name-tamer 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/name-tamer.rb +5 -0
- data/lib/name-tamer/version.rb +1 -1
- data/lib/string_extras.rb +40 -0
- data/spec/name_tamer_spec.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f8e5bd6a935818948438315fc5f7a1a2ff90e173
|
4
|
+
data.tar.gz: c6b2f73fa732939faa8e0eef6c680d852374b7c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dca80914cdbb4d1e6d254e51c2c9104ea18410424cd75663800564ac9c597cfe19c16651467cebc20883f4359b9e394ee23fbc0291cd6dad293c801db862ec84
|
7
|
+
data.tar.gz: 4cfadfe895919caf36468c5dc2c243054db0be686f8a6bb8500c69f4f7b39aeb00e9c32e6d358eb088e6e39c36ce1f9eb2d3d2ebce5806f5618aa8ee41832b75
|
data/Gemfile.lock
CHANGED
data/lib/name-tamer.rb
CHANGED
@@ -26,6 +26,7 @@ class NameTamer
|
|
26
26
|
@nice_name = name.dup # Start with the name we've received
|
27
27
|
|
28
28
|
tidy_spacing # " John Smith " -> "John Smith"
|
29
|
+
fix_encoding_errors # "René Descartes" -> "René Descartes"
|
29
30
|
consolidate_initials # "I. B. M." -> "I.B.M."
|
30
31
|
remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
|
31
32
|
fixup_last_name_first # "Smith, John" -> "John Smith"
|
@@ -108,6 +109,10 @@ class NameTamer
|
|
108
109
|
.whitespace_to!(ASCII_SPACE)
|
109
110
|
end
|
110
111
|
|
112
|
+
def fix_encoding_errors
|
113
|
+
@nice_name.fix_encoding_errors!
|
114
|
+
end
|
115
|
+
|
111
116
|
# Remove spaces from groups of initials
|
112
117
|
def consolidate_initials
|
113
118
|
@nice_name
|
data/lib/name-tamer/version.rb
CHANGED
data/lib/string_extras.rb
CHANGED
@@ -51,6 +51,13 @@ class String
|
|
51
51
|
self # Allows chaining
|
52
52
|
end
|
53
53
|
|
54
|
+
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
55
|
+
# tell-tale substrings that we can put back into the correct UTF-8 character
|
56
|
+
def fix_encoding_errors!
|
57
|
+
self.gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring }
|
58
|
+
self # Allows chaining
|
59
|
+
end
|
60
|
+
|
54
61
|
def upcase_first_letter!
|
55
62
|
self.gsub!(/\b\w/) { |first| first.upcase }
|
56
63
|
self # Allows chaining
|
@@ -190,4 +197,37 @@ class String
|
|
190
197
|
'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
|
191
198
|
'Ž' => 'Z', 'ž' => 'z'
|
192
199
|
}
|
200
|
+
|
201
|
+
# When strings are mistakenly encoded as single-byte character sets, instead
|
202
|
+
# of UTF-8, there are some distinctive character combinations that we can spot
|
203
|
+
# and fix
|
204
|
+
BAD_ENCODING = {
|
205
|
+
'€' => '€', '‚' => '‚', 'Æ’' => 'ƒ', '„' => '„', '…' => '…',
|
206
|
+
'â€' => '†', '‡' => '‡', 'ˆ' => 'ˆ', '‰' => '‰', 'Å ' => 'Š',
|
207
|
+
'‹' => '‹', 'Å’' => 'Œ', 'Ž' => 'Ž', '‘' => '‘', '’' => '’',
|
208
|
+
'“' => '“', 'â€' => '”', '•' => '•', '–' => '–', '—' => '—',
|
209
|
+
'Ëœ' => '˜', 'â„¢' => '™', 'Å¡' => 'š', '›' => '›', 'Å“' => 'œ',
|
210
|
+
'ž' => 'ž', 'Ÿ' => 'Ÿ', ' ' => ' ', '¡' => '¡', '¢' => '¢',
|
211
|
+
'£' => '£', '¤' => '¤', 'Â¥' => '¥', '¦' => '¦', '§' => '§',
|
212
|
+
'¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬',
|
213
|
+
'Â' => '', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±',
|
214
|
+
'²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶',
|
215
|
+
'·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»',
|
216
|
+
'¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À',
|
217
|
+
'Ã�' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Ã…' => 'Å',
|
218
|
+
'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
|
219
|
+
'Ë' => 'Ë', 'ÃŒ' => 'Ì', 'Ã�' => 'Í', 'ÃŽ' => 'Î', 'Ã�' => 'Ï',
|
220
|
+
'Ã�' => 'Ð', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô',
|
221
|
+
'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù',
|
222
|
+
'Ú' => 'Ú', 'Û' => 'Û', 'Ãœ' => 'Ü', 'Ã�' => 'Ý', 'Þ' => 'Þ',
|
223
|
+
'ß' => 'ß', 'à ' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã',
|
224
|
+
'ä' => 'ä', 'Ã¥' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è',
|
225
|
+
'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'Ã' => 'í',
|
226
|
+
'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò',
|
227
|
+
'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷',
|
228
|
+
'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü',
|
229
|
+
'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ'
|
230
|
+
}
|
231
|
+
|
232
|
+
BAD_ENCODING_PATTERNS = /(#{BAD_ENCODING.keys.join('|')})/
|
193
233
|
end
|
data/spec/name_tamer_spec.rb
CHANGED
@@ -181,6 +181,12 @@ describe NameTamer do
|
|
181
181
|
nn: 'Scout® Loyalty Optimizer',
|
182
182
|
sn: 'Scout Loyalty Optimizer',
|
183
183
|
s: 'scout-loyalty-optimizer'
|
184
|
+
},
|
185
|
+
{ n: 'René Descartes',
|
186
|
+
t: :person,
|
187
|
+
nn: 'René Descartes',
|
188
|
+
sn: 'René Descartes',
|
189
|
+
s:'rene-descartes'
|
184
190
|
}
|
185
191
|
]
|
186
192
|
end
|