name-tamer 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/name-tamer.rb +5 -0
- data/lib/name-tamer/version.rb +1 -1
- data/lib/string_extras.rb +40 -0
- data/spec/name_tamer_spec.rb +6 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f8e5bd6a935818948438315fc5f7a1a2ff90e173
|
4
|
+
data.tar.gz: c6b2f73fa732939faa8e0eef6c680d852374b7c2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dca80914cdbb4d1e6d254e51c2c9104ea18410424cd75663800564ac9c597cfe19c16651467cebc20883f4359b9e394ee23fbc0291cd6dad293c801db862ec84
|
7
|
+
data.tar.gz: 4cfadfe895919caf36468c5dc2c243054db0be686f8a6bb8500c69f4f7b39aeb00e9c32e6d358eb088e6e39c36ce1f9eb2d3d2ebce5806f5618aa8ee41832b75
|
data/Gemfile.lock
CHANGED
data/lib/name-tamer.rb
CHANGED
@@ -26,6 +26,7 @@ class NameTamer
|
|
26
26
|
@nice_name = name.dup # Start with the name we've received
|
27
27
|
|
28
28
|
tidy_spacing # " John Smith " -> "John Smith"
|
29
|
+
fix_encoding_errors # "René Descartes" -> "René Descartes"
|
29
30
|
consolidate_initials # "I. B. M." -> "I.B.M."
|
30
31
|
remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
|
31
32
|
fixup_last_name_first # "Smith, John" -> "John Smith"
|
@@ -108,6 +109,10 @@ class NameTamer
|
|
108
109
|
.whitespace_to!(ASCII_SPACE)
|
109
110
|
end
|
110
111
|
|
112
|
+
def fix_encoding_errors
|
113
|
+
@nice_name.fix_encoding_errors!
|
114
|
+
end
|
115
|
+
|
111
116
|
# Remove spaces from groups of initials
|
112
117
|
def consolidate_initials
|
113
118
|
@nice_name
|
data/lib/name-tamer/version.rb
CHANGED
data/lib/string_extras.rb
CHANGED
@@ -51,6 +51,13 @@ class String
|
|
51
51
|
self # Allows chaining
|
52
52
|
end
|
53
53
|
|
54
|
+
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
55
|
+
# tell-tale substrings that we can put back into the correct UTF-8 character
|
56
|
+
def fix_encoding_errors!
|
57
|
+
self.gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring }
|
58
|
+
self # Allows chaining
|
59
|
+
end
|
60
|
+
|
54
61
|
def upcase_first_letter!
|
55
62
|
self.gsub!(/\b\w/) { |first| first.upcase }
|
56
63
|
self # Allows chaining
|
@@ -190,4 +197,37 @@ class String
|
|
190
197
|
'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
|
191
198
|
'Ž' => 'Z', 'ž' => 'z'
|
192
199
|
}
|
200
|
+
|
201
|
+
# When strings are mistakenly encoded as single-byte character sets, instead
|
202
|
+
# of UTF-8, there are some distinctive character combinations that we can spot
|
203
|
+
# and fix
|
204
|
+
BAD_ENCODING = {
|
205
|
+
'€' => '€', '‚' => '‚', 'Æ’' => 'ƒ', '„' => '„', '…' => '…',
|
206
|
+
'â€' => '†', '‡' => '‡', 'ˆ' => 'ˆ', '‰' => '‰', 'Å ' => 'Š',
|
207
|
+
'‹' => '‹', 'Å’' => 'Œ', 'Ž' => 'Ž', '‘' => '‘', '’' => '’',
|
208
|
+
'“' => '“', 'â€' => '”', '•' => '•', '–' => '–', '—' => '—',
|
209
|
+
'Ëœ' => '˜', 'â„¢' => '™', 'Å¡' => 'š', '›' => '›', 'Å“' => 'œ',
|
210
|
+
'ž' => 'ž', 'Ÿ' => 'Ÿ', ' ' => ' ', '¡' => '¡', '¢' => '¢',
|
211
|
+
'£' => '£', '¤' => '¤', 'Â¥' => '¥', '¦' => '¦', '§' => '§',
|
212
|
+
'¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬',
|
213
|
+
'Â' => '', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±',
|
214
|
+
'²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶',
|
215
|
+
'·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»',
|
216
|
+
'¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À',
|
217
|
+
'Ã�' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Ã…' => 'Å',
|
218
|
+
'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
|
219
|
+
'Ë' => 'Ë', 'ÃŒ' => 'Ì', 'Ã�' => 'Í', 'ÃŽ' => 'Î', 'Ã�' => 'Ï',
|
220
|
+
'Ã�' => 'Ð', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô',
|
221
|
+
'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù',
|
222
|
+
'Ú' => 'Ú', 'Û' => 'Û', 'Ü' => 'Ü', 'Ã�' => 'Ý', 'Þ' => 'Þ',
|
223
|
+
'ß' => 'ß', 'à ' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã',
|
224
|
+
'ä' => 'ä', 'Ã¥' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è',
|
225
|
+
'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'Ã' => 'í',
|
226
|
+
'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò',
|
227
|
+
'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷',
|
228
|
+
'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü',
|
229
|
+
'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ'
|
230
|
+
}
|
231
|
+
|
232
|
+
BAD_ENCODING_PATTERNS = /(#{BAD_ENCODING.keys.join('|')})/
|
193
233
|
end
|
data/spec/name_tamer_spec.rb
CHANGED
@@ -181,6 +181,12 @@ describe NameTamer do
|
|
181
181
|
nn: 'Scout® Loyalty Optimizer',
|
182
182
|
sn: 'Scout Loyalty Optimizer',
|
183
183
|
s: 'scout-loyalty-optimizer'
|
184
|
+
},
|
185
|
+
{ n: 'René Descartes',
|
186
|
+
t: :person,
|
187
|
+
nn: 'René Descartes',
|
188
|
+
sn: 'René Descartes',
|
189
|
+
s:'rene-descartes'
|
184
190
|
}
|
185
191
|
]
|
186
192
|
end
|