name_tamer 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.codeclimate.yml +18 -0
- data/.env +1 -0
- data/.gitignore +26 -0
- data/.hound.yml +6 -0
- data/.rspec +2 -0
- data/.rubocop.yml +63 -0
- data/.travis.yml +13 -0
- data/Gemfile +20 -0
- data/Guardfile +16 -0
- data/LICENSE +21 -0
- data/README.md +82 -0
- data/Rakefile +14 -0
- data/doc/maintenance.rake +76 -0
- data/doc/prefixes.csv +49 -0
- data/doc/suffixes.csv +345 -0
- data/lib/name-tamer.rb +1 -0
- data/lib/name_tamer.rb +22 -0
- data/lib/name_tamer/array.rb +8 -0
- data/lib/name_tamer/constants.rb +121 -0
- data/lib/name_tamer/name.rb +390 -0
- data/lib/name_tamer/string.rb +280 -0
- data/lib/name_tamer/text.rb +53 -0
- data/lib/name_tamer/version.rb +3 -0
- data/name_tamer.gemspec +19 -0
- data/spec/name_tamer/name_spec.rb +95 -0
- data/spec/name_tamer/string_spec.rb +5 -0
- data/spec/name_tamer/text_spec.rb +40 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/names.yml +741 -0
- metadata +79 -0
@@ -0,0 +1,280 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
unless respond_to? :presence
|
5
|
+
def presence
|
6
|
+
self unless empty?
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Strip illegal characters out completely
|
11
|
+
def strip_unwanted!(filter)
|
12
|
+
substitute!(filter, '')
|
13
|
+
end
|
14
|
+
|
15
|
+
def strip_or_self!
|
16
|
+
strip! || self
|
17
|
+
end
|
18
|
+
|
19
|
+
# Change any whitespace into our separator character
|
20
|
+
def whitespace_to!(separator)
|
21
|
+
substitute!(/[[:space:]]+/, separator)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Ensure commas have exactly one space after them
|
25
|
+
def space_around_comma!
|
26
|
+
substitute!(/[[:space:]]*,[[:space:]]*/, ', ')
|
27
|
+
end
|
28
|
+
|
29
|
+
# Change some characters embedded in words to our separator character
|
30
|
+
# e.g. example.com -> example-com
|
31
|
+
def invalid_chars_to!(separator)
|
32
|
+
substitute!(%r{(?<![[:space:]])[\.\/](?![[:space:]])}, separator)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Unescape percent-encoded characters
|
36
|
+
# This might introduce UTF-8 invalid byte sequence
|
37
|
+
# so we take precautions
|
38
|
+
def safe_unescape!
|
39
|
+
string = URI.unescape(self)
|
40
|
+
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
|
41
|
+
return self
|
42
|
+
else
|
43
|
+
return self if self == string
|
44
|
+
replace string
|
45
|
+
ensure_safe!
|
46
|
+
end
|
47
|
+
|
48
|
+
# Remove HTML entities
|
49
|
+
def unescape_html!
|
50
|
+
replace CGI.unescapeHTML self
|
51
|
+
end
|
52
|
+
|
53
|
+
# Make sure separators are not where they shouldn't be
|
54
|
+
def fix_separators!(separator)
|
55
|
+
return self if separator.nil? || separator.empty?
|
56
|
+
|
57
|
+
r = Regexp.escape(separator)
|
58
|
+
|
59
|
+
# No more than one of the separator in a row.
|
60
|
+
substitute!(/#{r}{2,}/, separator)
|
61
|
+
|
62
|
+
# Remove leading/trailing separator.
|
63
|
+
substitute!(/^#{r}|#{r}$/i, '')
|
64
|
+
end
|
65
|
+
|
66
|
+
# Any characters that resemble latin characters might usefully be
|
67
|
+
# transliterated into ones that are easy to type on an anglophone
|
68
|
+
# keyboard.
|
69
|
+
def approximate_latin_chars!
|
70
|
+
gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
|
71
|
+
end
|
72
|
+
|
73
|
+
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
74
|
+
# tell-tale substrings that we can put back into the correct UTF-8 character
|
75
|
+
def fix_encoding_errors!
|
76
|
+
gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
|
77
|
+
end
|
78
|
+
|
79
|
+
def upcase_first_letter!
|
80
|
+
gsub!(/\b\w/, &:upcase) || self
|
81
|
+
end
|
82
|
+
|
83
|
+
def downcase_after_apostrophe!
|
84
|
+
gsub!(/\'\w\b/, &:downcase) || self # Lowercase 's
|
85
|
+
end
|
86
|
+
|
87
|
+
# Our list of terminal characters that indicate a non-celtic name used
|
88
|
+
# to include o but we removed it because of MacMurdo.
|
89
|
+
def fix_mac!
|
90
|
+
if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
|
91
|
+
gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
|
92
|
+
|
93
|
+
# Fix Mac exceptions
|
94
|
+
%w[
|
95
|
+
MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
|
96
|
+
MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
|
97
|
+
].each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
|
98
|
+
end
|
99
|
+
|
100
|
+
self # Allows chaining
|
101
|
+
end
|
102
|
+
|
103
|
+
# Fix ff wierdybonks
|
104
|
+
def fix_ff!
|
105
|
+
%w[
|
106
|
+
Fforbes Fforde Ffinch Ffrench Ffoulkes
|
107
|
+
].each { |ff_name| substitute!(ff_name, ff_name.downcase) }
|
108
|
+
|
109
|
+
self # Allows chaining
|
110
|
+
end
|
111
|
+
|
112
|
+
# Fixes for name modifiers followed by space
|
113
|
+
# Also replaces spaces with non-breaking spaces
|
114
|
+
# Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte
|
115
|
+
def fix_name_modifiers!
|
116
|
+
NAME_MODIFIERS.each do |modifier|
|
117
|
+
gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
|
118
|
+
"#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
fix_apostrophe_modifiers!
|
123
|
+
self # Allows chaining
|
124
|
+
end
|
125
|
+
|
126
|
+
def fix_apostrophe_modifiers!
|
127
|
+
%w[Dell D].each do |modifier|
|
128
|
+
gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
|
129
|
+
end
|
130
|
+
|
131
|
+
self # Allows chaining
|
132
|
+
end
|
133
|
+
|
134
|
+
# Upcase words with no vowels, e.g JPR Williams
|
135
|
+
# Except Ng
|
136
|
+
def upcase_initials!
|
137
|
+
gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
|
138
|
+
gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
|
139
|
+
end
|
140
|
+
|
141
|
+
# Fix known last names that have spaces (not hyphens!)
|
142
|
+
def nbsp_in_compound_name!
|
143
|
+
COMPOUND_NAMES.each do |compound_name|
|
144
|
+
substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
|
145
|
+
end
|
146
|
+
|
147
|
+
self # Allows chaining
|
148
|
+
end
|
149
|
+
|
150
|
+
def nbsp_in_name_modifier!
|
151
|
+
NAME_MODIFIERS.each do |modifier|
|
152
|
+
gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
|
153
|
+
end
|
154
|
+
|
155
|
+
self # Allows chaining
|
156
|
+
end
|
157
|
+
|
158
|
+
def remove_periods_from_initials!
|
159
|
+
gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
|
160
|
+
end
|
161
|
+
|
162
|
+
def remove_spaces_from_initials!
|
163
|
+
gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
|
164
|
+
"#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
|
165
|
+
end || self
|
166
|
+
end
|
167
|
+
|
168
|
+
def ensure_space_after_initials!
|
169
|
+
gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
|
170
|
+
end
|
171
|
+
|
172
|
+
def ensure_safe!
|
173
|
+
encode!('UTF-8', invalid: :replace, undef: :replace, replace: '') # Doesn't fully work in Ruby 2.0
|
174
|
+
end
|
175
|
+
|
176
|
+
def substitute!(pattern, replacement)
|
177
|
+
gsub!(pattern, replacement) || self
|
178
|
+
end
|
179
|
+
|
180
|
+
NONBREAKING_SPACE = "\u00a0".freeze
|
181
|
+
ASCII_SPACE = ' '.freeze
|
182
|
+
|
183
|
+
COMPOUND_NAMES = [
|
184
|
+
'Lane Fox', 'Bonham Carter', 'Pitt Rivers', 'Lloyd Webber', 'Sebag Montefiore', 'Holmes à Court', 'Holmes a Court',
|
185
|
+
'Baron Cohen', 'Strang Steel', 'Wingfield Digby',
|
186
|
+
'Service Company', 'Corporation Company', 'Corporation System', 'Incorporations Limited'
|
187
|
+
].freeze
|
188
|
+
|
189
|
+
NAME_MODIFIERS = [
|
190
|
+
'Al', 'Ap', 'Ben', 'Dell[ae]', 'D[aeiou]', 'De[lrn]', 'D[ao]s', 'El', 'La', 'L[eo]', 'V[ao]n', 'Of', 'San',
|
191
|
+
'St[\.]?', 'Zur'
|
192
|
+
].freeze
|
193
|
+
|
194
|
+
# Transliterations (like the i18n defaults)
|
195
|
+
# see https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb
|
196
|
+
APPROXIMATIONS = {
|
197
|
+
'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE',
|
198
|
+
'Ç' => 'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I',
|
199
|
+
'Î' => 'I', 'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O',
|
200
|
+
'Õ' => 'O', 'Ö' => 'O', '×' => 'x', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U',
|
201
|
+
'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'Th', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a',
|
202
|
+
'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e',
|
203
|
+
'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd',
|
204
|
+
'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o',
|
205
|
+
'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y',
|
206
|
+
'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A', 'ą' => 'a', 'Ć' => 'C',
|
207
|
+
'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c', 'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c',
|
208
|
+
'Ď' => 'D', 'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e', 'Ĕ' => 'E',
|
209
|
+
'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E', 'ę' => 'e', 'Ě' => 'E', 'ě' => 'e',
|
210
|
+
'Ĝ' => 'G', 'ĝ' => 'g', 'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G',
|
211
|
+
'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h', 'Ĩ' => 'I', 'ĩ' => 'i',
|
212
|
+
'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I', 'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I',
|
213
|
+
'ı' => 'i', 'IJ' => 'IJ', 'ij' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k',
|
214
|
+
'ĸ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l', 'Ľ' => 'L', 'ľ' => 'l',
|
215
|
+
'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L', 'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N',
|
216
|
+
'ņ' => 'n', 'Ň' => 'N', 'ň' => 'n', 'ʼn' => "'n", 'Ŋ' => 'NG', 'ŋ' => 'ng',
|
217
|
+
'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O', 'ő' => 'o', 'Œ' => 'OE',
|
218
|
+
'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r', 'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r',
|
219
|
+
'Ś' => 'S', 'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's', 'Š' => 'S',
|
220
|
+
'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T', 'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't',
|
221
|
+
'Ũ' => 'U', 'ũ' => 'u', 'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U',
|
222
|
+
'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u', 'Ŵ' => 'W', 'ŵ' => 'w',
|
223
|
+
'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
|
224
|
+
'Ž' => 'Z', 'ž' => 'z'
|
225
|
+
}.freeze
|
226
|
+
|
227
|
+
# When strings are mistakenly encoded as single-byte character sets, instead
|
228
|
+
# of UTF-8, there are some distinctive character combinations that we can spot
|
229
|
+
# and fix
|
230
|
+
# Useful table here http://www.i18nqa.com/debug/utf8-debug.html
|
231
|
+
BAD_ENCODING = {
|
232
|
+
'€' => '€', '‚' => '‚', 'Æ’' => 'ƒ', '„' => '„', '…' => '…',
|
233
|
+
'†' => '†', '‡' => '‡', 'ˆ' => 'ˆ', '‰' => '‰', 'Å ' => 'Š',
|
234
|
+
'‹' => '‹', 'Å’' => 'Œ', 'Ž' => 'Ž', '‘' => '‘', '’' => '’',
|
235
|
+
'“' => '“',
|
236
|
+
'â€' => '”', # Note the invisible Ux009D in the key
|
237
|
+
'′' => '′', # Manually added. Some seem to use this instead of Ux2019
|
238
|
+
'•' => '•', '–' => '–', '—' => '—',
|
239
|
+
'Ëœ' => '˜', 'â„¢' => '™', 'Å¡' => 'š', '›' => '›', 'Å“' => 'œ',
|
240
|
+
'ž' => 'ž', 'Ÿ' => 'Ÿ', ' ' => ' ', '¡' => '¡', '¢' => '¢',
|
241
|
+
'£' => '£', '¤' => '¤', 'Â¥' => '¥', '¦' => '¦', '§' => '§',
|
242
|
+
'¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬',
|
243
|
+
'Â' => '', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±',
|
244
|
+
'²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶',
|
245
|
+
'·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»',
|
246
|
+
'¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À',
|
247
|
+
'Ã�' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Ã…' => 'Å',
|
248
|
+
'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
|
249
|
+
'Ë' => 'Ë', 'ÃŒ' => 'Ì', "\xC3\x8D" => 'Í', 'ÃŽ' => 'Î', "\xC3\x8F" => 'Ï',
|
250
|
+
"\xC3\x90" => 'Ð', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô',
|
251
|
+
'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù',
|
252
|
+
'Ú' => 'Ú', 'Û' => 'Û', 'Ãœ' => 'Ü', "\xC3\x9D" => 'Ý', 'Þ' => 'Þ',
|
253
|
+
'ß' => 'ß', 'à ' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã',
|
254
|
+
'ä' => 'ä', 'Ã¥' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è',
|
255
|
+
'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'Ã' => 'í',
|
256
|
+
'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò',
|
257
|
+
'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷',
|
258
|
+
'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü',
|
259
|
+
'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ',
|
260
|
+
"\x00" => '' # Manually added to avoid Bad Argument exception
|
261
|
+
}.freeze
|
262
|
+
|
263
|
+
BAD_ENCODING_PATTERNS = /(#{BAD_ENCODING.keys.join('|')})/
|
264
|
+
|
265
|
+
# Colorize strings
|
266
|
+
colors = %w[black red green yellow blue magenta cyan white]
|
267
|
+
|
268
|
+
colors.each_with_index do |fg_color, i|
|
269
|
+
fg = 30 + i
|
270
|
+
define_method(fg_color) { ansi_attributes(fg) }
|
271
|
+
|
272
|
+
colors.each_with_index do |bg_color, j|
|
273
|
+
define_method("#{fg_color}_on_#{bg_color}") { ansi_attributes(fg, 40 + j) }
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def ansi_attributes(*args)
|
278
|
+
"\e[#{args.join(';')}m#{self}\e[0m"
|
279
|
+
end
|
280
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module NameTamer
|
2
|
+
class Text
|
3
|
+
# All the potential slugs from the string
|
4
|
+
# e.g. 'lorem ipsum dolor' -> ['lorem', 'ipsum' ,'dolor', 'lorem-ipsum', 'ipsum-dolor', 'lorem-ipsum-dolor']
|
5
|
+
def slugs
|
6
|
+
@slugs ||= segments.flat_map { |s| self.class.new(s).neighbours }.uniq
|
7
|
+
end
|
8
|
+
|
9
|
+
# Split the string into segments (e.g. sentences)
|
10
|
+
def segments
|
11
|
+
string.split(%r{(?:[\.\?,:;!]|[[:space:]][/-])[[:space:]]})
|
12
|
+
end
|
13
|
+
|
14
|
+
# The string as a slug
|
15
|
+
def parameterize
|
16
|
+
@parameterize ||= (
|
17
|
+
string
|
18
|
+
.dup
|
19
|
+
.whitespace_to!(separator)
|
20
|
+
.invalid_chars_to!(separator)
|
21
|
+
.strip_unwanted!(filter)
|
22
|
+
.fix_separators!(separator)
|
23
|
+
.approximate_latin_chars!
|
24
|
+
.presence || '_'
|
25
|
+
).downcase
|
26
|
+
end
|
27
|
+
|
28
|
+
def neighbours
|
29
|
+
@neighbours ||= NameTamer[string].array.neighbours.map { |a| a.join('-') }
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
attr_reader :string, :args
|
35
|
+
|
36
|
+
def initialize(string, args = {})
|
37
|
+
@string = string
|
38
|
+
@args = args
|
39
|
+
end
|
40
|
+
|
41
|
+
def separator
|
42
|
+
@seperator ||= args[:sep] || SLUG_DELIMITER
|
43
|
+
end
|
44
|
+
|
45
|
+
def rfc3987
|
46
|
+
@rfc3987 ||= args[:rfc3987] || false
|
47
|
+
end
|
48
|
+
|
49
|
+
def filter
|
50
|
+
@filter ||= args[:filter] || (rfc3987 ? FILTER_RFC3987 : FILTER_COMPAT)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/name_tamer.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'name_tamer/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'name_tamer'
|
7
|
+
spec.version = NameTamer::VERSION
|
8
|
+
spec.authors = ['Dominic Sayers']
|
9
|
+
spec.email = ['dominic@sayers.cc']
|
10
|
+
spec.description = 'Useful methods for taming names'
|
11
|
+
spec.summary = "Example: NameTamer['Mr. John Q. Smith III, MD'].simple_name # => John Smith"
|
12
|
+
spec.homepage = 'https://github.com/dominicsayers/name_tamer'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features|coverage)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
describe NameTamer::Name do
|
4
|
+
context 'invalid byte sequence in UTF-8' do
|
5
|
+
let(:name_data) { { n: "\xc3\x28", t: :person, nn: '()', sn: '()', s: '_' } } # Invalid byte sequence in UTF-8
|
6
|
+
|
7
|
+
if Gem::Version.new(RUBY_VERSION) <= Gem::Version.new('2')
|
8
|
+
it 'fails to correct invalid byte sequence' do
|
9
|
+
name = name_data[:n]
|
10
|
+
expect { NameTamer[name, contact_type: name_data[:t]].slug }.to raise_error(
|
11
|
+
ArgumentError,
|
12
|
+
'invalid byte sequence in UTF-8'
|
13
|
+
)
|
14
|
+
end
|
15
|
+
else
|
16
|
+
it 'makes a slug' do
|
17
|
+
name = name_data[:n]
|
18
|
+
expect(NameTamer[name, contact_type: name_data[:t]].slug).to eq(name_data[:s])
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'makes a nice name' do
|
22
|
+
name = name_data[:n]
|
23
|
+
nice_name = NameTamer[name, contact_type: name_data[:t]].nice_name
|
24
|
+
expect(nice_name).to eq(name_data[:nn])
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'makes a searchable name' do
|
28
|
+
name = name_data[:n]
|
29
|
+
expect(NameTamer[name, contact_type: name_data[:t]].simple_name).to eq(name_data[:sn])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'all ruby versions' do
|
35
|
+
let(:names) { YAML.load_file(File.join('spec', 'support', 'names.yml')) }
|
36
|
+
|
37
|
+
it 'loads the examples correctly' do
|
38
|
+
expect(names.length).to eq(152) # Number of examples
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'makes a slug' do
|
42
|
+
names.each do |name_data|
|
43
|
+
name = name_data[:n]
|
44
|
+
expect(NameTamer[name, contact_type: name_data[:t]].slug).to eq(name_data[:s].downcase)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'makes a nice name' do
|
49
|
+
names.each do |name_data|
|
50
|
+
name = name_data[:n]
|
51
|
+
nice_name = NameTamer[name, contact_type: name_data[:t]].nice_name
|
52
|
+
expect(nice_name).to eq(name_data[:nn])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'makes a searchable name' do
|
57
|
+
names.each do |name_data|
|
58
|
+
name = name_data[:n]
|
59
|
+
expect(NameTamer[name, contact_type: name_data[:t]].simple_name).to eq(name_data[:sn])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe 'contact type inference' do
|
66
|
+
it 'infers that "Mr. John Smith" is a person' do
|
67
|
+
expect(NameTamer['Mr. John Smith'].contact_type).to eq(:person)
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'infers that "Di Doo Doo d.o.o." is an organization' do
|
71
|
+
expect(NameTamer['Di Doo Doo d.o.o.'].contact_type).to eq(:organization)
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'infers that "DiDooDoo" is an organization' do
|
75
|
+
expect(NameTamer['DiDooDoo'].contact_type).to eq(:organization)
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'infers that "John Smith" is a person' do
|
79
|
+
expect(NameTamer['John Smith'].contact_type).to eq(:person)
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'announces a change in contact type' do
|
83
|
+
nt = NameTamer::Name.new 'John Smith', contact_type: :person
|
84
|
+
nt.contact_type = :organization
|
85
|
+
expect(nt.contact_type).to eq(:organization)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe 'iteration' do
|
90
|
+
it 'iterates through the significant words of a name' do
|
91
|
+
words = []
|
92
|
+
NameTamer['John Smith'].each_word { |w| words << w }
|
93
|
+
expect(words).to include('john', 'smith')
|
94
|
+
end
|
95
|
+
end
|