name_tamer 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +18 -0
- data/.env +1 -0
- data/.gitignore +26 -0
- data/.hound.yml +6 -0
- data/.rspec +2 -0
- data/.rubocop.yml +63 -0
- data/.travis.yml +13 -0
- data/Gemfile +20 -0
- data/Guardfile +16 -0
- data/LICENSE +21 -0
- data/README.md +82 -0
- data/Rakefile +14 -0
- data/doc/maintenance.rake +76 -0
- data/doc/prefixes.csv +49 -0
- data/doc/suffixes.csv +345 -0
- data/lib/name-tamer.rb +1 -0
- data/lib/name_tamer.rb +22 -0
- data/lib/name_tamer/array.rb +8 -0
- data/lib/name_tamer/constants.rb +121 -0
- data/lib/name_tamer/name.rb +390 -0
- data/lib/name_tamer/string.rb +280 -0
- data/lib/name_tamer/text.rb +53 -0
- data/lib/name_tamer/version.rb +3 -0
- data/name_tamer.gemspec +19 -0
- data/spec/name_tamer/name_spec.rb +95 -0
- data/spec/name_tamer/string_spec.rb +5 -0
- data/spec/name_tamer/text_spec.rb +40 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/names.yml +741 -0
- metadata +79 -0
@@ -0,0 +1,280 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
class String
|
4
|
+
unless respond_to? :presence
|
5
|
+
def presence
|
6
|
+
self unless empty?
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
# Strip illegal characters out completely
|
11
|
+
def strip_unwanted!(filter)
|
12
|
+
substitute!(filter, '')
|
13
|
+
end
|
14
|
+
|
15
|
+
def strip_or_self!
|
16
|
+
strip! || self
|
17
|
+
end
|
18
|
+
|
19
|
+
# Change any whitespace into our separator character
|
20
|
+
def whitespace_to!(separator)
|
21
|
+
substitute!(/[[:space:]]+/, separator)
|
22
|
+
end
|
23
|
+
|
24
|
+
# Ensure commas have exactly one space after them
|
25
|
+
def space_around_comma!
|
26
|
+
substitute!(/[[:space:]]*,[[:space:]]*/, ', ')
|
27
|
+
end
|
28
|
+
|
29
|
+
# Change some characters embedded in words to our separator character
|
30
|
+
# e.g. example.com -> example-com
|
31
|
+
def invalid_chars_to!(separator)
|
32
|
+
substitute!(%r{(?<![[:space:]])[\.\/](?![[:space:]])}, separator)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Unescape percent-encoded characters
|
36
|
+
# This might introduce UTF-8 invalid byte sequence
|
37
|
+
# so we take precautions
|
38
|
+
def safe_unescape!
|
39
|
+
string = URI.unescape(self)
|
40
|
+
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
|
41
|
+
return self
|
42
|
+
else
|
43
|
+
return self if self == string
|
44
|
+
replace string
|
45
|
+
ensure_safe!
|
46
|
+
end
|
47
|
+
|
48
|
+
# Remove HTML entities
|
49
|
+
def unescape_html!
|
50
|
+
replace CGI.unescapeHTML self
|
51
|
+
end
|
52
|
+
|
53
|
+
# Make sure separators are not where they shouldn't be
|
54
|
+
def fix_separators!(separator)
|
55
|
+
return self if separator.nil? || separator.empty?
|
56
|
+
|
57
|
+
r = Regexp.escape(separator)
|
58
|
+
|
59
|
+
# No more than one of the separator in a row.
|
60
|
+
substitute!(/#{r}{2,}/, separator)
|
61
|
+
|
62
|
+
# Remove leading/trailing separator.
|
63
|
+
substitute!(/^#{r}|#{r}$/i, '')
|
64
|
+
end
|
65
|
+
|
66
|
+
# Any characters that resemble latin characters might usefully be
|
67
|
+
# transliterated into ones that are easy to type on an anglophone
|
68
|
+
# keyboard.
|
69
|
+
def approximate_latin_chars!
|
70
|
+
gsub!(/[^\x00-\x7f]/u) { |char| APPROXIMATIONS[char] || char } || self
|
71
|
+
end
|
72
|
+
|
73
|
+
# Strings that were wrongly encoded with single-byte encodings sometimes have
|
74
|
+
# tell-tale substrings that we can put back into the correct UTF-8 character
|
75
|
+
def fix_encoding_errors!
|
76
|
+
gsub!(BAD_ENCODING_PATTERNS) { |substring| BAD_ENCODING[substring] || substring } || self
|
77
|
+
end
|
78
|
+
|
79
|
+
def upcase_first_letter!
|
80
|
+
gsub!(/\b\w/, &:upcase) || self
|
81
|
+
end
|
82
|
+
|
83
|
+
def downcase_after_apostrophe!
|
84
|
+
gsub!(/\'\w\b/, &:downcase) || self # Lowercase 's
|
85
|
+
end
|
86
|
+
|
87
|
+
# Our list of terminal characters that indicate a non-celtic name used
|
88
|
+
# to include o but we removed it because of MacMurdo.
|
89
|
+
def fix_mac!
|
90
|
+
if self =~ /\bMac[A-Za-z]{2,}[^acizj]\b/ || self =~ /\bMc/
|
91
|
+
gsub!(/\b(Ma?c)([A-Za-z]+)/) { |_| Regexp.last_match[1] + Regexp.last_match[2].capitalize }
|
92
|
+
|
93
|
+
# Fix Mac exceptions
|
94
|
+
%w[
|
95
|
+
MacEdo MacEvicius MacHado MacHar MacHin MacHlin MacIas MacIulis MacKie
|
96
|
+
MacKle MacKlin MacKmin MacKmurdo MacQuarie MacLise MacKenzie
|
97
|
+
].each { |mac_name| substitute!(/\b#{mac_name}/, mac_name.capitalize) }
|
98
|
+
end
|
99
|
+
|
100
|
+
self # Allows chaining
|
101
|
+
end
|
102
|
+
|
103
|
+
# Fix ff wierdybonks
|
104
|
+
def fix_ff!
|
105
|
+
%w[
|
106
|
+
Fforbes Fforde Ffinch Ffrench Ffoulkes
|
107
|
+
].each { |ff_name| substitute!(ff_name, ff_name.downcase) }
|
108
|
+
|
109
|
+
self # Allows chaining
|
110
|
+
end
|
111
|
+
|
112
|
+
# Fixes for name modifiers followed by space
|
113
|
+
# Also replaces spaces with non-breaking spaces
|
114
|
+
# Fixes for name modifiers followed by an apostrophe, e.g. d'Artagnan, Commedia dell'Arte
|
115
|
+
def fix_name_modifiers!
|
116
|
+
NAME_MODIFIERS.each do |modifier|
|
117
|
+
gsub!(/((?:[[:space:]]|^)#{modifier})([[:space:]]+|-)/) do |_|
|
118
|
+
"#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2].tr(ASCII_SPACE, NONBREAKING_SPACE)}"
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
fix_apostrophe_modifiers!
|
123
|
+
self # Allows chaining
|
124
|
+
end
|
125
|
+
|
126
|
+
def fix_apostrophe_modifiers!
|
127
|
+
%w[Dell D].each do |modifier|
|
128
|
+
gsub!(/(.#{modifier}')(\w)/) { |_| "#{Regexp.last_match[1].rstrip.downcase}#{Regexp.last_match[2]}" }
|
129
|
+
end
|
130
|
+
|
131
|
+
self # Allows chaining
|
132
|
+
end
|
133
|
+
|
134
|
+
# Upcase words with no vowels, e.g JPR Williams
|
135
|
+
# Except Ng
|
136
|
+
def upcase_initials!
|
137
|
+
gsub!(/\b([bcdfghjklmnpqrstvwxz]+)\b/i) { |_| Regexp.last_match[1].upcase }
|
138
|
+
gsub!(/\b(NG)\b/i) { |_| Regexp.last_match[1].capitalize } || self # http://en.wikipedia.org/wiki/Ng
|
139
|
+
end
|
140
|
+
|
141
|
+
# Fix known last names that have spaces (not hyphens!)
|
142
|
+
def nbsp_in_compound_name!
|
143
|
+
COMPOUND_NAMES.each do |compound_name|
|
144
|
+
substitute!(compound_name, compound_name.tr(ASCII_SPACE, NONBREAKING_SPACE))
|
145
|
+
end
|
146
|
+
|
147
|
+
self # Allows chaining
|
148
|
+
end
|
149
|
+
|
150
|
+
def nbsp_in_name_modifier!
|
151
|
+
NAME_MODIFIERS.each do |modifier|
|
152
|
+
gsub!(/([[:space:]]#{modifier})([[:space:]])/i) { |_| "#{Regexp.last_match[1]}#{NONBREAKING_SPACE}" }
|
153
|
+
end
|
154
|
+
|
155
|
+
self # Allows chaining
|
156
|
+
end
|
157
|
+
|
158
|
+
def remove_periods_from_initials!
|
159
|
+
gsub!(/\b([a-z])\./i) { |_| Regexp.last_match[1] } || self
|
160
|
+
end
|
161
|
+
|
162
|
+
def remove_spaces_from_initials!
|
163
|
+
gsub!(/\b([a-z])(\.)* \b(?![a-z0-9'\u00C0-\u00FF]{2,})/i) do |_|
|
164
|
+
"#{Regexp.last_match[1]}#{Regexp.last_match[2]}"
|
165
|
+
end || self
|
166
|
+
end
|
167
|
+
|
168
|
+
def ensure_space_after_initials!
|
169
|
+
gsub!(/\b([a-z]\.)(?=[a-z0-9]{2,})/i) { |_| "#{Regexp.last_match[1]} " } || self
|
170
|
+
end
|
171
|
+
|
172
|
+
def ensure_safe!
|
173
|
+
encode!('UTF-8', invalid: :replace, undef: :replace, replace: '') # Doesn't fully work in Ruby 2.0
|
174
|
+
end
|
175
|
+
|
176
|
+
def substitute!(pattern, replacement)
|
177
|
+
gsub!(pattern, replacement) || self
|
178
|
+
end
|
179
|
+
|
180
|
+
NONBREAKING_SPACE = "\u00a0".freeze
|
181
|
+
ASCII_SPACE = ' '.freeze
|
182
|
+
|
183
|
+
COMPOUND_NAMES = [
|
184
|
+
'Lane Fox', 'Bonham Carter', 'Pitt Rivers', 'Lloyd Webber', 'Sebag Montefiore', 'Holmes à Court', 'Holmes a Court',
|
185
|
+
'Baron Cohen', 'Strang Steel', 'Wingfield Digby',
|
186
|
+
'Service Company', 'Corporation Company', 'Corporation System', 'Incorporations Limited'
|
187
|
+
].freeze
|
188
|
+
|
189
|
+
NAME_MODIFIERS = [
|
190
|
+
'Al', 'Ap', 'Ben', 'Dell[ae]', 'D[aeiou]', 'De[lrn]', 'D[ao]s', 'El', 'La', 'L[eo]', 'V[ao]n', 'Of', 'San',
|
191
|
+
'St[\.]?', 'Zur'
|
192
|
+
].freeze
|
193
|
+
|
194
|
+
# Transliterations (like the i18n defaults)
|
195
|
+
# see https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb
|
196
|
+
APPROXIMATIONS = {
|
197
|
+
'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE',
|
198
|
+
'Ç' => 'C', 'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I',
|
199
|
+
'Î' => 'I', 'Ï' => 'I', 'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O',
|
200
|
+
'Õ' => 'O', 'Ö' => 'O', '×' => 'x', 'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U',
|
201
|
+
'Ü' => 'U', 'Ý' => 'Y', 'Þ' => 'Th', 'ß' => 'ss', 'à' => 'a', 'á' => 'a', 'â' => 'a',
|
202
|
+
'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c', 'è' => 'e', 'é' => 'e',
|
203
|
+
'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i', 'ð' => 'd',
|
204
|
+
'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ø' => 'o',
|
205
|
+
'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ý' => 'y', 'þ' => 'th', 'ÿ' => 'y',
|
206
|
+
'Ā' => 'A', 'ā' => 'a', 'Ă' => 'A', 'ă' => 'a', 'Ą' => 'A', 'ą' => 'a', 'Ć' => 'C',
|
207
|
+
'ć' => 'c', 'Ĉ' => 'C', 'ĉ' => 'c', 'Ċ' => 'C', 'ċ' => 'c', 'Č' => 'C', 'č' => 'c',
|
208
|
+
'Ď' => 'D', 'ď' => 'd', 'Đ' => 'D', 'đ' => 'd', 'Ē' => 'E', 'ē' => 'e', 'Ĕ' => 'E',
|
209
|
+
'ĕ' => 'e', 'Ė' => 'E', 'ė' => 'e', 'Ę' => 'E', 'ę' => 'e', 'Ě' => 'E', 'ě' => 'e',
|
210
|
+
'Ĝ' => 'G', 'ĝ' => 'g', 'Ğ' => 'G', 'ğ' => 'g', 'Ġ' => 'G', 'ġ' => 'g', 'Ģ' => 'G',
|
211
|
+
'ģ' => 'g', 'Ĥ' => 'H', 'ĥ' => 'h', 'Ħ' => 'H', 'ħ' => 'h', 'Ĩ' => 'I', 'ĩ' => 'i',
|
212
|
+
'Ī' => 'I', 'ī' => 'i', 'Ĭ' => 'I', 'ĭ' => 'i', 'Į' => 'I', 'į' => 'i', 'İ' => 'I',
|
213
|
+
'ı' => 'i', 'IJ' => 'IJ', 'ij' => 'ij', 'Ĵ' => 'J', 'ĵ' => 'j', 'Ķ' => 'K', 'ķ' => 'k',
|
214
|
+
'ĸ' => 'k', 'Ĺ' => 'L', 'ĺ' => 'l', 'Ļ' => 'L', 'ļ' => 'l', 'Ľ' => 'L', 'ľ' => 'l',
|
215
|
+
'Ŀ' => 'L', 'ŀ' => 'l', 'Ł' => 'L', 'ł' => 'l', 'Ń' => 'N', 'ń' => 'n', 'Ņ' => 'N',
|
216
|
+
'ņ' => 'n', 'Ň' => 'N', 'ň' => 'n', 'ʼn' => "'n", 'Ŋ' => 'NG', 'ŋ' => 'ng',
|
217
|
+
'Ō' => 'O', 'ō' => 'o', 'Ŏ' => 'O', 'ŏ' => 'o', 'Ő' => 'O', 'ő' => 'o', 'Œ' => 'OE',
|
218
|
+
'œ' => 'oe', 'Ŕ' => 'R', 'ŕ' => 'r', 'Ŗ' => 'R', 'ŗ' => 'r', 'Ř' => 'R', 'ř' => 'r',
|
219
|
+
'Ś' => 'S', 'ś' => 's', 'Ŝ' => 'S', 'ŝ' => 's', 'Ş' => 'S', 'ş' => 's', 'Š' => 'S',
|
220
|
+
'š' => 's', 'Ţ' => 'T', 'ţ' => 't', 'Ť' => 'T', 'ť' => 't', 'Ŧ' => 'T', 'ŧ' => 't',
|
221
|
+
'Ũ' => 'U', 'ũ' => 'u', 'Ū' => 'U', 'ū' => 'u', 'Ŭ' => 'U', 'ŭ' => 'u', 'Ů' => 'U',
|
222
|
+
'ů' => 'u', 'Ű' => 'U', 'ű' => 'u', 'Ų' => 'U', 'ų' => 'u', 'Ŵ' => 'W', 'ŵ' => 'w',
|
223
|
+
'Ŷ' => 'Y', 'ŷ' => 'y', 'Ÿ' => 'Y', 'Ź' => 'Z', 'ź' => 'z', 'Ż' => 'Z', 'ż' => 'z',
|
224
|
+
'Ž' => 'Z', 'ž' => 'z'
|
225
|
+
}.freeze
|
226
|
+
|
227
|
+
# When strings are mistakenly encoded as single-byte character sets, instead
|
228
|
+
# of UTF-8, there are some distinctive character combinations that we can spot
|
229
|
+
# and fix
|
230
|
+
# Useful table here http://www.i18nqa.com/debug/utf8-debug.html
|
231
|
+
BAD_ENCODING = {
|
232
|
+
'€' => '€', '‚' => '‚', 'Æ’' => 'ƒ', '„' => '„', '…' => '…',
|
233
|
+
'†' => '†', '‡' => '‡', 'ˆ' => 'ˆ', '‰' => '‰', 'Å ' => 'Š',
|
234
|
+
'‹' => '‹', 'Å’' => 'Œ', 'Ž' => 'Ž', '‘' => '‘', '’' => '’',
|
235
|
+
'“' => '“',
|
236
|
+
'â€' => '”', # Note the invisible Ux009D in the key
|
237
|
+
'′' => '′', # Manually added. Some seem to use this instead of Ux2019
|
238
|
+
'•' => '•', '–' => '–', '—' => '—',
|
239
|
+
'Ëœ' => '˜', 'â„¢' => '™', 'Å¡' => 'š', '›' => '›', 'Å“' => 'œ',
|
240
|
+
'ž' => 'ž', 'Ÿ' => 'Ÿ', ' ' => ' ', '¡' => '¡', '¢' => '¢',
|
241
|
+
'£' => '£', '¤' => '¤', 'Â¥' => '¥', '¦' => '¦', '§' => '§',
|
242
|
+
'¨' => '¨', '©' => '©', 'ª' => 'ª', '«' => '«', '¬' => '¬',
|
243
|
+
'Â' => '', '®' => '®', '¯' => '¯', '°' => '°', '±' => '±',
|
244
|
+
'²' => '²', '³' => '³', '´' => '´', 'µ' => 'µ', '¶' => '¶',
|
245
|
+
'·' => '·', '¸' => '¸', '¹' => '¹', 'º' => 'º', '»' => '»',
|
246
|
+
'¼' => '¼', '½' => '½', '¾' => '¾', '¿' => '¿', 'À' => 'À',
|
247
|
+
'Ã�' => 'Á', 'Â' => 'Â', 'Ã' => 'Ã', 'Ä' => 'Ä', 'Ã…' => 'Å',
|
248
|
+
'Æ' => 'Æ', 'Ç' => 'Ç', 'È' => 'È', 'É' => 'É', 'Ê' => 'Ê',
|
249
|
+
'Ë' => 'Ë', 'ÃŒ' => 'Ì', "\xC3\x8D" => 'Í', 'ÃŽ' => 'Î', "\xC3\x8F" => 'Ï',
|
250
|
+
"\xC3\x90" => 'Ð', 'Ñ' => 'Ñ', 'Ã’' => 'Ò', 'Ó' => 'Ó', 'Ô' => 'Ô',
|
251
|
+
'Õ' => 'Õ', 'Ö' => 'Ö', '×' => '×', 'Ø' => 'Ø', 'Ù' => 'Ù',
|
252
|
+
'Ú' => 'Ú', 'Û' => 'Û', 'Ü' => 'Ü', "\xC3\x9D" => 'Ý', 'Þ' => 'Þ',
|
253
|
+
'ß' => 'ß', 'à ' => 'à', 'á' => 'á', 'â' => 'â', 'ã' => 'ã',
|
254
|
+
'ä' => 'ä', 'Ã¥' => 'å', 'æ' => 'æ', 'ç' => 'ç', 'è' => 'è',
|
255
|
+
'é' => 'é', 'ê' => 'ê', 'ë' => 'ë', 'ì' => 'ì', 'Ã' => 'í',
|
256
|
+
'î' => 'î', 'ï' => 'ï', 'ð' => 'ð', 'ñ' => 'ñ', 'ò' => 'ò',
|
257
|
+
'ó' => 'ó', 'ô' => 'ô', 'õ' => 'õ', 'ö' => 'ö', '÷' => '÷',
|
258
|
+
'ø' => 'ø', 'ù' => 'ù', 'ú' => 'ú', 'û' => 'û', 'ü' => 'ü',
|
259
|
+
'ý' => 'ý', 'þ' => 'þ', 'ÿ' => 'ÿ',
|
260
|
+
"\x00" => '' # Manually added to avoid Bad Argument exception
|
261
|
+
}.freeze
|
262
|
+
|
263
|
+
BAD_ENCODING_PATTERNS = /(#{BAD_ENCODING.keys.join('|')})/
|
264
|
+
|
265
|
+
# Colorize strings
|
266
|
+
colors = %w[black red green yellow blue magenta cyan white]
|
267
|
+
|
268
|
+
colors.each_with_index do |fg_color, i|
|
269
|
+
fg = 30 + i
|
270
|
+
define_method(fg_color) { ansi_attributes(fg) }
|
271
|
+
|
272
|
+
colors.each_with_index do |bg_color, j|
|
273
|
+
define_method("#{fg_color}_on_#{bg_color}") { ansi_attributes(fg, 40 + j) }
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
def ansi_attributes(*args)
|
278
|
+
"\e[#{args.join(';')}m#{self}\e[0m"
|
279
|
+
end
|
280
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module NameTamer
|
2
|
+
class Text
|
3
|
+
# All the potential slugs from the string
|
4
|
+
# e.g. 'lorem ipsum dolor' -> ['lorem', 'ipsum' ,'dolor', 'lorem-ipsum', 'ipsum-dolor', 'lorem-ipsum-dolor']
|
5
|
+
def slugs
|
6
|
+
@slugs ||= segments.flat_map { |s| self.class.new(s).neighbours }.uniq
|
7
|
+
end
|
8
|
+
|
9
|
+
# Split the string into segments (e.g. sentences)
|
10
|
+
def segments
|
11
|
+
string.split(%r{(?:[\.\?,:;!]|[[:space:]][/-])[[:space:]]})
|
12
|
+
end
|
13
|
+
|
14
|
+
# The string as a slug
|
15
|
+
def parameterize
|
16
|
+
@parameterize ||= (
|
17
|
+
string
|
18
|
+
.dup
|
19
|
+
.whitespace_to!(separator)
|
20
|
+
.invalid_chars_to!(separator)
|
21
|
+
.strip_unwanted!(filter)
|
22
|
+
.fix_separators!(separator)
|
23
|
+
.approximate_latin_chars!
|
24
|
+
.presence || '_'
|
25
|
+
).downcase
|
26
|
+
end
|
27
|
+
|
28
|
+
def neighbours
|
29
|
+
@neighbours ||= NameTamer[string].array.neighbours.map { |a| a.join('-') }
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
attr_reader :string, :args
|
35
|
+
|
36
|
+
def initialize(string, args = {})
|
37
|
+
@string = string
|
38
|
+
@args = args
|
39
|
+
end
|
40
|
+
|
41
|
+
def separator
|
42
|
+
@seperator ||= args[:sep] || SLUG_DELIMITER
|
43
|
+
end
|
44
|
+
|
45
|
+
def rfc3987
|
46
|
+
@rfc3987 ||= args[:rfc3987] || false
|
47
|
+
end
|
48
|
+
|
49
|
+
def filter
|
50
|
+
@filter ||= args[:filter] || (rfc3987 ? FILTER_RFC3987 : FILTER_COMPAT)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/name_tamer.gemspec
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
3
|
+
require 'name_tamer/version'
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = 'name_tamer'
|
7
|
+
spec.version = NameTamer::VERSION
|
8
|
+
spec.authors = ['Dominic Sayers']
|
9
|
+
spec.email = ['dominic@sayers.cc']
|
10
|
+
spec.description = 'Useful methods for taming names'
|
11
|
+
spec.summary = "Example: NameTamer['Mr. John Q. Smith III, MD'].simple_name # => John Smith"
|
12
|
+
spec.homepage = 'https://github.com/dominicsayers/name_tamer'
|
13
|
+
spec.license = 'MIT'
|
14
|
+
|
15
|
+
spec.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features|coverage)/})
|
18
|
+
spec.require_paths = ['lib']
|
19
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
describe NameTamer::Name do
|
4
|
+
context 'invalid byte sequence in UTF-8' do
|
5
|
+
let(:name_data) { { n: "\xc3\x28", t: :person, nn: '()', sn: '()', s: '_' } } # Invalid byte sequence in UTF-8
|
6
|
+
|
7
|
+
if Gem::Version.new(RUBY_VERSION) <= Gem::Version.new('2')
|
8
|
+
it 'fails to correct invalid byte sequence' do
|
9
|
+
name = name_data[:n]
|
10
|
+
expect { NameTamer[name, contact_type: name_data[:t]].slug }.to raise_error(
|
11
|
+
ArgumentError,
|
12
|
+
'invalid byte sequence in UTF-8'
|
13
|
+
)
|
14
|
+
end
|
15
|
+
else
|
16
|
+
it 'makes a slug' do
|
17
|
+
name = name_data[:n]
|
18
|
+
expect(NameTamer[name, contact_type: name_data[:t]].slug).to eq(name_data[:s])
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'makes a nice name' do
|
22
|
+
name = name_data[:n]
|
23
|
+
nice_name = NameTamer[name, contact_type: name_data[:t]].nice_name
|
24
|
+
expect(nice_name).to eq(name_data[:nn])
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'makes a searchable name' do
|
28
|
+
name = name_data[:n]
|
29
|
+
expect(NameTamer[name, contact_type: name_data[:t]].simple_name).to eq(name_data[:sn])
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context 'all ruby versions' do
|
35
|
+
let(:names) { YAML.load_file(File.join('spec', 'support', 'names.yml')) }
|
36
|
+
|
37
|
+
it 'loads the examples correctly' do
|
38
|
+
expect(names.length).to eq(152) # Number of examples
|
39
|
+
end
|
40
|
+
|
41
|
+
it 'makes a slug' do
|
42
|
+
names.each do |name_data|
|
43
|
+
name = name_data[:n]
|
44
|
+
expect(NameTamer[name, contact_type: name_data[:t]].slug).to eq(name_data[:s].downcase)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
it 'makes a nice name' do
|
49
|
+
names.each do |name_data|
|
50
|
+
name = name_data[:n]
|
51
|
+
nice_name = NameTamer[name, contact_type: name_data[:t]].nice_name
|
52
|
+
expect(nice_name).to eq(name_data[:nn])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
it 'makes a searchable name' do
|
57
|
+
names.each do |name_data|
|
58
|
+
name = name_data[:n]
|
59
|
+
expect(NameTamer[name, contact_type: name_data[:t]].simple_name).to eq(name_data[:sn])
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
describe 'contact type inference' do
|
66
|
+
it 'infers that "Mr. John Smith" is a person' do
|
67
|
+
expect(NameTamer['Mr. John Smith'].contact_type).to eq(:person)
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'infers that "Di Doo Doo d.o.o." is an organization' do
|
71
|
+
expect(NameTamer['Di Doo Doo d.o.o.'].contact_type).to eq(:organization)
|
72
|
+
end
|
73
|
+
|
74
|
+
it 'infers that "DiDooDoo" is an organization' do
|
75
|
+
expect(NameTamer['DiDooDoo'].contact_type).to eq(:organization)
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'infers that "John Smith" is a person' do
|
79
|
+
expect(NameTamer['John Smith'].contact_type).to eq(:person)
|
80
|
+
end
|
81
|
+
|
82
|
+
it 'announces a change in contact type' do
|
83
|
+
nt = NameTamer::Name.new 'John Smith', contact_type: :person
|
84
|
+
nt.contact_type = :organization
|
85
|
+
expect(nt.contact_type).to eq(:organization)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe 'iteration' do
|
90
|
+
it 'iterates through the significant words of a name' do
|
91
|
+
words = []
|
92
|
+
NameTamer['John Smith'].each_word { |w| words << w }
|
93
|
+
expect(words).to include('john', 'smith')
|
94
|
+
end
|
95
|
+
end
|