name_tamer 0.6.0 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +34 -36
- data/.travis.yml +2 -4
- data/Gemfile +9 -3
- data/Guardfile +2 -0
- data/README.md +0 -1
- data/Rakefile +2 -0
- data/lib/name-tamer.rb +2 -0
- data/lib/name_tamer.rb +1 -1
- data/lib/name_tamer/array.rb +1 -1
- data/lib/name_tamer/constants.rb +406 -53
- data/lib/name_tamer/name.rb +4 -2
- data/lib/name_tamer/string.rb +350 -67
- data/lib/name_tamer/text.rb +4 -2
- data/lib/name_tamer/version.rb +3 -1
- data/name_tamer.gemspec +4 -3
- metadata +4 -14
- data/spec/name_tamer/name_spec.rb +0 -95
- data/spec/name_tamer/string_spec.rb +0 -5
- data/spec/name_tamer/text_spec.rb +0 -40
- data/spec/spec_helper.rb +0 -14
- data/spec/support/names.yml +0 -741
data/lib/name_tamer/name.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module NameTamer
|
2
4
|
class Name
|
3
5
|
# References:
|
@@ -215,7 +217,7 @@ module NameTamer
|
|
215
217
|
lowercase = @last_name.downcase
|
216
218
|
uppercase = @last_name.upcase
|
217
219
|
@last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
|
218
|
-
@nice_name = "#{@remainder} #{@last_name}"
|
220
|
+
@nice_name = +"#{@remainder} #{@last_name}"
|
219
221
|
end
|
220
222
|
|
221
223
|
# Conjoin compound names with non-breaking spaces
|
@@ -249,7 +251,7 @@ module NameTamer
|
|
249
251
|
return unless first_name || last_name
|
250
252
|
|
251
253
|
separator = first_name && last_name ? ' ' : ''
|
252
|
-
@simple_name = "#{first_name}#{separator}#{last_name}"
|
254
|
+
@simple_name = +"#{first_name}#{separator}#{last_name}"
|
253
255
|
end
|
254
256
|
|
255
257
|
def find_first_usable_name(parts)
|
data/lib/name_tamer/string.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class String
|
4
4
|
unless respond_to? :presence
|
@@ -23,7 +23,8 @@ class String
|
|
23
23
|
|
24
24
|
# Ensure commas have exactly one space after them
|
25
25
|
def space_around_comma!
|
26
|
-
substitute!(/[[:space:]]*,[[:space:]]*/, ',
|
26
|
+
substitute!(/[[:space:]]*,[[:space:]]*/, ',
|
27
|
+
')
|
27
28
|
end
|
28
29
|
|
29
30
|
# Change some characters embedded in words to our separator character
|
@@ -36,10 +37,7 @@ class String
|
|
36
37
|
# This might introduce UTF-8 invalid byte sequence
|
37
38
|
# so we take precautions
|
38
39
|
def safe_unescape!
|
39
|
-
string =
|
40
|
-
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
|
41
|
-
return self
|
42
|
-
else
|
40
|
+
string = CGI.unescape(gsub('+', '%2B'))
|
43
41
|
return self if self == string
|
44
42
|
replace string
|
45
43
|
ensure_safe!
|
@@ -177,51 +175,240 @@ class String
|
|
177
175
|
gsub!(pattern, replacement) || self
|
178
176
|
end
|
179
177
|
|
180
|
-
NONBREAKING_SPACE = "\u00a0"
|
181
|
-
ASCII_SPACE = ' '
|
178
|
+
NONBREAKING_SPACE = "\u00a0"
|
179
|
+
ASCII_SPACE = ' '
|
182
180
|
|
183
181
|
COMPOUND_NAMES = [
|
184
|
-
|
185
|
-
'Baron Cohen',
|
186
|
-
'
|
182
|
+
# Known families with a space in their surname
|
183
|
+
'Baron Cohen',
|
184
|
+
'Bonham Carter',
|
185
|
+
'Holmes a Court',
|
186
|
+
'Holmes à Court',
|
187
|
+
'Lane Fox',
|
188
|
+
'Lloyd Webber',
|
189
|
+
'Pitt Rivers',
|
190
|
+
'Sebag Montefiore',
|
191
|
+
'Strang Steel',
|
192
|
+
'Wedgwood Benn',
|
193
|
+
'Wingfield Digby',
|
194
|
+
# Sometimes companies appear as people
|
195
|
+
'Corporation Company',
|
196
|
+
'Corporation System',
|
197
|
+
'Incorporations Limited',
|
198
|
+
'Service Company',
|
187
199
|
].freeze
|
188
200
|
|
189
201
|
NAME_MODIFIERS = [
|
190
|
-
'Al',
|
191
|
-
'
|
202
|
+
'Al',
|
203
|
+
'Ap',
|
204
|
+
'Ben',
|
205
|
+
'D[aeiou]',
|
206
|
+
'D[ao]s',
|
207
|
+
'De[lrn]',
|
208
|
+
'Dell[ae]',
|
209
|
+
'El',
|
210
|
+
'L[eo]',
|
211
|
+
'La',
|
212
|
+
'Of',
|
213
|
+
'San',
|
214
|
+
'St[\.]?',
|
215
|
+
'V[ao]n',
|
216
|
+
'Zur',
|
192
217
|
].freeze
|
193
218
|
|
194
219
|
# Transliterations (like the i18n defaults)
|
195
220
|
# see https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb
|
196
221
|
APPROXIMATIONS = {
|
197
|
-
'
|
198
|
-
'
|
199
|
-
'
|
200
|
-
'
|
201
|
-
'
|
202
|
-
'
|
203
|
-
'
|
204
|
-
'
|
205
|
-
'
|
206
|
-
'
|
207
|
-
'
|
208
|
-
'
|
209
|
-
'
|
210
|
-
'
|
211
|
-
'
|
212
|
-
'
|
213
|
-
'
|
214
|
-
'
|
215
|
-
'
|
216
|
-
'
|
217
|
-
'
|
218
|
-
'
|
219
|
-
'
|
220
|
-
'
|
221
|
-
'
|
222
|
-
'
|
223
|
-
'
|
224
|
-
'
|
222
|
+
'İ' => 'I',
|
223
|
+
'×' => 'x',
|
224
|
+
'ß' => 'ss',
|
225
|
+
'À' => 'A',
|
226
|
+
'à' => 'a',
|
227
|
+
'Á' => 'A',
|
228
|
+
'á' => 'a',
|
229
|
+
'Â' => 'A',
|
230
|
+
'â' => 'a',
|
231
|
+
'Ã' => 'A',
|
232
|
+
'ã' => 'a',
|
233
|
+
'Ä' => 'A',
|
234
|
+
'ä' => 'a',
|
235
|
+
'Å' => 'A',
|
236
|
+
'å' => 'a',
|
237
|
+
'Æ' => 'AE',
|
238
|
+
'æ' => 'ae',
|
239
|
+
'Ç' => 'C',
|
240
|
+
'ç' => 'c',
|
241
|
+
'È' => 'E',
|
242
|
+
'è' => 'e',
|
243
|
+
'É' => 'E',
|
244
|
+
'é' => 'e',
|
245
|
+
'Ê' => 'E',
|
246
|
+
'ê' => 'e',
|
247
|
+
'Ë' => 'E',
|
248
|
+
'ë' => 'e',
|
249
|
+
'Ì' => 'I',
|
250
|
+
'ì' => 'i',
|
251
|
+
'Í' => 'I',
|
252
|
+
'í' => 'i',
|
253
|
+
'Î' => 'I',
|
254
|
+
'î' => 'i',
|
255
|
+
'Ï' => 'I',
|
256
|
+
'ï' => 'i',
|
257
|
+
'Ð' => 'D',
|
258
|
+
'ð' => 'd',
|
259
|
+
'Ñ' => 'N',
|
260
|
+
'ñ' => 'n',
|
261
|
+
'Ò' => 'O',
|
262
|
+
'ò' => 'o',
|
263
|
+
'Ó' => 'O',
|
264
|
+
'ó' => 'o',
|
265
|
+
'Ô' => 'O',
|
266
|
+
'ô' => 'o',
|
267
|
+
'Õ' => 'O',
|
268
|
+
'õ' => 'o',
|
269
|
+
'Ö' => 'O',
|
270
|
+
'ö' => 'o',
|
271
|
+
'Ø' => 'O',
|
272
|
+
'ø' => 'o',
|
273
|
+
'Ù' => 'U',
|
274
|
+
'ù' => 'u',
|
275
|
+
'Ú' => 'U',
|
276
|
+
'ú' => 'u',
|
277
|
+
'Û' => 'U',
|
278
|
+
'û' => 'u',
|
279
|
+
'Ü' => 'U',
|
280
|
+
'ü' => 'u',
|
281
|
+
'Ý' => 'Y',
|
282
|
+
'ý' => 'y',
|
283
|
+
'Þ' => 'Th',
|
284
|
+
'þ' => 'th',
|
285
|
+
'ÿ' => 'y',
|
286
|
+
'Ÿ' => 'Y',
|
287
|
+
'Ā' => 'A',
|
288
|
+
'ā' => 'a',
|
289
|
+
'Ă' => 'A',
|
290
|
+
'ă' => 'a',
|
291
|
+
'Ą' => 'A',
|
292
|
+
'ą' => 'a',
|
293
|
+
'Ć' => 'C',
|
294
|
+
'ć' => 'c',
|
295
|
+
'Ĉ' => 'C',
|
296
|
+
'ĉ' => 'c',
|
297
|
+
'Ċ' => 'C',
|
298
|
+
'ċ' => 'c',
|
299
|
+
'Č' => 'C',
|
300
|
+
'č' => 'c',
|
301
|
+
'Ď' => 'D',
|
302
|
+
'ď' => 'd',
|
303
|
+
'Đ' => 'D',
|
304
|
+
'đ' => 'd',
|
305
|
+
'Ē' => 'E',
|
306
|
+
'ē' => 'e',
|
307
|
+
'Ĕ' => 'E',
|
308
|
+
'ĕ' => 'e',
|
309
|
+
'Ė' => 'E',
|
310
|
+
'ė' => 'e',
|
311
|
+
'Ę' => 'E',
|
312
|
+
'ę' => 'e',
|
313
|
+
'Ě' => 'E',
|
314
|
+
'ě' => 'e',
|
315
|
+
'Ĝ' => 'G',
|
316
|
+
'ĝ' => 'g',
|
317
|
+
'Ğ' => 'G',
|
318
|
+
'ğ' => 'g',
|
319
|
+
'Ġ' => 'G',
|
320
|
+
'ġ' => 'g',
|
321
|
+
'Ģ' => 'G',
|
322
|
+
'ģ' => 'g',
|
323
|
+
'Ĥ' => 'H',
|
324
|
+
'ĥ' => 'h',
|
325
|
+
'Ħ' => 'H',
|
326
|
+
'ħ' => 'h',
|
327
|
+
'Ĩ' => 'I',
|
328
|
+
'ĩ' => 'i',
|
329
|
+
'Ī' => 'I',
|
330
|
+
'ī' => 'i',
|
331
|
+
'Ĭ' => 'I',
|
332
|
+
'ĭ' => 'i',
|
333
|
+
'Į' => 'I',
|
334
|
+
'į' => 'i',
|
335
|
+
'ı' => 'i',
|
336
|
+
'IJ' => 'IJ',
|
337
|
+
'ij' => 'ij',
|
338
|
+
'Ĵ' => 'J',
|
339
|
+
'ĵ' => 'j',
|
340
|
+
'Ķ' => 'K',
|
341
|
+
'ķ' => 'k',
|
342
|
+
'ĸ' => 'k',
|
343
|
+
'Ĺ' => 'L',
|
344
|
+
'ĺ' => 'l',
|
345
|
+
'Ļ' => 'L',
|
346
|
+
'ļ' => 'l',
|
347
|
+
'Ľ' => 'L',
|
348
|
+
'ľ' => 'l',
|
349
|
+
'Ŀ' => 'L',
|
350
|
+
'ŀ' => 'l',
|
351
|
+
'Ł' => 'L',
|
352
|
+
'ł' => 'l',
|
353
|
+
'Ń' => 'N',
|
354
|
+
'ń' => 'n',
|
355
|
+
'Ņ' => 'N',
|
356
|
+
'ņ' => 'n',
|
357
|
+
'Ň' => 'N',
|
358
|
+
'ň' => 'n',
|
359
|
+
'ʼn' => "'n",
|
360
|
+
'Ŋ' => 'NG',
|
361
|
+
'ŋ' => 'ng',
|
362
|
+
'Ō' => 'O',
|
363
|
+
'ō' => 'o',
|
364
|
+
'Ŏ' => 'O',
|
365
|
+
'ŏ' => 'o',
|
366
|
+
'Ő' => 'O',
|
367
|
+
'ő' => 'o',
|
368
|
+
'Œ' => 'OE',
|
369
|
+
'œ' => 'oe',
|
370
|
+
'Ŕ' => 'R',
|
371
|
+
'ŕ' => 'r',
|
372
|
+
'Ŗ' => 'R',
|
373
|
+
'ŗ' => 'r',
|
374
|
+
'Ř' => 'R',
|
375
|
+
'ř' => 'r',
|
376
|
+
'Ś' => 'S',
|
377
|
+
'ś' => 's',
|
378
|
+
'Ŝ' => 'S',
|
379
|
+
'ŝ' => 's',
|
380
|
+
'Ş' => 'S',
|
381
|
+
'ş' => 's',
|
382
|
+
'Š' => 'S',
|
383
|
+
'š' => 's',
|
384
|
+
'Ţ' => 'T',
|
385
|
+
'ţ' => 't',
|
386
|
+
'Ť' => 'T',
|
387
|
+
'ť' => 't',
|
388
|
+
'Ŧ' => 'T',
|
389
|
+
'ŧ' => 't',
|
390
|
+
'Ũ' => 'U',
|
391
|
+
'ũ' => 'u',
|
392
|
+
'Ū' => 'U',
|
393
|
+
'ū' => 'u',
|
394
|
+
'Ŭ' => 'U',
|
395
|
+
'ŭ' => 'u',
|
396
|
+
'Ů' => 'U',
|
397
|
+
'ů' => 'u',
|
398
|
+
'Ű' => 'U',
|
399
|
+
'ű' => 'u',
|
400
|
+
'Ų' => 'U',
|
401
|
+
'ų' => 'u',
|
402
|
+
'Ŵ' => 'W',
|
403
|
+
'ŵ' => 'w',
|
404
|
+
'Ŷ' => 'Y',
|
405
|
+
'ŷ' => 'y',
|
406
|
+
'Ź' => 'Z',
|
407
|
+
'ź' => 'z',
|
408
|
+
'Ż' => 'Z',
|
409
|
+
'ż' => 'z',
|
410
|
+
'ž' => 'z',
|
411
|
+
'Ž' => 'Z',
|
225
412
|
}.freeze
|
226
413
|
|
227
414
|
# When strings are mistakenly encoded as single-byte character sets, instead
|
@@ -229,34 +416,130 @@ class String
|
|
229
416
|
# and fix
|
230
417
|
# Useful table here http://www.i18nqa.com/debug/utf8-debug.html
|
231
418
|
BAD_ENCODING = {
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
419
|
+
"\xC3\x8D" => 'Í',
|
420
|
+
"\xC3\x8F" => 'Ï',
|
421
|
+
"\xC3\x90" => 'Ð',
|
422
|
+
"\xC3\x9D" => 'Ý',
|
423
|
+
'Â ' => ' ',
|
424
|
+
'¡' => '¡',
|
425
|
+
'¢' => '¢',
|
426
|
+
'£' => '£',
|
427
|
+
'¤' => '¤',
|
428
|
+
'Â¥' => '¥',
|
429
|
+
'¦' => '¦',
|
430
|
+
'§' => '§',
|
431
|
+
'¨' => '¨',
|
432
|
+
'©' => '©',
|
433
|
+
'ª' => 'ª',
|
434
|
+
'«' => '«',
|
435
|
+
'¬' => '¬',
|
436
|
+
'Â' => '',
|
437
|
+
'®' => '®',
|
438
|
+
'¯' => '¯',
|
439
|
+
'°' => '°',
|
440
|
+
'±' => '±',
|
441
|
+
'²' => '²',
|
442
|
+
'³' => '³',
|
443
|
+
'´' => '´',
|
444
|
+
'µ' => 'µ',
|
445
|
+
'¶' => '¶',
|
446
|
+
'·' => '·',
|
447
|
+
'¸' => '¸',
|
448
|
+
'¹' => '¹',
|
449
|
+
'º' => 'º',
|
450
|
+
'»' => '»',
|
451
|
+
'¼' => '¼',
|
452
|
+
'½' => '½',
|
453
|
+
'¾' => '¾',
|
454
|
+
'¿' => '¿',
|
455
|
+
'€' => '€',
|
456
|
+
'â„¢' => '™',
|
236
457
|
'â€' => '”', # Note the invisible Ux009D in the key
|
458
|
+
'†' => '†',
|
459
|
+
'‡' => '‡',
|
460
|
+
'•' => '•',
|
461
|
+
'…' => '…',
|
462
|
+
'‰' => '‰',
|
237
463
|
'′' => '′', # Manually added. Some seem to use this instead of Ux2019
|
238
|
-
'
|
239
|
-
'
|
240
|
-
'
|
241
|
-
'
|
242
|
-
'
|
243
|
-
'
|
244
|
-
'
|
245
|
-
'
|
246
|
-
'
|
247
|
-
'
|
248
|
-
'
|
249
|
-
'
|
250
|
-
|
251
|
-
'
|
252
|
-
'
|
253
|
-
'
|
254
|
-
'
|
255
|
-
'
|
256
|
-
'
|
257
|
-
'
|
258
|
-
'
|
259
|
-
'
|
464
|
+
'‹' => '‹',
|
465
|
+
'›' => '›',
|
466
|
+
'“' => '“',
|
467
|
+
'‚' => '‚',
|
468
|
+
'„' => '„',
|
469
|
+
'‘' => '‘',
|
470
|
+
'–' => '–',
|
471
|
+
'—' => '—',
|
472
|
+
'’' => '’',
|
473
|
+
'Ã ' => 'à',
|
474
|
+
'á' => 'á',
|
475
|
+
'â' => 'â',
|
476
|
+
'ã' => 'ã',
|
477
|
+
'ä' => 'ä',
|
478
|
+
'Ã¥' => 'å',
|
479
|
+
'æ' => 'æ',
|
480
|
+
'ç' => 'ç',
|
481
|
+
'è' => 'è',
|
482
|
+
'é' => 'é',
|
483
|
+
'ê' => 'ê',
|
484
|
+
'ë' => 'ë',
|
485
|
+
'ì' => 'ì',
|
486
|
+
'Ã' => 'í',
|
487
|
+
'î' => 'î',
|
488
|
+
'ï' => 'ï',
|
489
|
+
'ð' => 'ð',
|
490
|
+
'ñ' => 'ñ',
|
491
|
+
'ò' => 'ò',
|
492
|
+
'ó' => 'ó',
|
493
|
+
'ô' => 'ô',
|
494
|
+
'õ' => 'õ',
|
495
|
+
'ö' => 'ö',
|
496
|
+
'÷' => '÷',
|
497
|
+
'ø' => 'ø',
|
498
|
+
'ù' => 'ù',
|
499
|
+
'ú' => 'ú',
|
500
|
+
'û' => 'û',
|
501
|
+
'ü' => 'ü',
|
502
|
+
'ý' => 'ý',
|
503
|
+
'þ' => 'þ',
|
504
|
+
'ÿ' => 'ÿ',
|
505
|
+
'ß' => 'ß',
|
506
|
+
'ÃŒ' => 'Ì',
|
507
|
+
'Ü' => 'Ü',
|
508
|
+
'Ê' => 'Ê',
|
509
|
+
'Ú' => 'Ú',
|
510
|
+
'ÃŽ' => 'Î',
|
511
|
+
'Þ' => 'Þ',
|
512
|
+
'Ã' => 'Ã',
|
513
|
+
'È' => 'È',
|
514
|
+
'Ø' => 'Ø',
|
515
|
+
'Ö' => 'Ö',
|
516
|
+
'×' => '×',
|
517
|
+
'Ñ' => 'Ñ',
|
518
|
+
'Ã’' => 'Ò',
|
519
|
+
'Â' => 'Â',
|
520
|
+
'Ó' => 'Ó',
|
521
|
+
'Ô' => 'Ô',
|
522
|
+
'Ä' => 'Ä',
|
523
|
+
'Æ' => 'Æ',
|
524
|
+
'Ç' => 'Ç',
|
525
|
+
'Õ' => 'Õ',
|
526
|
+
'Ã…' => 'Å',
|
527
|
+
'É' => 'É',
|
528
|
+
'Ë' => 'Ë',
|
529
|
+
'Û' => 'Û',
|
530
|
+
'À' => 'À',
|
531
|
+
'Ù' => 'Ù',
|
532
|
+
'Ã�' => 'Á',
|
533
|
+
'Å ' => 'Š',
|
534
|
+
'Å¡' => 'š',
|
535
|
+
'Ÿ' => 'Ÿ',
|
536
|
+
'Ž' => 'Ž',
|
537
|
+
'ž' => 'ž',
|
538
|
+
'Å’' => 'Œ',
|
539
|
+
'Å“' => 'œ',
|
540
|
+
'Æ’' => 'ƒ',
|
541
|
+
'Ëœ' => '˜',
|
542
|
+
'ˆ' => 'ˆ',
|
260
543
|
"\x00" => '' # Manually added to avoid Bad Argument exception
|
261
544
|
}.freeze
|
262
545
|
|