name_tamer 0.6.0 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +34 -36
- data/.travis.yml +2 -4
- data/Gemfile +9 -3
- data/Guardfile +2 -0
- data/README.md +0 -1
- data/Rakefile +2 -0
- data/lib/name-tamer.rb +2 -0
- data/lib/name_tamer.rb +1 -1
- data/lib/name_tamer/array.rb +1 -1
- data/lib/name_tamer/constants.rb +406 -53
- data/lib/name_tamer/name.rb +4 -2
- data/lib/name_tamer/string.rb +350 -67
- data/lib/name_tamer/text.rb +4 -2
- data/lib/name_tamer/version.rb +3 -1
- data/name_tamer.gemspec +4 -3
- metadata +4 -14
- data/spec/name_tamer/name_spec.rb +0 -95
- data/spec/name_tamer/string_spec.rb +0 -5
- data/spec/name_tamer/text_spec.rb +0 -40
- data/spec/spec_helper.rb +0 -14
- data/spec/support/names.yml +0 -741
data/lib/name_tamer/name.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module NameTamer
|
2
4
|
class Name
|
3
5
|
# References:
|
@@ -215,7 +217,7 @@ module NameTamer
|
|
215
217
|
lowercase = @last_name.downcase
|
216
218
|
uppercase = @last_name.upcase
|
217
219
|
@last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
|
218
|
-
@nice_name = "#{@remainder} #{@last_name}"
|
220
|
+
@nice_name = +"#{@remainder} #{@last_name}"
|
219
221
|
end
|
220
222
|
|
221
223
|
# Conjoin compound names with non-breaking spaces
|
@@ -249,7 +251,7 @@ module NameTamer
|
|
249
251
|
return unless first_name || last_name
|
250
252
|
|
251
253
|
separator = first_name && last_name ? ' ' : ''
|
252
|
-
@simple_name = "#{first_name}#{separator}#{last_name}"
|
254
|
+
@simple_name = +"#{first_name}#{separator}#{last_name}"
|
253
255
|
end
|
254
256
|
|
255
257
|
def find_first_usable_name(parts)
|
data/lib/name_tamer/string.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
#
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class String
|
4
4
|
unless respond_to? :presence
|
@@ -23,7 +23,8 @@ class String
|
|
23
23
|
|
24
24
|
# Ensure commas have exactly one space after them
|
25
25
|
def space_around_comma!
|
26
|
-
substitute!(/[[:space:]]*,[[:space:]]*/, ',
|
26
|
+
substitute!(/[[:space:]]*,[[:space:]]*/, ',
|
27
|
+
')
|
27
28
|
end
|
28
29
|
|
29
30
|
# Change some characters embedded in words to our separator character
|
@@ -36,10 +37,7 @@ class String
|
|
36
37
|
# This might introduce UTF-8 invalid byte sequence
|
37
38
|
# so we take precautions
|
38
39
|
def safe_unescape!
|
39
|
-
string =
|
40
|
-
rescue Encoding::CompatibilityError # e.g. "\u2019%80"
|
41
|
-
return self
|
42
|
-
else
|
40
|
+
string = CGI.unescape(gsub('+', '%2B'))
|
43
41
|
return self if self == string
|
44
42
|
replace string
|
45
43
|
ensure_safe!
|
@@ -177,51 +175,240 @@ class String
|
|
177
175
|
gsub!(pattern, replacement) || self
|
178
176
|
end
|
179
177
|
|
180
|
-
NONBREAKING_SPACE = "\u00a0"
|
181
|
-
ASCII_SPACE = ' '
|
178
|
+
NONBREAKING_SPACE = "\u00a0"
|
179
|
+
ASCII_SPACE = ' '
|
182
180
|
|
183
181
|
COMPOUND_NAMES = [
|
184
|
-
|
185
|
-
'Baron Cohen',
|
186
|
-
'
|
182
|
+
# Known families with a space in their surname
|
183
|
+
'Baron Cohen',
|
184
|
+
'Bonham Carter',
|
185
|
+
'Holmes a Court',
|
186
|
+
'Holmes à Court',
|
187
|
+
'Lane Fox',
|
188
|
+
'Lloyd Webber',
|
189
|
+
'Pitt Rivers',
|
190
|
+
'Sebag Montefiore',
|
191
|
+
'Strang Steel',
|
192
|
+
'Wedgwood Benn',
|
193
|
+
'Wingfield Digby',
|
194
|
+
# Sometimes companies appear as people
|
195
|
+
'Corporation Company',
|
196
|
+
'Corporation System',
|
197
|
+
'Incorporations Limited',
|
198
|
+
'Service Company',
|
187
199
|
].freeze
|
188
200
|
|
189
201
|
NAME_MODIFIERS = [
|
190
|
-
'Al',
|
191
|
-
'
|
202
|
+
'Al',
|
203
|
+
'Ap',
|
204
|
+
'Ben',
|
205
|
+
'D[aeiou]',
|
206
|
+
'D[ao]s',
|
207
|
+
'De[lrn]',
|
208
|
+
'Dell[ae]',
|
209
|
+
'El',
|
210
|
+
'L[eo]',
|
211
|
+
'La',
|
212
|
+
'Of',
|
213
|
+
'San',
|
214
|
+
'St[\.]?',
|
215
|
+
'V[ao]n',
|
216
|
+
'Zur',
|
192
217
|
].freeze
|
193
218
|
|
194
219
|
# Transliterations (like the i18n defaults)
|
195
220
|
# see https://github.com/svenfuchs/i18n/blob/master/lib/i18n/backend/transliterator.rb
|
196
221
|
APPROXIMATIONS = {
|
197
|
-
'
|
198
|
-
'
|
199
|
-
'
|
200
|
-
'
|
201
|
-
'
|
202
|
-
'
|
203
|
-
'
|
204
|
-
'
|
205
|
-
'
|
206
|
-
'
|
207
|
-
'
|
208
|
-
'
|
209
|
-
'
|
210
|
-
'
|
211
|
-
'
|
212
|
-
'
|
213
|
-
'
|
214
|
-
'
|
215
|
-
'
|
216
|
-
'
|
217
|
-
'
|
218
|
-
'
|
219
|
-
'
|
220
|
-
'
|
221
|
-
'
|
222
|
-
'
|
223
|
-
'
|
224
|
-
'
|
222
|
+
'İ' => 'I',
|
223
|
+
'×' => 'x',
|
224
|
+
'ß' => 'ss',
|
225
|
+
'À' => 'A',
|
226
|
+
'à' => 'a',
|
227
|
+
'Á' => 'A',
|
228
|
+
'á' => 'a',
|
229
|
+
'Â' => 'A',
|
230
|
+
'â' => 'a',
|
231
|
+
'Ã' => 'A',
|
232
|
+
'ã' => 'a',
|
233
|
+
'Ä' => 'A',
|
234
|
+
'ä' => 'a',
|
235
|
+
'Å' => 'A',
|
236
|
+
'å' => 'a',
|
237
|
+
'Æ' => 'AE',
|
238
|
+
'æ' => 'ae',
|
239
|
+
'Ç' => 'C',
|
240
|
+
'ç' => 'c',
|
241
|
+
'È' => 'E',
|
242
|
+
'è' => 'e',
|
243
|
+
'É' => 'E',
|
244
|
+
'é' => 'e',
|
245
|
+
'Ê' => 'E',
|
246
|
+
'ê' => 'e',
|
247
|
+
'Ë' => 'E',
|
248
|
+
'ë' => 'e',
|
249
|
+
'Ì' => 'I',
|
250
|
+
'ì' => 'i',
|
251
|
+
'Í' => 'I',
|
252
|
+
'í' => 'i',
|
253
|
+
'Î' => 'I',
|
254
|
+
'î' => 'i',
|
255
|
+
'Ï' => 'I',
|
256
|
+
'ï' => 'i',
|
257
|
+
'Ð' => 'D',
|
258
|
+
'ð' => 'd',
|
259
|
+
'Ñ' => 'N',
|
260
|
+
'ñ' => 'n',
|
261
|
+
'Ò' => 'O',
|
262
|
+
'ò' => 'o',
|
263
|
+
'Ó' => 'O',
|
264
|
+
'ó' => 'o',
|
265
|
+
'Ô' => 'O',
|
266
|
+
'ô' => 'o',
|
267
|
+
'Õ' => 'O',
|
268
|
+
'õ' => 'o',
|
269
|
+
'Ö' => 'O',
|
270
|
+
'ö' => 'o',
|
271
|
+
'Ø' => 'O',
|
272
|
+
'ø' => 'o',
|
273
|
+
'Ù' => 'U',
|
274
|
+
'ù' => 'u',
|
275
|
+
'Ú' => 'U',
|
276
|
+
'ú' => 'u',
|
277
|
+
'Û' => 'U',
|
278
|
+
'û' => 'u',
|
279
|
+
'Ü' => 'U',
|
280
|
+
'ü' => 'u',
|
281
|
+
'Ý' => 'Y',
|
282
|
+
'ý' => 'y',
|
283
|
+
'Þ' => 'Th',
|
284
|
+
'þ' => 'th',
|
285
|
+
'ÿ' => 'y',
|
286
|
+
'Ÿ' => 'Y',
|
287
|
+
'Ā' => 'A',
|
288
|
+
'ā' => 'a',
|
289
|
+
'Ă' => 'A',
|
290
|
+
'ă' => 'a',
|
291
|
+
'Ą' => 'A',
|
292
|
+
'ą' => 'a',
|
293
|
+
'Ć' => 'C',
|
294
|
+
'ć' => 'c',
|
295
|
+
'Ĉ' => 'C',
|
296
|
+
'ĉ' => 'c',
|
297
|
+
'Ċ' => 'C',
|
298
|
+
'ċ' => 'c',
|
299
|
+
'Č' => 'C',
|
300
|
+
'č' => 'c',
|
301
|
+
'Ď' => 'D',
|
302
|
+
'ď' => 'd',
|
303
|
+
'Đ' => 'D',
|
304
|
+
'đ' => 'd',
|
305
|
+
'Ē' => 'E',
|
306
|
+
'ē' => 'e',
|
307
|
+
'Ĕ' => 'E',
|
308
|
+
'ĕ' => 'e',
|
309
|
+
'Ė' => 'E',
|
310
|
+
'ė' => 'e',
|
311
|
+
'Ę' => 'E',
|
312
|
+
'ę' => 'e',
|
313
|
+
'Ě' => 'E',
|
314
|
+
'ě' => 'e',
|
315
|
+
'Ĝ' => 'G',
|
316
|
+
'ĝ' => 'g',
|
317
|
+
'Ğ' => 'G',
|
318
|
+
'ğ' => 'g',
|
319
|
+
'Ġ' => 'G',
|
320
|
+
'ġ' => 'g',
|
321
|
+
'Ģ' => 'G',
|
322
|
+
'ģ' => 'g',
|
323
|
+
'Ĥ' => 'H',
|
324
|
+
'ĥ' => 'h',
|
325
|
+
'Ħ' => 'H',
|
326
|
+
'ħ' => 'h',
|
327
|
+
'Ĩ' => 'I',
|
328
|
+
'ĩ' => 'i',
|
329
|
+
'Ī' => 'I',
|
330
|
+
'ī' => 'i',
|
331
|
+
'Ĭ' => 'I',
|
332
|
+
'ĭ' => 'i',
|
333
|
+
'Į' => 'I',
|
334
|
+
'į' => 'i',
|
335
|
+
'ı' => 'i',
|
336
|
+
'IJ' => 'IJ',
|
337
|
+
'ij' => 'ij',
|
338
|
+
'Ĵ' => 'J',
|
339
|
+
'ĵ' => 'j',
|
340
|
+
'Ķ' => 'K',
|
341
|
+
'ķ' => 'k',
|
342
|
+
'ĸ' => 'k',
|
343
|
+
'Ĺ' => 'L',
|
344
|
+
'ĺ' => 'l',
|
345
|
+
'Ļ' => 'L',
|
346
|
+
'ļ' => 'l',
|
347
|
+
'Ľ' => 'L',
|
348
|
+
'ľ' => 'l',
|
349
|
+
'Ŀ' => 'L',
|
350
|
+
'ŀ' => 'l',
|
351
|
+
'Ł' => 'L',
|
352
|
+
'ł' => 'l',
|
353
|
+
'Ń' => 'N',
|
354
|
+
'ń' => 'n',
|
355
|
+
'Ņ' => 'N',
|
356
|
+
'ņ' => 'n',
|
357
|
+
'Ň' => 'N',
|
358
|
+
'ň' => 'n',
|
359
|
+
'ʼn' => "'n",
|
360
|
+
'Ŋ' => 'NG',
|
361
|
+
'ŋ' => 'ng',
|
362
|
+
'Ō' => 'O',
|
363
|
+
'ō' => 'o',
|
364
|
+
'Ŏ' => 'O',
|
365
|
+
'ŏ' => 'o',
|
366
|
+
'Ő' => 'O',
|
367
|
+
'ő' => 'o',
|
368
|
+
'Œ' => 'OE',
|
369
|
+
'œ' => 'oe',
|
370
|
+
'Ŕ' => 'R',
|
371
|
+
'ŕ' => 'r',
|
372
|
+
'Ŗ' => 'R',
|
373
|
+
'ŗ' => 'r',
|
374
|
+
'Ř' => 'R',
|
375
|
+
'ř' => 'r',
|
376
|
+
'Ś' => 'S',
|
377
|
+
'ś' => 's',
|
378
|
+
'Ŝ' => 'S',
|
379
|
+
'ŝ' => 's',
|
380
|
+
'Ş' => 'S',
|
381
|
+
'ş' => 's',
|
382
|
+
'Š' => 'S',
|
383
|
+
'š' => 's',
|
384
|
+
'Ţ' => 'T',
|
385
|
+
'ţ' => 't',
|
386
|
+
'Ť' => 'T',
|
387
|
+
'ť' => 't',
|
388
|
+
'Ŧ' => 'T',
|
389
|
+
'ŧ' => 't',
|
390
|
+
'Ũ' => 'U',
|
391
|
+
'ũ' => 'u',
|
392
|
+
'Ū' => 'U',
|
393
|
+
'ū' => 'u',
|
394
|
+
'Ŭ' => 'U',
|
395
|
+
'ŭ' => 'u',
|
396
|
+
'Ů' => 'U',
|
397
|
+
'ů' => 'u',
|
398
|
+
'Ű' => 'U',
|
399
|
+
'ű' => 'u',
|
400
|
+
'Ų' => 'U',
|
401
|
+
'ų' => 'u',
|
402
|
+
'Ŵ' => 'W',
|
403
|
+
'ŵ' => 'w',
|
404
|
+
'Ŷ' => 'Y',
|
405
|
+
'ŷ' => 'y',
|
406
|
+
'Ź' => 'Z',
|
407
|
+
'ź' => 'z',
|
408
|
+
'Ż' => 'Z',
|
409
|
+
'ż' => 'z',
|
410
|
+
'ž' => 'z',
|
411
|
+
'Ž' => 'Z',
|
225
412
|
}.freeze
|
226
413
|
|
227
414
|
# When strings are mistakenly encoded as single-byte character sets, instead
|
@@ -229,34 +416,130 @@ class String
|
|
229
416
|
# and fix
|
230
417
|
# Useful table here http://www.i18nqa.com/debug/utf8-debug.html
|
231
418
|
BAD_ENCODING = {
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
419
|
+
"\xC3\x8D" => 'Í',
|
420
|
+
"\xC3\x8F" => 'Ï',
|
421
|
+
"\xC3\x90" => 'Ð',
|
422
|
+
"\xC3\x9D" => 'Ý',
|
423
|
+
'Â ' => ' ',
|
424
|
+
'¡' => '¡',
|
425
|
+
'¢' => '¢',
|
426
|
+
'£' => '£',
|
427
|
+
'¤' => '¤',
|
428
|
+
'Â¥' => '¥',
|
429
|
+
'¦' => '¦',
|
430
|
+
'§' => '§',
|
431
|
+
'¨' => '¨',
|
432
|
+
'©' => '©',
|
433
|
+
'ª' => 'ª',
|
434
|
+
'«' => '«',
|
435
|
+
'¬' => '¬',
|
436
|
+
'Â' => '',
|
437
|
+
'®' => '®',
|
438
|
+
'¯' => '¯',
|
439
|
+
'°' => '°',
|
440
|
+
'±' => '±',
|
441
|
+
'²' => '²',
|
442
|
+
'³' => '³',
|
443
|
+
'´' => '´',
|
444
|
+
'µ' => 'µ',
|
445
|
+
'¶' => '¶',
|
446
|
+
'·' => '·',
|
447
|
+
'¸' => '¸',
|
448
|
+
'¹' => '¹',
|
449
|
+
'º' => 'º',
|
450
|
+
'»' => '»',
|
451
|
+
'¼' => '¼',
|
452
|
+
'½' => '½',
|
453
|
+
'¾' => '¾',
|
454
|
+
'¿' => '¿',
|
455
|
+
'€' => '€',
|
456
|
+
'â„¢' => '™',
|
236
457
|
'â€' => '”', # Note the invisible Ux009D in the key
|
458
|
+
'†' => '†',
|
459
|
+
'‡' => '‡',
|
460
|
+
'•' => '•',
|
461
|
+
'…' => '…',
|
462
|
+
'‰' => '‰',
|
237
463
|
'′' => '′', # Manually added. Some seem to use this instead of Ux2019
|
238
|
-
'
|
239
|
-
'
|
240
|
-
'
|
241
|
-
'
|
242
|
-
'
|
243
|
-
'
|
244
|
-
'
|
245
|
-
'
|
246
|
-
'
|
247
|
-
'
|
248
|
-
'
|
249
|
-
'
|
250
|
-
|
251
|
-
'
|
252
|
-
'
|
253
|
-
'
|
254
|
-
'
|
255
|
-
'
|
256
|
-
'
|
257
|
-
'
|
258
|
-
'
|
259
|
-
'
|
464
|
+
'‹' => '‹',
|
465
|
+
'›' => '›',
|
466
|
+
'“' => '“',
|
467
|
+
'‚' => '‚',
|
468
|
+
'„' => '„',
|
469
|
+
'‘' => '‘',
|
470
|
+
'–' => '–',
|
471
|
+
'—' => '—',
|
472
|
+
'’' => '’',
|
473
|
+
'Ã ' => 'à',
|
474
|
+
'á' => 'á',
|
475
|
+
'â' => 'â',
|
476
|
+
'ã' => 'ã',
|
477
|
+
'ä' => 'ä',
|
478
|
+
'Ã¥' => 'å',
|
479
|
+
'æ' => 'æ',
|
480
|
+
'ç' => 'ç',
|
481
|
+
'è' => 'è',
|
482
|
+
'é' => 'é',
|
483
|
+
'ê' => 'ê',
|
484
|
+
'ë' => 'ë',
|
485
|
+
'ì' => 'ì',
|
486
|
+
'Ã' => 'í',
|
487
|
+
'î' => 'î',
|
488
|
+
'ï' => 'ï',
|
489
|
+
'ð' => 'ð',
|
490
|
+
'ñ' => 'ñ',
|
491
|
+
'ò' => 'ò',
|
492
|
+
'ó' => 'ó',
|
493
|
+
'ô' => 'ô',
|
494
|
+
'õ' => 'õ',
|
495
|
+
'ö' => 'ö',
|
496
|
+
'÷' => '÷',
|
497
|
+
'ø' => 'ø',
|
498
|
+
'ù' => 'ù',
|
499
|
+
'ú' => 'ú',
|
500
|
+
'û' => 'û',
|
501
|
+
'ü' => 'ü',
|
502
|
+
'ý' => 'ý',
|
503
|
+
'þ' => 'þ',
|
504
|
+
'ÿ' => 'ÿ',
|
505
|
+
'ß' => 'ß',
|
506
|
+
'ÃŒ' => 'Ì',
|
507
|
+
'Ãœ' => 'Ü',
|
508
|
+
'Ê' => 'Ê',
|
509
|
+
'Ú' => 'Ú',
|
510
|
+
'ÃŽ' => 'Î',
|
511
|
+
'Þ' => 'Þ',
|
512
|
+
'Ã' => 'Ã',
|
513
|
+
'È' => 'È',
|
514
|
+
'Ø' => 'Ø',
|
515
|
+
'Ö' => 'Ö',
|
516
|
+
'×' => '×',
|
517
|
+
'Ñ' => 'Ñ',
|
518
|
+
'Ã’' => 'Ò',
|
519
|
+
'Â' => 'Â',
|
520
|
+
'Ó' => 'Ó',
|
521
|
+
'Ô' => 'Ô',
|
522
|
+
'Ä' => 'Ä',
|
523
|
+
'Æ' => 'Æ',
|
524
|
+
'Ç' => 'Ç',
|
525
|
+
'Õ' => 'Õ',
|
526
|
+
'Ã…' => 'Å',
|
527
|
+
'É' => 'É',
|
528
|
+
'Ë' => 'Ë',
|
529
|
+
'Û' => 'Û',
|
530
|
+
'À' => 'À',
|
531
|
+
'Ù' => 'Ù',
|
532
|
+
'Ã�' => 'Á',
|
533
|
+
'Å ' => 'Š',
|
534
|
+
'Å¡' => 'š',
|
535
|
+
'Ÿ' => 'Ÿ',
|
536
|
+
'Ž' => 'Ž',
|
537
|
+
'ž' => 'ž',
|
538
|
+
'Å’' => 'Œ',
|
539
|
+
'Å“' => 'œ',
|
540
|
+
'Æ’' => 'ƒ',
|
541
|
+
'Ëœ' => '˜',
|
542
|
+
'ˆ' => 'ˆ',
|
260
543
|
"\x00" => '' # Manually added to avoid Bad Argument exception
|
261
544
|
}.freeze
|
262
545
|
|