name-tamer 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.ruby-version +1 -1
- data/doc/maintenance.rake +2 -2
- data/lib/name-tamer.rb +1 -537
- data/lib/name_tamer.rb +21 -0
- data/lib/name_tamer/array.rb +7 -0
- data/lib/name_tamer/constants.rb +121 -0
- data/lib/name_tamer/name.rb +384 -0
- data/lib/{string_extras.rb → name_tamer/string.rb} +14 -8
- data/lib/name_tamer/text.rb +53 -0
- data/lib/name_tamer/version.rb +3 -0
- data/name-tamer.gemspec +10 -10
- data/spec/{name_tamer_spec.rb → name_tamer/name_spec.rb} +2 -2
- data/spec/name_tamer/text_spec.rb +42 -0
- metadata +31 -24
- data/lib/name-tamer/version.rb +0 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fb6ad36c6ae8e6e70a0dc780b7c16a21c044a50e
|
4
|
+
data.tar.gz: 39e85453fb141d296944dfc2541a19a8308c62d5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 50eedb83bbdef219b9ce12309e89faa53f2ca31aad1b20af54068777f7a4792abcca854479409f7259cfa23a376912a558b8c8845a892f7df4807c283a9afd0e
|
7
|
+
data.tar.gz: cef63c1ce63b49618c49f5f07eaaac89bc81839901581a18662c90cb1f652c72b63463fab27263005da32461db9bc2a0e5ac17dd6986f3b1f7395a623077cac0
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.3.
|
1
|
+
2.3.1
|
data/doc/maintenance.rake
CHANGED
@@ -46,7 +46,7 @@ task :check_existing do
|
|
46
46
|
'LL.D.', 'LL.M.', 'M.Ed.', 'M.Sc.', 'Oxon.', 'Ph.D.', 'B.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.',
|
47
47
|
'O.K.', 'P.A.', 'Q.C.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V'
|
48
48
|
].each do |suffix|
|
49
|
-
|
49
|
+
raise suffix unless NameTamer::ADFIXES[:suffix][:person].include? suffix
|
50
50
|
end
|
51
51
|
|
52
52
|
[
|
@@ -71,6 +71,6 @@ task :check_existing do
|
|
71
71
|
'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.', 'ПУП.', 'С.Д.', 'בע"מ', '任意組合',
|
72
72
|
'匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社', 'A/S', 'G/S', 'I/S', 'K/S', 'P/S', 'S/A'
|
73
73
|
].each do |suffix|
|
74
|
-
|
74
|
+
raise suffix unless NameTamer::ADFIXES[:suffix][:organization].include? suffix
|
75
75
|
end
|
76
76
|
end
|
data/lib/name-tamer.rb
CHANGED
@@ -1,537 +1 @@
|
|
1
|
-
|
2
|
-
require 'cgi'
|
3
|
-
require 'string_extras'
|
4
|
-
|
5
|
-
# References:
|
6
|
-
# http://www.w3.org/International/questions/qa-personal-names
|
7
|
-
# https://github.com/berkmancenter/namae
|
8
|
-
# https://github.com/mericson/people
|
9
|
-
# http://en.wikipedia.org/wiki/Types_of_business_entity
|
10
|
-
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(USA)
|
11
|
-
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom)
|
12
|
-
# http://en.wikipedia.org/wiki/Nobiliary_particle
|
13
|
-
# http://en.wikipedia.org/wiki/Spanish_naming_customs
|
14
|
-
# http://linguistlist.org/pubs/tocs/JournalUnifiedStyleSheet2007.pdf [PDF]
|
15
|
-
|
16
|
-
class NameTamer
|
17
|
-
attr_reader :name
|
18
|
-
|
19
|
-
class << self
|
20
|
-
def [](name, args = {})
|
21
|
-
new name, args
|
22
|
-
end
|
23
|
-
|
24
|
-
# Make a slug from a string
|
25
|
-
def parameterize(string, args = {})
|
26
|
-
sep = args[:sep] || SLUG_DELIMITER
|
27
|
-
rfc3987 = args[:rfc3987] || false
|
28
|
-
filter = args[:filter] || (rfc3987 ? FILTER_RFC3987 : FILTER_COMPAT)
|
29
|
-
|
30
|
-
new_string = string.dup
|
31
|
-
|
32
|
-
new_string
|
33
|
-
.whitespace_to!(sep)
|
34
|
-
.invalid_chars_to!(sep)
|
35
|
-
.strip_unwanted!(filter)
|
36
|
-
.fix_separators!(sep)
|
37
|
-
.approximate_latin_chars!
|
38
|
-
|
39
|
-
# Have we got anything left?
|
40
|
-
new_string = '_' if new_string.empty?
|
41
|
-
|
42
|
-
# downcase any latin characters
|
43
|
-
new_string.downcase
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def tidy_name
|
48
|
-
unless @tidy_name
|
49
|
-
@tidy_name = name.dup # Start with the name we've received
|
50
|
-
|
51
|
-
unescape # Unescape percent-encoded characters and fix UTF-8 encoding
|
52
|
-
remove_zero_width # remove zero-width characters
|
53
|
-
tidy_spacing # " John Smith " -> "John Smith"
|
54
|
-
fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
|
55
|
-
consolidate_initials # "I. B. M." -> "I.B.M."
|
56
|
-
end
|
57
|
-
|
58
|
-
@tidy_name
|
59
|
-
end
|
60
|
-
|
61
|
-
def nice_name
|
62
|
-
unless @nice_name
|
63
|
-
@nice_name = tidy_name.dup # Start with the tidied name
|
64
|
-
|
65
|
-
remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
|
66
|
-
fixup_last_name_first # "Smith, John" -> "John Smith"
|
67
|
-
fixup_mismatched_braces # "Ceres (AZ" -> "Ceres (AZ)"
|
68
|
-
remove_adfixes # prefixes and suffixes: "Mr John Smith Jr." -> "John Smith"
|
69
|
-
name_wrangle # proper name case and non-breaking spaces
|
70
|
-
use_nonbreaking_spaces_in_compound_names
|
71
|
-
end
|
72
|
-
|
73
|
-
@nice_name
|
74
|
-
end
|
75
|
-
|
76
|
-
def simple_name
|
77
|
-
unless @simple_name
|
78
|
-
@simple_name = nice_name.dup # Start with nice name
|
79
|
-
|
80
|
-
remove_initials # "John Q. Doe" -> "John Doe"
|
81
|
-
remove_middle_names # "Philip Seymour Hoffman" -> "Philip Hoffman"
|
82
|
-
remove_periods_from_initials # "J.P.R. Williams" -> "JPR Williams"
|
83
|
-
standardize_words # "B&Q Intl" -> "B and Q International"
|
84
|
-
|
85
|
-
@simple_name.whitespace_to!(ASCII_SPACE)
|
86
|
-
end
|
87
|
-
|
88
|
-
@simple_name
|
89
|
-
end
|
90
|
-
|
91
|
-
def slug
|
92
|
-
@slug ||= NameTamer.parameterize simple_name.dup # "John Doe" -> "john-doe"
|
93
|
-
end
|
94
|
-
|
95
|
-
def contact_type
|
96
|
-
nice_name # make sure we've done the bit which infers contact_type
|
97
|
-
contact_type_best_effort
|
98
|
-
end
|
99
|
-
|
100
|
-
def contact_type=(new_contact_type)
|
101
|
-
ct_as_sym = new_contact_type.to_sym
|
102
|
-
|
103
|
-
unless @contact_type.nil? || @contact_type == ct_as_sym
|
104
|
-
puts "Changing contact type of #{@name} from #{@contact_type} to #{new_contact_type}"
|
105
|
-
end
|
106
|
-
|
107
|
-
@contact_type = ct_as_sym
|
108
|
-
end
|
109
|
-
|
110
|
-
# These lines aren't used and aren't covered by specs
|
111
|
-
# def name=(new_name)
|
112
|
-
# initialize new_name, :contact_type => @contact_type
|
113
|
-
# end
|
114
|
-
#
|
115
|
-
# def to_hash
|
116
|
-
# {
|
117
|
-
# name: name,
|
118
|
-
# nice_name: nice_name,
|
119
|
-
# simple_name: simple_name,
|
120
|
-
# slug: slug,
|
121
|
-
# contact_type: contact_type,
|
122
|
-
# last_name: last_name,
|
123
|
-
# remainder: remainder,
|
124
|
-
# adfix_found: adfix_found
|
125
|
-
# }
|
126
|
-
# end
|
127
|
-
|
128
|
-
private
|
129
|
-
|
130
|
-
#--------------------------------------------------------
|
131
|
-
# Tidy up the name we've received
|
132
|
-
#--------------------------------------------------------
|
133
|
-
|
134
|
-
def unescape
|
135
|
-
@tidy_name.ensure_safe!.safe_unescape!.unescape_html!
|
136
|
-
end
|
137
|
-
|
138
|
-
def remove_zero_width
|
139
|
-
@tidy_name.strip_unwanted!(ZERO_WIDTH_FILTER)
|
140
|
-
end
|
141
|
-
|
142
|
-
def tidy_spacing
|
143
|
-
@tidy_name
|
144
|
-
.space_around_comma!
|
145
|
-
.strip_or_self!
|
146
|
-
.whitespace_to!(ASCII_SPACE)
|
147
|
-
end
|
148
|
-
|
149
|
-
def fix_encoding_errors
|
150
|
-
@tidy_name.fix_encoding_errors!
|
151
|
-
end
|
152
|
-
|
153
|
-
# Remove spaces from groups of initials
|
154
|
-
def consolidate_initials
|
155
|
-
@tidy_name
|
156
|
-
.remove_spaces_from_initials!
|
157
|
-
.ensure_space_after_initials!
|
158
|
-
end
|
159
|
-
|
160
|
-
# An adfix is either a prefix or a suffix
|
161
|
-
def remove_adfixes
|
162
|
-
if @last_name.nil?
|
163
|
-
# Our name is still in one part, not two
|
164
|
-
loop do
|
165
|
-
@nice_name = remove_outermost_adfix(:suffix, @nice_name)
|
166
|
-
break unless @adfix_found
|
167
|
-
end
|
168
|
-
|
169
|
-
loop do
|
170
|
-
@nice_name = remove_outermost_adfix(:prefix, @nice_name)
|
171
|
-
break unless @adfix_found
|
172
|
-
end
|
173
|
-
else
|
174
|
-
# Our name is currently in two halves
|
175
|
-
loop do
|
176
|
-
@last_name = remove_outermost_adfix(:suffix, @last_name)
|
177
|
-
break unless @adfix_found
|
178
|
-
end
|
179
|
-
|
180
|
-
loop do
|
181
|
-
@remainder = remove_outermost_adfix(:prefix, @remainder)
|
182
|
-
break unless @adfix_found
|
183
|
-
end
|
184
|
-
end
|
185
|
-
end
|
186
|
-
|
187
|
-
# Names in the form "Smith, John" need to be turned around to "John Smith"
|
188
|
-
def fixup_last_name_first
|
189
|
-
return if @contact_type == :organization
|
190
|
-
|
191
|
-
parts = @nice_name.split ', '
|
192
|
-
|
193
|
-
return unless parts.count == 2
|
194
|
-
|
195
|
-
@last_name = parts[0] # Sometimes the last name alone is all caps and we can name-case it
|
196
|
-
@remainder = parts[1]
|
197
|
-
end
|
198
|
-
|
199
|
-
# Sometimes we end up with mismatched braces after adfix stripping
|
200
|
-
# e.g. "Ceres (Ceres Holdings LLC)" -> "Ceres (Ceres Holdings"
|
201
|
-
def fixup_mismatched_braces
|
202
|
-
left_brace_count = @nice_name.count '('
|
203
|
-
right_brace_count = @nice_name.count ')'
|
204
|
-
|
205
|
-
if left_brace_count > right_brace_count
|
206
|
-
@nice_name += ')'
|
207
|
-
elsif left_brace_count < right_brace_count
|
208
|
-
@nice_name = '(' + @nice_name
|
209
|
-
end
|
210
|
-
end
|
211
|
-
|
212
|
-
def name_wrangle
|
213
|
-
# Fix case if all caps or all lowercase
|
214
|
-
if @last_name.nil?
|
215
|
-
name_wrangle_single_name
|
216
|
-
else
|
217
|
-
name_wrangle_split_name
|
218
|
-
end
|
219
|
-
end
|
220
|
-
|
221
|
-
def name_wrangle_single_name
|
222
|
-
lowercase = @nice_name.downcase
|
223
|
-
uppercase = @nice_name.upcase
|
224
|
-
fix_case = false
|
225
|
-
|
226
|
-
if @contact_type == :organization
|
227
|
-
fix_case = true if @nice_name == uppercase && @nice_name.length > 4
|
228
|
-
else
|
229
|
-
fix_case = true if [uppercase, lowercase].include?(@nice_name)
|
230
|
-
end
|
231
|
-
|
232
|
-
@nice_name = name_case(lowercase) if fix_case
|
233
|
-
end
|
234
|
-
|
235
|
-
def name_wrangle_split_name
|
236
|
-
# It's a person if we've split the name, so no organization logic here
|
237
|
-
lowercase = @last_name.downcase
|
238
|
-
uppercase = @last_name.upcase
|
239
|
-
@last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
|
240
|
-
@nice_name = "#{@remainder} #{@last_name}"
|
241
|
-
end
|
242
|
-
|
243
|
-
# Conjoin compound names with non-breaking spaces
|
244
|
-
def use_nonbreaking_spaces_in_compound_names
|
245
|
-
@nice_name
|
246
|
-
.nbsp_in_compound_name!
|
247
|
-
.nbsp_in_name_modifier!
|
248
|
-
end
|
249
|
-
|
250
|
-
#--------------------------------------------------------
|
251
|
-
# Make search name from nice name
|
252
|
-
#--------------------------------------------------------
|
253
|
-
|
254
|
-
# Remove initials from personal names unless they are the only identifier.
|
255
|
-
# i.e. only remove initials if there's also a proper name there
|
256
|
-
def remove_initials
|
257
|
-
return unless @contact_type == :person
|
258
|
-
|
259
|
-
temp_name = @simple_name.gsub(/\b([a-z](?:\.*\s+|\.))/i, '')
|
260
|
-
|
261
|
-
# If the name still has at least one space we're OK
|
262
|
-
@simple_name = temp_name if temp_name.include?(ASCII_SPACE)
|
263
|
-
end
|
264
|
-
|
265
|
-
def remove_middle_names
|
266
|
-
return unless @contact_type == :person
|
267
|
-
|
268
|
-
first_name, parts = find_first_usable_name(@simple_name.split)
|
269
|
-
last_name, = find_last_usable_name(parts)
|
270
|
-
|
271
|
-
return unless first_name || last_name
|
272
|
-
|
273
|
-
separator = first_name && last_name ? ' ' : ''
|
274
|
-
@simple_name = "#{first_name}#{separator}#{last_name}"
|
275
|
-
end
|
276
|
-
|
277
|
-
def find_first_usable_name(parts)
|
278
|
-
part = nil
|
279
|
-
|
280
|
-
parts.each_index do |i|
|
281
|
-
part = parts[i]
|
282
|
-
next if part.gsub(FILTER_COMPAT, '').empty?
|
283
|
-
parts = parts.slice(i + 1, parts.length) # don't use "slice!"
|
284
|
-
break
|
285
|
-
end
|
286
|
-
|
287
|
-
[part, parts]
|
288
|
-
end
|
289
|
-
|
290
|
-
def find_last_usable_name(parts)
|
291
|
-
part = nil
|
292
|
-
|
293
|
-
parts.reverse_each do |p|
|
294
|
-
next if p.gsub(FILTER_COMPAT, '').empty?
|
295
|
-
part = p
|
296
|
-
break
|
297
|
-
end
|
298
|
-
|
299
|
-
part
|
300
|
-
end
|
301
|
-
|
302
|
-
def remove_periods_from_initials
|
303
|
-
@simple_name.remove_periods_from_initials!
|
304
|
-
end
|
305
|
-
|
306
|
-
def standardize_words
|
307
|
-
@simple_name.gsub!(/ *& */, ' and ') # replace ampersand characters with ' and '
|
308
|
-
@simple_name.gsub!(/ *\+ */, ' plus ') # replace plus signs with ' plus '
|
309
|
-
@simple_name.gsub!(/\bintl\b/i, 'International') # replace 'intl' with 'International'
|
310
|
-
@simple_name.gsub!(/[־‐‑‒–—―−﹘﹣-]/, SLUG_DELIMITER) # Replace Unicode dashes with ASCII hyphen
|
311
|
-
@simple_name.strip_unwanted!(/["“”™℠®©℗]/) # remove quotes and commercial decoration
|
312
|
-
end
|
313
|
-
|
314
|
-
#--------------------------------------------------------
|
315
|
-
# Initialization and utilities
|
316
|
-
#--------------------------------------------------------
|
317
|
-
|
318
|
-
def initialize(new_name, args = {})
|
319
|
-
@name = new_name || ''
|
320
|
-
@contact_type = contact_type_from args
|
321
|
-
|
322
|
-
@tidy_name = nil
|
323
|
-
@nice_name = nil
|
324
|
-
@simple_name = nil
|
325
|
-
@slug = nil
|
326
|
-
|
327
|
-
@last_name = nil
|
328
|
-
@remainder = nil
|
329
|
-
|
330
|
-
@adfix_found = false
|
331
|
-
end
|
332
|
-
|
333
|
-
def contact_type_from(args)
|
334
|
-
args_ct = args[:contact_type]
|
335
|
-
return unless args_ct
|
336
|
-
|
337
|
-
ct = args_ct.is_a?(Symbol) ? args_ct : args_ct.dup
|
338
|
-
ct = ct.to_s unless [String, Symbol].include? ct.class
|
339
|
-
ct.downcase! if ct.class == String
|
340
|
-
ct = ct.to_sym
|
341
|
-
ct = nil unless [:person, :organization].include? ct
|
342
|
-
|
343
|
-
ct
|
344
|
-
end
|
345
|
-
|
346
|
-
# If we don't know the contact type, what's our best guess?
|
347
|
-
def contact_type_best_effort
|
348
|
-
if @contact_type
|
349
|
-
@contact_type
|
350
|
-
else
|
351
|
-
# If it's just one word we'll assume organization.
|
352
|
-
# If more then we'll assume a person
|
353
|
-
@name.include?(ASCII_SPACE) ? :person : :organization
|
354
|
-
end
|
355
|
-
end
|
356
|
-
|
357
|
-
# We pass to this routine either prefixes or suffixes
|
358
|
-
def remove_outermost_adfix(adfix_type, name_part)
|
359
|
-
ct, parts = find_contact_type_and_parts(ADFIX_PATTERNS[adfix_type], name_part)
|
360
|
-
|
361
|
-
return name_part unless @adfix_found
|
362
|
-
|
363
|
-
# If we've found a diagnostic adfix then set the contact type
|
364
|
-
self.contact_type = ct
|
365
|
-
|
366
|
-
# The remainder of the name will be in parts[0] or parts[2] depending
|
367
|
-
# on whether this is a prefix or a suffix.
|
368
|
-
# We'll also remove any trailing commas we've exposed.
|
369
|
-
(parts[0] + parts[2]).gsub(/\s*,\s*$/, '')
|
370
|
-
end
|
371
|
-
|
372
|
-
def find_contact_type_and_parts(adfixes, name_part)
|
373
|
-
ct = contact_type_best_effort
|
374
|
-
parts = name_part.partition adfixes[ct]
|
375
|
-
@adfix_found = !parts[1].empty?
|
376
|
-
|
377
|
-
return [ct, parts] if @contact_type || @adfix_found
|
378
|
-
|
379
|
-
# If the contact type is indeterminate and we didn't find a diagnostic adfix
|
380
|
-
# for a person then try again for an organization
|
381
|
-
ct = :organization
|
382
|
-
parts = name_part.partition adfixes[ct]
|
383
|
-
@adfix_found = !parts[1].empty?
|
384
|
-
|
385
|
-
[ct, parts]
|
386
|
-
end
|
387
|
-
|
388
|
-
# Original Version of NameCase:
|
389
|
-
# Copyright (c) Mark Summerfield 1998-2008. All Rights Reserved
|
390
|
-
# This module may be used/distributed/modified under the same terms as Perl itself
|
391
|
-
# http://dev.perl.org/licenses/ (GPL)
|
392
|
-
#
|
393
|
-
# Ruby Version:
|
394
|
-
# Copyright (c) Aaron Patterson 2006
|
395
|
-
# NameCase is distributed under the GPL license.
|
396
|
-
#
|
397
|
-
# Substantially modified for Xendata
|
398
|
-
# Improved in several areas, also now adds non-breaking spaces for
|
399
|
-
# compound names like "van der Pump"
|
400
|
-
def name_case(lowercase)
|
401
|
-
n = lowercase.dup # We assume the name is passed already downcased
|
402
|
-
|
403
|
-
n
|
404
|
-
.upcase_first_letter!
|
405
|
-
.downcase_after_apostrophe!
|
406
|
-
.fix_mac!
|
407
|
-
.fix_ff!
|
408
|
-
.fix_name_modifiers!
|
409
|
-
.upcase_initials!
|
410
|
-
end
|
411
|
-
|
412
|
-
#--------------------------------------------------------
|
413
|
-
# Constants
|
414
|
-
#--------------------------------------------------------
|
415
|
-
|
416
|
-
NONBREAKING_SPACE = "\u00a0"
|
417
|
-
ASCII_SPACE = "\u0020"
|
418
|
-
ADFIX_JOINERS = "[#{ASCII_SPACE}-]"
|
419
|
-
SLUG_DELIMITER = '-'
|
420
|
-
ZERO_WIDTH_FILTER = /[\u180E\u200B\u200C\u200D\u2063\uFEFF]/
|
421
|
-
|
422
|
-
# Constants for parameterizing Unicode strings for IRIs
|
423
|
-
#
|
424
|
-
# Allowed characters in an IRI segment are defined by RFC 3987
|
425
|
-
# (https://tools.ietf.org/html/rfc3987#section-2.2) as follows:
|
426
|
-
#
|
427
|
-
# isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
|
428
|
-
# / "@" )
|
429
|
-
# ; non-zero-length segment without any colon ":"
|
430
|
-
#
|
431
|
-
# iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
|
432
|
-
#
|
433
|
-
# pct-encoded = "%" HEXDIG HEXDIG
|
434
|
-
#
|
435
|
-
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
|
436
|
-
# / "*" / "+" / "," / ";" / "="
|
437
|
-
#
|
438
|
-
# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
|
439
|
-
# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
|
440
|
-
# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
|
441
|
-
# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
|
442
|
-
# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
|
443
|
-
# / %xD0000-DFFFD / %xE1000-EFFFD
|
444
|
-
#
|
445
|
-
# Note that we can't use Unicode code points above \uFFFF because of
|
446
|
-
# regex limitations, so we'll ignore ucschar above that point.
|
447
|
-
#
|
448
|
-
# We're using the most restrictive segment definition (isegment-nz-nc)
|
449
|
-
# to avoid any possible problems with the IRI that it one day might
|
450
|
-
# get placed in.
|
451
|
-
ALPHA = 'A-Za-z'
|
452
|
-
DIGIT = '0-9'
|
453
|
-
UCSCHAR = '\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'
|
454
|
-
IUNRESERVED = "#{ALPHA}#{DIGIT}\\-\\._~#{UCSCHAR}"
|
455
|
-
SUBDELIMS = '!$&\'\(\)\*+,;='
|
456
|
-
ISEGMENT_NZ_NC = "#{IUNRESERVED}#{SUBDELIMS}@" # pct-encoded not needed
|
457
|
-
FILTER_RFC3987 = /[^#{ISEGMENT_NZ_NC}]/
|
458
|
-
FILTER_COMPAT = /[^#{ALPHA}#{DIGIT}\-_#{UCSCHAR}]/
|
459
|
-
|
460
|
-
# These are the prefixes and suffixes we want to remove
|
461
|
-
# If you add to the list, you can use spaces and dots where appropriate
|
462
|
-
# Ensure any single letters are followed by a dot because we'll add one to the string
|
463
|
-
# during processing, e.g. "y Cia." should be "y. Cia."
|
464
|
-
ADFIXES = {
|
465
|
-
prefix: {
|
466
|
-
person: [
|
467
|
-
'Baron', 'Baroness', 'Capt.', 'Captain', 'Col.', 'Colonel', 'Dame',
|
468
|
-
'Doctor', 'Dr.', 'Judge', 'Justice', 'Lady', 'Lieut.', 'Lieutenant',
|
469
|
-
'Lord', 'Madame', 'Major', 'Master', 'Matron', 'Messrs.', 'Mgr.',
|
470
|
-
'Miss', 'Mister', 'Mlle.', 'Mme.', 'Mons.', 'Mr.', 'Mr. & Mrs.',
|
471
|
-
'Mr. and Mrs.', 'Mrs.', 'Msgr.', 'Ms.', 'Prof.', 'Professor', 'Rev.',
|
472
|
-
'Reverend', 'Sir', 'Sister', 'The Hon.', 'The Lady.', 'The Lord',
|
473
|
-
'The Rt. Hon.'
|
474
|
-
],
|
475
|
-
organization: [
|
476
|
-
'Fa.', 'P.T.', 'P.T. Tbk.', 'U.D.'
|
477
|
-
],
|
478
|
-
before: '\\A', after: ADFIX_JOINERS
|
479
|
-
},
|
480
|
-
suffix: {
|
481
|
-
person: [
|
482
|
-
'Chartered F.C.S.I.', 'Chartered M.C.S.I.', 'I.F.R.S. Certified', 'F.Inst.L.M.', 'C.I.S.S.P.', 'F.C.I.P.S.',
|
483
|
-
'M.R.I.C.S.', 'T.M.I.E.T.', 'Dip. D.M.', 'A.A.M.S.', 'A.C.C.A.', 'A.C.M.A.', 'A.I.F.A.', 'A.W.M.A.', 'C.A.I.A.',
|
484
|
-
'C.A.P.M.', 'C.C.I.M.', 'C.D.F.A.', 'C.E.P.P.', 'C.F.B.S.', 'C.G.M.A.', 'C.I.T.P.', 'C.L.T.C.', 'C.P.C.C.',
|
485
|
-
'C.R.P.C.', 'C.R.P.S.', 'C.S.O.X.', 'C.S.S.D.', 'F.B.C.S.', 'F.C.C.A.', 'F.C.M.I.', 'F.C.S.I.', 'F.I.E.T.',
|
486
|
-
'F.I.R.P.', 'M.I.E.T.', 'M.S.F.S.', 'M.Sc. D.', 'O.R.S.C.', 'R.I.C.P.', 'B.Tech.', 'Cantab.', 'Ch.F.C.',
|
487
|
-
'D.Phil.', 'I.T.I.L. v3', 'M.Io.D.', 'S.C.M.P', 'A.C.A.', 'A.C.C.', 'A.E.P.', 'A.I.F.', 'A.S.A.', 'B.Eng.',
|
488
|
-
'C.B.V.', 'C.E.M.', 'C.Eng.', 'C.F.A.', 'C.F.F.', 'C.F.P.', 'C.F.S.', 'C.G.A.', 'C.G.B.', 'C.G.P.', 'C.I.M.',
|
489
|
-
'C.L.P.', 'C.L.U.', 'C.M.A.', 'C.M.T.', 'C.P.A.', 'C.T.A.', 'C.W.S.', 'D.B.E.', 'D.D.S.', 'D.V.M.', 'E.R.P.',
|
490
|
-
'Eng.D.', 'F.C.A.', 'F.P.C.', 'F.R.M.', 'F.R.M.', 'G.S.P.', 'L.P.S.', 'M.B.A.', 'M.B.E.', 'M.E.P.', 'M.Eng.',
|
491
|
-
'M.Jur.', 'M.P.A.', 'M.S.F.', 'M.S.P.', 'O.B.E.', 'P.C.C.', 'P.F.S.', 'P.H.R.', 'P.M.C.', 'P.M.P.', 'P.M.P.',
|
492
|
-
'P.S.P.', 'R.F.C.', 'V.M.D.', 'B.Ed.', 'B.Sc.', 'Ed.D.', 'Ed.M.', 'Hons.', 'LL.B.', 'LL.D.', 'LL.M.', 'M.Ed.',
|
493
|
-
'M.Sc.', 'Oxon.', 'Ph.D.', 'B.A.', 'C.A.', 'E.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.', 'M.S.',
|
494
|
-
'O.K.', 'P.A.', 'Q.C.', 'R.D.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V'
|
495
|
-
],
|
496
|
-
organization: [
|
497
|
-
'S. de R.L. de C.V.', 'S.A.P.I. de C.V.', 'y. Cía. S. en C.', 'Private Limited', 'S.M. Pte. Ltd.',
|
498
|
-
'Cía. S. C. A.', 'y. Cía. S. C.', 'S.A. de C.V.', 'spol. s.r.o.', '(Pty.) Ltd.', '(Pvt.) Ltd.', 'A.D.S.I.Tz.',
|
499
|
-
'S.p. z.o.o.', '(Pvt.)Ltd.', 'akc. spol.', 'Cía. Ltda.', 'E.B.V.B.A.', 'P. Limited', 'S. de R.L.', 'S.I.C.A.V.',
|
500
|
-
'S.P.R.L.U.', 'А.Д.С.И.Ц.', '(P.) Ltd.', 'C. por A.', 'Comm.V.A.', 'Ltd. Şti.', 'Plc. Ltd.', 'Pte. Ltd.',
|
501
|
-
'Pty. Ltd.', 'Pvt. Ltd.', 'Soc. Col.', 'A.M.B.A.', 'A.S.B.L.', 'A.V.E.E.', 'B.V.B.A.', 'B.V.I.O.', 'C.V.B.A.',
|
502
|
-
'C.V.O.A.', 'E.E.I.G.', 'E.I.R.L.', 'E.O.O.D.', 'E.U.R.L.', 'F.M.B.A.', 'G.m.b.H.', 'Ges.b.R.', 'K.G.a.A.',
|
503
|
-
'L.L.L.P.', 'Ltd. Co.', 'Ltd. Co.', 'M.E.P.E.', 'n.y.r.t.', 'O.V.E.E.', 'P.E.E.C.', 'P.L.L.C.', 'P.L.L.C.',
|
504
|
-
'S. en C.', 'S.a.p.a.', 'S.A.R.L.', 'S.à.R.L.', 'S.A.S.U.', 'S.C.e.I.', 'S.C.O.P.', 'S.C.p.A.', 'S.C.R.I.',
|
505
|
-
'S.C.R.L.', 'S.M.B.A.', 'S.P.R.L.', 'Е.О.О.Д.', '&. Cie.', 'and Co.', 'Comm.V.', 'Limited', 'P. Ltd.',
|
506
|
-
'Part.G.', 'Sh.p.k.', '&. Co.', 'C.X.A.', 'd.n.o.', 'd.o.o.', 'E.A.D.', 'e.h.f.', 'E.P.E.', 'E.S.V.', 'F.C.P.',
|
507
|
-
'F.I.E.', 'G.b.R.', 'G.I.E.', 'G.M.K.', 'G.S.K.', 'H.U.F.', 'K.D.A.', 'k.f.t.', 'k.h.t.', 'k.k.t.', 'L.L.C.',
|
508
|
-
'L.L.P.', 'o.h.f.', 'O.H.G.', 'O.O.D.', 'O.y.j.', 'p.l.c.', 'P.S.U.', 'S.A.E.', 'S.A.S.', 'S.C.A.', 'S.C.E.',
|
509
|
-
'S.C.S.', 'S.E.M.', 'S.E.P.', 's.e.s.', 'S.G.R.', 'S.N.C.', 'S.p.A.', 'S.P.E.', 'S.R.L.', 's.r.o.', 'Unltd.',
|
510
|
-
'V.O.F.', 'V.o.G.', 'v.o.s.', 'V.Z.W.', 'z.r.t.', 'А.А.Т.', 'Е.А.Д.', 'З.А.Т.', 'К.Д.А.', 'О.О.Д.', 'Т.А.А.',
|
511
|
-
'股份有限公司', 'Ap.S.', 'Corp.', 'ltda.', 'Sh.A.', 'st.G.', 'Ultd.', 'a.b.', 'A.D.', 'A.E.', 'A.G.', 'A.S.',
|
512
|
-
'A.Ş.', 'A.y.', 'B.M.', 'b.t.', 'B.V.', 'C.A.', 'C.V.', 'd.d.', 'e.c.', 'E.E.', 'e.G.', 'E.I.', 'E.P.', 'E.T.',
|
513
|
-
'E.U.', 'e.v.', 'G.K.', 'G.P.', 'h.f.', 'Inc.', 'K.D.', 'K.G.', 'K.K.', 'k.s.', 'k.v.', 'K.y.', 'L.C.', 'L.P.',
|
514
|
-
'Ltd.', 'N.K.', 'N.L.', 'N.V.', 'O.E.', 'O.G.', 'O.Ü.', 'O.y.', 'P.C.', 'p.l.', 'Pty.', 'PUP.', 'Pvt.', 'r.t.',
|
515
|
-
'S.A.', 'S.D.', 'S.E.', 's.f.', 'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.',
|
516
|
-
'ПУП.', 'С.Д.', 'בע"מ', '任意組合', '匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社',
|
517
|
-
'A/S', 'G/S', 'I/S', 'K/S', 'P/S', 'S/A'
|
518
|
-
],
|
519
|
-
before: ADFIX_JOINERS, after: '\\z'
|
520
|
-
}
|
521
|
-
}
|
522
|
-
|
523
|
-
ADFIX_PATTERNS = {}
|
524
|
-
|
525
|
-
[:prefix, :suffix].each do |adfix_type|
|
526
|
-
patterns = {}
|
527
|
-
adfix = ADFIXES[adfix_type]
|
528
|
-
|
529
|
-
[:person, :organization].each do |ct|
|
530
|
-
with_optional_spaces = adfix[ct].map { |p| p.gsub(ASCII_SPACE, ' *') }
|
531
|
-
pattern_string = with_optional_spaces.join('|').gsub('.', '\.*')
|
532
|
-
patterns[ct] = /#{adfix[:before]}\(*(?:#{pattern_string})[®™\)]*#{adfix[:after]}/i
|
533
|
-
end
|
534
|
-
|
535
|
-
ADFIX_PATTERNS[adfix_type] = patterns
|
536
|
-
end
|
537
|
-
end
|
1
|
+
require 'name_tamer'
|