name_tamer 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.codeclimate.yml +18 -0
- data/.env +1 -0
- data/.gitignore +26 -0
- data/.hound.yml +6 -0
- data/.rspec +2 -0
- data/.rubocop.yml +63 -0
- data/.travis.yml +13 -0
- data/Gemfile +20 -0
- data/Guardfile +16 -0
- data/LICENSE +21 -0
- data/README.md +82 -0
- data/Rakefile +14 -0
- data/doc/maintenance.rake +76 -0
- data/doc/prefixes.csv +49 -0
- data/doc/suffixes.csv +345 -0
- data/lib/name-tamer.rb +1 -0
- data/lib/name_tamer.rb +22 -0
- data/lib/name_tamer/array.rb +8 -0
- data/lib/name_tamer/constants.rb +121 -0
- data/lib/name_tamer/name.rb +390 -0
- data/lib/name_tamer/string.rb +280 -0
- data/lib/name_tamer/text.rb +53 -0
- data/lib/name_tamer/version.rb +3 -0
- data/name_tamer.gemspec +19 -0
- data/spec/name_tamer/name_spec.rb +95 -0
- data/spec/name_tamer/string_spec.rb +5 -0
- data/spec/name_tamer/text_spec.rb +40 -0
- data/spec/spec_helper.rb +14 -0
- data/spec/support/names.yml +741 -0
- metadata +79 -0
data/lib/name-tamer.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'name_tamer'
|
data/lib/name_tamer.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'cgi'
|
4
|
+
require 'name_tamer/string'
|
5
|
+
require 'name_tamer/array'
|
6
|
+
require 'name_tamer/constants'
|
7
|
+
|
8
|
+
module NameTamer
|
9
|
+
autoload :Name, 'name_tamer/name'
|
10
|
+
autoload :Text, 'name_tamer/text'
|
11
|
+
|
12
|
+
class << self
|
13
|
+
def [](name, args = {})
|
14
|
+
NameTamer::Name.new name, args
|
15
|
+
end
|
16
|
+
|
17
|
+
# Make a slug from a string
|
18
|
+
def parameterize(string, args = {})
|
19
|
+
NameTamer::Text.new(string, args).parameterize
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module NameTamer
|
2
|
+
NONBREAKING_SPACE = "\u00a0".freeze
|
3
|
+
ASCII_SPACE = ' '.freeze
|
4
|
+
ADFIX_JOINERS = "[#{ASCII_SPACE}-]".freeze
|
5
|
+
SLUG_DELIMITER = '-'.freeze
|
6
|
+
ZERO_WIDTH_FILTER = /[\u180E\u200B\u200C\u200D\u2063\uFEFF]/
|
7
|
+
|
8
|
+
# Constants for parameterizing Unicode strings for IRIs
|
9
|
+
#
|
10
|
+
# Allowed characters in an IRI segment are defined by RFC 3987
|
11
|
+
# (https://tools.ietf.org/html/rfc3987#section-2.2) as follows:
|
12
|
+
#
|
13
|
+
# isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
|
14
|
+
# / "@" )
|
15
|
+
# ; non-zero-length segment without any colon ":"
|
16
|
+
#
|
17
|
+
# iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
|
18
|
+
#
|
19
|
+
# pct-encoded = "%" HEXDIG HEXDIG
|
20
|
+
#
|
21
|
+
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
|
22
|
+
# / "*" / "+" / "," / ";" / "="
|
23
|
+
#
|
24
|
+
# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
|
25
|
+
# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
|
26
|
+
# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
|
27
|
+
# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
|
28
|
+
# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
|
29
|
+
# / %xD0000-DFFFD / %xE1000-EFFFD
|
30
|
+
#
|
31
|
+
# Note that we can't use Unicode code points above \uFFFF because of
|
32
|
+
# regex limitations, so we'll ignore ucschar above that point.
|
33
|
+
#
|
34
|
+
# We're using the most restrictive segment definition (isegment-nz-nc)
|
35
|
+
# to avoid any possible problems with the IRI that it one day might
|
36
|
+
# get placed in.
|
37
|
+
ALPHA = 'A-Za-z'.freeze
|
38
|
+
DIGIT = '0-9'.freeze
|
39
|
+
UCSCHAR = '\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'.freeze
|
40
|
+
IUNRESERVED = "#{ALPHA}#{DIGIT}\\-\\._~#{UCSCHAR}".freeze
|
41
|
+
SUBDELIMS = '!$&\'\(\)\*+,;='.freeze
|
42
|
+
ISEGMENT_NZ_NC = "#{IUNRESERVED}#{SUBDELIMS}@".freeze # pct-encoded not needed
|
43
|
+
FILTER_RFC3987 = /[^#{ISEGMENT_NZ_NC}]/
|
44
|
+
FILTER_COMPAT = /[^#{ALPHA}#{DIGIT}\-_#{UCSCHAR}]/
|
45
|
+
|
46
|
+
# These are the prefixes and suffixes we want to remove
|
47
|
+
# If you add to the list, you can use spaces and dots where appropriate
|
48
|
+
# Ensure any single letters are followed by a dot because we'll add one to the string
|
49
|
+
# during processing, e.g. "y Cia." should be "y. Cia."
|
50
|
+
ADFIXES = {
|
51
|
+
prefix: {
|
52
|
+
person: [
|
53
|
+
'Baroness', 'Capt.', 'Captain', 'Col.', 'Colonel', 'Dame', 'Doctor',
|
54
|
+
'Dr.', 'Judge', 'Justice', 'Lady', 'Lieut.', 'Lieutenant', 'Lord',
|
55
|
+
'Madame', 'Major', 'Master', 'Matron', 'Messrs.', 'Mgr.', 'Miss',
|
56
|
+
'Mister', 'Mlle.', 'Mme.', 'Mons.', 'Mr.', 'Mr. & Mrs.', 'Mr. and Mrs.',
|
57
|
+
'Mrs.', 'Ms.', 'Msgr.', 'Prof.', 'Professor', 'Rev.', 'Reverend', 'Sir',
|
58
|
+
'Sister', 'The Hon.', 'The Lady.', 'The Lord', 'The Rt. Hon.', 'Doktor',
|
59
|
+
'Herr', 'Frau'
|
60
|
+
],
|
61
|
+
organization: [
|
62
|
+
'Fa.', 'P.T.', 'P.T. Tbk.', 'U.D.'
|
63
|
+
],
|
64
|
+
before: '\\A', after: ADFIX_JOINERS
|
65
|
+
},
|
66
|
+
suffix: {
|
67
|
+
person: [
|
68
|
+
'Chartered F.C.S.I.', 'Chartered M.C.S.I.', 'I.F.R.S. Certified', 'F.Inst.L.M.', 'C.I.S.S.P.', 'F.C.I.P.S.',
|
69
|
+
'M.R.I.C.S.', 'T.M.I.E.T.', 'Dip. D.M.', 'A.A.M.S.', 'A.C.C.A.', 'A.C.M.A.', 'A.I.F.A.', 'A.W.M.A.', 'C.A.I.A.',
|
70
|
+
'C.A.P.M.', 'C.C.I.M.', 'C.D.F.A.', 'C.E.P.P.', 'C.F.B.S.', 'C.G.M.A.', 'C.I.T.P.', 'C.L.T.C.', 'C.P.C.C.',
|
71
|
+
'C.R.P.C.', 'C.R.P.S.', 'C.S.O.X.', 'C.S.S.D.', 'F.B.C.S.', 'F.C.C.A.', 'F.C.M.I.', 'F.C.S.I.', 'F.I.E.T.',
|
72
|
+
'F.I.R.P.', 'M.I.E.T.', 'M.S.F.S.', 'M.Sc. D.', 'O.R.S.C.', 'R.I.C.P.', 'B.Tech.', 'Cantab.', 'Ch.F.C.',
|
73
|
+
'D.Phil.', 'I.T.I.L. v3', 'M.Io.D.', 'S.C.M.P', 'A.C.A.', 'A.C.C.', 'A.E.P.', 'A.I.F.', 'A.S.A.', 'B.Eng.',
|
74
|
+
'C.B.V.', 'C.E.M.', 'C.Eng.', 'C.F.A.', 'C.F.F.', 'C.F.P.', 'C.F.S.', 'C.G.A.', 'C.G.B.', 'C.G.P.', 'C.I.M.',
|
75
|
+
'C.L.P.', 'C.L.U.', 'C.M.A.', 'C.M.T.', 'C.P.A.', 'C.T.A.', 'C.W.S.', 'D.B.E.', 'D.D.S.', 'D.V.M.', 'E.R.P.',
|
76
|
+
'Eng.D.', 'F.C.A.', 'F.P.C.', 'F.R.M.', 'F.R.M.', 'G.S.P.', 'L.P.S.', 'M.B.A.', 'M.B.E.', 'M.E.P.', 'M.Eng.',
|
77
|
+
'M.Jur.', 'M.P.A.', 'M.S.F.', 'M.S.P.', 'O.B.E.', 'P.C.C.', 'P.F.S.', 'P.H.R.', 'P.M.C.', 'P.M.P.', 'P.M.P.',
|
78
|
+
'P.S.P.', 'R.F.C.', 'V.M.D.', 'B.Ed.', 'B.Sc.', 'Ed.D.', 'Ed.M.', 'Hons.', 'LL.B.', 'LL.D.', 'LL.M.', 'M.Ed.',
|
79
|
+
'M.Sc.', 'Oxon.', 'Ph.D.', 'B.A.', 'C.A.', 'E.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.', 'M.S.',
|
80
|
+
'O.K.', 'P.A.', 'Q.C.', 'R.D.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V'
|
81
|
+
],
|
82
|
+
organization: [
|
83
|
+
'S. de R.L. de C.V.', 'S.A.P.I. de C.V.', 'y. Cía. S. en C.', 'Private Limited', 'S.M. Pte. Ltd.',
|
84
|
+
'Cía. S. C. A.', 'y. Cía. S. C.', 'S.A. de C.V.', 'spol. s.r.o.', '(Pty.) Ltd.', '(Pvt.) Ltd.', 'A.D.S.I.Tz.',
|
85
|
+
'S.p. z.o.o.', '(Pvt.)Ltd.', 'akc. spol.', 'Cía. Ltda.', 'E.B.V.B.A.', 'P. Limited', 'S. de R.L.', 'S.I.C.A.V.',
|
86
|
+
'S.P.R.L.U.', 'А.Д.С.И.Ц.', '(P.) Ltd.', 'C. por A.', 'Comm.V.A.', 'Ltd. Şti.', 'Plc. Ltd.', 'Pte. Ltd.',
|
87
|
+
'Pty. Ltd.', 'Pvt. Ltd.', 'Soc. Col.', 'A.M.B.A.', 'A.S.B.L.', 'A.V.E.E.', 'B.V.B.A.', 'B.V.I.O.', 'C.V.B.A.',
|
88
|
+
'C.V.O.A.', 'E.E.I.G.', 'E.I.R.L.', 'E.O.O.D.', 'E.U.R.L.', 'F.M.B.A.', 'G.m.b.H.', 'Ges.b.R.', 'K.G.a.A.',
|
89
|
+
'L.L.L.P.', 'Ltd. Co.', 'Ltd. Co.', 'M.E.P.E.', 'n.y.r.t.', 'O.V.E.E.', 'P.E.E.C.', 'P.L.L.C.', 'P.L.L.C.',
|
90
|
+
'S. en C.', 'S.a.p.a.', 'S.A.R.L.', 'S.à.R.L.', 'S.A.S.U.', 'S.C.e.I.', 'S.C.O.P.', 'S.C.p.A.', 'S.C.R.I.',
|
91
|
+
'S.C.R.L.', 'S.M.B.A.', 'S.P.R.L.', 'Е.О.О.Д.', '&. Cie.', 'and Co.', 'Comm.V.', 'Limited', 'P. Ltd.',
|
92
|
+
'Part.G.', 'Sh.p.k.', '&. Co.', 'C.X.A.', 'd.n.o.', 'd.o.o.', 'E.A.D.', 'e.h.f.', 'E.P.E.', 'E.S.V.', 'F.C.P.',
|
93
|
+
'F.I.E.', 'G.b.R.', 'G.I.E.', 'G.M.K.', 'G.S.K.', 'H.U.F.', 'K.D.A.', 'k.f.t.', 'k.h.t.', 'k.k.t.', 'L.L.C.',
|
94
|
+
'L.L.P.', 'o.h.f.', 'O.H.G.', 'O.O.D.', 'O.y.j.', 'p.l.c.', 'P.S.U.', 'S.A.E.', 'S.A.S.', 'S.C.A.', 'S.C.E.',
|
95
|
+
'S.C.S.', 'S.E.M.', 'S.E.P.', 's.e.s.', 'S.G.R.', 'S.N.C.', 'S.p.A.', 'S.P.E.', 'S.R.L.', 's.r.o.', 'Unltd.',
|
96
|
+
'V.O.F.', 'V.o.G.', 'v.o.s.', 'V.Z.W.', 'z.r.t.', 'А.А.Т.', 'Е.А.Д.', 'З.А.Т.', 'К.Д.А.', 'О.О.Д.', 'Т.А.А.',
|
97
|
+
'股份有限公司', 'Ap.S.', 'Corp.', 'ltda.', 'Sh.A.', 'st.G.', 'Ultd.', 'a.b.', 'A.D.', 'A.E.', 'A.G.', 'A.S.',
|
98
|
+
'A.Ş.', 'A.y.', 'B.M.', 'b.t.', 'B.V.', 'C.A.', 'C.V.', 'd.d.', 'e.c.', 'E.E.', 'e.G.', 'E.I.', 'E.P.', 'E.T.',
|
99
|
+
'E.U.', 'e.v.', 'G.K.', 'G.P.', 'h.f.', 'Inc.', 'K.D.', 'K.G.', 'K.K.', 'k.s.', 'k.v.', 'K.y.', 'L.C.', 'L.P.',
|
100
|
+
'Ltd.', 'N.K.', 'N.L.', 'N.V.', 'O.E.', 'O.G.', 'O.Ü.', 'O.y.', 'P.C.', 'p.l.', 'Pty.', 'PUP.', 'Pvt.', 'r.t.',
|
101
|
+
'S.A.', 'S.D.', 'S.E.', 's.f.', 'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.',
|
102
|
+
'ПУП.', 'С.Д.', 'בע"מ', '任意組合', '匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社',
|
103
|
+
'A/S', 'G/S', 'I/S', 'K/S', 'P/S', 'S/A'
|
104
|
+
],
|
105
|
+
before: ADFIX_JOINERS, after: '\\z'
|
106
|
+
}
|
107
|
+
}.freeze
|
108
|
+
|
109
|
+
ADFIX_PATTERNS = Hash[%i[prefix suffix].map do |adfix_type|
|
110
|
+
patterns = {}
|
111
|
+
adfix = ADFIXES[adfix_type]
|
112
|
+
|
113
|
+
%i[person organization].each do |ct|
|
114
|
+
with_optional_spaces = adfix[ct].map { |p| p.gsub(ASCII_SPACE, ' *') }
|
115
|
+
pattern_string = with_optional_spaces.join('|').gsub('.', '\.*')
|
116
|
+
patterns[ct] = /#{adfix[:before]}\(*(?:#{pattern_string})[®™\)]*#{adfix[:after]}/i
|
117
|
+
end
|
118
|
+
|
119
|
+
[adfix_type, patterns]
|
120
|
+
end]
|
121
|
+
end
|
@@ -0,0 +1,390 @@
|
|
1
|
+
module NameTamer
|
2
|
+
class Name
|
3
|
+
# References:
|
4
|
+
# http://www.w3.org/International/questions/qa-personal-names
|
5
|
+
# https://github.com/berkmancenter/namae
|
6
|
+
# https://github.com/mericson/people
|
7
|
+
# http://en.wikipedia.org/wiki/Types_of_business_entity
|
8
|
+
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(USA)
|
9
|
+
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom)
|
10
|
+
# http://en.wikipedia.org/wiki/Nobiliary_particle
|
11
|
+
# http://en.wikipedia.org/wiki/Spanish_naming_customs
|
12
|
+
# http://linguistlist.org/pubs/tocs/JournalUnifiedStyleSheet2007.pdf [PDF]
|
13
|
+
attr_reader :name
|
14
|
+
|
15
|
+
def tidy_name
|
16
|
+
unless @tidy_name
|
17
|
+
@tidy_name = name.dup # Start with the name we've received
|
18
|
+
|
19
|
+
unescape # Unescape percent-encoded characters and fix UTF-8 encoding
|
20
|
+
remove_zero_width # remove zero-width characters
|
21
|
+
tidy_spacing # " John Smith " -> "John Smith"
|
22
|
+
fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
|
23
|
+
consolidate_initials # "I. B. M." -> "I.B.M."
|
24
|
+
end
|
25
|
+
|
26
|
+
@tidy_name
|
27
|
+
end
|
28
|
+
|
29
|
+
def nice_name
|
30
|
+
unless @nice_name
|
31
|
+
@nice_name = tidy_name.dup # Start with the tidied name
|
32
|
+
|
33
|
+
remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
|
34
|
+
fixup_last_name_first # "Smith, John" -> "John Smith"
|
35
|
+
fixup_mismatched_braces # "Ceres (AZ" -> "Ceres (AZ)"
|
36
|
+
remove_adfixes # prefixes and suffixes: "Mr John Smith Jr." -> "John Smith"
|
37
|
+
name_wrangle # proper name case and non-breaking spaces
|
38
|
+
use_nonbreaking_spaces_in_compound_names
|
39
|
+
end
|
40
|
+
|
41
|
+
@nice_name
|
42
|
+
end
|
43
|
+
|
44
|
+
def simple_name
|
45
|
+
unless @simple_name
|
46
|
+
@simple_name = nice_name.dup # Start with nice name
|
47
|
+
|
48
|
+
remove_initials # "John Q. Doe" -> "John Doe"
|
49
|
+
remove_middle_names # "Philip Seymour Hoffman" -> "Philip Hoffman"
|
50
|
+
remove_periods_from_initials # "J.P.R. Williams" -> "JPR Williams"
|
51
|
+
standardize_words # "B&Q Intl" -> "B and Q International"
|
52
|
+
|
53
|
+
@simple_name.whitespace_to!(ASCII_SPACE)
|
54
|
+
end
|
55
|
+
|
56
|
+
@simple_name
|
57
|
+
end
|
58
|
+
|
59
|
+
def slug
|
60
|
+
@slug ||= NameTamer.parameterize simple_name.dup # "John Doe" -> "john-doe"
|
61
|
+
end
|
62
|
+
|
63
|
+
def array
|
64
|
+
@array ||= slug.split(SLUG_DELIMITER)
|
65
|
+
end
|
66
|
+
|
67
|
+
def contact_type
|
68
|
+
nice_name # make sure we've done the bit which infers contact_type
|
69
|
+
contact_type_best_effort
|
70
|
+
end
|
71
|
+
|
72
|
+
def contact_type=(new_contact_type)
|
73
|
+
ct_as_sym = new_contact_type.to_sym
|
74
|
+
|
75
|
+
unless @contact_type.nil? || @contact_type == ct_as_sym
|
76
|
+
puts "Changing contact type of #{@name} from #{@contact_type} to #{new_contact_type}"
|
77
|
+
end
|
78
|
+
|
79
|
+
@contact_type = ct_as_sym
|
80
|
+
end
|
81
|
+
|
82
|
+
# Useful method for iterating through the words in the name
|
83
|
+
def each_word(&block)
|
84
|
+
@words ||= slug.split(SLUG_DELIMITER)
|
85
|
+
@words.each(&block)
|
86
|
+
end
|
87
|
+
|
88
|
+
# These lines aren't used and aren't covered by specs
|
89
|
+
# def name=(new_name)
|
90
|
+
# initialize new_name, :contact_type => @contact_type
|
91
|
+
# end
|
92
|
+
#
|
93
|
+
# def to_hash
|
94
|
+
# {
|
95
|
+
# name: name,
|
96
|
+
# nice_name: nice_name,
|
97
|
+
# simple_name: simple_name,
|
98
|
+
# slug: slug,
|
99
|
+
# contact_type: contact_type,
|
100
|
+
# last_name: last_name,
|
101
|
+
# remainder: remainder,
|
102
|
+
# adfix_found: adfix_found
|
103
|
+
# }
|
104
|
+
# end
|
105
|
+
|
106
|
+
private
|
107
|
+
|
108
|
+
#--------------------------------------------------------
|
109
|
+
# Tidy up the name we've received
|
110
|
+
#--------------------------------------------------------
|
111
|
+
|
112
|
+
def unescape
|
113
|
+
@tidy_name.ensure_safe!.safe_unescape!.unescape_html!
|
114
|
+
end
|
115
|
+
|
116
|
+
def remove_zero_width
|
117
|
+
@tidy_name.strip_unwanted!(ZERO_WIDTH_FILTER)
|
118
|
+
end
|
119
|
+
|
120
|
+
def tidy_spacing
|
121
|
+
@tidy_name
|
122
|
+
.space_around_comma!
|
123
|
+
.strip_or_self!
|
124
|
+
.whitespace_to!(ASCII_SPACE)
|
125
|
+
end
|
126
|
+
|
127
|
+
def fix_encoding_errors
|
128
|
+
@tidy_name.fix_encoding_errors!
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove spaces from groups of initials
|
132
|
+
def consolidate_initials
|
133
|
+
@tidy_name
|
134
|
+
.remove_spaces_from_initials!
|
135
|
+
.ensure_space_after_initials!
|
136
|
+
end
|
137
|
+
|
138
|
+
# An adfix is either a prefix or a suffix
|
139
|
+
def remove_adfixes
|
140
|
+
if @last_name.nil?
|
141
|
+
# Our name is still in one part, not two
|
142
|
+
loop do
|
143
|
+
@nice_name = remove_outermost_adfix(:suffix, @nice_name)
|
144
|
+
break unless @adfix_found
|
145
|
+
end
|
146
|
+
|
147
|
+
loop do
|
148
|
+
@nice_name = remove_outermost_adfix(:prefix, @nice_name)
|
149
|
+
break unless @adfix_found
|
150
|
+
end
|
151
|
+
else
|
152
|
+
# Our name is currently in two halves
|
153
|
+
loop do
|
154
|
+
@last_name = remove_outermost_adfix(:suffix, @last_name)
|
155
|
+
break unless @adfix_found
|
156
|
+
end
|
157
|
+
|
158
|
+
loop do
|
159
|
+
@remainder = remove_outermost_adfix(:prefix, @remainder)
|
160
|
+
break unless @adfix_found
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
# Names in the form "Smith, John" need to be turned around to "John Smith"
|
166
|
+
def fixup_last_name_first
|
167
|
+
return if @contact_type == :organization
|
168
|
+
|
169
|
+
parts = @nice_name.split ', '
|
170
|
+
|
171
|
+
return unless parts.count == 2
|
172
|
+
|
173
|
+
@last_name = parts[0] # Sometimes the last name alone is all caps and we can name-case it
|
174
|
+
@remainder = parts[1]
|
175
|
+
end
|
176
|
+
|
177
|
+
# Sometimes we end up with mismatched braces after adfix stripping
|
178
|
+
# e.g. "Ceres (Ceres Holdings LLC)" -> "Ceres (Ceres Holdings"
|
179
|
+
def fixup_mismatched_braces
|
180
|
+
left_brace_count = @nice_name.count '('
|
181
|
+
right_brace_count = @nice_name.count ')'
|
182
|
+
|
183
|
+
if left_brace_count > right_brace_count
|
184
|
+
@nice_name += ')'
|
185
|
+
elsif left_brace_count < right_brace_count
|
186
|
+
@nice_name = '(' + @nice_name
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
def name_wrangle
|
191
|
+
# Fix case if all caps or all lowercase
|
192
|
+
if @last_name.nil?
|
193
|
+
name_wrangle_single_name
|
194
|
+
else
|
195
|
+
name_wrangle_split_name
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def name_wrangle_single_name
|
200
|
+
lowercase = @nice_name.downcase
|
201
|
+
uppercase = @nice_name.upcase
|
202
|
+
fix_case = false
|
203
|
+
|
204
|
+
if @contact_type == :organization
|
205
|
+
fix_case = true if @nice_name == uppercase && @nice_name.length > 4
|
206
|
+
elsif [uppercase, lowercase].include?(@nice_name)
|
207
|
+
fix_case = true
|
208
|
+
end
|
209
|
+
|
210
|
+
@nice_name = name_case(lowercase) if fix_case
|
211
|
+
end
|
212
|
+
|
213
|
+
def name_wrangle_split_name
|
214
|
+
# It's a person if we've split the name, so no organization logic here
|
215
|
+
lowercase = @last_name.downcase
|
216
|
+
uppercase = @last_name.upcase
|
217
|
+
@last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
|
218
|
+
@nice_name = "#{@remainder} #{@last_name}"
|
219
|
+
end
|
220
|
+
|
221
|
+
# Conjoin compound names with non-breaking spaces
|
222
|
+
def use_nonbreaking_spaces_in_compound_names
|
223
|
+
@nice_name
|
224
|
+
.nbsp_in_compound_name!
|
225
|
+
.nbsp_in_name_modifier!
|
226
|
+
end
|
227
|
+
|
228
|
+
#--------------------------------------------------------
|
229
|
+
# Make search name from nice name
|
230
|
+
#--------------------------------------------------------
|
231
|
+
|
232
|
+
# Remove initials from personal names unless they are the only identifier.
|
233
|
+
# i.e. only remove initials if there's also a proper name there
|
234
|
+
def remove_initials
|
235
|
+
return unless @contact_type == :person
|
236
|
+
|
237
|
+
temp_name = @simple_name.gsub(/\b([a-z](?:\.*\s+|\.))/i, '')
|
238
|
+
|
239
|
+
# If the name still has at least one space we're OK
|
240
|
+
@simple_name = temp_name if temp_name.include?(ASCII_SPACE)
|
241
|
+
end
|
242
|
+
|
243
|
+
def remove_middle_names
|
244
|
+
return unless @contact_type == :person
|
245
|
+
|
246
|
+
first_name, parts = find_first_usable_name(@simple_name.split)
|
247
|
+
last_name, = find_last_usable_name(parts)
|
248
|
+
|
249
|
+
return unless first_name || last_name
|
250
|
+
|
251
|
+
separator = first_name && last_name ? ' ' : ''
|
252
|
+
@simple_name = "#{first_name}#{separator}#{last_name}"
|
253
|
+
end
|
254
|
+
|
255
|
+
def find_first_usable_name(parts)
|
256
|
+
part = nil
|
257
|
+
|
258
|
+
parts.each_index do |i|
|
259
|
+
part = parts[i]
|
260
|
+
next if part.gsub(FILTER_COMPAT, '').empty?
|
261
|
+
parts = parts.slice(i + 1, parts.length) # don't use "slice!"
|
262
|
+
break
|
263
|
+
end
|
264
|
+
|
265
|
+
[part, parts]
|
266
|
+
end
|
267
|
+
|
268
|
+
def find_last_usable_name(parts)
|
269
|
+
part = nil
|
270
|
+
|
271
|
+
parts.reverse_each do |p|
|
272
|
+
next if p.gsub(FILTER_COMPAT, '').empty?
|
273
|
+
part = p
|
274
|
+
break
|
275
|
+
end
|
276
|
+
|
277
|
+
part
|
278
|
+
end
|
279
|
+
|
280
|
+
def remove_periods_from_initials
|
281
|
+
@simple_name.remove_periods_from_initials!
|
282
|
+
end
|
283
|
+
|
284
|
+
def standardize_words
|
285
|
+
@simple_name.gsub!(/ *& */, ' and ') # replace ampersand characters with ' and '
|
286
|
+
@simple_name.gsub!(/ *\+ */, ' plus ') # replace plus signs with ' plus '
|
287
|
+
@simple_name.gsub!(/\bintl\b/i, 'International') # replace 'intl' with 'International'
|
288
|
+
@simple_name.gsub!(/[־‐‑‒–—―−﹘﹣-]/, SLUG_DELIMITER) # Replace Unicode dashes with ASCII hyphen
|
289
|
+
@simple_name.strip_unwanted!(/["“”™℠®©℗]/) # remove quotes and commercial decoration
|
290
|
+
end
|
291
|
+
|
292
|
+
#--------------------------------------------------------
|
293
|
+
# Initialization and utilities
|
294
|
+
#--------------------------------------------------------
|
295
|
+
|
296
|
+
def initialize(new_name, args = {})
|
297
|
+
@name = new_name || ''
|
298
|
+
@contact_type = contact_type_from args
|
299
|
+
|
300
|
+
@tidy_name = nil
|
301
|
+
@nice_name = nil
|
302
|
+
@simple_name = nil
|
303
|
+
@slug = nil
|
304
|
+
|
305
|
+
@last_name = nil
|
306
|
+
@remainder = nil
|
307
|
+
|
308
|
+
@adfix_found = false
|
309
|
+
end
|
310
|
+
|
311
|
+
def contact_type_from(args)
|
312
|
+
args_ct = args[:contact_type]
|
313
|
+
return unless args_ct
|
314
|
+
|
315
|
+
ct = args_ct.is_a?(Symbol) ? args_ct : args_ct.dup
|
316
|
+
ct = ct.to_s unless [String, Symbol].include? ct.class
|
317
|
+
ct.downcase! if ct.class == String
|
318
|
+
ct = ct.to_sym
|
319
|
+
ct = nil unless %i[person organization].include? ct
|
320
|
+
|
321
|
+
ct
|
322
|
+
end
|
323
|
+
|
324
|
+
# If we don't know the contact type, what's our best guess?
|
325
|
+
def contact_type_best_effort
|
326
|
+
if @contact_type
|
327
|
+
@contact_type
|
328
|
+
else
|
329
|
+
# If it's just one word we'll assume organization.
|
330
|
+
# If more then we'll assume a person
|
331
|
+
@name.include?(ASCII_SPACE) ? :person : :organization
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
# We pass to this routine either prefixes or suffixes
|
336
|
+
def remove_outermost_adfix(adfix_type, name_part)
|
337
|
+
ct, parts = find_contact_type_and_parts(ADFIX_PATTERNS[adfix_type], name_part)
|
338
|
+
|
339
|
+
return name_part unless @adfix_found
|
340
|
+
|
341
|
+
# If we've found a diagnostic adfix then set the contact type
|
342
|
+
self.contact_type = ct
|
343
|
+
|
344
|
+
# The remainder of the name will be in parts[0] or parts[2] depending
|
345
|
+
# on whether this is a prefix or a suffix.
|
346
|
+
# We'll also remove any trailing commas we've exposed.
|
347
|
+
(parts[0] + parts[2]).gsub(/\s*,\s*$/, '')
|
348
|
+
end
|
349
|
+
|
350
|
+
def find_contact_type_and_parts(adfixes, name_part)
|
351
|
+
ct = contact_type_best_effort
|
352
|
+
parts = name_part.partition adfixes[ct]
|
353
|
+
@adfix_found = !parts[1].empty?
|
354
|
+
|
355
|
+
return [ct, parts] if @contact_type || @adfix_found
|
356
|
+
|
357
|
+
# If the contact type is indeterminate and we didn't find a diagnostic adfix
|
358
|
+
# for a person then try again for an organization
|
359
|
+
ct = :organization
|
360
|
+
parts = name_part.partition adfixes[ct]
|
361
|
+
@adfix_found = !parts[1].empty?
|
362
|
+
|
363
|
+
[ct, parts]
|
364
|
+
end
|
365
|
+
|
366
|
+
# Original Version of NameCase:
|
367
|
+
# Copyright (c) Mark Summerfield 1998-2008. All Rights Reserved
|
368
|
+
# This module may be used/distributed/modified under the same terms as Perl itself
|
369
|
+
# http://dev.perl.org/licenses/ (GPL)
|
370
|
+
#
|
371
|
+
# Ruby Version:
|
372
|
+
# Copyright (c) Aaron Patterson 2006
|
373
|
+
# NameCase is distributed under the GPL license.
|
374
|
+
#
|
375
|
+
# Substantially modified for Xendata
|
376
|
+
# Improved in several areas, also now adds non-breaking spaces for
|
377
|
+
# compound names like "van der Pump"
|
378
|
+
def name_case(lowercase)
|
379
|
+
n = lowercase.dup # We assume the name is passed already downcased
|
380
|
+
|
381
|
+
n
|
382
|
+
.upcase_first_letter!
|
383
|
+
.downcase_after_apostrophe!
|
384
|
+
.fix_mac!
|
385
|
+
.fix_ff!
|
386
|
+
.fix_name_modifiers!
|
387
|
+
.upcase_initials!
|
388
|
+
end
|
389
|
+
end
|
390
|
+
end
|