name_tamer 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/name-tamer.rb ADDED
@@ -0,0 +1 @@
1
+ require 'name_tamer'
data/lib/name_tamer.rb ADDED
@@ -0,0 +1,22 @@
1
+ # encoding: utf-8
2
+
3
+ require 'cgi'
4
+ require 'name_tamer/string'
5
+ require 'name_tamer/array'
6
+ require 'name_tamer/constants'
7
+
8
+ module NameTamer
9
+ autoload :Name, 'name_tamer/name'
10
+ autoload :Text, 'name_tamer/text'
11
+
12
+ class << self
13
+ def [](name, args = {})
14
+ NameTamer::Name.new name, args
15
+ end
16
+
17
+ # Make a slug from a string
18
+ def parameterize(string, args = {})
19
+ NameTamer::Text.new(string, args).parameterize
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,8 @@
1
+ # encoding: utf-8
2
+
3
+ class Array
4
+ def neighbours
5
+ last_index = length - 1
6
+ 0.upto(last_index).flat_map { |i| i.upto(last_index).map { |j| self[i..j] } }
7
+ end
8
+ end
@@ -0,0 +1,121 @@
1
+ module NameTamer
2
+ NONBREAKING_SPACE = "\u00a0".freeze
3
+ ASCII_SPACE = ' '.freeze
4
+ ADFIX_JOINERS = "[#{ASCII_SPACE}-]".freeze
5
+ SLUG_DELIMITER = '-'.freeze
6
+ ZERO_WIDTH_FILTER = /[\u180E\u200B\u200C\u200D\u2063\uFEFF]/
7
+
8
+ # Constants for parameterizing Unicode strings for IRIs
9
+ #
10
+ # Allowed characters in an IRI segment are defined by RFC 3987
11
+ # (https://tools.ietf.org/html/rfc3987#section-2.2) as follows:
12
+ #
13
+ # isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
14
+ # / "@" )
15
+ # ; non-zero-length segment without any colon ":"
16
+ #
17
+ # iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
18
+ #
19
+ # pct-encoded = "%" HEXDIG HEXDIG
20
+ #
21
+ # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
22
+ # / "*" / "+" / "," / ";" / "="
23
+ #
24
+ # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
25
+ # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
26
+ # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
27
+ # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
28
+ # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
29
+ # / %xD0000-DFFFD / %xE1000-EFFFD
30
+ #
31
+ # Note that we can't use Unicode code points above \uFFFF because of
32
+ # regex limitations, so we'll ignore ucschar above that point.
33
+ #
34
+ # We're using the most restrictive segment definition (isegment-nz-nc)
35
+ # to avoid any possible problems with the IRI that it one day might
36
+ # get placed in.
37
+ ALPHA = 'A-Za-z'.freeze
38
+ DIGIT = '0-9'.freeze
39
+ UCSCHAR = '\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'.freeze
40
+ IUNRESERVED = "#{ALPHA}#{DIGIT}\\-\\._~#{UCSCHAR}".freeze
41
+ SUBDELIMS = '!$&\'\(\)\*+,;='.freeze
42
+ ISEGMENT_NZ_NC = "#{IUNRESERVED}#{SUBDELIMS}@".freeze # pct-encoded not needed
43
+ FILTER_RFC3987 = /[^#{ISEGMENT_NZ_NC}]/
44
+ FILTER_COMPAT = /[^#{ALPHA}#{DIGIT}\-_#{UCSCHAR}]/
45
+
46
+ # These are the prefixes and suffixes we want to remove
47
+ # If you add to the list, you can use spaces and dots where appropriate
48
+ # Ensure any single letters are followed by a dot because we'll add one to the string
49
+ # during processing, e.g. "y Cia." should be "y. Cia."
50
+ ADFIXES = {
51
+ prefix: {
52
+ person: [
53
+ 'Baroness', 'Capt.', 'Captain', 'Col.', 'Colonel', 'Dame', 'Doctor',
54
+ 'Dr.', 'Judge', 'Justice', 'Lady', 'Lieut.', 'Lieutenant', 'Lord',
55
+ 'Madame', 'Major', 'Master', 'Matron', 'Messrs.', 'Mgr.', 'Miss',
56
+ 'Mister', 'Mlle.', 'Mme.', 'Mons.', 'Mr.', 'Mr. & Mrs.', 'Mr. and Mrs.',
57
+ 'Mrs.', 'Ms.', 'Msgr.', 'Prof.', 'Professor', 'Rev.', 'Reverend', 'Sir',
58
+ 'Sister', 'The Hon.', 'The Lady.', 'The Lord', 'The Rt. Hon.', 'Doktor',
59
+ 'Herr', 'Frau'
60
+ ],
61
+ organization: [
62
+ 'Fa.', 'P.T.', 'P.T. Tbk.', 'U.D.'
63
+ ],
64
+ before: '\\A', after: ADFIX_JOINERS
65
+ },
66
+ suffix: {
67
+ person: [
68
+ 'Chartered F.C.S.I.', 'Chartered M.C.S.I.', 'I.F.R.S. Certified', 'F.Inst.L.M.', 'C.I.S.S.P.', 'F.C.I.P.S.',
69
+ 'M.R.I.C.S.', 'T.M.I.E.T.', 'Dip. D.M.', 'A.A.M.S.', 'A.C.C.A.', 'A.C.M.A.', 'A.I.F.A.', 'A.W.M.A.', 'C.A.I.A.',
70
+ 'C.A.P.M.', 'C.C.I.M.', 'C.D.F.A.', 'C.E.P.P.', 'C.F.B.S.', 'C.G.M.A.', 'C.I.T.P.', 'C.L.T.C.', 'C.P.C.C.',
71
+ 'C.R.P.C.', 'C.R.P.S.', 'C.S.O.X.', 'C.S.S.D.', 'F.B.C.S.', 'F.C.C.A.', 'F.C.M.I.', 'F.C.S.I.', 'F.I.E.T.',
72
+ 'F.I.R.P.', 'M.I.E.T.', 'M.S.F.S.', 'M.Sc. D.', 'O.R.S.C.', 'R.I.C.P.', 'B.Tech.', 'Cantab.', 'Ch.F.C.',
73
+ 'D.Phil.', 'I.T.I.L. v3', 'M.Io.D.', 'S.C.M.P', 'A.C.A.', 'A.C.C.', 'A.E.P.', 'A.I.F.', 'A.S.A.', 'B.Eng.',
74
+ 'C.B.V.', 'C.E.M.', 'C.Eng.', 'C.F.A.', 'C.F.F.', 'C.F.P.', 'C.F.S.', 'C.G.A.', 'C.G.B.', 'C.G.P.', 'C.I.M.',
75
+ 'C.L.P.', 'C.L.U.', 'C.M.A.', 'C.M.T.', 'C.P.A.', 'C.T.A.', 'C.W.S.', 'D.B.E.', 'D.D.S.', 'D.V.M.', 'E.R.P.',
76
+ 'Eng.D.', 'F.C.A.', 'F.P.C.', 'F.R.M.', 'F.R.M.', 'G.S.P.', 'L.P.S.', 'M.B.A.', 'M.B.E.', 'M.E.P.', 'M.Eng.',
77
+ 'M.Jur.', 'M.P.A.', 'M.S.F.', 'M.S.P.', 'O.B.E.', 'P.C.C.', 'P.F.S.', 'P.H.R.', 'P.M.C.', 'P.M.P.', 'P.M.P.',
78
+ 'P.S.P.', 'R.F.C.', 'V.M.D.', 'B.Ed.', 'B.Sc.', 'Ed.D.', 'Ed.M.', 'Hons.', 'LL.B.', 'LL.D.', 'LL.M.', 'M.Ed.',
79
+ 'M.Sc.', 'Oxon.', 'Ph.D.', 'B.A.', 'C.A.', 'E.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.', 'M.S.',
80
+ 'O.K.', 'P.A.', 'Q.C.', 'R.D.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V'
81
+ ],
82
+ organization: [
83
+ 'S. de R.L. de C.V.', 'S.A.P.I. de C.V.', 'y. Cía. S. en C.', 'Private Limited', 'S.M. Pte. Ltd.',
84
+ 'Cía. S. C. A.', 'y. Cía. S. C.', 'S.A. de C.V.', 'spol. s.r.o.', '(Pty.) Ltd.', '(Pvt.) Ltd.', 'A.D.S.I.Tz.',
85
+ 'S.p. z.o.o.', '(Pvt.)Ltd.', 'akc. spol.', 'Cía. Ltda.', 'E.B.V.B.A.', 'P. Limited', 'S. de R.L.', 'S.I.C.A.V.',
86
+ 'S.P.R.L.U.', 'А.Д.С.И.Ц.', '(P.) Ltd.', 'C. por A.', 'Comm.V.A.', 'Ltd. Şti.', 'Plc. Ltd.', 'Pte. Ltd.',
87
+ 'Pty. Ltd.', 'Pvt. Ltd.', 'Soc. Col.', 'A.M.B.A.', 'A.S.B.L.', 'A.V.E.E.', 'B.V.B.A.', 'B.V.I.O.', 'C.V.B.A.',
88
+ 'C.V.O.A.', 'E.E.I.G.', 'E.I.R.L.', 'E.O.O.D.', 'E.U.R.L.', 'F.M.B.A.', 'G.m.b.H.', 'Ges.b.R.', 'K.G.a.A.',
89
+ 'L.L.L.P.', 'Ltd. Co.', 'Ltd. Co.', 'M.E.P.E.', 'n.y.r.t.', 'O.V.E.E.', 'P.E.E.C.', 'P.L.L.C.', 'P.L.L.C.',
90
+ 'S. en C.', 'S.a.p.a.', 'S.A.R.L.', 'S.à.R.L.', 'S.A.S.U.', 'S.C.e.I.', 'S.C.O.P.', 'S.C.p.A.', 'S.C.R.I.',
91
+ 'S.C.R.L.', 'S.M.B.A.', 'S.P.R.L.', 'Е.О.О.Д.', '&. Cie.', 'and Co.', 'Comm.V.', 'Limited', 'P. Ltd.',
92
+ 'Part.G.', 'Sh.p.k.', '&. Co.', 'C.X.A.', 'd.n.o.', 'd.o.o.', 'E.A.D.', 'e.h.f.', 'E.P.E.', 'E.S.V.', 'F.C.P.',
93
+ 'F.I.E.', 'G.b.R.', 'G.I.E.', 'G.M.K.', 'G.S.K.', 'H.U.F.', 'K.D.A.', 'k.f.t.', 'k.h.t.', 'k.k.t.', 'L.L.C.',
94
+ 'L.L.P.', 'o.h.f.', 'O.H.G.', 'O.O.D.', 'O.y.j.', 'p.l.c.', 'P.S.U.', 'S.A.E.', 'S.A.S.', 'S.C.A.', 'S.C.E.',
95
+ 'S.C.S.', 'S.E.M.', 'S.E.P.', 's.e.s.', 'S.G.R.', 'S.N.C.', 'S.p.A.', 'S.P.E.', 'S.R.L.', 's.r.o.', 'Unltd.',
96
+ 'V.O.F.', 'V.o.G.', 'v.o.s.', 'V.Z.W.', 'z.r.t.', 'А.А.Т.', 'Е.А.Д.', 'З.А.Т.', 'К.Д.А.', 'О.О.Д.', 'Т.А.А.',
97
+ '股份有限公司', 'Ap.S.', 'Corp.', 'ltda.', 'Sh.A.', 'st.G.', 'Ultd.', 'a.b.', 'A.D.', 'A.E.', 'A.G.', 'A.S.',
98
+ 'A.Ş.', 'A.y.', 'B.M.', 'b.t.', 'B.V.', 'C.A.', 'C.V.', 'd.d.', 'e.c.', 'E.E.', 'e.G.', 'E.I.', 'E.P.', 'E.T.',
99
+ 'E.U.', 'e.v.', 'G.K.', 'G.P.', 'h.f.', 'Inc.', 'K.D.', 'K.G.', 'K.K.', 'k.s.', 'k.v.', 'K.y.', 'L.C.', 'L.P.',
100
+ 'Ltd.', 'N.K.', 'N.L.', 'N.V.', 'O.E.', 'O.G.', 'O.Ü.', 'O.y.', 'P.C.', 'p.l.', 'Pty.', 'PUP.', 'Pvt.', 'r.t.',
101
+ 'S.A.', 'S.D.', 'S.E.', 's.f.', 'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.',
102
+ 'ПУП.', 'С.Д.', 'בע"מ', '任意組合', '匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社',
103
+ 'A/S', 'G/S', 'I/S', 'K/S', 'P/S', 'S/A'
104
+ ],
105
+ before: ADFIX_JOINERS, after: '\\z'
106
+ }
107
+ }.freeze
108
+
109
+ ADFIX_PATTERNS = Hash[%i[prefix suffix].map do |adfix_type|
110
+ patterns = {}
111
+ adfix = ADFIXES[adfix_type]
112
+
113
+ %i[person organization].each do |ct|
114
+ with_optional_spaces = adfix[ct].map { |p| p.gsub(ASCII_SPACE, ' *') }
115
+ pattern_string = with_optional_spaces.join('|').gsub('.', '\.*')
116
+ patterns[ct] = /#{adfix[:before]}\(*(?:#{pattern_string})[®™\)]*#{adfix[:after]}/i
117
+ end
118
+
119
+ [adfix_type, patterns]
120
+ end]
121
+ end
@@ -0,0 +1,390 @@
1
+ module NameTamer
2
+ class Name
3
+ # References:
4
+ # http://www.w3.org/International/questions/qa-personal-names
5
+ # https://github.com/berkmancenter/namae
6
+ # https://github.com/mericson/people
7
+ # http://en.wikipedia.org/wiki/Types_of_business_entity
8
+ # http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(USA)
9
+ # http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom)
10
+ # http://en.wikipedia.org/wiki/Nobiliary_particle
11
+ # http://en.wikipedia.org/wiki/Spanish_naming_customs
12
+ # http://linguistlist.org/pubs/tocs/JournalUnifiedStyleSheet2007.pdf [PDF]
13
+ attr_reader :name
14
+
15
+ def tidy_name
16
+ unless @tidy_name
17
+ @tidy_name = name.dup # Start with the name we've received
18
+
19
+ unescape # Unescape percent-encoded characters and fix UTF-8 encoding
20
+ remove_zero_width # remove zero-width characters
21
+ tidy_spacing # " John Smith " -> "John Smith"
22
+ fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
23
+ consolidate_initials # "I. B. M." -> "I.B.M."
24
+ end
25
+
26
+ @tidy_name
27
+ end
28
+
29
+ def nice_name
30
+ unless @nice_name
31
+ @nice_name = tidy_name.dup # Start with the tidied name
32
+
33
+ remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
34
+ fixup_last_name_first # "Smith, John" -> "John Smith"
35
+ fixup_mismatched_braces # "Ceres (AZ" -> "Ceres (AZ)"
36
+ remove_adfixes # prefixes and suffixes: "Mr John Smith Jr." -> "John Smith"
37
+ name_wrangle # proper name case and non-breaking spaces
38
+ use_nonbreaking_spaces_in_compound_names
39
+ end
40
+
41
+ @nice_name
42
+ end
43
+
44
+ def simple_name
45
+ unless @simple_name
46
+ @simple_name = nice_name.dup # Start with nice name
47
+
48
+ remove_initials # "John Q. Doe" -> "John Doe"
49
+ remove_middle_names # "Philip Seymour Hoffman" -> "Philip Hoffman"
50
+ remove_periods_from_initials # "J.P.R. Williams" -> "JPR Williams"
51
+ standardize_words # "B&Q Intl" -> "B and Q International"
52
+
53
+ @simple_name.whitespace_to!(ASCII_SPACE)
54
+ end
55
+
56
+ @simple_name
57
+ end
58
+
59
+ def slug
60
+ @slug ||= NameTamer.parameterize simple_name.dup # "John Doe" -> "john-doe"
61
+ end
62
+
63
+ def array
64
+ @array ||= slug.split(SLUG_DELIMITER)
65
+ end
66
+
67
+ def contact_type
68
+ nice_name # make sure we've done the bit which infers contact_type
69
+ contact_type_best_effort
70
+ end
71
+
72
+ def contact_type=(new_contact_type)
73
+ ct_as_sym = new_contact_type.to_sym
74
+
75
+ unless @contact_type.nil? || @contact_type == ct_as_sym
76
+ puts "Changing contact type of #{@name} from #{@contact_type} to #{new_contact_type}"
77
+ end
78
+
79
+ @contact_type = ct_as_sym
80
+ end
81
+
82
+ # Useful method for iterating through the words in the name
83
+ def each_word(&block)
84
+ @words ||= slug.split(SLUG_DELIMITER)
85
+ @words.each(&block)
86
+ end
87
+
88
+ # These lines aren't used and aren't covered by specs
89
+ # def name=(new_name)
90
+ # initialize new_name, :contact_type => @contact_type
91
+ # end
92
+ #
93
+ # def to_hash
94
+ # {
95
+ # name: name,
96
+ # nice_name: nice_name,
97
+ # simple_name: simple_name,
98
+ # slug: slug,
99
+ # contact_type: contact_type,
100
+ # last_name: last_name,
101
+ # remainder: remainder,
102
+ # adfix_found: adfix_found
103
+ # }
104
+ # end
105
+
106
+ private
107
+
108
+ #--------------------------------------------------------
109
+ # Tidy up the name we've received
110
+ #--------------------------------------------------------
111
+
112
+ def unescape
113
+ @tidy_name.ensure_safe!.safe_unescape!.unescape_html!
114
+ end
115
+
116
+ def remove_zero_width
117
+ @tidy_name.strip_unwanted!(ZERO_WIDTH_FILTER)
118
+ end
119
+
120
+ def tidy_spacing
121
+ @tidy_name
122
+ .space_around_comma!
123
+ .strip_or_self!
124
+ .whitespace_to!(ASCII_SPACE)
125
+ end
126
+
127
+ def fix_encoding_errors
128
+ @tidy_name.fix_encoding_errors!
129
+ end
130
+
131
+ # Remove spaces from groups of initials
132
+ def consolidate_initials
133
+ @tidy_name
134
+ .remove_spaces_from_initials!
135
+ .ensure_space_after_initials!
136
+ end
137
+
138
+ # An adfix is either a prefix or a suffix
139
+ def remove_adfixes
140
+ if @last_name.nil?
141
+ # Our name is still in one part, not two
142
+ loop do
143
+ @nice_name = remove_outermost_adfix(:suffix, @nice_name)
144
+ break unless @adfix_found
145
+ end
146
+
147
+ loop do
148
+ @nice_name = remove_outermost_adfix(:prefix, @nice_name)
149
+ break unless @adfix_found
150
+ end
151
+ else
152
+ # Our name is currently in two halves
153
+ loop do
154
+ @last_name = remove_outermost_adfix(:suffix, @last_name)
155
+ break unless @adfix_found
156
+ end
157
+
158
+ loop do
159
+ @remainder = remove_outermost_adfix(:prefix, @remainder)
160
+ break unless @adfix_found
161
+ end
162
+ end
163
+ end
164
+
165
+ # Names in the form "Smith, John" need to be turned around to "John Smith"
166
+ def fixup_last_name_first
167
+ return if @contact_type == :organization
168
+
169
+ parts = @nice_name.split ', '
170
+
171
+ return unless parts.count == 2
172
+
173
+ @last_name = parts[0] # Sometimes the last name alone is all caps and we can name-case it
174
+ @remainder = parts[1]
175
+ end
176
+
177
+ # Sometimes we end up with mismatched braces after adfix stripping
178
+ # e.g. "Ceres (Ceres Holdings LLC)" -> "Ceres (Ceres Holdings"
179
+ def fixup_mismatched_braces
180
+ left_brace_count = @nice_name.count '('
181
+ right_brace_count = @nice_name.count ')'
182
+
183
+ if left_brace_count > right_brace_count
184
+ @nice_name += ')'
185
+ elsif left_brace_count < right_brace_count
186
+ @nice_name = '(' + @nice_name
187
+ end
188
+ end
189
+
190
+ def name_wrangle
191
+ # Fix case if all caps or all lowercase
192
+ if @last_name.nil?
193
+ name_wrangle_single_name
194
+ else
195
+ name_wrangle_split_name
196
+ end
197
+ end
198
+
199
+ def name_wrangle_single_name
200
+ lowercase = @nice_name.downcase
201
+ uppercase = @nice_name.upcase
202
+ fix_case = false
203
+
204
+ if @contact_type == :organization
205
+ fix_case = true if @nice_name == uppercase && @nice_name.length > 4
206
+ elsif [uppercase, lowercase].include?(@nice_name)
207
+ fix_case = true
208
+ end
209
+
210
+ @nice_name = name_case(lowercase) if fix_case
211
+ end
212
+
213
+ def name_wrangle_split_name
214
+ # It's a person if we've split the name, so no organization logic here
215
+ lowercase = @last_name.downcase
216
+ uppercase = @last_name.upcase
217
+ @last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
218
+ @nice_name = "#{@remainder} #{@last_name}"
219
+ end
220
+
221
+ # Conjoin compound names with non-breaking spaces
222
+ def use_nonbreaking_spaces_in_compound_names
223
+ @nice_name
224
+ .nbsp_in_compound_name!
225
+ .nbsp_in_name_modifier!
226
+ end
227
+
228
+ #--------------------------------------------------------
229
+ # Make search name from nice name
230
+ #--------------------------------------------------------
231
+
232
+ # Remove initials from personal names unless they are the only identifier.
233
+ # i.e. only remove initials if there's also a proper name there
234
+ def remove_initials
235
+ return unless @contact_type == :person
236
+
237
+ temp_name = @simple_name.gsub(/\b([a-z](?:\.*\s+|\.))/i, '')
238
+
239
+ # If the name still has at least one space we're OK
240
+ @simple_name = temp_name if temp_name.include?(ASCII_SPACE)
241
+ end
242
+
243
+ def remove_middle_names
244
+ return unless @contact_type == :person
245
+
246
+ first_name, parts = find_first_usable_name(@simple_name.split)
247
+ last_name, = find_last_usable_name(parts)
248
+
249
+ return unless first_name || last_name
250
+
251
+ separator = first_name && last_name ? ' ' : ''
252
+ @simple_name = "#{first_name}#{separator}#{last_name}"
253
+ end
254
+
255
+ def find_first_usable_name(parts)
256
+ part = nil
257
+
258
+ parts.each_index do |i|
259
+ part = parts[i]
260
+ next if part.gsub(FILTER_COMPAT, '').empty?
261
+ parts = parts.slice(i + 1, parts.length) # don't use "slice!"
262
+ break
263
+ end
264
+
265
+ [part, parts]
266
+ end
267
+
268
+ def find_last_usable_name(parts)
269
+ part = nil
270
+
271
+ parts.reverse_each do |p|
272
+ next if p.gsub(FILTER_COMPAT, '').empty?
273
+ part = p
274
+ break
275
+ end
276
+
277
+ part
278
+ end
279
+
280
+ def remove_periods_from_initials
281
+ @simple_name.remove_periods_from_initials!
282
+ end
283
+
284
+ def standardize_words
285
+ @simple_name.gsub!(/ *& */, ' and ') # replace ampersand characters with ' and '
286
+ @simple_name.gsub!(/ *\+ */, ' plus ') # replace plus signs with ' plus '
287
+ @simple_name.gsub!(/\bintl\b/i, 'International') # replace 'intl' with 'International'
288
+ @simple_name.gsub!(/[־‐‑‒–—―−﹘﹣-]/, SLUG_DELIMITER) # Replace Unicode dashes with ASCII hyphen
289
+ @simple_name.strip_unwanted!(/["“”™℠®©℗]/) # remove quotes and commercial decoration
290
+ end
291
+
292
+ #--------------------------------------------------------
293
+ # Initialization and utilities
294
+ #--------------------------------------------------------
295
+
296
+ def initialize(new_name, args = {})
297
+ @name = new_name || ''
298
+ @contact_type = contact_type_from args
299
+
300
+ @tidy_name = nil
301
+ @nice_name = nil
302
+ @simple_name = nil
303
+ @slug = nil
304
+
305
+ @last_name = nil
306
+ @remainder = nil
307
+
308
+ @adfix_found = false
309
+ end
310
+
311
+ def contact_type_from(args)
312
+ args_ct = args[:contact_type]
313
+ return unless args_ct
314
+
315
+ ct = args_ct.is_a?(Symbol) ? args_ct : args_ct.dup
316
+ ct = ct.to_s unless [String, Symbol].include? ct.class
317
+ ct.downcase! if ct.class == String
318
+ ct = ct.to_sym
319
+ ct = nil unless %i[person organization].include? ct
320
+
321
+ ct
322
+ end
323
+
324
+ # If we don't know the contact type, what's our best guess?
325
+ def contact_type_best_effort
326
+ if @contact_type
327
+ @contact_type
328
+ else
329
+ # If it's just one word we'll assume organization.
330
+ # If more then we'll assume a person
331
+ @name.include?(ASCII_SPACE) ? :person : :organization
332
+ end
333
+ end
334
+
335
+ # We pass to this routine either prefixes or suffixes
336
+ def remove_outermost_adfix(adfix_type, name_part)
337
+ ct, parts = find_contact_type_and_parts(ADFIX_PATTERNS[adfix_type], name_part)
338
+
339
+ return name_part unless @adfix_found
340
+
341
+ # If we've found a diagnostic adfix then set the contact type
342
+ self.contact_type = ct
343
+
344
+ # The remainder of the name will be in parts[0] or parts[2] depending
345
+ # on whether this is a prefix or a suffix.
346
+ # We'll also remove any trailing commas we've exposed.
347
+ (parts[0] + parts[2]).gsub(/\s*,\s*$/, '')
348
+ end
349
+
350
+ def find_contact_type_and_parts(adfixes, name_part)
351
+ ct = contact_type_best_effort
352
+ parts = name_part.partition adfixes[ct]
353
+ @adfix_found = !parts[1].empty?
354
+
355
+ return [ct, parts] if @contact_type || @adfix_found
356
+
357
+ # If the contact type is indeterminate and we didn't find a diagnostic adfix
358
+ # for a person then try again for an organization
359
+ ct = :organization
360
+ parts = name_part.partition adfixes[ct]
361
+ @adfix_found = !parts[1].empty?
362
+
363
+ [ct, parts]
364
+ end
365
+
366
+ # Original Version of NameCase:
367
+ # Copyright (c) Mark Summerfield 1998-2008. All Rights Reserved
368
+ # This module may be used/distributed/modified under the same terms as Perl itself
369
+ # http://dev.perl.org/licenses/ (GPL)
370
+ #
371
+ # Ruby Version:
372
+ # Copyright (c) Aaron Patterson 2006
373
+ # NameCase is distributed under the GPL license.
374
+ #
375
+ # Substantially modified for Xendata
376
+ # Improved in several areas, also now adds non-breaking spaces for
377
+ # compound names like "van der Pump"
378
+ def name_case(lowercase)
379
+ n = lowercase.dup # We assume the name is passed already downcased
380
+
381
+ n
382
+ .upcase_first_letter!
383
+ .downcase_after_apostrophe!
384
+ .fix_mac!
385
+ .fix_ff!
386
+ .fix_name_modifiers!
387
+ .upcase_initials!
388
+ end
389
+ end
390
+ end