name-tamer 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.ruby-version +1 -1
- data/doc/maintenance.rake +2 -2
- data/lib/name-tamer.rb +1 -537
- data/lib/name_tamer.rb +21 -0
- data/lib/name_tamer/array.rb +7 -0
- data/lib/name_tamer/constants.rb +121 -0
- data/lib/name_tamer/name.rb +384 -0
- data/lib/{string_extras.rb → name_tamer/string.rb} +14 -8
- data/lib/name_tamer/text.rb +53 -0
- data/lib/name_tamer/version.rb +3 -0
- data/name-tamer.gemspec +10 -10
- data/spec/{name_tamer_spec.rb → name_tamer/name_spec.rb} +2 -2
- data/spec/name_tamer/text_spec.rb +42 -0
- metadata +31 -24
- data/lib/name-tamer/version.rb +0 -3
data/lib/name_tamer.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'cgi'
|
3
|
+
require 'name_tamer/string'
|
4
|
+
require 'name_tamer/array'
|
5
|
+
require 'name_tamer/constants'
|
6
|
+
|
7
|
+
module NameTamer
|
8
|
+
autoload :Name, 'name_tamer/name'
|
9
|
+
autoload :Text, 'name_tamer/text'
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def [](name, args = {})
|
13
|
+
NameTamer::Name.new name, args
|
14
|
+
end
|
15
|
+
|
16
|
+
# Make a slug from a string
|
17
|
+
def parameterize(string, args = {})
|
18
|
+
NameTamer::Text.new(string, args).parameterize
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module NameTamer
|
2
|
+
NONBREAKING_SPACE = "\u00a0".freeze
|
3
|
+
ASCII_SPACE = ' '.freeze
|
4
|
+
ADFIX_JOINERS = "[#{ASCII_SPACE}-]".freeze
|
5
|
+
SLUG_DELIMITER = '-'.freeze
|
6
|
+
ZERO_WIDTH_FILTER = /[\u180E\u200B\u200C\u200D\u2063\uFEFF]/
|
7
|
+
|
8
|
+
# Constants for parameterizing Unicode strings for IRIs
|
9
|
+
#
|
10
|
+
# Allowed characters in an IRI segment are defined by RFC 3987
|
11
|
+
# (https://tools.ietf.org/html/rfc3987#section-2.2) as follows:
|
12
|
+
#
|
13
|
+
# isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
|
14
|
+
# / "@" )
|
15
|
+
# ; non-zero-length segment without any colon ":"
|
16
|
+
#
|
17
|
+
# iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
|
18
|
+
#
|
19
|
+
# pct-encoded = "%" HEXDIG HEXDIG
|
20
|
+
#
|
21
|
+
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
|
22
|
+
# / "*" / "+" / "," / ";" / "="
|
23
|
+
#
|
24
|
+
# ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
|
25
|
+
# / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
|
26
|
+
# / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
|
27
|
+
# / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
|
28
|
+
# / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
|
29
|
+
# / %xD0000-DFFFD / %xE1000-EFFFD
|
30
|
+
#
|
31
|
+
# Note that we can't use Unicode code points above \uFFFF because of
|
32
|
+
# regex limitations, so we'll ignore ucschar above that point.
|
33
|
+
#
|
34
|
+
# We're using the most restrictive segment definition (isegment-nz-nc)
|
35
|
+
# to avoid any possible problems with the IRI that it one day might
|
36
|
+
# get placed in.
|
37
|
+
ALPHA = 'A-Za-z'.freeze
|
38
|
+
DIGIT = '0-9'.freeze
|
39
|
+
UCSCHAR = '\u00A0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF'.freeze
|
40
|
+
IUNRESERVED = "#{ALPHA}#{DIGIT}\\-\\._~#{UCSCHAR}".freeze
|
41
|
+
SUBDELIMS = '!$&\'\(\)\*+,;='.freeze
|
42
|
+
ISEGMENT_NZ_NC = "#{IUNRESERVED}#{SUBDELIMS}@".freeze # pct-encoded not needed
|
43
|
+
FILTER_RFC3987 = /[^#{ISEGMENT_NZ_NC}]/
|
44
|
+
FILTER_COMPAT = /[^#{ALPHA}#{DIGIT}\-_#{UCSCHAR}]/
|
45
|
+
|
46
|
+
# These are the prefixes and suffixes we want to remove
|
47
|
+
# If you add to the list, you can use spaces and dots where appropriate
|
48
|
+
# Ensure any single letters are followed by a dot because we'll add one to the string
|
49
|
+
# during processing, e.g. "y Cia." should be "y. Cia."
|
50
|
+
ADFIXES = {
|
51
|
+
prefix: {
|
52
|
+
person: [
|
53
|
+
'Baron', 'Baroness', 'Capt.', 'Captain', 'Col.', 'Colonel', 'Dame',
|
54
|
+
'Doctor', 'Dr.', 'Judge', 'Justice', 'Lady', 'Lieut.', 'Lieutenant',
|
55
|
+
'Lord', 'Madame', 'Major', 'Master', 'Matron', 'Messrs.', 'Mgr.',
|
56
|
+
'Miss', 'Mister', 'Mlle.', 'Mme.', 'Mons.', 'Mr.', 'Mr. & Mrs.',
|
57
|
+
'Mr. and Mrs.', 'Mrs.', 'Msgr.', 'Ms.', 'Prof.', 'Professor', 'Rev.',
|
58
|
+
'Reverend', 'Sir', 'Sister', 'The Hon.', 'The Lady.', 'The Lord',
|
59
|
+
'The Rt. Hon.'
|
60
|
+
],
|
61
|
+
organization: [
|
62
|
+
'Fa.', 'P.T.', 'P.T. Tbk.', 'U.D.'
|
63
|
+
],
|
64
|
+
before: '\\A', after: ADFIX_JOINERS
|
65
|
+
},
|
66
|
+
suffix: {
|
67
|
+
person: [
|
68
|
+
'Chartered F.C.S.I.', 'Chartered M.C.S.I.', 'I.F.R.S. Certified', 'F.Inst.L.M.', 'C.I.S.S.P.', 'F.C.I.P.S.',
|
69
|
+
'M.R.I.C.S.', 'T.M.I.E.T.', 'Dip. D.M.', 'A.A.M.S.', 'A.C.C.A.', 'A.C.M.A.', 'A.I.F.A.', 'A.W.M.A.', 'C.A.I.A.',
|
70
|
+
'C.A.P.M.', 'C.C.I.M.', 'C.D.F.A.', 'C.E.P.P.', 'C.F.B.S.', 'C.G.M.A.', 'C.I.T.P.', 'C.L.T.C.', 'C.P.C.C.',
|
71
|
+
'C.R.P.C.', 'C.R.P.S.', 'C.S.O.X.', 'C.S.S.D.', 'F.B.C.S.', 'F.C.C.A.', 'F.C.M.I.', 'F.C.S.I.', 'F.I.E.T.',
|
72
|
+
'F.I.R.P.', 'M.I.E.T.', 'M.S.F.S.', 'M.Sc. D.', 'O.R.S.C.', 'R.I.C.P.', 'B.Tech.', 'Cantab.', 'Ch.F.C.',
|
73
|
+
'D.Phil.', 'I.T.I.L. v3', 'M.Io.D.', 'S.C.M.P', 'A.C.A.', 'A.C.C.', 'A.E.P.', 'A.I.F.', 'A.S.A.', 'B.Eng.',
|
74
|
+
'C.B.V.', 'C.E.M.', 'C.Eng.', 'C.F.A.', 'C.F.F.', 'C.F.P.', 'C.F.S.', 'C.G.A.', 'C.G.B.', 'C.G.P.', 'C.I.M.',
|
75
|
+
'C.L.P.', 'C.L.U.', 'C.M.A.', 'C.M.T.', 'C.P.A.', 'C.T.A.', 'C.W.S.', 'D.B.E.', 'D.D.S.', 'D.V.M.', 'E.R.P.',
|
76
|
+
'Eng.D.', 'F.C.A.', 'F.P.C.', 'F.R.M.', 'F.R.M.', 'G.S.P.', 'L.P.S.', 'M.B.A.', 'M.B.E.', 'M.E.P.', 'M.Eng.',
|
77
|
+
'M.Jur.', 'M.P.A.', 'M.S.F.', 'M.S.P.', 'O.B.E.', 'P.C.C.', 'P.F.S.', 'P.H.R.', 'P.M.C.', 'P.M.P.', 'P.M.P.',
|
78
|
+
'P.S.P.', 'R.F.C.', 'V.M.D.', 'B.Ed.', 'B.Sc.', 'Ed.D.', 'Ed.M.', 'Hons.', 'LL.B.', 'LL.D.', 'LL.M.', 'M.Ed.',
|
79
|
+
'M.Sc.', 'Oxon.', 'Ph.D.', 'B.A.', 'C.A.', 'E.A.', 'Esq.', 'J.D.', 'K.C.', 'M.A.', 'M.D.', 'M.P.', 'M.S.',
|
80
|
+
'O.K.', 'P.A.', 'Q.C.', 'R.D.', 'III', 'Jr.', 'Sr.', 'II', 'IV', 'V'
|
81
|
+
],
|
82
|
+
organization: [
|
83
|
+
'S. de R.L. de C.V.', 'S.A.P.I. de C.V.', 'y. Cía. S. en C.', 'Private Limited', 'S.M. Pte. Ltd.',
|
84
|
+
'Cía. S. C. A.', 'y. Cía. S. C.', 'S.A. de C.V.', 'spol. s.r.o.', '(Pty.) Ltd.', '(Pvt.) Ltd.', 'A.D.S.I.Tz.',
|
85
|
+
'S.p. z.o.o.', '(Pvt.)Ltd.', 'akc. spol.', 'Cía. Ltda.', 'E.B.V.B.A.', 'P. Limited', 'S. de R.L.', 'S.I.C.A.V.',
|
86
|
+
'S.P.R.L.U.', 'А.Д.С.И.Ц.', '(P.) Ltd.', 'C. por A.', 'Comm.V.A.', 'Ltd. Şti.', 'Plc. Ltd.', 'Pte. Ltd.',
|
87
|
+
'Pty. Ltd.', 'Pvt. Ltd.', 'Soc. Col.', 'A.M.B.A.', 'A.S.B.L.', 'A.V.E.E.', 'B.V.B.A.', 'B.V.I.O.', 'C.V.B.A.',
|
88
|
+
'C.V.O.A.', 'E.E.I.G.', 'E.I.R.L.', 'E.O.O.D.', 'E.U.R.L.', 'F.M.B.A.', 'G.m.b.H.', 'Ges.b.R.', 'K.G.a.A.',
|
89
|
+
'L.L.L.P.', 'Ltd. Co.', 'Ltd. Co.', 'M.E.P.E.', 'n.y.r.t.', 'O.V.E.E.', 'P.E.E.C.', 'P.L.L.C.', 'P.L.L.C.',
|
90
|
+
'S. en C.', 'S.a.p.a.', 'S.A.R.L.', 'S.à.R.L.', 'S.A.S.U.', 'S.C.e.I.', 'S.C.O.P.', 'S.C.p.A.', 'S.C.R.I.',
|
91
|
+
'S.C.R.L.', 'S.M.B.A.', 'S.P.R.L.', 'Е.О.О.Д.', '&. Cie.', 'and Co.', 'Comm.V.', 'Limited', 'P. Ltd.',
|
92
|
+
'Part.G.', 'Sh.p.k.', '&. Co.', 'C.X.A.', 'd.n.o.', 'd.o.o.', 'E.A.D.', 'e.h.f.', 'E.P.E.', 'E.S.V.', 'F.C.P.',
|
93
|
+
'F.I.E.', 'G.b.R.', 'G.I.E.', 'G.M.K.', 'G.S.K.', 'H.U.F.', 'K.D.A.', 'k.f.t.', 'k.h.t.', 'k.k.t.', 'L.L.C.',
|
94
|
+
'L.L.P.', 'o.h.f.', 'O.H.G.', 'O.O.D.', 'O.y.j.', 'p.l.c.', 'P.S.U.', 'S.A.E.', 'S.A.S.', 'S.C.A.', 'S.C.E.',
|
95
|
+
'S.C.S.', 'S.E.M.', 'S.E.P.', 's.e.s.', 'S.G.R.', 'S.N.C.', 'S.p.A.', 'S.P.E.', 'S.R.L.', 's.r.o.', 'Unltd.',
|
96
|
+
'V.O.F.', 'V.o.G.', 'v.o.s.', 'V.Z.W.', 'z.r.t.', 'А.А.Т.', 'Е.А.Д.', 'З.А.Т.', 'К.Д.А.', 'О.О.Д.', 'Т.А.А.',
|
97
|
+
'股份有限公司', 'Ap.S.', 'Corp.', 'ltda.', 'Sh.A.', 'st.G.', 'Ultd.', 'a.b.', 'A.D.', 'A.E.', 'A.G.', 'A.S.',
|
98
|
+
'A.Ş.', 'A.y.', 'B.M.', 'b.t.', 'B.V.', 'C.A.', 'C.V.', 'd.d.', 'e.c.', 'E.E.', 'e.G.', 'E.I.', 'E.P.', 'E.T.',
|
99
|
+
'E.U.', 'e.v.', 'G.K.', 'G.P.', 'h.f.', 'Inc.', 'K.D.', 'K.G.', 'K.K.', 'k.s.', 'k.v.', 'K.y.', 'L.C.', 'L.P.',
|
100
|
+
'Ltd.', 'N.K.', 'N.L.', 'N.V.', 'O.E.', 'O.G.', 'O.Ü.', 'O.y.', 'P.C.', 'p.l.', 'Pty.', 'PUP.', 'Pvt.', 'r.t.',
|
101
|
+
'S.A.', 'S.D.', 'S.E.', 's.f.', 'S.L.', 'S.P.', 'S.s.', 'T.K.', 'T.Ü.', 'U.Ü.', 'Y.K.', 'А.Д.', 'І.П.', 'К.Д.',
|
102
|
+
'ПУП.', 'С.Д.', 'בע"מ', '任意組合', '匿名組合', '合同会社', '合名会社', '合資会社', '有限会社', '有限公司', '株式会社',
|
103
|
+
'A/S', 'G/S', 'I/S', 'K/S', 'P/S', 'S/A'
|
104
|
+
],
|
105
|
+
before: ADFIX_JOINERS, after: '\\z'
|
106
|
+
}
|
107
|
+
}.freeze
|
108
|
+
|
109
|
+
ADFIX_PATTERNS = [:prefix, :suffix].map do |adfix_type|
|
110
|
+
patterns = {}
|
111
|
+
adfix = ADFIXES[adfix_type]
|
112
|
+
|
113
|
+
[:person, :organization].each do |ct|
|
114
|
+
with_optional_spaces = adfix[ct].map { |p| p.gsub(ASCII_SPACE, ' *') }
|
115
|
+
pattern_string = with_optional_spaces.join('|').gsub('.', '\.*')
|
116
|
+
patterns[ct] = /#{adfix[:before]}\(*(?:#{pattern_string})[®™\)]*#{adfix[:after]}/i
|
117
|
+
end
|
118
|
+
|
119
|
+
[adfix_type, patterns]
|
120
|
+
end.to_h
|
121
|
+
end
|
@@ -0,0 +1,384 @@
|
|
1
|
+
module NameTamer
|
2
|
+
class Name
|
3
|
+
# References:
|
4
|
+
# http://www.w3.org/International/questions/qa-personal-names
|
5
|
+
# https://github.com/berkmancenter/namae
|
6
|
+
# https://github.com/mericson/people
|
7
|
+
# http://en.wikipedia.org/wiki/Types_of_business_entity
|
8
|
+
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(USA)
|
9
|
+
# http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom)
|
10
|
+
# http://en.wikipedia.org/wiki/Nobiliary_particle
|
11
|
+
# http://en.wikipedia.org/wiki/Spanish_naming_customs
|
12
|
+
# http://linguistlist.org/pubs/tocs/JournalUnifiedStyleSheet2007.pdf [PDF]
|
13
|
+
attr_reader :name
|
14
|
+
|
15
|
+
def tidy_name
|
16
|
+
unless @tidy_name
|
17
|
+
@tidy_name = name.dup # Start with the name we've received
|
18
|
+
|
19
|
+
unescape # Unescape percent-encoded characters and fix UTF-8 encoding
|
20
|
+
remove_zero_width # remove zero-width characters
|
21
|
+
tidy_spacing # " John Smith " -> "John Smith"
|
22
|
+
fix_encoding_errors # "Ren\u00c3\u00a9 Descartes" -> "Ren\u00e9 Descartes"
|
23
|
+
consolidate_initials # "I. B. M." -> "I.B.M."
|
24
|
+
end
|
25
|
+
|
26
|
+
@tidy_name
|
27
|
+
end
|
28
|
+
|
29
|
+
def nice_name
|
30
|
+
unless @nice_name
|
31
|
+
@nice_name = tidy_name.dup # Start with the tidied name
|
32
|
+
|
33
|
+
remove_adfixes # prefixes and suffixes: "Smith, John, Jr." -> "Smith, John"
|
34
|
+
fixup_last_name_first # "Smith, John" -> "John Smith"
|
35
|
+
fixup_mismatched_braces # "Ceres (AZ" -> "Ceres (AZ)"
|
36
|
+
remove_adfixes # prefixes and suffixes: "Mr John Smith Jr." -> "John Smith"
|
37
|
+
name_wrangle # proper name case and non-breaking spaces
|
38
|
+
use_nonbreaking_spaces_in_compound_names
|
39
|
+
end
|
40
|
+
|
41
|
+
@nice_name
|
42
|
+
end
|
43
|
+
|
44
|
+
def simple_name
|
45
|
+
unless @simple_name
|
46
|
+
@simple_name = nice_name.dup # Start with nice name
|
47
|
+
|
48
|
+
remove_initials # "John Q. Doe" -> "John Doe"
|
49
|
+
remove_middle_names # "Philip Seymour Hoffman" -> "Philip Hoffman"
|
50
|
+
remove_periods_from_initials # "J.P.R. Williams" -> "JPR Williams"
|
51
|
+
standardize_words # "B&Q Intl" -> "B and Q International"
|
52
|
+
|
53
|
+
@simple_name.whitespace_to!(ASCII_SPACE)
|
54
|
+
end
|
55
|
+
|
56
|
+
@simple_name
|
57
|
+
end
|
58
|
+
|
59
|
+
def slug
|
60
|
+
@slug ||= NameTamer.parameterize simple_name.dup # "John Doe" -> "john-doe"
|
61
|
+
end
|
62
|
+
|
63
|
+
def array
|
64
|
+
@array ||= slug.split(SLUG_DELIMITER)
|
65
|
+
end
|
66
|
+
|
67
|
+
def contact_type
|
68
|
+
nice_name # make sure we've done the bit which infers contact_type
|
69
|
+
contact_type_best_effort
|
70
|
+
end
|
71
|
+
|
72
|
+
def contact_type=(new_contact_type)
|
73
|
+
ct_as_sym = new_contact_type.to_sym
|
74
|
+
|
75
|
+
unless @contact_type.nil? || @contact_type == ct_as_sym
|
76
|
+
puts "Changing contact type of #{@name} from #{@contact_type} to #{new_contact_type}"
|
77
|
+
end
|
78
|
+
|
79
|
+
@contact_type = ct_as_sym
|
80
|
+
end
|
81
|
+
|
82
|
+
# These lines aren't used and aren't covered by specs
|
83
|
+
# def name=(new_name)
|
84
|
+
# initialize new_name, :contact_type => @contact_type
|
85
|
+
# end
|
86
|
+
#
|
87
|
+
# def to_hash
|
88
|
+
# {
|
89
|
+
# name: name,
|
90
|
+
# nice_name: nice_name,
|
91
|
+
# simple_name: simple_name,
|
92
|
+
# slug: slug,
|
93
|
+
# contact_type: contact_type,
|
94
|
+
# last_name: last_name,
|
95
|
+
# remainder: remainder,
|
96
|
+
# adfix_found: adfix_found
|
97
|
+
# }
|
98
|
+
# end
|
99
|
+
|
100
|
+
private
|
101
|
+
|
102
|
+
#--------------------------------------------------------
|
103
|
+
# Tidy up the name we've received
|
104
|
+
#--------------------------------------------------------
|
105
|
+
|
106
|
+
def unescape
|
107
|
+
@tidy_name.ensure_safe!.safe_unescape!.unescape_html!
|
108
|
+
end
|
109
|
+
|
110
|
+
def remove_zero_width
|
111
|
+
@tidy_name.strip_unwanted!(ZERO_WIDTH_FILTER)
|
112
|
+
end
|
113
|
+
|
114
|
+
def tidy_spacing
|
115
|
+
@tidy_name
|
116
|
+
.space_around_comma!
|
117
|
+
.strip_or_self!
|
118
|
+
.whitespace_to!(ASCII_SPACE)
|
119
|
+
end
|
120
|
+
|
121
|
+
def fix_encoding_errors
|
122
|
+
@tidy_name.fix_encoding_errors!
|
123
|
+
end
|
124
|
+
|
125
|
+
# Remove spaces from groups of initials
|
126
|
+
def consolidate_initials
|
127
|
+
@tidy_name
|
128
|
+
.remove_spaces_from_initials!
|
129
|
+
.ensure_space_after_initials!
|
130
|
+
end
|
131
|
+
|
132
|
+
# An adfix is either a prefix or a suffix
|
133
|
+
def remove_adfixes
|
134
|
+
if @last_name.nil?
|
135
|
+
# Our name is still in one part, not two
|
136
|
+
loop do
|
137
|
+
@nice_name = remove_outermost_adfix(:suffix, @nice_name)
|
138
|
+
break unless @adfix_found
|
139
|
+
end
|
140
|
+
|
141
|
+
loop do
|
142
|
+
@nice_name = remove_outermost_adfix(:prefix, @nice_name)
|
143
|
+
break unless @adfix_found
|
144
|
+
end
|
145
|
+
else
|
146
|
+
# Our name is currently in two halves
|
147
|
+
loop do
|
148
|
+
@last_name = remove_outermost_adfix(:suffix, @last_name)
|
149
|
+
break unless @adfix_found
|
150
|
+
end
|
151
|
+
|
152
|
+
loop do
|
153
|
+
@remainder = remove_outermost_adfix(:prefix, @remainder)
|
154
|
+
break unless @adfix_found
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
# Names in the form "Smith, John" need to be turned around to "John Smith"
|
160
|
+
def fixup_last_name_first
|
161
|
+
return if @contact_type == :organization
|
162
|
+
|
163
|
+
parts = @nice_name.split ', '
|
164
|
+
|
165
|
+
return unless parts.count == 2
|
166
|
+
|
167
|
+
@last_name = parts[0] # Sometimes the last name alone is all caps and we can name-case it
|
168
|
+
@remainder = parts[1]
|
169
|
+
end
|
170
|
+
|
171
|
+
# Sometimes we end up with mismatched braces after adfix stripping
|
172
|
+
# e.g. "Ceres (Ceres Holdings LLC)" -> "Ceres (Ceres Holdings"
|
173
|
+
def fixup_mismatched_braces
|
174
|
+
left_brace_count = @nice_name.count '('
|
175
|
+
right_brace_count = @nice_name.count ')'
|
176
|
+
|
177
|
+
if left_brace_count > right_brace_count
|
178
|
+
@nice_name += ')'
|
179
|
+
elsif left_brace_count < right_brace_count
|
180
|
+
@nice_name = '(' + @nice_name
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
def name_wrangle
|
185
|
+
# Fix case if all caps or all lowercase
|
186
|
+
if @last_name.nil?
|
187
|
+
name_wrangle_single_name
|
188
|
+
else
|
189
|
+
name_wrangle_split_name
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
def name_wrangle_single_name
|
194
|
+
lowercase = @nice_name.downcase
|
195
|
+
uppercase = @nice_name.upcase
|
196
|
+
fix_case = false
|
197
|
+
|
198
|
+
if @contact_type == :organization
|
199
|
+
fix_case = true if @nice_name == uppercase && @nice_name.length > 4
|
200
|
+
elsif [uppercase, lowercase].include?(@nice_name)
|
201
|
+
fix_case = true
|
202
|
+
end
|
203
|
+
|
204
|
+
@nice_name = name_case(lowercase) if fix_case
|
205
|
+
end
|
206
|
+
|
207
|
+
def name_wrangle_split_name
|
208
|
+
# It's a person if we've split the name, so no organization logic here
|
209
|
+
lowercase = @last_name.downcase
|
210
|
+
uppercase = @last_name.upcase
|
211
|
+
@last_name = name_case(lowercase) if [uppercase, lowercase].include?(@last_name)
|
212
|
+
@nice_name = "#{@remainder} #{@last_name}"
|
213
|
+
end
|
214
|
+
|
215
|
+
# Conjoin compound names with non-breaking spaces
|
216
|
+
def use_nonbreaking_spaces_in_compound_names
|
217
|
+
@nice_name
|
218
|
+
.nbsp_in_compound_name!
|
219
|
+
.nbsp_in_name_modifier!
|
220
|
+
end
|
221
|
+
|
222
|
+
#--------------------------------------------------------
|
223
|
+
# Make search name from nice name
|
224
|
+
#--------------------------------------------------------
|
225
|
+
|
226
|
+
# Remove initials from personal names unless they are the only identifier.
|
227
|
+
# i.e. only remove initials if there's also a proper name there
|
228
|
+
def remove_initials
|
229
|
+
return unless @contact_type == :person
|
230
|
+
|
231
|
+
temp_name = @simple_name.gsub(/\b([a-z](?:\.*\s+|\.))/i, '')
|
232
|
+
|
233
|
+
# If the name still has at least one space we're OK
|
234
|
+
@simple_name = temp_name if temp_name.include?(ASCII_SPACE)
|
235
|
+
end
|
236
|
+
|
237
|
+
def remove_middle_names
|
238
|
+
return unless @contact_type == :person
|
239
|
+
|
240
|
+
first_name, parts = find_first_usable_name(@simple_name.split)
|
241
|
+
last_name, = find_last_usable_name(parts)
|
242
|
+
|
243
|
+
return unless first_name || last_name
|
244
|
+
|
245
|
+
separator = first_name && last_name ? ' ' : ''
|
246
|
+
@simple_name = "#{first_name}#{separator}#{last_name}"
|
247
|
+
end
|
248
|
+
|
249
|
+
def find_first_usable_name(parts)
|
250
|
+
part = nil
|
251
|
+
|
252
|
+
parts.each_index do |i|
|
253
|
+
part = parts[i]
|
254
|
+
next if part.gsub(FILTER_COMPAT, '').empty?
|
255
|
+
parts = parts.slice(i + 1, parts.length) # don't use "slice!"
|
256
|
+
break
|
257
|
+
end
|
258
|
+
|
259
|
+
[part, parts]
|
260
|
+
end
|
261
|
+
|
262
|
+
def find_last_usable_name(parts)
|
263
|
+
part = nil
|
264
|
+
|
265
|
+
parts.reverse_each do |p|
|
266
|
+
next if p.gsub(FILTER_COMPAT, '').empty?
|
267
|
+
part = p
|
268
|
+
break
|
269
|
+
end
|
270
|
+
|
271
|
+
part
|
272
|
+
end
|
273
|
+
|
274
|
+
def remove_periods_from_initials
|
275
|
+
@simple_name.remove_periods_from_initials!
|
276
|
+
end
|
277
|
+
|
278
|
+
def standardize_words
|
279
|
+
@simple_name.gsub!(/ *& */, ' and ') # replace ampersand characters with ' and '
|
280
|
+
@simple_name.gsub!(/ *\+ */, ' plus ') # replace plus signs with ' plus '
|
281
|
+
@simple_name.gsub!(/\bintl\b/i, 'International') # replace 'intl' with 'International'
|
282
|
+
@simple_name.gsub!(/[־‐‑‒–—―−﹘﹣-]/, SLUG_DELIMITER) # Replace Unicode dashes with ASCII hyphen
|
283
|
+
@simple_name.strip_unwanted!(/["“”™℠®©℗]/) # remove quotes and commercial decoration
|
284
|
+
end
|
285
|
+
|
286
|
+
#--------------------------------------------------------
|
287
|
+
# Initialization and utilities
|
288
|
+
#--------------------------------------------------------
|
289
|
+
|
290
|
+
def initialize(new_name, args = {})
|
291
|
+
@name = new_name || ''
|
292
|
+
@contact_type = contact_type_from args
|
293
|
+
|
294
|
+
@tidy_name = nil
|
295
|
+
@nice_name = nil
|
296
|
+
@simple_name = nil
|
297
|
+
@slug = nil
|
298
|
+
|
299
|
+
@last_name = nil
|
300
|
+
@remainder = nil
|
301
|
+
|
302
|
+
@adfix_found = false
|
303
|
+
end
|
304
|
+
|
305
|
+
def contact_type_from(args)
|
306
|
+
args_ct = args[:contact_type]
|
307
|
+
return unless args_ct
|
308
|
+
|
309
|
+
ct = args_ct.is_a?(Symbol) ? args_ct : args_ct.dup
|
310
|
+
ct = ct.to_s unless [String, Symbol].include? ct.class
|
311
|
+
ct.downcase! if ct.class == String
|
312
|
+
ct = ct.to_sym
|
313
|
+
ct = nil unless [:person, :organization].include? ct
|
314
|
+
|
315
|
+
ct
|
316
|
+
end
|
317
|
+
|
318
|
+
# If we don't know the contact type, what's our best guess?
|
319
|
+
def contact_type_best_effort
|
320
|
+
if @contact_type
|
321
|
+
@contact_type
|
322
|
+
else
|
323
|
+
# If it's just one word we'll assume organization.
|
324
|
+
# If more then we'll assume a person
|
325
|
+
@name.include?(ASCII_SPACE) ? :person : :organization
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# We pass to this routine either prefixes or suffixes
|
330
|
+
def remove_outermost_adfix(adfix_type, name_part)
|
331
|
+
ct, parts = find_contact_type_and_parts(ADFIX_PATTERNS[adfix_type], name_part)
|
332
|
+
|
333
|
+
return name_part unless @adfix_found
|
334
|
+
|
335
|
+
# If we've found a diagnostic adfix then set the contact type
|
336
|
+
self.contact_type = ct
|
337
|
+
|
338
|
+
# The remainder of the name will be in parts[0] or parts[2] depending
|
339
|
+
# on whether this is a prefix or a suffix.
|
340
|
+
# We'll also remove any trailing commas we've exposed.
|
341
|
+
(parts[0] + parts[2]).gsub(/\s*,\s*$/, '')
|
342
|
+
end
|
343
|
+
|
344
|
+
def find_contact_type_and_parts(adfixes, name_part)
|
345
|
+
ct = contact_type_best_effort
|
346
|
+
parts = name_part.partition adfixes[ct]
|
347
|
+
@adfix_found = !parts[1].empty?
|
348
|
+
|
349
|
+
return [ct, parts] if @contact_type || @adfix_found
|
350
|
+
|
351
|
+
# If the contact type is indeterminate and we didn't find a diagnostic adfix
|
352
|
+
# for a person then try again for an organization
|
353
|
+
ct = :organization
|
354
|
+
parts = name_part.partition adfixes[ct]
|
355
|
+
@adfix_found = !parts[1].empty?
|
356
|
+
|
357
|
+
[ct, parts]
|
358
|
+
end
|
359
|
+
|
360
|
+
# Original Version of NameCase:
|
361
|
+
# Copyright (c) Mark Summerfield 1998-2008. All Rights Reserved
|
362
|
+
# This module may be used/distributed/modified under the same terms as Perl itself
|
363
|
+
# http://dev.perl.org/licenses/ (GPL)
|
364
|
+
#
|
365
|
+
# Ruby Version:
|
366
|
+
# Copyright (c) Aaron Patterson 2006
|
367
|
+
# NameCase is distributed under the GPL license.
|
368
|
+
#
|
369
|
+
# Substantially modified for Xendata
|
370
|
+
# Improved in several areas, also now adds non-breaking spaces for
|
371
|
+
# compound names like "van der Pump"
|
372
|
+
def name_case(lowercase)
|
373
|
+
n = lowercase.dup # We assume the name is passed already downcased
|
374
|
+
|
375
|
+
n
|
376
|
+
.upcase_first_letter!
|
377
|
+
.downcase_after_apostrophe!
|
378
|
+
.fix_mac!
|
379
|
+
.fix_ff!
|
380
|
+
.fix_name_modifiers!
|
381
|
+
.upcase_initials!
|
382
|
+
end
|
383
|
+
end
|
384
|
+
end
|