namor 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/namor/namor.rb +33 -9
- data/lib/namor/version.rb +1 -1
- data/spec/lib/namor_spec.rb +28 -24
- metadata +3 -3
data/lib/namor/namor.rb
CHANGED
@@ -19,7 +19,28 @@ class Namor::Namor
|
|
19
19
|
suppression_list = @config[:suppress] || []
|
20
20
|
suppression_re = Regexp.new('\b?' + (suppression_list + (opts[:suppress]||[])).compact.map(&:upcase).join('|') + '\b?')
|
21
21
|
|
22
|
-
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(suppression_re, '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'
|
22
|
+
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(suppression_re, '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
|
23
|
+
end
|
24
|
+
|
25
|
+
def fullscrub(name)
|
26
|
+
final_cleaning(scrub(name))
|
27
|
+
end
|
28
|
+
|
29
|
+
def demaiden(lastname)
|
30
|
+
return [nil,nil] unless lastname && !lastname.empty?
|
31
|
+
if lastname =~ /\-/
|
32
|
+
[lastname.gsub(/ /, ''), lastname.split(/\-/).last.gsub(/ /, '')]
|
33
|
+
else
|
34
|
+
[lastname.gsub(/ /, ''), lastname.split(/ /).last]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def final_cleaning(name)
|
39
|
+
if name && !name.empty?
|
40
|
+
name.gsub(/\-/, '')
|
41
|
+
else
|
42
|
+
nil
|
43
|
+
end
|
23
44
|
end
|
24
45
|
|
25
46
|
def extract(name, opts = {})
|
@@ -30,7 +51,7 @@ class Namor::Namor
|
|
30
51
|
if detitled_name =~ /,/
|
31
52
|
# "last, first[ middle]"
|
32
53
|
lastname, firstname = detitled_name.split(/\s*,\s*/)
|
33
|
-
lastname
|
54
|
+
lastname, de_maidened_last = demaiden(lastname)
|
34
55
|
middlename = nil
|
35
56
|
if firstname && firstname =~ / /
|
36
57
|
pieces = firstname.split(/ +/)
|
@@ -38,26 +59,29 @@ class Namor::Namor
|
|
38
59
|
middlename = pieces.join if pieces.any?
|
39
60
|
end
|
40
61
|
else
|
41
|
-
# "first [middle ]last"
|
62
|
+
# "first [middle-initial ]last" or "first everything-else-is-the-lastname"
|
42
63
|
pieces = detitled_name.split(' ')
|
43
64
|
firstname = pieces.shift
|
44
|
-
middlename = nil
|
45
65
|
if pieces.count > 1 && pieces.first.length == 1
|
46
66
|
# assume this is a middle initial
|
47
67
|
middlename = pieces.shift
|
68
|
+
else
|
69
|
+
middlename = nil
|
48
70
|
end
|
49
71
|
|
50
|
-
lastname = pieces.join
|
72
|
+
lastname, de_maidened_last = demaiden(pieces.join(' '))
|
51
73
|
end
|
52
74
|
|
53
|
-
firstname =
|
54
|
-
middlename =
|
55
|
-
lastname =
|
75
|
+
firstname = final_cleaning(firstname)
|
76
|
+
middlename = final_cleaning(middlename)
|
77
|
+
lastname = final_cleaning(lastname)
|
78
|
+
de_maidened_last = final_cleaning(de_maidened_last)
|
56
79
|
|
57
80
|
fm = [firstname, middlename].compact.join(' ')
|
58
81
|
fullname = [lastname, fm].compact.join(',')
|
82
|
+
nee_fullname = [de_maidened_last, fm].compact.join(',')
|
59
83
|
|
60
|
-
[firstname, middlename, lastname, fullname]
|
84
|
+
[firstname, middlename, lastname, fullname, nee_fullname]
|
61
85
|
end
|
62
86
|
|
63
87
|
def extract_with_cluster(name, opts = {})
|
data/lib/namor/version.rb
CHANGED
data/spec/lib/namor_spec.rb
CHANGED
@@ -9,71 +9,75 @@ describe "name extract" do
|
|
9
9
|
end
|
10
10
|
|
11
11
|
it "should handle 2-part names without commas" do
|
12
|
-
@namor.extract("john smith").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN']
|
12
|
+
@namor.extract("john smith").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN', 'SMITH,JOHN']
|
13
13
|
end
|
14
14
|
|
15
15
|
it "should handle 2-part names with commas" do
|
16
|
-
@namor.extract("SMITH, JOHN").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN']
|
16
|
+
@namor.extract("SMITH, JOHN").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN', 'SMITH,JOHN']
|
17
17
|
end
|
18
18
|
|
19
19
|
it "should handle 2-part names with commas and middle initials" do
|
20
|
-
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
20
|
+
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
21
21
|
end
|
22
22
|
|
23
23
|
it "should handle 2-part names with commas and middle initials" do
|
24
|
-
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
24
|
+
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
25
25
|
end
|
26
26
|
|
27
27
|
it "should strip elements within parentheses" do
|
28
|
-
@namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
28
|
+
@namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
29
29
|
end
|
30
30
|
|
31
31
|
it "should drop periods" do
|
32
|
-
@namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
32
|
+
@namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
33
33
|
end
|
34
34
|
|
35
35
|
it "should drop spaces in last name (only when input has a comma)" do
|
36
|
-
@namor.extract("Smith Jones, Mary").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY']
|
36
|
+
@namor.extract("Smith Jones, Mary").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
|
37
37
|
end
|
38
38
|
|
39
39
|
it "should drop dashes, apostrophes, ampersands" do
|
40
|
-
@namor.extract("Mary Smith-Jones").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY']
|
41
|
-
@namor.extract("Mary S. O'Keefe").should == ['MARY', 'S', 'OKEEFE', 'OKEEFE,MARY S']
|
42
|
-
@namor.extract("Jean-Michel Claude").should == ['JEANMICHEL', nil, 'CLAUDE', 'CLAUDE,JEANMICHEL']
|
43
|
-
@namor.extract("Smith, Bob & Sue").should == ['BOB', 'SUE', 'SMITH', 'SMITH,BOB SUE']
|
44
|
-
@namor.extract("Research & Development").should == ['RESEARCH', nil, 'DEVELOPMENT', 'DEVELOPMENT,RESEARCH']
|
40
|
+
@namor.extract("Mary Smith-Jones").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
|
41
|
+
@namor.extract("Mary S. O'Keefe").should == ['MARY', 'S', 'OKEEFE', 'OKEEFE,MARY S', 'OKEEFE,MARY S']
|
42
|
+
@namor.extract("Jean-Michel Claude").should == ['JEANMICHEL', nil, 'CLAUDE', 'CLAUDE,JEANMICHEL', 'CLAUDE,JEANMICHEL']
|
43
|
+
@namor.extract("Smith, Bob & Sue").should == ['BOB', 'SUE', 'SMITH', 'SMITH,BOB SUE', 'SMITH,BOB SUE']
|
44
|
+
@namor.extract("Research & Development").should == ['RESEARCH', nil, 'DEVELOPMENT', 'DEVELOPMENT,RESEARCH', 'DEVELOPMENT,RESEARCH']
|
45
45
|
end
|
46
46
|
|
47
47
|
it "should concatenate extract name pieces" do
|
48
|
-
@namor.extract("rajesh kumar vishnu garuda").should == ['RAJESH', nil, 'KUMARVISHNUGARUDA', 'KUMARVISHNUGARUDA,RAJESH']
|
49
|
-
@namor.extract("Kumar, Rajesh Vishnu Garuda").should == ['RAJESH', 'VISHNUGARUDA', 'KUMAR', 'KUMAR,RAJESH VISHNUGARUDA']
|
48
|
+
@namor.extract("rajesh kumar vishnu garuda").should == ['RAJESH', nil, 'KUMARVISHNUGARUDA', 'KUMARVISHNUGARUDA,RAJESH', 'GARUDA,RAJESH']
|
49
|
+
@namor.extract("Kumar, Rajesh Vishnu Garuda").should == ['RAJESH', 'VISHNUGARUDA', 'KUMAR', 'KUMAR,RAJESH VISHNUGARUDA', 'KUMAR,RAJESH VISHNUGARUDA']
|
50
50
|
end
|
51
51
|
|
52
52
|
it "should excise suffixes like 'Jr.' from lastnames" do
|
53
|
-
@namor.extract("Smith Jr, Edward M").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M']
|
53
|
+
@namor.extract("Smith Jr, Edward M").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
54
54
|
end
|
55
55
|
|
56
56
|
it "should excise terms from optional suppression list" do
|
57
|
-
@namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M']
|
58
|
-
@namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M']
|
59
|
-
@namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD']
|
57
|
+
@namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
58
|
+
@namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
59
|
+
@namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD', 'SMITH,EDWARD PHD']
|
60
60
|
end
|
61
61
|
|
62
62
|
it "should handle pathological cases" do
|
63
|
-
@namor.extract(", Mary Smith").should == ['MARY', 'SMITH', nil, 'MARY SMITH']
|
63
|
+
@namor.extract(", Mary Smith").should == ['MARY', 'SMITH', nil, 'MARY SMITH', 'MARY SMITH']
|
64
64
|
end
|
65
65
|
|
66
66
|
it "should squash multi-part last names" do
|
67
|
-
@namor.extract("Al Hassan, Bashar").should == ['BASHAR', nil, 'ALHASSAN', 'ALHASSAN,BASHAR']
|
68
|
-
@namor.extract("Bashar Al-Hassan").should == ['BASHAR', nil, 'ALHASSAN', 'ALHASSAN,BASHAR']
|
67
|
+
@namor.extract("Al Hassan, Bashar").should == ['BASHAR', nil, 'ALHASSAN', 'ALHASSAN,BASHAR', 'HASSAN,BASHAR']
|
68
|
+
@namor.extract("Bashar Al-Hassan").should == ['BASHAR', nil, 'ALHASSAN', 'ALHASSAN,BASHAR', 'HASSAN,BASHAR']
|
69
69
|
end
|
70
70
|
|
71
71
|
it "should squash hyphenated first names" do
|
72
|
-
@namor.extract("Smith,Anne-Marie").should == ['ANNEMARIE', nil, 'SMITH', 'SMITH,ANNEMARIE']
|
72
|
+
@namor.extract("Smith,Anne-Marie").should == ['ANNEMARIE', nil, 'SMITH', 'SMITH,ANNEMARIE', 'SMITH,ANNEMARIE']
|
73
73
|
end
|
74
74
|
|
75
75
|
it "should treat some cases with periods as first.last" do
|
76
|
-
@namor.extract("john.smith").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN']
|
76
|
+
@namor.extract("john.smith").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN', 'SMITH,JOHN']
|
77
|
+
end
|
78
|
+
|
79
|
+
it "should generate estimated maiden names" do
|
80
|
+
@namor.extract("Jones-De Quento, Maria").should == ['MARIA', nil, 'JONESDEQUENTO', 'JONESDEQUENTO,MARIA', 'DEQUENTO,MARIA']
|
77
81
|
end
|
78
82
|
end
|
79
83
|
|
@@ -109,7 +113,7 @@ describe "name componentization" do
|
|
109
113
|
end
|
110
114
|
|
111
115
|
it "should scrub individual name components of punctuation and titles" do
|
112
|
-
@namor.
|
116
|
+
@namor.fullscrub('Foxworthy-Smythe, ESQ.').should == 'FOXWORTHYSMYTHE'
|
113
117
|
end
|
114
118
|
|
115
119
|
it "should delete strings inside parens" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -108,7 +108,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
108
108
|
version: '0'
|
109
109
|
segments:
|
110
110
|
- 0
|
111
|
-
hash:
|
111
|
+
hash: 3235644830650956760
|
112
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
113
|
none: false
|
114
114
|
requirements:
|
@@ -117,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
117
117
|
version: '0'
|
118
118
|
segments:
|
119
119
|
- 0
|
120
|
-
hash:
|
120
|
+
hash: 3235644830650956760
|
121
121
|
requirements: []
|
122
122
|
rubyforge_project:
|
123
123
|
rubygems_version: 1.8.24
|