namor 0.5.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/namor.rb +1 -0
- data/lib/namor/comparator.rb +85 -0
- data/lib/namor/namor.rb +15 -9
- data/lib/namor/version.rb +1 -1
- data/spec/lib/comparator_spec.rb +59 -0
- data/spec/lib/namor_spec.rb +7 -0
- metadata +7 -4
data/lib/namor.rb
CHANGED
@@ -0,0 +1,85 @@
|
|
1
|
+
# MULTI-MATCHING via components
|
2
|
+
# go through all users
|
3
|
+
# group by distinct sets of components
|
4
|
+
# pick a (small) subset of component-keys, say <10. Maybe random sample?
|
5
|
+
# build a set of matching rules
|
6
|
+
# run the subset * the full corpus * the matching rules
|
7
|
+
|
8
|
+
class Namor::Comparator
|
9
|
+
attr_reader :corpus
|
10
|
+
|
11
|
+
def initialize(corpus)
|
12
|
+
@corpus = corpus
|
13
|
+
|
14
|
+
prep_missing_initials
|
15
|
+
end
|
16
|
+
|
17
|
+
def crunch(record)
|
18
|
+
(@corpus - [record]).each_with_object([]) do |candidate,matches|
|
19
|
+
if evaluate(record, candidate)
|
20
|
+
matches << candidate
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def evaluate(record, candidate)
|
26
|
+
[:missing_initials].each do |rule|
|
27
|
+
return true if send(rule, record, candidate)
|
28
|
+
end
|
29
|
+
false
|
30
|
+
end
|
31
|
+
|
32
|
+
# don't need an 'identical' test - assuming that the input record does not appear in the corpus
|
33
|
+
# def identical(a,b)
|
34
|
+
# a == b
|
35
|
+
# end
|
36
|
+
|
37
|
+
# must have at least 2 long (non-initial-only) components in each
|
38
|
+
# those long parts must be identical
|
39
|
+
# only one of the names can have any initials
|
40
|
+
def missing_initials(a,b)
|
41
|
+
longnames_a = a.select {|s| s.length > 1}
|
42
|
+
longnames_b = b.select {|s| s.length > 1}
|
43
|
+
inits_a = a.select {|s| s.length == 1}
|
44
|
+
inits_b = b.select {|s| s.length == 1}
|
45
|
+
|
46
|
+
longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
|
47
|
+
end
|
48
|
+
|
49
|
+
def prep_missing_initials
|
50
|
+
@corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
|
51
|
+
without_initials = rec.select {|s| s.length > 1}
|
52
|
+
if without_initials.count >= 2
|
53
|
+
set << without_initials
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# must have at least 1 long (non-initial-only) component in each
|
59
|
+
# those long parts must be identical
|
60
|
+
# all initials should correspond to non-matched longnames in the other input
|
61
|
+
def matching_initials(a,b)
|
62
|
+
longnames_a = a.select {|s| s.length > 1}
|
63
|
+
longnames_b = b.select {|s| s.length > 1}
|
64
|
+
inits_a = a.select {|s| s.length == 1}
|
65
|
+
inits_b = b.select {|s| s.length == 1}
|
66
|
+
|
67
|
+
return false unless longnames_a.count >= 1 && longnames_b.count >= 1
|
68
|
+
|
69
|
+
unmatched_longnames_a = longnames_a - longnames_b
|
70
|
+
unmatched_longnames_b = longnames_b - longnames_a
|
71
|
+
unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
|
72
|
+
unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
|
73
|
+
|
74
|
+
inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
|
75
|
+
end
|
76
|
+
|
77
|
+
# ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
|
78
|
+
def matching_all_but_one(a,b)
|
79
|
+
longnames_a = a.select {|s| s.length > 1}
|
80
|
+
longnames_b = b.select {|s| s.length > 1}
|
81
|
+
|
82
|
+
((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
data/lib/namor/namor.rb
CHANGED
@@ -20,6 +20,11 @@ class Namor::Namor
|
|
20
20
|
# Regexp.new(bits.join('|'))
|
21
21
|
end
|
22
22
|
|
23
|
+
def suppress(name, supplist)
|
24
|
+
@re_cache[supplist] ||= suppression_re(supplist)
|
25
|
+
name && name.upcase.gsub(@re_cache[supplist], '')
|
26
|
+
end
|
27
|
+
|
23
28
|
# clean up a single name component
|
24
29
|
# * output all converted to uppercase
|
25
30
|
# * strip leading ZZ+ or XX+ (frequently used as invalid-account prefixes)
|
@@ -31,7 +36,7 @@ class Namor::Namor
|
|
31
36
|
def scrub(name, opts = {})
|
32
37
|
@re_cache[opts[:suppress]] ||= suppression_re(opts[:suppress])
|
33
38
|
|
34
|
-
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\
|
39
|
+
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\)]*\)/, '').gsub(/\[[^\]]*\]/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
|
35
40
|
end
|
36
41
|
|
37
42
|
def fullscrub(name, opts = {})
|
@@ -44,8 +49,9 @@ class Namor::Namor
|
|
44
49
|
s && s.gsub(/[- ]/, '')
|
45
50
|
end
|
46
51
|
|
47
|
-
def demaiden(lastname)
|
52
|
+
def demaiden(lastname, opts = {})
|
48
53
|
return [nil,nil] unless lastname && !lastname.empty?
|
54
|
+
lastname = suppress(lastname, opts[:suppress]) if opts[:suppress]
|
49
55
|
if lastname =~ /\-/
|
50
56
|
[lastname.upcase.gsub(/ /, ''), lastname.split(/\-/).last.gsub(/ /, '')]
|
51
57
|
else
|
@@ -114,17 +120,17 @@ class Namor::Namor
|
|
114
120
|
ary << ary[4].gsub(/\W/, '_')
|
115
121
|
end
|
116
122
|
|
117
|
-
def extract_from_pieces(hash)
|
123
|
+
def extract_from_pieces(hash, opts = {})
|
118
124
|
assemble(
|
119
|
-
scrub(hash[:first]),
|
120
|
-
scrub(hash[:middle]),
|
121
|
-
scrub_and_squash(hash[:last]),
|
122
|
-
scrub_and_squash((s = demaiden(hash[:last])) && s.last)
|
125
|
+
scrub(hash[:first], opts),
|
126
|
+
scrub(hash[:middle], opts),
|
127
|
+
scrub_and_squash(hash[:last], opts),
|
128
|
+
scrub_and_squash((s = demaiden(hash[:last], opts)) && s.last, opts)
|
123
129
|
)
|
124
130
|
end
|
125
131
|
|
126
|
-
def extract_from_pieces_with_cluster(hash)
|
127
|
-
ary = extract_from_pieces(hash)
|
132
|
+
def extract_from_pieces_with_cluster(hash, opts = {})
|
133
|
+
ary = extract_from_pieces(hash, opts)
|
128
134
|
ary << ary[3].gsub(/\W/, '_')
|
129
135
|
ary << ary[4].gsub(/\W/, '_')
|
130
136
|
end
|
data/lib/namor/version.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe "Comparator" do
|
4
|
+
def explode(name)
|
5
|
+
name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
|
6
|
+
end
|
7
|
+
|
8
|
+
def try(rule, name1, name2)
|
9
|
+
@comp.send(rule, explode(name1), explode(name2)).should be_true
|
10
|
+
end
|
11
|
+
|
12
|
+
def bust(rule, name1, name2)
|
13
|
+
@comp.send(rule, explode(name1), explode(name2)).should be_false
|
14
|
+
end
|
15
|
+
|
16
|
+
before :all do
|
17
|
+
names = [
|
18
|
+
"michael g palmer",
|
19
|
+
"francis l palmer",
|
20
|
+
"michael palmer"
|
21
|
+
]
|
22
|
+
corpus = names.map {|name| explode(name)}
|
23
|
+
@comp = Namor::Comparator.new(corpus)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "finds names that match without initials" do
|
27
|
+
try(:missing_initials, "michael palmer", "michael g palmer")
|
28
|
+
try(:missing_initials, "michael palmer", "Q michael palmer")
|
29
|
+
try(:missing_initials, "michael palmer", "Michael N Palmer x")
|
30
|
+
bust(:missing_initials, "michael palmer", "Michael P")
|
31
|
+
bust(:missing_initials, "michael palmer", "Michael John Palmer")
|
32
|
+
|
33
|
+
matches = @comp.crunch(explode("michael palmer"))
|
34
|
+
matches.should == [explode("michael g palmer")]
|
35
|
+
matches = @comp.crunch(explode("palmer michael"))
|
36
|
+
matches.should == [explode("michael g palmer")]
|
37
|
+
matches = @comp.crunch(explode("michael g palmer"))
|
38
|
+
matches.should == [explode("michael palmer")]
|
39
|
+
end
|
40
|
+
|
41
|
+
it "finds names that match initials to names" do
|
42
|
+
try(:matching_initials, "fred jones", "f jones")
|
43
|
+
try(:matching_initials, "fred jones", "jones f")
|
44
|
+
try(:matching_initials, "fred jones", "fred j")
|
45
|
+
try(:matching_initials, "fred xavier jones", "fred x jones")
|
46
|
+
try(:matching_initials, "fred xavier jones", "xavier jones f")
|
47
|
+
bust(:matching_initials, "fred xavier jones", "fred jones")
|
48
|
+
bust(:matching_initials, "fred xavier jones", "fred q jones")
|
49
|
+
bust(:matching_initials, "fred x jones", "fred q jones")
|
50
|
+
bust(:matching_initials, "fred xavier jones", "homer simpson")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "finds names that match on all but one long names" do
|
54
|
+
try(:matching_all_but_one, "john philip sousa", "john sousa")
|
55
|
+
try(:matching_all_but_one, "philip sousa", "philip john sousa")
|
56
|
+
bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
|
57
|
+
try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
|
58
|
+
end
|
59
|
+
end
|
data/spec/lib/namor_spec.rb
CHANGED
@@ -28,6 +28,10 @@ describe "name extract" do
|
|
28
28
|
@namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
29
29
|
end
|
30
30
|
|
31
|
+
it "should strip elements within square brackets" do
|
32
|
+
@namor.extract("SMITH, JOHN [Jacko] R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
33
|
+
end
|
34
|
+
|
31
35
|
it "should drop periods" do
|
32
36
|
@namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
33
37
|
end
|
@@ -56,6 +60,7 @@ describe "name extract" do
|
|
56
60
|
it "should excise terms from optional suppression list" do
|
57
61
|
@namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
58
62
|
@namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
63
|
+
@namor.extract("Smith Jr, Edward M M.D. [Oph,Ped Orth]").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
59
64
|
@namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD', 'SMITH,EDWARD PHD']
|
60
65
|
end
|
61
66
|
|
@@ -88,6 +93,8 @@ describe "name extract" do
|
|
88
93
|
@namor.extract_from_pieces_with_cluster(:first => 'John', :middle => 'M', :last => 'Smith').should == ['JOHN', 'M', 'SMITH', 'SMITH,JOHN M', 'SMITH,JOHN M', 'SMITH_JOHN_M', 'SMITH_JOHN_M']
|
89
94
|
@namor.extract_from_pieces_with_cluster(:first => 'Susan', :last => 'Smith-Jones').should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
|
90
95
|
|
96
|
+
@namor.extract_from_pieces_with_cluster({:first => 'Susan', :last => 'Smith-Jones MD PHD'}, {:suppress => ['MD', 'PHD']}).should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
|
97
|
+
|
91
98
|
@namor.extract_from_pieces(:last => 'Smith').should == [nil,nil, 'SMITH', 'SMITH', 'SMITH']
|
92
99
|
|
93
100
|
@namor.extract_from_pieces(:first => 'Mary', :last => 'Smith Jones').should == ['MARY',nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -89,9 +89,11 @@ files:
|
|
89
89
|
- README.md
|
90
90
|
- Rakefile
|
91
91
|
- lib/namor.rb
|
92
|
+
- lib/namor/comparator.rb
|
92
93
|
- lib/namor/namor.rb
|
93
94
|
- lib/namor/version.rb
|
94
95
|
- namor.gemspec
|
96
|
+
- spec/lib/comparator_spec.rb
|
95
97
|
- spec/lib/namor_spec.rb
|
96
98
|
- spec/spec_helper.rb
|
97
99
|
homepage: ''
|
@@ -108,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
108
110
|
version: '0'
|
109
111
|
segments:
|
110
112
|
- 0
|
111
|
-
hash: -
|
113
|
+
hash: -145397464691454724
|
112
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
115
|
none: false
|
114
116
|
requirements:
|
@@ -117,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
117
119
|
version: '0'
|
118
120
|
segments:
|
119
121
|
- 0
|
120
|
-
hash: -
|
122
|
+
hash: -145397464691454724
|
121
123
|
requirements: []
|
122
124
|
rubyforge_project:
|
123
125
|
rubygems_version: 1.8.24
|
@@ -125,5 +127,6 @@ signing_key:
|
|
125
127
|
specification_version: 3
|
126
128
|
summary: Parse & extract pieces of names
|
127
129
|
test_files:
|
130
|
+
- spec/lib/comparator_spec.rb
|
128
131
|
- spec/lib/namor_spec.rb
|
129
132
|
- spec/spec_helper.rb
|