namor 0.5.4 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/namor.rb +1 -0
- data/lib/namor/comparator.rb +85 -0
- data/lib/namor/namor.rb +15 -9
- data/lib/namor/version.rb +1 -1
- data/spec/lib/comparator_spec.rb +59 -0
- data/spec/lib/namor_spec.rb +7 -0
- metadata +7 -4
data/lib/namor.rb
CHANGED
@@ -0,0 +1,85 @@
|
|
1
|
+
# MULTI-MATCHING via components
|
2
|
+
# go through all users
|
3
|
+
# group by distinct sets of components
|
4
|
+
# pick a (small) subset of component-keys, say <10. Maybe random sample?
|
5
|
+
# build a set of matching rules
|
6
|
+
# run the subset * the full corpus * the matching rules
|
7
|
+
|
8
|
+
class Namor::Comparator
|
9
|
+
attr_reader :corpus
|
10
|
+
|
11
|
+
def initialize(corpus)
|
12
|
+
@corpus = corpus
|
13
|
+
|
14
|
+
prep_missing_initials
|
15
|
+
end
|
16
|
+
|
17
|
+
def crunch(record)
|
18
|
+
(@corpus - [record]).each_with_object([]) do |candidate,matches|
|
19
|
+
if evaluate(record, candidate)
|
20
|
+
matches << candidate
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def evaluate(record, candidate)
|
26
|
+
[:missing_initials].each do |rule|
|
27
|
+
return true if send(rule, record, candidate)
|
28
|
+
end
|
29
|
+
false
|
30
|
+
end
|
31
|
+
|
32
|
+
# don't need an 'identical' test - assuming that the input record does not appear in the corpus
|
33
|
+
# def identical(a,b)
|
34
|
+
# a == b
|
35
|
+
# end
|
36
|
+
|
37
|
+
# must have at least 2 long (non-initial-only) components in each
|
38
|
+
# those long parts must be identical
|
39
|
+
# only one of the names can have any initials
|
40
|
+
def missing_initials(a,b)
|
41
|
+
longnames_a = a.select {|s| s.length > 1}
|
42
|
+
longnames_b = b.select {|s| s.length > 1}
|
43
|
+
inits_a = a.select {|s| s.length == 1}
|
44
|
+
inits_b = b.select {|s| s.length == 1}
|
45
|
+
|
46
|
+
longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
|
47
|
+
end
|
48
|
+
|
49
|
+
def prep_missing_initials
|
50
|
+
@corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
|
51
|
+
without_initials = rec.select {|s| s.length > 1}
|
52
|
+
if without_initials.count >= 2
|
53
|
+
set << without_initials
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# must have at least 1 long (non-initial-only) component in each
|
59
|
+
# those long parts must be identical
|
60
|
+
# all initials should correspond to non-matched longnames in the other input
|
61
|
+
def matching_initials(a,b)
|
62
|
+
longnames_a = a.select {|s| s.length > 1}
|
63
|
+
longnames_b = b.select {|s| s.length > 1}
|
64
|
+
inits_a = a.select {|s| s.length == 1}
|
65
|
+
inits_b = b.select {|s| s.length == 1}
|
66
|
+
|
67
|
+
return false unless longnames_a.count >= 1 && longnames_b.count >= 1
|
68
|
+
|
69
|
+
unmatched_longnames_a = longnames_a - longnames_b
|
70
|
+
unmatched_longnames_b = longnames_b - longnames_a
|
71
|
+
unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
|
72
|
+
unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
|
73
|
+
|
74
|
+
inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
|
75
|
+
end
|
76
|
+
|
77
|
+
# ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
|
78
|
+
def matching_all_but_one(a,b)
|
79
|
+
longnames_a = a.select {|s| s.length > 1}
|
80
|
+
longnames_b = b.select {|s| s.length > 1}
|
81
|
+
|
82
|
+
((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
data/lib/namor/namor.rb
CHANGED
@@ -20,6 +20,11 @@ class Namor::Namor
|
|
20
20
|
# Regexp.new(bits.join('|'))
|
21
21
|
end
|
22
22
|
|
23
|
+
def suppress(name, supplist)
|
24
|
+
@re_cache[supplist] ||= suppression_re(supplist)
|
25
|
+
name && name.upcase.gsub(@re_cache[supplist], '')
|
26
|
+
end
|
27
|
+
|
23
28
|
# clean up a single name component
|
24
29
|
# * output all converted to uppercase
|
25
30
|
# * strip leading ZZ+ or XX+ (frequently used as invalid-account prefixes)
|
@@ -31,7 +36,7 @@ class Namor::Namor
|
|
31
36
|
def scrub(name, opts = {})
|
32
37
|
@re_cache[opts[:suppress]] ||= suppression_re(opts[:suppress])
|
33
38
|
|
34
|
-
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\
|
39
|
+
name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\)]*\)/, '').gsub(/\[[^\]]*\]/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
|
35
40
|
end
|
36
41
|
|
37
42
|
def fullscrub(name, opts = {})
|
@@ -44,8 +49,9 @@ class Namor::Namor
|
|
44
49
|
s && s.gsub(/[- ]/, '')
|
45
50
|
end
|
46
51
|
|
47
|
-
def demaiden(lastname)
|
52
|
+
def demaiden(lastname, opts = {})
|
48
53
|
return [nil,nil] unless lastname && !lastname.empty?
|
54
|
+
lastname = suppress(lastname, opts[:suppress]) if opts[:suppress]
|
49
55
|
if lastname =~ /\-/
|
50
56
|
[lastname.upcase.gsub(/ /, ''), lastname.split(/\-/).last.gsub(/ /, '')]
|
51
57
|
else
|
@@ -114,17 +120,17 @@ class Namor::Namor
|
|
114
120
|
ary << ary[4].gsub(/\W/, '_')
|
115
121
|
end
|
116
122
|
|
117
|
-
def extract_from_pieces(hash)
|
123
|
+
def extract_from_pieces(hash, opts = {})
|
118
124
|
assemble(
|
119
|
-
scrub(hash[:first]),
|
120
|
-
scrub(hash[:middle]),
|
121
|
-
scrub_and_squash(hash[:last]),
|
122
|
-
scrub_and_squash((s = demaiden(hash[:last])) && s.last)
|
125
|
+
scrub(hash[:first], opts),
|
126
|
+
scrub(hash[:middle], opts),
|
127
|
+
scrub_and_squash(hash[:last], opts),
|
128
|
+
scrub_and_squash((s = demaiden(hash[:last], opts)) && s.last, opts)
|
123
129
|
)
|
124
130
|
end
|
125
131
|
|
126
|
-
def extract_from_pieces_with_cluster(hash)
|
127
|
-
ary = extract_from_pieces(hash)
|
132
|
+
def extract_from_pieces_with_cluster(hash, opts = {})
|
133
|
+
ary = extract_from_pieces(hash, opts)
|
128
134
|
ary << ary[3].gsub(/\W/, '_')
|
129
135
|
ary << ary[4].gsub(/\W/, '_')
|
130
136
|
end
|
data/lib/namor/version.rb
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe "Comparator" do
|
4
|
+
def explode(name)
|
5
|
+
name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
|
6
|
+
end
|
7
|
+
|
8
|
+
def try(rule, name1, name2)
|
9
|
+
@comp.send(rule, explode(name1), explode(name2)).should be_true
|
10
|
+
end
|
11
|
+
|
12
|
+
def bust(rule, name1, name2)
|
13
|
+
@comp.send(rule, explode(name1), explode(name2)).should be_false
|
14
|
+
end
|
15
|
+
|
16
|
+
before :all do
|
17
|
+
names = [
|
18
|
+
"michael g palmer",
|
19
|
+
"francis l palmer",
|
20
|
+
"michael palmer"
|
21
|
+
]
|
22
|
+
corpus = names.map {|name| explode(name)}
|
23
|
+
@comp = Namor::Comparator.new(corpus)
|
24
|
+
end
|
25
|
+
|
26
|
+
it "finds names that match without initials" do
|
27
|
+
try(:missing_initials, "michael palmer", "michael g palmer")
|
28
|
+
try(:missing_initials, "michael palmer", "Q michael palmer")
|
29
|
+
try(:missing_initials, "michael palmer", "Michael N Palmer x")
|
30
|
+
bust(:missing_initials, "michael palmer", "Michael P")
|
31
|
+
bust(:missing_initials, "michael palmer", "Michael John Palmer")
|
32
|
+
|
33
|
+
matches = @comp.crunch(explode("michael palmer"))
|
34
|
+
matches.should == [explode("michael g palmer")]
|
35
|
+
matches = @comp.crunch(explode("palmer michael"))
|
36
|
+
matches.should == [explode("michael g palmer")]
|
37
|
+
matches = @comp.crunch(explode("michael g palmer"))
|
38
|
+
matches.should == [explode("michael palmer")]
|
39
|
+
end
|
40
|
+
|
41
|
+
it "finds names that match initials to names" do
|
42
|
+
try(:matching_initials, "fred jones", "f jones")
|
43
|
+
try(:matching_initials, "fred jones", "jones f")
|
44
|
+
try(:matching_initials, "fred jones", "fred j")
|
45
|
+
try(:matching_initials, "fred xavier jones", "fred x jones")
|
46
|
+
try(:matching_initials, "fred xavier jones", "xavier jones f")
|
47
|
+
bust(:matching_initials, "fred xavier jones", "fred jones")
|
48
|
+
bust(:matching_initials, "fred xavier jones", "fred q jones")
|
49
|
+
bust(:matching_initials, "fred x jones", "fred q jones")
|
50
|
+
bust(:matching_initials, "fred xavier jones", "homer simpson")
|
51
|
+
end
|
52
|
+
|
53
|
+
it "finds names that match on all but one long names" do
|
54
|
+
try(:matching_all_but_one, "john philip sousa", "john sousa")
|
55
|
+
try(:matching_all_but_one, "philip sousa", "philip john sousa")
|
56
|
+
bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
|
57
|
+
try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
|
58
|
+
end
|
59
|
+
end
|
data/spec/lib/namor_spec.rb
CHANGED
@@ -28,6 +28,10 @@ describe "name extract" do
|
|
28
28
|
@namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
29
29
|
end
|
30
30
|
|
31
|
+
it "should strip elements within square brackets" do
|
32
|
+
@namor.extract("SMITH, JOHN [Jacko] R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
33
|
+
end
|
34
|
+
|
31
35
|
it "should drop periods" do
|
32
36
|
@namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
|
33
37
|
end
|
@@ -56,6 +60,7 @@ describe "name extract" do
|
|
56
60
|
it "should excise terms from optional suppression list" do
|
57
61
|
@namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
58
62
|
@namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
63
|
+
@namor.extract("Smith Jr, Edward M M.D. [Oph,Ped Orth]").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
|
59
64
|
@namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD', 'SMITH,EDWARD PHD']
|
60
65
|
end
|
61
66
|
|
@@ -88,6 +93,8 @@ describe "name extract" do
|
|
88
93
|
@namor.extract_from_pieces_with_cluster(:first => 'John', :middle => 'M', :last => 'Smith').should == ['JOHN', 'M', 'SMITH', 'SMITH,JOHN M', 'SMITH,JOHN M', 'SMITH_JOHN_M', 'SMITH_JOHN_M']
|
89
94
|
@namor.extract_from_pieces_with_cluster(:first => 'Susan', :last => 'Smith-Jones').should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
|
90
95
|
|
96
|
+
@namor.extract_from_pieces_with_cluster({:first => 'Susan', :last => 'Smith-Jones MD PHD'}, {:suppress => ['MD', 'PHD']}).should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
|
97
|
+
|
91
98
|
@namor.extract_from_pieces(:last => 'Smith').should == [nil,nil, 'SMITH', 'SMITH', 'SMITH']
|
92
99
|
|
93
100
|
@namor.extract_from_pieces(:first => 'Mary', :last => 'Smith Jones').should == ['MARY',nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-29 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -89,9 +89,11 @@ files:
|
|
89
89
|
- README.md
|
90
90
|
- Rakefile
|
91
91
|
- lib/namor.rb
|
92
|
+
- lib/namor/comparator.rb
|
92
93
|
- lib/namor/namor.rb
|
93
94
|
- lib/namor/version.rb
|
94
95
|
- namor.gemspec
|
96
|
+
- spec/lib/comparator_spec.rb
|
95
97
|
- spec/lib/namor_spec.rb
|
96
98
|
- spec/spec_helper.rb
|
97
99
|
homepage: ''
|
@@ -108,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
108
110
|
version: '0'
|
109
111
|
segments:
|
110
112
|
- 0
|
111
|
-
hash: -
|
113
|
+
hash: -145397464691454724
|
112
114
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
115
|
none: false
|
114
116
|
requirements:
|
@@ -117,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
117
119
|
version: '0'
|
118
120
|
segments:
|
119
121
|
- 0
|
120
|
-
hash: -
|
122
|
+
hash: -145397464691454724
|
121
123
|
requirements: []
|
122
124
|
rubyforge_project:
|
123
125
|
rubygems_version: 1.8.24
|
@@ -125,5 +127,6 @@ signing_key:
|
|
125
127
|
specification_version: 3
|
126
128
|
summary: Parse & extract pieces of names
|
127
129
|
test_files:
|
130
|
+
- spec/lib/comparator_spec.rb
|
128
131
|
- spec/lib/namor_spec.rb
|
129
132
|
- spec/spec_helper.rb
|