namor 0.5.4 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/namor.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require_relative "namor/version"
2
2
  require_relative "namor/namor"
3
+ require_relative "namor/comparator"
3
4
 
4
5
  module Namor
5
6
  end
@@ -0,0 +1,85 @@
1
+ # MULTI-MATCHING via components
2
+ # go through all users
3
+ # group by distinct sets of components
4
+ # pick a (small) subset of component-keys, say <10. Maybe random sample?
5
+ # build a set of matching rules
6
+ # run the subset * the full corpus * the matching rules
7
+
8
+ class Namor::Comparator
9
+ attr_reader :corpus
10
+
11
+ def initialize(corpus)
12
+ @corpus = corpus
13
+
14
+ prep_missing_initials
15
+ end
16
+
17
+ def crunch(record)
18
+ (@corpus - [record]).each_with_object([]) do |candidate,matches|
19
+ if evaluate(record, candidate)
20
+ matches << candidate
21
+ end
22
+ end
23
+ end
24
+
25
+ def evaluate(record, candidate)
26
+ [:missing_initials].each do |rule|
27
+ return true if send(rule, record, candidate)
28
+ end
29
+ false
30
+ end
31
+
32
+ # don't need an 'identical' test - assuming that the input record does not appear in the corpus
33
+ # def identical(a,b)
34
+ # a == b
35
+ # end
36
+
37
+ # must have at least 2 long (non-initial-only) components in each
38
+ # those long parts must be identical
39
+ # only one of the names can have any initials
40
+ def missing_initials(a,b)
41
+ longnames_a = a.select {|s| s.length > 1}
42
+ longnames_b = b.select {|s| s.length > 1}
43
+ inits_a = a.select {|s| s.length == 1}
44
+ inits_b = b.select {|s| s.length == 1}
45
+
46
+ longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
47
+ end
48
+
49
+ def prep_missing_initials
50
+ @corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
51
+ without_initials = rec.select {|s| s.length > 1}
52
+ if without_initials.count >= 2
53
+ set << without_initials
54
+ end
55
+ end
56
+ end
57
+
58
+ # must have at least 1 long (non-initial-only) component in each
59
+ # those long parts must be identical
60
+ # all initials should correspond to non-matched longnames in the other input
61
+ def matching_initials(a,b)
62
+ longnames_a = a.select {|s| s.length > 1}
63
+ longnames_b = b.select {|s| s.length > 1}
64
+ inits_a = a.select {|s| s.length == 1}
65
+ inits_b = b.select {|s| s.length == 1}
66
+
67
+ return false unless longnames_a.count >= 1 && longnames_b.count >= 1
68
+
69
+ unmatched_longnames_a = longnames_a - longnames_b
70
+ unmatched_longnames_b = longnames_b - longnames_a
71
+ unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
72
+ unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
73
+
74
+ inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
75
+ end
76
+
77
+ # ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
78
+ def matching_all_but_one(a,b)
79
+ longnames_a = a.select {|s| s.length > 1}
80
+ longnames_b = b.select {|s| s.length > 1}
81
+
82
+ ((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
83
+ end
84
+
85
+ end
data/lib/namor/namor.rb CHANGED
@@ -20,6 +20,11 @@ class Namor::Namor
20
20
  # Regexp.new(bits.join('|'))
21
21
  end
22
22
 
23
+ def suppress(name, supplist)
24
+ @re_cache[supplist] ||= suppression_re(supplist)
25
+ name && name.upcase.gsub(@re_cache[supplist], '')
26
+ end
27
+
23
28
  # clean up a single name component
24
29
  # * output all converted to uppercase
25
30
  # * strip leading ZZ+ or XX+ (frequently used as invalid-account prefixes)
@@ -31,7 +36,7 @@ class Namor::Namor
31
36
  def scrub(name, opts = {})
32
37
  @re_cache[opts[:suppress]] ||= suppression_re(opts[:suppress])
33
38
 
34
- name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
39
+ name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\)]*\)/, '').gsub(/\[[^\]]*\]/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
35
40
  end
36
41
 
37
42
  def fullscrub(name, opts = {})
@@ -44,8 +49,9 @@ class Namor::Namor
44
49
  s && s.gsub(/[- ]/, '')
45
50
  end
46
51
 
47
- def demaiden(lastname)
52
+ def demaiden(lastname, opts = {})
48
53
  return [nil,nil] unless lastname && !lastname.empty?
54
+ lastname = suppress(lastname, opts[:suppress]) if opts[:suppress]
49
55
  if lastname =~ /\-/
50
56
  [lastname.upcase.gsub(/ /, ''), lastname.split(/\-/).last.gsub(/ /, '')]
51
57
  else
@@ -114,17 +120,17 @@ class Namor::Namor
114
120
  ary << ary[4].gsub(/\W/, '_')
115
121
  end
116
122
 
117
- def extract_from_pieces(hash)
123
+ def extract_from_pieces(hash, opts = {})
118
124
  assemble(
119
- scrub(hash[:first]),
120
- scrub(hash[:middle]),
121
- scrub_and_squash(hash[:last]),
122
- scrub_and_squash((s = demaiden(hash[:last])) && s.last)
125
+ scrub(hash[:first], opts),
126
+ scrub(hash[:middle], opts),
127
+ scrub_and_squash(hash[:last], opts),
128
+ scrub_and_squash((s = demaiden(hash[:last], opts)) && s.last, opts)
123
129
  )
124
130
  end
125
131
 
126
- def extract_from_pieces_with_cluster(hash)
127
- ary = extract_from_pieces(hash)
132
+ def extract_from_pieces_with_cluster(hash, opts = {})
133
+ ary = extract_from_pieces(hash, opts)
128
134
  ary << ary[3].gsub(/\W/, '_')
129
135
  ary << ary[4].gsub(/\W/, '_')
130
136
  end
data/lib/namor/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Namor
2
- VERSION = "0.5.4"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,59 @@
1
+ require "spec_helper"
2
+
3
+ describe "Comparator" do
4
+ def explode(name)
5
+ name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
6
+ end
7
+
8
+ def try(rule, name1, name2)
9
+ @comp.send(rule, explode(name1), explode(name2)).should be_true
10
+ end
11
+
12
+ def bust(rule, name1, name2)
13
+ @comp.send(rule, explode(name1), explode(name2)).should be_false
14
+ end
15
+
16
+ before :all do
17
+ names = [
18
+ "michael g palmer",
19
+ "francis l palmer",
20
+ "michael palmer"
21
+ ]
22
+ corpus = names.map {|name| explode(name)}
23
+ @comp = Namor::Comparator.new(corpus)
24
+ end
25
+
26
+ it "finds names that match without initials" do
27
+ try(:missing_initials, "michael palmer", "michael g palmer")
28
+ try(:missing_initials, "michael palmer", "Q michael palmer")
29
+ try(:missing_initials, "michael palmer", "Michael N Palmer x")
30
+ bust(:missing_initials, "michael palmer", "Michael P")
31
+ bust(:missing_initials, "michael palmer", "Michael John Palmer")
32
+
33
+ matches = @comp.crunch(explode("michael palmer"))
34
+ matches.should == [explode("michael g palmer")]
35
+ matches = @comp.crunch(explode("palmer michael"))
36
+ matches.should == [explode("michael g palmer")]
37
+ matches = @comp.crunch(explode("michael g palmer"))
38
+ matches.should == [explode("michael palmer")]
39
+ end
40
+
41
+ it "finds names that match initials to names" do
42
+ try(:matching_initials, "fred jones", "f jones")
43
+ try(:matching_initials, "fred jones", "jones f")
44
+ try(:matching_initials, "fred jones", "fred j")
45
+ try(:matching_initials, "fred xavier jones", "fred x jones")
46
+ try(:matching_initials, "fred xavier jones", "xavier jones f")
47
+ bust(:matching_initials, "fred xavier jones", "fred jones")
48
+ bust(:matching_initials, "fred xavier jones", "fred q jones")
49
+ bust(:matching_initials, "fred x jones", "fred q jones")
50
+ bust(:matching_initials, "fred xavier jones", "homer simpson")
51
+ end
52
+
53
+ it "finds names that match on all but one long names" do
54
+ try(:matching_all_but_one, "john philip sousa", "john sousa")
55
+ try(:matching_all_but_one, "philip sousa", "philip john sousa")
56
+ bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
57
+ try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
58
+ end
59
+ end
@@ -28,6 +28,10 @@ describe "name extract" do
28
28
  @namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
29
29
  end
30
30
 
31
+ it "should strip elements within square brackets" do
32
+ @namor.extract("SMITH, JOHN [Jacko] R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
33
+ end
34
+
31
35
  it "should drop periods" do
32
36
  @namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
33
37
  end
@@ -56,6 +60,7 @@ describe "name extract" do
56
60
  it "should excise terms from optional suppression list" do
57
61
  @namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
58
62
  @namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
63
+ @namor.extract("Smith Jr, Edward M M.D. [Oph,Ped Orth]").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
59
64
  @namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD', 'SMITH,EDWARD PHD']
60
65
  end
61
66
 
@@ -88,6 +93,8 @@ describe "name extract" do
88
93
  @namor.extract_from_pieces_with_cluster(:first => 'John', :middle => 'M', :last => 'Smith').should == ['JOHN', 'M', 'SMITH', 'SMITH,JOHN M', 'SMITH,JOHN M', 'SMITH_JOHN_M', 'SMITH_JOHN_M']
89
94
  @namor.extract_from_pieces_with_cluster(:first => 'Susan', :last => 'Smith-Jones').should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
90
95
 
96
+ @namor.extract_from_pieces_with_cluster({:first => 'Susan', :last => 'Smith-Jones MD PHD'}, {:suppress => ['MD', 'PHD']}).should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
97
+
91
98
  @namor.extract_from_pieces(:last => 'Smith').should == [nil,nil, 'SMITH', 'SMITH', 'SMITH']
92
99
 
93
100
  @namor.extract_from_pieces(:first => 'Mary', :last => 'Smith Jones').should == ['MARY',nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: namor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-12 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -89,9 +89,11 @@ files:
89
89
  - README.md
90
90
  - Rakefile
91
91
  - lib/namor.rb
92
+ - lib/namor/comparator.rb
92
93
  - lib/namor/namor.rb
93
94
  - lib/namor/version.rb
94
95
  - namor.gemspec
96
+ - spec/lib/comparator_spec.rb
95
97
  - spec/lib/namor_spec.rb
96
98
  - spec/spec_helper.rb
97
99
  homepage: ''
@@ -108,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
108
110
  version: '0'
109
111
  segments:
110
112
  - 0
111
- hash: -2394129238240412470
113
+ hash: -145397464691454724
112
114
  required_rubygems_version: !ruby/object:Gem::Requirement
113
115
  none: false
114
116
  requirements:
@@ -117,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
117
119
  version: '0'
118
120
  segments:
119
121
  - 0
120
- hash: -2394129238240412470
122
+ hash: -145397464691454724
121
123
  requirements: []
122
124
  rubyforge_project:
123
125
  rubygems_version: 1.8.24
@@ -125,5 +127,6 @@ signing_key:
125
127
  specification_version: 3
126
128
  summary: Parse & extract pieces of names
127
129
  test_files:
130
+ - spec/lib/comparator_spec.rb
128
131
  - spec/lib/namor_spec.rb
129
132
  - spec/spec_helper.rb