namor 0.5.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/namor.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require_relative "namor/version"
2
2
  require_relative "namor/namor"
3
+ require_relative "namor/comparator"
3
4
 
4
5
  module Namor
5
6
  end
@@ -0,0 +1,85 @@
1
+ # MULTI-MATCHING via components
2
+ # go through all users
3
+ # group by distinct sets of components
4
+ # pick a (small) subset of component-keys, say <10. Maybe random sample?
5
+ # build a set of matching rules
6
+ # run the subset * the full corpus * the matching rules
7
+
8
+ class Namor::Comparator
9
+ attr_reader :corpus
10
+
11
+ def initialize(corpus)
12
+ @corpus = corpus
13
+
14
+ prep_missing_initials
15
+ end
16
+
17
+ def crunch(record)
18
+ (@corpus - [record]).each_with_object([]) do |candidate,matches|
19
+ if evaluate(record, candidate)
20
+ matches << candidate
21
+ end
22
+ end
23
+ end
24
+
25
+ def evaluate(record, candidate)
26
+ [:missing_initials].each do |rule|
27
+ return true if send(rule, record, candidate)
28
+ end
29
+ false
30
+ end
31
+
32
+ # don't need an 'identical' test - assuming that the input record does not appear in the corpus
33
+ # def identical(a,b)
34
+ # a == b
35
+ # end
36
+
37
+ # must have at least 2 long (non-initial-only) components in each
38
+ # those long parts must be identical
39
+ # only one of the names can have any initials
40
+ def missing_initials(a,b)
41
+ longnames_a = a.select {|s| s.length > 1}
42
+ longnames_b = b.select {|s| s.length > 1}
43
+ inits_a = a.select {|s| s.length == 1}
44
+ inits_b = b.select {|s| s.length == 1}
45
+
46
+ longnames_a.count >= 2 && longnames_b.count >= 2 && longnames_a == longnames_b && (inits_a.empty? || inits_b.empty?)
47
+ end
48
+
49
+ def prep_missing_initials
50
+ @corpus_missing_initials = corpus.each_with_object(Set.new) do |rec,set|
51
+ without_initials = rec.select {|s| s.length > 1}
52
+ if without_initials.count >= 2
53
+ set << without_initials
54
+ end
55
+ end
56
+ end
57
+
58
+ # must have at least 1 long (non-initial-only) component in each
59
+ # those long parts must be identical
60
+ # all initials should correspond to non-matched longnames in the other input
61
+ def matching_initials(a,b)
62
+ longnames_a = a.select {|s| s.length > 1}
63
+ longnames_b = b.select {|s| s.length > 1}
64
+ inits_a = a.select {|s| s.length == 1}
65
+ inits_b = b.select {|s| s.length == 1}
66
+
67
+ return false unless longnames_a.count >= 1 && longnames_b.count >= 1
68
+
69
+ unmatched_longnames_a = longnames_a - longnames_b
70
+ unmatched_longnames_b = longnames_b - longnames_a
71
+ unmatched_inits_a = unmatched_longnames_a.map {|s| s[0]}
72
+ unmatched_inits_b = unmatched_longnames_b.map {|s| s[0]}
73
+
74
+ inits_a == unmatched_inits_b && inits_b == unmatched_inits_a
75
+ end
76
+
77
+ # ignore any initials. look for cases where there is exactly one name component that differs between the inputs.
78
+ def matching_all_but_one(a,b)
79
+ longnames_a = a.select {|s| s.length > 1}
80
+ longnames_b = b.select {|s| s.length > 1}
81
+
82
+ ((longnames_a | longnames_b) - (longnames_a & longnames_b)).count == 1
83
+ end
84
+
85
+ end
data/lib/namor/namor.rb CHANGED
@@ -20,6 +20,11 @@ class Namor::Namor
20
20
  # Regexp.new(bits.join('|'))
21
21
  end
22
22
 
23
+ def suppress(name, supplist)
24
+ @re_cache[supplist] ||= suppression_re(supplist)
25
+ name && name.upcase.gsub(@re_cache[supplist], '')
26
+ end
27
+
23
28
  # clean up a single name component
24
29
  # * output all converted to uppercase
25
30
  # * strip leading ZZ+ or XX+ (frequently used as invalid-account prefixes)
@@ -31,7 +36,7 @@ class Namor::Namor
31
36
  def scrub(name, opts = {})
32
37
  @re_cache[opts[:suppress]] ||= suppression_re(opts[:suppress])
33
38
 
34
- name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
39
+ name && name.upcase.gsub(/^[ZX]{2,}/, '').gsub(@re_cache[opts[:suppress]], '').gsub(/\b(JR|SR|II|III|IV)\b/i, '').gsub(/\([^\)]*\)/, '').gsub(/\[[^\]]*\]/, '').gsub(/\./, ' ').gsub(/[_'\&]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
35
40
  end
36
41
 
37
42
  def fullscrub(name, opts = {})
@@ -44,8 +49,9 @@ class Namor::Namor
44
49
  s && s.gsub(/[- ]/, '')
45
50
  end
46
51
 
47
- def demaiden(lastname)
52
+ def demaiden(lastname, opts = {})
48
53
  return [nil,nil] unless lastname && !lastname.empty?
54
+ lastname = suppress(lastname, opts[:suppress]) if opts[:suppress]
49
55
  if lastname =~ /\-/
50
56
  [lastname.upcase.gsub(/ /, ''), lastname.split(/\-/).last.gsub(/ /, '')]
51
57
  else
@@ -114,17 +120,17 @@ class Namor::Namor
114
120
  ary << ary[4].gsub(/\W/, '_')
115
121
  end
116
122
 
117
- def extract_from_pieces(hash)
123
+ def extract_from_pieces(hash, opts = {})
118
124
  assemble(
119
- scrub(hash[:first]),
120
- scrub(hash[:middle]),
121
- scrub_and_squash(hash[:last]),
122
- scrub_and_squash((s = demaiden(hash[:last])) && s.last)
125
+ scrub(hash[:first], opts),
126
+ scrub(hash[:middle], opts),
127
+ scrub_and_squash(hash[:last], opts),
128
+ scrub_and_squash((s = demaiden(hash[:last], opts)) && s.last, opts)
123
129
  )
124
130
  end
125
131
 
126
- def extract_from_pieces_with_cluster(hash)
127
- ary = extract_from_pieces(hash)
132
+ def extract_from_pieces_with_cluster(hash, opts = {})
133
+ ary = extract_from_pieces(hash, opts)
128
134
  ary << ary[3].gsub(/\W/, '_')
129
135
  ary << ary[4].gsub(/\W/, '_')
130
136
  end
data/lib/namor/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Namor
2
- VERSION = "0.5.4"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,59 @@
1
+ require "spec_helper"
2
+
3
+ describe "Comparator" do
4
+ def explode(name)
5
+ name.gsub(/[,._-]/, ' ').split.map(&:upcase).sort
6
+ end
7
+
8
+ def try(rule, name1, name2)
9
+ @comp.send(rule, explode(name1), explode(name2)).should be_true
10
+ end
11
+
12
+ def bust(rule, name1, name2)
13
+ @comp.send(rule, explode(name1), explode(name2)).should be_false
14
+ end
15
+
16
+ before :all do
17
+ names = [
18
+ "michael g palmer",
19
+ "francis l palmer",
20
+ "michael palmer"
21
+ ]
22
+ corpus = names.map {|name| explode(name)}
23
+ @comp = Namor::Comparator.new(corpus)
24
+ end
25
+
26
+ it "finds names that match without initials" do
27
+ try(:missing_initials, "michael palmer", "michael g palmer")
28
+ try(:missing_initials, "michael palmer", "Q michael palmer")
29
+ try(:missing_initials, "michael palmer", "Michael N Palmer x")
30
+ bust(:missing_initials, "michael palmer", "Michael P")
31
+ bust(:missing_initials, "michael palmer", "Michael John Palmer")
32
+
33
+ matches = @comp.crunch(explode("michael palmer"))
34
+ matches.should == [explode("michael g palmer")]
35
+ matches = @comp.crunch(explode("palmer michael"))
36
+ matches.should == [explode("michael g palmer")]
37
+ matches = @comp.crunch(explode("michael g palmer"))
38
+ matches.should == [explode("michael palmer")]
39
+ end
40
+
41
+ it "finds names that match initials to names" do
42
+ try(:matching_initials, "fred jones", "f jones")
43
+ try(:matching_initials, "fred jones", "jones f")
44
+ try(:matching_initials, "fred jones", "fred j")
45
+ try(:matching_initials, "fred xavier jones", "fred x jones")
46
+ try(:matching_initials, "fred xavier jones", "xavier jones f")
47
+ bust(:matching_initials, "fred xavier jones", "fred jones")
48
+ bust(:matching_initials, "fred xavier jones", "fred q jones")
49
+ bust(:matching_initials, "fred x jones", "fred q jones")
50
+ bust(:matching_initials, "fred xavier jones", "homer simpson")
51
+ end
52
+
53
+ it "finds names that match on all but one long names" do
54
+ try(:matching_all_but_one, "john philip sousa", "john sousa")
55
+ try(:matching_all_but_one, "philip sousa", "philip john sousa")
56
+ bust(:matching_all_but_one, "john philip sousa", "philip john sousa")
57
+ try(:matching_all_but_one, "Helen Q. Glorpworth-Smythe", "helen smythe")
58
+ end
59
+ end
@@ -28,6 +28,10 @@ describe "name extract" do
28
28
  @namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
29
29
  end
30
30
 
31
+ it "should strip elements within square brackets" do
32
+ @namor.extract("SMITH, JOHN [Jacko] R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
33
+ end
34
+
31
35
  it "should drop periods" do
32
36
  @namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R', 'SMITH,JOHN R']
33
37
  end
@@ -56,6 +60,7 @@ describe "name extract" do
56
60
  it "should excise terms from optional suppression list" do
57
61
  @namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
58
62
  @namor.extract("Smith Jr, Edward M M.D.").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
63
+ @namor.extract("Smith Jr, Edward M M.D. [Oph,Ped Orth]").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M', 'SMITH,EDWARD M']
59
64
  @namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD', 'SMITH,EDWARD PHD']
60
65
  end
61
66
 
@@ -88,6 +93,8 @@ describe "name extract" do
88
93
  @namor.extract_from_pieces_with_cluster(:first => 'John', :middle => 'M', :last => 'Smith').should == ['JOHN', 'M', 'SMITH', 'SMITH,JOHN M', 'SMITH,JOHN M', 'SMITH_JOHN_M', 'SMITH_JOHN_M']
89
94
  @namor.extract_from_pieces_with_cluster(:first => 'Susan', :last => 'Smith-Jones').should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
90
95
 
96
+ @namor.extract_from_pieces_with_cluster({:first => 'Susan', :last => 'Smith-Jones MD PHD'}, {:suppress => ['MD', 'PHD']}).should == ['SUSAN', nil, 'SMITHJONES', 'SMITHJONES,SUSAN', 'JONES,SUSAN', 'SMITHJONES_SUSAN', 'JONES_SUSAN']
97
+
91
98
  @namor.extract_from_pieces(:last => 'Smith').should == [nil,nil, 'SMITH', 'SMITH', 'SMITH']
92
99
 
93
100
  @namor.extract_from_pieces(:first => 'Mary', :last => 'Smith Jones').should == ['MARY',nil, 'SMITHJONES', 'SMITHJONES,MARY', 'JONES,MARY']
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: namor
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.6.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-12 00:00:00.000000000 Z
12
+ date: 2012-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
@@ -89,9 +89,11 @@ files:
89
89
  - README.md
90
90
  - Rakefile
91
91
  - lib/namor.rb
92
+ - lib/namor/comparator.rb
92
93
  - lib/namor/namor.rb
93
94
  - lib/namor/version.rb
94
95
  - namor.gemspec
96
+ - spec/lib/comparator_spec.rb
95
97
  - spec/lib/namor_spec.rb
96
98
  - spec/spec_helper.rb
97
99
  homepage: ''
@@ -108,7 +110,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
108
110
  version: '0'
109
111
  segments:
110
112
  - 0
111
- hash: -2394129238240412470
113
+ hash: -145397464691454724
112
114
  required_rubygems_version: !ruby/object:Gem::Requirement
113
115
  none: false
114
116
  requirements:
@@ -117,7 +119,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
117
119
  version: '0'
118
120
  segments:
119
121
  - 0
120
- hash: -2394129238240412470
122
+ hash: -145397464691454724
121
123
  requirements: []
122
124
  rubyforge_project:
123
125
  rubygems_version: 1.8.24
@@ -125,5 +127,6 @@ signing_key:
125
127
  specification_version: 3
126
128
  summary: Parse & extract pieces of names
127
129
  test_files:
130
+ - spec/lib/comparator_spec.rb
128
131
  - spec/lib/namor_spec.rb
129
132
  - spec/spec_helper.rb