namor 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/lib/namor/namor.rb +77 -0
- data/lib/namor/version.rb +1 -1
- data/lib/namor.rb +2 -47
- data/spec/lib/namor_spec.rb +48 -17
- metadata +38 -12
data/.gitignore
CHANGED
data/lib/namor/namor.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
class Namor::Namor
|
2
|
+
def initialize(opts = {})
|
3
|
+
config(opts)
|
4
|
+
end
|
5
|
+
|
6
|
+
def config(opts)
|
7
|
+
@config = opts
|
8
|
+
end
|
9
|
+
|
10
|
+
def extract(name)
|
11
|
+
return [] if name.nil?
|
12
|
+
|
13
|
+
suppression_list = @config[:suppress] || []
|
14
|
+
suppression_re = suppression_list.join('|')
|
15
|
+
|
16
|
+
detitled_name = name.upcase.gsub(/\b(#{suppression_re})\b/i, '').gsub(/\b(MD|JR|SR|I+|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/[_.'-]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
|
17
|
+
|
18
|
+
if detitled_name =~ /,/
|
19
|
+
# "last, first[ middle]"
|
20
|
+
lastname, firstname = detitled_name.split(/\s*,\s*/)
|
21
|
+
lastname.gsub!(/ /, '')
|
22
|
+
middlename = nil
|
23
|
+
if firstname && firstname =~ / /
|
24
|
+
pieces = firstname.split(/ +/)
|
25
|
+
firstname = pieces.shift
|
26
|
+
middlename = pieces.join if pieces.any?
|
27
|
+
end
|
28
|
+
else
|
29
|
+
# "first [middle ]last"
|
30
|
+
pieces = detitled_name.split(' ')
|
31
|
+
firstname = pieces.shift
|
32
|
+
middlename = nil
|
33
|
+
if pieces.count > 1 && pieces.first.length == 1
|
34
|
+
# assume this is a middle initial
|
35
|
+
middlename = pieces.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
lastname = pieces.join
|
39
|
+
end
|
40
|
+
|
41
|
+
firstname = nil if firstname.empty?
|
42
|
+
middlename = nil if middlename && middlename.empty?
|
43
|
+
lastname = nil if lastname.empty?
|
44
|
+
|
45
|
+
fm = [firstname, middlename].compact.join(' ')
|
46
|
+
fullname = [lastname, fm].compact.join(',')
|
47
|
+
|
48
|
+
[firstname, middlename, lastname, fullname]
|
49
|
+
end
|
50
|
+
|
51
|
+
def extract_with_cluster(name)
|
52
|
+
ary = extract(name)
|
53
|
+
return [] if ary.empty?
|
54
|
+
ary << ary.last.gsub(/\W/, '_')
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
def components(*args)
|
59
|
+
suppression_list = @config[:suppress] ? @config[:suppress].map(&:upcase) : []
|
60
|
+
|
61
|
+
names = args
|
62
|
+
bits = []
|
63
|
+
names.compact.each do |name|
|
64
|
+
name = name.dup
|
65
|
+
name.gsub!(/\([^\(]*\)/, '')
|
66
|
+
name.gsub!(/\[[^\[]*\]/, '')
|
67
|
+
name.gsub!(/[\(\)\[\]\']/, '')
|
68
|
+
name.gsub!(/[,._-]/, ' ')
|
69
|
+
bits += name.split(/\s+/).map(&:upcase)
|
70
|
+
end
|
71
|
+
|
72
|
+
suppress_re = %w{MD JR SR I+ IV}.join('|')
|
73
|
+
bits.delete_if {|bit| suppression_list.include?(bit) || bit =~ /^(#{suppress_re})$/}
|
74
|
+
bits.delete_if(&:empty?)
|
75
|
+
bits.uniq.sort
|
76
|
+
end
|
77
|
+
end
|
data/lib/namor/version.rb
CHANGED
data/lib/namor.rb
CHANGED
@@ -1,50 +1,5 @@
|
|
1
|
-
|
1
|
+
require_relative "namor/version"
|
2
|
+
require_relative "namor/namor"
|
2
3
|
|
3
4
|
module Namor
|
4
|
-
def self.extract(name, args = {})
|
5
|
-
return [] if name.nil?
|
6
|
-
|
7
|
-
suppression_list = args[:suppress] || []
|
8
|
-
suppression_re = suppression_list.join('|')
|
9
|
-
|
10
|
-
detitled_name = name.upcase.gsub(/\b(#{suppression_re})\b/i, '').gsub(/\b(MD|JR|SR|I+|IV)\b/i, '').gsub(/\([^\(]*\)/, '').gsub(/[_.'-]/, '').gsub(/,\s*$/, '').gsub(/ +/, ' ').strip
|
11
|
-
|
12
|
-
if detitled_name =~ /,/
|
13
|
-
# "last, first[ middle]"
|
14
|
-
lastname, firstname = detitled_name.split(/\s*,\s*/)
|
15
|
-
lastname.gsub!(/ /, '')
|
16
|
-
middlename = nil
|
17
|
-
if firstname && firstname =~ / /
|
18
|
-
pieces = firstname.split(/ +/)
|
19
|
-
firstname = pieces.shift
|
20
|
-
middlename = pieces.join if pieces.any?
|
21
|
-
end
|
22
|
-
else
|
23
|
-
# "first [middle ]last"
|
24
|
-
pieces = detitled_name.split(' ')
|
25
|
-
firstname = pieces.shift
|
26
|
-
middlename = nil
|
27
|
-
if pieces.count > 1 && pieces.first.length == 1
|
28
|
-
# assume this is a middle initial
|
29
|
-
middlename = pieces.shift
|
30
|
-
end
|
31
|
-
|
32
|
-
lastname = pieces.join
|
33
|
-
end
|
34
|
-
|
35
|
-
firstname = nil if firstname.empty?
|
36
|
-
middlename = nil if middlename && middlename.empty?
|
37
|
-
lastname = nil if lastname.empty?
|
38
|
-
|
39
|
-
fm = [firstname, middlename].compact.join(' ')
|
40
|
-
fullname = [lastname, fm].compact.join(',')
|
41
|
-
|
42
|
-
[firstname, middlename, lastname, fullname]
|
43
|
-
end
|
44
|
-
|
45
|
-
def self.extract_with_cluster(name, args = {})
|
46
|
-
ary = extract(name, args)
|
47
|
-
return [] if ary.empty?
|
48
|
-
ary << ary.last.gsub(/\W/, '_')
|
49
|
-
end
|
50
5
|
end
|
data/spec/lib/namor_spec.rb
CHANGED
@@ -3,61 +3,92 @@
|
|
3
3
|
require "spec_helper"
|
4
4
|
|
5
5
|
describe "name extract" do
|
6
|
+
before(:all) do
|
7
|
+
@namor = Namor::Namor.new
|
8
|
+
@namor.config(:suppress => ['MD', 'DDS'])
|
9
|
+
end
|
10
|
+
|
6
11
|
it "should handle 2-part names without commas" do
|
7
|
-
|
12
|
+
@namor.extract("john smith").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN']
|
8
13
|
end
|
9
14
|
|
10
15
|
it "should handle 2-part names with commas" do
|
11
|
-
|
16
|
+
@namor.extract("SMITH, JOHN").should == ['JOHN', nil, 'SMITH', 'SMITH,JOHN']
|
12
17
|
end
|
13
18
|
|
14
19
|
it "should handle 2-part names with commas and middle initials" do
|
15
|
-
|
20
|
+
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
16
21
|
end
|
17
22
|
|
18
23
|
it "should handle 2-part names with commas and middle initials" do
|
19
|
-
|
24
|
+
@namor.extract("SMITH, JOHN R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
20
25
|
end
|
21
26
|
|
22
27
|
it "should strip elements within parentheses" do
|
23
|
-
|
28
|
+
@namor.extract("SMITH, JOHN (Jacko) R").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
24
29
|
end
|
25
30
|
|
26
31
|
it "should drop periods" do
|
27
|
-
|
32
|
+
@namor.extract("John R. Smith").should == ['JOHN', 'R', 'SMITH', 'SMITH,JOHN R']
|
28
33
|
end
|
29
34
|
|
30
35
|
it "should drop spaces in last name (only when input has a comma)" do
|
31
|
-
|
36
|
+
@namor.extract("Smith Jones, Mary").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY']
|
32
37
|
end
|
33
38
|
|
34
39
|
it "should drop dashes & apostrophes" do
|
35
|
-
|
36
|
-
|
37
|
-
|
40
|
+
@namor.extract("Mary Smith-Jones").should == ['MARY', nil, 'SMITHJONES', 'SMITHJONES,MARY']
|
41
|
+
@namor.extract("Mary S. O'Keefe").should == ['MARY', 'S', 'OKEEFE', 'OKEEFE,MARY S']
|
42
|
+
@namor.extract("Jean-Michel Claude").should == ['JEANMICHEL', nil, 'CLAUDE', 'CLAUDE,JEANMICHEL']
|
38
43
|
end
|
39
44
|
|
40
45
|
it "should concatenate extract name pieces" do
|
41
|
-
|
42
|
-
|
46
|
+
@namor.extract("rajesh kumar vishnu garuda").should == ['RAJESH', nil, 'KUMARVISHNUGARUDA', 'KUMARVISHNUGARUDA,RAJESH']
|
47
|
+
@namor.extract("Kumar, Rajesh Vishnu Garuda").should == ['RAJESH', 'VISHNUGARUDA', 'KUMAR', 'KUMAR,RAJESH VISHNUGARUDA']
|
43
48
|
end
|
44
49
|
|
45
50
|
it "should excise suffixes like 'Jr.' from lastnames" do
|
46
|
-
|
51
|
+
@namor.extract("Smith Jr, Edward M").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M']
|
47
52
|
end
|
48
53
|
|
49
54
|
it "should excise terms from optional suppression list" do
|
50
|
-
|
51
|
-
|
55
|
+
@namor.extract("Smith Jr, Edward M MD DDS").should == ['EDWARD', 'M', 'SMITH', 'SMITH,EDWARD M']
|
56
|
+
@namor.extract("Smith Jr, Edward III MD PHD").should == ['EDWARD', 'PHD', 'SMITH', 'SMITH,EDWARD PHD']
|
52
57
|
end
|
53
58
|
|
54
59
|
it "should handle pathological cases" do
|
55
|
-
|
60
|
+
@namor.extract(", Mary Smith").should == ['MARY', 'SMITH', nil, 'MARY SMITH']
|
56
61
|
end
|
57
62
|
end
|
58
63
|
|
59
64
|
describe "with cluster coding" do
|
65
|
+
before(:all) do
|
66
|
+
@namor = Namor::Namor.new
|
67
|
+
@namor.config(:suppress => ['MD', 'DDS'])
|
68
|
+
end
|
69
|
+
|
60
70
|
it "should generate cluster labels" do
|
61
|
-
|
71
|
+
@namor.extract_with_cluster("Smith Jr, Edward III MD PHD").last.should == 'SMITH_EDWARD_PHD'
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "name componentization" do
|
76
|
+
before(:all) do
|
77
|
+
@namor = Namor::Namor.new
|
78
|
+
@namor.config(:suppress => ['esq'])
|
79
|
+
end
|
80
|
+
|
81
|
+
it "should include initials" do
|
82
|
+
@namor.components("john q. smith").should == ['JOHN', 'Q', 'SMITH']
|
83
|
+
end
|
84
|
+
|
85
|
+
it "should excise common suffixes" do
|
86
|
+
@namor.components("john smith III").should == ['JOHN', 'SMITH']
|
87
|
+
@namor.components("john smith jr").should == ['JOHN', 'SMITH']
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should excise from suppression list" do
|
91
|
+
@namor.components("john smith esk.").should == ['ESK', 'JOHN', 'SMITH']
|
92
|
+
@namor.components("john smith esq.").should == ['JOHN', 'SMITH']
|
62
93
|
end
|
63
94
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-23 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: 0.9.2
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 0.9.2
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: rspec
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ~>
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: 2.9.0
|
33
38
|
type: :development
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: 2.9.0
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: guard-rspec
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ~>
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: 0.7.0
|
44
54
|
type: :development
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 0.7.0
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: ruby_gntp
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ~>
|
@@ -54,7 +69,12 @@ dependencies:
|
|
54
69
|
version: 0.3.4
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 0.3.4
|
58
78
|
description: Munging English names
|
59
79
|
email:
|
60
80
|
- jmay@pobox.com
|
@@ -68,6 +88,7 @@ files:
|
|
68
88
|
- README.md
|
69
89
|
- Rakefile
|
70
90
|
- lib/namor.rb
|
91
|
+
- lib/namor/namor.rb
|
71
92
|
- lib/namor/version.rb
|
72
93
|
- namor.gemspec
|
73
94
|
- spec/lib/namor_spec.rb
|
@@ -84,19 +105,24 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
84
105
|
- - ! '>='
|
85
106
|
- !ruby/object:Gem::Version
|
86
107
|
version: '0'
|
108
|
+
segments:
|
109
|
+
- 0
|
110
|
+
hash: 3634688006053498514
|
87
111
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
112
|
none: false
|
89
113
|
requirements:
|
90
114
|
- - ! '>='
|
91
115
|
- !ruby/object:Gem::Version
|
92
116
|
version: '0'
|
117
|
+
segments:
|
118
|
+
- 0
|
119
|
+
hash: 3634688006053498514
|
93
120
|
requirements: []
|
94
121
|
rubyforge_project:
|
95
|
-
rubygems_version: 1.8.
|
122
|
+
rubygems_version: 1.8.24
|
96
123
|
signing_key:
|
97
124
|
specification_version: 3
|
98
125
|
summary: Parse & extract pieces of names
|
99
126
|
test_files:
|
100
127
|
- spec/lib/namor_spec.rb
|
101
128
|
- spec/spec_helper.rb
|
102
|
-
has_rdoc:
|