nameable 0.5.1 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/Gemfile +2 -9
- data/Gemfile.lock +24 -23
- data/{LICENSE → LICENSE.txt} +2 -0
- data/README.md +68 -0
- data/Rakefile +1 -38
- data/{examples/nameable_web_service.rb → bin/nameable_web_service} +4 -2
- data/data/app_c.csv +151672 -0
- data/data/yob2013.txt +33072 -0
- data/lib/nameable.rb +6 -216
- data/lib/nameable/error.rb +5 -0
- data/lib/nameable/extensions.rb +5 -0
- data/lib/nameable/latin.rb +247 -0
- data/lib/nameable/latin/patterns.rb +34 -0
- data/lib/nameable/version.rb +3 -0
- data/nameable.gemspec +24 -0
- data/spec/nameable/latin_spec.rb +284 -0
- data/spec/nameable_spec.rb +5 -17
- data/spec/spec_helper.rb +5 -6
- metadata +53 -39
- data/History.txt +0 -11
- data/README.rdoc +0 -29
- data/TODO +0 -3
- data/VERSION +0 -1
- data/examples/test.rb +0 -45
data/lib/nameable.rb
CHANGED
@@ -1,220 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
require "nameable/version"
|
2
|
+
require "nameable/error"
|
3
|
+
require "nameable/latin"
|
4
|
+
require "nameable/extensions"
|
5
5
|
|
6
6
|
module Nameable
|
7
|
-
|
8
|
-
|
9
|
-
##
|
10
|
-
# Raised if something other than a valid Name is supplied
|
11
|
-
class InvalidNameError < StandardError
|
12
|
-
end
|
13
|
-
|
14
|
-
##
|
15
|
-
# Regex's to match the detritus that people add to their names
|
16
|
-
module Patterns
|
17
|
-
PREFIX = {
|
18
|
-
"Mr." => /^\(*(mr\.*|mister)\)*$/i,
|
19
|
-
"Mrs." => /^\(*(mrs\.*|misses)\)*$/i,
|
20
|
-
"Ms." => /^\(*(ms\.*|miss)\)*$/i,
|
21
|
-
"Dr." => /^\(*(dr\.*|doctor)\)*$/i,
|
22
|
-
"Rev." => /^\(*(rev\.*|reverand)\)*$/i,
|
23
|
-
"Fr." => /^\(*(fr\.*|friar)\)*$/i,
|
24
|
-
"Master" => /^\(*(master)\)*$/i,
|
25
|
-
"Sir" => /^\(*(sir)\)*$/i
|
26
|
-
}
|
27
|
-
|
28
|
-
SUFFIX = {
|
29
|
-
"Sr." => /^\(*(sr\.?|senior)\)*$/i,
|
30
|
-
"Jr." => /^\(*(jr\.?|junior)\)*$/i,
|
31
|
-
"Esq." => /^\(*(esq\.?|esquire)\)*$/i,
|
32
|
-
"Ph.D." => /^\(*(phd\.?)\)*$/i
|
33
|
-
}
|
34
|
-
|
35
|
-
SUFFIX_GENERATIONAL_ROMAN = /^\(*[IVX.]+\)*$/i
|
36
|
-
SUFFIX_ACADEMIC = /^(APR|RPh|MD|MA|DMD|DDS|PharmD|EngD|DPhil|JD|DD|DO|BA|BS|BSc|BE|BFA|MA|MS|MSc|MFA|MLA|MBA)$/i
|
37
|
-
SUFFIX_PROFESSIONAL = /^(PE|CSA|CPA|CPL|CME|CEng|OFM|CSV|Douchebag)$/i
|
38
|
-
SUFFIX_ABBREVIATION = /^[A-Z.]+[A-Z.]+$/ # It should be at least 2 letters
|
39
|
-
|
40
|
-
LAST_NAME_PRE_DANGLERS = /^(mc|vere|von|van|da|de|del|della|di|da|pietro|vanden|du|st|la|ter|ten)$/i
|
41
|
-
O_LAST_NAME_PRE_CONCATS = /^(o'|o`)$/i
|
42
|
-
# MC_LAST_NAME_PRE_CONCAT = /^(mc|da|de)$/i
|
43
|
-
# ST_LAST_NAME_PRE_CONCAT = /^(st)\.*$/i
|
44
|
-
end
|
45
|
-
|
46
|
-
attr_accessor :prefix, :first, :middle, :last, :suffix
|
47
|
-
|
48
|
-
##
|
49
|
-
#
|
50
|
-
def initialize(parts={})
|
51
|
-
self.prefix = parts[:prefix] ? parts[:prefix] : nil
|
52
|
-
self.first = parts[:first] ? parts[:first] : nil
|
53
|
-
self.middle = parts[:middle] ? parts[:middle] : nil
|
54
|
-
self.last = parts[:last] ? parts[:last] : nil
|
55
|
-
self.suffix = parts[:suffix] ? parts[:suffix] : nil
|
56
|
-
end
|
57
|
-
|
58
|
-
##
|
59
|
-
# name is an Array
|
60
|
-
def extract_prefix(name)
|
61
|
-
return unless name and name.size > 1 and @prefix.nil? and @first.nil?
|
62
|
-
Patterns::PREFIX.each_pair do |pretty, regex|
|
63
|
-
if name.first =~ regex
|
64
|
-
@prefix = pretty
|
65
|
-
name.delete(name.first)
|
66
|
-
return
|
67
|
-
end
|
68
|
-
end
|
69
|
-
end
|
70
|
-
|
71
|
-
##
|
72
|
-
# name is an Array
|
73
|
-
def extract_suffix(name)
|
74
|
-
return unless name and name.size >= 3
|
75
|
-
|
76
|
-
(name.size - 1).downto(2) do |n|
|
77
|
-
suff = nil
|
78
|
-
|
79
|
-
Patterns::SUFFIX.each_pair do |pretty, regex|
|
80
|
-
suff = pretty if name[n] =~ regex
|
81
|
-
end
|
82
|
-
|
83
|
-
if name[n] =~ Patterns::SUFFIX_ACADEMIC or name[n] =~ Patterns::SUFFIX_PROFESSIONAL or name[n] =~ Patterns::SUFFIX_GENERATIONAL_ROMAN
|
84
|
-
suff = name[n].upcase.gsub(/\./,'')
|
85
|
-
end
|
86
|
-
|
87
|
-
if name.join != name.join.upcase and name[n].length > 1 and name[n] =~ Patterns::SUFFIX_ABBREVIATION
|
88
|
-
suff = name[n].upcase.gsub(/\./,'')
|
89
|
-
end
|
90
|
-
|
91
|
-
if suff
|
92
|
-
@suffix = @suffix ? "#{suff}, #{@suffix}" : suff
|
93
|
-
name.delete_at(n)
|
94
|
-
end
|
95
|
-
|
96
|
-
end
|
97
|
-
end
|
98
|
-
|
99
|
-
##
|
100
|
-
# name is an Array
|
101
|
-
def extract_first(name)
|
102
|
-
return unless name and name.size >= 1
|
103
|
-
|
104
|
-
@first = name.first
|
105
|
-
name.delete_at(0)
|
106
|
-
|
107
|
-
@first.capitalize! unless @first =~ /[a-z]/ and @first =~ /[A-Z]/
|
108
|
-
end
|
109
|
-
|
110
|
-
##
|
111
|
-
# name is an Array
|
112
|
-
def extract_last(name)
|
113
|
-
return unless name and name.size >= 1
|
114
|
-
|
115
|
-
@last = name.last
|
116
|
-
name.delete_at(name.size - 1)
|
117
|
-
|
118
|
-
@last.capitalize! unless @last =~ /[a-z]/ and @last =~ /[A-Z]/
|
119
|
-
end
|
120
|
-
|
121
|
-
##
|
122
|
-
# name is an Array
|
123
|
-
def extract_middle(name)
|
124
|
-
return unless name and name.size >= 1
|
125
|
-
|
126
|
-
(name.size - 1).downto(0) do |n|
|
127
|
-
next unless name[n]
|
128
|
-
|
129
|
-
if name[n] =~ Patterns::LAST_NAME_PRE_DANGLERS
|
130
|
-
@last = "#{name[n].downcase.capitalize} #{@last}"
|
131
|
-
elsif name[n] =~ Patterns::O_LAST_NAME_PRE_CONCATS
|
132
|
-
@last = "O'#{@last}"
|
133
|
-
# elsif name[n] =~ Patterns::MC_LAST_NAME_PRE_CONCAT
|
134
|
-
# @last = "#{name[n].downcase.capitalize} #{@last}"
|
135
|
-
# elsif name[n] =~ Patterns::ST_LAST_NAME_PRE_CONCAT
|
136
|
-
# @last = "St. #{@last}"
|
137
|
-
elsif name[n] =~ /-+/ and n > 0 and name[n-1]
|
138
|
-
@last = "#{name[n-1]}-#{@last}"
|
139
|
-
name[n-1] = nil
|
140
|
-
else
|
141
|
-
@middle = @middle ? "#{name[n]} #{@middle}" : name[n]
|
142
|
-
end
|
143
|
-
|
144
|
-
name.delete_at(n)
|
145
|
-
end
|
146
|
-
|
147
|
-
@middle.capitalize! if @middle and !(@middle =~ /[a-z]/ and @middle =~ /[A-Z]/)
|
148
|
-
@middle = "#{@middle}." if @middle and @middle.size == 1
|
149
|
-
end
|
150
|
-
|
151
|
-
def parse(name)
|
152
|
-
raise InvalidNameError unless name
|
153
|
-
if name.class == String
|
154
|
-
if name.index(',')
|
155
|
-
name = "#{$2} #{$1}" if name =~ /^([a-z]+)\s*,\s*,*(.*)/i
|
156
|
-
end
|
157
|
-
|
158
|
-
name = name.strip.split(/\s+/)
|
159
|
-
end
|
160
|
-
|
161
|
-
name = name.first.split(/[^[:alnum:]]+/) if name.size == 1 and name.first.split(/[^[:alnum:]]+/)
|
162
|
-
|
163
|
-
extract_prefix(name)
|
164
|
-
extract_suffix(name)
|
165
|
-
extract_first(name)
|
166
|
-
extract_last(name)
|
167
|
-
extract_middle(name)
|
168
|
-
|
169
|
-
raise InvalidNameError, "A parseable name was not found. #{name.inspect}" unless @first
|
170
|
-
|
171
|
-
self
|
172
|
-
end
|
173
|
-
|
174
|
-
def to_s
|
175
|
-
[@prefix, @first, @middle, @last].compact.join(' ') + (@suffix ? ", #{@suffix}" : "")
|
176
|
-
end
|
177
|
-
|
178
|
-
def to_name
|
179
|
-
to_nameable
|
180
|
-
end
|
181
|
-
|
182
|
-
def to_fullname
|
183
|
-
to_s
|
184
|
-
end
|
185
|
-
|
186
|
-
def to_prefix
|
187
|
-
@prefix
|
188
|
-
end
|
189
|
-
|
190
|
-
def to_firstname
|
191
|
-
@first
|
192
|
-
end
|
193
|
-
|
194
|
-
def to_lastname
|
195
|
-
@last
|
196
|
-
end
|
197
|
-
|
198
|
-
def to_middlename
|
199
|
-
@middle
|
200
|
-
end
|
201
|
-
|
202
|
-
def to_suffix
|
203
|
-
@suffix
|
204
|
-
end
|
205
|
-
|
206
|
-
def to_nameable
|
207
|
-
[@first, @last].compact.join(' ')
|
208
|
-
end
|
209
|
-
|
210
|
-
def to_hash
|
211
|
-
return {
|
212
|
-
:prefix => @prefix,
|
213
|
-
:first => @first,
|
214
|
-
:middle => @middle,
|
215
|
-
:last => @last,
|
216
|
-
:suffix => @suffix
|
217
|
-
}
|
218
|
-
end
|
7
|
+
def self.parse(name)
|
8
|
+
Nameable::Latin.new.parse(name)
|
219
9
|
end
|
220
10
|
end
|
@@ -0,0 +1,247 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'nameable/error'
|
3
|
+
require 'nameable/latin/patterns'
|
4
|
+
|
5
|
+
module Nameable
|
6
|
+
class Latin
|
7
|
+
@@first_names = {}
|
8
|
+
@@last_names = {}
|
9
|
+
|
10
|
+
attr_accessor :prefix, :first, :middle, :last, :suffix
|
11
|
+
|
12
|
+
##
|
13
|
+
def initialize(*args)
|
14
|
+
if args.size == 1 && args.first.class == Hash
|
15
|
+
parts = args.first
|
16
|
+
@prefix = parts[:prefix] ? parts[:prefix] : nil
|
17
|
+
@first = parts[:first] ? parts[:first] : nil
|
18
|
+
@middle = parts[:middle] ? parts[:middle] : nil
|
19
|
+
@last = parts[:last] ? parts[:last] : nil
|
20
|
+
@suffix = parts[:suffix] ? parts[:suffix] : nil
|
21
|
+
else
|
22
|
+
@first = args.shift if args.size > 0
|
23
|
+
@middle = args.shift if args.size >= 2 # Only grab a middle name if we've got a last name left
|
24
|
+
@last = args.shift if args.size > 0
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# name is an Array
|
30
|
+
def extract_prefix(name)
|
31
|
+
return unless name and name.size > 1 and @prefix.nil? and @first.nil?
|
32
|
+
Nameable::Latin::Patterns::PREFIX.each_pair do |pretty, regex|
|
33
|
+
if name.first =~ regex
|
34
|
+
@prefix = pretty
|
35
|
+
name.delete(name.first)
|
36
|
+
return
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# name is an Array
|
43
|
+
def extract_suffix(name)
|
44
|
+
return unless name and name.size >= 3
|
45
|
+
|
46
|
+
(name.size - 1).downto(2) do |n|
|
47
|
+
suff = nil
|
48
|
+
|
49
|
+
Nameable::Latin::Patterns::SUFFIX.each_pair do |pretty, regex|
|
50
|
+
suff = pretty if name[n] =~ regex
|
51
|
+
end
|
52
|
+
|
53
|
+
if name[n] =~ Nameable::Latin::Patterns::SUFFIX_ACADEMIC or name[n] =~ Nameable::Latin::Patterns::SUFFIX_PROFESSIONAL or name[n] =~ Nameable::Latin::Patterns::SUFFIX_GENERATIONAL_ROMAN
|
54
|
+
suff = name[n].upcase.gsub(/\./,'')
|
55
|
+
end
|
56
|
+
|
57
|
+
if !suff && name.join != name.join.upcase and name[n].length > 1 and name[n] =~ Nameable::Latin::Patterns::SUFFIX_ABBREVIATION
|
58
|
+
suff = name[n].upcase.gsub(/\./,'')
|
59
|
+
end
|
60
|
+
|
61
|
+
if suff
|
62
|
+
@suffix = @suffix ? "#{suff}, #{@suffix}" : suff
|
63
|
+
name.delete_at(n)
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# name is an Array
|
71
|
+
def extract_first(name)
|
72
|
+
return unless name and name.size >= 1
|
73
|
+
|
74
|
+
@first = name.first
|
75
|
+
name.delete_at(0)
|
76
|
+
|
77
|
+
@first.capitalize! unless @first =~ /[a-z]/ and @first =~ /[A-Z]/
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# name is an Array
|
82
|
+
def extract_last(name)
|
83
|
+
return unless name and name.size >= 1
|
84
|
+
|
85
|
+
@last = name.last.gsub(/['`"]+/, "'").gsub(/-+/, '-')
|
86
|
+
name.delete_at(name.size - 1)
|
87
|
+
|
88
|
+
@last.capitalize! unless @last =~ /[a-z]/ and @last =~ /[A-Z]/
|
89
|
+
end
|
90
|
+
|
91
|
+
##
|
92
|
+
# name is an Array
|
93
|
+
def extract_middle(name)
|
94
|
+
return unless name and name.size >= 1
|
95
|
+
|
96
|
+
(name.size - 1).downto(0) do |n|
|
97
|
+
next unless name[n]
|
98
|
+
|
99
|
+
if name[n] =~ Nameable::Latin::Patterns::LAST_NAME_PRE_DANGLERS
|
100
|
+
@last = "#{name[n].downcase.capitalize} #{@last}"
|
101
|
+
elsif name[n] =~ Nameable::Latin::Patterns::O_LAST_NAME_PRE_CONCATS
|
102
|
+
@last = "O'#{@last}"
|
103
|
+
elsif name[n] =~ /\-/ and n > 0 and name[n-1]
|
104
|
+
@last = "#{name[n-1].gsub(/\-/, '')}-#{@last}"
|
105
|
+
name[n-1] = nil
|
106
|
+
else
|
107
|
+
@middle = @middle ? "#{name[n]} #{@middle}" : name[n]
|
108
|
+
end
|
109
|
+
|
110
|
+
name.delete_at(n)
|
111
|
+
end
|
112
|
+
|
113
|
+
@middle.capitalize! if @middle and !(@middle =~ /[a-z]/ and @middle =~ /[A-Z]/)
|
114
|
+
@middle = "#{@middle}." if @middle and @middle.size == 1
|
115
|
+
end
|
116
|
+
|
117
|
+
def parse(name)
|
118
|
+
raise InvalidNameError unless name
|
119
|
+
if name.class == String
|
120
|
+
if name.index(',')
|
121
|
+
name = "#{$2} #{$1}" if name =~ /^([a-z]+)\s*,\s*,*(.*)/i
|
122
|
+
end
|
123
|
+
|
124
|
+
name = name.strip.split(/\s+/)
|
125
|
+
end
|
126
|
+
|
127
|
+
name = name.first.split(/[^[:alnum:]]+/) if name.size == 1 and name.first.split(/[^[:alnum:]]+/)
|
128
|
+
|
129
|
+
extract_prefix(name)
|
130
|
+
extract_suffix(name)
|
131
|
+
extract_first(name)
|
132
|
+
extract_last(name)
|
133
|
+
extract_middle(name)
|
134
|
+
|
135
|
+
raise InvalidNameError, "A parseable name was not found. #{name.inspect}" unless @first
|
136
|
+
|
137
|
+
self
|
138
|
+
end
|
139
|
+
|
140
|
+
# http://www.ssa.gov/oact/babynames/limits.html
|
141
|
+
def load_huge_gender_table
|
142
|
+
ranked = {}
|
143
|
+
|
144
|
+
CSV.read(File.expand_path(File.join('..', '..', '..', 'data', 'yob2013.txt'), __FILE__)).each do |first, gender, rank|
|
145
|
+
first.downcase!
|
146
|
+
gender.downcase!
|
147
|
+
ranked[first] = {} unless ranked[first]
|
148
|
+
ranked[first][gender] = rank.to_i
|
149
|
+
end
|
150
|
+
|
151
|
+
ranked.each do |first, ranks|
|
152
|
+
if ranks['m'] && !ranks['f']
|
153
|
+
@@first_names[first] = :male
|
154
|
+
elsif !ranks['m'] && ranks['f']
|
155
|
+
@@first_names[first] = :female
|
156
|
+
elsif ranks['m'] && ranks['f']
|
157
|
+
@@first_names[first] = ranks['m'] > ranks['f'] ? :male : :female
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
# http://www.census.gov/genealogy/www/data/2000surnames/index.html
|
163
|
+
def load_huge_ethnicity_table
|
164
|
+
CSV.read(File.expand_path(File.join('..', '..', '..', 'data', 'app_c.csv'), __FILE__)).each do |name, rank, count, prop100k, cum_prop100k, pctwhite, pctblack, pctapi, pctaian, pct2prace, pcthispanic|
|
165
|
+
next if name == 'name'
|
166
|
+
@@last_names[name.downcase] = {
|
167
|
+
rank:rank.to_i,
|
168
|
+
count:count.to_i,
|
169
|
+
percent_white:pctwhite.to_f,
|
170
|
+
percent_black:pctblack.to_f,
|
171
|
+
percent_asian_pacific_islander:pctapi.to_f,
|
172
|
+
percent_american_indian_alaska_native:pctaian.to_f,
|
173
|
+
percent_two_or_more_races:pct2prace.to_f,
|
174
|
+
percent_hispanic:pcthispanic.to_f
|
175
|
+
}
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
def gender
|
180
|
+
return @gender if @gender
|
181
|
+
load_huge_gender_table unless @@first_names && @@first_names.size > 0
|
182
|
+
@gender = @@first_names[@first.to_s.downcase] ? @@first_names[@first.to_s.downcase] : :unknown
|
183
|
+
@gender
|
184
|
+
end
|
185
|
+
|
186
|
+
def ethnicity
|
187
|
+
return @ethnicity if @ethnicity
|
188
|
+
load_huge_ethnicity_table unless @@last_names && @@last_names.size > 0
|
189
|
+
@ethnicity = (@last && @@last_names[@last.downcase]) ? @@last_names[@last.downcase] : :unknown
|
190
|
+
@ethnicity
|
191
|
+
end
|
192
|
+
|
193
|
+
def male?
|
194
|
+
self.gender == :male
|
195
|
+
end
|
196
|
+
|
197
|
+
def female?
|
198
|
+
self.gender == :female
|
199
|
+
end
|
200
|
+
|
201
|
+
def to_s
|
202
|
+
[@prefix, @first, @middle, @last].compact.join(' ') + (@suffix ? ", #{@suffix}" : "")
|
203
|
+
end
|
204
|
+
|
205
|
+
def to_name
|
206
|
+
to_nameable
|
207
|
+
end
|
208
|
+
|
209
|
+
def to_fullname
|
210
|
+
to_s
|
211
|
+
end
|
212
|
+
|
213
|
+
def to_prefix
|
214
|
+
@prefix
|
215
|
+
end
|
216
|
+
|
217
|
+
def to_firstname
|
218
|
+
@first
|
219
|
+
end
|
220
|
+
|
221
|
+
def to_lastname
|
222
|
+
@last
|
223
|
+
end
|
224
|
+
|
225
|
+
def to_middlename
|
226
|
+
@middle
|
227
|
+
end
|
228
|
+
|
229
|
+
def to_suffix
|
230
|
+
@suffix
|
231
|
+
end
|
232
|
+
|
233
|
+
def to_nameable
|
234
|
+
[@first, @last].compact.join(' ')
|
235
|
+
end
|
236
|
+
|
237
|
+
def to_hash
|
238
|
+
return {
|
239
|
+
:prefix => @prefix,
|
240
|
+
:first => @first,
|
241
|
+
:middle => @middle,
|
242
|
+
:last => @last,
|
243
|
+
:suffix => @suffix
|
244
|
+
}
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|