fullname-parser 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/fullname/parser.rb +161 -78
- data/lib/fullname/parser/version.rb +1 -1
- metadata +7 -10
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8701ad30b034d522d4116053672296dad64fe2b8
|
4
|
+
data.tar.gz: 00b0315fbefe4988a2aadde0c14a0f128baf7271
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f3498c76c53588b73363fa76652fd1d268282e8742116931793330eab1cc9becbbc8b5f65fe20647be02e0b5755d7c729e6cf4f309b436eda1d82adba56d5e74
|
7
|
+
data.tar.gz: 4472a7dc0e50d9129c0ab96b7b0006b588f1f5db1c96dccf79f21d34870ad09f2a498c1481311e94ce69e9c75c13a36019b69a632c34017f1d840f6dc250973d
|
data/lib/fullname/parser.rb
CHANGED
@@ -101,91 +101,174 @@ module Fullname
|
|
101
101
|
'9th' => 'IX',
|
102
102
|
} unless const_defined?(:CONVERSION)
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
104
|
+
class Error < StandardError; end
|
105
|
+
class Identifier
|
106
|
+
attr_reader :name, :original_name, :prefix, :firstname, :middlename, :lastname, :suffix
|
107
|
+
def initialize(name)
|
108
|
+
@original_name = name.dup
|
109
|
+
@name = name.dup
|
110
|
+
@prefix_list = []
|
111
|
+
@suffix_list = []
|
112
|
+
sanitize!
|
113
|
+
flip_parts!
|
114
|
+
breakup!
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def sanitize!
|
120
|
+
# replace "’" to "'"
|
121
|
+
name.gsub!(/’/, "'")
|
122
|
+
# remove strings which contain and include in parentheses
|
123
|
+
# ex. 'Susan M. (Scully) Schultz' => 'Susan M. Schultz'
|
124
|
+
# 'Jay (Jung) Heum Kim' => 'Jay Heum Kim'
|
125
|
+
name.gsub!(/\(.*?\)/, ' ')
|
126
|
+
name.gsub!(/\(|\)/, '')
|
127
|
+
# remove quoted strings
|
128
|
+
# Darin "Derry" Ronald Anderson => 'Darin Ronald Anderson'
|
129
|
+
# Nancy M. "Shelli" Egger => 'Nancy M. Egger'
|
130
|
+
# Nicole 'nikki' Adame => 'Nicole Adame'
|
131
|
+
name.gsub!(/".*?"/, ' ')
|
132
|
+
name.gsub!(/'.*?'/i, ' ')
|
133
|
+
|
134
|
+
# remove curly brackets
|
135
|
+
# Henry C.{Harry} Wilson => 'Henry C. Wilson'
|
136
|
+
# Cellestine {Steen} Armstrong => 'Cellestine Armstrong'
|
137
|
+
name.gsub!(/\{.*?\}/, ' ')
|
138
|
+
# remove exceptional names
|
139
|
+
# ex. "William . D. 'Bill' Beard" => "William D. 'Bill' Beard"
|
140
|
+
# also this regexp can remove
|
141
|
+
name.gsub!(/\s+[^a-zA-Z]+\s+/, ' ')
|
142
|
+
# Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
|
143
|
+
# the reason is the substitution applies for suffix splitting, not for replacing
|
144
|
+
# bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
|
145
|
+
# so that the suffix will get into the split array.
|
146
|
+
# and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
|
147
|
+
CONVERSION.each_pair do |finder, replacer|
|
148
|
+
name.gsub!(Regexp.new("\\b#{Regexp.escape(finder)}\\b", true), replacer)
|
148
149
|
end
|
149
150
|
end
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
151
|
+
|
152
|
+
def extract_suffix(str)
|
153
|
+
list = []
|
154
|
+
loop do
|
155
|
+
m = /(.*)[, ](.+)/.match(str)
|
156
|
+
break unless m
|
157
|
+
remaining = m[1]
|
158
|
+
last_part = m[2].strip
|
159
|
+
last_part_downcase = last_part.downcase
|
160
|
+
if IGNORABLE_SUFFIXES.include?(last_part_downcase)
|
161
|
+
list.unshift([last_part, false])
|
162
|
+
elsif SUFFIX_LIST.include?(last_part_downcase) || GLOBAL_SUFFIX_LIST.include?(last_part_downcase)
|
163
|
+
list.unshift([last_part, true])
|
164
|
+
else
|
165
|
+
break
|
166
|
+
end
|
167
|
+
str = remaining.gsub(/[, ]+$/, '').strip
|
161
168
|
end
|
169
|
+
[str, list]
|
162
170
|
end
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
171
|
+
|
172
|
+
def extract_prefix(str)
|
173
|
+
list = []
|
174
|
+
loop do
|
175
|
+
m = /(.+?)[, ](.+)/.match(str)
|
176
|
+
break unless m
|
177
|
+
remining = m[2]
|
178
|
+
first_part = m[1]
|
179
|
+
first_part_downcase = first_part.downcase
|
180
|
+
if IGNORABLE_PREFIXS.include?(first_part_downcase)
|
181
|
+
# skip words
|
182
|
+
elsif PREFIX_LIST.include?(first_part_downcase)
|
183
|
+
list.push(first_part)
|
184
|
+
else
|
185
|
+
break
|
186
|
+
end
|
187
|
+
str = remining.gsub(/^[, ]+/, '').strip
|
188
|
+
end
|
189
|
+
[str, list]
|
176
190
|
end
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
191
|
+
|
192
|
+
def extract_suffix_before_flipping_parts
|
193
|
+
remaining, list = extract_suffix(name)
|
194
|
+
@name = remaining
|
195
|
+
@suffix_list += list
|
181
196
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
197
|
+
|
198
|
+
def flip_parts!
|
199
|
+
extract_suffix_before_flipping_parts
|
200
|
+
parts = name.split(/,/)
|
201
|
+
case parts.size
|
202
|
+
when 1
|
203
|
+
when 2
|
204
|
+
remining, list = extract_suffix(parts[0])
|
205
|
+
@name = [parts[1], remining].join(' ').strip.gsub(/ +/, ' ')
|
206
|
+
@suffix_list += list
|
207
|
+
when 3
|
208
|
+
remining, list = extract_suffix(parts[0..1].join(' '))
|
209
|
+
@name = [parts[2], remining].join(' ').strip.gsub(/ +/, ' ')
|
210
|
+
@suffix_list += list
|
211
|
+
else
|
212
|
+
fail Error.new("name [ #{name} ] has >2 commas, don't know how to parse")
|
213
|
+
end
|
214
|
+
|
215
|
+
extract_prefix_after_flipping_parts
|
185
216
|
end
|
186
|
-
|
187
|
-
|
188
|
-
|
217
|
+
|
218
|
+
def extract_prefix_after_flipping_parts
|
219
|
+
remaining, list = extract_prefix(name)
|
220
|
+
@name = remaining
|
221
|
+
@prefix_list += list
|
222
|
+
end
|
223
|
+
|
224
|
+
def breakup!
|
225
|
+
parts = name.split(/[, ]+/)
|
226
|
+
|
227
|
+
# process prefix
|
228
|
+
@prefix = @prefix_list.join(' ') if @prefix_list.any?
|
229
|
+
|
230
|
+
# process lastname
|
231
|
+
# Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
|
232
|
+
last_name_arr = []
|
233
|
+
last_name_arr.push(parts.pop)
|
234
|
+
last_name_arr.push(parts.pop) while parts.length > 1 && LAST_NAME_EXTENSIONS.include?(parts.last.downcase)
|
235
|
+
@lastname = last_name_arr.reverse.join(' ')
|
236
|
+
|
237
|
+
# process firstname and middlename
|
238
|
+
@firstname = parts.shift if parts.length >= 1
|
239
|
+
@middlename = parts.join(' ') if parts.length > 0
|
240
|
+
if firstname.nil? && prefix
|
241
|
+
@firstname = prefix
|
242
|
+
@prefix = nil
|
243
|
+
end
|
244
|
+
|
245
|
+
# move lastname to firstname, move first suffix to lastname
|
246
|
+
if firstname.nil? && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
|
247
|
+
@firstname = lastname
|
248
|
+
@lastname = @suffix_list.shift.first
|
249
|
+
end
|
250
|
+
|
251
|
+
# move lastname to middlename, move first suffix to lastname
|
252
|
+
if lastname =~ /^[A-Z]\.?$/i && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
|
253
|
+
@middlename = [middlename, lastname].compact.join(' ')
|
254
|
+
@lastname = @suffix_list.shift.first
|
255
|
+
end
|
256
|
+
|
257
|
+
# process suffix
|
258
|
+
@suffix_list.delete_if { |_, ignore_able| !ignore_able }
|
259
|
+
@suffix = @suffix_list.any? ? @suffix_list.first.first : nil
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def parse_fullname(name)
|
264
|
+
i = Identifier.new(name)
|
265
|
+
return {
|
266
|
+
prefix: i.prefix,
|
267
|
+
first: i.firstname,
|
268
|
+
middle: i.middlename,
|
269
|
+
last: i.lastname,
|
270
|
+
suffix: i.suffix
|
271
|
+
}
|
189
272
|
end # << parse_fullname
|
190
273
|
extend self
|
191
274
|
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fullname-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
5
|
-
prerelease:
|
4
|
+
version: 1.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- xiaohui
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
|
15
14
|
email:
|
@@ -25,27 +24,25 @@ files:
|
|
25
24
|
- lib/fullname/parser/version.rb
|
26
25
|
homepage: http://github.com/xiaohui-zhangxh/fullname-parser
|
27
26
|
licenses: []
|
27
|
+
metadata: {}
|
28
28
|
post_install_message:
|
29
29
|
rdoc_options: []
|
30
30
|
require_paths:
|
31
31
|
- lib
|
32
32
|
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
33
|
requirements:
|
35
|
-
- -
|
34
|
+
- - '>='
|
36
35
|
- !ruby/object:Gem::Version
|
37
36
|
version: '0'
|
38
37
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
38
|
requirements:
|
41
|
-
- -
|
39
|
+
- - '>='
|
42
40
|
- !ruby/object:Gem::Version
|
43
41
|
version: '0'
|
44
42
|
requirements: []
|
45
43
|
rubyforge_project:
|
46
|
-
rubygems_version:
|
44
|
+
rubygems_version: 2.4.8
|
47
45
|
signing_key:
|
48
|
-
specification_version:
|
46
|
+
specification_version: 4
|
49
47
|
summary: Split fullname into pieces(prefix/first/middle/last/suffix)
|
50
48
|
test_files: []
|
51
|
-
has_rdoc:
|