fullname-parser 1.0.5 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/fullname/parser.rb +161 -78
- data/lib/fullname/parser/version.rb +1 -1
- metadata +7 -10
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8701ad30b034d522d4116053672296dad64fe2b8
|
4
|
+
data.tar.gz: 00b0315fbefe4988a2aadde0c14a0f128baf7271
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f3498c76c53588b73363fa76652fd1d268282e8742116931793330eab1cc9becbbc8b5f65fe20647be02e0b5755d7c729e6cf4f309b436eda1d82adba56d5e74
|
7
|
+
data.tar.gz: 4472a7dc0e50d9129c0ab96b7b0006b588f1f5db1c96dccf79f21d34870ad09f2a498c1481311e94ce69e9c75c13a36019b69a632c34017f1d840f6dc250973d
|
data/lib/fullname/parser.rb
CHANGED
@@ -101,91 +101,174 @@ module Fullname
|
|
101
101
|
'9th' => 'IX',
|
102
102
|
} unless const_defined?(:CONVERSION)
|
103
103
|
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
104
|
+
class Error < StandardError; end
|
105
|
+
class Identifier
|
106
|
+
attr_reader :name, :original_name, :prefix, :firstname, :middlename, :lastname, :suffix
|
107
|
+
def initialize(name)
|
108
|
+
@original_name = name.dup
|
109
|
+
@name = name.dup
|
110
|
+
@prefix_list = []
|
111
|
+
@suffix_list = []
|
112
|
+
sanitize!
|
113
|
+
flip_parts!
|
114
|
+
breakup!
|
115
|
+
end
|
116
|
+
|
117
|
+
private
|
118
|
+
|
119
|
+
def sanitize!
|
120
|
+
# replace "’" to "'"
|
121
|
+
name.gsub!(/’/, "'")
|
122
|
+
# remove strings which contain and include in parentheses
|
123
|
+
# ex. 'Susan M. (Scully) Schultz' => 'Susan M. Schultz'
|
124
|
+
# 'Jay (Jung) Heum Kim' => 'Jay Heum Kim'
|
125
|
+
name.gsub!(/\(.*?\)/, ' ')
|
126
|
+
name.gsub!(/\(|\)/, '')
|
127
|
+
# remove quoted strings
|
128
|
+
# Darin "Derry" Ronald Anderson => 'Darin Ronald Anderson'
|
129
|
+
# Nancy M. "Shelli" Egger => 'Nancy M. Egger'
|
130
|
+
# Nicole 'nikki' Adame => 'Nicole Adame'
|
131
|
+
name.gsub!(/".*?"/, ' ')
|
132
|
+
name.gsub!(/'.*?'/i, ' ')
|
133
|
+
|
134
|
+
# remove curly brackets
|
135
|
+
# Henry C.{Harry} Wilson => 'Henry C. Wilson'
|
136
|
+
# Cellestine {Steen} Armstrong => 'Cellestine Armstrong'
|
137
|
+
name.gsub!(/\{.*?\}/, ' ')
|
138
|
+
# remove exceptional names
|
139
|
+
# ex. "William . D. 'Bill' Beard" => "William D. 'Bill' Beard"
|
140
|
+
# also this regexp can remove
|
141
|
+
name.gsub!(/\s+[^a-zA-Z]+\s+/, ' ')
|
142
|
+
# Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
|
143
|
+
# the reason is the substitution applies for suffix splitting, not for replacing
|
144
|
+
# bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
|
145
|
+
# so that the suffix will get into the split array.
|
146
|
+
# and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
|
147
|
+
CONVERSION.each_pair do |finder, replacer|
|
148
|
+
name.gsub!(Regexp.new("\\b#{Regexp.escape(finder)}\\b", true), replacer)
|
148
149
|
end
|
149
150
|
end
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
151
|
+
|
152
|
+
def extract_suffix(str)
|
153
|
+
list = []
|
154
|
+
loop do
|
155
|
+
m = /(.*)[, ](.+)/.match(str)
|
156
|
+
break unless m
|
157
|
+
remaining = m[1]
|
158
|
+
last_part = m[2].strip
|
159
|
+
last_part_downcase = last_part.downcase
|
160
|
+
if IGNORABLE_SUFFIXES.include?(last_part_downcase)
|
161
|
+
list.unshift([last_part, false])
|
162
|
+
elsif SUFFIX_LIST.include?(last_part_downcase) || GLOBAL_SUFFIX_LIST.include?(last_part_downcase)
|
163
|
+
list.unshift([last_part, true])
|
164
|
+
else
|
165
|
+
break
|
166
|
+
end
|
167
|
+
str = remaining.gsub(/[, ]+$/, '').strip
|
161
168
|
end
|
169
|
+
[str, list]
|
162
170
|
end
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
171
|
+
|
172
|
+
def extract_prefix(str)
|
173
|
+
list = []
|
174
|
+
loop do
|
175
|
+
m = /(.+?)[, ](.+)/.match(str)
|
176
|
+
break unless m
|
177
|
+
remining = m[2]
|
178
|
+
first_part = m[1]
|
179
|
+
first_part_downcase = first_part.downcase
|
180
|
+
if IGNORABLE_PREFIXS.include?(first_part_downcase)
|
181
|
+
# skip words
|
182
|
+
elsif PREFIX_LIST.include?(first_part_downcase)
|
183
|
+
list.push(first_part)
|
184
|
+
else
|
185
|
+
break
|
186
|
+
end
|
187
|
+
str = remining.gsub(/^[, ]+/, '').strip
|
188
|
+
end
|
189
|
+
[str, list]
|
176
190
|
end
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
191
|
+
|
192
|
+
def extract_suffix_before_flipping_parts
|
193
|
+
remaining, list = extract_suffix(name)
|
194
|
+
@name = remaining
|
195
|
+
@suffix_list += list
|
181
196
|
end
|
182
|
-
|
183
|
-
|
184
|
-
|
197
|
+
|
198
|
+
def flip_parts!
|
199
|
+
extract_suffix_before_flipping_parts
|
200
|
+
parts = name.split(/,/)
|
201
|
+
case parts.size
|
202
|
+
when 1
|
203
|
+
when 2
|
204
|
+
remining, list = extract_suffix(parts[0])
|
205
|
+
@name = [parts[1], remining].join(' ').strip.gsub(/ +/, ' ')
|
206
|
+
@suffix_list += list
|
207
|
+
when 3
|
208
|
+
remining, list = extract_suffix(parts[0..1].join(' '))
|
209
|
+
@name = [parts[2], remining].join(' ').strip.gsub(/ +/, ' ')
|
210
|
+
@suffix_list += list
|
211
|
+
else
|
212
|
+
fail Error.new("name [ #{name} ] has >2 commas, don't know how to parse")
|
213
|
+
end
|
214
|
+
|
215
|
+
extract_prefix_after_flipping_parts
|
185
216
|
end
|
186
|
-
|
187
|
-
|
188
|
-
|
217
|
+
|
218
|
+
def extract_prefix_after_flipping_parts
|
219
|
+
remaining, list = extract_prefix(name)
|
220
|
+
@name = remaining
|
221
|
+
@prefix_list += list
|
222
|
+
end
|
223
|
+
|
224
|
+
def breakup!
|
225
|
+
parts = name.split(/[, ]+/)
|
226
|
+
|
227
|
+
# process prefix
|
228
|
+
@prefix = @prefix_list.join(' ') if @prefix_list.any?
|
229
|
+
|
230
|
+
# process lastname
|
231
|
+
# Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
|
232
|
+
last_name_arr = []
|
233
|
+
last_name_arr.push(parts.pop)
|
234
|
+
last_name_arr.push(parts.pop) while parts.length > 1 && LAST_NAME_EXTENSIONS.include?(parts.last.downcase)
|
235
|
+
@lastname = last_name_arr.reverse.join(' ')
|
236
|
+
|
237
|
+
# process firstname and middlename
|
238
|
+
@firstname = parts.shift if parts.length >= 1
|
239
|
+
@middlename = parts.join(' ') if parts.length > 0
|
240
|
+
if firstname.nil? && prefix
|
241
|
+
@firstname = prefix
|
242
|
+
@prefix = nil
|
243
|
+
end
|
244
|
+
|
245
|
+
# move lastname to firstname, move first suffix to lastname
|
246
|
+
if firstname.nil? && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
|
247
|
+
@firstname = lastname
|
248
|
+
@lastname = @suffix_list.shift.first
|
249
|
+
end
|
250
|
+
|
251
|
+
# move lastname to middlename, move first suffix to lastname
|
252
|
+
if lastname =~ /^[A-Z]\.?$/i && @suffix_list.any? && SUFFIX_CAN_BE_LASTNAME.include?(@suffix_list.first.first.downcase)
|
253
|
+
@middlename = [middlename, lastname].compact.join(' ')
|
254
|
+
@lastname = @suffix_list.shift.first
|
255
|
+
end
|
256
|
+
|
257
|
+
# process suffix
|
258
|
+
@suffix_list.delete_if { |_, ignore_able| !ignore_able }
|
259
|
+
@suffix = @suffix_list.any? ? @suffix_list.first.first : nil
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def parse_fullname(name)
|
264
|
+
i = Identifier.new(name)
|
265
|
+
return {
|
266
|
+
prefix: i.prefix,
|
267
|
+
first: i.firstname,
|
268
|
+
middle: i.middlename,
|
269
|
+
last: i.lastname,
|
270
|
+
suffix: i.suffix
|
271
|
+
}
|
189
272
|
end # << parse_fullname
|
190
273
|
extend self
|
191
274
|
end
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fullname-parser
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
5
|
-
prerelease:
|
4
|
+
version: 1.1.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- xiaohui
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2016-02-16 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
|
15
14
|
email:
|
@@ -25,27 +24,25 @@ files:
|
|
25
24
|
- lib/fullname/parser/version.rb
|
26
25
|
homepage: http://github.com/xiaohui-zhangxh/fullname-parser
|
27
26
|
licenses: []
|
27
|
+
metadata: {}
|
28
28
|
post_install_message:
|
29
29
|
rdoc_options: []
|
30
30
|
require_paths:
|
31
31
|
- lib
|
32
32
|
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
33
|
requirements:
|
35
|
-
- -
|
34
|
+
- - '>='
|
36
35
|
- !ruby/object:Gem::Version
|
37
36
|
version: '0'
|
38
37
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
-
none: false
|
40
38
|
requirements:
|
41
|
-
- -
|
39
|
+
- - '>='
|
42
40
|
- !ruby/object:Gem::Version
|
43
41
|
version: '0'
|
44
42
|
requirements: []
|
45
43
|
rubyforge_project:
|
46
|
-
rubygems_version:
|
44
|
+
rubygems_version: 2.4.8
|
47
45
|
signing_key:
|
48
|
-
specification_version:
|
46
|
+
specification_version: 4
|
49
47
|
summary: Split fullname into pieces(prefix/first/middle/last/suffix)
|
50
48
|
test_files: []
|
51
|
-
has_rdoc:
|