fullname-parser 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +18 -0
- data/README.md +18 -0
- data/fullname-parser.gemspec +17 -0
- data/lib/fullname/parser.rb +185 -0
- data/lib/fullname/parser/version.rb +6 -0
- metadata +51 -0
data/.gitignore
ADDED
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
fullname_parser
|
2
|
+
===============
|
3
|
+
|
4
|
+
There are two ways to use this function:
|
5
|
+
|
6
|
+
require 'fullname/parser'
|
7
|
+
Fullname::Parser.parse_fullname("Xiaohui Zhang")
|
8
|
+
|
9
|
+
=> {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
|
10
|
+
|
11
|
+
or
|
12
|
+
|
13
|
+
require 'fullname/parser'
|
14
|
+
include Fullname::Parser
|
15
|
+
parse_fullname("Xiaohui Zhang")
|
16
|
+
|
17
|
+
=> {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
|
18
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
2
|
+
require 'fullname/parser/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "fullname-parser"
|
6
|
+
s.version = Fullname::Parser::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['xiaohui']
|
9
|
+
s.email = ['xiaohui@zhangxh.net']
|
10
|
+
s.homepage = 'http://github.com/xiaohui-zhangxh/fullname-parser'
|
11
|
+
s.summary = "Split fullname into pieces(prefix/first/middle/last/suffix)"
|
12
|
+
s.description = "For parsing people's fullname into pieces(prefix/first/middle/last/suffix)"
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
16
|
+
s.require_path = 'lib'
|
17
|
+
end
|
@@ -0,0 +1,185 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path('../parser/version', __FILE__)
|
3
|
+
|
4
|
+
module Fullname
|
5
|
+
module Parser
|
6
|
+
|
7
|
+
# When "II" or "III" or even "IV" appear in the Middle Name/Suffix slot, it can safely be assumed that they are Suffixes.
|
8
|
+
# (John Smith has a son named John Smith II, who has a son named John Smith III, etc.) However, nobody (except a king)
|
9
|
+
# puts "I" after their name to indicate that they are the "first." If anything, they put "Sr." Therefore, a letter "I"
|
10
|
+
# appearing in the Middle Name/Suffix slot can be assumed to be their Middle Initial.
|
11
|
+
# So here 'i' will be removed from the GENERATION_LIST
|
12
|
+
#
|
13
|
+
# Also almost nobody will reach to 'v'(except a king), so all suffixes later than 'v' we won't use.
|
14
|
+
GENERATION_LIST = [
|
15
|
+
#'i',
|
16
|
+
'ii', 'iii', 'iv', 'v',
|
17
|
+
# 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
|
18
|
+
] unless const_defined?(:GENERATION_LIST)
|
19
|
+
|
20
|
+
GLOBAL_SUFFIX_LIST = GENERATION_LIST + [
|
21
|
+
'jr', 'jr.',
|
22
|
+
'sr', 'sr.',
|
23
|
+
] unless const_defined?(:GLOBAL_SUFFIX_LIST)
|
24
|
+
|
25
|
+
SUFFIX_LIST = [
|
26
|
+
'b.a.',
|
27
|
+
'capt.', 'col.', 'cfa', 'c.f.a', 'c.f.a.', 'cpa', 'c.p.a', 'c.p.a.',
|
28
|
+
'edd', 'ed.d',
|
29
|
+
'mph',
|
30
|
+
'pc', 'p.c.', 'psyd', 'psyd.', 'psy.d', 'phd', 'phd.', 'ph.d', 'ph.d.',
|
31
|
+
'r.s.m.',
|
32
|
+
'usn',
|
33
|
+
] unless const_defined?(:SUFFIX_LIST)
|
34
|
+
|
35
|
+
IGNORABLE_SUFFIXES = [
|
36
|
+
'do', 'd.o.', 'd.o', 'dds', 'd.d.s.',
|
37
|
+
'esq', 'esq.',
|
38
|
+
'md', 'm.d.', 'm.d',
|
39
|
+
'mr.', 'ms.', 'mrs.',
|
40
|
+
'jd', 'jd.', 'j.d.',
|
41
|
+
'retd', 'ret.', 'retd.',
|
42
|
+
'usmc',
|
43
|
+
] unless const_defined?(:IGNORABLE_SUFFIXES)
|
44
|
+
|
45
|
+
SUFFIX_CAN_BE_LASTNAME = [
|
46
|
+
'do',
|
47
|
+
] unless const_defined?(:SUFFIX_CAN_BE_LASTNAME)
|
48
|
+
|
49
|
+
PREFIX_LIST = [
|
50
|
+
'asst.',
|
51
|
+
'attorney', 'atty.',
|
52
|
+
'bg', 'brig', 'gen',
|
53
|
+
'colonel', 'cardinal', 'capt', 'capt.', 'captain', 'cdr', 'col' , 'col.', 'congressman', 'cpt',
|
54
|
+
'dir.', 'dr', 'dr.',
|
55
|
+
'exec.',
|
56
|
+
'general', 'gen', 'gen.',
|
57
|
+
'honorable', 'hon', 'hon.',
|
58
|
+
'judge', 'justice', 'chiefjustice',
|
59
|
+
'lieutenant', 'lcdr', 'lt', 'lt.', 'ltc', 'ltcol.', 'ltcol', 'ltjg',
|
60
|
+
'mr', 'mr.', 'ms', 'ms.', 'mrs', 'mrs.', 'maj', 'maj.', 'major', 'miss',
|
61
|
+
'president', 'prof', 'prof.', 'professor',
|
62
|
+
'reverend', 'rev', 'rev.',
|
63
|
+
'sheriff',
|
64
|
+
'sr', 'sr.'
|
65
|
+
|
66
|
+
] unless const_defined?(:PREFIX_LIST)
|
67
|
+
|
68
|
+
IGNORABLE_PREFIXS = [
|
69
|
+
'the',
|
70
|
+
] unless const_defined?(:IGNORABLE_PREFIXS)
|
71
|
+
|
72
|
+
|
73
|
+
# These will be considered part of the last name
|
74
|
+
LAST_NAME_EXTENSIONS = [
|
75
|
+
'bar', 'ben',
|
76
|
+
'da', 'dal', 'dan', 'de', 'del', 'den', 'der', 'des', 'dela', 'della', 'di', 'do', 'du',
|
77
|
+
'el',
|
78
|
+
'la', 'le', 'lo',
|
79
|
+
'mac', 'mc',
|
80
|
+
'san',
|
81
|
+
'st', 'st.', 'sta', 'sta.',
|
82
|
+
'van','von', 'ver', 'vanden', 'vander'
|
83
|
+
] unless const_defined?(:LAST_NAME_EXTENSIONS)
|
84
|
+
|
85
|
+
CONVERSION = {
|
86
|
+
'1st' => 'I',
|
87
|
+
'2nd' => 'II',
|
88
|
+
'3rd' => 'III',
|
89
|
+
'4th' => 'IV',
|
90
|
+
'5th' => 'V',
|
91
|
+
'6th' => 'VI',
|
92
|
+
'7th' => 'VII',
|
93
|
+
'8th' => 'VIII',
|
94
|
+
'9th' => 'IX',
|
95
|
+
} unless const_defined?(:CONVERSION)
|
96
|
+
|
97
|
+
def parse_fullname(name)
|
98
|
+
first_name = nil
|
99
|
+
middle_name = nil
|
100
|
+
last_name = nil
|
101
|
+
prefix = nil
|
102
|
+
suffix = nil
|
103
|
+
|
104
|
+
# replace "’" to "'"
|
105
|
+
name = name.gsub(/’/, "'")
|
106
|
+
# remove strings which contain and include in parentheses
|
107
|
+
# ex. 'Susan M. (Scully) Schultz' => 'Susan M. Schultz'
|
108
|
+
# 'Jay (Jung) Heum Kim' => 'Jay Heum Kim'
|
109
|
+
name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
|
110
|
+
# remove quoted strings
|
111
|
+
# Darin "Derry" Ronald Anderson => 'Darin Ronald Anderson'
|
112
|
+
# Nancy M. "Shelli" Egger => 'Nancy M. Egger'
|
113
|
+
# Nicole 'nikki' Adame => 'Nicole Adame'
|
114
|
+
name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
|
115
|
+
|
116
|
+
# remove curly brackets
|
117
|
+
# Henry C.{Harry} Wilson => 'Henry C. Wilson'
|
118
|
+
# Cellestine {Steen} Armstrong => 'Cellestine Armstrong'
|
119
|
+
name = name.gsub(/\{.*?\}/, ' ')
|
120
|
+
# remove exceptional names
|
121
|
+
# ex. "William . D. 'Bill' Beard" => "William D. 'Bill' Beard"
|
122
|
+
# also this regexp can remove
|
123
|
+
name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
|
124
|
+
# Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
|
125
|
+
# the reason is the substitution applies for suffix splitting, not for replacing
|
126
|
+
# bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
|
127
|
+
# so that the suffix will get into the split array.
|
128
|
+
# and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
|
129
|
+
nameSplit = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
|
130
|
+
|
131
|
+
return { :last=>name } if nameSplit.length <= 1
|
132
|
+
|
133
|
+
suffix_arr = []
|
134
|
+
while (nameSplit.length > 1)
|
135
|
+
if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
|
136
|
+
suffix_arr.unshift([nameSplit.pop, false])
|
137
|
+
elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
|
138
|
+
suffix_arr.unshift([nameSplit.pop, true])
|
139
|
+
else
|
140
|
+
break
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Loop around until we run into a name that is not contained in the PREFIX_LIST
|
145
|
+
# ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
|
146
|
+
prefix_arr = []
|
147
|
+
while (nameSplit.length > 1)
|
148
|
+
if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
|
149
|
+
nameSplit.shift
|
150
|
+
elsif PREFIX_LIST.include?(nameSplit.first.downcase)
|
151
|
+
prefix_arr.push(nameSplit.shift)
|
152
|
+
else
|
153
|
+
break
|
154
|
+
end
|
155
|
+
end
|
156
|
+
prefix = prefix_arr.join(' ') if prefix_arr.size > 0
|
157
|
+
|
158
|
+
# Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
|
159
|
+
last_name_arr = []
|
160
|
+
last_name_arr.push(nameSplit.pop)
|
161
|
+
last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
|
162
|
+
last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
|
163
|
+
|
164
|
+
first_name = nameSplit.shift if nameSplit.length >= 1
|
165
|
+
middle_name = nameSplit.join(' ') if nameSplit.length > 0
|
166
|
+
if first_name.nil? && prefix
|
167
|
+
first_name = prefix
|
168
|
+
prefix = nil
|
169
|
+
end
|
170
|
+
|
171
|
+
if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
|
172
|
+
first_name = last_name
|
173
|
+
last_name = suffix_arr.shift.first
|
174
|
+
end
|
175
|
+
if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
|
176
|
+
middle_name = [middle_name, last_name].compact.join(' ')
|
177
|
+
last_name = suffix_arr.shift.first
|
178
|
+
end
|
179
|
+
suffix_arr.delete_if{|a, b| !b}
|
180
|
+
suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
|
181
|
+
return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
|
182
|
+
end # << parse_fullname
|
183
|
+
extend self
|
184
|
+
end
|
185
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fullname-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- xiaohui
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-05-15 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
|
15
|
+
email:
|
16
|
+
- xiaohui@zhangxh.net
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- README.md
|
23
|
+
- fullname-parser.gemspec
|
24
|
+
- lib/fullname/parser.rb
|
25
|
+
- lib/fullname/parser/version.rb
|
26
|
+
homepage: http://github.com/xiaohui-zhangxh/fullname-parser
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.25
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Split fullname into pieces(prefix/first/middle/last/suffix)
|
50
|
+
test_files: []
|
51
|
+
has_rdoc:
|