fullname-parser 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +18 -0
- data/README.md +18 -0
- data/fullname-parser.gemspec +17 -0
- data/lib/fullname/parser.rb +185 -0
- data/lib/fullname/parser/version.rb +6 -0
- metadata +51 -0
data/.gitignore
ADDED
data/README.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
fullname_parser
|
2
|
+
===============
|
3
|
+
|
4
|
+
There are two ways to use this function:
|
5
|
+
|
6
|
+
require 'fullname/parser'
|
7
|
+
Fullname::Parser.parse_fullname("Xiaohui Zhang")
|
8
|
+
|
9
|
+
=> {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
|
10
|
+
|
11
|
+
or
|
12
|
+
|
13
|
+
require 'fullname/parser'
|
14
|
+
include Fullname::Parser
|
15
|
+
parse_fullname("Xiaohui Zhang")
|
16
|
+
|
17
|
+
=> {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
|
18
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
$:.unshift File.expand_path("../lib", __FILE__)
|
2
|
+
require 'fullname/parser/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "fullname-parser"
|
6
|
+
s.version = Fullname::Parser::VERSION
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ['xiaohui']
|
9
|
+
s.email = ['xiaohui@zhangxh.net']
|
10
|
+
s.homepage = 'http://github.com/xiaohui-zhangxh/fullname-parser'
|
11
|
+
s.summary = "Split fullname into pieces(prefix/first/middle/last/suffix)"
|
12
|
+
s.description = "For parsing people's fullname into pieces(prefix/first/middle/last/suffix)"
|
13
|
+
|
14
|
+
s.files = `git ls-files`.split("\n")
|
15
|
+
s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
|
16
|
+
s.require_path = 'lib'
|
17
|
+
end
|
@@ -0,0 +1,185 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path('../parser/version', __FILE__)
|
3
|
+
|
4
|
+
module Fullname
|
5
|
+
module Parser
|
6
|
+
|
7
|
+
# When "II" or "III" or even "IV" appear in the Middle Name/Suffix slot, it can safely be assumed that they are Suffixes.
|
8
|
+
# (John Smith has a son named John Smith II, who has a son named John Smith III, etc.) However, nobody (except a king)
|
9
|
+
# puts "I" after their name to indicate that they are the "first." If anything, they put "Sr." Therefore, a letter "I"
|
10
|
+
# appearing in the Middle Name/Suffix slot can be assumed to be their Middle Initial.
|
11
|
+
# So here 'i' will be removed from the GENERATION_LIST
|
12
|
+
#
|
13
|
+
# Also almost nobody will reach to 'v'(except a king), so all suffixes later than 'v' we won't use.
|
14
|
+
GENERATION_LIST = [
|
15
|
+
#'i',
|
16
|
+
'ii', 'iii', 'iv', 'v',
|
17
|
+
# 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
|
18
|
+
] unless const_defined?(:GENERATION_LIST)
|
19
|
+
|
20
|
+
GLOBAL_SUFFIX_LIST = GENERATION_LIST + [
|
21
|
+
'jr', 'jr.',
|
22
|
+
'sr', 'sr.',
|
23
|
+
] unless const_defined?(:GLOBAL_SUFFIX_LIST)
|
24
|
+
|
25
|
+
SUFFIX_LIST = [
|
26
|
+
'b.a.',
|
27
|
+
'capt.', 'col.', 'cfa', 'c.f.a', 'c.f.a.', 'cpa', 'c.p.a', 'c.p.a.',
|
28
|
+
'edd', 'ed.d',
|
29
|
+
'mph',
|
30
|
+
'pc', 'p.c.', 'psyd', 'psyd.', 'psy.d', 'phd', 'phd.', 'ph.d', 'ph.d.',
|
31
|
+
'r.s.m.',
|
32
|
+
'usn',
|
33
|
+
] unless const_defined?(:SUFFIX_LIST)
|
34
|
+
|
35
|
+
IGNORABLE_SUFFIXES = [
|
36
|
+
'do', 'd.o.', 'd.o', 'dds', 'd.d.s.',
|
37
|
+
'esq', 'esq.',
|
38
|
+
'md', 'm.d.', 'm.d',
|
39
|
+
'mr.', 'ms.', 'mrs.',
|
40
|
+
'jd', 'jd.', 'j.d.',
|
41
|
+
'retd', 'ret.', 'retd.',
|
42
|
+
'usmc',
|
43
|
+
] unless const_defined?(:IGNORABLE_SUFFIXES)
|
44
|
+
|
45
|
+
SUFFIX_CAN_BE_LASTNAME = [
|
46
|
+
'do',
|
47
|
+
] unless const_defined?(:SUFFIX_CAN_BE_LASTNAME)
|
48
|
+
|
49
|
+
PREFIX_LIST = [
|
50
|
+
'asst.',
|
51
|
+
'attorney', 'atty.',
|
52
|
+
'bg', 'brig', 'gen',
|
53
|
+
'colonel', 'cardinal', 'capt', 'capt.', 'captain', 'cdr', 'col' , 'col.', 'congressman', 'cpt',
|
54
|
+
'dir.', 'dr', 'dr.',
|
55
|
+
'exec.',
|
56
|
+
'general', 'gen', 'gen.',
|
57
|
+
'honorable', 'hon', 'hon.',
|
58
|
+
'judge', 'justice', 'chiefjustice',
|
59
|
+
'lieutenant', 'lcdr', 'lt', 'lt.', 'ltc', 'ltcol.', 'ltcol', 'ltjg',
|
60
|
+
'mr', 'mr.', 'ms', 'ms.', 'mrs', 'mrs.', 'maj', 'maj.', 'major', 'miss',
|
61
|
+
'president', 'prof', 'prof.', 'professor',
|
62
|
+
'reverend', 'rev', 'rev.',
|
63
|
+
'sheriff',
|
64
|
+
'sr', 'sr.'
|
65
|
+
|
66
|
+
] unless const_defined?(:PREFIX_LIST)
|
67
|
+
|
68
|
+
IGNORABLE_PREFIXS = [
|
69
|
+
'the',
|
70
|
+
] unless const_defined?(:IGNORABLE_PREFIXS)
|
71
|
+
|
72
|
+
|
73
|
+
# These will be considered part of the last name
|
74
|
+
LAST_NAME_EXTENSIONS = [
|
75
|
+
'bar', 'ben',
|
76
|
+
'da', 'dal', 'dan', 'de', 'del', 'den', 'der', 'des', 'dela', 'della', 'di', 'do', 'du',
|
77
|
+
'el',
|
78
|
+
'la', 'le', 'lo',
|
79
|
+
'mac', 'mc',
|
80
|
+
'san',
|
81
|
+
'st', 'st.', 'sta', 'sta.',
|
82
|
+
'van','von', 'ver', 'vanden', 'vander'
|
83
|
+
] unless const_defined?(:LAST_NAME_EXTENSIONS)
|
84
|
+
|
85
|
+
CONVERSION = {
|
86
|
+
'1st' => 'I',
|
87
|
+
'2nd' => 'II',
|
88
|
+
'3rd' => 'III',
|
89
|
+
'4th' => 'IV',
|
90
|
+
'5th' => 'V',
|
91
|
+
'6th' => 'VI',
|
92
|
+
'7th' => 'VII',
|
93
|
+
'8th' => 'VIII',
|
94
|
+
'9th' => 'IX',
|
95
|
+
} unless const_defined?(:CONVERSION)
|
96
|
+
|
97
|
+
def parse_fullname(name)
|
98
|
+
first_name = nil
|
99
|
+
middle_name = nil
|
100
|
+
last_name = nil
|
101
|
+
prefix = nil
|
102
|
+
suffix = nil
|
103
|
+
|
104
|
+
# replace "’" to "'"
|
105
|
+
name = name.gsub(/’/, "'")
|
106
|
+
# remove strings which contain and include in parentheses
|
107
|
+
# ex. 'Susan M. (Scully) Schultz' => 'Susan M. Schultz'
|
108
|
+
# 'Jay (Jung) Heum Kim' => 'Jay Heum Kim'
|
109
|
+
name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
|
110
|
+
# remove quoted strings
|
111
|
+
# Darin "Derry" Ronald Anderson => 'Darin Ronald Anderson'
|
112
|
+
# Nancy M. "Shelli" Egger => 'Nancy M. Egger'
|
113
|
+
# Nicole 'nikki' Adame => 'Nicole Adame'
|
114
|
+
name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
|
115
|
+
|
116
|
+
# remove curly brackets
|
117
|
+
# Henry C.{Harry} Wilson => 'Henry C. Wilson'
|
118
|
+
# Cellestine {Steen} Armstrong => 'Cellestine Armstrong'
|
119
|
+
name = name.gsub(/\{.*?\}/, ' ')
|
120
|
+
# remove exceptional names
|
121
|
+
# ex. "William . D. 'Bill' Beard" => "William D. 'Bill' Beard"
|
122
|
+
# also this regexp can remove
|
123
|
+
name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
|
124
|
+
# Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
|
125
|
+
# the reason is the substitution applies for suffix splitting, not for replacing
|
126
|
+
# bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
|
127
|
+
# so that the suffix will get into the split array.
|
128
|
+
# and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
|
129
|
+
nameSplit = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
|
130
|
+
|
131
|
+
return { :last=>name } if nameSplit.length <= 1
|
132
|
+
|
133
|
+
suffix_arr = []
|
134
|
+
while (nameSplit.length > 1)
|
135
|
+
if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
|
136
|
+
suffix_arr.unshift([nameSplit.pop, false])
|
137
|
+
elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
|
138
|
+
suffix_arr.unshift([nameSplit.pop, true])
|
139
|
+
else
|
140
|
+
break
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Loop around until we run into a name that is not contained in the PREFIX_LIST
|
145
|
+
# ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
|
146
|
+
prefix_arr = []
|
147
|
+
while (nameSplit.length > 1)
|
148
|
+
if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
|
149
|
+
nameSplit.shift
|
150
|
+
elsif PREFIX_LIST.include?(nameSplit.first.downcase)
|
151
|
+
prefix_arr.push(nameSplit.shift)
|
152
|
+
else
|
153
|
+
break
|
154
|
+
end
|
155
|
+
end
|
156
|
+
prefix = prefix_arr.join(' ') if prefix_arr.size > 0
|
157
|
+
|
158
|
+
# Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
|
159
|
+
last_name_arr = []
|
160
|
+
last_name_arr.push(nameSplit.pop)
|
161
|
+
last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
|
162
|
+
last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
|
163
|
+
|
164
|
+
first_name = nameSplit.shift if nameSplit.length >= 1
|
165
|
+
middle_name = nameSplit.join(' ') if nameSplit.length > 0
|
166
|
+
if first_name.nil? && prefix
|
167
|
+
first_name = prefix
|
168
|
+
prefix = nil
|
169
|
+
end
|
170
|
+
|
171
|
+
if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
|
172
|
+
first_name = last_name
|
173
|
+
last_name = suffix_arr.shift.first
|
174
|
+
end
|
175
|
+
if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
|
176
|
+
middle_name = [middle_name, last_name].compact.join(' ')
|
177
|
+
last_name = suffix_arr.shift.first
|
178
|
+
end
|
179
|
+
suffix_arr.delete_if{|a, b| !b}
|
180
|
+
suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
|
181
|
+
return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
|
182
|
+
end # << parse_fullname
|
183
|
+
extend self
|
184
|
+
end
|
185
|
+
end
|
metadata
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fullname-parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.3
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- xiaohui
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-05-15 00:00:00.000000000 Z
|
13
|
+
dependencies: []
|
14
|
+
description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
|
15
|
+
email:
|
16
|
+
- xiaohui@zhangxh.net
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- .gitignore
|
22
|
+
- README.md
|
23
|
+
- fullname-parser.gemspec
|
24
|
+
- lib/fullname/parser.rb
|
25
|
+
- lib/fullname/parser/version.rb
|
26
|
+
homepage: http://github.com/xiaohui-zhangxh/fullname-parser
|
27
|
+
licenses: []
|
28
|
+
post_install_message:
|
29
|
+
rdoc_options: []
|
30
|
+
require_paths:
|
31
|
+
- lib
|
32
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 1.8.25
|
47
|
+
signing_key:
|
48
|
+
specification_version: 3
|
49
|
+
summary: Split fullname into pieces(prefix/first/middle/last/suffix)
|
50
|
+
test_files: []
|
51
|
+
has_rdoc:
|