fullname-parser 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ coverage
6
+ InstalledFiles
7
+ lib/bundler/man
8
+ pkg
9
+ rdoc
10
+ spec/reports
11
+ test/tmp
12
+ test/version_tmp
13
+ tmp
14
+
15
+ # YARD artifacts
16
+ .yardoc
17
+ _yardoc
18
+ doc/
@@ -0,0 +1,18 @@
1
+ fullname_parser
2
+ ===============
3
+
4
+ There are two ways to use this function:
5
+
6
+ require 'fullname/parser'
7
+ Fullname::Parser.parse_fullname("Xiaohui Zhang")
8
+
9
+ => {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
10
+
11
+ or
12
+
13
+ require 'fullname/parser'
14
+ include Fullname::Parser
15
+ parse_fullname("Xiaohui Zhang")
16
+
17
+ => {:last=>"Zhang", :middle=>nil, :first=>"Xiaohui", :prefix=>nil, :suffix=>nil}
18
+
@@ -0,0 +1,17 @@
1
+ $:.unshift File.expand_path("../lib", __FILE__)
2
+ require 'fullname/parser/version'
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "fullname-parser"
6
+ s.version = Fullname::Parser::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['xiaohui']
9
+ s.email = ['xiaohui@zhangxh.net']
10
+ s.homepage = 'http://github.com/xiaohui-zhangxh/fullname-parser'
11
+ s.summary = "Split fullname into pieces(prefix/first/middle/last/suffix)"
12
+ s.description = "For parsing people's fullname into pieces(prefix/first/middle/last/suffix)"
13
+
14
+ s.files = `git ls-files`.split("\n")
15
+ s.executables = `git ls-files`.split("\n").map{|f| f =~ /^bin\/(.*)/ ? $1 : nil}.compact
16
+ s.require_path = 'lib'
17
+ end
@@ -0,0 +1,185 @@
1
+ # encoding: utf-8
2
+ require File.expand_path('../parser/version', __FILE__)
3
+
4
+ module Fullname
5
+ module Parser
6
+
7
+ # When "II" or "III" or even "IV" appear in the Middle Name/Suffix slot, it can safely be assumed that they are Suffixes.
8
+ # (John Smith has a son named John Smith II, who has a son named John Smith III, etc.) However, nobody (except a king)
9
+ # puts "I" after their name to indicate that they are the "first." If anything, they put "Sr." Therefore, a letter "I"
10
+ # appearing in the Middle Name/Suffix slot can be assumed to be their Middle Initial.
11
+ # So here 'i' will be removed from the GENERATION_LIST
12
+ #
13
+ # Also almost nobody will reach to 'v'(except a king), so all suffixes later than 'v' we won't use.
14
+ GENERATION_LIST = [
15
+ #'i',
16
+ 'ii', 'iii', 'iv', 'v',
17
+ # 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx',
18
+ ] unless const_defined?(:GENERATION_LIST)
19
+
20
+ GLOBAL_SUFFIX_LIST = GENERATION_LIST + [
21
+ 'jr', 'jr.',
22
+ 'sr', 'sr.',
23
+ ] unless const_defined?(:GLOBAL_SUFFIX_LIST)
24
+
25
+ SUFFIX_LIST = [
26
+ 'b.a.',
27
+ 'capt.', 'col.', 'cfa', 'c.f.a', 'c.f.a.', 'cpa', 'c.p.a', 'c.p.a.',
28
+ 'edd', 'ed.d',
29
+ 'mph',
30
+ 'pc', 'p.c.', 'psyd', 'psyd.', 'psy.d', 'phd', 'phd.', 'ph.d', 'ph.d.',
31
+ 'r.s.m.',
32
+ 'usn',
33
+ ] unless const_defined?(:SUFFIX_LIST)
34
+
35
+ IGNORABLE_SUFFIXES = [
36
+ 'do', 'd.o.', 'd.o', 'dds', 'd.d.s.',
37
+ 'esq', 'esq.',
38
+ 'md', 'm.d.', 'm.d',
39
+ 'mr.', 'ms.', 'mrs.',
40
+ 'jd', 'jd.', 'j.d.',
41
+ 'retd', 'ret.', 'retd.',
42
+ 'usmc',
43
+ ] unless const_defined?(:IGNORABLE_SUFFIXES)
44
+
45
+ SUFFIX_CAN_BE_LASTNAME = [
46
+ 'do',
47
+ ] unless const_defined?(:SUFFIX_CAN_BE_LASTNAME)
48
+
49
+ PREFIX_LIST = [
50
+ 'asst.',
51
+ 'attorney', 'atty.',
52
+ 'bg', 'brig', 'gen',
53
+ 'colonel', 'cardinal', 'capt', 'capt.', 'captain', 'cdr', 'col' , 'col.', 'congressman', 'cpt',
54
+ 'dir.', 'dr', 'dr.',
55
+ 'exec.',
56
+ 'general', 'gen', 'gen.',
57
+ 'honorable', 'hon', 'hon.',
58
+ 'judge', 'justice', 'chiefjustice',
59
+ 'lieutenant', 'lcdr', 'lt', 'lt.', 'ltc', 'ltcol.', 'ltcol', 'ltjg',
60
+ 'mr', 'mr.', 'ms', 'ms.', 'mrs', 'mrs.', 'maj', 'maj.', 'major', 'miss',
61
+ 'president', 'prof', 'prof.', 'professor',
62
+ 'reverend', 'rev', 'rev.',
63
+ 'sheriff',
64
+ 'sr', 'sr.'
65
+
66
+ ] unless const_defined?(:PREFIX_LIST)
67
+
68
+ IGNORABLE_PREFIXS = [
69
+ 'the',
70
+ ] unless const_defined?(:IGNORABLE_PREFIXS)
71
+
72
+
73
+ # These will be considered part of the last name
74
+ LAST_NAME_EXTENSIONS = [
75
+ 'bar', 'ben',
76
+ 'da', 'dal', 'dan', 'de', 'del', 'den', 'der', 'des', 'dela', 'della', 'di', 'do', 'du',
77
+ 'el',
78
+ 'la', 'le', 'lo',
79
+ 'mac', 'mc',
80
+ 'san',
81
+ 'st', 'st.', 'sta', 'sta.',
82
+ 'van','von', 'ver', 'vanden', 'vander'
83
+ ] unless const_defined?(:LAST_NAME_EXTENSIONS)
84
+
85
+ CONVERSION = {
86
+ '1st' => 'I',
87
+ '2nd' => 'II',
88
+ '3rd' => 'III',
89
+ '4th' => 'IV',
90
+ '5th' => 'V',
91
+ '6th' => 'VI',
92
+ '7th' => 'VII',
93
+ '8th' => 'VIII',
94
+ '9th' => 'IX',
95
+ } unless const_defined?(:CONVERSION)
96
+
97
+ def parse_fullname(name)
98
+ first_name = nil
99
+ middle_name = nil
100
+ last_name = nil
101
+ prefix = nil
102
+ suffix = nil
103
+
104
+ # replace "’" to "'"
105
+ name = name.gsub(/’/, "'")
106
+ # remove strings which contain and include in parentheses
107
+ # ex. 'Susan M. (Scully) Schultz' => 'Susan M. Schultz'
108
+ # 'Jay (Jung) Heum Kim' => 'Jay Heum Kim'
109
+ name = name.gsub(/\(.*?\)/, ' ').gsub(/\(|\)/, '')
110
+ # remove quoted strings
111
+ # Darin "Derry" Ronald Anderson => 'Darin Ronald Anderson'
112
+ # Nancy M. "Shelli" Egger => 'Nancy M. Egger'
113
+ # Nicole 'nikki' Adame => 'Nicole Adame'
114
+ name = name.gsub(/".*?"/, ' ').gsub(/'.*?'/i, ' ')
115
+
116
+ # remove curly brackets
117
+ # Henry C.{Harry} Wilson => 'Henry C. Wilson'
118
+ # Cellestine {Steen} Armstrong => 'Cellestine Armstrong'
119
+ name = name.gsub(/\{.*?\}/, ' ')
120
+ # remove exceptional names
121
+ # ex. "William . D. 'Bill' Beard" => "William D. 'Bill' Beard"
122
+ # also this regexp can remove
123
+ name = name.gsub(/\s+[^a-zA-Z]+\s+/, ' ')
124
+ # Why we use substitute(sub) comma to whitespace, not global substitute(gsub).
125
+ # the reason is the substitution applies for suffix splitting, not for replacing
126
+ # bad data. As we want, convert "Marvene A Gordon, JD" to "Marvene A Gordon JD",
127
+ # so that the suffix will get into the split array.
128
+ # and, standardize suffix as '2nd' => 'II', '3rd' => 'III'
129
+ nameSplit = name.gsub(',', ' ').strip.split(/\s+/).map{ |n| CONVERSION[n.downcase] || n }
130
+
131
+ return { :last=>name } if nameSplit.length <= 1
132
+
133
+ suffix_arr = []
134
+ while (nameSplit.length > 1)
135
+ if IGNORABLE_SUFFIXES.include?(nameSplit.last.downcase)
136
+ suffix_arr.unshift([nameSplit.pop, false])
137
+ elsif SUFFIX_LIST.include?(nameSplit.last.downcase) || GLOBAL_SUFFIX_LIST.include?(nameSplit.last.downcase)
138
+ suffix_arr.unshift([nameSplit.pop, true])
139
+ else
140
+ break
141
+ end
142
+ end
143
+
144
+ # Loop around until we run into a name that is not contained in the PREFIX_LIST
145
+ # ex(FL): 'Lt Col Marvene A Gordon', 'The Honorable Dexter F George'
146
+ prefix_arr = []
147
+ while (nameSplit.length > 1)
148
+ if IGNORABLE_PREFIXS.include?(nameSplit.first.downcase)
149
+ nameSplit.shift
150
+ elsif PREFIX_LIST.include?(nameSplit.first.downcase)
151
+ prefix_arr.push(nameSplit.shift)
152
+ else
153
+ break
154
+ end
155
+ end
156
+ prefix = prefix_arr.join(' ') if prefix_arr.size > 0
157
+
158
+ # Loop around until we run into a name that is not contained in the LAST_NAME_EXTENSIONS
159
+ last_name_arr = []
160
+ last_name_arr.push(nameSplit.pop)
161
+ last_name_arr.push(nameSplit.pop) while nameSplit.length > 1 && LAST_NAME_EXTENSIONS.include?(nameSplit.last.downcase)
162
+ last_name = last_name_arr.reverse.join(' ') if last_name_arr.size > 0
163
+
164
+ first_name = nameSplit.shift if nameSplit.length >= 1
165
+ middle_name = nameSplit.join(' ') if nameSplit.length > 0
166
+ if first_name.nil? && prefix
167
+ first_name = prefix
168
+ prefix = nil
169
+ end
170
+
171
+ if first_name.nil? && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
172
+ first_name = last_name
173
+ last_name = suffix_arr.shift.first
174
+ end
175
+ if last_name =~ /^[A-Z]\.?$/i && suffix_arr.any? && SUFFIX_CAN_BE_LASTNAME.include?(suffix_arr.first.first.downcase)
176
+ middle_name = [middle_name, last_name].compact.join(' ')
177
+ last_name = suffix_arr.shift.first
178
+ end
179
+ suffix_arr.delete_if{|a, b| !b}
180
+ suffix = suffix_arr.size == 0 ? nil : suffix_arr.first.first # only return first suffix
181
+ return { :last => last_name, :middle => middle_name, :first => first_name, :prefix => prefix, :suffix => suffix }
182
+ end # << parse_fullname
183
+ extend self
184
+ end
185
+ end
@@ -0,0 +1,6 @@
1
+
2
+ module Fullname
3
+ module Parser
4
+ VERSION = '1.0.3'
5
+ end
6
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fullname-parser
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.3
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - xiaohui
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-05-15 00:00:00.000000000 Z
13
+ dependencies: []
14
+ description: For parsing people's fullname into pieces(prefix/first/middle/last/suffix)
15
+ email:
16
+ - xiaohui@zhangxh.net
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - .gitignore
22
+ - README.md
23
+ - fullname-parser.gemspec
24
+ - lib/fullname/parser.rb
25
+ - lib/fullname/parser/version.rb
26
+ homepage: http://github.com/xiaohui-zhangxh/fullname-parser
27
+ licenses: []
28
+ post_install_message:
29
+ rdoc_options: []
30
+ require_paths:
31
+ - lib
32
+ required_ruby_version: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 1.8.25
47
+ signing_key:
48
+ specification_version: 3
49
+ summary: Split fullname into pieces(prefix/first/middle/last/suffix)
50
+ test_files: []
51
+ has_rdoc: