nomener 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/nomener/helper.rb +12 -15
- data/lib/nomener/name.rb +25 -30
- data/lib/nomener/parser.rb +93 -102
- data/lib/nomener/version.rb +1 -1
- data/spec/nomener/nomener_name_spec.rb +27 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd56c9231c23b185899cb3f0c85f7bfd050fbd0c
|
|
4
|
+
data.tar.gz: b37745c76f6263bd7fb771aab566286f44937693
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c314a3ad037c4a9fdb120ef77fbb9c40c7998574f99c74f8062120dbefa940e7995c4def79637793b21f6d0f41d3efa747c80813fbc4403c09c06197f1e3998d
|
|
7
|
+
data.tar.gz: dcf8213aeb321016c3e01c798b379c620b178e0456780d1a6a65cd9ce43c55f2a1ed101b032888303cdded55d558b164da14b036ca4089f073dcd0a483bd34c7
|
data/README.md
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Nomener
|
|
2
2
|
[](http://badge.fury.io/rb/nomener)
|
|
3
3
|
[](https://travis-ci.org/dan-ding/nomener)
|
|
4
|
+
[](https://codeclimate.com/github/dan-ding/nomener)
|
|
4
5
|
|
|
5
6
|
Nomener assists with parsing peoples names that they give themselves (or other people). Nomener ~~is~~ was a fork of [People](https://github.com/dan-ding/people) as it uses some code contributed there. It's currently geared towards western style name formatting, however other cultural name formatting is (or would like to be supported). Currently it attempts to parse names through pattern matching without using large(r) dictionary/library/data files (except for name decorations and suffixes, see usage). It may not be possible to do without such in all languages.
|
|
6
7
|
|
data/lib/nomener/helper.rb
CHANGED
|
@@ -2,9 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# For Ruby 1.9.3, 2.0.0
|
|
4
4
|
rv = RUBY_VERSION.split(".")[(0..1)].join("")
|
|
5
|
-
if
|
|
6
|
-
require "string-scrub"
|
|
7
|
-
end
|
|
5
|
+
require "string-scrub" if(rv >= '19' && rv < '21')
|
|
8
6
|
|
|
9
7
|
module Nomener
|
|
10
8
|
module Helper
|
|
@@ -20,24 +18,23 @@ module Nomener
|
|
|
20
18
|
#
|
|
21
19
|
# Returns a string which is (ideally) pretty much the same as it was given.
|
|
22
20
|
def self.reformat(name, leftleft = '"', rightright = '"', left = "'", right = "'")
|
|
23
|
-
|
|
24
|
-
|
|
21
|
+
nomen = name.dup
|
|
22
|
+
nomen.scrub! # remove illegal characters
|
|
25
23
|
|
|
26
24
|
# translate fullwidth to typewriter
|
|
27
|
-
|
|
25
|
+
nomen.tr!("\uFF02\uFF07", "\u0022\u0027")
|
|
28
26
|
|
|
29
|
-
|
|
30
|
-
|
|
27
|
+
nomen.tr!("\u0022\u00AB\u201C\u201E\u2036\u300E\u301D\u301F\uFE43", leftleft) # replace left double quotes
|
|
28
|
+
nomen.tr!("\u0022\u00BB\u201D\u201F\u2033\u300F\u301E\uFE44", rightright) # replace right double quotes
|
|
31
29
|
|
|
32
|
-
|
|
33
|
-
|
|
30
|
+
nomen.tr!("\u0027\u2018\u201A\u2035\u2039\u300C\uFE41\uFF62", left) # replace left single quotes
|
|
31
|
+
nomen.tr!("\u0027\u2019\u201B\u2032\u203A\u300D\uFE42\uFF62", right) # replace left single quotes
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
n.strip! # trim space
|
|
33
|
+
nomen.gsub!(/[^\p{Alpha}\-&\/ \.\,\'\"#{leftleft}#{rightright}#{left}#{right}\(\)]/, " ") # what others may be in a name?
|
|
34
|
+
nomen.squeeze! " "
|
|
35
|
+
nomen.strip!
|
|
39
36
|
|
|
40
|
-
|
|
37
|
+
nomen
|
|
41
38
|
end
|
|
42
39
|
|
|
43
40
|
end
|
data/lib/nomener/name.rb
CHANGED
|
@@ -50,37 +50,32 @@ module Nomener
|
|
|
50
50
|
|
|
51
51
|
fix = last.dup
|
|
52
52
|
|
|
53
|
-
# if there are multiple last names separated by spaces
|
|
54
|
-
fix = fix.split(" ").map { |v| v.capitalize }.join " "
|
|
55
|
-
|
|
56
53
|
# if there are multiple last names separated by a dash
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
fix.gsub!(/('\p{Alpha})/) { |s| (s[-1] != 't') ? s.upcase : s } #no cap 't
|
|
83
|
-
end
|
|
54
|
+
fix = fix.split("-").map { |v|
|
|
55
|
+
v.split(" ").map { |w| w.capitalize }.join " "
|
|
56
|
+
}.join "-"
|
|
57
|
+
|
|
58
|
+
# anything begining with Mac and not ending in [aciozj], except for a few
|
|
59
|
+
fix.sub!(/Mac(?!
|
|
60
|
+
hin|
|
|
61
|
+
hlen|
|
|
62
|
+
har|
|
|
63
|
+
kle|
|
|
64
|
+
klin|
|
|
65
|
+
kie|
|
|
66
|
+
hado| # Portugese
|
|
67
|
+
evicius| # Lithuanian
|
|
68
|
+
iulis| # Lithuanian
|
|
69
|
+
ias # Lithuanian
|
|
70
|
+
)([\p{Alpha}]{2,}[^aAcCiIoOzZjJ])\b/x) { |s| "Mac#{$1.capitalize}" }
|
|
71
|
+
|
|
72
|
+
fix.sub! /\bMacmurdo\b/, "MacMurdo" # fix MacMurdo
|
|
73
|
+
|
|
74
|
+
# anything beginning with Mc, Mcdonald == McDonald
|
|
75
|
+
fix.sub!(/Mc(\p{Alpha}{2,})/) { |s| "Mc#{$1.capitalize}" }
|
|
76
|
+
|
|
77
|
+
# names like D'Angelo or Van 't Hooft, no cap 't
|
|
78
|
+
fix.gsub!(/('\p{Alpha})(?=\p{Alpha})/) { |s| "'#{$1[(1..-1)].capitalize}" }
|
|
84
79
|
|
|
85
80
|
fix
|
|
86
81
|
end
|
data/lib/nomener/parser.rb
CHANGED
|
@@ -15,7 +15,7 @@ module Nomener
|
|
|
15
15
|
TRAILER_TRASH = /[,|\s]+$/
|
|
16
16
|
|
|
17
17
|
# regex for name characters we aren't going to use
|
|
18
|
-
DIRTY_STUFF = /[^,'(?:\p{Alpha}(?<\.))\p{Alpha}]{2,}/
|
|
18
|
+
DIRTY_STUFF = /[^,'(?:\p{Alpha}(?<\.))\p{Alpha}\p{Blank}]{2,}/
|
|
19
19
|
|
|
20
20
|
# regex for boundaries we'll use to find leftover nickname boundaries
|
|
21
21
|
NICKNAME_LEFTOVER = /["'\(\)]{2}/
|
|
@@ -62,85 +62,58 @@ module Nomener
|
|
|
62
62
|
# Returns a hash of name parts or nil
|
|
63
63
|
# Raises ArgumentError if 'name' is not a string or is empty
|
|
64
64
|
def self.parse!(name, format = {:order => :auto, :spacelimit => 0})
|
|
65
|
-
raise ArgumentError,
|
|
65
|
+
raise ArgumentError, "Name to parse not provided" unless (name.kind_of?(String) && !name.empty?)
|
|
66
66
|
|
|
67
67
|
name = Nomener::Helper.reformat name
|
|
68
|
+
newname = { :title => "", :first => "", :nick => "", :middle => "", :last => "", :suffix => "" }
|
|
68
69
|
|
|
69
70
|
# grab any identified nickname before working on the rest
|
|
70
|
-
nick = parse_nick! name
|
|
71
|
+
newname[:nick] = parse_nick! name
|
|
71
72
|
cleanup! name
|
|
72
73
|
|
|
73
74
|
# grab any suffix' we can find
|
|
74
|
-
suffix = parse_suffix! name
|
|
75
|
+
newname[:suffix] = parse_suffix! name
|
|
75
76
|
cleanup! name
|
|
76
77
|
|
|
77
|
-
title = parse_title! name
|
|
78
|
-
|
|
78
|
+
newname[:title] = parse_title! name
|
|
79
|
+
name = dustoff name
|
|
80
|
+
|
|
81
|
+
newname[:last] = name # possibly mononyms
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
case name
|
|
85
|
+
when /,/ # if there's a comma, it may be a useful hint
|
|
86
|
+
clues = name.split(",").each { |i| i.strip! }
|
|
87
|
+
|
|
88
|
+
raise ParseError, "Could not decipher commas in \"#{name}\"" if clues.length > 2
|
|
79
89
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
last, first = clues
|
|
94
|
-
|
|
95
|
-
# Mies van der Rohe, Ludwig
|
|
96
|
-
# Snepscheut, Jan L. A. van de
|
|
97
|
-
# check the last by comparing a re-ordering of the name
|
|
98
|
-
first_parts = first.split " "
|
|
99
|
-
unless first_parts.length == 1
|
|
100
|
-
check = parse_last!("#{first} #{last}", :fl)
|
|
101
|
-
# let's trust the full name
|
|
102
|
-
if check != last
|
|
103
|
-
first = "#{first} #{last}".sub(check, '').strip
|
|
104
|
-
last = check
|
|
105
|
-
end
|
|
90
|
+
# convention is last, first when there's a comma
|
|
91
|
+
newname[:last], newname[:first] = clues
|
|
92
|
+
|
|
93
|
+
# check the last by comparing a re-ordering of the name
|
|
94
|
+
# Mies van der Rohe, Ludwig
|
|
95
|
+
# Snepscheut, Jan L. A. van de
|
|
96
|
+
unless newname[:first].nil? || newname[:first].split(" ").length == 1
|
|
97
|
+
check = parse_last!("#{newname[:first]} #{newname[:last]}", :fl)
|
|
98
|
+
|
|
99
|
+
# let's trust the full name
|
|
100
|
+
if check != newname[:last]
|
|
101
|
+
newname[:first] = "#{newname[:first]} #{newname[:last]}".sub(check, "").strip
|
|
102
|
+
newname[:last] = check
|
|
106
103
|
end
|
|
107
|
-
# titles are part of the first name
|
|
108
|
-
title = parse_title!(first) if title.nil? || title.empty?
|
|
109
|
-
elsif clues.length == 1
|
|
110
|
-
last = clues.shift
|
|
111
|
-
else
|
|
112
|
-
raise ParseError, "Could not decipher commas in \"#{name}\""
|
|
113
104
|
end
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
105
|
+
|
|
106
|
+
# titles which are part of the first name...
|
|
107
|
+
newname[:title] = parse_title!(newname[:first]) if newname[:title].empty?
|
|
108
|
+
|
|
109
|
+
when / / # no comma, check for space on first then last
|
|
110
|
+
newname[:last] = parse_last!(name, format[:order])
|
|
111
|
+
newname[:first], newname[:middle] = parse_first!(name, format[:spacelimit])
|
|
120
112
|
end
|
|
121
113
|
|
|
122
|
-
|
|
123
|
-
:title => (title || "").strip,
|
|
124
|
-
:suffix => (suffix || "").strip,
|
|
125
|
-
:nick => (nick || "").strip,
|
|
126
|
-
:first => (first || "").strip,
|
|
127
|
-
:last => (last || "").strip,
|
|
128
|
-
:middle => (middle || "").strip
|
|
129
|
-
}
|
|
130
|
-
end
|
|
114
|
+
cleanup! newname[:last], newname[:first], newname[:middle]
|
|
131
115
|
|
|
132
|
-
|
|
133
|
-
# Modifies given string in place.
|
|
134
|
-
#
|
|
135
|
-
# dirty - string to clean up
|
|
136
|
-
#
|
|
137
|
-
# Returns nothing
|
|
138
|
-
def self.cleanup!(dirty)
|
|
139
|
-
dirty.gsub! DIRTY_STUFF, ''
|
|
140
|
-
dirty.squeeze! " "
|
|
141
|
-
# remove any trailing commas or whitespace
|
|
142
|
-
dirty.gsub! TRAILER_TRASH, ''
|
|
143
|
-
dirty.strip!
|
|
116
|
+
newname
|
|
144
117
|
end
|
|
145
118
|
|
|
146
119
|
# Internal: pull off a title if we can
|
|
@@ -153,13 +126,9 @@ module Nomener
|
|
|
153
126
|
titles = []
|
|
154
127
|
nm.gsub! TITLES do |title|
|
|
155
128
|
titles << title.strip
|
|
156
|
-
|
|
129
|
+
""
|
|
157
130
|
end
|
|
158
|
-
|
|
159
|
-
t.gsub! PERIOD, ' '
|
|
160
|
-
t.squeeze! " "
|
|
161
|
-
t.strip!
|
|
162
|
-
t
|
|
131
|
+
dustoff titles.join(" ")
|
|
163
132
|
end
|
|
164
133
|
|
|
165
134
|
# Internal: pull off what suffixes we can
|
|
@@ -172,13 +141,9 @@ module Nomener
|
|
|
172
141
|
suffixes = []
|
|
173
142
|
nm.gsub! SUFFIXES do |suffix|
|
|
174
143
|
suffixes << suffix.strip
|
|
175
|
-
|
|
144
|
+
""
|
|
176
145
|
end
|
|
177
|
-
|
|
178
|
-
s.gsub! /\./, ' '
|
|
179
|
-
s.squeeze! " "
|
|
180
|
-
s.strip!
|
|
181
|
-
s
|
|
146
|
+
dustoff suffixes.join(" ")
|
|
182
147
|
end
|
|
183
148
|
|
|
184
149
|
# Internal: parse nickname out of string. presuming it's in quotes
|
|
@@ -189,14 +154,12 @@ module Nomener
|
|
|
189
154
|
# Returns string of the nickname found or and empty string
|
|
190
155
|
def self.parse_nick!(nm)
|
|
191
156
|
nick = ""
|
|
192
|
-
nm.sub! NICKNAME
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
nick
|
|
198
|
-
nick.strip!
|
|
199
|
-
nick
|
|
157
|
+
nm.sub! NICKNAME do |z|
|
|
158
|
+
nick = $1.strip
|
|
159
|
+
""
|
|
160
|
+
end
|
|
161
|
+
nm.sub! NICKNAME_LEFTOVER, ""
|
|
162
|
+
dustoff nick
|
|
200
163
|
end
|
|
201
164
|
|
|
202
165
|
# Internal: parse last name from string
|
|
@@ -207,24 +170,22 @@ module Nomener
|
|
|
207
170
|
#
|
|
208
171
|
# Returns string of the last name found or an empty string
|
|
209
172
|
def self.parse_last!(nm, format = :fl)
|
|
210
|
-
last =
|
|
173
|
+
last = ""
|
|
211
174
|
|
|
212
|
-
if format == :auto
|
|
213
|
-
|
|
214
|
-
# format = :lcf if !nm.index(',').nil?
|
|
215
|
-
end
|
|
175
|
+
format = :fl if (format == :auto && nm.index(",").nil?)
|
|
176
|
+
format = :lcf if (format == :auto && nm.index(","))
|
|
216
177
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
last
|
|
225
|
-
nm.sub!(
|
|
226
|
-
nm.sub!(',', "").strip!
|
|
178
|
+
# these constants should have the named match :fam
|
|
179
|
+
n = nm.match( FIRSTLAST_MATCHER ) if format == :fl
|
|
180
|
+
n = nm.match( LASTFIRST_MATCHER ) if format == :lf
|
|
181
|
+
n = nm.match( LASTCOMFIRST_MATCHER ) if format == :lcf
|
|
182
|
+
|
|
183
|
+
unless n.nil?
|
|
184
|
+
last = n[:fam].strip if n[:fam]
|
|
185
|
+
nm.sub!(last, "")
|
|
186
|
+
nm.sub!(",", "")
|
|
227
187
|
end
|
|
188
|
+
|
|
228
189
|
last
|
|
229
190
|
end
|
|
230
191
|
|
|
@@ -236,11 +197,41 @@ module Nomener
|
|
|
236
197
|
#
|
|
237
198
|
# Returns an array containing the first name and middle name if any
|
|
238
199
|
def self.parse_first!(nm, namecount = 0)
|
|
239
|
-
nm.tr!
|
|
240
|
-
|
|
200
|
+
nm.tr! ".", " "
|
|
201
|
+
nm.squeeze! " "
|
|
202
|
+
first, middle = nm.split " ", namecount
|
|
241
203
|
|
|
242
204
|
[first || "", middle || ""]
|
|
243
205
|
end
|
|
244
206
|
|
|
207
|
+
private
|
|
208
|
+
# Internal: Clean up a string where there are numerous consecutive and trailing non-name characters.
|
|
209
|
+
# Modifies given string in place.
|
|
210
|
+
#
|
|
211
|
+
# args - strings to clean up
|
|
212
|
+
#
|
|
213
|
+
# Returns nothing
|
|
214
|
+
def self.cleanup!(*args)
|
|
215
|
+
args.each do |dirty|
|
|
216
|
+
next if(dirty.nil? || !dirty.kind_of?(String))
|
|
217
|
+
|
|
218
|
+
dirty.gsub! DIRTY_STUFF, ""
|
|
219
|
+
dirty.squeeze! " "
|
|
220
|
+
# remove any trailing commas or whitespace
|
|
221
|
+
dirty.gsub! TRAILER_TRASH, ""
|
|
222
|
+
dirty.strip!
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# Internal: a softer clean we keep re-using
|
|
227
|
+
#
|
|
228
|
+
# str - the string to dust off
|
|
229
|
+
#
|
|
230
|
+
# Returns the nice clean
|
|
231
|
+
def self.dustoff(str)
|
|
232
|
+
str = str.gsub PERIOD, " "
|
|
233
|
+
str = str.squeeze " "
|
|
234
|
+
str = str.strip
|
|
235
|
+
end
|
|
245
236
|
end
|
|
246
237
|
end
|
data/lib/nomener/version.rb
CHANGED
|
@@ -81,6 +81,33 @@ RSpec.describe "Nomener::Name" do
|
|
|
81
81
|
end
|
|
82
82
|
end
|
|
83
83
|
|
|
84
|
+
context "with last name alternates" do
|
|
85
|
+
name = Nomener::Name.new("Joe Smith")
|
|
86
|
+
|
|
87
|
+
it "returns from surname the last name" do
|
|
88
|
+
expect(name.surname).to eq "Smith"
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
it "returns from family the last name" do
|
|
92
|
+
expect(name.family).to eq "Smith"
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
context "with first name alternate" do
|
|
97
|
+
name = Nomener::Name.new("Joe Smith")
|
|
98
|
+
|
|
99
|
+
it "returns from surname the last name" do
|
|
100
|
+
expect(name.given).to eq "Joe"
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
context "with a name method to_h" do
|
|
105
|
+
it "responds with a hash" do
|
|
106
|
+
name = Nomener::Name.new("Joe Smith")
|
|
107
|
+
expect(name.to_h).to be_a Hash
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
84
111
|
context "with capit" do
|
|
85
112
|
name = Nomener::Name.new
|
|
86
113
|
[
|