namae 0.11.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.travis.yml +9 -10
- data/BSDL +1 -1
- data/Gemfile +10 -24
- data/README.md +22 -4
- data/features/lists.feature +11 -1
- data/features/step_definitions/namae_steps.rb +5 -0
- data/features/support/env.rb +0 -9
- data/lib/namae/parser.rb +224 -170
- data/lib/namae/parser.y +56 -22
- data/lib/namae/utility.rb +4 -0
- data/lib/namae/version.rb +3 -3
- data/namae.gemspec +12 -14
- data/spec/namae/parser_spec.rb +61 -10
- data/spec/thread_safety_spec.rb +25 -0
- metadata +11 -12
- data/.autotest +0 -21
data/lib/namae/parser.y
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
class Namae::Parser
|
5
5
|
|
6
|
-
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
|
6
|
+
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX UPARTICLE
|
7
7
|
|
8
8
|
expect 0
|
9
9
|
|
@@ -20,7 +20,7 @@ rule
|
|
20
20
|
| sort_order
|
21
21
|
|
22
22
|
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
23
|
-
|
|
23
|
+
| titles { result = Name.new(:title => val[0]) }
|
24
24
|
|
25
25
|
display_order : u_words word opt_suffices opt_titles
|
26
26
|
{
|
@@ -43,6 +43,14 @@ rule
|
|
43
43
|
result = Name.new(:given => val[0], :particle => val[1],
|
44
44
|
:family => val[2])
|
45
45
|
}
|
46
|
+
| u_words UPARTICLE last
|
47
|
+
{
|
48
|
+
result = if include_particle_in_family?
|
49
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
50
|
+
else
|
51
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
52
|
+
end
|
53
|
+
}
|
46
54
|
| von last
|
47
55
|
{
|
48
56
|
result = Name.new(:particle => val[0], :family => val[1])
|
@@ -53,6 +61,14 @@ rule
|
|
53
61
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
54
62
|
:given => val[2][1] }, !!val[2][0])
|
55
63
|
}
|
64
|
+
| UPARTICLE last COMMA first
|
65
|
+
{
|
66
|
+
result = if include_particle_in_family?
|
67
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
68
|
+
else
|
69
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
70
|
+
end
|
71
|
+
}
|
56
72
|
| von last COMMA first
|
57
73
|
{
|
58
74
|
result = Name.new({ :particle => val[0], :family => val[1],
|
@@ -100,26 +116,35 @@ rule
|
|
100
116
|
| titles TITLE { result = val.join(' ') }
|
101
117
|
|
102
118
|
---- header
|
103
|
-
require 'singleton'
|
104
119
|
require 'strscan'
|
105
120
|
|
106
121
|
---- inner
|
107
122
|
|
108
|
-
|
123
|
+
@defaults = {
|
124
|
+
:debug => false,
|
125
|
+
:prefer_comma_as_separator => false,
|
126
|
+
:include_particle_in_family => false,
|
127
|
+
:comma => ',',
|
128
|
+
:stops => ',;',
|
129
|
+
:separator => /\s*(\band\b|\&|;)\s*/i,
|
130
|
+
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
131
|
+
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
132
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
133
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
134
|
+
}
|
135
|
+
|
136
|
+
class << self
|
137
|
+
attr_reader :defaults
|
138
|
+
|
139
|
+
def instance
|
140
|
+
Thread.current[:namae] ||= new
|
141
|
+
end
|
142
|
+
end
|
109
143
|
|
110
144
|
attr_reader :options, :input
|
111
145
|
|
112
|
-
def initialize
|
113
|
-
@
|
114
|
-
:debug => false,
|
115
|
-
:prefer_comma_as_separator => false,
|
116
|
-
:comma => ',',
|
117
|
-
:stops => ',;',
|
118
|
-
:separator => /\s*(\band\b|\&|;)\s*/i,
|
119
|
-
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
120
|
-
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
121
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
122
|
-
}
|
146
|
+
def initialize(options = {})
|
147
|
+
@options = self.class.defaults.merge(options)
|
123
148
|
end
|
124
149
|
|
125
150
|
def debug?
|
@@ -134,6 +159,10 @@ require 'strscan'
|
|
134
159
|
options[:comma]
|
135
160
|
end
|
136
161
|
|
162
|
+
def include_particle_in_family?
|
163
|
+
options[:include_particle_in_family]
|
164
|
+
end
|
165
|
+
|
137
166
|
def stops
|
138
167
|
options[:stops]
|
139
168
|
end
|
@@ -150,26 +179,29 @@ require 'strscan'
|
|
150
179
|
options[:appellation]
|
151
180
|
end
|
152
181
|
|
182
|
+
def uppercase_particle
|
183
|
+
options[:uppercase_particle]
|
184
|
+
end
|
185
|
+
|
153
186
|
def prefer_comma_as_separator?
|
154
187
|
options[:prefer_comma_as_separator]
|
155
188
|
end
|
156
189
|
|
157
|
-
def parse(
|
158
|
-
parse!(
|
190
|
+
def parse(string)
|
191
|
+
parse!(string)
|
159
192
|
rescue => e
|
160
193
|
warn e.message if debug?
|
161
194
|
[]
|
162
195
|
end
|
163
196
|
|
164
197
|
def parse!(string)
|
165
|
-
input
|
198
|
+
@input = StringScanner.new(normalize(string))
|
166
199
|
reset
|
167
200
|
do_parse
|
168
201
|
end
|
169
202
|
|
170
203
|
def normalize(string)
|
171
|
-
string
|
172
|
-
string
|
204
|
+
string.scrub.strip
|
173
205
|
end
|
174
206
|
|
175
207
|
def reset
|
@@ -220,11 +252,11 @@ require 'strscan'
|
|
220
252
|
end
|
221
253
|
|
222
254
|
def will_see_suffix?
|
223
|
-
input.
|
255
|
+
input.rest.strip.split(/\s+/)[0] =~ suffix
|
224
256
|
end
|
225
257
|
|
226
258
|
def will_see_initial?
|
227
|
-
input.
|
259
|
+
input.rest.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/
|
228
260
|
end
|
229
261
|
|
230
262
|
def seen_full_name?
|
@@ -256,6 +288,8 @@ require 'strscan'
|
|
256
288
|
else
|
257
289
|
consume_word(:UWORD, input.matched)
|
258
290
|
end
|
291
|
+
when input.scan(uppercase_particle)
|
292
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
259
293
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
260
294
|
consume_word(:UWORD, input.matched)
|
261
295
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
data/lib/namae/utility.rb
CHANGED
data/lib/namae/version.rb
CHANGED
data/namae.gemspec
CHANGED
@@ -2,23 +2,22 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: namae
|
5
|
+
# stub: namae 1.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "namae".freeze
|
9
|
-
s.version = "
|
9
|
+
s.version = "1.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Sylvester Keil".freeze, "Dan Collis-Puro".freeze]
|
14
|
-
s.date = "
|
15
|
-
s.description = " Namae (\
|
14
|
+
s.date = "2021-03-12"
|
15
|
+
s.description = " Namae (\u540D\u524D) is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). ".freeze
|
16
16
|
s.email = ["sylvester@keil.or.at".freeze, "dan@collispuro.com".freeze]
|
17
17
|
s.extra_rdoc_files = [
|
18
18
|
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
-
".autotest",
|
22
21
|
".codeclimate.yml",
|
23
22
|
".coveralls.yml",
|
24
23
|
".document",
|
@@ -50,23 +49,22 @@ Gem::Specification.new do |s|
|
|
50
49
|
"spec/namae/name_spec.rb",
|
51
50
|
"spec/namae/parser_spec.rb",
|
52
51
|
"spec/namae/utility_spec.rb",
|
53
|
-
"spec/spec_helper.rb"
|
52
|
+
"spec/spec_helper.rb",
|
53
|
+
"spec/thread_safety_spec.rb"
|
54
54
|
]
|
55
55
|
s.homepage = "https://github.com/berkmancenter/namae".freeze
|
56
56
|
s.licenses = ["AGPL-3.0".freeze]
|
57
|
-
s.rubygems_version = "2.
|
58
|
-
s.summary = "Namae (\
|
57
|
+
s.rubygems_version = "3.2.3".freeze
|
58
|
+
s.summary = "Namae (\u540D\u524D) parses personal names and splits them into their component parts.".freeze
|
59
59
|
|
60
60
|
if s.respond_to? :specification_version then
|
61
61
|
s.specification_version = 4
|
62
|
+
end
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
else
|
66
|
-
s.add_dependency(%q<racc>.freeze, ["= 1.4.9"])
|
67
|
-
end
|
64
|
+
if s.respond_to? :add_runtime_dependency then
|
65
|
+
s.add_development_dependency(%q<racc>.freeze, ["~> 1.4"])
|
68
66
|
else
|
69
|
-
s.add_dependency(%q<racc>.freeze, ["
|
67
|
+
s.add_dependency(%q<racc>.freeze, ["~> 1.4"])
|
70
68
|
end
|
71
69
|
end
|
72
70
|
|
data/spec/namae/parser_spec.rb
CHANGED
@@ -1,10 +1,5 @@
|
|
1
1
|
module Namae
|
2
2
|
describe 'Parser' do
|
3
|
-
|
4
|
-
it 'does not respond to .new' do
|
5
|
-
expect(Parser).not_to respond_to(:new)
|
6
|
-
end
|
7
|
-
|
8
3
|
describe '.instance' do
|
9
4
|
let(:parser) { Parser.instance }
|
10
5
|
|
@@ -120,7 +115,7 @@ module Namae
|
|
120
115
|
end
|
121
116
|
end
|
122
117
|
|
123
|
-
%w{Pastor Pr. Reverend Rev. Elder Deacon Deaconess Father Fr. Vicar}.each do |title|
|
118
|
+
%w{Pastor Pr. Reverend Rev. Elder Deacon Deaconess Father Fr. Vicar Rabbi Cantor}.each do |title|
|
124
119
|
describe "the next token is #{title.inspect}" do
|
125
120
|
before { parser.send(:input).string = title }
|
126
121
|
it 'returns a TITLE token' do
|
@@ -149,10 +144,6 @@ module Namae
|
|
149
144
|
expect(parser.parse!('Ichiro')[0].given).to eq('Ichiro')
|
150
145
|
end
|
151
146
|
|
152
|
-
it 'removes numbers' do
|
153
|
-
expect(parser.parse!('Ichiro 20156')[0].given).to eq('Ichiro')
|
154
|
-
end
|
155
|
-
|
156
147
|
it 'treats "Lord Byron" as a title and family name' do
|
157
148
|
expect(parser.parse!('Lord Byron')[0].values_at(:family, :title)).to eq(['Byron', 'Lord'])
|
158
149
|
end
|
@@ -200,6 +191,66 @@ module Namae
|
|
200
191
|
expect(parser.parse!('Bernado Franecki Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
201
192
|
#expect(parser.parse!('Bernado Franecki, Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
202
193
|
end
|
194
|
+
|
195
|
+
it 'parses consecutive titles in display order' do
|
196
|
+
expect(parser.parse!('Lt. Col. Bernado Franecki')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Lt. Col.'])
|
197
|
+
end
|
198
|
+
|
199
|
+
context 'when include_particle_in_family is false' do
|
200
|
+
let(:parser) { Parser.new(include_particle_in_family: false) }
|
201
|
+
|
202
|
+
it 'parses common capitalized particles as the family name in display order' do
|
203
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
207
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
208
|
+
end
|
209
|
+
|
210
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
211
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
212
|
+
end
|
213
|
+
|
214
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
215
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
216
|
+
end
|
217
|
+
|
218
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
219
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
223
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
context 'when include_particle_in_family is true' do
|
228
|
+
let(:parser) { Parser.new(include_particle_in_family: true) }
|
229
|
+
|
230
|
+
it 'parses common capitalized particles as the family name in display order' do
|
231
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
232
|
+
end
|
233
|
+
|
234
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
235
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
236
|
+
end
|
237
|
+
|
238
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
239
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
243
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
244
|
+
end
|
245
|
+
|
246
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
247
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
248
|
+
end
|
249
|
+
|
250
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
251
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
252
|
+
end
|
253
|
+
end
|
203
254
|
end
|
204
255
|
end
|
205
256
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Namae
|
2
|
+
describe 'Parser using threads' do
|
3
|
+
let(:name_1_str) { "Foo Bar" }
|
4
|
+
let(:name_2_str) { "Baz" }
|
5
|
+
let(:name_1) { Namae.parse(name_1_str).first }
|
6
|
+
let(:name_2) { Namae.parse(name_2_str).first }
|
7
|
+
|
8
|
+
def compare(string, expectation)
|
9
|
+
name = Namae.parse(string).first
|
10
|
+
given_name_match = expectation.given == name.given
|
11
|
+
family_name_match = expectation.family == name.family
|
12
|
+
raise unless given_name_match && family_name_match
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'has no conflicts' do
|
16
|
+
[[name_1_str, name_1], [name_2_str, name_2]].map do |string, expectation|
|
17
|
+
Thread.new do
|
18
|
+
1000.times do
|
19
|
+
compare(string, expectation)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end.each(&:join)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
CHANGED
@@ -1,30 +1,30 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namae
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
- Dan Collis-Puro
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-03-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: racc
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
18
|
+
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 1.4
|
20
|
+
version: '1.4'
|
21
21
|
type: :development
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 1.4
|
27
|
+
version: '1.4'
|
28
28
|
description: " Namae (名前) is a parser for human names. It recognizes personal names
|
29
29
|
of various cultural backgrounds and tries to split them into their component parts
|
30
30
|
(e.g., given and family names, honorifics etc.). "
|
@@ -36,7 +36,6 @@ extensions: []
|
|
36
36
|
extra_rdoc_files:
|
37
37
|
- README.md
|
38
38
|
files:
|
39
|
-
- ".autotest"
|
40
39
|
- ".codeclimate.yml"
|
41
40
|
- ".coveralls.yml"
|
42
41
|
- ".document"
|
@@ -69,11 +68,12 @@ files:
|
|
69
68
|
- spec/namae/parser_spec.rb
|
70
69
|
- spec/namae/utility_spec.rb
|
71
70
|
- spec/spec_helper.rb
|
71
|
+
- spec/thread_safety_spec.rb
|
72
72
|
homepage: https://github.com/berkmancenter/namae
|
73
73
|
licenses:
|
74
74
|
- AGPL-3.0
|
75
75
|
metadata: {}
|
76
|
-
post_install_message:
|
76
|
+
post_install_message:
|
77
77
|
rdoc_options: []
|
78
78
|
require_paths:
|
79
79
|
- lib
|
@@ -88,9 +88,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
90
|
requirements: []
|
91
|
-
|
92
|
-
|
93
|
-
signing_key:
|
91
|
+
rubygems_version: 3.2.3
|
92
|
+
signing_key:
|
94
93
|
specification_version: 4
|
95
94
|
summary: Namae (名前) parses personal names and splits them into their component parts.
|
96
95
|
test_files: []
|
data/.autotest
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'bundler'
|
2
|
-
begin
|
3
|
-
if RUBY_PLATFORM =~ /darwin/
|
4
|
-
Bundler.setup(:default, :development, :debug, :test, :osx)
|
5
|
-
require 'autotest/fsevent'
|
6
|
-
else
|
7
|
-
Bundler.setup(:default, :development, :debug, :test)
|
8
|
-
end
|
9
|
-
rescue Bundler::BundlerError => e
|
10
|
-
$stderr.puts e.message
|
11
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
-
exit e.status_code
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
Autotest.add_hook :initialize do |at|
|
17
|
-
at.add_mapping(/.+\.y$/) do |f,_|
|
18
|
-
system 'rake clean racc'
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|