namae 0.11.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.travis.yml +9 -10
- data/BSDL +1 -1
- data/Gemfile +10 -24
- data/README.md +22 -4
- data/features/lists.feature +11 -1
- data/features/step_definitions/namae_steps.rb +5 -0
- data/features/support/env.rb +0 -9
- data/lib/namae/parser.rb +224 -170
- data/lib/namae/parser.y +56 -22
- data/lib/namae/utility.rb +4 -0
- data/lib/namae/version.rb +3 -3
- data/namae.gemspec +12 -14
- data/spec/namae/parser_spec.rb +61 -10
- data/spec/thread_safety_spec.rb +25 -0
- metadata +11 -12
- data/.autotest +0 -21
data/lib/namae/parser.y
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
class Namae::Parser
|
5
5
|
|
6
|
-
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
|
6
|
+
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX UPARTICLE
|
7
7
|
|
8
8
|
expect 0
|
9
9
|
|
@@ -20,7 +20,7 @@ rule
|
|
20
20
|
| sort_order
|
21
21
|
|
22
22
|
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
23
|
-
|
|
23
|
+
| titles { result = Name.new(:title => val[0]) }
|
24
24
|
|
25
25
|
display_order : u_words word opt_suffices opt_titles
|
26
26
|
{
|
@@ -43,6 +43,14 @@ rule
|
|
43
43
|
result = Name.new(:given => val[0], :particle => val[1],
|
44
44
|
:family => val[2])
|
45
45
|
}
|
46
|
+
| u_words UPARTICLE last
|
47
|
+
{
|
48
|
+
result = if include_particle_in_family?
|
49
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
50
|
+
else
|
51
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
52
|
+
end
|
53
|
+
}
|
46
54
|
| von last
|
47
55
|
{
|
48
56
|
result = Name.new(:particle => val[0], :family => val[1])
|
@@ -53,6 +61,14 @@ rule
|
|
53
61
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
54
62
|
:given => val[2][1] }, !!val[2][0])
|
55
63
|
}
|
64
|
+
| UPARTICLE last COMMA first
|
65
|
+
{
|
66
|
+
result = if include_particle_in_family?
|
67
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
68
|
+
else
|
69
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
70
|
+
end
|
71
|
+
}
|
56
72
|
| von last COMMA first
|
57
73
|
{
|
58
74
|
result = Name.new({ :particle => val[0], :family => val[1],
|
@@ -100,26 +116,35 @@ rule
|
|
100
116
|
| titles TITLE { result = val.join(' ') }
|
101
117
|
|
102
118
|
---- header
|
103
|
-
require 'singleton'
|
104
119
|
require 'strscan'
|
105
120
|
|
106
121
|
---- inner
|
107
122
|
|
108
|
-
|
123
|
+
@defaults = {
|
124
|
+
:debug => false,
|
125
|
+
:prefer_comma_as_separator => false,
|
126
|
+
:include_particle_in_family => false,
|
127
|
+
:comma => ',',
|
128
|
+
:stops => ',;',
|
129
|
+
:separator => /\s*(\band\b|\&|;)\s*/i,
|
130
|
+
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
131
|
+
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
132
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
133
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
134
|
+
}
|
135
|
+
|
136
|
+
class << self
|
137
|
+
attr_reader :defaults
|
138
|
+
|
139
|
+
def instance
|
140
|
+
Thread.current[:namae] ||= new
|
141
|
+
end
|
142
|
+
end
|
109
143
|
|
110
144
|
attr_reader :options, :input
|
111
145
|
|
112
|
-
def initialize
|
113
|
-
@
|
114
|
-
:debug => false,
|
115
|
-
:prefer_comma_as_separator => false,
|
116
|
-
:comma => ',',
|
117
|
-
:stops => ',;',
|
118
|
-
:separator => /\s*(\band\b|\&|;)\s*/i,
|
119
|
-
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
120
|
-
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
121
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
122
|
-
}
|
146
|
+
def initialize(options = {})
|
147
|
+
@options = self.class.defaults.merge(options)
|
123
148
|
end
|
124
149
|
|
125
150
|
def debug?
|
@@ -134,6 +159,10 @@ require 'strscan'
|
|
134
159
|
options[:comma]
|
135
160
|
end
|
136
161
|
|
162
|
+
def include_particle_in_family?
|
163
|
+
options[:include_particle_in_family]
|
164
|
+
end
|
165
|
+
|
137
166
|
def stops
|
138
167
|
options[:stops]
|
139
168
|
end
|
@@ -150,26 +179,29 @@ require 'strscan'
|
|
150
179
|
options[:appellation]
|
151
180
|
end
|
152
181
|
|
182
|
+
def uppercase_particle
|
183
|
+
options[:uppercase_particle]
|
184
|
+
end
|
185
|
+
|
153
186
|
def prefer_comma_as_separator?
|
154
187
|
options[:prefer_comma_as_separator]
|
155
188
|
end
|
156
189
|
|
157
|
-
def parse(
|
158
|
-
parse!(
|
190
|
+
def parse(string)
|
191
|
+
parse!(string)
|
159
192
|
rescue => e
|
160
193
|
warn e.message if debug?
|
161
194
|
[]
|
162
195
|
end
|
163
196
|
|
164
197
|
def parse!(string)
|
165
|
-
input
|
198
|
+
@input = StringScanner.new(normalize(string))
|
166
199
|
reset
|
167
200
|
do_parse
|
168
201
|
end
|
169
202
|
|
170
203
|
def normalize(string)
|
171
|
-
string
|
172
|
-
string
|
204
|
+
string.scrub.strip
|
173
205
|
end
|
174
206
|
|
175
207
|
def reset
|
@@ -220,11 +252,11 @@ require 'strscan'
|
|
220
252
|
end
|
221
253
|
|
222
254
|
def will_see_suffix?
|
223
|
-
input.
|
255
|
+
input.rest.strip.split(/\s+/)[0] =~ suffix
|
224
256
|
end
|
225
257
|
|
226
258
|
def will_see_initial?
|
227
|
-
input.
|
259
|
+
input.rest.strip.split(/\s+/)[0] =~ /^[[:upper:]]+\b/
|
228
260
|
end
|
229
261
|
|
230
262
|
def seen_full_name?
|
@@ -256,6 +288,8 @@ require 'strscan'
|
|
256
288
|
else
|
257
289
|
consume_word(:UWORD, input.matched)
|
258
290
|
end
|
291
|
+
when input.scan(uppercase_particle)
|
292
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
259
293
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
260
294
|
consume_word(:UWORD, input.matched)
|
261
295
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
data/lib/namae/utility.rb
CHANGED
data/lib/namae/version.rb
CHANGED
data/namae.gemspec
CHANGED
@@ -2,23 +2,22 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: namae
|
5
|
+
# stub: namae 1.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "namae".freeze
|
9
|
-
s.version = "
|
9
|
+
s.version = "1.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Sylvester Keil".freeze, "Dan Collis-Puro".freeze]
|
14
|
-
s.date = "
|
15
|
-
s.description = " Namae (\
|
14
|
+
s.date = "2021-03-12"
|
15
|
+
s.description = " Namae (\u540D\u524D) is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). ".freeze
|
16
16
|
s.email = ["sylvester@keil.or.at".freeze, "dan@collispuro.com".freeze]
|
17
17
|
s.extra_rdoc_files = [
|
18
18
|
"README.md"
|
19
19
|
]
|
20
20
|
s.files = [
|
21
|
-
".autotest",
|
22
21
|
".codeclimate.yml",
|
23
22
|
".coveralls.yml",
|
24
23
|
".document",
|
@@ -50,23 +49,22 @@ Gem::Specification.new do |s|
|
|
50
49
|
"spec/namae/name_spec.rb",
|
51
50
|
"spec/namae/parser_spec.rb",
|
52
51
|
"spec/namae/utility_spec.rb",
|
53
|
-
"spec/spec_helper.rb"
|
52
|
+
"spec/spec_helper.rb",
|
53
|
+
"spec/thread_safety_spec.rb"
|
54
54
|
]
|
55
55
|
s.homepage = "https://github.com/berkmancenter/namae".freeze
|
56
56
|
s.licenses = ["AGPL-3.0".freeze]
|
57
|
-
s.rubygems_version = "2.
|
58
|
-
s.summary = "Namae (\
|
57
|
+
s.rubygems_version = "3.2.3".freeze
|
58
|
+
s.summary = "Namae (\u540D\u524D) parses personal names and splits them into their component parts.".freeze
|
59
59
|
|
60
60
|
if s.respond_to? :specification_version then
|
61
61
|
s.specification_version = 4
|
62
|
+
end
|
62
63
|
|
63
|
-
|
64
|
-
|
65
|
-
else
|
66
|
-
s.add_dependency(%q<racc>.freeze, ["= 1.4.9"])
|
67
|
-
end
|
64
|
+
if s.respond_to? :add_runtime_dependency then
|
65
|
+
s.add_development_dependency(%q<racc>.freeze, ["~> 1.4"])
|
68
66
|
else
|
69
|
-
s.add_dependency(%q<racc>.freeze, ["
|
67
|
+
s.add_dependency(%q<racc>.freeze, ["~> 1.4"])
|
70
68
|
end
|
71
69
|
end
|
72
70
|
|
data/spec/namae/parser_spec.rb
CHANGED
@@ -1,10 +1,5 @@
|
|
1
1
|
module Namae
|
2
2
|
describe 'Parser' do
|
3
|
-
|
4
|
-
it 'does not respond to .new' do
|
5
|
-
expect(Parser).not_to respond_to(:new)
|
6
|
-
end
|
7
|
-
|
8
3
|
describe '.instance' do
|
9
4
|
let(:parser) { Parser.instance }
|
10
5
|
|
@@ -120,7 +115,7 @@ module Namae
|
|
120
115
|
end
|
121
116
|
end
|
122
117
|
|
123
|
-
%w{Pastor Pr. Reverend Rev. Elder Deacon Deaconess Father Fr. Vicar}.each do |title|
|
118
|
+
%w{Pastor Pr. Reverend Rev. Elder Deacon Deaconess Father Fr. Vicar Rabbi Cantor}.each do |title|
|
124
119
|
describe "the next token is #{title.inspect}" do
|
125
120
|
before { parser.send(:input).string = title }
|
126
121
|
it 'returns a TITLE token' do
|
@@ -149,10 +144,6 @@ module Namae
|
|
149
144
|
expect(parser.parse!('Ichiro')[0].given).to eq('Ichiro')
|
150
145
|
end
|
151
146
|
|
152
|
-
it 'removes numbers' do
|
153
|
-
expect(parser.parse!('Ichiro 20156')[0].given).to eq('Ichiro')
|
154
|
-
end
|
155
|
-
|
156
147
|
it 'treats "Lord Byron" as a title and family name' do
|
157
148
|
expect(parser.parse!('Lord Byron')[0].values_at(:family, :title)).to eq(['Byron', 'Lord'])
|
158
149
|
end
|
@@ -200,6 +191,66 @@ module Namae
|
|
200
191
|
expect(parser.parse!('Bernado Franecki Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
201
192
|
#expect(parser.parse!('Bernado Franecki, Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
202
193
|
end
|
194
|
+
|
195
|
+
it 'parses consecutive titles in display order' do
|
196
|
+
expect(parser.parse!('Lt. Col. Bernado Franecki')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Lt. Col.'])
|
197
|
+
end
|
198
|
+
|
199
|
+
context 'when include_particle_in_family is false' do
|
200
|
+
let(:parser) { Parser.new(include_particle_in_family: false) }
|
201
|
+
|
202
|
+
it 'parses common capitalized particles as the family name in display order' do
|
203
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
207
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
208
|
+
end
|
209
|
+
|
210
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
211
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
212
|
+
end
|
213
|
+
|
214
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
215
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
216
|
+
end
|
217
|
+
|
218
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
219
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
223
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
context 'when include_particle_in_family is true' do
|
228
|
+
let(:parser) { Parser.new(include_particle_in_family: true) }
|
229
|
+
|
230
|
+
it 'parses common capitalized particles as the family name in display order' do
|
231
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
232
|
+
end
|
233
|
+
|
234
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
235
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
236
|
+
end
|
237
|
+
|
238
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
239
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
243
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
244
|
+
end
|
245
|
+
|
246
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
247
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
248
|
+
end
|
249
|
+
|
250
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
251
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
252
|
+
end
|
253
|
+
end
|
203
254
|
end
|
204
255
|
end
|
205
256
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Namae
|
2
|
+
describe 'Parser using threads' do
|
3
|
+
let(:name_1_str) { "Foo Bar" }
|
4
|
+
let(:name_2_str) { "Baz" }
|
5
|
+
let(:name_1) { Namae.parse(name_1_str).first }
|
6
|
+
let(:name_2) { Namae.parse(name_2_str).first }
|
7
|
+
|
8
|
+
def compare(string, expectation)
|
9
|
+
name = Namae.parse(string).first
|
10
|
+
given_name_match = expectation.given == name.given
|
11
|
+
family_name_match = expectation.family == name.family
|
12
|
+
raise unless given_name_match && family_name_match
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'has no conflicts' do
|
16
|
+
[[name_1_str, name_1], [name_2_str, name_2]].map do |string, expectation|
|
17
|
+
Thread.new do
|
18
|
+
1000.times do
|
19
|
+
compare(string, expectation)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end.each(&:join)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
CHANGED
@@ -1,30 +1,30 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namae
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
8
8
|
- Dan Collis-Puro
|
9
|
-
autorequire:
|
9
|
+
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2021-03-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: racc
|
16
16
|
requirement: !ruby/object:Gem::Requirement
|
17
17
|
requirements:
|
18
|
-
- -
|
18
|
+
- - "~>"
|
19
19
|
- !ruby/object:Gem::Version
|
20
|
-
version: 1.4
|
20
|
+
version: '1.4'
|
21
21
|
type: :development
|
22
22
|
prerelease: false
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- -
|
25
|
+
- - "~>"
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: 1.4
|
27
|
+
version: '1.4'
|
28
28
|
description: " Namae (名前) is a parser for human names. It recognizes personal names
|
29
29
|
of various cultural backgrounds and tries to split them into their component parts
|
30
30
|
(e.g., given and family names, honorifics etc.). "
|
@@ -36,7 +36,6 @@ extensions: []
|
|
36
36
|
extra_rdoc_files:
|
37
37
|
- README.md
|
38
38
|
files:
|
39
|
-
- ".autotest"
|
40
39
|
- ".codeclimate.yml"
|
41
40
|
- ".coveralls.yml"
|
42
41
|
- ".document"
|
@@ -69,11 +68,12 @@ files:
|
|
69
68
|
- spec/namae/parser_spec.rb
|
70
69
|
- spec/namae/utility_spec.rb
|
71
70
|
- spec/spec_helper.rb
|
71
|
+
- spec/thread_safety_spec.rb
|
72
72
|
homepage: https://github.com/berkmancenter/namae
|
73
73
|
licenses:
|
74
74
|
- AGPL-3.0
|
75
75
|
metadata: {}
|
76
|
-
post_install_message:
|
76
|
+
post_install_message:
|
77
77
|
rdoc_options: []
|
78
78
|
require_paths:
|
79
79
|
- lib
|
@@ -88,9 +88,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
88
88
|
- !ruby/object:Gem::Version
|
89
89
|
version: '0'
|
90
90
|
requirements: []
|
91
|
-
|
92
|
-
|
93
|
-
signing_key:
|
91
|
+
rubygems_version: 3.2.3
|
92
|
+
signing_key:
|
94
93
|
specification_version: 4
|
95
94
|
summary: Namae (名前) parses personal names and splits them into their component parts.
|
96
95
|
test_files: []
|
data/.autotest
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'bundler'
|
2
|
-
begin
|
3
|
-
if RUBY_PLATFORM =~ /darwin/
|
4
|
-
Bundler.setup(:default, :development, :debug, :test, :osx)
|
5
|
-
require 'autotest/fsevent'
|
6
|
-
else
|
7
|
-
Bundler.setup(:default, :development, :debug, :test)
|
8
|
-
end
|
9
|
-
rescue Bundler::BundlerError => e
|
10
|
-
$stderr.puts e.message
|
11
|
-
$stderr.puts "Run `bundle install` to install missing gems"
|
12
|
-
exit e.status_code
|
13
|
-
end
|
14
|
-
|
15
|
-
|
16
|
-
Autotest.add_hook :initialize do |at|
|
17
|
-
at.add_mapping(/.+\.y$/) do |f,_|
|
18
|
-
system 'rake clean racc'
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|