namae 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/features/lists.feature +4 -2
- data/features/step_definitions/namae_steps.rb +5 -0
- data/lib/namae/parser.rb +197 -149
- data/lib/namae/parser.y +31 -3
- data/lib/namae/version.rb +2 -2
- data/namae.gemspec +3 -3
- data/spec/namae/parser_spec.rb +60 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82fa8955c4f650ccbcb6bf67db18f005eafd2f1f09252b8d98203a6f04949ed2
|
4
|
+
data.tar.gz: c96965c52193db381f8fceb0e8bfc34453f62127c846b6cb593e804c25394908
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14644528eb8d587a2fd0064fbddfa6cfe8925d7a1b9b3f6a75b33c599f3bbb9a43b53e951e4ec8fab9506c5edc2ef62abff77fc1f43cca1ce3489c5ab4c36f8f
|
7
|
+
data.tar.gz: 921b0c60e964b8e5f6154008ab8923d59a2df5523484e38618f3ddddbd89eb2d0ad99ffdbbc9d90629cced3051e38fffb05617657338018afd8913cefc6e891c
|
data/features/lists.feature
CHANGED
@@ -115,7 +115,8 @@ Feature: Parse a list of names
|
|
115
115
|
| B | Malcom |
|
116
116
|
|
117
117
|
Scenario: A list of names with particles separated by commas
|
118
|
-
Given
|
118
|
+
Given I want to include particles in the family name
|
119
|
+
And a parser that prefers commas as separators
|
119
120
|
When I parse the names "Di Proctor, M., von Cooper, P."
|
120
121
|
Then the names should be:
|
121
122
|
| given | family |
|
@@ -128,7 +129,8 @@ Feature: Parse a list of names
|
|
128
129
|
| P | Cooper |
|
129
130
|
|
130
131
|
Scenario: A list of names with two consecutive accented characters
|
131
|
-
Given
|
132
|
+
Given I want to include particles in the family name
|
133
|
+
And a parser that prefers commas as separators
|
132
134
|
When I parse the names "Çakıroğlu, Ü., Başıbüyük, B."
|
133
135
|
Then the names should be:
|
134
136
|
| given | family |
|
@@ -2,6 +2,11 @@ Given /^a parser that prefers commas as separators$/ do
|
|
2
2
|
Namae::Parser.instance.options[:prefer_comma_as_separator] = true
|
3
3
|
end
|
4
4
|
|
5
|
+
Given /^I want to include particles in the family name$/ do
|
6
|
+
Namae::Parser.instance.options[:include_particle_in_family] = true
|
7
|
+
end
|
8
|
+
|
9
|
+
|
5
10
|
When /^I parse the name "(.*)"$/ do |string|
|
6
11
|
@name = Namae.parse!(string)[0]
|
7
12
|
end
|
data/lib/namae/parser.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.
|
3
|
+
# This file is automatically generated by Racc 1.5.2
|
4
4
|
# from Racc grammar file "".
|
5
5
|
#
|
6
6
|
|
@@ -11,17 +11,19 @@ require 'strscan'
|
|
11
11
|
module Namae
|
12
12
|
class Parser < Racc::Parser
|
13
13
|
|
14
|
-
module_eval(<<'...end parser.y/module_eval...', 'parser.y',
|
14
|
+
module_eval(<<'...end parser.y/module_eval...', 'parser.y', 122)
|
15
15
|
|
16
16
|
@defaults = {
|
17
17
|
:debug => false,
|
18
18
|
:prefer_comma_as_separator => false,
|
19
|
+
:include_particle_in_family => false,
|
19
20
|
:comma => ',',
|
20
21
|
:stops => ',;',
|
21
22
|
:separator => /\s*(\band\b|\&|;)\s*/i,
|
22
23
|
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
23
24
|
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
24
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
25
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
26
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
25
27
|
}
|
26
28
|
|
27
29
|
class << self
|
@@ -50,6 +52,10 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
50
52
|
options[:comma]
|
51
53
|
end
|
52
54
|
|
55
|
+
def include_particle_in_family?
|
56
|
+
options[:include_particle_in_family]
|
57
|
+
end
|
58
|
+
|
53
59
|
def stops
|
54
60
|
options[:stops]
|
55
61
|
end
|
@@ -66,6 +72,10 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
66
72
|
options[:appellation]
|
67
73
|
end
|
68
74
|
|
75
|
+
def uppercase_particle
|
76
|
+
options[:uppercase_particle]
|
77
|
+
end
|
78
|
+
|
69
79
|
def prefer_comma_as_separator?
|
70
80
|
options[:prefer_comma_as_separator]
|
71
81
|
end
|
@@ -171,6 +181,8 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
171
181
|
else
|
172
182
|
consume_word(:UWORD, input.matched)
|
173
183
|
end
|
184
|
+
when input.scan(uppercase_particle)
|
185
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
174
186
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
175
187
|
consume_word(:UWORD, input.matched)
|
176
188
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
@@ -195,133 +207,143 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
195
207
|
##### State transition tables begin ###
|
196
208
|
|
197
209
|
racc_action_table = [
|
198
|
-
-
|
199
|
-
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
210
|
+
-41, 18, 25, 34, -42, 35, 36, -41, 19, -41,
|
211
|
+
-41, -42, 40, -42, -42, 15, 13, 16, 46, 52,
|
212
|
+
7, 17, 62, 12, 15, 24, 16, 27, 15, 13,
|
213
|
+
16, 17, 29, 7, 17, 66, 12, 15, 24, 16,
|
214
|
+
27, 73, 60, 59, 61, 29, 74, 46, -40, -36,
|
215
|
+
-24, 60, 59, 61, 66, -40, 69, 25, 46, 60,
|
216
|
+
59, 61, 60, 59, 61, 17, 46, 46, 46, 46,
|
217
|
+
60, 59, 61, 15, 24, 16, 17, 46, 34, 32,
|
218
|
+
35, 34, 38, 35, 34, 32, 35, -21, -21, -21,
|
219
|
+
34, 49, 35, 34, 32, 35, 34, 38, 35, -22,
|
220
|
+
-22, -22, 34, 53, 35, 34, 32, 35, 34, 32,
|
221
|
+
35, -21, -21, -21, 60, 59, 61, 60, 59, 61,
|
222
|
+
66 ]
|
209
223
|
|
210
224
|
racc_action_check = [
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
225
|
+
15, 1, 8, 39, 16, 39, 11, 15, 1, 15,
|
226
|
+
15, 16, 18, 16, 16, 0, 0, 0, 26, 31,
|
227
|
+
0, 0, 37, 0, 9, 9, 9, 9, 19, 19,
|
228
|
+
19, 44, 9, 19, 19, 45, 19, 22, 22, 22,
|
229
|
+
22, 56, 56, 56, 56, 22, 57, 47, 13, 56,
|
230
|
+
13, 36, 36, 36, 57, 13, 50, 65, 36, 52,
|
231
|
+
52, 52, 62, 62, 62, 67, 52, 68, 71, 62,
|
232
|
+
69, 69, 69, 5, 5, 5, 77, 69, 10, 10,
|
233
|
+
10, 12, 12, 12, 23, 23, 23, 24, 24, 24,
|
234
|
+
27, 27, 27, 28, 28, 28, 29, 29, 29, 32,
|
235
|
+
32, 32, 33, 33, 33, 42, 42, 42, 48, 48,
|
236
|
+
48, 49, 49, 49, 74, 74, 74, 80, 80, 80,
|
237
|
+
79 ]
|
222
238
|
|
223
239
|
racc_action_pointer = [
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
nil,
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
240
|
+
12, 1, nil, nil, nil, 70, nil, nil, -7, 21,
|
241
|
+
75, 4, 78, 48, nil, 0, 4, nil, 12, 25,
|
242
|
+
nil, nil, 34, 81, 84, nil, 8, 87, 90, 93,
|
243
|
+
nil, 17, 96, 99, nil, nil, 48, 20, nil, 0,
|
244
|
+
nil, nil, 102, nil, 22, 25, nil, 37, 105, 108,
|
245
|
+
54, nil, 56, nil, nil, nil, 39, 44, nil, nil,
|
246
|
+
nil, nil, 59, nil, nil, 48, nil, 56, 57, 67,
|
247
|
+
nil, 58, nil, nil, 111, nil, nil, 67, nil, 110,
|
248
|
+
114, nil ]
|
232
249
|
|
233
250
|
racc_action_default = [
|
234
|
-
-1, -
|
235
|
-
-
|
236
|
-
|
237
|
-
-
|
238
|
-
|
239
|
-
-
|
240
|
-
-
|
241
|
-
-
|
251
|
+
-1, -51, -2, -4, -5, -51, -8, -9, -10, -25,
|
252
|
+
-51, -51, -51, -21, -30, -32, -33, -49, -51, -51,
|
253
|
+
-6, -7, -51, -51, -40, -50, -43, -51, -51, -51,
|
254
|
+
-31, -16, -24, -25, -32, -33, -38, -51, -24, -25,
|
255
|
+
82, -3, -51, -16, -47, -44, -45, -43, -51, -24,
|
256
|
+
-14, -15, -38, -23, -17, -26, -39, -28, -34, -40,
|
257
|
+
-41, -42, -38, -14, -11, -48, -46, -47, -43, -38,
|
258
|
+
-19, -51, -35, -37, -51, -18, -12, -47, -20, -27,
|
259
|
+
-29, -13 ]
|
242
260
|
|
243
261
|
racc_goto_table = [
|
244
|
-
3,
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
262
|
+
3, 1, 45, 44, 54, 20, 64, 21, 31, 26,
|
263
|
+
37, 23, 80, 2, 71, 28, 8, nil, 30, 3,
|
264
|
+
70, 43, 26, 45, 67, 47, 50, 51, 42, 76,
|
265
|
+
75, 30, 41, 48, nil, 8, nil, 78, 9, 81,
|
266
|
+
63, nil, 30, 22, 45, 77, 68, 79, 30, nil,
|
267
|
+
39, nil, nil, nil, nil, nil, 72, 9, nil, nil,
|
268
|
+
nil, nil, nil, nil, nil, 39, nil, 39, nil, nil,
|
250
269
|
nil, nil, nil, nil, nil, nil, nil, nil, nil, nil,
|
251
|
-
|
270
|
+
72 ]
|
252
271
|
|
253
272
|
racc_goto_check = [
|
254
|
-
3,
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
273
|
+
3, 1, 17, 9, 13, 3, 10, 4, 11, 3,
|
274
|
+
11, 12, 15, 2, 16, 12, 7, nil, 18, 3,
|
275
|
+
13, 11, 3, 17, 9, 11, 11, 11, 12, 10,
|
276
|
+
13, 18, 2, 12, nil, 7, nil, 13, 8, 10,
|
277
|
+
11, nil, 18, 8, 17, 9, 11, 17, 18, nil,
|
278
|
+
8, nil, nil, nil, nil, nil, 3, 8, nil, nil,
|
279
|
+
nil, nil, nil, nil, nil, 8, nil, 8, nil, nil,
|
259
280
|
nil, nil, nil, nil, nil, nil, nil, nil, nil, nil,
|
260
|
-
|
261
|
-
nil, nil, nil, 3 ]
|
281
|
+
3 ]
|
262
282
|
|
263
283
|
racc_goto_pointer = [
|
264
|
-
nil,
|
265
|
-
|
284
|
+
nil, 1, 13, 0, 2, nil, nil, 16, 38, -23,
|
285
|
+
-38, -2, 6, -32, nil, -62, -42, -24, 9 ]
|
266
286
|
|
267
287
|
racc_goto_default = [
|
268
|
-
nil, nil, nil,
|
269
|
-
11, 10, nil,
|
288
|
+
nil, nil, nil, 58, 4, 5, 6, 65, 33, nil,
|
289
|
+
nil, 11, 10, nil, 55, 56, nil, 57, 14 ]
|
270
290
|
|
271
291
|
racc_reduce_table = [
|
272
292
|
0, 0, :racc_error,
|
273
|
-
0,
|
274
|
-
1,
|
275
|
-
3,
|
276
|
-
1,
|
277
|
-
1, 13, :_reduce_none,
|
278
|
-
2, 13, :_reduce_6,
|
279
|
-
2, 13, :_reduce_7,
|
280
|
-
1, 13, :_reduce_none,
|
281
|
-
1, 16, :_reduce_9,
|
282
|
-
1, 16, :_reduce_10,
|
283
|
-
4, 15, :_reduce_11,
|
284
|
-
5, 15, :_reduce_12,
|
285
|
-
6, 15, :_reduce_13,
|
286
|
-
3, 15, :_reduce_14,
|
287
|
-
2, 15, :_reduce_15,
|
288
|
-
3, 17, :_reduce_16,
|
289
|
-
4, 17, :_reduce_17,
|
290
|
-
5, 17, :_reduce_18,
|
291
|
-
1, 22, :_reduce_none,
|
292
|
-
2, 22, :_reduce_20,
|
293
|
-
3, 22, :_reduce_21,
|
294
|
-
1, 21, :_reduce_none,
|
295
|
-
1, 21, :_reduce_none,
|
296
|
-
1, 23, :_reduce_24,
|
297
|
-
3, 23, :_reduce_25,
|
298
|
-
1, 23, :_reduce_26,
|
299
|
-
3, 23, :_reduce_27,
|
300
|
-
1, 18, :_reduce_none,
|
301
|
-
2, 18, :_reduce_29,
|
302
|
-
1, 28, :_reduce_none,
|
303
|
-
1, 28, :_reduce_none,
|
304
|
-
1, 25, :_reduce_none,
|
305
|
-
2, 25, :_reduce_33,
|
306
|
-
0, 26, :_reduce_none,
|
307
|
-
1, 26, :_reduce_none,
|
308
|
-
0, 24, :_reduce_none,
|
309
|
-
1, 24, :_reduce_none,
|
310
|
-
1, 14, :_reduce_none,
|
293
|
+
0, 13, :_reduce_1,
|
294
|
+
1, 13, :_reduce_2,
|
295
|
+
3, 13, :_reduce_3,
|
296
|
+
1, 14, :_reduce_4,
|
311
297
|
1, 14, :_reduce_none,
|
298
|
+
2, 14, :_reduce_6,
|
299
|
+
2, 14, :_reduce_7,
|
312
300
|
1, 14, :_reduce_none,
|
313
|
-
|
314
|
-
1,
|
315
|
-
|
316
|
-
|
317
|
-
|
301
|
+
1, 17, :_reduce_9,
|
302
|
+
1, 17, :_reduce_10,
|
303
|
+
4, 16, :_reduce_11,
|
304
|
+
5, 16, :_reduce_12,
|
305
|
+
6, 16, :_reduce_13,
|
306
|
+
3, 16, :_reduce_14,
|
307
|
+
3, 16, :_reduce_15,
|
308
|
+
2, 16, :_reduce_16,
|
309
|
+
3, 18, :_reduce_17,
|
310
|
+
4, 18, :_reduce_18,
|
311
|
+
4, 18, :_reduce_19,
|
312
|
+
5, 18, :_reduce_20,
|
313
|
+
1, 24, :_reduce_none,
|
314
|
+
2, 24, :_reduce_22,
|
315
|
+
3, 24, :_reduce_23,
|
316
|
+
1, 23, :_reduce_none,
|
317
|
+
1, 23, :_reduce_none,
|
318
|
+
1, 25, :_reduce_26,
|
319
|
+
3, 25, :_reduce_27,
|
320
|
+
1, 25, :_reduce_28,
|
321
|
+
3, 25, :_reduce_29,
|
318
322
|
1, 20, :_reduce_none,
|
323
|
+
2, 20, :_reduce_31,
|
324
|
+
1, 30, :_reduce_none,
|
325
|
+
1, 30, :_reduce_none,
|
326
|
+
1, 27, :_reduce_none,
|
327
|
+
2, 27, :_reduce_35,
|
328
|
+
0, 28, :_reduce_none,
|
329
|
+
1, 28, :_reduce_none,
|
330
|
+
0, 26, :_reduce_none,
|
331
|
+
1, 26, :_reduce_none,
|
332
|
+
1, 15, :_reduce_none,
|
333
|
+
1, 15, :_reduce_none,
|
334
|
+
1, 15, :_reduce_none,
|
335
|
+
0, 21, :_reduce_none,
|
336
|
+
1, 21, :_reduce_none,
|
319
337
|
1, 29, :_reduce_none,
|
320
|
-
2, 29, :
|
338
|
+
2, 29, :_reduce_46,
|
339
|
+
0, 22, :_reduce_none,
|
340
|
+
1, 22, :_reduce_none,
|
341
|
+
1, 19, :_reduce_none,
|
342
|
+
2, 19, :_reduce_50 ]
|
321
343
|
|
322
|
-
racc_reduce_n =
|
344
|
+
racc_reduce_n = 51
|
323
345
|
|
324
|
-
racc_shift_n =
|
346
|
+
racc_shift_n = 82
|
325
347
|
|
326
348
|
racc_token_table = {
|
327
349
|
false => 0,
|
@@ -334,9 +356,10 @@ racc_token_table = {
|
|
334
356
|
:AND => 7,
|
335
357
|
:APPELLATION => 8,
|
336
358
|
:TITLE => 9,
|
337
|
-
:SUFFIX => 10
|
359
|
+
:SUFFIX => 10,
|
360
|
+
:UPARTICLE => 11 }
|
338
361
|
|
339
|
-
racc_nt_base =
|
362
|
+
racc_nt_base = 12
|
340
363
|
|
341
364
|
racc_use_result_var = true
|
342
365
|
|
@@ -368,6 +391,7 @@ Racc_token_to_s_table = [
|
|
368
391
|
"APPELLATION",
|
369
392
|
"TITLE",
|
370
393
|
"SUFFIX",
|
394
|
+
"UPARTICLE",
|
371
395
|
"$start",
|
372
396
|
"names",
|
373
397
|
"name",
|
@@ -375,6 +399,7 @@ Racc_token_to_s_table = [
|
|
375
399
|
"display_order",
|
376
400
|
"honorific",
|
377
401
|
"sort_order",
|
402
|
+
"titles",
|
378
403
|
"u_words",
|
379
404
|
"opt_suffices",
|
380
405
|
"opt_titles",
|
@@ -385,8 +410,7 @@ Racc_token_to_s_table = [
|
|
385
410
|
"words",
|
386
411
|
"opt_comma",
|
387
412
|
"suffices",
|
388
|
-
"u_word"
|
389
|
-
"titles" ]
|
413
|
+
"u_word" ]
|
390
414
|
|
391
415
|
Racc_debug_parser = false
|
392
416
|
|
@@ -493,14 +517,26 @@ module_eval(<<'.,.,', 'parser.y', 42)
|
|
493
517
|
|
494
518
|
module_eval(<<'.,.,', 'parser.y', 47)
|
495
519
|
def _reduce_15(val, _values, result)
|
496
|
-
|
520
|
+
result = if include_particle_in_family?
|
521
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
522
|
+
else
|
523
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
524
|
+
end
|
497
525
|
|
498
526
|
result
|
499
527
|
end
|
500
528
|
.,.,
|
501
529
|
|
502
|
-
module_eval(<<'.,.,', 'parser.y',
|
530
|
+
module_eval(<<'.,.,', 'parser.y', 55)
|
503
531
|
def _reduce_16(val, _values, result)
|
532
|
+
result = Name.new(:particle => val[0], :family => val[1])
|
533
|
+
|
534
|
+
result
|
535
|
+
end
|
536
|
+
.,.,
|
537
|
+
|
538
|
+
module_eval(<<'.,.,', 'parser.y', 60)
|
539
|
+
def _reduce_17(val, _values, result)
|
504
540
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
505
541
|
:given => val[2][1] }, !!val[2][0])
|
506
542
|
|
@@ -508,8 +544,20 @@ module_eval(<<'.,.,', 'parser.y', 52)
|
|
508
544
|
end
|
509
545
|
.,.,
|
510
546
|
|
511
|
-
module_eval(<<'.,.,', 'parser.y',
|
512
|
-
def
|
547
|
+
module_eval(<<'.,.,', 'parser.y', 65)
|
548
|
+
def _reduce_18(val, _values, result)
|
549
|
+
result = if include_particle_in_family?
|
550
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
551
|
+
else
|
552
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
553
|
+
end
|
554
|
+
|
555
|
+
result
|
556
|
+
end
|
557
|
+
.,.,
|
558
|
+
|
559
|
+
module_eval(<<'.,.,', 'parser.y', 73)
|
560
|
+
def _reduce_19(val, _values, result)
|
513
561
|
result = Name.new({ :particle => val[0], :family => val[1],
|
514
562
|
:suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
515
563
|
|
@@ -517,8 +565,8 @@ module_eval(<<'.,.,', 'parser.y', 57)
|
|
517
565
|
end
|
518
566
|
.,.,
|
519
567
|
|
520
|
-
module_eval(<<'.,.,', 'parser.y',
|
521
|
-
def
|
568
|
+
module_eval(<<'.,.,', 'parser.y', 78)
|
569
|
+
def _reduce_20(val, _values, result)
|
522
570
|
result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],
|
523
571
|
:suffix => val[4][0], :given => val[4][1] }, !!val[4][0])
|
524
572
|
|
@@ -526,80 +574,76 @@ module_eval(<<'.,.,', 'parser.y', 62)
|
|
526
574
|
end
|
527
575
|
.,.,
|
528
576
|
|
529
|
-
# reduce
|
577
|
+
# reduce 21 omitted
|
530
578
|
|
531
|
-
module_eval(<<'.,.,', 'parser.y',
|
532
|
-
def
|
579
|
+
module_eval(<<'.,.,', 'parser.y', 84)
|
580
|
+
def _reduce_22(val, _values, result)
|
533
581
|
result = val.join(' ')
|
534
582
|
result
|
535
583
|
end
|
536
584
|
.,.,
|
537
585
|
|
538
|
-
module_eval(<<'.,.,', 'parser.y',
|
539
|
-
def
|
586
|
+
module_eval(<<'.,.,', 'parser.y', 85)
|
587
|
+
def _reduce_23(val, _values, result)
|
540
588
|
result = val.join(' ')
|
541
589
|
result
|
542
590
|
end
|
543
591
|
.,.,
|
544
592
|
|
545
|
-
# reduce
|
593
|
+
# reduce 24 omitted
|
546
594
|
|
547
|
-
# reduce
|
595
|
+
# reduce 25 omitted
|
548
596
|
|
549
|
-
module_eval(<<'.,.,', 'parser.y',
|
550
|
-
def
|
597
|
+
module_eval(<<'.,.,', 'parser.y', 89)
|
598
|
+
def _reduce_26(val, _values, result)
|
551
599
|
result = [nil,val[0]]
|
552
600
|
result
|
553
601
|
end
|
554
602
|
.,.,
|
555
603
|
|
556
|
-
module_eval(<<'.,.,', 'parser.y',
|
557
|
-
def
|
604
|
+
module_eval(<<'.,.,', 'parser.y', 90)
|
605
|
+
def _reduce_27(val, _values, result)
|
558
606
|
result = [val[2],val[0]]
|
559
607
|
result
|
560
608
|
end
|
561
609
|
.,.,
|
562
610
|
|
563
|
-
module_eval(<<'.,.,', 'parser.y',
|
564
|
-
def
|
611
|
+
module_eval(<<'.,.,', 'parser.y', 91)
|
612
|
+
def _reduce_28(val, _values, result)
|
565
613
|
result = [val[0],nil]
|
566
614
|
result
|
567
615
|
end
|
568
616
|
.,.,
|
569
617
|
|
570
|
-
module_eval(<<'.,.,', 'parser.y',
|
571
|
-
def
|
618
|
+
module_eval(<<'.,.,', 'parser.y', 92)
|
619
|
+
def _reduce_29(val, _values, result)
|
572
620
|
result = [val[0],val[2]]
|
573
621
|
result
|
574
622
|
end
|
575
623
|
.,.,
|
576
624
|
|
577
|
-
# reduce
|
625
|
+
# reduce 30 omitted
|
578
626
|
|
579
|
-
module_eval(<<'.,.,', 'parser.y',
|
580
|
-
def
|
627
|
+
module_eval(<<'.,.,', 'parser.y', 95)
|
628
|
+
def _reduce_31(val, _values, result)
|
581
629
|
result = val.join(' ')
|
582
630
|
result
|
583
631
|
end
|
584
632
|
.,.,
|
585
633
|
|
586
|
-
# reduce
|
634
|
+
# reduce 32 omitted
|
587
635
|
|
588
|
-
# reduce
|
636
|
+
# reduce 33 omitted
|
589
637
|
|
590
|
-
# reduce
|
638
|
+
# reduce 34 omitted
|
591
639
|
|
592
|
-
module_eval(<<'.,.,', 'parser.y',
|
593
|
-
def
|
640
|
+
module_eval(<<'.,.,', 'parser.y', 100)
|
641
|
+
def _reduce_35(val, _values, result)
|
594
642
|
result = val.join(' ')
|
595
643
|
result
|
596
644
|
end
|
597
645
|
.,.,
|
598
646
|
|
599
|
-
# reduce 34 omitted
|
600
|
-
|
601
|
-
# reduce 35 omitted
|
602
|
-
|
603
647
|
# reduce 36 omitted
|
604
648
|
|
605
649
|
# reduce 37 omitted
|
@@ -616,21 +660,25 @@ module_eval(<<'.,.,', 'parser.y', 84)
|
|
616
660
|
|
617
661
|
# reduce 43 omitted
|
618
662
|
|
619
|
-
|
620
|
-
|
663
|
+
# reduce 44 omitted
|
664
|
+
|
665
|
+
# reduce 45 omitted
|
666
|
+
|
667
|
+
module_eval(<<'.,.,', 'parser.y', 110)
|
668
|
+
def _reduce_46(val, _values, result)
|
621
669
|
result = val.join(' ')
|
622
670
|
result
|
623
671
|
end
|
624
672
|
.,.,
|
625
673
|
|
626
|
-
# reduce
|
674
|
+
# reduce 47 omitted
|
627
675
|
|
628
|
-
# reduce
|
676
|
+
# reduce 48 omitted
|
629
677
|
|
630
|
-
# reduce
|
678
|
+
# reduce 49 omitted
|
631
679
|
|
632
|
-
module_eval(<<'.,.,', 'parser.y',
|
633
|
-
def
|
680
|
+
module_eval(<<'.,.,', 'parser.y', 115)
|
681
|
+
def _reduce_50(val, _values, result)
|
634
682
|
result = val.join(' ')
|
635
683
|
result
|
636
684
|
end
|
data/lib/namae/parser.y
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
class Namae::Parser
|
5
5
|
|
6
|
-
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
|
6
|
+
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX UPARTICLE
|
7
7
|
|
8
8
|
expect 0
|
9
9
|
|
@@ -20,7 +20,7 @@ rule
|
|
20
20
|
| sort_order
|
21
21
|
|
22
22
|
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
23
|
-
|
|
23
|
+
| titles { result = Name.new(:title => val[0]) }
|
24
24
|
|
25
25
|
display_order : u_words word opt_suffices opt_titles
|
26
26
|
{
|
@@ -43,6 +43,14 @@ rule
|
|
43
43
|
result = Name.new(:given => val[0], :particle => val[1],
|
44
44
|
:family => val[2])
|
45
45
|
}
|
46
|
+
| u_words UPARTICLE last
|
47
|
+
{
|
48
|
+
result = if include_particle_in_family?
|
49
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
50
|
+
else
|
51
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
52
|
+
end
|
53
|
+
}
|
46
54
|
| von last
|
47
55
|
{
|
48
56
|
result = Name.new(:particle => val[0], :family => val[1])
|
@@ -53,6 +61,14 @@ rule
|
|
53
61
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
54
62
|
:given => val[2][1] }, !!val[2][0])
|
55
63
|
}
|
64
|
+
| UPARTICLE last COMMA first
|
65
|
+
{
|
66
|
+
result = if include_particle_in_family?
|
67
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
68
|
+
else
|
69
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
70
|
+
end
|
71
|
+
}
|
56
72
|
| von last COMMA first
|
57
73
|
{
|
58
74
|
result = Name.new({ :particle => val[0], :family => val[1],
|
@@ -107,12 +123,14 @@ require 'strscan'
|
|
107
123
|
@defaults = {
|
108
124
|
:debug => false,
|
109
125
|
:prefer_comma_as_separator => false,
|
126
|
+
:include_particle_in_family => false,
|
110
127
|
:comma => ',',
|
111
128
|
:stops => ',;',
|
112
129
|
:separator => /\s*(\band\b|\&|;)\s*/i,
|
113
130
|
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
114
131
|
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
115
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
132
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
133
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
116
134
|
}
|
117
135
|
|
118
136
|
class << self
|
@@ -141,6 +159,10 @@ require 'strscan'
|
|
141
159
|
options[:comma]
|
142
160
|
end
|
143
161
|
|
162
|
+
def include_particle_in_family?
|
163
|
+
options[:include_particle_in_family]
|
164
|
+
end
|
165
|
+
|
144
166
|
def stops
|
145
167
|
options[:stops]
|
146
168
|
end
|
@@ -157,6 +179,10 @@ require 'strscan'
|
|
157
179
|
options[:appellation]
|
158
180
|
end
|
159
181
|
|
182
|
+
def uppercase_particle
|
183
|
+
options[:uppercase_particle]
|
184
|
+
end
|
185
|
+
|
160
186
|
def prefer_comma_as_separator?
|
161
187
|
options[:prefer_comma_as_separator]
|
162
188
|
end
|
@@ -262,6 +288,8 @@ require 'strscan'
|
|
262
288
|
else
|
263
289
|
consume_word(:UWORD, input.matched)
|
264
290
|
end
|
291
|
+
when input.scan(uppercase_particle)
|
292
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
265
293
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
266
294
|
consume_word(:UWORD, input.matched)
|
267
295
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
data/lib/namae/version.rb
CHANGED
data/namae.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: namae 1.0
|
5
|
+
# stub: namae 1.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "namae".freeze
|
9
|
-
s.version = "1.0
|
9
|
+
s.version = "1.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Sylvester Keil".freeze, "Dan Collis-Puro".freeze]
|
14
|
-
s.date = "2021-
|
14
|
+
s.date = "2021-03-12"
|
15
15
|
s.description = " Namae (\u540D\u524D) is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). ".freeze
|
16
16
|
s.email = ["sylvester@keil.or.at".freeze, "dan@collispuro.com".freeze]
|
17
17
|
s.extra_rdoc_files = [
|
data/spec/namae/parser_spec.rb
CHANGED
@@ -191,6 +191,66 @@ module Namae
|
|
191
191
|
expect(parser.parse!('Bernado Franecki Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
192
192
|
#expect(parser.parse!('Bernado Franecki, Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
193
193
|
end
|
194
|
+
|
195
|
+
it 'parses consecutive titles in display order' do
|
196
|
+
expect(parser.parse!('Lt. Col. Bernado Franecki')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Lt. Col.'])
|
197
|
+
end
|
198
|
+
|
199
|
+
context 'when include_particle_in_family is false' do
|
200
|
+
let(:parser) { Parser.new(include_particle_in_family: false) }
|
201
|
+
|
202
|
+
it 'parses common capitalized particles as the family name in display order' do
|
203
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
207
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
208
|
+
end
|
209
|
+
|
210
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
211
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
212
|
+
end
|
213
|
+
|
214
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
215
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
216
|
+
end
|
217
|
+
|
218
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
219
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
223
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
context 'when include_particle_in_family is true' do
|
228
|
+
let(:parser) { Parser.new(include_particle_in_family: true) }
|
229
|
+
|
230
|
+
it 'parses common capitalized particles as the family name in display order' do
|
231
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
232
|
+
end
|
233
|
+
|
234
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
235
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
236
|
+
end
|
237
|
+
|
238
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
239
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
243
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
244
|
+
end
|
245
|
+
|
246
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
247
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
248
|
+
end
|
249
|
+
|
250
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
251
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
252
|
+
end
|
253
|
+
end
|
194
254
|
end
|
195
255
|
end
|
196
256
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namae
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-03-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: racc
|