namae 1.0.2 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/features/lists.feature +4 -2
- data/features/step_definitions/namae_steps.rb +5 -0
- data/lib/namae/parser.rb +197 -149
- data/lib/namae/parser.y +31 -3
- data/lib/namae/version.rb +2 -2
- data/namae.gemspec +3 -3
- data/spec/namae/parser_spec.rb +60 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82fa8955c4f650ccbcb6bf67db18f005eafd2f1f09252b8d98203a6f04949ed2
|
4
|
+
data.tar.gz: c96965c52193db381f8fceb0e8bfc34453f62127c846b6cb593e804c25394908
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14644528eb8d587a2fd0064fbddfa6cfe8925d7a1b9b3f6a75b33c599f3bbb9a43b53e951e4ec8fab9506c5edc2ef62abff77fc1f43cca1ce3489c5ab4c36f8f
|
7
|
+
data.tar.gz: 921b0c60e964b8e5f6154008ab8923d59a2df5523484e38618f3ddddbd89eb2d0ad99ffdbbc9d90629cced3051e38fffb05617657338018afd8913cefc6e891c
|
data/features/lists.feature
CHANGED
@@ -115,7 +115,8 @@ Feature: Parse a list of names
|
|
115
115
|
| B | Malcom |
|
116
116
|
|
117
117
|
Scenario: A list of names with particles separated by commas
|
118
|
-
Given
|
118
|
+
Given I want to include particles in the family name
|
119
|
+
And a parser that prefers commas as separators
|
119
120
|
When I parse the names "Di Proctor, M., von Cooper, P."
|
120
121
|
Then the names should be:
|
121
122
|
| given | family |
|
@@ -128,7 +129,8 @@ Feature: Parse a list of names
|
|
128
129
|
| P | Cooper |
|
129
130
|
|
130
131
|
Scenario: A list of names with two consecutive accented characters
|
131
|
-
Given
|
132
|
+
Given I want to include particles in the family name
|
133
|
+
And a parser that prefers commas as separators
|
132
134
|
When I parse the names "Çakıroğlu, Ü., Başıbüyük, B."
|
133
135
|
Then the names should be:
|
134
136
|
| given | family |
|
@@ -2,6 +2,11 @@ Given /^a parser that prefers commas as separators$/ do
|
|
2
2
|
Namae::Parser.instance.options[:prefer_comma_as_separator] = true
|
3
3
|
end
|
4
4
|
|
5
|
+
Given /^I want to include particles in the family name$/ do
|
6
|
+
Namae::Parser.instance.options[:include_particle_in_family] = true
|
7
|
+
end
|
8
|
+
|
9
|
+
|
5
10
|
When /^I parse the name "(.*)"$/ do |string|
|
6
11
|
@name = Namae.parse!(string)[0]
|
7
12
|
end
|
data/lib/namae/parser.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
#
|
2
2
|
# DO NOT MODIFY!!!!
|
3
|
-
# This file is automatically generated by Racc 1.
|
3
|
+
# This file is automatically generated by Racc 1.5.2
|
4
4
|
# from Racc grammar file "".
|
5
5
|
#
|
6
6
|
|
@@ -11,17 +11,19 @@ require 'strscan'
|
|
11
11
|
module Namae
|
12
12
|
class Parser < Racc::Parser
|
13
13
|
|
14
|
-
module_eval(<<'...end parser.y/module_eval...', 'parser.y',
|
14
|
+
module_eval(<<'...end parser.y/module_eval...', 'parser.y', 122)
|
15
15
|
|
16
16
|
@defaults = {
|
17
17
|
:debug => false,
|
18
18
|
:prefer_comma_as_separator => false,
|
19
|
+
:include_particle_in_family => false,
|
19
20
|
:comma => ',',
|
20
21
|
:stops => ',;',
|
21
22
|
:separator => /\s*(\band\b|\&|;)\s*/i,
|
22
23
|
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
23
24
|
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
24
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
25
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
26
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
25
27
|
}
|
26
28
|
|
27
29
|
class << self
|
@@ -50,6 +52,10 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
50
52
|
options[:comma]
|
51
53
|
end
|
52
54
|
|
55
|
+
def include_particle_in_family?
|
56
|
+
options[:include_particle_in_family]
|
57
|
+
end
|
58
|
+
|
53
59
|
def stops
|
54
60
|
options[:stops]
|
55
61
|
end
|
@@ -66,6 +72,10 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
66
72
|
options[:appellation]
|
67
73
|
end
|
68
74
|
|
75
|
+
def uppercase_particle
|
76
|
+
options[:uppercase_particle]
|
77
|
+
end
|
78
|
+
|
69
79
|
def prefer_comma_as_separator?
|
70
80
|
options[:prefer_comma_as_separator]
|
71
81
|
end
|
@@ -171,6 +181,8 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
171
181
|
else
|
172
182
|
consume_word(:UWORD, input.matched)
|
173
183
|
end
|
184
|
+
when input.scan(uppercase_particle)
|
185
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
174
186
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
175
187
|
consume_word(:UWORD, input.matched)
|
176
188
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
@@ -195,133 +207,143 @@ module_eval(<<'...end parser.y/module_eval...', 'parser.y', 106)
|
|
195
207
|
##### State transition tables begin ###
|
196
208
|
|
197
209
|
racc_action_table = [
|
198
|
-
-
|
199
|
-
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
210
|
+
-41, 18, 25, 34, -42, 35, 36, -41, 19, -41,
|
211
|
+
-41, -42, 40, -42, -42, 15, 13, 16, 46, 52,
|
212
|
+
7, 17, 62, 12, 15, 24, 16, 27, 15, 13,
|
213
|
+
16, 17, 29, 7, 17, 66, 12, 15, 24, 16,
|
214
|
+
27, 73, 60, 59, 61, 29, 74, 46, -40, -36,
|
215
|
+
-24, 60, 59, 61, 66, -40, 69, 25, 46, 60,
|
216
|
+
59, 61, 60, 59, 61, 17, 46, 46, 46, 46,
|
217
|
+
60, 59, 61, 15, 24, 16, 17, 46, 34, 32,
|
218
|
+
35, 34, 38, 35, 34, 32, 35, -21, -21, -21,
|
219
|
+
34, 49, 35, 34, 32, 35, 34, 38, 35, -22,
|
220
|
+
-22, -22, 34, 53, 35, 34, 32, 35, 34, 32,
|
221
|
+
35, -21, -21, -21, 60, 59, 61, 60, 59, 61,
|
222
|
+
66 ]
|
209
223
|
|
210
224
|
racc_action_check = [
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
225
|
+
15, 1, 8, 39, 16, 39, 11, 15, 1, 15,
|
226
|
+
15, 16, 18, 16, 16, 0, 0, 0, 26, 31,
|
227
|
+
0, 0, 37, 0, 9, 9, 9, 9, 19, 19,
|
228
|
+
19, 44, 9, 19, 19, 45, 19, 22, 22, 22,
|
229
|
+
22, 56, 56, 56, 56, 22, 57, 47, 13, 56,
|
230
|
+
13, 36, 36, 36, 57, 13, 50, 65, 36, 52,
|
231
|
+
52, 52, 62, 62, 62, 67, 52, 68, 71, 62,
|
232
|
+
69, 69, 69, 5, 5, 5, 77, 69, 10, 10,
|
233
|
+
10, 12, 12, 12, 23, 23, 23, 24, 24, 24,
|
234
|
+
27, 27, 27, 28, 28, 28, 29, 29, 29, 32,
|
235
|
+
32, 32, 33, 33, 33, 42, 42, 42, 48, 48,
|
236
|
+
48, 49, 49, 49, 74, 74, 74, 80, 80, 80,
|
237
|
+
79 ]
|
222
238
|
|
223
239
|
racc_action_pointer = [
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
nil,
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
240
|
+
12, 1, nil, nil, nil, 70, nil, nil, -7, 21,
|
241
|
+
75, 4, 78, 48, nil, 0, 4, nil, 12, 25,
|
242
|
+
nil, nil, 34, 81, 84, nil, 8, 87, 90, 93,
|
243
|
+
nil, 17, 96, 99, nil, nil, 48, 20, nil, 0,
|
244
|
+
nil, nil, 102, nil, 22, 25, nil, 37, 105, 108,
|
245
|
+
54, nil, 56, nil, nil, nil, 39, 44, nil, nil,
|
246
|
+
nil, nil, 59, nil, nil, 48, nil, 56, 57, 67,
|
247
|
+
nil, 58, nil, nil, 111, nil, nil, 67, nil, 110,
|
248
|
+
114, nil ]
|
232
249
|
|
233
250
|
racc_action_default = [
|
234
|
-
-1, -
|
235
|
-
-
|
236
|
-
|
237
|
-
-
|
238
|
-
|
239
|
-
-
|
240
|
-
-
|
241
|
-
-
|
251
|
+
-1, -51, -2, -4, -5, -51, -8, -9, -10, -25,
|
252
|
+
-51, -51, -51, -21, -30, -32, -33, -49, -51, -51,
|
253
|
+
-6, -7, -51, -51, -40, -50, -43, -51, -51, -51,
|
254
|
+
-31, -16, -24, -25, -32, -33, -38, -51, -24, -25,
|
255
|
+
82, -3, -51, -16, -47, -44, -45, -43, -51, -24,
|
256
|
+
-14, -15, -38, -23, -17, -26, -39, -28, -34, -40,
|
257
|
+
-41, -42, -38, -14, -11, -48, -46, -47, -43, -38,
|
258
|
+
-19, -51, -35, -37, -51, -18, -12, -47, -20, -27,
|
259
|
+
-29, -13 ]
|
242
260
|
|
243
261
|
racc_goto_table = [
|
244
|
-
3,
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
262
|
+
3, 1, 45, 44, 54, 20, 64, 21, 31, 26,
|
263
|
+
37, 23, 80, 2, 71, 28, 8, nil, 30, 3,
|
264
|
+
70, 43, 26, 45, 67, 47, 50, 51, 42, 76,
|
265
|
+
75, 30, 41, 48, nil, 8, nil, 78, 9, 81,
|
266
|
+
63, nil, 30, 22, 45, 77, 68, 79, 30, nil,
|
267
|
+
39, nil, nil, nil, nil, nil, 72, 9, nil, nil,
|
268
|
+
nil, nil, nil, nil, nil, 39, nil, 39, nil, nil,
|
250
269
|
nil, nil, nil, nil, nil, nil, nil, nil, nil, nil,
|
251
|
-
|
270
|
+
72 ]
|
252
271
|
|
253
272
|
racc_goto_check = [
|
254
|
-
3,
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
273
|
+
3, 1, 17, 9, 13, 3, 10, 4, 11, 3,
|
274
|
+
11, 12, 15, 2, 16, 12, 7, nil, 18, 3,
|
275
|
+
13, 11, 3, 17, 9, 11, 11, 11, 12, 10,
|
276
|
+
13, 18, 2, 12, nil, 7, nil, 13, 8, 10,
|
277
|
+
11, nil, 18, 8, 17, 9, 11, 17, 18, nil,
|
278
|
+
8, nil, nil, nil, nil, nil, 3, 8, nil, nil,
|
279
|
+
nil, nil, nil, nil, nil, 8, nil, 8, nil, nil,
|
259
280
|
nil, nil, nil, nil, nil, nil, nil, nil, nil, nil,
|
260
|
-
|
261
|
-
nil, nil, nil, 3 ]
|
281
|
+
3 ]
|
262
282
|
|
263
283
|
racc_goto_pointer = [
|
264
|
-
nil,
|
265
|
-
|
284
|
+
nil, 1, 13, 0, 2, nil, nil, 16, 38, -23,
|
285
|
+
-38, -2, 6, -32, nil, -62, -42, -24, 9 ]
|
266
286
|
|
267
287
|
racc_goto_default = [
|
268
|
-
nil, nil, nil,
|
269
|
-
11, 10, nil,
|
288
|
+
nil, nil, nil, 58, 4, 5, 6, 65, 33, nil,
|
289
|
+
nil, 11, 10, nil, 55, 56, nil, 57, 14 ]
|
270
290
|
|
271
291
|
racc_reduce_table = [
|
272
292
|
0, 0, :racc_error,
|
273
|
-
0,
|
274
|
-
1,
|
275
|
-
3,
|
276
|
-
1,
|
277
|
-
1, 13, :_reduce_none,
|
278
|
-
2, 13, :_reduce_6,
|
279
|
-
2, 13, :_reduce_7,
|
280
|
-
1, 13, :_reduce_none,
|
281
|
-
1, 16, :_reduce_9,
|
282
|
-
1, 16, :_reduce_10,
|
283
|
-
4, 15, :_reduce_11,
|
284
|
-
5, 15, :_reduce_12,
|
285
|
-
6, 15, :_reduce_13,
|
286
|
-
3, 15, :_reduce_14,
|
287
|
-
2, 15, :_reduce_15,
|
288
|
-
3, 17, :_reduce_16,
|
289
|
-
4, 17, :_reduce_17,
|
290
|
-
5, 17, :_reduce_18,
|
291
|
-
1, 22, :_reduce_none,
|
292
|
-
2, 22, :_reduce_20,
|
293
|
-
3, 22, :_reduce_21,
|
294
|
-
1, 21, :_reduce_none,
|
295
|
-
1, 21, :_reduce_none,
|
296
|
-
1, 23, :_reduce_24,
|
297
|
-
3, 23, :_reduce_25,
|
298
|
-
1, 23, :_reduce_26,
|
299
|
-
3, 23, :_reduce_27,
|
300
|
-
1, 18, :_reduce_none,
|
301
|
-
2, 18, :_reduce_29,
|
302
|
-
1, 28, :_reduce_none,
|
303
|
-
1, 28, :_reduce_none,
|
304
|
-
1, 25, :_reduce_none,
|
305
|
-
2, 25, :_reduce_33,
|
306
|
-
0, 26, :_reduce_none,
|
307
|
-
1, 26, :_reduce_none,
|
308
|
-
0, 24, :_reduce_none,
|
309
|
-
1, 24, :_reduce_none,
|
310
|
-
1, 14, :_reduce_none,
|
293
|
+
0, 13, :_reduce_1,
|
294
|
+
1, 13, :_reduce_2,
|
295
|
+
3, 13, :_reduce_3,
|
296
|
+
1, 14, :_reduce_4,
|
311
297
|
1, 14, :_reduce_none,
|
298
|
+
2, 14, :_reduce_6,
|
299
|
+
2, 14, :_reduce_7,
|
312
300
|
1, 14, :_reduce_none,
|
313
|
-
|
314
|
-
1,
|
315
|
-
|
316
|
-
|
317
|
-
|
301
|
+
1, 17, :_reduce_9,
|
302
|
+
1, 17, :_reduce_10,
|
303
|
+
4, 16, :_reduce_11,
|
304
|
+
5, 16, :_reduce_12,
|
305
|
+
6, 16, :_reduce_13,
|
306
|
+
3, 16, :_reduce_14,
|
307
|
+
3, 16, :_reduce_15,
|
308
|
+
2, 16, :_reduce_16,
|
309
|
+
3, 18, :_reduce_17,
|
310
|
+
4, 18, :_reduce_18,
|
311
|
+
4, 18, :_reduce_19,
|
312
|
+
5, 18, :_reduce_20,
|
313
|
+
1, 24, :_reduce_none,
|
314
|
+
2, 24, :_reduce_22,
|
315
|
+
3, 24, :_reduce_23,
|
316
|
+
1, 23, :_reduce_none,
|
317
|
+
1, 23, :_reduce_none,
|
318
|
+
1, 25, :_reduce_26,
|
319
|
+
3, 25, :_reduce_27,
|
320
|
+
1, 25, :_reduce_28,
|
321
|
+
3, 25, :_reduce_29,
|
318
322
|
1, 20, :_reduce_none,
|
323
|
+
2, 20, :_reduce_31,
|
324
|
+
1, 30, :_reduce_none,
|
325
|
+
1, 30, :_reduce_none,
|
326
|
+
1, 27, :_reduce_none,
|
327
|
+
2, 27, :_reduce_35,
|
328
|
+
0, 28, :_reduce_none,
|
329
|
+
1, 28, :_reduce_none,
|
330
|
+
0, 26, :_reduce_none,
|
331
|
+
1, 26, :_reduce_none,
|
332
|
+
1, 15, :_reduce_none,
|
333
|
+
1, 15, :_reduce_none,
|
334
|
+
1, 15, :_reduce_none,
|
335
|
+
0, 21, :_reduce_none,
|
336
|
+
1, 21, :_reduce_none,
|
319
337
|
1, 29, :_reduce_none,
|
320
|
-
2, 29, :
|
338
|
+
2, 29, :_reduce_46,
|
339
|
+
0, 22, :_reduce_none,
|
340
|
+
1, 22, :_reduce_none,
|
341
|
+
1, 19, :_reduce_none,
|
342
|
+
2, 19, :_reduce_50 ]
|
321
343
|
|
322
|
-
racc_reduce_n =
|
344
|
+
racc_reduce_n = 51
|
323
345
|
|
324
|
-
racc_shift_n =
|
346
|
+
racc_shift_n = 82
|
325
347
|
|
326
348
|
racc_token_table = {
|
327
349
|
false => 0,
|
@@ -334,9 +356,10 @@ racc_token_table = {
|
|
334
356
|
:AND => 7,
|
335
357
|
:APPELLATION => 8,
|
336
358
|
:TITLE => 9,
|
337
|
-
:SUFFIX => 10
|
359
|
+
:SUFFIX => 10,
|
360
|
+
:UPARTICLE => 11 }
|
338
361
|
|
339
|
-
racc_nt_base =
|
362
|
+
racc_nt_base = 12
|
340
363
|
|
341
364
|
racc_use_result_var = true
|
342
365
|
|
@@ -368,6 +391,7 @@ Racc_token_to_s_table = [
|
|
368
391
|
"APPELLATION",
|
369
392
|
"TITLE",
|
370
393
|
"SUFFIX",
|
394
|
+
"UPARTICLE",
|
371
395
|
"$start",
|
372
396
|
"names",
|
373
397
|
"name",
|
@@ -375,6 +399,7 @@ Racc_token_to_s_table = [
|
|
375
399
|
"display_order",
|
376
400
|
"honorific",
|
377
401
|
"sort_order",
|
402
|
+
"titles",
|
378
403
|
"u_words",
|
379
404
|
"opt_suffices",
|
380
405
|
"opt_titles",
|
@@ -385,8 +410,7 @@ Racc_token_to_s_table = [
|
|
385
410
|
"words",
|
386
411
|
"opt_comma",
|
387
412
|
"suffices",
|
388
|
-
"u_word"
|
389
|
-
"titles" ]
|
413
|
+
"u_word" ]
|
390
414
|
|
391
415
|
Racc_debug_parser = false
|
392
416
|
|
@@ -493,14 +517,26 @@ module_eval(<<'.,.,', 'parser.y', 42)
|
|
493
517
|
|
494
518
|
module_eval(<<'.,.,', 'parser.y', 47)
|
495
519
|
def _reduce_15(val, _values, result)
|
496
|
-
|
520
|
+
result = if include_particle_in_family?
|
521
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
522
|
+
else
|
523
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
524
|
+
end
|
497
525
|
|
498
526
|
result
|
499
527
|
end
|
500
528
|
.,.,
|
501
529
|
|
502
|
-
module_eval(<<'.,.,', 'parser.y',
|
530
|
+
module_eval(<<'.,.,', 'parser.y', 55)
|
503
531
|
def _reduce_16(val, _values, result)
|
532
|
+
result = Name.new(:particle => val[0], :family => val[1])
|
533
|
+
|
534
|
+
result
|
535
|
+
end
|
536
|
+
.,.,
|
537
|
+
|
538
|
+
module_eval(<<'.,.,', 'parser.y', 60)
|
539
|
+
def _reduce_17(val, _values, result)
|
504
540
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
505
541
|
:given => val[2][1] }, !!val[2][0])
|
506
542
|
|
@@ -508,8 +544,20 @@ module_eval(<<'.,.,', 'parser.y', 52)
|
|
508
544
|
end
|
509
545
|
.,.,
|
510
546
|
|
511
|
-
module_eval(<<'.,.,', 'parser.y',
|
512
|
-
def
|
547
|
+
module_eval(<<'.,.,', 'parser.y', 65)
|
548
|
+
def _reduce_18(val, _values, result)
|
549
|
+
result = if include_particle_in_family?
|
550
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
551
|
+
else
|
552
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
553
|
+
end
|
554
|
+
|
555
|
+
result
|
556
|
+
end
|
557
|
+
.,.,
|
558
|
+
|
559
|
+
module_eval(<<'.,.,', 'parser.y', 73)
|
560
|
+
def _reduce_19(val, _values, result)
|
513
561
|
result = Name.new({ :particle => val[0], :family => val[1],
|
514
562
|
:suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
515
563
|
|
@@ -517,8 +565,8 @@ module_eval(<<'.,.,', 'parser.y', 57)
|
|
517
565
|
end
|
518
566
|
.,.,
|
519
567
|
|
520
|
-
module_eval(<<'.,.,', 'parser.y',
|
521
|
-
def
|
568
|
+
module_eval(<<'.,.,', 'parser.y', 78)
|
569
|
+
def _reduce_20(val, _values, result)
|
522
570
|
result = Name.new({ :particle => val[0,2].join(' '), :family => val[2],
|
523
571
|
:suffix => val[4][0], :given => val[4][1] }, !!val[4][0])
|
524
572
|
|
@@ -526,80 +574,76 @@ module_eval(<<'.,.,', 'parser.y', 62)
|
|
526
574
|
end
|
527
575
|
.,.,
|
528
576
|
|
529
|
-
# reduce
|
577
|
+
# reduce 21 omitted
|
530
578
|
|
531
|
-
module_eval(<<'.,.,', 'parser.y',
|
532
|
-
def
|
579
|
+
module_eval(<<'.,.,', 'parser.y', 84)
|
580
|
+
def _reduce_22(val, _values, result)
|
533
581
|
result = val.join(' ')
|
534
582
|
result
|
535
583
|
end
|
536
584
|
.,.,
|
537
585
|
|
538
|
-
module_eval(<<'.,.,', 'parser.y',
|
539
|
-
def
|
586
|
+
module_eval(<<'.,.,', 'parser.y', 85)
|
587
|
+
def _reduce_23(val, _values, result)
|
540
588
|
result = val.join(' ')
|
541
589
|
result
|
542
590
|
end
|
543
591
|
.,.,
|
544
592
|
|
545
|
-
# reduce
|
593
|
+
# reduce 24 omitted
|
546
594
|
|
547
|
-
# reduce
|
595
|
+
# reduce 25 omitted
|
548
596
|
|
549
|
-
module_eval(<<'.,.,', 'parser.y',
|
550
|
-
def
|
597
|
+
module_eval(<<'.,.,', 'parser.y', 89)
|
598
|
+
def _reduce_26(val, _values, result)
|
551
599
|
result = [nil,val[0]]
|
552
600
|
result
|
553
601
|
end
|
554
602
|
.,.,
|
555
603
|
|
556
|
-
module_eval(<<'.,.,', 'parser.y',
|
557
|
-
def
|
604
|
+
module_eval(<<'.,.,', 'parser.y', 90)
|
605
|
+
def _reduce_27(val, _values, result)
|
558
606
|
result = [val[2],val[0]]
|
559
607
|
result
|
560
608
|
end
|
561
609
|
.,.,
|
562
610
|
|
563
|
-
module_eval(<<'.,.,', 'parser.y',
|
564
|
-
def
|
611
|
+
module_eval(<<'.,.,', 'parser.y', 91)
|
612
|
+
def _reduce_28(val, _values, result)
|
565
613
|
result = [val[0],nil]
|
566
614
|
result
|
567
615
|
end
|
568
616
|
.,.,
|
569
617
|
|
570
|
-
module_eval(<<'.,.,', 'parser.y',
|
571
|
-
def
|
618
|
+
module_eval(<<'.,.,', 'parser.y', 92)
|
619
|
+
def _reduce_29(val, _values, result)
|
572
620
|
result = [val[0],val[2]]
|
573
621
|
result
|
574
622
|
end
|
575
623
|
.,.,
|
576
624
|
|
577
|
-
# reduce
|
625
|
+
# reduce 30 omitted
|
578
626
|
|
579
|
-
module_eval(<<'.,.,', 'parser.y',
|
580
|
-
def
|
627
|
+
module_eval(<<'.,.,', 'parser.y', 95)
|
628
|
+
def _reduce_31(val, _values, result)
|
581
629
|
result = val.join(' ')
|
582
630
|
result
|
583
631
|
end
|
584
632
|
.,.,
|
585
633
|
|
586
|
-
# reduce
|
634
|
+
# reduce 32 omitted
|
587
635
|
|
588
|
-
# reduce
|
636
|
+
# reduce 33 omitted
|
589
637
|
|
590
|
-
# reduce
|
638
|
+
# reduce 34 omitted
|
591
639
|
|
592
|
-
module_eval(<<'.,.,', 'parser.y',
|
593
|
-
def
|
640
|
+
module_eval(<<'.,.,', 'parser.y', 100)
|
641
|
+
def _reduce_35(val, _values, result)
|
594
642
|
result = val.join(' ')
|
595
643
|
result
|
596
644
|
end
|
597
645
|
.,.,
|
598
646
|
|
599
|
-
# reduce 34 omitted
|
600
|
-
|
601
|
-
# reduce 35 omitted
|
602
|
-
|
603
647
|
# reduce 36 omitted
|
604
648
|
|
605
649
|
# reduce 37 omitted
|
@@ -616,21 +660,25 @@ module_eval(<<'.,.,', 'parser.y', 84)
|
|
616
660
|
|
617
661
|
# reduce 43 omitted
|
618
662
|
|
619
|
-
|
620
|
-
|
663
|
+
# reduce 44 omitted
|
664
|
+
|
665
|
+
# reduce 45 omitted
|
666
|
+
|
667
|
+
module_eval(<<'.,.,', 'parser.y', 110)
|
668
|
+
def _reduce_46(val, _values, result)
|
621
669
|
result = val.join(' ')
|
622
670
|
result
|
623
671
|
end
|
624
672
|
.,.,
|
625
673
|
|
626
|
-
# reduce
|
674
|
+
# reduce 47 omitted
|
627
675
|
|
628
|
-
# reduce
|
676
|
+
# reduce 48 omitted
|
629
677
|
|
630
|
-
# reduce
|
678
|
+
# reduce 49 omitted
|
631
679
|
|
632
|
-
module_eval(<<'.,.,', 'parser.y',
|
633
|
-
def
|
680
|
+
module_eval(<<'.,.,', 'parser.y', 115)
|
681
|
+
def _reduce_50(val, _values, result)
|
634
682
|
result = val.join(' ')
|
635
683
|
result
|
636
684
|
end
|
data/lib/namae/parser.y
CHANGED
@@ -3,7 +3,7 @@
|
|
3
3
|
|
4
4
|
class Namae::Parser
|
5
5
|
|
6
|
-
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX
|
6
|
+
token COMMA UWORD LWORD PWORD NICK AND APPELLATION TITLE SUFFIX UPARTICLE
|
7
7
|
|
8
8
|
expect 0
|
9
9
|
|
@@ -20,7 +20,7 @@ rule
|
|
20
20
|
| sort_order
|
21
21
|
|
22
22
|
honorific : APPELLATION { result = Name.new(:appellation => val[0]) }
|
23
|
-
|
|
23
|
+
| titles { result = Name.new(:title => val[0]) }
|
24
24
|
|
25
25
|
display_order : u_words word opt_suffices opt_titles
|
26
26
|
{
|
@@ -43,6 +43,14 @@ rule
|
|
43
43
|
result = Name.new(:given => val[0], :particle => val[1],
|
44
44
|
:family => val[2])
|
45
45
|
}
|
46
|
+
| u_words UPARTICLE last
|
47
|
+
{
|
48
|
+
result = if include_particle_in_family?
|
49
|
+
Name.new(:given => val[0], :family => val[1,2].join(' '))
|
50
|
+
else
|
51
|
+
Name.new(:given => val[0], :particle => val[1], :family => val[2])
|
52
|
+
end
|
53
|
+
}
|
46
54
|
| von last
|
47
55
|
{
|
48
56
|
result = Name.new(:particle => val[0], :family => val[1])
|
@@ -53,6 +61,14 @@ rule
|
|
53
61
|
result = Name.new({ :family => val[0], :suffix => val[2][0],
|
54
62
|
:given => val[2][1] }, !!val[2][0])
|
55
63
|
}
|
64
|
+
| UPARTICLE last COMMA first
|
65
|
+
{
|
66
|
+
result = if include_particle_in_family?
|
67
|
+
Name.new({ :family => val[0,2].join(' '), :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
68
|
+
else
|
69
|
+
Name.new({ :particle => val[0], :family => val[1], :suffix => val[3][0], :given => val[3][1] }, !!val[3][0])
|
70
|
+
end
|
71
|
+
}
|
56
72
|
| von last COMMA first
|
57
73
|
{
|
58
74
|
result = Name.new({ :particle => val[0], :family => val[1],
|
@@ -107,12 +123,14 @@ require 'strscan'
|
|
107
123
|
@defaults = {
|
108
124
|
:debug => false,
|
109
125
|
:prefer_comma_as_separator => false,
|
126
|
+
:include_particle_in_family => false,
|
110
127
|
:comma => ',',
|
111
128
|
:stops => ',;',
|
112
129
|
:separator => /\s*(\band\b|\&|;)\s*/i,
|
113
130
|
:title => /\s*\b(sir|lord|count(ess)?|(gen|adm|col|maj|capt|cmdr|lt|sgt|cpl|pvt|pastor|pr|reverend|rev|elder|deacon|deaconess|father|fr|rabbi|cantor|vicar|prof|dr|md|ph\.?d)\.?)(\s+|$)/i,
|
114
131
|
:suffix => /\s*\b(JR|Jr|jr|SR|Sr|sr|[IVX]{2,})(\.|\b)/,
|
115
|
-
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i
|
132
|
+
:appellation => /\s*\b((mrs?|ms|fr|hr)\.?|miss|herr|frau)(\s+|$)/i,
|
133
|
+
:uppercase_particle => /\s*\b((Da|De|Di|De\sLa|Du|Der|Des|Da|St|Saint|Les|Van)\.?)(\s+|$)/
|
116
134
|
}
|
117
135
|
|
118
136
|
class << self
|
@@ -141,6 +159,10 @@ require 'strscan'
|
|
141
159
|
options[:comma]
|
142
160
|
end
|
143
161
|
|
162
|
+
def include_particle_in_family?
|
163
|
+
options[:include_particle_in_family]
|
164
|
+
end
|
165
|
+
|
144
166
|
def stops
|
145
167
|
options[:stops]
|
146
168
|
end
|
@@ -157,6 +179,10 @@ require 'strscan'
|
|
157
179
|
options[:appellation]
|
158
180
|
end
|
159
181
|
|
182
|
+
def uppercase_particle
|
183
|
+
options[:uppercase_particle]
|
184
|
+
end
|
185
|
+
|
160
186
|
def prefer_comma_as_separator?
|
161
187
|
options[:prefer_comma_as_separator]
|
162
188
|
end
|
@@ -262,6 +288,8 @@ require 'strscan'
|
|
262
288
|
else
|
263
289
|
consume_word(:UWORD, input.matched)
|
264
290
|
end
|
291
|
+
when input.scan(uppercase_particle)
|
292
|
+
consume_word(:UPARTICLE, input.matched.strip)
|
265
293
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:upper:]][^\s#{stops}]*/)
|
266
294
|
consume_word(:UWORD, input.matched)
|
267
295
|
when input.scan(/((\\\w+)?\{[^\}]*\})*[[:lower:]][^\s#{stops}]*/)
|
data/lib/namae/version.rb
CHANGED
data/namae.gemspec
CHANGED
@@ -2,16 +2,16 @@
|
|
2
2
|
# DO NOT EDIT THIS FILE DIRECTLY
|
3
3
|
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
4
|
# -*- encoding: utf-8 -*-
|
5
|
-
# stub: namae 1.0
|
5
|
+
# stub: namae 1.1.0 ruby lib
|
6
6
|
|
7
7
|
Gem::Specification.new do |s|
|
8
8
|
s.name = "namae".freeze
|
9
|
-
s.version = "1.0
|
9
|
+
s.version = "1.1.0"
|
10
10
|
|
11
11
|
s.required_rubygems_version = Gem::Requirement.new(">= 0".freeze) if s.respond_to? :required_rubygems_version=
|
12
12
|
s.require_paths = ["lib".freeze]
|
13
13
|
s.authors = ["Sylvester Keil".freeze, "Dan Collis-Puro".freeze]
|
14
|
-
s.date = "2021-
|
14
|
+
s.date = "2021-03-12"
|
15
15
|
s.description = " Namae (\u540D\u524D) is a parser for human names. It recognizes personal names of various cultural backgrounds and tries to split them into their component parts (e.g., given and family names, honorifics etc.). ".freeze
|
16
16
|
s.email = ["sylvester@keil.or.at".freeze, "dan@collispuro.com".freeze]
|
17
17
|
s.extra_rdoc_files = [
|
data/spec/namae/parser_spec.rb
CHANGED
@@ -191,6 +191,66 @@ module Namae
|
|
191
191
|
expect(parser.parse!('Bernado Franecki Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
192
192
|
#expect(parser.parse!('Bernado Franecki, Ph.D.')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Ph.D.'])
|
193
193
|
end
|
194
|
+
|
195
|
+
it 'parses consecutive titles in display order' do
|
196
|
+
expect(parser.parse!('Lt. Col. Bernado Franecki')[0].values_at(:given, :family, :title)).to eq(['Bernado', 'Franecki', 'Lt. Col.'])
|
197
|
+
end
|
198
|
+
|
199
|
+
context 'when include_particle_in_family is false' do
|
200
|
+
let(:parser) { Parser.new(include_particle_in_family: false) }
|
201
|
+
|
202
|
+
it 'parses common capitalized particles as the family name in display order' do
|
203
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
204
|
+
end
|
205
|
+
|
206
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
207
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
208
|
+
end
|
209
|
+
|
210
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
211
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
212
|
+
end
|
213
|
+
|
214
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
215
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'De'])
|
216
|
+
end
|
217
|
+
|
218
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
219
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
220
|
+
end
|
221
|
+
|
222
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
223
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'Hilaire', 'St.'])
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
context 'when include_particle_in_family is true' do
|
228
|
+
let(:parser) { Parser.new(include_particle_in_family: true) }
|
229
|
+
|
230
|
+
it 'parses common capitalized particles as the family name in display order' do
|
231
|
+
expect(parser.parse!('Carlos De Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
232
|
+
end
|
233
|
+
|
234
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
235
|
+
expect(parser.parse!('Matt St. Hilaire')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
236
|
+
end
|
237
|
+
|
238
|
+
it 'parses common lowercase particles as a particle, not family name in display order' do
|
239
|
+
expect(parser.parse!('Carlos de Silva')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
240
|
+
end
|
241
|
+
|
242
|
+
it 'parses common capitalized particles as the family name in sort order' do
|
243
|
+
expect(parser.parse!('De Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'De Silva', nil])
|
244
|
+
end
|
245
|
+
|
246
|
+
it 'parses common lowercase particles as a particle, not family name in sort order' do
|
247
|
+
expect(parser.parse!('de Silva, Carlos')[0].values_at(:given, :family, :particle)).to eq(['Carlos', 'Silva', 'de'])
|
248
|
+
end
|
249
|
+
|
250
|
+
it 'parses common capitalized particles with punctuation as the family name in display order' do
|
251
|
+
expect(parser.parse!('St. Hilaire, Matt')[0].values_at(:given, :family, :particle)).to eq(['Matt', 'St. Hilaire', nil])
|
252
|
+
end
|
253
|
+
end
|
194
254
|
end
|
195
255
|
end
|
196
256
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: namae
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sylvester Keil
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-03-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: racc
|