petrovich 0.2.1 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +79 -177
- data/Rakefile +7 -9
- data/lib/petrovich.rb +78 -77
- data/lib/petrovich/case/rule.rb +63 -0
- data/lib/petrovich/case/rule/modifier.rb +19 -0
- data/lib/petrovich/case/rule/test.rb +23 -0
- data/lib/petrovich/gender.rb +39 -0
- data/lib/petrovich/gender/rule.rb +22 -0
- data/lib/petrovich/inflected.rb +18 -0
- data/lib/petrovich/inflector.rb +42 -0
- data/lib/petrovich/name.rb +75 -0
- data/lib/petrovich/rule_set.rb +118 -0
- data/lib/petrovich/unicode.rb +4 -3
- data/lib/petrovich/value.rb +12 -0
- data/lib/tasks/evaluate.rake +14 -32
- data/rules/rules.yml +186 -44
- metadata +57 -7
- data/lib/petrovich/extension.rb +0 -140
- data/lib/petrovich/rules.rb +0 -209
data/lib/tasks/evaluate.rake
CHANGED
@@ -1,39 +1,22 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
require 'csv'
|
4
|
-
require 'petrovich/unicode'
|
5
|
-
|
6
|
-
class Object
|
7
|
-
include Petrovich::Unicode
|
8
|
-
end
|
9
|
-
|
10
|
-
CASES = [
|
11
|
-
:nominative,
|
12
|
-
:genitive,
|
13
|
-
:dative,
|
14
|
-
:accusative,
|
15
|
-
:instrumental,
|
16
|
-
:prepositional
|
17
|
-
]
|
18
4
|
|
19
5
|
def check!(errors, correct, total, lemma, gender, gcase, expected)
|
20
|
-
|
21
|
-
inflection = upcase(inflector.lastname(lemma, gcase))
|
22
|
-
|
6
|
+
actual = Petrovich::Unicode.upcase(Petrovich(lastname: lemma).public_send(gcase).lastname)
|
23
7
|
total[[gender, gcase]] += 1
|
24
|
-
|
25
|
-
if inflection == expected
|
8
|
+
if actual == expected
|
26
9
|
correct[[gender, gcase]] += 1
|
27
10
|
true
|
28
11
|
else
|
29
|
-
errors << [lemma, expected,
|
30
|
-
|
12
|
+
errors << [lemma, expected, actual, [gender, gcase]]
|
13
|
+
actual
|
31
14
|
end
|
32
15
|
end
|
33
16
|
|
34
|
-
desc 'Evaluate the inflector on
|
17
|
+
desc 'Evaluate the inflector on lastnames'
|
35
18
|
task :evaluate => :petrovich do
|
36
|
-
filename = File.expand_path('../../../
|
19
|
+
filename = File.expand_path('../../../test/data/surnames.tsv', __FILE__)
|
37
20
|
errors_filename = ENV['errors'] || 'errors.tsv'
|
38
21
|
|
39
22
|
correct, total = Hash.new(0), Hash.new(0)
|
@@ -58,22 +41,21 @@ task :evaluate => :petrovich do
|
|
58
41
|
|
59
42
|
if grammemes.include? '0'
|
60
43
|
# some words are aptotic so we have to ensure that
|
61
|
-
CASES.each do |gcase|
|
62
|
-
check!
|
44
|
+
Petrovich::CASES.each do |gcase|
|
45
|
+
check!(errors, correct, total, lemma, gender, gcase, word)
|
63
46
|
end
|
64
47
|
elsif grammemes.include? 'им'
|
65
|
-
check!
|
48
|
+
check!(errors, correct, total, lemma, gender, :nominative, word)
|
66
49
|
elsif grammemes.include? 'рд'
|
67
|
-
check!
|
50
|
+
check!(errors, correct, total, lemma, gender, :genitive, word)
|
68
51
|
elsif grammemes.include? 'дт'
|
69
|
-
check!
|
52
|
+
check!(errors, correct, total, lemma, gender, :dative, word)
|
70
53
|
elsif grammemes.include? 'вн'
|
71
|
-
check!
|
54
|
+
check!(errors, correct, total, lemma, gender, :accusative, word)
|
72
55
|
elsif grammemes.include? 'тв'
|
73
|
-
|
74
|
-
check! errors, correct, total, lemma, gender, :instrumental, word
|
56
|
+
check!(errors, correct, total, lemma, gender, :instrumental, word)
|
75
57
|
elsif grammemes.include? 'пр'
|
76
|
-
check!
|
58
|
+
check!(errors, correct, total, lemma, gender, :prepositional, word)
|
77
59
|
end
|
78
60
|
end
|
79
61
|
end
|
data/rules/rules.yml
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
# Порядок имеет значение!
|
2
|
+
# http://to-name.ru/surname/familnye-okonchania.htm
|
3
|
+
# http://gramma.ru/SPR/?id=2.1
|
1
4
|
lastname:
|
2
5
|
exceptions:
|
3
6
|
# Неизменяемые первые части двойных русских фамилий.
|
4
|
-
#
|
5
7
|
- gender: androgynous
|
6
8
|
test:
|
7
9
|
- бонч
|
@@ -47,153 +49,161 @@ lastname:
|
|
47
49
|
test: [гава, орота]
|
48
50
|
mods: [., ., ., ., .]
|
49
51
|
|
52
|
+
# Брыльска, *
|
50
53
|
- gender: female
|
51
54
|
test: [ска, цка]
|
52
55
|
mods: [-ой, -ой, -ую, -ой, -ой]
|
53
56
|
|
57
|
+
# Дубовицкая, Суперанская, Лесная, Барановская
|
54
58
|
- gender: female
|
55
59
|
test: [цкая, ская, ная, ая]
|
56
60
|
mods: [--ой, --ой, --ую, --ой, --ой]
|
57
61
|
|
62
|
+
# Зимняя
|
58
63
|
- gender: female
|
59
64
|
test: [яя]
|
60
65
|
mods: [--ей, --ей, --юю, --ей, --ей]
|
61
66
|
|
62
|
-
|
63
|
-
test: [на]
|
64
|
-
mods: [-ой, -ой, -у, -ой, -ой]
|
65
|
-
|
66
|
-
- gender: male
|
67
|
-
test: [иной]
|
68
|
-
mods: [-я, -ю, -я, -ем, -е]
|
69
|
-
|
67
|
+
# *, Иллуй
|
70
68
|
- gender: male
|
71
|
-
test: [уй]
|
69
|
+
test: [иной, уй]
|
72
70
|
mods: [-я, -ю, -я, -ем, -е]
|
73
71
|
|
72
|
+
# Ярица
|
74
73
|
- gender: androgynous
|
75
74
|
test: [ца]
|
76
75
|
mods: [-ы, -е, -у, -ей, -е]
|
77
76
|
|
77
|
+
# Эрих
|
78
78
|
- gender: male
|
79
79
|
test: [рих]
|
80
80
|
mods: [а, у, а, ом, е]
|
81
81
|
|
82
|
+
# Кантария
|
82
83
|
- gender: androgynous
|
83
84
|
test: [ия]
|
84
|
-
mods: [
|
85
|
+
mods: [-и, -и, -ю, -ей, -и]
|
85
86
|
|
87
|
+
# Гулиа, *, *, Джабуа, *, *, *, *
|
86
88
|
- gender: androgynous
|
87
89
|
test: [иа, аа, оа, уа, ыа, еа, юа, эа]
|
88
90
|
mods: [., ., ., ., .]
|
89
91
|
|
90
|
-
|
91
|
-
test: [их, ых]
|
92
|
-
mods: [., ., ., ., .]
|
93
|
-
|
92
|
+
# Осипенко, *, Мегрэ, *, *, Хереску, *
|
94
93
|
- gender: androgynous
|
95
94
|
test: [о, е, э, и, ы, у, ю]
|
96
95
|
mods: [., ., ., ., .]
|
97
96
|
|
97
|
+
# Широких, Больных
|
98
|
+
- gender: male
|
99
|
+
test: [их, ых]
|
100
|
+
mods: [., ., ., ., .]
|
101
|
+
|
102
|
+
# Кузнецова, Голубева, Скурихина
|
98
103
|
- gender: female
|
99
|
-
test: [ова,
|
104
|
+
test: [ова, ева, на]
|
100
105
|
mods: [-ой, -ой, -у, -ой, -ой]
|
101
106
|
|
107
|
+
# Басалыга, Лазука, Пидкуймуха, Бобча, Гуща, Рогожа, Кваша, Глоба
|
102
108
|
- gender: androgynous
|
103
|
-
test: [га, ка, ха, ча, ща, жа,
|
109
|
+
test: [га, ка, ха, ча, ща, жа, ша, a]
|
104
110
|
mods: [-и, -е, -у, -ой, -е]
|
105
111
|
|
106
|
-
|
107
|
-
test: [а]
|
108
|
-
mods: [-ы, -е, -у, -ой, -е]
|
109
|
-
|
112
|
+
# Белоконь
|
110
113
|
- gender: male
|
111
114
|
test: [ь]
|
112
115
|
mods: [-я, -ю, -я, -ем, -е]
|
113
116
|
|
114
|
-
|
115
|
-
test: [ия]
|
116
|
-
mods: [-и, -и, -ю, -ей, -и]
|
117
|
-
|
117
|
+
# Фидря
|
118
118
|
- gender: androgynous
|
119
119
|
test: [я]
|
120
120
|
mods: [-и, -е, -ю, -ей, -е]
|
121
121
|
|
122
|
+
# Воробей
|
123
|
+
- gender: male
|
124
|
+
test: [обей]
|
125
|
+
mods: [--ья, --ью, --ья, --ьем, --ье]
|
126
|
+
|
127
|
+
# Кочубей
|
122
128
|
- gender: male
|
123
129
|
test: [ей]
|
124
130
|
mods: [-я, -ю, -я, -ем, -е]
|
125
131
|
|
132
|
+
# Хачикян, Богдан, *
|
126
133
|
- gender: male
|
127
134
|
test: [ян, ан, йн]
|
128
135
|
mods: [а, у, а, ом, е]
|
129
136
|
|
137
|
+
# Волынец, Горобец
|
130
138
|
- gender: male
|
131
139
|
test: [ынец, обец]
|
132
140
|
mods: [--ца, --цу, --ца, --цем, --це]
|
133
141
|
|
142
|
+
# *, Быховец
|
143
|
+
# TODO: Проверить Брагинец
|
134
144
|
- gender: male
|
135
145
|
test: [онец, овец]
|
136
146
|
mods: [--ца, --цу, --ца, --цом, --це]
|
137
147
|
|
148
|
+
# Порывай
|
138
149
|
- gender: male
|
139
150
|
test: [ай]
|
140
151
|
mods: [-я, -ю, -я, -ем, -е]
|
141
152
|
|
153
|
+
# Дорогой, Руцкой
|
142
154
|
- gender: male
|
143
|
-
test: [кой]
|
144
|
-
mods: [-го, -му, -го, --им, -м]
|
145
|
-
|
146
|
-
- gender: male
|
147
|
-
test: [гой]
|
155
|
+
test: [гой, кой]
|
148
156
|
mods: [-го, -му, -го, --им, -м]
|
149
157
|
|
158
|
+
# Беpеговой
|
150
159
|
- gender: male
|
151
160
|
test: [ой]
|
152
161
|
mods: [-го, -му, -го, --ым, -м]
|
153
162
|
|
163
|
+
# *, *
|
154
164
|
- gender: male
|
155
165
|
test: [ах, ив]
|
156
166
|
mods: [а, у, а, ом, е]
|
157
167
|
|
168
|
+
# *, *, *, *
|
158
169
|
- gender: male
|
159
170
|
test: [ший, щий, жий, ний]
|
160
171
|
mods: [--его, --ему, --его, -м, --ем]
|
161
172
|
|
162
173
|
- gender: male
|
163
|
-
test: [
|
164
|
-
mods: [--ого, --ому, --ого, -м, --ом]
|
165
|
-
|
166
|
-
- gender: male
|
167
|
-
test: [кий]
|
174
|
+
test: [ый, кий]
|
168
175
|
mods: [--ого, --ому, --ого, -м, --ом]
|
169
176
|
|
177
|
+
# Хорунжий
|
170
178
|
- gender: male
|
171
179
|
test: [ий]
|
172
180
|
mods: [-я, -ю, -я, -ем, -и]
|
173
181
|
|
182
|
+
# Починок
|
174
183
|
- gender: male
|
175
184
|
test: [ок]
|
176
185
|
mods: [--ка, --ку, --ка, --ком, --ке]
|
177
186
|
|
187
|
+
# Кравец
|
178
188
|
- gender: male
|
179
189
|
test: [ец]
|
180
190
|
mods: [--ца, --цу, --ца, --цом, --це]
|
181
191
|
|
192
|
+
# Лившиц, Концевич, Гармаш, Хрущ
|
182
193
|
- gender: male
|
183
194
|
test: [ц, ч, ш, щ]
|
184
195
|
mods: [а, у, а, ем, е]
|
185
196
|
|
197
|
+
# Любен, Манн, Гордон, Прядун
|
186
198
|
- gender: male
|
187
|
-
test: [ен, нн, он,
|
199
|
+
test: [ен, нн, он, ун, б, г, д, ж, з, к, л, м, п, р, с, т, ф, х]
|
188
200
|
mods: [а, у, а, ом, е]
|
189
201
|
|
202
|
+
# Петров, Левин
|
190
203
|
- gender: male
|
191
204
|
test: [в, н]
|
192
205
|
mods: [а, у, а, ым, е]
|
193
206
|
|
194
|
-
- gender: male
|
195
|
-
test: [б, г, д, ж, з, к, л, м, п, р, с, т, ф, х]
|
196
|
-
mods: [а, у, а, ом, е]
|
197
207
|
|
198
208
|
firstname:
|
199
209
|
exceptions:
|
@@ -234,6 +244,7 @@ firstname:
|
|
234
244
|
test: [ь]
|
235
245
|
mods: [-и, -и, ., ю, -и]
|
236
246
|
|
247
|
+
# Олесь
|
237
248
|
- gender: male
|
238
249
|
test: [ь]
|
239
250
|
mods: [-я, -ю, -я, -ем, -е]
|
@@ -242,6 +253,7 @@ firstname:
|
|
242
253
|
test: [га, ка, ха, ча, ща, жа]
|
243
254
|
mods: [-и, -е, -у, -ой, -е]
|
244
255
|
|
256
|
+
# Даша, Саша
|
245
257
|
- gender: female
|
246
258
|
test: [ша]
|
247
259
|
mods: [-и, -е, -у, -ей, -е]
|
@@ -250,14 +262,17 @@ firstname:
|
|
250
262
|
test: [а]
|
251
263
|
mods: [-ы, -е, -у, -ой, -е]
|
252
264
|
|
265
|
+
# Зульфия
|
253
266
|
- gender: female
|
254
267
|
test: [ия]
|
255
268
|
mods: [-и, -и, -ю, -ей, -и]
|
256
269
|
|
270
|
+
# Светлана
|
257
271
|
- gender: female
|
258
272
|
test: [а]
|
259
273
|
mods: [-ы, -е, -у, -ой, -е]
|
260
274
|
|
275
|
+
# Юлия
|
261
276
|
- gender: female
|
262
277
|
test: [я]
|
263
278
|
mods: [-и, -е, -ю, -ей, -е]
|
@@ -270,18 +285,16 @@ firstname:
|
|
270
285
|
test: [я]
|
271
286
|
mods: [-и, -е, -ю, -ей, -е]
|
272
287
|
|
288
|
+
# Андрей, *
|
273
289
|
- gender: male
|
274
|
-
test: [
|
290
|
+
test: [ей, й]
|
275
291
|
mods: [-я, -ю, -я, -ем, -е]
|
276
292
|
|
293
|
+
# Афанасий
|
277
294
|
- gender: male
|
278
295
|
test: [ий]
|
279
296
|
mods: [-я, -ю, -я, -ем, -и]
|
280
297
|
|
281
|
-
- gender: male
|
282
|
-
test: [й]
|
283
|
-
mods: [-я, -ю, -я, -ем, -е]
|
284
|
-
|
285
298
|
- gender: male
|
286
299
|
test: [б, в, г, д, ж, з, к, л, м, н, п, р, с, т, ф, х, ц, ч]
|
287
300
|
mods: [а, у, а, ом, е]
|
@@ -291,6 +304,13 @@ firstname:
|
|
291
304
|
mods: [-и, -и, -ю, -ем, -ем]
|
292
305
|
|
293
306
|
middlename:
|
307
|
+
exceptions:
|
308
|
+
- gender: androgynous
|
309
|
+
test:
|
310
|
+
- борух
|
311
|
+
mods: [., ., ., ., .]
|
312
|
+
tags: [first_word]
|
313
|
+
|
294
314
|
suffixes:
|
295
315
|
- gender: male
|
296
316
|
test: [ич]
|
@@ -299,3 +319,125 @@ middlename:
|
|
299
319
|
- gender: female
|
300
320
|
test: [на]
|
301
321
|
mods: [-ы, -е, -у, -ой, -е]
|
322
|
+
|
323
|
+
# Эвристики для определения пола
|
324
|
+
gender:
|
325
|
+
lastname:
|
326
|
+
# Здесь андрогенные фамилии не выделены в отдельную группу. Если в группе female и male
|
327
|
+
# не будет найдено совпадений, то фамилия будет считаться андрогенной.
|
328
|
+
female:
|
329
|
+
- ова
|
330
|
+
- кая
|
331
|
+
- на
|
332
|
+
- ина
|
333
|
+
- ева
|
334
|
+
- вая
|
335
|
+
- ска
|
336
|
+
- ная
|
337
|
+
male:
|
338
|
+
- кий
|
339
|
+
- ов
|
340
|
+
- ын
|
341
|
+
- ев
|
342
|
+
- ин
|
343
|
+
firstname:
|
344
|
+
# Исключения - андрогенные имена
|
345
|
+
androgynous:
|
346
|
+
- ким
|
347
|
+
- никита
|
348
|
+
- саша
|
349
|
+
- женя
|
350
|
+
male:
|
351
|
+
- кузьма
|
352
|
+
- лука
|
353
|
+
- савва
|
354
|
+
- фома
|
355
|
+
- ей
|
356
|
+
- ий
|
357
|
+
- ый
|
358
|
+
- ой
|
359
|
+
- др
|
360
|
+
- тр
|
361
|
+
- ел
|
362
|
+
- ан
|
363
|
+
- им
|
364
|
+
- д
|
365
|
+
- ис
|
366
|
+
- рт
|
367
|
+
- кт
|
368
|
+
- ар
|
369
|
+
- ен
|
370
|
+
- ав
|
371
|
+
- он
|
372
|
+
- ил
|
373
|
+
- ир
|
374
|
+
- их
|
375
|
+
- ри
|
376
|
+
- ис
|
377
|
+
- аф
|
378
|
+
- ор
|
379
|
+
- рь
|
380
|
+
- жи
|
381
|
+
- ат
|
382
|
+
- иф
|
383
|
+
- ья
|
384
|
+
- нт
|
385
|
+
- к
|
386
|
+
- ст
|
387
|
+
- ян
|
388
|
+
female:
|
389
|
+
- фанни
|
390
|
+
- фаня
|
391
|
+
- жаден
|
392
|
+
- жейден
|
393
|
+
- ия
|
394
|
+
- а
|
395
|
+
- ся
|
396
|
+
- ина
|
397
|
+
- та
|
398
|
+
- сья
|
399
|
+
- ля
|
400
|
+
- оя
|
401
|
+
- да
|
402
|
+
- йя
|
403
|
+
- ея
|
404
|
+
- вья
|
405
|
+
- нья
|
406
|
+
- ая
|
407
|
+
- ель
|
408
|
+
- ико
|
409
|
+
- фья
|
410
|
+
- рья
|
411
|
+
- лья
|
412
|
+
- биргит
|
413
|
+
- илзе
|
414
|
+
- овь
|
415
|
+
- елли
|
416
|
+
- ои
|
417
|
+
- гюль
|
418
|
+
- ес
|
419
|
+
- есс
|
420
|
+
- есси
|
421
|
+
- естин
|
422
|
+
- естини
|
423
|
+
- истин
|
424
|
+
- ети
|
425
|
+
- нет
|
426
|
+
- сти
|
427
|
+
- сми
|
428
|
+
- лин
|
429
|
+
- линн
|
430
|
+
- ейн
|
431
|
+
- нифер
|
432
|
+
- женни
|
433
|
+
- еннис
|
434
|
+
- енн
|
435
|
+
- инн
|
436
|
+
- ерин
|
437
|
+
- ляра
|
438
|
+
- лярам
|
439
|
+
middlename:
|
440
|
+
female:
|
441
|
+
- на
|
442
|
+
male:
|
443
|
+
- ич
|