opener-tokenizer-base 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +148 -0
- data/bin/tokenizer-base +5 -0
- data/bin/tokenizer-de +5 -0
- data/bin/tokenizer-en +5 -0
- data/bin/tokenizer-es +5 -0
- data/bin/tokenizer-fr +5 -0
- data/bin/tokenizer-it +5 -0
- data/bin/tokenizer-nl +5 -0
- data/core/lib/Data/OptList.pm +256 -0
- data/core/lib/Params/Util.pm +866 -0
- data/core/lib/Sub/Exporter.pm +1101 -0
- data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
- data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
- data/core/lib/Sub/Exporter/Util.pm +354 -0
- data/core/lib/Sub/Install.pm +329 -0
- data/core/lib/Time/Stamp.pm +808 -0
- data/core/load-prefixes.pl +43 -0
- data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
- data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
- data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
- data/core/split-sentences.pl +114 -0
- data/core/text-fixer.pl +169 -0
- data/core/tokenizer-cli.pl +363 -0
- data/core/tokenizer.pl +145 -0
- data/lib/opener/tokenizers/base.rb +84 -0
- data/lib/opener/tokenizers/base/version.rb +8 -0
- data/opener-tokenizer-base.gemspec +25 -0
- metadata +134 -0
@@ -0,0 +1,641 @@
|
|
1
|
+
#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
|
2
|
+
#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
|
3
|
+
|
4
|
+
#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
|
5
|
+
#usually upper case letters are initials in a name
|
6
|
+
A
|
7
|
+
B
|
8
|
+
C
|
9
|
+
D
|
10
|
+
E
|
11
|
+
F
|
12
|
+
G
|
13
|
+
H
|
14
|
+
I
|
15
|
+
J
|
16
|
+
K
|
17
|
+
L
|
18
|
+
M
|
19
|
+
N
|
20
|
+
O
|
21
|
+
P
|
22
|
+
Q
|
23
|
+
R
|
24
|
+
S
|
25
|
+
T
|
26
|
+
U
|
27
|
+
V
|
28
|
+
W
|
29
|
+
X
|
30
|
+
Y
|
31
|
+
Z
|
32
|
+
|
33
|
+
#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
|
34
|
+
Adj
|
35
|
+
Adm
|
36
|
+
Adv
|
37
|
+
Amn
|
38
|
+
Arch
|
39
|
+
Asst
|
40
|
+
Avv
|
41
|
+
Bart
|
42
|
+
Bcc
|
43
|
+
Bldg
|
44
|
+
Brig
|
45
|
+
Bros
|
46
|
+
C.A.P
|
47
|
+
C.P
|
48
|
+
Capt
|
49
|
+
Cc
|
50
|
+
Cmdr
|
51
|
+
Co
|
52
|
+
Col
|
53
|
+
Comdr
|
54
|
+
Con
|
55
|
+
Corp
|
56
|
+
Cpl
|
57
|
+
DR
|
58
|
+
Dott
|
59
|
+
Dr
|
60
|
+
Drs
|
61
|
+
Egr
|
62
|
+
Ens
|
63
|
+
Gen
|
64
|
+
Genn
|
65
|
+
Geom
|
66
|
+
Gov
|
67
|
+
Hon
|
68
|
+
Hosp
|
69
|
+
Hr
|
70
|
+
Id
|
71
|
+
Ing
|
72
|
+
Insp
|
73
|
+
Lt
|
74
|
+
MM
|
75
|
+
MR
|
76
|
+
MRS
|
77
|
+
MS
|
78
|
+
Maj
|
79
|
+
Messrs
|
80
|
+
Mlle
|
81
|
+
Mme
|
82
|
+
Mo
|
83
|
+
Mons
|
84
|
+
Mr
|
85
|
+
Mrs
|
86
|
+
Ms
|
87
|
+
Msgr
|
88
|
+
N.B
|
89
|
+
Op
|
90
|
+
Ord
|
91
|
+
P.S
|
92
|
+
P.T
|
93
|
+
Pfc
|
94
|
+
Ph
|
95
|
+
Prof
|
96
|
+
Pvt
|
97
|
+
RP
|
98
|
+
RSVP
|
99
|
+
Rag
|
100
|
+
Rep
|
101
|
+
Reps
|
102
|
+
Res
|
103
|
+
Rev
|
104
|
+
Rif
|
105
|
+
Rt
|
106
|
+
S.A
|
107
|
+
S.B.F
|
108
|
+
S.P.M
|
109
|
+
S.p.A
|
110
|
+
S.r.l
|
111
|
+
Sen
|
112
|
+
Sens
|
113
|
+
Sfc
|
114
|
+
Sgt
|
115
|
+
SGT
|
116
|
+
Sig
|
117
|
+
Sigg
|
118
|
+
Soc
|
119
|
+
Spett
|
120
|
+
Sr
|
121
|
+
St
|
122
|
+
Supt
|
123
|
+
Surg
|
124
|
+
V.P
|
125
|
+
|
126
|
+
# other
|
127
|
+
a.c
|
128
|
+
acc
|
129
|
+
all
|
130
|
+
banc
|
131
|
+
c.a
|
132
|
+
c.c.p
|
133
|
+
c.m
|
134
|
+
c.p
|
135
|
+
c.s
|
136
|
+
c.v
|
137
|
+
corr
|
138
|
+
dott
|
139
|
+
e.p.c
|
140
|
+
ecc
|
141
|
+
es
|
142
|
+
fatt
|
143
|
+
gg
|
144
|
+
int
|
145
|
+
lett
|
146
|
+
ogg
|
147
|
+
on
|
148
|
+
p.c
|
149
|
+
p.c.c
|
150
|
+
p.es
|
151
|
+
p.f
|
152
|
+
p.r
|
153
|
+
p.v
|
154
|
+
post
|
155
|
+
pp
|
156
|
+
racc
|
157
|
+
ric
|
158
|
+
s.n.c
|
159
|
+
seg
|
160
|
+
sgg
|
161
|
+
ss
|
162
|
+
tel
|
163
|
+
u.s
|
164
|
+
v.r
|
165
|
+
v.s
|
166
|
+
|
167
|
+
#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
|
168
|
+
v
|
169
|
+
vs
|
170
|
+
i.e
|
171
|
+
rev
|
172
|
+
e.g
|
173
|
+
|
174
|
+
#Numbers only. These should only induce breaks when followed by a numeric sequence
|
175
|
+
# add NUMERIC_ONLY after the word for this function
|
176
|
+
#This case is mostly for the english "No." which can either be a sentence of its own, or
|
177
|
+
#if followed by a number, a non-breaking prefix
|
178
|
+
No #NUMERIC_ONLY#
|
179
|
+
Nos
|
180
|
+
Art #NUMERIC_ONLY#
|
181
|
+
Nr
|
182
|
+
pp #NUMERIC_ONLY#
|
183
|
+
|
184
|
+
#unified abbreviation list
|
185
|
+
Acad
|
186
|
+
Adj
|
187
|
+
Adm
|
188
|
+
Adv
|
189
|
+
Affl
|
190
|
+
Ag
|
191
|
+
Apr
|
192
|
+
Art
|
193
|
+
Asst
|
194
|
+
Av
|
195
|
+
Avg
|
196
|
+
B.ches-du-Rh
|
197
|
+
Bart
|
198
|
+
Bco
|
199
|
+
Bldg
|
200
|
+
Brig
|
201
|
+
Bros
|
202
|
+
C.a
|
203
|
+
C.p.c.n
|
204
|
+
Ca
|
205
|
+
Capt
|
206
|
+
Cdt
|
207
|
+
Cf
|
208
|
+
Ch.-Mme
|
209
|
+
Chap
|
210
|
+
Cie
|
211
|
+
Cmdr
|
212
|
+
Col
|
213
|
+
Comdr
|
214
|
+
Con
|
215
|
+
Corp
|
216
|
+
Cpl
|
217
|
+
DR
|
218
|
+
DRA
|
219
|
+
Da
|
220
|
+
Dec
|
221
|
+
Dep
|
222
|
+
Dic
|
223
|
+
Dn
|
224
|
+
Dr
|
225
|
+
Dra
|
226
|
+
Dras
|
227
|
+
Drs
|
228
|
+
Eng
|
229
|
+
Enga
|
230
|
+
Engas
|
231
|
+
Engos
|
232
|
+
Ens
|
233
|
+
Ets
|
234
|
+
Euro
|
235
|
+
Ev
|
236
|
+
Ex
|
237
|
+
Excmo
|
238
|
+
Exmo
|
239
|
+
Exo
|
240
|
+
Fa
|
241
|
+
Fco
|
242
|
+
Feb
|
243
|
+
Febbr
|
244
|
+
Fig
|
245
|
+
Fr
|
246
|
+
Gar
|
247
|
+
Gen
|
248
|
+
Gir
|
249
|
+
Gl
|
250
|
+
Gov
|
251
|
+
Hno
|
252
|
+
Hon
|
253
|
+
Hosp
|
254
|
+
Hr
|
255
|
+
Ilmo
|
256
|
+
Insp
|
257
|
+
J.-C
|
258
|
+
Jan
|
259
|
+
Jeu
|
260
|
+
Jr
|
261
|
+
Jul
|
262
|
+
Jun
|
263
|
+
Lda
|
264
|
+
Lieut
|
265
|
+
Lt
|
266
|
+
Lun
|
267
|
+
MM
|
268
|
+
MR
|
269
|
+
MRS
|
270
|
+
MS
|
271
|
+
MSc
|
272
|
+
Magg
|
273
|
+
Maj
|
274
|
+
Mar
|
275
|
+
Me
|
276
|
+
Mej
|
277
|
+
Mer
|
278
|
+
Mes
|
279
|
+
Messrs
|
280
|
+
Mgr
|
281
|
+
Mgrs
|
282
|
+
Mll
|
283
|
+
Mlle
|
284
|
+
Mlle(s)
|
285
|
+
Mme
|
286
|
+
Mme(s)
|
287
|
+
Mr
|
288
|
+
Mrs
|
289
|
+
Ms
|
290
|
+
Msgr
|
291
|
+
Mw
|
292
|
+
Nov
|
293
|
+
Npr
|
294
|
+
Nr
|
295
|
+
O.d.J
|
296
|
+
Okt
|
297
|
+
Op
|
298
|
+
Ord
|
299
|
+
Ott
|
300
|
+
Oz
|
301
|
+
P
|
302
|
+
P.D
|
303
|
+
P.ej
|
304
|
+
P.p.c
|
305
|
+
Pas
|
306
|
+
Pfc
|
307
|
+
Ph
|
308
|
+
Prim
|
309
|
+
Prof
|
310
|
+
Pte
|
311
|
+
Pts
|
312
|
+
Pvt
|
313
|
+
Rep
|
314
|
+
Reps
|
315
|
+
Res
|
316
|
+
Rev
|
317
|
+
Revd
|
318
|
+
Rh
|
319
|
+
Riv
|
320
|
+
Rt
|
321
|
+
S.Em
|
322
|
+
S.Exc
|
323
|
+
S.a.r.l
|
324
|
+
Sen
|
325
|
+
Sens
|
326
|
+
Sep
|
327
|
+
Sept
|
328
|
+
Sett
|
329
|
+
Sfc
|
330
|
+
Sgt
|
331
|
+
Sl
|
332
|
+
Sr
|
333
|
+
Sra
|
334
|
+
Sras
|
335
|
+
Srs
|
336
|
+
Srta
|
337
|
+
St
|
338
|
+
ST
|
339
|
+
Sta
|
340
|
+
Ste
|
341
|
+
Sto
|
342
|
+
Supt
|
343
|
+
Surg
|
344
|
+
Tj
|
345
|
+
Tr
|
346
|
+
Ud
|
347
|
+
Uds
|
348
|
+
V.Exc
|
349
|
+
Vd
|
350
|
+
Vda
|
351
|
+
Vds
|
352
|
+
Vz
|
353
|
+
Z.D
|
354
|
+
Z.D.H
|
355
|
+
Z.E
|
356
|
+
Z.Em
|
357
|
+
Z.H
|
358
|
+
Z.K.H
|
359
|
+
Z.K.M
|
360
|
+
Z.M
|
361
|
+
a
|
362
|
+
a./s
|
363
|
+
a.C
|
364
|
+
a.g.v
|
365
|
+
a.l
|
366
|
+
abrev
|
367
|
+
abs
|
368
|
+
ac
|
369
|
+
acc
|
370
|
+
acron
|
371
|
+
adj
|
372
|
+
adm
|
373
|
+
adr
|
374
|
+
adv
|
375
|
+
ag
|
376
|
+
alt
|
377
|
+
anal
|
378
|
+
anat
|
379
|
+
angl
|
380
|
+
appos
|
381
|
+
apr
|
382
|
+
apr
|
383
|
+
asc
|
384
|
+
atm
|
385
|
+
auj
|
386
|
+
aux
|
387
|
+
av
|
388
|
+
avg
|
389
|
+
avr
|
390
|
+
b
|
391
|
+
b.a.o
|
392
|
+
b.a.p
|
393
|
+
b.a.r
|
394
|
+
bacc
|
395
|
+
bat
|
396
|
+
bc
|
397
|
+
bd
|
398
|
+
bde
|
399
|
+
bgen
|
400
|
+
bijv
|
401
|
+
bijz
|
402
|
+
br
|
403
|
+
bv
|
404
|
+
c
|
405
|
+
c.-a-d
|
406
|
+
c.a.f
|
407
|
+
c.i
|
408
|
+
cc
|
409
|
+
cf
|
410
|
+
cft
|
411
|
+
ch
|
412
|
+
ch.-l
|
413
|
+
chbre
|
414
|
+
chbs
|
415
|
+
chf
|
416
|
+
col
|
417
|
+
coll
|
418
|
+
cpl
|
419
|
+
cpt
|
420
|
+
cpte
|
421
|
+
cta
|
422
|
+
d
|
423
|
+
d.c
|
424
|
+
d.w.z
|
425
|
+
dcha
|
426
|
+
dec
|
427
|
+
def
|
428
|
+
dem
|
429
|
+
dep
|
430
|
+
dept
|
431
|
+
dhr
|
432
|
+
dic
|
433
|
+
dipl
|
434
|
+
dispo
|
435
|
+
div
|
436
|
+
dpto
|
437
|
+
dr
|
438
|
+
dr.h.c
|
439
|
+
dra
|
440
|
+
dras
|
441
|
+
drs
|
442
|
+
ds
|
443
|
+
dz
|
444
|
+
e.c
|
445
|
+
e.g
|
446
|
+
e.g
|
447
|
+
e.k
|
448
|
+
eccles
|
449
|
+
ecol
|
450
|
+
econ
|
451
|
+
ed
|
452
|
+
ej
|
453
|
+
env
|
454
|
+
ep
|
455
|
+
eq
|
456
|
+
et
|
457
|
+
etc
|
458
|
+
ev
|
459
|
+
ex
|
460
|
+
exmo
|
461
|
+
exo
|
462
|
+
exp
|
463
|
+
expo
|
464
|
+
f.a.c
|
465
|
+
fa
|
466
|
+
fam
|
467
|
+
fasc
|
468
|
+
fbg
|
469
|
+
feb
|
470
|
+
febbr
|
471
|
+
fem
|
472
|
+
fevr
|
473
|
+
ff
|
474
|
+
fl
|
475
|
+
fol
|
476
|
+
fr
|
477
|
+
fs
|
478
|
+
fut
|
479
|
+
gd
|
480
|
+
gde
|
481
|
+
gdes
|
482
|
+
gds
|
483
|
+
gen
|
484
|
+
genn
|
485
|
+
gl
|
486
|
+
grd
|
487
|
+
h.-t
|
488
|
+
hab
|
489
|
+
i.e
|
490
|
+
i.p.v
|
491
|
+
i.s.m
|
492
|
+
i.t.t
|
493
|
+
i.v.m
|
494
|
+
ibid
|
495
|
+
id
|
496
|
+
imp
|
497
|
+
ing
|
498
|
+
ir
|
499
|
+
iron
|
500
|
+
itd
|
501
|
+
itn
|
502
|
+
itp
|
503
|
+
izq
|
504
|
+
j
|
505
|
+
janv
|
506
|
+
jhr
|
507
|
+
jkvr
|
508
|
+
jr
|
509
|
+
l
|
510
|
+
lat
|
511
|
+
lex
|
512
|
+
lgen
|
513
|
+
lib
|
514
|
+
lieut
|
515
|
+
liv
|
516
|
+
lkol
|
517
|
+
loc
|
518
|
+
lof
|
519
|
+
m
|
520
|
+
m.a.w
|
521
|
+
m.b.t
|
522
|
+
m.b.v
|
523
|
+
m.h.o
|
524
|
+
m.i
|
525
|
+
m.i.v
|
526
|
+
magg
|
527
|
+
maj
|
528
|
+
mar
|
529
|
+
mas
|
530
|
+
max
|
531
|
+
med
|
532
|
+
mevr
|
533
|
+
min
|
534
|
+
mll
|
535
|
+
mr
|
536
|
+
ms
|
537
|
+
mtr
|
538
|
+
mtrs
|
539
|
+
n
|
540
|
+
n
|
541
|
+
n.f
|
542
|
+
n.f.pl
|
543
|
+
n.m
|
544
|
+
n.m.pl
|
545
|
+
nov
|
546
|
+
npr
|
547
|
+
o
|
548
|
+
o.b.s
|
549
|
+
obs
|
550
|
+
oct
|
551
|
+
okt
|
552
|
+
ord
|
553
|
+
ott
|
554
|
+
oz
|
555
|
+
p
|
556
|
+
p
|
557
|
+
p.a
|
558
|
+
p.ej
|
559
|
+
p.ex
|
560
|
+
p.g.c.d
|
561
|
+
p.i
|
562
|
+
p.j
|
563
|
+
p.m
|
564
|
+
p.o
|
565
|
+
p.p
|
566
|
+
p.p.c.d
|
567
|
+
p.p.c.m
|
568
|
+
p.pa
|
569
|
+
p.pr
|
570
|
+
pl
|
571
|
+
plv
|
572
|
+
poe
|
573
|
+
pp
|
574
|
+
pp
|
575
|
+
pr
|
576
|
+
pr
|
577
|
+
pres
|
578
|
+
prev
|
579
|
+
prof
|
580
|
+
px
|
581
|
+
q.s
|
582
|
+
qqch
|
583
|
+
qqf
|
584
|
+
qqn
|
585
|
+
qqns
|
586
|
+
r.-de-ch
|
587
|
+
r.p.m
|
588
|
+
rc
|
589
|
+
rd
|
590
|
+
ref
|
591
|
+
refl
|
592
|
+
reg
|
593
|
+
rev
|
594
|
+
ro
|
595
|
+
rte
|
596
|
+
s
|
597
|
+
s
|
598
|
+
s.a
|
599
|
+
s.b.f
|
600
|
+
s.d
|
601
|
+
s.e
|
602
|
+
s.l
|
603
|
+
s.l.n.d
|
604
|
+
s.l.p
|
605
|
+
s.t.p
|
606
|
+
s.v.p
|
607
|
+
s/c
|
608
|
+
sc
|
609
|
+
sett
|
610
|
+
sf
|
611
|
+
sgt
|
612
|
+
sl
|
613
|
+
sr
|
614
|
+
sra
|
615
|
+
sras
|
616
|
+
srs
|
617
|
+
ss
|
618
|
+
sto
|
619
|
+
t
|
620
|
+
t.s.v.p
|
621
|
+
tec
|
622
|
+
tel
|
623
|
+
terr
|
624
|
+
tg
|
625
|
+
tint
|
626
|
+
tit
|
627
|
+
tj
|
628
|
+
tr
|
629
|
+
travx
|
630
|
+
v
|
631
|
+
v.intr
|
632
|
+
v.tr
|
633
|
+
v.w.t
|
634
|
+
var
|
635
|
+
vs
|
636
|
+
vta
|
637
|
+
vx
|
638
|
+
z.v
|
639
|
+
zool
|
640
|
+
Št
|
641
|
+
št
|