opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,641 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+
33
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34
+ Adj
35
+ Adm
36
+ Adv
37
+ Amn
38
+ Arch
39
+ Asst
40
+ Avv
41
+ Bart
42
+ Bcc
43
+ Bldg
44
+ Brig
45
+ Bros
46
+ C.A.P
47
+ C.P
48
+ Capt
49
+ Cc
50
+ Cmdr
51
+ Co
52
+ Col
53
+ Comdr
54
+ Con
55
+ Corp
56
+ Cpl
57
+ DR
58
+ Dott
59
+ Dr
60
+ Drs
61
+ Egr
62
+ Ens
63
+ Gen
64
+ Genn
65
+ Geom
66
+ Gov
67
+ Hon
68
+ Hosp
69
+ Hr
70
+ Id
71
+ Ing
72
+ Insp
73
+ Lt
74
+ MM
75
+ MR
76
+ MRS
77
+ MS
78
+ Maj
79
+ Messrs
80
+ Mlle
81
+ Mme
82
+ Mo
83
+ Mons
84
+ Mr
85
+ Mrs
86
+ Ms
87
+ Msgr
88
+ N.B
89
+ Op
90
+ Ord
91
+ P.S
92
+ P.T
93
+ Pfc
94
+ Ph
95
+ Prof
96
+ Pvt
97
+ RP
98
+ RSVP
99
+ Rag
100
+ Rep
101
+ Reps
102
+ Res
103
+ Rev
104
+ Rif
105
+ Rt
106
+ S.A
107
+ S.B.F
108
+ S.P.M
109
+ S.p.A
110
+ S.r.l
111
+ Sen
112
+ Sens
113
+ Sfc
114
+ Sgt
115
+ SGT
116
+ Sig
117
+ Sigg
118
+ Soc
119
+ Spett
120
+ Sr
121
+ St
122
+ Supt
123
+ Surg
124
+ V.P
125
+
126
+ # other
127
+ a.c
128
+ acc
129
+ all
130
+ banc
131
+ c.a
132
+ c.c.p
133
+ c.m
134
+ c.p
135
+ c.s
136
+ c.v
137
+ corr
138
+ dott
139
+ e.p.c
140
+ ecc
141
+ es
142
+ fatt
143
+ gg
144
+ int
145
+ lett
146
+ ogg
147
+ on
148
+ p.c
149
+ p.c.c
150
+ p.es
151
+ p.f
152
+ p.r
153
+ p.v
154
+ post
155
+ pp
156
+ racc
157
+ ric
158
+ s.n.c
159
+ seg
160
+ sgg
161
+ ss
162
+ tel
163
+ u.s
164
+ v.r
165
+ v.s
166
+
167
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
168
+ v
169
+ vs
170
+ i.e
171
+ rev
172
+ e.g
173
+
174
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
175
+ # add NUMERIC_ONLY after the word for this function
176
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
177
+ #if followed by a number, a non-breaking prefix
178
+ No #NUMERIC_ONLY#
179
+ Nos
180
+ Art #NUMERIC_ONLY#
181
+ Nr
182
+ pp #NUMERIC_ONLY#
183
+
184
+ #unified abbreviation list
185
+ Acad
186
+ Adj
187
+ Adm
188
+ Adv
189
+ Affl
190
+ Ag
191
+ Apr
192
+ Art
193
+ Asst
194
+ Av
195
+ Avg
196
+ B.ches-du-Rh
197
+ Bart
198
+ Bco
199
+ Bldg
200
+ Brig
201
+ Bros
202
+ C.a
203
+ C.p.c.n
204
+ Ca
205
+ Capt
206
+ Cdt
207
+ Cf
208
+ Ch.-Mme
209
+ Chap
210
+ Cie
211
+ Cmdr
212
+ Col
213
+ Comdr
214
+ Con
215
+ Corp
216
+ Cpl
217
+ DR
218
+ DRA
219
+ Da
220
+ Dec
221
+ Dep
222
+ Dic
223
+ Dn
224
+ Dr
225
+ Dra
226
+ Dras
227
+ Drs
228
+ Eng
229
+ Enga
230
+ Engas
231
+ Engos
232
+ Ens
233
+ Ets
234
+ Euro
235
+ Ev
236
+ Ex
237
+ Excmo
238
+ Exmo
239
+ Exo
240
+ Fa
241
+ Fco
242
+ Feb
243
+ Febbr
244
+ Fig
245
+ Fr
246
+ Gar
247
+ Gen
248
+ Gir
249
+ Gl
250
+ Gov
251
+ Hno
252
+ Hon
253
+ Hosp
254
+ Hr
255
+ Ilmo
256
+ Insp
257
+ J.-C
258
+ Jan
259
+ Jeu
260
+ Jr
261
+ Jul
262
+ Jun
263
+ Lda
264
+ Lieut
265
+ Lt
266
+ Lun
267
+ MM
268
+ MR
269
+ MRS
270
+ MS
271
+ MSc
272
+ Magg
273
+ Maj
274
+ Mar
275
+ Me
276
+ Mej
277
+ Mer
278
+ Mes
279
+ Messrs
280
+ Mgr
281
+ Mgrs
282
+ Mll
283
+ Mlle
284
+ Mlle(s)
285
+ Mme
286
+ Mme(s)
287
+ Mr
288
+ Mrs
289
+ Ms
290
+ Msgr
291
+ Mw
292
+ Nov
293
+ Npr
294
+ Nr
295
+ O.d.J
296
+ Okt
297
+ Op
298
+ Ord
299
+ Ott
300
+ Oz
301
+ P
302
+ P.D
303
+ P.ej
304
+ P.p.c
305
+ Pas
306
+ Pfc
307
+ Ph
308
+ Prim
309
+ Prof
310
+ Pte
311
+ Pts
312
+ Pvt
313
+ Rep
314
+ Reps
315
+ Res
316
+ Rev
317
+ Revd
318
+ Rh
319
+ Riv
320
+ Rt
321
+ S.Em
322
+ S.Exc
323
+ S.a.r.l
324
+ Sen
325
+ Sens
326
+ Sep
327
+ Sept
328
+ Sett
329
+ Sfc
330
+ Sgt
331
+ Sl
332
+ Sr
333
+ Sra
334
+ Sras
335
+ Srs
336
+ Srta
337
+ St
338
+ ST
339
+ Sta
340
+ Ste
341
+ Sto
342
+ Supt
343
+ Surg
344
+ Tj
345
+ Tr
346
+ Ud
347
+ Uds
348
+ V.Exc
349
+ Vd
350
+ Vda
351
+ Vds
352
+ Vz
353
+ Z.D
354
+ Z.D.H
355
+ Z.E
356
+ Z.Em
357
+ Z.H
358
+ Z.K.H
359
+ Z.K.M
360
+ Z.M
361
+ a
362
+ a./s
363
+ a.C
364
+ a.g.v
365
+ a.l
366
+ abrev
367
+ abs
368
+ ac
369
+ acc
370
+ acron
371
+ adj
372
+ adm
373
+ adr
374
+ adv
375
+ ag
376
+ alt
377
+ anal
378
+ anat
379
+ angl
380
+ appos
381
+ apr
382
+ apr
383
+ asc
384
+ atm
385
+ auj
386
+ aux
387
+ av
388
+ avg
389
+ avr
390
+ b
391
+ b.a.o
392
+ b.a.p
393
+ b.a.r
394
+ bacc
395
+ bat
396
+ bc
397
+ bd
398
+ bde
399
+ bgen
400
+ bijv
401
+ bijz
402
+ br
403
+ bv
404
+ c
405
+ c.-a-d
406
+ c.a.f
407
+ c.i
408
+ cc
409
+ cf
410
+ cft
411
+ ch
412
+ ch.-l
413
+ chbre
414
+ chbs
415
+ chf
416
+ col
417
+ coll
418
+ cpl
419
+ cpt
420
+ cpte
421
+ cta
422
+ d
423
+ d.c
424
+ d.w.z
425
+ dcha
426
+ dec
427
+ def
428
+ dem
429
+ dep
430
+ dept
431
+ dhr
432
+ dic
433
+ dipl
434
+ dispo
435
+ div
436
+ dpto
437
+ dr
438
+ dr.h.c
439
+ dra
440
+ dras
441
+ drs
442
+ ds
443
+ dz
444
+ e.c
445
+ e.g
446
+ e.g
447
+ e.k
448
+ eccles
449
+ ecol
450
+ econ
451
+ ed
452
+ ej
453
+ env
454
+ ep
455
+ eq
456
+ et
457
+ etc
458
+ ev
459
+ ex
460
+ exmo
461
+ exo
462
+ exp
463
+ expo
464
+ f.a.c
465
+ fa
466
+ fam
467
+ fasc
468
+ fbg
469
+ feb
470
+ febbr
471
+ fem
472
+ fevr
473
+ ff
474
+ fl
475
+ fol
476
+ fr
477
+ fs
478
+ fut
479
+ gd
480
+ gde
481
+ gdes
482
+ gds
483
+ gen
484
+ genn
485
+ gl
486
+ grd
487
+ h.-t
488
+ hab
489
+ i.e
490
+ i.p.v
491
+ i.s.m
492
+ i.t.t
493
+ i.v.m
494
+ ibid
495
+ id
496
+ imp
497
+ ing
498
+ ir
499
+ iron
500
+ itd
501
+ itn
502
+ itp
503
+ izq
504
+ j
505
+ janv
506
+ jhr
507
+ jkvr
508
+ jr
509
+ l
510
+ lat
511
+ lex
512
+ lgen
513
+ lib
514
+ lieut
515
+ liv
516
+ lkol
517
+ loc
518
+ lof
519
+ m
520
+ m.a.w
521
+ m.b.t
522
+ m.b.v
523
+ m.h.o
524
+ m.i
525
+ m.i.v
526
+ magg
527
+ maj
528
+ mar
529
+ mas
530
+ max
531
+ med
532
+ mevr
533
+ min
534
+ mll
535
+ mr
536
+ ms
537
+ mtr
538
+ mtrs
539
+ n
540
+ n
541
+ n.f
542
+ n.f.pl
543
+ n.m
544
+ n.m.pl
545
+ nov
546
+ npr
547
+ o
548
+ o.b.s
549
+ obs
550
+ oct
551
+ okt
552
+ ord
553
+ ott
554
+ oz
555
+ p
556
+ p
557
+ p.a
558
+ p.ej
559
+ p.ex
560
+ p.g.c.d
561
+ p.i
562
+ p.j
563
+ p.m
564
+ p.o
565
+ p.p
566
+ p.p.c.d
567
+ p.p.c.m
568
+ p.pa
569
+ p.pr
570
+ pl
571
+ plv
572
+ poe
573
+ pp
574
+ pp
575
+ pr
576
+ pr
577
+ pres
578
+ prev
579
+ prof
580
+ px
581
+ q.s
582
+ qqch
583
+ qqf
584
+ qqn
585
+ qqns
586
+ r.-de-ch
587
+ r.p.m
588
+ rc
589
+ rd
590
+ ref
591
+ refl
592
+ reg
593
+ rev
594
+ ro
595
+ rte
596
+ s
597
+ s
598
+ s.a
599
+ s.b.f
600
+ s.d
601
+ s.e
602
+ s.l
603
+ s.l.n.d
604
+ s.l.p
605
+ s.t.p
606
+ s.v.p
607
+ s/c
608
+ sc
609
+ sett
610
+ sf
611
+ sgt
612
+ sl
613
+ sr
614
+ sra
615
+ sras
616
+ srs
617
+ ss
618
+ sto
619
+ t
620
+ t.s.v.p
621
+ tec
622
+ tel
623
+ terr
624
+ tg
625
+ tint
626
+ tit
627
+ tj
628
+ tr
629
+ travx
630
+ v
631
+ v.intr
632
+ v.tr
633
+ v.w.t
634
+ var
635
+ vs
636
+ vta
637
+ vx
638
+ z.v
639
+ zool
640
+ Št
641
+ št