opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,781 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ #no german words end in single lower-case letters, so we throw those in too.
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ U
28
+ V
29
+ W
30
+ X
31
+ Y
32
+ Z
33
+ a
34
+ b
35
+ c
36
+ d
37
+ e
38
+ f
39
+ g
40
+ h
41
+ i
42
+ j
43
+ k
44
+ l
45
+ m
46
+ n
47
+ o
48
+ p
49
+ q
50
+ r
51
+ s
52
+ t
53
+ u
54
+ v
55
+ w
56
+ x
57
+ y
58
+ z
59
+
60
+
61
+ #Roman Numerals. A dot after one of these is not a sentence break in German.
62
+ I
63
+ II
64
+ III
65
+ IV
66
+ V
67
+ VI
68
+ VII
69
+ VIII
70
+ IX
71
+ X
72
+ XI
73
+ XII
74
+ XIII
75
+ XIV
76
+ XV
77
+ XVI
78
+ XVII
79
+ XVIII
80
+ XIX
81
+ XX
82
+ i
83
+ ii
84
+ iii
85
+ iv
86
+ v
87
+ vi
88
+ vii
89
+ viii
90
+ ix
91
+ x
92
+ xi
93
+ xii
94
+ xiii
95
+ xiv
96
+ xv
97
+ xvi
98
+ xvii
99
+ xviii
100
+ xix
101
+ xx
102
+
103
+ #Titles and Honorifics
104
+ Adj
105
+ Adm
106
+ Adv
107
+ Asst
108
+ Bart
109
+ Bldg
110
+ Brig
111
+ Bros
112
+ Capt
113
+ Cmdr
114
+ Col
115
+ Comdr
116
+ Con
117
+ Corp
118
+ Cpl
119
+ DR
120
+ Dr
121
+ Ens
122
+ Gen
123
+ Gov
124
+ Hon
125
+ Hosp
126
+ Insp
127
+ Lt
128
+ MM
129
+ MR
130
+ MRS
131
+ MS
132
+ Maj
133
+ Messrs
134
+ Mlle
135
+ Mme
136
+ Mr
137
+ Mrs
138
+ Ms
139
+ Msgr
140
+ Op
141
+ Ord
142
+ Pfc
143
+ Ph
144
+ Prof
145
+ Pvt
146
+ Rep
147
+ Reps
148
+ Res
149
+ Rev
150
+ Rt
151
+ Sen
152
+ Sens
153
+ Sfc
154
+ Sgt
155
+ Sr
156
+ St
157
+ Supt
158
+ Surg
159
+
160
+ #Misc symbols
161
+ Mio
162
+ Mrd
163
+ bzw
164
+ v
165
+ vs
166
+ usw
167
+ d.h
168
+ z.B
169
+ u.a
170
+ etc
171
+ Mrd
172
+ MwSt
173
+ ggf
174
+ d.J
175
+ D.h
176
+ m.E
177
+ vgl
178
+ I.F
179
+ z.T
180
+ sogen
181
+ ff
182
+ u.E
183
+ g.U
184
+ g.g.A
185
+ c.-à-d
186
+ Buchst
187
+ u.s.w
188
+ sog
189
+ u.ä
190
+ Std
191
+ evtl
192
+ Zt
193
+ Chr
194
+ u.U
195
+ o.ä
196
+ Ltd
197
+ b.A
198
+ z.Zt
199
+ spp
200
+ sen
201
+ SA
202
+ k.o
203
+ jun
204
+ i.H.v
205
+ dgl
206
+ dergl
207
+ Co
208
+ zzt
209
+ usf
210
+ s.p.a
211
+ Dkr
212
+ Corp
213
+ bzgl
214
+ BSE
215
+
216
+ #Number indicators
217
+ # add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
218
+ No
219
+ Nos
220
+ Art
221
+ Nr
222
+ pp
223
+ ca
224
+ Ca
225
+
226
+ #Ordinals are done with . in German - "1." = "1st" in English
227
+ 1
228
+ 2
229
+ 3
230
+ 4
231
+ 5
232
+ 6
233
+ 7
234
+ 8
235
+ 9
236
+ 10
237
+ 11
238
+ 12
239
+ 13
240
+ 14
241
+ 15
242
+ 16
243
+ 17
244
+ 18
245
+ 19
246
+ 20
247
+ 21
248
+ 22
249
+ 23
250
+ 24
251
+ 25
252
+ 26
253
+ 27
254
+ 28
255
+ 29
256
+ 30
257
+ 31
258
+ 32
259
+ 33
260
+ 34
261
+ 35
262
+ 36
263
+ 37
264
+ 38
265
+ 39
266
+ 40
267
+ 41
268
+ 42
269
+ 43
270
+ 44
271
+ 45
272
+ 46
273
+ 47
274
+ 48
275
+ 49
276
+ 50
277
+ 51
278
+ 52
279
+ 53
280
+ 54
281
+ 55
282
+ 56
283
+ 57
284
+ 58
285
+ 59
286
+ 60
287
+ 61
288
+ 62
289
+ 63
290
+ 64
291
+ 65
292
+ 66
293
+ 67
294
+ 68
295
+ 69
296
+ 70
297
+ 71
298
+ 72
299
+ 73
300
+ 74
301
+ 75
302
+ 76
303
+ 77
304
+ 78
305
+ 79
306
+ 80
307
+ 81
308
+ 82
309
+ 83
310
+ 84
311
+ 85
312
+ 86
313
+ 87
314
+ 88
315
+ 89
316
+ 90
317
+ 91
318
+ 92
319
+ 93
320
+ 94
321
+ 95
322
+ 96
323
+ 97
324
+ 98
325
+ 99
326
+
327
+ #unified abbreviation list
328
+ Acad
329
+ Adj
330
+ Adm
331
+ Adv
332
+ Affl
333
+ Apr
334
+ Art
335
+ Asst
336
+ Aug
337
+ Av
338
+ Avg
339
+ B.ches-du-Rh
340
+ Bart
341
+ Bco
342
+ Bldg
343
+ Brig
344
+ Bros
345
+ C.a
346
+ C.p.c.n
347
+ Ca
348
+ Capt
349
+ Cdt
350
+ Cf
351
+ Ch.-Mme
352
+ Chap
353
+ Cie
354
+ Cmdr
355
+ Col
356
+ Comdr
357
+ Con
358
+ Corp
359
+ Cpl
360
+ DR
361
+ DRA
362
+ Da
363
+ Dec
364
+ Dep
365
+ Dez
366
+ Dn
367
+ Dr
368
+ Dra
369
+ Dras
370
+ Drs
371
+ Eng
372
+ Enga
373
+ Engas
374
+ Engos
375
+ Ens
376
+ Ets
377
+ Euro
378
+ Ev
379
+ Ex
380
+ Excmo
381
+ Exmo
382
+ Exo
383
+ Fa
384
+ Fco
385
+ Feb
386
+ Fig
387
+ Fr
388
+ Gar
389
+ Gen
390
+ Gir
391
+ Gl
392
+ Gov
393
+ Hno
394
+ Hon
395
+ Hosp
396
+ Hr
397
+ Ilmo
398
+ Insp
399
+ J.-C
400
+ Jan
401
+ Jän
402
+ Jeu
403
+ Jr
404
+ Jul
405
+ Jun
406
+ Lda
407
+ Lieut
408
+ Lt
409
+ Lun
410
+ MM
411
+ MR
412
+ MRS
413
+ MS
414
+ MSc
415
+ Maj
416
+ Mar
417
+ Me
418
+ Mej
419
+ Mer
420
+ Mes
421
+ Messrs
422
+ Mgr
423
+ Mgrs
424
+ Mll
425
+ Mlle
426
+ Mlle(s)
427
+ Mme
428
+ Mme(s)
429
+ Mr
430
+ Mrs
431
+ Ms
432
+ Msgr
433
+ Mw
434
+ Nov
435
+ Npr
436
+ Nr
437
+ O.d.J
438
+ Okt
439
+ Op
440
+ Ord
441
+ Oz
442
+ P
443
+ P.D
444
+ P.ej
445
+ P.p.c
446
+ Pas
447
+ Pfc
448
+ Ph
449
+ Prim
450
+ Prof
451
+ Pte
452
+ Pts
453
+ Pvt
454
+ Rep
455
+ Reps
456
+ Res
457
+ Rev
458
+ Revd
459
+ Rh
460
+ Riv
461
+ Rt
462
+ S.Em
463
+ S.Exc
464
+ S.a.r.l
465
+ Sen
466
+ Sens
467
+ Sep
468
+ Sept
469
+ Sfc
470
+ Sgt
471
+ SGT
472
+ Sl
473
+ Sr
474
+ Sra
475
+ Sras
476
+ Srs
477
+ Srta
478
+ St
479
+ ST
480
+ Sta
481
+ Ste
482
+ Sto
483
+ Supt
484
+ Surg
485
+ Tj
486
+ Tr
487
+ Ud
488
+ Uds
489
+ V.Exc
490
+ Vd
491
+ Vda
492
+ Vds
493
+ Vz
494
+ Z.D
495
+ Z.D.H
496
+ Z.E
497
+ Z.Em
498
+ Z.H
499
+ Z.K.H
500
+ Z.K.M
501
+ Z.M
502
+ a
503
+ a./s
504
+ a.C
505
+ a.g.v
506
+ a.l
507
+ abrev
508
+ abs
509
+ ac
510
+ acc
511
+ acron
512
+ adj
513
+ adm
514
+ adr
515
+ adv
516
+ alt
517
+ anal
518
+ anat
519
+ angl
520
+ appos
521
+ apr
522
+ apr
523
+ asc
524
+ atm
525
+ aug
526
+ auj
527
+ aux
528
+ av
529
+ avg
530
+ avr
531
+ b
532
+ b.a.o
533
+ b.a.p
534
+ b.a.r
535
+ bacc
536
+ bat
537
+ bc
538
+ bd
539
+ bde
540
+ bgen
541
+ bijv
542
+ bijz
543
+ br
544
+ bv
545
+ c
546
+ c.-a-d
547
+ c.a.f
548
+ c.i
549
+ cc
550
+ cf
551
+ cft
552
+ ch
553
+ ch.-l
554
+ chbre
555
+ chbs
556
+ chf
557
+ col
558
+ coll
559
+ cpl
560
+ cpt
561
+ cpte
562
+ cta
563
+ d
564
+ d.c
565
+ d.w.z
566
+ dcha
567
+ dec
568
+ def
569
+ dem
570
+ dep
571
+ dept
572
+ dez
573
+ dhr
574
+ dipl
575
+ dispo
576
+ div
577
+ dpto
578
+ dr
579
+ dr.h.c
580
+ dra
581
+ dras
582
+ drs
583
+ ds
584
+ dz
585
+ e.c
586
+ e.g
587
+ e.g
588
+ e.k
589
+ eccles
590
+ ecol
591
+ econ
592
+ ed
593
+ ej
594
+ env
595
+ ep
596
+ eq
597
+ et
598
+ etc
599
+ ev
600
+ ex
601
+ exmo
602
+ exo
603
+ exp
604
+ expo
605
+ f.a.c
606
+ fa
607
+ fam
608
+ fasc
609
+ fbg
610
+ feb
611
+ fem
612
+ fevr
613
+ ff
614
+ fl
615
+ fol
616
+ fr
617
+ fs
618
+ fut
619
+ gd
620
+ gde
621
+ gdes
622
+ gds
623
+ gen
624
+ gl
625
+ grd
626
+ h.-t
627
+ hab
628
+ i.e
629
+ i.p.v
630
+ i.s.m
631
+ i.t.t
632
+ i.v.m
633
+ ibid
634
+ id
635
+ imp
636
+ ing
637
+ ir
638
+ iron
639
+ itd
640
+ itn
641
+ itp
642
+ izq
643
+ j
644
+ jan
645
+ jän
646
+ janv
647
+ jhr
648
+ jkvr
649
+ jr
650
+ l
651
+ lat
652
+ lex
653
+ lgen
654
+ lib
655
+ lieut
656
+ liv
657
+ lkol
658
+ loc
659
+ lof
660
+ m
661
+ m.a.w
662
+ m.b.t
663
+ m.b.v
664
+ m.h.o
665
+ m.i
666
+ m.i.v
667
+ maj
668
+ mar
669
+ mas
670
+ max
671
+ med
672
+ mevr
673
+ min
674
+ mll
675
+ mr
676
+ ms
677
+ mtr
678
+ mtrs
679
+ n
680
+ n
681
+ n.f
682
+ n.f.pl
683
+ n.m
684
+ n.m.pl
685
+ nov
686
+ npr
687
+ o
688
+ o.b.s
689
+ obs
690
+ oct
691
+ okt
692
+ ord
693
+ oz
694
+ p
695
+ p
696
+ p.a
697
+ p.ej
698
+ p.ex
699
+ p.g.c.d
700
+ p.i
701
+ p.j
702
+ p.m
703
+ p.o
704
+ p.p
705
+ p.p.c.d
706
+ p.p.c.m
707
+ p.pa
708
+ p.pr
709
+ pl
710
+ plv
711
+ poe
712
+ pp
713
+ pp
714
+ pr
715
+ pr
716
+ pres
717
+ prev
718
+ prof
719
+ px
720
+ q.s
721
+ qqch
722
+ qqf
723
+ qqn
724
+ qqns
725
+ r.-de-ch
726
+ r.p.m
727
+ rc
728
+ rd
729
+ ref
730
+ refl
731
+ reg
732
+ rev
733
+ ro
734
+ rte
735
+ s
736
+ s
737
+ s.a
738
+ s.b.f
739
+ s.d
740
+ s.e
741
+ s.l
742
+ s.l.n.d
743
+ s.l.p
744
+ s.t.p
745
+ s.v.p
746
+ s/c
747
+ sc
748
+ sep
749
+ sept
750
+ sf
751
+ sgt
752
+ sl
753
+ sr
754
+ sra
755
+ sras
756
+ srs
757
+ ss
758
+ sto
759
+ t
760
+ t.s.v.p
761
+ tec
762
+ tel
763
+ terr
764
+ tg
765
+ tint
766
+ tit
767
+ tj
768
+ tr
769
+ travx
770
+ v
771
+ v.intr
772
+ v.tr
773
+ v.w.t
774
+ var
775
+ vs
776
+ vta
777
+ vx
778
+ z.v
779
+ zool
780
+ Št
781
+ št