opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,739 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+ #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
4
+ # http://nl.wikipedia.org/wiki/Aanspreekvorm
5
+ # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
6
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
7
+ #usually upper case letters are initials in a name
8
+ A
9
+ B
10
+ C
11
+ D
12
+ E
13
+ F
14
+ G
15
+ H
16
+ I
17
+ J
18
+ K
19
+ L
20
+ M
21
+ N
22
+ O
23
+ P
24
+ Q
25
+ R
26
+ S
27
+ T
28
+ U
29
+ V
30
+ W
31
+ X
32
+ Y
33
+ Z
34
+
35
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
36
+ bacc
37
+ bc
38
+ bgen
39
+ c.i
40
+ dhr
41
+ dr
42
+ dr.h.c
43
+ drs
44
+ ds
45
+ eint
46
+ fa
47
+ Fa
48
+ fam
49
+ gen
50
+ genm
51
+ ing
52
+ ir
53
+ jhr
54
+ jkvr
55
+ jr
56
+ kand
57
+ kol
58
+ lgen
59
+ lkol
60
+ Lt
61
+ maj
62
+ Mej
63
+ mevr
64
+ Mme
65
+ mr
66
+ Mw
67
+ o.b.s
68
+ plv
69
+ prof
70
+ ritm
71
+ tint
72
+ Vz
73
+ Z.D
74
+ Z.D.H
75
+ Z.E
76
+ Z.Em
77
+ Z.H
78
+ Z.K.H
79
+ Z.K.M
80
+ Z.M
81
+ z.v
82
+
83
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
84
+ #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
85
+ a.g.v
86
+ bijv
87
+ bijz
88
+ bv
89
+ d.w.z
90
+ e.c
91
+ e.g
92
+ e.k
93
+ ev
94
+ i.p.v
95
+ i.s.m
96
+ i.t.t
97
+ i.v.m
98
+ m.a.w
99
+ m.b.t
100
+ m.b.v
101
+ m.h.o
102
+ m.i
103
+ m.i.v
104
+ v.w.t
105
+
106
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
107
+ # add NUMERIC_ONLY after the word for this function
108
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
109
+ #if followed by a number, a non-breaking prefix
110
+ Nr #NUMERIC_ONLY#
111
+ Nrs
112
+ nrs
113
+ nr #NUMERIC_ONLY#
114
+
115
+ #others
116
+ a.g.v
117
+ aanh
118
+ aanw
119
+ aardew
120
+ aardr
121
+ abs
122
+ abstr
123
+ adj
124
+ adm
125
+ afb
126
+ afd
127
+ afk
128
+ afl
129
+ Afr
130
+ al
131
+ ald
132
+ alg
133
+ Am
134
+ amb
135
+ ambt
136
+ anat
137
+ antrop
138
+ apoth
139
+ Ar
140
+ arch
141
+ archeol
142
+ art
143
+ bacc
144
+ betr
145
+ bez
146
+ bgen
147
+ bibl
148
+ bijl
149
+ blz
150
+ Br
151
+ bv
152
+ bw
153
+ c.i
154
+ ca
155
+ cat
156
+ centr
157
+ cf
158
+ cfr
159
+ Cie
160
+ cmpl
161
+ Comp
162
+ conf
163
+ ct
164
+ d.w.z
165
+ dal
166
+ derg
167
+ Dhr
168
+ dir
169
+ div
170
+ Dr
171
+ dr
172
+ dr.h.c
173
+ dra
174
+ e.c
175
+ e.g
176
+ e.k
177
+ ed
178
+ eint
179
+ Em
180
+ em
181
+ enz
182
+ etc
183
+ excl
184
+ fig
185
+ fl
186
+ fr
187
+ geb
188
+ gen
189
+ genm
190
+ get
191
+ gld
192
+ i.p.v
193
+ i.s.m
194
+ i.t.t
195
+ i.v.m
196
+ id
197
+ incl
198
+ intern
199
+ jl
200
+ kand
201
+ kol
202
+ Kon
203
+ kr
204
+ kt
205
+ lab
206
+ lgen
207
+ lic
208
+ lkol
209
+ ll
210
+ Lt
211
+ lt
212
+ lw
213
+ m.a.w
214
+ m.b.t
215
+ m.b.v
216
+ m.h.o
217
+ m.i
218
+ m.i.v
219
+ maj
220
+ max
221
+ Mevr
222
+ mevr
223
+ Mgr
224
+ mi
225
+ min
226
+ mld
227
+ mln
228
+ Mme
229
+ mw
230
+ Ndl
231
+ Ned
232
+ Nl
233
+ nl
234
+ No
235
+ no
236
+ nr
237
+ Nrs
238
+ o.a
239
+ o.b.s
240
+ ob
241
+ obl
242
+ ong
243
+ onov
244
+ opm
245
+ org
246
+ ov
247
+ pag
248
+ par
249
+ penn
250
+ plm
251
+ prov
252
+ pseud
253
+ red
254
+ ref
255
+ resp
256
+ ritm
257
+ Secr
258
+ soc
259
+ Sr
260
+ St
261
+ st
262
+ tab
263
+ tel
264
+ tint
265
+ tk
266
+ Ued
267
+ uitsl
268
+ v.w.t
269
+ ver
270
+ vgl
271
+ vnl
272
+ vnw
273
+ voorz
274
+ ww
275
+ Z.D
276
+ Z.D.H
277
+ Z.E
278
+ Z.Em
279
+ Z.H
280
+ Z.K.H
281
+ Z.K.M
282
+ Z.M
283
+ z.v
284
+ zat
285
+ zg
286
+
287
+
288
+ #unified abbreviation list
289
+ Acad
290
+ Adj
291
+ Adm
292
+ Adv
293
+ Affl
294
+ Apr
295
+ Art
296
+ Asst
297
+ Aug
298
+ Av
299
+ Avg
300
+ B.ches-du-Rh
301
+ Bart
302
+ Bco
303
+ Bldg
304
+ Brig
305
+ Bros
306
+ C.a
307
+ C.p.c.n
308
+ Ca
309
+ Capt
310
+ Cdt
311
+ Cf
312
+ Ch.-Mme
313
+ Chap
314
+ Cie
315
+ Cmdr
316
+ Col
317
+ Comdr
318
+ Con
319
+ Corp
320
+ Cpl
321
+ DR
322
+ DRA
323
+ Da
324
+ Dec
325
+ Dep
326
+ Dn
327
+ Dr
328
+ Dra
329
+ Dras
330
+ Drs
331
+ Eng
332
+ Enga
333
+ Engas
334
+ Engos
335
+ Ens
336
+ Ets
337
+ Euro
338
+ Ev
339
+ Ex
340
+ Excmo
341
+ Exmo
342
+ Exo
343
+ Fa
344
+ Fco
345
+ Feb
346
+ Fig
347
+ Fr
348
+ Gar
349
+ Gen
350
+ Gir
351
+ Gl
352
+ Gov
353
+ Hno
354
+ Hon
355
+ Hosp
356
+ Hr
357
+ Ilmo
358
+ Insp
359
+ J.-C
360
+ Jan
361
+ Jeu
362
+ Jr
363
+ Jul
364
+ Jun
365
+ Lda
366
+ Lieut
367
+ Lt
368
+ Lun
369
+ MM
370
+ MR
371
+ MRS
372
+ MS
373
+ MSc
374
+ Maj
375
+ Mar
376
+ Me
377
+ Mej
378
+ Mer
379
+ Mes
380
+ Messrs
381
+ Mgr
382
+ Mgrs
383
+ Mll
384
+ Mlle
385
+ Mlle(s)
386
+ Mme
387
+ Mme(s)
388
+ Mr
389
+ Mrs
390
+ Ms
391
+ Msgr
392
+ Mw
393
+ Nov
394
+ Npr
395
+ Nr
396
+ O.d.J
397
+ Oct
398
+ Okt
399
+ Op
400
+ Ord
401
+ Oz
402
+ P
403
+ P.D
404
+ P.ej
405
+ P.p.c
406
+ Pas
407
+ Pfc
408
+ Ph
409
+ Prim
410
+ Prof
411
+ Pte
412
+ Pts
413
+ Pvt
414
+ Rep
415
+ Reps
416
+ Res
417
+ Rev
418
+ Revd
419
+ Rh
420
+ Riv
421
+ Rt
422
+ S.Em
423
+ S.Exc
424
+ S.a.r.l
425
+ Sen
426
+ Sens
427
+ Sep
428
+ Sept
429
+ Sfc
430
+ Sgt
431
+ SGT
432
+ Sl
433
+ Sr
434
+ Sra
435
+ Sras
436
+ Srs
437
+ Srta
438
+ St
439
+ ST
440
+ Sta
441
+ Ste
442
+ Sto
443
+ Supt
444
+ Surg
445
+ Tj
446
+ Tr
447
+ Ud
448
+ Uds
449
+ V.Exc
450
+ Vd
451
+ Vda
452
+ Vds
453
+ Vz
454
+ Z.D
455
+ Z.D.H
456
+ Z.E
457
+ Z.Em
458
+ Z.H
459
+ Z.K.H
460
+ Z.K.M
461
+ Z.M
462
+ a
463
+ a./s
464
+ a.C
465
+ a.g.v
466
+ a.l
467
+ abrev
468
+ abs
469
+ ac
470
+ acc
471
+ acron
472
+ adj
473
+ adm
474
+ adr
475
+ adv
476
+ alt
477
+ anal
478
+ anat
479
+ angl
480
+ appos
481
+ apr
482
+ apr
483
+ asc
484
+ atm
485
+ aug
486
+ auj
487
+ aux
488
+ av
489
+ avg
490
+ avr
491
+ b
492
+ b.a.o
493
+ b.a.p
494
+ b.a.r
495
+ bacc
496
+ bat
497
+ bc
498
+ bd
499
+ bde
500
+ bgen
501
+ bijv
502
+ bijz
503
+ br
504
+ bv
505
+ c
506
+ c.-a-d
507
+ c.a.f
508
+ c.i
509
+ cc
510
+ cf
511
+ cft
512
+ ch
513
+ ch.-l
514
+ chbre
515
+ chbs
516
+ chf
517
+ col
518
+ coll
519
+ cpl
520
+ cpt
521
+ cpte
522
+ cta
523
+ d
524
+ d.c
525
+ d.w.z
526
+ dcha
527
+ dec
528
+ def
529
+ dem
530
+ dep
531
+ dept
532
+ dhr
533
+ dipl
534
+ dispo
535
+ div
536
+ dpto
537
+ dr
538
+ dr.h.c
539
+ dra
540
+ dras
541
+ drs
542
+ ds
543
+ dz
544
+ e.c
545
+ e.g
546
+ e.g
547
+ e.k
548
+ eccles
549
+ ecol
550
+ econ
551
+ ed
552
+ ej
553
+ env
554
+ ep
555
+ eq
556
+ et
557
+ etc
558
+ ev
559
+ ex
560
+ exmo
561
+ exo
562
+ exp
563
+ expo
564
+ f.a.c
565
+ fa
566
+ fam
567
+ fasc
568
+ fbg
569
+ feb
570
+ fem
571
+ fevr
572
+ ff
573
+ fl
574
+ fol
575
+ fr
576
+ fs
577
+ fut
578
+ gd
579
+ gde
580
+ gdes
581
+ gds
582
+ gen
583
+ gl
584
+ grd
585
+ h.-t
586
+ hab
587
+ i.e
588
+ i.p.v
589
+ i.s.m
590
+ i.t.t
591
+ i.v.m
592
+ ibid
593
+ id
594
+ imp
595
+ ing
596
+ ir
597
+ iron
598
+ itd
599
+ itn
600
+ itp
601
+ izq
602
+ j
603
+ jan
604
+ janv
605
+ jhr
606
+ jkvr
607
+ jr
608
+ l
609
+ lat
610
+ lex
611
+ lgen
612
+ lib
613
+ lieut
614
+ liv
615
+ lkol
616
+ loc
617
+ lof
618
+ m
619
+ m.a.w
620
+ m.b.t
621
+ m.b.v
622
+ m.h.o
623
+ m.i
624
+ m.i.v
625
+ maj
626
+ mar
627
+ mas
628
+ max
629
+ med
630
+ mevr
631
+ min
632
+ mll
633
+ mr
634
+ ms
635
+ mtr
636
+ mtrs
637
+ n
638
+ n
639
+ n.f
640
+ n.f.pl
641
+ n.m
642
+ n.m.pl
643
+ nov
644
+ npr
645
+ o
646
+ o.b.s
647
+ obs
648
+ oct
649
+ okt
650
+ ord
651
+ oz
652
+ p
653
+ p
654
+ p.a
655
+ p.ej
656
+ p.ex
657
+ p.g.c.d
658
+ p.i
659
+ p.j
660
+ p.m
661
+ p.o
662
+ p.p
663
+ p.p.c.d
664
+ p.p.c.m
665
+ p.pa
666
+ p.pr
667
+ pl
668
+ plv
669
+ poe
670
+ pp
671
+ pp
672
+ pr
673
+ pr
674
+ pres
675
+ prev
676
+ prof
677
+ px
678
+ q.s
679
+ qqch
680
+ qqf
681
+ qqn
682
+ qqns
683
+ r.-de-ch
684
+ r.p.m
685
+ rc
686
+ rd
687
+ ref
688
+ refl
689
+ reg
690
+ rev
691
+ ro
692
+ rte
693
+ s
694
+ s
695
+ s.a
696
+ s.b.f
697
+ s.d
698
+ s.e
699
+ s.l
700
+ s.l.n.d
701
+ s.l.p
702
+ s.t.p
703
+ s.v.p
704
+ s/c
705
+ sc
706
+ sep
707
+ sept
708
+ sf
709
+ sgt
710
+ sl
711
+ sr
712
+ sra
713
+ sras
714
+ srs
715
+ ss
716
+ sto
717
+ t
718
+ t.s.v.p
719
+ tec
720
+ tel
721
+ terr
722
+ tg
723
+ tint
724
+ tit
725
+ tj
726
+ tr
727
+ travx
728
+ v
729
+ v.intr
730
+ v.tr
731
+ v.w.t
732
+ var
733
+ vs
734
+ vta
735
+ vx
736
+ z.v
737
+ zool
738
+ Št
739
+ št