opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,739 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+ #Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
4
+ # http://nl.wikipedia.org/wiki/Aanspreekvorm
5
+ # http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
6
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
7
+ #usually upper case letters are initials in a name
8
+ A
9
+ B
10
+ C
11
+ D
12
+ E
13
+ F
14
+ G
15
+ H
16
+ I
17
+ J
18
+ K
19
+ L
20
+ M
21
+ N
22
+ O
23
+ P
24
+ Q
25
+ R
26
+ S
27
+ T
28
+ U
29
+ V
30
+ W
31
+ X
32
+ Y
33
+ Z
34
+
35
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
36
+ bacc
37
+ bc
38
+ bgen
39
+ c.i
40
+ dhr
41
+ dr
42
+ dr.h.c
43
+ drs
44
+ ds
45
+ eint
46
+ fa
47
+ Fa
48
+ fam
49
+ gen
50
+ genm
51
+ ing
52
+ ir
53
+ jhr
54
+ jkvr
55
+ jr
56
+ kand
57
+ kol
58
+ lgen
59
+ lkol
60
+ Lt
61
+ maj
62
+ Mej
63
+ mevr
64
+ Mme
65
+ mr
66
+ Mw
67
+ o.b.s
68
+ plv
69
+ prof
70
+ ritm
71
+ tint
72
+ Vz
73
+ Z.D
74
+ Z.D.H
75
+ Z.E
76
+ Z.Em
77
+ Z.H
78
+ Z.K.H
79
+ Z.K.M
80
+ Z.M
81
+ z.v
82
+
83
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
84
+ #we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
85
+ a.g.v
86
+ bijv
87
+ bijz
88
+ bv
89
+ d.w.z
90
+ e.c
91
+ e.g
92
+ e.k
93
+ ev
94
+ i.p.v
95
+ i.s.m
96
+ i.t.t
97
+ i.v.m
98
+ m.a.w
99
+ m.b.t
100
+ m.b.v
101
+ m.h.o
102
+ m.i
103
+ m.i.v
104
+ v.w.t
105
+
106
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
107
+ # add NUMERIC_ONLY after the word for this function
108
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
109
+ #if followed by a number, a non-breaking prefix
110
+ Nr #NUMERIC_ONLY#
111
+ Nrs
112
+ nrs
113
+ nr #NUMERIC_ONLY#
114
+
115
+ #others
116
+ a.g.v
117
+ aanh
118
+ aanw
119
+ aardew
120
+ aardr
121
+ abs
122
+ abstr
123
+ adj
124
+ adm
125
+ afb
126
+ afd
127
+ afk
128
+ afl
129
+ Afr
130
+ al
131
+ ald
132
+ alg
133
+ Am
134
+ amb
135
+ ambt
136
+ anat
137
+ antrop
138
+ apoth
139
+ Ar
140
+ arch
141
+ archeol
142
+ art
143
+ bacc
144
+ betr
145
+ bez
146
+ bgen
147
+ bibl
148
+ bijl
149
+ blz
150
+ Br
151
+ bv
152
+ bw
153
+ c.i
154
+ ca
155
+ cat
156
+ centr
157
+ cf
158
+ cfr
159
+ Cie
160
+ cmpl
161
+ Comp
162
+ conf
163
+ ct
164
+ d.w.z
165
+ dal
166
+ derg
167
+ Dhr
168
+ dir
169
+ div
170
+ Dr
171
+ dr
172
+ dr.h.c
173
+ dra
174
+ e.c
175
+ e.g
176
+ e.k
177
+ ed
178
+ eint
179
+ Em
180
+ em
181
+ enz
182
+ etc
183
+ excl
184
+ fig
185
+ fl
186
+ fr
187
+ geb
188
+ gen
189
+ genm
190
+ get
191
+ gld
192
+ i.p.v
193
+ i.s.m
194
+ i.t.t
195
+ i.v.m
196
+ id
197
+ incl
198
+ intern
199
+ jl
200
+ kand
201
+ kol
202
+ Kon
203
+ kr
204
+ kt
205
+ lab
206
+ lgen
207
+ lic
208
+ lkol
209
+ ll
210
+ Lt
211
+ lt
212
+ lw
213
+ m.a.w
214
+ m.b.t
215
+ m.b.v
216
+ m.h.o
217
+ m.i
218
+ m.i.v
219
+ maj
220
+ max
221
+ Mevr
222
+ mevr
223
+ Mgr
224
+ mi
225
+ min
226
+ mld
227
+ mln
228
+ Mme
229
+ mw
230
+ Ndl
231
+ Ned
232
+ Nl
233
+ nl
234
+ No
235
+ no
236
+ nr
237
+ Nrs
238
+ o.a
239
+ o.b.s
240
+ ob
241
+ obl
242
+ ong
243
+ onov
244
+ opm
245
+ org
246
+ ov
247
+ pag
248
+ par
249
+ penn
250
+ plm
251
+ prov
252
+ pseud
253
+ red
254
+ ref
255
+ resp
256
+ ritm
257
+ Secr
258
+ soc
259
+ Sr
260
+ St
261
+ st
262
+ tab
263
+ tel
264
+ tint
265
+ tk
266
+ Ued
267
+ uitsl
268
+ v.w.t
269
+ ver
270
+ vgl
271
+ vnl
272
+ vnw
273
+ voorz
274
+ ww
275
+ Z.D
276
+ Z.D.H
277
+ Z.E
278
+ Z.Em
279
+ Z.H
280
+ Z.K.H
281
+ Z.K.M
282
+ Z.M
283
+ z.v
284
+ zat
285
+ zg
286
+
287
+
288
+ #unified abbreviation list
289
+ Acad
290
+ Adj
291
+ Adm
292
+ Adv
293
+ Affl
294
+ Apr
295
+ Art
296
+ Asst
297
+ Aug
298
+ Av
299
+ Avg
300
+ B.ches-du-Rh
301
+ Bart
302
+ Bco
303
+ Bldg
304
+ Brig
305
+ Bros
306
+ C.a
307
+ C.p.c.n
308
+ Ca
309
+ Capt
310
+ Cdt
311
+ Cf
312
+ Ch.-Mme
313
+ Chap
314
+ Cie
315
+ Cmdr
316
+ Col
317
+ Comdr
318
+ Con
319
+ Corp
320
+ Cpl
321
+ DR
322
+ DRA
323
+ Da
324
+ Dec
325
+ Dep
326
+ Dn
327
+ Dr
328
+ Dra
329
+ Dras
330
+ Drs
331
+ Eng
332
+ Enga
333
+ Engas
334
+ Engos
335
+ Ens
336
+ Ets
337
+ Euro
338
+ Ev
339
+ Ex
340
+ Excmo
341
+ Exmo
342
+ Exo
343
+ Fa
344
+ Fco
345
+ Feb
346
+ Fig
347
+ Fr
348
+ Gar
349
+ Gen
350
+ Gir
351
+ Gl
352
+ Gov
353
+ Hno
354
+ Hon
355
+ Hosp
356
+ Hr
357
+ Ilmo
358
+ Insp
359
+ J.-C
360
+ Jan
361
+ Jeu
362
+ Jr
363
+ Jul
364
+ Jun
365
+ Lda
366
+ Lieut
367
+ Lt
368
+ Lun
369
+ MM
370
+ MR
371
+ MRS
372
+ MS
373
+ MSc
374
+ Maj
375
+ Mar
376
+ Me
377
+ Mej
378
+ Mer
379
+ Mes
380
+ Messrs
381
+ Mgr
382
+ Mgrs
383
+ Mll
384
+ Mlle
385
+ Mlle(s)
386
+ Mme
387
+ Mme(s)
388
+ Mr
389
+ Mrs
390
+ Ms
391
+ Msgr
392
+ Mw
393
+ Nov
394
+ Npr
395
+ Nr
396
+ O.d.J
397
+ Oct
398
+ Okt
399
+ Op
400
+ Ord
401
+ Oz
402
+ P
403
+ P.D
404
+ P.ej
405
+ P.p.c
406
+ Pas
407
+ Pfc
408
+ Ph
409
+ Prim
410
+ Prof
411
+ Pte
412
+ Pts
413
+ Pvt
414
+ Rep
415
+ Reps
416
+ Res
417
+ Rev
418
+ Revd
419
+ Rh
420
+ Riv
421
+ Rt
422
+ S.Em
423
+ S.Exc
424
+ S.a.r.l
425
+ Sen
426
+ Sens
427
+ Sep
428
+ Sept
429
+ Sfc
430
+ Sgt
431
+ SGT
432
+ Sl
433
+ Sr
434
+ Sra
435
+ Sras
436
+ Srs
437
+ Srta
438
+ St
439
+ ST
440
+ Sta
441
+ Ste
442
+ Sto
443
+ Supt
444
+ Surg
445
+ Tj
446
+ Tr
447
+ Ud
448
+ Uds
449
+ V.Exc
450
+ Vd
451
+ Vda
452
+ Vds
453
+ Vz
454
+ Z.D
455
+ Z.D.H
456
+ Z.E
457
+ Z.Em
458
+ Z.H
459
+ Z.K.H
460
+ Z.K.M
461
+ Z.M
462
+ a
463
+ a./s
464
+ a.C
465
+ a.g.v
466
+ a.l
467
+ abrev
468
+ abs
469
+ ac
470
+ acc
471
+ acron
472
+ adj
473
+ adm
474
+ adr
475
+ adv
476
+ alt
477
+ anal
478
+ anat
479
+ angl
480
+ appos
481
+ apr
482
+ apr
483
+ asc
484
+ atm
485
+ aug
486
+ auj
487
+ aux
488
+ av
489
+ avg
490
+ avr
491
+ b
492
+ b.a.o
493
+ b.a.p
494
+ b.a.r
495
+ bacc
496
+ bat
497
+ bc
498
+ bd
499
+ bde
500
+ bgen
501
+ bijv
502
+ bijz
503
+ br
504
+ bv
505
+ c
506
+ c.-a-d
507
+ c.a.f
508
+ c.i
509
+ cc
510
+ cf
511
+ cft
512
+ ch
513
+ ch.-l
514
+ chbre
515
+ chbs
516
+ chf
517
+ col
518
+ coll
519
+ cpl
520
+ cpt
521
+ cpte
522
+ cta
523
+ d
524
+ d.c
525
+ d.w.z
526
+ dcha
527
+ dec
528
+ def
529
+ dem
530
+ dep
531
+ dept
532
+ dhr
533
+ dipl
534
+ dispo
535
+ div
536
+ dpto
537
+ dr
538
+ dr.h.c
539
+ dra
540
+ dras
541
+ drs
542
+ ds
543
+ dz
544
+ e.c
545
+ e.g
546
+ e.g
547
+ e.k
548
+ eccles
549
+ ecol
550
+ econ
551
+ ed
552
+ ej
553
+ env
554
+ ep
555
+ eq
556
+ et
557
+ etc
558
+ ev
559
+ ex
560
+ exmo
561
+ exo
562
+ exp
563
+ expo
564
+ f.a.c
565
+ fa
566
+ fam
567
+ fasc
568
+ fbg
569
+ feb
570
+ fem
571
+ fevr
572
+ ff
573
+ fl
574
+ fol
575
+ fr
576
+ fs
577
+ fut
578
+ gd
579
+ gde
580
+ gdes
581
+ gds
582
+ gen
583
+ gl
584
+ grd
585
+ h.-t
586
+ hab
587
+ i.e
588
+ i.p.v
589
+ i.s.m
590
+ i.t.t
591
+ i.v.m
592
+ ibid
593
+ id
594
+ imp
595
+ ing
596
+ ir
597
+ iron
598
+ itd
599
+ itn
600
+ itp
601
+ izq
602
+ j
603
+ jan
604
+ janv
605
+ jhr
606
+ jkvr
607
+ jr
608
+ l
609
+ lat
610
+ lex
611
+ lgen
612
+ lib
613
+ lieut
614
+ liv
615
+ lkol
616
+ loc
617
+ lof
618
+ m
619
+ m.a.w
620
+ m.b.t
621
+ m.b.v
622
+ m.h.o
623
+ m.i
624
+ m.i.v
625
+ maj
626
+ mar
627
+ mas
628
+ max
629
+ med
630
+ mevr
631
+ min
632
+ mll
633
+ mr
634
+ ms
635
+ mtr
636
+ mtrs
637
+ n
638
+ n
639
+ n.f
640
+ n.f.pl
641
+ n.m
642
+ n.m.pl
643
+ nov
644
+ npr
645
+ o
646
+ o.b.s
647
+ obs
648
+ oct
649
+ okt
650
+ ord
651
+ oz
652
+ p
653
+ p
654
+ p.a
655
+ p.ej
656
+ p.ex
657
+ p.g.c.d
658
+ p.i
659
+ p.j
660
+ p.m
661
+ p.o
662
+ p.p
663
+ p.p.c.d
664
+ p.p.c.m
665
+ p.pa
666
+ p.pr
667
+ pl
668
+ plv
669
+ poe
670
+ pp
671
+ pp
672
+ pr
673
+ pr
674
+ pres
675
+ prev
676
+ prof
677
+ px
678
+ q.s
679
+ qqch
680
+ qqf
681
+ qqn
682
+ qqns
683
+ r.-de-ch
684
+ r.p.m
685
+ rc
686
+ rd
687
+ ref
688
+ refl
689
+ reg
690
+ rev
691
+ ro
692
+ rte
693
+ s
694
+ s
695
+ s.a
696
+ s.b.f
697
+ s.d
698
+ s.e
699
+ s.l
700
+ s.l.n.d
701
+ s.l.p
702
+ s.t.p
703
+ s.v.p
704
+ s/c
705
+ sc
706
+ sep
707
+ sept
708
+ sf
709
+ sgt
710
+ sl
711
+ sr
712
+ sra
713
+ sras
714
+ srs
715
+ ss
716
+ sto
717
+ t
718
+ t.s.v.p
719
+ tec
720
+ tel
721
+ terr
722
+ tg
723
+ tint
724
+ tit
725
+ tj
726
+ tr
727
+ travx
728
+ v
729
+ v.intr
730
+ v.tr
731
+ v.w.t
732
+ var
733
+ vs
734
+ vta
735
+ vx
736
+ z.v
737
+ zool
738
+ Št
739
+ št