opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,656 @@
1
+ #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
2
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
3
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
4
+
5
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
6
+ #usually upper case letters are initials in a name
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ U
28
+ V
29
+ W
30
+ X
31
+ Y
32
+ Z
33
+ a
34
+ b
35
+ c
36
+ d
37
+ e
38
+ f
39
+ g
40
+ h
41
+ i
42
+ j
43
+ k
44
+ l
45
+ m
46
+ n
47
+ o
48
+ p
49
+ q
50
+ r
51
+ s
52
+ t
53
+ u
54
+ v
55
+ w
56
+ x
57
+ y
58
+ z
59
+
60
+
61
+ #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
62
+ I
63
+ II
64
+ III
65
+ IV
66
+ V
67
+ VI
68
+ VII
69
+ VIII
70
+ IX
71
+ X
72
+ XI
73
+ XII
74
+ XIII
75
+ XIV
76
+ XV
77
+ XVI
78
+ XVII
79
+ XVIII
80
+ XIX
81
+ XX
82
+ i
83
+ ii
84
+ iii
85
+ iv
86
+ v
87
+ vi
88
+ vii
89
+ viii
90
+ ix
91
+ x
92
+ xi
93
+ xii
94
+ xiii
95
+ xiv
96
+ xv
97
+ xvi
98
+ xvii
99
+ xviii
100
+ xix
101
+ xx
102
+
103
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104
+ Adj
105
+ Adm
106
+ Adv
107
+ Art
108
+ Ca
109
+ Capt
110
+ Cmdr
111
+ Col
112
+ Comdr
113
+ Con
114
+ Corp
115
+ Cpl
116
+ DR
117
+ DRA
118
+ Dr
119
+ Dra
120
+ Dras
121
+ Drs
122
+ Eng
123
+ Enga
124
+ Engas
125
+ Engos
126
+ Ex
127
+ Exo
128
+ Exmo
129
+ Fig
130
+ Gen
131
+ Hosp
132
+ Insp
133
+ Lda
134
+ MM
135
+ MR
136
+ MRS
137
+ MS
138
+ Maj
139
+ Mrs
140
+ Ms
141
+ Msgr
142
+ Op
143
+ Ord
144
+ Pfc
145
+ Ph
146
+ Prof
147
+ Pvt
148
+ Rep
149
+ Reps
150
+ Res
151
+ Rev
152
+ Rt
153
+ Sen
154
+ Sens
155
+ Sfc
156
+ Sgt
157
+ SGT
158
+ Sr
159
+ Sra
160
+ Sras
161
+ Srs
162
+ Sto
163
+ Supt
164
+ Surg
165
+ adj
166
+ adm
167
+ adv
168
+ art
169
+ cit
170
+ col
171
+ con
172
+ corp
173
+ cpl
174
+ dr
175
+ dra
176
+ dras
177
+ drs
178
+ eng
179
+ enga
180
+ engas
181
+ engos
182
+ ex
183
+ exo
184
+ exmo
185
+ fig
186
+ op
187
+ prof
188
+ sr
189
+ sra
190
+ sras
191
+ srs
192
+ sto
193
+
194
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
195
+ v
196
+ vs
197
+ i.e
198
+ rev
199
+ e.g
200
+
201
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
202
+ # add NUMERIC_ONLY after the word for this function
203
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
204
+ #if followed by a number, a non-breaking prefix
205
+ No #NUMERIC_ONLY#
206
+ Nos
207
+ Art #NUMERIC_ONLY#
208
+ Nr
209
+ p #NUMERIC_ONLY#
210
+ pp #NUMERIC_ONLY#
211
+
212
+
213
+ #unified abbreviation list
214
+ Acad
215
+ Adj
216
+ Adm
217
+ Adv
218
+ Affl
219
+ Apr
220
+ Art
221
+ Asst
222
+ Av
223
+ Avg
224
+ B.ches-du-Rh
225
+ Bart
226
+ Bco
227
+ Bldg
228
+ Brig
229
+ Bros
230
+ C.a
231
+ C.p.c.n
232
+ Ca
233
+ Capt
234
+ Cdt
235
+ Cf
236
+ Ch.-Mme
237
+ Chap
238
+ Cie
239
+ Cmdr
240
+ Col
241
+ Comdr
242
+ Con
243
+ Corp
244
+ Cpl
245
+ DR
246
+ DRA
247
+ Da
248
+ Dec
249
+ Dep
250
+ Dn
251
+ Dr
252
+ Dra
253
+ Dras
254
+ Drs
255
+ Eng
256
+ Enga
257
+ Engas
258
+ Engos
259
+ Ens
260
+ Ets
261
+ Euro
262
+ Ev
263
+ Ex
264
+ Excmo
265
+ Exmo
266
+ Exo
267
+ Fa
268
+ Fco
269
+ Feb
270
+ Fig
271
+ Fr
272
+ Gar
273
+ Gen
274
+ Gir
275
+ Gl
276
+ Gov
277
+ Hno
278
+ Hon
279
+ Hosp
280
+ Hr
281
+ Ilmo
282
+ Insp
283
+ J.-C
284
+ Jan
285
+ Jeu
286
+ Jr
287
+ Jul
288
+ Jun
289
+ Lda
290
+ Lieut
291
+ Lt
292
+ Lun
293
+ MM
294
+ MR
295
+ MRS
296
+ MS
297
+ MSc
298
+ Maj
299
+ Mar
300
+ Me
301
+ Mej
302
+ Mer
303
+ Mes
304
+ Messrs
305
+ Mgr
306
+ Mgrs
307
+ Mll
308
+ Mlle
309
+ Mlle(s)
310
+ Mme
311
+ Mme(s)
312
+ Mr
313
+ Mrs
314
+ Ms
315
+ Msgr
316
+ Mw
317
+ Nov
318
+ Npr
319
+ Nr
320
+ O.d.J
321
+ Okt
322
+ Op
323
+ Ord
324
+ Oz
325
+ P
326
+ P.D
327
+ P.ej
328
+ P.p.c
329
+ Pas
330
+ Pfc
331
+ Ph
332
+ Prim
333
+ Prof
334
+ Pte
335
+ Pts
336
+ Pvt
337
+ Rep
338
+ Reps
339
+ Res
340
+ Rev
341
+ Revd
342
+ Rh
343
+ Riv
344
+ Rt
345
+ S.Em
346
+ S.Exc
347
+ S.a.r.l
348
+ Sen
349
+ Sens
350
+ Sep
351
+ Sept
352
+ Sfc
353
+ Sgt
354
+ Sl
355
+ Sr
356
+ Sra
357
+ Sras
358
+ Srs
359
+ Srta
360
+ St
361
+ ST
362
+ Sta
363
+ Ste
364
+ Sto
365
+ Supt
366
+ Surg
367
+ Tj
368
+ Tr
369
+ Ud
370
+ Uds
371
+ V.Exc
372
+ Vd
373
+ Vda
374
+ Vds
375
+ Vz
376
+ Z.D
377
+ Z.D.H
378
+ Z.E
379
+ Z.Em
380
+ Z.H
381
+ Z.K.H
382
+ Z.K.M
383
+ Z.M
384
+ a
385
+ a./s
386
+ a.C
387
+ a.g.v
388
+ a.l
389
+ abrev
390
+ abs
391
+ ac
392
+ acc
393
+ acron
394
+ adj
395
+ adm
396
+ adr
397
+ adv
398
+ alt
399
+ anal
400
+ anat
401
+ angl
402
+ appos
403
+ apr
404
+ apr
405
+ asc
406
+ atm
407
+ auj
408
+ aux
409
+ av
410
+ avg
411
+ avr
412
+ b
413
+ b.a.o
414
+ b.a.p
415
+ b.a.r
416
+ bacc
417
+ bat
418
+ bc
419
+ bd
420
+ bde
421
+ bgen
422
+ bijv
423
+ bijz
424
+ br
425
+ bv
426
+ c
427
+ c.-a-d
428
+ c.a.f
429
+ c.i
430
+ cc
431
+ cf
432
+ cft
433
+ ch
434
+ ch.-l
435
+ chbre
436
+ chbs
437
+ chf
438
+ col
439
+ coll
440
+ cpl
441
+ cpt
442
+ cpte
443
+ cta
444
+ d
445
+ d.c
446
+ d.w.z
447
+ dcha
448
+ dec
449
+ def
450
+ dem
451
+ dep
452
+ dept
453
+ dhr
454
+ dipl
455
+ dispo
456
+ div
457
+ dpto
458
+ dr
459
+ dr.h.c
460
+ dra
461
+ dras
462
+ drs
463
+ ds
464
+ dz
465
+ e.c
466
+ e.g
467
+ e.g
468
+ e.k
469
+ eccles
470
+ ecol
471
+ econ
472
+ ed
473
+ ej
474
+ env
475
+ ep
476
+ eq
477
+ et
478
+ etc
479
+ ev
480
+ ex
481
+ exmo
482
+ exo
483
+ exp
484
+ expo
485
+ f.a.c
486
+ fa
487
+ fam
488
+ fasc
489
+ fbg
490
+ feb
491
+ fem
492
+ fevr
493
+ ff
494
+ fl
495
+ fol
496
+ fr
497
+ fs
498
+ fut
499
+ gd
500
+ gde
501
+ gdes
502
+ gds
503
+ gen
504
+ gl
505
+ grd
506
+ h.-t
507
+ hab
508
+ i.e
509
+ i.p.v
510
+ i.s.m
511
+ i.t.t
512
+ i.v.m
513
+ ibid
514
+ id
515
+ imp
516
+ ing
517
+ ir
518
+ iron
519
+ itd
520
+ itn
521
+ itp
522
+ izq
523
+ j
524
+ janv
525
+ jhr
526
+ jkvr
527
+ jr
528
+ l
529
+ lat
530
+ lex
531
+ lgen
532
+ lib
533
+ lieut
534
+ liv
535
+ lkol
536
+ loc
537
+ lof
538
+ m
539
+ m.a.w
540
+ m.b.t
541
+ m.b.v
542
+ m.h.o
543
+ m.i
544
+ m.i.v
545
+ maj
546
+ mar
547
+ mas
548
+ max
549
+ med
550
+ mevr
551
+ min
552
+ mll
553
+ mr
554
+ ms
555
+ mtr
556
+ mtrs
557
+ n
558
+ n
559
+ n.f
560
+ n.f.pl
561
+ n.m
562
+ n.m.pl
563
+ npr
564
+ o
565
+ o.b.s
566
+ obs
567
+ oct
568
+ okt
569
+ ord
570
+ oz
571
+ p
572
+ p
573
+ p.a
574
+ p.ej
575
+ p.ex
576
+ p.g.c.d
577
+ p.i
578
+ p.j
579
+ p.m
580
+ p.o
581
+ p.p
582
+ p.p.c.d
583
+ p.p.c.m
584
+ p.pa
585
+ p.pr
586
+ pl
587
+ plv
588
+ poe
589
+ pp
590
+ pp
591
+ pr
592
+ pr
593
+ pres
594
+ prev
595
+ prof
596
+ px
597
+ q.s
598
+ qqch
599
+ qqf
600
+ qqn
601
+ qqns
602
+ r.-de-ch
603
+ r.p.m
604
+ rc
605
+ rd
606
+ ref
607
+ refl
608
+ reg
609
+ rev
610
+ ro
611
+ rte
612
+ s
613
+ s
614
+ s.a
615
+ s.b.f
616
+ s.d
617
+ s.e
618
+ s.l
619
+ s.l.n.d
620
+ s.l.p
621
+ s.t.p
622
+ s.v.p
623
+ s/c
624
+ sc
625
+ sf
626
+ sgt
627
+ sl
628
+ sr
629
+ sra
630
+ sras
631
+ srs
632
+ ss
633
+ sto
634
+ t
635
+ t.s.v.p
636
+ tec
637
+ tel
638
+ terr
639
+ tg
640
+ tint
641
+ tit
642
+ tj
643
+ tr
644
+ travx
645
+ v
646
+ v.intr
647
+ v.tr
648
+ v.w.t
649
+ var
650
+ vs
651
+ vta
652
+ vx
653
+ z.v
654
+ zool
655
+ Št
656
+ št