opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,656 @@
1
+ #File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
2
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
3
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
4
+
5
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
6
+ #usually upper case letters are initials in a name
7
+ A
8
+ B
9
+ C
10
+ D
11
+ E
12
+ F
13
+ G
14
+ H
15
+ I
16
+ J
17
+ K
18
+ L
19
+ M
20
+ N
21
+ O
22
+ P
23
+ Q
24
+ R
25
+ S
26
+ T
27
+ U
28
+ V
29
+ W
30
+ X
31
+ Y
32
+ Z
33
+ a
34
+ b
35
+ c
36
+ d
37
+ e
38
+ f
39
+ g
40
+ h
41
+ i
42
+ j
43
+ k
44
+ l
45
+ m
46
+ n
47
+ o
48
+ p
49
+ q
50
+ r
51
+ s
52
+ t
53
+ u
54
+ v
55
+ w
56
+ x
57
+ y
58
+ z
59
+
60
+
61
+ #Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
62
+ I
63
+ II
64
+ III
65
+ IV
66
+ V
67
+ VI
68
+ VII
69
+ VIII
70
+ IX
71
+ X
72
+ XI
73
+ XII
74
+ XIII
75
+ XIV
76
+ XV
77
+ XVI
78
+ XVII
79
+ XVIII
80
+ XIX
81
+ XX
82
+ i
83
+ ii
84
+ iii
85
+ iv
86
+ v
87
+ vi
88
+ vii
89
+ viii
90
+ ix
91
+ x
92
+ xi
93
+ xii
94
+ xiii
95
+ xiv
96
+ xv
97
+ xvi
98
+ xvii
99
+ xviii
100
+ xix
101
+ xx
102
+
103
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
104
+ Adj
105
+ Adm
106
+ Adv
107
+ Art
108
+ Ca
109
+ Capt
110
+ Cmdr
111
+ Col
112
+ Comdr
113
+ Con
114
+ Corp
115
+ Cpl
116
+ DR
117
+ DRA
118
+ Dr
119
+ Dra
120
+ Dras
121
+ Drs
122
+ Eng
123
+ Enga
124
+ Engas
125
+ Engos
126
+ Ex
127
+ Exo
128
+ Exmo
129
+ Fig
130
+ Gen
131
+ Hosp
132
+ Insp
133
+ Lda
134
+ MM
135
+ MR
136
+ MRS
137
+ MS
138
+ Maj
139
+ Mrs
140
+ Ms
141
+ Msgr
142
+ Op
143
+ Ord
144
+ Pfc
145
+ Ph
146
+ Prof
147
+ Pvt
148
+ Rep
149
+ Reps
150
+ Res
151
+ Rev
152
+ Rt
153
+ Sen
154
+ Sens
155
+ Sfc
156
+ Sgt
157
+ SGT
158
+ Sr
159
+ Sra
160
+ Sras
161
+ Srs
162
+ Sto
163
+ Supt
164
+ Surg
165
+ adj
166
+ adm
167
+ adv
168
+ art
169
+ cit
170
+ col
171
+ con
172
+ corp
173
+ cpl
174
+ dr
175
+ dra
176
+ dras
177
+ drs
178
+ eng
179
+ enga
180
+ engas
181
+ engos
182
+ ex
183
+ exo
184
+ exmo
185
+ fig
186
+ op
187
+ prof
188
+ sr
189
+ sra
190
+ sras
191
+ srs
192
+ sto
193
+
194
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
195
+ v
196
+ vs
197
+ i.e
198
+ rev
199
+ e.g
200
+
201
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
202
+ # add NUMERIC_ONLY after the word for this function
203
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
204
+ #if followed by a number, a non-breaking prefix
205
+ No #NUMERIC_ONLY#
206
+ Nos
207
+ Art #NUMERIC_ONLY#
208
+ Nr
209
+ p #NUMERIC_ONLY#
210
+ pp #NUMERIC_ONLY#
211
+
212
+
213
+ #unified abbreviation list
214
+ Acad
215
+ Adj
216
+ Adm
217
+ Adv
218
+ Affl
219
+ Apr
220
+ Art
221
+ Asst
222
+ Av
223
+ Avg
224
+ B.ches-du-Rh
225
+ Bart
226
+ Bco
227
+ Bldg
228
+ Brig
229
+ Bros
230
+ C.a
231
+ C.p.c.n
232
+ Ca
233
+ Capt
234
+ Cdt
235
+ Cf
236
+ Ch.-Mme
237
+ Chap
238
+ Cie
239
+ Cmdr
240
+ Col
241
+ Comdr
242
+ Con
243
+ Corp
244
+ Cpl
245
+ DR
246
+ DRA
247
+ Da
248
+ Dec
249
+ Dep
250
+ Dn
251
+ Dr
252
+ Dra
253
+ Dras
254
+ Drs
255
+ Eng
256
+ Enga
257
+ Engas
258
+ Engos
259
+ Ens
260
+ Ets
261
+ Euro
262
+ Ev
263
+ Ex
264
+ Excmo
265
+ Exmo
266
+ Exo
267
+ Fa
268
+ Fco
269
+ Feb
270
+ Fig
271
+ Fr
272
+ Gar
273
+ Gen
274
+ Gir
275
+ Gl
276
+ Gov
277
+ Hno
278
+ Hon
279
+ Hosp
280
+ Hr
281
+ Ilmo
282
+ Insp
283
+ J.-C
284
+ Jan
285
+ Jeu
286
+ Jr
287
+ Jul
288
+ Jun
289
+ Lda
290
+ Lieut
291
+ Lt
292
+ Lun
293
+ MM
294
+ MR
295
+ MRS
296
+ MS
297
+ MSc
298
+ Maj
299
+ Mar
300
+ Me
301
+ Mej
302
+ Mer
303
+ Mes
304
+ Messrs
305
+ Mgr
306
+ Mgrs
307
+ Mll
308
+ Mlle
309
+ Mlle(s)
310
+ Mme
311
+ Mme(s)
312
+ Mr
313
+ Mrs
314
+ Ms
315
+ Msgr
316
+ Mw
317
+ Nov
318
+ Npr
319
+ Nr
320
+ O.d.J
321
+ Okt
322
+ Op
323
+ Ord
324
+ Oz
325
+ P
326
+ P.D
327
+ P.ej
328
+ P.p.c
329
+ Pas
330
+ Pfc
331
+ Ph
332
+ Prim
333
+ Prof
334
+ Pte
335
+ Pts
336
+ Pvt
337
+ Rep
338
+ Reps
339
+ Res
340
+ Rev
341
+ Revd
342
+ Rh
343
+ Riv
344
+ Rt
345
+ S.Em
346
+ S.Exc
347
+ S.a.r.l
348
+ Sen
349
+ Sens
350
+ Sep
351
+ Sept
352
+ Sfc
353
+ Sgt
354
+ Sl
355
+ Sr
356
+ Sra
357
+ Sras
358
+ Srs
359
+ Srta
360
+ St
361
+ ST
362
+ Sta
363
+ Ste
364
+ Sto
365
+ Supt
366
+ Surg
367
+ Tj
368
+ Tr
369
+ Ud
370
+ Uds
371
+ V.Exc
372
+ Vd
373
+ Vda
374
+ Vds
375
+ Vz
376
+ Z.D
377
+ Z.D.H
378
+ Z.E
379
+ Z.Em
380
+ Z.H
381
+ Z.K.H
382
+ Z.K.M
383
+ Z.M
384
+ a
385
+ a./s
386
+ a.C
387
+ a.g.v
388
+ a.l
389
+ abrev
390
+ abs
391
+ ac
392
+ acc
393
+ acron
394
+ adj
395
+ adm
396
+ adr
397
+ adv
398
+ alt
399
+ anal
400
+ anat
401
+ angl
402
+ appos
403
+ apr
404
+ apr
405
+ asc
406
+ atm
407
+ auj
408
+ aux
409
+ av
410
+ avg
411
+ avr
412
+ b
413
+ b.a.o
414
+ b.a.p
415
+ b.a.r
416
+ bacc
417
+ bat
418
+ bc
419
+ bd
420
+ bde
421
+ bgen
422
+ bijv
423
+ bijz
424
+ br
425
+ bv
426
+ c
427
+ c.-a-d
428
+ c.a.f
429
+ c.i
430
+ cc
431
+ cf
432
+ cft
433
+ ch
434
+ ch.-l
435
+ chbre
436
+ chbs
437
+ chf
438
+ col
439
+ coll
440
+ cpl
441
+ cpt
442
+ cpte
443
+ cta
444
+ d
445
+ d.c
446
+ d.w.z
447
+ dcha
448
+ dec
449
+ def
450
+ dem
451
+ dep
452
+ dept
453
+ dhr
454
+ dipl
455
+ dispo
456
+ div
457
+ dpto
458
+ dr
459
+ dr.h.c
460
+ dra
461
+ dras
462
+ drs
463
+ ds
464
+ dz
465
+ e.c
466
+ e.g
467
+ e.g
468
+ e.k
469
+ eccles
470
+ ecol
471
+ econ
472
+ ed
473
+ ej
474
+ env
475
+ ep
476
+ eq
477
+ et
478
+ etc
479
+ ev
480
+ ex
481
+ exmo
482
+ exo
483
+ exp
484
+ expo
485
+ f.a.c
486
+ fa
487
+ fam
488
+ fasc
489
+ fbg
490
+ feb
491
+ fem
492
+ fevr
493
+ ff
494
+ fl
495
+ fol
496
+ fr
497
+ fs
498
+ fut
499
+ gd
500
+ gde
501
+ gdes
502
+ gds
503
+ gen
504
+ gl
505
+ grd
506
+ h.-t
507
+ hab
508
+ i.e
509
+ i.p.v
510
+ i.s.m
511
+ i.t.t
512
+ i.v.m
513
+ ibid
514
+ id
515
+ imp
516
+ ing
517
+ ir
518
+ iron
519
+ itd
520
+ itn
521
+ itp
522
+ izq
523
+ j
524
+ janv
525
+ jhr
526
+ jkvr
527
+ jr
528
+ l
529
+ lat
530
+ lex
531
+ lgen
532
+ lib
533
+ lieut
534
+ liv
535
+ lkol
536
+ loc
537
+ lof
538
+ m
539
+ m.a.w
540
+ m.b.t
541
+ m.b.v
542
+ m.h.o
543
+ m.i
544
+ m.i.v
545
+ maj
546
+ mar
547
+ mas
548
+ max
549
+ med
550
+ mevr
551
+ min
552
+ mll
553
+ mr
554
+ ms
555
+ mtr
556
+ mtrs
557
+ n
558
+ n
559
+ n.f
560
+ n.f.pl
561
+ n.m
562
+ n.m.pl
563
+ npr
564
+ o
565
+ o.b.s
566
+ obs
567
+ oct
568
+ okt
569
+ ord
570
+ oz
571
+ p
572
+ p
573
+ p.a
574
+ p.ej
575
+ p.ex
576
+ p.g.c.d
577
+ p.i
578
+ p.j
579
+ p.m
580
+ p.o
581
+ p.p
582
+ p.p.c.d
583
+ p.p.c.m
584
+ p.pa
585
+ p.pr
586
+ pl
587
+ plv
588
+ poe
589
+ pp
590
+ pp
591
+ pr
592
+ pr
593
+ pres
594
+ prev
595
+ prof
596
+ px
597
+ q.s
598
+ qqch
599
+ qqf
600
+ qqn
601
+ qqns
602
+ r.-de-ch
603
+ r.p.m
604
+ rc
605
+ rd
606
+ ref
607
+ refl
608
+ reg
609
+ rev
610
+ ro
611
+ rte
612
+ s
613
+ s
614
+ s.a
615
+ s.b.f
616
+ s.d
617
+ s.e
618
+ s.l
619
+ s.l.n.d
620
+ s.l.p
621
+ s.t.p
622
+ s.v.p
623
+ s/c
624
+ sc
625
+ sf
626
+ sgt
627
+ sl
628
+ sr
629
+ sra
630
+ sras
631
+ srs
632
+ ss
633
+ sto
634
+ t
635
+ t.s.v.p
636
+ tec
637
+ tel
638
+ terr
639
+ tg
640
+ tint
641
+ tit
642
+ tj
643
+ tr
644
+ travx
645
+ v
646
+ v.intr
647
+ v.tr
648
+ v.w.t
649
+ var
650
+ vs
651
+ vta
652
+ vx
653
+ z.v
654
+ zool
655
+ Št
656
+ št