opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,641 @@
1
+ #Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
2
+ #Special cases are included for prefixes that ONLY appear before 0-9 numbers.
3
+
4
+ #any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
5
+ #usually upper case letters are initials in a name
6
+ A
7
+ B
8
+ C
9
+ D
10
+ E
11
+ F
12
+ G
13
+ H
14
+ I
15
+ J
16
+ K
17
+ L
18
+ M
19
+ N
20
+ O
21
+ P
22
+ Q
23
+ R
24
+ S
25
+ T
26
+ U
27
+ V
28
+ W
29
+ X
30
+ Y
31
+ Z
32
+
33
+ #List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
34
+ Adj
35
+ Adm
36
+ Adv
37
+ Amn
38
+ Arch
39
+ Asst
40
+ Avv
41
+ Bart
42
+ Bcc
43
+ Bldg
44
+ Brig
45
+ Bros
46
+ C.A.P
47
+ C.P
48
+ Capt
49
+ Cc
50
+ Cmdr
51
+ Co
52
+ Col
53
+ Comdr
54
+ Con
55
+ Corp
56
+ Cpl
57
+ DR
58
+ Dott
59
+ Dr
60
+ Drs
61
+ Egr
62
+ Ens
63
+ Gen
64
+ Genn
65
+ Geom
66
+ Gov
67
+ Hon
68
+ Hosp
69
+ Hr
70
+ Id
71
+ Ing
72
+ Insp
73
+ Lt
74
+ MM
75
+ MR
76
+ MRS
77
+ MS
78
+ Maj
79
+ Messrs
80
+ Mlle
81
+ Mme
82
+ Mo
83
+ Mons
84
+ Mr
85
+ Mrs
86
+ Ms
87
+ Msgr
88
+ N.B
89
+ Op
90
+ Ord
91
+ P.S
92
+ P.T
93
+ Pfc
94
+ Ph
95
+ Prof
96
+ Pvt
97
+ RP
98
+ RSVP
99
+ Rag
100
+ Rep
101
+ Reps
102
+ Res
103
+ Rev
104
+ Rif
105
+ Rt
106
+ S.A
107
+ S.B.F
108
+ S.P.M
109
+ S.p.A
110
+ S.r.l
111
+ Sen
112
+ Sens
113
+ Sfc
114
+ Sgt
115
+ SGT
116
+ Sig
117
+ Sigg
118
+ Soc
119
+ Spett
120
+ Sr
121
+ St
122
+ Supt
123
+ Surg
124
+ V.P
125
+
126
+ # other
127
+ a.c
128
+ acc
129
+ all
130
+ banc
131
+ c.a
132
+ c.c.p
133
+ c.m
134
+ c.p
135
+ c.s
136
+ c.v
137
+ corr
138
+ dott
139
+ e.p.c
140
+ ecc
141
+ es
142
+ fatt
143
+ gg
144
+ int
145
+ lett
146
+ ogg
147
+ on
148
+ p.c
149
+ p.c.c
150
+ p.es
151
+ p.f
152
+ p.r
153
+ p.v
154
+ post
155
+ pp
156
+ racc
157
+ ric
158
+ s.n.c
159
+ seg
160
+ sgg
161
+ ss
162
+ tel
163
+ u.s
164
+ v.r
165
+ v.s
166
+
167
+ #misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
168
+ v
169
+ vs
170
+ i.e
171
+ rev
172
+ e.g
173
+
174
+ #Numbers only. These should only induce breaks when followed by a numeric sequence
175
+ # add NUMERIC_ONLY after the word for this function
176
+ #This case is mostly for the english "No." which can either be a sentence of its own, or
177
+ #if followed by a number, a non-breaking prefix
178
+ No #NUMERIC_ONLY#
179
+ Nos
180
+ Art #NUMERIC_ONLY#
181
+ Nr
182
+ pp #NUMERIC_ONLY#
183
+
184
+ #unified abbreviation list
185
+ Acad
186
+ Adj
187
+ Adm
188
+ Adv
189
+ Affl
190
+ Ag
191
+ Apr
192
+ Art
193
+ Asst
194
+ Av
195
+ Avg
196
+ B.ches-du-Rh
197
+ Bart
198
+ Bco
199
+ Bldg
200
+ Brig
201
+ Bros
202
+ C.a
203
+ C.p.c.n
204
+ Ca
205
+ Capt
206
+ Cdt
207
+ Cf
208
+ Ch.-Mme
209
+ Chap
210
+ Cie
211
+ Cmdr
212
+ Col
213
+ Comdr
214
+ Con
215
+ Corp
216
+ Cpl
217
+ DR
218
+ DRA
219
+ Da
220
+ Dec
221
+ Dep
222
+ Dic
223
+ Dn
224
+ Dr
225
+ Dra
226
+ Dras
227
+ Drs
228
+ Eng
229
+ Enga
230
+ Engas
231
+ Engos
232
+ Ens
233
+ Ets
234
+ Euro
235
+ Ev
236
+ Ex
237
+ Excmo
238
+ Exmo
239
+ Exo
240
+ Fa
241
+ Fco
242
+ Feb
243
+ Febbr
244
+ Fig
245
+ Fr
246
+ Gar
247
+ Gen
248
+ Gir
249
+ Gl
250
+ Gov
251
+ Hno
252
+ Hon
253
+ Hosp
254
+ Hr
255
+ Ilmo
256
+ Insp
257
+ J.-C
258
+ Jan
259
+ Jeu
260
+ Jr
261
+ Jul
262
+ Jun
263
+ Lda
264
+ Lieut
265
+ Lt
266
+ Lun
267
+ MM
268
+ MR
269
+ MRS
270
+ MS
271
+ MSc
272
+ Magg
273
+ Maj
274
+ Mar
275
+ Me
276
+ Mej
277
+ Mer
278
+ Mes
279
+ Messrs
280
+ Mgr
281
+ Mgrs
282
+ Mll
283
+ Mlle
284
+ Mlle(s)
285
+ Mme
286
+ Mme(s)
287
+ Mr
288
+ Mrs
289
+ Ms
290
+ Msgr
291
+ Mw
292
+ Nov
293
+ Npr
294
+ Nr
295
+ O.d.J
296
+ Okt
297
+ Op
298
+ Ord
299
+ Ott
300
+ Oz
301
+ P
302
+ P.D
303
+ P.ej
304
+ P.p.c
305
+ Pas
306
+ Pfc
307
+ Ph
308
+ Prim
309
+ Prof
310
+ Pte
311
+ Pts
312
+ Pvt
313
+ Rep
314
+ Reps
315
+ Res
316
+ Rev
317
+ Revd
318
+ Rh
319
+ Riv
320
+ Rt
321
+ S.Em
322
+ S.Exc
323
+ S.a.r.l
324
+ Sen
325
+ Sens
326
+ Sep
327
+ Sept
328
+ Sett
329
+ Sfc
330
+ Sgt
331
+ Sl
332
+ Sr
333
+ Sra
334
+ Sras
335
+ Srs
336
+ Srta
337
+ St
338
+ ST
339
+ Sta
340
+ Ste
341
+ Sto
342
+ Supt
343
+ Surg
344
+ Tj
345
+ Tr
346
+ Ud
347
+ Uds
348
+ V.Exc
349
+ Vd
350
+ Vda
351
+ Vds
352
+ Vz
353
+ Z.D
354
+ Z.D.H
355
+ Z.E
356
+ Z.Em
357
+ Z.H
358
+ Z.K.H
359
+ Z.K.M
360
+ Z.M
361
+ a
362
+ a./s
363
+ a.C
364
+ a.g.v
365
+ a.l
366
+ abrev
367
+ abs
368
+ ac
369
+ acc
370
+ acron
371
+ adj
372
+ adm
373
+ adr
374
+ adv
375
+ ag
376
+ alt
377
+ anal
378
+ anat
379
+ angl
380
+ appos
381
+ apr
382
+ apr
383
+ asc
384
+ atm
385
+ auj
386
+ aux
387
+ av
388
+ avg
389
+ avr
390
+ b
391
+ b.a.o
392
+ b.a.p
393
+ b.a.r
394
+ bacc
395
+ bat
396
+ bc
397
+ bd
398
+ bde
399
+ bgen
400
+ bijv
401
+ bijz
402
+ br
403
+ bv
404
+ c
405
+ c.-a-d
406
+ c.a.f
407
+ c.i
408
+ cc
409
+ cf
410
+ cft
411
+ ch
412
+ ch.-l
413
+ chbre
414
+ chbs
415
+ chf
416
+ col
417
+ coll
418
+ cpl
419
+ cpt
420
+ cpte
421
+ cta
422
+ d
423
+ d.c
424
+ d.w.z
425
+ dcha
426
+ dec
427
+ def
428
+ dem
429
+ dep
430
+ dept
431
+ dhr
432
+ dic
433
+ dipl
434
+ dispo
435
+ div
436
+ dpto
437
+ dr
438
+ dr.h.c
439
+ dra
440
+ dras
441
+ drs
442
+ ds
443
+ dz
444
+ e.c
445
+ e.g
446
+ e.g
447
+ e.k
448
+ eccles
449
+ ecol
450
+ econ
451
+ ed
452
+ ej
453
+ env
454
+ ep
455
+ eq
456
+ et
457
+ etc
458
+ ev
459
+ ex
460
+ exmo
461
+ exo
462
+ exp
463
+ expo
464
+ f.a.c
465
+ fa
466
+ fam
467
+ fasc
468
+ fbg
469
+ feb
470
+ febbr
471
+ fem
472
+ fevr
473
+ ff
474
+ fl
475
+ fol
476
+ fr
477
+ fs
478
+ fut
479
+ gd
480
+ gde
481
+ gdes
482
+ gds
483
+ gen
484
+ genn
485
+ gl
486
+ grd
487
+ h.-t
488
+ hab
489
+ i.e
490
+ i.p.v
491
+ i.s.m
492
+ i.t.t
493
+ i.v.m
494
+ ibid
495
+ id
496
+ imp
497
+ ing
498
+ ir
499
+ iron
500
+ itd
501
+ itn
502
+ itp
503
+ izq
504
+ j
505
+ janv
506
+ jhr
507
+ jkvr
508
+ jr
509
+ l
510
+ lat
511
+ lex
512
+ lgen
513
+ lib
514
+ lieut
515
+ liv
516
+ lkol
517
+ loc
518
+ lof
519
+ m
520
+ m.a.w
521
+ m.b.t
522
+ m.b.v
523
+ m.h.o
524
+ m.i
525
+ m.i.v
526
+ magg
527
+ maj
528
+ mar
529
+ mas
530
+ max
531
+ med
532
+ mevr
533
+ min
534
+ mll
535
+ mr
536
+ ms
537
+ mtr
538
+ mtrs
539
+ n
540
+ n
541
+ n.f
542
+ n.f.pl
543
+ n.m
544
+ n.m.pl
545
+ nov
546
+ npr
547
+ o
548
+ o.b.s
549
+ obs
550
+ oct
551
+ okt
552
+ ord
553
+ ott
554
+ oz
555
+ p
556
+ p
557
+ p.a
558
+ p.ej
559
+ p.ex
560
+ p.g.c.d
561
+ p.i
562
+ p.j
563
+ p.m
564
+ p.o
565
+ p.p
566
+ p.p.c.d
567
+ p.p.c.m
568
+ p.pa
569
+ p.pr
570
+ pl
571
+ plv
572
+ poe
573
+ pp
574
+ pp
575
+ pr
576
+ pr
577
+ pres
578
+ prev
579
+ prof
580
+ px
581
+ q.s
582
+ qqch
583
+ qqf
584
+ qqn
585
+ qqns
586
+ r.-de-ch
587
+ r.p.m
588
+ rc
589
+ rd
590
+ ref
591
+ refl
592
+ reg
593
+ rev
594
+ ro
595
+ rte
596
+ s
597
+ s
598
+ s.a
599
+ s.b.f
600
+ s.d
601
+ s.e
602
+ s.l
603
+ s.l.n.d
604
+ s.l.p
605
+ s.t.p
606
+ s.v.p
607
+ s/c
608
+ sc
609
+ sett
610
+ sf
611
+ sgt
612
+ sl
613
+ sr
614
+ sra
615
+ sras
616
+ srs
617
+ ss
618
+ sto
619
+ t
620
+ t.s.v.p
621
+ tec
622
+ tel
623
+ terr
624
+ tg
625
+ tint
626
+ tit
627
+ tj
628
+ tr
629
+ travx
630
+ v
631
+ v.intr
632
+ v.tr
633
+ v.w.t
634
+ var
635
+ vs
636
+ vta
637
+ vx
638
+ z.v
639
+ zool
640
+ Št
641
+ št