opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,524 @@
1
+ dr
2
+ Dr
3
+ itd
4
+ itn
5
+ št #NUMERIC_ONLY#
6
+ Št #NUMERIC_ONLY#
7
+ d
8
+ jan
9
+ Jan
10
+ feb
11
+ Feb
12
+ mar
13
+ Mar
14
+ apr
15
+ Apr
16
+ jun
17
+ Jun
18
+ jul
19
+ Jul
20
+ avg
21
+ Avg
22
+ sept
23
+ Sept
24
+ sep
25
+ Sep
26
+ okt
27
+ Okt
28
+ nov
29
+ Nov
30
+ dec
31
+ Dec
32
+ tj
33
+ Tj
34
+ npr
35
+ Npr
36
+ sl
37
+ Sl
38
+ op
39
+ Op
40
+ gl
41
+ Gl
42
+ oz
43
+ Oz
44
+ prev
45
+ dipl
46
+ ing
47
+ prim
48
+ Prim
49
+ cf
50
+ Cf
51
+ gl
52
+ Gl
53
+ A
54
+ B
55
+ C
56
+ D
57
+ E
58
+ F
59
+ G
60
+ H
61
+ I
62
+ J
63
+ K
64
+ L
65
+ M
66
+ N
67
+ O
68
+ P
69
+ Q
70
+ R
71
+ S
72
+ T
73
+ U
74
+ V
75
+ W
76
+ X
77
+ Y
78
+ Z
79
+
80
+ #unified abbreviation list
81
+ Acad
82
+ Adj
83
+ Adm
84
+ Adv
85
+ Affl
86
+ Apr
87
+ Art
88
+ Asst
89
+ Av
90
+ Avg
91
+ B.ches-du-Rh
92
+ Bart
93
+ Bco
94
+ Bldg
95
+ Brig
96
+ Bros
97
+ C.a
98
+ C.p.c.n
99
+ Ca
100
+ Capt
101
+ Cdt
102
+ Cf
103
+ Ch.-Mme
104
+ Chap
105
+ Cie
106
+ Cmdr
107
+ Col
108
+ Comdr
109
+ Con
110
+ Corp
111
+ Cpl
112
+ DR
113
+ DRA
114
+ Da
115
+ Dec
116
+ Dep
117
+ Dn
118
+ Dr
119
+ Dra
120
+ Dras
121
+ Drs
122
+ Eng
123
+ Enga
124
+ Engas
125
+ Engos
126
+ Ens
127
+ Ets
128
+ Euro
129
+ Ev
130
+ Ex
131
+ Excmo
132
+ Exmo
133
+ Exo
134
+ Fa
135
+ Fco
136
+ Feb
137
+ Fig
138
+ Fr
139
+ Gar
140
+ Gen
141
+ Gir
142
+ Gl
143
+ Gov
144
+ Hno
145
+ Hon
146
+ Hosp
147
+ Hr
148
+ Ilmo
149
+ Insp
150
+ J.-C
151
+ Jan
152
+ Jeu
153
+ Jr
154
+ Jul
155
+ Jun
156
+ Lda
157
+ Lieut
158
+ Lt
159
+ Lun
160
+ MM
161
+ MR
162
+ MRS
163
+ MS
164
+ MSc
165
+ Maj
166
+ Mar
167
+ Me
168
+ Mej
169
+ Mer
170
+ Mes
171
+ Messrs
172
+ Mgr
173
+ Mgrs
174
+ Mll
175
+ Mlle
176
+ Mlle(s)
177
+ Mme
178
+ Mme(s)
179
+ Mr
180
+ Mrs
181
+ Ms
182
+ Msgr
183
+ Mw
184
+ Nov
185
+ Npr
186
+ Nr
187
+ O.d.J
188
+ Okt
189
+ Op
190
+ Ord
191
+ Oz
192
+ P
193
+ P.D
194
+ P.ej
195
+ P.p.c
196
+ Pas
197
+ Pfc
198
+ Ph
199
+ Prim
200
+ Prof
201
+ Pte
202
+ Pts
203
+ Pvt
204
+ Rep
205
+ Reps
206
+ Res
207
+ Rev
208
+ Revd
209
+ Rh
210
+ Riv
211
+ Rt
212
+ S.Em
213
+ S.Exc
214
+ S.a.r.l
215
+ Sen
216
+ Sens
217
+ Sep
218
+ Sept
219
+ Sfc
220
+ Sgt
221
+ SGT
222
+ Sl
223
+ Sr
224
+ Sra
225
+ Sras
226
+ Srs
227
+ Srta
228
+ St
229
+ ST
230
+ Sta
231
+ Ste
232
+ Sto
233
+ Supt
234
+ Surg
235
+ Tj
236
+ Tr
237
+ Ud
238
+ Uds
239
+ V.Exc
240
+ Vd
241
+ Vda
242
+ Vds
243
+ Vz
244
+ Z.D
245
+ Z.D.H
246
+ Z.E
247
+ Z.Em
248
+ Z.H
249
+ Z.K.H
250
+ Z.K.M
251
+ Z.M
252
+ a
253
+ a./s
254
+ a.C
255
+ a.g.v
256
+ a.l
257
+ abrev
258
+ abs
259
+ ac
260
+ acc
261
+ acron
262
+ adj
263
+ adm
264
+ adr
265
+ adv
266
+ alt
267
+ anal
268
+ anat
269
+ angl
270
+ appos
271
+ apr
272
+ apr
273
+ asc
274
+ atm
275
+ auj
276
+ aux
277
+ av
278
+ avg
279
+ avr
280
+ b
281
+ b.a.o
282
+ b.a.p
283
+ b.a.r
284
+ bacc
285
+ bat
286
+ bc
287
+ bd
288
+ bde
289
+ bgen
290
+ bijv
291
+ bijz
292
+ br
293
+ bv
294
+ c
295
+ c.-a-d
296
+ c.a.f
297
+ c.i
298
+ cc
299
+ cf
300
+ cft
301
+ ch
302
+ ch.-l
303
+ chbre
304
+ chbs
305
+ chf
306
+ col
307
+ coll
308
+ cpl
309
+ cpt
310
+ cpte
311
+ cta
312
+ d
313
+ d.c
314
+ d.w.z
315
+ dcha
316
+ dec
317
+ def
318
+ dem
319
+ dep
320
+ dept
321
+ dhr
322
+ dipl
323
+ dispo
324
+ div
325
+ dpto
326
+ dr
327
+ dr.h.c
328
+ dra
329
+ dras
330
+ drs
331
+ ds
332
+ dz
333
+ e.c
334
+ e.g
335
+ e.g
336
+ e.k
337
+ eccles
338
+ ecol
339
+ econ
340
+ ed
341
+ ej
342
+ env
343
+ ep
344
+ eq
345
+ et
346
+ etc
347
+ ev
348
+ ex
349
+ exmo
350
+ exo
351
+ exp
352
+ expo
353
+ f.a.c
354
+ fa
355
+ fam
356
+ fasc
357
+ fbg
358
+ feb
359
+ fem
360
+ fevr
361
+ ff
362
+ fl
363
+ fol
364
+ fr
365
+ fs
366
+ fut
367
+ gd
368
+ gde
369
+ gdes
370
+ gds
371
+ gen
372
+ gl
373
+ grd
374
+ h.-t
375
+ hab
376
+ i.e
377
+ i.p.v
378
+ i.s.m
379
+ i.t.t
380
+ i.v.m
381
+ ibid
382
+ id
383
+ imp
384
+ ing
385
+ ir
386
+ iron
387
+ itd
388
+ itn
389
+ itp
390
+ izq
391
+ j
392
+ janv
393
+ jhr
394
+ jkvr
395
+ jr
396
+ l
397
+ lat
398
+ lex
399
+ lgen
400
+ lib
401
+ lieut
402
+ liv
403
+ lkol
404
+ loc
405
+ lof
406
+ m
407
+ m.a.w
408
+ m.b.t
409
+ m.b.v
410
+ m.h.o
411
+ m.i
412
+ m.i.v
413
+ maj
414
+ mar
415
+ mas
416
+ max
417
+ med
418
+ mevr
419
+ min
420
+ mll
421
+ mr
422
+ ms
423
+ mtr
424
+ mtrs
425
+ n
426
+ n
427
+ n.f
428
+ n.f.pl
429
+ n.m
430
+ n.m.pl
431
+ npr
432
+ o
433
+ o.b.s
434
+ obs
435
+ oct
436
+ okt
437
+ ord
438
+ oz
439
+ p
440
+ p
441
+ p.a
442
+ p.ej
443
+ p.ex
444
+ p.g.c.d
445
+ p.i
446
+ p.j
447
+ p.m
448
+ p.o
449
+ p.p
450
+ p.p.c.d
451
+ p.p.c.m
452
+ p.pa
453
+ p.pr
454
+ pl
455
+ plv
456
+ poe
457
+ pp
458
+ pp
459
+ pr
460
+ pr
461
+ pres
462
+ prev
463
+ prof
464
+ px
465
+ q.s
466
+ qqch
467
+ qqf
468
+ qqn
469
+ qqns
470
+ r.-de-ch
471
+ r.p.m
472
+ rc
473
+ rd
474
+ ref
475
+ refl
476
+ reg
477
+ rev
478
+ ro
479
+ rte
480
+ s
481
+ s
482
+ s.a
483
+ s.b.f
484
+ s.d
485
+ s.e
486
+ s.l
487
+ s.l.n.d
488
+ s.l.p
489
+ s.t.p
490
+ s.v.p
491
+ s/c
492
+ sc
493
+ sf
494
+ sgt
495
+ sl
496
+ sr
497
+ sra
498
+ sras
499
+ srs
500
+ ss
501
+ sto
502
+ t
503
+ t.s.v.p
504
+ tec
505
+ tel
506
+ terr
507
+ tg
508
+ tint
509
+ tit
510
+ tj
511
+ tr
512
+ travx
513
+ v
514
+ v.intr
515
+ v.tr
516
+ v.w.t
517
+ var
518
+ vs
519
+ vta
520
+ vx
521
+ z.v
522
+ zool
523
+ Št
524
+ št