opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,492 @@
1
+ #single upper case letter are usually initials
2
+ A
3
+ B
4
+ C
5
+ D
6
+ E
7
+ F
8
+ G
9
+ H
10
+ I
11
+ J
12
+ K
13
+ L
14
+ M
15
+ N
16
+ O
17
+ P
18
+ Q
19
+ R
20
+ S
21
+ T
22
+ U
23
+ V
24
+ W
25
+ X
26
+ Y
27
+ Z
28
+ #misc abbreviations
29
+ AB
30
+ G
31
+ VG
32
+ dvs
33
+ etc
34
+ from
35
+ iaf
36
+ jfr
37
+ kl
38
+ kr
39
+ mao
40
+ mfl
41
+ mm
42
+ osv
43
+ pga
44
+ tex
45
+ tom
46
+ vs
47
+
48
+ #unified abbreviation list
49
+ Acad
50
+ Adj
51
+ Adm
52
+ Adv
53
+ Affl
54
+ Apr
55
+ Art
56
+ Asst
57
+ Av
58
+ Avg
59
+ B.ches-du-Rh
60
+ Bart
61
+ Bco
62
+ Bldg
63
+ Brig
64
+ Bros
65
+ C.a
66
+ C.p.c.n
67
+ Ca
68
+ Capt
69
+ Cdt
70
+ Cf
71
+ Ch.-Mme
72
+ Chap
73
+ Cie
74
+ Cmdr
75
+ Col
76
+ Comdr
77
+ Con
78
+ Corp
79
+ Cpl
80
+ DR
81
+ DRA
82
+ Da
83
+ Dec
84
+ Dep
85
+ Dn
86
+ Dr
87
+ Dra
88
+ Dras
89
+ Drs
90
+ Eng
91
+ Enga
92
+ Engas
93
+ Engos
94
+ Ens
95
+ Ets
96
+ Euro
97
+ Ev
98
+ Ex
99
+ Excmo
100
+ Exmo
101
+ Exo
102
+ Fa
103
+ Fco
104
+ Feb
105
+ Fig
106
+ Fr
107
+ Gar
108
+ Gen
109
+ Gir
110
+ Gl
111
+ Gov
112
+ Hno
113
+ Hon
114
+ Hosp
115
+ Hr
116
+ Ilmo
117
+ Insp
118
+ J.-C
119
+ Jan
120
+ Jeu
121
+ Jr
122
+ Jul
123
+ Jun
124
+ Lda
125
+ Lieut
126
+ Lt
127
+ Lun
128
+ MM
129
+ MR
130
+ MRS
131
+ MS
132
+ MSc
133
+ Maj
134
+ Mar
135
+ Me
136
+ Mej
137
+ Mer
138
+ Mes
139
+ Messrs
140
+ Mgr
141
+ Mgrs
142
+ Mll
143
+ Mlle
144
+ Mlle(s)
145
+ Mme
146
+ Mme(s)
147
+ Mr
148
+ Mrs
149
+ Ms
150
+ Msgr
151
+ Mw
152
+ Nov
153
+ Npr
154
+ Nr
155
+ O.d.J
156
+ Okt
157
+ Op
158
+ Ord
159
+ Oz
160
+ P
161
+ P.D
162
+ P.ej
163
+ P.p.c
164
+ Pas
165
+ Pfc
166
+ Ph
167
+ Prim
168
+ Prof
169
+ Pte
170
+ Pts
171
+ Pvt
172
+ Rep
173
+ Reps
174
+ Res
175
+ Rev
176
+ Revd
177
+ Rh
178
+ Riv
179
+ Rt
180
+ S.Em
181
+ S.Exc
182
+ S.a.r.l
183
+ Sen
184
+ Sens
185
+ Sep
186
+ Sept
187
+ Sfc
188
+ Sgt
189
+ SGT
190
+ Sl
191
+ Sr
192
+ Sra
193
+ Sras
194
+ Srs
195
+ Srta
196
+ St
197
+ ST
198
+ Sta
199
+ Ste
200
+ Sto
201
+ Supt
202
+ Surg
203
+ Tj
204
+ Tr
205
+ Ud
206
+ Uds
207
+ V.Exc
208
+ Vd
209
+ Vda
210
+ Vds
211
+ Vz
212
+ Z.D
213
+ Z.D.H
214
+ Z.E
215
+ Z.Em
216
+ Z.H
217
+ Z.K.H
218
+ Z.K.M
219
+ Z.M
220
+ a
221
+ a./s
222
+ a.C
223
+ a.g.v
224
+ a.l
225
+ abrev
226
+ abs
227
+ ac
228
+ acc
229
+ acron
230
+ adj
231
+ adm
232
+ adr
233
+ adv
234
+ alt
235
+ anal
236
+ anat
237
+ angl
238
+ appos
239
+ apr
240
+ apr
241
+ asc
242
+ atm
243
+ auj
244
+ aux
245
+ av
246
+ avg
247
+ avr
248
+ b
249
+ b.a.o
250
+ b.a.p
251
+ b.a.r
252
+ bacc
253
+ bat
254
+ bc
255
+ bd
256
+ bde
257
+ bgen
258
+ bijv
259
+ bijz
260
+ br
261
+ bv
262
+ c
263
+ c.-a-d
264
+ c.a.f
265
+ c.i
266
+ cc
267
+ cf
268
+ cft
269
+ ch
270
+ ch.-l
271
+ chbre
272
+ chbs
273
+ chf
274
+ col
275
+ coll
276
+ cpl
277
+ cpt
278
+ cpte
279
+ cta
280
+ d
281
+ d.c
282
+ d.w.z
283
+ dcha
284
+ dec
285
+ def
286
+ dem
287
+ dep
288
+ dept
289
+ dhr
290
+ dipl
291
+ dispo
292
+ div
293
+ dpto
294
+ dr
295
+ dr.h.c
296
+ dra
297
+ dras
298
+ drs
299
+ ds
300
+ dz
301
+ e.c
302
+ e.g
303
+ e.g
304
+ e.k
305
+ eccles
306
+ ecol
307
+ econ
308
+ ed
309
+ ej
310
+ env
311
+ ep
312
+ eq
313
+ et
314
+ etc
315
+ ev
316
+ ex
317
+ exmo
318
+ exo
319
+ exp
320
+ expo
321
+ f.a.c
322
+ fa
323
+ fam
324
+ fasc
325
+ fbg
326
+ feb
327
+ fem
328
+ fevr
329
+ ff
330
+ fl
331
+ fol
332
+ fr
333
+ fs
334
+ fut
335
+ gd
336
+ gde
337
+ gdes
338
+ gds
339
+ gen
340
+ gl
341
+ grd
342
+ h.-t
343
+ hab
344
+ i.e
345
+ i.p.v
346
+ i.s.m
347
+ i.t.t
348
+ i.v.m
349
+ ibid
350
+ id
351
+ imp
352
+ ing
353
+ ir
354
+ iron
355
+ itd
356
+ itn
357
+ itp
358
+ izq
359
+ j
360
+ janv
361
+ jhr
362
+ jkvr
363
+ jr
364
+ l
365
+ lat
366
+ lex
367
+ lgen
368
+ lib
369
+ lieut
370
+ liv
371
+ lkol
372
+ loc
373
+ lof
374
+ m
375
+ m.a.w
376
+ m.b.t
377
+ m.b.v
378
+ m.h.o
379
+ m.i
380
+ m.i.v
381
+ maj
382
+ mar
383
+ mas
384
+ max
385
+ med
386
+ mevr
387
+ min
388
+ mll
389
+ mr
390
+ ms
391
+ mtr
392
+ mtrs
393
+ n
394
+ n
395
+ n.f
396
+ n.f.pl
397
+ n.m
398
+ n.m.pl
399
+ npr
400
+ o
401
+ o.b.s
402
+ obs
403
+ oct
404
+ okt
405
+ ord
406
+ oz
407
+ p
408
+ p
409
+ p.a
410
+ p.ej
411
+ p.ex
412
+ p.g.c.d
413
+ p.i
414
+ p.j
415
+ p.m
416
+ p.o
417
+ p.p
418
+ p.p.c.d
419
+ p.p.c.m
420
+ p.pa
421
+ p.pr
422
+ pl
423
+ plv
424
+ poe
425
+ pp
426
+ pp
427
+ pr
428
+ pr
429
+ pres
430
+ prev
431
+ prof
432
+ px
433
+ q.s
434
+ qqch
435
+ qqf
436
+ qqn
437
+ qqns
438
+ r.-de-ch
439
+ r.p.m
440
+ rc
441
+ rd
442
+ ref
443
+ refl
444
+ reg
445
+ rev
446
+ ro
447
+ rte
448
+ s
449
+ s
450
+ s.a
451
+ s.b.f
452
+ s.d
453
+ s.e
454
+ s.l
455
+ s.l.n.d
456
+ s.l.p
457
+ s.t.p
458
+ s.v.p
459
+ s/c
460
+ sc
461
+ sf
462
+ sgt
463
+ sl
464
+ sr
465
+ sra
466
+ sras
467
+ srs
468
+ ss
469
+ sto
470
+ t
471
+ t.s.v.p
472
+ tec
473
+ tel
474
+ terr
475
+ tg
476
+ tint
477
+ tit
478
+ tj
479
+ tr
480
+ travx
481
+ v
482
+ v.intr
483
+ v.tr
484
+ v.w.t
485
+ var
486
+ vs
487
+ vta
488
+ vx
489
+ z.v
490
+ zool
491
+ Št
492
+ št