opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,533 @@
1
+ Dr
2
+ Dra
3
+ pàg
4
+ p
5
+ c
6
+ av
7
+ Sr
8
+ Sra
9
+ adm
10
+ esq
11
+ Prof
12
+ S.A
13
+ S.L
14
+ p.e
15
+ ptes
16
+ Sta
17
+ St
18
+ pl
19
+ màx
20
+ cast
21
+ dir
22
+ nre
23
+ fra
24
+ admdora
25
+ Emm
26
+ Excma
27
+ espf
28
+ dc
29
+ admdor
30
+ tel
31
+ angl
32
+ aprox
33
+ ca
34
+ dept
35
+ dj
36
+ dl
37
+ dt
38
+ ds
39
+ dg
40
+ dv
41
+ ed
42
+ entl
43
+ al
44
+ i.e
45
+ maj
46
+ smin
47
+ n
48
+ núm
49
+ pta
50
+ A
51
+ B
52
+ C
53
+ D
54
+ E
55
+ F
56
+ G
57
+ H
58
+ I
59
+ J
60
+ K
61
+ L
62
+ M
63
+ N
64
+ O
65
+ P
66
+ Q
67
+ R
68
+ S
69
+ T
70
+ U
71
+ V
72
+ W
73
+ X
74
+ Y
75
+ Z
76
+
77
+ #unified abbreviation list
78
+ Abr
79
+ Acad
80
+ Adj
81
+ Adm
82
+ Adv
83
+ Affl
84
+ Ago
85
+ Apr
86
+ Art
87
+ Asst
88
+ Av
89
+ Avg
90
+ B.ches-du-Rh
91
+ Bart
92
+ Bco
93
+ Bldg
94
+ Brig
95
+ Bros
96
+ C.a
97
+ C.p.c.n
98
+ Ca
99
+ Capt
100
+ Cdt
101
+ Cf
102
+ Ch.-Mme
103
+ Chap
104
+ Cie
105
+ Cmdr
106
+ Col
107
+ Comdr
108
+ Con
109
+ Corp
110
+ Cpl
111
+ DR
112
+ DRA
113
+ Da
114
+ Dec
115
+ Dep
116
+ Dn
117
+ Dr
118
+ Dra
119
+ Dras
120
+ Drs
121
+ Eng
122
+ Enga
123
+ Engas
124
+ Engos
125
+ Ens
126
+ Ets
127
+ Euro
128
+ Ev
129
+ Ex
130
+ Excmo
131
+ Exmo
132
+ Exo
133
+ Fa
134
+ Fco
135
+ Feb
136
+ Fig
137
+ Fr
138
+ Gar
139
+ Gen
140
+ Gir
141
+ Gl
142
+ Gov
143
+ Hno
144
+ Hon
145
+ Hosp
146
+ Hr
147
+ Ilmo
148
+ Insp
149
+ J.-C
150
+ Jan
151
+ Jeu
152
+ Jr
153
+ Jul
154
+ Jun
155
+ Lda
156
+ Lieut
157
+ Lt
158
+ Lun
159
+ MM
160
+ MR
161
+ MRS
162
+ MS
163
+ MSc
164
+ Mai
165
+ Maj
166
+ Mar
167
+ Me
168
+ Mej
169
+ Mer
170
+ Mes
171
+ Messrs
172
+ Mgr
173
+ Mgrs
174
+ Mll
175
+ Mlle
176
+ Mlle(s)
177
+ Mme
178
+ Mme(s)
179
+ Mr
180
+ Mrs
181
+ Ms
182
+ Msgr
183
+ Mw
184
+ Nov
185
+ Npr
186
+ Nr
187
+ O.d.J
188
+ Oct
189
+ Okt
190
+ Op
191
+ Ord
192
+ Oz
193
+ P
194
+ P.D
195
+ P.ej
196
+ P.p.c
197
+ Pas
198
+ Pfc
199
+ Ph
200
+ Prim
201
+ Prof
202
+ Pte
203
+ Pts
204
+ Pvt
205
+ Rep
206
+ Reps
207
+ Res
208
+ Rev
209
+ Revd
210
+ Rh
211
+ Riv
212
+ Rt
213
+ S.Em
214
+ S.Exc
215
+ S.a.r.l
216
+ Sen
217
+ Sens
218
+ Sep
219
+ Sept
220
+ Sfc
221
+ Sgt
222
+ SGT
223
+ Sl
224
+ Sr
225
+ Sra
226
+ Sras
227
+ Srs
228
+ Srta
229
+ St
230
+ ST
231
+ Sta
232
+ Ste
233
+ Sto
234
+ Supt
235
+ Surg
236
+ Tj
237
+ Tr
238
+ Ud
239
+ Uds
240
+ V.Exc
241
+ Vd
242
+ Vda
243
+ Vds
244
+ Vz
245
+ Z.D
246
+ Z.D.H
247
+ Z.E
248
+ Z.Em
249
+ Z.H
250
+ Z.K.H
251
+ Z.K.M
252
+ Z.M
253
+ a
254
+ a./s
255
+ a.C
256
+ a.g.v
257
+ a.l
258
+ abr
259
+ abrev
260
+ abs
261
+ ac
262
+ acc
263
+ acron
264
+ adj
265
+ adm
266
+ adr
267
+ adv
268
+ alt
269
+ ago
270
+ anal
271
+ anat
272
+ angl
273
+ appos
274
+ apr
275
+ apr
276
+ asc
277
+ atm
278
+ auj
279
+ aux
280
+ av
281
+ avg
282
+ avr
283
+ b
284
+ b.a.o
285
+ b.a.p
286
+ b.a.r
287
+ bacc
288
+ bat
289
+ bc
290
+ bd
291
+ bde
292
+ bgen
293
+ bijv
294
+ bijz
295
+ br
296
+ bv
297
+ c
298
+ c.-a-d
299
+ c.a.f
300
+ c.i
301
+ cc
302
+ cf
303
+ cft
304
+ ch
305
+ ch.-l
306
+ chbre
307
+ chbs
308
+ chf
309
+ col
310
+ coll
311
+ cpl
312
+ cpt
313
+ cpte
314
+ cta
315
+ d
316
+ d.c
317
+ d.w.z
318
+ dcha
319
+ dec
320
+ def
321
+ dem
322
+ dep
323
+ dept
324
+ dhr
325
+ dipl
326
+ dispo
327
+ div
328
+ dpto
329
+ dr
330
+ dr.h.c
331
+ dra
332
+ dras
333
+ drs
334
+ ds
335
+ dz
336
+ e.c
337
+ e.g
338
+ e.g
339
+ e.k
340
+ eccles
341
+ ecol
342
+ econ
343
+ ed
344
+ ej
345
+ env
346
+ ep
347
+ eq
348
+ et
349
+ etc
350
+ ev
351
+ ex
352
+ exmo
353
+ exo
354
+ exp
355
+ expo
356
+ f.a.c
357
+ fa
358
+ fam
359
+ fasc
360
+ fbg
361
+ feb
362
+ fem
363
+ fevr
364
+ ff
365
+ fl
366
+ fol
367
+ fr
368
+ fs
369
+ fut
370
+ gd
371
+ gde
372
+ gdes
373
+ gds
374
+ gen
375
+ gl
376
+ grd
377
+ h.-t
378
+ hab
379
+ i.e
380
+ i.p.v
381
+ i.s.m
382
+ i.t.t
383
+ i.v.m
384
+ ibid
385
+ id
386
+ imp
387
+ ing
388
+ ir
389
+ iron
390
+ itd
391
+ itn
392
+ itp
393
+ izq
394
+ j
395
+ janv
396
+ jhr
397
+ jkvr
398
+ jr
399
+ jul
400
+ jun
401
+ l
402
+ lat
403
+ lex
404
+ lgen
405
+ lib
406
+ lieut
407
+ liv
408
+ lkol
409
+ loc
410
+ lof
411
+ m
412
+ m.a.w
413
+ m.b.t
414
+ m.b.v
415
+ m.h.o
416
+ m.i
417
+ m.i.v
418
+ mai
419
+ maj
420
+ mar
421
+ mas
422
+ max
423
+ med
424
+ mevr
425
+ min
426
+ mll
427
+ mr
428
+ ms
429
+ mtr
430
+ mtrs
431
+ n
432
+ n
433
+ n.f
434
+ n.f.pl
435
+ n.m
436
+ n.m.pl
437
+ nov
438
+ npr
439
+ o
440
+ o.b.s
441
+ obs
442
+ oct
443
+ okt
444
+ ord
445
+ oz
446
+ p
447
+ p
448
+ p.a
449
+ p.ej
450
+ p.ex
451
+ p.g.c.d
452
+ p.i
453
+ p.j
454
+ p.m
455
+ p.o
456
+ p.p
457
+ p.p.c.d
458
+ p.p.c.m
459
+ p.pa
460
+ p.pr
461
+ pl
462
+ plv
463
+ poe
464
+ pp
465
+ pp
466
+ pr
467
+ pr
468
+ pres
469
+ prev
470
+ prof
471
+ px
472
+ q.s
473
+ qqch
474
+ qqf
475
+ qqn
476
+ qqns
477
+ r.-de-ch
478
+ r.p.m
479
+ rc
480
+ rd
481
+ ref
482
+ refl
483
+ reg
484
+ rev
485
+ ro
486
+ rte
487
+ s
488
+ s
489
+ s.a
490
+ s.b.f
491
+ s.d
492
+ s.e
493
+ s.l
494
+ s.l.n.d
495
+ s.l.p
496
+ s.t.p
497
+ s.v.p
498
+ s/c
499
+ sc
500
+ sep
501
+ sept
502
+ sf
503
+ sgt
504
+ sl
505
+ sr
506
+ sra
507
+ sras
508
+ srs
509
+ ss
510
+ sto
511
+ t
512
+ t.s.v.p
513
+ tec
514
+ tel
515
+ terr
516
+ tg
517
+ tint
518
+ tit
519
+ tj
520
+ tr
521
+ travx
522
+ v
523
+ v.intr
524
+ v.tr
525
+ v.w.t
526
+ var
527
+ vs
528
+ vta
529
+ vx
530
+ z.v
531
+ zool
532
+ Št
533
+ št