opener-tokenizer-base 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Moses nonbreaking prefixes loader
4
+ # changed by Andoni Azpeitia (#2013/12/17)
5
+
6
+ use FindBin;
7
+ use utf8;
8
+
9
+ my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
10
+ my %NONBREAKING_PREFIX = ();
11
+ my $LANGUAGE;
12
+
13
+ sub load_prefixes {
14
+ $LANGUAGE = shift(@_);
15
+
16
+ my $prefixfile = "$mydir/nonbreaking_prefix.$LANGUAGE";
17
+
18
+ #default back to English if we don't have a language-specific prefix file
19
+ if (!(-e $prefixfile)) {
20
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
21
+ print STDERR "WARNING: No known abbreviations for language '$LANGUAGE', attempting fall-back to English version...\n";
22
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
23
+ }
24
+
25
+ if (-e "$prefixfile") {
26
+ open(PREFIX, "<:utf8", "$prefixfile");
27
+ while (<PREFIX>) {
28
+ my $item = $_;
29
+ chomp($item);
30
+ if (($item) && (substr($item,0,1) ne "#")) {
31
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
32
+ $NONBREAKING_PREFIX{$1} = 2;
33
+ } else {
34
+ $NONBREAKING_PREFIX{$item} = 1;
35
+ }
36
+ }
37
+ }
38
+ close(PREFIX);
39
+ }
40
+ return \%NONBREAKING_PREFIX;
41
+ }
42
+
43
+ 1;
@@ -0,0 +1,444 @@
1
+ Acad
2
+ Adj
3
+ Adm
4
+ Adv
5
+ Affl
6
+ Apr
7
+ Art
8
+ Asst
9
+ Av
10
+ Avg
11
+ B.ches-du-Rh
12
+ Bart
13
+ Bco
14
+ Bldg
15
+ Brig
16
+ Bros
17
+ C.a
18
+ C.p.c.n
19
+ Ca
20
+ Capt
21
+ Cdt
22
+ Cf
23
+ Ch.-Mme
24
+ Chap
25
+ Cie
26
+ Cmdr
27
+ Col
28
+ Comdr
29
+ Con
30
+ Corp
31
+ Cpl
32
+ DR
33
+ DRA
34
+ Da
35
+ Dec
36
+ Dep
37
+ Dn
38
+ Dr
39
+ Dra
40
+ Dras
41
+ Drs
42
+ Eng
43
+ Enga
44
+ Engas
45
+ Engos
46
+ Ens
47
+ Ets
48
+ Euro
49
+ Ev
50
+ Ex
51
+ Excmo
52
+ Exmo
53
+ Exo
54
+ Fa
55
+ Fco
56
+ Feb
57
+ Fig
58
+ Fr
59
+ Gar
60
+ Gen
61
+ Gir
62
+ Gl
63
+ Gov
64
+ Hno
65
+ Hon
66
+ Hosp
67
+ Hr
68
+ Ilmo
69
+ Insp
70
+ J.-C
71
+ Jan
72
+ Jeu
73
+ Jr
74
+ Jul
75
+ Jun
76
+ Lda
77
+ Lieut
78
+ Lt
79
+ Lun
80
+ MM
81
+ MR
82
+ MRS
83
+ MS
84
+ MSc
85
+ Maj
86
+ Mar
87
+ Me
88
+ Mej
89
+ Mer
90
+ Mes
91
+ Messrs
92
+ Mgr
93
+ Mgrs
94
+ Mll
95
+ Mlle
96
+ Mlle(s)
97
+ Mme
98
+ Mme(s)
99
+ Mr
100
+ Mrs
101
+ Ms
102
+ Msgr
103
+ Mw
104
+ Nov
105
+ Npr
106
+ Nr
107
+ O.d.J
108
+ Okt
109
+ Op
110
+ Ord
111
+ Oz
112
+ P
113
+ P.D
114
+ P.ej
115
+ P.p.c
116
+ Pas
117
+ Pfc
118
+ Ph
119
+ Prim
120
+ Prof
121
+ Pte
122
+ Pts
123
+ Pvt
124
+ Rep
125
+ Reps
126
+ Res
127
+ Rev
128
+ Revd
129
+ Rh
130
+ Riv
131
+ Rt
132
+ S.Em
133
+ S.Exc
134
+ S.a.r.l
135
+ Sen
136
+ Sens
137
+ Sep
138
+ Sept
139
+ Sfc
140
+ Sgt
141
+ SGT
142
+ Sl
143
+ Sr
144
+ Sra
145
+ Sras
146
+ Srs
147
+ Srta
148
+ St
149
+ ST
150
+ Sta
151
+ Ste
152
+ Sto
153
+ Supt
154
+ Surg
155
+ Tj
156
+ Tr
157
+ Ud
158
+ Uds
159
+ V.Exc
160
+ Vd
161
+ Vda
162
+ Vds
163
+ Vz
164
+ Z.D
165
+ Z.D.H
166
+ Z.E
167
+ Z.Em
168
+ Z.H
169
+ Z.K.H
170
+ Z.K.M
171
+ Z.M
172
+ a
173
+ a./s
174
+ a.C
175
+ a.g.v
176
+ a.l
177
+ abrev
178
+ abs
179
+ ac
180
+ acc
181
+ acron
182
+ adj
183
+ adm
184
+ adr
185
+ adv
186
+ alt
187
+ anal
188
+ anat
189
+ angl
190
+ appos
191
+ apr
192
+ apr
193
+ asc
194
+ atm
195
+ auj
196
+ aux
197
+ av
198
+ avg
199
+ avr
200
+ b
201
+ b.a.o
202
+ b.a.p
203
+ b.a.r
204
+ bacc
205
+ bat
206
+ bc
207
+ bd
208
+ bde
209
+ bgen
210
+ bijv
211
+ bijz
212
+ br
213
+ bv
214
+ c
215
+ c.-a-d
216
+ c.a.f
217
+ c.i
218
+ cc
219
+ cf
220
+ cft
221
+ ch
222
+ ch.-l
223
+ chbre
224
+ chbs
225
+ chf
226
+ col
227
+ coll
228
+ cpl
229
+ cpt
230
+ cpte
231
+ cta
232
+ d
233
+ d.c
234
+ d.w.z
235
+ dcha
236
+ dec
237
+ def
238
+ dem
239
+ dep
240
+ dept
241
+ dhr
242
+ dipl
243
+ dispo
244
+ div
245
+ dpto
246
+ dr
247
+ dr.h.c
248
+ dra
249
+ dras
250
+ drs
251
+ ds
252
+ dz
253
+ e.c
254
+ e.g
255
+ e.g
256
+ e.k
257
+ eccles
258
+ ecol
259
+ econ
260
+ ed
261
+ ej
262
+ env
263
+ ep
264
+ eq
265
+ et
266
+ etc
267
+ ev
268
+ ex
269
+ exmo
270
+ exo
271
+ exp
272
+ expo
273
+ f.a.c
274
+ fa
275
+ fam
276
+ fasc
277
+ fbg
278
+ feb
279
+ fem
280
+ fevr
281
+ ff
282
+ fl
283
+ fol
284
+ fr
285
+ fs
286
+ fut
287
+ gd
288
+ gde
289
+ gdes
290
+ gds
291
+ gen
292
+ gl
293
+ grd
294
+ h.-t
295
+ hab
296
+ i.e
297
+ i.p.v
298
+ i.s.m
299
+ i.t.t
300
+ i.v.m
301
+ ibid
302
+ id
303
+ imp
304
+ ing
305
+ ir
306
+ iron
307
+ itd
308
+ itn
309
+ itp
310
+ izq
311
+ j
312
+ janv
313
+ jhr
314
+ jkvr
315
+ jr
316
+ l
317
+ lat
318
+ lex
319
+ lgen
320
+ lib
321
+ lieut
322
+ liv
323
+ lkol
324
+ loc
325
+ lof
326
+ m
327
+ m.a.w
328
+ m.b.t
329
+ m.b.v
330
+ m.h.o
331
+ m.i
332
+ m.i.v
333
+ maj
334
+ mar
335
+ mas
336
+ max
337
+ med
338
+ mevr
339
+ min
340
+ mll
341
+ mr
342
+ ms
343
+ mtr
344
+ mtrs
345
+ n
346
+ n
347
+ n.f
348
+ n.f.pl
349
+ n.m
350
+ n.m.pl
351
+ npr
352
+ o
353
+ o.b.s
354
+ obs
355
+ oct
356
+ okt
357
+ ord
358
+ oz
359
+ p
360
+ p
361
+ p.a
362
+ p.ej
363
+ p.ex
364
+ p.g.c.d
365
+ p.i
366
+ p.j
367
+ p.m
368
+ p.o
369
+ p.p
370
+ p.p.c.d
371
+ p.p.c.m
372
+ p.pa
373
+ p.pr
374
+ pl
375
+ plv
376
+ poe
377
+ pp
378
+ pp
379
+ pr
380
+ pr
381
+ pres
382
+ prev
383
+ prof
384
+ px
385
+ q.s
386
+ qqch
387
+ qqf
388
+ qqn
389
+ qqns
390
+ r.-de-ch
391
+ r.p.m
392
+ rc
393
+ rd
394
+ ref
395
+ refl
396
+ reg
397
+ rev
398
+ ro
399
+ rte
400
+ s
401
+ s
402
+ s.a
403
+ s.b.f
404
+ s.d
405
+ s.e
406
+ s.l
407
+ s.l.n.d
408
+ s.l.p
409
+ s.t.p
410
+ s.v.p
411
+ s/c
412
+ sc
413
+ sf
414
+ sgt
415
+ sl
416
+ sr
417
+ sra
418
+ sras
419
+ srs
420
+ ss
421
+ sto
422
+ t
423
+ t.s.v.p
424
+ tec
425
+ tel
426
+ terr
427
+ tg
428
+ tint
429
+ tit
430
+ tj
431
+ tr
432
+ travx
433
+ v
434
+ v.intr
435
+ v.tr
436
+ v.w.t
437
+ var
438
+ vs
439
+ vta
440
+ vx
441
+ z.v
442
+ zool
443
+ �t
444
+ �t