opener-tokenizer-base 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +148 -0
  3. data/bin/tokenizer-base +5 -0
  4. data/bin/tokenizer-de +5 -0
  5. data/bin/tokenizer-en +5 -0
  6. data/bin/tokenizer-es +5 -0
  7. data/bin/tokenizer-fr +5 -0
  8. data/bin/tokenizer-it +5 -0
  9. data/bin/tokenizer-nl +5 -0
  10. data/core/lib/Data/OptList.pm +256 -0
  11. data/core/lib/Params/Util.pm +866 -0
  12. data/core/lib/Sub/Exporter.pm +1101 -0
  13. data/core/lib/Sub/Exporter/Cookbook.pod +309 -0
  14. data/core/lib/Sub/Exporter/Tutorial.pod +280 -0
  15. data/core/lib/Sub/Exporter/Util.pm +354 -0
  16. data/core/lib/Sub/Install.pm +329 -0
  17. data/core/lib/Time/Stamp.pm +808 -0
  18. data/core/load-prefixes.pl +43 -0
  19. data/core/nonbreaking_prefixes/abbreviation_list.kaf +0 -0
  20. data/core/nonbreaking_prefixes/abbreviation_list.txt +444 -0
  21. data/core/nonbreaking_prefixes/nonbreaking_prefix.ca +533 -0
  22. data/core/nonbreaking_prefixes/nonbreaking_prefix.de +781 -0
  23. data/core/nonbreaking_prefixes/nonbreaking_prefix.el +448 -0
  24. data/core/nonbreaking_prefixes/nonbreaking_prefix.en +564 -0
  25. data/core/nonbreaking_prefixes/nonbreaking_prefix.es +758 -0
  26. data/core/nonbreaking_prefixes/nonbreaking_prefix.fr +1027 -0
  27. data/core/nonbreaking_prefixes/nonbreaking_prefix.is +697 -0
  28. data/core/nonbreaking_prefixes/nonbreaking_prefix.it +641 -0
  29. data/core/nonbreaking_prefixes/nonbreaking_prefix.nl +739 -0
  30. data/core/nonbreaking_prefixes/nonbreaking_prefix.pl +729 -0
  31. data/core/nonbreaking_prefixes/nonbreaking_prefix.pt +656 -0
  32. data/core/nonbreaking_prefixes/nonbreaking_prefix.ro +484 -0
  33. data/core/nonbreaking_prefixes/nonbreaking_prefix.ru +705 -0
  34. data/core/nonbreaking_prefixes/nonbreaking_prefix.sk +920 -0
  35. data/core/nonbreaking_prefixes/nonbreaking_prefix.sl +524 -0
  36. data/core/nonbreaking_prefixes/nonbreaking_prefix.sv +492 -0
  37. data/core/split-sentences.pl +114 -0
  38. data/core/text-fixer.pl +169 -0
  39. data/core/tokenizer-cli.pl +363 -0
  40. data/core/tokenizer.pl +145 -0
  41. data/lib/opener/tokenizers/base.rb +84 -0
  42. data/lib/opener/tokenizers/base/version.rb +8 -0
  43. data/opener-tokenizer-base.gemspec +25 -0
  44. metadata +134 -0
@@ -0,0 +1,43 @@
1
+ #!/usr/bin/perl -w
2
+
3
+ # Moses nonbreaking prefixes loader
4
+ # changed by Andoni Azpeitia (#2013/12/17)
5
+
6
+ use FindBin;
7
+ use utf8;
8
+
9
+ my $mydir = "$FindBin::Bin"."/nonbreaking_prefixes";
10
+ my %NONBREAKING_PREFIX = ();
11
+ my $LANGUAGE;
12
+
13
+ sub load_prefixes {
14
+ $LANGUAGE = shift(@_);
15
+
16
+ my $prefixfile = "$mydir/nonbreaking_prefix.$LANGUAGE";
17
+
18
+ #default back to English if we don't have a language-specific prefix file
19
+ if (!(-e $prefixfile)) {
20
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
21
+ print STDERR "WARNING: No known abbreviations for language '$LANGUAGE', attempting fall-back to English version...\n";
22
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
23
+ }
24
+
25
+ if (-e "$prefixfile") {
26
+ open(PREFIX, "<:utf8", "$prefixfile");
27
+ while (<PREFIX>) {
28
+ my $item = $_;
29
+ chomp($item);
30
+ if (($item) && (substr($item,0,1) ne "#")) {
31
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) {
32
+ $NONBREAKING_PREFIX{$1} = 2;
33
+ } else {
34
+ $NONBREAKING_PREFIX{$item} = 1;
35
+ }
36
+ }
37
+ }
38
+ close(PREFIX);
39
+ }
40
+ return \%NONBREAKING_PREFIX;
41
+ }
42
+
43
+ 1;
@@ -0,0 +1,444 @@
1
+ Acad
2
+ Adj
3
+ Adm
4
+ Adv
5
+ Affl
6
+ Apr
7
+ Art
8
+ Asst
9
+ Av
10
+ Avg
11
+ B.ches-du-Rh
12
+ Bart
13
+ Bco
14
+ Bldg
15
+ Brig
16
+ Bros
17
+ C.a
18
+ C.p.c.n
19
+ Ca
20
+ Capt
21
+ Cdt
22
+ Cf
23
+ Ch.-Mme
24
+ Chap
25
+ Cie
26
+ Cmdr
27
+ Col
28
+ Comdr
29
+ Con
30
+ Corp
31
+ Cpl
32
+ DR
33
+ DRA
34
+ Da
35
+ Dec
36
+ Dep
37
+ Dn
38
+ Dr
39
+ Dra
40
+ Dras
41
+ Drs
42
+ Eng
43
+ Enga
44
+ Engas
45
+ Engos
46
+ Ens
47
+ Ets
48
+ Euro
49
+ Ev
50
+ Ex
51
+ Excmo
52
+ Exmo
53
+ Exo
54
+ Fa
55
+ Fco
56
+ Feb
57
+ Fig
58
+ Fr
59
+ Gar
60
+ Gen
61
+ Gir
62
+ Gl
63
+ Gov
64
+ Hno
65
+ Hon
66
+ Hosp
67
+ Hr
68
+ Ilmo
69
+ Insp
70
+ J.-C
71
+ Jan
72
+ Jeu
73
+ Jr
74
+ Jul
75
+ Jun
76
+ Lda
77
+ Lieut
78
+ Lt
79
+ Lun
80
+ MM
81
+ MR
82
+ MRS
83
+ MS
84
+ MSc
85
+ Maj
86
+ Mar
87
+ Me
88
+ Mej
89
+ Mer
90
+ Mes
91
+ Messrs
92
+ Mgr
93
+ Mgrs
94
+ Mll
95
+ Mlle
96
+ Mlle(s)
97
+ Mme
98
+ Mme(s)
99
+ Mr
100
+ Mrs
101
+ Ms
102
+ Msgr
103
+ Mw
104
+ Nov
105
+ Npr
106
+ Nr
107
+ O.d.J
108
+ Okt
109
+ Op
110
+ Ord
111
+ Oz
112
+ P
113
+ P.D
114
+ P.ej
115
+ P.p.c
116
+ Pas
117
+ Pfc
118
+ Ph
119
+ Prim
120
+ Prof
121
+ Pte
122
+ Pts
123
+ Pvt
124
+ Rep
125
+ Reps
126
+ Res
127
+ Rev
128
+ Revd
129
+ Rh
130
+ Riv
131
+ Rt
132
+ S.Em
133
+ S.Exc
134
+ S.a.r.l
135
+ Sen
136
+ Sens
137
+ Sep
138
+ Sept
139
+ Sfc
140
+ Sgt
141
+ SGT
142
+ Sl
143
+ Sr
144
+ Sra
145
+ Sras
146
+ Srs
147
+ Srta
148
+ St
149
+ ST
150
+ Sta
151
+ Ste
152
+ Sto
153
+ Supt
154
+ Surg
155
+ Tj
156
+ Tr
157
+ Ud
158
+ Uds
159
+ V.Exc
160
+ Vd
161
+ Vda
162
+ Vds
163
+ Vz
164
+ Z.D
165
+ Z.D.H
166
+ Z.E
167
+ Z.Em
168
+ Z.H
169
+ Z.K.H
170
+ Z.K.M
171
+ Z.M
172
+ a
173
+ a./s
174
+ a.C
175
+ a.g.v
176
+ a.l
177
+ abrev
178
+ abs
179
+ ac
180
+ acc
181
+ acron
182
+ adj
183
+ adm
184
+ adr
185
+ adv
186
+ alt
187
+ anal
188
+ anat
189
+ angl
190
+ appos
191
+ apr
192
+ apr
193
+ asc
194
+ atm
195
+ auj
196
+ aux
197
+ av
198
+ avg
199
+ avr
200
+ b
201
+ b.a.o
202
+ b.a.p
203
+ b.a.r
204
+ bacc
205
+ bat
206
+ bc
207
+ bd
208
+ bde
209
+ bgen
210
+ bijv
211
+ bijz
212
+ br
213
+ bv
214
+ c
215
+ c.-a-d
216
+ c.a.f
217
+ c.i
218
+ cc
219
+ cf
220
+ cft
221
+ ch
222
+ ch.-l
223
+ chbre
224
+ chbs
225
+ chf
226
+ col
227
+ coll
228
+ cpl
229
+ cpt
230
+ cpte
231
+ cta
232
+ d
233
+ d.c
234
+ d.w.z
235
+ dcha
236
+ dec
237
+ def
238
+ dem
239
+ dep
240
+ dept
241
+ dhr
242
+ dipl
243
+ dispo
244
+ div
245
+ dpto
246
+ dr
247
+ dr.h.c
248
+ dra
249
+ dras
250
+ drs
251
+ ds
252
+ dz
253
+ e.c
254
+ e.g
255
+ e.g
256
+ e.k
257
+ eccles
258
+ ecol
259
+ econ
260
+ ed
261
+ ej
262
+ env
263
+ ep
264
+ eq
265
+ et
266
+ etc
267
+ ev
268
+ ex
269
+ exmo
270
+ exo
271
+ exp
272
+ expo
273
+ f.a.c
274
+ fa
275
+ fam
276
+ fasc
277
+ fbg
278
+ feb
279
+ fem
280
+ fevr
281
+ ff
282
+ fl
283
+ fol
284
+ fr
285
+ fs
286
+ fut
287
+ gd
288
+ gde
289
+ gdes
290
+ gds
291
+ gen
292
+ gl
293
+ grd
294
+ h.-t
295
+ hab
296
+ i.e
297
+ i.p.v
298
+ i.s.m
299
+ i.t.t
300
+ i.v.m
301
+ ibid
302
+ id
303
+ imp
304
+ ing
305
+ ir
306
+ iron
307
+ itd
308
+ itn
309
+ itp
310
+ izq
311
+ j
312
+ janv
313
+ jhr
314
+ jkvr
315
+ jr
316
+ l
317
+ lat
318
+ lex
319
+ lgen
320
+ lib
321
+ lieut
322
+ liv
323
+ lkol
324
+ loc
325
+ lof
326
+ m
327
+ m.a.w
328
+ m.b.t
329
+ m.b.v
330
+ m.h.o
331
+ m.i
332
+ m.i.v
333
+ maj
334
+ mar
335
+ mas
336
+ max
337
+ med
338
+ mevr
339
+ min
340
+ mll
341
+ mr
342
+ ms
343
+ mtr
344
+ mtrs
345
+ n
346
+ n
347
+ n.f
348
+ n.f.pl
349
+ n.m
350
+ n.m.pl
351
+ npr
352
+ o
353
+ o.b.s
354
+ obs
355
+ oct
356
+ okt
357
+ ord
358
+ oz
359
+ p
360
+ p
361
+ p.a
362
+ p.ej
363
+ p.ex
364
+ p.g.c.d
365
+ p.i
366
+ p.j
367
+ p.m
368
+ p.o
369
+ p.p
370
+ p.p.c.d
371
+ p.p.c.m
372
+ p.pa
373
+ p.pr
374
+ pl
375
+ plv
376
+ poe
377
+ pp
378
+ pp
379
+ pr
380
+ pr
381
+ pres
382
+ prev
383
+ prof
384
+ px
385
+ q.s
386
+ qqch
387
+ qqf
388
+ qqn
389
+ qqns
390
+ r.-de-ch
391
+ r.p.m
392
+ rc
393
+ rd
394
+ ref
395
+ refl
396
+ reg
397
+ rev
398
+ ro
399
+ rte
400
+ s
401
+ s
402
+ s.a
403
+ s.b.f
404
+ s.d
405
+ s.e
406
+ s.l
407
+ s.l.n.d
408
+ s.l.p
409
+ s.t.p
410
+ s.v.p
411
+ s/c
412
+ sc
413
+ sf
414
+ sgt
415
+ sl
416
+ sr
417
+ sra
418
+ sras
419
+ srs
420
+ ss
421
+ sto
422
+ t
423
+ t.s.v.p
424
+ tec
425
+ tel
426
+ terr
427
+ tg
428
+ tint
429
+ tit
430
+ tj
431
+ tr
432
+ travx
433
+ v
434
+ v.intr
435
+ v.tr
436
+ v.w.t
437
+ var
438
+ vs
439
+ vta
440
+ vx
441
+ z.v
442
+ zool
443
+ �t
444
+ �t