scylla 0.4.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/scylla/generator.rb +1 -1
- data/lib/scylla/lms/13375P33K.lm +156 -156
- data/lib/scylla/lms/arabic.lm +133 -133
- data/lib/scylla/lms/bulgarian.lm +122 -122
- data/lib/scylla/lms/catalan.lm +151 -151
- data/lib/scylla/lms/danish.lm +137 -137
- data/lib/scylla/lms/english.lm +207 -207
- data/lib/scylla/lms/french.lm +400 -400
- data/lib/scylla/lms/japanese.lm +400 -400
- data/lib/scylla/lms/korean.lm +233 -233
- data/lib/scylla/lms/norwegian.lm +398 -398
- data/lib/scylla/lms/spanish.lm +98 -98
- data/lib/scylla/lms/swedish.lm +123 -123
- data/lib/scylla/lms/tagalog.lm +223 -223
- data/lib/scylla/lms/welsh.lm +234 -234
- data/lib/scylla/resources.rb +10 -10
- data/scylla.gemspec +17 -40
- data/source_texts/catalan.txt +28 -28
- data/source_texts/danish.txt +62 -62
- data/source_texts/english.txt +10 -10
- data/source_texts/french.txt +470 -77
- data/source_texts/japanese.txt +453 -199
- data/source_texts/norwegian.txt +96 -63
- data/source_texts/spanish.txt +269 -269
- data/test/classifier_test.rb +2 -2
- data/test/fixtures/lms/13375p33k.lm +156 -156
- data/test/fixtures/lms/danish.lm +137 -137
- data/test/fixtures/lms/english.lm +207 -207
- data/test/fixtures/lms/french.lm +400 -400
- data/test/fixtures/lms/hindi.lm +400 -0
- data/test/fixtures/lms/italian.lm +400 -0
- data/test/fixtures/lms/japanese.lm +400 -400
- data/test/fixtures/lms/norwegian.lm +400 -0
- data/test/fixtures/lms/spanish.lm +98 -98
- data/test/fixtures/source_texts/danish.txt +62 -62
- data/test/fixtures/source_texts/english.txt +10 -10
- data/test/fixtures/source_texts/french.txt +470 -77
- data/test/fixtures/source_texts/hindi.txt +199 -0
- data/test/fixtures/source_texts/italian.txt +120 -0
- data/test/fixtures/source_texts/japanese.txt +453 -199
- data/test/fixtures/source_texts/norwegian.txt +190 -0
- data/test/fixtures/source_texts/spanish.txt +269 -269
- data/test/fixtures/test_languages/english +61 -0
- data/test/fixtures/test_languages/french +0 -0
- data/test/fixtures/test_languages/german +29 -0
- data/test/fixtures/test_languages/hindi +3 -0
- data/test/fixtures/test_languages/italian +6 -0
- data/test/fixtures/test_languages/japanese +79 -0
- data/test/fixtures/test_languages/norwegian +14 -0
- data/test/fixtures/test_languages/spanish +22 -0
- data/test/generator_test.rb +0 -1
- data/test/language_test.rb +28 -0
- metadata +20 -43
- data/lib/scylla/lms/esperanto.lm +0 -400
- data/lib/scylla/lms/hungarian.lm +0 -400
- data/lib/scylla/lms/irish.lm +0 -400
- data/lib/scylla/lms/kannada.lm +0 -400
- data/lib/scylla/lms/latin.lm +0 -400
- data/lib/scylla/lms/malay.lm +0 -400
- data/lib/scylla/lms/marathi.lm +0 -400
- data/lib/scylla/lms/mingo.lm +0 -400
- data/lib/scylla/lms/nepali.lm +0 -400
- data/lib/scylla/lms/quechua.lm +0 -400
- data/lib/scylla/lms/rumantsch.lm +0 -400
- data/lib/scylla/lms/sanskrit.lm +0 -400
- data/lib/scylla/lms/scots_gaelic.lm +0 -400
- data/lib/scylla/lms/serbian.lm +0 -400
- data/lib/scylla/lms/swahili.lm +0 -400
- data/lib/scylla/lms/tamil.lm +0 -400
- data/lib/scylla/lms/ukrainian.lm +0 -400
- data/lib/scylla/lms/yiddish.lm +0 -400
- data/source_texts/esperanto.txt +0 -199
- data/source_texts/hungarian.txt +0 -102
- data/source_texts/irish.txt +0 -209
- data/source_texts/kannada.txt +0 -283
- data/source_texts/latin.txt +0 -120
- data/source_texts/malay.txt +0 -108
- data/source_texts/marathi.txt +0 -100
- data/source_texts/mingo.txt +0 -146
- data/source_texts/nepali.txt +0 -131
- data/source_texts/quechua.txt +0 -108
- data/source_texts/rumantsch.txt +0 -110
- data/source_texts/sanskrit.txt +0 -135
- data/source_texts/scots_gaelic.txt +0 -93
- data/source_texts/serbian.txt +0 -121
- data/source_texts/swahili.txt +0 -120
- data/source_texts/tamil.txt +0 -167
- data/source_texts/ukrainian.txt +0 -214
- data/source_texts/yiddish-utf.txt +0 -83
- data/test/fixtures/lms/kannada.lm +0 -400
- data/test/fixtures/source_texts/kannada.txt +0 -283
data/test/fixtures/lms/danish.lm
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
_
|
1
|
+
_ 16386
|
2
2
|
e 5759
|
3
3
|
r 3334
|
4
4
|
n 3061
|
@@ -10,26 +10,26 @@ s 2224
|
|
10
10
|
o 1932
|
11
11
|
l 1884
|
12
12
|
g 1617
|
13
|
-
__ 1390
|
14
13
|
k 1278
|
15
14
|
m 1273
|
16
15
|
er 1209
|
17
|
-
e_
|
16
|
+
e_ 1175
|
17
|
+
__ 1083
|
18
18
|
de 1045
|
19
19
|
en 993
|
20
20
|
� 940
|
21
21
|
f 939
|
22
|
-
r_
|
22
|
+
r_ 828
|
23
23
|
v 770
|
24
|
-
t_
|
24
|
+
t_ 725
|
25
25
|
an 724
|
26
26
|
n_ 710
|
27
27
|
u 605
|
28
28
|
nd 598
|
29
29
|
b 585
|
30
30
|
et 574
|
31
|
+
_s 546
|
31
32
|
. 546
|
32
|
-
_s 545
|
33
33
|
re 543
|
34
34
|
te 540
|
35
35
|
st 530
|
@@ -39,9 +39,9 @@ g_ 519
|
|
39
39
|
_o 501
|
40
40
|
_d 484
|
41
41
|
, 480
|
42
|
-
|
42
|
+
,_ 478
|
43
43
|
h 474
|
44
|
-
|
44
|
+
ge 474
|
45
45
|
_a 462
|
46
46
|
in 440
|
47
47
|
p 436
|
@@ -50,11 +50,12 @@ _f 423
|
|
50
50
|
og 415
|
51
51
|
or 411
|
52
52
|
ti 406
|
53
|
-
|
53
|
+
._ 405
|
54
|
+
et_ 396
|
54
55
|
_e 390
|
55
56
|
ed 381
|
56
|
-
_i 378
|
57
57
|
_m 378
|
58
|
+
_i 378
|
58
59
|
sk 365
|
59
60
|
ne 365
|
60
61
|
le 353
|
@@ -64,15 +65,14 @@ el 342
|
|
64
65
|
og_ 338
|
65
66
|
� 332
|
66
67
|
ø 332
|
67
|
-
d_
|
68
|
+
d_ 326
|
68
69
|
me 324
|
69
70
|
ng 317
|
70
71
|
_de 314
|
71
72
|
_og_ 313
|
72
|
-
|
73
|
+
å 293
|
73
74
|
ig 293
|
74
75
|
� 293
|
75
|
-
å 293
|
76
76
|
_b 288
|
77
77
|
� 286
|
78
78
|
æ 286
|
@@ -83,8 +83,8 @@ ri 273
|
|
83
83
|
s_ 271
|
84
84
|
D 268
|
85
85
|
nde 263
|
86
|
-
li 262
|
87
86
|
om 262
|
87
|
+
li 262
|
88
88
|
ma 259
|
89
89
|
ve 257
|
90
90
|
y 256
|
@@ -95,29 +95,25 @@ at 245
|
|
95
95
|
il 244
|
96
96
|
and 241
|
97
97
|
es 240
|
98
|
-
al 234
|
99
98
|
be 234
|
99
|
+
al 234
|
100
100
|
is 233
|
101
101
|
fo 232
|
102
102
|
se 232
|
103
103
|
ns 229
|
104
104
|
la 224
|
105
|
-
_D
|
106
|
-
[ 221
|
107
|
-
] 221
|
105
|
+
_D 223
|
108
106
|
on 221
|
109
107
|
rk 219
|
110
108
|
_af 217
|
111
|
-
[_ 217
|
112
|
-
_] 217
|
113
109
|
den 216
|
114
110
|
der 214
|
115
111
|
_me 210
|
116
112
|
_k 210
|
117
|
-
m_
|
113
|
+
m_ 209
|
118
114
|
ing 207
|
119
115
|
_v 203
|
120
|
-
k_
|
116
|
+
k_ 195
|
121
117
|
ra 191
|
122
118
|
f_ 188
|
123
119
|
af_ 186
|
@@ -125,182 +121,178 @@ for 184
|
|
125
121
|
_p 181
|
126
122
|
_af_ 180
|
127
123
|
ol 174
|
124
|
+
ere 172
|
128
125
|
_fo 172
|
129
126
|
ark 172
|
130
|
-
ere 172
|
131
|
-
ste 169
|
132
127
|
lan 169
|
128
|
+
ste 169
|
133
129
|
te_ 168
|
134
|
-
.[ 168
|
135
|
-
.[_ 165
|
136
|
-
_l 165
|
137
130
|
mar 165
|
138
|
-
|
131
|
+
_l 165
|
132
|
+
l_ 164
|
139
133
|
ll 162
|
140
134
|
ter 161
|
141
135
|
j 159
|
142
136
|
ske 159
|
143
137
|
om_ 155
|
138
|
+
land 153
|
139
|
+
mark 153
|
144
140
|
Da 153
|
141
|
+
ke_ 153
|
145
142
|
den_ 153
|
146
143
|
ha 153
|
147
|
-
mark 153
|
148
|
-
ke_ 153
|
149
|
-
land 153
|
150
|
-
_st 151
|
151
144
|
ni 151
|
152
145
|
ed_ 151
|
146
|
+
_st 151
|
153
147
|
_for 149
|
154
148
|
so 149
|
155
|
-
ta 148
|
156
149
|
Dan 148
|
150
|
+
ta 148
|
157
151
|
ger 147
|
158
152
|
nge 144
|
159
153
|
det 143
|
160
154
|
re_ 140
|
161
155
|
ede 139
|
162
|
-
vi 138
|
163
|
-
nm 138
|
164
156
|
nma 138
|
157
|
+
nm 138
|
158
|
+
vi 138
|
165
159
|
nmark 137
|
166
160
|
_en 137
|
167
161
|
nmar 137
|
168
|
-
anmar 136
|
169
|
-
anma 136
|
170
162
|
anm 136
|
163
|
+
anma 136
|
164
|
+
anmar 136
|
171
165
|
ev 135
|
172
166
|
rs 135
|
173
|
-
un 133
|
174
|
-
Danma 133
|
175
|
-
_Da 133
|
176
|
-
]_ 133
|
177
|
-
S 133
|
178
167
|
der_ 133
|
168
|
+
Danma 133
|
179
169
|
Danm 133
|
180
|
-
|
170
|
+
S 133
|
171
|
+
_Da 133
|
172
|
+
un 133
|
181
173
|
ans 132
|
182
174
|
_er 131
|
183
|
-
med 130
|
184
175
|
da 130
|
176
|
+
med 130
|
185
177
|
_Dan 129
|
186
|
-
io 127
|
187
178
|
ro 127
|
179
|
+
io 127
|
188
180
|
til 126
|
189
|
-
ik 125
|
190
181
|
som 125
|
191
182
|
_er_ 125
|
192
|
-
|
183
|
+
ik 125
|
193
184
|
rn 124
|
185
|
+
_ti 124
|
194
186
|
ds 123
|
195
|
-
em 122
|
196
187
|
_u 122
|
197
|
-
|
188
|
+
em 122
|
198
189
|
eg 121
|
190
|
+
�_ 121
|
199
191
|
å_ 121
|
200
192
|
_ha 120
|
201
193
|
_Danm 120
|
202
194
|
rt 120
|
203
|
-
_med 119
|
204
195
|
ld 119
|
196
|
+
_med 119
|
197
|
+
_r 118
|
205
198
|
som_ 118
|
206
199
|
to 117
|
207
200
|
_so 116
|
208
|
-
ske_ 116
|
209
201
|
_g 116
|
202
|
+
ske_ 116
|
210
203
|
det_ 115
|
211
|
-
_r 115
|
212
|
-
tr 114
|
213
204
|
ern 114
|
214
|
-
ar_ 114
|
215
205
|
end 114
|
206
|
+
ar_ 114
|
207
|
+
tr 114
|
216
208
|
_som 113
|
217
|
-
ud 111
|
218
|
-
ko 111
|
219
209
|
id 111
|
220
|
-
|
210
|
+
ko 111
|
211
|
+
ud 111
|
221
212
|
_som_ 110
|
213
|
+
del 110
|
222
214
|
_til 109
|
223
|
-
lig 108
|
224
|
-
nsk 108
|
225
215
|
si 108
|
226
|
-
|
216
|
+
lig 108
|
227
217
|
va 108
|
218
|
+
mi 108
|
219
|
+
nsk 108
|
228
220
|
ls 107
|
229
|
-
|
230
|
-
|
221
|
+
_be 105
|
222
|
+
�r 105
|
231
223
|
bl 105
|
232
224
|
ka 105
|
233
|
-
|
234
|
-
_be 105
|
225
|
+
ion 105
|
235
226
|
ind 105
|
227
|
+
ør 105
|
236
228
|
gs 105
|
237
229
|
lle 104
|
238
230
|
_da 104
|
239
231
|
t� 103
|
240
|
-
_S
|
232
|
+
_S 103
|
241
233
|
ne_ 102
|
242
234
|
med_ 102
|
243
|
-
dt 101
|
244
|
-
tt 101
|
245
235
|
ag 101
|
246
236
|
_en_ 101
|
237
|
+
tt 101
|
238
|
+
dt 101
|
247
239
|
r� 100
|
248
|
-
c 100
|
249
240
|
_� 100
|
241
|
+
c 100
|
250
242
|
ansk 99
|
251
|
-
ie 99
|
252
243
|
nt 99
|
253
244
|
dan 99
|
245
|
+
ie 99
|
254
246
|
_med_ 98
|
255
247
|
or_ 97
|
256
|
-
ær 95
|
257
|
-
il_ 95
|
258
|
-
- 95
|
259
248
|
�r 95
|
249
|
+
- 95
|
260
250
|
De 95
|
261
|
-
|
262
|
-
|
251
|
+
il_ 95
|
252
|
+
ær 95
|
253
|
+
na 94
|
263
254
|
nin 94
|
255
|
+
rne 94
|
264
256
|
ning 94
|
265
257
|
ner 94
|
266
|
-
|
267
|
-
at_ 92
|
268
|
-
til_ 92
|
269
|
-
fr 92
|
258
|
+
lt 94
|
270
259
|
I 92
|
271
|
-
|
260
|
+
fr 92
|
261
|
+
til_ 92
|
262
|
+
at_ 92
|
272
263
|
op 91
|
273
|
-
|
264
|
+
ru 91
|
265
|
+
_dan 89
|
274
266
|
rd 89
|
267
|
+
ige 89
|
275
268
|
_bl 89
|
276
|
-
|
269
|
+
erne 89
|
277
270
|
ge_ 89
|
278
|
-
ige 89
|
279
|
-
gt 88
|
280
|
-
v_ 88
|
281
271
|
ng_ 88
|
272
|
+
v_ 88
|
273
|
+
gt 88
|
274
|
+
kr 87
|
282
275
|
tte 87
|
283
276
|
a_ 87
|
284
|
-
|
277
|
+
_re 87
|
285
278
|
inge 87
|
286
|
-
kr 87
|
287
279
|
_den 87
|
280
|
+
p� 87
|
288
281
|
dans 86
|
282
|
+
men 86
|
289
283
|
s� 86
|
290
284
|
dansk 86
|
291
|
-
men 86
|
292
|
-
ver 85
|
293
|
-
_til_ 85
|
294
|
-
isk 85
|
295
285
|
it 85
|
296
|
-
|
297
|
-
|
286
|
+
isk 85
|
287
|
+
_til_ 85
|
288
|
+
ver 85
|
289
|
+
am 84
|
298
290
|
f� 84
|
299
291
|
_dans 84
|
300
|
-
|
301
|
-
|
302
|
-
est 83
|
292
|
+
_at 84
|
293
|
+
els 84
|
303
294
|
es_ 83
|
295
|
+
est 83
|
304
296
|
ur 82
|
305
297
|
gen 82
|
306
298
|
_den_ 82
|
@@ -308,93 +300,101 @@ he 81
|
|
308
300
|
_ud 81
|
309
301
|
_at_ 81
|
310
302
|
_n 81
|
311
|
-
ble 80
|
312
303
|
ene 80
|
304
|
+
ble 80
|
305
|
+
sa 79
|
313
306
|
od 79
|
314
307
|
und 79
|
315
|
-
sa 79
|
316
|
-
ede_ 79
|
317
308
|
_. 79
|
309
|
+
ede_ 79
|
318
310
|
ande 79
|
311
|
+
eri 78
|
312
|
+
nde_ 78
|
319
313
|
_in 78
|
320
314
|
_la 78
|
321
|
-
nde_ 78
|
322
|
-
eri 78
|
323
|
-
ende 77
|
324
315
|
ov 77
|
325
|
-
|
326
|
-
rk_ 76
|
316
|
+
ende 77
|
327
317
|
_I 76
|
328
318
|
r. 76
|
319
|
+
_fr 76
|
329
320
|
tor 76
|
330
321
|
av 75
|
331
|
-
lk 75
|
332
322
|
lev 75
|
333
|
-
|
323
|
+
lk 75
|
334
324
|
sk_ 75
|
335
|
-
|
325
|
+
rk_ 75
|
326
|
+
ing_ 75
|
336
327
|
_si 74
|
337
328
|
an_ 74
|
338
|
-
|
339
|
-
us 73
|
340
|
-
på 73
|
329
|
+
ft 74
|
341
330
|
mm 73
|
331
|
+
på 73
|
342
332
|
F 73
|
343
|
-
|
344
|
-
|
333
|
+
_._ 73
|
334
|
+
us 73
|
335
|
+
e. 73
|
336
|
+
e,_ 72
|
345
337
|
di 72
|
346
|
-
e, 72
|
347
338
|
rin 72
|
348
|
-
|
349
|
-
|
350
|
-
|
339
|
+
e, 72
|
340
|
+
_De 72
|
341
|
+
E 72
|
342
|
+
nske 71
|
351
343
|
_på 71
|
352
344
|
_der 71
|
353
|
-
|
345
|
+
_p� 71
|
346
|
+
_lan 71
|
347
|
+
get 70
|
354
348
|
le_ 70
|
349
|
+
ark_ 70
|
355
350
|
st� 70
|
356
|
-
get 70
|
357
351
|
gi 70
|
358
|
-
e,_ 69
|
359
|
-
pr 69
|
360
352
|
ist 69
|
353
|
+
pr 69
|
354
|
+
nd_ 68
|
361
355
|
var 68
|
362
|
-
blev 68
|
363
|
-
mark_ 68
|
364
|
-
_ble 68
|
365
356
|
_blev 68
|
357
|
+
_ble 68
|
358
|
+
blev 68
|
366
359
|
ks 68
|
360
|
+
mark_ 67
|
367
361
|
på_ 67
|
368
|
-
_va 67
|
369
|
-
nd_ 67
|
370
|
-
anske 67
|
371
362
|
ss 67
|
372
|
-
|
373
|
-
|
374
|
-
|
363
|
+
anske 67
|
364
|
+
_va 67
|
365
|
+
_( 66
|
366
|
+
_land 66
|
367
|
+
ati 66
|
375
368
|
tio 66
|
376
369
|
lse 66
|
377
|
-
|
370
|
+
år 66
|
371
|
+
fi 66
|
378
372
|
_på_ 66
|
379
373
|
) 66
|
380
374
|
tion 66
|
381
|
-
|
382
|
-
fi 66
|
375
|
+
�r 66
|
383
376
|
( 66
|
384
|
-
|
385
|
-
one 65
|
377
|
+
gr 66
|
386
378
|
ef 65
|
387
379
|
sto 65
|
388
380
|
kt 65
|
389
|
-
|
381
|
+
one 65
|
390
382
|
sen 64
|
391
|
-
else 64
|
392
383
|
ev_ 64
|
393
|
-
|
384
|
+
else 64
|
394
385
|
A 63
|
395
386
|
ende_ 63
|
396
387
|
ren 63
|
388
|
+
ring 63
|
389
|
+
_ko 62
|
397
390
|
for_ 62
|
398
391
|
ho 62
|
399
|
-
|
392
|
+
dr 61
|
393
|
+
ig_ 61
|
394
|
+
ste_ 61
|
400
395
|
rig 61
|
396
|
+
lev_ 60
|
397
|
+
ret 60
|
398
|
+
blev_ 60
|
399
|
+
_E 59
|
400
|
+
fø 59
|