scylla 0.4.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/scylla/generator.rb +1 -1
- data/lib/scylla/lms/13375P33K.lm +156 -156
- data/lib/scylla/lms/arabic.lm +133 -133
- data/lib/scylla/lms/bulgarian.lm +122 -122
- data/lib/scylla/lms/catalan.lm +151 -151
- data/lib/scylla/lms/danish.lm +137 -137
- data/lib/scylla/lms/english.lm +207 -207
- data/lib/scylla/lms/french.lm +400 -400
- data/lib/scylla/lms/japanese.lm +400 -400
- data/lib/scylla/lms/korean.lm +233 -233
- data/lib/scylla/lms/norwegian.lm +398 -398
- data/lib/scylla/lms/spanish.lm +98 -98
- data/lib/scylla/lms/swedish.lm +123 -123
- data/lib/scylla/lms/tagalog.lm +223 -223
- data/lib/scylla/lms/welsh.lm +234 -234
- data/lib/scylla/resources.rb +10 -10
- data/scylla.gemspec +17 -40
- data/source_texts/catalan.txt +28 -28
- data/source_texts/danish.txt +62 -62
- data/source_texts/english.txt +10 -10
- data/source_texts/french.txt +470 -77
- data/source_texts/japanese.txt +453 -199
- data/source_texts/norwegian.txt +96 -63
- data/source_texts/spanish.txt +269 -269
- data/test/classifier_test.rb +2 -2
- data/test/fixtures/lms/13375p33k.lm +156 -156
- data/test/fixtures/lms/danish.lm +137 -137
- data/test/fixtures/lms/english.lm +207 -207
- data/test/fixtures/lms/french.lm +400 -400
- data/test/fixtures/lms/hindi.lm +400 -0
- data/test/fixtures/lms/italian.lm +400 -0
- data/test/fixtures/lms/japanese.lm +400 -400
- data/test/fixtures/lms/norwegian.lm +400 -0
- data/test/fixtures/lms/spanish.lm +98 -98
- data/test/fixtures/source_texts/danish.txt +62 -62
- data/test/fixtures/source_texts/english.txt +10 -10
- data/test/fixtures/source_texts/french.txt +470 -77
- data/test/fixtures/source_texts/hindi.txt +199 -0
- data/test/fixtures/source_texts/italian.txt +120 -0
- data/test/fixtures/source_texts/japanese.txt +453 -199
- data/test/fixtures/source_texts/norwegian.txt +190 -0
- data/test/fixtures/source_texts/spanish.txt +269 -269
- data/test/fixtures/test_languages/english +61 -0
- data/test/fixtures/test_languages/french +0 -0
- data/test/fixtures/test_languages/german +29 -0
- data/test/fixtures/test_languages/hindi +3 -0
- data/test/fixtures/test_languages/italian +6 -0
- data/test/fixtures/test_languages/japanese +79 -0
- data/test/fixtures/test_languages/norwegian +14 -0
- data/test/fixtures/test_languages/spanish +22 -0
- data/test/generator_test.rb +0 -1
- data/test/language_test.rb +28 -0
- metadata +20 -43
- data/lib/scylla/lms/esperanto.lm +0 -400
- data/lib/scylla/lms/hungarian.lm +0 -400
- data/lib/scylla/lms/irish.lm +0 -400
- data/lib/scylla/lms/kannada.lm +0 -400
- data/lib/scylla/lms/latin.lm +0 -400
- data/lib/scylla/lms/malay.lm +0 -400
- data/lib/scylla/lms/marathi.lm +0 -400
- data/lib/scylla/lms/mingo.lm +0 -400
- data/lib/scylla/lms/nepali.lm +0 -400
- data/lib/scylla/lms/quechua.lm +0 -400
- data/lib/scylla/lms/rumantsch.lm +0 -400
- data/lib/scylla/lms/sanskrit.lm +0 -400
- data/lib/scylla/lms/scots_gaelic.lm +0 -400
- data/lib/scylla/lms/serbian.lm +0 -400
- data/lib/scylla/lms/swahili.lm +0 -400
- data/lib/scylla/lms/tamil.lm +0 -400
- data/lib/scylla/lms/ukrainian.lm +0 -400
- data/lib/scylla/lms/yiddish.lm +0 -400
- data/source_texts/esperanto.txt +0 -199
- data/source_texts/hungarian.txt +0 -102
- data/source_texts/irish.txt +0 -209
- data/source_texts/kannada.txt +0 -283
- data/source_texts/latin.txt +0 -120
- data/source_texts/malay.txt +0 -108
- data/source_texts/marathi.txt +0 -100
- data/source_texts/mingo.txt +0 -146
- data/source_texts/nepali.txt +0 -131
- data/source_texts/quechua.txt +0 -108
- data/source_texts/rumantsch.txt +0 -110
- data/source_texts/sanskrit.txt +0 -135
- data/source_texts/scots_gaelic.txt +0 -93
- data/source_texts/serbian.txt +0 -121
- data/source_texts/swahili.txt +0 -120
- data/source_texts/tamil.txt +0 -167
- data/source_texts/ukrainian.txt +0 -214
- data/source_texts/yiddish-utf.txt +0 -83
- data/test/fixtures/lms/kannada.lm +0 -400
- data/test/fixtures/source_texts/kannada.txt +0 -283
@@ -1,4 +1,4 @@
|
|
1
|
-
_
|
1
|
+
_ 34451
|
2
2
|
a 10584
|
3
3
|
e 10526
|
4
4
|
o 7129
|
@@ -10,29 +10,28 @@ l 5112
|
|
10
10
|
d 4622
|
11
11
|
t 3867
|
12
12
|
c 3674
|
13
|
-
a_
|
14
|
-
e_
|
13
|
+
a_ 2975
|
14
|
+
e_ 2938
|
15
15
|
u 2914
|
16
16
|
� 2572
|
17
|
-
s_
|
17
|
+
s_ 2494
|
18
18
|
de 2468
|
19
|
-
|
20
|
-
_d 2335
|
19
|
+
_d 2324
|
21
20
|
p 2303
|
22
21
|
m 2195
|
23
|
-
_de
|
24
|
-
o_
|
22
|
+
_de 2087
|
23
|
+
o_ 1866
|
25
24
|
_e 1761
|
26
|
-
n_
|
25
|
+
n_ 1757
|
27
26
|
en 1746
|
28
|
-
de_
|
29
|
-
_de_
|
27
|
+
de_ 1679
|
28
|
+
_de_ 1591
|
30
29
|
la 1459
|
31
30
|
es 1454
|
32
31
|
_l 1371
|
33
32
|
, 1276
|
34
|
-
,_
|
35
|
-
l_
|
33
|
+
,_ 1240
|
34
|
+
l_ 1223
|
36
35
|
os 1221
|
37
36
|
er 1194
|
38
37
|
on 1122
|
@@ -40,13 +39,13 @@ as 1103
|
|
40
39
|
ci 1095
|
41
40
|
_p 1071
|
42
41
|
el 1049
|
43
|
-
_c
|
42
|
+
_c 1047
|
44
43
|
an 1032
|
45
44
|
ra 1018
|
46
45
|
al 997
|
47
46
|
g 941
|
48
47
|
_la 932
|
49
|
-
os_
|
48
|
+
os_ 930
|
50
49
|
nt 926
|
51
50
|
te 922
|
52
51
|
co 901
|
@@ -54,11 +53,11 @@ b 896
|
|
54
53
|
_a 893
|
55
54
|
re 888
|
56
55
|
ta 858
|
56
|
+
ad 827
|
57
57
|
ri 825
|
58
|
-
ad 825
|
59
58
|
ar 816
|
60
|
-
la_ 814
|
61
59
|
or 812
|
60
|
+
la_ 812
|
62
61
|
el_ 796
|
63
62
|
_s 795
|
64
63
|
. 766
|
@@ -69,7 +68,7 @@ do 751
|
|
69
68
|
st 737
|
70
69
|
ro 725
|
71
70
|
y 717
|
72
|
-
as_
|
71
|
+
as_ 715
|
73
72
|
_la_ 712
|
74
73
|
na 691
|
75
74
|
ue 688
|
@@ -78,40 +77,39 @@ in 658
|
|
78
77
|
_en 644
|
79
78
|
ca 643
|
80
79
|
ic 635
|
81
|
-
en_
|
82
|
-
es_ 621
|
80
|
+
en_ 631
|
83
81
|
da 621
|
84
82
|
ia 620
|
83
|
+
es_ 620
|
85
84
|
E 615
|
86
85
|
to 609
|
87
86
|
lo 605
|
88
87
|
_m 597
|
89
88
|
f 596
|
90
89
|
_co 590
|
91
|
-
y_
|
90
|
+
y_ 562
|
92
91
|
� 557
|
93
92
|
í 557
|
94
93
|
_el 553
|
95
94
|
ti 535
|
96
95
|
no 532
|
97
96
|
_en_ 529
|
98
|
-
._ 524
|
99
97
|
_y 523
|
100
98
|
un 513
|
101
99
|
ent 513
|
102
100
|
le 507
|
103
101
|
_el_ 505
|
104
|
-
r_
|
102
|
+
r_ 499
|
105
103
|
io 495
|
106
|
-
_y_
|
107
|
-
�n 488
|
104
|
+
_y_ 492
|
108
105
|
ón 488
|
106
|
+
�n 488
|
109
107
|
i� 487
|
110
108
|
po 475
|
111
109
|
a� 472
|
112
|
-
_E 470
|
113
110
|
se 468
|
114
111
|
sp 460
|
112
|
+
_E 459
|
115
113
|
q 456
|
116
114
|
qu 455
|
117
115
|
� 453
|
@@ -120,6 +118,7 @@ C 451
|
|
120
118
|
tr 450
|
121
119
|
_t 444
|
122
120
|
ma 440
|
121
|
+
._ 437
|
123
122
|
id 432
|
124
123
|
ac 431
|
125
124
|
ió 428
|
@@ -129,27 +128,27 @@ om 425
|
|
129
128
|
ne 414
|
130
129
|
nte 414
|
131
130
|
con 410
|
132
|
-
do_
|
131
|
+
do_ 406
|
133
132
|
nc 406
|
133
|
+
nd 404
|
134
134
|
_r 403
|
135
135
|
li 403
|
136
|
-
nd 403
|
137
136
|
ie 401
|
138
137
|
si 399
|
139
138
|
me 396
|
140
139
|
añ 395
|
141
|
-
á 391
|
142
140
|
� 391
|
141
|
+
á 391
|
143
142
|
h 383
|
144
143
|
pr 382
|
145
|
-
�n_ 376
|
146
|
-
ón_ 376
|
147
144
|
spa 375
|
148
|
-
|
145
|
+
�n_ 375
|
146
|
+
ón_ 375
|
149
147
|
que 374
|
150
148
|
ión 373
|
151
149
|
ue_ 370
|
152
150
|
pa� 363
|
151
|
+
_C 358
|
153
152
|
ec 341
|
154
153
|
mi 340
|
155
154
|
ión_ 339
|
@@ -162,50 +161,50 @@ _pr 328
|
|
162
161
|
A 327
|
163
162
|
te_ 327
|
164
163
|
que_ 324
|
165
|
-
_q 319
|
166
164
|
_qu 319
|
165
|
+
_q 319
|
167
166
|
it 317
|
168
167
|
pañ 315
|
169
168
|
Es 313
|
170
169
|
_se 310
|
171
170
|
mo 310
|
171
|
+
spa� 309
|
172
172
|
_lo 309
|
173
173
|
spañ 309
|
174
|
-
spa� 309
|
175
174
|
_que 308
|
176
175
|
_po 307
|
177
176
|
los 305
|
178
177
|
_es 304
|
179
|
-
al_
|
178
|
+
al_ 302
|
180
179
|
ol 302
|
181
|
-
aci 300
|
182
180
|
ci� 300
|
181
|
+
aci 300
|
183
182
|
ció 299
|
184
183
|
los_ 296
|
184
|
+
a, 295
|
185
185
|
I 295
|
186
|
-
a, 294
|
187
186
|
ado 294
|
187
|
+
a,_ 293
|
188
188
|
ur 293
|
189
|
-
a,_ 292
|
190
|
-
_que_ 291
|
191
189
|
_i 291
|
190
|
+
_que_ 291
|
192
191
|
ción 285
|
193
|
-
_Es
|
192
|
+
_Es 282
|
194
193
|
su 282
|
195
|
-
ña 278
|
196
194
|
�a 278
|
195
|
+
ña 278
|
197
196
|
z 275
|
198
197
|
_f 275
|
199
|
-
_u 271
|
200
198
|
ica 271
|
201
|
-
|
199
|
+
_u 271
|
202
200
|
del 270
|
201
|
+
on_ 270
|
203
202
|
cia 267
|
204
203
|
ce 267
|
205
204
|
_del 266
|
206
205
|
del_ 261
|
207
|
-
� 261
|
208
206
|
é 261
|
207
|
+
� 261
|
209
208
|
nci 260
|
210
209
|
_del_ 259
|
211
210
|
tu 258
|
@@ -217,51 +216,52 @@ am 255
|
|
217
216
|
_los_ 254
|
218
217
|
sta 253
|
219
218
|
_un 252
|
220
|
-
Espa 250
|
221
219
|
Espa� 250
|
222
220
|
Esp 250
|
223
|
-
|
221
|
+
Espa 250
|
222
|
+
s,_ 249
|
224
223
|
s, 249
|
225
|
-
|
224
|
+
ra_ 248
|
226
225
|
est 245
|
227
|
-
ll 245
|
228
226
|
ab 245
|
227
|
+
ll 245
|
229
228
|
las 244
|
230
229
|
por 244
|
231
|
-
ía 243
|
232
230
|
�a 243
|
231
|
+
__ 243
|
233
232
|
aña 243
|
234
|
-
|
233
|
+
ía 243
|
235
234
|
r� 239
|
235
|
+
at 239
|
236
236
|
so 234
|
237
237
|
paña 234
|
238
|
-
_A 233
|
239
|
-
im 231
|
240
238
|
_a_ 231
|
239
|
+
im 231
|
241
240
|
las_ 230
|
241
|
+
_A 228
|
242
242
|
ns 228
|
243
|
-
_Esp 227
|
244
|
-
_Espa 227
|
245
|
-
cu 226
|
246
243
|
em 226
|
247
|
-
|
244
|
+
_Espa 226
|
245
|
+
_Esp 226
|
246
|
+
cu 226
|
248
247
|
j 224
|
248
|
+
na_ 224
|
249
249
|
ul 220
|
250
|
-
ant 219
|
251
250
|
P 219
|
251
|
+
ant 219
|
252
252
|
ente 218
|
253
253
|
rr 218
|
254
254
|
to_ 217
|
255
255
|
nte_ 217
|
256
256
|
) 216
|
257
|
-
( 216
|
258
257
|
_n 216
|
258
|
+
( 216
|
259
259
|
dad 215
|
260
|
-
ia_ 215
|
261
260
|
se_ 215
|
261
|
+
ia_ 215
|
262
262
|
_( 214
|
263
|
-
il 213
|
264
263
|
vi 213
|
264
|
+
il 213
|
265
265
|
L 211
|
266
266
|
ter 209
|
267
267
|
_pa 206
|
@@ -270,7 +270,7 @@ men 203
|
|
270
270
|
era 202
|
271
271
|
ran 201
|
272
272
|
les 201
|
273
|
-
da_
|
273
|
+
da_ 201
|
274
274
|
ig 198
|
275
275
|
_su 198
|
276
276
|
o, 197
|
@@ -280,121 +280,121 @@ tra 193
|
|
280
280
|
res 192
|
281
281
|
cio 190
|
282
282
|
com 190
|
283
|
+
ida 189
|
283
284
|
one 189
|
284
285
|
_ca 189
|
285
|
-
|
286
|
+
ed 188
|
286
287
|
M 188
|
287
288
|
S 187
|
288
|
-
ed 187
|
289
|
-
t� 185
|
290
289
|
_las 185
|
291
290
|
m� 185
|
292
|
-
ona 185
|
293
291
|
_las_ 185
|
294
|
-
|
292
|
+
ona 185
|
293
|
+
t� 185
|
295
294
|
ion 184
|
295
|
+
ha 184
|
296
|
+
od 184
|
296
297
|
nes 183
|
297
298
|
no_ 182
|
298
|
-
od 182
|
299
299
|
ale 180
|
300
|
-
_P 179
|
301
|
-
br 178
|
302
300
|
sa 178
|
303
301
|
_com 178
|
302
|
+
br 178
|
304
303
|
_por 177
|
305
304
|
mp 177
|
306
305
|
bi 176
|
307
306
|
_in 176
|
308
|
-
pro 175
|
309
307
|
ist 175
|
308
|
+
pro 175
|
309
|
+
aci� 173
|
310
310
|
ació 173
|
311
311
|
dos 173
|
312
|
-
aci� 173
|
313
312
|
ct 172
|
314
313
|
des 172
|
315
314
|
oc 172
|
316
315
|
eg 167
|
317
|
-
_I 167
|
318
316
|
_al 167
|
319
317
|
an_ 166
|
318
|
+
_I 166
|
320
319
|
por_ 166
|
321
320
|
ero 165
|
322
321
|
_pro 164
|
323
|
-
_por_ 164
|
324
322
|
_se_ 164
|
325
|
-
|
323
|
+
_por_ 164
|
326
324
|
ño 163
|
327
|
-
�a_ 163
|
328
325
|
�o 163
|
326
|
+
ía_ 162
|
329
327
|
_v 162
|
328
|
+
�a_ 162
|
330
329
|
va 161
|
331
330
|
ment 161
|
332
331
|
lo_ 160
|
333
332
|
iv 160
|
333
|
+
_P 160
|
334
334
|
gu 159
|
335
335
|
ndo 159
|
336
336
|
mu 158
|
337
|
-
_. 157
|
338
|
-
_si 156
|
339
337
|
et 156
|
338
|
+
_si 156
|
340
339
|
ici 155
|
341
|
-
fi 155
|
342
340
|
d_ 155
|
343
|
-
|
341
|
+
fi 155
|
344
342
|
ria 154
|
345
|
-
|
343
|
+
go 154
|
346
344
|
a. 152
|
347
|
-
ron 152
|
348
345
|
mo_ 152
|
349
|
-
|
350
|
-
ones 151
|
351
|
-
_ha 151
|
346
|
+
ron 152
|
352
347
|
op 151
|
348
|
+
_ha 151
|
349
|
+
ones 151
|
350
|
+
ga 151
|
353
351
|
za 151
|
354
|
-
_L 150
|
355
352
|
us 150
|
356
|
-
_S 149
|
357
353
|
mb 149
|
358
|
-
ca_ 148
|
359
|
-
ba 148
|
360
354
|
año 148
|
355
|
+
ba 148
|
361
356
|
Ca 148
|
362
|
-
sti 147
|
363
357
|
_pe 147
|
364
|
-
|
358
|
+
sti 147
|
365
359
|
ncia 147
|
360
|
+
ado_ 147
|
366
361
|
ua 146
|
367
362
|
uc 146
|
368
363
|
ico 146
|
364
|
+
_S 145
|
369
365
|
nes_ 145
|
370
366
|
s. 144
|
371
|
-
ña_ 144
|
372
367
|
�a_ 144
|
373
368
|
ve 144
|
369
|
+
ña_ 144
|
374
370
|
rio 143
|
375
371
|
cion 143
|
372
|
+
_L 142
|
373
|
+
_M 142
|
376
374
|
_con_ 142
|
377
375
|
con_ 142
|
378
|
-
_._ 142
|
379
376
|
ente_ 142
|
380
377
|
ip 141
|
381
378
|
rc 141
|
382
|
-
io_ 140
|
383
379
|
ntr 140
|
384
|
-
|
380
|
+
io_ 140
|
385
381
|
tor 139
|
382
|
+
ca_ 139
|
383
|
+
nto 139
|
386
384
|
_g 138
|
387
385
|
ob 138
|
388
|
-
ta_ 138
|
389
386
|
par 138
|
390
|
-
|
387
|
+
ta_ 138
|
391
388
|
ir 137
|
389
|
+
G 137
|
392
390
|
aña_ 136
|
393
391
|
bl 136
|
394
392
|
n� 136
|
395
393
|
ante 136
|
396
394
|
dos_ 135
|
397
|
-
|
395
|
+
err 134
|
398
396
|
�_ 134
|
397
|
+
rm 134
|
398
|
+
ó_ 134
|
399
399
|
eri 134
|
400
|
-
|
400
|
+
ori 134
|