scylla 0.4.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/scylla/generator.rb +1 -1
- data/lib/scylla/lms/13375P33K.lm +156 -156
- data/lib/scylla/lms/arabic.lm +133 -133
- data/lib/scylla/lms/bulgarian.lm +122 -122
- data/lib/scylla/lms/catalan.lm +151 -151
- data/lib/scylla/lms/danish.lm +137 -137
- data/lib/scylla/lms/english.lm +207 -207
- data/lib/scylla/lms/french.lm +400 -400
- data/lib/scylla/lms/japanese.lm +400 -400
- data/lib/scylla/lms/korean.lm +233 -233
- data/lib/scylla/lms/norwegian.lm +398 -398
- data/lib/scylla/lms/spanish.lm +98 -98
- data/lib/scylla/lms/swedish.lm +123 -123
- data/lib/scylla/lms/tagalog.lm +223 -223
- data/lib/scylla/lms/welsh.lm +234 -234
- data/lib/scylla/resources.rb +10 -10
- data/scylla.gemspec +17 -40
- data/source_texts/catalan.txt +28 -28
- data/source_texts/danish.txt +62 -62
- data/source_texts/english.txt +10 -10
- data/source_texts/french.txt +470 -77
- data/source_texts/japanese.txt +453 -199
- data/source_texts/norwegian.txt +96 -63
- data/source_texts/spanish.txt +269 -269
- data/test/classifier_test.rb +2 -2
- data/test/fixtures/lms/13375p33k.lm +156 -156
- data/test/fixtures/lms/danish.lm +137 -137
- data/test/fixtures/lms/english.lm +207 -207
- data/test/fixtures/lms/french.lm +400 -400
- data/test/fixtures/lms/hindi.lm +400 -0
- data/test/fixtures/lms/italian.lm +400 -0
- data/test/fixtures/lms/japanese.lm +400 -400
- data/test/fixtures/lms/norwegian.lm +400 -0
- data/test/fixtures/lms/spanish.lm +98 -98
- data/test/fixtures/source_texts/danish.txt +62 -62
- data/test/fixtures/source_texts/english.txt +10 -10
- data/test/fixtures/source_texts/french.txt +470 -77
- data/test/fixtures/source_texts/hindi.txt +199 -0
- data/test/fixtures/source_texts/italian.txt +120 -0
- data/test/fixtures/source_texts/japanese.txt +453 -199
- data/test/fixtures/source_texts/norwegian.txt +190 -0
- data/test/fixtures/source_texts/spanish.txt +269 -269
- data/test/fixtures/test_languages/english +61 -0
- data/test/fixtures/test_languages/french +0 -0
- data/test/fixtures/test_languages/german +29 -0
- data/test/fixtures/test_languages/hindi +3 -0
- data/test/fixtures/test_languages/italian +6 -0
- data/test/fixtures/test_languages/japanese +79 -0
- data/test/fixtures/test_languages/norwegian +14 -0
- data/test/fixtures/test_languages/spanish +22 -0
- data/test/generator_test.rb +0 -1
- data/test/language_test.rb +28 -0
- metadata +20 -43
- data/lib/scylla/lms/esperanto.lm +0 -400
- data/lib/scylla/lms/hungarian.lm +0 -400
- data/lib/scylla/lms/irish.lm +0 -400
- data/lib/scylla/lms/kannada.lm +0 -400
- data/lib/scylla/lms/latin.lm +0 -400
- data/lib/scylla/lms/malay.lm +0 -400
- data/lib/scylla/lms/marathi.lm +0 -400
- data/lib/scylla/lms/mingo.lm +0 -400
- data/lib/scylla/lms/nepali.lm +0 -400
- data/lib/scylla/lms/quechua.lm +0 -400
- data/lib/scylla/lms/rumantsch.lm +0 -400
- data/lib/scylla/lms/sanskrit.lm +0 -400
- data/lib/scylla/lms/scots_gaelic.lm +0 -400
- data/lib/scylla/lms/serbian.lm +0 -400
- data/lib/scylla/lms/swahili.lm +0 -400
- data/lib/scylla/lms/tamil.lm +0 -400
- data/lib/scylla/lms/ukrainian.lm +0 -400
- data/lib/scylla/lms/yiddish.lm +0 -400
- data/source_texts/esperanto.txt +0 -199
- data/source_texts/hungarian.txt +0 -102
- data/source_texts/irish.txt +0 -209
- data/source_texts/kannada.txt +0 -283
- data/source_texts/latin.txt +0 -120
- data/source_texts/malay.txt +0 -108
- data/source_texts/marathi.txt +0 -100
- data/source_texts/mingo.txt +0 -146
- data/source_texts/nepali.txt +0 -131
- data/source_texts/quechua.txt +0 -108
- data/source_texts/rumantsch.txt +0 -110
- data/source_texts/sanskrit.txt +0 -135
- data/source_texts/scots_gaelic.txt +0 -93
- data/source_texts/serbian.txt +0 -121
- data/source_texts/swahili.txt +0 -120
- data/source_texts/tamil.txt +0 -167
- data/source_texts/ukrainian.txt +0 -214
- data/source_texts/yiddish-utf.txt +0 -83
- data/test/fixtures/lms/kannada.lm +0 -400
- data/test/fixtures/source_texts/kannada.txt +0 -283
data/lib/scylla/lms/catalan.lm
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
_
|
1
|
+
_ 15194
|
2
2
|
a 4057
|
3
3
|
e 3799
|
4
4
|
l 2679
|
@@ -10,49 +10,49 @@ r 1853
|
|
10
10
|
o 1493
|
11
11
|
c 1484
|
12
12
|
� 1204
|
13
|
+
a_ 1195
|
13
14
|
d 1193
|
14
|
-
|
15
|
-
s_ 1131
|
15
|
+
s_ 1143
|
16
16
|
u 1083
|
17
|
-
__
|
17
|
+
__ 912
|
18
18
|
m 842
|
19
|
-
_d
|
19
|
+
_d 773
|
20
20
|
es 765
|
21
21
|
en 764
|
22
22
|
p 702
|
23
|
-
_l
|
23
|
+
_l 682
|
24
24
|
al 681
|
25
25
|
de 636
|
26
|
-
e_
|
27
|
-
_e
|
26
|
+
e_ 617
|
27
|
+
_e 596
|
28
28
|
ta 559
|
29
29
|
_de 544
|
30
|
-
l_
|
30
|
+
l_ 510
|
31
31
|
g 500
|
32
32
|
, 490
|
33
|
-
,_
|
33
|
+
,_ 484
|
34
34
|
la 470
|
35
35
|
at 463
|
36
36
|
_c 454
|
37
37
|
el 452
|
38
38
|
le 449
|
39
|
-
_a
|
39
|
+
_a 447
|
40
|
+
es_ 430
|
40
41
|
ca 427
|
41
|
-
es_ 425
|
42
42
|
er 409
|
43
43
|
ci 399
|
44
|
-
_p
|
44
|
+
_p 397
|
45
45
|
. 394
|
46
|
-
t_
|
46
|
+
t_ 391
|
47
47
|
an 383
|
48
48
|
nt 382
|
49
|
-
à 364
|
50
49
|
� 364
|
50
|
+
à 364
|
51
51
|
re 359
|
52
52
|
st 352
|
53
53
|
ar 348
|
54
|
+
n_ 346
|
54
55
|
b 345
|
55
|
-
n_ 343
|
56
56
|
de_ 339
|
57
57
|
_de_ 333
|
58
58
|
ra 319
|
@@ -60,26 +60,27 @@ f 305
|
|
60
60
|
' 300
|
61
61
|
on 298
|
62
62
|
el_ 295
|
63
|
+
._ 286
|
64
|
+
la_ 270
|
63
65
|
v 269
|
64
|
-
la_ 269
|
65
66
|
ll 266
|
66
|
-
_i
|
67
|
+
_i 263
|
67
68
|
ic 258
|
68
|
-
|
69
|
+
i_ 256
|
69
70
|
ue 253
|
70
|
-
i_ 252
|
71
71
|
or 251
|
72
72
|
_la 250
|
73
73
|
q 246
|
74
74
|
qu 242
|
75
75
|
_la_ 238
|
76
|
+
_m 236
|
76
77
|
in 236
|
77
78
|
te 235
|
78
79
|
tal 233
|
79
80
|
_ca 232
|
80
|
-
it 227
|
81
81
|
ia 227
|
82
|
-
|
82
|
+
it 227
|
83
|
+
_s 221
|
83
84
|
ti 220
|
84
85
|
om 216
|
85
86
|
co 214
|
@@ -87,9 +88,8 @@ en_ 213
|
|
87
88
|
ent 212
|
88
89
|
na 211
|
89
90
|
que 211
|
90
|
-
_el 204
|
91
91
|
ri 204
|
92
|
-
|
92
|
+
_el 204
|
93
93
|
un 198
|
94
94
|
i� 197
|
95
95
|
ata 195
|
@@ -100,15 +100,15 @@ E 188
|
|
100
100
|
atal 187
|
101
101
|
ng 183
|
102
102
|
len 181
|
103
|
-
|
103
|
+
_i_ 181
|
104
|
+
_a_ 180
|
104
105
|
� 179
|
105
106
|
A 179
|
106
|
-
|
107
|
+
ó 179
|
108
|
+
� 177
|
107
109
|
é 177
|
108
|
-
gu 177
|
109
110
|
r_ 177
|
110
|
-
|
111
|
-
� 177
|
111
|
+
gu 177
|
112
112
|
nc 176
|
113
113
|
ci� 175
|
114
114
|
l� 173
|
@@ -118,283 +118,283 @@ di 170
|
|
118
118
|
_co 170
|
119
119
|
pe 169
|
120
120
|
_en 168
|
121
|
-
[ 164
|
122
|
-
] 164
|
123
121
|
li 164
|
124
|
-
C 163
|
125
122
|
ac 163
|
123
|
+
C 163
|
124
|
+
_t 155
|
126
125
|
me 154
|
127
126
|
del 153
|
128
|
-
_del 152
|
129
127
|
í 152
|
128
|
+
_del 152
|
130
129
|
� 152
|
131
|
-
_t 152
|
132
|
-
l' 151
|
133
130
|
lle 151
|
131
|
+
l' 151
|
134
132
|
cat 151
|
135
133
|
_el_ 150
|
136
134
|
ls 149
|
137
135
|
tr 149
|
138
136
|
- 148
|
139
|
-
_es 147
|
140
137
|
pa 147
|
138
|
+
_es 147
|
141
139
|
ts 146
|
142
|
-
_l' 145
|
143
140
|
) 145
|
141
|
+
_l' 145
|
144
142
|
là 144
|
145
|
-
_f 143
|
146
143
|
est 143
|
144
|
+
_f 143
|
147
145
|
ua 142
|
148
146
|
ne 140
|
147
|
+
_. 139
|
149
148
|
_ll 138
|
150
149
|
pr 138
|
151
|
-
ro 137
|
152
|
-
catal 137
|
153
150
|
cata 137
|
151
|
+
catal 137
|
152
|
+
ro 137
|
153
|
+
al_ 137
|
154
154
|
( 135
|
155
|
-
al_ 135
|
156
|
-
_cata 134
|
157
155
|
_cat 134
|
156
|
+
_cata 134
|
158
157
|
ma 133
|
159
|
-
ió 133
|
160
158
|
h 133
|
161
|
-
|
162
|
-
|
159
|
+
ió 133
|
160
|
+
nt_ 132
|
161
|
+
ts_ 132
|
163
162
|
_en_ 130
|
163
|
+
_( 130
|
164
164
|
oc 130
|
165
|
-
ts_ 130
|
166
|
-
_. 128
|
167
165
|
sta 127
|
166
|
+
ls_ 127
|
167
|
+
_del_ 126
|
168
168
|
_q 126
|
169
|
-
ls_ 126
|
170
169
|
del_ 126
|
171
|
-
_del_ 126
|
172
|
-
L 124
|
173
170
|
_qu 124
|
171
|
+
L 124
|
172
|
+
_._ 124
|
174
173
|
am 123
|
175
|
-
|
176
|
-
|
177
|
-
ica 122
|
178
|
-
nci 122
|
174
|
+
ó_ 123
|
175
|
+
�_ 123
|
179
176
|
no 122
|
180
|
-
|
181
|
-
|
177
|
+
nci 122
|
178
|
+
ica 122
|
179
|
+
_o 120
|
182
180
|
et 120
|
181
|
+
és 120
|
183
182
|
ció 120
|
183
|
+
�s 120
|
184
184
|
ni 118
|
185
|
-
_._ 118
|
186
|
-
_o 118
|
187
185
|
és_ 117
|
188
186
|
�s_ 117
|
189
|
-
al� 116
|
190
187
|
ale 116
|
188
|
+
al� 116
|
191
189
|
les 116
|
192
|
-
_lle 115
|
193
|
-
eng 115
|
194
|
-
leng 115
|
195
190
|
_pe 115
|
191
|
+
leng 115
|
192
|
+
eng 115
|
193
|
+
_lle 115
|
194
|
+
atal� 114
|
196
195
|
tal� 114
|
197
|
-
talà 114
|
198
196
|
_v 114
|
197
|
+
talà 114
|
199
198
|
alà 114
|
200
|
-
|
199
|
+
ió_ 113
|
201
200
|
y 113
|
202
|
-
ió_ 112
|
203
201
|
as 112
|
204
|
-
|
202
|
+
llen 111
|
205
203
|
fi 111
|
204
|
+
per 111
|
206
205
|
va 111
|
207
|
-
llen 111
|
208
206
|
tat 110
|
209
207
|
ad 109
|
210
|
-
les_ 109
|
211
208
|
aci 109
|
209
|
+
les_ 109
|
212
210
|
là_ 109
|
213
|
-
|
214
|
-
at_ 108
|
211
|
+
at_ 109
|
215
212
|
_que 108
|
213
|
+
lleng 108
|
216
214
|
_pr 107
|
217
215
|
po 106
|
218
216
|
tu 106
|
219
|
-
o_
|
217
|
+
o_ 106
|
218
|
+
_C 105
|
220
219
|
m� 105
|
221
220
|
ant 104
|
222
221
|
_llen 103
|
223
222
|
� 102
|
223
|
+
com 102
|
224
224
|
è 102
|
225
225
|
g� 102
|
226
226
|
t� 102
|
227
|
-
com 102
|
228
227
|
ec 101
|
229
228
|
da 101
|
229
|
+
ció_ 100
|
230
|
+
ue_ 100
|
230
231
|
que_ 100
|
231
232
|
ues 100
|
232
|
-
|
233
|
-
ció_ 99
|
233
|
+
na_ 99
|
234
234
|
_que_ 99
|
235
235
|
cia 98
|
236
|
-
na_ 98
|
237
|
-
_C 98
|
238
|
-
_r 97
|
239
236
|
_al 97
|
237
|
+
_r 97
|
240
238
|
ny 97
|
241
|
-
_com 96
|
242
|
-
a, 96
|
243
239
|
_h 96
|
240
|
+
_com 96
|
244
241
|
is 96
|
242
|
+
a, 96
|
245
243
|
ngu 95
|
246
|
-
a,_
|
244
|
+
a,_ 95
|
247
245
|
mi 94
|
246
|
+
ix 93
|
248
247
|
ia_ 93
|
249
248
|
sti 93
|
250
|
-
|
249
|
+
_u 92
|
251
250
|
ques 92
|
252
251
|
ns_ 91
|
253
|
-
|
254
|
-
d' 90
|
255
|
-
em 90
|
256
|
-
_u 90
|
252
|
+
os 90
|
257
253
|
alà_ 90
|
254
|
+
em 90
|
258
255
|
gua 90
|
256
|
+
enc 90
|
259
257
|
_d' 90
|
260
|
-
|
261
|
-
rt 89
|
262
|
-
men 89
|
263
|
-
aci� 89
|
264
|
-
r� 89
|
258
|
+
d' 90
|
265
259
|
_per 89
|
260
|
+
r� 89
|
266
261
|
to 89
|
262
|
+
men 89
|
267
263
|
ació 89
|
268
|
-
|
264
|
+
ca_ 89
|
265
|
+
rt 89
|
266
|
+
aci� 89
|
269
267
|
ct 88
|
270
|
-
|
271
|
-
mb 87
|
268
|
+
_se 88
|
272
269
|
� 87
|
273
|
-
|
270
|
+
mb 87
|
271
|
+
ü 87
|
274
272
|
j 86
|
275
273
|
er_ 86
|
276
|
-
|
277
|
-
lengu 85
|
274
|
+
P 86
|
278
275
|
engu 85
|
276
|
+
ngua 85
|
279
277
|
engua 85
|
278
|
+
lengu 85
|
280
279
|
ol 85
|
281
280
|
I 84
|
281
|
+
io 84
|
282
282
|
ons 84
|
283
283
|
_le 84
|
284
|
-
io 84
|
285
284
|
tre 83
|
286
285
|
si 83
|
287
|
-
res 82
|
288
286
|
� 82
|
287
|
+
res 82
|
288
|
+
_� 82
|
289
289
|
gü 82
|
290
290
|
ei 81
|
291
|
-
|
291
|
+
_n 81
|
292
292
|
par 80
|
293
293
|
ut 80
|
294
294
|
ent_ 79
|
295
|
-
|
295
|
+
ica_ 79
|
296
|
+
so 78
|
296
297
|
ng� 78
|
297
298
|
esta 78
|
298
|
-
so 78
|
299
299
|
ur 78
|
300
|
-
_[ 77
|
301
|
-
ngü 77
|
302
|
-
bl 77
|
303
300
|
: 77
|
304
|
-
|
305
|
-
|
306
|
-
�s 76
|
307
|
-
ís 76
|
301
|
+
bl 77
|
302
|
+
ngü 77
|
308
303
|
_pa 76
|
304
|
+
�s 76
|
309
305
|
_les 76
|
310
306
|
_di 76
|
311
|
-
|
307
|
+
eg 76
|
308
|
+
ís 76
|
309
|
+
_les_ 76
|
312
310
|
ha 75
|
311
|
+
_re 75
|
313
312
|
con 74
|
314
313
|
_un 74
|
315
|
-
tic 73
|
316
314
|
_g 73
|
317
315
|
ran 73
|
316
|
+
tic 73
|
318
317
|
mp 73
|
319
318
|
mo 72
|
319
|
+
_, 71
|
320
320
|
lu 70
|
321
321
|
_po 70
|
322
|
+
_,_ 70
|
322
323
|
els 69
|
323
324
|
Ca 69
|
324
325
|
rs 68
|
326
|
+
enci 68
|
325
327
|
T 68
|
326
|
-
els_ 68
|
327
328
|
ie 68
|
329
|
+
els_ 68
|
328
330
|
fic 68
|
329
|
-
enci 68
|
330
|
-
amb 67
|
331
|
-
rd 67
|
332
331
|
ua_ 67
|
332
|
+
rd 67
|
333
|
+
amb 67
|
334
|
+
ta_ 67
|
333
335
|
_no 66
|
334
|
-
ta_ 66
|
335
|
-
]_ 66
|
336
336
|
ot 66
|
337
|
-
|
338
|
-
|
337
|
+
_P 66
|
338
|
+
m_ 66
|
339
339
|
ial 65
|
340
340
|
ter 65
|
341
|
-
des 65
|
342
341
|
nta 65
|
343
|
-
|
342
|
+
ment 65
|
343
|
+
des 65
|
344
344
|
_b 64
|
345
345
|
ià 64
|
346
346
|
_ha 63
|
347
347
|
eix 63
|
348
|
+
)_ 63
|
349
|
+
ra_ 63
|
350
|
+
gua_ 62
|
348
351
|
�n 62
|
352
|
+
ngua_ 62
|
349
353
|
vi 62
|
350
|
-
|
351
|
-
ra_ 62
|
354
|
+
als 62
|
352
355
|
àn 62
|
353
|
-
|
354
|
-
ngua_ 62
|
356
|
+
do 62
|
355
357
|
re_ 62
|
356
|
-
|
358
|
+
ar_ 61
|
357
359
|
lenc 61
|
358
|
-
V 61
|
359
|
-
if 61
|
360
360
|
us 61
|
361
|
-
|
362
|
-
|
363
|
-
alenc 60
|
364
|
-
lenci 60
|
361
|
+
if 61
|
362
|
+
V 61
|
365
363
|
alen 60
|
366
364
|
M 60
|
365
|
+
_mo 60
|
366
|
+
lenci 60
|
367
|
+
alenc 60
|
368
|
+
nd 60
|
369
|
+
ing 59
|
370
|
+
ell 59
|
367
371
|
fe 59
|
368
|
-
_, 59
|
369
372
|
� 59
|
370
373
|
lo 59
|
371
|
-
|
372
|
-
ing 59
|
373
|
-
_ma 58
|
374
|
-
�sti 58
|
374
|
+
_Ca 58
|
375
375
|
íst 58
|
376
|
-
ce 58
|
377
|
-
�st 58
|
378
376
|
sp 58
|
377
|
+
�sti 58
|
379
378
|
ísti 58
|
380
379
|
'a 58
|
381
|
-
|
382
|
-
|
383
|
-
|
380
|
+
_E 58
|
381
|
+
ce 58
|
382
|
+
_A 58
|
383
|
+
_ma 58
|
384
|
+
�st 58
|
384
385
|
_é 57
|
385
|
-
|
386
|
+
_con 57
|
387
|
+
_és 57
|
386
388
|
ita 57
|
387
389
|
_va 57
|
388
|
-
|
389
|
-
_és_ 56
|
390
|
-
_E 56
|
391
|
-
om_ 56
|
390
|
+
� 57
|
392
391
|
S 56
|
392
|
+
_- 56
|
393
393
|
ya 56
|
394
|
-
_A 56
|
395
394
|
D 56
|
396
|
-
_
|
397
|
-
|
398
|
-
ul 55
|
399
|
-
sa 55
|
395
|
+
_és_ 56
|
396
|
+
om_ 56
|
400
397
|
_� 55
|
398
|
+
ul 55
|
399
|
+
era 55
|
400
|
+
� 55
|