scylla 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/lib/scylla/lms/catalan.lm
CHANGED
@@ -1,400 +1,400 @@
|
|
1
|
-
_
|
2
|
-
a
|
3
|
-
e
|
4
|
-
l
|
5
|
-
s
|
6
|
-
i
|
7
|
-
n
|
8
|
-
t
|
9
|
-
r
|
10
|
-
o
|
11
|
-
c
|
12
|
-
�
|
13
|
-
|
14
|
-
|
15
|
-
s_
|
16
|
-
u
|
17
|
-
__
|
18
|
-
m
|
19
|
-
_d
|
20
|
-
|
21
|
-
|
22
|
-
p
|
1
|
+
_ 14596
|
2
|
+
a 4010
|
3
|
+
e 3751
|
4
|
+
l 2647
|
5
|
+
s 2301
|
6
|
+
i 2270
|
7
|
+
n 2179
|
8
|
+
t 2145
|
9
|
+
r 1821
|
10
|
+
o 1472
|
11
|
+
c 1470
|
12
|
+
� 1190
|
13
|
+
d 1181
|
14
|
+
a_ 1172
|
15
|
+
s_ 1146
|
16
|
+
u 1070
|
17
|
+
__ 872
|
18
|
+
m 828
|
19
|
+
_d 769
|
20
|
+
en 755
|
21
|
+
es 750
|
22
|
+
p 696
|
23
23
|
_l 682
|
24
|
-
al
|
25
|
-
de
|
24
|
+
al 674
|
25
|
+
de 638
|
26
26
|
e_ 617
|
27
|
-
_e
|
28
|
-
ta
|
29
|
-
_de
|
30
|
-
l_
|
31
|
-
g
|
32
|
-
|
33
|
-
,
|
34
|
-
|
35
|
-
|
36
|
-
_c
|
37
|
-
|
38
|
-
|
39
|
-
_a
|
40
|
-
es_
|
41
|
-
ca
|
42
|
-
er
|
43
|
-
ci
|
44
|
-
_p
|
45
|
-
.
|
46
|
-
t_
|
47
|
-
an
|
48
|
-
nt
|
49
|
-
�
|
50
|
-
à
|
51
|
-
re
|
52
|
-
st
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
de_
|
57
|
-
_de_
|
58
|
-
ra
|
59
|
-
f
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
ic
|
69
|
-
i_
|
70
|
-
ue
|
71
|
-
or
|
72
|
-
|
73
|
-
|
74
|
-
qu
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
tal 233
|
80
|
-
_ca 232
|
81
|
-
ia 227
|
27
|
+
_e 615
|
28
|
+
ta 555
|
29
|
+
_de 552
|
30
|
+
l_ 517
|
31
|
+
g 496
|
32
|
+
la 486
|
33
|
+
, 483
|
34
|
+
,_ 477
|
35
|
+
le 475
|
36
|
+
_c 459
|
37
|
+
at 458
|
38
|
+
el 448
|
39
|
+
_a 443
|
40
|
+
es_ 427
|
41
|
+
ca 424
|
42
|
+
er 400
|
43
|
+
ci 398
|
44
|
+
_p 396
|
45
|
+
. 391
|
46
|
+
t_ 390
|
47
|
+
an 380
|
48
|
+
nt 376
|
49
|
+
� 360
|
50
|
+
à 360
|
51
|
+
re 356
|
52
|
+
st 351
|
53
|
+
n_ 347
|
54
|
+
b 341
|
55
|
+
ar 339
|
56
|
+
de_ 335
|
57
|
+
_de_ 331
|
58
|
+
ra 315
|
59
|
+
f 301
|
60
|
+
on 294
|
61
|
+
el_ 293
|
62
|
+
la_ 272
|
63
|
+
_la 270
|
64
|
+
v 265
|
65
|
+
ll 264
|
66
|
+
_i 264
|
67
|
+
._ 259
|
68
|
+
ic 254
|
69
|
+
i_ 253
|
70
|
+
ue 248
|
71
|
+
or 245
|
72
|
+
q 243
|
73
|
+
_la_ 239
|
74
|
+
qu 239
|
75
|
+
_ca 236
|
76
|
+
in 234
|
77
|
+
te 233
|
78
|
+
tal 231
|
82
79
|
it 227
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
co
|
87
|
-
en_
|
88
|
-
|
89
|
-
na
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
80
|
+
ia 225
|
81
|
+
_s 223
|
82
|
+
ti 215
|
83
|
+
co 212
|
84
|
+
en_ 211
|
85
|
+
om 211
|
86
|
+
na 210
|
87
|
+
ent 209
|
88
|
+
que 208
|
89
|
+
_m 206
|
90
|
+
_el 201
|
91
|
+
se 199
|
92
|
+
ri 198
|
94
93
|
i� 197
|
95
|
-
|
96
|
-
|
97
|
-
ns 193
|
94
|
+
un 196
|
95
|
+
ata 194
|
98
96
|
x 191
|
99
|
-
|
100
|
-
atal
|
97
|
+
ns 191
|
98
|
+
atal 186
|
99
|
+
E 185
|
100
|
+
l� 185
|
101
|
+
len 184
|
101
102
|
ng 183
|
102
|
-
|
103
|
-
_i_ 181
|
103
|
+
r_ 182
|
104
104
|
_a_ 180
|
105
|
-
|
106
|
-
|
107
|
-
ó
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
gu 177
|
112
|
-
nc 176
|
105
|
+
_i_ 180
|
106
|
+
� 178
|
107
|
+
ó 178
|
108
|
+
A 177
|
109
|
+
di 177
|
110
|
+
�_ 175
|
113
111
|
ci� 175
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
_en
|
121
|
-
li
|
122
|
-
|
123
|
-
C
|
124
|
-
|
125
|
-
|
112
|
+
à_ 175
|
113
|
+
nc 175
|
114
|
+
é 173
|
115
|
+
gu 173
|
116
|
+
� 173
|
117
|
+
_co 171
|
118
|
+
_en 170
|
119
|
+
li 168
|
120
|
+
pe 167
|
121
|
+
C 162
|
122
|
+
ac 162
|
123
|
+
_t 157
|
124
|
+
_es 155
|
126
125
|
del 153
|
127
|
-
í 152
|
128
126
|
_del 152
|
129
|
-
� 152
|
130
127
|
lle 151
|
131
|
-
|
132
|
-
cat
|
133
|
-
|
134
|
-
|
135
|
-
|
128
|
+
� 150
|
129
|
+
cat 150
|
130
|
+
me 150
|
131
|
+
í 150
|
132
|
+
_el_ 149
|
136
133
|
- 148
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
est
|
144
|
-
|
145
|
-
ua
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
ro 137
|
153
|
-
al_ 137
|
154
|
-
( 135
|
134
|
+
tr 148
|
135
|
+
al_ 145
|
136
|
+
pa 145
|
137
|
+
ts 145
|
138
|
+
là 145
|
139
|
+
ls 144
|
140
|
+
est 142
|
141
|
+
_ll 142
|
142
|
+
ua 141
|
143
|
+
_f 141
|
144
|
+
ne 136
|
145
|
+
cata 136
|
146
|
+
catal 136
|
147
|
+
pr 136
|
148
|
+
les 135
|
155
149
|
_cat 134
|
156
150
|
_cata 134
|
157
|
-
|
158
|
-
h 133
|
151
|
+
_en_ 133
|
159
152
|
ió 133
|
160
|
-
|
153
|
+
ro 133
|
161
154
|
ts_ 132
|
162
|
-
|
163
|
-
|
164
|
-
oc
|
165
|
-
|
166
|
-
|
155
|
+
nt_ 131
|
156
|
+
h 130
|
157
|
+
oc 129
|
158
|
+
_. 129
|
159
|
+
_o 129
|
160
|
+
ma 128
|
161
|
+
_q 127
|
162
|
+
da 126
|
167
163
|
_del_ 126
|
168
|
-
|
164
|
+
sta 126
|
169
165
|
del_ 126
|
170
|
-
|
171
|
-
|
172
|
-
_
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
no 122
|
166
|
+
ls_ 125
|
167
|
+
_qu 125
|
168
|
+
�_ 124
|
169
|
+
ó_ 124
|
170
|
+
L 122
|
171
|
+
am 122
|
177
172
|
nci 122
|
178
|
-
ica
|
179
|
-
|
180
|
-
et 120
|
181
|
-
és 120
|
173
|
+
ica 121
|
174
|
+
_lle 120
|
182
175
|
ció 120
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
�
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
176
|
+
et 119
|
177
|
+
no 119
|
178
|
+
_._ 119
|
179
|
+
�s 117
|
180
|
+
_v 117
|
181
|
+
és 117
|
182
|
+
ni 117
|
183
|
+
ale 117
|
191
184
|
leng 115
|
185
|
+
al� 115
|
192
186
|
eng 115
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
talà
|
198
|
-
|
199
|
-
ió_ 113
|
187
|
+
�s_ 114
|
188
|
+
és_ 114
|
189
|
+
ió_ 114
|
190
|
+
tal� 113
|
191
|
+
talà 113
|
192
|
+
atal� 113
|
200
193
|
y 113
|
201
|
-
|
202
|
-
|
194
|
+
alà 113
|
195
|
+
_pe 113
|
196
|
+
_le 112
|
203
197
|
fi 111
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
198
|
+
llen 111
|
199
|
+
as 111
|
200
|
+
per 109
|
201
|
+
tat 109
|
202
|
+
_que 109
|
208
203
|
aci 109
|
209
|
-
les_ 109
|
210
|
-
là_ 109
|
211
204
|
at_ 109
|
212
|
-
|
205
|
+
là_ 108
|
213
206
|
lleng 108
|
214
|
-
|
215
|
-
|
207
|
+
ad 108
|
208
|
+
va 108
|
209
|
+
_llen 107
|
216
210
|
tu 106
|
211
|
+
po 106
|
217
212
|
o_ 106
|
218
|
-
_C
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
� 102
|
223
|
-
com 102
|
224
|
-
è 102
|
213
|
+
_C 106
|
214
|
+
les_ 105
|
215
|
+
_pr 105
|
216
|
+
na_ 104
|
225
217
|
g� 102
|
226
|
-
|
218
|
+
ant 102
|
219
|
+
m� 102
|
220
|
+
t� 101
|
221
|
+
ció_ 101
|
227
222
|
ec 101
|
228
|
-
|
229
|
-
|
230
|
-
ue_ 100
|
223
|
+
com 101
|
224
|
+
è 100
|
231
225
|
que_ 100
|
232
|
-
|
233
|
-
|
234
|
-
|
226
|
+
_que_ 100
|
227
|
+
� 100
|
228
|
+
ue_ 100
|
229
|
+
a, 100
|
230
|
+
a,_ 99
|
231
|
+
_r 99
|
235
232
|
cia 98
|
236
|
-
_al 97
|
237
|
-
_r 97
|
238
233
|
ny 97
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
a, 96
|
234
|
+
_com 97
|
235
|
+
ues 97
|
236
|
+
_se 97
|
243
237
|
ngu 95
|
244
|
-
|
245
|
-
|
238
|
+
_h 95
|
239
|
+
is 95
|
240
|
+
ia_ 95
|
241
|
+
_les 95
|
242
|
+
_al 94
|
243
|
+
ns_ 94
|
244
|
+
mi 93
|
246
245
|
ix 93
|
247
|
-
ia_ 93
|
248
246
|
sti 93
|
249
|
-
_u
|
250
|
-
ques 92
|
251
|
-
ns_ 91
|
252
|
-
os 90
|
253
|
-
alà_ 90
|
254
|
-
em 90
|
247
|
+
_u 91
|
255
248
|
gua 90
|
256
|
-
enc
|
257
|
-
_d' 90
|
258
|
-
d' 90
|
259
|
-
_per 89
|
260
|
-
r� 89
|
249
|
+
enc 89
|
261
250
|
to 89
|
262
|
-
|
263
|
-
|
264
|
-
ca_ 89
|
265
|
-
rt 89
|
251
|
+
os 89
|
252
|
+
alà_ 89
|
266
253
|
aci� 89
|
267
|
-
|
268
|
-
|
254
|
+
ació 89
|
255
|
+
em 89
|
256
|
+
ques 89
|
257
|
+
er_ 88
|
269
258
|
� 87
|
259
|
+
_per 87
|
270
260
|
mb 87
|
261
|
+
r� 87
|
271
262
|
ü 87
|
263
|
+
men 87
|
264
|
+
ct 87
|
265
|
+
rt 86
|
272
266
|
j 86
|
273
|
-
er_ 86
|
274
|
-
P 86
|
275
|
-
engu 85
|
276
|
-
ngua 85
|
277
267
|
engua 85
|
278
268
|
lengu 85
|
279
269
|
ol 85
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
270
|
+
engu 85
|
271
|
+
ngua 85
|
272
|
+
P 84
|
273
|
+
_di 84
|
284
274
|
tre 83
|
285
|
-
|
275
|
+
io 82
|
286
276
|
� 82
|
287
|
-
|
288
|
-
_� 82
|
277
|
+
I 82
|
289
278
|
gü 82
|
279
|
+
res 82
|
280
|
+
ons 82
|
281
|
+
si 82
|
282
|
+
lo 81
|
290
283
|
ei 81
|
291
|
-
|
292
|
-
|
293
|
-
ut
|
294
|
-
|
295
|
-
ica_ 79
|
296
|
-
so 78
|
284
|
+
_� 80
|
285
|
+
_n 79
|
286
|
+
ut 79
|
287
|
+
so 79
|
297
288
|
ng� 78
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
bl 77
|
289
|
+
par 78
|
290
|
+
ent_ 78
|
291
|
+
esta 77
|
302
292
|
ngü 77
|
303
|
-
|
293
|
+
ur 77
|
304
294
|
�s 76
|
305
|
-
|
306
|
-
_di 76
|
307
|
-
eg 76
|
295
|
+
bl 76
|
308
296
|
ís 76
|
309
|
-
|
310
|
-
|
297
|
+
eg 75
|
298
|
+
_pa 75
|
311
299
|
_re 75
|
312
|
-
|
300
|
+
_g 74
|
301
|
+
_les_ 74
|
302
|
+
_po 74
|
313
303
|
_un 74
|
314
|
-
_g 73
|
315
304
|
ran 73
|
316
|
-
|
305
|
+
con 73
|
306
|
+
ha 73
|
317
307
|
mp 73
|
318
308
|
mo 72
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
_,_ 70
|
323
|
-
els 69
|
309
|
+
lu 71
|
310
|
+
tic 70
|
311
|
+
ua_ 69
|
324
312
|
Ca 69
|
325
|
-
|
313
|
+
des 69
|
314
|
+
fic 68
|
326
315
|
enci 68
|
327
316
|
T 68
|
328
|
-
ie 68
|
329
317
|
els_ 68
|
330
|
-
|
331
|
-
|
332
|
-
rd 67
|
318
|
+
els 68
|
319
|
+
rs 67
|
333
320
|
amb 67
|
334
|
-
|
335
|
-
|
321
|
+
ie 67
|
322
|
+
m_ 67
|
336
323
|
ot 66
|
324
|
+
ta_ 66
|
337
325
|
_P 66
|
338
|
-
|
326
|
+
rd 65
|
327
|
+
re_ 65
|
339
328
|
ial 65
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
329
|
+
do 64
|
330
|
+
ter 64
|
331
|
+
_, 64
|
332
|
+
_no 64
|
333
|
+
ngua_ 64
|
345
334
|
ià 64
|
346
|
-
|
335
|
+
nta 64
|
336
|
+
gua_ 64
|
337
|
+
ment 63
|
347
338
|
eix 63
|
348
|
-
|
339
|
+
_b 63
|
340
|
+
_,_ 63
|
349
341
|
ra_ 63
|
350
|
-
gua_ 62
|
351
|
-
�n 62
|
352
|
-
ngua_ 62
|
353
342
|
vi 62
|
354
|
-
|
355
|
-
|
356
|
-
do 62
|
357
|
-
re_ 62
|
343
|
+
_ha 62
|
344
|
+
V 61
|
358
345
|
ar_ 61
|
346
|
+
�n 61
|
359
347
|
lenc 61
|
360
|
-
|
348
|
+
àn 61
|
361
349
|
if 61
|
362
|
-
V 61
|
363
350
|
alen 60
|
364
|
-
M 60
|
365
|
-
_mo 60
|
366
351
|
lenci 60
|
352
|
+
_E 60
|
367
353
|
alenc 60
|
368
354
|
nd 60
|
369
|
-
|
355
|
+
M 60
|
370
356
|
ell 59
|
357
|
+
lan 59
|
371
358
|
fe 59
|
359
|
+
als 59
|
372
360
|
� 59
|
373
|
-
|
374
|
-
|
375
|
-
|
361
|
+
_va 59
|
362
|
+
s. 59
|
363
|
+
ing 59
|
364
|
+
us 59
|
376
365
|
sp 58
|
377
|
-
�sti 58
|
378
366
|
ísti 58
|
379
|
-
|
380
|
-
|
381
|
-
ce 58
|
367
|
+
�sti 58
|
368
|
+
íst 58
|
382
369
|
_A 58
|
383
|
-
_ma 58
|
384
370
|
�st 58
|
385
|
-
|
371
|
+
ce 57
|
386
372
|
_con 57
|
387
|
-
_és 57
|
388
|
-
ita 57
|
389
|
-
_va 57
|
390
373
|
� 57
|
391
|
-
|
392
|
-
_
|
374
|
+
ita 57
|
375
|
+
_� 56
|
376
|
+
_Ca 56
|
377
|
+
_és 56
|
393
378
|
ya 56
|
394
379
|
D 56
|
395
|
-
|
396
|
-
|
397
|
-
_
|
398
|
-
ul 55
|
399
|
-
era 55
|
380
|
+
_ma 56
|
381
|
+
_- 56
|
382
|
+
_é 56
|
400
383
|
� 55
|
384
|
+
_és_ 55
|
385
|
+
sa 55
|
386
|
+
cià 55
|
387
|
+
om_ 55
|
388
|
+
lt 54
|
389
|
+
ul 54
|
390
|
+
ca_ 54
|
391
|
+
ret 54
|
392
|
+
S 54
|
393
|
+
era 54
|
394
|
+
ncia 54
|
395
|
+
il 53
|
396
|
+
ion 53
|
397
|
+
ste 53
|
398
|
+
s,_ 53
|
399
|
+
nya 53
|
400
|
+
s, 53
|