scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/lib/scylla/lms/romanian.lm
CHANGED
@@ -11,390 +11,390 @@ s 194
|
|
11
11
|
o 188
|
12
12
|
l 174
|
13
13
|
e_ 170
|
14
|
-
a_
|
14
|
+
a_ 158
|
15
15
|
d 115
|
16
16
|
m 114
|
17
17
|
p 113
|
18
|
-
i_
|
18
|
+
i_ 112
|
19
19
|
in 101
|
20
|
-
_c
|
20
|
+
_c 93
|
21
21
|
ar 87
|
22
|
-
_s
|
22
|
+
_s 76
|
23
23
|
re 71
|
24
24
|
_d 69
|
25
25
|
, 66
|
26
|
-
de 64
|
27
26
|
_a 64
|
27
|
+
de 64
|
28
28
|
ra 63
|
29
29
|
at 62
|
30
30
|
f 57
|
31
|
-
_p
|
31
|
+
_p 56
|
32
32
|
,_ 54
|
33
33
|
st 53
|
34
34
|
ta 53
|
35
35
|
ti 51
|
36
|
+
n_ 50
|
36
37
|
ca 50
|
37
38
|
te 50
|
38
|
-
n_ 50
|
39
39
|
si 47
|
40
40
|
u_ 46
|
41
|
-
_i 45
|
42
41
|
un 45
|
42
|
+
_i 45
|
43
43
|
_de 43
|
44
44
|
g 42
|
45
|
-
ri 41
|
46
45
|
ce 41
|
46
|
+
ri 41
|
47
|
+
_m 41
|
47
48
|
nt 41
|
48
49
|
ul 40
|
49
|
-
_m 39
|
50
|
-
or 39
|
51
50
|
b 39
|
52
|
-
|
51
|
+
or 39
|
52
|
+
es 39
|
53
53
|
de_ 38
|
54
|
-
|
54
|
+
_o 38
|
55
55
|
_in 38
|
56
56
|
la 37
|
57
|
+
ma 37
|
57
58
|
are 37
|
58
|
-
_ca
|
59
|
-
|
59
|
+
_ca 37
|
60
|
+
t_ 35
|
60
61
|
tr 34
|
61
62
|
. 34
|
62
|
-
t_ 33
|
63
|
-
ea 33
|
64
63
|
ci 33
|
65
64
|
_f 33
|
66
|
-
|
65
|
+
ea 33
|
67
66
|
el 32
|
68
|
-
|
69
|
-
oa 30
|
67
|
+
_de_ 32
|
70
68
|
ni 30
|
71
|
-
|
72
|
-
|
73
|
-
se 29
|
69
|
+
oa 30
|
70
|
+
ia 30
|
74
71
|
te_ 29
|
75
72
|
v 29
|
73
|
+
se 29
|
74
|
+
ta_ 29
|
76
75
|
cu 29
|
76
|
+
in_ 29
|
77
77
|
_si 28
|
78
|
+
_u 28
|
79
|
+
l_ 27
|
78
80
|
as 27
|
79
81
|
z 27
|
80
82
|
re_ 27
|
81
|
-
|
83
|
+
ac 26
|
82
84
|
si_ 26
|
83
|
-
l_ 26
|
84
|
-
le 26
|
85
85
|
ic 26
|
86
|
-
an 26
|
87
|
-
ac 26
|
88
|
-
ne 26
|
89
86
|
_l 26
|
87
|
+
le 26
|
88
|
+
ne 26
|
89
|
+
an 26
|
90
90
|
_t 25
|
91
91
|
pe 25
|
92
92
|
li 25
|
93
|
-
ei
|
94
|
-
|
93
|
+
ei 25
|
94
|
+
_ma 25
|
95
|
+
_un 24
|
95
96
|
er 24
|
96
|
-
|
97
|
-
_un 23
|
98
|
-
lu 23
|
97
|
+
int 24
|
99
98
|
est 23
|
99
|
+
lu 23
|
100
|
+
sa 23
|
101
|
+
are_ 23
|
100
102
|
ie 22
|
101
|
-
la_ 22
|
102
103
|
ste 22
|
104
|
+
la_ 22
|
103
105
|
pa 22
|
104
|
-
sa 21
|
105
|
-
_ma 21
|
106
106
|
car 21
|
107
|
-
|
108
|
-
il 20
|
107
|
+
it 20
|
109
108
|
na 20
|
110
109
|
_e 20
|
110
|
+
il 20
|
111
111
|
al 20
|
112
|
-
|
113
|
-
_pe 19
|
114
|
-
da 19
|
115
|
-
en 19
|
116
|
-
ai 19
|
117
|
-
_n 19
|
112
|
+
e, 20
|
118
113
|
to 19
|
119
|
-
ii 19
|
120
114
|
_si_ 19
|
115
|
+
ai 19
|
116
|
+
da 19
|
117
|
+
ii 19
|
118
|
+
_pe 19
|
119
|
+
_n 19
|
121
120
|
care 19
|
121
|
+
en 19
|
122
122
|
_cu 18
|
123
|
-
|
123
|
+
este 18
|
124
|
+
ui 18
|
124
125
|
ara 18
|
126
|
+
ata 18
|
127
|
+
ap 18
|
125
128
|
o_ 18
|
126
129
|
cu_ 18
|
127
|
-
ata 18
|
128
|
-
ui 18
|
129
130
|
au 18
|
130
|
-
este 18
|
131
|
-
lo 17
|
132
131
|
am 17
|
132
|
+
_sa 17
|
133
133
|
_in_ 17
|
134
134
|
me 17
|
135
|
+
_car 17
|
136
|
+
lo 17
|
135
137
|
D 17
|
136
|
-
|
137
|
-
care_ 16
|
138
|
-
_care 16
|
139
|
-
fa 16
|
138
|
+
r_ 17
|
140
139
|
ra_ 16
|
141
140
|
om 16
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
._ 16
|
141
|
+
ru 16
|
142
|
+
sa_ 16
|
143
|
+
fa 16
|
146
144
|
pr 16
|
147
|
-
|
145
|
+
co 16
|
146
|
+
._ 16
|
147
|
+
_care 16
|
148
|
+
care_ 16
|
149
|
+
un_ 16
|
148
150
|
pe_ 15
|
149
|
-
h 15
|
150
|
-
ste_ 15
|
151
|
-
ru 15
|
152
151
|
_cu_ 15
|
153
|
-
|
152
|
+
a, 15
|
153
|
+
ste_ 15
|
154
154
|
e,_ 15
|
155
155
|
ul_ 15
|
156
|
-
|
157
|
-
|
156
|
+
h 15
|
157
|
+
este_ 15
|
158
|
+
_o_ 15
|
158
159
|
rt 15
|
159
|
-
|
160
|
-
nu 14
|
161
|
-
ca_ 14
|
162
|
-
le_ 14
|
163
|
-
a, 14
|
164
|
-
tra 14
|
165
|
-
is 14
|
160
|
+
se_ 15
|
166
161
|
_b 14
|
167
|
-
_v 14
|
168
|
-
ata_ 14
|
169
|
-
ace 14
|
170
162
|
_ce 14
|
163
|
+
tra 14
|
171
164
|
ur 14
|
172
|
-
|
165
|
+
ca_ 14
|
166
|
+
ei_ 14
|
167
|
+
le_ 14
|
173
168
|
_se 14
|
174
|
-
|
175
|
-
|
176
|
-
|
169
|
+
nu 14
|
170
|
+
is 14
|
171
|
+
tu 14
|
172
|
+
ace 14
|
173
|
+
ata_ 14
|
174
|
+
_v 14
|
175
|
+
io 13
|
177
176
|
mi 13
|
177
|
+
a,_ 13
|
178
|
+
ot 13
|
179
|
+
sc 13
|
178
180
|
lui 13
|
181
|
+
zi 13
|
179
182
|
mu 13
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
+
__ 13
|
184
|
+
au_ 13
|
185
|
+
_sa_ 12
|
186
|
+
fe 12
|
187
|
+
oar 12
|
183
188
|
nd 12
|
184
|
-
bi 12
|
185
|
-
po 12
|
186
|
-
ei_ 12
|
187
|
-
ve 12
|
188
|
-
na_ 12
|
189
|
-
ut 12
|
190
|
-
ii_ 12
|
191
|
-
ec 12
|
192
|
-
_ac 12
|
193
189
|
ele 12
|
194
|
-
oar 12
|
195
|
-
os 12
|
196
|
-
pi 12
|
197
|
-
fe 12
|
198
190
|
_fa 12
|
199
|
-
|
191
|
+
ec 12
|
192
|
+
os 12
|
193
|
+
_un_ 12
|
194
|
+
ut 12
|
195
|
+
ve 12
|
196
|
+
po 12
|
200
197
|
ui_ 12
|
198
|
+
bi 12
|
199
|
+
pi 12
|
200
|
+
na_ 12
|
201
|
+
_ac 12
|
202
|
+
ii_ 12
|
203
|
+
_r 11
|
201
204
|
ir 11
|
202
205
|
_pe_ 11
|
203
|
-
|
206
|
+
ci_ 11
|
204
207
|
tat 11
|
205
|
-
_un_ 11
|
206
|
-
di 11
|
207
|
-
E 11
|
208
208
|
_se_ 11
|
209
|
-
_r 11
|
210
|
-
ent 11
|
211
|
-
fo 11
|
212
209
|
et 11
|
210
|
+
E 11
|
211
|
+
um 11
|
213
212
|
ea_ 11
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
ici 10
|
219
|
-
_sa_ 10
|
220
|
-
ai_ 10
|
221
|
-
" 10
|
222
|
-
I 10
|
213
|
+
ent 11
|
214
|
+
fo 11
|
215
|
+
di 11
|
216
|
+
ntr 10
|
223
217
|
pu 10
|
224
|
-
|
225
|
-
ulu 10
|
218
|
+
_co 10
|
226
219
|
cel 10
|
227
|
-
_pa 10
|
228
220
|
_est 10
|
229
|
-
|
221
|
+
ne_ 10
|
222
|
+
at_ 10
|
223
|
+
_este 10
|
224
|
+
ai_ 10
|
225
|
+
op 10
|
226
|
+
_pa 10
|
230
227
|
ol 10
|
231
228
|
im 10
|
232
|
-
|
233
|
-
_este 10
|
234
|
-
_co 10
|
229
|
+
ulu 10
|
235
230
|
ga 10
|
236
|
-
|
231
|
+
i, 10
|
232
|
+
_es 10
|
233
|
+
ici 10
|
234
|
+
_la 10
|
235
|
+
hi 9
|
236
|
+
_ar 9
|
237
|
+
ele_ 9
|
238
|
+
iu 9
|
239
|
+
su 9
|
240
|
+
sta 9
|
237
241
|
mo 9
|
238
|
-
|
239
|
-
or_ 9
|
240
|
-
_la_ 9
|
241
|
-
ului 9
|
242
|
-
ori 9
|
243
|
-
ati 9
|
242
|
+
id 9
|
244
243
|
ns 9
|
245
244
|
on 9
|
246
|
-
su 9
|
247
245
|
_ca_ 9
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
246
|
+
ati 9
|
247
|
+
ori 9
|
248
|
+
or_ 9
|
249
|
+
ului 9
|
250
|
+
_la_ 9
|
251
|
+
_nu 9
|
253
252
|
_di 9
|
254
|
-
|
253
|
+
tru 9
|
254
|
+
A 9
|
255
255
|
lui_ 9
|
256
|
-
|
256
|
+
_D 9
|
257
|
+
i. 8
|
257
258
|
i,_ 8
|
258
|
-
_int 8
|
259
259
|
ad 8
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
_ace 8
|
265
|
-
_g 8
|
266
|
-
tru 8
|
267
|
-
ine 8
|
268
|
-
oc 8
|
269
|
-
ost 8
|
270
|
-
dat 8
|
271
|
-
mar 8
|
260
|
+
ce_ 8
|
261
|
+
mai 8
|
262
|
+
tie 8
|
263
|
+
nc 8
|
272
264
|
ate 8
|
273
|
-
em 8
|
274
|
-
ini 8
|
275
265
|
ng 8
|
276
|
-
gi 8
|
277
|
-
nc 8
|
278
|
-
mai 8
|
279
266
|
_a_ 8
|
280
|
-
|
267
|
+
str 8
|
268
|
+
mai_ 8
|
269
|
+
lt 8
|
270
|
+
lor 8
|
271
|
+
nic 8
|
272
|
+
oc 8
|
273
|
+
em 8
|
274
|
+
ia_ 8
|
281
275
|
mul 8
|
282
|
-
|
276
|
+
ost 8
|
277
|
+
_fo 8
|
283
278
|
ti_ 8
|
284
|
-
|
285
|
-
|
286
|
-
|
279
|
+
I 8
|
280
|
+
une 8
|
281
|
+
_g 8
|
282
|
+
gi 8
|
283
|
+
dat 8
|
284
|
+
par 8
|
285
|
+
_lu 8
|
287
286
|
ara_ 8
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
287
|
+
ine 8
|
288
|
+
mar 8
|
289
|
+
ini 8
|
290
|
+
_int 8
|
291
|
+
_ace 8
|
292
|
+
_pr 8
|
293
|
+
min 7
|
294
|
+
din 7
|
295
|
+
_E 7
|
296
|
+
tul 7
|
297
|
+
lin 7
|
298
|
+
- 7
|
299
|
+
esc 7
|
300
|
+
ina 7
|
301
|
+
j 7
|
302
|
+
_mai 7
|
303
|
+
far 7
|
297
304
|
art 7
|
298
|
-
|
299
|
-
|
305
|
+
cr 7
|
306
|
+
_da 7
|
300
307
|
pl 7
|
308
|
+
_mai_ 7
|
309
|
+
tin 7
|
310
|
+
una 7
|
311
|
+
_din 7
|
312
|
+
_ci 7
|
301
313
|
m_ 7
|
302
|
-
|
314
|
+
pri 7
|
303
315
|
ator 7
|
316
|
+
az 7
|
317
|
+
rea 7
|
304
318
|
res 7
|
305
|
-
|
306
|
-
mp 7
|
319
|
+
ani 7
|
307
320
|
sin 7
|
308
|
-
|
309
|
-
j 7
|
310
|
-
cr 7
|
311
|
-
min 7
|
312
|
-
tul 7
|
321
|
+
ato 7
|
313
322
|
fi 7
|
314
|
-
|
315
|
-
- 7
|
316
|
-
ind 7
|
317
|
-
ina 7
|
318
|
-
' 7
|
319
|
-
_E 7
|
320
|
-
rea 7
|
321
|
-
_pr 7
|
322
|
-
una 7
|
323
|
+
ran 7
|
323
324
|
_or 7
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
lin 7
|
328
|
-
ia_ 7
|
325
|
+
mp 7
|
326
|
+
ilo 7
|
327
|
+
ind 7
|
329
328
|
nta 7
|
330
329
|
tor 7
|
331
|
-
|
332
|
-
|
333
|
-
rti 6
|
334
|
-
_ti 6
|
335
|
-
_z 6
|
336
|
-
_to 6
|
337
|
-
iv 6
|
338
|
-
_st 6
|
330
|
+
do 7
|
331
|
+
ro 6
|
339
332
|
ez 6
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
_tr 6
|
333
|
+
C 6
|
334
|
+
lor_ 6
|
335
|
+
nde 6
|
344
336
|
it_ 6
|
345
337
|
pin 6
|
346
|
-
nde 6
|
347
|
-
_do 6
|
348
|
-
s_ 6
|
349
|
-
no 6
|
350
|
-
nti 6
|
351
|
-
iat 6
|
352
|
-
_op 6
|
353
|
-
tre 6
|
354
|
-
? 6
|
355
|
-
ge 6
|
356
338
|
ag 6
|
357
|
-
|
358
|
-
|
339
|
+
ie_ 6
|
340
|
+
nei 6
|
341
|
+
eri 6
|
342
|
+
_pu 6
|
343
|
+
c_ 6
|
344
|
+
ba 6
|
345
|
+
iat 6
|
346
|
+
tot 6
|
347
|
+
s_ 6
|
348
|
+
da_ 6
|
349
|
+
omu 6
|
350
|
+
gr 6
|
359
351
|
ric 6
|
360
|
-
|
352
|
+
rti 6
|
353
|
+
ra,_ 6
|
361
354
|
so 6
|
362
|
-
|
363
|
-
|
364
|
-
|
355
|
+
no 6
|
356
|
+
bo 6
|
357
|
+
rul 6
|
365
358
|
ch 6
|
366
|
-
|
367
|
-
nu_ 6
|
368
|
-
lor_ 6
|
369
|
-
og 6
|
370
|
-
c_ 6
|
371
|
-
_po 6
|
372
|
-
omu 6
|
373
|
-
up 6
|
374
|
-
ari 6
|
359
|
+
_op 6
|
375
360
|
pt 6
|
376
|
-
|
377
|
-
|
378
|
-
_pu 6
|
379
|
-
nei 6
|
380
|
-
S 6
|
381
|
-
ra,_ 6
|
382
|
-
ba 6
|
383
|
-
ro 6
|
384
|
-
gr 6
|
385
|
-
ilor 6
|
386
|
-
ie_ 6
|
361
|
+
ari 6
|
362
|
+
_ti 6
|
387
363
|
intr 6
|
364
|
+
_mo 6
|
365
|
+
oas 6
|
366
|
+
iv 6
|
367
|
+
e. 6
|
388
368
|
imp 6
|
369
|
+
oare 6
|
370
|
+
st_ 6
|
371
|
+
_z 6
|
372
|
+
_do 6
|
373
|
+
ate_ 6
|
389
374
|
_cel 6
|
390
|
-
|
375
|
+
_tr 6
|
376
|
+
S 6
|
377
|
+
tre 6
|
378
|
+
_po 6
|
379
|
+
vi 6
|
380
|
+
_st 6
|
381
|
+
og 6
|
382
|
+
ast 6
|
391
383
|
rm 6
|
384
|
+
man 6
|
385
|
+
_mar 6
|
386
|
+
ilor 6
|
392
387
|
ra, 6
|
393
|
-
|
388
|
+
nu_ 6
|
389
|
+
ica 6
|
390
|
+
ge 6
|
391
|
+
_to 6
|
392
|
+
up 6
|
393
|
+
nti 6
|
394
394
|
P 5
|
395
|
-
|
396
|
-
rin 5
|
395
|
+
ing 5
|
397
396
|
ora 5
|
398
|
-
|
399
|
-
|
400
|
-
|
397
|
+
rat 5
|
398
|
+
_al 5
|
399
|
+
ril 5
|
400
|
+
ult 5
|