scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/lib/scylla/lms/catalan.lm
CHANGED
@@ -1,400 +1,400 @@
|
|
1
|
-
_
|
2
|
-
a
|
3
|
-
e
|
4
|
-
l
|
5
|
-
s
|
6
|
-
i
|
7
|
-
n
|
8
|
-
t
|
9
|
-
r
|
10
|
-
o
|
11
|
-
c
|
12
|
-
�
|
13
|
-
|
14
|
-
|
15
|
-
s_
|
16
|
-
u
|
17
|
-
__
|
18
|
-
m
|
19
|
-
_d
|
20
|
-
|
21
|
-
|
22
|
-
p
|
1
|
+
_ 14596
|
2
|
+
a 4010
|
3
|
+
e 3751
|
4
|
+
l 2647
|
5
|
+
s 2301
|
6
|
+
i 2270
|
7
|
+
n 2179
|
8
|
+
t 2145
|
9
|
+
r 1821
|
10
|
+
o 1472
|
11
|
+
c 1470
|
12
|
+
� 1190
|
13
|
+
d 1181
|
14
|
+
a_ 1172
|
15
|
+
s_ 1146
|
16
|
+
u 1070
|
17
|
+
__ 872
|
18
|
+
m 828
|
19
|
+
_d 769
|
20
|
+
en 755
|
21
|
+
es 750
|
22
|
+
p 696
|
23
23
|
_l 682
|
24
|
-
al
|
25
|
-
de
|
24
|
+
al 674
|
25
|
+
de 638
|
26
26
|
e_ 617
|
27
|
-
_e
|
28
|
-
ta
|
29
|
-
_de
|
30
|
-
l_
|
31
|
-
g
|
32
|
-
|
33
|
-
,
|
34
|
-
|
35
|
-
|
36
|
-
_c
|
37
|
-
|
38
|
-
|
39
|
-
_a
|
40
|
-
es_
|
41
|
-
ca
|
42
|
-
er
|
43
|
-
ci
|
44
|
-
_p
|
45
|
-
.
|
46
|
-
t_
|
47
|
-
an
|
48
|
-
nt
|
49
|
-
�
|
50
|
-
à
|
51
|
-
re
|
52
|
-
st
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
de_
|
57
|
-
_de_
|
58
|
-
ra
|
59
|
-
f
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
ic
|
69
|
-
i_
|
70
|
-
ue
|
71
|
-
or
|
72
|
-
|
73
|
-
|
74
|
-
qu
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
tal 233
|
80
|
-
_ca 232
|
81
|
-
ia 227
|
27
|
+
_e 615
|
28
|
+
ta 555
|
29
|
+
_de 552
|
30
|
+
l_ 517
|
31
|
+
g 496
|
32
|
+
la 486
|
33
|
+
, 483
|
34
|
+
,_ 477
|
35
|
+
le 475
|
36
|
+
_c 459
|
37
|
+
at 458
|
38
|
+
el 448
|
39
|
+
_a 443
|
40
|
+
es_ 427
|
41
|
+
ca 424
|
42
|
+
er 400
|
43
|
+
ci 398
|
44
|
+
_p 396
|
45
|
+
. 391
|
46
|
+
t_ 390
|
47
|
+
an 380
|
48
|
+
nt 376
|
49
|
+
� 360
|
50
|
+
à 360
|
51
|
+
re 356
|
52
|
+
st 351
|
53
|
+
n_ 347
|
54
|
+
b 341
|
55
|
+
ar 339
|
56
|
+
de_ 335
|
57
|
+
_de_ 331
|
58
|
+
ra 315
|
59
|
+
f 301
|
60
|
+
on 294
|
61
|
+
el_ 293
|
62
|
+
la_ 272
|
63
|
+
_la 270
|
64
|
+
v 265
|
65
|
+
ll 264
|
66
|
+
_i 264
|
67
|
+
._ 259
|
68
|
+
ic 254
|
69
|
+
i_ 253
|
70
|
+
ue 248
|
71
|
+
or 245
|
72
|
+
q 243
|
73
|
+
_la_ 239
|
74
|
+
qu 239
|
75
|
+
_ca 236
|
76
|
+
in 234
|
77
|
+
te 233
|
78
|
+
tal 231
|
82
79
|
it 227
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
co
|
87
|
-
en_
|
88
|
-
|
89
|
-
na
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
80
|
+
ia 225
|
81
|
+
_s 223
|
82
|
+
ti 215
|
83
|
+
co 212
|
84
|
+
en_ 211
|
85
|
+
om 211
|
86
|
+
na 210
|
87
|
+
ent 209
|
88
|
+
que 208
|
89
|
+
_m 206
|
90
|
+
_el 201
|
91
|
+
se 199
|
92
|
+
ri 198
|
94
93
|
i� 197
|
95
|
-
|
96
|
-
|
97
|
-
ns 193
|
94
|
+
un 196
|
95
|
+
ata 194
|
98
96
|
x 191
|
99
|
-
|
100
|
-
atal
|
97
|
+
ns 191
|
98
|
+
atal 186
|
99
|
+
E 185
|
100
|
+
l� 185
|
101
|
+
len 184
|
101
102
|
ng 183
|
102
|
-
|
103
|
-
_i_ 181
|
103
|
+
r_ 182
|
104
104
|
_a_ 180
|
105
|
-
|
106
|
-
|
107
|
-
ó
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
gu 177
|
112
|
-
nc 176
|
105
|
+
_i_ 180
|
106
|
+
� 178
|
107
|
+
ó 178
|
108
|
+
A 177
|
109
|
+
di 177
|
110
|
+
�_ 175
|
113
111
|
ci� 175
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
_en
|
121
|
-
li
|
122
|
-
|
123
|
-
C
|
124
|
-
|
125
|
-
|
112
|
+
à_ 175
|
113
|
+
nc 175
|
114
|
+
é 173
|
115
|
+
gu 173
|
116
|
+
� 173
|
117
|
+
_co 171
|
118
|
+
_en 170
|
119
|
+
li 168
|
120
|
+
pe 167
|
121
|
+
C 162
|
122
|
+
ac 162
|
123
|
+
_t 157
|
124
|
+
_es 155
|
126
125
|
del 153
|
127
|
-
í 152
|
128
126
|
_del 152
|
129
|
-
� 152
|
130
127
|
lle 151
|
131
|
-
|
132
|
-
cat
|
133
|
-
|
134
|
-
|
135
|
-
|
128
|
+
� 150
|
129
|
+
cat 150
|
130
|
+
me 150
|
131
|
+
í 150
|
132
|
+
_el_ 149
|
136
133
|
- 148
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
est
|
144
|
-
|
145
|
-
ua
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
ro 137
|
153
|
-
al_ 137
|
154
|
-
( 135
|
134
|
+
tr 148
|
135
|
+
al_ 145
|
136
|
+
pa 145
|
137
|
+
ts 145
|
138
|
+
là 145
|
139
|
+
ls 144
|
140
|
+
est 142
|
141
|
+
_ll 142
|
142
|
+
ua 141
|
143
|
+
_f 141
|
144
|
+
ne 136
|
145
|
+
cata 136
|
146
|
+
catal 136
|
147
|
+
pr 136
|
148
|
+
les 135
|
155
149
|
_cat 134
|
156
150
|
_cata 134
|
157
|
-
|
158
|
-
h 133
|
151
|
+
_en_ 133
|
159
152
|
ió 133
|
160
|
-
|
153
|
+
ro 133
|
161
154
|
ts_ 132
|
162
|
-
|
163
|
-
|
164
|
-
oc
|
165
|
-
|
166
|
-
|
155
|
+
nt_ 131
|
156
|
+
h 130
|
157
|
+
oc 129
|
158
|
+
_. 129
|
159
|
+
_o 129
|
160
|
+
ma 128
|
161
|
+
_q 127
|
162
|
+
da 126
|
167
163
|
_del_ 126
|
168
|
-
|
164
|
+
sta 126
|
169
165
|
del_ 126
|
170
|
-
|
171
|
-
|
172
|
-
_
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
no 122
|
166
|
+
ls_ 125
|
167
|
+
_qu 125
|
168
|
+
�_ 124
|
169
|
+
ó_ 124
|
170
|
+
L 122
|
171
|
+
am 122
|
177
172
|
nci 122
|
178
|
-
ica
|
179
|
-
|
180
|
-
et 120
|
181
|
-
és 120
|
173
|
+
ica 121
|
174
|
+
_lle 120
|
182
175
|
ció 120
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
�
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
176
|
+
et 119
|
177
|
+
no 119
|
178
|
+
_._ 119
|
179
|
+
�s 117
|
180
|
+
_v 117
|
181
|
+
és 117
|
182
|
+
ni 117
|
183
|
+
ale 117
|
191
184
|
leng 115
|
185
|
+
al� 115
|
192
186
|
eng 115
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
talà
|
198
|
-
|
199
|
-
ió_ 113
|
187
|
+
�s_ 114
|
188
|
+
és_ 114
|
189
|
+
ió_ 114
|
190
|
+
tal� 113
|
191
|
+
talà 113
|
192
|
+
atal� 113
|
200
193
|
y 113
|
201
|
-
|
202
|
-
|
194
|
+
alà 113
|
195
|
+
_pe 113
|
196
|
+
_le 112
|
203
197
|
fi 111
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
198
|
+
llen 111
|
199
|
+
as 111
|
200
|
+
per 109
|
201
|
+
tat 109
|
202
|
+
_que 109
|
208
203
|
aci 109
|
209
|
-
les_ 109
|
210
|
-
là_ 109
|
211
204
|
at_ 109
|
212
|
-
|
205
|
+
là_ 108
|
213
206
|
lleng 108
|
214
|
-
|
215
|
-
|
207
|
+
ad 108
|
208
|
+
va 108
|
209
|
+
_llen 107
|
216
210
|
tu 106
|
211
|
+
po 106
|
217
212
|
o_ 106
|
218
|
-
_C
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
� 102
|
223
|
-
com 102
|
224
|
-
è 102
|
213
|
+
_C 106
|
214
|
+
les_ 105
|
215
|
+
_pr 105
|
216
|
+
na_ 104
|
225
217
|
g� 102
|
226
|
-
|
218
|
+
ant 102
|
219
|
+
m� 102
|
220
|
+
t� 101
|
221
|
+
ció_ 101
|
227
222
|
ec 101
|
228
|
-
|
229
|
-
|
230
|
-
ue_ 100
|
223
|
+
com 101
|
224
|
+
è 100
|
231
225
|
que_ 100
|
232
|
-
|
233
|
-
|
234
|
-
|
226
|
+
_que_ 100
|
227
|
+
� 100
|
228
|
+
ue_ 100
|
229
|
+
a, 100
|
230
|
+
a,_ 99
|
231
|
+
_r 99
|
235
232
|
cia 98
|
236
|
-
_al 97
|
237
|
-
_r 97
|
238
233
|
ny 97
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
a, 96
|
234
|
+
_com 97
|
235
|
+
ues 97
|
236
|
+
_se 97
|
243
237
|
ngu 95
|
244
|
-
|
245
|
-
|
238
|
+
_h 95
|
239
|
+
is 95
|
240
|
+
ia_ 95
|
241
|
+
_les 95
|
242
|
+
_al 94
|
243
|
+
ns_ 94
|
244
|
+
mi 93
|
246
245
|
ix 93
|
247
|
-
ia_ 93
|
248
246
|
sti 93
|
249
|
-
_u
|
250
|
-
ques 92
|
251
|
-
ns_ 91
|
252
|
-
os 90
|
253
|
-
alà_ 90
|
254
|
-
em 90
|
247
|
+
_u 91
|
255
248
|
gua 90
|
256
|
-
enc
|
257
|
-
_d' 90
|
258
|
-
d' 90
|
259
|
-
_per 89
|
260
|
-
r� 89
|
249
|
+
enc 89
|
261
250
|
to 89
|
262
|
-
|
263
|
-
|
264
|
-
ca_ 89
|
265
|
-
rt 89
|
251
|
+
os 89
|
252
|
+
alà_ 89
|
266
253
|
aci� 89
|
267
|
-
|
268
|
-
|
254
|
+
ació 89
|
255
|
+
em 89
|
256
|
+
ques 89
|
257
|
+
er_ 88
|
269
258
|
� 87
|
259
|
+
_per 87
|
270
260
|
mb 87
|
261
|
+
r� 87
|
271
262
|
ü 87
|
263
|
+
men 87
|
264
|
+
ct 87
|
265
|
+
rt 86
|
272
266
|
j 86
|
273
|
-
er_ 86
|
274
|
-
P 86
|
275
|
-
engu 85
|
276
|
-
ngua 85
|
277
267
|
engua 85
|
278
268
|
lengu 85
|
279
269
|
ol 85
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
270
|
+
engu 85
|
271
|
+
ngua 85
|
272
|
+
P 84
|
273
|
+
_di 84
|
284
274
|
tre 83
|
285
|
-
|
275
|
+
io 82
|
286
276
|
� 82
|
287
|
-
|
288
|
-
_� 82
|
277
|
+
I 82
|
289
278
|
gü 82
|
279
|
+
res 82
|
280
|
+
ons 82
|
281
|
+
si 82
|
282
|
+
lo 81
|
290
283
|
ei 81
|
291
|
-
|
292
|
-
|
293
|
-
ut
|
294
|
-
|
295
|
-
ica_ 79
|
296
|
-
so 78
|
284
|
+
_� 80
|
285
|
+
_n 79
|
286
|
+
ut 79
|
287
|
+
so 79
|
297
288
|
ng� 78
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
bl 77
|
289
|
+
par 78
|
290
|
+
ent_ 78
|
291
|
+
esta 77
|
302
292
|
ngü 77
|
303
|
-
|
293
|
+
ur 77
|
304
294
|
�s 76
|
305
|
-
|
306
|
-
_di 76
|
307
|
-
eg 76
|
295
|
+
bl 76
|
308
296
|
ís 76
|
309
|
-
|
310
|
-
|
297
|
+
eg 75
|
298
|
+
_pa 75
|
311
299
|
_re 75
|
312
|
-
|
300
|
+
_g 74
|
301
|
+
_les_ 74
|
302
|
+
_po 74
|
313
303
|
_un 74
|
314
|
-
_g 73
|
315
304
|
ran 73
|
316
|
-
|
305
|
+
con 73
|
306
|
+
ha 73
|
317
307
|
mp 73
|
318
308
|
mo 72
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
_,_ 70
|
323
|
-
els 69
|
309
|
+
lu 71
|
310
|
+
tic 70
|
311
|
+
ua_ 69
|
324
312
|
Ca 69
|
325
|
-
|
313
|
+
des 69
|
314
|
+
fic 68
|
326
315
|
enci 68
|
327
316
|
T 68
|
328
|
-
ie 68
|
329
317
|
els_ 68
|
330
|
-
|
331
|
-
|
332
|
-
rd 67
|
318
|
+
els 68
|
319
|
+
rs 67
|
333
320
|
amb 67
|
334
|
-
|
335
|
-
|
321
|
+
ie 67
|
322
|
+
m_ 67
|
336
323
|
ot 66
|
324
|
+
ta_ 66
|
337
325
|
_P 66
|
338
|
-
|
326
|
+
rd 65
|
327
|
+
re_ 65
|
339
328
|
ial 65
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
329
|
+
do 64
|
330
|
+
ter 64
|
331
|
+
_, 64
|
332
|
+
_no 64
|
333
|
+
ngua_ 64
|
345
334
|
ià 64
|
346
|
-
|
335
|
+
nta 64
|
336
|
+
gua_ 64
|
337
|
+
ment 63
|
347
338
|
eix 63
|
348
|
-
|
339
|
+
_b 63
|
340
|
+
_,_ 63
|
349
341
|
ra_ 63
|
350
|
-
gua_ 62
|
351
|
-
�n 62
|
352
|
-
ngua_ 62
|
353
342
|
vi 62
|
354
|
-
|
355
|
-
|
356
|
-
do 62
|
357
|
-
re_ 62
|
343
|
+
_ha 62
|
344
|
+
V 61
|
358
345
|
ar_ 61
|
346
|
+
�n 61
|
359
347
|
lenc 61
|
360
|
-
|
348
|
+
àn 61
|
361
349
|
if 61
|
362
|
-
V 61
|
363
350
|
alen 60
|
364
|
-
M 60
|
365
|
-
_mo 60
|
366
351
|
lenci 60
|
352
|
+
_E 60
|
367
353
|
alenc 60
|
368
354
|
nd 60
|
369
|
-
|
355
|
+
M 60
|
370
356
|
ell 59
|
357
|
+
lan 59
|
371
358
|
fe 59
|
359
|
+
als 59
|
372
360
|
� 59
|
373
|
-
|
374
|
-
|
375
|
-
|
361
|
+
_va 59
|
362
|
+
s. 59
|
363
|
+
ing 59
|
364
|
+
us 59
|
376
365
|
sp 58
|
377
|
-
�sti 58
|
378
366
|
ísti 58
|
379
|
-
|
380
|
-
|
381
|
-
ce 58
|
367
|
+
�sti 58
|
368
|
+
íst 58
|
382
369
|
_A 58
|
383
|
-
_ma 58
|
384
370
|
�st 58
|
385
|
-
|
371
|
+
ce 57
|
386
372
|
_con 57
|
387
|
-
_és 57
|
388
|
-
ita 57
|
389
|
-
_va 57
|
390
373
|
� 57
|
391
|
-
|
392
|
-
_
|
374
|
+
ita 57
|
375
|
+
_� 56
|
376
|
+
_Ca 56
|
377
|
+
_és 56
|
393
378
|
ya 56
|
394
379
|
D 56
|
395
|
-
|
396
|
-
|
397
|
-
_
|
398
|
-
ul 55
|
399
|
-
era 55
|
380
|
+
_ma 56
|
381
|
+
_- 56
|
382
|
+
_é 56
|
400
383
|
� 55
|
384
|
+
_és_ 55
|
385
|
+
sa 55
|
386
|
+
cià 55
|
387
|
+
om_ 55
|
388
|
+
lt 54
|
389
|
+
ul 54
|
390
|
+
ca_ 54
|
391
|
+
ret 54
|
392
|
+
S 54
|
393
|
+
era 54
|
394
|
+
ncia 54
|
395
|
+
il 53
|
396
|
+
ion 53
|
397
|
+
ste 53
|
398
|
+
s,_ 53
|
399
|
+
nya 53
|
400
|
+
s, 53
|