scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
@@ -10,32 +10,32 @@ t 205
|
|
10
10
|
an 185
|
11
11
|
g 152
|
12
12
|
s 152
|
13
|
-
d 149
|
14
13
|
m 149
|
15
|
-
|
14
|
+
d 149
|
16
15
|
ng 108
|
16
|
+
l 108
|
17
17
|
p 99
|
18
|
-
a_
|
18
|
+
a_ 94
|
19
19
|
o 91
|
20
20
|
er 85
|
21
21
|
b 85
|
22
|
-
n_
|
22
|
+
n_ 82
|
23
23
|
, 77
|
24
24
|
ka 76
|
25
|
+
an_ 75
|
25
26
|
_d 74
|
26
|
-
an_ 74
|
27
27
|
h 73
|
28
|
-
i_
|
29
|
-
|
30
|
-
ar 65
|
28
|
+
i_ 73
|
29
|
+
,_ 67
|
31
30
|
_m 65
|
32
|
-
|
31
|
+
ar 65
|
32
|
+
en 65
|
33
33
|
ta 59
|
34
34
|
di 59
|
35
35
|
me 58
|
36
|
+
_k 57
|
36
37
|
ang 56
|
37
38
|
ra 55
|
38
|
-
_k 55
|
39
39
|
at 54
|
40
40
|
ak 54
|
41
41
|
y 53
|
@@ -45,356 +45,356 @@ ga 48
|
|
45
45
|
in 46
|
46
46
|
ya 43
|
47
47
|
. 42
|
48
|
-
tu 41
|
49
48
|
_di 41
|
49
|
+
tu 41
|
50
50
|
un 40
|
51
51
|
_p 39
|
52
|
-
j 38
|
53
52
|
la 38
|
54
|
-
|
53
|
+
j 38
|
55
54
|
ma 37
|
55
|
+
_s 37
|
56
56
|
k_ 36
|
57
57
|
na 36
|
58
58
|
ah 35
|
59
59
|
ri 34
|
60
|
-
_t 33
|
61
|
-
se 33
|
62
|
-
ke 33
|
63
60
|
ng_ 33
|
64
61
|
g_ 33
|
62
|
+
_t 33
|
63
|
+
se 33
|
65
64
|
be 33
|
66
|
-
|
67
|
-
as 32
|
68
|
-
pe 32
|
65
|
+
ke 33
|
69
66
|
_b 32
|
70
67
|
__ 32
|
71
|
-
|
68
|
+
al 32
|
69
|
+
pe 32
|
70
|
+
as 32
|
72
71
|
men 31
|
73
|
-
|
72
|
+
ia 31
|
73
|
+
h_ 30
|
74
74
|
_ke 29
|
75
|
-
|
76
|
-
P 29
|
75
|
+
_men 29
|
77
76
|
si 29
|
77
|
+
P 29
|
78
|
+
ny 28
|
78
79
|
ti 28
|
79
|
-
it 28
|
80
80
|
pa 28
|
81
|
-
|
81
|
+
it 28
|
82
82
|
em 27
|
83
83
|
_a 27
|
84
|
-
sa 26
|
85
84
|
am 26
|
85
|
+
sa 26
|
86
86
|
kan 26
|
87
87
|
u_ 25
|
88
|
-
eng 24
|
89
|
-
te 24
|
90
|
-
_pe 24
|
91
88
|
c 24
|
92
89
|
ang_ 24
|
90
|
+
eng 24
|
91
|
+
_pe 24
|
92
|
+
te 24
|
93
|
+
S 23
|
94
|
+
s_ 23
|
93
95
|
nga 23
|
96
|
+
ba 23
|
94
97
|
ja 23
|
95
98
|
_se 23
|
96
|
-
|
97
|
-
ba 23
|
98
|
-
S 23
|
99
|
-
ber 22
|
99
|
+
r_ 22
|
100
100
|
li 22
|
101
101
|
ni 22
|
102
102
|
el 22
|
103
|
+
ber 22
|
103
104
|
di_ 22
|
104
105
|
nt 22
|
105
|
-
r_ 22
|
106
|
-
_da 21
|
107
106
|
t_ 21
|
107
|
+
_da 21
|
108
|
+
ad 20
|
109
|
+
ak_ 20
|
110
|
+
_ka 20
|
108
111
|
ik 20
|
109
|
-
|
112
|
+
_P 20
|
110
113
|
nya 20
|
111
|
-
|
114
|
+
bu 20
|
112
115
|
ata 20
|
113
|
-
ak_ 20
|
114
|
-
ara 19
|
115
116
|
_i 19
|
116
|
-
|
117
|
+
gan 19
|
117
118
|
ran 19
|
119
|
+
._ 19
|
120
|
+
is 19
|
118
121
|
us 19
|
119
|
-
_P 19
|
120
|
-
gan 19
|
121
122
|
ap 19
|
122
|
-
|
123
|
+
ara 19
|
123
124
|
ru 18
|
125
|
+
dan 18
|
124
126
|
_be 18
|
125
127
|
era 18
|
126
|
-
_ka 18
|
127
|
-
dan 18
|
128
128
|
ah_ 17
|
129
|
+
nd 17
|
130
|
+
a, 17
|
129
131
|
ari 17
|
130
|
-
- 17
|
131
|
-
to 17
|
132
132
|
es 17
|
133
|
-
nd 17
|
134
133
|
ur 17
|
135
|
-
a, 17
|
136
134
|
uk 17
|
137
|
-
|
135
|
+
- 17
|
136
|
+
to 17
|
138
137
|
ala 16
|
139
|
-
ha 16
|
140
|
-
yang 16
|
141
|
-
ter 16
|
142
|
-
ol 16
|
143
138
|
su 16
|
139
|
+
yan 16
|
140
|
+
ter 16
|
144
141
|
I 16
|
145
|
-
ngan 16
|
146
142
|
ung 16
|
147
|
-
|
148
|
-
|
143
|
+
ol 16
|
144
|
+
ha 16
|
145
|
+
yang 16
|
146
|
+
ngan 16
|
149
147
|
M 15
|
150
|
-
kan_ 15
|
151
|
-
_ya 15
|
152
|
-
_di_ 15
|
153
148
|
_ber 15
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
149
|
+
_di_ 15
|
150
|
+
_ya 15
|
151
|
+
kan_ 15
|
152
|
+
J 15
|
153
|
+
_y 15
|
158
154
|
lu 14
|
159
|
-
|
160
|
-
itu 14
|
161
|
-
" 14
|
162
|
-
ok 14
|
155
|
+
du 14
|
163
156
|
ek 14
|
164
157
|
_yang 14
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
158
|
+
A 14
|
159
|
+
ok 14
|
160
|
+
de 14
|
161
|
+
_yan 14
|
162
|
+
itu 14
|
163
|
+
enga 14
|
164
|
+
_S 14
|
169
165
|
a,_ 13
|
170
|
-
|
171
|
-
|
166
|
+
ku 13
|
167
|
+
_te 13
|
168
|
+
ko 13
|
172
169
|
yang_ 13
|
173
|
-
|
174
|
-
ge 13
|
175
|
-
R 13
|
170
|
+
ai 13
|
176
171
|
da_ 13
|
172
|
+
rang 13
|
177
173
|
at_ 13
|
174
|
+
l_ 13
|
175
|
+
ina 13
|
176
|
+
eri 13
|
177
|
+
ge 13
|
178
178
|
D 13
|
179
|
-
|
179
|
+
per 13
|
180
|
+
_dan 13
|
180
181
|
kar 13
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
ul 12
|
186
|
-
et 12
|
187
|
-
w 12
|
188
|
-
po 12
|
189
|
-
ut 12
|
182
|
+
R 13
|
183
|
+
dan_ 13
|
184
|
+
ua 12
|
185
|
+
ia_ 12
|
190
186
|
ya_ 12
|
191
187
|
gi 12
|
192
|
-
|
193
|
-
|
188
|
+
po 12
|
189
|
+
ul 12
|
190
|
+
et 12
|
194
191
|
_dan_ 12
|
195
|
-
|
196
|
-
ua 12
|
192
|
+
e_ 12
|
197
193
|
ju 12
|
194
|
+
rin 12
|
198
195
|
pen 12
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
ama 11
|
203
|
-
mp 11
|
204
|
-
ca 11
|
205
|
-
_pen 11
|
206
|
-
re 11
|
207
|
-
ika 11
|
208
|
-
_J 11
|
196
|
+
aka 12
|
197
|
+
ut 12
|
198
|
+
w 12
|
209
199
|
B 11
|
200
|
+
_J 11
|
201
|
+
mp 11
|
202
|
+
ngg 11
|
203
|
+
ed 11
|
210
204
|
_ter 11
|
211
205
|
gg 11
|
212
206
|
na_ 11
|
213
|
-
|
214
|
-
|
207
|
+
_pen 11
|
208
|
+
nya_ 11
|
215
209
|
or 11
|
210
|
+
ika 11
|
211
|
+
re 11
|
212
|
+
ca 11
|
213
|
+
ama 11
|
214
|
+
ena 11
|
216
215
|
lan 11
|
216
|
+
uh 10
|
217
|
+
arin 10
|
218
|
+
wa 10
|
219
|
+
ela 10
|
220
|
+
_T 10
|
217
221
|
ni_ 10
|
222
|
+
Z 10
|
223
|
+
im 10
|
224
|
+
adi 10
|
218
225
|
ngk 10
|
219
|
-
ab 10
|
220
|
-
gk 10
|
221
|
-
mi 10
|
222
226
|
arina 10
|
223
|
-
T 10
|
224
|
-
eru 10
|
225
227
|
Zar 10
|
226
|
-
uk_ 10
|
227
|
-
Za 10
|
228
|
-
arin 10
|
229
|
-
ela 10
|
230
|
-
adi 10
|
231
|
-
rina 10
|
232
|
-
wa 10
|
233
|
-
rt 10
|
234
|
-
Z 10
|
235
228
|
us_ 10
|
236
|
-
Zari 10
|
237
|
-
im 10
|
238
|
-
_T 10
|
239
229
|
Zarin 10
|
240
|
-
|
230
|
+
rt 10
|
231
|
+
rina 10
|
232
|
+
mi 10
|
233
|
+
uk_ 10
|
234
|
+
T 10
|
235
|
+
gk 10
|
241
236
|
ini 10
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
i. 9
|
247
|
-
mu 9
|
248
|
-
ntu 9
|
249
|
-
i,_ 9
|
237
|
+
eru 10
|
238
|
+
Za 10
|
239
|
+
Zari 10
|
240
|
+
ab 10
|
250
241
|
ing 9
|
251
|
-
|
252
|
-
gan_ 9
|
253
|
-
eb 9
|
254
|
-
ngan_ 9
|
242
|
+
os 9
|
255
243
|
ta_ 9
|
256
|
-
|
244
|
+
tu_ 9
|
245
|
+
_c 9
|
246
|
+
i, 9
|
257
247
|
si_ 9
|
258
|
-
|
259
|
-
engan 9
|
260
|
-
asi 9
|
261
|
-
_de 9
|
248
|
+
jad 9
|
262
249
|
_ta 9
|
263
|
-
|
250
|
+
i,_ 9
|
251
|
+
mu 9
|
264
252
|
K 9
|
265
|
-
|
266
|
-
|
253
|
+
gan_ 9
|
254
|
+
asi 9
|
255
|
+
i. 9
|
267
256
|
ag 9
|
268
|
-
|
257
|
+
_mem 9
|
258
|
+
engan 9
|
259
|
+
ada 9
|
260
|
+
il 9
|
261
|
+
mem 9
|
262
|
+
ant 9
|
263
|
+
apa 9
|
269
264
|
jadi 9
|
265
|
+
_de 9
|
266
|
+
ngan_ 9
|
267
|
+
eb 9
|
268
|
+
ntu 9
|
269
|
+
_A 8
|
270
|
+
ip 8
|
271
|
+
man 8
|
272
|
+
_in 8
|
273
|
+
emb 8
|
274
|
+
oh 8
|
270
275
|
_Zari 8
|
276
|
+
nj 8
|
277
|
+
u, 8
|
278
|
+
a. 8
|
279
|
+
eny 8
|
280
|
+
_Za 8
|
281
|
+
_Z 8
|
271
282
|
dia 8
|
272
|
-
oko 8
|
273
|
-
_ma 8
|
274
283
|
ki 8
|
275
|
-
|
284
|
+
_I 8
|
285
|
+
oko 8
|
286
|
+
n. 8
|
287
|
+
pat 8
|
276
288
|
_meng 8
|
289
|
+
st 8
|
290
|
+
_Zar 8
|
291
|
+
ian 8
|
292
|
+
ri_ 8
|
293
|
+
_ma 8
|
294
|
+
_M 8
|
295
|
+
meng 8
|
277
296
|
mb 8
|
278
|
-
au 8
|
279
|
-
le 8
|
280
|
-
man 8
|
281
|
-
_A 8
|
282
297
|
rk 8
|
283
|
-
|
284
|
-
|
285
|
-
oh 8
|
286
|
-
_Zar 8
|
287
|
-
isi 8
|
298
|
+
L 8
|
299
|
+
le 8
|
288
300
|
itu_ 8
|
289
|
-
|
290
|
-
eny 8
|
301
|
+
isi 8
|
291
302
|
aga 8
|
292
|
-
|
293
|
-
_Za 8
|
294
|
-
ri_ 8
|
303
|
+
au 8
|
295
304
|
ar_ 8
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
meng 8
|
302
|
-
_Z 8
|
305
|
+
ep 8
|
306
|
+
tan 7
|
307
|
+
ej 7
|
308
|
+
_ini 7
|
309
|
+
art 7
|
303
310
|
bur 7
|
304
|
-
gga 7
|
305
|
-
ot 7
|
306
311
|
ay 7
|
307
|
-
uga 7
|
308
|
-
rl 7
|
309
|
-
Ta 7
|
310
312
|
nu 7
|
311
|
-
|
312
|
-
|
313
|
-
ug 7
|
314
|
-
ej 7
|
315
|
-
ce 7
|
316
|
-
_peng 7
|
317
|
-
ng,_ 7
|
318
|
-
tak 7
|
319
|
-
ada_ 7
|
320
|
-
ro 7
|
313
|
+
ita 7
|
314
|
+
gga 7
|
321
315
|
atan 7
|
316
|
+
tak 7
|
317
|
+
ger 7
|
318
|
+
gka 7
|
319
|
+
eka 7
|
320
|
+
_Ta 7
|
321
|
+
aya 7
|
322
|
+
toko 7
|
323
|
+
arang 7
|
324
|
+
Ja 7
|
325
|
+
pu 7
|
326
|
+
rah 7
|
327
|
+
ngka 7
|
322
328
|
ntuk_ 7
|
323
|
-
|
324
|
-
|
325
|
-
as_ 7
|
329
|
+
p_ 7
|
330
|
+
Ta 7
|
326
331
|
erl 7
|
327
|
-
art 7
|
328
|
-
ih 7
|
329
|
-
ma_ 7
|
330
|
-
um 7
|
331
|
-
eka 7
|
332
|
-
lu_ 7
|
333
|
-
and 7
|
334
|
-
_it 7
|
335
|
-
tok 7
|
336
|
-
nda 7
|
337
|
-
ntuk 7
|
338
332
|
ngga 7
|
333
|
+
ma_ 7
|
334
|
+
uga 7
|
335
|
+
peng 7
|
336
|
+
aran 7
|
337
|
+
_per 7
|
339
338
|
alan 7
|
340
|
-
ib 7
|
341
|
-
g, 7
|
342
|
-
rah 7
|
343
|
-
_Ja 7
|
344
|
-
gka 7
|
345
|
-
aya 7
|
346
|
-
pu 7
|
347
|
-
_Ta 7
|
348
|
-
Ja 7
|
349
|
-
ger 7
|
350
|
-
_I 7
|
351
|
-
meny 7
|
352
|
-
tuk 7
|
353
|
-
a. 7
|
354
|
-
n. 7
|
355
|
-
_itu 7
|
356
339
|
tuk_ 7
|
340
|
+
_Ja 7
|
341
|
+
_peng 7
|
342
|
+
ug 7
|
343
|
+
ce 7
|
344
|
+
nda 7
|
357
345
|
ng, 7
|
358
|
-
|
346
|
+
g, 7
|
347
|
+
and 7
|
348
|
+
sia 7
|
349
|
+
_itu 7
|
359
350
|
_ba 7
|
351
|
+
rl 7
|
352
|
+
as_ 7
|
353
|
+
lu_ 7
|
354
|
+
ada_ 7
|
355
|
+
ro 7
|
356
|
+
ng,_ 7
|
360
357
|
al_ 7
|
361
|
-
|
362
|
-
aran 7
|
358
|
+
tok 7
|
363
359
|
g,_ 7
|
360
|
+
ntuk 7
|
361
|
+
um 7
|
362
|
+
tuk 7
|
364
363
|
kal 7
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
pi 6
|
373
|
-
den 6
|
374
|
-
kart 6
|
375
|
-
_L 6
|
376
|
-
akart 6
|
377
|
-
lis 6
|
378
|
-
mel 6
|
364
|
+
ib 7
|
365
|
+
_meny 7
|
366
|
+
_it 7
|
367
|
+
meny 7
|
368
|
+
ih 7
|
369
|
+
an. 7
|
370
|
+
ot 7
|
379
371
|
ngi 6
|
372
|
+
up 6
|
373
|
+
aj 6
|
374
|
+
rang_ 6
|
375
|
+
an, 6
|
376
|
+
rta 6
|
377
|
+
_tak 6
|
378
|
+
isa 6
|
379
|
+
tokoh 6
|
380
|
+
Po 6
|
381
|
+
_po 6
|
382
|
+
_U 6
|
383
|
+
nta 6
|
384
|
+
Pol 6
|
385
|
+
akar 6
|
386
|
+
on 6
|
387
|
+
ga_ 6
|
388
|
+
f 6
|
380
389
|
_den 6
|
390
|
+
ura 6
|
391
|
+
ema 6
|
392
|
+
kat 6
|
393
|
+
n, 6
|
381
394
|
n,_ 6
|
382
|
-
|
383
|
-
|
395
|
+
koh 6
|
396
|
+
arta 6
|
384
397
|
aha 6
|
385
|
-
|
386
|
-
PR 6
|
387
|
-
ra_ 6
|
398
|
+
mun 6
|
388
399
|
oli 6
|
389
|
-
an,_ 6
|
390
|
-
sus 6
|
391
|
-
an. 6
|
392
|
-
kat 6
|
393
|
-
n, 6
|
394
|
-
_itu_ 6
|
395
|
-
ang, 6
|
396
|
-
Po 6
|
397
400
|
tin 6
|
398
|
-
mun 6
|
399
|
-
U 6
|
400
|
-
nge 6
|