scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/test/fixtures/lms/danish.lm
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
_
|
1
|
+
_ 16370
|
2
2
|
e 5759
|
3
3
|
r 3334
|
4
4
|
n 3061
|
@@ -12,389 +12,389 @@ l 1884
|
|
12
12
|
g 1617
|
13
13
|
k 1278
|
14
14
|
m 1273
|
15
|
-
er
|
16
|
-
e_
|
17
|
-
__
|
15
|
+
er 1210
|
16
|
+
e_ 1186
|
17
|
+
__ 1181
|
18
18
|
de 1045
|
19
19
|
en 993
|
20
20
|
� 940
|
21
21
|
f 939
|
22
|
-
r_
|
22
|
+
r_ 840
|
23
23
|
v 770
|
24
|
-
t_
|
24
|
+
t_ 732
|
25
25
|
an 724
|
26
|
-
n_
|
26
|
+
n_ 716
|
27
27
|
u 605
|
28
28
|
nd 598
|
29
29
|
b 585
|
30
30
|
et 574
|
31
|
-
_s
|
31
|
+
_s 552
|
32
32
|
. 546
|
33
33
|
re 543
|
34
|
+
er_ 541
|
34
35
|
te 540
|
36
|
+
en_ 530
|
35
37
|
st 530
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
_o 501
|
40
|
-
_d 484
|
38
|
+
g_ 521
|
39
|
+
_o 503
|
40
|
+
_d 487
|
41
41
|
, 480
|
42
42
|
,_ 478
|
43
43
|
h 474
|
44
44
|
ge 474
|
45
|
-
_a
|
45
|
+
_a 463
|
46
46
|
in 440
|
47
47
|
p 436
|
48
|
+
_f 431
|
48
49
|
ar 430
|
49
|
-
_f 423
|
50
50
|
og 415
|
51
51
|
or 411
|
52
52
|
ti 406
|
53
53
|
._ 405
|
54
|
-
|
55
|
-
|
54
|
+
_e 404
|
55
|
+
et_ 399
|
56
|
+
_m 382
|
57
|
+
_i 381
|
56
58
|
ed 381
|
57
|
-
_m 378
|
58
|
-
_i 378
|
59
|
-
sk 365
|
60
59
|
ne 365
|
60
|
+
sk 365
|
61
61
|
le 353
|
62
|
-
_og
|
63
|
-
ke
|
62
|
+
_og 346
|
63
|
+
ke 344
|
64
64
|
el 342
|
65
65
|
og_ 338
|
66
|
-
� 332
|
67
66
|
ø 332
|
68
|
-
|
67
|
+
� 332
|
68
|
+
d_ 330
|
69
69
|
me 324
|
70
70
|
ng 317
|
71
|
-
|
72
|
-
|
73
|
-
å 293
|
74
|
-
ig 293
|
71
|
+
_og_ 315
|
72
|
+
_de 315
|
75
73
|
� 293
|
76
|
-
|
77
|
-
|
74
|
+
ig 293
|
75
|
+
å 293
|
76
|
+
_b 290
|
78
77
|
æ 286
|
79
|
-
|
78
|
+
� 286
|
79
|
+
i_ 285
|
80
|
+
s_ 276
|
81
|
+
de_ 275
|
80
82
|
_h 274
|
81
|
-
de_ 274
|
82
83
|
ri 273
|
83
|
-
s_ 271
|
84
84
|
D 268
|
85
85
|
nde 263
|
86
|
-
om 262
|
87
86
|
li 262
|
87
|
+
om 262
|
88
88
|
ma 259
|
89
89
|
ve 257
|
90
|
+
_t 256
|
90
91
|
y 256
|
91
92
|
af 254
|
92
|
-
_i_
|
93
|
-
_t 249
|
93
|
+
_i_ 254
|
94
94
|
at 245
|
95
95
|
il 244
|
96
|
+
es 241
|
96
97
|
and 241
|
97
|
-
es 240
|
98
98
|
be 234
|
99
99
|
al 234
|
100
100
|
is 233
|
101
101
|
fo 232
|
102
102
|
se 232
|
103
103
|
ns 229
|
104
|
+
_D 226
|
104
105
|
la 224
|
105
|
-
_D 223
|
106
106
|
on 221
|
107
107
|
rk 219
|
108
108
|
_af 217
|
109
109
|
den 216
|
110
110
|
der 214
|
111
|
+
_k 212
|
111
112
|
_me 210
|
112
|
-
_k 210
|
113
113
|
m_ 209
|
114
114
|
ing 207
|
115
115
|
_v 203
|
116
|
-
k_
|
116
|
+
k_ 202
|
117
117
|
ra 191
|
118
118
|
f_ 188
|
119
119
|
af_ 186
|
120
120
|
for 184
|
121
|
-
_p
|
121
|
+
_p 184
|
122
|
+
l_ 183
|
122
123
|
_af_ 180
|
123
124
|
ol 174
|
124
|
-
ere
|
125
|
-
_fo 172
|
125
|
+
ere 173
|
126
126
|
ark 172
|
127
|
+
_fo 172
|
127
128
|
lan 169
|
128
129
|
ste 169
|
129
130
|
te_ 168
|
131
|
+
_l 166
|
130
132
|
mar 165
|
131
|
-
_l 165
|
132
|
-
l_ 164
|
133
133
|
ll 162
|
134
134
|
ter 161
|
135
|
+
ske 160
|
135
136
|
j 159
|
136
|
-
|
137
|
+
ke_ 155
|
137
138
|
om_ 155
|
138
|
-
|
139
|
-
mark 153
|
139
|
+
ha 153
|
140
140
|
Da 153
|
141
|
-
|
141
|
+
mark 153
|
142
142
|
den_ 153
|
143
|
-
|
143
|
+
land 153
|
144
|
+
_st 151
|
144
145
|
ni 151
|
145
146
|
ed_ 151
|
146
|
-
_st 151
|
147
147
|
_for 149
|
148
148
|
so 149
|
149
|
-
Dan 148
|
150
149
|
ta 148
|
150
|
+
Dan 148
|
151
151
|
ger 147
|
152
|
+
_er 145
|
152
153
|
nge 144
|
153
154
|
det 143
|
154
|
-
re_
|
155
|
+
re_ 141
|
155
156
|
ede 139
|
156
157
|
nma 138
|
157
158
|
nm 138
|
158
159
|
vi 138
|
159
|
-
nmark 137
|
160
|
-
_en 137
|
161
160
|
nmar 137
|
161
|
+
_en 137
|
162
|
+
nmark 137
|
163
|
+
anmar 136
|
162
164
|
anm 136
|
165
|
+
_Da 136
|
163
166
|
anma 136
|
164
|
-
anmar 136
|
165
167
|
ev 135
|
166
168
|
rs 135
|
167
|
-
der_
|
168
|
-
Danma 133
|
169
|
+
der_ 134
|
169
170
|
Danm 133
|
170
171
|
S 133
|
171
|
-
|
172
|
+
Danma 133
|
172
173
|
un 133
|
174
|
+
_Dan 132
|
173
175
|
ans 132
|
174
|
-
_er 131
|
175
|
-
da 130
|
176
176
|
med 130
|
177
|
-
|
177
|
+
da 130
|
178
178
|
ro 127
|
179
179
|
io 127
|
180
180
|
til 126
|
181
|
-
som 125
|
182
181
|
_er_ 125
|
183
182
|
ik 125
|
184
|
-
|
183
|
+
som 125
|
185
184
|
_ti 124
|
185
|
+
rn 124
|
186
|
+
em 123
|
186
187
|
ds 123
|
188
|
+
�_ 123
|
189
|
+
å_ 123
|
187
190
|
_u 122
|
188
|
-
em 122
|
189
191
|
eg 121
|
190
|
-
|
191
|
-
å_ 121
|
192
|
+
_Danm 121
|
192
193
|
_ha 120
|
193
|
-
_Danm 120
|
194
194
|
rt 120
|
195
|
-
ld 119
|
196
195
|
_med 119
|
197
|
-
|
196
|
+
_so 119
|
197
|
+
ld 119
|
198
|
+
_g 118
|
198
199
|
som_ 118
|
199
200
|
to 117
|
200
|
-
_so 116
|
201
|
-
_g 116
|
202
201
|
ske_ 116
|
202
|
+
_som 116
|
203
203
|
det_ 115
|
204
|
-
|
204
|
+
_r 115
|
205
205
|
end 114
|
206
|
+
ern 114
|
206
207
|
ar_ 114
|
207
208
|
tr 114
|
208
|
-
|
209
|
+
_som_ 113
|
209
210
|
id 111
|
210
|
-
ko 111
|
211
211
|
ud 111
|
212
|
-
|
212
|
+
ko 111
|
213
213
|
del 110
|
214
214
|
_til 109
|
215
|
-
si 108
|
216
|
-
lig 108
|
217
215
|
va 108
|
218
|
-
mi 108
|
219
216
|
nsk 108
|
217
|
+
mi 108
|
218
|
+
si 108
|
219
|
+
lig 108
|
220
|
+
_be 107
|
220
221
|
ls 107
|
221
|
-
_be 105
|
222
|
-
�r 105
|
223
|
-
bl 105
|
224
|
-
ka 105
|
225
222
|
ion 105
|
226
223
|
ind 105
|
227
|
-
|
224
|
+
_da 105
|
225
|
+
�r 105
|
226
|
+
bl 105
|
228
227
|
gs 105
|
228
|
+
ør 105
|
229
|
+
ka 105
|
230
|
+
_S 104
|
229
231
|
lle 104
|
230
|
-
|
232
|
+
_� 104
|
233
|
+
dt 103
|
231
234
|
t� 103
|
232
|
-
_S 103
|
233
235
|
ne_ 102
|
234
236
|
med_ 102
|
235
|
-
|
237
|
+
tt 102
|
236
238
|
_en_ 101
|
237
|
-
|
238
|
-
dt 101
|
239
|
+
ag 101
|
239
240
|
r� 100
|
240
|
-
_� 100
|
241
241
|
c 100
|
242
242
|
ansk 99
|
243
|
-
nt 99
|
244
|
-
dan 99
|
245
243
|
ie 99
|
244
|
+
dan 99
|
245
|
+
nt 99
|
246
246
|
_med_ 98
|
247
247
|
or_ 97
|
248
|
-
|
249
|
-
- 95
|
248
|
+
il_ 96
|
250
249
|
De 95
|
251
|
-
|
250
|
+
�r 95
|
252
251
|
ær 95
|
253
|
-
na 94
|
254
|
-
nin 94
|
255
252
|
rne 94
|
253
|
+
lt 94
|
256
254
|
ning 94
|
255
|
+
na 94
|
257
256
|
ner 94
|
258
|
-
|
259
|
-
I 92
|
260
|
-
fr 92
|
257
|
+
nin 94
|
261
258
|
til_ 92
|
259
|
+
fr 92
|
260
|
+
I 92
|
262
261
|
at_ 92
|
263
262
|
op 91
|
264
263
|
ru 91
|
265
|
-
|
264
|
+
ge_ 91
|
265
|
+
_dan 90
|
266
|
+
erne 89
|
267
|
+
v_ 89
|
266
268
|
rd 89
|
269
|
+
ng_ 89
|
267
270
|
ige 89
|
268
271
|
_bl 89
|
269
|
-
|
270
|
-
|
271
|
-
ng_ 88
|
272
|
-
v_ 88
|
272
|
+
_. 88
|
273
|
+
a_ 88
|
273
274
|
gt 88
|
275
|
+
p� 87
|
274
276
|
kr 87
|
277
|
+
_den 87
|
275
278
|
tte 87
|
276
|
-
a_ 87
|
277
|
-
_re 87
|
278
279
|
inge 87
|
279
|
-
_den 87
|
280
|
-
p� 87
|
281
|
-
dans 86
|
282
|
-
men 86
|
283
280
|
s� 86
|
281
|
+
men 86
|
282
|
+
dans 86
|
284
283
|
dansk 86
|
285
|
-
it 85
|
286
|
-
isk 85
|
287
284
|
_til_ 85
|
285
|
+
isk 85
|
288
286
|
ver 85
|
289
|
-
|
290
|
-
|
287
|
+
es_ 85
|
288
|
+
it 85
|
289
|
+
_re 84
|
291
290
|
_dans 84
|
292
|
-
_at 84
|
293
291
|
els 84
|
294
|
-
|
292
|
+
_at 84
|
293
|
+
am 84
|
294
|
+
f� 84
|
295
|
+
_n 83
|
295
296
|
est 83
|
296
297
|
ur 82
|
297
|
-
gen 82
|
298
298
|
_den_ 82
|
299
|
+
gen 82
|
299
300
|
he 81
|
300
|
-
_ud 81
|
301
301
|
_at_ 81
|
302
|
-
|
303
|
-
ene 80
|
302
|
+
_ud 81
|
304
303
|
ble 80
|
305
|
-
|
306
|
-
|
307
|
-
und 79
|
308
|
-
_. 79
|
304
|
+
ene 80
|
305
|
+
rk_ 80
|
309
306
|
ede_ 79
|
307
|
+
e. 79
|
310
308
|
ande 79
|
311
|
-
|
312
|
-
|
309
|
+
und 79
|
310
|
+
od 79
|
311
|
+
sa 79
|
312
|
+
_I 79
|
313
313
|
_in 78
|
314
|
+
nde_ 78
|
315
|
+
_fr 78
|
314
316
|
_la 78
|
317
|
+
eri 78
|
318
|
+
sk_ 77
|
315
319
|
ov 77
|
316
320
|
ende 77
|
317
|
-
|
318
|
-
|
319
|
-
|
321
|
+
r. 77
|
322
|
+
_._ 76
|
323
|
+
ing_ 76
|
324
|
+
_si 76
|
320
325
|
tor 76
|
326
|
+
lk 75
|
321
327
|
av 75
|
322
328
|
lev 75
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
ing_ 75
|
327
|
-
_si 74
|
328
|
-
an_ 74
|
329
|
+
an_ 75
|
330
|
+
ark_ 75
|
331
|
+
mm 74
|
329
332
|
ft 74
|
330
|
-
|
333
|
+
_De 73
|
331
334
|
på 73
|
332
335
|
F 73
|
333
|
-
_._ 73
|
334
336
|
us 73
|
335
|
-
|
336
|
-
e,_ 72
|
337
|
-
di 72
|
337
|
+
le_ 72
|
338
338
|
rin 72
|
339
|
+
e,_ 72
|
339
340
|
e, 72
|
340
|
-
|
341
|
+
di 72
|
342
|
+
nd_ 72
|
341
343
|
E 72
|
342
|
-
nske 71
|
343
|
-
_på 71
|
344
344
|
_der 71
|
345
|
-
|
345
|
+
mark_ 71
|
346
|
+
_på 71
|
347
|
+
nske 71
|
346
348
|
_lan 71
|
347
|
-
|
348
|
-
le_ 70
|
349
|
-
ark_ 70
|
349
|
+
_p� 71
|
350
350
|
st� 70
|
351
|
+
get 70
|
351
352
|
gi 70
|
353
|
+
ks 69
|
352
354
|
ist 69
|
353
355
|
pr 69
|
354
|
-
nd_ 68
|
355
|
-
var 68
|
356
356
|
_blev 68
|
357
357
|
_ble 68
|
358
358
|
blev 68
|
359
|
-
|
360
|
-
mark_ 67
|
361
|
-
på_ 67
|
359
|
+
var 68
|
362
360
|
ss 67
|
361
|
+
på_ 67
|
363
362
|
anske 67
|
364
363
|
_va 67
|
365
|
-
|
366
|
-
_land 66
|
364
|
+
fi 66
|
367
365
|
ati 66
|
368
366
|
tio 66
|
369
367
|
lse 66
|
370
|
-
år 66
|
371
|
-
fi 66
|
372
|
-
_på_ 66
|
373
|
-
) 66
|
374
368
|
tion 66
|
375
|
-
|
376
|
-
( 66
|
369
|
+
_på_ 66
|
377
370
|
gr 66
|
378
|
-
|
371
|
+
�r 66
|
372
|
+
år 66
|
373
|
+
_land 66
|
379
374
|
sto 65
|
380
|
-
kt 65
|
381
375
|
one 65
|
382
|
-
|
376
|
+
ef 65
|
377
|
+
kt 65
|
383
378
|
ev_ 64
|
379
|
+
sen 64
|
384
380
|
else 64
|
385
381
|
A 63
|
386
|
-
ende_ 63
|
387
|
-
ren 63
|
388
382
|
ring 63
|
389
|
-
|
390
|
-
|
383
|
+
ren 63
|
384
|
+
ende_ 63
|
385
|
+
e._ 62
|
391
386
|
ho 62
|
392
|
-
|
393
|
-
|
387
|
+
for_ 62
|
388
|
+
_ko 62
|
389
|
+
ig_ 62
|
390
|
+
n, 61
|
394
391
|
ste_ 61
|
395
392
|
rig 61
|
396
|
-
|
393
|
+
n,_ 61
|
394
|
+
dr 61
|
397
395
|
ret 60
|
396
|
+
lev_ 60
|
398
397
|
blev_ 60
|
399
|
-
|
400
|
-
|
398
|
+
_F 60
|
399
|
+
_E 60
|
400
|
+
nger 59
|