scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/test/fixtures/lms/french.lm
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
_
|
1
|
+
_ 32286
|
2
2
|
e 11820
|
3
3
|
s 6385
|
4
4
|
a 6245
|
@@ -7,394 +7,394 @@ n 6115
|
|
7
7
|
t 5635
|
8
8
|
r 5480
|
9
9
|
l 4711
|
10
|
-
e_
|
10
|
+
e_ 4337
|
11
11
|
u 4260
|
12
12
|
o 4196
|
13
13
|
� 3243
|
14
14
|
d 3178
|
15
|
-
s_
|
15
|
+
s_ 2976
|
16
16
|
c 2591
|
17
17
|
p 2396
|
18
|
-
_d
|
18
|
+
_d 2360
|
19
19
|
m 2162
|
20
20
|
es 2138
|
21
|
-
é 2115
|
22
21
|
� 2115
|
23
|
-
|
22
|
+
é 2115
|
23
|
+
_l 1881
|
24
24
|
t_ 1784
|
25
|
-
de
|
26
|
-
le
|
27
|
-
es_
|
28
|
-
en
|
29
|
-
on
|
25
|
+
de 1595
|
26
|
+
le 1539
|
27
|
+
es_ 1523
|
28
|
+
en 1500
|
29
|
+
on 1430
|
30
30
|
_de 1338
|
31
|
-
_e
|
31
|
+
_e 1314
|
32
32
|
nt 1258
|
33
33
|
an 1256
|
34
|
+
_p 1247
|
34
35
|
, 1245
|
35
|
-
|
36
|
-
,_ 1207
|
36
|
+
,_ 1209
|
37
37
|
re 1163
|
38
|
-
n_
|
38
|
+
n_ 1086
|
39
39
|
ti 1028
|
40
|
-
is
|
40
|
+
is 982
|
41
41
|
de_ 975
|
42
|
-
la
|
42
|
+
la 906
|
43
43
|
ra 900
|
44
|
-
_de_
|
44
|
+
_de_ 877
|
45
45
|
a_ 863
|
46
46
|
g 846
|
47
|
-
_s
|
47
|
+
_s 834
|
48
48
|
v 796
|
49
49
|
f 777
|
50
|
-
_c
|
51
|
-
_a
|
50
|
+
_c 773
|
51
|
+
_a 746
|
52
52
|
ai 735
|
53
|
-
te
|
53
|
+
te 724
|
54
54
|
ent 708
|
55
55
|
� 698
|
56
|
+
_le 691
|
56
57
|
� 688
|
57
58
|
� 687
|
58
59
|
q 683
|
59
|
-
|
60
|
+
_la 679
|
60
61
|
qu 672
|
61
62
|
ar 672
|
62
|
-
le_
|
63
|
+
le_ 665
|
63
64
|
in 664
|
64
65
|
nt_ 663
|
65
|
-
et
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
it 656
|
66
|
+
et 663
|
67
|
+
it 660
|
68
|
+
se 659
|
69
|
+
me 658
|
70
70
|
� 652
|
71
71
|
�� 652
|
72
72
|
’ 652
|
73
73
|
ur 643
|
74
|
-
ce
|
74
|
+
ce 641
|
75
75
|
la_ 638
|
76
|
+
_la_ 637
|
76
77
|
ne 632
|
77
78
|
b 630
|
78
|
-
_la_ 625
|
79
79
|
er 620
|
80
|
-
ue
|
80
|
+
ue 616
|
81
|
+
__ 614
|
82
|
+
ns 601
|
81
83
|
io 600
|
82
|
-
ns 600
|
83
84
|
. 599
|
84
85
|
u_ 577
|
85
86
|
ion 572
|
86
|
-
et_
|
87
|
+
et_ 556
|
88
|
+
_� 545
|
87
89
|
st 545
|
88
|
-
_� 544
|
89
90
|
r_ 541
|
90
91
|
ie 538
|
91
|
-
_m
|
92
|
+
_m 538
|
92
93
|
ri 535
|
93
94
|
pa 531
|
94
95
|
_et 529
|
95
96
|
ran 527
|
96
|
-
at
|
97
|
+
at 526
|
97
98
|
au 525
|
98
99
|
co 521
|
100
|
+
nc 520
|
99
101
|
_et_ 520
|
100
|
-
nc 519
|
101
102
|
les 515
|
103
|
+
li 512
|
102
104
|
ou 510
|
103
|
-
li 508
|
104
105
|
tr 498
|
105
106
|
al 479
|
106
|
-
ta
|
107
|
+
ta 469
|
107
108
|
ro 467
|
108
109
|
h 459
|
109
|
-
les_
|
110
|
-
que
|
110
|
+
les_ 457
|
111
|
+
que 455
|
111
112
|
tio 431
|
112
113
|
eu 429
|
113
114
|
tion 429
|
114
115
|
r� 428
|
116
|
+
em 428
|
115
117
|
_f 424
|
116
|
-
re_
|
117
|
-
on_
|
118
|
-
|
118
|
+
re_ 423
|
119
|
+
on_ 419
|
120
|
+
_r 413
|
119
121
|
x 410
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
or 400
|
122
|
+
_en 408
|
123
|
+
_pa 404
|
124
|
+
or 402
|
124
125
|
rt 400
|
125
126
|
po 397
|
127
|
+
des 388
|
126
128
|
ll 386
|
127
129
|
si 382
|
128
|
-
|
129
|
-
__ 379
|
130
|
-
oi 378
|
130
|
+
oi 380
|
131
131
|
pr 375
|
132
132
|
anc 374
|
133
133
|
un 373
|
134
|
-
ent_
|
134
|
+
ent_ 372
|
135
135
|
en_ 368
|
136
|
-
nce
|
137
|
-
|
136
|
+
nce 367
|
137
|
+
e, 362
|
138
138
|
_t 362
|
139
|
-
|
140
|
-
|
139
|
+
des_ 362
|
140
|
+
e,_ 362
|
141
|
+
._ 362
|
141
142
|
� 359
|
143
|
+
è 359
|
142
144
|
ne_ 356
|
143
145
|
ir 352
|
144
146
|
par 352
|
145
|
-
e,_ 351
|
146
|
-
e, 351
|
147
147
|
_des 347
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
du
|
148
|
+
_le_ 345
|
149
|
+
_co 345
|
150
|
+
ce_ 344
|
151
|
+
du 343
|
152
152
|
_des_ 338
|
153
|
-
ns_
|
154
|
-
el
|
155
|
-
l’ 328
|
156
|
-
l� 328
|
153
|
+
ns_ 338
|
154
|
+
el 331
|
157
155
|
l� 328
|
158
|
-
_l�
|
159
|
-
_l�
|
160
|
-
|
156
|
+
_l� 328
|
157
|
+
_l� 328
|
158
|
+
l� 328
|
159
|
+
l’ 328
|
160
|
+
_l’ 328
|
161
|
+
ui 327
|
161
162
|
us 325
|
162
|
-
ui 325
|
163
163
|
F 324
|
164
|
+
nd 323
|
164
165
|
ré 322
|
165
|
-
nd 321
|
166
166
|
ati 321
|
167
167
|
ance 311
|
168
168
|
t� 311
|
169
|
-
n�
|
169
|
+
n� 308
|
170
|
+
_F 307
|
170
171
|
ve 307
|
171
172
|
ion_ 306
|
172
|
-
|
173
|
+
om 306
|
174
|
+
_en_ 305
|
173
175
|
L 305
|
174
|
-
om 305
|
175
|
-
_en_ 304
|
176
176
|
so 304
|
177
|
+
_les 304
|
177
178
|
is_ 302
|
178
|
-
_les 302
|
179
|
-
é_ 299
|
180
179
|
�_ 299
|
181
|
-
ue_
|
180
|
+
ue_ 299
|
181
|
+
é_ 299
|
182
182
|
iq 296
|
183
|
+
_les_ 296
|
183
184
|
iqu 296
|
184
185
|
�e 294
|
186
|
+
_par 294
|
185
187
|
ée 294
|
186
|
-
_les_ 294
|
187
|
-
_par 293
|
188
188
|
ma 293
|
189
|
-
men
|
189
|
+
men 292
|
190
190
|
à 288
|
191
191
|
� 288
|
192
192
|
Fr 287
|
193
193
|
ique 286
|
194
194
|
il 284
|
195
|
+
_Fr 283
|
195
196
|
Fra 283
|
196
197
|
est 283
|
197
|
-
_Fr 282
|
198
|
-
à_ 280
|
199
|
-
i_ 280
|
200
198
|
Fran 280
|
201
199
|
�_ 280
|
200
|
+
i_ 280
|
201
|
+
à_ 280
|
202
|
+
_Fra 279
|
202
203
|
mi 279
|
203
|
-
_Fra 278
|
204
204
|
pl 278
|
205
|
+
_Fran 276
|
205
206
|
té 276
|
206
|
-
|
207
|
+
_. 276
|
207
208
|
_u 274
|
209
|
+
_, 274
|
208
210
|
_à 273
|
209
211
|
_à_ 272
|
210
212
|
ranc 272
|
213
|
+
di 271
|
214
|
+
na 270
|
211
215
|
ment 270
|
212
|
-
di 270
|
213
216
|
ut 269
|
214
|
-
na 269
|
215
217
|
ss 268
|
216
|
-
|
218
|
+
_du 268
|
217
219
|
ci 267
|
218
|
-
_du 264
|
219
220
|
�s 263
|
220
221
|
és 263
|
221
222
|
ais 262
|
222
223
|
du_ 259
|
223
|
-
|
224
|
+
_au 259
|
224
225
|
atio 258
|
225
|
-
|
226
|
+
ation 258
|
227
|
+
_n 258
|
228
|
+
_pr 258
|
226
229
|
_un 257
|
227
|
-
|
230
|
+
lu 257
|
228
231
|
_du_ 256
|
229
232
|
y 255
|
230
|
-
lu 255
|
231
233
|
nce_ 255
|
232
|
-
|
234
|
+
que_ 253
|
233
235
|
ol 252
|
234
236
|
Franc 252
|
235
|
-
que_ 252
|
236
|
-
rs 251
|
237
237
|
rance 251
|
238
|
+
rs 251
|
238
239
|
tion_ 250
|
239
240
|
lle 250
|
240
241
|
pe 248
|
242
|
+
_,_ 247
|
243
|
+
s, 246
|
241
244
|
con 244
|
242
|
-
|
243
|
-
s,
|
244
|
-
|
245
|
-
|
245
|
+
ic 244
|
246
|
+
s,_ 243
|
247
|
+
te_ 242
|
248
|
+
d� 242
|
246
249
|
ire 240
|
247
|
-
|
248
|
-
|
250
|
+
ont 240
|
251
|
+
_so 240
|
249
252
|
no 239
|
250
|
-
|
251
|
-
_so 238
|
252
|
-
ic 238
|
253
|
+
res 239
|
253
254
|
ons 238
|
254
255
|
mo 236
|
255
|
-
|
256
|
-
dé 232
|
256
|
+
dé 233
|
257
257
|
i� 231
|
258
258
|
eur 230
|
259
259
|
ance_ 228
|
260
260
|
nn 227
|
261
261
|
_qu 226
|
262
|
-
ant 226
|
263
262
|
_q 226
|
263
|
+
ant 226
|
264
264
|
ct 223
|
265
|
-
|
266
|
-
est_
|
267
|
-
|
268
|
-
ni
|
265
|
+
eme 223
|
266
|
+
est_ 218
|
267
|
+
st_ 218
|
268
|
+
ni 217
|
269
|
+
lo 215
|
269
270
|
ux 215
|
270
|
-
_po 213
|
271
|
-
lo 213
|
272
271
|
ch 213
|
272
|
+
_po 213
|
273
273
|
vi 212
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
_L
|
274
|
+
_é 209
|
275
|
+
me_ 209
|
276
|
+
sa 207
|
277
|
+
_L 207
|
278
278
|
emen 205
|
279
279
|
ement 205
|
280
|
-
|
281
|
-
|
282
|
-
ec
|
280
|
+
_d� 203
|
281
|
+
se_ 203
|
282
|
+
ec 203
|
283
283
|
pu 202
|
284
284
|
ét 202
|
285
285
|
�t 202
|
286
|
-
|
287
|
-
d� 201
|
286
|
+
_es 202
|
288
287
|
d� 201
|
289
288
|
d’ 201
|
290
|
-
|
289
|
+
d� 201
|
290
|
+
ale 196
|
291
291
|
ur_ 196
|
292
292
|
to 196
|
293
|
-
|
294
|
-
|
295
|
-
_d’ 193
|
296
|
-
_d� 193
|
293
|
+
_dé 196
|
294
|
+
da 195
|
297
295
|
_o 193
|
296
|
+
_d’ 193
|
298
297
|
_d� 193
|
298
|
+
_d� 193
|
299
299
|
_est 192
|
300
|
-
tre 191
|
301
300
|
A 191
|
301
|
+
tre 191
|
302
302
|
op 191
|
303
|
-
_pl 190
|
304
|
-
da 190
|
305
303
|
au_ 190
|
304
|
+
_pl 190
|
306
305
|
ul 189
|
307
306
|
_est_ 187
|
307
|
+
x_ 186
|
308
|
+
ment_ 186
|
309
|
+
_re 185
|
310
|
+
bl 185
|
308
311
|
fr 185
|
309
|
-
|
310
|
-
_re 184
|
311
|
-
x_ 184
|
312
|
-
bl 184
|
313
|
-
_i 184
|
312
|
+
_._ 184
|
314
313
|
mp 184
|
315
|
-
|
316
|
-
ts
|
314
|
+
_i 184
|
315
|
+
ts 182
|
316
|
+
tu 180
|
317
317
|
ac 180
|
318
318
|
�r 180
|
319
|
-
tu 180
|
320
319
|
ér 180
|
321
320
|
rti 179
|
322
|
-
_se
|
323
|
-
ise
|
321
|
+
_se 177
|
322
|
+
ise 175
|
324
323
|
art 173
|
325
324
|
iè 172
|
326
325
|
ans 171
|
326
|
+
ia 170
|
327
327
|
ç 170
|
328
328
|
� 170
|
329
|
-
|
329
|
+
l_ 169
|
330
330
|
_mo 169
|
331
|
+
ux_ 169
|
331
332
|
ar_ 168
|
332
|
-
ux_ 168
|
333
333
|
an� 167
|
334
|
-
|
335
|
-
l_ 167
|
334
|
+
lus 167
|
336
335
|
gr 167
|
337
|
-
|
336
|
+
_con 167
|
338
337
|
ran� 166
|
339
|
-
|
338
|
+
P 166
|
340
339
|
nç 165
|
341
340
|
im 164
|
342
|
-
ont_ 164
|
343
|
-
une 163
|
344
341
|
_fr 163
|
345
|
-
|
342
|
+
ont_ 163
|
343
|
+
une 163
|
346
344
|
rs_ 162
|
347
|
-
|
345
|
+
son 162
|
346
|
+
_g 162
|
348
347
|
anç 161
|
349
|
-
|
348
|
+
un_ 161
|
350
349
|
ranç 160
|
351
|
-
C 160
|
352
350
|
su 160
|
353
351
|
us_ 160
|
354
|
-
_v
|
352
|
+
_v 160
|
353
|
+
C 160
|
355
354
|
ill 159
|
356
|
-
cl 158
|
357
355
|
plu 158
|
356
|
+
cl 158
|
358
357
|
par_ 158
|
359
358
|
_plu 158
|
359
|
+
_par_ 157
|
360
360
|
as 157
|
361
|
-
nça 156
|
362
361
|
�a 156
|
362
|
+
nça 156
|
363
363
|
ça 156
|
364
|
-
|
365
|
-
|
364
|
+
l� 154
|
365
|
+
ique_ 154
|
366
366
|
mm 153
|
367
|
-
|
368
|
-
�ais 152
|
367
|
+
ança 153
|
369
368
|
plus 152
|
370
|
-
nçai 152
|
371
369
|
çai 152
|
372
370
|
�ai 152
|
373
371
|
çais 152
|
374
|
-
|
372
|
+
�ais 152
|
373
|
+
nçai 152
|
375
374
|
rr 152
|
375
|
+
_plus 152
|
376
376
|
fra 151
|
377
|
+
_au_ 151
|
377
378
|
ge 150
|
378
|
-
_au_ 150
|
379
|
-
l� 149
|
380
379
|
m� 149
|
381
380
|
une_ 148
|
382
381
|
ag 147
|
382
|
+
fi 146
|
383
383
|
ell 146
|
384
384
|
ions 146
|
385
|
-
fi 146
|
386
385
|
iv 145
|
387
|
-
|
386
|
+
ie_ 145
|
387
|
+
dan 145
|
388
388
|
ien 144
|
389
|
-
|
389
|
+
ans_ 144
|
390
|
+
té_ 143
|
391
|
+
ain 143
|
390
392
|
_fra 143
|
391
393
|
our 143
|
392
|
-
|
393
|
-
té_ 143
|
394
|
-
ans_ 143
|
394
|
+
ep 143
|
395
395
|
elle 142
|
396
396
|
fran 142
|
397
|
-
ep 142
|
398
397
|
_fran 142
|
399
|
-
|
400
|
-
|
398
|
+
éc 141
|
399
|
+
�c 141
|
400
|
+
res_ 140
|