scylla 0.4.3 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/Gemfile.lock +10 -0
- data/VERSION +1 -1
- data/lib/scylla/generator.rb +1 -1
- data/lib/scylla/lms/13375P33K.lm +156 -156
- data/lib/scylla/lms/arabic.lm +133 -133
- data/lib/scylla/lms/bulgarian.lm +122 -122
- data/lib/scylla/lms/catalan.lm +151 -151
- data/lib/scylla/lms/danish.lm +137 -137
- data/lib/scylla/lms/english.lm +207 -207
- data/lib/scylla/lms/french.lm +400 -400
- data/lib/scylla/lms/japanese.lm +400 -400
- data/lib/scylla/lms/korean.lm +233 -233
- data/lib/scylla/lms/norwegian.lm +398 -398
- data/lib/scylla/lms/spanish.lm +98 -98
- data/lib/scylla/lms/swedish.lm +123 -123
- data/lib/scylla/lms/tagalog.lm +223 -223
- data/lib/scylla/lms/welsh.lm +234 -234
- data/lib/scylla/resources.rb +10 -10
- data/scylla.gemspec +17 -40
- data/source_texts/catalan.txt +28 -28
- data/source_texts/danish.txt +62 -62
- data/source_texts/english.txt +10 -10
- data/source_texts/french.txt +470 -77
- data/source_texts/japanese.txt +453 -199
- data/source_texts/norwegian.txt +96 -63
- data/source_texts/spanish.txt +269 -269
- data/test/classifier_test.rb +2 -2
- data/test/fixtures/lms/13375p33k.lm +156 -156
- data/test/fixtures/lms/danish.lm +137 -137
- data/test/fixtures/lms/english.lm +207 -207
- data/test/fixtures/lms/french.lm +400 -400
- data/test/fixtures/lms/hindi.lm +400 -0
- data/test/fixtures/lms/italian.lm +400 -0
- data/test/fixtures/lms/japanese.lm +400 -400
- data/test/fixtures/lms/norwegian.lm +400 -0
- data/test/fixtures/lms/spanish.lm +98 -98
- data/test/fixtures/source_texts/danish.txt +62 -62
- data/test/fixtures/source_texts/english.txt +10 -10
- data/test/fixtures/source_texts/french.txt +470 -77
- data/test/fixtures/source_texts/hindi.txt +199 -0
- data/test/fixtures/source_texts/italian.txt +120 -0
- data/test/fixtures/source_texts/japanese.txt +453 -199
- data/test/fixtures/source_texts/norwegian.txt +190 -0
- data/test/fixtures/source_texts/spanish.txt +269 -269
- data/test/fixtures/test_languages/english +61 -0
- data/test/fixtures/test_languages/french +0 -0
- data/test/fixtures/test_languages/german +29 -0
- data/test/fixtures/test_languages/hindi +3 -0
- data/test/fixtures/test_languages/italian +6 -0
- data/test/fixtures/test_languages/japanese +79 -0
- data/test/fixtures/test_languages/norwegian +14 -0
- data/test/fixtures/test_languages/spanish +22 -0
- data/test/generator_test.rb +0 -1
- data/test/language_test.rb +28 -0
- metadata +20 -43
- data/lib/scylla/lms/esperanto.lm +0 -400
- data/lib/scylla/lms/hungarian.lm +0 -400
- data/lib/scylla/lms/irish.lm +0 -400
- data/lib/scylla/lms/kannada.lm +0 -400
- data/lib/scylla/lms/latin.lm +0 -400
- data/lib/scylla/lms/malay.lm +0 -400
- data/lib/scylla/lms/marathi.lm +0 -400
- data/lib/scylla/lms/mingo.lm +0 -400
- data/lib/scylla/lms/nepali.lm +0 -400
- data/lib/scylla/lms/quechua.lm +0 -400
- data/lib/scylla/lms/rumantsch.lm +0 -400
- data/lib/scylla/lms/sanskrit.lm +0 -400
- data/lib/scylla/lms/scots_gaelic.lm +0 -400
- data/lib/scylla/lms/serbian.lm +0 -400
- data/lib/scylla/lms/swahili.lm +0 -400
- data/lib/scylla/lms/tamil.lm +0 -400
- data/lib/scylla/lms/ukrainian.lm +0 -400
- data/lib/scylla/lms/yiddish.lm +0 -400
- data/source_texts/esperanto.txt +0 -199
- data/source_texts/hungarian.txt +0 -102
- data/source_texts/irish.txt +0 -209
- data/source_texts/kannada.txt +0 -283
- data/source_texts/latin.txt +0 -120
- data/source_texts/malay.txt +0 -108
- data/source_texts/marathi.txt +0 -100
- data/source_texts/mingo.txt +0 -146
- data/source_texts/nepali.txt +0 -131
- data/source_texts/quechua.txt +0 -108
- data/source_texts/rumantsch.txt +0 -110
- data/source_texts/sanskrit.txt +0 -135
- data/source_texts/scots_gaelic.txt +0 -93
- data/source_texts/serbian.txt +0 -121
- data/source_texts/swahili.txt +0 -120
- data/source_texts/tamil.txt +0 -167
- data/source_texts/ukrainian.txt +0 -214
- data/source_texts/yiddish-utf.txt +0 -83
- data/test/fixtures/lms/kannada.lm +0 -400
- data/test/fixtures/source_texts/kannada.txt +0 -283
data/lib/scylla/lms/swedish.lm
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
_
|
1
|
+
_ 20778
|
2
2
|
e 4682
|
3
3
|
r 4185
|
4
4
|
a 4010
|
5
5
|
n 3825
|
6
6
|
t 3529
|
7
7
|
i 2832
|
8
|
+
__ 2797
|
8
9
|
s 2764
|
9
|
-
__ 2494
|
10
10
|
l 2337
|
11
11
|
� 2073
|
12
12
|
d 2021
|
@@ -16,15 +16,15 @@ k 1402
|
|
16
16
|
m 1359
|
17
17
|
v 1194
|
18
18
|
er 1164
|
19
|
-
r_
|
19
|
+
r_ 1003
|
20
20
|
en 976
|
21
|
-
ä 888
|
22
21
|
� 888
|
22
|
+
ä 888
|
23
23
|
de 842
|
24
24
|
n_ 833
|
25
|
-
t_
|
25
|
+
t_ 813
|
26
|
+
a_ 780
|
26
27
|
f 745
|
27
|
-
a_ 744
|
28
28
|
ar 732
|
29
29
|
u 729
|
30
30
|
an 718
|
@@ -33,16 +33,16 @@ h 695
|
|
33
33
|
st 647
|
34
34
|
_s 643
|
35
35
|
in 614
|
36
|
-
ö 597
|
37
36
|
� 597
|
37
|
+
ö 597
|
38
38
|
nd 582
|
39
|
-
e_
|
39
|
+
e_ 579
|
40
40
|
ri 540
|
41
41
|
c 527
|
42
42
|
et 526
|
43
43
|
en_ 523
|
44
|
-
� 523
|
45
44
|
å 523
|
45
|
+
� 523
|
46
46
|
ge 519
|
47
47
|
ra 512
|
48
48
|
. 509
|
@@ -51,7 +51,8 @@ ti 471
|
|
51
51
|
ig 467
|
52
52
|
_i 461
|
53
53
|
la 437
|
54
|
-
s_
|
54
|
+
s_ 424
|
55
|
+
._ 419
|
55
56
|
b 412
|
56
57
|
ta 403
|
57
58
|
re 402
|
@@ -64,24 +65,23 @@ oc 379
|
|
64
65
|
_m 376
|
65
66
|
, 376
|
66
67
|
ll 375
|
67
|
-
,_
|
68
|
+
,_ 372
|
68
69
|
_d 364
|
69
70
|
ng 362
|
70
|
-
._ 353
|
71
71
|
er_ 349
|
72
72
|
and 345
|
73
73
|
sk 343
|
74
74
|
na 342
|
75
75
|
om 341
|
76
76
|
at 339
|
77
|
-
_�
|
77
|
+
_� 339
|
78
78
|
al 337
|
79
|
+
_S 336
|
79
80
|
ka 334
|
80
|
-
_S 325
|
81
81
|
i_ 318
|
82
82
|
or 315
|
83
|
-
�r 311
|
84
83
|
är 311
|
84
|
+
�r 311
|
85
85
|
ns 304
|
86
86
|
_e 303
|
87
87
|
tt 298
|
@@ -90,107 +90,108 @@ _oc 288
|
|
90
90
|
ch 287
|
91
91
|
ige 282
|
92
92
|
eri 281
|
93
|
-
ar_
|
93
|
+
ar_ 281
|
94
94
|
ver 277
|
95
95
|
h_ 273
|
96
|
-
�r 272
|
97
96
|
ör 272
|
97
|
+
�r 272
|
98
98
|
ed 271
|
99
99
|
och 269
|
100
100
|
ch_ 269
|
101
101
|
_och 268
|
102
102
|
och_ 267
|
103
|
-
y 266
|
104
|
-
_och_ 266
|
105
103
|
_i_ 266
|
104
|
+
_och_ 266
|
105
|
+
y 266
|
106
106
|
li 260
|
107
|
-
Sv 257
|
108
107
|
ing 257
|
108
|
+
Sv 257
|
109
109
|
Sve 256
|
110
|
-
rig 254
|
111
110
|
me 254
|
111
|
+
rig 254
|
112
112
|
on 254
|
113
113
|
le 252
|
114
114
|
_t 250
|
115
|
-
d_
|
115
|
+
d_ 249
|
116
116
|
_de 247
|
117
117
|
is 246
|
118
|
-
j 245
|
119
118
|
_v 245
|
119
|
+
j 245
|
120
|
+
et_ 244
|
120
121
|
es 243
|
121
|
-
|
122
|
-
|
122
|
+
m_ 242
|
123
|
+
_k 241
|
123
124
|
rige 238
|
124
125
|
nde 237
|
125
126
|
_h 235
|
126
|
-
_k 232
|
127
127
|
_l 230
|
128
128
|
ni 229
|
129
129
|
_p 229
|
130
130
|
il 228
|
131
|
+
erig 226
|
132
|
+
veri 226
|
131
133
|
erige 226
|
132
134
|
verig 226
|
133
135
|
f� 226
|
134
|
-
|
135
|
-
veri 226
|
136
|
-
Sver 224
|
136
|
+
_Sv 225
|
137
137
|
Sveri 224
|
138
|
-
|
138
|
+
_Sve 224
|
139
|
+
Sver 224
|
139
140
|
de_ 219
|
140
|
-
_Sve 219
|
141
|
-
av 217
|
142
141
|
ter 217
|
143
|
-
|
142
|
+
av 217
|
144
143
|
va 212
|
144
|
+
v_ 212
|
145
145
|
da 209
|
146
146
|
nt 206
|
147
|
+
_r 205
|
147
148
|
ne 205
|
148
149
|
ga 204
|
149
150
|
ik 199
|
151
|
+
_Sver 198
|
150
152
|
lan 198
|
151
153
|
r� 196
|
152
154
|
_b 196
|
153
155
|
fö 195
|
154
|
-
_Sver 193
|
155
156
|
g_ 193
|
156
157
|
rn 191
|
157
|
-
l_
|
158
|
+
l_ 191
|
158
159
|
om_ 190
|
159
|
-
_av 187
|
160
160
|
ha 187
|
161
|
+
_av 187
|
161
162
|
se 187
|
162
163
|
av_ 186
|
163
|
-
�n 184
|
164
164
|
än 184
|
165
|
+
�n 184
|
165
166
|
ad 179
|
166
167
|
_ä 178
|
167
168
|
ska 176
|
168
169
|
_me 174
|
169
170
|
_av_ 174
|
170
171
|
_in 173
|
171
|
-
_r 173
|
172
|
-
land 172
|
173
|
-
för 172
|
174
172
|
so 172
|
173
|
+
för 172
|
174
|
+
land 172
|
175
175
|
ol 171
|
176
176
|
it 167
|
177
177
|
sta 166
|
178
|
+
är_ 164
|
178
179
|
_u 164
|
179
180
|
�r_ 164
|
180
|
-
är_ 164
|
181
|
-
kt 163
|
182
181
|
to 163
|
182
|
+
kt 163
|
183
183
|
der 161
|
184
|
+
ra_ 161
|
185
|
+
v� 160
|
184
186
|
ma 160
|
185
187
|
un 160
|
186
|
-
v� 160
|
187
|
-
_ha 159
|
188
188
|
l� 159
|
189
|
+
_ha 159
|
189
190
|
_f� 159
|
190
191
|
tr 158
|
191
192
|
rs 156
|
192
|
-
am 152
|
193
193
|
ag 152
|
194
|
+
am 152
|
194
195
|
_st 151
|
195
196
|
ka_ 151
|
196
197
|
_en 150
|
@@ -198,76 +199,74 @@ era 148
|
|
198
199
|
io 147
|
199
200
|
ro 146
|
200
201
|
�n 143
|
201
|
-
ån 143
|
202
|
-
�_ 143
|
203
202
|
å_ 143
|
204
|
-
|
203
|
+
�_ 143
|
204
|
+
ån 143
|
205
205
|
den 142
|
206
|
-
|
206
|
+
- 142
|
207
207
|
sa 142
|
208
|
+
ts 142
|
208
209
|
_fö 141
|
209
210
|
tt_ 139
|
210
|
-
] 139
|
211
211
|
_är 139
|
212
|
-
[ 139
|
213
|
-
_ti 138
|
214
212
|
ut 138
|
213
|
+
_ti 138
|
215
214
|
_är_ 137
|
215
|
+
med 136
|
216
216
|
ion 136
|
217
217
|
ill 136
|
218
|
-
med 136
|
219
|
-
ge_ 131
|
220
218
|
gen 131
|
221
|
-
|
222
|
-
som 129
|
219
|
+
ge_ 131
|
223
220
|
nin 129
|
224
221
|
ning 129
|
225
|
-
|
222
|
+
som 129
|
226
223
|
rd 128
|
224
|
+
_so 128
|
227
225
|
rna 127
|
228
226
|
be 127
|
229
227
|
gs 126
|
230
228
|
vi 126
|
231
229
|
ko 125
|
232
230
|
ens 124
|
233
|
-
es_ 124
|
234
231
|
_n 124
|
235
|
-
|
232
|
+
es_ 124
|
236
233
|
di 123
|
234
|
+
t� 123
|
237
235
|
lä 123
|
236
|
+
an_ 122
|
238
237
|
til 122
|
238
|
+
rt 122
|
239
239
|
ige_ 122
|
240
|
-
an_ 122
|
241
240
|
vä 122
|
242
|
-
|
241
|
+
rige_ 121
|
242
|
+
rk 121
|
243
243
|
har 121
|
244
244
|
_för 121
|
245
|
-
|
246
|
-
rige_ 121
|
247
|
-
till 119
|
245
|
+
_g 119
|
248
246
|
som_ 119
|
249
|
-
|
247
|
+
till 119
|
250
248
|
_l� 119
|
251
|
-
|
252
|
-
_med 118
|
253
|
-
_har 118
|
249
|
+
as 119
|
254
250
|
ck 118
|
251
|
+
_har 118
|
252
|
+
_med 118
|
255
253
|
ll_ 118
|
256
254
|
_- 117
|
257
255
|
ande 117
|
258
|
-
ska_ 116
|
259
256
|
har_ 116
|
257
|
+
ska_ 116
|
260
258
|
no 115
|
261
|
-
_som 115
|
262
259
|
ds 115
|
263
260
|
dr 115
|
264
|
-
|
265
|
-
_en_ 114
|
261
|
+
_som 115
|
266
262
|
ade 114
|
267
263
|
ke 114
|
268
|
-
|
264
|
+
_en_ 114
|
265
|
+
_har_ 114
|
266
|
+
_re 114
|
269
267
|
na_ 113
|
270
268
|
nn 113
|
269
|
+
pe 113
|
271
270
|
lt 112
|
272
271
|
del 112
|
273
272
|
_till 111
|
@@ -275,126 +274,127 @@ _til 111
|
|
275
274
|
k_ 110
|
276
275
|
fr 109
|
277
276
|
_som_ 109
|
278
|
-
mi 107
|
279
277
|
pr 107
|
278
|
+
mi 107
|
280
279
|
ng_ 106
|
281
280
|
D 106
|
282
281
|
em 105
|
283
|
-
den_ 104
|
284
282
|
ent 104
|
285
283
|
var 104
|
284
|
+
den_ 104
|
286
285
|
gr 103
|
287
|
-
si 102
|
288
286
|
nsk 102
|
289
|
-
|
287
|
+
si 102
|
290
288
|
att 101
|
289
|
+
s� 101
|
291
290
|
m� 101
|
292
291
|
ger 101
|
293
|
-
tio 100
|
294
292
|
ste 100
|
295
293
|
län 100
|
294
|
+
tio 100
|
296
295
|
_lä 99
|
296
|
+
re_ 99
|
297
297
|
ern 99
|
298
298
|
tal 97
|
299
299
|
det 97
|
300
|
-
|
301
|
-
ed_ 96
|
302
|
-
ta_ 96
|
300
|
+
ta_ 97
|
303
301
|
tion 96
|
304
302
|
kr 96
|
305
|
-
|
303
|
+
ed_ 96
|
306
304
|
ten 96
|
305
|
+
_va 96
|
307
306
|
isk 95
|
308
307
|
ill_ 94
|
309
308
|
id 94
|
310
|
-
[_ 93
|
311
309
|
ot 93
|
312
310
|
ks 93
|
313
|
-
_] 93
|
314
|
-
ur 92
|
315
311
|
are 92
|
316
312
|
ss 92
|
317
313
|
sv 92
|
318
314
|
ven 92
|
315
|
+
ur 92
|
316
|
+
_D 91
|
319
317
|
till_ 90
|
320
|
-
ell 89
|
321
|
-
_fr 89
|
322
318
|
ati 89
|
323
|
-
|
324
|
-
|
325
|
-
lla 88
|
319
|
+
_fr 89
|
320
|
+
ell 89
|
326
321
|
rå 88
|
322
|
+
lla 88
|
323
|
+
lig 88
|
324
|
+
med_ 88
|
327
325
|
ld 88
|
328
|
-
_D 87
|
329
326
|
ru 87
|
330
|
-
pp 86
|
331
327
|
�r 86
|
332
328
|
_län 86
|
333
329
|
år 86
|
330
|
+
pp 86
|
334
331
|
gar 85
|
335
|
-
der_ 85
|
336
332
|
ing_ 85
|
337
|
-
|
333
|
+
der_ 85
|
338
334
|
N 83
|
339
335
|
ls 83
|
336
|
+
he 83
|
337
|
+
_med_ 82
|
340
338
|
nder 82
|
341
339
|
p� 82
|
342
|
-
_med_ 82
|
343
|
-
_re 82
|
344
340
|
rl 82
|
345
341
|
up 81
|
346
342
|
one 81
|
347
|
-
ft 80
|
348
343
|
ns_ 80
|
349
|
-
|
344
|
+
ft 80
|
350
345
|
på 80
|
346
|
+
st� 80
|
351
347
|
rin 80
|
352
348
|
t. 79
|
353
349
|
) 78
|
354
|
-
( 78
|
355
350
|
erna 78
|
356
351
|
ner 78
|
357
|
-
|
358
|
-
j� 77
|
359
|
-
eg 77
|
360
|
-
nte 77
|
361
|
-
_den 77
|
352
|
+
( 78
|
362
353
|
E 77
|
354
|
+
pa 77
|
355
|
+
nte 77
|
356
|
+
_på 77
|
357
|
+
eg 77
|
363
358
|
ät 77
|
359
|
+
_den 77
|
360
|
+
_p� 77
|
361
|
+
j� 77
|
364
362
|
�t 77
|
365
|
-
_på 77
|
366
363
|
_( 77
|
367
|
-
pa 77
|
368
364
|
r, 76
|
369
|
-
|
365
|
+
rg 76
|
370
366
|
på_ 76
|
371
367
|
-_ 76
|
372
|
-
|
368
|
+
ie 76
|
369
|
+
r,_ 75
|
373
370
|
h� 75
|
374
|
-
|
371
|
+
det_ 75
|
375
372
|
tor 75
|
376
373
|
rna_ 75
|
377
|
-
|
374
|
+
und 75
|
378
375
|
n. 75
|
379
|
-
r,_ 75
|
380
|
-
nde_ 74
|
381
376
|
F 74
|
382
|
-
|
383
|
-
äl 73
|
384
|
-
_ut 73
|
385
|
-
�l 73
|
377
|
+
nde_ 74
|
386
378
|
_på_ 73
|
379
|
+
ensk 73
|
387
380
|
ges 73
|
381
|
+
ring 73
|
388
382
|
ist 73
|
389
|
-
|
383
|
+
äl 73
|
390
384
|
dra 73
|
391
|
-
|
392
|
-
|
393
|
-
|
385
|
+
_ut 73
|
386
|
+
rä 73
|
387
|
+
�l 73
|
394
388
|
r. 72
|
395
|
-
|
389
|
+
ett 72
|
390
|
+
_vi 72
|
391
|
+
_be 71
|
396
392
|
ms 71
|
397
|
-
ter_ 71
|
398
393
|
gen_ 71
|
399
394
|
sta_ 71
|
400
|
-
|
395
|
+
_-_ 71
|
396
|
+
ter_ 71
|
397
|
+
dig 70
|
398
|
+
riges 70
|
399
|
+
ges_ 70
|
400
|
+
iges 70
|