scylla 0.7.5 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/scylla/classifier.rb +1 -1
- data/scylla.gemspec +1 -1
- data/test/fixtures/lms/danish.lm +168 -168
- data/test/fixtures/lms/english.lm +217 -217
- data/test/fixtures/lms/french.lm +216 -216
- data/test/fixtures/lms/german.lm +274 -274
- data/test/fixtures/lms/hindi.lm +241 -241
- data/test/fixtures/lms/italian.lm +280 -280
- data/test/fixtures/lms/japanese.lm +110 -110
- data/test/fixtures/lms/norwegian.lm +239 -239
- data/test/fixtures/lms/spanish.lm +188 -188
- data/test/generator_test.rb +1 -1
- metadata +4 -4
data/lib/scylla/classifier.rb
CHANGED
|
@@ -5,7 +5,7 @@ module Scylla
|
|
|
5
5
|
# limit : Up to how many matching language results should be displayed
|
|
6
6
|
# ngrams : The total number of ngrams that are stored for each language
|
|
7
7
|
# threshold: The threshold score for matches
|
|
8
|
-
def initialize(limit = 10, ngrams = 400, threshold = 1.
|
|
8
|
+
def initialize(limit = 10, ngrams = 400, threshold = 1.04)
|
|
9
9
|
@limit = limit
|
|
10
10
|
@ngrams = ngrams
|
|
11
11
|
@threshold = threshold
|
data/scylla.gemspec
CHANGED
data/test/fixtures/lms/danish.lm
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
_
|
|
1
|
+
_ 14426
|
|
2
2
|
e 5759
|
|
3
3
|
r 3334
|
|
4
4
|
n 3061
|
|
@@ -12,389 +12,389 @@ l 1884
|
|
|
12
12
|
g 1617
|
|
13
13
|
k 1278
|
|
14
14
|
m 1273
|
|
15
|
-
er
|
|
16
|
-
e_
|
|
17
|
-
__ 1181
|
|
15
|
+
er 1209
|
|
16
|
+
e_ 1202
|
|
18
17
|
de 1045
|
|
19
18
|
en 993
|
|
20
19
|
� 940
|
|
21
20
|
f 939
|
|
22
|
-
r_
|
|
21
|
+
r_ 854
|
|
23
22
|
v 770
|
|
24
|
-
t_
|
|
23
|
+
t_ 738
|
|
24
|
+
n_ 725
|
|
25
25
|
an 724
|
|
26
|
-
n_ 716
|
|
27
26
|
u 605
|
|
28
27
|
nd 598
|
|
29
28
|
b 585
|
|
30
29
|
et 574
|
|
31
|
-
_s
|
|
30
|
+
_s 562
|
|
32
31
|
. 546
|
|
32
|
+
er_ 545
|
|
33
33
|
re 543
|
|
34
|
-
er_ 541
|
|
35
34
|
te 540
|
|
36
|
-
en_
|
|
35
|
+
en_ 536
|
|
37
36
|
st 530
|
|
38
|
-
g_
|
|
37
|
+
g_ 523
|
|
38
|
+
._ 513
|
|
39
39
|
_o 503
|
|
40
|
-
_d
|
|
40
|
+
_d 488
|
|
41
41
|
, 480
|
|
42
|
-
,_
|
|
43
|
-
h 474
|
|
42
|
+
,_ 479
|
|
44
43
|
ge 474
|
|
44
|
+
h 474
|
|
45
45
|
_a 463
|
|
46
46
|
in 440
|
|
47
47
|
p 436
|
|
48
|
-
_f
|
|
48
|
+
_f 432
|
|
49
49
|
ar 430
|
|
50
50
|
og 415
|
|
51
51
|
or 411
|
|
52
52
|
ti 406
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
_m 382
|
|
57
|
-
_i 381
|
|
53
|
+
_e 406
|
|
54
|
+
et_ 401
|
|
55
|
+
_m 384
|
|
58
56
|
ed 381
|
|
59
|
-
|
|
57
|
+
_i 381
|
|
60
58
|
sk 365
|
|
59
|
+
ne 365
|
|
61
60
|
le 353
|
|
62
61
|
_og 346
|
|
63
|
-
ke
|
|
62
|
+
ke 343
|
|
64
63
|
el 342
|
|
64
|
+
d_ 338
|
|
65
65
|
og_ 338
|
|
66
|
-
ø 332
|
|
67
66
|
� 332
|
|
68
|
-
|
|
67
|
+
ø 332
|
|
69
68
|
me 324
|
|
70
69
|
ng 317
|
|
71
|
-
_og_ 315
|
|
72
70
|
_de 315
|
|
73
|
-
|
|
71
|
+
_og_ 315
|
|
74
72
|
ig 293
|
|
75
73
|
å 293
|
|
74
|
+
� 293
|
|
76
75
|
_b 290
|
|
76
|
+
i_ 287
|
|
77
77
|
æ 286
|
|
78
78
|
� 286
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
de_ 275
|
|
79
|
+
s_ 278
|
|
80
|
+
de_ 276
|
|
82
81
|
_h 274
|
|
83
82
|
ri 273
|
|
84
83
|
D 268
|
|
84
|
+
_D 267
|
|
85
85
|
nde 263
|
|
86
|
-
li 262
|
|
87
86
|
om 262
|
|
87
|
+
li 262
|
|
88
88
|
ma 259
|
|
89
|
+
_t 259
|
|
89
90
|
ve 257
|
|
90
|
-
_t 256
|
|
91
91
|
y 256
|
|
92
|
-
af 254
|
|
93
92
|
_i_ 254
|
|
93
|
+
af 254
|
|
94
94
|
at 245
|
|
95
95
|
il 244
|
|
96
|
-
es 241
|
|
97
96
|
and 241
|
|
98
|
-
|
|
97
|
+
es 240
|
|
99
98
|
al 234
|
|
99
|
+
be 234
|
|
100
100
|
is 233
|
|
101
|
-
fo 232
|
|
102
101
|
se 232
|
|
102
|
+
fo 232
|
|
103
103
|
ns 229
|
|
104
|
-
_D 226
|
|
105
104
|
la 224
|
|
106
105
|
on 221
|
|
107
106
|
rk 219
|
|
108
107
|
_af 217
|
|
109
108
|
den 216
|
|
110
109
|
der 214
|
|
111
|
-
_k
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
_k 213
|
|
111
|
+
k_ 212
|
|
112
|
+
_me 212
|
|
113
|
+
m_ 210
|
|
114
114
|
ing 207
|
|
115
115
|
_v 203
|
|
116
|
-
k_ 202
|
|
117
116
|
ra 191
|
|
118
117
|
f_ 188
|
|
119
118
|
af_ 186
|
|
119
|
+
l_ 186
|
|
120
|
+
_p 186
|
|
120
121
|
for 184
|
|
121
|
-
_p 184
|
|
122
|
-
l_ 183
|
|
123
122
|
_af_ 180
|
|
124
123
|
ol 174
|
|
125
|
-
ere 173
|
|
126
|
-
ark 172
|
|
127
124
|
_fo 172
|
|
125
|
+
ark 172
|
|
126
|
+
ere 172
|
|
128
127
|
lan 169
|
|
129
128
|
ste 169
|
|
130
129
|
te_ 168
|
|
131
|
-
_l
|
|
130
|
+
_l 168
|
|
132
131
|
mar 165
|
|
133
132
|
ll 162
|
|
134
133
|
ter 161
|
|
135
|
-
ske
|
|
134
|
+
ske 159
|
|
136
135
|
j 159
|
|
137
|
-
ke_
|
|
136
|
+
ke_ 157
|
|
138
137
|
om_ 155
|
|
139
|
-
|
|
138
|
+
land 153
|
|
139
|
+
ed_ 153
|
|
140
140
|
Da 153
|
|
141
|
+
_Da 153
|
|
141
142
|
mark 153
|
|
143
|
+
ha 153
|
|
142
144
|
den_ 153
|
|
143
|
-
|
|
144
|
-
_st 151
|
|
145
|
+
_st 153
|
|
145
146
|
ni 151
|
|
146
|
-
ed_ 151
|
|
147
|
-
_for 149
|
|
148
147
|
so 149
|
|
149
|
-
|
|
148
|
+
_for 149
|
|
150
149
|
Dan 148
|
|
150
|
+
_Dan 148
|
|
151
|
+
ta 148
|
|
151
152
|
ger 147
|
|
152
|
-
_er
|
|
153
|
+
_er 146
|
|
154
|
+
re_ 144
|
|
153
155
|
nge 144
|
|
154
156
|
det 143
|
|
155
|
-
re_ 141
|
|
156
157
|
ede 139
|
|
157
|
-
nma 138
|
|
158
158
|
nm 138
|
|
159
|
+
nma 138
|
|
159
160
|
vi 138
|
|
160
161
|
nmar 137
|
|
161
162
|
_en 137
|
|
162
163
|
nmark 137
|
|
163
|
-
anmar 136
|
|
164
|
-
anm 136
|
|
165
|
-
_Da 136
|
|
166
164
|
anma 136
|
|
165
|
+
anm 136
|
|
166
|
+
anmar 136
|
|
167
167
|
ev 135
|
|
168
168
|
rs 135
|
|
169
169
|
der_ 134
|
|
170
|
-
Danm 133
|
|
171
170
|
S 133
|
|
172
|
-
Danma 133
|
|
173
171
|
un 133
|
|
174
|
-
|
|
172
|
+
Danma 133
|
|
173
|
+
Danm 133
|
|
174
|
+
_Danm 133
|
|
175
175
|
ans 132
|
|
176
|
-
med 130
|
|
177
176
|
da 130
|
|
178
|
-
|
|
177
|
+
med 130
|
|
179
178
|
io 127
|
|
179
|
+
ro 127
|
|
180
|
+
_er_ 126
|
|
180
181
|
til 126
|
|
181
|
-
_er_ 125
|
|
182
182
|
ik 125
|
|
183
183
|
som 125
|
|
184
184
|
_ti 124
|
|
185
185
|
rn 124
|
|
186
|
-
em 123
|
|
187
|
-
ds 123
|
|
188
|
-
�_ 123
|
|
189
186
|
å_ 123
|
|
187
|
+
�_ 123
|
|
188
|
+
ds 123
|
|
189
|
+
em 122
|
|
190
190
|
_u 122
|
|
191
191
|
eg 121
|
|
192
|
-
_Danm 121
|
|
193
|
-
_ha 120
|
|
194
192
|
rt 120
|
|
195
|
-
|
|
196
|
-
|
|
193
|
+
_ha 120
|
|
194
|
+
_r 120
|
|
195
|
+
_so 120
|
|
197
196
|
ld 119
|
|
197
|
+
_med 119
|
|
198
198
|
_g 118
|
|
199
199
|
som_ 118
|
|
200
|
+
_S 118
|
|
200
201
|
to 117
|
|
201
202
|
ske_ 116
|
|
203
|
+
det_ 116
|
|
202
204
|
_som 116
|
|
203
|
-
det_ 115
|
|
204
|
-
_r 115
|
|
205
|
-
end 114
|
|
206
|
-
ern 114
|
|
207
205
|
ar_ 114
|
|
206
|
+
ern 114
|
|
207
|
+
end 114
|
|
208
208
|
tr 114
|
|
209
209
|
_som_ 113
|
|
210
|
-
id 111
|
|
211
|
-
ud 111
|
|
212
210
|
ko 111
|
|
211
|
+
ud 111
|
|
212
|
+
id 111
|
|
213
213
|
del 110
|
|
214
214
|
_til 109
|
|
215
|
-
va 108
|
|
216
215
|
nsk 108
|
|
217
216
|
mi 108
|
|
218
|
-
si 108
|
|
219
217
|
lig 108
|
|
218
|
+
va 108
|
|
219
|
+
si 108
|
|
220
220
|
_be 107
|
|
221
221
|
ls 107
|
|
222
|
+
ne_ 106
|
|
223
|
+
_� 106
|
|
224
|
+
_da 105
|
|
225
|
+
ka 105
|
|
222
226
|
ion 105
|
|
223
227
|
ind 105
|
|
224
|
-
_da 105
|
|
225
|
-
�r 105
|
|
226
|
-
bl 105
|
|
227
228
|
gs 105
|
|
229
|
+
bl 105
|
|
230
|
+
�r 105
|
|
228
231
|
ør 105
|
|
229
|
-
ka 105
|
|
230
|
-
_S 104
|
|
231
232
|
lle 104
|
|
232
|
-
|
|
233
|
-
|
|
233
|
+
med_ 104
|
|
234
|
+
_. 104
|
|
234
235
|
t� 103
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
tt 102
|
|
238
|
-
_en_ 101
|
|
236
|
+
_._ 103
|
|
237
|
+
tt 101
|
|
239
238
|
ag 101
|
|
240
|
-
|
|
239
|
+
_en_ 101
|
|
240
|
+
dt 101
|
|
241
241
|
c 100
|
|
242
|
+
r� 100
|
|
243
|
+
nt 99
|
|
244
|
+
dan 99
|
|
242
245
|
ansk 99
|
|
243
246
|
ie 99
|
|
244
|
-
dan 99
|
|
245
|
-
nt 99
|
|
246
247
|
_med_ 98
|
|
247
248
|
or_ 97
|
|
248
|
-
il_
|
|
249
|
+
il_ 97
|
|
250
|
+
_De 95
|
|
249
251
|
De 95
|
|
250
252
|
�r 95
|
|
251
253
|
ær 95
|
|
252
|
-
rne 94
|
|
253
|
-
lt 94
|
|
254
|
-
ning 94
|
|
255
254
|
na 94
|
|
255
|
+
lt 94
|
|
256
256
|
ner 94
|
|
257
|
+
rne 94
|
|
258
|
+
ning 94
|
|
257
259
|
nin 94
|
|
258
|
-
|
|
260
|
+
v_ 94
|
|
261
|
+
til_ 93
|
|
259
262
|
fr 92
|
|
260
|
-
I 92
|
|
261
263
|
at_ 92
|
|
264
|
+
I 92
|
|
265
|
+
ge_ 91
|
|
262
266
|
op 91
|
|
263
267
|
ru 91
|
|
264
|
-
|
|
268
|
+
ng_ 90
|
|
269
|
+
a_ 90
|
|
265
270
|
_dan 90
|
|
271
|
+
_I 90
|
|
266
272
|
erne 89
|
|
267
|
-
|
|
273
|
+
_bl 89
|
|
268
274
|
rd 89
|
|
269
|
-
ng_ 89
|
|
270
275
|
ige 89
|
|
271
|
-
|
|
272
|
-
_. 88
|
|
273
|
-
a_ 88
|
|
276
|
+
_re 89
|
|
274
277
|
gt 88
|
|
278
|
+
inge 87
|
|
279
|
+
tte 87
|
|
275
280
|
p� 87
|
|
276
281
|
kr 87
|
|
277
282
|
_den 87
|
|
278
|
-
|
|
279
|
-
inge 87
|
|
283
|
+
rk_ 86
|
|
280
284
|
s� 86
|
|
285
|
+
dansk 86
|
|
281
286
|
men 86
|
|
282
287
|
dans 86
|
|
283
|
-
dansk 86
|
|
284
|
-
_til_ 85
|
|
285
|
-
isk 85
|
|
286
288
|
ver 85
|
|
287
|
-
|
|
289
|
+
isk 85
|
|
288
290
|
it 85
|
|
289
|
-
|
|
291
|
+
_til_ 85
|
|
292
|
+
am 84
|
|
293
|
+
f� 84
|
|
290
294
|
_dans 84
|
|
295
|
+
es_ 84
|
|
291
296
|
els 84
|
|
292
297
|
_at 84
|
|
293
|
-
am 84
|
|
294
|
-
f� 84
|
|
295
|
-
_n 83
|
|
296
298
|
est 83
|
|
297
|
-
|
|
299
|
+
_n 83
|
|
298
300
|
_den_ 82
|
|
301
|
+
ur 82
|
|
299
302
|
gen 82
|
|
303
|
+
ark_ 81
|
|
304
|
+
_ud 81
|
|
300
305
|
he 81
|
|
306
|
+
sk_ 81
|
|
301
307
|
_at_ 81
|
|
302
|
-
_ud 81
|
|
303
308
|
ble 80
|
|
304
309
|
ene 80
|
|
305
|
-
rk_ 80
|
|
306
310
|
ede_ 79
|
|
307
|
-
e. 79
|
|
308
|
-
ande 79
|
|
309
|
-
und 79
|
|
310
311
|
od 79
|
|
312
|
+
ande 79
|
|
311
313
|
sa 79
|
|
312
|
-
|
|
314
|
+
und 79
|
|
313
315
|
_in 78
|
|
316
|
+
_la 78
|
|
314
317
|
nde_ 78
|
|
315
318
|
_fr 78
|
|
316
|
-
_la 78
|
|
317
319
|
eri 78
|
|
318
|
-
sk_ 77
|
|
319
320
|
ov 77
|
|
320
321
|
ende 77
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
_si 76
|
|
322
|
+
_si 77
|
|
323
|
+
ing_ 77
|
|
324
|
+
r. 76
|
|
325
325
|
tor 76
|
|
326
|
-
|
|
326
|
+
mark_ 76
|
|
327
327
|
av 75
|
|
328
|
+
nd_ 75
|
|
328
329
|
lev 75
|
|
330
|
+
r._ 75
|
|
329
331
|
an_ 75
|
|
330
|
-
|
|
331
|
-
mm 74
|
|
332
|
+
lk 75
|
|
332
333
|
ft 74
|
|
333
|
-
|
|
334
|
+
mm 73
|
|
335
|
+
us 73
|
|
334
336
|
på 73
|
|
337
|
+
e. 73
|
|
335
338
|
F 73
|
|
336
|
-
|
|
339
|
+
di 72
|
|
340
|
+
E 72
|
|
337
341
|
le_ 72
|
|
338
|
-
rin 72
|
|
339
342
|
e,_ 72
|
|
343
|
+
rin 72
|
|
340
344
|
e, 72
|
|
341
|
-
di 72
|
|
342
|
-
nd_ 72
|
|
343
|
-
E 72
|
|
344
|
-
_der 71
|
|
345
|
-
mark_ 71
|
|
346
345
|
_på 71
|
|
347
346
|
nske 71
|
|
348
347
|
_lan 71
|
|
348
|
+
_der 71
|
|
349
349
|
_p� 71
|
|
350
|
-
|
|
350
|
+
e._ 71
|
|
351
351
|
get 70
|
|
352
|
+
st� 70
|
|
352
353
|
gi 70
|
|
353
|
-
ks 69
|
|
354
|
-
ist 69
|
|
355
354
|
pr 69
|
|
355
|
+
_,_ 69
|
|
356
|
+
ist 69
|
|
357
|
+
_, 69
|
|
358
|
+
_E 69
|
|
356
359
|
_blev 68
|
|
357
|
-
|
|
358
|
-
blev 68
|
|
360
|
+
ks 68
|
|
359
361
|
var 68
|
|
360
|
-
|
|
362
|
+
blev 68
|
|
363
|
+
_ble 68
|
|
364
|
+
_va 67
|
|
361
365
|
på_ 67
|
|
362
366
|
anske 67
|
|
363
|
-
|
|
364
|
-
fi 66
|
|
365
|
-
ati 66
|
|
367
|
+
ss 67
|
|
366
368
|
tio 66
|
|
367
369
|
lse 66
|
|
370
|
+
år 66
|
|
368
371
|
tion 66
|
|
369
|
-
|
|
370
|
-
gr 66
|
|
372
|
+
fi 66
|
|
371
373
|
�r 66
|
|
372
|
-
|
|
374
|
+
_F 66
|
|
375
|
+
gr 66
|
|
373
376
|
_land 66
|
|
374
|
-
|
|
375
|
-
|
|
377
|
+
_på_ 66
|
|
378
|
+
ati 66
|
|
376
379
|
ef 65
|
|
380
|
+
one 65
|
|
381
|
+
sto 65
|
|
377
382
|
kt 65
|
|
378
|
-
ev_ 64
|
|
379
|
-
sen 64
|
|
380
383
|
else 64
|
|
381
|
-
|
|
382
|
-
|
|
384
|
+
sen 64
|
|
385
|
+
ev_ 64
|
|
383
386
|
ren 63
|
|
384
387
|
ende_ 63
|
|
385
|
-
|
|
386
|
-
|
|
388
|
+
A 63
|
|
389
|
+
ig_ 63
|
|
390
|
+
ring 63
|
|
387
391
|
for_ 62
|
|
392
|
+
ho 62
|
|
388
393
|
_ko 62
|
|
389
|
-
|
|
390
|
-
n, 61
|
|
391
|
-
ste_ 61
|
|
394
|
+
ere_ 61
|
|
392
395
|
rig 61
|
|
393
|
-
|
|
396
|
+
ste_ 61
|
|
394
397
|
dr 61
|
|
395
398
|
ret 60
|
|
396
|
-
lev_ 60
|
|
397
399
|
blev_ 60
|
|
398
|
-
|
|
399
|
-
_E 60
|
|
400
|
-
nger 59
|
|
400
|
+
lev_ 60
|