scylla 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +4 -2
- data/Gemfile.lock +16 -1
- data/lib/scylla/classifier.rb +1 -1
- data/lib/scylla/generator.rb +16 -4
- data/lib/scylla/lms/afrikaans.lm +232 -232
- data/lib/scylla/lms/arabic.lm +175 -175
- data/lib/scylla/lms/bulgarian.lm +225 -225
- data/lib/scylla/lms/catalan.lm +309 -309
- data/lib/scylla/lms/danish.lm +167 -167
- data/lib/scylla/lms/english.lm +398 -398
- data/lib/scylla/lms/finnish.lm +237 -237
- data/lib/scylla/lms/french.lm +148 -148
- data/lib/scylla/lms/german.lm +258 -258
- data/lib/scylla/lms/greek.lm +236 -236
- data/lib/scylla/lms/hebrew.lm +154 -154
- data/lib/scylla/lms/hindi.lm +139 -139
- data/lib/scylla/lms/icelandic.lm +239 -239
- data/lib/scylla/lms/indonesian.lm +244 -244
- data/lib/scylla/lms/italian.lm +248 -248
- data/lib/scylla/lms/japanese.lm +90 -90
- data/lib/scylla/lms/korean.lm +306 -306
- data/lib/scylla/lms/norwegian.lm +193 -193
- data/lib/scylla/lms/polish.lm +241 -241
- data/lib/scylla/lms/portuguese.lm +232 -232
- data/lib/scylla/lms/romanian.lm +246 -246
- data/lib/scylla/lms/slovak.lm +242 -242
- data/lib/scylla/lms/slovenian.lm +229 -229
- data/lib/scylla/lms/spanish.lm +164 -164
- data/lib/scylla/lms/swedish.lm +157 -157
- data/lib/scylla/lms/tagalog.lm +247 -247
- data/lib/scylla/lms/thai.lm +252 -252
- data/lib/scylla/lms/turkish.lm +285 -285
- data/lib/scylla/lms/vietnamese.lm +250 -250
- data/lib/scylla/lms/welsh.lm +248 -248
- data/lib/scylla/resources.rb +1 -9
- data/lib/scylla.rb +4 -0
- data/scylla.gemspec +2 -120
- data/source_texts/english.txt +62 -27
- data/test/classifier_test.rb +1 -3
- data/test/fixtures/lms/danish.lm +173 -173
- data/test/fixtures/lms/english.lm +220 -220
- data/test/fixtures/lms/french.lm +175 -175
- data/test/fixtures/lms/german.lm +254 -254
- data/test/fixtures/lms/hindi.lm +139 -139
- data/test/fixtures/lms/italian.lm +236 -236
- data/test/fixtures/lms/japanese.lm +88 -88
- data/test/fixtures/lms/norwegian.lm +182 -182
- data/test/fixtures/lms/spanish.lm +164 -164
- data/test/fixtures/test_languages/spanish +0 -1
- data/test/generator_test.rb +13 -0
- data/test/helper.rb +2 -0
- metadata +18 -25
- data/.document +0 -5
- data/lib/scylla/lms/13375P33K.lm +0 -400
- data/scylla-0.1.0.gem +0 -0
- data/source_texts/13375P33K.txt +0 -199
- data/test/fixtures/lms/13375p33k.lm +0 -400
- data/test/fixtures/source_texts/13375P33K.txt +0 -199
data/lib/scylla/lms/tagalog.lm
CHANGED
@@ -1,14 +1,14 @@
|
|
1
|
-
_
|
1
|
+
_ 1354
|
2
2
|
a 856
|
3
3
|
n 460
|
4
4
|
i 336
|
5
5
|
g 302
|
6
6
|
ng 210
|
7
7
|
t 183
|
8
|
-
g_
|
8
|
+
g_ 176
|
9
9
|
o 173
|
10
|
-
ng_
|
11
|
-
a_
|
10
|
+
ng_ 170
|
11
|
+
a_ 159
|
12
12
|
k 155
|
13
13
|
an 155
|
14
14
|
s 150
|
@@ -21,380 +21,380 @@ u 103
|
|
21
21
|
na 101
|
22
22
|
ang 98
|
23
23
|
. 85
|
24
|
-
ang_
|
24
|
+
ang_ 84
|
25
25
|
la 80
|
26
26
|
in 79
|
27
27
|
_a 77
|
28
28
|
r 77
|
29
29
|
b 72
|
30
30
|
ka 72
|
31
|
-
sa 67
|
32
31
|
ak 67
|
32
|
+
sa 67
|
33
33
|
pa 65
|
34
34
|
_m 65
|
35
|
-
ma 64
|
36
35
|
_s 64
|
36
|
+
ma 64
|
37
37
|
h 63
|
38
38
|
_na 61
|
39
|
+
at 57
|
39
40
|
d 57
|
40
41
|
al 57
|
41
|
-
at 54
|
42
|
-
_p 52
|
43
42
|
ag 52
|
43
|
+
_p 52
|
44
44
|
ta 49
|
45
|
-
|
45
|
+
ay 46
|
46
46
|
ni 45
|
47
|
+
_k 45
|
47
48
|
t_ 44
|
48
49
|
_sa 43
|
49
|
-
" 42
|
50
50
|
ya 42
|
51
|
-
_ng 42
|
52
51
|
n_ 42
|
53
|
-
|
52
|
+
_ng 42
|
54
53
|
it 40
|
55
54
|
_pa 40
|
56
|
-
|
57
|
-
e 39
|
55
|
+
o_ 39
|
58
56
|
on 39
|
57
|
+
e 39
|
58
|
+
ga 39
|
59
|
+
y_ 38
|
59
60
|
sa_ 38
|
60
|
-
y_ 37
|
61
61
|
iy 36
|
62
62
|
il 36
|
63
|
-
o_ 35
|
64
|
-
ala 35
|
65
63
|
w 35
|
66
|
-
|
64
|
+
ala 35
|
67
65
|
am 34
|
66
|
+
_ng_ 34
|
68
67
|
._ 33
|
69
68
|
_sa_ 33
|
70
|
-
i_ 32
|
71
|
-
_ma 32
|
72
69
|
na_ 32
|
70
|
+
_ma 32
|
71
|
+
i_ 32
|
72
|
+
, 31
|
73
|
+
,_ 31
|
73
74
|
_an 31
|
74
75
|
ra 31
|
75
|
-
, 31
|
76
76
|
ba 30
|
77
|
-
_t 29
|
78
|
-
iya 29
|
79
77
|
_ang_ 29
|
78
|
+
ar 29
|
79
|
+
ay_ 29
|
80
80
|
_ni 29
|
81
81
|
_ang 29
|
82
|
-
|
82
|
+
iya 29
|
83
|
+
_t 29
|
83
84
|
_i 28
|
84
85
|
_ka 28
|
85
86
|
_l 27
|
86
|
-
|
87
|
-
|
87
|
+
li 27
|
88
|
+
is 27
|
88
89
|
hi 27
|
89
|
-
ong 27
|
90
90
|
ha 27
|
91
|
-
|
91
|
+
ong 27
|
92
92
|
aka 27
|
93
|
-
|
94
|
-
ko 26
|
95
|
-
gi 26
|
93
|
+
as 27
|
96
94
|
_na_ 26
|
97
|
-
|
95
|
+
gi 26
|
96
|
+
ko 26
|
98
97
|
ap 25
|
99
98
|
ri 25
|
100
|
-
|
101
|
-
|
99
|
+
to 25
|
100
|
+
ah 24
|
102
101
|
ong_ 24
|
103
102
|
un 24
|
104
|
-
|
103
|
+
lan 24
|
104
|
+
di 23
|
105
|
+
wa 23
|
105
106
|
ata 23
|
106
|
-
|
107
|
+
at_ 23
|
107
108
|
um 23
|
108
|
-
o. 23
|
109
|
-
di 23
|
110
109
|
si 23
|
111
|
-
|
110
|
+
o. 23
|
111
|
+
s_ 23
|
112
|
+
ing 23
|
112
113
|
ti 22
|
113
|
-
s_ 22
|
114
114
|
ki 22
|
115
|
-
niy 21
|
116
|
-
ab 21
|
117
|
-
_d 21
|
118
|
-
niya 21
|
119
|
-
_niy 21
|
120
115
|
mo 21
|
121
|
-
an_ 21
|
122
116
|
_niya 21
|
123
|
-
|
124
|
-
|
117
|
+
_niy 21
|
118
|
+
niya 21
|
119
|
+
_d 21
|
120
|
+
an_ 21
|
121
|
+
niy 21
|
122
|
+
ab 21
|
125
123
|
N 20
|
124
|
+
a. 20
|
125
|
+
aw 19
|
126
126
|
yo 19
|
127
127
|
ila 19
|
128
|
-
- 19
|
129
128
|
ot 19
|
130
|
-
|
131
|
-
_mo 18
|
132
|
-
nga 18
|
133
|
-
_ak 18
|
129
|
+
- 19
|
134
130
|
ig 18
|
131
|
+
_mo 18
|
135
132
|
A 18
|
133
|
+
_ak 18
|
134
|
+
nga 18
|
136
135
|
lang 18
|
137
|
-
nag 17
|
138
|
-
ama 17
|
139
136
|
tu 17
|
137
|
+
ama 17
|
138
|
+
nag 17
|
139
|
+
_b 17
|
140
|
+
in_ 16
|
140
141
|
ina 16
|
141
142
|
P 16
|
142
|
-
in_ 16
|
143
143
|
ali 16
|
144
|
-
"_ 16
|
145
|
-
ara 16
|
146
|
-
_b 16
|
147
144
|
aki 16
|
148
|
-
|
145
|
+
ara 16
|
146
|
+
ing_ 15
|
147
|
+
ya_ 15
|
148
|
+
_si 15
|
149
149
|
lang_ 15
|
150
|
+
o._ 15
|
150
151
|
_at 15
|
151
|
-
|
152
|
-
ing_ 15
|
153
|
-
o._ 14
|
152
|
+
ga_ 14
|
154
153
|
ai 14
|
155
|
-
ib 14
|
156
|
-
nd 14
|
157
|
-
aa 14
|
158
|
-
_r 14
|
159
154
|
da 14
|
155
|
+
_h 14
|
156
|
+
_r 14
|
160
157
|
bi 14
|
161
|
-
|
158
|
+
iya_ 14
|
159
|
+
aa 14
|
160
|
+
nd 14
|
161
|
+
ib 14
|
162
|
+
ik 13
|
162
163
|
ro 13
|
164
|
+
ako 13
|
165
|
+
lo 13
|
163
166
|
_at_ 13
|
164
|
-
hin 13
|
165
167
|
ito 13
|
166
|
-
|
167
|
-
|
168
|
-
ako 13
|
169
|
-
ya_ 13
|
170
|
-
_h 13
|
171
|
-
iya_ 13
|
172
|
-
la_ 12
|
173
|
-
rin 12
|
168
|
+
hin 13
|
169
|
+
_ta 12
|
174
170
|
Na 12
|
171
|
+
rin 12
|
172
|
+
mag 12
|
173
|
+
a, 12
|
175
174
|
K 12
|
175
|
+
_la 12
|
176
|
+
ahi 12
|
176
177
|
ul 12
|
177
178
|
S 12
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
mag 12
|
183
|
-
k_ 11
|
184
|
-
pan 11
|
179
|
+
a,_ 12
|
180
|
+
la_ 12
|
181
|
+
man 11
|
182
|
+
go 11
|
185
183
|
_tu 11
|
186
|
-
mi 11
|
187
|
-
a,_ 11
|
188
184
|
asa 11
|
189
|
-
|
185
|
+
_di 11
|
190
186
|
apa 11
|
191
|
-
|
192
|
-
_mag 11
|
193
|
-
'y 11
|
194
|
-
'y_ 11
|
187
|
+
M 11
|
195
188
|
gk 11
|
196
189
|
lu 11
|
190
|
+
k_ 11
|
191
|
+
kin 11
|
197
192
|
_ri 11
|
198
|
-
|
199
|
-
|
200
|
-
|
193
|
+
_mag 11
|
194
|
+
ilan 11
|
195
|
+
mi 11
|
196
|
+
pan 11
|
201
197
|
ung 11
|
202
|
-
|
203
|
-
|
198
|
+
agk 10
|
199
|
+
akin 10
|
200
|
+
mg 10
|
201
|
+
Ma 10
|
204
202
|
mga 10
|
205
|
-
|
206
|
-
wal 10
|
207
|
-
nak 10
|
208
|
-
_N 10
|
209
|
-
awa 10
|
203
|
+
_K 10
|
210
204
|
ot_ 10
|
211
|
-
|
212
|
-
|
205
|
+
awa 10
|
206
|
+
ir 10
|
213
207
|
pa_ 10
|
214
|
-
|
215
|
-
|
216
|
-
|
208
|
+
uma 10
|
209
|
+
ilang 10
|
210
|
+
wal 10
|
217
211
|
mga_ 10
|
218
|
-
|
212
|
+
is_ 10
|
213
|
+
yon 10
|
219
214
|
yan 10
|
220
|
-
.. 10
|
221
|
-
ir 10
|
222
|
-
Ka 10
|
223
|
-
akin 10
|
224
215
|
wala 10
|
225
216
|
ari 10
|
226
|
-
ilang 10
|
227
|
-
pi 10
|
228
|
-
uma 10
|
229
217
|
I 10
|
230
|
-
|
231
|
-
|
218
|
+
pi 10
|
219
|
+
Ka 10
|
220
|
+
mu 10
|
221
|
+
_is 10
|
222
|
+
nak 10
|
223
|
+
.. 10
|
224
|
+
su 10
|
225
|
+
ini 9
|
232
226
|
it_ 9
|
233
227
|
n, 9
|
234
|
-
|
235
|
-
|
236
|
-
_aki 9
|
237
|
-
agka 9
|
228
|
+
n,_ 9
|
229
|
+
_N 9
|
238
230
|
n. 9
|
239
|
-
pag 9
|
240
231
|
san 9
|
241
232
|
no 9
|
242
|
-
isa 9
|
243
|
-
nan 9
|
244
|
-
_mga 9
|
245
233
|
_mg 9
|
234
|
+
_Ka 9
|
235
|
+
_mga_ 9
|
236
|
+
_ba 9
|
246
237
|
pu 9
|
247
|
-
|
238
|
+
nan 9
|
239
|
+
isa 9
|
240
|
+
pag 9
|
241
|
+
aba 9
|
248
242
|
_A 9
|
249
|
-
|
250
|
-
|
243
|
+
_P 9
|
244
|
+
_aki 9
|
245
|
+
_akin 9
|
246
|
+
agka 9
|
247
|
+
gka 9
|
248
|
+
alan 9
|
251
249
|
An 9
|
252
|
-
|
253
|
-
_Ka 9
|
254
|
-
aha 8
|
255
|
-
aman 8
|
256
|
-
p_ 8
|
257
|
-
ob 8
|
258
|
-
_lu 8
|
259
|
-
_ba 8
|
250
|
+
_mga 9
|
260
251
|
ayo 8
|
252
|
+
igi 8
|
261
253
|
si_ 8
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
254
|
+
Pa 8
|
255
|
+
p_ 8
|
256
|
+
naka 8
|
257
|
+
.n 8
|
258
|
+
_wala 8
|
259
|
+
aha 8
|
260
|
+
_nag 8
|
268
261
|
niya_ 8
|
269
262
|
yang 8
|
270
|
-
as_ 8
|
271
263
|
mo_ 8
|
272
|
-
aga 8
|
273
|
-
_nag 8
|
274
|
-
aking 8
|
275
|
-
tan 8
|
276
|
-
tay 8
|
277
|
-
dib 8
|
278
|
-
a' 8
|
279
|
-
abi 8
|
280
|
-
l_ 8
|
281
|
-
_wala 8
|
282
|
-
naka 8
|
283
|
-
_wal 8
|
284
|
-
alang 8
|
285
264
|
t. 8
|
286
|
-
|
287
|
-
|
265
|
+
aman 8
|
266
|
+
abi 8
|
267
|
+
_wa 8
|
288
268
|
ngi 8
|
289
269
|
_ako 8
|
290
|
-
|
291
|
-
|
270
|
+
l_ 8
|
271
|
+
king 8
|
272
|
+
di_ 8
|
273
|
+
tay 8
|
274
|
+
aking 8
|
275
|
+
dib 8
|
276
|
+
umi 8
|
277
|
+
_w 8
|
278
|
+
alang 8
|
292
279
|
mat 8
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
280
|
+
_wal 8
|
281
|
+
ili 8
|
282
|
+
aga 8
|
283
|
+
_lu 8
|
284
|
+
oy 8
|
285
|
+
tan 8
|
286
|
+
ob 8
|
287
|
+
as_ 8
|
288
|
+
iti 8
|
289
|
+
ut 7
|
290
|
+
tak 7
|
298
291
|
B 7
|
292
|
+
tat 7
|
293
|
+
tang 7
|
299
294
|
kat 7
|
300
|
-
|
301
|
-
|
302
|
-
|
295
|
+
o,_ 7
|
296
|
+
bo 7
|
297
|
+
oy_ 7
|
303
298
|
king_ 7
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
299
|
+
kan 7
|
300
|
+
anga 7
|
301
|
+
mata 7
|
302
|
+
ana 7
|
303
|
+
ala_ 7
|
304
|
+
_M 7
|
305
|
+
lis 7
|
306
|
+
e_ 7
|
309
307
|
er 7
|
310
|
-
|
311
|
-
ut 7
|
312
|
-
ram 7
|
313
|
-
tak 7
|
308
|
+
_pag 7
|
314
309
|
_ko 7
|
315
|
-
|
316
|
-
n,_ 7
|
317
|
-
o' 7
|
318
|
-
_An 7
|
319
|
-
_pa_ 7
|
320
|
-
lis 7
|
321
|
-
o, 7
|
322
|
-
ala_ 7
|
323
|
-
ka_ 7
|
310
|
+
ip 7
|
324
311
|
ku 7
|
325
|
-
|
326
|
-
tat 7
|
327
|
-
kan 7
|
328
|
-
_mo_ 7
|
329
|
-
_si_ 7
|
330
|
-
bo 7
|
312
|
+
g- 7
|
331
313
|
to_ 7
|
314
|
+
_hi 7
|
315
|
+
o, 7
|
316
|
+
_mo_ 7
|
317
|
+
tum 7
|
332
318
|
pak 7
|
319
|
+
ban 7
|
320
|
+
a._ 7
|
321
|
+
_si_ 7
|
322
|
+
gin 7
|
323
|
+
tin 7
|
324
|
+
ra_ 7
|
325
|
+
_An 7
|
326
|
+
ka_ 7
|
327
|
+
ram 7
|
328
|
+
iyo 7
|
329
|
+
_isa 7
|
330
|
+
oo 7
|
331
|
+
_pa_ 7
|
332
|
+
ail 7
|
333
333
|
_rin 7
|
334
|
-
tum 7
|
335
|
-
tang 7
|
336
|
-
anga 7
|
337
|
-
ana 7
|
338
|
-
mata 7
|
339
|
-
ita 6
|
340
|
-
o'y_ 6
|
341
|
-
pal 6
|
342
|
-
_iyo 6
|
343
|
-
?" 6
|
344
|
-
lis_ 6
|
345
|
-
_ku 6
|
346
|
-
nit 6
|
347
|
-
up 6
|
348
|
-
st 6
|
349
334
|
par 6
|
350
|
-
_hi 6
|
351
|
-
_" 6
|
352
|
-
man_ 6
|
353
|
-
nang 6
|
354
|
-
ung_ 6
|
355
|
-
ak_ 6
|
356
|
-
walan 6
|
357
|
-
os 6
|
358
335
|
aila 6
|
359
|
-
|
336
|
+
_ha 6
|
337
|
+
ha_ 6
|
338
|
+
siy 6
|
339
|
+
_iy 6
|
340
|
+
_S 6
|
341
|
+
pal 6
|
342
|
+
kun 6
|
343
|
+
isan 6
|
344
|
+
agi 6
|
345
|
+
Sa 6
|
346
|
+
mba 6
|
347
|
+
isang 6
|
348
|
+
tang_ 6
|
349
|
+
kah 6
|
350
|
+
st 6
|
360
351
|
ag- 6
|
361
|
-
|
362
|
-
|
352
|
+
c 6
|
353
|
+
lis_ 6
|
363
354
|
sang 6
|
364
|
-
|
365
|
-
|
366
|
-
mb 6
|
367
|
-
o'y 6
|
368
|
-
_tum 6
|
355
|
+
_nga 6
|
356
|
+
nang 6
|
369
357
|
ap_ 6
|
370
|
-
|
358
|
+
nt 6
|
359
|
+
os 6
|
360
|
+
ago 6
|
361
|
+
ig_ 6
|
362
|
+
man_ 6
|
371
363
|
gay 6
|
364
|
+
_mu 6
|
372
365
|
_par 6
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
366
|
+
bang 6
|
367
|
+
up 6
|
368
|
+
kak 6
|
369
|
+
ak_ 6
|
377
370
|
siya 6
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
kun 6
|
385
|
-
Sa 6
|
386
|
-
isan 6
|
387
|
-
kah 6
|
371
|
+
ita 6
|
372
|
+
ung_ 6
|
373
|
+
yang_ 6
|
374
|
+
gan 6
|
375
|
+
d_ 6
|
376
|
+
_iyo 6
|
388
377
|
una 6
|
389
|
-
agi 6
|
390
|
-
.na 6
|
391
|
-
kak 6
|
392
378
|
_Ma 6
|
393
|
-
|
394
|
-
|
379
|
+
ula 6
|
380
|
+
im 6
|
381
|
+
_ku 6
|
382
|
+
.na 6
|
383
|
+
_tum 6
|
384
|
+
walan 6
|
385
|
+
_Na 6
|
386
|
+
kaka 6
|
387
|
+
nit 6
|
388
|
+
.N 6
|
389
|
+
mb 6
|
395
390
|
_I 6
|
396
391
|
tata 6
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
392
|
+
han 5
|
393
|
+
nig_ 5
|
394
|
+
uh 5
|
395
|
+
bat 5
|
396
|
+
ito. 5
|
397
|
+
siya_ 5
|
398
|
+
hawak 5
|
399
|
+
go_ 5
|
400
|
+
ot. 5
|