scylla 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. data/Gemfile +0 -1
  2. data/Gemfile.lock +0 -10
  3. data/README.rdoc +22 -0
  4. data/VERSION +1 -1
  5. data/bin/scylla +13 -0
  6. data/lib/scylla/classifier.rb +2 -2
  7. data/lib/scylla/generator.rb +1 -1
  8. data/lib/scylla/lms/13375P33K.lm +400 -0
  9. data/lib/scylla/lms/afrikaans.lm +400 -0
  10. data/lib/scylla/lms/arabic.lm +400 -0
  11. data/lib/scylla/lms/bulgarian.lm +400 -0
  12. data/lib/scylla/lms/catalan.lm +400 -0
  13. data/lib/scylla/lms/chinese.lm +400 -0
  14. data/lib/scylla/lms/danish.lm +400 -0
  15. data/lib/scylla/lms/english.lm +400 -0
  16. data/lib/scylla/lms/esperanto.lm +400 -0
  17. data/lib/scylla/lms/finnish.lm +400 -0
  18. data/lib/scylla/lms/french.lm +400 -0
  19. data/lib/scylla/lms/german.lm +400 -0
  20. data/lib/scylla/lms/greek-iso8859-7.lm +400 -0
  21. data/lib/scylla/lms/hebrew.lm +400 -0
  22. data/lib/scylla/lms/hindi.lm +400 -0
  23. data/lib/scylla/lms/hungarian.lm +400 -0
  24. data/lib/scylla/lms/icelandic.lm +400 -0
  25. data/lib/scylla/lms/indonesian.lm +400 -0
  26. data/lib/scylla/lms/irish.lm +400 -0
  27. data/lib/scylla/lms/italian.lm +400 -0
  28. data/lib/scylla/lms/japanese.lm +400 -0
  29. data/lib/scylla/lms/kannada.lm +400 -0
  30. data/lib/scylla/lms/korean.lm +400 -0
  31. data/lib/scylla/lms/latin.lm +400 -0
  32. data/lib/scylla/lms/malay.lm +400 -0
  33. data/lib/scylla/lms/marathi.lm +400 -0
  34. data/lib/scylla/lms/mingo.lm +400 -0
  35. data/lib/scylla/lms/nepali.lm +400 -0
  36. data/lib/scylla/lms/norwegian.lm +400 -0
  37. data/lib/scylla/lms/polish.lm +400 -0
  38. data/lib/scylla/lms/portuguese.lm +400 -0
  39. data/lib/scylla/lms/quechua.lm +400 -0
  40. data/lib/scylla/lms/romanian.lm +400 -0
  41. data/lib/scylla/lms/rumantsch.lm +400 -0
  42. data/lib/scylla/lms/russian.lm +400 -0
  43. data/lib/scylla/lms/sanskrit.lm +400 -0
  44. data/lib/scylla/lms/scots_gaelic.lm +400 -0
  45. data/lib/scylla/lms/serbian-ascii.lm +400 -0
  46. data/lib/scylla/lms/slovak-ascii.lm +400 -0
  47. data/lib/scylla/lms/slovenian-ascii.lm +400 -0
  48. data/lib/scylla/lms/spanish.lm +400 -0
  49. data/lib/scylla/lms/swahili.lm +400 -0
  50. data/lib/scylla/lms/swedish.lm +400 -0
  51. data/lib/scylla/lms/tagalog.lm +400 -0
  52. data/lib/scylla/lms/tamil.lm +400 -0
  53. data/lib/scylla/lms/thai.lm +400 -0
  54. data/lib/scylla/lms/turkish.lm +400 -0
  55. data/lib/scylla/lms/ukrainian-koi8_u.lm +400 -0
  56. data/lib/scylla/lms/vietnamese.lm +400 -0
  57. data/lib/scylla/lms/welsh.lm +400 -0
  58. data/lib/scylla/lms/yiddish-utf.lm +400 -0
  59. data/lib/scylla/loader.rb +8 -1
  60. data/scylla-0.1.0.gem +0 -0
  61. data/scylla.gemspec +69 -3
  62. data/source_texts/kannada.txt +283 -0
  63. data/test/classifier_test.rb +7 -0
  64. data/test/fixtures/lms/13375p33k.lm +400 -0
  65. data/test/fixtures/lms/danish.lm +400 -0
  66. data/test/fixtures/lms/english.lm +400 -0
  67. data/test/fixtures/lms/french.lm +400 -0
  68. data/test/fixtures/lms/german.lm +400 -0
  69. data/test/fixtures/lms/japanese.lm +400 -0
  70. data/test/fixtures/lms/kannada.lm +400 -0
  71. data/test/fixtures/lms/spanish.lm +400 -0
  72. data/test/fixtures/source_texts/13375P33K.txt +199 -0
  73. data/test/fixtures/source_texts/japanese.txt +199 -0
  74. data/test/fixtures/source_texts/kannada.txt +283 -0
  75. data/test/generator_test.rb +10 -7
  76. data/test/helper.rb +5 -6
  77. data/test/loader_test.rb +1 -0
  78. data/test/scylla_test.rb +1 -0
  79. metadata +78 -14
  80. data/source_texts/armenian.txt +0 -86
@@ -0,0 +1,400 @@
1
+ _ 1242
2
+ a 715
3
+ n 366
4
+ e 308
5
+ i 298
6
+ u 211
7
+ r 208
8
+ k 205
9
+ t 205
10
+ an 185
11
+ g 152
12
+ s 152
13
+ d 149
14
+ m 149
15
+ l 108
16
+ ng 108
17
+ p 99
18
+ a_ 91
19
+ o 91
20
+ er 85
21
+ b 85
22
+ n_ 81
23
+ , 77
24
+ ka 76
25
+ _d 74
26
+ an_ 74
27
+ h 73
28
+ i_ 72
29
+ en 65
30
+ ar 65
31
+ _m 65
32
+ ,_ 64
33
+ ta 59
34
+ di 59
35
+ me 58
36
+ ang 56
37
+ ra 55
38
+ _k 55
39
+ at 54
40
+ ak 54
41
+ y 53
42
+ _me 51
43
+ da 49
44
+ ga 48
45
+ in 46
46
+ ya 43
47
+ . 42
48
+ tu 41
49
+ _di 41
50
+ un 40
51
+ _p 39
52
+ j 38
53
+ la 38
54
+ _s 37
55
+ ma 37
56
+ k_ 36
57
+ na 36
58
+ ah 35
59
+ ri 34
60
+ _t 33
61
+ se 33
62
+ ke 33
63
+ ng_ 33
64
+ g_ 33
65
+ be 33
66
+ al 32
67
+ as 32
68
+ pe 32
69
+ _b 32
70
+ __ 32
71
+ ia 31
72
+ men 31
73
+ _men 29
74
+ _ke 29
75
+ h_ 29
76
+ P 29
77
+ si 29
78
+ ti 28
79
+ it 28
80
+ pa 28
81
+ ny 28
82
+ em 27
83
+ _a 27
84
+ sa 26
85
+ am 26
86
+ kan 26
87
+ u_ 25
88
+ eng 24
89
+ te 24
90
+ _pe 24
91
+ c 24
92
+ ang_ 24
93
+ nga 23
94
+ ja 23
95
+ _se 23
96
+ s_ 23
97
+ ba 23
98
+ S 23
99
+ ber 22
100
+ li 22
101
+ ni 22
102
+ el 22
103
+ di_ 22
104
+ nt 22
105
+ r_ 22
106
+ _da 21
107
+ t_ 21
108
+ ik 20
109
+ bu 20
110
+ nya 20
111
+ ad 20
112
+ ata 20
113
+ ak_ 20
114
+ ara 19
115
+ _i 19
116
+ is 19
117
+ ran 19
118
+ us 19
119
+ _P 19
120
+ gan 19
121
+ ap 19
122
+ ._ 19
123
+ ru 18
124
+ _be 18
125
+ era 18
126
+ _ka 18
127
+ dan 18
128
+ ah_ 17
129
+ ari 17
130
+ - 17
131
+ to 17
132
+ es 17
133
+ nd 17
134
+ ur 17
135
+ a, 17
136
+ uk 17
137
+ yan 16
138
+ ala 16
139
+ ha 16
140
+ yang 16
141
+ ter 16
142
+ ol 16
143
+ su 16
144
+ I 16
145
+ ngan 16
146
+ ung 16
147
+ _y 15
148
+ J 15
149
+ M 15
150
+ kan_ 15
151
+ _ya 15
152
+ _di_ 15
153
+ _ber 15
154
+ A 14
155
+ du 14
156
+ enga 14
157
+ _yan 14
158
+ lu 14
159
+ de 14
160
+ itu 14
161
+ " 14
162
+ ok 14
163
+ ek 14
164
+ _yang 14
165
+ ai 13
166
+ dan_ 13
167
+ rang 13
168
+ per 13
169
+ a,_ 13
170
+ _dan 13
171
+ eri 13
172
+ yang_ 13
173
+ l_ 13
174
+ ge 13
175
+ R 13
176
+ da_ 13
177
+ at_ 13
178
+ D 13
179
+ _te 13
180
+ kar 13
181
+ ko 13
182
+ _S 13
183
+ ina 13
184
+ ku 13
185
+ ul 12
186
+ et 12
187
+ w 12
188
+ po 12
189
+ ut 12
190
+ ya_ 12
191
+ gi 12
192
+ e_ 12
193
+ aka 12
194
+ _dan_ 12
195
+ rin 12
196
+ ua 12
197
+ ju 12
198
+ pen 12
199
+ nya_ 11
200
+ ngg 11
201
+ ena 11
202
+ ama 11
203
+ mp 11
204
+ ca 11
205
+ _pen 11
206
+ re 11
207
+ ika 11
208
+ _J 11
209
+ B 11
210
+ _ter 11
211
+ gg 11
212
+ na_ 11
213
+ ia_ 11
214
+ ed 11
215
+ or 11
216
+ lan 11
217
+ ni_ 10
218
+ ngk 10
219
+ ab 10
220
+ gk 10
221
+ mi 10
222
+ arina 10
223
+ T 10
224
+ eru 10
225
+ Zar 10
226
+ uk_ 10
227
+ Za 10
228
+ arin 10
229
+ ela 10
230
+ adi 10
231
+ rina 10
232
+ wa 10
233
+ rt 10
234
+ Z 10
235
+ us_ 10
236
+ Zari 10
237
+ im 10
238
+ _T 10
239
+ Zarin 10
240
+ uh 10
241
+ ini 10
242
+ il 9
243
+ jad 9
244
+ _mem 9
245
+ i, 9
246
+ i. 9
247
+ mu 9
248
+ ntu 9
249
+ i,_ 9
250
+ ing 9
251
+ _c 9
252
+ gan_ 9
253
+ eb 9
254
+ ngan_ 9
255
+ ta_ 9
256
+ ada 9
257
+ si_ 9
258
+ apa 9
259
+ engan 9
260
+ asi 9
261
+ _de 9
262
+ _ta 9
263
+ mem 9
264
+ K 9
265
+ ant 9
266
+ tu_ 9
267
+ ag 9
268
+ os 9
269
+ jadi 9
270
+ _Zari 8
271
+ dia 8
272
+ oko 8
273
+ _ma 8
274
+ ki 8
275
+ nj 8
276
+ _meng 8
277
+ mb 8
278
+ au 8
279
+ le 8
280
+ man 8
281
+ _A 8
282
+ rk 8
283
+ pat 8
284
+ ep 8
285
+ oh 8
286
+ _Zar 8
287
+ isi 8
288
+ itu_ 8
289
+ emb 8
290
+ eny 8
291
+ aga 8
292
+ ip 8
293
+ _Za 8
294
+ ri_ 8
295
+ ar_ 8
296
+ _in 8
297
+ st 8
298
+ ian 8
299
+ _M 8
300
+ L 8
301
+ meng 8
302
+ _Z 8
303
+ bur 7
304
+ gga 7
305
+ ot 7
306
+ ay 7
307
+ uga 7
308
+ rl 7
309
+ Ta 7
310
+ nu 7
311
+ tan 7
312
+ peng 7
313
+ ug 7
314
+ ej 7
315
+ ce 7
316
+ _peng 7
317
+ ng,_ 7
318
+ tak 7
319
+ ada_ 7
320
+ ro 7
321
+ atan 7
322
+ ntuk_ 7
323
+ _ini 7
324
+ _meny 7
325
+ as_ 7
326
+ erl 7
327
+ art 7
328
+ ih 7
329
+ ma_ 7
330
+ um 7
331
+ eka 7
332
+ lu_ 7
333
+ and 7
334
+ _it 7
335
+ tok 7
336
+ nda 7
337
+ ntuk 7
338
+ ngga 7
339
+ alan 7
340
+ ib 7
341
+ g, 7
342
+ rah 7
343
+ _Ja 7
344
+ gka 7
345
+ aya 7
346
+ pu 7
347
+ _Ta 7
348
+ Ja 7
349
+ ger 7
350
+ _I 7
351
+ meny 7
352
+ tuk 7
353
+ a. 7
354
+ n. 7
355
+ _itu 7
356
+ tuk_ 7
357
+ ng, 7
358
+ _per 7
359
+ _ba 7
360
+ al_ 7
361
+ ita 7
362
+ aran 7
363
+ g,_ 7
364
+ kal 7
365
+ arang 7
366
+ p_ 7
367
+ ngka 7
368
+ toko 7
369
+ u, 7
370
+ sia 7
371
+ pol 6
372
+ pi 6
373
+ den 6
374
+ kart 6
375
+ _L 6
376
+ akart 6
377
+ lis 6
378
+ mel 6
379
+ ngi 6
380
+ _den 6
381
+ n,_ 6
382
+ Pol 6
383
+ ang,_ 6
384
+ aha 6
385
+ uny 6
386
+ PR 6
387
+ ra_ 6
388
+ oli 6
389
+ an,_ 6
390
+ sus 6
391
+ an. 6
392
+ kat 6
393
+ n, 6
394
+ _itu_ 6
395
+ ang, 6
396
+ Po 6
397
+ tin 6
398
+ mun 6
399
+ U 6
400
+ nge 6