scylla 0.4.3 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. data/Gemfile +1 -0
  2. data/Gemfile.lock +10 -0
  3. data/VERSION +1 -1
  4. data/lib/scylla/generator.rb +1 -1
  5. data/lib/scylla/lms/13375P33K.lm +156 -156
  6. data/lib/scylla/lms/arabic.lm +133 -133
  7. data/lib/scylla/lms/bulgarian.lm +122 -122
  8. data/lib/scylla/lms/catalan.lm +151 -151
  9. data/lib/scylla/lms/danish.lm +137 -137
  10. data/lib/scylla/lms/english.lm +207 -207
  11. data/lib/scylla/lms/french.lm +400 -400
  12. data/lib/scylla/lms/japanese.lm +400 -400
  13. data/lib/scylla/lms/korean.lm +233 -233
  14. data/lib/scylla/lms/norwegian.lm +398 -398
  15. data/lib/scylla/lms/spanish.lm +98 -98
  16. data/lib/scylla/lms/swedish.lm +123 -123
  17. data/lib/scylla/lms/tagalog.lm +223 -223
  18. data/lib/scylla/lms/welsh.lm +234 -234
  19. data/lib/scylla/resources.rb +10 -10
  20. data/scylla.gemspec +17 -40
  21. data/source_texts/catalan.txt +28 -28
  22. data/source_texts/danish.txt +62 -62
  23. data/source_texts/english.txt +10 -10
  24. data/source_texts/french.txt +470 -77
  25. data/source_texts/japanese.txt +453 -199
  26. data/source_texts/norwegian.txt +96 -63
  27. data/source_texts/spanish.txt +269 -269
  28. data/test/classifier_test.rb +2 -2
  29. data/test/fixtures/lms/13375p33k.lm +156 -156
  30. data/test/fixtures/lms/danish.lm +137 -137
  31. data/test/fixtures/lms/english.lm +207 -207
  32. data/test/fixtures/lms/french.lm +400 -400
  33. data/test/fixtures/lms/hindi.lm +400 -0
  34. data/test/fixtures/lms/italian.lm +400 -0
  35. data/test/fixtures/lms/japanese.lm +400 -400
  36. data/test/fixtures/lms/norwegian.lm +400 -0
  37. data/test/fixtures/lms/spanish.lm +98 -98
  38. data/test/fixtures/source_texts/danish.txt +62 -62
  39. data/test/fixtures/source_texts/english.txt +10 -10
  40. data/test/fixtures/source_texts/french.txt +470 -77
  41. data/test/fixtures/source_texts/hindi.txt +199 -0
  42. data/test/fixtures/source_texts/italian.txt +120 -0
  43. data/test/fixtures/source_texts/japanese.txt +453 -199
  44. data/test/fixtures/source_texts/norwegian.txt +190 -0
  45. data/test/fixtures/source_texts/spanish.txt +269 -269
  46. data/test/fixtures/test_languages/english +61 -0
  47. data/test/fixtures/test_languages/french +0 -0
  48. data/test/fixtures/test_languages/german +29 -0
  49. data/test/fixtures/test_languages/hindi +3 -0
  50. data/test/fixtures/test_languages/italian +6 -0
  51. data/test/fixtures/test_languages/japanese +79 -0
  52. data/test/fixtures/test_languages/norwegian +14 -0
  53. data/test/fixtures/test_languages/spanish +22 -0
  54. data/test/generator_test.rb +0 -1
  55. data/test/language_test.rb +28 -0
  56. metadata +20 -43
  57. data/lib/scylla/lms/esperanto.lm +0 -400
  58. data/lib/scylla/lms/hungarian.lm +0 -400
  59. data/lib/scylla/lms/irish.lm +0 -400
  60. data/lib/scylla/lms/kannada.lm +0 -400
  61. data/lib/scylla/lms/latin.lm +0 -400
  62. data/lib/scylla/lms/malay.lm +0 -400
  63. data/lib/scylla/lms/marathi.lm +0 -400
  64. data/lib/scylla/lms/mingo.lm +0 -400
  65. data/lib/scylla/lms/nepali.lm +0 -400
  66. data/lib/scylla/lms/quechua.lm +0 -400
  67. data/lib/scylla/lms/rumantsch.lm +0 -400
  68. data/lib/scylla/lms/sanskrit.lm +0 -400
  69. data/lib/scylla/lms/scots_gaelic.lm +0 -400
  70. data/lib/scylla/lms/serbian.lm +0 -400
  71. data/lib/scylla/lms/swahili.lm +0 -400
  72. data/lib/scylla/lms/tamil.lm +0 -400
  73. data/lib/scylla/lms/ukrainian.lm +0 -400
  74. data/lib/scylla/lms/yiddish.lm +0 -400
  75. data/source_texts/esperanto.txt +0 -199
  76. data/source_texts/hungarian.txt +0 -102
  77. data/source_texts/irish.txt +0 -209
  78. data/source_texts/kannada.txt +0 -283
  79. data/source_texts/latin.txt +0 -120
  80. data/source_texts/malay.txt +0 -108
  81. data/source_texts/marathi.txt +0 -100
  82. data/source_texts/mingo.txt +0 -146
  83. data/source_texts/nepali.txt +0 -131
  84. data/source_texts/quechua.txt +0 -108
  85. data/source_texts/rumantsch.txt +0 -110
  86. data/source_texts/sanskrit.txt +0 -135
  87. data/source_texts/scots_gaelic.txt +0 -93
  88. data/source_texts/serbian.txt +0 -121
  89. data/source_texts/swahili.txt +0 -120
  90. data/source_texts/tamil.txt +0 -167
  91. data/source_texts/ukrainian.txt +0 -214
  92. data/source_texts/yiddish-utf.txt +0 -83
  93. data/test/fixtures/lms/kannada.lm +0 -400
  94. data/test/fixtures/source_texts/kannada.txt +0 -283
@@ -1,4 +1,4 @@
1
- _ 1354
1
+ _ 1358
2
2
  a 856
3
3
  n 460
4
4
  i 336
@@ -9,14 +9,14 @@ g_ 175
9
9
  o 173
10
10
  ng_ 169
11
11
  a_ 156
12
- an 155
13
12
  k 155
13
+ an 155
14
14
  s 150
15
15
  l 147
16
16
  _n 132
17
17
  m 131
18
- y 105
19
18
  p 105
19
+ y 105
20
20
  u 103
21
21
  na 101
22
22
  ang 98
@@ -26,14 +26,14 @@ la 80
26
26
  in 79
27
27
  _a 77
28
28
  r 77
29
- ka 72
30
29
  b 72
30
+ ka 72
31
31
  sa 67
32
32
  ak 67
33
- _m 65
34
33
  pa 65
35
- _s 64
34
+ _m 65
36
35
  ma 64
36
+ _s 64
37
37
  h 63
38
38
  _na 61
39
39
  d 57
@@ -46,355 +46,355 @@ _k 45
46
46
  ni 45
47
47
  t_ 44
48
48
  _sa 43
49
- n_ 42
50
49
  " 42
51
50
  ya 42
52
51
  _ng 42
52
+ n_ 42
53
53
  ay 41
54
54
  it 40
55
55
  _pa 40
56
56
  ga 39
57
- on 39
58
57
  e 39
58
+ on 39
59
59
  sa_ 38
60
60
  y_ 37
61
61
  iy 36
62
62
  il 36
63
- ala 35
64
63
  o_ 35
64
+ ala 35
65
65
  w 35
66
66
  _ng_ 34
67
67
  am 34
68
+ ._ 33
68
69
  _sa_ 33
70
+ i_ 32
69
71
  _ma 32
70
72
  na_ 32
71
- i_ 32
72
- ._ 32
73
73
  _an 31
74
74
  ra 31
75
75
  , 31
76
76
  ba 30
77
- _ang 29
77
+ _t 29
78
+ iya 29
78
79
  _ang_ 29
79
80
  _ni 29
81
+ _ang 29
80
82
  ar 29
81
- iya 29
82
- _t 29
83
83
  _i 28
84
84
  _ka 28
85
- li 27
86
85
  _l 27
87
- ha 27
86
+ ,_ 27
88
87
  as 27
89
- is 27
90
- aka 27
91
88
  hi 27
92
- ,_ 27
93
89
  ong 27
94
- _na_ 26
90
+ ha 27
91
+ is 27
92
+ aka 27
93
+ li 27
95
94
  ko 26
96
95
  gi 26
97
- ri 25
98
- ap 25
96
+ _na_ 26
99
97
  to 25
98
+ ap 25
99
+ ri 25
100
+ lan 24
101
+ ay_ 24
100
102
  ong_ 24
101
103
  un 24
102
104
  ah 24
103
- lan 24
104
- ay_ 24
105
105
  ata 23
106
+ ing 23
106
107
  um 23
107
- si 23
108
- wa 23
109
108
  o. 23
110
- ing 23
111
109
  di 23
112
- ki 22
110
+ si 23
111
+ wa 23
113
112
  ti 22
114
113
  s_ 22
115
- _niy 21
114
+ ki 22
116
115
  niy 21
117
- _d 21
118
116
  ab 21
119
- _niya 21
120
- an_ 21
117
+ _d 21
121
118
  niya 21
119
+ _niy 21
122
120
  mo 21
121
+ an_ 21
122
+ _niya 21
123
+ a. 20
123
124
  at_ 20
124
125
  N 20
125
- a. 20
126
- ila 19
127
- ot 19
128
126
  yo 19
127
+ ila 19
129
128
  - 19
129
+ ot 19
130
130
  aw 19
131
+ _mo 18
131
132
  nga 18
132
133
  _ak 18
133
134
  ig 18
134
- _mo 18
135
- lang 18
136
135
  A 18
136
+ lang 18
137
137
  nag 17
138
138
  ama 17
139
139
  tu 17
140
- in_ 16
141
- _b 16
142
- ali 16
143
140
  ina 16
144
- aki 16
145
- ara 16
146
141
  P 16
142
+ in_ 16
143
+ ali 16
147
144
  "_ 16
148
- _at 15
145
+ ara 16
146
+ _b 16
147
+ aki 16
149
148
  ' 15
150
- ing_ 15
151
149
  lang_ 15
150
+ _at 15
152
151
  _si 15
152
+ ing_ 15
153
+ o._ 14
153
154
  ai 14
154
- bi 14
155
+ ib 14
156
+ nd 14
155
157
  aa 14
156
158
  _r 14
157
- ga_ 14
158
- o._ 14
159
- nd 14
160
- ib 14
161
159
  da 14
162
- hin 13
160
+ bi 14
161
+ ga_ 14
163
162
  ro 13
164
- iya_ 13
165
- lo 13
166
- ito 13
167
163
  _at_ 13
164
+ hin 13
165
+ ito 13
166
+ lo 13
168
167
  ik 13
169
- _h 13
170
168
  ako 13
171
169
  ya_ 13
172
- _ta 12
173
- ul 12
170
+ _h 13
171
+ iya_ 13
174
172
  la_ 12
175
- _la 12
176
- Na 12
177
- mag 12
178
- S 12
179
173
  rin 12
180
- a, 12
174
+ Na 12
181
175
  K 12
176
+ ul 12
177
+ S 12
182
178
  ahi 12
183
- man 11
184
- a,_ 11
185
- _ri 11
179
+ _ta 12
180
+ _la 12
181
+ a, 12
182
+ mag 12
186
183
  k_ 11
187
- lu 11
188
- 'y 11
189
- ." 11
190
- gk 11
191
- kin 11
184
+ pan 11
185
+ _tu 11
192
186
  mi 11
193
- go 11
194
- 'y_ 11
187
+ a,_ 11
195
188
  asa 11
196
- M 11
197
- _mag 11
198
189
  ilan 11
199
190
  apa 11
191
+ kin 11
192
+ _mag 11
193
+ 'y 11
194
+ 'y_ 11
195
+ gk 11
196
+ lu 11
197
+ _ri 11
198
+ man 11
199
+ go 11
200
+ M 11
200
201
  ung 11
201
- _tu 11
202
- pan 11
202
+ ." 11
203
203
  _di 11
204
- ot_ 10
205
- Ma 10
204
+ mga 10
206
205
  is_ 10
207
206
  wal 10
208
207
  nak 10
209
- mg 10
210
- mu 10
211
- _is 10
208
+ _N 10
209
+ awa 10
210
+ ot_ 10
212
211
  yon 10
213
- I 10
214
- mga 10
215
- ir 10
216
- ilang 10
217
- mga_ 10
212
+ _is 10
213
+ pa_ 10
218
214
  su 10
219
- uma 10
220
- yan 10
215
+ _K 10
221
216
  agk 10
217
+ mga_ 10
218
+ mu 10
219
+ yan 10
222
220
  .. 10
223
- pa_ 10
224
- awa 10
221
+ ir 10
225
222
  Ka 10
226
- wala 10
227
- pi 10
228
223
  akin 10
224
+ wala 10
229
225
  ari 10
226
+ ilang 10
227
+ pi 10
228
+ uma 10
229
+ I 10
230
+ mg 10
231
+ Ma 10
232
+ it_ 9
233
+ n, 9
234
+ _akin 9
235
+ gka 9
230
236
  _aki 9
231
- An 9
232
- _K 9
233
- ini 9
234
- pu 9
235
- no 9
236
- aba 9
237
- _mg 9
238
- n. 9
239
- _mga_ 9
240
- _mga 9
241
237
  agka 9
242
- gka 9
243
- isa 9
244
- n, 9
245
- _A 9
238
+ n. 9
246
239
  pag 9
247
- alan 9
248
- nan 9
249
240
  san 9
250
- it_ 9
251
- _N 9
252
- _akin 9
253
- ayo 8
254
- _lu 8
255
- naka 8
256
- _wala 8
257
- t. 8
241
+ no 9
242
+ isa 9
243
+ nan 9
244
+ _mga 9
245
+ _mg 9
246
+ pu 9
247
+ alan 9
248
+ _A 9
249
+ _mga_ 9
250
+ ini 9
251
+ An 9
252
+ aba 9
253
+ _Ka 9
258
254
  aha 8
259
- _ako 8
255
+ aman 8
256
+ p_ 8
260
257
  ob 8
258
+ _lu 8
259
+ _ba 8
260
+ ayo 8
261
261
  si_ 8
262
- mo_ 8
263
- _nag 8
264
- _wa 8
265
- dib 8
266
- niya_ 8
267
- mat 8
268
- alang 8
269
- aman 8
270
- ngi 8
262
+ _w 8
271
263
  ili 8
272
- aga 8
273
- as_ 8
274
- Pa 8
275
- l_ 8
264
+ king 8
265
+ _P 8
276
266
  di_ 8
277
- tay 8
278
- _Ka 8
279
267
  iti 8
280
- _w 8
268
+ niya_ 8
269
+ yang 8
270
+ as_ 8
271
+ mo_ 8
272
+ aga 8
273
+ _nag 8
281
274
  aking 8
282
- igi 8
275
+ tan 8
276
+ tay 8
277
+ dib 8
278
+ a' 8
283
279
  abi 8
284
- _ba 8
285
- umi 8
280
+ l_ 8
281
+ _wala 8
282
+ naka 8
286
283
  _wal 8
287
- a' 8
288
- _P 8
289
- king 8
290
- tan 8
291
- p_ 8
292
- yang 8
293
- pak 7
294
- _isa 7
295
- er 7
296
- _rin 7
284
+ alang 8
285
+ t. 8
286
+ umi 8
287
+ Pa 8
288
+ ngi 8
289
+ _ako 8
290
+ _wa 8
291
+ igi 8
292
+ mat 8
293
+ e_ 7
297
294
  _pag 7
298
- tak 7
299
- anga 7
300
- tum 7
295
+ tin 7
296
+ _M 7
297
+ ra_ 7
301
298
  B 7
302
- tat 7
303
- _An 7
304
- _mo_ 7
305
- ail 7
306
- o, 7
299
+ kat 7
300
+ _Na 7
301
+ ip 7
302
+ _isa 7
307
303
  king_ 7
308
- ut 7
309
- a._ 7
304
+ g- 7
305
+ .n 7
306
+ ban 7
307
+ oo 7
308
+ gin 7
309
+ er 7
310
310
  iyo 7
311
- mata 7
311
+ ut 7
312
+ ram 7
313
+ tak 7
314
+ _ko 7
315
+ ail 7
312
316
  n,_ 7
313
317
  o' 7
314
- kan 7
318
+ _An 7
315
319
  _pa_ 7
320
+ lis 7
321
+ o, 7
316
322
  ala_ 7
323
+ ka_ 7
324
+ ku 7
325
+ a._ 7
326
+ tat 7
327
+ kan 7
328
+ _mo_ 7
329
+ _si_ 7
317
330
  bo 7
318
- .n 7
319
- ra_ 7
320
- e_ 7
321
- g- 7
322
- oo 7
323
- gin 7
331
+ to_ 7
332
+ pak 7
333
+ _rin 7
334
+ tum 7
324
335
  tang 7
325
- _ko 7
326
- lis 7
327
- ban 7
328
- _si_ 7
329
- ram 7
330
- ip 7
331
- ka_ 7
332
- tin 7
336
+ anga 7
333
337
  ana 7
334
- to_ 7
335
- _M 7
336
- ku 7
337
- kat 7
338
- kak 6
339
- ula 6
340
- ago 6
338
+ mata 7
339
+ ita 6
340
+ o'y_ 6
341
+ pal 6
342
+ _iyo 6
343
+ ?" 6
341
344
  lis_ 6
342
- _nga 6
343
- .na 6
344
- siy 6
345
- isang 6
346
- ung_ 6
345
+ _ku 6
346
+ nit 6
347
+ up 6
348
+ st 6
347
349
  par 6
348
- _I 6
350
+ _hi 6
351
+ _" 6
349
352
  man_ 6
350
- up 6
351
- pal 6
352
353
  nang 6
353
- ig_ 6
354
- _" 6
355
- kah 6
354
+ ung_ 6
355
+ ak_ 6
356
+ walan 6
357
+ os 6
356
358
  aila 6
357
- tang_ 6
358
- agi 6
359
- nt 6
360
- siya 6
361
- nit 6
362
- isan 6
359
+ ig_ 6
360
+ ag- 6
361
+ yang_ 6
362
+ ? 6
363
363
  sang 6
364
- _iy 6
364
+ d_ 6
365
365
  _mu 6
366
+ mb 6
366
367
  o'y 6
367
- Sa 6
368
- _S 6
369
- c 6
370
368
  _tum 6
371
- mb 6
372
- kun 6
373
- tata 6
374
- _par 6
375
- _ha 6
376
- ? 6
377
- mba 6
378
- yang_ 6
379
- gay 6
380
369
  ap_ 6
381
370
  bang 6
382
- _Na 6
371
+ gay 6
372
+ _par 6
383
373
  im 6
384
- d_ 6
385
- o,_ 6
386
- os 6
387
- ag- 6
374
+ ago 6
375
+ _S 6
376
+ kaka 6
377
+ siya 6
378
+ _ha 6
379
+ c 6
380
+ _iy 6
381
+ siy 6
382
+ tang_ 6
383
+ _nga 6
384
+ kun 6
385
+ Sa 6
386
+ isan 6
387
+ kah 6
388
388
  una 6
389
+ agi 6
390
+ .na 6
391
+ kak 6
389
392
  _Ma 6
390
- ak_ 6
391
- _hi 6
392
- st 6
393
- o'y_ 6
394
- ita 6
395
- _ku 6
396
- gan 6
397
- walan 6
398
- kaka 6
399
- _iyo 6
400
393
  o." 6
394
+ o,_ 6
395
+ _I 6
396
+ tata 6
397
+ nt 6
398
+ ula 6
399
+ mba 6
400
+ isang 6