scylla 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. data/Gemfile +1 -0
  2. data/Gemfile.lock +10 -0
  3. data/VERSION +1 -1
  4. data/lib/scylla/generator.rb +1 -1
  5. data/lib/scylla/lms/13375P33K.lm +156 -156
  6. data/lib/scylla/lms/arabic.lm +133 -133
  7. data/lib/scylla/lms/bulgarian.lm +122 -122
  8. data/lib/scylla/lms/catalan.lm +151 -151
  9. data/lib/scylla/lms/danish.lm +137 -137
  10. data/lib/scylla/lms/english.lm +207 -207
  11. data/lib/scylla/lms/french.lm +400 -400
  12. data/lib/scylla/lms/japanese.lm +400 -400
  13. data/lib/scylla/lms/korean.lm +233 -233
  14. data/lib/scylla/lms/norwegian.lm +398 -398
  15. data/lib/scylla/lms/spanish.lm +98 -98
  16. data/lib/scylla/lms/swedish.lm +123 -123
  17. data/lib/scylla/lms/tagalog.lm +223 -223
  18. data/lib/scylla/lms/welsh.lm +234 -234
  19. data/lib/scylla/resources.rb +10 -10
  20. data/scylla.gemspec +17 -40
  21. data/source_texts/catalan.txt +28 -28
  22. data/source_texts/danish.txt +62 -62
  23. data/source_texts/english.txt +10 -10
  24. data/source_texts/french.txt +470 -77
  25. data/source_texts/japanese.txt +453 -199
  26. data/source_texts/norwegian.txt +96 -63
  27. data/source_texts/spanish.txt +269 -269
  28. data/test/classifier_test.rb +2 -2
  29. data/test/fixtures/lms/13375p33k.lm +156 -156
  30. data/test/fixtures/lms/danish.lm +137 -137
  31. data/test/fixtures/lms/english.lm +207 -207
  32. data/test/fixtures/lms/french.lm +400 -400
  33. data/test/fixtures/lms/hindi.lm +400 -0
  34. data/test/fixtures/lms/italian.lm +400 -0
  35. data/test/fixtures/lms/japanese.lm +400 -400
  36. data/test/fixtures/lms/norwegian.lm +400 -0
  37. data/test/fixtures/lms/spanish.lm +98 -98
  38. data/test/fixtures/source_texts/danish.txt +62 -62
  39. data/test/fixtures/source_texts/english.txt +10 -10
  40. data/test/fixtures/source_texts/french.txt +470 -77
  41. data/test/fixtures/source_texts/hindi.txt +199 -0
  42. data/test/fixtures/source_texts/italian.txt +120 -0
  43. data/test/fixtures/source_texts/japanese.txt +453 -199
  44. data/test/fixtures/source_texts/norwegian.txt +190 -0
  45. data/test/fixtures/source_texts/spanish.txt +269 -269
  46. data/test/fixtures/test_languages/english +61 -0
  47. data/test/fixtures/test_languages/french +0 -0
  48. data/test/fixtures/test_languages/german +29 -0
  49. data/test/fixtures/test_languages/hindi +3 -0
  50. data/test/fixtures/test_languages/italian +6 -0
  51. data/test/fixtures/test_languages/japanese +79 -0
  52. data/test/fixtures/test_languages/norwegian +14 -0
  53. data/test/fixtures/test_languages/spanish +22 -0
  54. data/test/generator_test.rb +0 -1
  55. data/test/language_test.rb +28 -0
  56. metadata +20 -43
  57. data/lib/scylla/lms/esperanto.lm +0 -400
  58. data/lib/scylla/lms/hungarian.lm +0 -400
  59. data/lib/scylla/lms/irish.lm +0 -400
  60. data/lib/scylla/lms/kannada.lm +0 -400
  61. data/lib/scylla/lms/latin.lm +0 -400
  62. data/lib/scylla/lms/malay.lm +0 -400
  63. data/lib/scylla/lms/marathi.lm +0 -400
  64. data/lib/scylla/lms/mingo.lm +0 -400
  65. data/lib/scylla/lms/nepali.lm +0 -400
  66. data/lib/scylla/lms/quechua.lm +0 -400
  67. data/lib/scylla/lms/rumantsch.lm +0 -400
  68. data/lib/scylla/lms/sanskrit.lm +0 -400
  69. data/lib/scylla/lms/scots_gaelic.lm +0 -400
  70. data/lib/scylla/lms/serbian.lm +0 -400
  71. data/lib/scylla/lms/swahili.lm +0 -400
  72. data/lib/scylla/lms/tamil.lm +0 -400
  73. data/lib/scylla/lms/ukrainian.lm +0 -400
  74. data/lib/scylla/lms/yiddish.lm +0 -400
  75. data/source_texts/esperanto.txt +0 -199
  76. data/source_texts/hungarian.txt +0 -102
  77. data/source_texts/irish.txt +0 -209
  78. data/source_texts/kannada.txt +0 -283
  79. data/source_texts/latin.txt +0 -120
  80. data/source_texts/malay.txt +0 -108
  81. data/source_texts/marathi.txt +0 -100
  82. data/source_texts/mingo.txt +0 -146
  83. data/source_texts/nepali.txt +0 -131
  84. data/source_texts/quechua.txt +0 -108
  85. data/source_texts/rumantsch.txt +0 -110
  86. data/source_texts/sanskrit.txt +0 -135
  87. data/source_texts/scots_gaelic.txt +0 -93
  88. data/source_texts/serbian.txt +0 -121
  89. data/source_texts/swahili.txt +0 -120
  90. data/source_texts/tamil.txt +0 -167
  91. data/source_texts/ukrainian.txt +0 -214
  92. data/source_texts/yiddish-utf.txt +0 -83
  93. data/test/fixtures/lms/kannada.lm +0 -400
  94. data/test/fixtures/source_texts/kannada.txt +0 -283
data/Gemfile CHANGED
@@ -13,4 +13,5 @@ end
13
13
  group :test do
14
14
  gem "shoulda", ">= 0"
15
15
  gem "mocha", "~> 0.9.12", :require => nil
16
+ gem "ruby-debug", "~> 0.10.4"
16
17
  end
data/Gemfile.lock CHANGED
@@ -1,13 +1,22 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ columnize (0.3.4)
4
5
  git (1.2.5)
5
6
  jeweler (1.6.4)
6
7
  bundler (~> 1.0)
7
8
  git (>= 1.2.5)
8
9
  rake
10
+ linecache (0.46)
11
+ rbx-require-relative (> 0.0.4)
9
12
  mocha (0.9.12)
10
13
  rake (0.9.2)
14
+ rbx-require-relative (0.0.5)
15
+ ruby-debug (0.10.4)
16
+ columnize (>= 0.1)
17
+ ruby-debug-base (~> 0.10.4.0)
18
+ ruby-debug-base (0.10.4)
19
+ linecache (>= 0.3)
11
20
  shoulda (2.11.3)
12
21
 
13
22
  PLATFORMS
@@ -17,4 +26,5 @@ DEPENDENCIES
17
26
  bundler (~> 1.0.0)
18
27
  jeweler (~> 1.6.4)
19
28
  mocha (~> 0.9.12)
29
+ ruby-debug (~> 0.10.4)
20
30
  shoulda
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.3
1
+ 0.5.0
@@ -49,7 +49,7 @@ module Scylla
49
49
  input.each_line {|line| text += line.strip }
50
50
  input = text
51
51
  ngram = Hash.new
52
- input.split(/[0-9\s]/).each do |word|
52
+ input.split(/[\d\s\[\]]/).each do |word|
53
53
  word = "_" + word + "_";
54
54
  len = word.size
55
55
  for i in 0..word.size
@@ -1,5 +1,5 @@
1
- _ 23034
2
- __ 3952
1
+ _ 23082
2
+ __ 3963
3
3
  | 1114
4
4
  |_ 748
5
5
  n 708
@@ -21,14 +21,14 @@ p 425
21
21
  _|_ 395
22
22
  h_ 365
23
23
  _d 363
24
- z_ 350
25
24
  n_ 350
25
+ z_ 350
26
26
  c 345
27
27
  v 339
28
28
  N 331
29
29
  _r_ 326
30
- d_ 319
31
30
  _p 319
31
+ d_ 319
32
32
  f 302
33
33
  l 297
34
34
  u 295
@@ -47,12 +47,12 @@ k 229
47
47
  f_ 221
48
48
  _c 220
49
49
  H 208
50
- U 203
51
50
  _h 203
52
- w 202
51
+ U 203
53
52
  _w 202
53
+ w 202
54
54
  m_ 199
55
- _, 195
55
+ _, 196
56
56
  _u 193
57
57
  \ 190
58
58
  _f_ 190
@@ -62,9 +62,9 @@ T 187
62
62
  L 187
63
63
  v_ 184
64
64
  _N 179
65
- _,_ 175
66
- D 174
65
+ _,_ 176
67
66
  I 174
67
+ D 174
68
68
  _m 173
69
69
  p_ 170
70
70
  _g 170
@@ -80,25 +80,25 @@ _l 153
80
80
  g_ 153
81
81
  _b 152
82
82
  _t_ 151
83
+ |\ 149
83
84
  _|\ 149
84
85
  _. 149
85
- |\ 149
86
86
  _( 148
87
87
  R_ 143
88
88
  l_ 140
89
89
  _k 138
90
90
  k_ 136
91
- c_ 134
92
91
  _._ 134
93
- _w_ 133
92
+ c_ 134
94
93
  w_ 133
94
+ _w_ 133
95
95
  T_ 130
96
96
  N_ 125
97
97
  - 121
98
98
  _m_ 120
99
99
  +_ 119
100
- E 118
101
100
  _y_ 118
101
+ E 118
102
102
  O 117
103
103
  _g_ 114
104
104
  _p_ 112
@@ -107,294 +107,294 @@ x 107
107
107
  o 107
108
108
  W 106
109
109
  i 105
110
- |\| 104
110
+ _c_ 104
111
111
  _|\| 104
112
+ |\| 104
112
113
  \| 104
113
- _c_ 104
114
- M 102
115
114
  u_ 102
116
115
  $ 102
116
+ M 102
117
117
  vv 101
118
118
  H_ 97
119
119
  L_ 94
120
120
  D_ 91
121
121
  _I 90
122
+ (_ 89
122
123
  F 88
123
- / 86
124
- \/ 86
125
124
  _R 86
125
+ \/ 86
126
+ / 86
126
127
  _T 84
127
128
  s 84
128
129
  _W 84
129
- _U 83
130
130
  b_ 83
131
131
  _b_ 83
132
+ _U 83
132
133
  _l_ 81
133
134
  x_ 81
134
135
  _v_ 79
136
+ _(_ 79
135
137
  _D 78
136
138
  _u_ 78
137
- (_ 77
138
139
  _vv 77
139
140
  _s 76
140
141
  $_ 72
141
142
  P 70
142
143
  _L 69
143
144
  _M 68
144
- _(_ 67
145
145
  e_ 67
146
- @R 66
147
146
  \|_ 66
148
- _k_ 66
149
- _|\|_ 66
150
147
  |\|_ 66
148
+ @R 66
151
149
  Y 66
152
- -| 65
150
+ _|\|_ 66
151
+ _k_ 66
153
152
  |- 65
153
+ -| 65
154
154
  _T_ 64
155
155
  _x 62
156
- _+_ 62
157
- vv_ 62
158
156
  @n 62
157
+ vv_ 62
158
+ _+_ 62
159
159
  _F 61
160
- F_ 60
161
160
  IN 60
162
161
  _N_ 60
162
+ F_ 60
163
163
  _H 59
164
164
  _P 59
165
165
  @_ 57
166
- _i 56
167
166
  _a 56
168
- C 54
167
+ _i 56
169
168
  _@R 54
169
+ C 54
170
+ gh 50
170
171
  s_ 50
171
172
  tz 50
172
- gh 50
173
- _R_ 49
174
173
  _vv_ 49
175
- _C 48
174
+ _R_ 49
176
175
  ! 48
177
176
  _F_ 48
178
- _+h 47
177
+ _C 48
179
178
  _O 47
180
179
  +h 47
180
+ _+h 47
181
181
  cH 47
182
182
  |-| 47
183
183
  _x_ 46
184
- _|\/ 45
185
- \/| 45
186
184
  /| 45
187
185
  |\/| 45
188
- _|\/| 45
189
186
  |\/ 45
187
+ _|\/| 45
188
+ \/| 45
189
+ _|\/ 45
190
+ tz_ 44
190
191
  u|_ 44
191
192
  u| 44
192
- tz_ 44
193
193
  E_ 44
194
+ Wh 43
194
195
  _IN 43
195
- Ul 43
196
196
  _gh 43
197
- Wh 43
197
+ Ul 43
198
198
  ) 43
199
- _s_ 42
200
199
  || 42
200
+ _s_ 42
201
201
  A 41
202
202
  Wh_ 41
203
- @R_ 40
204
- Or 40
205
- _L_ 40
206
203
  p| 40
204
+ Or 40
205
+ @R_ 40
207
206
  nd 40
208
- z,_ 39
209
- _@R_ 39
210
- p@ 39
211
- _Wh 39
207
+ _L_ 40
212
208
  G 39
209
+ z, 39
210
+ _Wh 39
213
211
  _p@ 39
214
212
  _wh 39
213
+ z,_ 39
215
214
  wh 39
216
- z, 39
215
+ p@ 39
216
+ _@R_ 39
217
217
  d, 38
218
+ nD 38
218
219
  rz 38
219
220
  _rz 38
220
- nD 38
221
221
  aR 37
222
+ (h 37
222
223
  df 37
224
+ c| 37
223
225
  _D_ 37
224
- (h 37
225
226
  M_ 37
226
227
  _Wh_ 37
227
- c| 37
228
228
  HE 36
229
- (h_ 36
230
229
  _d, 36
230
+ (h_ 36
231
231
  Up 36
232
- _nT 35
233
- +H 35
234
- _@n 35
235
232
  nT 35
236
233
  LL 35
234
+ _@n 35
237
235
  _d@ 35
238
- cH_ 35
239
236
  d@ 35
240
- B 34
237
+ _nT 35
238
+ cH_ 35
239
+ +H 35
240
+ pdf 34
241
241
  _u| 34
242
- _u|_ 34
243
- j 34
242
+ B 34
243
+ _Up 34
244
+ pd 34
244
245
  _j 34
245
246
  n. 34
246
- _Up 34
247
247
  y, 34
248
- pdf 34
249
- pd 34
250
- _@$ 33
248
+ _u|_ 34
249
+ j 34
250
+ lt 33
251
251
  _|| 33
252
- d,_ 33
253
252
  @$ 33
254
- lt 33
255
- Y_ 32
256
- _aR 32
257
- _zUl 32
258
- Rc 32
259
- q 32
260
- p|_ 32
261
- _$ 32
262
- _) 32
263
- _p| 32
253
+ d,_ 33
254
+ _@$ 33
264
255
  _zU 32
265
- z. 32
266
- zUl 32
256
+ Rc 32
267
257
  _p|_ 32
268
- y,_ 32
269
- _aRc 32
258
+ _) 32
259
+ _$ 32
260
+ p|_ 32
270
261
  aRc 32
262
+ Y_ 32
263
+ @r 32
264
+ _aRc 32
265
+ zUl 32
266
+ _p| 32
267
+ _zUl 32
271
268
  M@ 32
272
269
  zU 32
273
- @r 32
270
+ _aR 32
271
+ z. 32
272
+ y,_ 32
273
+ q 32
274
274
  _B 31
275
+ -|_ 31
276
+ o_ 31
275
277
  _d,_ 31
276
278
  |-|_ 31
277
- o_ 31
278
279
  nd_ 31
279
- -|_ 31
280
280
  |\/|_ 30
281
+ \/|_ 30
281
282
  _M_ 30
282
283
  /|_ 30
283
- \/|_ 30
284
- @g 29
285
284
  _b|_ 29
286
- _+H 29
287
285
  |__ 29
288
- b| 29
289
- gh_ 29
290
- r. 29
291
286
  _b| 29
292
- h@ 29
293
- O_ 29
287
+ r. 29
288
+ @g 29
294
289
  b|_ 29
290
+ O_ 29
291
+ _+H 29
292
+ -_ 29
295
293
  )_ 29
296
- PH 28
297
- ||_ 28
298
- De 28
294
+ gh_ 29
295
+ b| 29
296
+ h@ 29
299
297
  G_ 28
300
- _nT_ 28
298
+ ||_ 28
299
+ ve 28
300
+ _nd 28
301
+ z._ 28
301
302
  nT_ 28
302
- _H_ 28
303
303
  ND 28
304
+ PH 28
304
305
  n._ 28
305
- ve 28
306
- z._ 28
306
+ _nT_ 28
307
+ De 28
307
308
  K 28
308
- _nd 28
309
- D, 27
309
+ _H_ 28
310
310
  W_ 27
311
311
  I_ 27
312
- _PH 27
313
312
  _K 27
313
+ D, 27
314
314
  _n. 27
315
315
  ve_ 27
316
- ph 26
316
+ _PH 27
317
+ r._ 26
317
318
  rE 26
319
+ n,_ 26
320
+ Up_ 26
321
+ _rE 26
318
322
  _+hO 26
319
323
  _tz 26
320
- _rE 26
324
+ _(h 26
325
+ LL_ 26
321
326
  @n_ 26
327
+ +hO 26
322
328
  hO 26
323
- Up_ 26
324
329
  n, 26
325
- _(h 26
326
- +hO 26
327
- LL_ 26
328
- r._ 26
329
- n,_ 26
330
+ ph 26
331
+ fOr 25
332
+ _tz_ 25
330
333
  _(h_ 25
334
+ rz_ 25
331
335
  = 25
336
+ _rz_ 25
337
+ +HE 25
332
338
  fO 25
333
- rz_ 25
334
- != 25
335
- _r. 25
336
- +o 25
337
339
  _+HE 25
338
- +HE 25
340
+ _r. 25
341
+ != 25
339
342
  P_ 25
340
- _tz_ 25
341
- _rz_ 25
342
- fOr 25
343
- _pR_ 24
344
- _wh_ 24
345
- Ult 24
343
+ +o 25
344
+ pR_ 24
345
+ _!= 24
346
+ _zUlt 24
347
+ $. 24
346
348
  _pR 24
349
+ ltz 24
347
350
  _Up_ 24
348
- nt 24
351
+ pR 24
352
+ zUlt 24
353
+ _iN 24
354
+ _wh_ 24
355
+ _+o 24
356
+ _nt 24
349
357
  wh_ 24
350
- Or_ 24
358
+ iN 24
351
359
  \/_ 24
352
- ' 24
353
- _nt 24
354
360
  zUltz 24
355
- iN 24
356
- _! 24
357
- _zUlt 24
358
361
  @g_ 24
359
- Ultz 24
362
+ ' 24
363
+ _pR_ 24
360
364
  /_ 24
361
- pR_ 24
362
- pR 24
363
- zUlt 24
364
- _+o 24
365
- _!= 24
366
- $. 24
367
- ltz 24
365
+ nt 24
366
+ Ult 24
368
367
  In 24
369
- _iN 24
370
- RcH 23
371
- rv_ 23
372
- rv 23
368
+ Ultz 24
369
+ Or_ 24
370
+ _! 24
373
371
  _|\|d 23
374
- |d 23
375
- _pd 23
376
- |\|d 23
377
- aRcH 23
378
- _rv 23
372
+ _n,_ 23
379
373
  \|d 23
380
- g|_ 23
381
374
  d. 23
375
+ _rv 23
376
+ aRcH 23
377
+ RcH 23
378
+ be 23
379
+ rv_ 23
382
380
  !_ 23
383
- _aRcH 23
384
- _n,_ 23
381
+ _rv_ 23
382
+ _n, 23
383
+ rv 23
384
+ _be 23
385
385
  _pdf 23
386
386
  rc 23
387
+ _aRcH 23
387
388
  ,. 23
389
+ |d 23
390
+ g|_ 23
391
+ |\|d 23
388
392
  D,_ 23
389
- _be 23
390
- _rv_ 23
393
+ _pd 23
391
394
  g| 23
392
- be 23
393
- _n, 23
394
- |_| 22
395
- \|d_ 22
396
- BuT 22
397
395
  ,,_ 22
398
- Bu 22
399
- nc 22
400
- $._ 22
396
+ |_| 22
397
+ _- 22
398
+ |d_ 22
399
+ d._ 22
400
+ _r._ 22