scylla 0.4.3 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. data/Gemfile +1 -0
  2. data/Gemfile.lock +10 -0
  3. data/VERSION +1 -1
  4. data/lib/scylla/generator.rb +1 -1
  5. data/lib/scylla/lms/13375P33K.lm +156 -156
  6. data/lib/scylla/lms/arabic.lm +133 -133
  7. data/lib/scylla/lms/bulgarian.lm +122 -122
  8. data/lib/scylla/lms/catalan.lm +151 -151
  9. data/lib/scylla/lms/danish.lm +137 -137
  10. data/lib/scylla/lms/english.lm +207 -207
  11. data/lib/scylla/lms/french.lm +400 -400
  12. data/lib/scylla/lms/japanese.lm +400 -400
  13. data/lib/scylla/lms/korean.lm +233 -233
  14. data/lib/scylla/lms/norwegian.lm +398 -398
  15. data/lib/scylla/lms/spanish.lm +98 -98
  16. data/lib/scylla/lms/swedish.lm +123 -123
  17. data/lib/scylla/lms/tagalog.lm +223 -223
  18. data/lib/scylla/lms/welsh.lm +234 -234
  19. data/lib/scylla/resources.rb +10 -10
  20. data/scylla.gemspec +17 -40
  21. data/source_texts/catalan.txt +28 -28
  22. data/source_texts/danish.txt +62 -62
  23. data/source_texts/english.txt +10 -10
  24. data/source_texts/french.txt +470 -77
  25. data/source_texts/japanese.txt +453 -199
  26. data/source_texts/norwegian.txt +96 -63
  27. data/source_texts/spanish.txt +269 -269
  28. data/test/classifier_test.rb +2 -2
  29. data/test/fixtures/lms/13375p33k.lm +156 -156
  30. data/test/fixtures/lms/danish.lm +137 -137
  31. data/test/fixtures/lms/english.lm +207 -207
  32. data/test/fixtures/lms/french.lm +400 -400
  33. data/test/fixtures/lms/hindi.lm +400 -0
  34. data/test/fixtures/lms/italian.lm +400 -0
  35. data/test/fixtures/lms/japanese.lm +400 -400
  36. data/test/fixtures/lms/norwegian.lm +400 -0
  37. data/test/fixtures/lms/spanish.lm +98 -98
  38. data/test/fixtures/source_texts/danish.txt +62 -62
  39. data/test/fixtures/source_texts/english.txt +10 -10
  40. data/test/fixtures/source_texts/french.txt +470 -77
  41. data/test/fixtures/source_texts/hindi.txt +199 -0
  42. data/test/fixtures/source_texts/italian.txt +120 -0
  43. data/test/fixtures/source_texts/japanese.txt +453 -199
  44. data/test/fixtures/source_texts/norwegian.txt +190 -0
  45. data/test/fixtures/source_texts/spanish.txt +269 -269
  46. data/test/fixtures/test_languages/english +61 -0
  47. data/test/fixtures/test_languages/french +0 -0
  48. data/test/fixtures/test_languages/german +29 -0
  49. data/test/fixtures/test_languages/hindi +3 -0
  50. data/test/fixtures/test_languages/italian +6 -0
  51. data/test/fixtures/test_languages/japanese +79 -0
  52. data/test/fixtures/test_languages/norwegian +14 -0
  53. data/test/fixtures/test_languages/spanish +22 -0
  54. data/test/generator_test.rb +0 -1
  55. data/test/language_test.rb +28 -0
  56. metadata +20 -43
  57. data/lib/scylla/lms/esperanto.lm +0 -400
  58. data/lib/scylla/lms/hungarian.lm +0 -400
  59. data/lib/scylla/lms/irish.lm +0 -400
  60. data/lib/scylla/lms/kannada.lm +0 -400
  61. data/lib/scylla/lms/latin.lm +0 -400
  62. data/lib/scylla/lms/malay.lm +0 -400
  63. data/lib/scylla/lms/marathi.lm +0 -400
  64. data/lib/scylla/lms/mingo.lm +0 -400
  65. data/lib/scylla/lms/nepali.lm +0 -400
  66. data/lib/scylla/lms/quechua.lm +0 -400
  67. data/lib/scylla/lms/rumantsch.lm +0 -400
  68. data/lib/scylla/lms/sanskrit.lm +0 -400
  69. data/lib/scylla/lms/scots_gaelic.lm +0 -400
  70. data/lib/scylla/lms/serbian.lm +0 -400
  71. data/lib/scylla/lms/swahili.lm +0 -400
  72. data/lib/scylla/lms/tamil.lm +0 -400
  73. data/lib/scylla/lms/ukrainian.lm +0 -400
  74. data/lib/scylla/lms/yiddish.lm +0 -400
  75. data/source_texts/esperanto.txt +0 -199
  76. data/source_texts/hungarian.txt +0 -102
  77. data/source_texts/irish.txt +0 -209
  78. data/source_texts/kannada.txt +0 -283
  79. data/source_texts/latin.txt +0 -120
  80. data/source_texts/malay.txt +0 -108
  81. data/source_texts/marathi.txt +0 -100
  82. data/source_texts/mingo.txt +0 -146
  83. data/source_texts/nepali.txt +0 -131
  84. data/source_texts/quechua.txt +0 -108
  85. data/source_texts/rumantsch.txt +0 -110
  86. data/source_texts/sanskrit.txt +0 -135
  87. data/source_texts/scots_gaelic.txt +0 -93
  88. data/source_texts/serbian.txt +0 -121
  89. data/source_texts/swahili.txt +0 -120
  90. data/source_texts/tamil.txt +0 -167
  91. data/source_texts/ukrainian.txt +0 -214
  92. data/source_texts/yiddish-utf.txt +0 -83
  93. data/test/fixtures/lms/kannada.lm +0 -400
  94. data/test/fixtures/source_texts/kannada.txt +0 -283
data/Gemfile CHANGED
@@ -13,4 +13,5 @@ end
13
13
  group :test do
14
14
  gem "shoulda", ">= 0"
15
15
  gem "mocha", "~> 0.9.12", :require => nil
16
+ gem "ruby-debug", "~> 0.10.4"
16
17
  end
data/Gemfile.lock CHANGED
@@ -1,13 +1,22 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
+ columnize (0.3.4)
4
5
  git (1.2.5)
5
6
  jeweler (1.6.4)
6
7
  bundler (~> 1.0)
7
8
  git (>= 1.2.5)
8
9
  rake
10
+ linecache (0.46)
11
+ rbx-require-relative (> 0.0.4)
9
12
  mocha (0.9.12)
10
13
  rake (0.9.2)
14
+ rbx-require-relative (0.0.5)
15
+ ruby-debug (0.10.4)
16
+ columnize (>= 0.1)
17
+ ruby-debug-base (~> 0.10.4.0)
18
+ ruby-debug-base (0.10.4)
19
+ linecache (>= 0.3)
11
20
  shoulda (2.11.3)
12
21
 
13
22
  PLATFORMS
@@ -17,4 +26,5 @@ DEPENDENCIES
17
26
  bundler (~> 1.0.0)
18
27
  jeweler (~> 1.6.4)
19
28
  mocha (~> 0.9.12)
29
+ ruby-debug (~> 0.10.4)
20
30
  shoulda
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.4.3
1
+ 0.5.0
@@ -49,7 +49,7 @@ module Scylla
49
49
  input.each_line {|line| text += line.strip }
50
50
  input = text
51
51
  ngram = Hash.new
52
- input.split(/[0-9\s]/).each do |word|
52
+ input.split(/[\d\s\[\]]/).each do |word|
53
53
  word = "_" + word + "_";
54
54
  len = word.size
55
55
  for i in 0..word.size
@@ -1,5 +1,5 @@
1
- _ 23034
2
- __ 3952
1
+ _ 23082
2
+ __ 3963
3
3
  | 1114
4
4
  |_ 748
5
5
  n 708
@@ -21,14 +21,14 @@ p 425
21
21
  _|_ 395
22
22
  h_ 365
23
23
  _d 363
24
- z_ 350
25
24
  n_ 350
25
+ z_ 350
26
26
  c 345
27
27
  v 339
28
28
  N 331
29
29
  _r_ 326
30
- d_ 319
31
30
  _p 319
31
+ d_ 319
32
32
  f 302
33
33
  l 297
34
34
  u 295
@@ -47,12 +47,12 @@ k 229
47
47
  f_ 221
48
48
  _c 220
49
49
  H 208
50
- U 203
51
50
  _h 203
52
- w 202
51
+ U 203
53
52
  _w 202
53
+ w 202
54
54
  m_ 199
55
- _, 195
55
+ _, 196
56
56
  _u 193
57
57
  \ 190
58
58
  _f_ 190
@@ -62,9 +62,9 @@ T 187
62
62
  L 187
63
63
  v_ 184
64
64
  _N 179
65
- _,_ 175
66
- D 174
65
+ _,_ 176
67
66
  I 174
67
+ D 174
68
68
  _m 173
69
69
  p_ 170
70
70
  _g 170
@@ -80,25 +80,25 @@ _l 153
80
80
  g_ 153
81
81
  _b 152
82
82
  _t_ 151
83
+ |\ 149
83
84
  _|\ 149
84
85
  _. 149
85
- |\ 149
86
86
  _( 148
87
87
  R_ 143
88
88
  l_ 140
89
89
  _k 138
90
90
  k_ 136
91
- c_ 134
92
91
  _._ 134
93
- _w_ 133
92
+ c_ 134
94
93
  w_ 133
94
+ _w_ 133
95
95
  T_ 130
96
96
  N_ 125
97
97
  - 121
98
98
  _m_ 120
99
99
  +_ 119
100
- E 118
101
100
  _y_ 118
101
+ E 118
102
102
  O 117
103
103
  _g_ 114
104
104
  _p_ 112
@@ -107,294 +107,294 @@ x 107
107
107
  o 107
108
108
  W 106
109
109
  i 105
110
- |\| 104
110
+ _c_ 104
111
111
  _|\| 104
112
+ |\| 104
112
113
  \| 104
113
- _c_ 104
114
- M 102
115
114
  u_ 102
116
115
  $ 102
116
+ M 102
117
117
  vv 101
118
118
  H_ 97
119
119
  L_ 94
120
120
  D_ 91
121
121
  _I 90
122
+ (_ 89
122
123
  F 88
123
- / 86
124
- \/ 86
125
124
  _R 86
125
+ \/ 86
126
+ / 86
126
127
  _T 84
127
128
  s 84
128
129
  _W 84
129
- _U 83
130
130
  b_ 83
131
131
  _b_ 83
132
+ _U 83
132
133
  _l_ 81
133
134
  x_ 81
134
135
  _v_ 79
136
+ _(_ 79
135
137
  _D 78
136
138
  _u_ 78
137
- (_ 77
138
139
  _vv 77
139
140
  _s 76
140
141
  $_ 72
141
142
  P 70
142
143
  _L 69
143
144
  _M 68
144
- _(_ 67
145
145
  e_ 67
146
- @R 66
147
146
  \|_ 66
148
- _k_ 66
149
- _|\|_ 66
150
147
  |\|_ 66
148
+ @R 66
151
149
  Y 66
152
- -| 65
150
+ _|\|_ 66
151
+ _k_ 66
153
152
  |- 65
153
+ -| 65
154
154
  _T_ 64
155
155
  _x 62
156
- _+_ 62
157
- vv_ 62
158
156
  @n 62
157
+ vv_ 62
158
+ _+_ 62
159
159
  _F 61
160
- F_ 60
161
160
  IN 60
162
161
  _N_ 60
162
+ F_ 60
163
163
  _H 59
164
164
  _P 59
165
165
  @_ 57
166
- _i 56
167
166
  _a 56
168
- C 54
167
+ _i 56
169
168
  _@R 54
169
+ C 54
170
+ gh 50
170
171
  s_ 50
171
172
  tz 50
172
- gh 50
173
- _R_ 49
174
173
  _vv_ 49
175
- _C 48
174
+ _R_ 49
176
175
  ! 48
177
176
  _F_ 48
178
- _+h 47
177
+ _C 48
179
178
  _O 47
180
179
  +h 47
180
+ _+h 47
181
181
  cH 47
182
182
  |-| 47
183
183
  _x_ 46
184
- _|\/ 45
185
- \/| 45
186
184
  /| 45
187
185
  |\/| 45
188
- _|\/| 45
189
186
  |\/ 45
187
+ _|\/| 45
188
+ \/| 45
189
+ _|\/ 45
190
+ tz_ 44
190
191
  u|_ 44
191
192
  u| 44
192
- tz_ 44
193
193
  E_ 44
194
+ Wh 43
194
195
  _IN 43
195
- Ul 43
196
196
  _gh 43
197
- Wh 43
197
+ Ul 43
198
198
  ) 43
199
- _s_ 42
200
199
  || 42
200
+ _s_ 42
201
201
  A 41
202
202
  Wh_ 41
203
- @R_ 40
204
- Or 40
205
- _L_ 40
206
203
  p| 40
204
+ Or 40
205
+ @R_ 40
207
206
  nd 40
208
- z,_ 39
209
- _@R_ 39
210
- p@ 39
211
- _Wh 39
207
+ _L_ 40
212
208
  G 39
209
+ z, 39
210
+ _Wh 39
213
211
  _p@ 39
214
212
  _wh 39
213
+ z,_ 39
215
214
  wh 39
216
- z, 39
215
+ p@ 39
216
+ _@R_ 39
217
217
  d, 38
218
+ nD 38
218
219
  rz 38
219
220
  _rz 38
220
- nD 38
221
221
  aR 37
222
+ (h 37
222
223
  df 37
224
+ c| 37
223
225
  _D_ 37
224
- (h 37
225
226
  M_ 37
226
227
  _Wh_ 37
227
- c| 37
228
228
  HE 36
229
- (h_ 36
230
229
  _d, 36
230
+ (h_ 36
231
231
  Up 36
232
- _nT 35
233
- +H 35
234
- _@n 35
235
232
  nT 35
236
233
  LL 35
234
+ _@n 35
237
235
  _d@ 35
238
- cH_ 35
239
236
  d@ 35
240
- B 34
237
+ _nT 35
238
+ cH_ 35
239
+ +H 35
240
+ pdf 34
241
241
  _u| 34
242
- _u|_ 34
243
- j 34
242
+ B 34
243
+ _Up 34
244
+ pd 34
244
245
  _j 34
245
246
  n. 34
246
- _Up 34
247
247
  y, 34
248
- pdf 34
249
- pd 34
250
- _@$ 33
248
+ _u|_ 34
249
+ j 34
250
+ lt 33
251
251
  _|| 33
252
- d,_ 33
253
252
  @$ 33
254
- lt 33
255
- Y_ 32
256
- _aR 32
257
- _zUl 32
258
- Rc 32
259
- q 32
260
- p|_ 32
261
- _$ 32
262
- _) 32
263
- _p| 32
253
+ d,_ 33
254
+ _@$ 33
264
255
  _zU 32
265
- z. 32
266
- zUl 32
256
+ Rc 32
267
257
  _p|_ 32
268
- y,_ 32
269
- _aRc 32
258
+ _) 32
259
+ _$ 32
260
+ p|_ 32
270
261
  aRc 32
262
+ Y_ 32
263
+ @r 32
264
+ _aRc 32
265
+ zUl 32
266
+ _p| 32
267
+ _zUl 32
271
268
  M@ 32
272
269
  zU 32
273
- @r 32
270
+ _aR 32
271
+ z. 32
272
+ y,_ 32
273
+ q 32
274
274
  _B 31
275
+ -|_ 31
276
+ o_ 31
275
277
  _d,_ 31
276
278
  |-|_ 31
277
- o_ 31
278
279
  nd_ 31
279
- -|_ 31
280
280
  |\/|_ 30
281
+ \/|_ 30
281
282
  _M_ 30
282
283
  /|_ 30
283
- \/|_ 30
284
- @g 29
285
284
  _b|_ 29
286
- _+H 29
287
285
  |__ 29
288
- b| 29
289
- gh_ 29
290
- r. 29
291
286
  _b| 29
292
- h@ 29
293
- O_ 29
287
+ r. 29
288
+ @g 29
294
289
  b|_ 29
290
+ O_ 29
291
+ _+H 29
292
+ -_ 29
295
293
  )_ 29
296
- PH 28
297
- ||_ 28
298
- De 28
294
+ gh_ 29
295
+ b| 29
296
+ h@ 29
299
297
  G_ 28
300
- _nT_ 28
298
+ ||_ 28
299
+ ve 28
300
+ _nd 28
301
+ z._ 28
301
302
  nT_ 28
302
- _H_ 28
303
303
  ND 28
304
+ PH 28
304
305
  n._ 28
305
- ve 28
306
- z._ 28
306
+ _nT_ 28
307
+ De 28
307
308
  K 28
308
- _nd 28
309
- D, 27
309
+ _H_ 28
310
310
  W_ 27
311
311
  I_ 27
312
- _PH 27
313
312
  _K 27
313
+ D, 27
314
314
  _n. 27
315
315
  ve_ 27
316
- ph 26
316
+ _PH 27
317
+ r._ 26
317
318
  rE 26
319
+ n,_ 26
320
+ Up_ 26
321
+ _rE 26
318
322
  _+hO 26
319
323
  _tz 26
320
- _rE 26
324
+ _(h 26
325
+ LL_ 26
321
326
  @n_ 26
327
+ +hO 26
322
328
  hO 26
323
- Up_ 26
324
329
  n, 26
325
- _(h 26
326
- +hO 26
327
- LL_ 26
328
- r._ 26
329
- n,_ 26
330
+ ph 26
331
+ fOr 25
332
+ _tz_ 25
330
333
  _(h_ 25
334
+ rz_ 25
331
335
  = 25
336
+ _rz_ 25
337
+ +HE 25
332
338
  fO 25
333
- rz_ 25
334
- != 25
335
- _r. 25
336
- +o 25
337
339
  _+HE 25
338
- +HE 25
340
+ _r. 25
341
+ != 25
339
342
  P_ 25
340
- _tz_ 25
341
- _rz_ 25
342
- fOr 25
343
- _pR_ 24
344
- _wh_ 24
345
- Ult 24
343
+ +o 25
344
+ pR_ 24
345
+ _!= 24
346
+ _zUlt 24
347
+ $. 24
346
348
  _pR 24
349
+ ltz 24
347
350
  _Up_ 24
348
- nt 24
351
+ pR 24
352
+ zUlt 24
353
+ _iN 24
354
+ _wh_ 24
355
+ _+o 24
356
+ _nt 24
349
357
  wh_ 24
350
- Or_ 24
358
+ iN 24
351
359
  \/_ 24
352
- ' 24
353
- _nt 24
354
360
  zUltz 24
355
- iN 24
356
- _! 24
357
- _zUlt 24
358
361
  @g_ 24
359
- Ultz 24
362
+ ' 24
363
+ _pR_ 24
360
364
  /_ 24
361
- pR_ 24
362
- pR 24
363
- zUlt 24
364
- _+o 24
365
- _!= 24
366
- $. 24
367
- ltz 24
365
+ nt 24
366
+ Ult 24
368
367
  In 24
369
- _iN 24
370
- RcH 23
371
- rv_ 23
372
- rv 23
368
+ Ultz 24
369
+ Or_ 24
370
+ _! 24
373
371
  _|\|d 23
374
- |d 23
375
- _pd 23
376
- |\|d 23
377
- aRcH 23
378
- _rv 23
372
+ _n,_ 23
379
373
  \|d 23
380
- g|_ 23
381
374
  d. 23
375
+ _rv 23
376
+ aRcH 23
377
+ RcH 23
378
+ be 23
379
+ rv_ 23
382
380
  !_ 23
383
- _aRcH 23
384
- _n,_ 23
381
+ _rv_ 23
382
+ _n, 23
383
+ rv 23
384
+ _be 23
385
385
  _pdf 23
386
386
  rc 23
387
+ _aRcH 23
387
388
  ,. 23
389
+ |d 23
390
+ g|_ 23
391
+ |\|d 23
388
392
  D,_ 23
389
- _be 23
390
- _rv_ 23
393
+ _pd 23
391
394
  g| 23
392
- be 23
393
- _n, 23
394
- |_| 22
395
- \|d_ 22
396
- BuT 22
397
395
  ,,_ 22
398
- Bu 22
399
- nc 22
400
- $._ 22
396
+ |_| 22
397
+ _- 22
398
+ |d_ 22
399
+ d._ 22
400
+ _r._ 22