scylla 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. data/Gemfile +0 -1
  2. data/Gemfile.lock +0 -10
  3. data/README.rdoc +22 -0
  4. data/VERSION +1 -1
  5. data/bin/scylla +13 -0
  6. data/lib/scylla/classifier.rb +2 -2
  7. data/lib/scylla/generator.rb +1 -1
  8. data/lib/scylla/lms/13375P33K.lm +400 -0
  9. data/lib/scylla/lms/afrikaans.lm +400 -0
  10. data/lib/scylla/lms/arabic.lm +400 -0
  11. data/lib/scylla/lms/bulgarian.lm +400 -0
  12. data/lib/scylla/lms/catalan.lm +400 -0
  13. data/lib/scylla/lms/chinese.lm +400 -0
  14. data/lib/scylla/lms/danish.lm +400 -0
  15. data/lib/scylla/lms/english.lm +400 -0
  16. data/lib/scylla/lms/esperanto.lm +400 -0
  17. data/lib/scylla/lms/finnish.lm +400 -0
  18. data/lib/scylla/lms/french.lm +400 -0
  19. data/lib/scylla/lms/german.lm +400 -0
  20. data/lib/scylla/lms/greek-iso8859-7.lm +400 -0
  21. data/lib/scylla/lms/hebrew.lm +400 -0
  22. data/lib/scylla/lms/hindi.lm +400 -0
  23. data/lib/scylla/lms/hungarian.lm +400 -0
  24. data/lib/scylla/lms/icelandic.lm +400 -0
  25. data/lib/scylla/lms/indonesian.lm +400 -0
  26. data/lib/scylla/lms/irish.lm +400 -0
  27. data/lib/scylla/lms/italian.lm +400 -0
  28. data/lib/scylla/lms/japanese.lm +400 -0
  29. data/lib/scylla/lms/kannada.lm +400 -0
  30. data/lib/scylla/lms/korean.lm +400 -0
  31. data/lib/scylla/lms/latin.lm +400 -0
  32. data/lib/scylla/lms/malay.lm +400 -0
  33. data/lib/scylla/lms/marathi.lm +400 -0
  34. data/lib/scylla/lms/mingo.lm +400 -0
  35. data/lib/scylla/lms/nepali.lm +400 -0
  36. data/lib/scylla/lms/norwegian.lm +400 -0
  37. data/lib/scylla/lms/polish.lm +400 -0
  38. data/lib/scylla/lms/portuguese.lm +400 -0
  39. data/lib/scylla/lms/quechua.lm +400 -0
  40. data/lib/scylla/lms/romanian.lm +400 -0
  41. data/lib/scylla/lms/rumantsch.lm +400 -0
  42. data/lib/scylla/lms/russian.lm +400 -0
  43. data/lib/scylla/lms/sanskrit.lm +400 -0
  44. data/lib/scylla/lms/scots_gaelic.lm +400 -0
  45. data/lib/scylla/lms/serbian-ascii.lm +400 -0
  46. data/lib/scylla/lms/slovak-ascii.lm +400 -0
  47. data/lib/scylla/lms/slovenian-ascii.lm +400 -0
  48. data/lib/scylla/lms/spanish.lm +400 -0
  49. data/lib/scylla/lms/swahili.lm +400 -0
  50. data/lib/scylla/lms/swedish.lm +400 -0
  51. data/lib/scylla/lms/tagalog.lm +400 -0
  52. data/lib/scylla/lms/tamil.lm +400 -0
  53. data/lib/scylla/lms/thai.lm +400 -0
  54. data/lib/scylla/lms/turkish.lm +400 -0
  55. data/lib/scylla/lms/ukrainian-koi8_u.lm +400 -0
  56. data/lib/scylla/lms/vietnamese.lm +400 -0
  57. data/lib/scylla/lms/welsh.lm +400 -0
  58. data/lib/scylla/lms/yiddish-utf.lm +400 -0
  59. data/lib/scylla/loader.rb +8 -1
  60. data/scylla-0.1.0.gem +0 -0
  61. data/scylla.gemspec +69 -3
  62. data/source_texts/kannada.txt +283 -0
  63. data/test/classifier_test.rb +7 -0
  64. data/test/fixtures/lms/13375p33k.lm +400 -0
  65. data/test/fixtures/lms/danish.lm +400 -0
  66. data/test/fixtures/lms/english.lm +400 -0
  67. data/test/fixtures/lms/french.lm +400 -0
  68. data/test/fixtures/lms/german.lm +400 -0
  69. data/test/fixtures/lms/japanese.lm +400 -0
  70. data/test/fixtures/lms/kannada.lm +400 -0
  71. data/test/fixtures/lms/spanish.lm +400 -0
  72. data/test/fixtures/source_texts/13375P33K.txt +199 -0
  73. data/test/fixtures/source_texts/japanese.txt +199 -0
  74. data/test/fixtures/source_texts/kannada.txt +283 -0
  75. data/test/generator_test.rb +10 -7
  76. data/test/helper.rb +5 -6
  77. data/test/loader_test.rb +1 -0
  78. data/test/scylla_test.rb +1 -0
  79. metadata +78 -14
  80. data/source_texts/armenian.txt +0 -86
data/Gemfile CHANGED
@@ -13,5 +13,4 @@ end
13
13
  group :test do
14
14
  gem "shoulda", ">= 0"
15
15
  gem "mocha", "~> 0.9.12", :require => nil
16
- gem "ruby-debug"
17
16
  end
data/Gemfile.lock CHANGED
@@ -1,22 +1,13 @@
1
1
  GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
- columnize (0.3.4)
5
4
  git (1.2.5)
6
5
  jeweler (1.6.4)
7
6
  bundler (~> 1.0)
8
7
  git (>= 1.2.5)
9
8
  rake
10
- linecache (0.46)
11
- rbx-require-relative (> 0.0.4)
12
9
  mocha (0.9.12)
13
10
  rake (0.9.2)
14
- rbx-require-relative (0.0.5)
15
- ruby-debug (0.10.4)
16
- columnize (>= 0.1)
17
- ruby-debug-base (~> 0.10.4.0)
18
- ruby-debug-base (0.10.4)
19
- linecache (>= 0.3)
20
11
  shoulda (2.11.3)
21
12
 
22
13
  PLATFORMS
@@ -26,5 +17,4 @@ DEPENDENCIES
26
17
  bundler (~> 1.0.0)
27
18
  jeweler (~> 1.6.4)
28
19
  mocha (~> 0.9.12)
29
- ruby-debug
30
20
  shoulda
data/README.rdoc CHANGED
@@ -2,6 +2,28 @@
2
2
 
3
3
  Scylla is a language categorizing gem that allows you to guess the language of a given text. Scylla is a Ruby port of TextCat (http://www.let.rug.nl/~vannoord/TextCat) and is based on the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, ``N-Gram-Based Text Categorization'' In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.
4
4
 
5
+ Installation:
6
+
7
+ gem install scylla
8
+
9
+ Usage:
10
+
11
+ require 'scylla'
12
+
13
+ "this is english text".language
14
+ => "english"
15
+
16
+ "Este es un texto español".language
17
+ => "spanish"
18
+
19
+ Multiple results for other possible languages:
20
+
21
+ "isso poderia ser confundido com espanhol, bem".language
22
+ => "portuguese"
23
+
24
+ "isso poderia ser confundido com espanhol, bem".guess
25
+ => ["portuguese", "spanish"]
26
+
5
27
  == Contributing to scylla
6
28
 
7
29
  * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.2.0
data/bin/scylla ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'scylla'
4
+ phrase = ""
5
+ puts "Welcome to Scylla Language Guesser"
6
+ puts "Enter a phrase which you would like to identify"
7
+ puts "Type exit to quit"
8
+ while(phrase != "exit")
9
+ puts "Phrase:"
10
+ STDOUT.flush
11
+ phrase = gets.chomp
12
+ puts phrase.guess.join(" or ")
13
+ end
@@ -1,6 +1,6 @@
1
1
  module Scylla
2
2
  class Classifier
3
- attr_accessor :limit, :dir, :ngrams, :threshold, :input
3
+ attr_accessor :limit, :ngrams, :threshold, :input
4
4
 
5
5
  # limit : Up to how many matching language results should be displayed
6
6
  # ngrams : The total number of ngrams that are stored for each language
@@ -30,7 +30,7 @@ module Scylla
30
30
  results = Hash.new
31
31
  languages = Scylla::Loader.languages
32
32
  if languages.empty?
33
- p "No languages (.lm files) found. Please run rake scylla:train after placing your training texts in the source_texts directory."
33
+ p "No languages (.lm files) found in + " + Scylla::Loader.dir + ". Please run rake scylla:train after placing your training texts in the source_texts directory."
34
34
  return
35
35
  end
36
36
  sg = Scylla::Generator.new
@@ -14,7 +14,7 @@ module Scylla
14
14
  # and creates language maps using ngram frequencies. The maps are stored in
15
15
  # lib/scylla/lms as .lm files
16
16
  def train
17
- languages = Dir.glob("**/*.lm")
17
+ languages = Dir.glob(@dirlm + "/*.lm")
18
18
  textpaths = Dir.glob(@dirtext + "/*.txt")
19
19
  languages.each {|l| File.delete(l) }
20
20
  textpaths.each do |path|
@@ -0,0 +1,400 @@
1
+ _ 23034
2
+ __ 3952
3
+ | 1114
4
+ |_ 748
5
+ n 708
6
+ r 683
7
+ , 659
8
+ ,_ 614
9
+ _| 605
10
+ . 575
11
+ _n 563
12
+ h 516
13
+ _r 510
14
+ ._ 505
15
+ d 493
16
+ z 485
17
+ @ 447
18
+ r_ 443
19
+ t 428
20
+ p 425
21
+ _|_ 395
22
+ h_ 365
23
+ _d 363
24
+ z_ 350
25
+ n_ 350
26
+ c 345
27
+ v 339
28
+ N 331
29
+ _r_ 326
30
+ d_ 319
31
+ _p 319
32
+ f 302
33
+ l 297
34
+ u 295
35
+ _n_ 285
36
+ + 273
37
+ R 271
38
+ _z 264
39
+ m 258
40
+ t_ 251
41
+ _t 250
42
+ y 248
43
+ _d_ 241
44
+ g 240
45
+ _f 239
46
+ k 229
47
+ f_ 221
48
+ _c 220
49
+ H 208
50
+ U 203
51
+ _h 203
52
+ w 202
53
+ _w 202
54
+ m_ 199
55
+ _, 195
56
+ _u 193
57
+ \ 190
58
+ _f_ 190
59
+ _z_ 189
60
+ _+ 187
61
+ T 187
62
+ L 187
63
+ v_ 184
64
+ _N 179
65
+ _,_ 175
66
+ D 174
67
+ I 174
68
+ _m 173
69
+ p_ 170
70
+ _g 170
71
+ _y 169
72
+ ( 169
73
+ _v 168
74
+ y_ 165
75
+ b 163
76
+ _@ 162
77
+ e 158
78
+ _h_ 156
79
+ _l 153
80
+ g_ 153
81
+ _b 152
82
+ _t_ 151
83
+ _|\ 149
84
+ _. 149
85
+ |\ 149
86
+ _( 148
87
+ R_ 143
88
+ l_ 140
89
+ _k 138
90
+ k_ 136
91
+ c_ 134
92
+ _._ 134
93
+ _w_ 133
94
+ w_ 133
95
+ T_ 130
96
+ N_ 125
97
+ - 121
98
+ _m_ 120
99
+ +_ 119
100
+ E 118
101
+ _y_ 118
102
+ O 117
103
+ _g_ 114
104
+ _p_ 112
105
+ a 111
106
+ x 107
107
+ o 107
108
+ W 106
109
+ i 105
110
+ |\| 104
111
+ _|\| 104
112
+ \| 104
113
+ _c_ 104
114
+ M 102
115
+ u_ 102
116
+ $ 102
117
+ vv 101
118
+ H_ 97
119
+ L_ 94
120
+ D_ 91
121
+ _I 90
122
+ F 88
123
+ / 86
124
+ \/ 86
125
+ _R 86
126
+ _T 84
127
+ s 84
128
+ _W 84
129
+ _U 83
130
+ b_ 83
131
+ _b_ 83
132
+ _l_ 81
133
+ x_ 81
134
+ _v_ 79
135
+ _D 78
136
+ _u_ 78
137
+ (_ 77
138
+ _vv 77
139
+ _s 76
140
+ $_ 72
141
+ P 70
142
+ _L 69
143
+ _M 68
144
+ _(_ 67
145
+ e_ 67
146
+ @R 66
147
+ \|_ 66
148
+ _k_ 66
149
+ _|\|_ 66
150
+ |\|_ 66
151
+ Y 66
152
+ -| 65
153
+ |- 65
154
+ _T_ 64
155
+ _x 62
156
+ _+_ 62
157
+ vv_ 62
158
+ @n 62
159
+ _F 61
160
+ F_ 60
161
+ IN 60
162
+ _N_ 60
163
+ _H 59
164
+ _P 59
165
+ @_ 57
166
+ _i 56
167
+ _a 56
168
+ C 54
169
+ _@R 54
170
+ s_ 50
171
+ tz 50
172
+ gh 50
173
+ _R_ 49
174
+ _vv_ 49
175
+ _C 48
176
+ ! 48
177
+ _F_ 48
178
+ _+h 47
179
+ _O 47
180
+ +h 47
181
+ cH 47
182
+ |-| 47
183
+ _x_ 46
184
+ _|\/ 45
185
+ \/| 45
186
+ /| 45
187
+ |\/| 45
188
+ _|\/| 45
189
+ |\/ 45
190
+ u|_ 44
191
+ u| 44
192
+ tz_ 44
193
+ E_ 44
194
+ _IN 43
195
+ Ul 43
196
+ _gh 43
197
+ Wh 43
198
+ ) 43
199
+ _s_ 42
200
+ || 42
201
+ A 41
202
+ Wh_ 41
203
+ @R_ 40
204
+ Or 40
205
+ _L_ 40
206
+ p| 40
207
+ nd 40
208
+ z,_ 39
209
+ _@R_ 39
210
+ p@ 39
211
+ _Wh 39
212
+ G 39
213
+ _p@ 39
214
+ _wh 39
215
+ wh 39
216
+ z, 39
217
+ d, 38
218
+ rz 38
219
+ _rz 38
220
+ nD 38
221
+ aR 37
222
+ df 37
223
+ _D_ 37
224
+ (h 37
225
+ M_ 37
226
+ _Wh_ 37
227
+ c| 37
228
+ HE 36
229
+ (h_ 36
230
+ _d, 36
231
+ Up 36
232
+ _nT 35
233
+ +H 35
234
+ _@n 35
235
+ nT 35
236
+ LL 35
237
+ _d@ 35
238
+ cH_ 35
239
+ d@ 35
240
+ B 34
241
+ _u| 34
242
+ _u|_ 34
243
+ j 34
244
+ _j 34
245
+ n. 34
246
+ _Up 34
247
+ y, 34
248
+ pdf 34
249
+ pd 34
250
+ _@$ 33
251
+ _|| 33
252
+ d,_ 33
253
+ @$ 33
254
+ lt 33
255
+ Y_ 32
256
+ _aR 32
257
+ _zUl 32
258
+ Rc 32
259
+ q 32
260
+ p|_ 32
261
+ _$ 32
262
+ _) 32
263
+ _p| 32
264
+ _zU 32
265
+ z. 32
266
+ zUl 32
267
+ _p|_ 32
268
+ y,_ 32
269
+ _aRc 32
270
+ aRc 32
271
+ M@ 32
272
+ zU 32
273
+ @r 32
274
+ _B 31
275
+ _d,_ 31
276
+ |-|_ 31
277
+ o_ 31
278
+ nd_ 31
279
+ -|_ 31
280
+ |\/|_ 30
281
+ _M_ 30
282
+ /|_ 30
283
+ \/|_ 30
284
+ @g 29
285
+ _b|_ 29
286
+ _+H 29
287
+ |__ 29
288
+ b| 29
289
+ gh_ 29
290
+ r. 29
291
+ _b| 29
292
+ h@ 29
293
+ O_ 29
294
+ b|_ 29
295
+ )_ 29
296
+ PH 28
297
+ ||_ 28
298
+ De 28
299
+ G_ 28
300
+ _nT_ 28
301
+ nT_ 28
302
+ _H_ 28
303
+ ND 28
304
+ n._ 28
305
+ ve 28
306
+ z._ 28
307
+ K 28
308
+ _nd 28
309
+ D, 27
310
+ W_ 27
311
+ I_ 27
312
+ _PH 27
313
+ _K 27
314
+ _n. 27
315
+ ve_ 27
316
+ ph 26
317
+ rE 26
318
+ _+hO 26
319
+ _tz 26
320
+ _rE 26
321
+ @n_ 26
322
+ hO 26
323
+ Up_ 26
324
+ n, 26
325
+ _(h 26
326
+ +hO 26
327
+ LL_ 26
328
+ r._ 26
329
+ n,_ 26
330
+ _(h_ 25
331
+ = 25
332
+ fO 25
333
+ rz_ 25
334
+ != 25
335
+ _r. 25
336
+ +o 25
337
+ _+HE 25
338
+ +HE 25
339
+ P_ 25
340
+ _tz_ 25
341
+ _rz_ 25
342
+ fOr 25
343
+ _pR_ 24
344
+ _wh_ 24
345
+ Ult 24
346
+ _pR 24
347
+ _Up_ 24
348
+ nt 24
349
+ wh_ 24
350
+ Or_ 24
351
+ \/_ 24
352
+ ' 24
353
+ _nt 24
354
+ zUltz 24
355
+ iN 24
356
+ _! 24
357
+ _zUlt 24
358
+ @g_ 24
359
+ Ultz 24
360
+ /_ 24
361
+ pR_ 24
362
+ pR 24
363
+ zUlt 24
364
+ _+o 24
365
+ _!= 24
366
+ $. 24
367
+ ltz 24
368
+ In 24
369
+ _iN 24
370
+ RcH 23
371
+ rv_ 23
372
+ rv 23
373
+ _|\|d 23
374
+ |d 23
375
+ _pd 23
376
+ |\|d 23
377
+ aRcH 23
378
+ _rv 23
379
+ \|d 23
380
+ g|_ 23
381
+ d. 23
382
+ !_ 23
383
+ _aRcH 23
384
+ _n,_ 23
385
+ _pdf 23
386
+ rc 23
387
+ ,. 23
388
+ D,_ 23
389
+ _be 23
390
+ _rv_ 23
391
+ g| 23
392
+ be 23
393
+ _n, 23
394
+ |_| 22
395
+ \|d_ 22
396
+ BuT 22
397
+ ,,_ 22
398
+ Bu 22
399
+ nc 22
400
+ $._ 22