scylla 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (80) hide show
  1. data/Gemfile +0 -1
  2. data/Gemfile.lock +0 -10
  3. data/README.rdoc +22 -0
  4. data/VERSION +1 -1
  5. data/bin/scylla +13 -0
  6. data/lib/scylla/classifier.rb +2 -2
  7. data/lib/scylla/generator.rb +1 -1
  8. data/lib/scylla/lms/13375P33K.lm +400 -0
  9. data/lib/scylla/lms/afrikaans.lm +400 -0
  10. data/lib/scylla/lms/arabic.lm +400 -0
  11. data/lib/scylla/lms/bulgarian.lm +400 -0
  12. data/lib/scylla/lms/catalan.lm +400 -0
  13. data/lib/scylla/lms/chinese.lm +400 -0
  14. data/lib/scylla/lms/danish.lm +400 -0
  15. data/lib/scylla/lms/english.lm +400 -0
  16. data/lib/scylla/lms/esperanto.lm +400 -0
  17. data/lib/scylla/lms/finnish.lm +400 -0
  18. data/lib/scylla/lms/french.lm +400 -0
  19. data/lib/scylla/lms/german.lm +400 -0
  20. data/lib/scylla/lms/greek-iso8859-7.lm +400 -0
  21. data/lib/scylla/lms/hebrew.lm +400 -0
  22. data/lib/scylla/lms/hindi.lm +400 -0
  23. data/lib/scylla/lms/hungarian.lm +400 -0
  24. data/lib/scylla/lms/icelandic.lm +400 -0
  25. data/lib/scylla/lms/indonesian.lm +400 -0
  26. data/lib/scylla/lms/irish.lm +400 -0
  27. data/lib/scylla/lms/italian.lm +400 -0
  28. data/lib/scylla/lms/japanese.lm +400 -0
  29. data/lib/scylla/lms/kannada.lm +400 -0
  30. data/lib/scylla/lms/korean.lm +400 -0
  31. data/lib/scylla/lms/latin.lm +400 -0
  32. data/lib/scylla/lms/malay.lm +400 -0
  33. data/lib/scylla/lms/marathi.lm +400 -0
  34. data/lib/scylla/lms/mingo.lm +400 -0
  35. data/lib/scylla/lms/nepali.lm +400 -0
  36. data/lib/scylla/lms/norwegian.lm +400 -0
  37. data/lib/scylla/lms/polish.lm +400 -0
  38. data/lib/scylla/lms/portuguese.lm +400 -0
  39. data/lib/scylla/lms/quechua.lm +400 -0
  40. data/lib/scylla/lms/romanian.lm +400 -0
  41. data/lib/scylla/lms/rumantsch.lm +400 -0
  42. data/lib/scylla/lms/russian.lm +400 -0
  43. data/lib/scylla/lms/sanskrit.lm +400 -0
  44. data/lib/scylla/lms/scots_gaelic.lm +400 -0
  45. data/lib/scylla/lms/serbian-ascii.lm +400 -0
  46. data/lib/scylla/lms/slovak-ascii.lm +400 -0
  47. data/lib/scylla/lms/slovenian-ascii.lm +400 -0
  48. data/lib/scylla/lms/spanish.lm +400 -0
  49. data/lib/scylla/lms/swahili.lm +400 -0
  50. data/lib/scylla/lms/swedish.lm +400 -0
  51. data/lib/scylla/lms/tagalog.lm +400 -0
  52. data/lib/scylla/lms/tamil.lm +400 -0
  53. data/lib/scylla/lms/thai.lm +400 -0
  54. data/lib/scylla/lms/turkish.lm +400 -0
  55. data/lib/scylla/lms/ukrainian-koi8_u.lm +400 -0
  56. data/lib/scylla/lms/vietnamese.lm +400 -0
  57. data/lib/scylla/lms/welsh.lm +400 -0
  58. data/lib/scylla/lms/yiddish-utf.lm +400 -0
  59. data/lib/scylla/loader.rb +8 -1
  60. data/scylla-0.1.0.gem +0 -0
  61. data/scylla.gemspec +69 -3
  62. data/source_texts/kannada.txt +283 -0
  63. data/test/classifier_test.rb +7 -0
  64. data/test/fixtures/lms/13375p33k.lm +400 -0
  65. data/test/fixtures/lms/danish.lm +400 -0
  66. data/test/fixtures/lms/english.lm +400 -0
  67. data/test/fixtures/lms/french.lm +400 -0
  68. data/test/fixtures/lms/german.lm +400 -0
  69. data/test/fixtures/lms/japanese.lm +400 -0
  70. data/test/fixtures/lms/kannada.lm +400 -0
  71. data/test/fixtures/lms/spanish.lm +400 -0
  72. data/test/fixtures/source_texts/13375P33K.txt +199 -0
  73. data/test/fixtures/source_texts/japanese.txt +199 -0
  74. data/test/fixtures/source_texts/kannada.txt +283 -0
  75. data/test/generator_test.rb +10 -7
  76. data/test/helper.rb +5 -6
  77. data/test/loader_test.rb +1 -0
  78. data/test/scylla_test.rb +1 -0
  79. metadata +78 -14
  80. data/source_texts/armenian.txt +0 -86
@@ -0,0 +1,400 @@
1
+ � 1822
2
+ _ 734
3
+ _� 343
4
+ � 273
5
+ � 222
6
+ א 222
7
+ � 203
8
+ ע 203
9
+ ע� 178
10
+ �� 178
11
+ �� 167
12
+ א� 167
13
+ � 158
14
+ י 158
15
+ � 133
16
+ ר 133
17
+ י� 126
18
+ �� 126
19
+ ַ 122
20
+ � 122
21
+ � 108
22
+ ט 108
23
+ �� 101
24
+ ַ� 101
25
+ אַ 100
26
+ �ַ 100
27
+ ן 98
28
+ � 98
29
+ �� 91
30
+ ר� 91
31
+ _א 86
32
+ � 82
33
+ ל 82
34
+ �ַ� 81
35
+ אַ� 81
36
+ ו 81
37
+ � 81
38
+ �_ 77
39
+ ן_ 77
40
+ ו� 73
41
+ �� 73
42
+ נ 68
43
+ � 68
44
+ � 67
45
+ �ָ 67
46
+ אָ 67
47
+ ָ 67
48
+ ס 65
49
+ � 65
50
+ �� 64
51
+ ָ� 64
52
+ �� 64
53
+ ל� 64
54
+ אָ� 64
55
+ �ָ� 64
56
+ ער 63
57
+ �ר 63
58
+ נ� 62
59
+ �� 62
60
+ �� 62
61
+ ט� 62
62
+ פ 61
63
+ �� 61
64
+ � 61
65
+ ד� 61
66
+ פ� 61
67
+ �� 61
68
+ � 61
69
+ ד 61
70
+ �� 55
71
+ א� 55
72
+ � 51
73
+ מ 51
74
+ � 50
75
+ ק 50
76
+ ש 48
77
+ � 48
78
+ _א� 46
79
+ מ� 46
80
+ �� 46
81
+ � 43
82
+ �� 43
83
+ ֿ 43
84
+ ק� 43
85
+ � 42
86
+ �ֿ 42
87
+ ֿ� 42
88
+ ז 42
89
+ פֿ 42
90
+ �� 42
91
+ �ֿ� 41
92
+ װ 41
93
+ � 41
94
+ ײ 41
95
+ פֿ� 41
96
+ � 41
97
+ װ� 40
98
+ _אַ 40
99
+ � 40
100
+ �� 40
101
+ _א� 40
102
+ ג 40
103
+ �� 38
104
+ ש� 38
105
+ ג� 37
106
+ ס� 37
107
+ �� 37
108
+ �� 37
109
+ , 36
110
+ �_ 35
111
+ ט_ 35
112
+ ער� 35
113
+ �ר� 35
114
+ ר_ 34
115
+ �_ 34
116
+ �� 34
117
+ ז� 34
118
+ � 32
119
+ ב 32
120
+ ב� 30
121
+ �� 30
122
+ ּ 28
123
+ _פ 28
124
+ _פ� 28
125
+ � 28
126
+ ה 26
127
+ _ד� 26
128
+ ,_ 26
129
+ �ע 26
130
+ � 26
131
+ דע 26
132
+ _ד 26
133
+ �� 25
134
+ ּ� 25
135
+ �ע� 24
136
+ �ע� 24
137
+ �ר_ 24
138
+ ון 24
139
+ ער_ 24
140
+ _װ 24
141
+ �ע 24
142
+ גע� 24
143
+ ה� 24
144
+ �� 24
145
+ גע 24
146
+ �ן 24
147
+ דע� 24
148
+ �ן_ 23
149
+ _װ� 23
150
+ ון_ 23
151
+ �ל 22
152
+ �� 22
153
+ � 22
154
+ �ע 22
155
+ ײַ 22
156
+ צ 22
157
+ �� 22
158
+ �ַ 22
159
+ צ� 22
160
+ לע 22
161
+ ַל 22
162
+ ײ� 22
163
+ �_ 21
164
+ ס_ 21
165
+ או 20
166
+ _מ 20
167
+ �ו 20
168
+ �ר 20
169
+ �ַר 20
170
+ _פֿ 20
171
+ �ַ� 20
172
+ �_ 20
173
+ ײַ� 20
174
+ �ו� 20
175
+ או� 20
176
+ ַר 20
177
+ ַ_ 20
178
+ _מ� 19
179
+ אי� 19
180
+ �ַל 19
181
+ פּ 19
182
+ �י 19
183
+ ױ 19
184
+ � 19
185
+ אי 19
186
+ �י� 19
187
+ �ע� 19
188
+ �י 19
189
+ לע� 19
190
+ �ּ 19
191
+ די 19
192
+ � 18
193
+ �א� 18
194
+ _ז� 18
195
+ אַ_ 18
196
+ " 18
197
+ _ז 18
198
+ רא 18
199
+ �א 18
200
+ רא� 18
201
+ ם 18
202
+ �ַ_ 18
203
+ ַר� 18
204
+ ך 18
205
+ � 18
206
+ ַל� 18
207
+ �ר� 18
208
+ �ל� 18
209
+ �� 17
210
+ �ער 17
211
+ פּ� 17
212
+ �_ 17
213
+ י_ 17
214
+ ײ� 17
215
+ �ּ� 17
216
+ כ 16
217
+ ע_ 16
218
+ �_ 16
219
+ - 16
220
+ _אי 16
221
+ � 16
222
+ �ון 16
223
+ �ע 15
224
+ נע 15
225
+ _ש 15
226
+ מע 15
227
+ ױ� 15
228
+ �� 15
229
+ . 15
230
+ �_ 15
231
+ ך_ 15
232
+ �ע 15
233
+ _או 15
234
+ �ש 14
235
+ �ק 14
236
+ יק 14
237
+ זי 14
238
+ �י 14
239
+ יש 14
240
+ װא 13
241
+ �א� 13
242
+ �ע� 13
243
+ װע 13
244
+ יִ 13
245
+ יט 13
246
+ יִ� 13
247
+ _י� 13
248
+ הא 13
249
+ ִ� 13
250
+ �� 13
251
+ י� 13
252
+ װא� 13
253
+ � 13
254
+ �ע 13
255
+ �� 13
256
+ �ע 13
257
+ ִ 13
258
+ טע 13
259
+ �א 13
260
+ �ִ 13
261
+ הא� 13
262
+ �ן 13
263
+ _י 13
264
+ װע� 13
265
+ �א� 13
266
+ �ע� 13
267
+ ען 13
268
+ _דע 13
269
+ מע� 13
270
+ �ִ� 13
271
+ �א 13
272
+ �ט 13
273
+ ענ� 12
274
+ �ֿו 12
275
+ �נ� 12
276
+ �נ 12
277
+ ענ 12
278
+ _ק 12
279
+ ני 12
280
+ ַז 12
281
+ ַט 12
282
+ _ק� 12
283
+ ַנ� 12
284
+ ֿא� 12
285
+ �ֿא 12
286
+ �י 12
287
+ �א 12
288
+ ֿו� 12
289
+ ַנ 12
290
+ �ו� 12
291
+ �י 12
292
+ �ו 12
293
+ �נ 12
294
+ לי 12
295
+ �נ� 12
296
+ ֿא 12
297
+ �ז 12
298
+ _ש� 12
299
+ �ט 12
300
+ ֿו 12
301
+ �א� 12
302
+ �� 11
303
+ �ע� 11
304
+ רע 11
305
+ �ע 11
306
+ _ה� 11
307
+ �אָ 11
308
+ ן� 11
309
+ _צ� 11
310
+ �ס 11
311
+ �י� 11
312
+ �ָט 11
313
+ על 11
314
+ טא 11
315
+ _ה 11
316
+ �ט 11
317
+ �ל 11
318
+ _ג 11
319
+ _זי 11
320
+ _ב 11
321
+ �ן 11
322
+ לי� 11
323
+ ין 11
324
+ �א 11
325
+ _ב� 11
326
+ ָט 11
327
+ �ך 11
328
+ _צ 11
329
+ יך 11
330
+ עס 11
331
+ טע� 11
332
+ __ 11
333
+ "� 10
334
+ �ל� 10
335
+ �י� 10
336
+ �י� 10
337
+ �_ 10
338
+ שע 10
339
+ ומ 10
340
+ -� 10
341
+ �ע 10
342
+ �ַז 10
343
+ יש� 10
344
+ �ן_ 10
345
+ _ל 10
346
+ די_ 10
347
+ ם_ 10
348
+ �ש� 10
349
+ ען_ 10
350
+ �מ 10
351
+ יק� 10
352
+ ני� 10
353
+ �ל 10
354
+ �י_ 10
355
+ _ג� 10
356
+ ל_ 10
357
+ �ק� 10
358
+ �א� 10
359
+ �_ 10
360
+ _װע 10
361
+ זי� 10
362
+ טא� 10
363
+ קל 10
364
+ קל� 10
365
+ מי 9
366
+ מי� 9
367
+ �ק 9
368
+ �ס 9
369
+ _הא 9
370
+ �ין 9
371
+ די� 9
372
+ יס 9
373
+ �אָ 9
374
+ יך_ 9
375
+ ,� 9
376
+ �י 9
377
+ �י� 9
378
+ שט 9
379
+ ָס 9
380
+ �ן_ 9
381
+ _די 9
382
+ �ס 9
383
+ בע 9
384
+ בע� 9
385
+ נע� 9
386
+ �י� 9
387
+ �ע� 9
388
+ _ל� 9
389
+ �ך_ 9
390
+ �ע� 9
391
+ �אָ 9
392
+ �ָס 9
393
+ ין_ 9
394
+ עק 9
395
+ _װא 9
396
+ �ט 9
397
+ �אַ 9
398
+ �ע 9
399
+ �ע� 9
400
+ רע� 9
data/lib/scylla/loader.rb CHANGED
@@ -1,16 +1,20 @@
1
1
  module Scylla
2
2
  class Loader
3
+ @@dir = DEFAULT_TARGET_DIR
3
4
  # Loads all the language maps once into memory using the .lm files located
4
5
  # in lib/scylla/lm
5
6
  def self.load_language_maps
6
7
  languages = Hash.new
7
- Dir.glob("**/*.lm").each do |filepath|
8
+ Dir.glob(File.join(@@dir, "*.lm")).each do |filepath|
8
9
  language = File.basename(filepath, ".lm")
9
10
  languages[language] = language_map(filepath)
10
11
  end
11
12
  return languages
12
13
  end
13
14
 
15
+ def self.dir
16
+ return @@dir
17
+ end
14
18
  # Returns a single language map from a specified .lm file
15
19
  def self.language_map(path)
16
20
  rank, ngram = 1, Hash.new
@@ -24,6 +28,9 @@ module Scylla
24
28
  return ngram
25
29
  end
26
30
 
31
+ def self.set_dir(dir)
32
+ @@dir = dir
33
+ end
27
34
  # Loads all maps from the .lm files, or loads them from memory if the
28
35
  # files have already been read and loaded.
29
36
  def self.languages
data/scylla-0.1.0.gem ADDED
Binary file
data/scylla.gemspec CHANGED
@@ -5,13 +5,15 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{scylla}
8
- s.version = "0.1.0"
8
+ s.version = "0.2.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Ashwin Hegde"]
12
- s.date = %q{2011-08-25}
12
+ s.date = %q{2011-08-26}
13
+ s.default_executable = %q{scylla}
13
14
  s.description = %q{Allows for text categorization by guessing the language of a given text using n-grams}
14
15
  s.email = %q{ahegde@zendesk.com}
16
+ s.executables = ["scylla"]
15
17
  s.extra_rdoc_files = [
16
18
  "LICENSE.txt",
17
19
  "README.rdoc"
@@ -24,17 +26,69 @@ Gem::Specification.new do |s|
24
26
  "README.rdoc",
25
27
  "Rakefile",
26
28
  "VERSION",
29
+ "bin/scylla",
27
30
  "lib/scylla.rb",
28
31
  "lib/scylla/classifier.rb",
29
32
  "lib/scylla/generator.rb",
33
+ "lib/scylla/lms/13375P33K.lm",
34
+ "lib/scylla/lms/afrikaans.lm",
35
+ "lib/scylla/lms/arabic.lm",
36
+ "lib/scylla/lms/bulgarian.lm",
37
+ "lib/scylla/lms/catalan.lm",
38
+ "lib/scylla/lms/chinese.lm",
39
+ "lib/scylla/lms/danish.lm",
40
+ "lib/scylla/lms/english.lm",
41
+ "lib/scylla/lms/esperanto.lm",
42
+ "lib/scylla/lms/finnish.lm",
43
+ "lib/scylla/lms/french.lm",
44
+ "lib/scylla/lms/german.lm",
45
+ "lib/scylla/lms/greek-iso8859-7.lm",
46
+ "lib/scylla/lms/hebrew.lm",
47
+ "lib/scylla/lms/hindi.lm",
48
+ "lib/scylla/lms/hungarian.lm",
49
+ "lib/scylla/lms/icelandic.lm",
50
+ "lib/scylla/lms/indonesian.lm",
51
+ "lib/scylla/lms/irish.lm",
52
+ "lib/scylla/lms/italian.lm",
53
+ "lib/scylla/lms/japanese.lm",
54
+ "lib/scylla/lms/kannada.lm",
55
+ "lib/scylla/lms/korean.lm",
56
+ "lib/scylla/lms/latin.lm",
57
+ "lib/scylla/lms/malay.lm",
58
+ "lib/scylla/lms/marathi.lm",
59
+ "lib/scylla/lms/mingo.lm",
60
+ "lib/scylla/lms/nepali.lm",
61
+ "lib/scylla/lms/norwegian.lm",
62
+ "lib/scylla/lms/polish.lm",
63
+ "lib/scylla/lms/portuguese.lm",
64
+ "lib/scylla/lms/quechua.lm",
65
+ "lib/scylla/lms/romanian.lm",
66
+ "lib/scylla/lms/rumantsch.lm",
67
+ "lib/scylla/lms/russian.lm",
68
+ "lib/scylla/lms/sanskrit.lm",
69
+ "lib/scylla/lms/scots_gaelic.lm",
70
+ "lib/scylla/lms/serbian-ascii.lm",
71
+ "lib/scylla/lms/slovak-ascii.lm",
72
+ "lib/scylla/lms/slovenian-ascii.lm",
73
+ "lib/scylla/lms/spanish.lm",
74
+ "lib/scylla/lms/swahili.lm",
75
+ "lib/scylla/lms/swedish.lm",
76
+ "lib/scylla/lms/tagalog.lm",
77
+ "lib/scylla/lms/tamil.lm",
78
+ "lib/scylla/lms/thai.lm",
79
+ "lib/scylla/lms/turkish.lm",
80
+ "lib/scylla/lms/ukrainian-koi8_u.lm",
81
+ "lib/scylla/lms/vietnamese.lm",
82
+ "lib/scylla/lms/welsh.lm",
83
+ "lib/scylla/lms/yiddish-utf.lm",
30
84
  "lib/scylla/loader.rb",
31
85
  "lib/scylla/string.rb",
32
86
  "lib/scylla/tasks.rb",
87
+ "scylla-0.1.0.gem",
33
88
  "scylla.gemspec",
34
89
  "source_texts/13375P33K.txt",
35
90
  "source_texts/afrikaans.txt",
36
91
  "source_texts/arabic.txt",
37
- "source_texts/armenian.txt",
38
92
  "source_texts/bulgarian.txt",
39
93
  "source_texts/catalan.txt",
40
94
  "source_texts/chinese.txt",
@@ -53,6 +107,7 @@ Gem::Specification.new do |s|
53
107
  "source_texts/irish.txt",
54
108
  "source_texts/italian.txt",
55
109
  "source_texts/japanese.txt",
110
+ "source_texts/kannada.txt",
56
111
  "source_texts/korean.txt",
57
112
  "source_texts/latin.txt",
58
113
  "source_texts/malay.txt",
@@ -83,10 +138,21 @@ Gem::Specification.new do |s|
83
138
  "source_texts/welsh.txt",
84
139
  "source_texts/yiddish-utf.txt",
85
140
  "test/classifier_test.rb",
141
+ "test/fixtures/lms/13375p33k.lm",
142
+ "test/fixtures/lms/danish.lm",
143
+ "test/fixtures/lms/english.lm",
144
+ "test/fixtures/lms/french.lm",
145
+ "test/fixtures/lms/german.lm",
146
+ "test/fixtures/lms/japanese.lm",
147
+ "test/fixtures/lms/kannada.lm",
148
+ "test/fixtures/lms/spanish.lm",
149
+ "test/fixtures/source_texts/13375P33K.txt",
86
150
  "test/fixtures/source_texts/danish.txt",
87
151
  "test/fixtures/source_texts/english.txt",
88
152
  "test/fixtures/source_texts/french.txt",
89
153
  "test/fixtures/source_texts/german.txt",
154
+ "test/fixtures/source_texts/japanese.txt",
155
+ "test/fixtures/source_texts/kannada.txt",
90
156
  "test/fixtures/source_texts/spanish.txt",
91
157
  "test/generator_test.rb",
92
158
  "test/helper.rb",