scylla 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +0 -1
- data/Gemfile.lock +0 -10
- data/README.rdoc +22 -0
- data/VERSION +1 -1
- data/bin/scylla +13 -0
- data/lib/scylla/classifier.rb +2 -2
- data/lib/scylla/generator.rb +1 -1
- data/lib/scylla/lms/13375P33K.lm +400 -0
- data/lib/scylla/lms/afrikaans.lm +400 -0
- data/lib/scylla/lms/arabic.lm +400 -0
- data/lib/scylla/lms/bulgarian.lm +400 -0
- data/lib/scylla/lms/catalan.lm +400 -0
- data/lib/scylla/lms/chinese.lm +400 -0
- data/lib/scylla/lms/danish.lm +400 -0
- data/lib/scylla/lms/english.lm +400 -0
- data/lib/scylla/lms/esperanto.lm +400 -0
- data/lib/scylla/lms/finnish.lm +400 -0
- data/lib/scylla/lms/french.lm +400 -0
- data/lib/scylla/lms/german.lm +400 -0
- data/lib/scylla/lms/greek-iso8859-7.lm +400 -0
- data/lib/scylla/lms/hebrew.lm +400 -0
- data/lib/scylla/lms/hindi.lm +400 -0
- data/lib/scylla/lms/hungarian.lm +400 -0
- data/lib/scylla/lms/icelandic.lm +400 -0
- data/lib/scylla/lms/indonesian.lm +400 -0
- data/lib/scylla/lms/irish.lm +400 -0
- data/lib/scylla/lms/italian.lm +400 -0
- data/lib/scylla/lms/japanese.lm +400 -0
- data/lib/scylla/lms/kannada.lm +400 -0
- data/lib/scylla/lms/korean.lm +400 -0
- data/lib/scylla/lms/latin.lm +400 -0
- data/lib/scylla/lms/malay.lm +400 -0
- data/lib/scylla/lms/marathi.lm +400 -0
- data/lib/scylla/lms/mingo.lm +400 -0
- data/lib/scylla/lms/nepali.lm +400 -0
- data/lib/scylla/lms/norwegian.lm +400 -0
- data/lib/scylla/lms/polish.lm +400 -0
- data/lib/scylla/lms/portuguese.lm +400 -0
- data/lib/scylla/lms/quechua.lm +400 -0
- data/lib/scylla/lms/romanian.lm +400 -0
- data/lib/scylla/lms/rumantsch.lm +400 -0
- data/lib/scylla/lms/russian.lm +400 -0
- data/lib/scylla/lms/sanskrit.lm +400 -0
- data/lib/scylla/lms/scots_gaelic.lm +400 -0
- data/lib/scylla/lms/serbian-ascii.lm +400 -0
- data/lib/scylla/lms/slovak-ascii.lm +400 -0
- data/lib/scylla/lms/slovenian-ascii.lm +400 -0
- data/lib/scylla/lms/spanish.lm +400 -0
- data/lib/scylla/lms/swahili.lm +400 -0
- data/lib/scylla/lms/swedish.lm +400 -0
- data/lib/scylla/lms/tagalog.lm +400 -0
- data/lib/scylla/lms/tamil.lm +400 -0
- data/lib/scylla/lms/thai.lm +400 -0
- data/lib/scylla/lms/turkish.lm +400 -0
- data/lib/scylla/lms/ukrainian-koi8_u.lm +400 -0
- data/lib/scylla/lms/vietnamese.lm +400 -0
- data/lib/scylla/lms/welsh.lm +400 -0
- data/lib/scylla/lms/yiddish-utf.lm +400 -0
- data/lib/scylla/loader.rb +8 -1
- data/scylla-0.1.0.gem +0 -0
- data/scylla.gemspec +69 -3
- data/source_texts/kannada.txt +283 -0
- data/test/classifier_test.rb +7 -0
- data/test/fixtures/lms/13375p33k.lm +400 -0
- data/test/fixtures/lms/danish.lm +400 -0
- data/test/fixtures/lms/english.lm +400 -0
- data/test/fixtures/lms/french.lm +400 -0
- data/test/fixtures/lms/german.lm +400 -0
- data/test/fixtures/lms/japanese.lm +400 -0
- data/test/fixtures/lms/kannada.lm +400 -0
- data/test/fixtures/lms/spanish.lm +400 -0
- data/test/fixtures/source_texts/13375P33K.txt +199 -0
- data/test/fixtures/source_texts/japanese.txt +199 -0
- data/test/fixtures/source_texts/kannada.txt +283 -0
- data/test/generator_test.rb +10 -7
- data/test/helper.rb +5 -6
- data/test/loader_test.rb +1 -0
- data/test/scylla_test.rb +1 -0
- metadata +78 -14
- data/source_texts/armenian.txt +0 -86
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,22 +1,13 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
columnize (0.3.4)
|
5
4
|
git (1.2.5)
|
6
5
|
jeweler (1.6.4)
|
7
6
|
bundler (~> 1.0)
|
8
7
|
git (>= 1.2.5)
|
9
8
|
rake
|
10
|
-
linecache (0.46)
|
11
|
-
rbx-require-relative (> 0.0.4)
|
12
9
|
mocha (0.9.12)
|
13
10
|
rake (0.9.2)
|
14
|
-
rbx-require-relative (0.0.5)
|
15
|
-
ruby-debug (0.10.4)
|
16
|
-
columnize (>= 0.1)
|
17
|
-
ruby-debug-base (~> 0.10.4.0)
|
18
|
-
ruby-debug-base (0.10.4)
|
19
|
-
linecache (>= 0.3)
|
20
11
|
shoulda (2.11.3)
|
21
12
|
|
22
13
|
PLATFORMS
|
@@ -26,5 +17,4 @@ DEPENDENCIES
|
|
26
17
|
bundler (~> 1.0.0)
|
27
18
|
jeweler (~> 1.6.4)
|
28
19
|
mocha (~> 0.9.12)
|
29
|
-
ruby-debug
|
30
20
|
shoulda
|
data/README.rdoc
CHANGED
@@ -2,6 +2,28 @@
|
|
2
2
|
|
3
3
|
Scylla is a language categorizing gem that allows you to guess the language of a given text. Scylla is a Ruby port of TextCat (http://www.let.rug.nl/~vannoord/TextCat) and is based on the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, ``N-Gram-Based Text Categorization'' In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.
|
4
4
|
|
5
|
+
Installation:
|
6
|
+
|
7
|
+
gem install scylla
|
8
|
+
|
9
|
+
Usage:
|
10
|
+
|
11
|
+
require 'scylla'
|
12
|
+
|
13
|
+
"this is english text".language
|
14
|
+
=> "english"
|
15
|
+
|
16
|
+
"Este es un texto español".language
|
17
|
+
=> "spanish"
|
18
|
+
|
19
|
+
Multiple results for other possible languages:
|
20
|
+
|
21
|
+
"isso poderia ser confundido com espanhol, bem".language
|
22
|
+
=> "portuguese"
|
23
|
+
|
24
|
+
"isso poderia ser confundido com espanhol, bem".guess
|
25
|
+
=> ["portuguese", "spanish"]
|
26
|
+
|
5
27
|
== Contributing to scylla
|
6
28
|
|
7
29
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/scylla
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'scylla'
|
4
|
+
phrase = ""
|
5
|
+
puts "Welcome to Scylla Language Guesser"
|
6
|
+
puts "Enter a phrase which you would like to identify"
|
7
|
+
puts "Type exit to quit"
|
8
|
+
while(phrase != "exit")
|
9
|
+
puts "Phrase:"
|
10
|
+
STDOUT.flush
|
11
|
+
phrase = gets.chomp
|
12
|
+
puts phrase.guess.join(" or ")
|
13
|
+
end
|
data/lib/scylla/classifier.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
module Scylla
|
2
2
|
class Classifier
|
3
|
-
attr_accessor :limit, :
|
3
|
+
attr_accessor :limit, :ngrams, :threshold, :input
|
4
4
|
|
5
5
|
# limit : Up to how many matching language results should be displayed
|
6
6
|
# ngrams : The total number of ngrams that are stored for each language
|
@@ -30,7 +30,7 @@ module Scylla
|
|
30
30
|
results = Hash.new
|
31
31
|
languages = Scylla::Loader.languages
|
32
32
|
if languages.empty?
|
33
|
-
p "No languages (.lm files) found. Please run rake scylla:train after placing your training texts in the source_texts directory."
|
33
|
+
p "No languages (.lm files) found in + " + Scylla::Loader.dir + ". Please run rake scylla:train after placing your training texts in the source_texts directory."
|
34
34
|
return
|
35
35
|
end
|
36
36
|
sg = Scylla::Generator.new
|
data/lib/scylla/generator.rb
CHANGED
@@ -14,7 +14,7 @@ module Scylla
|
|
14
14
|
# and creates language maps using ngram frequencies. The maps are stored in
|
15
15
|
# lib/scylla/lms as .lm files
|
16
16
|
def train
|
17
|
-
languages = Dir.glob("
|
17
|
+
languages = Dir.glob(@dirlm + "/*.lm")
|
18
18
|
textpaths = Dir.glob(@dirtext + "/*.txt")
|
19
19
|
languages.each {|l| File.delete(l) }
|
20
20
|
textpaths.each do |path|
|
@@ -0,0 +1,400 @@
|
|
1
|
+
_ 23034
|
2
|
+
__ 3952
|
3
|
+
| 1114
|
4
|
+
|_ 748
|
5
|
+
n 708
|
6
|
+
r 683
|
7
|
+
, 659
|
8
|
+
,_ 614
|
9
|
+
_| 605
|
10
|
+
. 575
|
11
|
+
_n 563
|
12
|
+
h 516
|
13
|
+
_r 510
|
14
|
+
._ 505
|
15
|
+
d 493
|
16
|
+
z 485
|
17
|
+
@ 447
|
18
|
+
r_ 443
|
19
|
+
t 428
|
20
|
+
p 425
|
21
|
+
_|_ 395
|
22
|
+
h_ 365
|
23
|
+
_d 363
|
24
|
+
z_ 350
|
25
|
+
n_ 350
|
26
|
+
c 345
|
27
|
+
v 339
|
28
|
+
N 331
|
29
|
+
_r_ 326
|
30
|
+
d_ 319
|
31
|
+
_p 319
|
32
|
+
f 302
|
33
|
+
l 297
|
34
|
+
u 295
|
35
|
+
_n_ 285
|
36
|
+
+ 273
|
37
|
+
R 271
|
38
|
+
_z 264
|
39
|
+
m 258
|
40
|
+
t_ 251
|
41
|
+
_t 250
|
42
|
+
y 248
|
43
|
+
_d_ 241
|
44
|
+
g 240
|
45
|
+
_f 239
|
46
|
+
k 229
|
47
|
+
f_ 221
|
48
|
+
_c 220
|
49
|
+
H 208
|
50
|
+
U 203
|
51
|
+
_h 203
|
52
|
+
w 202
|
53
|
+
_w 202
|
54
|
+
m_ 199
|
55
|
+
_, 195
|
56
|
+
_u 193
|
57
|
+
\ 190
|
58
|
+
_f_ 190
|
59
|
+
_z_ 189
|
60
|
+
_+ 187
|
61
|
+
T 187
|
62
|
+
L 187
|
63
|
+
v_ 184
|
64
|
+
_N 179
|
65
|
+
_,_ 175
|
66
|
+
D 174
|
67
|
+
I 174
|
68
|
+
_m 173
|
69
|
+
p_ 170
|
70
|
+
_g 170
|
71
|
+
_y 169
|
72
|
+
( 169
|
73
|
+
_v 168
|
74
|
+
y_ 165
|
75
|
+
b 163
|
76
|
+
_@ 162
|
77
|
+
e 158
|
78
|
+
_h_ 156
|
79
|
+
_l 153
|
80
|
+
g_ 153
|
81
|
+
_b 152
|
82
|
+
_t_ 151
|
83
|
+
_|\ 149
|
84
|
+
_. 149
|
85
|
+
|\ 149
|
86
|
+
_( 148
|
87
|
+
R_ 143
|
88
|
+
l_ 140
|
89
|
+
_k 138
|
90
|
+
k_ 136
|
91
|
+
c_ 134
|
92
|
+
_._ 134
|
93
|
+
_w_ 133
|
94
|
+
w_ 133
|
95
|
+
T_ 130
|
96
|
+
N_ 125
|
97
|
+
- 121
|
98
|
+
_m_ 120
|
99
|
+
+_ 119
|
100
|
+
E 118
|
101
|
+
_y_ 118
|
102
|
+
O 117
|
103
|
+
_g_ 114
|
104
|
+
_p_ 112
|
105
|
+
a 111
|
106
|
+
x 107
|
107
|
+
o 107
|
108
|
+
W 106
|
109
|
+
i 105
|
110
|
+
|\| 104
|
111
|
+
_|\| 104
|
112
|
+
\| 104
|
113
|
+
_c_ 104
|
114
|
+
M 102
|
115
|
+
u_ 102
|
116
|
+
$ 102
|
117
|
+
vv 101
|
118
|
+
H_ 97
|
119
|
+
L_ 94
|
120
|
+
D_ 91
|
121
|
+
_I 90
|
122
|
+
F 88
|
123
|
+
/ 86
|
124
|
+
\/ 86
|
125
|
+
_R 86
|
126
|
+
_T 84
|
127
|
+
s 84
|
128
|
+
_W 84
|
129
|
+
_U 83
|
130
|
+
b_ 83
|
131
|
+
_b_ 83
|
132
|
+
_l_ 81
|
133
|
+
x_ 81
|
134
|
+
_v_ 79
|
135
|
+
_D 78
|
136
|
+
_u_ 78
|
137
|
+
(_ 77
|
138
|
+
_vv 77
|
139
|
+
_s 76
|
140
|
+
$_ 72
|
141
|
+
P 70
|
142
|
+
_L 69
|
143
|
+
_M 68
|
144
|
+
_(_ 67
|
145
|
+
e_ 67
|
146
|
+
@R 66
|
147
|
+
\|_ 66
|
148
|
+
_k_ 66
|
149
|
+
_|\|_ 66
|
150
|
+
|\|_ 66
|
151
|
+
Y 66
|
152
|
+
-| 65
|
153
|
+
|- 65
|
154
|
+
_T_ 64
|
155
|
+
_x 62
|
156
|
+
_+_ 62
|
157
|
+
vv_ 62
|
158
|
+
@n 62
|
159
|
+
_F 61
|
160
|
+
F_ 60
|
161
|
+
IN 60
|
162
|
+
_N_ 60
|
163
|
+
_H 59
|
164
|
+
_P 59
|
165
|
+
@_ 57
|
166
|
+
_i 56
|
167
|
+
_a 56
|
168
|
+
C 54
|
169
|
+
_@R 54
|
170
|
+
s_ 50
|
171
|
+
tz 50
|
172
|
+
gh 50
|
173
|
+
_R_ 49
|
174
|
+
_vv_ 49
|
175
|
+
_C 48
|
176
|
+
! 48
|
177
|
+
_F_ 48
|
178
|
+
_+h 47
|
179
|
+
_O 47
|
180
|
+
+h 47
|
181
|
+
cH 47
|
182
|
+
|-| 47
|
183
|
+
_x_ 46
|
184
|
+
_|\/ 45
|
185
|
+
\/| 45
|
186
|
+
/| 45
|
187
|
+
|\/| 45
|
188
|
+
_|\/| 45
|
189
|
+
|\/ 45
|
190
|
+
u|_ 44
|
191
|
+
u| 44
|
192
|
+
tz_ 44
|
193
|
+
E_ 44
|
194
|
+
_IN 43
|
195
|
+
Ul 43
|
196
|
+
_gh 43
|
197
|
+
Wh 43
|
198
|
+
) 43
|
199
|
+
_s_ 42
|
200
|
+
|| 42
|
201
|
+
A 41
|
202
|
+
Wh_ 41
|
203
|
+
@R_ 40
|
204
|
+
Or 40
|
205
|
+
_L_ 40
|
206
|
+
p| 40
|
207
|
+
nd 40
|
208
|
+
z,_ 39
|
209
|
+
_@R_ 39
|
210
|
+
p@ 39
|
211
|
+
_Wh 39
|
212
|
+
G 39
|
213
|
+
_p@ 39
|
214
|
+
_wh 39
|
215
|
+
wh 39
|
216
|
+
z, 39
|
217
|
+
d, 38
|
218
|
+
rz 38
|
219
|
+
_rz 38
|
220
|
+
nD 38
|
221
|
+
aR 37
|
222
|
+
df 37
|
223
|
+
_D_ 37
|
224
|
+
(h 37
|
225
|
+
M_ 37
|
226
|
+
_Wh_ 37
|
227
|
+
c| 37
|
228
|
+
HE 36
|
229
|
+
(h_ 36
|
230
|
+
_d, 36
|
231
|
+
Up 36
|
232
|
+
_nT 35
|
233
|
+
+H 35
|
234
|
+
_@n 35
|
235
|
+
nT 35
|
236
|
+
LL 35
|
237
|
+
_d@ 35
|
238
|
+
cH_ 35
|
239
|
+
d@ 35
|
240
|
+
B 34
|
241
|
+
_u| 34
|
242
|
+
_u|_ 34
|
243
|
+
j 34
|
244
|
+
_j 34
|
245
|
+
n. 34
|
246
|
+
_Up 34
|
247
|
+
y, 34
|
248
|
+
pdf 34
|
249
|
+
pd 34
|
250
|
+
_@$ 33
|
251
|
+
_|| 33
|
252
|
+
d,_ 33
|
253
|
+
@$ 33
|
254
|
+
lt 33
|
255
|
+
Y_ 32
|
256
|
+
_aR 32
|
257
|
+
_zUl 32
|
258
|
+
Rc 32
|
259
|
+
q 32
|
260
|
+
p|_ 32
|
261
|
+
_$ 32
|
262
|
+
_) 32
|
263
|
+
_p| 32
|
264
|
+
_zU 32
|
265
|
+
z. 32
|
266
|
+
zUl 32
|
267
|
+
_p|_ 32
|
268
|
+
y,_ 32
|
269
|
+
_aRc 32
|
270
|
+
aRc 32
|
271
|
+
M@ 32
|
272
|
+
zU 32
|
273
|
+
@r 32
|
274
|
+
_B 31
|
275
|
+
_d,_ 31
|
276
|
+
|-|_ 31
|
277
|
+
o_ 31
|
278
|
+
nd_ 31
|
279
|
+
-|_ 31
|
280
|
+
|\/|_ 30
|
281
|
+
_M_ 30
|
282
|
+
/|_ 30
|
283
|
+
\/|_ 30
|
284
|
+
@g 29
|
285
|
+
_b|_ 29
|
286
|
+
_+H 29
|
287
|
+
|__ 29
|
288
|
+
b| 29
|
289
|
+
gh_ 29
|
290
|
+
r. 29
|
291
|
+
_b| 29
|
292
|
+
h@ 29
|
293
|
+
O_ 29
|
294
|
+
b|_ 29
|
295
|
+
)_ 29
|
296
|
+
PH 28
|
297
|
+
||_ 28
|
298
|
+
De 28
|
299
|
+
G_ 28
|
300
|
+
_nT_ 28
|
301
|
+
nT_ 28
|
302
|
+
_H_ 28
|
303
|
+
ND 28
|
304
|
+
n._ 28
|
305
|
+
ve 28
|
306
|
+
z._ 28
|
307
|
+
K 28
|
308
|
+
_nd 28
|
309
|
+
D, 27
|
310
|
+
W_ 27
|
311
|
+
I_ 27
|
312
|
+
_PH 27
|
313
|
+
_K 27
|
314
|
+
_n. 27
|
315
|
+
ve_ 27
|
316
|
+
ph 26
|
317
|
+
rE 26
|
318
|
+
_+hO 26
|
319
|
+
_tz 26
|
320
|
+
_rE 26
|
321
|
+
@n_ 26
|
322
|
+
hO 26
|
323
|
+
Up_ 26
|
324
|
+
n, 26
|
325
|
+
_(h 26
|
326
|
+
+hO 26
|
327
|
+
LL_ 26
|
328
|
+
r._ 26
|
329
|
+
n,_ 26
|
330
|
+
_(h_ 25
|
331
|
+
= 25
|
332
|
+
fO 25
|
333
|
+
rz_ 25
|
334
|
+
!= 25
|
335
|
+
_r. 25
|
336
|
+
+o 25
|
337
|
+
_+HE 25
|
338
|
+
+HE 25
|
339
|
+
P_ 25
|
340
|
+
_tz_ 25
|
341
|
+
_rz_ 25
|
342
|
+
fOr 25
|
343
|
+
_pR_ 24
|
344
|
+
_wh_ 24
|
345
|
+
Ult 24
|
346
|
+
_pR 24
|
347
|
+
_Up_ 24
|
348
|
+
nt 24
|
349
|
+
wh_ 24
|
350
|
+
Or_ 24
|
351
|
+
\/_ 24
|
352
|
+
' 24
|
353
|
+
_nt 24
|
354
|
+
zUltz 24
|
355
|
+
iN 24
|
356
|
+
_! 24
|
357
|
+
_zUlt 24
|
358
|
+
@g_ 24
|
359
|
+
Ultz 24
|
360
|
+
/_ 24
|
361
|
+
pR_ 24
|
362
|
+
pR 24
|
363
|
+
zUlt 24
|
364
|
+
_+o 24
|
365
|
+
_!= 24
|
366
|
+
$. 24
|
367
|
+
ltz 24
|
368
|
+
In 24
|
369
|
+
_iN 24
|
370
|
+
RcH 23
|
371
|
+
rv_ 23
|
372
|
+
rv 23
|
373
|
+
_|\|d 23
|
374
|
+
|d 23
|
375
|
+
_pd 23
|
376
|
+
|\|d 23
|
377
|
+
aRcH 23
|
378
|
+
_rv 23
|
379
|
+
\|d 23
|
380
|
+
g|_ 23
|
381
|
+
d. 23
|
382
|
+
!_ 23
|
383
|
+
_aRcH 23
|
384
|
+
_n,_ 23
|
385
|
+
_pdf 23
|
386
|
+
rc 23
|
387
|
+
,. 23
|
388
|
+
D,_ 23
|
389
|
+
_be 23
|
390
|
+
_rv_ 23
|
391
|
+
g| 23
|
392
|
+
be 23
|
393
|
+
_n, 23
|
394
|
+
|_| 22
|
395
|
+
\|d_ 22
|
396
|
+
BuT 22
|
397
|
+
,,_ 22
|
398
|
+
Bu 22
|
399
|
+
nc 22
|
400
|
+
$._ 22
|