scylla 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +17 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +19 -0
- data/Rakefile +52 -0
- data/VERSION +1 -0
- data/lib/scylla/classifier.rb +65 -0
- data/lib/scylla/generator.rb +73 -0
- data/lib/scylla/loader.rb +37 -0
- data/lib/scylla/string.rb +11 -0
- data/lib/scylla/tasks.rb +20 -0
- data/lib/scylla.rb +10 -0
- data/scylla.gemspec +117 -0
- data/source_texts/13375P33K.txt +199 -0
- data/source_texts/afrikaans.txt +114 -0
- data/source_texts/arabic.txt +576 -0
- data/source_texts/armenian.txt +86 -0
- data/source_texts/bulgarian.txt +834 -0
- data/source_texts/catalan.txt +413 -0
- data/source_texts/chinese.txt +199 -0
- data/source_texts/danish.txt +219 -0
- data/source_texts/english.txt +35 -0
- data/source_texts/esperanto.txt +199 -0
- data/source_texts/finnish.txt +71 -0
- data/source_texts/french.txt +89 -0
- data/source_texts/german.txt +137 -0
- data/source_texts/greek-iso8859-7.txt +139 -0
- data/source_texts/hebrew.txt +199 -0
- data/source_texts/hindi.txt +199 -0
- data/source_texts/hungarian.txt +102 -0
- data/source_texts/icelandic.txt +131 -0
- data/source_texts/indonesian.txt +93 -0
- data/source_texts/irish.txt +209 -0
- data/source_texts/italian.txt +120 -0
- data/source_texts/japanese.txt +199 -0
- data/source_texts/korean.txt +134 -0
- data/source_texts/latin.txt +120 -0
- data/source_texts/malay.txt +108 -0
- data/source_texts/marathi.txt +100 -0
- data/source_texts/mingo.txt +146 -0
- data/source_texts/nepali.txt +131 -0
- data/source_texts/norwegian.txt +157 -0
- data/source_texts/polish.txt +91 -0
- data/source_texts/portuguese.txt +88 -0
- data/source_texts/quechua.txt +108 -0
- data/source_texts/romanian.txt +103 -0
- data/source_texts/rumantsch.txt +110 -0
- data/source_texts/russian.txt +199 -0
- data/source_texts/sanskrit.txt +135 -0
- data/source_texts/scots_gaelic.txt +93 -0
- data/source_texts/serbian-ascii.txt +121 -0
- data/source_texts/slovak-ascii.txt +102 -0
- data/source_texts/slovenian-ascii.txt +100 -0
- data/source_texts/spanish.txt +834 -0
- data/source_texts/swahili.txt +120 -0
- data/source_texts/swedish.txt +75 -0
- data/source_texts/tagalog.txt +135 -0
- data/source_texts/tamil.txt +167 -0
- data/source_texts/thai.txt +86 -0
- data/source_texts/turkish.txt +117 -0
- data/source_texts/ukrainian-koi8_u.txt +214 -0
- data/source_texts/vietnamese.txt +92 -0
- data/source_texts/welsh.txt +148 -0
- data/source_texts/yiddish-utf.txt +83 -0
- data/test/classifier_test.rb +29 -0
- data/test/fixtures/source_texts/danish.txt +219 -0
- data/test/fixtures/source_texts/english.txt +35 -0
- data/test/fixtures/source_texts/french.txt +89 -0
- data/test/fixtures/source_texts/german.txt +137 -0
- data/test/fixtures/source_texts/spanish.txt +834 -0
- data/test/generator_test.rb +72 -0
- data/test/helper.rb +22 -0
- data/test/loader_test.rb +31 -0
- data/test/scylla_test.rb +20 -0
- metadata +173 -0
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'test/helper'
|
2
|
+
|
3
|
+
class GeneratorTest < Test::Unit::TestCase
|
4
|
+
context "create_lm ngrams" do
|
5
|
+
setup do
|
6
|
+
@text = "hello"
|
7
|
+
@ngram = ["_", "l", "lo_", "ello", "lo", "o", "llo", "hel", "o_", "ell", "e", "ello_",
|
8
|
+
"_he", "el", "hello", "hell", "he", "_hel", "h", "_hell", "llo_", "_h", "ll"]
|
9
|
+
@ngram_frequencies = [["_", 2], ["l", 2], ["lo_", 1], ["ello", 1], ["lo", 1], ["o", 1],
|
10
|
+
["llo", 1], ["hel", 1], ["o_", 1], ["ell", 1], ["e", 1], ["ello_", 1], ["_he", 1],
|
11
|
+
["el", 1], ["hello", 1], ["hell", 1], ["he", 1], ["_hel", 1], ["h", 1], ["_hell", 1],
|
12
|
+
["llo_", 1], ["_h", 1], ["ll", 1]]
|
13
|
+
end
|
14
|
+
|
15
|
+
should "create an array of ngrams for a given text input" do
|
16
|
+
sg = Scylla::Generator.new
|
17
|
+
ngram_result = sg.create_lm(@text)
|
18
|
+
ngram_result.each do |res|
|
19
|
+
assert @ngram.include?(res)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
should "create an array of ngrams with their associated frequencies for a given text input" do
|
24
|
+
sg = Scylla::Generator.new
|
25
|
+
ngram_result = sg.create_lm(@text, true)
|
26
|
+
ngram_result.each do |res|
|
27
|
+
assert @ngram_frequencies.include?(res)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
context "create .lm files out of text files" do
|
33
|
+
setup do
|
34
|
+
@engtext = 'test/fixtures/source_texts/english.txt'
|
35
|
+
@englm = 'test/fixtures/lms/english.lm'
|
36
|
+
@sg = Scylla::Generator.new('test/fixtures/source_texts', 'test/fixtures/lms')
|
37
|
+
languages = Dir.glob("**/*.lm")
|
38
|
+
text = ""
|
39
|
+
File.readlines(@engtext).each {|line| text += line }
|
40
|
+
@map = @sg.create_lm(text, true)
|
41
|
+
end
|
42
|
+
|
43
|
+
should "create lm file out of text file" do
|
44
|
+
path = 'test/fixtures/source_texts/english.txt'
|
45
|
+
@sg.write_lm(path)
|
46
|
+
i = 0
|
47
|
+
File.readlines(@englm).each do |line|
|
48
|
+
break if i > 400
|
49
|
+
set = line.split("\t")
|
50
|
+
key = set.first
|
51
|
+
value = set.last.strip.to_i
|
52
|
+
assert_equal value, @map[i][1]
|
53
|
+
i += 1
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
should "create .lm files in bulk" do
|
58
|
+
@sg.train
|
59
|
+
languages = Dir.glob("**/*.lm")
|
60
|
+
assert_equal 8, languages.size
|
61
|
+
i = 0
|
62
|
+
File.readlines(@englm).each do |line|
|
63
|
+
break if i > 400
|
64
|
+
set = line.split("\t")
|
65
|
+
key = set.first
|
66
|
+
value = set.last.strip.to_i
|
67
|
+
assert_equal value, @map[i][1]
|
68
|
+
i += 1
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
|
4
|
+
begin
|
5
|
+
Bundler.setup(:default, :development)
|
6
|
+
rescue Bundler::BundlerError => e
|
7
|
+
$stderr.puts e.message
|
8
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
9
|
+
exit e.status_code
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'test/unit'
|
13
|
+
require 'shoulda'
|
14
|
+
|
15
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
16
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
17
|
+
|
18
|
+
require 'scylla'
|
19
|
+
require 'mocha'
|
20
|
+
|
21
|
+
class Test::Unit::TestCase
|
22
|
+
end
|
data/test/loader_test.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'test/helper'
|
2
|
+
|
3
|
+
class LoaderTest < Test::Unit::TestCase
|
4
|
+
context "#languages" do
|
5
|
+
setup do
|
6
|
+
Scylla::Loader.clear
|
7
|
+
@englm = 'test/fixtures/lms/english.lm'
|
8
|
+
end
|
9
|
+
|
10
|
+
context "when being read" do
|
11
|
+
should_eventually "only load from disk once" do
|
12
|
+
Scylla::Loader.expects(:load_language_maps).once.returns([])
|
13
|
+
Scylla::Loader.languages
|
14
|
+
Scylla::Loader.languages
|
15
|
+
Scylla::Loader.unstub(:load_language_maps)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
should "load the correct map for a language" do
|
20
|
+
map = Scylla::Loader.language_map(@englm)
|
21
|
+
rank = 1
|
22
|
+
File.readlines(@englm).each do |line|
|
23
|
+
set = line.split("\t")
|
24
|
+
key = set.first
|
25
|
+
value = set.last.strip.to_i
|
26
|
+
assert_equal rank, map[key]
|
27
|
+
rank += 1
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/test/scylla_test.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'test/helper'
|
2
|
+
|
3
|
+
class ScyllaTest < Test::Unit::TestCase
|
4
|
+
context "String methods" do
|
5
|
+
setup do
|
6
|
+
text = "Hello? Is there anybody in there?"
|
7
|
+
@language = text.language
|
8
|
+
@languages = text.guess
|
9
|
+
end
|
10
|
+
|
11
|
+
should "load language results for strings" do
|
12
|
+
assert_not_nil @language
|
13
|
+
assert_not_nil @languages
|
14
|
+
assert String, @language.class
|
15
|
+
assert Array, @languages.class
|
16
|
+
assert_equal "english", @language
|
17
|
+
assert_equal "english", @languages.first
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scylla
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 27
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
- 0
|
10
|
+
version: 0.1.0
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Ashwin Hegde
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2011-08-25 00:00:00 -07:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: bundler
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 23
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 0
|
33
|
+
- 0
|
34
|
+
version: 1.0.0
|
35
|
+
type: :development
|
36
|
+
version_requirements: *id001
|
37
|
+
- !ruby/object:Gem::Dependency
|
38
|
+
name: jeweler
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 1
|
48
|
+
- 6
|
49
|
+
- 4
|
50
|
+
version: 1.6.4
|
51
|
+
type: :development
|
52
|
+
version_requirements: *id002
|
53
|
+
description: Allows for text categorization by guessing the language of a given text using n-grams
|
54
|
+
email: ahegde@zendesk.com
|
55
|
+
executables: []
|
56
|
+
|
57
|
+
extensions: []
|
58
|
+
|
59
|
+
extra_rdoc_files:
|
60
|
+
- LICENSE.txt
|
61
|
+
- README.rdoc
|
62
|
+
files:
|
63
|
+
- .document
|
64
|
+
- Gemfile
|
65
|
+
- Gemfile.lock
|
66
|
+
- LICENSE.txt
|
67
|
+
- README.rdoc
|
68
|
+
- Rakefile
|
69
|
+
- VERSION
|
70
|
+
- lib/scylla.rb
|
71
|
+
- lib/scylla/classifier.rb
|
72
|
+
- lib/scylla/generator.rb
|
73
|
+
- lib/scylla/loader.rb
|
74
|
+
- lib/scylla/string.rb
|
75
|
+
- lib/scylla/tasks.rb
|
76
|
+
- scylla.gemspec
|
77
|
+
- source_texts/13375P33K.txt
|
78
|
+
- source_texts/afrikaans.txt
|
79
|
+
- source_texts/arabic.txt
|
80
|
+
- source_texts/armenian.txt
|
81
|
+
- source_texts/bulgarian.txt
|
82
|
+
- source_texts/catalan.txt
|
83
|
+
- source_texts/chinese.txt
|
84
|
+
- source_texts/danish.txt
|
85
|
+
- source_texts/english.txt
|
86
|
+
- source_texts/esperanto.txt
|
87
|
+
- source_texts/finnish.txt
|
88
|
+
- source_texts/french.txt
|
89
|
+
- source_texts/german.txt
|
90
|
+
- source_texts/greek-iso8859-7.txt
|
91
|
+
- source_texts/hebrew.txt
|
92
|
+
- source_texts/hindi.txt
|
93
|
+
- source_texts/hungarian.txt
|
94
|
+
- source_texts/icelandic.txt
|
95
|
+
- source_texts/indonesian.txt
|
96
|
+
- source_texts/irish.txt
|
97
|
+
- source_texts/italian.txt
|
98
|
+
- source_texts/japanese.txt
|
99
|
+
- source_texts/korean.txt
|
100
|
+
- source_texts/latin.txt
|
101
|
+
- source_texts/malay.txt
|
102
|
+
- source_texts/marathi.txt
|
103
|
+
- source_texts/mingo.txt
|
104
|
+
- source_texts/nepali.txt
|
105
|
+
- source_texts/norwegian.txt
|
106
|
+
- source_texts/polish.txt
|
107
|
+
- source_texts/portuguese.txt
|
108
|
+
- source_texts/quechua.txt
|
109
|
+
- source_texts/romanian.txt
|
110
|
+
- source_texts/rumantsch.txt
|
111
|
+
- source_texts/russian.txt
|
112
|
+
- source_texts/sanskrit.txt
|
113
|
+
- source_texts/scots_gaelic.txt
|
114
|
+
- source_texts/serbian-ascii.txt
|
115
|
+
- source_texts/slovak-ascii.txt
|
116
|
+
- source_texts/slovenian-ascii.txt
|
117
|
+
- source_texts/spanish.txt
|
118
|
+
- source_texts/swahili.txt
|
119
|
+
- source_texts/swedish.txt
|
120
|
+
- source_texts/tagalog.txt
|
121
|
+
- source_texts/tamil.txt
|
122
|
+
- source_texts/thai.txt
|
123
|
+
- source_texts/turkish.txt
|
124
|
+
- source_texts/ukrainian-koi8_u.txt
|
125
|
+
- source_texts/vietnamese.txt
|
126
|
+
- source_texts/welsh.txt
|
127
|
+
- source_texts/yiddish-utf.txt
|
128
|
+
- test/classifier_test.rb
|
129
|
+
- test/fixtures/source_texts/danish.txt
|
130
|
+
- test/fixtures/source_texts/english.txt
|
131
|
+
- test/fixtures/source_texts/french.txt
|
132
|
+
- test/fixtures/source_texts/german.txt
|
133
|
+
- test/fixtures/source_texts/spanish.txt
|
134
|
+
- test/generator_test.rb
|
135
|
+
- test/helper.rb
|
136
|
+
- test/loader_test.rb
|
137
|
+
- test/scylla_test.rb
|
138
|
+
has_rdoc: true
|
139
|
+
homepage: http://github.com/hashwin/scylla
|
140
|
+
licenses:
|
141
|
+
- MIT
|
142
|
+
post_install_message:
|
143
|
+
rdoc_options: []
|
144
|
+
|
145
|
+
require_paths:
|
146
|
+
- lib
|
147
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
148
|
+
none: false
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
hash: 3
|
153
|
+
segments:
|
154
|
+
- 0
|
155
|
+
version: "0"
|
156
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
157
|
+
none: false
|
158
|
+
requirements:
|
159
|
+
- - ">="
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
hash: 3
|
162
|
+
segments:
|
163
|
+
- 0
|
164
|
+
version: "0"
|
165
|
+
requirements: []
|
166
|
+
|
167
|
+
rubyforge_project:
|
168
|
+
rubygems_version: 1.6.2
|
169
|
+
signing_key:
|
170
|
+
specification_version: 3
|
171
|
+
summary: Ruby port of Textcat language guesser
|
172
|
+
test_files: []
|
173
|
+
|