scylla 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/lib/scylla/classifier.rb +65 -0
  9. data/lib/scylla/generator.rb +73 -0
  10. data/lib/scylla/loader.rb +37 -0
  11. data/lib/scylla/string.rb +11 -0
  12. data/lib/scylla/tasks.rb +20 -0
  13. data/lib/scylla.rb +10 -0
  14. data/scylla.gemspec +117 -0
  15. data/source_texts/13375P33K.txt +199 -0
  16. data/source_texts/afrikaans.txt +114 -0
  17. data/source_texts/arabic.txt +576 -0
  18. data/source_texts/armenian.txt +86 -0
  19. data/source_texts/bulgarian.txt +834 -0
  20. data/source_texts/catalan.txt +413 -0
  21. data/source_texts/chinese.txt +199 -0
  22. data/source_texts/danish.txt +219 -0
  23. data/source_texts/english.txt +35 -0
  24. data/source_texts/esperanto.txt +199 -0
  25. data/source_texts/finnish.txt +71 -0
  26. data/source_texts/french.txt +89 -0
  27. data/source_texts/german.txt +137 -0
  28. data/source_texts/greek-iso8859-7.txt +139 -0
  29. data/source_texts/hebrew.txt +199 -0
  30. data/source_texts/hindi.txt +199 -0
  31. data/source_texts/hungarian.txt +102 -0
  32. data/source_texts/icelandic.txt +131 -0
  33. data/source_texts/indonesian.txt +93 -0
  34. data/source_texts/irish.txt +209 -0
  35. data/source_texts/italian.txt +120 -0
  36. data/source_texts/japanese.txt +199 -0
  37. data/source_texts/korean.txt +134 -0
  38. data/source_texts/latin.txt +120 -0
  39. data/source_texts/malay.txt +108 -0
  40. data/source_texts/marathi.txt +100 -0
  41. data/source_texts/mingo.txt +146 -0
  42. data/source_texts/nepali.txt +131 -0
  43. data/source_texts/norwegian.txt +157 -0
  44. data/source_texts/polish.txt +91 -0
  45. data/source_texts/portuguese.txt +88 -0
  46. data/source_texts/quechua.txt +108 -0
  47. data/source_texts/romanian.txt +103 -0
  48. data/source_texts/rumantsch.txt +110 -0
  49. data/source_texts/russian.txt +199 -0
  50. data/source_texts/sanskrit.txt +135 -0
  51. data/source_texts/scots_gaelic.txt +93 -0
  52. data/source_texts/serbian-ascii.txt +121 -0
  53. data/source_texts/slovak-ascii.txt +102 -0
  54. data/source_texts/slovenian-ascii.txt +100 -0
  55. data/source_texts/spanish.txt +834 -0
  56. data/source_texts/swahili.txt +120 -0
  57. data/source_texts/swedish.txt +75 -0
  58. data/source_texts/tagalog.txt +135 -0
  59. data/source_texts/tamil.txt +167 -0
  60. data/source_texts/thai.txt +86 -0
  61. data/source_texts/turkish.txt +117 -0
  62. data/source_texts/ukrainian-koi8_u.txt +214 -0
  63. data/source_texts/vietnamese.txt +92 -0
  64. data/source_texts/welsh.txt +148 -0
  65. data/source_texts/yiddish-utf.txt +83 -0
  66. data/test/classifier_test.rb +29 -0
  67. data/test/fixtures/source_texts/danish.txt +219 -0
  68. data/test/fixtures/source_texts/english.txt +35 -0
  69. data/test/fixtures/source_texts/french.txt +89 -0
  70. data/test/fixtures/source_texts/german.txt +137 -0
  71. data/test/fixtures/source_texts/spanish.txt +834 -0
  72. data/test/generator_test.rb +72 -0
  73. data/test/helper.rb +22 -0
  74. data/test/loader_test.rb +31 -0
  75. data/test/scylla_test.rb +20 -0
  76. metadata +173 -0
@@ -0,0 +1,72 @@
1
+ require 'test/helper'
2
+
3
+ class GeneratorTest < Test::Unit::TestCase
4
+ context "create_lm ngrams" do
5
+ setup do
6
+ @text = "hello"
7
+ @ngram = ["_", "l", "lo_", "ello", "lo", "o", "llo", "hel", "o_", "ell", "e", "ello_",
8
+ "_he", "el", "hello", "hell", "he", "_hel", "h", "_hell", "llo_", "_h", "ll"]
9
+ @ngram_frequencies = [["_", 2], ["l", 2], ["lo_", 1], ["ello", 1], ["lo", 1], ["o", 1],
10
+ ["llo", 1], ["hel", 1], ["o_", 1], ["ell", 1], ["e", 1], ["ello_", 1], ["_he", 1],
11
+ ["el", 1], ["hello", 1], ["hell", 1], ["he", 1], ["_hel", 1], ["h", 1], ["_hell", 1],
12
+ ["llo_", 1], ["_h", 1], ["ll", 1]]
13
+ end
14
+
15
+ should "create an array of ngrams for a given text input" do
16
+ sg = Scylla::Generator.new
17
+ ngram_result = sg.create_lm(@text)
18
+ ngram_result.each do |res|
19
+ assert @ngram.include?(res)
20
+ end
21
+ end
22
+
23
+ should "create an array of ngrams with their associated frequencies for a given text input" do
24
+ sg = Scylla::Generator.new
25
+ ngram_result = sg.create_lm(@text, true)
26
+ ngram_result.each do |res|
27
+ assert @ngram_frequencies.include?(res)
28
+ end
29
+ end
30
+ end
31
+
32
+ context "create .lm files out of text files" do
33
+ setup do
34
+ @engtext = 'test/fixtures/source_texts/english.txt'
35
+ @englm = 'test/fixtures/lms/english.lm'
36
+ @sg = Scylla::Generator.new('test/fixtures/source_texts', 'test/fixtures/lms')
37
+ languages = Dir.glob("**/*.lm")
38
+ text = ""
39
+ File.readlines(@engtext).each {|line| text += line }
40
+ @map = @sg.create_lm(text, true)
41
+ end
42
+
43
+ should "create lm file out of text file" do
44
+ path = 'test/fixtures/source_texts/english.txt'
45
+ @sg.write_lm(path)
46
+ i = 0
47
+ File.readlines(@englm).each do |line|
48
+ break if i > 400
49
+ set = line.split("\t")
50
+ key = set.first
51
+ value = set.last.strip.to_i
52
+ assert_equal value, @map[i][1]
53
+ i += 1
54
+ end
55
+ end
56
+
57
+ should "create .lm files in bulk" do
58
+ @sg.train
59
+ languages = Dir.glob("**/*.lm")
60
+ assert_equal 8, languages.size
61
+ i = 0
62
+ File.readlines(@englm).each do |line|
63
+ break if i > 400
64
+ set = line.split("\t")
65
+ key = set.first
66
+ value = set.last.strip.to_i
67
+ assert_equal value, @map[i][1]
68
+ i += 1
69
+ end
70
+ end
71
+ end
72
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+
4
+ begin
5
+ Bundler.setup(:default, :development)
6
+ rescue Bundler::BundlerError => e
7
+ $stderr.puts e.message
8
+ $stderr.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
11
+
12
+ require 'test/unit'
13
+ require 'shoulda'
14
+
15
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
16
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
17
+
18
+ require 'scylla'
19
+ require 'mocha'
20
+
21
+ class Test::Unit::TestCase
22
+ end
@@ -0,0 +1,31 @@
1
+ require 'test/helper'
2
+
3
+ class LoaderTest < Test::Unit::TestCase
4
+ context "#languages" do
5
+ setup do
6
+ Scylla::Loader.clear
7
+ @englm = 'test/fixtures/lms/english.lm'
8
+ end
9
+
10
+ context "when being read" do
11
+ should_eventually "only load from disk once" do
12
+ Scylla::Loader.expects(:load_language_maps).once.returns([])
13
+ Scylla::Loader.languages
14
+ Scylla::Loader.languages
15
+ Scylla::Loader.unstub(:load_language_maps)
16
+ end
17
+ end
18
+
19
+ should "load the correct map for a language" do
20
+ map = Scylla::Loader.language_map(@englm)
21
+ rank = 1
22
+ File.readlines(@englm).each do |line|
23
+ set = line.split("\t")
24
+ key = set.first
25
+ value = set.last.strip.to_i
26
+ assert_equal rank, map[key]
27
+ rank += 1
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,20 @@
1
+ require 'test/helper'
2
+
3
+ class ScyllaTest < Test::Unit::TestCase
4
+ context "String methods" do
5
+ setup do
6
+ text = "Hello? Is there anybody in there?"
7
+ @language = text.language
8
+ @languages = text.guess
9
+ end
10
+
11
+ should "load language results for strings" do
12
+ assert_not_nil @language
13
+ assert_not_nil @languages
14
+ assert String, @language.class
15
+ assert Array, @languages.class
16
+ assert_equal "english", @language
17
+ assert_equal "english", @languages.first
18
+ end
19
+ end
20
+ end
metadata ADDED
@@ -0,0 +1,173 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: scylla
3
+ version: !ruby/object:Gem::Version
4
+ hash: 27
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 1
9
+ - 0
10
+ version: 0.1.0
11
+ platform: ruby
12
+ authors:
13
+ - Ashwin Hegde
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-08-25 00:00:00 -07:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: bundler
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ hash: 23
30
+ segments:
31
+ - 1
32
+ - 0
33
+ - 0
34
+ version: 1.0.0
35
+ type: :development
36
+ version_requirements: *id001
37
+ - !ruby/object:Gem::Dependency
38
+ name: jeweler
39
+ prerelease: false
40
+ requirement: &id002 !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ hash: 7
46
+ segments:
47
+ - 1
48
+ - 6
49
+ - 4
50
+ version: 1.6.4
51
+ type: :development
52
+ version_requirements: *id002
53
+ description: Allows for text categorization by guessing the language of a given text using n-grams
54
+ email: ahegde@zendesk.com
55
+ executables: []
56
+
57
+ extensions: []
58
+
59
+ extra_rdoc_files:
60
+ - LICENSE.txt
61
+ - README.rdoc
62
+ files:
63
+ - .document
64
+ - Gemfile
65
+ - Gemfile.lock
66
+ - LICENSE.txt
67
+ - README.rdoc
68
+ - Rakefile
69
+ - VERSION
70
+ - lib/scylla.rb
71
+ - lib/scylla/classifier.rb
72
+ - lib/scylla/generator.rb
73
+ - lib/scylla/loader.rb
74
+ - lib/scylla/string.rb
75
+ - lib/scylla/tasks.rb
76
+ - scylla.gemspec
77
+ - source_texts/13375P33K.txt
78
+ - source_texts/afrikaans.txt
79
+ - source_texts/arabic.txt
80
+ - source_texts/armenian.txt
81
+ - source_texts/bulgarian.txt
82
+ - source_texts/catalan.txt
83
+ - source_texts/chinese.txt
84
+ - source_texts/danish.txt
85
+ - source_texts/english.txt
86
+ - source_texts/esperanto.txt
87
+ - source_texts/finnish.txt
88
+ - source_texts/french.txt
89
+ - source_texts/german.txt
90
+ - source_texts/greek-iso8859-7.txt
91
+ - source_texts/hebrew.txt
92
+ - source_texts/hindi.txt
93
+ - source_texts/hungarian.txt
94
+ - source_texts/icelandic.txt
95
+ - source_texts/indonesian.txt
96
+ - source_texts/irish.txt
97
+ - source_texts/italian.txt
98
+ - source_texts/japanese.txt
99
+ - source_texts/korean.txt
100
+ - source_texts/latin.txt
101
+ - source_texts/malay.txt
102
+ - source_texts/marathi.txt
103
+ - source_texts/mingo.txt
104
+ - source_texts/nepali.txt
105
+ - source_texts/norwegian.txt
106
+ - source_texts/polish.txt
107
+ - source_texts/portuguese.txt
108
+ - source_texts/quechua.txt
109
+ - source_texts/romanian.txt
110
+ - source_texts/rumantsch.txt
111
+ - source_texts/russian.txt
112
+ - source_texts/sanskrit.txt
113
+ - source_texts/scots_gaelic.txt
114
+ - source_texts/serbian-ascii.txt
115
+ - source_texts/slovak-ascii.txt
116
+ - source_texts/slovenian-ascii.txt
117
+ - source_texts/spanish.txt
118
+ - source_texts/swahili.txt
119
+ - source_texts/swedish.txt
120
+ - source_texts/tagalog.txt
121
+ - source_texts/tamil.txt
122
+ - source_texts/thai.txt
123
+ - source_texts/turkish.txt
124
+ - source_texts/ukrainian-koi8_u.txt
125
+ - source_texts/vietnamese.txt
126
+ - source_texts/welsh.txt
127
+ - source_texts/yiddish-utf.txt
128
+ - test/classifier_test.rb
129
+ - test/fixtures/source_texts/danish.txt
130
+ - test/fixtures/source_texts/english.txt
131
+ - test/fixtures/source_texts/french.txt
132
+ - test/fixtures/source_texts/german.txt
133
+ - test/fixtures/source_texts/spanish.txt
134
+ - test/generator_test.rb
135
+ - test/helper.rb
136
+ - test/loader_test.rb
137
+ - test/scylla_test.rb
138
+ has_rdoc: true
139
+ homepage: http://github.com/hashwin/scylla
140
+ licenses:
141
+ - MIT
142
+ post_install_message:
143
+ rdoc_options: []
144
+
145
+ require_paths:
146
+ - lib
147
+ required_ruby_version: !ruby/object:Gem::Requirement
148
+ none: false
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ hash: 3
153
+ segments:
154
+ - 0
155
+ version: "0"
156
+ required_rubygems_version: !ruby/object:Gem::Requirement
157
+ none: false
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ hash: 3
162
+ segments:
163
+ - 0
164
+ version: "0"
165
+ requirements: []
166
+
167
+ rubyforge_project:
168
+ rubygems_version: 1.6.2
169
+ signing_key:
170
+ specification_version: 3
171
+ summary: Ruby port of Textcat language guesser
172
+ test_files: []
173
+