scylla 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (76) hide show
  1. data/.document +5 -0
  2. data/Gemfile +17 -0
  3. data/Gemfile.lock +30 -0
  4. data/LICENSE.txt +20 -0
  5. data/README.rdoc +19 -0
  6. data/Rakefile +52 -0
  7. data/VERSION +1 -0
  8. data/lib/scylla/classifier.rb +65 -0
  9. data/lib/scylla/generator.rb +73 -0
  10. data/lib/scylla/loader.rb +37 -0
  11. data/lib/scylla/string.rb +11 -0
  12. data/lib/scylla/tasks.rb +20 -0
  13. data/lib/scylla.rb +10 -0
  14. data/scylla.gemspec +117 -0
  15. data/source_texts/13375P33K.txt +199 -0
  16. data/source_texts/afrikaans.txt +114 -0
  17. data/source_texts/arabic.txt +576 -0
  18. data/source_texts/armenian.txt +86 -0
  19. data/source_texts/bulgarian.txt +834 -0
  20. data/source_texts/catalan.txt +413 -0
  21. data/source_texts/chinese.txt +199 -0
  22. data/source_texts/danish.txt +219 -0
  23. data/source_texts/english.txt +35 -0
  24. data/source_texts/esperanto.txt +199 -0
  25. data/source_texts/finnish.txt +71 -0
  26. data/source_texts/french.txt +89 -0
  27. data/source_texts/german.txt +137 -0
  28. data/source_texts/greek-iso8859-7.txt +139 -0
  29. data/source_texts/hebrew.txt +199 -0
  30. data/source_texts/hindi.txt +199 -0
  31. data/source_texts/hungarian.txt +102 -0
  32. data/source_texts/icelandic.txt +131 -0
  33. data/source_texts/indonesian.txt +93 -0
  34. data/source_texts/irish.txt +209 -0
  35. data/source_texts/italian.txt +120 -0
  36. data/source_texts/japanese.txt +199 -0
  37. data/source_texts/korean.txt +134 -0
  38. data/source_texts/latin.txt +120 -0
  39. data/source_texts/malay.txt +108 -0
  40. data/source_texts/marathi.txt +100 -0
  41. data/source_texts/mingo.txt +146 -0
  42. data/source_texts/nepali.txt +131 -0
  43. data/source_texts/norwegian.txt +157 -0
  44. data/source_texts/polish.txt +91 -0
  45. data/source_texts/portuguese.txt +88 -0
  46. data/source_texts/quechua.txt +108 -0
  47. data/source_texts/romanian.txt +103 -0
  48. data/source_texts/rumantsch.txt +110 -0
  49. data/source_texts/russian.txt +199 -0
  50. data/source_texts/sanskrit.txt +135 -0
  51. data/source_texts/scots_gaelic.txt +93 -0
  52. data/source_texts/serbian-ascii.txt +121 -0
  53. data/source_texts/slovak-ascii.txt +102 -0
  54. data/source_texts/slovenian-ascii.txt +100 -0
  55. data/source_texts/spanish.txt +834 -0
  56. data/source_texts/swahili.txt +120 -0
  57. data/source_texts/swedish.txt +75 -0
  58. data/source_texts/tagalog.txt +135 -0
  59. data/source_texts/tamil.txt +167 -0
  60. data/source_texts/thai.txt +86 -0
  61. data/source_texts/turkish.txt +117 -0
  62. data/source_texts/ukrainian-koi8_u.txt +214 -0
  63. data/source_texts/vietnamese.txt +92 -0
  64. data/source_texts/welsh.txt +148 -0
  65. data/source_texts/yiddish-utf.txt +83 -0
  66. data/test/classifier_test.rb +29 -0
  67. data/test/fixtures/source_texts/danish.txt +219 -0
  68. data/test/fixtures/source_texts/english.txt +35 -0
  69. data/test/fixtures/source_texts/french.txt +89 -0
  70. data/test/fixtures/source_texts/german.txt +137 -0
  71. data/test/fixtures/source_texts/spanish.txt +834 -0
  72. data/test/generator_test.rb +72 -0
  73. data/test/helper.rb +22 -0
  74. data/test/loader_test.rb +31 -0
  75. data/test/scylla_test.rb +20 -0
  76. metadata +173 -0
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/Gemfile ADDED
@@ -0,0 +1,17 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "bundler", "~> 1.0.0"
10
+ gem "jeweler", "~> 1.6.4"
11
+ end
12
+
13
+ group :test do
14
+ gem "shoulda", ">= 0"
15
+ gem "mocha", "~> 0.9.12", :require => nil
16
+ gem "ruby-debug"
17
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,30 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ columnize (0.3.4)
5
+ git (1.2.5)
6
+ jeweler (1.6.4)
7
+ bundler (~> 1.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ linecache (0.46)
11
+ rbx-require-relative (> 0.0.4)
12
+ mocha (0.9.12)
13
+ rake (0.9.2)
14
+ rbx-require-relative (0.0.5)
15
+ ruby-debug (0.10.4)
16
+ columnize (>= 0.1)
17
+ ruby-debug-base (~> 0.10.4.0)
18
+ ruby-debug-base (0.10.4)
19
+ linecache (>= 0.3)
20
+ shoulda (2.11.3)
21
+
22
+ PLATFORMS
23
+ ruby
24
+
25
+ DEPENDENCIES
26
+ bundler (~> 1.0.0)
27
+ jeweler (~> 1.6.4)
28
+ mocha (~> 0.9.12)
29
+ ruby-debug
30
+ shoulda
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Ashwin Hegde
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = scylla
2
+
3
+ Scylla is a language categorizing gem that allows you to guess the language of a given text. Scylla is a Ruby port of TextCat (http://www.let.rug.nl/~vannoord/TextCat) and is based on the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, ``N-Gram-Based Text Categorization'' In Proceedings of Third Annual Symposium on Document Analysis and Information Retrieval, Las Vegas, NV, UNLV Publications/Reprographics, pp. 161-175, 11-13 April 1994.
4
+
5
+ == Contributing to scylla
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 Ashwin Hegde. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,52 @@
1
+ # encoding: utf-8
2
+
3
+ $LOAD_PATH.unshift(File.expand_path(File.join(File.dirname(__FILE__), "lib")))
4
+
5
+ require 'rubygems'
6
+ require 'bundler'
7
+ require 'scylla'
8
+ require 'scylla/tasks'
9
+
10
+ begin
11
+ Bundler.setup(:default, :development)
12
+ rescue Bundler::BundlerError => e
13
+ $stderr.puts e.message
14
+ $stderr.puts "Run `bundle install` to install missing gems"
15
+ exit e.status_code
16
+ end
17
+ require 'rake'
18
+
19
+ require 'jeweler'
20
+ Jeweler::Tasks.new do |gem|
21
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
22
+ gem.name = "scylla"
23
+ gem.homepage = "http://github.com/hashwin/scylla"
24
+ gem.license = "MIT"
25
+ gem.summary = "Ruby port of Textcat language guesser"
26
+ gem.description = "Allows for text categorization by guessing the language of a given text using n-grams"
27
+ gem.email = "ahegde@zendesk.com"
28
+ gem.authors = ["Ashwin Hegde"]
29
+ # dependencies defined in Gemfile
30
+ end
31
+ Jeweler::RubygemsDotOrgTasks.new
32
+
33
+ require 'rake/testtask'
34
+ Rake::TestTask.new(:test) do |test|
35
+ test.libs << 'lib' << 'test'
36
+ test.pattern = 'test/**/*_test.rb'
37
+ test.verbose = true
38
+ end
39
+
40
+ task :default => :test
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
45
+
46
+ rdoc.rdoc_dir = 'rdoc'
47
+ rdoc.title = "scylla #{version}"
48
+ rdoc.rdoc_files.include('README*')
49
+ rdoc.rdoc_files.include('lib/**/*.rb')
50
+ end
51
+
52
+ Scylla::Tasks.new
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1,65 @@
1
+ module Scylla
2
+ class Classifier
3
+ attr_accessor :limit, :dir, :ngrams, :threshold, :input
4
+
5
+ # limit : Up to how many matching language results should be displayed
6
+ # ngrams : The total number of ngrams that are stored for each language
7
+ # threshold: The threshold score for matches
8
+ def initialize(limit = 10, ngrams = 400, threshold = 1.05)
9
+ @limit = limit
10
+ @ngrams = ngrams
11
+ @threshold = threshold
12
+ end
13
+
14
+ # Classifies a string to a list of languages in order of best match
15
+ def classify_string(text)
16
+ @input = ""
17
+ text.each_line { |line| @input += line.strip }
18
+ classify
19
+ end
20
+
21
+ # Classifies a file to a list of languages in order of best match
22
+ def classify_file(path)
23
+ @input = ""
24
+ File.readlines(path).each { |line| @input += line.strip }
25
+ classify
26
+ end
27
+
28
+ # Classifies @input to a list of languages in order of best match
29
+ def classify
30
+ results = Hash.new
31
+ languages = Scylla::Loader.languages
32
+ if languages.empty?
33
+ p "No languages (.lm files) found. Please run rake scylla:train after placing your training texts in the source_texts directory."
34
+ return
35
+ end
36
+ sg = Scylla::Generator.new
37
+ unknown = sg.create_lm(@input)
38
+ languages.each_key do |key|
39
+ ngram = languages[key]
40
+ results[key] = get_score(unknown, ngram)
41
+ end
42
+ results = results.sort {|a,b| a[1]<=>b[1]}
43
+ a = results[0][1]
44
+ answers = [results.shift[0]]
45
+ while (!results.empty? and results[0][1] < (@threshold * a))
46
+ answers << results.shift[0]
47
+ end
48
+ return answers
49
+ end
50
+
51
+ # Gets the score of the text in question compared to a particular language
52
+ def get_score(unknown, ngram)
53
+ i, p = 0,0
54
+ while i < unknown.size
55
+ if (ngram[unknown[i]])
56
+ p += (ngram[unknown[i]]-i).abs
57
+ else
58
+ p += @ngrams
59
+ end
60
+ i += 1
61
+ end
62
+ return p
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,73 @@
1
+ module Scylla
2
+ class Generator
3
+ attr_accessor :dirtext, :dirlm, :minsize
4
+
5
+ # dirtext: The location of the source training text files
6
+ # minsize: The minimum size of the ngrams that you would like to store
7
+ def initialize(dirtext = DEFAULT_SOURCE_DIR, dirlm = DEFAULT_TARGET_DIR, minsize = 0, silent = false)
8
+ @dirtext = dirtext
9
+ @dirlm = dirlm
10
+ @minsize = minsize
11
+ end
12
+
13
+ # This loads all the .txt files in the specified source training text folder
14
+ # and creates language maps using ngram frequencies. The maps are stored in
15
+ # lib/scylla/lms as .lm files
16
+ def train
17
+ languages = Dir.glob("**/*.lm")
18
+ textpaths = Dir.glob(@dirtext + "/*.txt")
19
+ languages.each {|l| File.delete(l) }
20
+ textpaths.each do |path|
21
+ write_lm(path)
22
+ end
23
+ end
24
+
25
+ # Reads a single text file specified by a path and writes a .lm file in
26
+ # lib/scylla/lms
27
+ def write_lm(path)
28
+ text = ""
29
+ File.open(path).each { |line| text += line.strip }
30
+ p "Creating language map for " + path
31
+ lm = create_lm(text, true)
32
+ lmname = File.join(@dirlm, File.basename(path, ".txt") + ".lm")
33
+ File.delete(lmname) if File.exists?(lmname)
34
+ File.open(lmname, 'w') do |f|
35
+ i = 0
36
+ lm.each do |freq|
37
+ break if i == 400
38
+ f.write(freq[0] + "\t" + freq[1].to_s + "\n")
39
+ i += 1
40
+ end
41
+ end
42
+ end
43
+
44
+ # This creates a language map for a given input string.
45
+ # The frequencies boolean specifies whether or not the method should
46
+ # return the freqencies of the ngrams, or simply an array in sorted order
47
+ def create_lm(input, frequencies = false)
48
+ text = ""
49
+ input.each_line {|line| text += line.strip }
50
+ input = text
51
+ ngram = Hash.new
52
+ input.split(/[0-9\s]/).each do |word|
53
+ word = "_" + word + "_";
54
+ len = word.size
55
+ for i in 0..word.size
56
+ (1..5).each do |j|
57
+ ngram[word[i,j]] ||= 0
58
+ ngram[word[i,j]] += 1 if (len > (j - 1))
59
+ end
60
+ len = len - 1
61
+ end
62
+ end
63
+ ngram.each_key do |key|
64
+ ngram.delete(key) if key.size <= @minsize
65
+ end
66
+ ngram = ngram.sort {|a,b| b[1] <=> a[1]}
67
+ return ngram if frequencies
68
+ sorted = []
69
+ ngram.each {|key| sorted << key[0]}
70
+ return sorted
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,37 @@
1
+ module Scylla
2
+ class Loader
3
+ # Loads all the language maps once into memory using the .lm files located
4
+ # in lib/scylla/lm
5
+ def self.load_language_maps
6
+ languages = Hash.new
7
+ Dir.glob("**/*.lm").each do |filepath|
8
+ language = File.basename(filepath, ".lm")
9
+ languages[language] = language_map(filepath)
10
+ end
11
+ return languages
12
+ end
13
+
14
+ # Returns a single language map from a specified .lm file
15
+ def self.language_map(path)
16
+ rank, ngram = 1, Hash.new
17
+ File.readlines(path).each do |line|
18
+ line = line.strip.split("\t").first
19
+ if(line =~ /^[^0-9\s]+/o)
20
+ ngram[line] = rank
21
+ rank += 1
22
+ end
23
+ end
24
+ return ngram
25
+ end
26
+
27
+ # Loads all maps from the .lm files, or loads them from memory if the
28
+ # files have already been read and loaded.
29
+ def self.languages
30
+ @languages ||= load_language_maps
31
+ end
32
+
33
+ def self.clear
34
+ @languages = nil
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,11 @@
1
+ class String
2
+ def guess
3
+ sc = Scylla::Classifier.new
4
+ sc.classify_string(self)
5
+ end
6
+
7
+ def language
8
+ sc = Scylla::Classifier.new
9
+ sc.classify_string(self).first
10
+ end
11
+ end
@@ -0,0 +1,20 @@
1
+ require 'rake'
2
+ require 'rake/tasklib'
3
+
4
+ module Scylla
5
+ class Tasks < ::Rake::TaskLib
6
+ def initialize
7
+ define_training_task
8
+ end
9
+
10
+ def define_training_task
11
+ namespace :scylla do
12
+ desc "Trains Scylla in new languages"
13
+ task :train do
14
+ sg = Scylla::Generator.new
15
+ sg.train
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
data/lib/scylla.rb ADDED
@@ -0,0 +1,10 @@
1
+ # encoding: utf-8
2
+ module Scylla
3
+ DEFAULT_SOURCE_DIR = File.join(File.dirname(__FILE__), "..", "source_texts")
4
+ DEFAULT_TARGET_DIR = File.join(File.dirname(__FILE__), "scylla", "lms")
5
+ end
6
+
7
+ require 'scylla/classifier'
8
+ require 'scylla/generator'
9
+ require 'scylla/loader'
10
+ require 'scylla/string'
data/scylla.gemspec ADDED
@@ -0,0 +1,117 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{scylla}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Ashwin Hegde"]
12
+ s.date = %q{2011-08-25}
13
+ s.description = %q{Allows for text categorization by guessing the language of a given text using n-grams}
14
+ s.email = %q{ahegde@zendesk.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE.txt",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ "Gemfile",
22
+ "Gemfile.lock",
23
+ "LICENSE.txt",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "lib/scylla.rb",
28
+ "lib/scylla/classifier.rb",
29
+ "lib/scylla/generator.rb",
30
+ "lib/scylla/loader.rb",
31
+ "lib/scylla/string.rb",
32
+ "lib/scylla/tasks.rb",
33
+ "scylla.gemspec",
34
+ "source_texts/13375P33K.txt",
35
+ "source_texts/afrikaans.txt",
36
+ "source_texts/arabic.txt",
37
+ "source_texts/armenian.txt",
38
+ "source_texts/bulgarian.txt",
39
+ "source_texts/catalan.txt",
40
+ "source_texts/chinese.txt",
41
+ "source_texts/danish.txt",
42
+ "source_texts/english.txt",
43
+ "source_texts/esperanto.txt",
44
+ "source_texts/finnish.txt",
45
+ "source_texts/french.txt",
46
+ "source_texts/german.txt",
47
+ "source_texts/greek-iso8859-7.txt",
48
+ "source_texts/hebrew.txt",
49
+ "source_texts/hindi.txt",
50
+ "source_texts/hungarian.txt",
51
+ "source_texts/icelandic.txt",
52
+ "source_texts/indonesian.txt",
53
+ "source_texts/irish.txt",
54
+ "source_texts/italian.txt",
55
+ "source_texts/japanese.txt",
56
+ "source_texts/korean.txt",
57
+ "source_texts/latin.txt",
58
+ "source_texts/malay.txt",
59
+ "source_texts/marathi.txt",
60
+ "source_texts/mingo.txt",
61
+ "source_texts/nepali.txt",
62
+ "source_texts/norwegian.txt",
63
+ "source_texts/polish.txt",
64
+ "source_texts/portuguese.txt",
65
+ "source_texts/quechua.txt",
66
+ "source_texts/romanian.txt",
67
+ "source_texts/rumantsch.txt",
68
+ "source_texts/russian.txt",
69
+ "source_texts/sanskrit.txt",
70
+ "source_texts/scots_gaelic.txt",
71
+ "source_texts/serbian-ascii.txt",
72
+ "source_texts/slovak-ascii.txt",
73
+ "source_texts/slovenian-ascii.txt",
74
+ "source_texts/spanish.txt",
75
+ "source_texts/swahili.txt",
76
+ "source_texts/swedish.txt",
77
+ "source_texts/tagalog.txt",
78
+ "source_texts/tamil.txt",
79
+ "source_texts/thai.txt",
80
+ "source_texts/turkish.txt",
81
+ "source_texts/ukrainian-koi8_u.txt",
82
+ "source_texts/vietnamese.txt",
83
+ "source_texts/welsh.txt",
84
+ "source_texts/yiddish-utf.txt",
85
+ "test/classifier_test.rb",
86
+ "test/fixtures/source_texts/danish.txt",
87
+ "test/fixtures/source_texts/english.txt",
88
+ "test/fixtures/source_texts/french.txt",
89
+ "test/fixtures/source_texts/german.txt",
90
+ "test/fixtures/source_texts/spanish.txt",
91
+ "test/generator_test.rb",
92
+ "test/helper.rb",
93
+ "test/loader_test.rb",
94
+ "test/scylla_test.rb"
95
+ ]
96
+ s.homepage = %q{http://github.com/hashwin/scylla}
97
+ s.licenses = ["MIT"]
98
+ s.require_paths = ["lib"]
99
+ s.rubygems_version = %q{1.6.2}
100
+ s.summary = %q{Ruby port of Textcat language guesser}
101
+
102
+ if s.respond_to? :specification_version then
103
+ s.specification_version = 3
104
+
105
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
106
+ s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
107
+ s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
108
+ else
109
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
110
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
111
+ end
112
+ else
113
+ s.add_dependency(%q<bundler>, ["~> 1.0.0"])
114
+ s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
115
+ end
116
+ end
117
+