RubyGems - langusta - Versions diffs - 0.1.1 → 0.2.0 - Mend

langusta 0.1.1 → 0.2.0

Files changed (37) hide show

data/.travis.yml +7 -0
data/Gemfile +10 -7
data/Gemfile.lock +12 -16
data/{README.rdoc → README.md} +27 -10
data/Rakefile +3 -10
data/VERSION +1 -1
data/langusta.gemspec +23 -47
data/lib/langusta.rb +36 -10
data/lib/langusta/codepoints.rb +19 -0
data/lib/langusta/command.rb +3 -3
data/lib/langusta/detector.rb +16 -13
data/lib/langusta/detector_factory.rb +11 -5
data/lib/langusta/guard.rb +22 -0
data/lib/langusta/inspector.rb +7 -0
data/lib/langusta/java_property_reader.rb +2 -3
data/lib/langusta/lang_profile.rb +12 -18
data/lib/langusta/language_detection_facade.rb +2 -2
data/lib/langusta/n_gram.rb +20 -25
data/lib/langusta/regex_helper.rb +15 -10
data/lib/langusta/tag_extractor.rb +5 -5
data/lib/langusta/unicode_block.rb +34 -34
data/test/helper.rb +12 -3
data/test/quality/test_falsified.rb +3 -3
data/test/test_command.rb +1 -0
data/test/test_detector.rb +18 -17
data/test/test_detector_factory.rb +17 -5
data/test/test_java_property_reader.rb +2 -1
data/test/test_lang_profile.rb +37 -31
data/test/test_language.rb +1 -0
data/test/test_language_detection_facade.rb +1 -1
data/test/test_langusta.rb +6 -6
data/test/test_n_gram.rb +87 -75
data/test/test_tag_extractor.rb +19 -18
data/test/test_unicode_block.rb +2 -1
metadata +54 -156
data/lib/langusta/ucs2_string.rb +0 -70
data/test/test_ucs2_string.rb +0 -9

data/.travis.yml ADDED Viewed

@@ -0,0 +1,7 @@
+before_install: "sudo apt-get -y install libonig-dev libyajl-dev"
+language: ruby
+rvm:
+  - 1.8.7
+  - 1.9.2
+  - 1.9.3

data/Gemfile CHANGED Viewed

@@ -1,11 +1,14 @@
 source :gemcutter
-gem "oniguruma", "1.1.0"
-gem "yajl-ruby", "0.8.2"
-group :development do
-  gem "bundler", "~> 1.0.0"
-  gem "jeweler", "~> 1.5.2"
-  gem "rcov"
+platform :mri_18 do
+  gem "oniguruma", "1.1.0"
+end
+gem "yajl-ruby", "0.8.2", :require => 'yajl'
+gem "bundler"
+gem "jeweler"
+group :test do
   gem "mocha"
-  gem "ruby-debug"
 end

data/Gemfile.lock CHANGED Viewed

@@ -1,32 +1,28 @@
 GEM
   remote: http://rubygems.org/
   specs:
-    columnize (0.3.2)
     git (1.2.5)
-    jeweler (1.5.2)
-      bundler (~> 1.0.0)
+    jeweler (1.8.3)
+      bundler (~> 1.0)
       git (>= 1.2.5)
       rake
-    linecache (0.43)
-    mocha (0.9.12)
+      rdoc
+    json (1.6.5)
+    metaclass (0.0.1)
+    mocha (0.10.5)
+      metaclass (~> 0.0.1)
     oniguruma (1.1.0)
-    rake (0.8.7)
-    rcov (0.9.9)
-    ruby-debug (0.10.4)
-      columnize (>= 0.1)
-      ruby-debug-base (~> 0.10.4.0)
-    ruby-debug-base (0.10.4)
-      linecache (>= 0.3)
+    rake (0.9.2.2)
+    rdoc (3.12)
+      json (~> 1.4)
     yajl-ruby (0.8.2)
 PLATFORMS
   ruby
 DEPENDENCIES
-  bundler (~> 1.0.0)
-  jeweler (~> 1.5.2)
+  bundler
+  jeweler
   mocha
   oniguruma (= 1.1.0)
-  rcov
-  ruby-debug
   yajl-ruby (= 0.8.2)

data/{README.rdoc → README.md} RENAMED Viewed

@@ -1,25 +1,41 @@
-= langusta
+# langusta
 Langusta is a language detection library based on a method designed and implemented by Nakatani Shuyo. This work is almost a direct 1-to-1 port of the original Java library which can be found at: http://code.google.com/p/language-detection.
 For more information about the method (naive bayesian classification), have a look at this presentation: http://www.slideshare.net/shuyo/language-detection-library-for-java. This implementation uses some resources from the original library, specifically the language profiles.
-== Runtime dependencies
+## Build status
-* oniguruma - regular expressions swiss army knife
+[![Build Status](https://secure.travis-ci.org/jasiek/langusta.png?branch=master)](http://travis-ci.org/jasiek/langusta)
+## Runtime dependencies
+* oniguruma - regular expressions swiss army knife (only required for 1.8.7)
 * yajl-ruby - a quick and elegant JSON parser
-== Usage
+## Usage
+The simplest way to use this library is to use the facade provided with this package.
-See lib/langusta/language_detection_facade.rb for an example, a canonical way to use the library is through this class.
+```ruby
+require 'langusta'
+facade = Langusta::LanguageDetectionFacade.new
+facade.detect('zażółć gęślą jaźń') #=> 'pl'
+```
-== Compatibility
+If you don't need all 49 profiles, you can boost your detection speed and reduce memory consumption by writing your own facade-like class.
+## Compatibility
 * Ruby 1.8.7
+* Ruby 1.9.2
+* Ruby 1.9.3
+## Caveats
-A version for Ruby 1.9 is in the works.
+Langusta is a memory hog - 49 profiles will take up about 80MB of RAM.
-== Contributing to langusta
+## Contributing to langusta
 * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
 * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
@@ -29,6 +45,7 @@ A version for Ruby 1.9 is in the works.
 * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
 * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
-== Copyright
+## Copyright
+Copyright (c) 2011, 2012 Jan Szumiec. See LICENSE.txt for further details.
-Copyright (c) 2011 Jan Szumiec. See LICENSE.txt for further details.

data/Rakefile CHANGED Viewed

@@ -24,21 +24,14 @@ Jeweler::RubygemsDotOrgTasks.new
 require 'rake/testtask'
 Rake::TestTask.new(:test) do |test|
-  test.libs << 'lib' << 'test'
-  test.pattern = 'test/test_*.rb'
-  test.verbose = true
-end
-require 'rcov/rcovtask'
-Rcov::RcovTask.new do |test|
-  test.libs << 'test'
+  test.libs << 'lib' << 'test' << '.'
   test.pattern = 'test/test_*.rb'
   test.verbose = true
 end
 task :default => :test
-require 'rake/rdoctask'
+require 'rdoc/task'
 Rake::RDocTask.new do |rdoc|
   version = File.exist?('VERSION') ? File.read('VERSION') : ""
@@ -49,7 +42,7 @@ Rake::RDocTask.new do |rdoc|
 end
 Rake::TestTask.new('test:quality') do |test|
-  test.libs << 'test/quality'
+  test.libs << 'test/quality' << 'lib' << '.'
   test.pattern = 'test/quality/test_*.rb'
   test.verbose = true
 end

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.1
1	+ 0.2.0

data/langusta.gemspec CHANGED Viewed

@@ -4,26 +4,26 @@
 # -*- encoding: utf-8 -*-
 Gem::Specification.new do |s|
-  s.name = %q{langusta}
-  s.version = "0.1.1"
+  s.name = "langusta"
+  s.version = "0.2.0"
   s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jan Szumiec"]
-  s.date = %q{2011-04-10}
-  s.default_executable = %q{langusta}
-  s.description = %q{Highly accurate language detection library, uses naive bayesian filter.}
-  s.email = %q{jan.szumiec@gmail.com}
+  s.date = "2012-03-04"
+  s.description = "Highly accurate language detection library, uses naive bayesian filter."
+  s.email = "jan.szumiec@gmail.com"
   s.executables = ["langusta"]
   s.extra_rdoc_files = [
     "LICENSE.txt",
-    "README.rdoc"
+    "README.md"
   ]
   s.files = [
     ".document",
+    ".travis.yml",
     "Gemfile",
     "Gemfile.lock",
     "LICENSE.txt",
-    "README.rdoc",
+    "README.md",
     "Rakefile",
     "VERSION",
     "bin/langusta",
@@ -31,9 +31,12 @@ Gem::Specification.new do |s|
     "data/uppercase.bin",
     "langusta.gemspec",
     "lib/langusta.rb",
+    "lib/langusta/codepoints.rb",
     "lib/langusta/command.rb",
     "lib/langusta/detector.rb",
     "lib/langusta/detector_factory.rb",
+    "lib/langusta/guard.rb",
+    "lib/langusta/inspector.rb",
     "lib/langusta/java_property_reader.rb",
     "lib/langusta/lang_profile.rb",
     "lib/langusta/language.rb",
@@ -41,7 +44,6 @@ Gem::Specification.new do |s|
     "lib/langusta/n_gram.rb",
     "lib/langusta/regex_helper.rb",
     "lib/langusta/tag_extractor.rb",
-    "lib/langusta/ucs2_string.rb",
     "lib/langusta/unicode_block.rb",
     "profiles/af",
     "profiles/ar",
@@ -152,59 +154,33 @@ Gem::Specification.new do |s|
     "test/test_langusta.rb",
     "test/test_n_gram.rb",
     "test/test_tag_extractor.rb",
-    "test/test_ucs2_string.rb",
     "test/test_unicode_block.rb"
   ]
-  s.homepage = %q{http://github.com/jasiek/langusta}
+  s.homepage = "http://github.com/jasiek/langusta"
   s.licenses = ["Apache 2.0"]
   s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.5.1}
-  s.summary = %q{Language detection library based on http://code.google.com/p/language-detection/.}
-  s.test_files = [
-    "test/helper.rb",
-    "test/quality/test_falsified.rb",
-    "test/test_command.rb",
-    "test/test_detector.rb",
-    "test/test_detector_factory.rb",
-    "test/test_java_property_reader.rb",
-    "test/test_lang_profile.rb",
-    "test/test_language.rb",
-    "test/test_language_detection_facade.rb",
-    "test/test_langusta.rb",
-    "test/test_n_gram.rb",
-    "test/test_tag_extractor.rb",
-    "test/test_ucs2_string.rb",
-    "test/test_unicode_block.rb"
-  ]
+  s.rubygems_version = "1.8.17"
+  s.summary = "Language detection library based on http://code.google.com/p/language-detection/."
   if s.respond_to? :specification_version then
     s.specification_version = 3
     if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
-      s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"])
+      s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
       s.add_runtime_dependency(%q<yajl-ruby>, ["= 0.8.2"])
-      s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
-      s.add_development_dependency(%q<jeweler>, ["~> 1.5.2"])
-      s.add_development_dependency(%q<rcov>, [">= 0"])
-      s.add_development_dependency(%q<mocha>, [">= 0"])
-      s.add_development_dependency(%q<ruby-debug>, [">= 0"])
+      s.add_runtime_dependency(%q<bundler>, [">= 0"])
+      s.add_runtime_dependency(%q<jeweler>, [">= 0"])
     else
-      s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
+      s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
       s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
-      s.add_dependency(%q<bundler>, ["~> 1.0.0"])
-      s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
-      s.add_dependency(%q<rcov>, [">= 0"])
-      s.add_dependency(%q<mocha>, [">= 0"])
-      s.add_dependency(%q<ruby-debug>, [">= 0"])
+      s.add_dependency(%q<bundler>, [">= 0"])
+      s.add_dependency(%q<jeweler>, [">= 0"])
     end
   else
-    s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
+    s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
     s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
-    s.add_dependency(%q<bundler>, ["~> 1.0.0"])
-    s.add_dependency(%q<jeweler>, ["~> 1.5.2"])
-    s.add_dependency(%q<rcov>, [">= 0"])
-    s.add_dependency(%q<mocha>, [">= 0"])
-    s.add_dependency(%q<ruby-debug>, [">= 0"])
+    s.add_dependency(%q<bundler>, [">= 0"])
+    s.add_dependency(%q<jeweler>, [">= 0"])
   end
 end

data/lib/langusta.rb CHANGED Viewed

@@ -2,20 +2,18 @@ $: << File.expand_path(File.dirname(__FILE__))
 require 'rubygems'
 require 'bundler'
-Bundler.setup
+Bundler.require
 require 'optparse'
-require 'iconv'
-# Required gems
-require 'oniguruma'
-require 'yajl'
+require 'iconv' if RUBY_VERSION < "1.9"
 module Langusta
   VERSION = '0.1.1'
+  autoload :Guard, 'langusta/guard'
+  autoload :Inspector, 'langusta/inspector'
   autoload :RegexHelper, 'langusta/regex_helper'
-  autoload :UCS2String, 'langusta/ucs2_string'
+  autoload :Codepoints, 'langusta/codepoints'
   autoload :Language, 'langusta/language'
   autoload :LangProfile, 'langusta/lang_profile'
   autoload :Detector, 'langusta/detector'
@@ -33,8 +31,36 @@ module Langusta
   UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
   MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
-  class DuplicateProfilesError < StandardError; end
-  class NoProfilesLoadedError < StandardError; end
-  class NoFeaturesInTextError < StandardError; end
+  class Error < StandardError; end
+  class DuplicateProfilesError < Error; end
+  class NoProfilesLoadedError < Error; end
+  class NoFeaturesInTextError < Error; end
+  UTF82CP_SELECTOR = RUBY_VERSION < "1.9" ? :utf82cp_18 : :utf82cp_19
+  CP2UTF8_SELECTOR = RUBY_VERSION < "1.9" ? :cp2utf8_18 : :cp2utf8_19
+  def self.utf82cp(utf8_string)
+    send(UTF82CP_SELECTOR, utf8_string)
+  end
+  def self.utf82cp_18(utf8_string)
+    Iconv.conv('ucs-2be', 'utf-8', utf8_string).unpack('n*')
+  end
+  def self.utf82cp_19(utf8_string)
+    utf8_string.encode('ucs-2be').unpack('n*')
+  end
+  def self.cp2utf8(cp_array)
+    send(CP2UTF8_SELECTOR, cp_array)
+  end
+  def self.cp2utf8_18(cp_array)
+    Iconv.conv('utf-8', 'ucs-2be', cp_array.pack('n*'))
+  end
+  def self.cp2utf8_19(cp_array)
+    cp_array.pack('n*').force_encoding('ucs-2be').encode('utf-8')
+  end
 end

data/lib/langusta/codepoints.rb ADDED Viewed

@@ -0,0 +1,19 @@
+module Langusta
+  module Codepoints
+    GSUB_SELECTOR = RUBY_VERSION < "1.9" ? :gsub18 : :gsub19
+    def self.gsub!(codepoint_array, regex, replacement)
+      string = Langusta.cp2utf8(codepoint_array)
+      string = send(GSUB_SELECTOR, string, regex, replacement)
+      codepoint_array.replace(Langusta.utf82cp(string))
+    end
+    def self.gsub18(string, oregex, replacement)
+      oregex.gsub(string, replacement)
+    end
+    def self.gsub19(string, regex, replacement)
+      string.gsub(regex, replacement)
+    end
+  end
+end

data/lib/langusta/command.rb CHANGED Viewed

@@ -55,7 +55,7 @@ EOF
     end
     def detect_single_lang(filename, alpha)
-      ucs2_content = UCS2String.from_utf8(File.open(filename).read)
+      ucs2_content = Langusta.utf82cp(File.open(filename).read)
       detector = @detector_factory.create(alpha)
       detector.append(ucs2_content)
@@ -64,8 +64,8 @@ EOF
     def initialize_factory(profile_directory)
       profiles = load_profiles(profile_directory)
-      profiles.each_with_index do |profile, index|
-        @detector_factory.add_profile(profile, index, profiles.length)
+      profiles.each do |profile|
+        @detector_factory.add_profile(profile)
       end
     end

data/lib/langusta/detector.rb CHANGED Viewed

@@ -13,7 +13,7 @@ module Langusta
     def initialize(factory)
       @word_lang_prob_map = factory.word_lang_prob_map
       @lang_list = factory.lang_list
-      @text = UCS2String.new('')
+      @text = []
       @langprob = nil
       @alpha = ALPHA_DEFAULT
       @n_trial = 7
@@ -25,13 +25,15 @@ module Langusta
     # Append more text to be recognized.
     # @param text [UCS2String] text to be recognized
     def append(text)
-      raise TypeError.new("Expected: UCS2String, got: #{text.class}") unless text.is_a?(UCS2String)
-      text.gsub!(RegexHelper::URL_REGEX, "\x00\x20")
-      text.gsub!(RegexHelper::MAIL_REGEX, "\x00\x20")
+      Guard.klass(text, Array, __method__)
+      text = Codepoints.gsub!(text, RegexHelper::URL_REGEX, "\x00\x20")
+      text = Codepoints.gsub!(text, RegexHelper::MAIL_REGEX, "\x00\x20")
       text = text.map do |c|
         NGram.normalize(c)
       end
-      @text = text.gsub!(RegexHelper::SPACE_REGEX, "\x00\x20")
+      @text = Codepoints.gsub!(text, RegexHelper::SPACE_REGEX, "\x00\x20")
     end
     # Detect the language.
@@ -102,17 +104,17 @@ module Langusta
     def cleaning_text
       non_latin_count = latin_count = 0
-      @text.each_char do |c|
-        if c < "\00z" && c >= "\x00A"
+      @text.each do |c|
+        if c < 0x007a && c > 0x0041 # c > "z" && c < "A"
           latin_count += 1
-        elsif c >= "\x03\x00" && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
+        elsif c >= 0x3000 && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
           non_latin_count += 1
         end
       end
       if latin_count * 2 < non_latin_count
-        text_without_latin = UCS2String.new('')
-        @text.each_char do |c|
-          text_without_latin << c if c > "\x00z" || c < "\x00A"
+        text_without_latin = []
+        @text.each do |c|
+          text_without_latin << c if c > 0x007a || c < 0x0041 # c > "z" || c < "A"
         end
         @text = text_without_latin
       end
@@ -121,7 +123,7 @@ module Langusta
     def extract_ngrams
       list = []
       ngram = NGram.new
-      @text.each_char do |char|
+      @text.each do |char|
         ngram.add_char(char)
         (1..NGram::N_GRAM).each do |n|
           w = ngram.get(n)
@@ -169,7 +171,8 @@ module Langusta
       # verbose
       weight = alpha / BASE_FREQ
       prob.length.times do |i|
-        prob[i] *= weight + lang_prob_map[i]
+        # tiny workaround for nil values in word freq array
+        prob[i] *= weight + (lang_prob_map[i] || 0.0)
       end
       true
     end