langusta 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +7 -0
 - data/Gemfile +10 -7
 - data/Gemfile.lock +12 -16
 - data/{README.rdoc → README.md} +27 -10
 - data/Rakefile +3 -10
 - data/VERSION +1 -1
 - data/langusta.gemspec +23 -47
 - data/lib/langusta.rb +36 -10
 - data/lib/langusta/codepoints.rb +19 -0
 - data/lib/langusta/command.rb +3 -3
 - data/lib/langusta/detector.rb +16 -13
 - data/lib/langusta/detector_factory.rb +11 -5
 - data/lib/langusta/guard.rb +22 -0
 - data/lib/langusta/inspector.rb +7 -0
 - data/lib/langusta/java_property_reader.rb +2 -3
 - data/lib/langusta/lang_profile.rb +12 -18
 - data/lib/langusta/language_detection_facade.rb +2 -2
 - data/lib/langusta/n_gram.rb +20 -25
 - data/lib/langusta/regex_helper.rb +15 -10
 - data/lib/langusta/tag_extractor.rb +5 -5
 - data/lib/langusta/unicode_block.rb +34 -34
 - data/test/helper.rb +12 -3
 - data/test/quality/test_falsified.rb +3 -3
 - data/test/test_command.rb +1 -0
 - data/test/test_detector.rb +18 -17
 - data/test/test_detector_factory.rb +17 -5
 - data/test/test_java_property_reader.rb +2 -1
 - data/test/test_lang_profile.rb +37 -31
 - data/test/test_language.rb +1 -0
 - data/test/test_language_detection_facade.rb +1 -1
 - data/test/test_langusta.rb +6 -6
 - data/test/test_n_gram.rb +87 -75
 - data/test/test_tag_extractor.rb +19 -18
 - data/test/test_unicode_block.rb +2 -1
 - metadata +54 -156
 - data/lib/langusta/ucs2_string.rb +0 -70
 - data/test/test_ucs2_string.rb +0 -9
 
    
        data/.travis.yml
    ADDED
    
    
    
        data/Gemfile
    CHANGED
    
    | 
         @@ -1,11 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            source :gemcutter
         
     | 
| 
       2 
     | 
    
         
            -
            gem "oniguruma", "1.1.0"
         
     | 
| 
       3 
     | 
    
         
            -
            gem "yajl-ruby", "0.8.2"
         
     | 
| 
       4 
2 
     | 
    
         | 
| 
       5 
     | 
    
         
            -
             
     | 
| 
       6 
     | 
    
         
            -
              gem " 
     | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
       8 
     | 
    
         
            -
             
     | 
| 
      
 3 
     | 
    
         
            +
            platform :mri_18 do
         
     | 
| 
      
 4 
     | 
    
         
            +
              gem "oniguruma", "1.1.0"
         
     | 
| 
      
 5 
     | 
    
         
            +
            end
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
            gem "yajl-ruby", "0.8.2", :require => 'yajl'
         
     | 
| 
      
 8 
     | 
    
         
            +
             
     | 
| 
      
 9 
     | 
    
         
            +
            gem "bundler"
         
     | 
| 
      
 10 
     | 
    
         
            +
            gem "jeweler"
         
     | 
| 
      
 11 
     | 
    
         
            +
             
     | 
| 
      
 12 
     | 
    
         
            +
            group :test do
         
     | 
| 
       9 
13 
     | 
    
         
             
              gem "mocha"
         
     | 
| 
       10 
     | 
    
         
            -
              gem "ruby-debug"
         
     | 
| 
       11 
14 
     | 
    
         
             
            end
         
     | 
    
        data/Gemfile.lock
    CHANGED
    
    | 
         @@ -1,32 +1,28 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            GEM
         
     | 
| 
       2 
2 
     | 
    
         
             
              remote: http://rubygems.org/
         
     | 
| 
       3 
3 
     | 
    
         
             
              specs:
         
     | 
| 
       4 
     | 
    
         
            -
                columnize (0.3.2)
         
     | 
| 
       5 
4 
     | 
    
         
             
                git (1.2.5)
         
     | 
| 
       6 
     | 
    
         
            -
                jeweler (1. 
     | 
| 
       7 
     | 
    
         
            -
                  bundler (~> 1.0 
     | 
| 
      
 5 
     | 
    
         
            +
                jeweler (1.8.3)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  bundler (~> 1.0)
         
     | 
| 
       8 
7 
     | 
    
         
             
                  git (>= 1.2.5)
         
     | 
| 
       9 
8 
     | 
    
         
             
                  rake
         
     | 
| 
       10 
     | 
    
         
            -
             
     | 
| 
       11 
     | 
    
         
            -
                 
     | 
| 
      
 9 
     | 
    
         
            +
                  rdoc
         
     | 
| 
      
 10 
     | 
    
         
            +
                json (1.6.5)
         
     | 
| 
      
 11 
     | 
    
         
            +
                metaclass (0.0.1)
         
     | 
| 
      
 12 
     | 
    
         
            +
                mocha (0.10.5)
         
     | 
| 
      
 13 
     | 
    
         
            +
                  metaclass (~> 0.0.1)
         
     | 
| 
       12 
14 
     | 
    
         
             
                oniguruma (1.1.0)
         
     | 
| 
       13 
     | 
    
         
            -
                rake (0. 
     | 
| 
       14 
     | 
    
         
            -
                 
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
       16 
     | 
    
         
            -
                  columnize (>= 0.1)
         
     | 
| 
       17 
     | 
    
         
            -
                  ruby-debug-base (~> 0.10.4.0)
         
     | 
| 
       18 
     | 
    
         
            -
                ruby-debug-base (0.10.4)
         
     | 
| 
       19 
     | 
    
         
            -
                  linecache (>= 0.3)
         
     | 
| 
      
 15 
     | 
    
         
            +
                rake (0.9.2.2)
         
     | 
| 
      
 16 
     | 
    
         
            +
                rdoc (3.12)
         
     | 
| 
      
 17 
     | 
    
         
            +
                  json (~> 1.4)
         
     | 
| 
       20 
18 
     | 
    
         
             
                yajl-ruby (0.8.2)
         
     | 
| 
       21 
19 
     | 
    
         | 
| 
       22 
20 
     | 
    
         
             
            PLATFORMS
         
     | 
| 
       23 
21 
     | 
    
         
             
              ruby
         
     | 
| 
       24 
22 
     | 
    
         | 
| 
       25 
23 
     | 
    
         
             
            DEPENDENCIES
         
     | 
| 
       26 
     | 
    
         
            -
              bundler 
     | 
| 
       27 
     | 
    
         
            -
              jeweler 
     | 
| 
      
 24 
     | 
    
         
            +
              bundler
         
     | 
| 
      
 25 
     | 
    
         
            +
              jeweler
         
     | 
| 
       28 
26 
     | 
    
         
             
              mocha
         
     | 
| 
       29 
27 
     | 
    
         
             
              oniguruma (= 1.1.0)
         
     | 
| 
       30 
     | 
    
         
            -
              rcov
         
     | 
| 
       31 
     | 
    
         
            -
              ruby-debug
         
     | 
| 
       32 
28 
     | 
    
         
             
              yajl-ruby (= 0.8.2)
         
     | 
    
        data/{README.rdoc → README.md}
    RENAMED
    
    | 
         @@ -1,25 +1,41 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
             
     | 
| 
      
 1 
     | 
    
         
            +
            # langusta
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            Langusta is a language detection library based on a method designed and implemented by Nakatani Shuyo. This work is almost a direct 1-to-1 port of the original Java library which can be found at: http://code.google.com/p/language-detection.
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
5 
     | 
    
         
             
            For more information about the method (naive bayesian classification), have a look at this presentation: http://www.slideshare.net/shuyo/language-detection-library-for-java. This implementation uses some resources from the original library, specifically the language profiles.
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
     | 
    
         
            -
             
     | 
| 
      
 7 
     | 
    
         
            +
            ## Build status
         
     | 
| 
       8 
8 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
      
 9 
     | 
    
         
            +
            [](http://travis-ci.org/jasiek/langusta)
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            ## Runtime dependencies
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            * oniguruma - regular expressions swiss army knife (only required for 1.8.7)
         
     | 
| 
       10 
14 
     | 
    
         
             
            * yajl-ruby - a quick and elegant JSON parser
         
     | 
| 
       11 
15 
     | 
    
         | 
| 
       12 
     | 
    
         
            -
             
     | 
| 
      
 16 
     | 
    
         
            +
            ## Usage
         
     | 
| 
      
 17 
     | 
    
         
            +
             
     | 
| 
      
 18 
     | 
    
         
            +
            The simplest way to use this library is to use the facade provided with this package.
         
     | 
| 
       13 
19 
     | 
    
         | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
      
 20 
     | 
    
         
            +
            ```ruby
         
     | 
| 
      
 21 
     | 
    
         
            +
            require 'langusta'
         
     | 
| 
      
 22 
     | 
    
         
            +
            facade = Langusta::LanguageDetectionFacade.new
         
     | 
| 
      
 23 
     | 
    
         
            +
            facade.detect('zażółć gęślą jaźń') #=> 'pl'
         
     | 
| 
      
 24 
     | 
    
         
            +
            ```
         
     | 
| 
       15 
25 
     | 
    
         | 
| 
       16 
     | 
    
         
            -
             
     | 
| 
      
 26 
     | 
    
         
            +
            If you don't need all 49 profiles, you can boost your detection speed and reduce memory consumption by writing your own facade-like class.
         
     | 
| 
      
 27 
     | 
    
         
            +
             
     | 
| 
      
 28 
     | 
    
         
            +
            ## Compatibility
         
     | 
| 
       17 
29 
     | 
    
         | 
| 
       18 
30 
     | 
    
         
             
            * Ruby 1.8.7
         
     | 
| 
      
 31 
     | 
    
         
            +
            * Ruby 1.9.2
         
     | 
| 
      
 32 
     | 
    
         
            +
            * Ruby 1.9.3
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            ## Caveats
         
     | 
| 
       19 
35 
     | 
    
         | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
      
 36 
     | 
    
         
            +
            Langusta is a memory hog - 49 profiles will take up about 80MB of RAM.
         
     | 
| 
       21 
37 
     | 
    
         | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
      
 38 
     | 
    
         
            +
            ## Contributing to langusta
         
     | 
| 
       23 
39 
     | 
    
         | 
| 
       24 
40 
     | 
    
         
             
            * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
         
     | 
| 
       25 
41 
     | 
    
         
             
            * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
         
     | 
| 
         @@ -29,6 +45,7 @@ A version for Ruby 1.9 is in the works. 
     | 
|
| 
       29 
45 
     | 
    
         
             
            * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
         
     | 
| 
       30 
46 
     | 
    
         
             
            * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
         
     | 
| 
       31 
47 
     | 
    
         | 
| 
       32 
     | 
    
         
            -
             
     | 
| 
      
 48 
     | 
    
         
            +
            ## Copyright
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
            Copyright (c) 2011, 2012 Jan Szumiec. See LICENSE.txt for further details.
         
     | 
| 
       33 
51 
     | 
    
         | 
| 
       34 
     | 
    
         
            -
            Copyright (c) 2011 Jan Szumiec. See LICENSE.txt for further details.
         
     | 
    
        data/Rakefile
    CHANGED
    
    | 
         @@ -24,21 +24,14 @@ Jeweler::RubygemsDotOrgTasks.new 
     | 
|
| 
       24 
24 
     | 
    
         | 
| 
       25 
25 
     | 
    
         
             
            require 'rake/testtask'
         
     | 
| 
       26 
26 
     | 
    
         
             
            Rake::TestTask.new(:test) do |test|
         
     | 
| 
       27 
     | 
    
         
            -
              test.libs << 'lib' << 'test'
         
     | 
| 
       28 
     | 
    
         
            -
              test.pattern = 'test/test_*.rb'
         
     | 
| 
       29 
     | 
    
         
            -
              test.verbose = true
         
     | 
| 
       30 
     | 
    
         
            -
            end
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
       32 
     | 
    
         
            -
            require 'rcov/rcovtask'
         
     | 
| 
       33 
     | 
    
         
            -
            Rcov::RcovTask.new do |test|
         
     | 
| 
       34 
     | 
    
         
            -
              test.libs << 'test'
         
     | 
| 
      
 27 
     | 
    
         
            +
              test.libs << 'lib' << 'test' << '.'
         
     | 
| 
       35 
28 
     | 
    
         
             
              test.pattern = 'test/test_*.rb'
         
     | 
| 
       36 
29 
     | 
    
         
             
              test.verbose = true
         
     | 
| 
       37 
30 
     | 
    
         
             
            end
         
     | 
| 
       38 
31 
     | 
    
         | 
| 
       39 
32 
     | 
    
         
             
            task :default => :test
         
     | 
| 
       40 
33 
     | 
    
         | 
| 
       41 
     | 
    
         
            -
            require ' 
     | 
| 
      
 34 
     | 
    
         
            +
            require 'rdoc/task'
         
     | 
| 
       42 
35 
     | 
    
         
             
            Rake::RDocTask.new do |rdoc|
         
     | 
| 
       43 
36 
     | 
    
         
             
              version = File.exist?('VERSION') ? File.read('VERSION') : ""
         
     | 
| 
       44 
37 
     | 
    
         | 
| 
         @@ -49,7 +42,7 @@ Rake::RDocTask.new do |rdoc| 
     | 
|
| 
       49 
42 
     | 
    
         
             
            end
         
     | 
| 
       50 
43 
     | 
    
         | 
| 
       51 
44 
     | 
    
         
             
            Rake::TestTask.new('test:quality') do |test|
         
     | 
| 
       52 
     | 
    
         
            -
              test.libs << 'test/quality'
         
     | 
| 
      
 45 
     | 
    
         
            +
              test.libs << 'test/quality' << 'lib' << '.'
         
     | 
| 
       53 
46 
     | 
    
         
             
              test.pattern = 'test/quality/test_*.rb'
         
     | 
| 
       54 
47 
     | 
    
         
             
              test.verbose = true
         
     | 
| 
       55 
48 
     | 
    
         
             
            end
         
     | 
    
        data/VERSION
    CHANGED
    
    | 
         @@ -1 +1 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            0. 
     | 
| 
      
 1 
     | 
    
         
            +
            0.2.0
         
     | 
    
        data/langusta.gemspec
    CHANGED
    
    | 
         @@ -4,26 +4,26 @@ 
     | 
|
| 
       4 
4 
     | 
    
         
             
            # -*- encoding: utf-8 -*-
         
     | 
| 
       5 
5 
     | 
    
         | 
| 
       6 
6 
     | 
    
         
             
            Gem::Specification.new do |s|
         
     | 
| 
       7 
     | 
    
         
            -
              s.name =  
     | 
| 
       8 
     | 
    
         
            -
              s.version = "0. 
     | 
| 
      
 7 
     | 
    
         
            +
              s.name = "langusta"
         
     | 
| 
      
 8 
     | 
    
         
            +
              s.version = "0.2.0"
         
     | 
| 
       9 
9 
     | 
    
         | 
| 
       10 
10 
     | 
    
         
             
              s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
         
     | 
| 
       11 
11 
     | 
    
         
             
              s.authors = ["Jan Szumiec"]
         
     | 
| 
       12 
     | 
    
         
            -
              s.date =  
     | 
| 
       13 
     | 
    
         
            -
              s. 
     | 
| 
       14 
     | 
    
         
            -
              s. 
     | 
| 
       15 
     | 
    
         
            -
              s.email = %q{jan.szumiec@gmail.com}
         
     | 
| 
      
 12 
     | 
    
         
            +
              s.date = "2012-03-04"
         
     | 
| 
      
 13 
     | 
    
         
            +
              s.description = "Highly accurate language detection library, uses naive bayesian filter."
         
     | 
| 
      
 14 
     | 
    
         
            +
              s.email = "jan.szumiec@gmail.com"
         
     | 
| 
       16 
15 
     | 
    
         
             
              s.executables = ["langusta"]
         
     | 
| 
       17 
16 
     | 
    
         
             
              s.extra_rdoc_files = [
         
     | 
| 
       18 
17 
     | 
    
         
             
                "LICENSE.txt",
         
     | 
| 
       19 
     | 
    
         
            -
                "README. 
     | 
| 
      
 18 
     | 
    
         
            +
                "README.md"
         
     | 
| 
       20 
19 
     | 
    
         
             
              ]
         
     | 
| 
       21 
20 
     | 
    
         
             
              s.files = [
         
     | 
| 
       22 
21 
     | 
    
         
             
                ".document",
         
     | 
| 
      
 22 
     | 
    
         
            +
                ".travis.yml",
         
     | 
| 
       23 
23 
     | 
    
         
             
                "Gemfile",
         
     | 
| 
       24 
24 
     | 
    
         
             
                "Gemfile.lock",
         
     | 
| 
       25 
25 
     | 
    
         
             
                "LICENSE.txt",
         
     | 
| 
       26 
     | 
    
         
            -
                "README. 
     | 
| 
      
 26 
     | 
    
         
            +
                "README.md",
         
     | 
| 
       27 
27 
     | 
    
         
             
                "Rakefile",
         
     | 
| 
       28 
28 
     | 
    
         
             
                "VERSION",
         
     | 
| 
       29 
29 
     | 
    
         
             
                "bin/langusta",
         
     | 
| 
         @@ -31,9 +31,12 @@ Gem::Specification.new do |s| 
     | 
|
| 
       31 
31 
     | 
    
         
             
                "data/uppercase.bin",
         
     | 
| 
       32 
32 
     | 
    
         
             
                "langusta.gemspec",
         
     | 
| 
       33 
33 
     | 
    
         
             
                "lib/langusta.rb",
         
     | 
| 
      
 34 
     | 
    
         
            +
                "lib/langusta/codepoints.rb",
         
     | 
| 
       34 
35 
     | 
    
         
             
                "lib/langusta/command.rb",
         
     | 
| 
       35 
36 
     | 
    
         
             
                "lib/langusta/detector.rb",
         
     | 
| 
       36 
37 
     | 
    
         
             
                "lib/langusta/detector_factory.rb",
         
     | 
| 
      
 38 
     | 
    
         
            +
                "lib/langusta/guard.rb",
         
     | 
| 
      
 39 
     | 
    
         
            +
                "lib/langusta/inspector.rb",
         
     | 
| 
       37 
40 
     | 
    
         
             
                "lib/langusta/java_property_reader.rb",
         
     | 
| 
       38 
41 
     | 
    
         
             
                "lib/langusta/lang_profile.rb",
         
     | 
| 
       39 
42 
     | 
    
         
             
                "lib/langusta/language.rb",
         
     | 
| 
         @@ -41,7 +44,6 @@ Gem::Specification.new do |s| 
     | 
|
| 
       41 
44 
     | 
    
         
             
                "lib/langusta/n_gram.rb",
         
     | 
| 
       42 
45 
     | 
    
         
             
                "lib/langusta/regex_helper.rb",
         
     | 
| 
       43 
46 
     | 
    
         
             
                "lib/langusta/tag_extractor.rb",
         
     | 
| 
       44 
     | 
    
         
            -
                "lib/langusta/ucs2_string.rb",
         
     | 
| 
       45 
47 
     | 
    
         
             
                "lib/langusta/unicode_block.rb",
         
     | 
| 
       46 
48 
     | 
    
         
             
                "profiles/af",
         
     | 
| 
       47 
49 
     | 
    
         
             
                "profiles/ar",
         
     | 
| 
         @@ -152,59 +154,33 @@ Gem::Specification.new do |s| 
     | 
|
| 
       152 
154 
     | 
    
         
             
                "test/test_langusta.rb",
         
     | 
| 
       153 
155 
     | 
    
         
             
                "test/test_n_gram.rb",
         
     | 
| 
       154 
156 
     | 
    
         
             
                "test/test_tag_extractor.rb",
         
     | 
| 
       155 
     | 
    
         
            -
                "test/test_ucs2_string.rb",
         
     | 
| 
       156 
157 
     | 
    
         
             
                "test/test_unicode_block.rb"
         
     | 
| 
       157 
158 
     | 
    
         
             
              ]
         
     | 
| 
       158 
     | 
    
         
            -
              s.homepage =  
     | 
| 
      
 159 
     | 
    
         
            +
              s.homepage = "http://github.com/jasiek/langusta"
         
     | 
| 
       159 
160 
     | 
    
         
             
              s.licenses = ["Apache 2.0"]
         
     | 
| 
       160 
161 
     | 
    
         
             
              s.require_paths = ["lib"]
         
     | 
| 
       161 
     | 
    
         
            -
              s.rubygems_version =  
     | 
| 
       162 
     | 
    
         
            -
              s.summary =  
     | 
| 
       163 
     | 
    
         
            -
              s.test_files = [
         
     | 
| 
       164 
     | 
    
         
            -
                "test/helper.rb",
         
     | 
| 
       165 
     | 
    
         
            -
                "test/quality/test_falsified.rb",
         
     | 
| 
       166 
     | 
    
         
            -
                "test/test_command.rb",
         
     | 
| 
       167 
     | 
    
         
            -
                "test/test_detector.rb",
         
     | 
| 
       168 
     | 
    
         
            -
                "test/test_detector_factory.rb",
         
     | 
| 
       169 
     | 
    
         
            -
                "test/test_java_property_reader.rb",
         
     | 
| 
       170 
     | 
    
         
            -
                "test/test_lang_profile.rb",
         
     | 
| 
       171 
     | 
    
         
            -
                "test/test_language.rb",
         
     | 
| 
       172 
     | 
    
         
            -
                "test/test_language_detection_facade.rb",
         
     | 
| 
       173 
     | 
    
         
            -
                "test/test_langusta.rb",
         
     | 
| 
       174 
     | 
    
         
            -
                "test/test_n_gram.rb",
         
     | 
| 
       175 
     | 
    
         
            -
                "test/test_tag_extractor.rb",
         
     | 
| 
       176 
     | 
    
         
            -
                "test/test_ucs2_string.rb",
         
     | 
| 
       177 
     | 
    
         
            -
                "test/test_unicode_block.rb"
         
     | 
| 
       178 
     | 
    
         
            -
              ]
         
     | 
| 
      
 162 
     | 
    
         
            +
              s.rubygems_version = "1.8.17"
         
     | 
| 
      
 163 
     | 
    
         
            +
              s.summary = "Language detection library based on http://code.google.com/p/language-detection/."
         
     | 
| 
       179 
164 
     | 
    
         | 
| 
       180 
165 
     | 
    
         
             
              if s.respond_to? :specification_version then
         
     | 
| 
       181 
166 
     | 
    
         
             
                s.specification_version = 3
         
     | 
| 
       182 
167 
     | 
    
         | 
| 
       183 
168 
     | 
    
         
             
                if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
         
     | 
| 
       184 
     | 
    
         
            -
                  s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"])
         
     | 
| 
      
 169 
     | 
    
         
            +
                  s.add_runtime_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
         
     | 
| 
       185 
170 
     | 
    
         
             
                  s.add_runtime_dependency(%q<yajl-ruby>, ["= 0.8.2"])
         
     | 
| 
       186 
     | 
    
         
            -
                  s. 
     | 
| 
       187 
     | 
    
         
            -
                  s. 
     | 
| 
       188 
     | 
    
         
            -
                  s.add_development_dependency(%q<rcov>, [">= 0"])
         
     | 
| 
       189 
     | 
    
         
            -
                  s.add_development_dependency(%q<mocha>, [">= 0"])
         
     | 
| 
       190 
     | 
    
         
            -
                  s.add_development_dependency(%q<ruby-debug>, [">= 0"])
         
     | 
| 
      
 171 
     | 
    
         
            +
                  s.add_runtime_dependency(%q<bundler>, [">= 0"])
         
     | 
| 
      
 172 
     | 
    
         
            +
                  s.add_runtime_dependency(%q<jeweler>, [">= 0"])
         
     | 
| 
       191 
173 
     | 
    
         
             
                else
         
     | 
| 
       192 
     | 
    
         
            -
                  s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
         
     | 
| 
      
 174 
     | 
    
         
            +
                  s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
         
     | 
| 
       193 
175 
     | 
    
         
             
                  s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
         
     | 
| 
       194 
     | 
    
         
            -
                  s.add_dependency(%q<bundler>, [" 
     | 
| 
       195 
     | 
    
         
            -
                  s.add_dependency(%q<jeweler>, [" 
     | 
| 
       196 
     | 
    
         
            -
                  s.add_dependency(%q<rcov>, [">= 0"])
         
     | 
| 
       197 
     | 
    
         
            -
                  s.add_dependency(%q<mocha>, [">= 0"])
         
     | 
| 
       198 
     | 
    
         
            -
                  s.add_dependency(%q<ruby-debug>, [">= 0"])
         
     | 
| 
      
 176 
     | 
    
         
            +
                  s.add_dependency(%q<bundler>, [">= 0"])
         
     | 
| 
      
 177 
     | 
    
         
            +
                  s.add_dependency(%q<jeweler>, [">= 0"])
         
     | 
| 
       199 
178 
     | 
    
         
             
                end
         
     | 
| 
       200 
179 
     | 
    
         
             
              else
         
     | 
| 
       201 
     | 
    
         
            -
                s.add_dependency(%q<oniguruma>, ["= 1.1.0"])
         
     | 
| 
      
 180 
     | 
    
         
            +
                s.add_dependency(%q<oniguruma>, ["= 1.1.0"]) if RUBY_VERSION < "1.9"
         
     | 
| 
       202 
181 
     | 
    
         
             
                s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"])
         
     | 
| 
       203 
     | 
    
         
            -
                s.add_dependency(%q<bundler>, [" 
     | 
| 
       204 
     | 
    
         
            -
                s.add_dependency(%q<jeweler>, [" 
     | 
| 
       205 
     | 
    
         
            -
                s.add_dependency(%q<rcov>, [">= 0"])
         
     | 
| 
       206 
     | 
    
         
            -
                s.add_dependency(%q<mocha>, [">= 0"])
         
     | 
| 
       207 
     | 
    
         
            -
                s.add_dependency(%q<ruby-debug>, [">= 0"])
         
     | 
| 
      
 182 
     | 
    
         
            +
                s.add_dependency(%q<bundler>, [">= 0"])
         
     | 
| 
      
 183 
     | 
    
         
            +
                s.add_dependency(%q<jeweler>, [">= 0"])
         
     | 
| 
       208 
184 
     | 
    
         
             
              end
         
     | 
| 
       209 
185 
     | 
    
         
             
            end
         
     | 
| 
       210 
186 
     | 
    
         | 
    
        data/lib/langusta.rb
    CHANGED
    
    | 
         @@ -2,20 +2,18 @@ $: << File.expand_path(File.dirname(__FILE__)) 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            require 'rubygems'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'bundler'
         
     | 
| 
       5 
     | 
    
         
            -
            Bundler. 
     | 
| 
      
 5 
     | 
    
         
            +
            Bundler.require
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
            require 'optparse'
         
     | 
| 
       8 
     | 
    
         
            -
            require 'iconv'
         
     | 
| 
       9 
     | 
    
         
            -
             
     | 
| 
       10 
     | 
    
         
            -
            # Required gems
         
     | 
| 
       11 
     | 
    
         
            -
            require 'oniguruma'
         
     | 
| 
       12 
     | 
    
         
            -
            require 'yajl'
         
     | 
| 
      
 8 
     | 
    
         
            +
            require 'iconv' if RUBY_VERSION < "1.9"
         
     | 
| 
       13 
9 
     | 
    
         | 
| 
       14 
10 
     | 
    
         
             
            module Langusta
         
     | 
| 
       15 
11 
     | 
    
         
             
              VERSION = '0.1.1'
         
     | 
| 
       16 
12 
     | 
    
         | 
| 
      
 13 
     | 
    
         
            +
              autoload :Guard, 'langusta/guard'
         
     | 
| 
      
 14 
     | 
    
         
            +
              autoload :Inspector, 'langusta/inspector'
         
     | 
| 
       17 
15 
     | 
    
         
             
              autoload :RegexHelper, 'langusta/regex_helper'
         
     | 
| 
       18 
     | 
    
         
            -
              autoload : 
     | 
| 
      
 16 
     | 
    
         
            +
              autoload :Codepoints, 'langusta/codepoints'
         
     | 
| 
       19 
17 
     | 
    
         
             
              autoload :Language, 'langusta/language'
         
     | 
| 
       20 
18 
     | 
    
         
             
              autoload :LangProfile, 'langusta/lang_profile'
         
     | 
| 
       21 
19 
     | 
    
         
             
              autoload :Detector, 'langusta/detector'
         
     | 
| 
         @@ -33,8 +31,36 @@ module Langusta 
     | 
|
| 
       33 
31 
     | 
    
         
             
              UPPERCASE_BIN = File.join(ABSOLUTE_PATH, 'data/uppercase.bin')
         
     | 
| 
       34 
32 
     | 
    
         
             
              MESSAGES_PROPERTIES = File.join(ABSOLUTE_PATH, 'data/messages.properties')
         
     | 
| 
       35 
33 
     | 
    
         | 
| 
       36 
     | 
    
         
            -
              class  
     | 
| 
       37 
     | 
    
         
            -
              class  
     | 
| 
       38 
     | 
    
         
            -
              class  
     | 
| 
      
 34 
     | 
    
         
            +
              class Error < StandardError; end
         
     | 
| 
      
 35 
     | 
    
         
            +
              class DuplicateProfilesError < Error; end
         
     | 
| 
      
 36 
     | 
    
         
            +
              class NoProfilesLoadedError < Error; end
         
     | 
| 
      
 37 
     | 
    
         
            +
              class NoFeaturesInTextError < Error; end
         
     | 
| 
      
 38 
     | 
    
         
            +
             
     | 
| 
      
 39 
     | 
    
         
            +
              UTF82CP_SELECTOR = RUBY_VERSION < "1.9" ? :utf82cp_18 : :utf82cp_19
         
     | 
| 
      
 40 
     | 
    
         
            +
              CP2UTF8_SELECTOR = RUBY_VERSION < "1.9" ? :cp2utf8_18 : :cp2utf8_19
         
     | 
| 
      
 41 
     | 
    
         
            +
             
     | 
| 
      
 42 
     | 
    
         
            +
              def self.utf82cp(utf8_string)
         
     | 
| 
      
 43 
     | 
    
         
            +
                send(UTF82CP_SELECTOR, utf8_string)
         
     | 
| 
      
 44 
     | 
    
         
            +
              end
         
     | 
| 
      
 45 
     | 
    
         
            +
             
     | 
| 
      
 46 
     | 
    
         
            +
              def self.utf82cp_18(utf8_string)
         
     | 
| 
      
 47 
     | 
    
         
            +
                Iconv.conv('ucs-2be', 'utf-8', utf8_string).unpack('n*')
         
     | 
| 
      
 48 
     | 
    
         
            +
              end
         
     | 
| 
      
 49 
     | 
    
         
            +
             
     | 
| 
      
 50 
     | 
    
         
            +
              def self.utf82cp_19(utf8_string)
         
     | 
| 
      
 51 
     | 
    
         
            +
                utf8_string.encode('ucs-2be').unpack('n*')
         
     | 
| 
      
 52 
     | 
    
         
            +
              end
         
     | 
| 
      
 53 
     | 
    
         
            +
             
     | 
| 
      
 54 
     | 
    
         
            +
              def self.cp2utf8(cp_array)
         
     | 
| 
      
 55 
     | 
    
         
            +
                send(CP2UTF8_SELECTOR, cp_array)
         
     | 
| 
      
 56 
     | 
    
         
            +
              end
         
     | 
| 
      
 57 
     | 
    
         
            +
             
     | 
| 
      
 58 
     | 
    
         
            +
              def self.cp2utf8_18(cp_array)
         
     | 
| 
      
 59 
     | 
    
         
            +
                Iconv.conv('utf-8', 'ucs-2be', cp_array.pack('n*'))
         
     | 
| 
      
 60 
     | 
    
         
            +
              end
         
     | 
| 
      
 61 
     | 
    
         
            +
             
     | 
| 
      
 62 
     | 
    
         
            +
              def self.cp2utf8_19(cp_array)
         
     | 
| 
      
 63 
     | 
    
         
            +
                cp_array.pack('n*').force_encoding('ucs-2be').encode('utf-8')
         
     | 
| 
      
 64 
     | 
    
         
            +
              end
         
     | 
| 
       39 
65 
     | 
    
         
             
            end
         
     | 
| 
       40 
66 
     | 
    
         | 
| 
         @@ -0,0 +1,19 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Langusta
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Codepoints
         
     | 
| 
      
 3 
     | 
    
         
            +
                GSUB_SELECTOR = RUBY_VERSION < "1.9" ? :gsub18 : :gsub19
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
                def self.gsub!(codepoint_array, regex, replacement)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  string = Langusta.cp2utf8(codepoint_array)
         
     | 
| 
      
 7 
     | 
    
         
            +
                  string = send(GSUB_SELECTOR, string, regex, replacement)
         
     | 
| 
      
 8 
     | 
    
         
            +
                  codepoint_array.replace(Langusta.utf82cp(string))
         
     | 
| 
      
 9 
     | 
    
         
            +
                end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                def self.gsub18(string, oregex, replacement)
         
     | 
| 
      
 12 
     | 
    
         
            +
                  oregex.gsub(string, replacement)
         
     | 
| 
      
 13 
     | 
    
         
            +
                end
         
     | 
| 
      
 14 
     | 
    
         
            +
             
     | 
| 
      
 15 
     | 
    
         
            +
                def self.gsub19(string, regex, replacement)
         
     | 
| 
      
 16 
     | 
    
         
            +
                  string.gsub(regex, replacement)
         
     | 
| 
      
 17 
     | 
    
         
            +
                end
         
     | 
| 
      
 18 
     | 
    
         
            +
              end
         
     | 
| 
      
 19 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/langusta/command.rb
    CHANGED
    
    | 
         @@ -55,7 +55,7 @@ EOF 
     | 
|
| 
       55 
55 
     | 
    
         
             
                end
         
     | 
| 
       56 
56 
     | 
    
         | 
| 
       57 
57 
     | 
    
         
             
                def detect_single_lang(filename, alpha)
         
     | 
| 
       58 
     | 
    
         
            -
                  ucs2_content =  
     | 
| 
      
 58 
     | 
    
         
            +
                  ucs2_content = Langusta.utf82cp(File.open(filename).read)
         
     | 
| 
       59 
59 
     | 
    
         
             
                  detector = @detector_factory.create(alpha)
         
     | 
| 
       60 
60 
     | 
    
         
             
                  detector.append(ucs2_content)
         
     | 
| 
       61 
61 
     | 
    
         | 
| 
         @@ -64,8 +64,8 @@ EOF 
     | 
|
| 
       64 
64 
     | 
    
         | 
| 
       65 
65 
     | 
    
         
             
                def initialize_factory(profile_directory)
         
     | 
| 
       66 
66 
     | 
    
         
             
                  profiles = load_profiles(profile_directory)
         
     | 
| 
       67 
     | 
    
         
            -
                  profiles. 
     | 
| 
       68 
     | 
    
         
            -
                    @detector_factory.add_profile(profile 
     | 
| 
      
 67 
     | 
    
         
            +
                  profiles.each do |profile|
         
     | 
| 
      
 68 
     | 
    
         
            +
                    @detector_factory.add_profile(profile)
         
     | 
| 
       69 
69 
     | 
    
         
             
                  end
         
     | 
| 
       70 
70 
     | 
    
         
             
                end
         
     | 
| 
       71 
71 
     | 
    
         | 
    
        data/lib/langusta/detector.rb
    CHANGED
    
    | 
         @@ -13,7 +13,7 @@ module Langusta 
     | 
|
| 
       13 
13 
     | 
    
         
             
                def initialize(factory)
         
     | 
| 
       14 
14 
     | 
    
         
             
                  @word_lang_prob_map = factory.word_lang_prob_map
         
     | 
| 
       15 
15 
     | 
    
         
             
                  @lang_list = factory.lang_list
         
     | 
| 
       16 
     | 
    
         
            -
                  @text =  
     | 
| 
      
 16 
     | 
    
         
            +
                  @text = []
         
     | 
| 
       17 
17 
     | 
    
         
             
                  @langprob = nil
         
     | 
| 
       18 
18 
     | 
    
         
             
                  @alpha = ALPHA_DEFAULT
         
     | 
| 
       19 
19 
     | 
    
         
             
                  @n_trial = 7
         
     | 
| 
         @@ -25,13 +25,15 @@ module Langusta 
     | 
|
| 
       25 
25 
     | 
    
         
             
                # Append more text to be recognized.
         
     | 
| 
       26 
26 
     | 
    
         
             
                # @param text [UCS2String] text to be recognized
         
     | 
| 
       27 
27 
     | 
    
         
             
                def append(text)
         
     | 
| 
       28 
     | 
    
         
            -
                   
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
                  text.gsub!(RegexHelper:: 
     | 
| 
      
 28 
     | 
    
         
            +
                  Guard.klass(text, Array, __method__)
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
                  text = Codepoints.gsub!(text, RegexHelper::URL_REGEX, "\x00\x20")
         
     | 
| 
      
 31 
     | 
    
         
            +
                  text = Codepoints.gsub!(text, RegexHelper::MAIL_REGEX, "\x00\x20")
         
     | 
| 
      
 32 
     | 
    
         
            +
             
     | 
| 
       31 
33 
     | 
    
         
             
                  text = text.map do |c|
         
     | 
| 
       32 
34 
     | 
    
         
             
                    NGram.normalize(c)
         
     | 
| 
       33 
35 
     | 
    
         
             
                  end
         
     | 
| 
       34 
     | 
    
         
            -
                  @text =  
     | 
| 
      
 36 
     | 
    
         
            +
                  @text = Codepoints.gsub!(text, RegexHelper::SPACE_REGEX, "\x00\x20")
         
     | 
| 
       35 
37 
     | 
    
         
             
                end
         
     | 
| 
       36 
38 
     | 
    
         | 
| 
       37 
39 
     | 
    
         
             
                # Detect the language.
         
     | 
| 
         @@ -102,17 +104,17 @@ module Langusta 
     | 
|
| 
       102 
104 
     | 
    
         | 
| 
       103 
105 
     | 
    
         
             
                def cleaning_text
         
     | 
| 
       104 
106 
     | 
    
         
             
                  non_latin_count = latin_count = 0
         
     | 
| 
       105 
     | 
    
         
            -
                  @text. 
     | 
| 
       106 
     | 
    
         
            -
                    if c < " 
     | 
| 
      
 107 
     | 
    
         
            +
                  @text.each do |c|
         
     | 
| 
      
 108 
     | 
    
         
            +
                    if c < 0x007a && c > 0x0041 # c > "z" && c < "A"
         
     | 
| 
       107 
109 
     | 
    
         
             
                      latin_count += 1
         
     | 
| 
       108 
     | 
    
         
            -
                    elsif c >=  
     | 
| 
      
 110 
     | 
    
         
            +
                    elsif c >= 0x3000 && UnicodeBlock.of(c) != UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
         
     | 
| 
       109 
111 
     | 
    
         
             
                      non_latin_count += 1
         
     | 
| 
       110 
112 
     | 
    
         
             
                    end
         
     | 
| 
       111 
113 
     | 
    
         
             
                  end
         
     | 
| 
       112 
114 
     | 
    
         
             
                  if latin_count * 2 < non_latin_count
         
     | 
| 
       113 
     | 
    
         
            -
                    text_without_latin =  
     | 
| 
       114 
     | 
    
         
            -
                    @text. 
     | 
| 
       115 
     | 
    
         
            -
                      text_without_latin << c if c > " 
     | 
| 
      
 115 
     | 
    
         
            +
                    text_without_latin = []
         
     | 
| 
      
 116 
     | 
    
         
            +
                    @text.each do |c|
         
     | 
| 
      
 117 
     | 
    
         
            +
                      text_without_latin << c if c > 0x007a || c < 0x0041 # c > "z" || c < "A"
         
     | 
| 
       116 
118 
     | 
    
         
             
                    end
         
     | 
| 
       117 
119 
     | 
    
         
             
                    @text = text_without_latin
         
     | 
| 
       118 
120 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -121,7 +123,7 @@ module Langusta 
     | 
|
| 
       121 
123 
     | 
    
         
             
                def extract_ngrams
         
     | 
| 
       122 
124 
     | 
    
         
             
                  list = []
         
     | 
| 
       123 
125 
     | 
    
         
             
                  ngram = NGram.new
         
     | 
| 
       124 
     | 
    
         
            -
                  @text. 
     | 
| 
      
 126 
     | 
    
         
            +
                  @text.each do |char|
         
     | 
| 
       125 
127 
     | 
    
         
             
                    ngram.add_char(char)
         
     | 
| 
       126 
128 
     | 
    
         
             
                    (1..NGram::N_GRAM).each do |n|
         
     | 
| 
       127 
129 
     | 
    
         
             
                      w = ngram.get(n)
         
     | 
| 
         @@ -169,7 +171,8 @@ module Langusta 
     | 
|
| 
       169 
171 
     | 
    
         
             
                  # verbose
         
     | 
| 
       170 
172 
     | 
    
         
             
                  weight = alpha / BASE_FREQ
         
     | 
| 
       171 
173 
     | 
    
         
             
                  prob.length.times do |i|
         
     | 
| 
       172 
     | 
    
         
            -
                     
     | 
| 
      
 174 
     | 
    
         
            +
                    # tiny workaround for nil values in word freq array
         
     | 
| 
      
 175 
     | 
    
         
            +
                    prob[i] *= weight + (lang_prob_map[i] || 0.0)
         
     | 
| 
       173 
176 
     | 
    
         
             
                  end
         
     | 
| 
       174 
177 
     | 
    
         
             
                  true
         
     | 
| 
       175 
178 
     | 
    
         
             
                end
         
     |