RubyGems - langusta - Versions diffs - 0.1.1 → 0.2.0 - Mend

langusta 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/.travis.yml +7 -0
data/Gemfile +10 -7
data/Gemfile.lock +12 -16
data/{README.rdoc → README.md} +27 -10
data/Rakefile +3 -10
data/VERSION +1 -1
data/langusta.gemspec +23 -47
data/lib/langusta.rb +36 -10
data/lib/langusta/codepoints.rb +19 -0
data/lib/langusta/command.rb +3 -3
data/lib/langusta/detector.rb +16 -13
data/lib/langusta/detector_factory.rb +11 -5
data/lib/langusta/guard.rb +22 -0
data/lib/langusta/inspector.rb +7 -0
data/lib/langusta/java_property_reader.rb +2 -3
data/lib/langusta/lang_profile.rb +12 -18
data/lib/langusta/language_detection_facade.rb +2 -2
data/lib/langusta/n_gram.rb +20 -25
data/lib/langusta/regex_helper.rb +15 -10
data/lib/langusta/tag_extractor.rb +5 -5
data/lib/langusta/unicode_block.rb +34 -34
data/test/helper.rb +12 -3
data/test/quality/test_falsified.rb +3 -3
data/test/test_command.rb +1 -0
data/test/test_detector.rb +18 -17
data/test/test_detector_factory.rb +17 -5
data/test/test_java_property_reader.rb +2 -1
data/test/test_lang_profile.rb +37 -31
data/test/test_language.rb +1 -0
data/test/test_language_detection_facade.rb +1 -1
data/test/test_langusta.rb +6 -6
data/test/test_n_gram.rb +87 -75
data/test/test_tag_extractor.rb +19 -18
data/test/test_unicode_block.rb +2 -1
metadata +54 -156
data/lib/langusta/ucs2_string.rb +0 -70
data/test/test_ucs2_string.rb +0 -9

data/test/test_tag_extractor.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class TagExtractorTest < Test::Unit::TestCase
@@ -6,22 +7,22 @@ class TagExtractorTest < Test::Unit::TestCase
     assert_nil(extractor.target)
     assert_equal(0, extractor.threshold)
-    extractor2 = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
-    assert_equal(UCS2String.from_utf8("abstract"), extractor2.target)
+    extractor2 = TagExtractor.new(str2cp("abstract"), 10)
+    assert_equal(str2cp("abstract"), extractor2.target)
     assert_equal(10, extractor2.threshold)
   end
   def test_set_tag
     extractor = TagExtractor.new(nil, 0)
-    extractor.tag = UCS2String.from_utf8("")
-    assert_equal(UCS2String.from_utf8(""), extractor.tag)
+    extractor.tag = str2cp("")
+    assert_equal(str2cp(""), extractor.tag)
     extractor.tag = nil
     assert_nil(extractor.tag)
   end
   def test_add
     extractor = TagExtractor.new(nil, 0)
-    extractor.add(UCS2String.from_utf8(""))
+    extractor.add(str2cp(""))
     extractor.add(nil)
   end
@@ -32,13 +33,13 @@ class TagExtractorTest < Test::Unit::TestCase
   end
   def test_normal_scenario
-    extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
+    extractor = TagExtractor.new(str2cp("abstract"), 10)
     assert_equal(0, extractor.count)
     profile = LangProfile.new("en")
     # normal
-    extractor.tag = UCS2String.from_utf8("abstract")
-    extractor.add(UCS2String.from_utf8("This is a sample text."))
+    extractor.tag = str2cp("abstract")
+    extractor.add(str2cp("This is a sample text."))
     extractor.close_tag(profile)
     assert_equal(1, extractor.count)
     assert_equal(17, profile.n_words[0])
@@ -46,26 +47,26 @@ class TagExtractorTest < Test::Unit::TestCase
     assert_equal(17, profile.n_words[2])
     # too short
-    extractor.tag = UCS2String.from_utf8("abstract")
-    extractor.add(UCS2String.from_utf8("sample"))
+    extractor.tag = str2cp("abstract")
+    extractor.add(str2cp("sample"))
     extractor.close_tag(profile)
     assert_equal(1, extractor.count)
     # other tags
-    extractor.tag = UCS2String.from_utf8("div")
-    extractor.add(UCS2String.from_utf8("This is a sample text which is enough long."))
+    extractor.tag = str2cp("div")
+    extractor.add(str2cp("This is a sample text which is enough long."))
     extractor.close_tag(profile)
     assert_equal(1, extractor.count)
   end
   def test_clear
-    extractor = TagExtractor.new(UCS2String.from_utf8("abstract"), 10)
-    extractor.tag = UCS2String.from_utf8("abstract")
-    extractor.add(UCS2String.from_utf8("This is a sample text."))
-    assert_equal(UCS2String.from_utf8("This is a sample text."), extractor.buffer)
-    assert_equal(UCS2String.from_utf8("abstract"), extractor.tag)
+    extractor = TagExtractor.new(str2cp("abstract"), 10)
+    extractor.tag = str2cp("abstract")
+    extractor.add(str2cp("This is a sample text."))
+    assert_equal(str2cp("This is a sample text."), extractor.buffer)
+    assert_equal(str2cp("abstract"), extractor.tag)
     extractor.clear
-    assert_equal(UCS2String.from_utf8(""), extractor.buffer)
+    assert_equal(str2cp(""), extractor.buffer)
     assert_nil(extractor.tag)
   end
 end

data/test/test_unicode_block.rb CHANGED Viewed

@@ -1,8 +1,9 @@
+# -*- coding: utf-8 -*-
 require 'test/helper'
 class UnicodeBlockTest < Test::Unit::TestCase
   def test_upper_case
-    ["\x00\x47", "\x01\x10", "\x01\x64", "\x03\xd5", "\x04\xa2", "\x10\xc3", "\x21\x60", "\xa7\x60"].each do |cp|
+    [0x0047, 0x0110, 0x0164, 0x03d5, 0x04a2, 0x10c3, 0x2160, 0xa760].each do |cp|
       assert(Langusta::UnicodeBlock.is_upper_case?(cp))
     end
   end

metadata CHANGED Viewed

@@ -1,144 +1,64 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: langusta
-version: !ruby/object:Gem::Version
-  hash: 25
+version: !ruby/object:Gem::Version
+  version: 0.2.0
   prerelease:
-  segments:
-  - 0
-  - 1
-  - 1
-  version: 0.1.1
 platform: ruby
-authors:
+authors:
 - Jan Szumiec
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-04-10 00:00:00 +02:00
-default_executable: langusta
-dependencies:
-- !ruby/object:Gem::Dependency
-  type: :runtime
-  requirement: &id001 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - "="
-      - !ruby/object:Gem::Version
-        hash: 19
-        segments:
-        - 1
-        - 1
-        - 0
-        version: 1.1.0
-  name: oniguruma
-  version_requirements: *id001
-  prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :runtime
-  requirement: &id002 !ruby/object:Gem::Requirement
+date: 2012-03-04 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: yajl-ruby
+  requirement: &2152186680 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - "="
-      - !ruby/object:Gem::Version
-        hash: 59
-        segments:
-        - 0
-        - 8
-        - 2
+    requirements:
+    - - =
+      - !ruby/object:Gem::Version
         version: 0.8.2
-  name: yajl-ruby
-  version_requirements: *id002
+  type: :runtime
   prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :development
-  requirement: &id003 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        hash: 23
-        segments:
-        - 1
-        - 0
-        - 0
-        version: 1.0.0
+  version_requirements: *2152186680
+- !ruby/object:Gem::Dependency
   name: bundler
-  version_requirements: *id003
-  prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :development
-  requirement: &id004 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ~>
-      - !ruby/object:Gem::Version
-        hash: 7
-        segments:
-        - 1
-        - 5
-        - 2
-        version: 1.5.2
-  name: jeweler
-  version_requirements: *id004
-  prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :development
-  requirement: &id005 !ruby/object:Gem::Requirement
+  requirement: &2152184160 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
-        version: "0"
-  name: rcov
-  version_requirements: *id005
-  prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :development
-  requirement: &id006 !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
-        version: "0"
-  name: mocha
-  version_requirements: *id006
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
   prerelease: false
-- !ruby/object:Gem::Dependency
-  type: :development
-  requirement: &id007 !ruby/object:Gem::Requirement
+  version_requirements: *2152184160
+- !ruby/object:Gem::Dependency
+  name: jeweler
+  requirement: &2152182040 !ruby/object:Gem::Requirement
     none: false
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        hash: 3
-        segments:
-        - 0
-        version: "0"
-  name: ruby-debug
-  version_requirements: *id007
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
   prerelease: false
+  version_requirements: *2152182040
 description: Highly accurate language detection library, uses naive bayesian filter.
 email: jan.szumiec@gmail.com
-executables:
+executables:
 - langusta
 extensions: []
-extra_rdoc_files:
+extra_rdoc_files:
 - LICENSE.txt
-- README.rdoc
-files:
+- README.md
+files:
 - .document
+- .travis.yml
 - Gemfile
 - Gemfile.lock
 - LICENSE.txt
-- README.rdoc
+- README.md
 - Rakefile
 - VERSION
 - bin/langusta
@@ -146,9 +66,12 @@ files:
 - data/uppercase.bin
 - langusta.gemspec
 - lib/langusta.rb
+- lib/langusta/codepoints.rb
 - lib/langusta/command.rb
 - lib/langusta/detector.rb
 - lib/langusta/detector_factory.rb
+- lib/langusta/guard.rb
+- lib/langusta/inspector.rb
 - lib/langusta/java_property_reader.rb
 - lib/langusta/lang_profile.rb
 - lib/langusta/language.rb
@@ -156,7 +79,6 @@ files:
 - lib/langusta/n_gram.rb
 - lib/langusta/regex_helper.rb
 - lib/langusta/tag_extractor.rb
-- lib/langusta/ucs2_string.rb
 - lib/langusta/unicode_block.rb
 - profiles/af
 - profiles/ar
@@ -267,54 +189,30 @@ files:
 - test/test_langusta.rb
 - test/test_n_gram.rb
 - test/test_tag_extractor.rb
-- test/test_ucs2_string.rb
 - test/test_unicode_block.rb
-has_rdoc: true
 homepage: http://github.com/jasiek/langusta
-licenses:
+licenses:
 - Apache 2.0
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.5.1
+rubygems_version: 1.8.17
 signing_key:
 specification_version: 3
 summary: Language detection library based on http://code.google.com/p/language-detection/.
-test_files:
-- test/helper.rb
-- test/quality/test_falsified.rb
-- test/test_command.rb
-- test/test_detector.rb
-- test/test_detector_factory.rb
-- test/test_java_property_reader.rb
-- test/test_lang_profile.rb
-- test/test_language.rb
-- test/test_language_detection_facade.rb
-- test/test_langusta.rb
-- test/test_n_gram.rb
-- test/test_tag_extractor.rb
-- test/test_ucs2_string.rb
-- test/test_unicode_block.rb
+test_files: []

data/lib/langusta/ucs2_string.rb DELETED Viewed

@@ -1,70 +0,0 @@
-module Langusta
-  class UCS2String
-    include Enumerable
-    UTF8_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'utf-8')
-    UCS2BE_TO_UCS2BE_ICONV = Iconv.new('ucs-2be', 'ucs-2be')
-    attr_reader :underlying
-    def self.from_utf8(utf8_string)
-      self.new(UTF8_TO_UCS2BE_ICONV.iconv(utf8_string))
-    end
-    def initialize(underlying)
-      @underlying = UCS2BE_TO_UCS2BE_ICONV.iconv(underlying)
-    end
-    def [](index)
-      @underlying[index / 2, 2]
-    end
-    def gsub!(oregexp, subst)
-      oregexp.gsub!(@underlying, subst)
-      self
-    end
-    def map(&blk)
-      mapped = []
-      each_char do |char|
-        mapped << blk.call(char)
-      end
-      return UCS2String.new(mapped.join)
-    end
-    def hash
-      @underlying.hash
-    end
-    def <<(ucs2string)
-      case ucs2string
-      when UCS2String
-        @underlying += ucs2string.underlying
-      when String
-        @underlying += ucs2string
-      else
-        raise TypeError
-      end
-      self
-    end
-    def each_char(&blk)
-      (0..(@underlying.length - 2)).step(2) do |index|
-        blk.call(@underlying[index, 2])
-      end
-    end
-    alias :each :each_char
-    def eql?(other)
-      other.is_a?(UCS2String) && self.underlying.eql?(other.underlying)
-    end
-    def ==(other)
-      self.underlying == other.underlying
-    end
-    def size
-      @underlying.size / 2
-    end
-    alias :length :size
-  end
-end

data/test/test_ucs2_string.rb DELETED Viewed

@@ -1,9 +0,0 @@
-require 'test/helper'
-class UCS2StringTest < Test::Unit::TestCase
-  def test_invalid_unicode_sequences_raise_an_error
-    assert_raises(Iconv::IllegalSequence) do
-      UCS2String.from_utf8("\xc0")
-    end
-  end
-end