RubyGems - langusta - Versions diffs - 0.1.1 → 0.2.0 - Mend

langusta 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

data/.travis.yml +7 -0
data/Gemfile +10 -7
data/Gemfile.lock +12 -16
data/{README.rdoc → README.md} +27 -10
data/Rakefile +3 -10
data/VERSION +1 -1
data/langusta.gemspec +23 -47
data/lib/langusta.rb +36 -10
data/lib/langusta/codepoints.rb +19 -0
data/lib/langusta/command.rb +3 -3
data/lib/langusta/detector.rb +16 -13
data/lib/langusta/detector_factory.rb +11 -5
data/lib/langusta/guard.rb +22 -0
data/lib/langusta/inspector.rb +7 -0
data/lib/langusta/java_property_reader.rb +2 -3
data/lib/langusta/lang_profile.rb +12 -18
data/lib/langusta/language_detection_facade.rb +2 -2
data/lib/langusta/n_gram.rb +20 -25
data/lib/langusta/regex_helper.rb +15 -10
data/lib/langusta/tag_extractor.rb +5 -5
data/lib/langusta/unicode_block.rb +34 -34
data/test/helper.rb +12 -3
data/test/quality/test_falsified.rb +3 -3
data/test/test_command.rb +1 -0
data/test/test_detector.rb +18 -17
data/test/test_detector_factory.rb +17 -5
data/test/test_java_property_reader.rb +2 -1
data/test/test_lang_profile.rb +37 -31
data/test/test_language.rb +1 -0
data/test/test_language_detection_facade.rb +1 -1
data/test/test_langusta.rb +6 -6
data/test/test_n_gram.rb +87 -75
data/test/test_tag_extractor.rb +19 -18
data/test/test_unicode_block.rb +2 -1
metadata +54 -156
data/lib/langusta/ucs2_string.rb +0 -70
data/test/test_ucs2_string.rb +0 -9

data/lib/langusta/detector_factory.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 module Langusta
   class DetectorFactory
+    include Inspector
     attr_reader :word_lang_prob_map, :lang_list
     def initialize
@@ -11,15 +13,15 @@ module Langusta
     # @param [LangProfile] language profile to be added.
     # @param [Fixnum] index at which the language profile is to be added.
     # @param [Fixnum] counts how many language profiles are to be added to this factory in total.
-    def add_profile(profile, index, langsize)
+    def add_profile(profile)
       raise DuplicateProfilesError.new(profile.name) if @lang_list.include?(profile.name)
       @lang_list << profile.name
+      last_lang_index = @lang_list.size - 1
       profile.freq.keys.each do |word|
-        if not @word_lang_prob_map.has_key?(word)
-          @word_lang_prob_map[word] = Array.new(langsize, 0.0)
-        end
+        @word_lang_prob_map[word] ||= []
         prob = 1.0 * profile.freq[word] / profile.n_words[word.length - 1]
-        @word_lang_prob_map[word][index] = prob
+        @word_lang_prob_map[word][last_lang_index] = prob
       end
     end
@@ -35,6 +37,10 @@ module Langusta
       end
     end
+    def inspect
+      "#<#{self.class.name}:0x#{object_ptr} (#{@lang_list.size} profile(s))"
+    end
     private
     def create_detector
       raise NoProfilesLoadedError if @lang_list.empty?

data/lib/langusta/guard.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Langusta
+  module Guard
+    def self.klass(argument, klass, _method)
+      return unless $debug
+      raise TypeError.new("#{_method}: expected #{klass} got: #{argument.class}") unless argument.is_a?(klass)
+    end
+    def self.codepoint(codepoint, _method)
+      return unless $debug
+      raise ArgumentError.new([_method, ':', codepoint.to_s(16)].join) unless (0x0000..0xffff).include?(codepoint)
+    end
+    def self.codepoint_array(array, _method)
+      return unless $debug
+      raise TypeError.new("#{_method}: expected Array, got: #{array.class}") unless array.is_a?(Array)
+      cp = array.find do |cp|
+        ! (0x0000..0xffff).include?(cp)
+      end && (raise ArgumentError.new("#{_method}: bad codepoint: #{cp}"))
+    end
+  end
+end

data/lib/langusta/inspector.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Langusta
+  module Inspector
+    def object_ptr
+      (object_id * 2).to_s(16)
+    end
+  end
+end

data/lib/langusta/java_property_reader.rb CHANGED Viewed

@@ -3,7 +3,7 @@ module Langusta
     # This is a minimal implementation, don't expect this to actually work.
     def initialize(filename)
-      @lines = File.open(filename).read
+      @lines = File.open(filename).readlines
       parse()
     end
@@ -28,8 +28,7 @@ module Langusta
       codepoints = value.scan(/([0-9A-F]{4})/)
       codepoints.map do |cp|
         int_cp = cp.first.to_i(16)
-        [int_cp / 256, int_cp % 256].pack("c*")
-      end.join
+      end
     end
   end
 end

data/lib/langusta/lang_profile.rb CHANGED Viewed

@@ -11,34 +11,29 @@ module Langusta
     # @return [LangProfile]
     def self.load_from_file(filename)
       json = Yajl::Parser.parse(File.new(filename))
-      profile = self.new
-      name = json['name']
-      n_words = json['n_words']
       freq = json['freq'].inject({}) do |acc, kv|
         key, value = kv
-        acc[UCS2String.from_utf8(key)] = value
+        acc[Langusta.utf82cp(key)] = value
         acc
       end
-      profile.populate_json(name, freq, n_words)
-      profile
-    end
-    def initialize(name=nil)
-      @name = name
-      @freq = {}
-      @n_words = Array.new(NGram::N_GRAM, 0)
+      self.new(json['name'] || (raise CorruptProfileError.new("Missing profile name")),
+               freq,
+               json['n_words'] || (raise CorruptProfileError.new("Missing number of words value")))
     end
-    def populate_json(name, freq, n_words)
+    def initialize(name, freq={}, n_words = Array.new(NGram::N_GRAM, 0))
+      Guard.klass(name, String, __method__)
       @name, @freq, @n_words = name, freq, n_words
     end
     # Adds a given NGram to this language profile. This operation is expected to be invoked multiple times for the same arguments.
-    # @param gram [UCS2String]
+    # @param gram [Array<Fixnum>]
     def add(gram)
-      raise TypeError.new("UCS2String or NilClass expected, got: #{gram.class}") unless gram.is_a?(UCS2String) or gram.is_a?(NilClass)
-      return if @name.nil? or gram.nil?
+      return if gram.nil?
+      Guard.klass(gram, Array, __method__)
       length = gram.size
       return if length < 1 or length > NGram::N_GRAM
       @n_words[length - 1] += 1
@@ -47,7 +42,6 @@ module Langusta
     end
     def omit_less_freq
-      return if @name.nil?
       threshold = @n_words[0] / LESS_FREQ_RATIO
       threshold = MINIMUM_FREQ if threshold < MINIMUM_FREQ
       keys = Set.new(@freq.keys)
@@ -59,7 +53,7 @@ module Langusta
           @freq.delete(key)
         else
           # temp workaround
-          if RegexHelper::ROMAN_REGEX.match(key.underlying)
+          if RegexHelper::ROMAN_REGEX.match(Langusta.cp2utf8(key))
             roman += count
           end
         end
@@ -69,7 +63,7 @@ module Langusta
         keys2 = Set.new(@freq.keys)
         keys2.each do |key|
           # temp workaround
-          if RegexHelper::INCL_ROMAN_REGEX.match(key.underlying)
+          if RegexHelper::INCL_ROMAN_REGEX.match(Langusta.cp2utf8(key))
             @n_words[key.size - 1] -= @freq[key]
             @freq.delete(key)
           end

data/lib/langusta/language_detection_facade.rb CHANGED Viewed

@@ -3,8 +3,8 @@ module Langusta
     def initialize
       @factory = DetectorFactory.new
       profiles = load_profiles()
-      profiles.each_with_index do |profile, index|
-        @factory.add_profile(profile, index, profiles.length)
+      profiles.each do |profile|
+        @factory.add_profile(profile)
       end
     end

data/lib/langusta/n_gram.rb CHANGED Viewed

@@ -3,17 +3,10 @@ module Langusta
   # constructed on a character by character basis.
   class NGram
     N_GRAM = 3
-    UCS2_SPACE = "\x00\x20"
+    UCS2_SPACE = 0x0020
     def self.calculate_latin1_excluded
-      internal_hash = JavaPropertyReader.new(MESSAGES_PROPERTIES).underlying_hash
-      _, value = internal_hash.find do |k, v|
-        k == "NGram.LATIN1_EXCLUDE"
-      end
-      (0..(value.length - 2)).step(2).map do |index|
-        value[index, 2]
-      end
+      JavaPropertyReader.new(MESSAGES_PROPERTIES)["NGram.LATIN1_EXCLUDE"]
     end
     LATIN1_EXCLUDED = self.calculate_latin1_excluded()
@@ -28,10 +21,9 @@ module Langusta
       internal_hash.select do |key, _|
         /KANJI_[0-9]{1}/ =~ key
       end.each do |_, chars|
-        key = chars[0..1]
-        m[key] = key
-        (2..(chars.length - 2)).step(2) do |n|
-          m[chars[n, 2]] = key
+        key = chars.first
+        chars.each do |cp|
+          m[cp] = key
         end
       end
       m
@@ -44,27 +36,27 @@ module Langusta
       block = UnicodeBlock.of(ch)
       case block
       when UnicodeBlock::BASIC_LATIN
-        (ch < "\x00A" || (ch < "\x00a" && ch > "\x00Z") || ch > "\x00z") ? UCS2_SPACE : ch
+        (ch < 0x0041 || (ch < 0x0061 && ch > 0x005a) || ch > 0x007a) ? UCS2_SPACE : ch
       when UnicodeBlock::LATIN_1_SUPPLEMENT
         LATIN1_EXCLUDED.include?(ch) ? UCS2_SPACE : ch
       when UnicodeBlock::GENERAL_PUNCTUATION
         UCS2_SPACE
       when UnicodeBlock::ARABIC
-        (ch == "\x06\xcc") ? "\x06\x4a" : ch
+        (ch == 0x06cc) ? 0x064a : ch
       when UnicodeBlock::LATIN_EXTENDED_ADDITIONAL
-        (ch >= "\x1e\xa0") ? "\x1e\xc3" : ch
+        (ch >= 0x1ea0) ? 0x1ec3 : ch
       when UnicodeBlock::HIRAGANA
-        "\x30\x42"
+        0x3042
       when UnicodeBlock::KATAKANA
-        "\x30\xa2"
+        0x30a2
       when UnicodeBlock::BOPOMOFO
-        "\x31\x05"
+        0x3105
       when UnicodeBlock::BOPOMOFO_EXTENDED
-        "\x31\x05"
+        0x3105
       when UnicodeBlock::CJK_UNIFIED_IDEOGRAPHS
         cjk_map.has_key?(ch) ? cjk_map[ch] : ch
       when UnicodeBlock::HANGUL_SYLLABES
-        "\xac\x00"
+        0xac00
       else
         ch
       end
@@ -77,22 +69,25 @@ module Langusta
     # Retrieves an n-sized NGram from the current sequence.
     # @param n [Integer] length of NGram.
-    # @return [UCS2String] n-sized NGram.
+    # @return [Array<Integer>] n-sized NGram.
     def get(n)
       return nil if @capitalword
       len = @grams.length
       return nil if n < 1 || n > 3 || len < n
       if n == 1
         ch = @grams[len - 1]
-        return (ch == UCS2_SPACE) ? nil : UCS2String.new(ch)
+        return (ch == UCS2_SPACE) ? nil : [ch]
       else
-        return UCS2String.new(@grams[len - n, len].join)
+        return @grams[len - n, len]
       end
     end
     # Adds a single character to an NGram sequence.
-    # @param character [String[2]] Two-byte Unicode codepoint.
+    # @param character [Fixnum] Two-byte Unicode codepoint.
     def add_char(character)
+      Guard.klass(character, Fixnum, __method__)
+      Guard.codepoint(character, __method__)
       character = NGram.normalize(character)
       lastchar = @grams[-1]
       if lastchar == UCS2_SPACE

data/lib/langusta/regex_helper.rb CHANGED Viewed

@@ -1,15 +1,20 @@
 module Langusta
   module RegexHelper
-    include Oniguruma
-    def self._u16(string)
-      string.unpack("U*").pack("n*")
+    if RUBY_VERSION < "1.9"
+      include Oniguruma
+      ROMAN_REGEX = ORegexp.new("^[a-z]$", :options => OPTION_IGNORECASE)
+      INCL_ROMAN_REGEX = ORegexp.new(".*[a-z].*", :options => OPTION_IGNORECASE)
+      URL_REGEX = ORegexp.new("https?://[-_.?&~;+=/#0-9a-z]+", :options => OPTION_IGNORECASE)
+      MAIL_REGEX = ORegexp.new("[-_.0-9a-z]+@[-_0-9a-z]+[-_.0-9a-z]+", :options => OPTION_IGNORECASE)
+      SPACE_REGEX = ORegexp.new(" +")
+    else
+      # /ui stands for UTF-8 case-insensitive regexp.
+      ROMAN_REGEX = /^[a-z]$/ui
+      INCL_ROMAN_REGEX = /.*[a-z].*/ui
+      URL_REGEX = Regexp.new("https?://[-_.?&~;+=/#a-z0-9]+")
+      MAIL_REGEX = /[-_.a-z0-9]+@[-_a-z0-9]+[-_.a-z0-9]+/ui
+      SPACE_REGEX = / +/
     end
-    ROMAN_REGEX = ORegexp.new(_u16("^[A-Za-z]$"), "", "UTF16_BE", "java")
-    INCL_ROMAN_REGEX = ORegexp.new(_u16(".*[A-Za-z].*"), "", "UTF16_BE", "java")
-    URL_REGEX = ORegexp.new(_u16("https?://[-_.?&~;+=/#0-9A-Za-z]+"), "", "UTF16_BE", "java")
-    MAIL_REGEX = ORegexp.new(_u16("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+"), "", "UTF_16BE", "java")
-    SPACE_REGEX = ORegexp.new(_u16(" +"), "", "UTF16_BE", "java")
   end
 end

data/lib/langusta/tag_extractor.rb CHANGED Viewed

@@ -7,26 +7,26 @@ module Langusta
       @target = tag
       @threshold = threshold
       @count = 0
-      @buffer = UCS2String.new("")
+      @buffer = []
       @tag = nil
     end
     def add(line)
       if @target == @tag && line
-        @buffer << line
+        @buffer += line
       end
     end
     def clear
       @tag = nil
-      @buffer = UCS2String.new("")
+      @buffer = []
     end
     def close_tag(profile)
       if profile && @tag == @target && @buffer.length > @threshold
         gram = NGram.new
-        @buffer.each_char do |char|
-          gram.add_char(char)
+        @buffer.each do |codepoint|
+          gram.add_char(codepoint)
           (1..NGram::N_GRAM).each do |n|
             profile.add(gram.get(n))
           end

data/lib/langusta/unicode_block.rb CHANGED Viewed

@@ -2,44 +2,44 @@ module Langusta
   module UnicodeBlock
     # Half-baked implementation of Java's UnicodeBlock.
-    OTHER = 0
-    BASIC_LATIN = 1
-    LATIN_1_SUPPLEMENT = 2
-    GENERAL_PUNCTUATION = 3
-    ARABIC = 4
-    LATIN_EXTENDED_ADDITIONAL = 5
-    HIRAGANA = 6
-    KATAKANA = 7
-    BOPOMOFO = 8
-    BOPOMOFO_EXTENDED = 9
-    CJK_UNIFIED_IDEOGRAPHS = 10
-    HANGUL_SYLLABES = 11
+    OTHER                           = 0
+    BASIC_LATIN                     = 1
+    LATIN_1_SUPPLEMENT              = 2
+    GENERAL_PUNCTUATION             = 3
+    ARABIC                          = 4
+    LATIN_EXTENDED_ADDITIONAL       = 5
+    HIRAGANA                        = 6
+    KATAKANA                        = 7
+    BOPOMOFO                        = 8
+    BOPOMOFO_EXTENDED               = 9
+    CJK_UNIFIED_IDEOGRAPHS          = 10
+    HANGUL_SYLLABES                 = 11
-    BASIC_LATIN_RANGE = "\x00\x00".."\x00\x7f"
-    LATIN_1_SUPPLEMENT_RANGE = "\x00\x80".."\x00\xff"
-    GENERAL_PUNCTUATION_RANGE = "\x20\x00".."\x20\x6f"
-    ARABIC_RANGE = "\x06\x00".."\x06\xff"
-    LATIN_EXTENDED_ADDITIONAL_RANGE = "\x1e\x00".."\x1e\xff"
-    HIRAGANA_RANGE = "\x30\x40".."\x30\x9f"
-    KATAKANA_RANGE = "\x30\xa0".."\x30\xff"
-    BOPOMOFO_RANGE = "\x31\x00".."\x31\xbf"
-    BOPOMOFO_EXTENDED_RANGE = "\x31\xa0".."\x31\xbf"
-    CJK_UNIFIED_IDEOGRAPHS_RANGE = "\x4e\x00".."\x9f\xff"
-    HANGUL_SYLLABES_RANGE = "\xac\x00".."\xd7\xaf"
+    BASIC_LATIN_RANGE               = 0x0000..0x007f
+    LATIN_1_SUPPLEMENT_RANGE        = 0x0080..0x00ff
+    GENERAL_PUNCTUATION_RANGE       = 0x2000..0x206f
+    ARABIC_RANGE                    = 0x0600..0x06ff
+    LATIN_EXTENDED_ADDITIONAL_RANGE = 0x1e00..0x1eff
+    HIRAGANA_RANGE                  = 0x3040..0x309f
+    KATAKANA_RANGE                  = 0x30a0..0x30ff
+    BOPOMOFO_RANGE                  = 0x3100..0x31bf
+    BOPOMOFO_EXTENDED_RANGE         = 0x31a0..0x31bf
+    CJK_UNIFIED_IDEOGRAPHS_RANGE    = 0x4e00..0x9fff
+    HANGUL_SYLLABES_RANGE           = 0xac00..0xd7af
     def self.of(character)
       case character
-      when BASIC_LATIN_RANGE then return BASIC_LATIN
-      when LATIN_1_SUPPLEMENT_RANGE then return LATIN_1_SUPPLEMENT
-      when GENERAL_PUNCTUATION_RANGE then return GENERAL_PUNCTUATION
-      when ARABIC_RANGE then return ARABIC
+      when BASIC_LATIN_RANGE               then return BASIC_LATIN
+      when LATIN_1_SUPPLEMENT_RANGE        then return LATIN_1_SUPPLEMENT
+      when GENERAL_PUNCTUATION_RANGE       then return GENERAL_PUNCTUATION
+      when ARABIC_RANGE                    then return ARABIC
       when LATIN_EXTENDED_ADDITIONAL_RANGE then return LATIN_EXTENDED_ADDITIONAL
-      when HIRAGANA_RANGE then return HIRAGANA
-      when KATAKANA_RANGE then return KATAKANA
-      when BOPOMOFO_RANGE then return BOPOMOFO
-      when BOPOMOFO_EXTENDED_RANGE then return BOPOMOFO_EXTENDED
-      when CJK_UNIFIED_IDEOGRAPHS_RANGE then return CJK_UNIFIED_IDEOGRAPHS
-      when HANGUL_SYLLABES_RANGE then return HANGUL_SYLLABES
+      when HIRAGANA_RANGE                  then return HIRAGANA
+      when KATAKANA_RANGE                  then return KATAKANA
+      when BOPOMOFO_RANGE                  then return BOPOMOFO
+      when BOPOMOFO_EXTENDED_RANGE         then return BOPOMOFO_EXTENDED
+      when CJK_UNIFIED_IDEOGRAPHS_RANGE    then return CJK_UNIFIED_IDEOGRAPHS
+      when HANGUL_SYLLABES_RANGE           then return HANGUL_SYLLABES
       else
         return OTHER
       end
@@ -50,7 +50,7 @@ module Langusta
     end
     def self.compute_upper_case_table
-      File.open(UPPERCASE_BIN).read
+      File.open(UPPERCASE_BIN).read.unpack('n*')
     end
   end
 end

data/test/helper.rb CHANGED Viewed

@@ -1,20 +1,29 @@
 require 'rubygems'
 require 'bundler'
 begin
-  Bundler.setup(:default, :development)
+  Bundler.setup(:default, :test)
 rescue Bundler::BundlerError => e
   $stderr.puts e.message
   $stderr.puts "Run `bundle install` to install missing gems"
   exit e.status_code
 end
 require 'test/unit'
+require 'mocha'
 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
 $LOAD_PATH.unshift(File.dirname(__FILE__))
 require 'langusta'
-require 'ruby-debug'
-require 'mocha'
 class Test::Unit::TestCase
   include Langusta
+  def str2cp(ascii_string)
+    Langusta.utf82cp(ascii_string)
+  end
+  def utf82cp(utf8_string)
+    Langusta.utf82cp(utf8_string)
+  end
 end
+$debug = true