RubyGems - rmmseg - Versions diffs - 0.1.2 → 0.1.3 - Mend

rmmseg 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/History.txt +6 -0
data/TODO.txt +3 -0
data/lib/rmmseg/algorithm.rb +22 -12
data/lib/rmmseg/chunk.rb +16 -10
data/lib/rmmseg/complex_algorithm.rb +6 -6
data/lib/rmmseg/dictionary.rb +1 -1
data/lib/rmmseg/ferret.rb +4 -11
data/lib/rmmseg/token.rb +31 -10
data/lib/rmmseg.rb +1 -1
data/spec/simple_algorithm_spec.rb +4 -4
metadata +2 -2

data/History.txt CHANGED Viewed

@@ -1,3 +1,9 @@
+=== 0.1.3 / 2008-02-28
+* Make RMMSeg Token campatible to Ferret Token.
+* Use while instead of loop for performance improvement.
+* Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
 === 0.1.2 / 2008-02-25
 * Add cache to find_match_words: performance improved.

data/TODO.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 === TODO
+* Release 0.1.3 before adding C staffs.
+* Implement a C version of jcode.
+* Implement a C version of string_ref.
 * Avoid Memory Leak
 * Improve Performance

data/lib/rmmseg/algorithm.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module RMMSeg
   # words. This module is the common operations shared by
   # SimpleAlgorithm and ComplexAlgorithm .
   module Algorithm
+    MATCH_CACHE_MAX_LENGTH = 3
     # Initialize a new instance of Algorithm, the +text+ will
     # then be segmented by this instance.
     def initialize(text)
@@ -16,7 +18,8 @@ module RMMSeg
       @chars = text.each_char
       @index = 0
       @byte_index = 0
-      @match_cache = Array.new
+      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
+      @match_cache_idx = 0
     end
     # Get the next Token recognized.
@@ -45,10 +48,11 @@ module RMMSeg
     # of words.
     def segment
       words = Array.new
-      loop do
-        token = next_token
-        break if token.nil?
+      token = next_token
+      until token.nil?
         words << token.text
+        token = next_token
       end
       words
@@ -83,7 +87,7 @@ module RMMSeg
       @byte_index += i - @index
       @index = i
-      return Token.new(@text, start_pos, end_pos)
+      return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
     end
     # Use rules to filter the +chunks+ to get the most
@@ -103,7 +107,7 @@ module RMMSeg
       end
       word = chunks[0][0]
-      token = Token.new(@text, @byte_index, @byte_index+word.byte_size)
+      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
       @index += word.length
       @byte_index += word.byte_size
@@ -123,25 +127,31 @@ module RMMSeg
       dic = Dictionary.instance
       str = String.new
+      strlen = 0
       words = Array.new
       i = index
-      loop do
-        break if i >= chars.length || basic_latin?(chars[i])
+      while i < chars.length               &&
+          !basic_latin?(chars[i])          &&
+          strlen < Config.max_word_length
         str << chars[i]
+        strlen += 1
         if dic.has_word?(str)
           words << dic.get_word(str)
         end
         i += 1
-        break if str.jlength >= Config.max_word_length
       end
       if words.empty?
         words << Word.new(chars[index], Word::TYPES[:unrecognized])
       end
-      @match_cache << [index, words]
-      @match_cache.shift if @match_cache.length > 4
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
       words
     end

data/lib/rmmseg/chunk.rb CHANGED Viewed

@@ -4,7 +4,11 @@ module RMMSeg
     # The sum of length of all words.
     def self.total_length(words)
-      words.inject(0) { |len, word| len + word.length }
+      len = 0
+      for word in words
+        len += word.length
+      end
+      len
     end
     # The average length of words.
@@ -15,21 +19,23 @@ module RMMSeg
     # The square of the standard deviation of length of all words.
     def self.variance(words)
       avglen = average_length(words)
-      Math.sqrt(words.inject(0.0) { |sqr_sum, word|
-                  tmp = word.length - avglen
-                  sqr_sum + tmp*tmp
-                })
+      sqr_sum = 0.0
+      for word in words
+        tmp = word.length - avglen
+        sqr_sum += tmp*tmp
+      end
+      Math.sqrt(sqr_sum)
     end
     # The sum of all frequencies of one-character words.
     def self.degree_of_morphemic_freedom(words)
-      words.inject(0) { |sum, word|
+      sum = 0
+      for word in words
         if word.length == 1 && word.type == Word::TYPES[:cjk_word]
-          sum + word.frequency
-        else
-          sum
+          sum += word.frequency
         end
-      }
+      end
+      sum
     end
   end
 end

data/lib/rmmseg/complex_algorithm.rb CHANGED Viewed

@@ -24,27 +24,27 @@ module RMMSeg
     # starting from +@index+ .
     def create_chunks
       chunks = Array.new
-      find_match_words(@chars, @index).each { |w0|
+      for w0 in find_match_words(@chars, @index)
         index0 = @index + w0.length
         if index0 < @chars.length
-          find_match_words(@chars, index0).each { |w1|
+          for w1 in find_match_words(@chars, index0)
             index1 = index0 + w1.length
             if index1 < @chars.length
-              find_match_words(@chars, index1).each { |w2|
+              for w2 in find_match_words(@chars, index1)
                 if w2.type == Word::TYPES[:unrecognized]
                   chunks << [w0, w1]
                 else
                   chunks << [w0, w1, w2]
                 end
-              }
+              end
             elsif index1 == @chars.length
               chunks << [w0, w1]
             end
-          }
+          end
         elsif index0 == @chars.length
           chunks << [w0]
         end
-      }
+      end
       chunks
     end

data/lib/rmmseg/dictionary.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module RMMSeg
       if word == true
         word = Word.new(value.dup, Word::TYPES[:cjk_word])
         @dic[value] = word
-      elsif word.is_a? String
+      elsif String === word
         word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
         @dic[value] = word
       end

data/lib/rmmseg/ferret.rb CHANGED Viewed

@@ -39,12 +39,7 @@ module RMMSeg
       # Get next token
       def next
-        tk = @algor.next_token
-        if tk.nil?
-          nil
-        else
-          ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
-        end
+        @algor.next_token
       end
       # Get the text being tokenized
@@ -91,13 +86,11 @@ module RMMSeg
       # Get next token, skip stand alone Chinese punctuations.
       def next
-        token = nil
+        token = @stream.next
         dic = Dictionary.instance
-        loop do
-          token = @stream.next
-          break if token.nil?
-          break unless dic.include? token.text
+        until token.nil? || !(dic.include? token.text)
+          token = @stream.next
         end
         token

data/lib/rmmseg/token.rb CHANGED Viewed

@@ -2,31 +2,52 @@ module RMMSeg
   # A Token consists of a term's text and the start and end offset
   # of the term.
   class Token
-    # Text of the token.
-    def text
-      @text[@start_pos...@end_pos]
-    end
     # Does this token contain any characters?
     def empty?
-      @start_pos == @end_pos
+      @start == @end
     end
+    # The text of the token
+    attr_accessor :text
     # The start position of the token. This is *byte* index instead of
     # character.
-    attr_reader :start_pos
+    attr_accessor :start
     # The one greater than the position of the last byte of the
     # token. This is *byte* index instead of character.
-    attr_reader :end_pos
+    attr_accessor :end
+    # See Ferret document for Token.
+    attr_accessor :pos_inc
     # +text+ is the ref to the whole text. In other words:
     # +text[start_pos...end_pos]+ should be the string held by this
     # token.
     def initialize(text, start_pos, end_pos)
       @text = text
-      @start_pos = start_pos
-      @end_pos = end_pos
+      @start = start_pos
+      @end = end_pos
+      @pos_inc = 1
+    end
+    def <=> other
+      if @start > other.start
+        return 1
+      elsif @start < other.start
+        return -1
+      elsif @end > other.end
+        return 1
+      elsif @end < other.end
+        return -1
+      else
+        return @text <=> other.text
+      end
+    end
+    include Comparable
+    def to_s
+      @text.dup
     end
   end
 end

data/lib/rmmseg.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.1.2'
+  VERSION = '0.1.3'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/spec/simple_algorithm_spec.rb CHANGED Viewed

@@ -30,8 +30,8 @@ describe "simple algorithm" do
     3.times { algor.next_token }
     token = algor.next_token
     token.text.should == "paragraph"
-    token.start_pos.should == 10
-    token.end_pos.should == 19
+    token.start.should == 10
+    token.end.should == 19
   end
   it "should handle byte positions of Chinese well" do
@@ -40,7 +40,7 @@ describe "simple algorithm" do
     2.times { algor.next_token }
     token = algor.next_token
     token.text.should == "中文"
-    token.start_pos.should == 12
-    token.end_pos.should == 18
+    token.start.should == 12
+    token.end.should == 18
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - pluskid
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-02-25 00:00:00 -08:00
+date: 2008-02-27 00:00:00 -08:00
 default_executable:
 dependencies: []