RubyGems - rmmseg - Versions diffs - 0.1.2 → 0.1.3 - Mend

rmmseg 0.1.2 → 0.1.3

Files changed (11) hide show

data/History.txt +6 -0
data/TODO.txt +3 -0
data/lib/rmmseg/algorithm.rb +22 -12
data/lib/rmmseg/chunk.rb +16 -10
data/lib/rmmseg/complex_algorithm.rb +6 -6
data/lib/rmmseg/dictionary.rb +1 -1
data/lib/rmmseg/ferret.rb +4 -11
data/lib/rmmseg/token.rb +31 -10
data/lib/rmmseg.rb +1 -1
data/spec/simple_algorithm_spec.rb +4 -4
metadata +2 -2

data/History.txt CHANGED Viewed

@@ -1,3 +1,9 @@
+=== 0.1.3 / 2008-02-28
+* Make RMMSeg Token campatible to Ferret Token.
+* Use while instead of loop for performance improvement.
+* Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
 === 0.1.2 / 2008-02-25
 * Add cache to find_match_words: performance improved.

data/TODO.txt CHANGED Viewed

@@ -1,4 +1,7 @@
 === TODO
+* Release 0.1.3 before adding C staffs.
+* Implement a C version of jcode.
+* Implement a C version of string_ref.
 * Avoid Memory Leak
 * Improve Performance

data/lib/rmmseg/algorithm.rb CHANGED Viewed

@@ -9,6 +9,8 @@ module RMMSeg
   # words. This module is the common operations shared by
   # SimpleAlgorithm and ComplexAlgorithm .
   module Algorithm
+    MATCH_CACHE_MAX_LENGTH = 3
     # Initialize a new instance of Algorithm, the +text+ will
     # then be segmented by this instance.
     def initialize(text)
@@ -16,7 +18,8 @@ module RMMSeg
       @chars = text.each_char
       @index = 0
       @byte_index = 0
-      @match_cache = Array.new
+      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
+      @match_cache_idx = 0
     end
     # Get the next Token recognized.
@@ -45,10 +48,11 @@ module RMMSeg
     # of words.
     def segment
       words = Array.new
-      loop do
-        token = next_token
-        break if token.nil?
+      token = next_token
+      until token.nil?
         words << token.text
+        token = next_token
       end
       words
@@ -83,7 +87,7 @@ module RMMSeg
       @byte_index += i - @index
       @index = i
-      return Token.new(@text, start_pos, end_pos)
+      return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
     end
     # Use rules to filter the +chunks+ to get the most
@@ -103,7 +107,7 @@ module RMMSeg
       end
       word = chunks[0][0]
-      token = Token.new(@text, @byte_index, @byte_index+word.byte_size)
+      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
       @index += word.length
       @byte_index += word.byte_size
@@ -123,25 +127,31 @@ module RMMSeg
       dic = Dictionary.instance
       str = String.new
+      strlen = 0
       words = Array.new
       i = index
-      loop do
-        break if i >= chars.length || basic_latin?(chars[i])
+      while i < chars.length               &&
+          !basic_latin?(chars[i])          &&
+          strlen < Config.max_word_length
         str << chars[i]
+        strlen += 1
         if dic.has_word?(str)
           words << dic.get_word(str)
         end
         i += 1
-        break if str.jlength >= Config.max_word_length
       end
       if words.empty?
         words << Word.new(chars[index], Word::TYPES[:unrecognized])
       end
-      @match_cache << [index, words]
-      @match_cache.shift if @match_cache.length > 4
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
       words
     end

data/lib/rmmseg/chunk.rb CHANGED Viewed

@@ -4,7 +4,11 @@ module RMMSeg
     # The sum of length of all words.
     def self.total_length(words)
-      words.inject(0) { |len, word| len + word.length }
+      len = 0
+      for word in words
+        len += word.length
+      end
+      len
     end
     # The average length of words.
@@ -15,21 +19,23 @@ module RMMSeg
     # The square of the standard deviation of length of all words.
     def self.variance(words)
       avglen = average_length(words)
-      Math.sqrt(words.inject(0.0) { |sqr_sum, word|
-                  tmp = word.length - avglen
-                  sqr_sum + tmp*tmp
-                })
+      sqr_sum = 0.0
+      for word in words
+        tmp = word.length - avglen
+        sqr_sum += tmp*tmp
+      end
+      Math.sqrt(sqr_sum)
     end
     # The sum of all frequencies of one-character words.
     def self.degree_of_morphemic_freedom(words)
-      words.inject(0) { |sum, word|
+      sum = 0
+      for word in words
         if word.length == 1 && word.type == Word::TYPES[:cjk_word]
-          sum + word.frequency
-        else
-          sum
+          sum += word.frequency
         end
-      }
+      end
+      sum
     end
   end
 end

data/lib/rmmseg/complex_algorithm.rb CHANGED Viewed

@@ -24,27 +24,27 @@ module RMMSeg
     # starting from +@index+ .
     def create_chunks
       chunks = Array.new
-      find_match_words(@chars, @index).each { |w0|
+      for w0 in find_match_words(@chars, @index)
         index0 = @index + w0.length
         if index0 < @chars.length
-          find_match_words(@chars, index0).each { |w1|
+          for w1 in find_match_words(@chars, index0)
             index1 = index0 + w1.length
             if index1 < @chars.length
-              find_match_words(@chars, index1).each { |w2|
+              for w2 in find_match_words(@chars, index1)
                 if w2.type == Word::TYPES[:unrecognized]
                   chunks << [w0, w1]
                 else
                   chunks << [w0, w1, w2]
                 end
-              }
+              end
             elsif index1 == @chars.length
               chunks << [w0, w1]
             end
-          }
+          end
         elsif index0 == @chars.length
           chunks << [w0]
         end
-      }
+      end
       chunks
     end

data/lib/rmmseg/dictionary.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module RMMSeg
       if word == true
         word = Word.new(value.dup, Word::TYPES[:cjk_word])
         @dic[value] = word
-      elsif word.is_a? String
+      elsif String === word
         word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
         @dic[value] = word
       end

data/lib/rmmseg/ferret.rb CHANGED Viewed

@@ -39,12 +39,7 @@ module RMMSeg
       # Get next token
       def next
-        tk = @algor.next_token
-        if tk.nil?
-          nil
-        else
-          ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
-        end
+        @algor.next_token
       end
       # Get the text being tokenized
@@ -91,13 +86,11 @@ module RMMSeg
       # Get next token, skip stand alone Chinese punctuations.
       def next
-        token = nil
+        token = @stream.next
         dic = Dictionary.instance
-        loop do
-          token = @stream.next
-          break if token.nil?
-          break unless dic.include? token.text
+        until token.nil? || !(dic.include? token.text)
+          token = @stream.next
         end
         token

data/lib/rmmseg/token.rb CHANGED Viewed

@@ -2,31 +2,52 @@ module RMMSeg
   # A Token consists of a term's text and the start and end offset
   # of the term.
   class Token
-    # Text of the token.
-    def text
-      @text[@start_pos...@end_pos]
-    end
     # Does this token contain any characters?
     def empty?
-      @start_pos == @end_pos
+      @start == @end
     end
+    # The text of the token
+    attr_accessor :text
     # The start position of the token. This is *byte* index instead of
     # character.
-    attr_reader :start_pos
+    attr_accessor :start
     # The one greater than the position of the last byte of the
     # token. This is *byte* index instead of character.
-    attr_reader :end_pos
+    attr_accessor :end
+    # See Ferret document for Token.
+    attr_accessor :pos_inc
     # +text+ is the ref to the whole text. In other words:
     # +text[start_pos...end_pos]+ should be the string held by this
     # token.
     def initialize(text, start_pos, end_pos)
       @text = text
-      @start_pos = start_pos
-      @end_pos = end_pos
+      @start = start_pos
+      @end = end_pos
+      @pos_inc = 1
+    end
+    def <=> other
+      if @start > other.start
+        return 1
+      elsif @start < other.start
+        return -1
+      elsif @end > other.end
+        return 1
+      elsif @end < other.end
+        return -1
+      else
+        return @text <=> other.text
+      end
+    end
+    include Comparable
+    def to_s
+      @text.dup
     end
   end
 end

data/lib/rmmseg.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.1.2'
+  VERSION = '0.1.3'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/spec/simple_algorithm_spec.rb CHANGED Viewed

@@ -30,8 +30,8 @@ describe "simple algorithm" do
     3.times { algor.next_token }
     token = algor.next_token
     token.text.should == "paragraph"
-    token.start_pos.should == 10
-    token.end_pos.should == 19
+    token.start.should == 10
+    token.end.should == 19
   end
   it "should handle byte positions of Chinese well" do
@@ -40,7 +40,7 @@ describe "simple algorithm" do
     2.times { algor.next_token }
     token = algor.next_token
     token.text.should == "中文"
-    token.start_pos.should == 12
-    token.end_pos.should == 18
+    token.start.should == 12
+    token.end.should == 18
   end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - pluskid
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-02-25 00:00:00 -08:00
+date: 2008-02-27 00:00:00 -08:00
 default_executable:
 dependencies: []