RubyGems - rmmseg - Versions diffs - 0.1.5 → 0.1.6 - Mend

rmmseg 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/History.txt +6 -0
data/lib/rmmseg.rb +1 -1
data/lib/rmmseg/algorithm.rb +6 -8
data/lib/rmmseg/complex_algorithm.rb +48 -8
data/lib/rmmseg/config.rb +5 -3
data/lib/rmmseg/ferret.rb +3 -6
data/lib/rmmseg/simple_algorithm.rb +17 -5
data/lib/rmmseg/token.rb +0 -5
metadata +2 -2

data/History.txt CHANGED

@@ -1,3 +1,9 @@
+* Construct Ferret Token directly.
+=== 0.1.6 / 2008-03-16
+* Optimize for simple algorithm. One time faster than before. And less memory usage.
 === 0.1.5 / 2008-03-03
 * Bug fix: Ferret Token is not Duck-Typing. We need to construct Ferret token instead of reuse RMMSeg Token.

data/lib/rmmseg.rb CHANGED

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.1.5'
+  VERSION = '0.1.6'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/lib/rmmseg/algorithm.rb CHANGED

@@ -9,17 +9,15 @@ module RMMSeg
   # words. This module is the common operations shared by
   # SimpleAlgorithm and ComplexAlgorithm .
   module Algorithm
-    MATCH_CACHE_MAX_LENGTH = 3
     # Initialize a new instance of Algorithm, the +text+ will
-    # then be segmented by this instance.
-    def initialize(text)
+    # then be segmented by this instance. +token+ is the class
+    # which will be used to construct the result token.
+    def initialize(text, token=Token)
       @text = text
       @chars = text.each_char
       @index = 0
       @byte_index = 0
-      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
-      @match_cache_idx = 0
+      @token = token
     end
     # Get the next Token recognized.
@@ -32,7 +30,7 @@ module RMMSeg
         token = get_cjk_word
       end
-      if token.empty?
+      if token.start == token.end # empty
         return next_token
       else
         return token
@@ -82,7 +80,7 @@ module RMMSeg
       @byte_index += i - @index
       @index = i
-      return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
+      return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
     end
     # Find all words occuring in the dictionary starting from

data/lib/rmmseg/complex_algorithm.rb CHANGED

@@ -6,11 +6,13 @@ require 'rmmseg/lsdmfocw_rule'
 module RMMSeg
   class ComplexAlgorithm
+    MATCH_CACHE_MAX_LENGTH = 3
     include Algorithm
     # Create a new ComplexAlgorithm . Rules used by this algorithm
     # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
-    def initialize(text)
+    def initialize(text, token=Token)
       super
       @rules = [
                 MMRule,
@@ -18,16 +20,13 @@ module RMMSeg
                 SVWLRule,
                 LSDMFOCWRule
                ]
+      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
+      @match_cache_idx = 0
     end
     # Get the most proper CJK word.
     def get_cjk_word
-      get_cjk_word_from_chunks(create_chunks)
-    end
-    # Use rules to filter the +chunks+ to get the most
-    # apropos CJK word.
-    def get_cjk_word_from_chunks(chunks)
+      chunks = create_chunks
       i = 0
       while i < @rules.length
         break if chunks.length < 2
@@ -42,7 +41,7 @@ module RMMSeg
       end
       word = chunks[0][0]
-      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
       @index += word.length
       @byte_index += word.byte_size
@@ -78,5 +77,46 @@ module RMMSeg
       chunks
     end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(index)
+      for i, w in @match_cache
+        if i == index
+          return w
+        end
+      end
+      dic = Dictionary.instance
+      str = String.new
+      strlen = 0
+      words = Array.new
+      i = index
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
+          strlen < Config.max_word_length
+        str << @chars[i]
+        strlen += 1
+        if dic.has_word?(str)
+          words << dic.get_word(str)
+        end
+        i += 1
+      end
+      if words.empty?
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
+      end
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
+      words
+    end
   end
 end

data/lib/rmmseg/config.rb CHANGED

@@ -25,9 +25,11 @@ module RMMSeg
         @algorithm = algor
       end
       # Get an instance of the algorithm object corresponding to the
-      # algorithm name configured.
-      def algorithm_instance(text)
-        RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
+      # algorithm name configured. +tok+ is the class of the token oject
+      # to be returned. For example, if you want to use with Ferret, you
+      # should provide +::Ferret::Analysis::Token+ .
+      def algorithm_instance(text, tok=Token)
+        RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
       end
       # Get the behavior description when an unresolved ambiguity occured.

data/lib/rmmseg/ferret.rb CHANGED

@@ -39,11 +39,7 @@ module RMMSeg
       # Get next token
       def next
-        tok = @algor.next_token
-        if tok
-          tok = ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
-        end
-        tok
+        @algor.next_token
       end
       # Get the text being tokenized
@@ -54,7 +50,8 @@ module RMMSeg
       # Set the text to be tokenized
       def text=(str)
         @text = str
-        @algor = RMMSeg::Config.algorithm_instance(@text)
+        @algor = RMMSeg::Config.algorithm_instance(@text,
+                                                   ::Ferret::Analysis::Token)
       end
     end

data/lib/rmmseg/simple_algorithm.rb CHANGED

@@ -7,17 +7,29 @@ module RMMSeg
     # Create a new SimpleAlgorithm . The only rule used by this
     # algorithm is MMRule .
-    def initialize(text)
+    def initialize(text, token=Token)
       super
     end
     # Get the most proper CJK word.
     def get_cjk_word
-      word = find_match_words(@index).last
-      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      dic = Dictionary.instance
+      i = Config.max_word_length
+      if i + @index > @chars.length
+        i = @chars.length - @index
+      end
+      chars = @chars[@index, i]
+      word = chars.join
-      @index += word.length
-      @byte_index += word.byte_size
+      while i > 1 && !dic.has_word?(word)
+        i -= 1
+        word.slice!(-chars[i].size,chars[i].size) # truncate last char
+      end
+      token = @token.new(word, @byte_index, @byte_index+word.size)
+      @index += i
+      @byte_index += word.size
       return token
     end

data/lib/rmmseg/token.rb CHANGED

@@ -2,11 +2,6 @@ module RMMSeg
   # A Token consists of a term's text and the start and end offset
   # of the term.
   class Token
-    # Does this token contain any characters?
-    def empty?
-      @start == @end
-    end
     # The text of the token
     attr_accessor :text

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - pluskid
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-03-04 00:00:00 +00:00
+date: 2008-03-16 00:00:00 +00:00
 default_executable:
 dependencies: []