RubyGems - rmmseg - Versions diffs - 0.1.3 → 0.1.4 - Mend

rmmseg 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

data/History.txt +5 -0
data/TODO.txt +0 -3
data/lib/rmmseg/algorithm.rb +7 -37
data/lib/rmmseg/complex_algorithm.rb +37 -7
data/lib/rmmseg/dictionary.rb +10 -0
data/lib/rmmseg/lawl_rule.rb +1 -1
data/lib/rmmseg/lsdmfocw_rule.rb +1 -1
data/lib/rmmseg/mm_rule.rb +1 -1
data/lib/rmmseg/simple_algorithm.rb +9 -6
data/lib/rmmseg/svwl_rule.rb +1 -1
data/lib/rmmseg.rb +1 -1
data/spec/lawl_rule_spec.rb +1 -1
data/spec/lsdmfocw_rule_spec.rb +1 -1
data/spec/mm_rule_spec.rb +1 -1
data/spec/svwl_rule_spec.rb +1 -1
metadata +2 -2

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.4 / 2008-03-02
+* Let user store their customized word to Dictionary after loaded.
+* Improved performance of SimpleAlgorithm.
 === 0.1.3 / 2008-02-28
 * Make RMMSeg Token campatible to Ferret Token.

data/TODO.txt CHANGED Viewed

@@ -1,7 +1,4 @@
 === TODO
-* Release 0.1.3 before adding C staffs.
-* Implement a C version of jcode.
-* Implement a C version of string_ref.
 * Avoid Memory Leak
 * Improve Performance

data/lib/rmmseg/algorithm.rb CHANGED Viewed

@@ -26,15 +26,10 @@ module RMMSeg
     def next_token
       return nil if @index >= @chars.length
-      current = @chars[@index]
-      orig_index = @index
-      token = nil
-      len = 0
-      if basic_latin?(current)
+      if basic_latin?(@chars[@index])
         token = get_basic_latin_word
       else
-        token = get_cjk_word(create_chunks)
+        token = get_cjk_word
       end
       if token.empty?
@@ -90,35 +85,10 @@ module RMMSeg
       return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
     end
-    # Use rules to filter the +chunks+ to get the most
-    # apropos CJK word.
-    def get_cjk_word(chunks)
-      i = 0
-      while i < @rules.length
-        break if chunks.length < 2
-        chunks = @rules[i].filter(chunks)
-        i += 1
-      end
-      if chunks.length > 1
-        if Config.on_ambiguity == :raise_exception
-          raise Ambiguity, "Can't solve ambiguity on #{chunks}"
-        end
-      end
-      word = chunks[0][0]
-      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
-      @index += word.length
-      @byte_index += word.byte_size
-      return token
-    end
     # Find all words occuring in the dictionary starting from
     # +index+ . The maximum word length is determined by
     # +Config.max_word_length+ .
-    def find_match_words(chars, index)
+    def find_match_words(index)
       for i, w in @match_cache
         if i == index
           return w
@@ -131,11 +101,11 @@ module RMMSeg
       words = Array.new
       i = index
-      while i < chars.length               &&
-          !basic_latin?(chars[i])          &&
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
           strlen < Config.max_word_length
-        str << chars[i]
+        str << @chars[i]
         strlen += 1
         if dic.has_word?(str)
@@ -145,7 +115,7 @@ module RMMSeg
       end
       if words.empty?
-        words << Word.new(chars[index], Word::TYPES[:unrecognized])
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
       end
       @match_cache[@match_cache_idx] = [index, words]

data/lib/rmmseg/complex_algorithm.rb CHANGED Viewed

@@ -13,24 +13,54 @@ module RMMSeg
     def initialize(text)
       super
       @rules = [
-                MMRule.new,
-                LAWLRule.new,
-                SVWLRule.new,
-                LSDMFOCWRule.new
+                MMRule,
+                LAWLRule,
+                SVWLRule,
+                LSDMFOCWRule
                ]
     end
+    # Get the most proper CJK word.
+    def get_cjk_word
+      get_cjk_word_from_chunks(create_chunks)
+    end
+    # Use rules to filter the +chunks+ to get the most
+    # apropos CJK word.
+    def get_cjk_word_from_chunks(chunks)
+      i = 0
+      while i < @rules.length
+        break if chunks.length < 2
+        chunks = @rules[i].filter(chunks)
+        i += 1
+      end
+      if chunks.length > 1
+        if Config.on_ambiguity == :raise_exception
+          raise Ambiguity, "Can't solve ambiguity on #{chunks}"
+        end
+      end
+      word = chunks[0][0]
+      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      @index += word.length
+      @byte_index += word.byte_size
+      return token
+    end
     # Create all possible three-word (or less) chunks
     # starting from +@index+ .
     def create_chunks
       chunks = Array.new
-      for w0 in find_match_words(@chars, @index)
+      for w0 in find_match_words(@index)
         index0 = @index + w0.length
         if index0 < @chars.length
-          for w1 in find_match_words(@chars, index0)
+          for w1 in find_match_words(index0)
             index1 = index0 + w1.length
             if index1 < @chars.length
-              for w2 in find_match_words(@chars, index1)
+              for w2 in find_match_words(index1)
                 if w2.type == Word::TYPES[:unrecognized]
                   chunks << [w0, w1]
                 else

data/lib/rmmseg/dictionary.rb CHANGED Viewed

@@ -18,6 +18,16 @@ module RMMSeg
       @dic.has_key?(value)
     end
+    # Store a new word to dictionary.
+    # +w+ may be:
+    # * an instance of Word.
+    # * +true+, then this is a normal world.
+    # * a String(which can be converted to a Number) or Number.
+    #   The number is the frequency of the word.
+    def store_word(key, w=true)
+      @dic[key] = w
+    end
     # Get an instance of Word corresponding to +value+ .
     def get_word(value)
       word = @dic[value]

data/lib/rmmseg/lawl_rule.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
 module RMMSeg
   # Largest average word length rule.
   class LAWLRule
-    def filter(chunks)
+    def self.filter(chunks)
       chunks.take_highest { |a, b|
         Chunk::average_length(a) <=> Chunk::average_length(b)
       }

data/lib/rmmseg/lsdmfocw_rule.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module RMMSeg
   # Largest sum of degree of morphemic freedom of one-character
   # words rule.
   class LSDMFOCWRule
-    def filter(chunks)
+    def self.filter(chunks)
       chunks.take_highest { |a, b|
         Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
       }

data/lib/rmmseg/mm_rule.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module RMMSeg
   # Maximum matching rule, select the chunks with the
   # maximum length.
   class MMRule
-    def filter(chunks)
+    def self.filter(chunks)
       chunks.take_highest { |a, b|
         Chunk::total_length(a) <=> Chunk::total_length(b)
       }

data/lib/rmmseg/simple_algorithm.rb CHANGED Viewed

@@ -9,14 +9,17 @@ module RMMSeg
     # algorithm is MMRule .
     def initialize(text)
       super
-      @rules = [ MMRule.new ]
     end
-    # Create all possible one-word chunks starting from +@index+ .
-    def create_chunks
-      find_match_words(@chars, @index).map { |word|
-        [word]
-      }
+    # Get the most proper CJK word.
+    def get_cjk_word
+      word = find_match_words(@index).last
+      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      @index += word.length
+      @byte_index += word.byte_size
+      return token
     end
   end
 end

data/lib/rmmseg/svwl_rule.rb CHANGED Viewed

@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
 module RMMSeg
   # Smallest variance of word length rule.
   class SVWLRule
-    def filter(chunks)
+    def self.filter(chunks)
       chunks.take_highest { |a, b|
         Chunk::variance(b) <=> Chunk::variance(a)
       }

data/lib/rmmseg.rb CHANGED Viewed

@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
 require 'rmmseg/complex_algorithm'
 module RMMSeg
-  VERSION = '0.1.3'
+  VERSION = '0.1.4'
   # Segment +text+ using the algorithm configured.
   def segment(text)

data/spec/lawl_rule_spec.rb CHANGED Viewed

@@ -8,7 +8,7 @@ describe "largest average word length rule" do
               gen_words(["国际", "化"]),
               gen_words(["国", "际", "化"])
              ]
-    chunks = RMMSeg::LAWLRule.new.filter(chunks)
+    chunks = RMMSeg::LAWLRule.filter(chunks)
     chunks.length.should == 1
     chunks[0][0].text.should == "国际化"
   end

data/spec/lsdmfocw_rule_spec.rb CHANGED Viewed

@@ -7,7 +7,7 @@ describe "largest sum of degree of morphemic freedom of one-character words rule
               gen_words(["主要", "是", "因为"], [nil, 100, nil]),
               gen_words(["主", "要是", "因为"], [10, nil, nil])
              ]
-    chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
+    chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
     chunks.length.should == 1
     chunks[0][0].text.should == "主要"
   end

data/spec/mm_rule_spec.rb CHANGED Viewed

@@ -9,7 +9,7 @@ describe 'maximum matching rule' do
               gen_words(["眼看", "就要", "来"]),
               gen_words(["眼", "看", "就"])
              ]
-    chunks = RMMSeg::MMRule.new.filter(chunks)
+    chunks = RMMSeg::MMRule.filter(chunks)
     chunks.length.should == 2
   end
 end

data/spec/svwl_rule_spec.rb CHANGED Viewed

@@ -7,7 +7,7 @@ describe "smallest variance of word length rule" do
               gen_words(["研究", "生命", "起源"]),
               gen_words(["研究生", "命", "起源"])
              ]
-    chunks = RMMSeg::SVWLRule.new.filter(chunks)
+    chunks = RMMSeg::SVWLRule.filter(chunks)
     chunks.length.should == 1
     chunks[0][0].text.should == "研究"
   end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: rmmseg
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - pluskid
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2008-02-27 00:00:00 -08:00
+date: 2008-03-02 00:00:00 +00:00
 default_executable:
 dependencies: []