RubyGems - loyal_rmmseg - Versions diffs - 0.0.1 - Mend

loyal_rmmseg 0.0.1

Files changed (18) hide show

data/README.txt +74 -0
data/lib/rmmseg/algorithm.rb +138 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chunk.rb +41 -0
data/lib/rmmseg/complex_algorithm.rb +122 -0
data/lib/rmmseg/config.rb +62 -0
data/lib/rmmseg/dictionary.rb +80 -0
data/lib/rmmseg/ferret.rb +109 -0
data/lib/rmmseg/lawl_rule.rb +12 -0
data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
data/lib/rmmseg/mm_rule.rb +13 -0
data/lib/rmmseg/rule_helper.rb +28 -0
data/lib/rmmseg/simple_algorithm.rb +37 -0
data/lib/rmmseg/svwl_rule.rb +12 -0
data/lib/rmmseg/token.rb +29 -0
data/lib/rmmseg/word.rb +38 -0
data/lib/rmmseg.rb +15 -0
metadata +96 -0

data/README.txt ADDED Viewed

@@ -0,0 +1,74 @@
+= rmmseg
+    by pluskid
+    http://rmmseg.rubyforge.org
+== DESCRIPTION:
+RMMSeg is an implementation of MMSEG Chinese word segmentation
+algorithm. It is based on two variants of maximum matching
+algorithms. Two algorithms are available for using:
+* simple algorithm that uses only forward maximum matching.
+* complex algorithm that uses three-word chunk maximum matching and 3
+  additonal rules to solve ambiguities.
+For more information about the algorithm, please refer to the
+following essays:
+* http://technology.chtsai.org/mmseg/
+* http://pluskid.lifegoo.com/?p=261
+== FEATURES/PROBLEMS:
+* Provides +rmmseg+ command line tool for quick and easy way to access
+  the word segment feature.
+* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
+== SYNOPSIS:
+Using the command line tool +rmmseg+ is simple:
+  $ rmmseg --separator _ < input.txt
+passing option +-h+ can get an overview of all supported options.
+Using the +Analyzer+ for Ferret is even easier:
+  require 'rmmseg'
+  require 'rmmseg/ferret'
+  alalyzer = RMMSeg::Ferret::Analyzer.new
+  index = Ferret::Index::Index.new(:analyzer => analyzer)
+For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
+== REQUIREMENTS:
+* ruby
+== INSTALL:
+* sudo gem install rmmseg
+== LICENSE:
+(The MIT License)
+Copyright (c) 2008 FIX
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/lib/rmmseg/algorithm.rb ADDED Viewed

@@ -0,0 +1,138 @@
+require 'jcode'
+require 'rmmseg/dictionary'
+require 'rmmseg/word'
+require 'rmmseg/chunk'
+require 'rmmseg/token'
+module RMMSeg
+  # An algorithm can segment a piece of text into an array of
+  # words. This module is the common operations shared by
+  # SimpleAlgorithm and ComplexAlgorithm .
+  module Algorithm
+    # Initialize a new instance of Algorithm, the +text+ will
+    # then be segmented by this instance. +token+ is the class
+    # which will be used to construct the result token.
+    def initialize(text, token=Token)
+      @text = text
+      @chars = text.each_char
+      @index = 0
+      @byte_index = 0
+      @token = token
+    end
+    # Get the next Token recognized.
+    def next_token
+      return nil if @index >= @chars.length
+      if basic_latin?(@chars[@index])
+        token = get_basic_latin_word
+      else
+        token = get_cjk_word
+      end
+      if token.start == token.end # empty
+        return next_token
+      else
+        return token
+      end
+    end
+    # Segment the string in +text+ into an array
+    # of words.
+    def segment
+      words = Array.new
+      token = next_token
+      until token.nil?
+        words << token.text
+        token = next_token
+      end
+      words
+    end
+    # Skip whitespaces and punctuation to extract a basic latin
+    # word.
+    def get_basic_latin_word
+      start_pos = nil
+      end_pos = nil
+      i = @index
+      while i < @chars.length     &&
+          basic_latin?(@chars[i]) &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      start_pos = @byte_index + i - @index
+      while i < @chars.length && basic_latin?(@chars[i])
+        break if nonword_char?(@chars[i])
+        i += 1
+      end
+      end_pos = @byte_index + i - @index
+      while i < @chars.length      &&
+          basic_latin?(@chars[i])  &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      @byte_index += i - @index
+      @index = i
+      return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
+    end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(index)
+      for i, w in @match_cache
+        if i == index
+          return w
+        end
+      end
+      dic = Dictionary.instance
+      str = String.new
+      strlen = 0
+      words = Array.new
+      i = index
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
+          strlen < Config.max_word_length
+        str << @chars[i]
+        strlen += 1
+        if dic.has_word?(str)
+          words << dic.get_word(str)
+        end
+        i += 1
+      end
+      if words.empty?
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
+      end
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
+      words
+    end
+    # Determine whether a character is a basic latin character.
+    def basic_latin?(char)
+      char.length == 1
+    end
+    # Determine whether a character can be part of a basic latin
+    # word.
+    NONWORD_CHAR_RE = /^\W$/
+    def nonword_char?(char)
+      NONWORD_CHAR_RE =~ char
+    end
+  end
+end

data/lib/rmmseg/amibguity.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module RMMSeg
+  class Ambiguity < Exception
+  end
+end

data/lib/rmmseg/chunk.rb ADDED Viewed

@@ -0,0 +1,41 @@
+module RMMSeg
+  # A Chunk holds one or more successive Word .
+  module Chunk
+    # The sum of length of all words.
+    def self.total_length(words)
+      len = 0
+      for word in words
+        len += word.length
+      end
+      len
+    end
+    # The average length of words.
+    def self.average_length(words)
+      total_length(words).to_f/words.size
+    end
+    # The square of the standard deviation of length of all words.
+    def self.variance(words)
+      avglen = average_length(words)
+      sqr_sum = 0.0
+      for word in words
+        tmp = word.length - avglen
+        sqr_sum += tmp*tmp
+      end
+      Math.sqrt(sqr_sum)
+    end
+    # The sum of all frequencies of one-character words.
+    def self.degree_of_morphemic_freedom(words)
+      sum = 0
+      for word in words
+        if word.length == 1 && word.type == Word::TYPES[:cjk_word]
+          sum += word.frequency
+        end
+      end
+      sum
+    end
+  end
+end

data/lib/rmmseg/complex_algorithm.rb ADDED Viewed

@@ -0,0 +1,122 @@
+require 'rmmseg/algorithm'
+require 'rmmseg/mm_rule'
+require 'rmmseg/lawl_rule'
+require 'rmmseg/svwl_rule'
+require 'rmmseg/lsdmfocw_rule'
+module RMMSeg
+  class ComplexAlgorithm
+    MATCH_CACHE_MAX_LENGTH = 3
+    include Algorithm
+    # Create a new ComplexAlgorithm . Rules used by this algorithm
+    # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
+    def initialize(text, token=Token)
+      super
+      @rules = [
+                MMRule,
+                LAWLRule,
+                SVWLRule,
+                LSDMFOCWRule
+               ]
+      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
+      @match_cache_idx = 0
+    end
+    # Get the most proper CJK word.
+    def get_cjk_word
+      chunks = create_chunks
+      i = 0
+      while i < @rules.length
+        break if chunks.length < 2
+        chunks = @rules[i].filter(chunks)
+        i += 1
+      end
+      if chunks.length > 1
+        if Config.on_ambiguity == :raise_exception
+          raise Ambiguity, "Can't solve ambiguity on #{chunks}"
+        end
+      end
+      word = chunks[0][0]
+      token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      @index += word.length
+      @byte_index += word.byte_size
+      return token
+    end
+    # Create all possible three-word (or less) chunks
+    # starting from +@index+ .
+    def create_chunks
+      chunks = Array.new
+      for w0 in find_match_words(@index)
+        index0 = @index + w0.length
+        if index0 < @chars.length
+          for w1 in find_match_words(index0)
+            index1 = index0 + w1.length
+            if index1 < @chars.length
+              for w2 in find_match_words(index1)
+                if w2.type == Word::TYPES[:unrecognized]
+                  chunks << [w0, w1]
+                else
+                  chunks << [w0, w1, w2]
+                end
+              end
+            elsif index1 == @chars.length
+              chunks << [w0, w1]
+            end
+          end
+        elsif index0 == @chars.length
+          chunks << [w0]
+        end
+      end
+      chunks
+    end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(index)
+      for i, w in @match_cache
+        if i == index
+          return w
+        end
+      end
+      dic = Dictionary.instance
+      str = String.new
+      strlen = 0
+      words = Array.new
+      i = index
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
+          strlen < Config.max_word_length
+        str << @chars[i]
+        strlen += 1
+        if dic.has_word?(str)
+          words << dic.get_word(str)
+        end
+        i += 1
+      end
+      if words.empty?
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
+      end
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
+      words
+    end
+  end
+end

data/lib/rmmseg/config.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'rmmseg/simple_algorithm'
+require 'rmmseg/complex_algorithm'
+module RMMSeg
+  # Configurations of RMMSeg.
+  class Config
+    @algorithm = :complex
+    @on_ambiguity = :select_first
+    data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
+    @dictionaries = [[File.join(data_dir, "chars.dic"), true],
+                     [File.join(data_dir, "words.dic"), false]]
+    @max_word_length = 4
+    class << self
+      # Get the algorithm name currently using
+      def algorithm
+        @algorithm
+      end
+      # Set the algorithm name used to segment. Valid values are
+      # +:complex+ and +:simple+ . The former is the default one.
+      def algorithm=(algor)
+        unless [:complex, :simple].include? algor
+          raise ArgumentError, "Unknown algorithm #{algor}"
+        end
+        @algorithm = algor
+      end
+      # Get an instance of the algorithm object corresponding to the
+      # algorithm name configured. +tok+ is the class of the token oject
+      # to be returned. For example, if you want to use with Ferret, you
+      # should provide +::Ferret::Analysis::Token+ .
+      def algorithm_instance(text, tok=Token)
+        RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
+      end
+      # Get the behavior description when an unresolved ambiguity occured.
+      def on_ambiguity
+        @on_ambiguity
+      end
+      # Set the behavior on an unresolved ambiguity. Valid values are
+      # +:raise_exception+ and +:select_first+ . The latter is the default
+      # one.
+      def on_ambiguity=(behavior)
+        unless [:raise_exception, :select_first].include? behavior
+          raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
+        end
+        @on_ambiguity = behavior
+      end
+      # An array of dictionary files. Each element should be of the
+      # form: [file, whether_dic_include_frequency_info]. This should
+      # be set before the dictionaries are loaded (They are loaded
+      # only when they are used). Or else you should call
+      # Dictionary.instance.reload manually to reload the
+      # dictionaries.
+      attr_accessor :dictionaries
+      # The maximum length of a CJK word. The default value is 4. Making
+      # this value too large might slow down the segment operations.
+      attr_accessor :max_word_length
+    end
+  end
+end

data/lib/rmmseg/dictionary.rb ADDED Viewed

@@ -0,0 +1,80 @@
+require 'singleton'
+module RMMSeg
+  # The dictionary is a singleton object which is lazily initialized.
+  # *NOTE* dictionary data should use the UNIX line-break '\n' instead
+  # of DOS '\r\n'.
+  class Dictionary
+    include Singleton
+    # Initialize and load dictionaries from files specified by
+    # +Config.dictionaries+ .
+    def initialize
+      load_dictionaries
+    end
+    # Determin whether +value+ is a word in the dictionary.
+    def has_word?(value)
+      @dic.has_key?(value)
+    end
+    # Store a new word to dictionary.
+    # +w+ may be:
+    # * an instance of Word.
+    # * +true+, then this is a normal world.
+    # * a String(which can be converted to a Number) or Number.
+    #   The number is the frequency of the word.
+    def store_word(key, w=true)
+      @dic[key] = w
+    end
+    # Get an instance of Word corresponding to +value+ .
+    def get_word(value)
+      word = @dic[value]
+      # Construct a Word lazily
+      if word == true
+        word = Word.new(value.dup, Word::TYPES[:cjk_word])
+        @dic[value] = word
+      elsif String === word
+        word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
+        @dic[value] = word
+      end
+      word
+    end
+    # Reload all dictionary files.
+    def reload
+      @dic = nil
+      load_dictionaries
+    end
+    private
+    def load_dictionaries
+      @dic = Hash.new
+      Config.dictionaries.each { |file, has_freq|
+        if has_freq
+          load_dictionary_with_freq(file)
+        else
+          load_dictionary(file)
+        end
+      }
+    end
+    def load_dictionary_with_freq(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          pair = line.split(" ")
+          @dic[pair[0]] = pair[1]
+        }
+      }
+    end
+    def load_dictionary(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          line.slice!(-1)       # chop!
+          @dic[line] = true
+        }
+      }
+    end
+  end
+end

data/lib/rmmseg/ferret.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# This file integrate RMMSeg with Ferret
+require 'singleton'
+require 'rubygems'
+require 'ferret'
+require 'rmmseg'
+module RMMSeg
+  module Ferret
+    # The Analyzer class can be used with Ferret .
+    class Analyzer < ::Ferret::Analysis::Analyzer
+      # Construct an Analyzer. Optional block can be used to
+      # add more +TokenFilter+s. e.g.
+      #
+      #   analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
+      #     Ferret::Analysis::LowerCaseFilter.new(tokenizer)
+      #   }
+      #
+      def initialize(&brk)
+        @brk = brk
+      end
+      def token_stream(field, text)
+        t = PunctuationFilter.new(Tokenizer.new(text))
+        if @brk
+          @brk.call(t)
+        else
+          t
+        end
+      end
+    end
+    # The Tokenizer tokenize text with RMMSeg::Algorithm.
+    class Tokenizer < ::Ferret::Analysis::TokenStream
+      # Create a new Tokenizer to tokenize +text+
+      def initialize(str)
+        self.text = str
+      end
+      # Get next token
+      def next
+        @algor.next_token
+      end
+      # Get the text being tokenized
+      def text
+        @text
+      end
+      # Set the text to be tokenized
+      def text=(str)
+        @text = str
+        @algor = RMMSeg::Config.algorithm_instance(@text,
+                                                   ::Ferret::Analysis::Token)
+      end
+    end
+    # PunctuationFilter filter out the stand alone Chinese
+    # punctuation tokens.
+    class PunctuationFilter < ::Ferret::Analysis::TokenStream
+      # The punctuation dictionary.
+      class Dictionary
+        include Singleton
+        DIC_FILE = File.join(File.dirname(__FILE__),
+                             "..",
+                             "..",
+                             "data",
+                             "punctuation.dic")
+        def initialize
+          @dic = Hash.new
+          File.open(DIC_FILE, "r") do |f|
+            f.each_line { |line|
+              @dic[line.chomp.freeze] = nil
+            }
+          end
+        end
+        def include?(str)
+          @dic.has_key?(str)
+        end
+      end
+      def initialize(stream)
+        @stream = stream
+      end
+      # Get next token, skip stand alone Chinese punctuations.
+      def next
+        token = @stream.next
+        dic = Dictionary.instance
+        until token.nil? || !(dic.include? token.text)
+          token = @stream.next
+        end
+        token
+      end
+      def text
+        @stream.text
+      end
+      def text=(str)
+        @stream.text = str
+      end
+    end
+  end
+end

data/lib/rmmseg/lawl_rule.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Largest average word length rule.
+  class LAWLRule
+    def self.filter(chunks)
+      chunks.take_highest { |a, b|
+        Chunk::average_length(a) <=> Chunk::average_length(b)
+      }
+    end
+  end
+end

data/lib/rmmseg/lsdmfocw_rule.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Largest sum of degree of morphemic freedom of one-character
+  # words rule.
+  class LSDMFOCWRule
+    def self.filter(chunks)
+      chunks.take_highest { |a, b|
+        Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
+      }
+    end
+  end
+end

data/lib/rmmseg/mm_rule.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Maximum matching rule, select the chunks with the
+  # maximum length.
+  class MMRule
+    def self.filter(chunks)
+      chunks.take_highest { |a, b|
+        Chunk::total_length(a) <=> Chunk::total_length(b)
+      }
+    end
+  end
+end

data/lib/rmmseg/rule_helper.rb ADDED Viewed

@@ -0,0 +1,28 @@
+class Array
+  # Take the elements with the highest value. Value are compared
+  # through the block. e.g
+  #
+  #   ["aaaa", "bb", "cccc"].take_highest { |a, b|
+  #     a.length <=> b.length
+  #   }
+  #   # => ["aaaa", "cccc"]
+  #
+  def take_highest
+    return [] if empty?
+    rlt = [self.first]
+    max = self.first
+    for i in 1...length
+      cmp = yield(self[i], max)
+      if cmp == 0
+        rlt << self[i]
+      elsif cmp > 0
+        max = self[i]
+        rlt = [max]
+      end
+    end
+    rlt
+  end
+end

data/lib/rmmseg/simple_algorithm.rb ADDED Viewed

@@ -0,0 +1,37 @@
+require 'rmmseg/algorithm'
+require 'rmmseg/mm_rule'
+module RMMSeg
+  class SimpleAlgorithm
+    include Algorithm
+    # Create a new SimpleAlgorithm . The only rule used by this
+    # algorithm is MMRule .
+    def initialize(text, token=Token)
+      super
+    end
+    # Get the most proper CJK word.
+    def get_cjk_word
+      dic = Dictionary.instance
+      i = Config.max_word_length
+      if i + @index > @chars.length
+        i = @chars.length - @index
+      end
+      chars = @chars[@index, i]
+      word = chars.join
+      while i > 1 && !dic.has_word?(word)
+        i -= 1
+        word.slice!(-chars[i].size,chars[i].size) # truncate last char
+      end
+      token = @token.new(word, @byte_index, @byte_index+word.size)
+      @index += i
+      @byte_index += word.size
+      return token
+    end
+  end
+end

data/lib/rmmseg/svwl_rule.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Smallest variance of word length rule.
+  class SVWLRule
+    def self.filter(chunks)
+      chunks.take_highest { |a, b|
+        Chunk::variance(b) <=> Chunk::variance(a)
+      }
+    end
+  end
+end

data/lib/rmmseg/token.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module RMMSeg
+  # A Token consists of a term's text and the start and end offset
+  # of the term.
+  class Token
+    # The text of the token
+    attr_accessor :text
+    # The start position of the token. This is *byte* index instead of
+    # character.
+    attr_accessor :start
+    # The one greater than the position of the last byte of the
+    # token. This is *byte* index instead of character.
+    attr_accessor :end
+    # +text+ is the ref to the whole text. In other words:
+    # +text[start_pos...end_pos]+ should be the string held by this
+    # token.
+    def initialize(text, start_pos, end_pos)
+      @text = text
+      @start = start_pos
+      @end = end_pos
+    end
+    def to_s
+      @text.dup
+    end
+  end
+end

data/lib/rmmseg/word.rb ADDED Viewed

@@ -0,0 +1,38 @@
+module RMMSeg
+  # An object representing a CJK word.
+  class Word
+    TYPES = {
+      :unrecognized => :unrecognized,
+      :basic_latin_word => :basic_latin_word,
+      :cjk_word => :cjk_word
+    }.freeze
+    # The content text of the word.
+    attr_reader :text
+    # The type of the word, may be one of the key of TYPES .
+    attr_reader :type
+    # The frequency of the word. This value is meaningful only
+    # when this is a one-character word.
+    attr_reader :frequency
+    # Initialize a Word object.
+    def initialize(text, type=TYPES[:unrecognized], frequency=nil)
+      @text = text
+      @type = type
+      @frequency = frequency
+      @length = @text.jlength
+    end
+    # The number of characters in the word. *Not* number of bytes.
+    def length
+      @length
+    end
+    # The number of bytes in the word.
+    def byte_size
+      @text.length
+    end
+  end
+end

data/lib/rmmseg.rb ADDED Viewed

@@ -0,0 +1,15 @@
+$KCODE = 'u'
+require 'jcode'
+require 'rmmseg/config'
+require 'rmmseg/simple_algorithm'
+require 'rmmseg/complex_algorithm'
+module RMMSeg
+  VERSION = '0.1.6'
+  # Segment +text+ using the algorithm configured.
+  def segment(text)
+    Config.algorithm_instance(text).segment
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,96 @@
+--- !ruby/object:Gem::Specification
+name: loyal_rmmseg
+version: !ruby/object:Gem::Version
+  hash: 29
+  prerelease:
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- happy
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-02-02 00:00:00 +08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 0
+        version: "0"
+  type: :runtime
+  version_requirements: *id001
+description: Chinese Seg.
+email: happy@doc5.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README.txt
+- lib/rmmseg.rb
+- lib/rmmseg/word.rb
+- lib/rmmseg/algorithm.rb
+- lib/rmmseg/rule_helper.rb
+- lib/rmmseg/amibguity.rb
+- lib/rmmseg/lawl_rule.rb
+- lib/rmmseg/chunk.rb
+- lib/rmmseg/config.rb
+- lib/rmmseg/lsdmfocw_rule.rb
+- lib/rmmseg/simple_algorithm.rb
+- lib/rmmseg/complex_algorithm.rb
+- lib/rmmseg/token.rb
+- lib/rmmseg/ferret.rb
+- lib/rmmseg/dictionary.rb
+- lib/rmmseg/mm_rule.rb
+- lib/rmmseg/svwl_rule.rb
+has_rdoc: true
+homepage: http://www.doc5.com
+licenses:
+- MIT
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.4.2
+signing_key:
+specification_version: 3
+summary: Nice Chinese Seg.
+test_files: []