RubyGems - plexus-rmmseg - Versions diffs - 0.1.6 - Mend

plexus-rmmseg 0.1.6

Files changed (57) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/History.txt +42 -0
data/Manifest.txt +51 -0
data/README.txt +74 -0
data/Rakefile +12 -0
data/TODO.txt +5 -0
data/bin/rmmseg +65 -0
data/data/chars.dic +12638 -0
data/data/custom.dic +12 -0
data/data/punctuation.dic +79 -0
data/data/words.dic +120330 -0
data/lib/rmmseg.rb +13 -0
data/lib/rmmseg/algorithm.rb +136 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chunk.rb +41 -0
data/lib/rmmseg/complex_algorithm.rb +122 -0
data/lib/rmmseg/config.rb +65 -0
data/lib/rmmseg/dictionary.rb +80 -0
data/lib/rmmseg/ferret.rb +109 -0
data/lib/rmmseg/lawl_rule.rb +12 -0
data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
data/lib/rmmseg/mm_rule.rb +13 -0
data/lib/rmmseg/rule_helper.rb +28 -0
data/lib/rmmseg/simple_algorithm.rb +37 -0
data/lib/rmmseg/svwl_rule.rb +12 -0
data/lib/rmmseg/token.rb +30 -0
data/lib/rmmseg/version.rb +3 -0
data/lib/rmmseg/word.rb +38 -0
data/misc/ferret_example.rb +56 -0
data/misc/homepage.erb +170 -0
data/misc/homepage.html +1214 -0
data/plexus-rmmseg.gemspec +20 -0
data/spec/chunk_spec.rb +25 -0
data/spec/complex_algorithm_spec.rb +18 -0
data/spec/config_spec.rb +12 -0
data/spec/dictionary_spec.rb +20 -0
data/spec/lawl_rule_spec.rb +15 -0
data/spec/lsdmfocw_rule_spec.rb +14 -0
data/spec/mm_rule_spec.rb +15 -0
data/spec/simple_algorithm_spec.rb +46 -0
data/spec/spec_helper.rb +12 -0
data/spec/svwl_rule_spec.rb +14 -0
data/spec/word_spec.rb +9 -0
data/tasks/ann.rake +76 -0
data/tasks/annotations.rake +22 -0
data/tasks/doc.rake +48 -0
data/tasks/gem.rake +110 -0
data/tasks/homepage.rake +12 -0
data/tasks/manifest.rake +49 -0
data/tasks/post_load.rake +26 -0
data/tasks/rubyforge.rake +57 -0
data/tasks/setup.rb +227 -0
data/tasks/spec.rake +54 -0
data/tasks/svn.rake +44 -0
data/tasks/test.rake +38 -0
metadata +121 -0

data/lib/rmmseg.rb ADDED

@@ -0,0 +1,13 @@
+# -*- encoding: utf-8 -*-
+require 'rmmseg/version'
+require 'rmmseg/config'
+require 'rmmseg/simple_algorithm'
+require 'rmmseg/complex_algorithm'
+module RMMSeg
+  # Segment +text+ using the algorithm configured.
+  def segment(text)
+    Config.algorithm_instance(text).segment
+  end
+end

data/lib/rmmseg/algorithm.rb ADDED

@@ -0,0 +1,136 @@
+# -*- encoding: utf-8 -*-
+require 'rmmseg/dictionary'
+require 'rmmseg/word'
+require 'rmmseg/chunk'
+require 'rmmseg/token'
+module RMMSeg
+  # An algorithm can segment a piece of text into an array of
+  # words. This module is the common operations shared by
+  # SimpleAlgorithm and ComplexAlgorithm .
+  module Algorithm
+    # Initialize a new instance of Algorithm, the +text+ will
+    # then be segmented by this instance. +token+ is the class
+    # which will be used to construct the result token.
+    def initialize(text, token=Token)
+      @text  = text
+      @chars = text.each_char.to_a
+      @index = 0
+      @byte_index = 0
+      @token = token
+    end
+    # Get the next Token recognized.
+    def next_token
+      return nil if @index >= @chars.length
+      if basic_latin?(@chars[@index])
+        token = get_basic_latin_word
+      else
+        token = get_cjk_word
+      end
+      if token.start == token.end # empty
+        return next_token
+      else
+        return token
+      end
+    end
+    # Segment the string in +text+ into an array
+    # of words.
+    def segment
+      words = Array.new
+      token = next_token
+      until token.nil?
+        words << token.text
+        token = next_token
+      end
+      words
+    end
+    # Skip whitespaces and punctuation to extract a basic latin
+    # word.
+    def get_basic_latin_word
+      start_pos = nil
+      end_pos   = nil
+      i = @index
+      while i < @chars.length     &&
+          basic_latin?(@chars[i]) &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      start_pos = i
+      while i < @chars.length && basic_latin?(@chars[i])
+        break if nonword_char?(@chars[i])
+        i += 1
+      end
+      end_pos = i
+      while i < @chars.length      &&
+          basic_latin?(@chars[i])  &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      @index = i
+      return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
+    end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(index)
+      for i, w in @match_cache
+        if i == index
+          return w
+        end
+      end
+      dic = Dictionary.instance
+      str = String.new
+      strlen = 0
+      words = Array.new
+      i = index
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
+          strlen < Config.max_word_length
+        str << @chars[i]
+        strlen += 1
+        if dic.has_word?(str)
+          words << dic.get_word(str)
+        end
+        i += 1
+      end
+      if words.empty?
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
+      end
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
+      words
+    end
+    # Determine whether a character is a basic latin character.
+    def basic_latin?(char)
+      char.each_byte.to_a.length == 1
+    end
+    # Determine whether a character can be part of a basic latin
+    # word.
+    NONWORD_CHAR_RE = /^\W$/
+    def nonword_char?(char)
+      NONWORD_CHAR_RE =~ char
+    end
+  end
+end

data/lib/rmmseg/amibguity.rb ADDED

@@ -0,0 +1,4 @@
+module RMMSeg
+  class Ambiguity < Exception
+  end
+end

data/lib/rmmseg/chunk.rb ADDED

@@ -0,0 +1,41 @@
+module RMMSeg
+  # A Chunk holds one or more successive Word .
+  module Chunk
+    # The sum of length of all words.
+    def self.total_length(words)
+      len = 0
+      for word in words
+        len += word.length
+      end
+      len
+    end
+    # The average length of words.
+    def self.average_length(words)
+      total_length(words).to_f/words.size
+    end
+    # The square of the standard deviation of length of all words.
+    def self.variance(words)
+      avglen = average_length(words)
+      sqr_sum = 0.0
+      for word in words
+        tmp = word.length - avglen
+        sqr_sum += tmp*tmp
+      end
+      Math.sqrt(sqr_sum)
+    end
+    # The sum of all frequencies of one-character words.
+    def self.degree_of_morphemic_freedom(words)
+      sum = 0
+      for word in words
+        if word.length == 1 && word.type == Word::TYPES[:cjk_word]
+          sum += word.frequency
+        end
+      end
+      sum
+    end
+  end
+end

data/lib/rmmseg/complex_algorithm.rb ADDED

@@ -0,0 +1,122 @@
+require 'rmmseg/algorithm'
+require 'rmmseg/mm_rule'
+require 'rmmseg/lawl_rule'
+require 'rmmseg/svwl_rule'
+require 'rmmseg/lsdmfocw_rule'
+module RMMSeg
+  class ComplexAlgorithm
+    MATCH_CACHE_MAX_LENGTH = 3
+    include Algorithm
+    # Create a new ComplexAlgorithm . Rules used by this algorithm
+    # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
+    def initialize(text, token=Token)
+      super
+      @rules = [
+                MMRule,
+                LAWLRule,
+                SVWLRule,
+                LSDMFOCWRule
+               ]
+      @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
+      @match_cache_idx = 0
+    end
+    # Get the most proper CJK word.
+    def get_cjk_word
+      chunks = create_chunks
+      i = 0
+      while i < @rules.length
+        break if chunks.length < 2
+        chunks = @rules[i].filter(chunks)
+        i += 1
+      end
+      if chunks.length > 1
+        if Config.on_ambiguity == :raise_exception
+          raise Ambiguity, "Can't solve ambiguity on #{chunks}"
+        end
+      end
+      word = chunks[0][0]
+      token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      @index += word.length
+      @byte_index += word.byte_size
+      return token
+    end
+    # Create all possible three-word (or less) chunks
+    # starting from +@index+ .
+    def create_chunks
+      chunks = Array.new
+      for w0 in find_match_words(@index)
+        index0 = @index + w0.length
+        if index0 < @chars.length
+          for w1 in find_match_words(index0)
+            index1 = index0 + w1.length
+            if index1 < @chars.length
+              for w2 in find_match_words(index1)
+                if w2.type == Word::TYPES[:unrecognized]
+                  chunks << [w0, w1]
+                else
+                  chunks << [w0, w1, w2]
+                end
+              end
+            elsif index1 == @chars.length
+              chunks << [w0, w1]
+            end
+          end
+        elsif index0 == @chars.length
+          chunks << [w0]
+        end
+      end
+      chunks
+    end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(index)
+      for i, w in @match_cache
+        if i == index
+          return w
+        end
+      end
+      dic = Dictionary.instance
+      str = String.new
+      strlen = 0
+      words = Array.new
+      i = index
+      while i < @chars.length               &&
+          !basic_latin?(@chars[i])          &&
+          strlen < Config.max_word_length
+        str << @chars[i]
+        strlen += 1
+        if dic.has_word?(str)
+          words << dic.get_word(str)
+        end
+        i += 1
+      end
+      if words.empty?
+        words << Word.new(@chars[index], Word::TYPES[:unrecognized])
+      end
+      @match_cache[@match_cache_idx] = [index, words]
+      @match_cache_idx += 1
+      @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
+      words
+    end
+  end
+end

data/lib/rmmseg/config.rb ADDED

@@ -0,0 +1,65 @@
+require 'rmmseg/simple_algorithm'
+require 'rmmseg/complex_algorithm'
+module RMMSeg
+  # Configurations of RMMSeg.
+  class Config
+    @algorithm = :complex
+    @on_ambiguity = :select_first
+    data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
+    @dictionaries = [
+      [File.join(data_dir, "chars.dic"), true],
+      [File.join(data_dir, "words.dic"), false],
+      [File.join(data_dir, "custom.dic"), false]
+    ]
+    @max_word_length = 4
+    class << self
+      # Get the algorithm name currently using
+      def algorithm
+        @algorithm
+      end
+      # Set the algorithm name used to segment. Valid values are
+      # +:complex+ and +:simple+ . The former is the default one.
+      def algorithm=(algor)
+        unless [:complex, :simple].include? algor
+          raise ArgumentError, "Unknown algorithm #{algor}"
+        end
+        @algorithm = algor
+      end
+      # Get an instance of the algorithm object corresponding to the
+      # algorithm name configured. +tok+ is the class of the token oject
+      # to be returned. For example, if you want to use with Ferret, you
+      # should provide +::Ferret::Analysis::Token+ .
+      def algorithm_instance(text, tok=Token)
+        RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
+      end
+      # Get the behavior description when an unresolved ambiguity occured.
+      def on_ambiguity
+        @on_ambiguity
+      end
+      # Set the behavior on an unresolved ambiguity. Valid values are
+      # +:raise_exception+ and +:select_first+ . The latter is the default
+      # one.
+      def on_ambiguity=(behavior)
+        unless [:raise_exception, :select_first].include? behavior
+          raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
+        end
+        @on_ambiguity = behavior
+      end
+      # An array of dictionary files. Each element should be of the
+      # form: [file, whether_dic_include_frequency_info]. This should
+      # be set before the dictionaries are loaded (They are loaded
+      # only when they are used). Or else you should call
+      # Dictionary.instance.reload manually to reload the
+      # dictionaries.
+      attr_accessor :dictionaries
+      # The maximum length of a CJK word. The default value is 4. Making
+      # this value too large might slow down the segment operations.
+      attr_accessor :max_word_length
+    end
+  end
+end

data/lib/rmmseg/dictionary.rb ADDED

@@ -0,0 +1,80 @@
+require 'singleton'
+module RMMSeg
+  # The dictionary is a singleton object which is lazily initialized.
+  # *NOTE* dictionary data should use the UNIX line-break '\n' instead
+  # of DOS '\r\n'.
+  class Dictionary
+    include Singleton
+    # Initialize and load dictionaries from files specified by
+    # +Config.dictionaries+ .
+    def initialize
+      load_dictionaries
+    end
+    # Determin whether +value+ is a word in the dictionary.
+    def has_word?(value)
+      @dic.has_key?(value)
+    end
+    # Store a new word to dictionary.
+    # +w+ may be:
+    # * an instance of Word.
+    # * +true+, then this is a normal world.
+    # * a String(which can be converted to a Number) or Number.
+    #   The number is the frequency of the word.
+    def store_word(key, w=true)
+      @dic[key] = w
+    end
+    # Get an instance of Word corresponding to +value+ .
+    def get_word(value)
+      word = @dic[value]
+      # Construct a Word lazily
+      if word == true
+        word = Word.new(value.dup, Word::TYPES[:cjk_word])
+        @dic[value] = word
+      elsif String === word
+        word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
+        @dic[value] = word
+      end
+      word
+    end
+    # Reload all dictionary files.
+    def reload
+      @dic = nil
+      load_dictionaries
+    end
+    private
+    def load_dictionaries
+      @dic = Hash.new
+      Config.dictionaries.each { |file, has_freq|
+        if has_freq
+          load_dictionary_with_freq(file)
+        else
+          load_dictionary(file)
+        end
+      }
+    end
+    def load_dictionary_with_freq(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          pair = line.split(" ")
+          @dic[pair[0]] = pair[1]
+        }
+      }
+    end
+    def load_dictionary(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          line.slice!(-1)       # chop!
+          @dic[line] = true
+        }
+      }
+    end
+  end
+end