RubyGems - rmmseg - Versions diffs - 0.0.1 - Mend

Files changed (38) hide show

data/History.txt +6 -0
data/Manifest.txt +37 -0
data/README.txt +63 -0
data/Rakefile +33 -0
data/TODO.txt +3 -0
data/bin/rmmseg +63 -0
data/lib/rmmseg/algorithm.rb +157 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chars.dic +12638 -0
data/lib/rmmseg/chunk.rb +51 -0
data/lib/rmmseg/complex_algorithm.rb +52 -0
data/lib/rmmseg/config.rb +59 -0
data/lib/rmmseg/dictionary.rb +66 -0
data/lib/rmmseg/ferret.rb +43 -0
data/lib/rmmseg/lawl_rule.rb +14 -0
data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
data/lib/rmmseg/mm_rule.rb +15 -0
data/lib/rmmseg/rule_helper.rb +22 -0
data/lib/rmmseg/simple_algorithm.rb +22 -0
data/lib/rmmseg/svwl_rule.rb +14 -0
data/lib/rmmseg/token.rb +22 -0
data/lib/rmmseg/word.rb +37 -0
data/lib/rmmseg/words.dic +120330 -0
data/lib/rmmseg.rb +15 -0
data/misc/homepage.erb +93 -0
data/misc/homepage.html +1063 -0
data/spec/chunk_spec.rb +26 -0
data/spec/complex_algorithm_spec.rb +18 -0
data/spec/config_spec.rb +12 -0
data/spec/dictionary_spec.rb +20 -0
data/spec/lawl_rule_spec.rb +15 -0
data/spec/lsdmfocw_rule_spec.rb +14 -0
data/spec/mm_rule_spec.rb +15 -0
data/spec/simple_algorithm_spec.rb +46 -0
data/spec/spec_helper.rb +15 -0
data/spec/svwl_rule_spec.rb +14 -0
data/spec/word_spec.rb +9 -0
metadata +101 -0

data/lib/rmmseg/chunk.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module RMMSeg
+  # A Chunk holds one or more successive Word .
+  class Chunk
+    # The words held by this chunk.
+    attr_reader :words
+    # Build a Chunk on an array of Word .
+    def initialize(words)
+      @words = words
+      @average_length = nil
+      @total_length = nil
+      @variance = nil
+      @degree_of_morphemic_freedom = nil
+    end
+    # The sum of length of all words held by this chunk.
+    def total_length
+      @total_length ||= @words.inject(0.0) { |len, word| len + word.length }
+      @total_length
+    end
+    # The average length of words held by this chunk.
+    def average_length
+      @average_length ||= total_length/@words.size
+      @average_length
+    end
+    # The square of the standard deviation of length of all words
+    # held by this chunk.
+    def variance
+      @variance ||= Math.sqrt(@words.inject(0.0) { |sqr_sum, word|
+                                tmp = word.length - average_length
+                                sqr_sum + tmp*tmp
+                              })
+      @variance
+    end
+    # The sum of all frequencies of one-character words held by
+    # this chunk.
+    def degree_of_morphemic_freedom
+      @degree_of_morphemic_freedom ||= @words.inject(0) { |sum, word|
+        if word.length == 1 && word.type == Word::TYPES[:cjk_word]
+          sum + word.frequency
+        else
+          sum
+        end
+      }
+      @degree_of_morphemic_freedom
+    end
+  end
+end

data/lib/rmmseg/complex_algorithm.rb ADDED Viewed

@@ -0,0 +1,52 @@
+require 'rmmseg/algorithm'
+require 'rmmseg/mm_rule'
+require 'rmmseg/lawl_rule'
+require 'rmmseg/svwl_rule'
+require 'rmmseg/lsdmfocw_rule'
+module RMMSeg
+  class ComplexAlgorithm
+    include Algorithm
+    # Create a new ComplexAlgorithm . Rules used by this algorithm
+    # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
+    def initialize(text)
+      super
+      @rules = [
+                MMRule.new,
+                LAWLRule.new,
+                SVWLRule.new,
+                LSDMFOCWRule.new
+               ]
+    end
+    # Create all possible three-word (or less) chunks
+    # starting from +@index+ .
+    def create_chunks
+      chunks = Array.new
+      find_match_words(@chars, @index).each { |w0|
+        index0 = @index + w0.length
+        if index0 < @chars.length
+          find_match_words(@chars, index0).each { |w1|
+            index1 = index0 + w1.length
+            if index1 < @chars.length
+              find_match_words(@chars, index1).each { |w2|
+                if w2.type == Word::TYPES[:unrecognized]
+                  chunks << Chunk.new([w0, w1])
+                else
+                  chunks << Chunk.new([w0, w1, w2])
+                end
+              }
+            elsif index1 == @chars.length
+              chunks << Chunk.new([w0, w1])
+            end
+          }
+        elsif index0 == @chars.length
+          chunks << Chunk.new([w0])
+        end
+      }
+      chunks
+    end
+  end
+end

data/lib/rmmseg/config.rb ADDED Viewed

@@ -0,0 +1,59 @@
+require 'rmmseg/simple_algorithm'
+require 'rmmseg/complex_algorithm'
+module RMMSeg
+  # Configurations of RMMSeg.
+  class Config
+    @algorithm = :complex
+    @on_ambiguity = :select_first
+    @dictionaries = [[File.join(File.dirname(__FILE__), "chars.dic"), true],
+                     [File.join(File.dirname(__FILE__), "words.dic"), false]]
+    @max_word_length = 4
+    class << self
+      # Get the algorithm name currently using
+      def algorithm
+        @algorithm
+      end
+      # Set the algorithm name used to segment. Valid values are
+      # +:complex+ and +:simple+ . The former is the default one.
+      def algorithm=(algor)
+        unless [:complex, :simple].include? algor
+          raise ArgumentError, "Unknown algorithm #{algor}"
+        end
+        @algorithm = algor
+      end
+      # Get an instance of the algorithm object corresponding to the
+      # algorithm name configured.
+      def algorithm_instance(text)
+        RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
+      end
+      # Get the behavior description when an unresolved ambiguity occured.
+      def on_ambiguity
+        @on_ambiguity
+      end
+      # Set the behavior on an unresolved ambiguity. Valid values are
+      # +:raise_exception+ and +:select_first+ . The latter is the default
+      # one.
+      def on_ambiguity=(behavior)
+        unless [:raise_exception, :select_first].include? behavior
+          raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
+        end
+        @on_ambiguity = behavior
+      end
+      # An array of dictionary files. Each element should be of the
+      # form: [file, whether_dic_include_frequency_info]. This should
+      # be set before the dictionaries are loaded (They are loaded
+      # only when they are used). Or else you should call
+      # Dictionary.instance.reload manually to reload the
+      # dictionaries.
+      attr_accessor :dictionaries
+      # The maximum length of a CJK word. The default value is 4. Making
+      # this value too large might slow down the segment operations.
+      attr_accessor :max_word_length
+    end
+  end
+end

data/lib/rmmseg/dictionary.rb ADDED Viewed

@@ -0,0 +1,66 @@
+require 'singleton'
+module RMMSeg
+  # The dictionary is a singleton object which is lazily initialized.
+  class Dictionary
+    include Singleton
+    # Initialize and load dictionaries from files specified by
+    # +Config.dictionaries+ .
+    def initialize
+      load_dictionaries
+    end
+    # Determin whether +value+ is a word in the dictionary.
+    def has_word?(value)
+      @dic.has_key?(value)
+    end
+    # Get an instance of Word corresponding to +value+ .
+    def get_word(value)
+      word = @dic[value]
+      # Construct a Word lazily
+      if word.is_a? String
+        arr = word.split(" ")
+        word = Word.new(arr[0], Word::TYPES[:cjk_word], arr[1].to_i)
+        @dic[value] = word
+      end
+      word
+    end
+    # Reload all dictionary files.
+    def reload
+      @dic = nil
+      load_dictionaries
+    end
+    private
+    def load_dictionaries
+      @dic = Hash.new
+      Config.dictionaries.each { |file, has_freq|
+        if has_freq
+          load_dictionary_with_freq(file)
+        else
+          load_dictionary(file)
+        end
+      }
+    end
+    def load_dictionary_with_freq(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          pair = line.split(" ")
+          @dic[pair[0]] = line
+        }
+      }
+    end
+    def load_dictionary(file)
+      File.open(file, "r") { |f|
+        f.each_line { |line|
+          line.chomp!.freeze
+          @dic[line] = line
+        }
+      }
+    end
+  end
+end

data/lib/rmmseg/ferret.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# This file integrate RMMSeg with Ferret
+require 'rubygems'
+require 'ferret'
+module RMMSeg
+  module Ferret
+    # The Analyzer class can be used with Ferret .
+    class Analyzer < ::Ferret::Analysis::Analyzer
+      def token_stream(field, text)
+        Tokenizer.new(text)
+      end
+    end
+    # The Tokenizer tokenize text with RMMSeg::Algorithm.
+    class Tokenizer < ::Ferret::Analysis::TokenStream
+      # Create a new Tokenizer to tokenize +text+
+      def initialize(str)
+        self.text = str
+      end
+      # Get next token
+      def next
+        tk = @algor.next_token
+        if tk.nil?
+          nil
+        else
+          ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
+        end
+      end
+      # Get the text being tokenized
+      def text
+        @text
+      end
+      # Set the text to be tokenized
+      def text=(str)
+        @text = str
+        @algor = RMMSeg::Config.algorithm_instance(@text)
+      end
+    end
+  end
+end

data/lib/rmmseg/lawl_rule.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Largest average word length rule.
+  class LAWLRule
+    def filter(chunks)
+      chunks.sort { |a, b|
+        b.average_length <=> a.average_length
+      }.similar_elements { |a, b|
+        a.average_length == b.average_length
+      }
+    end
+  end
+end

data/lib/rmmseg/lsdmfocw_rule.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Largest sum of degree of morphemic freedom of one-character
+  # words rule.
+  class LSDMFOCWRule
+    def filter(chunks)
+      chunks.sort { |a, b|
+        b.degree_of_morphemic_freedom <=> a.degree_of_morphemic_freedom
+      }.similar_elements { |a, b|
+        a.degree_of_morphemic_freedom == b.degree_of_morphemic_freedom
+      }
+    end
+  end
+end

data/lib/rmmseg/mm_rule.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Maximum matching rule, select the chunks with the
+  # maximum length.
+  class MMRule
+    def filter(chunks)
+      chunks.sort { |a, b|
+        b.total_length <=> a.total_length
+      }.similar_elements { |a, b|
+        a.total_length == b.total_length
+      }
+    end
+  end
+end

data/lib/rmmseg/rule_helper.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class Array
+  # Return an array of _similar_ elements neighbouring to each
+  # other. e.g.
+  #   [1,2,2,2,3,3,5].similar_elements(1) => [2,2,2]
+  # and (maybe more useful example)
+  #   ["Kid", "Kily", "KDE", "Foo", "Food"].similar_elements { |a, b|
+  #     a[0] == b[0]
+  #   } => ["Kid", "Kily", "KDE"]
+  def similar_elements(index=0)
+    i = index+1
+    loop do
+      break if i >= self.length
+      if block_given?
+        break unless yield(self[index], self[i])
+      else
+        break if self[index] == self[i]
+      end
+      i += 1
+    end
+    self[index...i]
+  end
+end

data/lib/rmmseg/simple_algorithm.rb ADDED Viewed

@@ -0,0 +1,22 @@
+require 'rmmseg/algorithm'
+require 'rmmseg/mm_rule'
+module RMMSeg
+  class SimpleAlgorithm
+    include Algorithm
+    # Create a new SimpleAlgorithm . The only rule used by this
+    # algorithm is MMRule .
+    def initialize(text)
+      super
+      @rules = [ MMRule.new ]
+    end
+    # Create all possible one-word chunks starting from +@index+ .
+    def create_chunks
+      find_match_words(@chars, @index).map { |word|
+        Chunk.new([word])
+      }
+    end
+  end
+end

data/lib/rmmseg/svwl_rule.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'rmmseg/rule_helper'
+module RMMSeg
+  # Smallest variance of word length rule.
+  class SVWLRule
+    def filter(chunks)
+      chunks.sort { |a, b|
+        a.variance <=> b.variance
+      }.similar_elements { |a, b|
+        a.variance == b.variance
+      }
+    end
+  end
+end

data/lib/rmmseg/token.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module RMMSeg
+  # A Token consists of a term's text and the start and end offset
+  # of the term.
+  class Token
+    # Text of the token.
+    attr_reader :text
+    # The start position of the token. This is *byte* index instead of
+    # character.
+    attr_reader :start_pos
+    # The one greater than the position of the last byte of the
+    # token. This is *byte* index instead of character.
+    attr_reader :end_pos
+    def initialize(text, start_pos, end_pos)
+      @text = text
+      @start_pos = start_pos
+      @end_pos = end_pos
+    end
+  end
+end

data/lib/rmmseg/word.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module RMMSeg
+  # An object representing a CJK word.
+  class Word
+    TYPES = {
+      :unrecognized => :unrecognized,
+      :basic_latin_word => :basic_latin_word,
+      :cjk_word => :cjk_word
+    }.freeze
+    # The content text of the word.
+    attr_reader :text
+    # The type of the word, may be one of the key of TYPES .
+    attr_reader :type
+    # The frequency of the word. This value is meaningful only
+    # when this is a one-character word.
+    attr_reader :frequency
+    # Initialize a Word object.
+    def initialize(text, type=TYPES[:unrecognized], frequency=nil)
+      @text = text
+      @type = type
+      @frequency = frequency
+    end
+    # The number of characters in the word. *Not* number of bytes.
+    def length
+      @text.jlength
+    end
+    # The number of bytes in the word.
+    def byte_size
+      @text.length
+    end
+  end
+end

rmmseg 0.0.1