RubyGems - rmmseg - Versions diffs - 0.0.1 - Mend

rmmseg 0.0.1

Files changed (38) hide show

data/History.txt +6 -0
data/Manifest.txt +37 -0
data/README.txt +63 -0
data/Rakefile +33 -0
data/TODO.txt +3 -0
data/bin/rmmseg +63 -0
data/lib/rmmseg/algorithm.rb +157 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chars.dic +12638 -0
data/lib/rmmseg/chunk.rb +51 -0
data/lib/rmmseg/complex_algorithm.rb +52 -0
data/lib/rmmseg/config.rb +59 -0
data/lib/rmmseg/dictionary.rb +66 -0
data/lib/rmmseg/ferret.rb +43 -0
data/lib/rmmseg/lawl_rule.rb +14 -0
data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
data/lib/rmmseg/mm_rule.rb +15 -0
data/lib/rmmseg/rule_helper.rb +22 -0
data/lib/rmmseg/simple_algorithm.rb +22 -0
data/lib/rmmseg/svwl_rule.rb +14 -0
data/lib/rmmseg/token.rb +22 -0
data/lib/rmmseg/word.rb +37 -0
data/lib/rmmseg/words.dic +120330 -0
data/lib/rmmseg.rb +15 -0
data/misc/homepage.erb +93 -0
data/misc/homepage.html +1063 -0
data/spec/chunk_spec.rb +26 -0
data/spec/complex_algorithm_spec.rb +18 -0
data/spec/config_spec.rb +12 -0
data/spec/dictionary_spec.rb +20 -0
data/spec/lawl_rule_spec.rb +15 -0
data/spec/lsdmfocw_rule_spec.rb +14 -0
data/spec/mm_rule_spec.rb +15 -0
data/spec/simple_algorithm_spec.rb +46 -0
data/spec/spec_helper.rb +15 -0
data/spec/svwl_rule_spec.rb +14 -0
data/spec/word_spec.rb +9 -0
metadata +101 -0

data/History.txt ADDED Viewed

@@ -0,0 +1,6 @@
+=== 0.0.1 / 2008-01-31
+* Analyser integration with Ferret.
+* rdoc added
+* Lazily init the +Word+ objects inside the +Dictionary+.
+* Handle English punctuation correctly.

data/Manifest.txt ADDED Viewed

@@ -0,0 +1,37 @@
+History.txt
+Manifest.txt
+README.txt
+Rakefile
+TODO.txt
+bin/rmmseg
+lib/rmmseg.rb
+lib/rmmseg/algorithm.rb
+lib/rmmseg/amibguity.rb
+lib/rmmseg/chars.dic
+lib/rmmseg/chunk.rb
+lib/rmmseg/complex_algorithm.rb
+lib/rmmseg/config.rb
+lib/rmmseg/dictionary.rb
+lib/rmmseg/ferret.rb
+lib/rmmseg/lawl_rule.rb
+lib/rmmseg/lsdmfocw_rule.rb
+lib/rmmseg/mm_rule.rb
+lib/rmmseg/rule_helper.rb
+lib/rmmseg/simple_algorithm.rb
+lib/rmmseg/svwl_rule.rb
+lib/rmmseg/token.rb
+lib/rmmseg/word.rb
+lib/rmmseg/words.dic
+misc/homepage.erb
+misc/homepage.html
+spec/chunk_spec.rb
+spec/complex_algorithm_spec.rb
+spec/config_spec.rb
+spec/dictionary_spec.rb
+spec/lawl_rule_spec.rb
+spec/lsdmfocw_rule_spec.rb
+spec/mm_rule_spec.rb
+spec/simple_algorithm_spec.rb
+spec/spec_helper.rb
+spec/svwl_rule_spec.rb
+spec/word_spec.rb

data/README.txt ADDED Viewed

@@ -0,0 +1,63 @@
+= rmmseg
+* http://rmmseg.rubyforge.org
+* mailto:pluskid@gmail.com
+== DESCRIPTION:
+RMMSeg is an implementation of MMSEG Chinese word segmentation
+algorithm. It is based on two variants of maximum matching
+algorithms. Two algorithms are available for using:
+* simple algorithm that uses only forward maximum matching.
+* complex algorithm that uses three-word chunk maximum matching and 3
+  aditonal rules to solve ambiguities.
+For more information about the algorithm, please refer to the
+following essays:
+* http://technology.chtsai.org/mmseg/
+* http://pluskid.lifegoo.com/?p=261
+== FEATURES/PROBLEMS:
+* Provides +rmmseg+ command line tool for quick and easy way to access
+  the word segment feature.
+* Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
+== SYNOPSIS:
+  $ rmmseg --separator _ < input.txt
+== REQUIREMENTS:
+* ruby
+== INSTALL:
+* sudo gem install rmmseg
+== LICENSE:
+(The MIT License)
+Copyright (c) 2008 FIX
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+'Software'), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/Rakefile ADDED Viewed

@@ -0,0 +1,33 @@
+# -*- ruby -*-
+$: << File.join(File.dirname(__FILE__), "lib")
+require 'rubygems'
+require 'hoe'
+require 'rmmseg'
+Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
+  p.rubyforge_name = 'rmmseg'
+  p.author = 'pluskid'
+  p.email = 'pluskid@gmail.com'
+  p.test_globs = ["spec/spec.rb"]
+  p.rdoc_pattern = /^lib\/.*\.rb$|\.txt$/
+  p.summary = <<-END
+    RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a
+    Chinese segmentation algorithm based on two variants of maximum
+    matching.
+    RMMSeg can be used as a stand alone program or as an Analyzer of
+    Ferret.
+    END
+end
+task :homepage do
+  sh "gerbil html misc/homepage.erb > misc/homepage.html"
+end
+task :publish_homepage do
+  sh "scp misc/homepage.html rubyforge.org:/var/www/gforge-projects/rmmseg/index.html"
+end
+# vim: syntax=Ruby

data/TODO.txt ADDED Viewed

@@ -0,0 +1,3 @@
+=== TODO
+* Add filter to filter out Chinese punctuations.

data/bin/rmmseg ADDED Viewed

@@ -0,0 +1,63 @@
+#!/usr/bin/env ruby
+$: << File.join(File.dirname(__FILE__), "..", "lib")
+require 'rmmseg'
+include RMMSeg
+require 'getoptlong'
+def print_usage
+  puts <<EOF
+#{__FILE__}  Segment Chinese text. Read from stdin and print to stdout.
+Options:
+  -h
+  --help      Print this message
+  -a
+  --algorithm Select segment algorithm. Valid values are 'complex' and
+              'simple'. 'simple' is the default one.
+  -A
+  --ambiguity Select a behavior when an ambiguity occurs. Valid values
+              are 'raise_exception' and 'select_first'. 'select_first'
+              is the default one.
+EOF
+  exit 0
+end
+separator = " "
+optparser = GetoptLong.new
+optparser.set_options(["-a", "--algorithm", GetoptLong::REQUIRED_ARGUMENT],
+                      ["-A", "--ambiguity", GetoptLong::REQUIRED_ARGUMENT],
+                      ["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
+                      ["-h", "--help", GetoptLong::NO_ARGUMENT])
+loop do
+  begin
+    opt, arg = optparser.get
+    break if not opt
+    case opt
+    when "-h"
+      print_usage
+    when "-a"
+      Config.algorithm = arg.to_sym
+    when "-A"
+      Config.on_ambiguity = arg.to_sym
+    when "-s"
+      separator = arg
+    end
+  rescue => err
+    puts err
+    exit 1
+  end
+end
+puts segment(STDIN.read).join(separator)

data/lib/rmmseg/algorithm.rb ADDED Viewed

@@ -0,0 +1,157 @@
+require 'jcode'
+require 'rmmseg/dictionary'
+require 'rmmseg/word'
+require 'rmmseg/chunk'
+require 'rmmseg/token'
+module RMMSeg
+  # An algorithm can segment a piece of text into an array of
+  # words. This module is the common operations shared by
+  # SimpleAlgorithm and ComplexAlgorithm .
+  module Algorithm
+    # Initialize a new instance of Algorithm, the +text+ will
+    # then be segmented by this instance.
+    def initialize(text)
+      @chars = text.each_char
+      @index = 0
+      @byte_index = 0
+    end
+    # Get the next Token recognized.
+    def next_token
+      return nil if @index >= @chars.length
+      current = @chars[@index]
+      orig_index = @index
+      token = nil
+      len = 0
+      if basic_latin?(current)
+        token = get_basic_latin_word
+      else
+        token = get_cjk_word(create_chunks)
+      end
+      if token.text.empty?
+        return next_token
+      else
+        return token
+      end
+    end
+    # Segment the string in +text+ into an array
+    # of words.
+    def segment
+      words = Array.new
+      loop do
+        token = next_token
+        break if token.nil?
+        words << token.text
+      end
+      words
+    end
+    # Skip whitespaces and punctuation to extract a basic latin
+    # word.
+    def get_basic_latin_word
+      word = String.new
+      start_pos = nil
+      end_pos = nil
+      i = @index
+      while i < @chars.length     &&
+          basic_latin?(@chars[i]) &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      start_pos = @byte_index + i - @index
+      while i < @chars.length && basic_latin?(@chars[i])
+        break if nonword_char?(@chars[i])
+        word << @chars[i]
+        i += 1
+      end
+      end_pos = @byte_index + i - @index
+      while i < @chars.length      &&
+          basic_latin?(@chars[i])  &&
+          nonword_char?(@chars[i])
+        i += 1
+      end
+      @byte_index += i - @index
+      @index = i
+      return Token.new(word, start_pos, end_pos)
+    end
+    # Use rules to filter the +chunks+ to get the most
+    # apropos CJK word.
+    def get_cjk_word(chunks)
+      i = 0
+      while i < @rules.length
+        break if chunks.length < 2
+        chunks = @rules[i].filter(chunks)
+        i += 1
+      end
+      if chunks.length > 1
+        if Config.on_ambiguity == :raise_exception
+          raise Ambiguity, "Can't solve ambiguity on #{chunks}"
+        end
+      end
+      word = chunks[0].words[0]
+      token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
+      @index += word.length
+      @byte_index += word.byte_size
+      return token
+    end
+    # Find all words occuring in the dictionary starting from
+    # +index+ . The maximum word length is determined by
+    # +Config.max_word_length+ .
+    def find_match_words(chars, index)
+      dic = Dictionary.instance
+      str = String.new
+      words = Array.new
+      i = index
+      loop do
+        break if i >= chars.length || basic_latin?(chars[i])
+        str << chars[i]
+        if dic.has_word?(str)
+          word = dic.get_word(str)
+          words << word
+        end
+        i += 1
+        break if Word.new(str).length >= Config.max_word_length
+      end
+      if words.empty?
+        words << Word.new(chars[index], Word::TYPES[:unrecognized])
+      end
+      words
+    end
+    # Determine whether a character is a basic latin character.
+    #--
+    # TODO: Implement this method in a more correct way.
+    # currently I use number of bytes in this char to determine this.
+    # If it is a one-byte char, I consider it a basic latin.
+    #++
+    def basic_latin?(char)
+      char.size == 1
+    end
+    # Determine whether a character can be part of a basic latin
+    # word.
+    def nonword_char?(char)
+      /^\W$/ =~ char
+    end
+  end
+end

data/lib/rmmseg/amibguity.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module RMMSeg
+  class Ambiguity < Exception
+  end
+end