RubyGems - rmmseg - Versions diffs - 0.0.1 - Mend

Files changed (38) hide show

data/History.txt +6 -0
data/Manifest.txt +37 -0
data/README.txt +63 -0
data/Rakefile +33 -0
data/TODO.txt +3 -0
data/bin/rmmseg +63 -0
data/lib/rmmseg/algorithm.rb +157 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chars.dic +12638 -0
data/lib/rmmseg/chunk.rb +51 -0
data/lib/rmmseg/complex_algorithm.rb +52 -0
data/lib/rmmseg/config.rb +59 -0
data/lib/rmmseg/dictionary.rb +66 -0
data/lib/rmmseg/ferret.rb +43 -0
data/lib/rmmseg/lawl_rule.rb +14 -0
data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
data/lib/rmmseg/mm_rule.rb +15 -0
data/lib/rmmseg/rule_helper.rb +22 -0
data/lib/rmmseg/simple_algorithm.rb +22 -0
data/lib/rmmseg/svwl_rule.rb +14 -0
data/lib/rmmseg/token.rb +22 -0
data/lib/rmmseg/word.rb +37 -0
data/lib/rmmseg/words.dic +120330 -0
data/lib/rmmseg.rb +15 -0
data/misc/homepage.erb +93 -0
data/misc/homepage.html +1063 -0
data/spec/chunk_spec.rb +26 -0
data/spec/complex_algorithm_spec.rb +18 -0
data/spec/config_spec.rb +12 -0
data/spec/dictionary_spec.rb +20 -0
data/spec/lawl_rule_spec.rb +15 -0
data/spec/lsdmfocw_rule_spec.rb +14 -0
data/spec/mm_rule_spec.rb +15 -0
data/spec/simple_algorithm_spec.rb +46 -0
data/spec/spec_helper.rb +15 -0
data/spec/svwl_rule_spec.rb +14 -0
data/spec/word_spec.rb +9 -0
metadata +101 -0

data/spec/chunk_spec.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'chunk' do
+  before(:all) do
+    @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
+                       [10, 7, 100, 8])
+    @chunk = RMMSeg::Chunk.new(@words)
+  end
+  it "should return proper total length" do
+    @chunk.total_length.should == 13
+  end
+  it "should return proper average length" do
+    @chunk.average_length.should == 13.0/4
+  end
+  it "should return proper variance" do
+    @chunk.variance.to_i.should == 4
+  end
+  it "should return proper degree of morphemic freedom" do
+    @chunk.degree_of_morphemic_freedom.should == 100
+  end
+end

data/spec/complex_algorithm_spec.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "complex algorithm" do
+  it "should behave well as svwl rule" do
+    text = "研究生命科学"
+    segs = RMMSeg::ComplexAlgorithm.new(text).segment
+    segs.length.should == 3
+    segs[0].should == "研究"
+  end
+  it "should segment a relative big chunk of Chinese" do
+    text = "主持人把一只割去头的羊放在指定处。枪响后，甲乙两队共同向羊飞驰而去，先抢到羊的同队队员互相掩护，极力向终点奔驰，双方骑手们施展各种技巧，围追堵截，拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗，将羊当场烤熟，请众骑手共享，称为“幸福肉”。"
+    segs = RMMSeg::ComplexAlgorithm.new(text).segment
+    segs.length.should == 87
+    segs[0].should == "主持人"
+  end
+end

data/spec/config_spec.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "RMMSeg Config" do
+  it "should be able to store and retrive config values" do
+    RMMSeg::Config.algorithm = :simple
+    RMMSeg::Config.algorithm.should == :simple
+  end
+  it "should reject invalid algorithm" do
+    lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
+  end
+end

data/spec/dictionary_spec.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "dictionary" do
+  before(:all) do
+    @dic = RMMSeg::Dictionary.instance
+  end
+  it "should contain frequency information for chars" do
+    @dic.get_word("你").frequency.should == 915385
+  end
+  it "should handle words" do
+    @dic.has_word?("你们").should == true
+  end
+  it "should ignore words which exceed the maximum length" do
+    @dic.has_word?("这是一个超出长度的词组").should == false
+  end
+end

data/spec/lawl_rule_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "largest average word length rule" do
+  it "should return chunks with the maximum average word length" do
+    chunks = [
+              RMMSeg::Chunk.new(gen_words(["国际化"])),
+              RMMSeg::Chunk.new(gen_words(["国际", "化"])),
+              RMMSeg::Chunk.new(gen_words(["国", "际", "化"]))
+             ]
+    chunks = RMMSeg::LAWLRule.new.filter(chunks)
+    chunks.length.should == 1
+    chunks[0].words[0].text.should == "国际化"
+  end
+end

data/spec/lsdmfocw_rule_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "largest sum of degree of morphemic freedom of one-character words rule" do
+  it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
+    chunks = [
+              RMMSeg::Chunk.new(gen_words(["主要", "是", "因为"], [nil, 100, nil])),
+              RMMSeg::Chunk.new(gen_words(["主", "要是", "因为"], [10, nil, nil]))
+             ]
+    chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
+    chunks.length.should == 1
+    chunks[0].words[0].text.should == "主要"
+  end
+end

data/spec/mm_rule_spec.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'maximum matching rule' do
+  it "should select chunks with the maximun total length" do
+    chunks = [
+              RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来了"])),
+              RMMSeg::Chunk.new(gen_words(["眼", "看", "就", "要", "来", "了"])),
+              RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来"])),
+              RMMSeg::Chunk.new(gen_words(["眼", "看", "就"]))
+             ]
+    chunks = RMMSeg::MMRule.new.filter(chunks)
+    chunks.length.should == 2
+  end
+end

data/spec/simple_algorithm_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "simple algorithm" do
+  it "should handle simple cases" do
+    text = "我们都喜欢用 Ruby"
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 5
+    segs[0].should == "我们"
+  end
+  it "shouldn't be able to handle some case" do
+    text = "研究生命起源"
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 3
+    segs[0].should_not == "研究"
+    segs[0].should == "研究生"
+  end
+  it "should handle pure English as well" do
+    text = "This is a paragraph of English."
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 6
+    segs[0].should == "This"
+  end
+  it "should handle byte positions of English well" do
+    text = "This is a paragraph of English."
+    algor = RMMSeg::SimpleAlgorithm.new(text)
+    3.times { algor.next_token }
+    token = algor.next_token
+    token.text.should == "paragraph"
+    token.start_pos.should == 10
+    token.end_pos.should == 19
+  end
+  it "should handle byte positions of Chinese well" do
+    text = "这是一句中文"
+    algor = RMMSeg::SimpleAlgorithm.new(text)
+    2.times { algor.next_token }
+    token = algor.next_token
+    token.text.should == "中文"
+    token.start_pos.should == 12
+    token.end_pos.should == 18
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,15 @@
+$: << File.join(File.dirname(__FILE__), "../lib")
+require 'rmmseg'
+require 'rubygems'
+require 'spec'
+def gen_words words, freqs=nil
+  if freqs.nil?
+    words.map { |word| RMMSeg::Word.new(word) }
+  else
+    words.zip(freqs).map { |word, freq|
+      RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
+    }
+  end
+end

data/spec/svwl_rule_spec.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "smallest variance of word length rule" do
+  it "should return chunks with the smallest word length variance" do
+    chunks = [
+              RMMSeg::Chunk.new(gen_words(["研究", "生命", "起源"])),
+              RMMSeg::Chunk.new(gen_words(["研究生", "命", "起源"]))
+             ]
+    chunks = RMMSeg::SVWLRule.new.filter(chunks)
+    chunks.length.should == 1
+    chunks[0].words[0].text.should == "研究"
+  end
+end

data/spec/word_spec.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'word' do
+  it "should return proper length on CJK words" do
+    w = RMMSeg::Word.new('中文')
+    w.length.should == 2
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,101 @@
+--- !ruby/object:Gem::Specification
+name: rmmseg
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- pluskid
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2008-02-01 00:00:00 +08:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: hoe
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.5.0
+    version:
+description: "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using:   * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 aditonal rules to solve ambiguities.  For more information about the algorithm, please refer to the following essays:  * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
+email: pluskid@gmail.com
+executables:
+- rmmseg
+extensions: []
+extra_rdoc_files:
+- History.txt
+- Manifest.txt
+- README.txt
+- TODO.txt
+files:
+- History.txt
+- Manifest.txt
+- README.txt
+- Rakefile
+- TODO.txt
+- bin/rmmseg
+- lib/rmmseg.rb
+- lib/rmmseg/algorithm.rb
+- lib/rmmseg/amibguity.rb
+- lib/rmmseg/chars.dic
+- lib/rmmseg/chunk.rb
+- lib/rmmseg/complex_algorithm.rb
+- lib/rmmseg/config.rb
+- lib/rmmseg/dictionary.rb
+- lib/rmmseg/ferret.rb
+- lib/rmmseg/lawl_rule.rb
+- lib/rmmseg/lsdmfocw_rule.rb
+- lib/rmmseg/mm_rule.rb
+- lib/rmmseg/rule_helper.rb
+- lib/rmmseg/simple_algorithm.rb
+- lib/rmmseg/svwl_rule.rb
+- lib/rmmseg/token.rb
+- lib/rmmseg/word.rb
+- lib/rmmseg/words.dic
+- misc/homepage.erb
+- misc/homepage.html
+- spec/chunk_spec.rb
+- spec/complex_algorithm_spec.rb
+- spec/config_spec.rb
+- spec/dictionary_spec.rb
+- spec/lawl_rule_spec.rb
+- spec/lsdmfocw_rule_spec.rb
+- spec/mm_rule_spec.rb
+- spec/simple_algorithm_spec.rb
+- spec/spec_helper.rb
+- spec/svwl_rule_spec.rb
+- spec/word_spec.rb
+has_rdoc: true
+homepage: http://rmmseg.rubyforge.org
+post_install_message:
+rdoc_options:
+- --main
+- README.txt
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: rmmseg
+rubygems_version: 1.0.1
+signing_key:
+specification_version: 2
+summary: RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a Chinese segmentation algorithm based on two variants of maximum matching.  RMMSeg can be used as a stand alone program or as an Analyzer of Ferret.
+test_files: []

rmmseg 0.0.1