RubyGems - plexus-rmmseg - Versions diffs - 0.1.6 - Mend

plexus-rmmseg 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

checksums.yaml +7 -0
data/.gitignore +1 -0
data/History.txt +42 -0
data/Manifest.txt +51 -0
data/README.txt +74 -0
data/Rakefile +12 -0
data/TODO.txt +5 -0
data/bin/rmmseg +65 -0
data/data/chars.dic +12638 -0
data/data/custom.dic +12 -0
data/data/punctuation.dic +79 -0
data/data/words.dic +120330 -0
data/lib/rmmseg.rb +13 -0
data/lib/rmmseg/algorithm.rb +136 -0
data/lib/rmmseg/amibguity.rb +4 -0
data/lib/rmmseg/chunk.rb +41 -0
data/lib/rmmseg/complex_algorithm.rb +122 -0
data/lib/rmmseg/config.rb +65 -0
data/lib/rmmseg/dictionary.rb +80 -0
data/lib/rmmseg/ferret.rb +109 -0
data/lib/rmmseg/lawl_rule.rb +12 -0
data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
data/lib/rmmseg/mm_rule.rb +13 -0
data/lib/rmmseg/rule_helper.rb +28 -0
data/lib/rmmseg/simple_algorithm.rb +37 -0
data/lib/rmmseg/svwl_rule.rb +12 -0
data/lib/rmmseg/token.rb +30 -0
data/lib/rmmseg/version.rb +3 -0
data/lib/rmmseg/word.rb +38 -0
data/misc/ferret_example.rb +56 -0
data/misc/homepage.erb +170 -0
data/misc/homepage.html +1214 -0
data/plexus-rmmseg.gemspec +20 -0
data/spec/chunk_spec.rb +25 -0
data/spec/complex_algorithm_spec.rb +18 -0
data/spec/config_spec.rb +12 -0
data/spec/dictionary_spec.rb +20 -0
data/spec/lawl_rule_spec.rb +15 -0
data/spec/lsdmfocw_rule_spec.rb +14 -0
data/spec/mm_rule_spec.rb +15 -0
data/spec/simple_algorithm_spec.rb +46 -0
data/spec/spec_helper.rb +12 -0
data/spec/svwl_rule_spec.rb +14 -0
data/spec/word_spec.rb +9 -0
data/tasks/ann.rake +76 -0
data/tasks/annotations.rake +22 -0
data/tasks/doc.rake +48 -0
data/tasks/gem.rake +110 -0
data/tasks/homepage.rake +12 -0
data/tasks/manifest.rake +49 -0
data/tasks/post_load.rake +26 -0
data/tasks/rubyforge.rake +57 -0
data/tasks/setup.rb +227 -0
data/tasks/spec.rake +54 -0
data/tasks/svn.rake +44 -0
data/tasks/test.rake +38 -0
metadata +121 -0

data/plexus-rmmseg.gemspec ADDED

@@ -0,0 +1,20 @@
+require File.expand_path('../lib/rmmseg/version', __FILE__)
+Gem::Specification.new do |gem|
+  gem.name             = "plexus-rmmseg"
+  gem.version          = RMMSeg::VERSION
+  gem.authors          = ["pluskid"]
+  gem.email            = "pluskid@gmail.com"
+  gem.date             = "2008-03-16"
+  gem.summary          = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm"
+  gem.description      = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using:   * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 additonal rules to solve ambiguities.  For more information about the algorithm, please refer to the following essays:  * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
+  gem.homepage         = "http://rmmseg.rubyforge.org"
+  gem.license          = 'MIT'
+  gem.require_paths    = ["lib"]
+  gem.files            = `git ls-files`.split($/)
+  gem.test_files       = `git ls-files -- spec`.split($/)
+  gem.extra_rdoc_files = ["History.txt", "README.txt", "TODO.txt", "bin/rmmseg"]
+  gem.executables      = ["rmmseg"]
+  gem.rdoc_options     = ["--main", "README.txt"]
+end

data/spec/chunk_spec.rb ADDED

@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'chunk' do
+  before(:all) do
+    @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
+                       [10, 7, 100, 8])
+  end
+  it "should return proper total length" do
+    RMMSeg::Chunk::total_length(@words).should == 13
+  end
+  it "should return proper average length" do
+    RMMSeg::Chunk::average_length(@words).should == 13.0/4
+  end
+  it "should return proper variance" do
+    RMMSeg::Chunk::variance(@words).to_i.should == 4
+  end
+  it "should return proper degree of morphemic freedom" do
+    RMMSeg::Chunk::degree_of_morphemic_freedom(@words).should == 100
+  end
+end

data/spec/complex_algorithm_spec.rb ADDED

@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "complex algorithm" do
+  it "should behave well as svwl rule" do
+    text = "研究生命科学"
+    segs = RMMSeg::ComplexAlgorithm.new(text).segment
+    segs.length.should == 3
+    segs[0].should == "研究"
+  end
+  it "should segment a relative big chunk of Chinese" do
+    text = "主持人把一只割去头的羊放在指定处。枪响后，甲乙两队共同向羊飞驰而去，先抢到羊的同队队员互相掩护，极力向终点奔驰，双方骑手们施展各种技巧，围追堵截，拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗，将羊当场烤熟，请众骑手共享，称为“幸福肉”。"
+    segs = RMMSeg::ComplexAlgorithm.new(text).segment
+    segs.length.should == 87
+    segs[0].should == "主持人"
+  end
+end

data/spec/config_spec.rb ADDED

@@ -0,0 +1,12 @@
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "RMMSeg Config" do
+  it "should be able to store and retrive config values" do
+    RMMSeg::Config.algorithm = :simple
+    RMMSeg::Config.algorithm.should == :simple
+  end
+  it "should reject invalid algorithm" do
+    lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
+  end
+end

data/spec/dictionary_spec.rb ADDED

@@ -0,0 +1,20 @@
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "dictionary" do
+  before(:all) do
+    @dic = RMMSeg::Dictionary.instance
+  end
+  it "should contain frequency information for chars" do
+    @dic.get_word("你").frequency.should == 915385
+  end
+  it "should handle words" do
+    @dic.has_word?("你们").should == true
+  end
+  it "should ignore words which exceed the maximum length" do
+    @dic.has_word?("这是一个超出长度的词组").should == false
+  end
+end

data/spec/lawl_rule_spec.rb ADDED

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "largest average word length rule" do
+  it "should return chunks with the maximum average word length" do
+    chunks = [
+              gen_words(["国际化"]),
+              gen_words(["国际", "化"]),
+              gen_words(["国", "际", "化"])
+             ]
+    chunks = RMMSeg::LAWLRule.filter(chunks)
+    chunks.length.should == 1
+    chunks[0][0].text.should == "国际化"
+  end
+end

data/spec/lsdmfocw_rule_spec.rb ADDED

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "largest sum of degree of morphemic freedom of one-character words rule" do
+  it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
+    chunks = [
+              gen_words(["主要", "是", "因为"], [nil, 100, nil]),
+              gen_words(["主", "要是", "因为"], [10, nil, nil])
+             ]
+    chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
+    chunks.length.should == 1
+    chunks[0][0].text.should == "主要"
+  end
+end

data/spec/mm_rule_spec.rb ADDED

@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'maximum matching rule' do
+  it "should select chunks with the maximun total length" do
+    chunks = [
+              gen_words(["眼看", "就要", "来了"]),
+              gen_words(["眼", "看", "就", "要", "来", "了"]),
+              gen_words(["眼看", "就要", "来"]),
+              gen_words(["眼", "看", "就"])
+             ]
+    chunks = RMMSeg::MMRule.filter(chunks)
+    chunks.length.should == 2
+  end
+end

data/spec/simple_algorithm_spec.rb ADDED

@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "simple algorithm" do
+  it "should handle simple cases" do
+    text = "我们都喜欢用 Ruby"
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 5
+    segs[0].should == "我们"
+  end
+  it "shouldn't be able to handle some case" do
+    text = "研究生命起源"
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 3
+    segs[0].should_not == "研究"
+    segs[0].should == "研究生"
+  end
+  it "should handle pure English as well" do
+    text = "This is a paragraph of English."
+    segs = RMMSeg::SimpleAlgorithm.new(text).segment
+    segs.length.should == 6
+    segs[0].should == "This"
+  end
+  it "should handle byte positions of English well" do
+    text = "This is a paragraph of English."
+    algor = RMMSeg::SimpleAlgorithm.new(text)
+    3.times { algor.next_token }
+    token = algor.next_token
+    token.text.should == "paragraph"
+    token.start.should == 10
+    token.end.should == 19
+  end
+  it "should handle byte positions of Chinese well" do
+    text = "这是一句中文"
+    algor = RMMSeg::SimpleAlgorithm.new(text)
+    2.times { algor.next_token }
+    token = algor.next_token
+    token.text.should == "中文"
+    token.start.should == 12
+    token.end.should == 18
+  end
+end

data/spec/spec_helper.rb ADDED

@@ -0,0 +1,12 @@
+$: << File.join(File.dirname(__FILE__), "../lib")
+require 'rmmseg'
+def gen_words words, freqs=nil
+  if freqs.nil?
+    words.map { |word| RMMSeg::Word.new(word) }
+  else
+    words.zip(freqs).map { |word, freq|
+      RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
+    }
+  end
+end

data/spec/svwl_rule_spec.rb ADDED

@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe "smallest variance of word length rule" do
+  it "should return chunks with the smallest word length variance" do
+    chunks = [
+              gen_words(["研究", "生命", "起源"]),
+              gen_words(["研究生", "命", "起源"])
+             ]
+    chunks = RMMSeg::SVWLRule.filter(chunks)
+    chunks.length.should == 1
+    chunks[0][0].text.should == "研究"
+  end
+end

data/spec/word_spec.rb ADDED

@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+require File.join(File.dirname(__FILE__), 'spec_helper')
+describe 'word' do
+  it "should return proper length on CJK words" do
+    w = RMMSeg::Word.new('中文')
+    w.length.should == 2
+  end
+end

data/tasks/ann.rake ADDED

@@ -0,0 +1,76 @@
+# $Id$
+begin
+  require 'bones/smtp_tls'
+rescue LoadError
+  require 'net/smtp'
+end
+require 'time'
+namespace :ann do
+  file PROJ.ann_file do
+    puts "Generating #{PROJ.ann_file}"
+    File.open(PROJ.ann_file,'w') do |fd|
+      fd.puts("#{PROJ.name} version #{PROJ.version}")
+      fd.puts("    by #{Array(PROJ.authors).first}") if PROJ.authors
+      fd.puts("    #{PROJ.url}") if PROJ.url
+      fd.puts("    (the \"#{PROJ.release_name}\" release)") if PROJ.release_name
+      fd.puts
+      fd.puts("== DESCRIPTION")
+      fd.puts
+      fd.puts(PROJ.description)
+      fd.puts
+      fd.puts(PROJ.changes.sub(%r/^.*$/, '== CHANGES'))
+      fd.puts
+      PROJ.ann_paragraphs.each do |p|
+        fd.puts "== #{p.upcase}"
+        fd.puts
+        fd.puts paragraphs_of(PROJ.readme_file, p).join("\n\n")
+        fd.puts
+      end
+      fd.puts PROJ.ann_text if PROJ.ann_text
+    end
+  end
+  desc "Create an announcement file"
+  task :announcement => PROJ.ann_file
+  desc "Send an email announcement"
+  task :email => PROJ.ann_file do
+    from = PROJ.ann_email[:from] || PROJ.email
+    to   = Array(PROJ.ann_email[:to])
+    ### build a mail header for RFC 822
+    rfc822msg =  "From: #{from}\n"
+    rfc822msg << "To: #{to.join(',')}\n"
+    rfc822msg << "Subject: [ANN] #{PROJ.name} #{PROJ.version}"
+    rfc822msg << " (#{PROJ.release_name})" if PROJ.release_name
+    rfc822msg << "\n"
+    rfc822msg << "Date: #{Time.new.rfc822}\n"
+    rfc822msg << "Message-Id: "
+    rfc822msg << "<#{"%.8f" % Time.now.to_f}@#{PROJ.ann_email[:domain]}>\n\n"
+    rfc822msg << File.read(PROJ.ann_file)
+    params = [:server, :port, :domain, :acct, :passwd, :authtype].map do |key|
+      PROJ.ann_email[key]
+    end
+    params[3] = PROJ.email if params[3].nil?
+    if params[4].nil?
+      STDOUT.write "Please enter your e-mail password (#{params[3]}): "
+      params[4] = STDIN.gets.chomp
+    end
+    ### send email
+    Net::SMTP.start(*params) {|smtp| smtp.sendmail(rfc822msg, from, to)}
+  end
+end  # namespace :ann
+desc 'Alias to ann:announcement'
+task :ann => 'ann:announcement'
+CLOBBER << PROJ.ann_file
+# EOF

data/tasks/annotations.rake ADDED

@@ -0,0 +1,22 @@
+# $Id$
+if HAVE_BONES
+desc "Enumerate all annotations"
+task :notes do
+  Bones::AnnotationExtractor.enumerate(
+      PROJ, PROJ.annotation_tags.join('|'), :tag => true)
+end
+namespace :notes do
+  PROJ.annotation_tags.each do |tag|
+    desc "Enumerate all #{tag} annotations"
+    task tag.downcase.to_sym do
+      Bones::AnnotationExtractor.enumerate(PROJ, tag)
+    end
+  end
+end
+end  # if HAVE_BONES
+# EOF

data/tasks/doc.rake ADDED

@@ -0,0 +1,48 @@
+# $Id$
+require 'rake/rdoctask'
+namespace :doc do
+  desc 'Generate RDoc documentation'
+  Rake::RDocTask.new do |rd|
+    rd.main = PROJ.rdoc_main
+    rd.rdoc_dir = PROJ.rdoc_dir
+    incl = Regexp.new(PROJ.rdoc_include.join('|'))
+    excl = Regexp.new(PROJ.rdoc_exclude.join('|'))
+    files = PROJ.files.find_all do |fn|
+              case fn
+              when excl; false
+              when incl; true
+              else false end
+            end
+    rd.rdoc_files.push(*files)
+    title = "#{PROJ.name}-#{PROJ.version} Documentation"
+    title = "#{PROJ.rubyforge_name}'s " + title if PROJ.rubyforge_name != title
+    rd.options << "-t #{title}"
+    rd.options.concat(PROJ.rdoc_opts)
+  end
+  desc 'Generate ri locally for testing'
+  task :ri => :clobber_ri do
+    sh "#{RDOC} --ri -o ri ."
+  end
+  task :clobber_ri do
+    rm_r 'ri' rescue nil
+  end
+end  # namespace :doc
+desc 'Alias to doc:rdoc'
+task :doc => 'doc:rdoc'
+desc 'Remove all build products'
+task :clobber => %w(doc:clobber_rdoc doc:clobber_ri)
+remove_desc_for_task %w(doc:clobber_rdoc)
+# EOF

data/tasks/gem.rake ADDED

@@ -0,0 +1,110 @@
+# $Id$
+require 'rake/gempackagetask'
+namespace :gem do
+  PROJ.spec = Gem::Specification.new do |s|
+    s.name = PROJ.name
+    s.version = PROJ.version
+    s.summary = PROJ.summary
+    s.authors = Array(PROJ.authors)
+    s.email = PROJ.email
+    s.homepage = Array(PROJ.url).first
+    s.rubyforge_project = PROJ.rubyforge_name
+    s.post_install_message = PROJ.post_install_message
+    s.description = PROJ.description
+    PROJ.dependencies.each do |dep|
+      s.add_dependency(*dep)
+    end
+    s.files = PROJ.files
+    s.executables = PROJ.executables.map {|fn| File.basename(fn)}
+    s.extensions = PROJ.files.grep %r/extconf\.rb$/
+    s.bindir = 'bin'
+    dirs = Dir["{#{PROJ.libs.join(',')}}"]
+    s.require_paths = dirs unless dirs.empty?
+    incl = Regexp.new(PROJ.rdoc_include.join('|'))
+    excl = PROJ.rdoc_exclude.dup.concat %w[\.rb$ ^(\.\/|\/)?ext]
+    excl = Regexp.new(excl.join('|'))
+    rdoc_files = PROJ.files.find_all do |fn|
+                   case fn
+                   when excl; false
+                   when incl; true
+                   else false end
+                 end
+    s.rdoc_options = PROJ.rdoc_opts + ['--main', PROJ.rdoc_main]
+    s.extra_rdoc_files = rdoc_files
+    s.has_rdoc = true
+    if test ?f, PROJ.test_file
+      s.test_file = PROJ.test_file
+    else
+      s.test_files = PROJ.tests.to_a
+    end
+    # Do any extra stuff the user wants
+#   spec_extras.each do |msg, val|
+#     case val
+#     when Proc
+#       val.call(s.send(msg))
+#     else
+#       s.send "#{msg}=", val
+#     end
+#   end
+  end
+  desc 'Show information about the gem'
+  task :debug do
+    puts PROJ.spec.to_ruby
+  end
+  pkg = Rake::PackageTask.new(PROJ.name, PROJ.version) do |pkg|
+    pkg.need_tar = PROJ.need_tar
+    pkg.need_zip = PROJ.need_zip
+    pkg.package_files += PROJ.spec.files
+  end
+  Rake::Task['gem:package'].instance_variable_set(:@full_comment, nil)
+  gem_file = if PROJ.spec.platform == Gem::Platform::RUBY
+      "#{pkg.package_name}.gem"
+    else
+      "#{pkg.package_name}-#{PROJ.spec.platform}.gem"
+    end
+  desc "Build the gem file #{gem_file}"
+  task :package => "#{pkg.package_dir}/#{gem_file}"
+  file "#{pkg.package_dir}/#{gem_file}" => [pkg.package_dir] + PROJ.spec.files do
+    when_writing("Creating GEM") {
+      Gem::Builder.new(PROJ.spec).build
+      verbose(true) {
+        mv gem_file, "#{pkg.package_dir}/#{gem_file}"
+      }
+    }
+  end
+  desc 'Install the gem'
+  task :install => [:clobber, :package] do
+    sh "#{SUDO} #{GEM} install pkg/#{PROJ.spec.full_name}"
+  end
+  desc 'Uninstall the gem'
+  task :uninstall do
+    sh "#{SUDO} #{GEM} uninstall -v '#{PROJ.version}' -x #{PROJ.name}"
+  end
+end  # namespace :gem
+desc 'Alias to gem:package'
+task :gem => 'gem:package'
+task :clobber => 'gem:clobber_package'
+remove_desc_for_task %w(gem:clobber_package)
+# EOF