plexus-rmmseg 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/History.txt +42 -0
  4. data/Manifest.txt +51 -0
  5. data/README.txt +74 -0
  6. data/Rakefile +12 -0
  7. data/TODO.txt +5 -0
  8. data/bin/rmmseg +65 -0
  9. data/data/chars.dic +12638 -0
  10. data/data/custom.dic +12 -0
  11. data/data/punctuation.dic +79 -0
  12. data/data/words.dic +120330 -0
  13. data/lib/rmmseg.rb +13 -0
  14. data/lib/rmmseg/algorithm.rb +136 -0
  15. data/lib/rmmseg/amibguity.rb +4 -0
  16. data/lib/rmmseg/chunk.rb +41 -0
  17. data/lib/rmmseg/complex_algorithm.rb +122 -0
  18. data/lib/rmmseg/config.rb +65 -0
  19. data/lib/rmmseg/dictionary.rb +80 -0
  20. data/lib/rmmseg/ferret.rb +109 -0
  21. data/lib/rmmseg/lawl_rule.rb +12 -0
  22. data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
  23. data/lib/rmmseg/mm_rule.rb +13 -0
  24. data/lib/rmmseg/rule_helper.rb +28 -0
  25. data/lib/rmmseg/simple_algorithm.rb +37 -0
  26. data/lib/rmmseg/svwl_rule.rb +12 -0
  27. data/lib/rmmseg/token.rb +30 -0
  28. data/lib/rmmseg/version.rb +3 -0
  29. data/lib/rmmseg/word.rb +38 -0
  30. data/misc/ferret_example.rb +56 -0
  31. data/misc/homepage.erb +170 -0
  32. data/misc/homepage.html +1214 -0
  33. data/plexus-rmmseg.gemspec +20 -0
  34. data/spec/chunk_spec.rb +25 -0
  35. data/spec/complex_algorithm_spec.rb +18 -0
  36. data/spec/config_spec.rb +12 -0
  37. data/spec/dictionary_spec.rb +20 -0
  38. data/spec/lawl_rule_spec.rb +15 -0
  39. data/spec/lsdmfocw_rule_spec.rb +14 -0
  40. data/spec/mm_rule_spec.rb +15 -0
  41. data/spec/simple_algorithm_spec.rb +46 -0
  42. data/spec/spec_helper.rb +12 -0
  43. data/spec/svwl_rule_spec.rb +14 -0
  44. data/spec/word_spec.rb +9 -0
  45. data/tasks/ann.rake +76 -0
  46. data/tasks/annotations.rake +22 -0
  47. data/tasks/doc.rake +48 -0
  48. data/tasks/gem.rake +110 -0
  49. data/tasks/homepage.rake +12 -0
  50. data/tasks/manifest.rake +49 -0
  51. data/tasks/post_load.rake +26 -0
  52. data/tasks/rubyforge.rake +57 -0
  53. data/tasks/setup.rb +227 -0
  54. data/tasks/spec.rake +54 -0
  55. data/tasks/svn.rake +44 -0
  56. data/tasks/test.rake +38 -0
  57. metadata +121 -0
@@ -0,0 +1,20 @@
1
+ require File.expand_path('../lib/rmmseg/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = "plexus-rmmseg"
5
+ gem.version = RMMSeg::VERSION
6
+ gem.authors = ["pluskid"]
7
+ gem.email = "pluskid@gmail.com"
8
+ gem.date = "2008-03-16"
9
+ gem.summary = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm"
10
+ gem.description = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 additonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
11
+ gem.homepage = "http://rmmseg.rubyforge.org"
12
+ gem.license = 'MIT'
13
+
14
+ gem.require_paths = ["lib"]
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.test_files = `git ls-files -- spec`.split($/)
17
+ gem.extra_rdoc_files = ["History.txt", "README.txt", "TODO.txt", "bin/rmmseg"]
18
+ gem.executables = ["rmmseg"]
19
+ gem.rdoc_options = ["--main", "README.txt"]
20
+ end
@@ -0,0 +1,25 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'chunk' do
5
+ before(:all) do
6
+ @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
7
+ [10, 7, 100, 8])
8
+ end
9
+
10
+ it "should return proper total length" do
11
+ RMMSeg::Chunk::total_length(@words).should == 13
12
+ end
13
+
14
+ it "should return proper average length" do
15
+ RMMSeg::Chunk::average_length(@words).should == 13.0/4
16
+ end
17
+
18
+ it "should return proper variance" do
19
+ RMMSeg::Chunk::variance(@words).to_i.should == 4
20
+ end
21
+
22
+ it "should return proper degree of morphemic freedom" do
23
+ RMMSeg::Chunk::degree_of_morphemic_freedom(@words).should == 100
24
+ end
25
+ end
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "complex algorithm" do
5
+ it "should behave well as svwl rule" do
6
+ text = "研究生命科学"
7
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
8
+ segs.length.should == 3
9
+ segs[0].should == "研究"
10
+ end
11
+
12
+ it "should segment a relative big chunk of Chinese" do
13
+ text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
14
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
15
+ segs.length.should == 87
16
+ segs[0].should == "主持人"
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "RMMSeg Config" do
4
+ it "should be able to store and retrive config values" do
5
+ RMMSeg::Config.algorithm = :simple
6
+ RMMSeg::Config.algorithm.should == :simple
7
+ end
8
+
9
+ it "should reject invalid algorithm" do
10
+ lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "dictionary" do
4
+
5
+ before(:all) do
6
+ @dic = RMMSeg::Dictionary.instance
7
+ end
8
+
9
+ it "should contain frequency information for chars" do
10
+ @dic.get_word("你").frequency.should == 915385
11
+ end
12
+
13
+ it "should handle words" do
14
+ @dic.has_word?("你们").should == true
15
+ end
16
+
17
+ it "should ignore words which exceed the maximum length" do
18
+ @dic.has_word?("这是一个超出长度的词组").should == false
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest average word length rule" do
5
+ it "should return chunks with the maximum average word length" do
6
+ chunks = [
7
+ gen_words(["国际化"]),
8
+ gen_words(["国际", "化"]),
9
+ gen_words(["国", "际", "化"])
10
+ ]
11
+ chunks = RMMSeg::LAWLRule.filter(chunks)
12
+ chunks.length.should == 1
13
+ chunks[0][0].text.should == "国际化"
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest sum of degree of morphemic freedom of one-character words rule" do
5
+ it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
6
+ chunks = [
7
+ gen_words(["主要", "是", "因为"], [nil, 100, nil]),
8
+ gen_words(["主", "要是", "因为"], [10, nil, nil])
9
+ ]
10
+ chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0][0].text.should == "主要"
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'maximum matching rule' do
5
+ it "should select chunks with the maximun total length" do
6
+ chunks = [
7
+ gen_words(["眼看", "就要", "来了"]),
8
+ gen_words(["眼", "看", "就", "要", "来", "了"]),
9
+ gen_words(["眼看", "就要", "来"]),
10
+ gen_words(["眼", "看", "就"])
11
+ ]
12
+ chunks = RMMSeg::MMRule.filter(chunks)
13
+ chunks.length.should == 2
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "simple algorithm" do
5
+ it "should handle simple cases" do
6
+ text = "我们都喜欢用 Ruby"
7
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
8
+ segs.length.should == 5
9
+ segs[0].should == "我们"
10
+ end
11
+
12
+ it "shouldn't be able to handle some case" do
13
+ text = "研究生命起源"
14
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
15
+ segs.length.should == 3
16
+ segs[0].should_not == "研究"
17
+ segs[0].should == "研究生"
18
+ end
19
+
20
+ it "should handle pure English as well" do
21
+ text = "This is a paragraph of English."
22
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
23
+ segs.length.should == 6
24
+ segs[0].should == "This"
25
+ end
26
+
27
+ it "should handle byte positions of English well" do
28
+ text = "This is a paragraph of English."
29
+ algor = RMMSeg::SimpleAlgorithm.new(text)
30
+ 3.times { algor.next_token }
31
+ token = algor.next_token
32
+ token.text.should == "paragraph"
33
+ token.start.should == 10
34
+ token.end.should == 19
35
+ end
36
+
37
+ it "should handle byte positions of Chinese well" do
38
+ text = "这是一句中文"
39
+ algor = RMMSeg::SimpleAlgorithm.new(text)
40
+ 2.times { algor.next_token }
41
+ token = algor.next_token
42
+ token.text.should == "中文"
43
+ token.start.should == 12
44
+ token.end.should == 18
45
+ end
46
+ end
@@ -0,0 +1,12 @@
1
+ $: << File.join(File.dirname(__FILE__), "../lib")
2
+ require 'rmmseg'
3
+
4
+ def gen_words words, freqs=nil
5
+ if freqs.nil?
6
+ words.map { |word| RMMSeg::Word.new(word) }
7
+ else
8
+ words.zip(freqs).map { |word, freq|
9
+ RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
10
+ }
11
+ end
12
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "smallest variance of word length rule" do
5
+ it "should return chunks with the smallest word length variance" do
6
+ chunks = [
7
+ gen_words(["研究", "生命", "起源"]),
8
+ gen_words(["研究生", "命", "起源"])
9
+ ]
10
+ chunks = RMMSeg::SVWLRule.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0][0].text.should == "研究"
13
+ end
14
+ end
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'word' do
5
+ it "should return proper length on CJK words" do
6
+ w = RMMSeg::Word.new('中文')
7
+ w.length.should == 2
8
+ end
9
+ end
@@ -0,0 +1,76 @@
1
+ # $Id$
2
+
3
+ begin
4
+ require 'bones/smtp_tls'
5
+ rescue LoadError
6
+ require 'net/smtp'
7
+ end
8
+ require 'time'
9
+
10
+ namespace :ann do
11
+
12
+ file PROJ.ann_file do
13
+ puts "Generating #{PROJ.ann_file}"
14
+ File.open(PROJ.ann_file,'w') do |fd|
15
+ fd.puts("#{PROJ.name} version #{PROJ.version}")
16
+ fd.puts(" by #{Array(PROJ.authors).first}") if PROJ.authors
17
+ fd.puts(" #{PROJ.url}") if PROJ.url
18
+ fd.puts(" (the \"#{PROJ.release_name}\" release)") if PROJ.release_name
19
+ fd.puts
20
+ fd.puts("== DESCRIPTION")
21
+ fd.puts
22
+ fd.puts(PROJ.description)
23
+ fd.puts
24
+ fd.puts(PROJ.changes.sub(%r/^.*$/, '== CHANGES'))
25
+ fd.puts
26
+ PROJ.ann_paragraphs.each do |p|
27
+ fd.puts "== #{p.upcase}"
28
+ fd.puts
29
+ fd.puts paragraphs_of(PROJ.readme_file, p).join("\n\n")
30
+ fd.puts
31
+ end
32
+ fd.puts PROJ.ann_text if PROJ.ann_text
33
+ end
34
+ end
35
+
36
+ desc "Create an announcement file"
37
+ task :announcement => PROJ.ann_file
38
+
39
+ desc "Send an email announcement"
40
+ task :email => PROJ.ann_file do
41
+ from = PROJ.ann_email[:from] || PROJ.email
42
+ to = Array(PROJ.ann_email[:to])
43
+
44
+ ### build a mail header for RFC 822
45
+ rfc822msg = "From: #{from}\n"
46
+ rfc822msg << "To: #{to.join(',')}\n"
47
+ rfc822msg << "Subject: [ANN] #{PROJ.name} #{PROJ.version}"
48
+ rfc822msg << " (#{PROJ.release_name})" if PROJ.release_name
49
+ rfc822msg << "\n"
50
+ rfc822msg << "Date: #{Time.new.rfc822}\n"
51
+ rfc822msg << "Message-Id: "
52
+ rfc822msg << "<#{"%.8f" % Time.now.to_f}@#{PROJ.ann_email[:domain]}>\n\n"
53
+ rfc822msg << File.read(PROJ.ann_file)
54
+
55
+ params = [:server, :port, :domain, :acct, :passwd, :authtype].map do |key|
56
+ PROJ.ann_email[key]
57
+ end
58
+
59
+ params[3] = PROJ.email if params[3].nil?
60
+
61
+ if params[4].nil?
62
+ STDOUT.write "Please enter your e-mail password (#{params[3]}): "
63
+ params[4] = STDIN.gets.chomp
64
+ end
65
+
66
+ ### send email
67
+ Net::SMTP.start(*params) {|smtp| smtp.sendmail(rfc822msg, from, to)}
68
+ end
69
+ end # namespace :ann
70
+
71
+ desc 'Alias to ann:announcement'
72
+ task :ann => 'ann:announcement'
73
+
74
+ CLOBBER << PROJ.ann_file
75
+
76
+ # EOF
@@ -0,0 +1,22 @@
1
+ # $Id$
2
+
3
+ if HAVE_BONES
4
+
5
+ desc "Enumerate all annotations"
6
+ task :notes do
7
+ Bones::AnnotationExtractor.enumerate(
8
+ PROJ, PROJ.annotation_tags.join('|'), :tag => true)
9
+ end
10
+
11
+ namespace :notes do
12
+ PROJ.annotation_tags.each do |tag|
13
+ desc "Enumerate all #{tag} annotations"
14
+ task tag.downcase.to_sym do
15
+ Bones::AnnotationExtractor.enumerate(PROJ, tag)
16
+ end
17
+ end
18
+ end
19
+
20
+ end # if HAVE_BONES
21
+
22
+ # EOF
@@ -0,0 +1,48 @@
1
+ # $Id$
2
+
3
+ require 'rake/rdoctask'
4
+
5
+ namespace :doc do
6
+
7
+ desc 'Generate RDoc documentation'
8
+ Rake::RDocTask.new do |rd|
9
+ rd.main = PROJ.rdoc_main
10
+ rd.rdoc_dir = PROJ.rdoc_dir
11
+
12
+ incl = Regexp.new(PROJ.rdoc_include.join('|'))
13
+ excl = Regexp.new(PROJ.rdoc_exclude.join('|'))
14
+ files = PROJ.files.find_all do |fn|
15
+ case fn
16
+ when excl; false
17
+ when incl; true
18
+ else false end
19
+ end
20
+ rd.rdoc_files.push(*files)
21
+
22
+ title = "#{PROJ.name}-#{PROJ.version} Documentation"
23
+ title = "#{PROJ.rubyforge_name}'s " + title if PROJ.rubyforge_name != title
24
+
25
+ rd.options << "-t #{title}"
26
+ rd.options.concat(PROJ.rdoc_opts)
27
+ end
28
+
29
+ desc 'Generate ri locally for testing'
30
+ task :ri => :clobber_ri do
31
+ sh "#{RDOC} --ri -o ri ."
32
+ end
33
+
34
+ task :clobber_ri do
35
+ rm_r 'ri' rescue nil
36
+ end
37
+
38
+ end # namespace :doc
39
+
40
+ desc 'Alias to doc:rdoc'
41
+ task :doc => 'doc:rdoc'
42
+
43
+ desc 'Remove all build products'
44
+ task :clobber => %w(doc:clobber_rdoc doc:clobber_ri)
45
+
46
+ remove_desc_for_task %w(doc:clobber_rdoc)
47
+
48
+ # EOF
@@ -0,0 +1,110 @@
1
+ # $Id$
2
+
3
+ require 'rake/gempackagetask'
4
+
5
+ namespace :gem do
6
+
7
+ PROJ.spec = Gem::Specification.new do |s|
8
+ s.name = PROJ.name
9
+ s.version = PROJ.version
10
+ s.summary = PROJ.summary
11
+ s.authors = Array(PROJ.authors)
12
+ s.email = PROJ.email
13
+ s.homepage = Array(PROJ.url).first
14
+ s.rubyforge_project = PROJ.rubyforge_name
15
+ s.post_install_message = PROJ.post_install_message
16
+
17
+ s.description = PROJ.description
18
+
19
+ PROJ.dependencies.each do |dep|
20
+ s.add_dependency(*dep)
21
+ end
22
+
23
+ s.files = PROJ.files
24
+ s.executables = PROJ.executables.map {|fn| File.basename(fn)}
25
+ s.extensions = PROJ.files.grep %r/extconf\.rb$/
26
+
27
+ s.bindir = 'bin'
28
+ dirs = Dir["{#{PROJ.libs.join(',')}}"]
29
+ s.require_paths = dirs unless dirs.empty?
30
+
31
+ incl = Regexp.new(PROJ.rdoc_include.join('|'))
32
+ excl = PROJ.rdoc_exclude.dup.concat %w[\.rb$ ^(\.\/|\/)?ext]
33
+ excl = Regexp.new(excl.join('|'))
34
+ rdoc_files = PROJ.files.find_all do |fn|
35
+ case fn
36
+ when excl; false
37
+ when incl; true
38
+ else false end
39
+ end
40
+ s.rdoc_options = PROJ.rdoc_opts + ['--main', PROJ.rdoc_main]
41
+ s.extra_rdoc_files = rdoc_files
42
+ s.has_rdoc = true
43
+
44
+ if test ?f, PROJ.test_file
45
+ s.test_file = PROJ.test_file
46
+ else
47
+ s.test_files = PROJ.tests.to_a
48
+ end
49
+
50
+ # Do any extra stuff the user wants
51
+ # spec_extras.each do |msg, val|
52
+ # case val
53
+ # when Proc
54
+ # val.call(s.send(msg))
55
+ # else
56
+ # s.send "#{msg}=", val
57
+ # end
58
+ # end
59
+ end
60
+
61
+ desc 'Show information about the gem'
62
+ task :debug do
63
+ puts PROJ.spec.to_ruby
64
+ end
65
+
66
+ pkg = Rake::PackageTask.new(PROJ.name, PROJ.version) do |pkg|
67
+ pkg.need_tar = PROJ.need_tar
68
+ pkg.need_zip = PROJ.need_zip
69
+ pkg.package_files += PROJ.spec.files
70
+ end
71
+ Rake::Task['gem:package'].instance_variable_set(:@full_comment, nil)
72
+
73
+ gem_file = if PROJ.spec.platform == Gem::Platform::RUBY
74
+ "#{pkg.package_name}.gem"
75
+ else
76
+ "#{pkg.package_name}-#{PROJ.spec.platform}.gem"
77
+ end
78
+
79
+ desc "Build the gem file #{gem_file}"
80
+ task :package => "#{pkg.package_dir}/#{gem_file}"
81
+
82
+ file "#{pkg.package_dir}/#{gem_file}" => [pkg.package_dir] + PROJ.spec.files do
83
+ when_writing("Creating GEM") {
84
+ Gem::Builder.new(PROJ.spec).build
85
+ verbose(true) {
86
+ mv gem_file, "#{pkg.package_dir}/#{gem_file}"
87
+ }
88
+ }
89
+ end
90
+
91
+ desc 'Install the gem'
92
+ task :install => [:clobber, :package] do
93
+ sh "#{SUDO} #{GEM} install pkg/#{PROJ.spec.full_name}"
94
+ end
95
+
96
+ desc 'Uninstall the gem'
97
+ task :uninstall do
98
+ sh "#{SUDO} #{GEM} uninstall -v '#{PROJ.version}' -x #{PROJ.name}"
99
+ end
100
+
101
+ end # namespace :gem
102
+
103
+ desc 'Alias to gem:package'
104
+ task :gem => 'gem:package'
105
+
106
+ task :clobber => 'gem:clobber_package'
107
+
108
+ remove_desc_for_task %w(gem:clobber_package)
109
+
110
+ # EOF