plexus-rmmseg 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +1 -0
  3. data/History.txt +42 -0
  4. data/Manifest.txt +51 -0
  5. data/README.txt +74 -0
  6. data/Rakefile +12 -0
  7. data/TODO.txt +5 -0
  8. data/bin/rmmseg +65 -0
  9. data/data/chars.dic +12638 -0
  10. data/data/custom.dic +12 -0
  11. data/data/punctuation.dic +79 -0
  12. data/data/words.dic +120330 -0
  13. data/lib/rmmseg.rb +13 -0
  14. data/lib/rmmseg/algorithm.rb +136 -0
  15. data/lib/rmmseg/amibguity.rb +4 -0
  16. data/lib/rmmseg/chunk.rb +41 -0
  17. data/lib/rmmseg/complex_algorithm.rb +122 -0
  18. data/lib/rmmseg/config.rb +65 -0
  19. data/lib/rmmseg/dictionary.rb +80 -0
  20. data/lib/rmmseg/ferret.rb +109 -0
  21. data/lib/rmmseg/lawl_rule.rb +12 -0
  22. data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
  23. data/lib/rmmseg/mm_rule.rb +13 -0
  24. data/lib/rmmseg/rule_helper.rb +28 -0
  25. data/lib/rmmseg/simple_algorithm.rb +37 -0
  26. data/lib/rmmseg/svwl_rule.rb +12 -0
  27. data/lib/rmmseg/token.rb +30 -0
  28. data/lib/rmmseg/version.rb +3 -0
  29. data/lib/rmmseg/word.rb +38 -0
  30. data/misc/ferret_example.rb +56 -0
  31. data/misc/homepage.erb +170 -0
  32. data/misc/homepage.html +1214 -0
  33. data/plexus-rmmseg.gemspec +20 -0
  34. data/spec/chunk_spec.rb +25 -0
  35. data/spec/complex_algorithm_spec.rb +18 -0
  36. data/spec/config_spec.rb +12 -0
  37. data/spec/dictionary_spec.rb +20 -0
  38. data/spec/lawl_rule_spec.rb +15 -0
  39. data/spec/lsdmfocw_rule_spec.rb +14 -0
  40. data/spec/mm_rule_spec.rb +15 -0
  41. data/spec/simple_algorithm_spec.rb +46 -0
  42. data/spec/spec_helper.rb +12 -0
  43. data/spec/svwl_rule_spec.rb +14 -0
  44. data/spec/word_spec.rb +9 -0
  45. data/tasks/ann.rake +76 -0
  46. data/tasks/annotations.rake +22 -0
  47. data/tasks/doc.rake +48 -0
  48. data/tasks/gem.rake +110 -0
  49. data/tasks/homepage.rake +12 -0
  50. data/tasks/manifest.rake +49 -0
  51. data/tasks/post_load.rake +26 -0
  52. data/tasks/rubyforge.rake +57 -0
  53. data/tasks/setup.rb +227 -0
  54. data/tasks/spec.rake +54 -0
  55. data/tasks/svn.rake +44 -0
  56. data/tasks/test.rake +38 -0
  57. metadata +121 -0
@@ -0,0 +1,20 @@
1
+ require File.expand_path('../lib/rmmseg/version', __FILE__)
2
+
3
+ Gem::Specification.new do |gem|
4
+ gem.name = "plexus-rmmseg"
5
+ gem.version = RMMSeg::VERSION
6
+ gem.authors = ["pluskid"]
7
+ gem.email = "pluskid@gmail.com"
8
+ gem.date = "2008-03-16"
9
+ gem.summary = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm"
10
+ gem.description = "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 additonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
11
+ gem.homepage = "http://rmmseg.rubyforge.org"
12
+ gem.license = 'MIT'
13
+
14
+ gem.require_paths = ["lib"]
15
+ gem.files = `git ls-files`.split($/)
16
+ gem.test_files = `git ls-files -- spec`.split($/)
17
+ gem.extra_rdoc_files = ["History.txt", "README.txt", "TODO.txt", "bin/rmmseg"]
18
+ gem.executables = ["rmmseg"]
19
+ gem.rdoc_options = ["--main", "README.txt"]
20
+ end
@@ -0,0 +1,25 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'chunk' do
5
+ before(:all) do
6
+ @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
7
+ [10, 7, 100, 8])
8
+ end
9
+
10
+ it "should return proper total length" do
11
+ RMMSeg::Chunk::total_length(@words).should == 13
12
+ end
13
+
14
+ it "should return proper average length" do
15
+ RMMSeg::Chunk::average_length(@words).should == 13.0/4
16
+ end
17
+
18
+ it "should return proper variance" do
19
+ RMMSeg::Chunk::variance(@words).to_i.should == 4
20
+ end
21
+
22
+ it "should return proper degree of morphemic freedom" do
23
+ RMMSeg::Chunk::degree_of_morphemic_freedom(@words).should == 100
24
+ end
25
+ end
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "complex algorithm" do
5
+ it "should behave well as svwl rule" do
6
+ text = "研究生命科学"
7
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
8
+ segs.length.should == 3
9
+ segs[0].should == "研究"
10
+ end
11
+
12
+ it "should segment a relative big chunk of Chinese" do
13
+ text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
14
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
15
+ segs.length.should == 87
16
+ segs[0].should == "主持人"
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "RMMSeg Config" do
4
+ it "should be able to store and retrive config values" do
5
+ RMMSeg::Config.algorithm = :simple
6
+ RMMSeg::Config.algorithm.should == :simple
7
+ end
8
+
9
+ it "should reject invalid algorithm" do
10
+ lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "dictionary" do
4
+
5
+ before(:all) do
6
+ @dic = RMMSeg::Dictionary.instance
7
+ end
8
+
9
+ it "should contain frequency information for chars" do
10
+ @dic.get_word("你").frequency.should == 915385
11
+ end
12
+
13
+ it "should handle words" do
14
+ @dic.has_word?("你们").should == true
15
+ end
16
+
17
+ it "should ignore words which exceed the maximum length" do
18
+ @dic.has_word?("这是一个超出长度的词组").should == false
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest average word length rule" do
5
+ it "should return chunks with the maximum average word length" do
6
+ chunks = [
7
+ gen_words(["国际化"]),
8
+ gen_words(["国际", "化"]),
9
+ gen_words(["国", "际", "化"])
10
+ ]
11
+ chunks = RMMSeg::LAWLRule.filter(chunks)
12
+ chunks.length.should == 1
13
+ chunks[0][0].text.should == "国际化"
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest sum of degree of morphemic freedom of one-character words rule" do
5
+ it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
6
+ chunks = [
7
+ gen_words(["主要", "是", "因为"], [nil, 100, nil]),
8
+ gen_words(["主", "要是", "因为"], [10, nil, nil])
9
+ ]
10
+ chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0][0].text.should == "主要"
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'maximum matching rule' do
5
+ it "should select chunks with the maximun total length" do
6
+ chunks = [
7
+ gen_words(["眼看", "就要", "来了"]),
8
+ gen_words(["眼", "看", "就", "要", "来", "了"]),
9
+ gen_words(["眼看", "就要", "来"]),
10
+ gen_words(["眼", "看", "就"])
11
+ ]
12
+ chunks = RMMSeg::MMRule.filter(chunks)
13
+ chunks.length.should == 2
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "simple algorithm" do
5
+ it "should handle simple cases" do
6
+ text = "我们都喜欢用 Ruby"
7
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
8
+ segs.length.should == 5
9
+ segs[0].should == "我们"
10
+ end
11
+
12
+ it "shouldn't be able to handle some case" do
13
+ text = "研究生命起源"
14
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
15
+ segs.length.should == 3
16
+ segs[0].should_not == "研究"
17
+ segs[0].should == "研究生"
18
+ end
19
+
20
+ it "should handle pure English as well" do
21
+ text = "This is a paragraph of English."
22
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
23
+ segs.length.should == 6
24
+ segs[0].should == "This"
25
+ end
26
+
27
+ it "should handle byte positions of English well" do
28
+ text = "This is a paragraph of English."
29
+ algor = RMMSeg::SimpleAlgorithm.new(text)
30
+ 3.times { algor.next_token }
31
+ token = algor.next_token
32
+ token.text.should == "paragraph"
33
+ token.start.should == 10
34
+ token.end.should == 19
35
+ end
36
+
37
+ it "should handle byte positions of Chinese well" do
38
+ text = "这是一句中文"
39
+ algor = RMMSeg::SimpleAlgorithm.new(text)
40
+ 2.times { algor.next_token }
41
+ token = algor.next_token
42
+ token.text.should == "中文"
43
+ token.start.should == 12
44
+ token.end.should == 18
45
+ end
46
+ end
@@ -0,0 +1,12 @@
1
+ $: << File.join(File.dirname(__FILE__), "../lib")
2
+ require 'rmmseg'
3
+
4
+ def gen_words words, freqs=nil
5
+ if freqs.nil?
6
+ words.map { |word| RMMSeg::Word.new(word) }
7
+ else
8
+ words.zip(freqs).map { |word, freq|
9
+ RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
10
+ }
11
+ end
12
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "smallest variance of word length rule" do
5
+ it "should return chunks with the smallest word length variance" do
6
+ chunks = [
7
+ gen_words(["研究", "生命", "起源"]),
8
+ gen_words(["研究生", "命", "起源"])
9
+ ]
10
+ chunks = RMMSeg::SVWLRule.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0][0].text.should == "研究"
13
+ end
14
+ end
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'word' do
5
+ it "should return proper length on CJK words" do
6
+ w = RMMSeg::Word.new('中文')
7
+ w.length.should == 2
8
+ end
9
+ end
@@ -0,0 +1,76 @@
1
+ # $Id$
2
+
3
+ begin
4
+ require 'bones/smtp_tls'
5
+ rescue LoadError
6
+ require 'net/smtp'
7
+ end
8
+ require 'time'
9
+
10
+ namespace :ann do
11
+
12
+ file PROJ.ann_file do
13
+ puts "Generating #{PROJ.ann_file}"
14
+ File.open(PROJ.ann_file,'w') do |fd|
15
+ fd.puts("#{PROJ.name} version #{PROJ.version}")
16
+ fd.puts(" by #{Array(PROJ.authors).first}") if PROJ.authors
17
+ fd.puts(" #{PROJ.url}") if PROJ.url
18
+ fd.puts(" (the \"#{PROJ.release_name}\" release)") if PROJ.release_name
19
+ fd.puts
20
+ fd.puts("== DESCRIPTION")
21
+ fd.puts
22
+ fd.puts(PROJ.description)
23
+ fd.puts
24
+ fd.puts(PROJ.changes.sub(%r/^.*$/, '== CHANGES'))
25
+ fd.puts
26
+ PROJ.ann_paragraphs.each do |p|
27
+ fd.puts "== #{p.upcase}"
28
+ fd.puts
29
+ fd.puts paragraphs_of(PROJ.readme_file, p).join("\n\n")
30
+ fd.puts
31
+ end
32
+ fd.puts PROJ.ann_text if PROJ.ann_text
33
+ end
34
+ end
35
+
36
+ desc "Create an announcement file"
37
+ task :announcement => PROJ.ann_file
38
+
39
+ desc "Send an email announcement"
40
+ task :email => PROJ.ann_file do
41
+ from = PROJ.ann_email[:from] || PROJ.email
42
+ to = Array(PROJ.ann_email[:to])
43
+
44
+ ### build a mail header for RFC 822
45
+ rfc822msg = "From: #{from}\n"
46
+ rfc822msg << "To: #{to.join(',')}\n"
47
+ rfc822msg << "Subject: [ANN] #{PROJ.name} #{PROJ.version}"
48
+ rfc822msg << " (#{PROJ.release_name})" if PROJ.release_name
49
+ rfc822msg << "\n"
50
+ rfc822msg << "Date: #{Time.new.rfc822}\n"
51
+ rfc822msg << "Message-Id: "
52
+ rfc822msg << "<#{"%.8f" % Time.now.to_f}@#{PROJ.ann_email[:domain]}>\n\n"
53
+ rfc822msg << File.read(PROJ.ann_file)
54
+
55
+ params = [:server, :port, :domain, :acct, :passwd, :authtype].map do |key|
56
+ PROJ.ann_email[key]
57
+ end
58
+
59
+ params[3] = PROJ.email if params[3].nil?
60
+
61
+ if params[4].nil?
62
+ STDOUT.write "Please enter your e-mail password (#{params[3]}): "
63
+ params[4] = STDIN.gets.chomp
64
+ end
65
+
66
+ ### send email
67
+ Net::SMTP.start(*params) {|smtp| smtp.sendmail(rfc822msg, from, to)}
68
+ end
69
+ end # namespace :ann
70
+
71
+ desc 'Alias to ann:announcement'
72
+ task :ann => 'ann:announcement'
73
+
74
+ CLOBBER << PROJ.ann_file
75
+
76
+ # EOF
@@ -0,0 +1,22 @@
1
+ # $Id$
2
+
3
+ if HAVE_BONES
4
+
5
+ desc "Enumerate all annotations"
6
+ task :notes do
7
+ Bones::AnnotationExtractor.enumerate(
8
+ PROJ, PROJ.annotation_tags.join('|'), :tag => true)
9
+ end
10
+
11
+ namespace :notes do
12
+ PROJ.annotation_tags.each do |tag|
13
+ desc "Enumerate all #{tag} annotations"
14
+ task tag.downcase.to_sym do
15
+ Bones::AnnotationExtractor.enumerate(PROJ, tag)
16
+ end
17
+ end
18
+ end
19
+
20
+ end # if HAVE_BONES
21
+
22
+ # EOF
@@ -0,0 +1,48 @@
1
+ # $Id$
2
+
3
+ require 'rake/rdoctask'
4
+
5
+ namespace :doc do
6
+
7
+ desc 'Generate RDoc documentation'
8
+ Rake::RDocTask.new do |rd|
9
+ rd.main = PROJ.rdoc_main
10
+ rd.rdoc_dir = PROJ.rdoc_dir
11
+
12
+ incl = Regexp.new(PROJ.rdoc_include.join('|'))
13
+ excl = Regexp.new(PROJ.rdoc_exclude.join('|'))
14
+ files = PROJ.files.find_all do |fn|
15
+ case fn
16
+ when excl; false
17
+ when incl; true
18
+ else false end
19
+ end
20
+ rd.rdoc_files.push(*files)
21
+
22
+ title = "#{PROJ.name}-#{PROJ.version} Documentation"
23
+ title = "#{PROJ.rubyforge_name}'s " + title if PROJ.rubyforge_name != title
24
+
25
+ rd.options << "-t #{title}"
26
+ rd.options.concat(PROJ.rdoc_opts)
27
+ end
28
+
29
+ desc 'Generate ri locally for testing'
30
+ task :ri => :clobber_ri do
31
+ sh "#{RDOC} --ri -o ri ."
32
+ end
33
+
34
+ task :clobber_ri do
35
+ rm_r 'ri' rescue nil
36
+ end
37
+
38
+ end # namespace :doc
39
+
40
+ desc 'Alias to doc:rdoc'
41
+ task :doc => 'doc:rdoc'
42
+
43
+ desc 'Remove all build products'
44
+ task :clobber => %w(doc:clobber_rdoc doc:clobber_ri)
45
+
46
+ remove_desc_for_task %w(doc:clobber_rdoc)
47
+
48
+ # EOF
@@ -0,0 +1,110 @@
1
+ # $Id$
2
+
3
+ require 'rake/gempackagetask'
4
+
5
+ namespace :gem do
6
+
7
+ PROJ.spec = Gem::Specification.new do |s|
8
+ s.name = PROJ.name
9
+ s.version = PROJ.version
10
+ s.summary = PROJ.summary
11
+ s.authors = Array(PROJ.authors)
12
+ s.email = PROJ.email
13
+ s.homepage = Array(PROJ.url).first
14
+ s.rubyforge_project = PROJ.rubyforge_name
15
+ s.post_install_message = PROJ.post_install_message
16
+
17
+ s.description = PROJ.description
18
+
19
+ PROJ.dependencies.each do |dep|
20
+ s.add_dependency(*dep)
21
+ end
22
+
23
+ s.files = PROJ.files
24
+ s.executables = PROJ.executables.map {|fn| File.basename(fn)}
25
+ s.extensions = PROJ.files.grep %r/extconf\.rb$/
26
+
27
+ s.bindir = 'bin'
28
+ dirs = Dir["{#{PROJ.libs.join(',')}}"]
29
+ s.require_paths = dirs unless dirs.empty?
30
+
31
+ incl = Regexp.new(PROJ.rdoc_include.join('|'))
32
+ excl = PROJ.rdoc_exclude.dup.concat %w[\.rb$ ^(\.\/|\/)?ext]
33
+ excl = Regexp.new(excl.join('|'))
34
+ rdoc_files = PROJ.files.find_all do |fn|
35
+ case fn
36
+ when excl; false
37
+ when incl; true
38
+ else false end
39
+ end
40
+ s.rdoc_options = PROJ.rdoc_opts + ['--main', PROJ.rdoc_main]
41
+ s.extra_rdoc_files = rdoc_files
42
+ s.has_rdoc = true
43
+
44
+ if test ?f, PROJ.test_file
45
+ s.test_file = PROJ.test_file
46
+ else
47
+ s.test_files = PROJ.tests.to_a
48
+ end
49
+
50
+ # Do any extra stuff the user wants
51
+ # spec_extras.each do |msg, val|
52
+ # case val
53
+ # when Proc
54
+ # val.call(s.send(msg))
55
+ # else
56
+ # s.send "#{msg}=", val
57
+ # end
58
+ # end
59
+ end
60
+
61
+ desc 'Show information about the gem'
62
+ task :debug do
63
+ puts PROJ.spec.to_ruby
64
+ end
65
+
66
+ pkg = Rake::PackageTask.new(PROJ.name, PROJ.version) do |pkg|
67
+ pkg.need_tar = PROJ.need_tar
68
+ pkg.need_zip = PROJ.need_zip
69
+ pkg.package_files += PROJ.spec.files
70
+ end
71
+ Rake::Task['gem:package'].instance_variable_set(:@full_comment, nil)
72
+
73
+ gem_file = if PROJ.spec.platform == Gem::Platform::RUBY
74
+ "#{pkg.package_name}.gem"
75
+ else
76
+ "#{pkg.package_name}-#{PROJ.spec.platform}.gem"
77
+ end
78
+
79
+ desc "Build the gem file #{gem_file}"
80
+ task :package => "#{pkg.package_dir}/#{gem_file}"
81
+
82
+ file "#{pkg.package_dir}/#{gem_file}" => [pkg.package_dir] + PROJ.spec.files do
83
+ when_writing("Creating GEM") {
84
+ Gem::Builder.new(PROJ.spec).build
85
+ verbose(true) {
86
+ mv gem_file, "#{pkg.package_dir}/#{gem_file}"
87
+ }
88
+ }
89
+ end
90
+
91
+ desc 'Install the gem'
92
+ task :install => [:clobber, :package] do
93
+ sh "#{SUDO} #{GEM} install pkg/#{PROJ.spec.full_name}"
94
+ end
95
+
96
+ desc 'Uninstall the gem'
97
+ task :uninstall do
98
+ sh "#{SUDO} #{GEM} uninstall -v '#{PROJ.version}' -x #{PROJ.name}"
99
+ end
100
+
101
+ end # namespace :gem
102
+
103
+ desc 'Alias to gem:package'
104
+ task :gem => 'gem:package'
105
+
106
+ task :clobber => 'gem:clobber_package'
107
+
108
+ remove_desc_for_task %w(gem:clobber_package)
109
+
110
+ # EOF