rmmseg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'chunk' do
5
+ before(:all) do
6
+ @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
7
+ [10, 7, 100, 8])
8
+ @chunk = RMMSeg::Chunk.new(@words)
9
+ end
10
+
11
+ it "should return proper total length" do
12
+ @chunk.total_length.should == 13
13
+ end
14
+
15
+ it "should return proper average length" do
16
+ @chunk.average_length.should == 13.0/4
17
+ end
18
+
19
+ it "should return proper variance" do
20
+ @chunk.variance.to_i.should == 4
21
+ end
22
+
23
+ it "should return proper degree of morphemic freedom" do
24
+ @chunk.degree_of_morphemic_freedom.should == 100
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "complex algorithm" do
5
+ it "should behave well as svwl rule" do
6
+ text = "研究生命科学"
7
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
8
+ segs.length.should == 3
9
+ segs[0].should == "研究"
10
+ end
11
+
12
+ it "should segment a relative big chunk of Chinese" do
13
+ text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
14
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
15
+ segs.length.should == 87
16
+ segs[0].should == "主持人"
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "RMMSeg Config" do
4
+ it "should be able to store and retrive config values" do
5
+ RMMSeg::Config.algorithm = :simple
6
+ RMMSeg::Config.algorithm.should == :simple
7
+ end
8
+
9
+ it "should reject invalid algorithm" do
10
+ lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "dictionary" do
4
+
5
+ before(:all) do
6
+ @dic = RMMSeg::Dictionary.instance
7
+ end
8
+
9
+ it "should contain frequency information for chars" do
10
+ @dic.get_word("你").frequency.should == 915385
11
+ end
12
+
13
+ it "should handle words" do
14
+ @dic.has_word?("你们").should == true
15
+ end
16
+
17
+ it "should ignore words which exceed the maximum length" do
18
+ @dic.has_word?("这是一个超出长度的词组").should == false
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest average word length rule" do
5
+ it "should return chunks with the maximum average word length" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["国际化"])),
8
+ RMMSeg::Chunk.new(gen_words(["国际", "化"])),
9
+ RMMSeg::Chunk.new(gen_words(["国", "际", "化"]))
10
+ ]
11
+ chunks = RMMSeg::LAWLRule.new.filter(chunks)
12
+ chunks.length.should == 1
13
+ chunks[0].words[0].text.should == "国际化"
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest sum of degree of morphemic freedom of one-character words rule" do
5
+ it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["主要", "是", "因为"], [nil, 100, nil])),
8
+ RMMSeg::Chunk.new(gen_words(["主", "要是", "因为"], [10, nil, nil]))
9
+ ]
10
+ chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0].words[0].text.should == "主要"
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'maximum matching rule' do
5
+ it "should select chunks with the maximun total length" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来了"])),
8
+ RMMSeg::Chunk.new(gen_words(["眼", "看", "就", "要", "来", "了"])),
9
+ RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来"])),
10
+ RMMSeg::Chunk.new(gen_words(["眼", "看", "就"]))
11
+ ]
12
+ chunks = RMMSeg::MMRule.new.filter(chunks)
13
+ chunks.length.should == 2
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "simple algorithm" do
5
+ it "should handle simple cases" do
6
+ text = "我们都喜欢用 Ruby"
7
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
8
+ segs.length.should == 5
9
+ segs[0].should == "我们"
10
+ end
11
+
12
+ it "shouldn't be able to handle some case" do
13
+ text = "研究生命起源"
14
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
15
+ segs.length.should == 3
16
+ segs[0].should_not == "研究"
17
+ segs[0].should == "研究生"
18
+ end
19
+
20
+ it "should handle pure English as well" do
21
+ text = "This is a paragraph of English."
22
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
23
+ segs.length.should == 6
24
+ segs[0].should == "This"
25
+ end
26
+
27
+ it "should handle byte positions of English well" do
28
+ text = "This is a paragraph of English."
29
+ algor = RMMSeg::SimpleAlgorithm.new(text)
30
+ 3.times { algor.next_token }
31
+ token = algor.next_token
32
+ token.text.should == "paragraph"
33
+ token.start_pos.should == 10
34
+ token.end_pos.should == 19
35
+ end
36
+
37
+ it "should handle byte positions of Chinese well" do
38
+ text = "这是一句中文"
39
+ algor = RMMSeg::SimpleAlgorithm.new(text)
40
+ 2.times { algor.next_token }
41
+ token = algor.next_token
42
+ token.text.should == "中文"
43
+ token.start_pos.should == 12
44
+ token.end_pos.should == 18
45
+ end
46
+ end
@@ -0,0 +1,15 @@
1
+ $: << File.join(File.dirname(__FILE__), "../lib")
2
+ require 'rmmseg'
3
+
4
+ require 'rubygems'
5
+ require 'spec'
6
+
7
+ def gen_words words, freqs=nil
8
+ if freqs.nil?
9
+ words.map { |word| RMMSeg::Word.new(word) }
10
+ else
11
+ words.zip(freqs).map { |word, freq|
12
+ RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
13
+ }
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "smallest variance of word length rule" do
5
+ it "should return chunks with the smallest word length variance" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["研究", "生命", "起源"])),
8
+ RMMSeg::Chunk.new(gen_words(["研究生", "命", "起源"]))
9
+ ]
10
+ chunks = RMMSeg::SVWLRule.new.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0].words[0].text.should == "研究"
13
+ end
14
+ end
data/spec/word_spec.rb ADDED
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'word' do
5
+ it "should return proper length on CJK words" do
6
+ w = RMMSeg::Word.new('中文')
7
+ w.length.should == 2
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rmmseg
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - pluskid
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-02-01 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.5.0
23
+ version:
24
+ description: "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 aditonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
25
+ email: pluskid@gmail.com
26
+ executables:
27
+ - rmmseg
28
+ extensions: []
29
+
30
+ extra_rdoc_files:
31
+ - History.txt
32
+ - Manifest.txt
33
+ - README.txt
34
+ - TODO.txt
35
+ files:
36
+ - History.txt
37
+ - Manifest.txt
38
+ - README.txt
39
+ - Rakefile
40
+ - TODO.txt
41
+ - bin/rmmseg
42
+ - lib/rmmseg.rb
43
+ - lib/rmmseg/algorithm.rb
44
+ - lib/rmmseg/amibguity.rb
45
+ - lib/rmmseg/chars.dic
46
+ - lib/rmmseg/chunk.rb
47
+ - lib/rmmseg/complex_algorithm.rb
48
+ - lib/rmmseg/config.rb
49
+ - lib/rmmseg/dictionary.rb
50
+ - lib/rmmseg/ferret.rb
51
+ - lib/rmmseg/lawl_rule.rb
52
+ - lib/rmmseg/lsdmfocw_rule.rb
53
+ - lib/rmmseg/mm_rule.rb
54
+ - lib/rmmseg/rule_helper.rb
55
+ - lib/rmmseg/simple_algorithm.rb
56
+ - lib/rmmseg/svwl_rule.rb
57
+ - lib/rmmseg/token.rb
58
+ - lib/rmmseg/word.rb
59
+ - lib/rmmseg/words.dic
60
+ - misc/homepage.erb
61
+ - misc/homepage.html
62
+ - spec/chunk_spec.rb
63
+ - spec/complex_algorithm_spec.rb
64
+ - spec/config_spec.rb
65
+ - spec/dictionary_spec.rb
66
+ - spec/lawl_rule_spec.rb
67
+ - spec/lsdmfocw_rule_spec.rb
68
+ - spec/mm_rule_spec.rb
69
+ - spec/simple_algorithm_spec.rb
70
+ - spec/spec_helper.rb
71
+ - spec/svwl_rule_spec.rb
72
+ - spec/word_spec.rb
73
+ has_rdoc: true
74
+ homepage: http://rmmseg.rubyforge.org
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project: rmmseg
96
+ rubygems_version: 1.0.1
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a Chinese segmentation algorithm based on two variants of maximum matching. RMMSeg can be used as a stand alone program or as an Analyzer of Ferret.
100
+ test_files: []
101
+