rmmseg 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'chunk' do
5
+ before(:all) do
6
+ @words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
7
+ [10, 7, 100, 8])
8
+ @chunk = RMMSeg::Chunk.new(@words)
9
+ end
10
+
11
+ it "should return proper total length" do
12
+ @chunk.total_length.should == 13
13
+ end
14
+
15
+ it "should return proper average length" do
16
+ @chunk.average_length.should == 13.0/4
17
+ end
18
+
19
+ it "should return proper variance" do
20
+ @chunk.variance.to_i.should == 4
21
+ end
22
+
23
+ it "should return proper degree of morphemic freedom" do
24
+ @chunk.degree_of_morphemic_freedom.should == 100
25
+ end
26
+ end
@@ -0,0 +1,18 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "complex algorithm" do
5
+ it "should behave well as svwl rule" do
6
+ text = "研究生命科学"
7
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
8
+ segs.length.should == 3
9
+ segs[0].should == "研究"
10
+ end
11
+
12
+ it "should segment a relative big chunk of Chinese" do
13
+ text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
14
+ segs = RMMSeg::ComplexAlgorithm.new(text).segment
15
+ segs.length.should == 87
16
+ segs[0].should == "主持人"
17
+ end
18
+ end
@@ -0,0 +1,12 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "RMMSeg Config" do
4
+ it "should be able to store and retrive config values" do
5
+ RMMSeg::Config.algorithm = :simple
6
+ RMMSeg::Config.algorithm.should == :simple
7
+ end
8
+
9
+ it "should reject invalid algorithm" do
10
+ lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
11
+ end
12
+ end
@@ -0,0 +1,20 @@
1
+ require File.join(File.dirname(__FILE__), 'spec_helper')
2
+
3
+ describe "dictionary" do
4
+
5
+ before(:all) do
6
+ @dic = RMMSeg::Dictionary.instance
7
+ end
8
+
9
+ it "should contain frequency information for chars" do
10
+ @dic.get_word("你").frequency.should == 915385
11
+ end
12
+
13
+ it "should handle words" do
14
+ @dic.has_word?("你们").should == true
15
+ end
16
+
17
+ it "should ignore words which exceed the maximum length" do
18
+ @dic.has_word?("这是一个超出长度的词组").should == false
19
+ end
20
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest average word length rule" do
5
+ it "should return chunks with the maximum average word length" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["国际化"])),
8
+ RMMSeg::Chunk.new(gen_words(["国际", "化"])),
9
+ RMMSeg::Chunk.new(gen_words(["国", "际", "化"]))
10
+ ]
11
+ chunks = RMMSeg::LAWLRule.new.filter(chunks)
12
+ chunks.length.should == 1
13
+ chunks[0].words[0].text.should == "国际化"
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "largest sum of degree of morphemic freedom of one-character words rule" do
5
+ it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["主要", "是", "因为"], [nil, 100, nil])),
8
+ RMMSeg::Chunk.new(gen_words(["主", "要是", "因为"], [10, nil, nil]))
9
+ ]
10
+ chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0].words[0].text.should == "主要"
13
+ end
14
+ end
@@ -0,0 +1,15 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'maximum matching rule' do
5
+ it "should select chunks with the maximun total length" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来了"])),
8
+ RMMSeg::Chunk.new(gen_words(["眼", "看", "就", "要", "来", "了"])),
9
+ RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来"])),
10
+ RMMSeg::Chunk.new(gen_words(["眼", "看", "就"]))
11
+ ]
12
+ chunks = RMMSeg::MMRule.new.filter(chunks)
13
+ chunks.length.should == 2
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "simple algorithm" do
5
+ it "should handle simple cases" do
6
+ text = "我们都喜欢用 Ruby"
7
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
8
+ segs.length.should == 5
9
+ segs[0].should == "我们"
10
+ end
11
+
12
+ it "shouldn't be able to handle some case" do
13
+ text = "研究生命起源"
14
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
15
+ segs.length.should == 3
16
+ segs[0].should_not == "研究"
17
+ segs[0].should == "研究生"
18
+ end
19
+
20
+ it "should handle pure English as well" do
21
+ text = "This is a paragraph of English."
22
+ segs = RMMSeg::SimpleAlgorithm.new(text).segment
23
+ segs.length.should == 6
24
+ segs[0].should == "This"
25
+ end
26
+
27
+ it "should handle byte positions of English well" do
28
+ text = "This is a paragraph of English."
29
+ algor = RMMSeg::SimpleAlgorithm.new(text)
30
+ 3.times { algor.next_token }
31
+ token = algor.next_token
32
+ token.text.should == "paragraph"
33
+ token.start_pos.should == 10
34
+ token.end_pos.should == 19
35
+ end
36
+
37
+ it "should handle byte positions of Chinese well" do
38
+ text = "这是一句中文"
39
+ algor = RMMSeg::SimpleAlgorithm.new(text)
40
+ 2.times { algor.next_token }
41
+ token = algor.next_token
42
+ token.text.should == "中文"
43
+ token.start_pos.should == 12
44
+ token.end_pos.should == 18
45
+ end
46
+ end
@@ -0,0 +1,15 @@
1
+ $: << File.join(File.dirname(__FILE__), "../lib")
2
+ require 'rmmseg'
3
+
4
+ require 'rubygems'
5
+ require 'spec'
6
+
7
+ def gen_words words, freqs=nil
8
+ if freqs.nil?
9
+ words.map { |word| RMMSeg::Word.new(word) }
10
+ else
11
+ words.zip(freqs).map { |word, freq|
12
+ RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
13
+ }
14
+ end
15
+ end
@@ -0,0 +1,14 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe "smallest variance of word length rule" do
5
+ it "should return chunks with the smallest word length variance" do
6
+ chunks = [
7
+ RMMSeg::Chunk.new(gen_words(["研究", "生命", "起源"])),
8
+ RMMSeg::Chunk.new(gen_words(["研究生", "命", "起源"]))
9
+ ]
10
+ chunks = RMMSeg::SVWLRule.new.filter(chunks)
11
+ chunks.length.should == 1
12
+ chunks[0].words[0].text.should == "研究"
13
+ end
14
+ end
data/spec/word_spec.rb ADDED
@@ -0,0 +1,9 @@
1
+ # -*- coding: utf-8 -*-
2
+ require File.join(File.dirname(__FILE__), 'spec_helper')
3
+
4
+ describe 'word' do
5
+ it "should return proper length on CJK words" do
6
+ w = RMMSeg::Word.new('中文')
7
+ w.length.should == 2
8
+ end
9
+ end
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rmmseg
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - pluskid
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-02-01 00:00:00 +08:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: hoe
17
+ version_requirement:
18
+ version_requirements: !ruby/object:Gem::Requirement
19
+ requirements:
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.5.0
23
+ version:
24
+ description: "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 aditonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
25
+ email: pluskid@gmail.com
26
+ executables:
27
+ - rmmseg
28
+ extensions: []
29
+
30
+ extra_rdoc_files:
31
+ - History.txt
32
+ - Manifest.txt
33
+ - README.txt
34
+ - TODO.txt
35
+ files:
36
+ - History.txt
37
+ - Manifest.txt
38
+ - README.txt
39
+ - Rakefile
40
+ - TODO.txt
41
+ - bin/rmmseg
42
+ - lib/rmmseg.rb
43
+ - lib/rmmseg/algorithm.rb
44
+ - lib/rmmseg/amibguity.rb
45
+ - lib/rmmseg/chars.dic
46
+ - lib/rmmseg/chunk.rb
47
+ - lib/rmmseg/complex_algorithm.rb
48
+ - lib/rmmseg/config.rb
49
+ - lib/rmmseg/dictionary.rb
50
+ - lib/rmmseg/ferret.rb
51
+ - lib/rmmseg/lawl_rule.rb
52
+ - lib/rmmseg/lsdmfocw_rule.rb
53
+ - lib/rmmseg/mm_rule.rb
54
+ - lib/rmmseg/rule_helper.rb
55
+ - lib/rmmseg/simple_algorithm.rb
56
+ - lib/rmmseg/svwl_rule.rb
57
+ - lib/rmmseg/token.rb
58
+ - lib/rmmseg/word.rb
59
+ - lib/rmmseg/words.dic
60
+ - misc/homepage.erb
61
+ - misc/homepage.html
62
+ - spec/chunk_spec.rb
63
+ - spec/complex_algorithm_spec.rb
64
+ - spec/config_spec.rb
65
+ - spec/dictionary_spec.rb
66
+ - spec/lawl_rule_spec.rb
67
+ - spec/lsdmfocw_rule_spec.rb
68
+ - spec/mm_rule_spec.rb
69
+ - spec/simple_algorithm_spec.rb
70
+ - spec/spec_helper.rb
71
+ - spec/svwl_rule_spec.rb
72
+ - spec/word_spec.rb
73
+ has_rdoc: true
74
+ homepage: http://rmmseg.rubyforge.org
75
+ post_install_message:
76
+ rdoc_options:
77
+ - --main
78
+ - README.txt
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project: rmmseg
96
+ rubygems_version: 1.0.1
97
+ signing_key:
98
+ specification_version: 2
99
+ summary: RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a Chinese segmentation algorithm based on two variants of maximum matching. RMMSeg can be used as a stand alone program or as an Analyzer of Ferret.
100
+ test_files: []
101
+