rmmseg 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +37 -0
- data/README.txt +63 -0
- data/Rakefile +33 -0
- data/TODO.txt +3 -0
- data/bin/rmmseg +63 -0
- data/lib/rmmseg/algorithm.rb +157 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chars.dic +12638 -0
- data/lib/rmmseg/chunk.rb +51 -0
- data/lib/rmmseg/complex_algorithm.rb +52 -0
- data/lib/rmmseg/config.rb +59 -0
- data/lib/rmmseg/dictionary.rb +66 -0
- data/lib/rmmseg/ferret.rb +43 -0
- data/lib/rmmseg/lawl_rule.rb +14 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
- data/lib/rmmseg/mm_rule.rb +15 -0
- data/lib/rmmseg/rule_helper.rb +22 -0
- data/lib/rmmseg/simple_algorithm.rb +22 -0
- data/lib/rmmseg/svwl_rule.rb +14 -0
- data/lib/rmmseg/token.rb +22 -0
- data/lib/rmmseg/word.rb +37 -0
- data/lib/rmmseg/words.dic +120330 -0
- data/lib/rmmseg.rb +15 -0
- data/misc/homepage.erb +93 -0
- data/misc/homepage.html +1063 -0
- data/spec/chunk_spec.rb +26 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- metadata +101 -0
data/spec/chunk_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe 'chunk' do
|
5
|
+
before(:all) do
|
6
|
+
@words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
|
7
|
+
[10, 7, 100, 8])
|
8
|
+
@chunk = RMMSeg::Chunk.new(@words)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return proper total length" do
|
12
|
+
@chunk.total_length.should == 13
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return proper average length" do
|
16
|
+
@chunk.average_length.should == 13.0/4
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should return proper variance" do
|
20
|
+
@chunk.variance.to_i.should == 4
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return proper degree of morphemic freedom" do
|
24
|
+
@chunk.degree_of_morphemic_freedom.should == 100
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "complex algorithm" do
|
5
|
+
it "should behave well as svwl rule" do
|
6
|
+
text = "研究生命科学"
|
7
|
+
segs = RMMSeg::ComplexAlgorithm.new(text).segment
|
8
|
+
segs.length.should == 3
|
9
|
+
segs[0].should == "研究"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should segment a relative big chunk of Chinese" do
|
13
|
+
text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
|
14
|
+
segs = RMMSeg::ComplexAlgorithm.new(text).segment
|
15
|
+
segs.length.should == 87
|
16
|
+
segs[0].should == "主持人"
|
17
|
+
end
|
18
|
+
end
|
data/spec/config_spec.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
describe "RMMSeg Config" do
|
4
|
+
it "should be able to store and retrive config values" do
|
5
|
+
RMMSeg::Config.algorithm = :simple
|
6
|
+
RMMSeg::Config.algorithm.should == :simple
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should reject invalid algorithm" do
|
10
|
+
lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
describe "dictionary" do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@dic = RMMSeg::Dictionary.instance
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should contain frequency information for chars" do
|
10
|
+
@dic.get_word("你").frequency.should == 915385
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should handle words" do
|
14
|
+
@dic.has_word?("你们").should == true
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should ignore words which exceed the maximum length" do
|
18
|
+
@dic.has_word?("这是一个超出长度的词组").should == false
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "largest average word length rule" do
|
5
|
+
it "should return chunks with the maximum average word length" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["国际化"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["国际", "化"])),
|
9
|
+
RMMSeg::Chunk.new(gen_words(["国", "际", "化"]))
|
10
|
+
]
|
11
|
+
chunks = RMMSeg::LAWLRule.new.filter(chunks)
|
12
|
+
chunks.length.should == 1
|
13
|
+
chunks[0].words[0].text.should == "国际化"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "largest sum of degree of morphemic freedom of one-character words rule" do
|
5
|
+
it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["主要", "是", "因为"], [nil, 100, nil])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["主", "要是", "因为"], [10, nil, nil]))
|
9
|
+
]
|
10
|
+
chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
|
11
|
+
chunks.length.should == 1
|
12
|
+
chunks[0].words[0].text.should == "主要"
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe 'maximum matching rule' do
|
5
|
+
it "should select chunks with the maximun total length" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来了"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["眼", "看", "就", "要", "来", "了"])),
|
9
|
+
RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来"])),
|
10
|
+
RMMSeg::Chunk.new(gen_words(["眼", "看", "就"]))
|
11
|
+
]
|
12
|
+
chunks = RMMSeg::MMRule.new.filter(chunks)
|
13
|
+
chunks.length.should == 2
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "simple algorithm" do
|
5
|
+
it "should handle simple cases" do
|
6
|
+
text = "我们都喜欢用 Ruby"
|
7
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
8
|
+
segs.length.should == 5
|
9
|
+
segs[0].should == "我们"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "shouldn't be able to handle some case" do
|
13
|
+
text = "研究生命起源"
|
14
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
15
|
+
segs.length.should == 3
|
16
|
+
segs[0].should_not == "研究"
|
17
|
+
segs[0].should == "研究生"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should handle pure English as well" do
|
21
|
+
text = "This is a paragraph of English."
|
22
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
23
|
+
segs.length.should == 6
|
24
|
+
segs[0].should == "This"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should handle byte positions of English well" do
|
28
|
+
text = "This is a paragraph of English."
|
29
|
+
algor = RMMSeg::SimpleAlgorithm.new(text)
|
30
|
+
3.times { algor.next_token }
|
31
|
+
token = algor.next_token
|
32
|
+
token.text.should == "paragraph"
|
33
|
+
token.start_pos.should == 10
|
34
|
+
token.end_pos.should == 19
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should handle byte positions of Chinese well" do
|
38
|
+
text = "这是一句中文"
|
39
|
+
algor = RMMSeg::SimpleAlgorithm.new(text)
|
40
|
+
2.times { algor.next_token }
|
41
|
+
token = algor.next_token
|
42
|
+
token.text.should == "中文"
|
43
|
+
token.start_pos.should == 12
|
44
|
+
token.end_pos.should == 18
|
45
|
+
end
|
46
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), "../lib")
|
2
|
+
require 'rmmseg'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'spec'
|
6
|
+
|
7
|
+
def gen_words words, freqs=nil
|
8
|
+
if freqs.nil?
|
9
|
+
words.map { |word| RMMSeg::Word.new(word) }
|
10
|
+
else
|
11
|
+
words.zip(freqs).map { |word, freq|
|
12
|
+
RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "smallest variance of word length rule" do
|
5
|
+
it "should return chunks with the smallest word length variance" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["研究", "生命", "起源"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["研究生", "命", "起源"]))
|
9
|
+
]
|
10
|
+
chunks = RMMSeg::SVWLRule.new.filter(chunks)
|
11
|
+
chunks.length.should == 1
|
12
|
+
chunks[0].words[0].text.should == "研究"
|
13
|
+
end
|
14
|
+
end
|
data/spec/word_spec.rb
ADDED
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rmmseg
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- pluskid
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-02-01 00:00:00 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.5.0
|
23
|
+
version:
|
24
|
+
description: "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 aditonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
|
25
|
+
email: pluskid@gmail.com
|
26
|
+
executables:
|
27
|
+
- rmmseg
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files:
|
31
|
+
- History.txt
|
32
|
+
- Manifest.txt
|
33
|
+
- README.txt
|
34
|
+
- TODO.txt
|
35
|
+
files:
|
36
|
+
- History.txt
|
37
|
+
- Manifest.txt
|
38
|
+
- README.txt
|
39
|
+
- Rakefile
|
40
|
+
- TODO.txt
|
41
|
+
- bin/rmmseg
|
42
|
+
- lib/rmmseg.rb
|
43
|
+
- lib/rmmseg/algorithm.rb
|
44
|
+
- lib/rmmseg/amibguity.rb
|
45
|
+
- lib/rmmseg/chars.dic
|
46
|
+
- lib/rmmseg/chunk.rb
|
47
|
+
- lib/rmmseg/complex_algorithm.rb
|
48
|
+
- lib/rmmseg/config.rb
|
49
|
+
- lib/rmmseg/dictionary.rb
|
50
|
+
- lib/rmmseg/ferret.rb
|
51
|
+
- lib/rmmseg/lawl_rule.rb
|
52
|
+
- lib/rmmseg/lsdmfocw_rule.rb
|
53
|
+
- lib/rmmseg/mm_rule.rb
|
54
|
+
- lib/rmmseg/rule_helper.rb
|
55
|
+
- lib/rmmseg/simple_algorithm.rb
|
56
|
+
- lib/rmmseg/svwl_rule.rb
|
57
|
+
- lib/rmmseg/token.rb
|
58
|
+
- lib/rmmseg/word.rb
|
59
|
+
- lib/rmmseg/words.dic
|
60
|
+
- misc/homepage.erb
|
61
|
+
- misc/homepage.html
|
62
|
+
- spec/chunk_spec.rb
|
63
|
+
- spec/complex_algorithm_spec.rb
|
64
|
+
- spec/config_spec.rb
|
65
|
+
- spec/dictionary_spec.rb
|
66
|
+
- spec/lawl_rule_spec.rb
|
67
|
+
- spec/lsdmfocw_rule_spec.rb
|
68
|
+
- spec/mm_rule_spec.rb
|
69
|
+
- spec/simple_algorithm_spec.rb
|
70
|
+
- spec/spec_helper.rb
|
71
|
+
- spec/svwl_rule_spec.rb
|
72
|
+
- spec/word_spec.rb
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://rmmseg.rubyforge.org
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project: rmmseg
|
96
|
+
rubygems_version: 1.0.1
|
97
|
+
signing_key:
|
98
|
+
specification_version: 2
|
99
|
+
summary: RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a Chinese segmentation algorithm based on two variants of maximum matching. RMMSeg can be used as a stand alone program or as an Analyzer of Ferret.
|
100
|
+
test_files: []
|
101
|
+
|