rmmseg 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest.txt +37 -0
- data/README.txt +63 -0
- data/Rakefile +33 -0
- data/TODO.txt +3 -0
- data/bin/rmmseg +63 -0
- data/lib/rmmseg/algorithm.rb +157 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chars.dic +12638 -0
- data/lib/rmmseg/chunk.rb +51 -0
- data/lib/rmmseg/complex_algorithm.rb +52 -0
- data/lib/rmmseg/config.rb +59 -0
- data/lib/rmmseg/dictionary.rb +66 -0
- data/lib/rmmseg/ferret.rb +43 -0
- data/lib/rmmseg/lawl_rule.rb +14 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
- data/lib/rmmseg/mm_rule.rb +15 -0
- data/lib/rmmseg/rule_helper.rb +22 -0
- data/lib/rmmseg/simple_algorithm.rb +22 -0
- data/lib/rmmseg/svwl_rule.rb +14 -0
- data/lib/rmmseg/token.rb +22 -0
- data/lib/rmmseg/word.rb +37 -0
- data/lib/rmmseg/words.dic +120330 -0
- data/lib/rmmseg.rb +15 -0
- data/misc/homepage.erb +93 -0
- data/misc/homepage.html +1063 -0
- data/spec/chunk_spec.rb +26 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- metadata +101 -0
data/spec/chunk_spec.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe 'chunk' do
|
5
|
+
before(:all) do
|
6
|
+
@words = gen_words(['中文', '中国字', '我', '中华人民共和国'],
|
7
|
+
[10, 7, 100, 8])
|
8
|
+
@chunk = RMMSeg::Chunk.new(@words)
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return proper total length" do
|
12
|
+
@chunk.total_length.should == 13
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should return proper average length" do
|
16
|
+
@chunk.average_length.should == 13.0/4
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should return proper variance" do
|
20
|
+
@chunk.variance.to_i.should == 4
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should return proper degree of morphemic freedom" do
|
24
|
+
@chunk.degree_of_morphemic_freedom.should == 100
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "complex algorithm" do
|
5
|
+
it "should behave well as svwl rule" do
|
6
|
+
text = "研究生命科学"
|
7
|
+
segs = RMMSeg::ComplexAlgorithm.new(text).segment
|
8
|
+
segs.length.should == 3
|
9
|
+
segs[0].should == "研究"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should segment a relative big chunk of Chinese" do
|
13
|
+
text = "主持人把一只割去头的羊放在指定处。枪响后,甲乙两队共同向羊飞驰而去,先抢到羊的同队队员互相掩护,极力向终点奔驰,双方骑手们施展各种技巧,围追堵截,拼命抢夺。叼着羊先到达终点的为胜方。获胜者按照当地的习俗,将羊当场烤熟,请众骑手共享,称为“幸福肉”。"
|
14
|
+
segs = RMMSeg::ComplexAlgorithm.new(text).segment
|
15
|
+
segs.length.should == 87
|
16
|
+
segs[0].should == "主持人"
|
17
|
+
end
|
18
|
+
end
|
data/spec/config_spec.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
describe "RMMSeg Config" do
|
4
|
+
it "should be able to store and retrive config values" do
|
5
|
+
RMMSeg::Config.algorithm = :simple
|
6
|
+
RMMSeg::Config.algorithm.should == :simple
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should reject invalid algorithm" do
|
10
|
+
lambda { RMMSeg::Config.algorithm = :foobar }.should raise_error(ArgumentError)
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
2
|
+
|
3
|
+
describe "dictionary" do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@dic = RMMSeg::Dictionary.instance
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should contain frequency information for chars" do
|
10
|
+
@dic.get_word("你").frequency.should == 915385
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should handle words" do
|
14
|
+
@dic.has_word?("你们").should == true
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should ignore words which exceed the maximum length" do
|
18
|
+
@dic.has_word?("这是一个超出长度的词组").should == false
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "largest average word length rule" do
|
5
|
+
it "should return chunks with the maximum average word length" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["国际化"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["国际", "化"])),
|
9
|
+
RMMSeg::Chunk.new(gen_words(["国", "际", "化"]))
|
10
|
+
]
|
11
|
+
chunks = RMMSeg::LAWLRule.new.filter(chunks)
|
12
|
+
chunks.length.should == 1
|
13
|
+
chunks[0].words[0].text.should == "国际化"
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "largest sum of degree of morphemic freedom of one-character words rule" do
|
5
|
+
it "should return chunks of the largest sum of degree of morphemic freedom of one-character words" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["主要", "是", "因为"], [nil, 100, nil])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["主", "要是", "因为"], [10, nil, nil]))
|
9
|
+
]
|
10
|
+
chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
|
11
|
+
chunks.length.should == 1
|
12
|
+
chunks[0].words[0].text.should == "主要"
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe 'maximum matching rule' do
|
5
|
+
it "should select chunks with the maximun total length" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来了"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["眼", "看", "就", "要", "来", "了"])),
|
9
|
+
RMMSeg::Chunk.new(gen_words(["眼看", "就要", "来"])),
|
10
|
+
RMMSeg::Chunk.new(gen_words(["眼", "看", "就"]))
|
11
|
+
]
|
12
|
+
chunks = RMMSeg::MMRule.new.filter(chunks)
|
13
|
+
chunks.length.should == 2
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "simple algorithm" do
|
5
|
+
it "should handle simple cases" do
|
6
|
+
text = "我们都喜欢用 Ruby"
|
7
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
8
|
+
segs.length.should == 5
|
9
|
+
segs[0].should == "我们"
|
10
|
+
end
|
11
|
+
|
12
|
+
it "shouldn't be able to handle some case" do
|
13
|
+
text = "研究生命起源"
|
14
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
15
|
+
segs.length.should == 3
|
16
|
+
segs[0].should_not == "研究"
|
17
|
+
segs[0].should == "研究生"
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should handle pure English as well" do
|
21
|
+
text = "This is a paragraph of English."
|
22
|
+
segs = RMMSeg::SimpleAlgorithm.new(text).segment
|
23
|
+
segs.length.should == 6
|
24
|
+
segs[0].should == "This"
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should handle byte positions of English well" do
|
28
|
+
text = "This is a paragraph of English."
|
29
|
+
algor = RMMSeg::SimpleAlgorithm.new(text)
|
30
|
+
3.times { algor.next_token }
|
31
|
+
token = algor.next_token
|
32
|
+
token.text.should == "paragraph"
|
33
|
+
token.start_pos.should == 10
|
34
|
+
token.end_pos.should == 19
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should handle byte positions of Chinese well" do
|
38
|
+
text = "这是一句中文"
|
39
|
+
algor = RMMSeg::SimpleAlgorithm.new(text)
|
40
|
+
2.times { algor.next_token }
|
41
|
+
token = algor.next_token
|
42
|
+
token.text.should == "中文"
|
43
|
+
token.start_pos.should == 12
|
44
|
+
token.end_pos.should == 18
|
45
|
+
end
|
46
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), "../lib")
|
2
|
+
require 'rmmseg'
|
3
|
+
|
4
|
+
require 'rubygems'
|
5
|
+
require 'spec'
|
6
|
+
|
7
|
+
def gen_words words, freqs=nil
|
8
|
+
if freqs.nil?
|
9
|
+
words.map { |word| RMMSeg::Word.new(word) }
|
10
|
+
else
|
11
|
+
words.zip(freqs).map { |word, freq|
|
12
|
+
RMMSeg::Word.new(word, RMMSeg::Word::TYPES[:cjk_word], freq)
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require File.join(File.dirname(__FILE__), 'spec_helper')
|
3
|
+
|
4
|
+
describe "smallest variance of word length rule" do
|
5
|
+
it "should return chunks with the smallest word length variance" do
|
6
|
+
chunks = [
|
7
|
+
RMMSeg::Chunk.new(gen_words(["研究", "生命", "起源"])),
|
8
|
+
RMMSeg::Chunk.new(gen_words(["研究生", "命", "起源"]))
|
9
|
+
]
|
10
|
+
chunks = RMMSeg::SVWLRule.new.filter(chunks)
|
11
|
+
chunks.length.should == 1
|
12
|
+
chunks[0].words[0].text.should == "研究"
|
13
|
+
end
|
14
|
+
end
|
data/spec/word_spec.rb
ADDED
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rmmseg
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- pluskid
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-02-01 00:00:00 +08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hoe
|
17
|
+
version_requirement:
|
18
|
+
version_requirements: !ruby/object:Gem::Requirement
|
19
|
+
requirements:
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.5.0
|
23
|
+
version:
|
24
|
+
description: "RMMSeg is an implementation of MMSEG Chinese word segmentation algorithm. It is based on two variants of maximum matching algorithms. Two algorithms are available for using: * simple algorithm that uses only forward maximum matching. * complex algorithm that uses three-word chunk maximum matching and 3 aditonal rules to solve ambiguities. For more information about the algorithm, please refer to the following essays: * http://technology.chtsai.org/mmseg/ * http://pluskid.lifegoo.com/?p=261"
|
25
|
+
email: pluskid@gmail.com
|
26
|
+
executables:
|
27
|
+
- rmmseg
|
28
|
+
extensions: []
|
29
|
+
|
30
|
+
extra_rdoc_files:
|
31
|
+
- History.txt
|
32
|
+
- Manifest.txt
|
33
|
+
- README.txt
|
34
|
+
- TODO.txt
|
35
|
+
files:
|
36
|
+
- History.txt
|
37
|
+
- Manifest.txt
|
38
|
+
- README.txt
|
39
|
+
- Rakefile
|
40
|
+
- TODO.txt
|
41
|
+
- bin/rmmseg
|
42
|
+
- lib/rmmseg.rb
|
43
|
+
- lib/rmmseg/algorithm.rb
|
44
|
+
- lib/rmmseg/amibguity.rb
|
45
|
+
- lib/rmmseg/chars.dic
|
46
|
+
- lib/rmmseg/chunk.rb
|
47
|
+
- lib/rmmseg/complex_algorithm.rb
|
48
|
+
- lib/rmmseg/config.rb
|
49
|
+
- lib/rmmseg/dictionary.rb
|
50
|
+
- lib/rmmseg/ferret.rb
|
51
|
+
- lib/rmmseg/lawl_rule.rb
|
52
|
+
- lib/rmmseg/lsdmfocw_rule.rb
|
53
|
+
- lib/rmmseg/mm_rule.rb
|
54
|
+
- lib/rmmseg/rule_helper.rb
|
55
|
+
- lib/rmmseg/simple_algorithm.rb
|
56
|
+
- lib/rmmseg/svwl_rule.rb
|
57
|
+
- lib/rmmseg/token.rb
|
58
|
+
- lib/rmmseg/word.rb
|
59
|
+
- lib/rmmseg/words.dic
|
60
|
+
- misc/homepage.erb
|
61
|
+
- misc/homepage.html
|
62
|
+
- spec/chunk_spec.rb
|
63
|
+
- spec/complex_algorithm_spec.rb
|
64
|
+
- spec/config_spec.rb
|
65
|
+
- spec/dictionary_spec.rb
|
66
|
+
- spec/lawl_rule_spec.rb
|
67
|
+
- spec/lsdmfocw_rule_spec.rb
|
68
|
+
- spec/mm_rule_spec.rb
|
69
|
+
- spec/simple_algorithm_spec.rb
|
70
|
+
- spec/spec_helper.rb
|
71
|
+
- spec/svwl_rule_spec.rb
|
72
|
+
- spec/word_spec.rb
|
73
|
+
has_rdoc: true
|
74
|
+
homepage: http://rmmseg.rubyforge.org
|
75
|
+
post_install_message:
|
76
|
+
rdoc_options:
|
77
|
+
- --main
|
78
|
+
- README.txt
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project: rmmseg
|
96
|
+
rubygems_version: 1.0.1
|
97
|
+
signing_key:
|
98
|
+
specification_version: 2
|
99
|
+
summary: RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a Chinese segmentation algorithm based on two variants of maximum matching. RMMSeg can be used as a stand alone program or as an Analyzer of Ferret.
|
100
|
+
test_files: []
|
101
|
+
|