rmmseg 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +37 -0
- data/README.txt +63 -0
- data/Rakefile +33 -0
- data/TODO.txt +3 -0
- data/bin/rmmseg +63 -0
- data/lib/rmmseg/algorithm.rb +157 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chars.dic +12638 -0
- data/lib/rmmseg/chunk.rb +51 -0
- data/lib/rmmseg/complex_algorithm.rb +52 -0
- data/lib/rmmseg/config.rb +59 -0
- data/lib/rmmseg/dictionary.rb +66 -0
- data/lib/rmmseg/ferret.rb +43 -0
- data/lib/rmmseg/lawl_rule.rb +14 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +15 -0
- data/lib/rmmseg/mm_rule.rb +15 -0
- data/lib/rmmseg/rule_helper.rb +22 -0
- data/lib/rmmseg/simple_algorithm.rb +22 -0
- data/lib/rmmseg/svwl_rule.rb +14 -0
- data/lib/rmmseg/token.rb +22 -0
- data/lib/rmmseg/word.rb +37 -0
- data/lib/rmmseg/words.dic +120330 -0
- data/lib/rmmseg.rb +15 -0
- data/misc/homepage.erb +93 -0
- data/misc/homepage.html +1063 -0
- data/spec/chunk_spec.rb +26 -0
- data/spec/complex_algorithm_spec.rb +18 -0
- data/spec/config_spec.rb +12 -0
- data/spec/dictionary_spec.rb +20 -0
- data/spec/lawl_rule_spec.rb +15 -0
- data/spec/lsdmfocw_rule_spec.rb +14 -0
- data/spec/mm_rule_spec.rb +15 -0
- data/spec/simple_algorithm_spec.rb +46 -0
- data/spec/spec_helper.rb +15 -0
- data/spec/svwl_rule_spec.rb +14 -0
- data/spec/word_spec.rb +9 -0
- metadata +101 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
History.txt
|
2
|
+
Manifest.txt
|
3
|
+
README.txt
|
4
|
+
Rakefile
|
5
|
+
TODO.txt
|
6
|
+
bin/rmmseg
|
7
|
+
lib/rmmseg.rb
|
8
|
+
lib/rmmseg/algorithm.rb
|
9
|
+
lib/rmmseg/amibguity.rb
|
10
|
+
lib/rmmseg/chars.dic
|
11
|
+
lib/rmmseg/chunk.rb
|
12
|
+
lib/rmmseg/complex_algorithm.rb
|
13
|
+
lib/rmmseg/config.rb
|
14
|
+
lib/rmmseg/dictionary.rb
|
15
|
+
lib/rmmseg/ferret.rb
|
16
|
+
lib/rmmseg/lawl_rule.rb
|
17
|
+
lib/rmmseg/lsdmfocw_rule.rb
|
18
|
+
lib/rmmseg/mm_rule.rb
|
19
|
+
lib/rmmseg/rule_helper.rb
|
20
|
+
lib/rmmseg/simple_algorithm.rb
|
21
|
+
lib/rmmseg/svwl_rule.rb
|
22
|
+
lib/rmmseg/token.rb
|
23
|
+
lib/rmmseg/word.rb
|
24
|
+
lib/rmmseg/words.dic
|
25
|
+
misc/homepage.erb
|
26
|
+
misc/homepage.html
|
27
|
+
spec/chunk_spec.rb
|
28
|
+
spec/complex_algorithm_spec.rb
|
29
|
+
spec/config_spec.rb
|
30
|
+
spec/dictionary_spec.rb
|
31
|
+
spec/lawl_rule_spec.rb
|
32
|
+
spec/lsdmfocw_rule_spec.rb
|
33
|
+
spec/mm_rule_spec.rb
|
34
|
+
spec/simple_algorithm_spec.rb
|
35
|
+
spec/spec_helper.rb
|
36
|
+
spec/svwl_rule_spec.rb
|
37
|
+
spec/word_spec.rb
|
data/README.txt
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
= rmmseg
|
2
|
+
|
3
|
+
* http://rmmseg.rubyforge.org
|
4
|
+
* mailto:pluskid@gmail.com
|
5
|
+
|
6
|
+
== DESCRIPTION:
|
7
|
+
|
8
|
+
RMMSeg is an implementation of MMSEG Chinese word segmentation
|
9
|
+
algorithm. It is based on two variants of maximum matching
|
10
|
+
algorithms. Two algorithms are available for using:
|
11
|
+
|
12
|
+
* simple algorithm that uses only forward maximum matching.
|
13
|
+
* complex algorithm that uses three-word chunk maximum matching and 3
|
14
|
+
aditonal rules to solve ambiguities.
|
15
|
+
|
16
|
+
For more information about the algorithm, please refer to the
|
17
|
+
following essays:
|
18
|
+
|
19
|
+
* http://technology.chtsai.org/mmseg/
|
20
|
+
* http://pluskid.lifegoo.com/?p=261
|
21
|
+
|
22
|
+
== FEATURES/PROBLEMS:
|
23
|
+
|
24
|
+
* Provides +rmmseg+ command line tool for quick and easy way to access
|
25
|
+
the word segment feature.
|
26
|
+
* Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
|
27
|
+
|
28
|
+
== SYNOPSIS:
|
29
|
+
|
30
|
+
$ rmmseg --separator _ < input.txt
|
31
|
+
|
32
|
+
== REQUIREMENTS:
|
33
|
+
|
34
|
+
* ruby
|
35
|
+
|
36
|
+
== INSTALL:
|
37
|
+
|
38
|
+
* sudo gem install rmmseg
|
39
|
+
|
40
|
+
== LICENSE:
|
41
|
+
|
42
|
+
(The MIT License)
|
43
|
+
|
44
|
+
Copyright (c) 2008 FIX
|
45
|
+
|
46
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
47
|
+
a copy of this software and associated documentation files (the
|
48
|
+
'Software'), to deal in the Software without restriction, including
|
49
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
50
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
51
|
+
permit persons to whom the Software is furnished to do so, subject to
|
52
|
+
the following conditions:
|
53
|
+
|
54
|
+
The above copyright notice and this permission notice shall be
|
55
|
+
included in all copies or substantial portions of the Software.
|
56
|
+
|
57
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
58
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
59
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
60
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
61
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
62
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
63
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
$: << File.join(File.dirname(__FILE__), "lib")
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'hoe'
|
7
|
+
require 'rmmseg'
|
8
|
+
|
9
|
+
Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
|
10
|
+
p.rubyforge_name = 'rmmseg'
|
11
|
+
p.author = 'pluskid'
|
12
|
+
p.email = 'pluskid@gmail.com'
|
13
|
+
p.test_globs = ["spec/spec.rb"]
|
14
|
+
p.rdoc_pattern = /^lib\/.*\.rb$|\.txt$/
|
15
|
+
p.summary = <<-END
|
16
|
+
RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a
|
17
|
+
Chinese segmentation algorithm based on two variants of maximum
|
18
|
+
matching.
|
19
|
+
|
20
|
+
RMMSeg can be used as a stand alone program or as an Analyzer of
|
21
|
+
Ferret.
|
22
|
+
END
|
23
|
+
end
|
24
|
+
|
25
|
+
task :homepage do
|
26
|
+
sh "gerbil html misc/homepage.erb > misc/homepage.html"
|
27
|
+
end
|
28
|
+
|
29
|
+
task :publish_homepage do
|
30
|
+
sh "scp misc/homepage.html rubyforge.org:/var/www/gforge-projects/rmmseg/index.html"
|
31
|
+
end
|
32
|
+
|
33
|
+
# vim: syntax=Ruby
|
data/TODO.txt
ADDED
data/bin/rmmseg
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$: << File.join(File.dirname(__FILE__), "..", "lib")
|
4
|
+
|
5
|
+
require 'rmmseg'
|
6
|
+
include RMMSeg
|
7
|
+
|
8
|
+
require 'getoptlong'
|
9
|
+
|
10
|
+
def print_usage
|
11
|
+
puts <<EOF
|
12
|
+
#{__FILE__} Segment Chinese text. Read from stdin and print to stdout.
|
13
|
+
|
14
|
+
Options:
|
15
|
+
-h
|
16
|
+
--help Print this message
|
17
|
+
|
18
|
+
-a
|
19
|
+
--algorithm Select segment algorithm. Valid values are 'complex' and
|
20
|
+
'simple'. 'simple' is the default one.
|
21
|
+
|
22
|
+
-A
|
23
|
+
--ambiguity Select a behavior when an ambiguity occurs. Valid values
|
24
|
+
are 'raise_exception' and 'select_first'. 'select_first'
|
25
|
+
is the default one.
|
26
|
+
EOF
|
27
|
+
exit 0
|
28
|
+
end
|
29
|
+
|
30
|
+
separator = " "
|
31
|
+
|
32
|
+
optparser = GetoptLong.new
|
33
|
+
optparser.set_options(["-a", "--algorithm", GetoptLong::REQUIRED_ARGUMENT],
|
34
|
+
["-A", "--ambiguity", GetoptLong::REQUIRED_ARGUMENT],
|
35
|
+
["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
|
36
|
+
["-h", "--help", GetoptLong::NO_ARGUMENT])
|
37
|
+
|
38
|
+
loop do
|
39
|
+
begin
|
40
|
+
opt, arg = optparser.get
|
41
|
+
break if not opt
|
42
|
+
|
43
|
+
case opt
|
44
|
+
when "-h"
|
45
|
+
print_usage
|
46
|
+
|
47
|
+
when "-a"
|
48
|
+
Config.algorithm = arg.to_sym
|
49
|
+
|
50
|
+
when "-A"
|
51
|
+
Config.on_ambiguity = arg.to_sym
|
52
|
+
|
53
|
+
when "-s"
|
54
|
+
separator = arg
|
55
|
+
end
|
56
|
+
|
57
|
+
rescue => err
|
58
|
+
puts err
|
59
|
+
exit 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
puts segment(STDIN.read).join(separator)
|
@@ -0,0 +1,157 @@
|
|
1
|
+
require 'jcode'
|
2
|
+
require 'rmmseg/dictionary'
|
3
|
+
require 'rmmseg/word'
|
4
|
+
require 'rmmseg/chunk'
|
5
|
+
require 'rmmseg/token'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
# An algorithm can segment a piece of text into an array of
|
9
|
+
# words. This module is the common operations shared by
|
10
|
+
# SimpleAlgorithm and ComplexAlgorithm .
|
11
|
+
module Algorithm
|
12
|
+
# Initialize a new instance of Algorithm, the +text+ will
|
13
|
+
# then be segmented by this instance.
|
14
|
+
def initialize(text)
|
15
|
+
@chars = text.each_char
|
16
|
+
@index = 0
|
17
|
+
@byte_index = 0
|
18
|
+
end
|
19
|
+
|
20
|
+
# Get the next Token recognized.
|
21
|
+
def next_token
|
22
|
+
return nil if @index >= @chars.length
|
23
|
+
|
24
|
+
current = @chars[@index]
|
25
|
+
orig_index = @index
|
26
|
+
token = nil
|
27
|
+
len = 0
|
28
|
+
|
29
|
+
if basic_latin?(current)
|
30
|
+
token = get_basic_latin_word
|
31
|
+
else
|
32
|
+
token = get_cjk_word(create_chunks)
|
33
|
+
end
|
34
|
+
|
35
|
+
if token.text.empty?
|
36
|
+
return next_token
|
37
|
+
else
|
38
|
+
return token
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Segment the string in +text+ into an array
|
43
|
+
# of words.
|
44
|
+
def segment
|
45
|
+
words = Array.new
|
46
|
+
loop do
|
47
|
+
token = next_token
|
48
|
+
break if token.nil?
|
49
|
+
words << token.text
|
50
|
+
end
|
51
|
+
|
52
|
+
words
|
53
|
+
end
|
54
|
+
|
55
|
+
# Skip whitespaces and punctuation to extract a basic latin
|
56
|
+
# word.
|
57
|
+
def get_basic_latin_word
|
58
|
+
word = String.new
|
59
|
+
start_pos = nil
|
60
|
+
end_pos = nil
|
61
|
+
|
62
|
+
i = @index
|
63
|
+
while i < @chars.length &&
|
64
|
+
basic_latin?(@chars[i]) &&
|
65
|
+
nonword_char?(@chars[i])
|
66
|
+
i += 1
|
67
|
+
end
|
68
|
+
|
69
|
+
start_pos = @byte_index + i - @index
|
70
|
+
while i < @chars.length && basic_latin?(@chars[i])
|
71
|
+
break if nonword_char?(@chars[i])
|
72
|
+
word << @chars[i]
|
73
|
+
i += 1
|
74
|
+
end
|
75
|
+
|
76
|
+
end_pos = @byte_index + i - @index
|
77
|
+
while i < @chars.length &&
|
78
|
+
basic_latin?(@chars[i]) &&
|
79
|
+
nonword_char?(@chars[i])
|
80
|
+
i += 1
|
81
|
+
end
|
82
|
+
|
83
|
+
@byte_index += i - @index
|
84
|
+
@index = i
|
85
|
+
|
86
|
+
return Token.new(word, start_pos, end_pos)
|
87
|
+
end
|
88
|
+
|
89
|
+
# Use rules to filter the +chunks+ to get the most
|
90
|
+
# apropos CJK word.
|
91
|
+
def get_cjk_word(chunks)
|
92
|
+
i = 0
|
93
|
+
while i < @rules.length
|
94
|
+
break if chunks.length < 2
|
95
|
+
chunks = @rules[i].filter(chunks)
|
96
|
+
i += 1
|
97
|
+
end
|
98
|
+
|
99
|
+
if chunks.length > 1
|
100
|
+
if Config.on_ambiguity == :raise_exception
|
101
|
+
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
word = chunks[0].words[0]
|
106
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
107
|
+
|
108
|
+
@index += word.length
|
109
|
+
@byte_index += word.byte_size
|
110
|
+
|
111
|
+
return token
|
112
|
+
end
|
113
|
+
|
114
|
+
# Find all words occuring in the dictionary starting from
|
115
|
+
# +index+ . The maximum word length is determined by
|
116
|
+
# +Config.max_word_length+ .
|
117
|
+
def find_match_words(chars, index)
|
118
|
+
dic = Dictionary.instance
|
119
|
+
str = String.new
|
120
|
+
words = Array.new
|
121
|
+
i = index
|
122
|
+
|
123
|
+
loop do
|
124
|
+
break if i >= chars.length || basic_latin?(chars[i])
|
125
|
+
str << chars[i]
|
126
|
+
if dic.has_word?(str)
|
127
|
+
word = dic.get_word(str)
|
128
|
+
words << word
|
129
|
+
end
|
130
|
+
i += 1
|
131
|
+
break if Word.new(str).length >= Config.max_word_length
|
132
|
+
end
|
133
|
+
|
134
|
+
if words.empty?
|
135
|
+
words << Word.new(chars[index], Word::TYPES[:unrecognized])
|
136
|
+
end
|
137
|
+
|
138
|
+
words
|
139
|
+
end
|
140
|
+
|
141
|
+
# Determine whether a character is a basic latin character.
|
142
|
+
#--
|
143
|
+
# TODO: Implement this method in a more correct way.
|
144
|
+
# currently I use number of bytes in this char to determine this.
|
145
|
+
# If it is a one-byte char, I consider it a basic latin.
|
146
|
+
#++
|
147
|
+
def basic_latin?(char)
|
148
|
+
char.size == 1
|
149
|
+
end
|
150
|
+
|
151
|
+
# Determine whether a character can be part of a basic latin
|
152
|
+
# word.
|
153
|
+
def nonword_char?(char)
|
154
|
+
/^\W$/ =~ char
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|