rmmseg 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,6 @@
1
+ === 0.0.1 / 2008-01-31
2
+
3
+ * Analyser integration with Ferret.
4
+ * rdoc added
5
+ * Lazily init the +Word+ objects inside the +Dictionary+.
6
+ * Handle English punctuation correctly.
data/Manifest.txt ADDED
@@ -0,0 +1,37 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ TODO.txt
6
+ bin/rmmseg
7
+ lib/rmmseg.rb
8
+ lib/rmmseg/algorithm.rb
9
+ lib/rmmseg/amibguity.rb
10
+ lib/rmmseg/chars.dic
11
+ lib/rmmseg/chunk.rb
12
+ lib/rmmseg/complex_algorithm.rb
13
+ lib/rmmseg/config.rb
14
+ lib/rmmseg/dictionary.rb
15
+ lib/rmmseg/ferret.rb
16
+ lib/rmmseg/lawl_rule.rb
17
+ lib/rmmseg/lsdmfocw_rule.rb
18
+ lib/rmmseg/mm_rule.rb
19
+ lib/rmmseg/rule_helper.rb
20
+ lib/rmmseg/simple_algorithm.rb
21
+ lib/rmmseg/svwl_rule.rb
22
+ lib/rmmseg/token.rb
23
+ lib/rmmseg/word.rb
24
+ lib/rmmseg/words.dic
25
+ misc/homepage.erb
26
+ misc/homepage.html
27
+ spec/chunk_spec.rb
28
+ spec/complex_algorithm_spec.rb
29
+ spec/config_spec.rb
30
+ spec/dictionary_spec.rb
31
+ spec/lawl_rule_spec.rb
32
+ spec/lsdmfocw_rule_spec.rb
33
+ spec/mm_rule_spec.rb
34
+ spec/simple_algorithm_spec.rb
35
+ spec/spec_helper.rb
36
+ spec/svwl_rule_spec.rb
37
+ spec/word_spec.rb
data/README.txt ADDED
@@ -0,0 +1,63 @@
1
+ = rmmseg
2
+
3
+ * http://rmmseg.rubyforge.org
4
+ * mailto:pluskid@gmail.com
5
+
6
+ == DESCRIPTION:
7
+
8
+ RMMSeg is an implementation of MMSEG Chinese word segmentation
9
+ algorithm. It is based on two variants of maximum matching
10
+ algorithms. Two algorithms are available for using:
11
+
12
+ * simple algorithm that uses only forward maximum matching.
13
+ * complex algorithm that uses three-word chunk maximum matching and 3
14
+ aditonal rules to solve ambiguities.
15
+
16
+ For more information about the algorithm, please refer to the
17
+ following essays:
18
+
19
+ * http://technology.chtsai.org/mmseg/
20
+ * http://pluskid.lifegoo.com/?p=261
21
+
22
+ == FEATURES/PROBLEMS:
23
+
24
+ * Provides +rmmseg+ command line tool for quick and easy way to access
25
+ the word segment feature.
26
+ * Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
27
+
28
+ == SYNOPSIS:
29
+
30
+ $ rmmseg --separator _ < input.txt
31
+
32
+ == REQUIREMENTS:
33
+
34
+ * ruby
35
+
36
+ == INSTALL:
37
+
38
+ * sudo gem install rmmseg
39
+
40
+ == LICENSE:
41
+
42
+ (The MIT License)
43
+
44
+ Copyright (c) 2008 FIX
45
+
46
+ Permission is hereby granted, free of charge, to any person obtaining
47
+ a copy of this software and associated documentation files (the
48
+ 'Software'), to deal in the Software without restriction, including
49
+ without limitation the rights to use, copy, modify, merge, publish,
50
+ distribute, sublicense, and/or sell copies of the Software, and to
51
+ permit persons to whom the Software is furnished to do so, subject to
52
+ the following conditions:
53
+
54
+ The above copyright notice and this permission notice shall be
55
+ included in all copies or substantial portions of the Software.
56
+
57
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
58
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
59
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
60
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
61
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
62
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
63
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,33 @@
1
+ # -*- ruby -*-
2
+
3
+ $: << File.join(File.dirname(__FILE__), "lib")
4
+
5
+ require 'rubygems'
6
+ require 'hoe'
7
+ require 'rmmseg'
8
+
9
+ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
10
+ p.rubyforge_name = 'rmmseg'
11
+ p.author = 'pluskid'
12
+ p.email = 'pluskid@gmail.com'
13
+ p.test_globs = ["spec/spec.rb"]
14
+ p.rdoc_pattern = /^lib\/.*\.rb$|\.txt$/
15
+ p.summary = <<-END
16
+ RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a
17
+ Chinese segmentation algorithm based on two variants of maximum
18
+ matching.
19
+
20
+ RMMSeg can be used as a stand alone program or as an Analyzer of
21
+ Ferret.
22
+ END
23
+ end
24
+
25
+ task :homepage do
26
+ sh "gerbil html misc/homepage.erb > misc/homepage.html"
27
+ end
28
+
29
+ task :publish_homepage do
30
+ sh "scp misc/homepage.html rubyforge.org:/var/www/gforge-projects/rmmseg/index.html"
31
+ end
32
+
33
+ # vim: syntax=Ruby
data/TODO.txt ADDED
@@ -0,0 +1,3 @@
1
+ === TODO
2
+
3
+ * Add filter to filter out Chinese punctuations.
data/bin/rmmseg ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), "..", "lib")
4
+
5
+ require 'rmmseg'
6
+ include RMMSeg
7
+
8
+ require 'getoptlong'
9
+
10
+ def print_usage
11
+ puts <<EOF
12
+ #{__FILE__} Segment Chinese text. Read from stdin and print to stdout.
13
+
14
+ Options:
15
+ -h
16
+ --help Print this message
17
+
18
+ -a
19
+ --algorithm Select segment algorithm. Valid values are 'complex' and
20
+ 'simple'. 'simple' is the default one.
21
+
22
+ -A
23
+ --ambiguity Select a behavior when an ambiguity occurs. Valid values
24
+ are 'raise_exception' and 'select_first'. 'select_first'
25
+ is the default one.
26
+ EOF
27
+ exit 0
28
+ end
29
+
30
+ separator = " "
31
+
32
+ optparser = GetoptLong.new
33
+ optparser.set_options(["-a", "--algorithm", GetoptLong::REQUIRED_ARGUMENT],
34
+ ["-A", "--ambiguity", GetoptLong::REQUIRED_ARGUMENT],
35
+ ["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
36
+ ["-h", "--help", GetoptLong::NO_ARGUMENT])
37
+
38
+ loop do
39
+ begin
40
+ opt, arg = optparser.get
41
+ break if not opt
42
+
43
+ case opt
44
+ when "-h"
45
+ print_usage
46
+
47
+ when "-a"
48
+ Config.algorithm = arg.to_sym
49
+
50
+ when "-A"
51
+ Config.on_ambiguity = arg.to_sym
52
+
53
+ when "-s"
54
+ separator = arg
55
+ end
56
+
57
+ rescue => err
58
+ puts err
59
+ exit 1
60
+ end
61
+ end
62
+
63
+ puts segment(STDIN.read).join(separator)
@@ -0,0 +1,157 @@
1
+ require 'jcode'
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance.
14
+ def initialize(text)
15
+ @chars = text.each_char
16
+ @index = 0
17
+ @byte_index = 0
18
+ end
19
+
20
+ # Get the next Token recognized.
21
+ def next_token
22
+ return nil if @index >= @chars.length
23
+
24
+ current = @chars[@index]
25
+ orig_index = @index
26
+ token = nil
27
+ len = 0
28
+
29
+ if basic_latin?(current)
30
+ token = get_basic_latin_word
31
+ else
32
+ token = get_cjk_word(create_chunks)
33
+ end
34
+
35
+ if token.text.empty?
36
+ return next_token
37
+ else
38
+ return token
39
+ end
40
+ end
41
+
42
+ # Segment the string in +text+ into an array
43
+ # of words.
44
+ def segment
45
+ words = Array.new
46
+ loop do
47
+ token = next_token
48
+ break if token.nil?
49
+ words << token.text
50
+ end
51
+
52
+ words
53
+ end
54
+
55
+ # Skip whitespaces and punctuation to extract a basic latin
56
+ # word.
57
+ def get_basic_latin_word
58
+ word = String.new
59
+ start_pos = nil
60
+ end_pos = nil
61
+
62
+ i = @index
63
+ while i < @chars.length &&
64
+ basic_latin?(@chars[i]) &&
65
+ nonword_char?(@chars[i])
66
+ i += 1
67
+ end
68
+
69
+ start_pos = @byte_index + i - @index
70
+ while i < @chars.length && basic_latin?(@chars[i])
71
+ break if nonword_char?(@chars[i])
72
+ word << @chars[i]
73
+ i += 1
74
+ end
75
+
76
+ end_pos = @byte_index + i - @index
77
+ while i < @chars.length &&
78
+ basic_latin?(@chars[i]) &&
79
+ nonword_char?(@chars[i])
80
+ i += 1
81
+ end
82
+
83
+ @byte_index += i - @index
84
+ @index = i
85
+
86
+ return Token.new(word, start_pos, end_pos)
87
+ end
88
+
89
+ # Use rules to filter the +chunks+ to get the most
90
+ # apropos CJK word.
91
+ def get_cjk_word(chunks)
92
+ i = 0
93
+ while i < @rules.length
94
+ break if chunks.length < 2
95
+ chunks = @rules[i].filter(chunks)
96
+ i += 1
97
+ end
98
+
99
+ if chunks.length > 1
100
+ if Config.on_ambiguity == :raise_exception
101
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
102
+ end
103
+ end
104
+
105
+ word = chunks[0].words[0]
106
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
107
+
108
+ @index += word.length
109
+ @byte_index += word.byte_size
110
+
111
+ return token
112
+ end
113
+
114
+ # Find all words occuring in the dictionary starting from
115
+ # +index+ . The maximum word length is determined by
116
+ # +Config.max_word_length+ .
117
+ def find_match_words(chars, index)
118
+ dic = Dictionary.instance
119
+ str = String.new
120
+ words = Array.new
121
+ i = index
122
+
123
+ loop do
124
+ break if i >= chars.length || basic_latin?(chars[i])
125
+ str << chars[i]
126
+ if dic.has_word?(str)
127
+ word = dic.get_word(str)
128
+ words << word
129
+ end
130
+ i += 1
131
+ break if Word.new(str).length >= Config.max_word_length
132
+ end
133
+
134
+ if words.empty?
135
+ words << Word.new(chars[index], Word::TYPES[:unrecognized])
136
+ end
137
+
138
+ words
139
+ end
140
+
141
+ # Determine whether a character is a basic latin character.
142
+ #--
143
+ # TODO: Implement this method in a more correct way.
144
+ # currently I use number of bytes in this char to determine this.
145
+ # If it is a one-byte char, I consider it a basic latin.
146
+ #++
147
+ def basic_latin?(char)
148
+ char.size == 1
149
+ end
150
+
151
+ # Determine whether a character can be part of a basic latin
152
+ # word.
153
+ def nonword_char?(char)
154
+ /^\W$/ =~ char
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end