rmmseg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt ADDED
@@ -0,0 +1,6 @@
1
+ === 0.0.1 / 2008-01-31
2
+
3
+ * Analyser integration with Ferret.
4
+ * rdoc added
5
+ * Lazily init the +Word+ objects inside the +Dictionary+.
6
+ * Handle English punctuation correctly.
data/Manifest.txt ADDED
@@ -0,0 +1,37 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ TODO.txt
6
+ bin/rmmseg
7
+ lib/rmmseg.rb
8
+ lib/rmmseg/algorithm.rb
9
+ lib/rmmseg/amibguity.rb
10
+ lib/rmmseg/chars.dic
11
+ lib/rmmseg/chunk.rb
12
+ lib/rmmseg/complex_algorithm.rb
13
+ lib/rmmseg/config.rb
14
+ lib/rmmseg/dictionary.rb
15
+ lib/rmmseg/ferret.rb
16
+ lib/rmmseg/lawl_rule.rb
17
+ lib/rmmseg/lsdmfocw_rule.rb
18
+ lib/rmmseg/mm_rule.rb
19
+ lib/rmmseg/rule_helper.rb
20
+ lib/rmmseg/simple_algorithm.rb
21
+ lib/rmmseg/svwl_rule.rb
22
+ lib/rmmseg/token.rb
23
+ lib/rmmseg/word.rb
24
+ lib/rmmseg/words.dic
25
+ misc/homepage.erb
26
+ misc/homepage.html
27
+ spec/chunk_spec.rb
28
+ spec/complex_algorithm_spec.rb
29
+ spec/config_spec.rb
30
+ spec/dictionary_spec.rb
31
+ spec/lawl_rule_spec.rb
32
+ spec/lsdmfocw_rule_spec.rb
33
+ spec/mm_rule_spec.rb
34
+ spec/simple_algorithm_spec.rb
35
+ spec/spec_helper.rb
36
+ spec/svwl_rule_spec.rb
37
+ spec/word_spec.rb
data/README.txt ADDED
@@ -0,0 +1,63 @@
1
+ = rmmseg
2
+
3
+ * http://rmmseg.rubyforge.org
4
+ * mailto:pluskid@gmail.com
5
+
6
+ == DESCRIPTION:
7
+
8
+ RMMSeg is an implementation of MMSEG Chinese word segmentation
9
+ algorithm. It is based on two variants of maximum matching
10
+ algorithms. Two algorithms are available for using:
11
+
12
+ * simple algorithm that uses only forward maximum matching.
13
+ * complex algorithm that uses three-word chunk maximum matching and 3
14
+ aditonal rules to solve ambiguities.
15
+
16
+ For more information about the algorithm, please refer to the
17
+ following essays:
18
+
19
+ * http://technology.chtsai.org/mmseg/
20
+ * http://pluskid.lifegoo.com/?p=261
21
+
22
+ == FEATURES/PROBLEMS:
23
+
24
+ * Provides +rmmseg+ command line tool for quick and easy way to access
25
+ the word segment feature.
26
+ * Provides an +Analyser+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
27
+
28
+ == SYNOPSIS:
29
+
30
+ $ rmmseg --separator _ < input.txt
31
+
32
+ == REQUIREMENTS:
33
+
34
+ * ruby
35
+
36
+ == INSTALL:
37
+
38
+ * sudo gem install rmmseg
39
+
40
+ == LICENSE:
41
+
42
+ (The MIT License)
43
+
44
+ Copyright (c) 2008 FIX
45
+
46
+ Permission is hereby granted, free of charge, to any person obtaining
47
+ a copy of this software and associated documentation files (the
48
+ 'Software'), to deal in the Software without restriction, including
49
+ without limitation the rights to use, copy, modify, merge, publish,
50
+ distribute, sublicense, and/or sell copies of the Software, and to
51
+ permit persons to whom the Software is furnished to do so, subject to
52
+ the following conditions:
53
+
54
+ The above copyright notice and this permission notice shall be
55
+ included in all copies or substantial portions of the Software.
56
+
57
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
58
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
59
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
60
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
61
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
62
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
63
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1,33 @@
1
+ # -*- ruby -*-
2
+
3
+ $: << File.join(File.dirname(__FILE__), "lib")
4
+
5
+ require 'rubygems'
6
+ require 'hoe'
7
+ require 'rmmseg'
8
+
9
+ Hoe.new('rmmseg', RMMSeg::VERSION) do |p|
10
+ p.rubyforge_name = 'rmmseg'
11
+ p.author = 'pluskid'
12
+ p.email = 'pluskid@gmail.com'
13
+ p.test_globs = ["spec/spec.rb"]
14
+ p.rdoc_pattern = /^lib\/.*\.rb$|\.txt$/
15
+ p.summary = <<-END
16
+ RMMSeg is an implementation of MMSEG algorithm in Ruby. MMSEG is a
17
+ Chinese segmentation algorithm based on two variants of maximum
18
+ matching.
19
+
20
+ RMMSeg can be used as a stand alone program or as an Analyzer of
21
+ Ferret.
22
+ END
23
+ end
24
+
25
+ task :homepage do
26
+ sh "gerbil html misc/homepage.erb > misc/homepage.html"
27
+ end
28
+
29
+ task :publish_homepage do
30
+ sh "scp misc/homepage.html rubyforge.org:/var/www/gforge-projects/rmmseg/index.html"
31
+ end
32
+
33
+ # vim: syntax=Ruby
data/TODO.txt ADDED
@@ -0,0 +1,3 @@
1
+ === TODO
2
+
3
+ * Add filter to filter out Chinese punctuations.
data/bin/rmmseg ADDED
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ $: << File.join(File.dirname(__FILE__), "..", "lib")
4
+
5
+ require 'rmmseg'
6
+ include RMMSeg
7
+
8
+ require 'getoptlong'
9
+
10
+ def print_usage
11
+ puts <<EOF
12
+ #{__FILE__} Segment Chinese text. Read from stdin and print to stdout.
13
+
14
+ Options:
15
+ -h
16
+ --help Print this message
17
+
18
+ -a
19
+ --algorithm Select segment algorithm. Valid values are 'complex' and
20
+ 'simple'. 'simple' is the default one.
21
+
22
+ -A
23
+ --ambiguity Select a behavior when an ambiguity occurs. Valid values
24
+ are 'raise_exception' and 'select_first'. 'select_first'
25
+ is the default one.
26
+ EOF
27
+ exit 0
28
+ end
29
+
30
+ separator = " "
31
+
32
+ optparser = GetoptLong.new
33
+ optparser.set_options(["-a", "--algorithm", GetoptLong::REQUIRED_ARGUMENT],
34
+ ["-A", "--ambiguity", GetoptLong::REQUIRED_ARGUMENT],
35
+ ["-s", "--separator", GetoptLong::REQUIRED_ARGUMENT],
36
+ ["-h", "--help", GetoptLong::NO_ARGUMENT])
37
+
38
+ loop do
39
+ begin
40
+ opt, arg = optparser.get
41
+ break if not opt
42
+
43
+ case opt
44
+ when "-h"
45
+ print_usage
46
+
47
+ when "-a"
48
+ Config.algorithm = arg.to_sym
49
+
50
+ when "-A"
51
+ Config.on_ambiguity = arg.to_sym
52
+
53
+ when "-s"
54
+ separator = arg
55
+ end
56
+
57
+ rescue => err
58
+ puts err
59
+ exit 1
60
+ end
61
+ end
62
+
63
+ puts segment(STDIN.read).join(separator)
@@ -0,0 +1,157 @@
1
+ require 'jcode'
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance.
14
+ def initialize(text)
15
+ @chars = text.each_char
16
+ @index = 0
17
+ @byte_index = 0
18
+ end
19
+
20
+ # Get the next Token recognized.
21
+ def next_token
22
+ return nil if @index >= @chars.length
23
+
24
+ current = @chars[@index]
25
+ orig_index = @index
26
+ token = nil
27
+ len = 0
28
+
29
+ if basic_latin?(current)
30
+ token = get_basic_latin_word
31
+ else
32
+ token = get_cjk_word(create_chunks)
33
+ end
34
+
35
+ if token.text.empty?
36
+ return next_token
37
+ else
38
+ return token
39
+ end
40
+ end
41
+
42
+ # Segment the string in +text+ into an array
43
+ # of words.
44
+ def segment
45
+ words = Array.new
46
+ loop do
47
+ token = next_token
48
+ break if token.nil?
49
+ words << token.text
50
+ end
51
+
52
+ words
53
+ end
54
+
55
+ # Skip whitespaces and punctuation to extract a basic latin
56
+ # word.
57
+ def get_basic_latin_word
58
+ word = String.new
59
+ start_pos = nil
60
+ end_pos = nil
61
+
62
+ i = @index
63
+ while i < @chars.length &&
64
+ basic_latin?(@chars[i]) &&
65
+ nonword_char?(@chars[i])
66
+ i += 1
67
+ end
68
+
69
+ start_pos = @byte_index + i - @index
70
+ while i < @chars.length && basic_latin?(@chars[i])
71
+ break if nonword_char?(@chars[i])
72
+ word << @chars[i]
73
+ i += 1
74
+ end
75
+
76
+ end_pos = @byte_index + i - @index
77
+ while i < @chars.length &&
78
+ basic_latin?(@chars[i]) &&
79
+ nonword_char?(@chars[i])
80
+ i += 1
81
+ end
82
+
83
+ @byte_index += i - @index
84
+ @index = i
85
+
86
+ return Token.new(word, start_pos, end_pos)
87
+ end
88
+
89
+ # Use rules to filter the +chunks+ to get the most
90
+ # apropos CJK word.
91
+ def get_cjk_word(chunks)
92
+ i = 0
93
+ while i < @rules.length
94
+ break if chunks.length < 2
95
+ chunks = @rules[i].filter(chunks)
96
+ i += 1
97
+ end
98
+
99
+ if chunks.length > 1
100
+ if Config.on_ambiguity == :raise_exception
101
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
102
+ end
103
+ end
104
+
105
+ word = chunks[0].words[0]
106
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
107
+
108
+ @index += word.length
109
+ @byte_index += word.byte_size
110
+
111
+ return token
112
+ end
113
+
114
+ # Find all words occuring in the dictionary starting from
115
+ # +index+ . The maximum word length is determined by
116
+ # +Config.max_word_length+ .
117
+ def find_match_words(chars, index)
118
+ dic = Dictionary.instance
119
+ str = String.new
120
+ words = Array.new
121
+ i = index
122
+
123
+ loop do
124
+ break if i >= chars.length || basic_latin?(chars[i])
125
+ str << chars[i]
126
+ if dic.has_word?(str)
127
+ word = dic.get_word(str)
128
+ words << word
129
+ end
130
+ i += 1
131
+ break if Word.new(str).length >= Config.max_word_length
132
+ end
133
+
134
+ if words.empty?
135
+ words << Word.new(chars[index], Word::TYPES[:unrecognized])
136
+ end
137
+
138
+ words
139
+ end
140
+
141
+ # Determine whether a character is a basic latin character.
142
+ #--
143
+ # TODO: Implement this method in a more correct way.
144
+ # currently I use number of bytes in this char to determine this.
145
+ # If it is a one-byte char, I consider it a basic latin.
146
+ #++
147
+ def basic_latin?(char)
148
+ char.size == 1
149
+ end
150
+
151
+ # Determine whether a character can be part of a basic latin
152
+ # word.
153
+ def nonword_char?(char)
154
+ /^\W$/ =~ char
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end