loyal_rmmseg 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt ADDED
@@ -0,0 +1,74 @@
1
+ = rmmseg
2
+ by pluskid
3
+ http://rmmseg.rubyforge.org
4
+
5
+ == DESCRIPTION:
6
+
7
+ RMMSeg is an implementation of MMSEG Chinese word segmentation
8
+ algorithm. It is based on two variants of maximum matching
9
+ algorithms. Two algorithms are available for using:
10
+
11
+ * simple algorithm that uses only forward maximum matching.
12
+ * complex algorithm that uses three-word chunk maximum matching and 3
13
+ additonal rules to solve ambiguities.
14
+
15
+ For more information about the algorithm, please refer to the
16
+ following essays:
17
+
18
+ * http://technology.chtsai.org/mmseg/
19
+ * http://pluskid.lifegoo.com/?p=261
20
+
21
+ == FEATURES/PROBLEMS:
22
+
23
+ * Provides +rmmseg+ command line tool for quick and easy way to access
24
+ the word segment feature.
25
+ * Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
26
+
27
+ == SYNOPSIS:
28
+
29
+ Using the command line tool +rmmseg+ is simple:
30
+ $ rmmseg --separator _ < input.txt
31
+ passing option +-h+ can get an overview of all supported options.
32
+
33
+ Using the +Analyzer+ for Ferret is even easier:
34
+
35
+ require 'rmmseg'
36
+ require 'rmmseg/ferret'
37
+
38
+ alalyzer = RMMSeg::Ferret::Analyzer.new
39
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
40
+
41
+ For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
42
+
43
+ == REQUIREMENTS:
44
+
45
+ * ruby
46
+
47
+ == INSTALL:
48
+
49
+ * sudo gem install rmmseg
50
+
51
+ == LICENSE:
52
+
53
+ (The MIT License)
54
+
55
+ Copyright (c) 2008 FIX
56
+
57
+ Permission is hereby granted, free of charge, to any person obtaining
58
+ a copy of this software and associated documentation files (the
59
+ 'Software'), to deal in the Software without restriction, including
60
+ without limitation the rights to use, copy, modify, merge, publish,
61
+ distribute, sublicense, and/or sell copies of the Software, and to
62
+ permit persons to whom the Software is furnished to do so, subject to
63
+ the following conditions:
64
+
65
+ The above copyright notice and this permission notice shall be
66
+ included in all copies or substantial portions of the Software.
67
+
68
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
69
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
70
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
71
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
72
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
73
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
74
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,138 @@
1
+ require 'jcode'
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
16
+ @text = text
17
+ @chars = text.each_char
18
+ @index = 0
19
+ @byte_index = 0
20
+ @token = token
21
+ end
22
+
23
+ # Get the next Token recognized.
24
+ def next_token
25
+ return nil if @index >= @chars.length
26
+
27
+ if basic_latin?(@chars[@index])
28
+ token = get_basic_latin_word
29
+ else
30
+ token = get_cjk_word
31
+ end
32
+
33
+ if token.start == token.end # empty
34
+ return next_token
35
+ else
36
+ return token
37
+ end
38
+ end
39
+
40
+ # Segment the string in +text+ into an array
41
+ # of words.
42
+ def segment
43
+ words = Array.new
44
+
45
+ token = next_token
46
+ until token.nil?
47
+ words << token.text
48
+ token = next_token
49
+ end
50
+
51
+ words
52
+ end
53
+
54
+ # Skip whitespaces and punctuation to extract a basic latin
55
+ # word.
56
+ def get_basic_latin_word
57
+ start_pos = nil
58
+ end_pos = nil
59
+
60
+ i = @index
61
+ while i < @chars.length &&
62
+ basic_latin?(@chars[i]) &&
63
+ nonword_char?(@chars[i])
64
+ i += 1
65
+ end
66
+
67
+ start_pos = @byte_index + i - @index
68
+ while i < @chars.length && basic_latin?(@chars[i])
69
+ break if nonword_char?(@chars[i])
70
+ i += 1
71
+ end
72
+
73
+ end_pos = @byte_index + i - @index
74
+ while i < @chars.length &&
75
+ basic_latin?(@chars[i]) &&
76
+ nonword_char?(@chars[i])
77
+ i += 1
78
+ end
79
+
80
+ @byte_index += i - @index
81
+ @index = i
82
+
83
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
84
+ end
85
+
86
+ # Find all words occuring in the dictionary starting from
87
+ # +index+ . The maximum word length is determined by
88
+ # +Config.max_word_length+ .
89
+ def find_match_words(index)
90
+ for i, w in @match_cache
91
+ if i == index
92
+ return w
93
+ end
94
+ end
95
+
96
+ dic = Dictionary.instance
97
+ str = String.new
98
+ strlen = 0
99
+ words = Array.new
100
+ i = index
101
+
102
+ while i < @chars.length &&
103
+ !basic_latin?(@chars[i]) &&
104
+ strlen < Config.max_word_length
105
+
106
+ str << @chars[i]
107
+ strlen += 1
108
+
109
+ if dic.has_word?(str)
110
+ words << dic.get_word(str)
111
+ end
112
+ i += 1
113
+ end
114
+
115
+ if words.empty?
116
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
117
+ end
118
+
119
+ @match_cache[@match_cache_idx] = [index, words]
120
+ @match_cache_idx += 1
121
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
122
+
123
+ words
124
+ end
125
+
126
+ # Determine whether a character is a basic latin character.
127
+ def basic_latin?(char)
128
+ char.length == 1
129
+ end
130
+
131
+ # Determine whether a character can be part of a basic latin
132
+ # word.
133
+ NONWORD_CHAR_RE = /^\W$/
134
+ def nonword_char?(char)
135
+ NONWORD_CHAR_RE =~ char
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end
@@ -0,0 +1,41 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ module Chunk
4
+
5
+ # The sum of length of all words.
6
+ def self.total_length(words)
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
12
+ end
13
+
14
+ # The average length of words.
15
+ def self.average_length(words)
16
+ total_length(words).to_f/words.size
17
+ end
18
+
19
+ # The square of the standard deviation of length of all words.
20
+ def self.variance(words)
21
+ avglen = average_length(words)
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
28
+ end
29
+
30
+ # The sum of all frequencies of one-character words.
31
+ def self.degree_of_morphemic_freedom(words)
32
+ sum = 0
33
+ for word in words
34
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
35
+ sum += word.frequency
36
+ end
37
+ end
38
+ sum
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,122 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
11
+ include Algorithm
12
+
13
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
14
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
15
+ def initialize(text, token=Token)
16
+ super
17
+ @rules = [
18
+ MMRule,
19
+ LAWLRule,
20
+ SVWLRule,
21
+ LSDMFOCWRule
22
+ ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
25
+ end
26
+
27
+ # Get the most proper CJK word.
28
+ def get_cjk_word
29
+ chunks = create_chunks
30
+ i = 0
31
+ while i < @rules.length
32
+ break if chunks.length < 2
33
+ chunks = @rules[i].filter(chunks)
34
+ i += 1
35
+ end
36
+
37
+ if chunks.length > 1
38
+ if Config.on_ambiguity == :raise_exception
39
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
40
+ end
41
+ end
42
+
43
+ word = chunks[0][0]
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
45
+
46
+ @index += word.length
47
+ @byte_index += word.byte_size
48
+
49
+ return token
50
+ end
51
+
52
+ # Create all possible three-word (or less) chunks
53
+ # starting from +@index+ .
54
+ def create_chunks
55
+ chunks = Array.new
56
+ for w0 in find_match_words(@index)
57
+ index0 = @index + w0.length
58
+ if index0 < @chars.length
59
+ for w1 in find_match_words(index0)
60
+ index1 = index0 + w1.length
61
+ if index1 < @chars.length
62
+ for w2 in find_match_words(index1)
63
+ if w2.type == Word::TYPES[:unrecognized]
64
+ chunks << [w0, w1]
65
+ else
66
+ chunks << [w0, w1, w2]
67
+ end
68
+ end
69
+ elsif index1 == @chars.length
70
+ chunks << [w0, w1]
71
+ end
72
+ end
73
+ elsif index0 == @chars.length
74
+ chunks << [w0]
75
+ end
76
+ end
77
+
78
+ chunks
79
+ end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
121
+ end
122
+ end
@@ -0,0 +1,62 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [[File.join(data_dir, "chars.dic"), true],
11
+ [File.join(data_dir, "words.dic"), false]]
12
+ @max_word_length = 4
13
+
14
+ class << self
15
+ # Get the algorithm name currently using
16
+ def algorithm
17
+ @algorithm
18
+ end
19
+ # Set the algorithm name used to segment. Valid values are
20
+ # +:complex+ and +:simple+ . The former is the default one.
21
+ def algorithm=(algor)
22
+ unless [:complex, :simple].include? algor
23
+ raise ArgumentError, "Unknown algorithm #{algor}"
24
+ end
25
+ @algorithm = algor
26
+ end
27
+ # Get an instance of the algorithm object corresponding to the
28
+ # algorithm name configured. +tok+ is the class of the token oject
29
+ # to be returned. For example, if you want to use with Ferret, you
30
+ # should provide +::Ferret::Analysis::Token+ .
31
+ def algorithm_instance(text, tok=Token)
32
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
33
+ end
34
+
35
+ # Get the behavior description when an unresolved ambiguity occured.
36
+ def on_ambiguity
37
+ @on_ambiguity
38
+ end
39
+ # Set the behavior on an unresolved ambiguity. Valid values are
40
+ # +:raise_exception+ and +:select_first+ . The latter is the default
41
+ # one.
42
+ def on_ambiguity=(behavior)
43
+ unless [:raise_exception, :select_first].include? behavior
44
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
45
+ end
46
+ @on_ambiguity = behavior
47
+ end
48
+
49
+ # An array of dictionary files. Each element should be of the
50
+ # form: [file, whether_dic_include_frequency_info]. This should
51
+ # be set before the dictionaries are loaded (They are loaded
52
+ # only when they are used). Or else you should call
53
+ # Dictionary.instance.reload manually to reload the
54
+ # dictionaries.
55
+ attr_accessor :dictionaries
56
+
57
+ # The maximum length of a CJK word. The default value is 4. Making
58
+ # this value too large might slow down the segment operations.
59
+ attr_accessor :max_word_length
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,80 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ # *NOTE* dictionary data should use the UNIX line-break '\n' instead
6
+ # of DOS '\r\n'.
7
+ class Dictionary
8
+ include Singleton
9
+
10
+ # Initialize and load dictionaries from files specified by
11
+ # +Config.dictionaries+ .
12
+ def initialize
13
+ load_dictionaries
14
+ end
15
+
16
+ # Determin whether +value+ is a word in the dictionary.
17
+ def has_word?(value)
18
+ @dic.has_key?(value)
19
+ end
20
+
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
31
+ # Get an instance of Word corresponding to +value+ .
32
+ def get_word(value)
33
+ word = @dic[value]
34
+ # Construct a Word lazily
35
+ if word == true
36
+ word = Word.new(value.dup, Word::TYPES[:cjk_word])
37
+ @dic[value] = word
38
+ elsif String === word
39
+ word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
40
+ @dic[value] = word
41
+ end
42
+ word
43
+ end
44
+
45
+ # Reload all dictionary files.
46
+ def reload
47
+ @dic = nil
48
+ load_dictionaries
49
+ end
50
+
51
+ private
52
+ def load_dictionaries
53
+ @dic = Hash.new
54
+ Config.dictionaries.each { |file, has_freq|
55
+ if has_freq
56
+ load_dictionary_with_freq(file)
57
+ else
58
+ load_dictionary(file)
59
+ end
60
+ }
61
+ end
62
+
63
+ def load_dictionary_with_freq(file)
64
+ File.open(file, "r") { |f|
65
+ f.each_line { |line|
66
+ pair = line.split(" ")
67
+ @dic[pair[0]] = pair[1]
68
+ }
69
+ }
70
+ end
71
+ def load_dictionary(file)
72
+ File.open(file, "r") { |f|
73
+ f.each_line { |line|
74
+ line.slice!(-1) # chop!
75
+ @dic[line] = true
76
+ }
77
+ }
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,109 @@
1
+ # This file integrate RMMSeg with Ferret
2
+ require 'singleton'
3
+ require 'rubygems'
4
+ require 'ferret'
5
+ require 'rmmseg'
6
+
7
+ module RMMSeg
8
+ module Ferret
9
+ # The Analyzer class can be used with Ferret .
10
+ class Analyzer < ::Ferret::Analysis::Analyzer
11
+
12
+ # Construct an Analyzer. Optional block can be used to
13
+ # add more +TokenFilter+s. e.g.
14
+ #
15
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
16
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
17
+ # }
18
+ #
19
+ def initialize(&brk)
20
+ @brk = brk
21
+ end
22
+
23
+ def token_stream(field, text)
24
+ t = PunctuationFilter.new(Tokenizer.new(text))
25
+ if @brk
26
+ @brk.call(t)
27
+ else
28
+ t
29
+ end
30
+ end
31
+ end
32
+
33
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
34
+ class Tokenizer < ::Ferret::Analysis::TokenStream
35
+ # Create a new Tokenizer to tokenize +text+
36
+ def initialize(str)
37
+ self.text = str
38
+ end
39
+
40
+ # Get next token
41
+ def next
42
+ @algor.next_token
43
+ end
44
+
45
+ # Get the text being tokenized
46
+ def text
47
+ @text
48
+ end
49
+
50
+ # Set the text to be tokenized
51
+ def text=(str)
52
+ @text = str
53
+ @algor = RMMSeg::Config.algorithm_instance(@text,
54
+ ::Ferret::Analysis::Token)
55
+ end
56
+ end
57
+
58
+ # PunctuationFilter filter out the stand alone Chinese
59
+ # punctuation tokens.
60
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
61
+ # The punctuation dictionary.
62
+ class Dictionary
63
+ include Singleton
64
+
65
+ DIC_FILE = File.join(File.dirname(__FILE__),
66
+ "..",
67
+ "..",
68
+ "data",
69
+ "punctuation.dic")
70
+ def initialize
71
+ @dic = Hash.new
72
+ File.open(DIC_FILE, "r") do |f|
73
+ f.each_line { |line|
74
+ @dic[line.chomp.freeze] = nil
75
+ }
76
+ end
77
+ end
78
+
79
+ def include?(str)
80
+ @dic.has_key?(str)
81
+ end
82
+ end
83
+
84
+ def initialize(stream)
85
+ @stream = stream
86
+ end
87
+
88
+ # Get next token, skip stand alone Chinese punctuations.
89
+ def next
90
+ token = @stream.next
91
+ dic = Dictionary.instance
92
+
93
+ until token.nil? || !(dic.include? token.text)
94
+ token = @stream.next
95
+ end
96
+
97
+ token
98
+ end
99
+
100
+ def text
101
+ @stream.text
102
+ end
103
+
104
+ def text=(str)
105
+ @stream.text = str
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest average word length rule.
5
+ class LAWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::average_length(a) <=> Chunk::average_length(b)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest sum of degree of morphemic freedom of one-character
5
+ # words rule.
6
+ class LSDMFOCWRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Maximum matching rule, select the chunks with the
5
+ # maximum length.
6
+ class MMRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::total_length(a) <=> Chunk::total_length(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ class Array
2
+ # Take the elements with the highest value. Value are compared
3
+ # through the block. e.g
4
+ #
5
+ # ["aaaa", "bb", "cccc"].take_highest { |a, b|
6
+ # a.length <=> b.length
7
+ # }
8
+ # # => ["aaaa", "cccc"]
9
+ #
10
+ def take_highest
11
+ return [] if empty?
12
+
13
+ rlt = [self.first]
14
+ max = self.first
15
+
16
+ for i in 1...length
17
+ cmp = yield(self[i], max)
18
+ if cmp == 0
19
+ rlt << self[i]
20
+ elsif cmp > 0
21
+ max = self[i]
22
+ rlt = [max]
23
+ end
24
+ end
25
+
26
+ rlt
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+
4
+ module RMMSeg
5
+ class SimpleAlgorithm
6
+ include Algorithm
7
+
8
+ # Create a new SimpleAlgorithm . The only rule used by this
9
+ # algorithm is MMRule .
10
+ def initialize(text, token=Token)
11
+ super
12
+ end
13
+
14
+ # Get the most proper CJK word.
15
+ def get_cjk_word
16
+ dic = Dictionary.instance
17
+ i = Config.max_word_length
18
+ if i + @index > @chars.length
19
+ i = @chars.length - @index
20
+ end
21
+ chars = @chars[@index, i]
22
+ word = chars.join
23
+
24
+ while i > 1 && !dic.has_word?(word)
25
+ i -= 1
26
+ word.slice!(-chars[i].size,chars[i].size) # truncate last char
27
+ end
28
+
29
+ token = @token.new(word, @byte_index, @byte_index+word.size)
30
+
31
+ @index += i
32
+ @byte_index += word.size
33
+
34
+ return token
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Smallest variance of word length rule.
5
+ class SVWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::variance(b) <=> Chunk::variance(a)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,29 @@
1
+ module RMMSeg
2
+ # A Token consists of a term's text and the start and end offset
3
+ # of the term.
4
+ class Token
5
+ # The text of the token
6
+ attr_accessor :text
7
+
8
+ # The start position of the token. This is *byte* index instead of
9
+ # character.
10
+ attr_accessor :start
11
+
12
+ # The one greater than the position of the last byte of the
13
+ # token. This is *byte* index instead of character.
14
+ attr_accessor :end
15
+
16
+ # +text+ is the ref to the whole text. In other words:
17
+ # +text[start_pos...end_pos]+ should be the string held by this
18
+ # token.
19
+ def initialize(text, start_pos, end_pos)
20
+ @text = text
21
+ @start = start_pos
22
+ @end = end_pos
23
+ end
24
+
25
+ def to_s
26
+ @text.dup
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,38 @@
1
+ module RMMSeg
2
+ # An object representing a CJK word.
3
+ class Word
4
+ TYPES = {
5
+ :unrecognized => :unrecognized,
6
+ :basic_latin_word => :basic_latin_word,
7
+ :cjk_word => :cjk_word
8
+ }.freeze
9
+
10
+ # The content text of the word.
11
+ attr_reader :text
12
+
13
+ # The type of the word, may be one of the key of TYPES .
14
+ attr_reader :type
15
+
16
+ # The frequency of the word. This value is meaningful only
17
+ # when this is a one-character word.
18
+ attr_reader :frequency
19
+
20
+ # Initialize a Word object.
21
+ def initialize(text, type=TYPES[:unrecognized], frequency=nil)
22
+ @text = text
23
+ @type = type
24
+ @frequency = frequency
25
+ @length = @text.jlength
26
+ end
27
+
28
+ # The number of characters in the word. *Not* number of bytes.
29
+ def length
30
+ @length
31
+ end
32
+
33
+ # The number of bytes in the word.
34
+ def byte_size
35
+ @text.length
36
+ end
37
+ end
38
+ end
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,15 @@
1
+ $KCODE = 'u'
2
+ require 'jcode'
3
+
4
+ require 'rmmseg/config'
5
+ require 'rmmseg/simple_algorithm'
6
+ require 'rmmseg/complex_algorithm'
7
+
8
+ module RMMSeg
9
+ VERSION = '0.1.6'
10
+
11
+ # Segment +text+ using the algorithm configured.
12
+ def segment(text)
13
+ Config.algorithm_instance(text).segment
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loyal_rmmseg
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - happy
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-02-02 00:00:00 +08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rake
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Chinese Seg.
36
+ email: happy@doc5.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - README.txt
45
+ - lib/rmmseg.rb
46
+ - lib/rmmseg/word.rb
47
+ - lib/rmmseg/algorithm.rb
48
+ - lib/rmmseg/rule_helper.rb
49
+ - lib/rmmseg/amibguity.rb
50
+ - lib/rmmseg/lawl_rule.rb
51
+ - lib/rmmseg/chunk.rb
52
+ - lib/rmmseg/config.rb
53
+ - lib/rmmseg/lsdmfocw_rule.rb
54
+ - lib/rmmseg/simple_algorithm.rb
55
+ - lib/rmmseg/complex_algorithm.rb
56
+ - lib/rmmseg/token.rb
57
+ - lib/rmmseg/ferret.rb
58
+ - lib/rmmseg/dictionary.rb
59
+ - lib/rmmseg/mm_rule.rb
60
+ - lib/rmmseg/svwl_rule.rb
61
+ has_rdoc: true
62
+ homepage: http://www.doc5.com
63
+ licenses:
64
+ - MIT
65
+ post_install_message:
66
+ rdoc_options: []
67
+
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ requirements: []
89
+
90
+ rubyforge_project:
91
+ rubygems_version: 1.4.2
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Nice Chinese Seg.
95
+ test_files: []
96
+