loyal_rmmseg 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt ADDED
@@ -0,0 +1,74 @@
1
+ = rmmseg
2
+ by pluskid
3
+ http://rmmseg.rubyforge.org
4
+
5
+ == DESCRIPTION:
6
+
7
+ RMMSeg is an implementation of MMSEG Chinese word segmentation
8
+ algorithm. It is based on two variants of maximum matching
9
+ algorithms. Two algorithms are available for using:
10
+
11
+ * simple algorithm that uses only forward maximum matching.
12
+ * complex algorithm that uses three-word chunk maximum matching and 3
13
+ additonal rules to solve ambiguities.
14
+
15
+ For more information about the algorithm, please refer to the
16
+ following essays:
17
+
18
+ * http://technology.chtsai.org/mmseg/
19
+ * http://pluskid.lifegoo.com/?p=261
20
+
21
+ == FEATURES/PROBLEMS:
22
+
23
+ * Provides +rmmseg+ command line tool for quick and easy way to access
24
+ the word segment feature.
25
+ * Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
26
+
27
+ == SYNOPSIS:
28
+
29
+ Using the command line tool +rmmseg+ is simple:
30
+ $ rmmseg --separator _ < input.txt
31
+ passing option +-h+ can get an overview of all supported options.
32
+
33
+ Using the +Analyzer+ for Ferret is even easier:
34
+
35
+ require 'rmmseg'
36
+ require 'rmmseg/ferret'
37
+
38
+ alalyzer = RMMSeg::Ferret::Analyzer.new
39
+ index = Ferret::Index::Index.new(:analyzer => analyzer)
40
+
41
+ For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
42
+
43
+ == REQUIREMENTS:
44
+
45
+ * ruby
46
+
47
+ == INSTALL:
48
+
49
+ * sudo gem install rmmseg
50
+
51
+ == LICENSE:
52
+
53
+ (The MIT License)
54
+
55
+ Copyright (c) 2008 FIX
56
+
57
+ Permission is hereby granted, free of charge, to any person obtaining
58
+ a copy of this software and associated documentation files (the
59
+ 'Software'), to deal in the Software without restriction, including
60
+ without limitation the rights to use, copy, modify, merge, publish,
61
+ distribute, sublicense, and/or sell copies of the Software, and to
62
+ permit persons to whom the Software is furnished to do so, subject to
63
+ the following conditions:
64
+
65
+ The above copyright notice and this permission notice shall be
66
+ included in all copies or substantial portions of the Software.
67
+
68
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
69
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
70
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
71
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
72
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
73
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
74
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,138 @@
1
+ require 'jcode'
2
+ require 'rmmseg/dictionary'
3
+ require 'rmmseg/word'
4
+ require 'rmmseg/chunk'
5
+ require 'rmmseg/token'
6
+
7
+ module RMMSeg
8
+ # An algorithm can segment a piece of text into an array of
9
+ # words. This module is the common operations shared by
10
+ # SimpleAlgorithm and ComplexAlgorithm .
11
+ module Algorithm
12
+ # Initialize a new instance of Algorithm, the +text+ will
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
16
+ @text = text
17
+ @chars = text.each_char
18
+ @index = 0
19
+ @byte_index = 0
20
+ @token = token
21
+ end
22
+
23
+ # Get the next Token recognized.
24
+ def next_token
25
+ return nil if @index >= @chars.length
26
+
27
+ if basic_latin?(@chars[@index])
28
+ token = get_basic_latin_word
29
+ else
30
+ token = get_cjk_word
31
+ end
32
+
33
+ if token.start == token.end # empty
34
+ return next_token
35
+ else
36
+ return token
37
+ end
38
+ end
39
+
40
+ # Segment the string in +text+ into an array
41
+ # of words.
42
+ def segment
43
+ words = Array.new
44
+
45
+ token = next_token
46
+ until token.nil?
47
+ words << token.text
48
+ token = next_token
49
+ end
50
+
51
+ words
52
+ end
53
+
54
+ # Skip whitespaces and punctuation to extract a basic latin
55
+ # word.
56
+ def get_basic_latin_word
57
+ start_pos = nil
58
+ end_pos = nil
59
+
60
+ i = @index
61
+ while i < @chars.length &&
62
+ basic_latin?(@chars[i]) &&
63
+ nonword_char?(@chars[i])
64
+ i += 1
65
+ end
66
+
67
+ start_pos = @byte_index + i - @index
68
+ while i < @chars.length && basic_latin?(@chars[i])
69
+ break if nonword_char?(@chars[i])
70
+ i += 1
71
+ end
72
+
73
+ end_pos = @byte_index + i - @index
74
+ while i < @chars.length &&
75
+ basic_latin?(@chars[i]) &&
76
+ nonword_char?(@chars[i])
77
+ i += 1
78
+ end
79
+
80
+ @byte_index += i - @index
81
+ @index = i
82
+
83
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
84
+ end
85
+
86
+ # Find all words occuring in the dictionary starting from
87
+ # +index+ . The maximum word length is determined by
88
+ # +Config.max_word_length+ .
89
+ def find_match_words(index)
90
+ for i, w in @match_cache
91
+ if i == index
92
+ return w
93
+ end
94
+ end
95
+
96
+ dic = Dictionary.instance
97
+ str = String.new
98
+ strlen = 0
99
+ words = Array.new
100
+ i = index
101
+
102
+ while i < @chars.length &&
103
+ !basic_latin?(@chars[i]) &&
104
+ strlen < Config.max_word_length
105
+
106
+ str << @chars[i]
107
+ strlen += 1
108
+
109
+ if dic.has_word?(str)
110
+ words << dic.get_word(str)
111
+ end
112
+ i += 1
113
+ end
114
+
115
+ if words.empty?
116
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
117
+ end
118
+
119
+ @match_cache[@match_cache_idx] = [index, words]
120
+ @match_cache_idx += 1
121
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
122
+
123
+ words
124
+ end
125
+
126
+ # Determine whether a character is a basic latin character.
127
+ def basic_latin?(char)
128
+ char.length == 1
129
+ end
130
+
131
+ # Determine whether a character can be part of a basic latin
132
+ # word.
133
+ NONWORD_CHAR_RE = /^\W$/
134
+ def nonword_char?(char)
135
+ NONWORD_CHAR_RE =~ char
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,4 @@
1
+ module RMMSeg
2
+ class Ambiguity < Exception
3
+ end
4
+ end
@@ -0,0 +1,41 @@
1
+ module RMMSeg
2
+ # A Chunk holds one or more successive Word .
3
+ module Chunk
4
+
5
+ # The sum of length of all words.
6
+ def self.total_length(words)
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
12
+ end
13
+
14
+ # The average length of words.
15
+ def self.average_length(words)
16
+ total_length(words).to_f/words.size
17
+ end
18
+
19
+ # The square of the standard deviation of length of all words.
20
+ def self.variance(words)
21
+ avglen = average_length(words)
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
28
+ end
29
+
30
+ # The sum of all frequencies of one-character words.
31
+ def self.degree_of_morphemic_freedom(words)
32
+ sum = 0
33
+ for word in words
34
+ if word.length == 1 && word.type == Word::TYPES[:cjk_word]
35
+ sum += word.frequency
36
+ end
37
+ end
38
+ sum
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,122 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+ require 'rmmseg/lawl_rule'
4
+ require 'rmmseg/svwl_rule'
5
+ require 'rmmseg/lsdmfocw_rule'
6
+
7
+ module RMMSeg
8
+ class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
11
+ include Algorithm
12
+
13
+ # Create a new ComplexAlgorithm . Rules used by this algorithm
14
+ # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
15
+ def initialize(text, token=Token)
16
+ super
17
+ @rules = [
18
+ MMRule,
19
+ LAWLRule,
20
+ SVWLRule,
21
+ LSDMFOCWRule
22
+ ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
25
+ end
26
+
27
+ # Get the most proper CJK word.
28
+ def get_cjk_word
29
+ chunks = create_chunks
30
+ i = 0
31
+ while i < @rules.length
32
+ break if chunks.length < 2
33
+ chunks = @rules[i].filter(chunks)
34
+ i += 1
35
+ end
36
+
37
+ if chunks.length > 1
38
+ if Config.on_ambiguity == :raise_exception
39
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
40
+ end
41
+ end
42
+
43
+ word = chunks[0][0]
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
45
+
46
+ @index += word.length
47
+ @byte_index += word.byte_size
48
+
49
+ return token
50
+ end
51
+
52
+ # Create all possible three-word (or less) chunks
53
+ # starting from +@index+ .
54
+ def create_chunks
55
+ chunks = Array.new
56
+ for w0 in find_match_words(@index)
57
+ index0 = @index + w0.length
58
+ if index0 < @chars.length
59
+ for w1 in find_match_words(index0)
60
+ index1 = index0 + w1.length
61
+ if index1 < @chars.length
62
+ for w2 in find_match_words(index1)
63
+ if w2.type == Word::TYPES[:unrecognized]
64
+ chunks << [w0, w1]
65
+ else
66
+ chunks << [w0, w1, w2]
67
+ end
68
+ end
69
+ elsif index1 == @chars.length
70
+ chunks << [w0, w1]
71
+ end
72
+ end
73
+ elsif index0 == @chars.length
74
+ chunks << [w0]
75
+ end
76
+ end
77
+
78
+ chunks
79
+ end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
121
+ end
122
+ end
@@ -0,0 +1,62 @@
1
+ require 'rmmseg/simple_algorithm'
2
+ require 'rmmseg/complex_algorithm'
3
+
4
+ module RMMSeg
5
+ # Configurations of RMMSeg.
6
+ class Config
7
+ @algorithm = :complex
8
+ @on_ambiguity = :select_first
9
+ data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
10
+ @dictionaries = [[File.join(data_dir, "chars.dic"), true],
11
+ [File.join(data_dir, "words.dic"), false]]
12
+ @max_word_length = 4
13
+
14
+ class << self
15
+ # Get the algorithm name currently using
16
+ def algorithm
17
+ @algorithm
18
+ end
19
+ # Set the algorithm name used to segment. Valid values are
20
+ # +:complex+ and +:simple+ . The former is the default one.
21
+ def algorithm=(algor)
22
+ unless [:complex, :simple].include? algor
23
+ raise ArgumentError, "Unknown algorithm #{algor}"
24
+ end
25
+ @algorithm = algor
26
+ end
27
+ # Get an instance of the algorithm object corresponding to the
28
+ # algorithm name configured. +tok+ is the class of the token oject
29
+ # to be returned. For example, if you want to use with Ferret, you
30
+ # should provide +::Ferret::Analysis::Token+ .
31
+ def algorithm_instance(text, tok=Token)
32
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
33
+ end
34
+
35
+ # Get the behavior description when an unresolved ambiguity occured.
36
+ def on_ambiguity
37
+ @on_ambiguity
38
+ end
39
+ # Set the behavior on an unresolved ambiguity. Valid values are
40
+ # +:raise_exception+ and +:select_first+ . The latter is the default
41
+ # one.
42
+ def on_ambiguity=(behavior)
43
+ unless [:raise_exception, :select_first].include? behavior
44
+ raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
45
+ end
46
+ @on_ambiguity = behavior
47
+ end
48
+
49
+ # An array of dictionary files. Each element should be of the
50
+ # form: [file, whether_dic_include_frequency_info]. This should
51
+ # be set before the dictionaries are loaded (They are loaded
52
+ # only when they are used). Or else you should call
53
+ # Dictionary.instance.reload manually to reload the
54
+ # dictionaries.
55
+ attr_accessor :dictionaries
56
+
57
+ # The maximum length of a CJK word. The default value is 4. Making
58
+ # this value too large might slow down the segment operations.
59
+ attr_accessor :max_word_length
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,80 @@
1
+ require 'singleton'
2
+
3
+ module RMMSeg
4
+ # The dictionary is a singleton object which is lazily initialized.
5
+ # *NOTE* dictionary data should use the UNIX line-break '\n' instead
6
+ # of DOS '\r\n'.
7
+ class Dictionary
8
+ include Singleton
9
+
10
+ # Initialize and load dictionaries from files specified by
11
+ # +Config.dictionaries+ .
12
+ def initialize
13
+ load_dictionaries
14
+ end
15
+
16
+ # Determin whether +value+ is a word in the dictionary.
17
+ def has_word?(value)
18
+ @dic.has_key?(value)
19
+ end
20
+
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
31
+ # Get an instance of Word corresponding to +value+ .
32
+ def get_word(value)
33
+ word = @dic[value]
34
+ # Construct a Word lazily
35
+ if word == true
36
+ word = Word.new(value.dup, Word::TYPES[:cjk_word])
37
+ @dic[value] = word
38
+ elsif String === word
39
+ word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
40
+ @dic[value] = word
41
+ end
42
+ word
43
+ end
44
+
45
+ # Reload all dictionary files.
46
+ def reload
47
+ @dic = nil
48
+ load_dictionaries
49
+ end
50
+
51
+ private
52
+ def load_dictionaries
53
+ @dic = Hash.new
54
+ Config.dictionaries.each { |file, has_freq|
55
+ if has_freq
56
+ load_dictionary_with_freq(file)
57
+ else
58
+ load_dictionary(file)
59
+ end
60
+ }
61
+ end
62
+
63
+ def load_dictionary_with_freq(file)
64
+ File.open(file, "r") { |f|
65
+ f.each_line { |line|
66
+ pair = line.split(" ")
67
+ @dic[pair[0]] = pair[1]
68
+ }
69
+ }
70
+ end
71
+ def load_dictionary(file)
72
+ File.open(file, "r") { |f|
73
+ f.each_line { |line|
74
+ line.slice!(-1) # chop!
75
+ @dic[line] = true
76
+ }
77
+ }
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,109 @@
1
+ # This file integrate RMMSeg with Ferret
2
+ require 'singleton'
3
+ require 'rubygems'
4
+ require 'ferret'
5
+ require 'rmmseg'
6
+
7
+ module RMMSeg
8
+ module Ferret
9
+ # The Analyzer class can be used with Ferret .
10
+ class Analyzer < ::Ferret::Analysis::Analyzer
11
+
12
+ # Construct an Analyzer. Optional block can be used to
13
+ # add more +TokenFilter+s. e.g.
14
+ #
15
+ # analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
16
+ # Ferret::Analysis::LowerCaseFilter.new(tokenizer)
17
+ # }
18
+ #
19
+ def initialize(&brk)
20
+ @brk = brk
21
+ end
22
+
23
+ def token_stream(field, text)
24
+ t = PunctuationFilter.new(Tokenizer.new(text))
25
+ if @brk
26
+ @brk.call(t)
27
+ else
28
+ t
29
+ end
30
+ end
31
+ end
32
+
33
+ # The Tokenizer tokenize text with RMMSeg::Algorithm.
34
+ class Tokenizer < ::Ferret::Analysis::TokenStream
35
+ # Create a new Tokenizer to tokenize +text+
36
+ def initialize(str)
37
+ self.text = str
38
+ end
39
+
40
+ # Get next token
41
+ def next
42
+ @algor.next_token
43
+ end
44
+
45
+ # Get the text being tokenized
46
+ def text
47
+ @text
48
+ end
49
+
50
+ # Set the text to be tokenized
51
+ def text=(str)
52
+ @text = str
53
+ @algor = RMMSeg::Config.algorithm_instance(@text,
54
+ ::Ferret::Analysis::Token)
55
+ end
56
+ end
57
+
58
+ # PunctuationFilter filter out the stand alone Chinese
59
+ # punctuation tokens.
60
+ class PunctuationFilter < ::Ferret::Analysis::TokenStream
61
+ # The punctuation dictionary.
62
+ class Dictionary
63
+ include Singleton
64
+
65
+ DIC_FILE = File.join(File.dirname(__FILE__),
66
+ "..",
67
+ "..",
68
+ "data",
69
+ "punctuation.dic")
70
+ def initialize
71
+ @dic = Hash.new
72
+ File.open(DIC_FILE, "r") do |f|
73
+ f.each_line { |line|
74
+ @dic[line.chomp.freeze] = nil
75
+ }
76
+ end
77
+ end
78
+
79
+ def include?(str)
80
+ @dic.has_key?(str)
81
+ end
82
+ end
83
+
84
+ def initialize(stream)
85
+ @stream = stream
86
+ end
87
+
88
+ # Get next token, skip stand alone Chinese punctuations.
89
+ def next
90
+ token = @stream.next
91
+ dic = Dictionary.instance
92
+
93
+ until token.nil? || !(dic.include? token.text)
94
+ token = @stream.next
95
+ end
96
+
97
+ token
98
+ end
99
+
100
+ def text
101
+ @stream.text
102
+ end
103
+
104
+ def text=(str)
105
+ @stream.text = str
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest average word length rule.
5
+ class LAWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::average_length(a) <=> Chunk::average_length(b)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Largest sum of degree of morphemic freedom of one-character
5
+ # words rule.
6
+ class LSDMFOCWRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,13 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Maximum matching rule, select the chunks with the
5
+ # maximum length.
6
+ class MMRule
7
+ def self.filter(chunks)
8
+ chunks.take_highest { |a, b|
9
+ Chunk::total_length(a) <=> Chunk::total_length(b)
10
+ }
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,28 @@
1
+ class Array
2
+ # Take the elements with the highest value. Value are compared
3
+ # through the block. e.g
4
+ #
5
+ # ["aaaa", "bb", "cccc"].take_highest { |a, b|
6
+ # a.length <=> b.length
7
+ # }
8
+ # # => ["aaaa", "cccc"]
9
+ #
10
+ def take_highest
11
+ return [] if empty?
12
+
13
+ rlt = [self.first]
14
+ max = self.first
15
+
16
+ for i in 1...length
17
+ cmp = yield(self[i], max)
18
+ if cmp == 0
19
+ rlt << self[i]
20
+ elsif cmp > 0
21
+ max = self[i]
22
+ rlt = [max]
23
+ end
24
+ end
25
+
26
+ rlt
27
+ end
28
+ end
@@ -0,0 +1,37 @@
1
+ require 'rmmseg/algorithm'
2
+ require 'rmmseg/mm_rule'
3
+
4
+ module RMMSeg
5
+ class SimpleAlgorithm
6
+ include Algorithm
7
+
8
+ # Create a new SimpleAlgorithm . The only rule used by this
9
+ # algorithm is MMRule .
10
+ def initialize(text, token=Token)
11
+ super
12
+ end
13
+
14
+ # Get the most proper CJK word.
15
+ def get_cjk_word
16
+ dic = Dictionary.instance
17
+ i = Config.max_word_length
18
+ if i + @index > @chars.length
19
+ i = @chars.length - @index
20
+ end
21
+ chars = @chars[@index, i]
22
+ word = chars.join
23
+
24
+ while i > 1 && !dic.has_word?(word)
25
+ i -= 1
26
+ word.slice!(-chars[i].size,chars[i].size) # truncate last char
27
+ end
28
+
29
+ token = @token.new(word, @byte_index, @byte_index+word.size)
30
+
31
+ @index += i
32
+ @byte_index += word.size
33
+
34
+ return token
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,12 @@
1
+ require 'rmmseg/rule_helper'
2
+
3
+ module RMMSeg
4
+ # Smallest variance of word length rule.
5
+ class SVWLRule
6
+ def self.filter(chunks)
7
+ chunks.take_highest { |a, b|
8
+ Chunk::variance(b) <=> Chunk::variance(a)
9
+ }
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,29 @@
1
+ module RMMSeg
2
+ # A Token consists of a term's text and the start and end offset
3
+ # of the term.
4
+ class Token
5
+ # The text of the token
6
+ attr_accessor :text
7
+
8
+ # The start position of the token. This is *byte* index instead of
9
+ # character.
10
+ attr_accessor :start
11
+
12
+ # The one greater than the position of the last byte of the
13
+ # token. This is *byte* index instead of character.
14
+ attr_accessor :end
15
+
16
+ # +text+ is the ref to the whole text. In other words:
17
+ # +text[start_pos...end_pos]+ should be the string held by this
18
+ # token.
19
+ def initialize(text, start_pos, end_pos)
20
+ @text = text
21
+ @start = start_pos
22
+ @end = end_pos
23
+ end
24
+
25
+ def to_s
26
+ @text.dup
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,38 @@
1
+ module RMMSeg
2
+ # An object representing a CJK word.
3
+ class Word
4
+ TYPES = {
5
+ :unrecognized => :unrecognized,
6
+ :basic_latin_word => :basic_latin_word,
7
+ :cjk_word => :cjk_word
8
+ }.freeze
9
+
10
+ # The content text of the word.
11
+ attr_reader :text
12
+
13
+ # The type of the word, may be one of the key of TYPES .
14
+ attr_reader :type
15
+
16
+ # The frequency of the word. This value is meaningful only
17
+ # when this is a one-character word.
18
+ attr_reader :frequency
19
+
20
+ # Initialize a Word object.
21
+ def initialize(text, type=TYPES[:unrecognized], frequency=nil)
22
+ @text = text
23
+ @type = type
24
+ @frequency = frequency
25
+ @length = @text.jlength
26
+ end
27
+
28
+ # The number of characters in the word. *Not* number of bytes.
29
+ def length
30
+ @length
31
+ end
32
+
33
+ # The number of bytes in the word.
34
+ def byte_size
35
+ @text.length
36
+ end
37
+ end
38
+ end
data/lib/rmmseg.rb ADDED
@@ -0,0 +1,15 @@
1
+ $KCODE = 'u'
2
+ require 'jcode'
3
+
4
+ require 'rmmseg/config'
5
+ require 'rmmseg/simple_algorithm'
6
+ require 'rmmseg/complex_algorithm'
7
+
8
+ module RMMSeg
9
+ VERSION = '0.1.6'
10
+
11
+ # Segment +text+ using the algorithm configured.
12
+ def segment(text)
13
+ Config.algorithm_instance(text).segment
14
+ end
15
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: loyal_rmmseg
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease:
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - happy
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2012-02-02 00:00:00 +08:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: rake
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Chinese Seg.
36
+ email: happy@doc5.com
37
+ executables: []
38
+
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - README.txt
45
+ - lib/rmmseg.rb
46
+ - lib/rmmseg/word.rb
47
+ - lib/rmmseg/algorithm.rb
48
+ - lib/rmmseg/rule_helper.rb
49
+ - lib/rmmseg/amibguity.rb
50
+ - lib/rmmseg/lawl_rule.rb
51
+ - lib/rmmseg/chunk.rb
52
+ - lib/rmmseg/config.rb
53
+ - lib/rmmseg/lsdmfocw_rule.rb
54
+ - lib/rmmseg/simple_algorithm.rb
55
+ - lib/rmmseg/complex_algorithm.rb
56
+ - lib/rmmseg/token.rb
57
+ - lib/rmmseg/ferret.rb
58
+ - lib/rmmseg/dictionary.rb
59
+ - lib/rmmseg/mm_rule.rb
60
+ - lib/rmmseg/svwl_rule.rb
61
+ has_rdoc: true
62
+ homepage: http://www.doc5.com
63
+ licenses:
64
+ - MIT
65
+ post_install_message:
66
+ rdoc_options: []
67
+
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ hash: 3
85
+ segments:
86
+ - 0
87
+ version: "0"
88
+ requirements: []
89
+
90
+ rubyforge_project:
91
+ rubygems_version: 1.4.2
92
+ signing_key:
93
+ specification_version: 3
94
+ summary: Nice Chinese Seg.
95
+ test_files: []
96
+