loyal_rmmseg 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +74 -0
- data/lib/rmmseg/algorithm.rb +138 -0
- data/lib/rmmseg/amibguity.rb +4 -0
- data/lib/rmmseg/chunk.rb +41 -0
- data/lib/rmmseg/complex_algorithm.rb +122 -0
- data/lib/rmmseg/config.rb +62 -0
- data/lib/rmmseg/dictionary.rb +80 -0
- data/lib/rmmseg/ferret.rb +109 -0
- data/lib/rmmseg/lawl_rule.rb +12 -0
- data/lib/rmmseg/lsdmfocw_rule.rb +13 -0
- data/lib/rmmseg/mm_rule.rb +13 -0
- data/lib/rmmseg/rule_helper.rb +28 -0
- data/lib/rmmseg/simple_algorithm.rb +37 -0
- data/lib/rmmseg/svwl_rule.rb +12 -0
- data/lib/rmmseg/token.rb +29 -0
- data/lib/rmmseg/word.rb +38 -0
- data/lib/rmmseg.rb +15 -0
- metadata +96 -0
data/README.txt
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
= rmmseg
|
2
|
+
by pluskid
|
3
|
+
http://rmmseg.rubyforge.org
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
RMMSeg is an implementation of MMSEG Chinese word segmentation
|
8
|
+
algorithm. It is based on two variants of maximum matching
|
9
|
+
algorithms. Two algorithms are available for using:
|
10
|
+
|
11
|
+
* simple algorithm that uses only forward maximum matching.
|
12
|
+
* complex algorithm that uses three-word chunk maximum matching and 3
|
13
|
+
additonal rules to solve ambiguities.
|
14
|
+
|
15
|
+
For more information about the algorithm, please refer to the
|
16
|
+
following essays:
|
17
|
+
|
18
|
+
* http://technology.chtsai.org/mmseg/
|
19
|
+
* http://pluskid.lifegoo.com/?p=261
|
20
|
+
|
21
|
+
== FEATURES/PROBLEMS:
|
22
|
+
|
23
|
+
* Provides +rmmseg+ command line tool for quick and easy way to access
|
24
|
+
the word segment feature.
|
25
|
+
* Provides an +Analyzer+ for integrating with Ferret[http://ferret.davebalmain.com/trac].
|
26
|
+
|
27
|
+
== SYNOPSIS:
|
28
|
+
|
29
|
+
Using the command line tool +rmmseg+ is simple:
|
30
|
+
$ rmmseg --separator _ < input.txt
|
31
|
+
passing option +-h+ can get an overview of all supported options.
|
32
|
+
|
33
|
+
Using the +Analyzer+ for Ferret is even easier:
|
34
|
+
|
35
|
+
require 'rmmseg'
|
36
|
+
require 'rmmseg/ferret'
|
37
|
+
|
38
|
+
alalyzer = RMMSeg::Ferret::Analyzer.new
|
39
|
+
index = Ferret::Index::Index.new(:analyzer => analyzer)
|
40
|
+
|
41
|
+
For more details, please refer to the {homepage usage section}[http://rmmseg.rubyforge.org/index.html#Usage].
|
42
|
+
|
43
|
+
== REQUIREMENTS:
|
44
|
+
|
45
|
+
* ruby
|
46
|
+
|
47
|
+
== INSTALL:
|
48
|
+
|
49
|
+
* sudo gem install rmmseg
|
50
|
+
|
51
|
+
== LICENSE:
|
52
|
+
|
53
|
+
(The MIT License)
|
54
|
+
|
55
|
+
Copyright (c) 2008 FIX
|
56
|
+
|
57
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
58
|
+
a copy of this software and associated documentation files (the
|
59
|
+
'Software'), to deal in the Software without restriction, including
|
60
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
61
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
62
|
+
permit persons to whom the Software is furnished to do so, subject to
|
63
|
+
the following conditions:
|
64
|
+
|
65
|
+
The above copyright notice and this permission notice shall be
|
66
|
+
included in all copies or substantial portions of the Software.
|
67
|
+
|
68
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
69
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
70
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
71
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
72
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
73
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
74
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
@@ -0,0 +1,138 @@
|
|
1
|
+
require 'jcode'
|
2
|
+
require 'rmmseg/dictionary'
|
3
|
+
require 'rmmseg/word'
|
4
|
+
require 'rmmseg/chunk'
|
5
|
+
require 'rmmseg/token'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
# An algorithm can segment a piece of text into an array of
|
9
|
+
# words. This module is the common operations shared by
|
10
|
+
# SimpleAlgorithm and ComplexAlgorithm .
|
11
|
+
module Algorithm
|
12
|
+
# Initialize a new instance of Algorithm, the +text+ will
|
13
|
+
# then be segmented by this instance. +token+ is the class
|
14
|
+
# which will be used to construct the result token.
|
15
|
+
def initialize(text, token=Token)
|
16
|
+
@text = text
|
17
|
+
@chars = text.each_char
|
18
|
+
@index = 0
|
19
|
+
@byte_index = 0
|
20
|
+
@token = token
|
21
|
+
end
|
22
|
+
|
23
|
+
# Get the next Token recognized.
|
24
|
+
def next_token
|
25
|
+
return nil if @index >= @chars.length
|
26
|
+
|
27
|
+
if basic_latin?(@chars[@index])
|
28
|
+
token = get_basic_latin_word
|
29
|
+
else
|
30
|
+
token = get_cjk_word
|
31
|
+
end
|
32
|
+
|
33
|
+
if token.start == token.end # empty
|
34
|
+
return next_token
|
35
|
+
else
|
36
|
+
return token
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
# Segment the string in +text+ into an array
|
41
|
+
# of words.
|
42
|
+
def segment
|
43
|
+
words = Array.new
|
44
|
+
|
45
|
+
token = next_token
|
46
|
+
until token.nil?
|
47
|
+
words << token.text
|
48
|
+
token = next_token
|
49
|
+
end
|
50
|
+
|
51
|
+
words
|
52
|
+
end
|
53
|
+
|
54
|
+
# Skip whitespaces and punctuation to extract a basic latin
|
55
|
+
# word.
|
56
|
+
def get_basic_latin_word
|
57
|
+
start_pos = nil
|
58
|
+
end_pos = nil
|
59
|
+
|
60
|
+
i = @index
|
61
|
+
while i < @chars.length &&
|
62
|
+
basic_latin?(@chars[i]) &&
|
63
|
+
nonword_char?(@chars[i])
|
64
|
+
i += 1
|
65
|
+
end
|
66
|
+
|
67
|
+
start_pos = @byte_index + i - @index
|
68
|
+
while i < @chars.length && basic_latin?(@chars[i])
|
69
|
+
break if nonword_char?(@chars[i])
|
70
|
+
i += 1
|
71
|
+
end
|
72
|
+
|
73
|
+
end_pos = @byte_index + i - @index
|
74
|
+
while i < @chars.length &&
|
75
|
+
basic_latin?(@chars[i]) &&
|
76
|
+
nonword_char?(@chars[i])
|
77
|
+
i += 1
|
78
|
+
end
|
79
|
+
|
80
|
+
@byte_index += i - @index
|
81
|
+
@index = i
|
82
|
+
|
83
|
+
return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Find all words occuring in the dictionary starting from
|
87
|
+
# +index+ . The maximum word length is determined by
|
88
|
+
# +Config.max_word_length+ .
|
89
|
+
def find_match_words(index)
|
90
|
+
for i, w in @match_cache
|
91
|
+
if i == index
|
92
|
+
return w
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
dic = Dictionary.instance
|
97
|
+
str = String.new
|
98
|
+
strlen = 0
|
99
|
+
words = Array.new
|
100
|
+
i = index
|
101
|
+
|
102
|
+
while i < @chars.length &&
|
103
|
+
!basic_latin?(@chars[i]) &&
|
104
|
+
strlen < Config.max_word_length
|
105
|
+
|
106
|
+
str << @chars[i]
|
107
|
+
strlen += 1
|
108
|
+
|
109
|
+
if dic.has_word?(str)
|
110
|
+
words << dic.get_word(str)
|
111
|
+
end
|
112
|
+
i += 1
|
113
|
+
end
|
114
|
+
|
115
|
+
if words.empty?
|
116
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
117
|
+
end
|
118
|
+
|
119
|
+
@match_cache[@match_cache_idx] = [index, words]
|
120
|
+
@match_cache_idx += 1
|
121
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
122
|
+
|
123
|
+
words
|
124
|
+
end
|
125
|
+
|
126
|
+
# Determine whether a character is a basic latin character.
|
127
|
+
def basic_latin?(char)
|
128
|
+
char.length == 1
|
129
|
+
end
|
130
|
+
|
131
|
+
# Determine whether a character can be part of a basic latin
|
132
|
+
# word.
|
133
|
+
NONWORD_CHAR_RE = /^\W$/
|
134
|
+
def nonword_char?(char)
|
135
|
+
NONWORD_CHAR_RE =~ char
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
data/lib/rmmseg/chunk.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Chunk holds one or more successive Word .
|
3
|
+
module Chunk
|
4
|
+
|
5
|
+
# The sum of length of all words.
|
6
|
+
def self.total_length(words)
|
7
|
+
len = 0
|
8
|
+
for word in words
|
9
|
+
len += word.length
|
10
|
+
end
|
11
|
+
len
|
12
|
+
end
|
13
|
+
|
14
|
+
# The average length of words.
|
15
|
+
def self.average_length(words)
|
16
|
+
total_length(words).to_f/words.size
|
17
|
+
end
|
18
|
+
|
19
|
+
# The square of the standard deviation of length of all words.
|
20
|
+
def self.variance(words)
|
21
|
+
avglen = average_length(words)
|
22
|
+
sqr_sum = 0.0
|
23
|
+
for word in words
|
24
|
+
tmp = word.length - avglen
|
25
|
+
sqr_sum += tmp*tmp
|
26
|
+
end
|
27
|
+
Math.sqrt(sqr_sum)
|
28
|
+
end
|
29
|
+
|
30
|
+
# The sum of all frequencies of one-character words.
|
31
|
+
def self.degree_of_morphemic_freedom(words)
|
32
|
+
sum = 0
|
33
|
+
for word in words
|
34
|
+
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
35
|
+
sum += word.frequency
|
36
|
+
end
|
37
|
+
end
|
38
|
+
sum
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
require 'rmmseg/lawl_rule'
|
4
|
+
require 'rmmseg/svwl_rule'
|
5
|
+
require 'rmmseg/lsdmfocw_rule'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
class ComplexAlgorithm
|
9
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
10
|
+
|
11
|
+
include Algorithm
|
12
|
+
|
13
|
+
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
14
|
+
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
15
|
+
def initialize(text, token=Token)
|
16
|
+
super
|
17
|
+
@rules = [
|
18
|
+
MMRule,
|
19
|
+
LAWLRule,
|
20
|
+
SVWLRule,
|
21
|
+
LSDMFOCWRule
|
22
|
+
]
|
23
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
24
|
+
@match_cache_idx = 0
|
25
|
+
end
|
26
|
+
|
27
|
+
# Get the most proper CJK word.
|
28
|
+
def get_cjk_word
|
29
|
+
chunks = create_chunks
|
30
|
+
i = 0
|
31
|
+
while i < @rules.length
|
32
|
+
break if chunks.length < 2
|
33
|
+
chunks = @rules[i].filter(chunks)
|
34
|
+
i += 1
|
35
|
+
end
|
36
|
+
|
37
|
+
if chunks.length > 1
|
38
|
+
if Config.on_ambiguity == :raise_exception
|
39
|
+
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
word = chunks[0][0]
|
44
|
+
token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
45
|
+
|
46
|
+
@index += word.length
|
47
|
+
@byte_index += word.byte_size
|
48
|
+
|
49
|
+
return token
|
50
|
+
end
|
51
|
+
|
52
|
+
# Create all possible three-word (or less) chunks
|
53
|
+
# starting from +@index+ .
|
54
|
+
def create_chunks
|
55
|
+
chunks = Array.new
|
56
|
+
for w0 in find_match_words(@index)
|
57
|
+
index0 = @index + w0.length
|
58
|
+
if index0 < @chars.length
|
59
|
+
for w1 in find_match_words(index0)
|
60
|
+
index1 = index0 + w1.length
|
61
|
+
if index1 < @chars.length
|
62
|
+
for w2 in find_match_words(index1)
|
63
|
+
if w2.type == Word::TYPES[:unrecognized]
|
64
|
+
chunks << [w0, w1]
|
65
|
+
else
|
66
|
+
chunks << [w0, w1, w2]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
elsif index1 == @chars.length
|
70
|
+
chunks << [w0, w1]
|
71
|
+
end
|
72
|
+
end
|
73
|
+
elsif index0 == @chars.length
|
74
|
+
chunks << [w0]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
chunks
|
79
|
+
end
|
80
|
+
|
81
|
+
# Find all words occuring in the dictionary starting from
|
82
|
+
# +index+ . The maximum word length is determined by
|
83
|
+
# +Config.max_word_length+ .
|
84
|
+
def find_match_words(index)
|
85
|
+
for i, w in @match_cache
|
86
|
+
if i == index
|
87
|
+
return w
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
dic = Dictionary.instance
|
92
|
+
str = String.new
|
93
|
+
strlen = 0
|
94
|
+
words = Array.new
|
95
|
+
i = index
|
96
|
+
|
97
|
+
while i < @chars.length &&
|
98
|
+
!basic_latin?(@chars[i]) &&
|
99
|
+
strlen < Config.max_word_length
|
100
|
+
|
101
|
+
str << @chars[i]
|
102
|
+
strlen += 1
|
103
|
+
|
104
|
+
if dic.has_word?(str)
|
105
|
+
words << dic.get_word(str)
|
106
|
+
end
|
107
|
+
i += 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if words.empty?
|
111
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
112
|
+
end
|
113
|
+
|
114
|
+
@match_cache[@match_cache_idx] = [index, words]
|
115
|
+
@match_cache_idx += 1
|
116
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
117
|
+
|
118
|
+
words
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require 'rmmseg/simple_algorithm'
|
2
|
+
require 'rmmseg/complex_algorithm'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
# Configurations of RMMSeg.
|
6
|
+
class Config
|
7
|
+
@algorithm = :complex
|
8
|
+
@on_ambiguity = :select_first
|
9
|
+
data_dir = File.join(File.dirname(__FILE__), "..", "..", "data")
|
10
|
+
@dictionaries = [[File.join(data_dir, "chars.dic"), true],
|
11
|
+
[File.join(data_dir, "words.dic"), false]]
|
12
|
+
@max_word_length = 4
|
13
|
+
|
14
|
+
class << self
|
15
|
+
# Get the algorithm name currently using
|
16
|
+
def algorithm
|
17
|
+
@algorithm
|
18
|
+
end
|
19
|
+
# Set the algorithm name used to segment. Valid values are
|
20
|
+
# +:complex+ and +:simple+ . The former is the default one.
|
21
|
+
def algorithm=(algor)
|
22
|
+
unless [:complex, :simple].include? algor
|
23
|
+
raise ArgumentError, "Unknown algorithm #{algor}"
|
24
|
+
end
|
25
|
+
@algorithm = algor
|
26
|
+
end
|
27
|
+
# Get an instance of the algorithm object corresponding to the
|
28
|
+
# algorithm name configured. +tok+ is the class of the token oject
|
29
|
+
# to be returned. For example, if you want to use with Ferret, you
|
30
|
+
# should provide +::Ferret::Analysis::Token+ .
|
31
|
+
def algorithm_instance(text, tok=Token)
|
32
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
|
33
|
+
end
|
34
|
+
|
35
|
+
# Get the behavior description when an unresolved ambiguity occured.
|
36
|
+
def on_ambiguity
|
37
|
+
@on_ambiguity
|
38
|
+
end
|
39
|
+
# Set the behavior on an unresolved ambiguity. Valid values are
|
40
|
+
# +:raise_exception+ and +:select_first+ . The latter is the default
|
41
|
+
# one.
|
42
|
+
def on_ambiguity=(behavior)
|
43
|
+
unless [:raise_exception, :select_first].include? behavior
|
44
|
+
raise ArgumentError, "Unknown behavior on ambiguity: #{behavior}"
|
45
|
+
end
|
46
|
+
@on_ambiguity = behavior
|
47
|
+
end
|
48
|
+
|
49
|
+
# An array of dictionary files. Each element should be of the
|
50
|
+
# form: [file, whether_dic_include_frequency_info]. This should
|
51
|
+
# be set before the dictionaries are loaded (They are loaded
|
52
|
+
# only when they are used). Or else you should call
|
53
|
+
# Dictionary.instance.reload manually to reload the
|
54
|
+
# dictionaries.
|
55
|
+
attr_accessor :dictionaries
|
56
|
+
|
57
|
+
# The maximum length of a CJK word. The default value is 4. Making
|
58
|
+
# this value too large might slow down the segment operations.
|
59
|
+
attr_accessor :max_word_length
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# The dictionary is a singleton object which is lazily initialized.
|
5
|
+
# *NOTE* dictionary data should use the UNIX line-break '\n' instead
|
6
|
+
# of DOS '\r\n'.
|
7
|
+
class Dictionary
|
8
|
+
include Singleton
|
9
|
+
|
10
|
+
# Initialize and load dictionaries from files specified by
|
11
|
+
# +Config.dictionaries+ .
|
12
|
+
def initialize
|
13
|
+
load_dictionaries
|
14
|
+
end
|
15
|
+
|
16
|
+
# Determin whether +value+ is a word in the dictionary.
|
17
|
+
def has_word?(value)
|
18
|
+
@dic.has_key?(value)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Store a new word to dictionary.
|
22
|
+
# +w+ may be:
|
23
|
+
# * an instance of Word.
|
24
|
+
# * +true+, then this is a normal world.
|
25
|
+
# * a String(which can be converted to a Number) or Number.
|
26
|
+
# The number is the frequency of the word.
|
27
|
+
def store_word(key, w=true)
|
28
|
+
@dic[key] = w
|
29
|
+
end
|
30
|
+
|
31
|
+
# Get an instance of Word corresponding to +value+ .
|
32
|
+
def get_word(value)
|
33
|
+
word = @dic[value]
|
34
|
+
# Construct a Word lazily
|
35
|
+
if word == true
|
36
|
+
word = Word.new(value.dup, Word::TYPES[:cjk_word])
|
37
|
+
@dic[value] = word
|
38
|
+
elsif String === word
|
39
|
+
word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
|
40
|
+
@dic[value] = word
|
41
|
+
end
|
42
|
+
word
|
43
|
+
end
|
44
|
+
|
45
|
+
# Reload all dictionary files.
|
46
|
+
def reload
|
47
|
+
@dic = nil
|
48
|
+
load_dictionaries
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def load_dictionaries
|
53
|
+
@dic = Hash.new
|
54
|
+
Config.dictionaries.each { |file, has_freq|
|
55
|
+
if has_freq
|
56
|
+
load_dictionary_with_freq(file)
|
57
|
+
else
|
58
|
+
load_dictionary(file)
|
59
|
+
end
|
60
|
+
}
|
61
|
+
end
|
62
|
+
|
63
|
+
def load_dictionary_with_freq(file)
|
64
|
+
File.open(file, "r") { |f|
|
65
|
+
f.each_line { |line|
|
66
|
+
pair = line.split(" ")
|
67
|
+
@dic[pair[0]] = pair[1]
|
68
|
+
}
|
69
|
+
}
|
70
|
+
end
|
71
|
+
def load_dictionary(file)
|
72
|
+
File.open(file, "r") { |f|
|
73
|
+
f.each_line { |line|
|
74
|
+
line.slice!(-1) # chop!
|
75
|
+
@dic[line] = true
|
76
|
+
}
|
77
|
+
}
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
# This file integrate RMMSeg with Ferret
|
2
|
+
require 'singleton'
|
3
|
+
require 'rubygems'
|
4
|
+
require 'ferret'
|
5
|
+
require 'rmmseg'
|
6
|
+
|
7
|
+
module RMMSeg
|
8
|
+
module Ferret
|
9
|
+
# The Analyzer class can be used with Ferret .
|
10
|
+
class Analyzer < ::Ferret::Analysis::Analyzer
|
11
|
+
|
12
|
+
# Construct an Analyzer. Optional block can be used to
|
13
|
+
# add more +TokenFilter+s. e.g.
|
14
|
+
#
|
15
|
+
# analyzer = RMMSeg::Ferret::Analyzer.new { |tokenizer|
|
16
|
+
# Ferret::Analysis::LowerCaseFilter.new(tokenizer)
|
17
|
+
# }
|
18
|
+
#
|
19
|
+
def initialize(&brk)
|
20
|
+
@brk = brk
|
21
|
+
end
|
22
|
+
|
23
|
+
def token_stream(field, text)
|
24
|
+
t = PunctuationFilter.new(Tokenizer.new(text))
|
25
|
+
if @brk
|
26
|
+
@brk.call(t)
|
27
|
+
else
|
28
|
+
t
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# The Tokenizer tokenize text with RMMSeg::Algorithm.
|
34
|
+
class Tokenizer < ::Ferret::Analysis::TokenStream
|
35
|
+
# Create a new Tokenizer to tokenize +text+
|
36
|
+
def initialize(str)
|
37
|
+
self.text = str
|
38
|
+
end
|
39
|
+
|
40
|
+
# Get next token
|
41
|
+
def next
|
42
|
+
@algor.next_token
|
43
|
+
end
|
44
|
+
|
45
|
+
# Get the text being tokenized
|
46
|
+
def text
|
47
|
+
@text
|
48
|
+
end
|
49
|
+
|
50
|
+
# Set the text to be tokenized
|
51
|
+
def text=(str)
|
52
|
+
@text = str
|
53
|
+
@algor = RMMSeg::Config.algorithm_instance(@text,
|
54
|
+
::Ferret::Analysis::Token)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
# PunctuationFilter filter out the stand alone Chinese
|
59
|
+
# punctuation tokens.
|
60
|
+
class PunctuationFilter < ::Ferret::Analysis::TokenStream
|
61
|
+
# The punctuation dictionary.
|
62
|
+
class Dictionary
|
63
|
+
include Singleton
|
64
|
+
|
65
|
+
DIC_FILE = File.join(File.dirname(__FILE__),
|
66
|
+
"..",
|
67
|
+
"..",
|
68
|
+
"data",
|
69
|
+
"punctuation.dic")
|
70
|
+
def initialize
|
71
|
+
@dic = Hash.new
|
72
|
+
File.open(DIC_FILE, "r") do |f|
|
73
|
+
f.each_line { |line|
|
74
|
+
@dic[line.chomp.freeze] = nil
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def include?(str)
|
80
|
+
@dic.has_key?(str)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def initialize(stream)
|
85
|
+
@stream = stream
|
86
|
+
end
|
87
|
+
|
88
|
+
# Get next token, skip stand alone Chinese punctuations.
|
89
|
+
def next
|
90
|
+
token = @stream.next
|
91
|
+
dic = Dictionary.instance
|
92
|
+
|
93
|
+
until token.nil? || !(dic.include? token.text)
|
94
|
+
token = @stream.next
|
95
|
+
end
|
96
|
+
|
97
|
+
token
|
98
|
+
end
|
99
|
+
|
100
|
+
def text
|
101
|
+
@stream.text
|
102
|
+
end
|
103
|
+
|
104
|
+
def text=(str)
|
105
|
+
@stream.text = str
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Largest sum of degree of morphemic freedom of one-character
|
5
|
+
# words rule.
|
6
|
+
class LSDMFOCWRule
|
7
|
+
def self.filter(chunks)
|
8
|
+
chunks.take_highest { |a, b|
|
9
|
+
Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
|
10
|
+
}
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'rmmseg/rule_helper'
|
2
|
+
|
3
|
+
module RMMSeg
|
4
|
+
# Maximum matching rule, select the chunks with the
|
5
|
+
# maximum length.
|
6
|
+
class MMRule
|
7
|
+
def self.filter(chunks)
|
8
|
+
chunks.take_highest { |a, b|
|
9
|
+
Chunk::total_length(a) <=> Chunk::total_length(b)
|
10
|
+
}
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
class Array
|
2
|
+
# Take the elements with the highest value. Value are compared
|
3
|
+
# through the block. e.g
|
4
|
+
#
|
5
|
+
# ["aaaa", "bb", "cccc"].take_highest { |a, b|
|
6
|
+
# a.length <=> b.length
|
7
|
+
# }
|
8
|
+
# # => ["aaaa", "cccc"]
|
9
|
+
#
|
10
|
+
def take_highest
|
11
|
+
return [] if empty?
|
12
|
+
|
13
|
+
rlt = [self.first]
|
14
|
+
max = self.first
|
15
|
+
|
16
|
+
for i in 1...length
|
17
|
+
cmp = yield(self[i], max)
|
18
|
+
if cmp == 0
|
19
|
+
rlt << self[i]
|
20
|
+
elsif cmp > 0
|
21
|
+
max = self[i]
|
22
|
+
rlt = [max]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
rlt
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rmmseg/algorithm'
|
2
|
+
require 'rmmseg/mm_rule'
|
3
|
+
|
4
|
+
module RMMSeg
|
5
|
+
class SimpleAlgorithm
|
6
|
+
include Algorithm
|
7
|
+
|
8
|
+
# Create a new SimpleAlgorithm . The only rule used by this
|
9
|
+
# algorithm is MMRule .
|
10
|
+
def initialize(text, token=Token)
|
11
|
+
super
|
12
|
+
end
|
13
|
+
|
14
|
+
# Get the most proper CJK word.
|
15
|
+
def get_cjk_word
|
16
|
+
dic = Dictionary.instance
|
17
|
+
i = Config.max_word_length
|
18
|
+
if i + @index > @chars.length
|
19
|
+
i = @chars.length - @index
|
20
|
+
end
|
21
|
+
chars = @chars[@index, i]
|
22
|
+
word = chars.join
|
23
|
+
|
24
|
+
while i > 1 && !dic.has_word?(word)
|
25
|
+
i -= 1
|
26
|
+
word.slice!(-chars[i].size,chars[i].size) # truncate last char
|
27
|
+
end
|
28
|
+
|
29
|
+
token = @token.new(word, @byte_index, @byte_index+word.size)
|
30
|
+
|
31
|
+
@index += i
|
32
|
+
@byte_index += word.size
|
33
|
+
|
34
|
+
return token
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/rmmseg/token.rb
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# A Token consists of a term's text and the start and end offset
|
3
|
+
# of the term.
|
4
|
+
class Token
|
5
|
+
# The text of the token
|
6
|
+
attr_accessor :text
|
7
|
+
|
8
|
+
# The start position of the token. This is *byte* index instead of
|
9
|
+
# character.
|
10
|
+
attr_accessor :start
|
11
|
+
|
12
|
+
# The one greater than the position of the last byte of the
|
13
|
+
# token. This is *byte* index instead of character.
|
14
|
+
attr_accessor :end
|
15
|
+
|
16
|
+
# +text+ is the ref to the whole text. In other words:
|
17
|
+
# +text[start_pos...end_pos]+ should be the string held by this
|
18
|
+
# token.
|
19
|
+
def initialize(text, start_pos, end_pos)
|
20
|
+
@text = text
|
21
|
+
@start = start_pos
|
22
|
+
@end = end_pos
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
@text.dup
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/rmmseg/word.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
module RMMSeg
|
2
|
+
# An object representing a CJK word.
|
3
|
+
class Word
|
4
|
+
TYPES = {
|
5
|
+
:unrecognized => :unrecognized,
|
6
|
+
:basic_latin_word => :basic_latin_word,
|
7
|
+
:cjk_word => :cjk_word
|
8
|
+
}.freeze
|
9
|
+
|
10
|
+
# The content text of the word.
|
11
|
+
attr_reader :text
|
12
|
+
|
13
|
+
# The type of the word, may be one of the key of TYPES .
|
14
|
+
attr_reader :type
|
15
|
+
|
16
|
+
# The frequency of the word. This value is meaningful only
|
17
|
+
# when this is a one-character word.
|
18
|
+
attr_reader :frequency
|
19
|
+
|
20
|
+
# Initialize a Word object.
|
21
|
+
def initialize(text, type=TYPES[:unrecognized], frequency=nil)
|
22
|
+
@text = text
|
23
|
+
@type = type
|
24
|
+
@frequency = frequency
|
25
|
+
@length = @text.jlength
|
26
|
+
end
|
27
|
+
|
28
|
+
# The number of characters in the word. *Not* number of bytes.
|
29
|
+
def length
|
30
|
+
@length
|
31
|
+
end
|
32
|
+
|
33
|
+
# The number of bytes in the word.
|
34
|
+
def byte_size
|
35
|
+
@text.length
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/rmmseg.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
$KCODE = 'u'
|
2
|
+
require 'jcode'
|
3
|
+
|
4
|
+
require 'rmmseg/config'
|
5
|
+
require 'rmmseg/simple_algorithm'
|
6
|
+
require 'rmmseg/complex_algorithm'
|
7
|
+
|
8
|
+
module RMMSeg
|
9
|
+
VERSION = '0.1.6'
|
10
|
+
|
11
|
+
# Segment +text+ using the algorithm configured.
|
12
|
+
def segment(text)
|
13
|
+
Config.algorithm_instance(text).segment
|
14
|
+
end
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: loyal_rmmseg
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- happy
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2012-02-02 00:00:00 +08:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: rake
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Chinese Seg.
|
36
|
+
email: happy@doc5.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- README.txt
|
45
|
+
- lib/rmmseg.rb
|
46
|
+
- lib/rmmseg/word.rb
|
47
|
+
- lib/rmmseg/algorithm.rb
|
48
|
+
- lib/rmmseg/rule_helper.rb
|
49
|
+
- lib/rmmseg/amibguity.rb
|
50
|
+
- lib/rmmseg/lawl_rule.rb
|
51
|
+
- lib/rmmseg/chunk.rb
|
52
|
+
- lib/rmmseg/config.rb
|
53
|
+
- lib/rmmseg/lsdmfocw_rule.rb
|
54
|
+
- lib/rmmseg/simple_algorithm.rb
|
55
|
+
- lib/rmmseg/complex_algorithm.rb
|
56
|
+
- lib/rmmseg/token.rb
|
57
|
+
- lib/rmmseg/ferret.rb
|
58
|
+
- lib/rmmseg/dictionary.rb
|
59
|
+
- lib/rmmseg/mm_rule.rb
|
60
|
+
- lib/rmmseg/svwl_rule.rb
|
61
|
+
has_rdoc: true
|
62
|
+
homepage: http://www.doc5.com
|
63
|
+
licenses:
|
64
|
+
- MIT
|
65
|
+
post_install_message:
|
66
|
+
rdoc_options: []
|
67
|
+
|
68
|
+
require_paths:
|
69
|
+
- lib
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
hash: 3
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
version: "0"
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ">="
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
hash: 3
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
version: "0"
|
88
|
+
requirements: []
|
89
|
+
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 1.4.2
|
92
|
+
signing_key:
|
93
|
+
specification_version: 3
|
94
|
+
summary: Nice Chinese Seg.
|
95
|
+
test_files: []
|
96
|
+
|