rmmseg 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/lib/rmmseg.rb +1 -1
- data/lib/rmmseg/algorithm.rb +6 -8
- data/lib/rmmseg/complex_algorithm.rb +48 -8
- data/lib/rmmseg/config.rb +5 -3
- data/lib/rmmseg/ferret.rb +3 -6
- data/lib/rmmseg/simple_algorithm.rb +17 -5
- data/lib/rmmseg/token.rb +0 -5
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
* Construct Ferret Token directly.
|
2
|
+
|
3
|
+
=== 0.1.6 / 2008-03-16
|
4
|
+
|
5
|
+
* Optimize for simple algorithm. One time faster than before. And less memory usage.
|
6
|
+
|
1
7
|
=== 0.1.5 / 2008-03-03
|
2
8
|
|
3
9
|
* Bug fix: Ferret Token is not Duck-Typing. We need to construct Ferret token instead of reuse RMMSeg Token.
|
data/lib/rmmseg.rb
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -9,17 +9,15 @@ module RMMSeg
|
|
9
9
|
# words. This module is the common operations shared by
|
10
10
|
# SimpleAlgorithm and ComplexAlgorithm .
|
11
11
|
module Algorithm
|
12
|
-
MATCH_CACHE_MAX_LENGTH = 3
|
13
|
-
|
14
12
|
# Initialize a new instance of Algorithm, the +text+ will
|
15
|
-
# then be segmented by this instance.
|
16
|
-
|
13
|
+
# then be segmented by this instance. +token+ is the class
|
14
|
+
# which will be used to construct the result token.
|
15
|
+
def initialize(text, token=Token)
|
17
16
|
@text = text
|
18
17
|
@chars = text.each_char
|
19
18
|
@index = 0
|
20
19
|
@byte_index = 0
|
21
|
-
@
|
22
|
-
@match_cache_idx = 0
|
20
|
+
@token = token
|
23
21
|
end
|
24
22
|
|
25
23
|
# Get the next Token recognized.
|
@@ -32,7 +30,7 @@ module RMMSeg
|
|
32
30
|
token = get_cjk_word
|
33
31
|
end
|
34
32
|
|
35
|
-
if token.empty
|
33
|
+
if token.start == token.end # empty
|
36
34
|
return next_token
|
37
35
|
else
|
38
36
|
return token
|
@@ -82,7 +80,7 @@ module RMMSeg
|
|
82
80
|
@byte_index += i - @index
|
83
81
|
@index = i
|
84
82
|
|
85
|
-
return
|
83
|
+
return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
86
84
|
end
|
87
85
|
|
88
86
|
# Find all words occuring in the dictionary starting from
|
@@ -6,11 +6,13 @@ require 'rmmseg/lsdmfocw_rule'
|
|
6
6
|
|
7
7
|
module RMMSeg
|
8
8
|
class ComplexAlgorithm
|
9
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
10
|
+
|
9
11
|
include Algorithm
|
10
12
|
|
11
13
|
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
12
14
|
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
13
|
-
def initialize(text)
|
15
|
+
def initialize(text, token=Token)
|
14
16
|
super
|
15
17
|
@rules = [
|
16
18
|
MMRule,
|
@@ -18,16 +20,13 @@ module RMMSeg
|
|
18
20
|
SVWLRule,
|
19
21
|
LSDMFOCWRule
|
20
22
|
]
|
23
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
24
|
+
@match_cache_idx = 0
|
21
25
|
end
|
22
26
|
|
23
27
|
# Get the most proper CJK word.
|
24
28
|
def get_cjk_word
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
# Use rules to filter the +chunks+ to get the most
|
29
|
-
# apropos CJK word.
|
30
|
-
def get_cjk_word_from_chunks(chunks)
|
29
|
+
chunks = create_chunks
|
31
30
|
i = 0
|
32
31
|
while i < @rules.length
|
33
32
|
break if chunks.length < 2
|
@@ -42,7 +41,7 @@ module RMMSeg
|
|
42
41
|
end
|
43
42
|
|
44
43
|
word = chunks[0][0]
|
45
|
-
token =
|
44
|
+
token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
46
45
|
|
47
46
|
@index += word.length
|
48
47
|
@byte_index += word.byte_size
|
@@ -78,5 +77,46 @@ module RMMSeg
|
|
78
77
|
|
79
78
|
chunks
|
80
79
|
end
|
80
|
+
|
81
|
+
# Find all words occuring in the dictionary starting from
|
82
|
+
# +index+ . The maximum word length is determined by
|
83
|
+
# +Config.max_word_length+ .
|
84
|
+
def find_match_words(index)
|
85
|
+
for i, w in @match_cache
|
86
|
+
if i == index
|
87
|
+
return w
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
dic = Dictionary.instance
|
92
|
+
str = String.new
|
93
|
+
strlen = 0
|
94
|
+
words = Array.new
|
95
|
+
i = index
|
96
|
+
|
97
|
+
while i < @chars.length &&
|
98
|
+
!basic_latin?(@chars[i]) &&
|
99
|
+
strlen < Config.max_word_length
|
100
|
+
|
101
|
+
str << @chars[i]
|
102
|
+
strlen += 1
|
103
|
+
|
104
|
+
if dic.has_word?(str)
|
105
|
+
words << dic.get_word(str)
|
106
|
+
end
|
107
|
+
i += 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if words.empty?
|
111
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
112
|
+
end
|
113
|
+
|
114
|
+
@match_cache[@match_cache_idx] = [index, words]
|
115
|
+
@match_cache_idx += 1
|
116
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
117
|
+
|
118
|
+
words
|
119
|
+
end
|
120
|
+
|
81
121
|
end
|
82
122
|
end
|
data/lib/rmmseg/config.rb
CHANGED
@@ -25,9 +25,11 @@ module RMMSeg
|
|
25
25
|
@algorithm = algor
|
26
26
|
end
|
27
27
|
# Get an instance of the algorithm object corresponding to the
|
28
|
-
# algorithm name configured.
|
29
|
-
|
30
|
-
|
28
|
+
# algorithm name configured. +tok+ is the class of the token oject
|
29
|
+
# to be returned. For example, if you want to use with Ferret, you
|
30
|
+
# should provide +::Ferret::Analysis::Token+ .
|
31
|
+
def algorithm_instance(text, tok=Token)
|
32
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
|
31
33
|
end
|
32
34
|
|
33
35
|
# Get the behavior description when an unresolved ambiguity occured.
|
data/lib/rmmseg/ferret.rb
CHANGED
@@ -39,11 +39,7 @@ module RMMSeg
|
|
39
39
|
|
40
40
|
# Get next token
|
41
41
|
def next
|
42
|
-
|
43
|
-
if tok
|
44
|
-
tok = ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
|
45
|
-
end
|
46
|
-
tok
|
42
|
+
@algor.next_token
|
47
43
|
end
|
48
44
|
|
49
45
|
# Get the text being tokenized
|
@@ -54,7 +50,8 @@ module RMMSeg
|
|
54
50
|
# Set the text to be tokenized
|
55
51
|
def text=(str)
|
56
52
|
@text = str
|
57
|
-
@algor = RMMSeg::Config.algorithm_instance(@text
|
53
|
+
@algor = RMMSeg::Config.algorithm_instance(@text,
|
54
|
+
::Ferret::Analysis::Token)
|
58
55
|
end
|
59
56
|
end
|
60
57
|
|
@@ -7,17 +7,29 @@ module RMMSeg
|
|
7
7
|
|
8
8
|
# Create a new SimpleAlgorithm . The only rule used by this
|
9
9
|
# algorithm is MMRule .
|
10
|
-
def initialize(text)
|
10
|
+
def initialize(text, token=Token)
|
11
11
|
super
|
12
12
|
end
|
13
13
|
|
14
14
|
# Get the most proper CJK word.
|
15
15
|
def get_cjk_word
|
16
|
-
|
17
|
-
|
16
|
+
dic = Dictionary.instance
|
17
|
+
i = Config.max_word_length
|
18
|
+
if i + @index > @chars.length
|
19
|
+
i = @chars.length - @index
|
20
|
+
end
|
21
|
+
chars = @chars[@index, i]
|
22
|
+
word = chars.join
|
18
23
|
|
19
|
-
|
20
|
-
|
24
|
+
while i > 1 && !dic.has_word?(word)
|
25
|
+
i -= 1
|
26
|
+
word.slice!(-chars[i].size,chars[i].size) # truncate last char
|
27
|
+
end
|
28
|
+
|
29
|
+
token = @token.new(word, @byte_index, @byte_index+word.size)
|
30
|
+
|
31
|
+
@index += i
|
32
|
+
@byte_index += word.size
|
21
33
|
|
22
34
|
return token
|
23
35
|
end
|
data/lib/rmmseg/token.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-03-
|
12
|
+
date: 2008-03-16 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|