rmmseg 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/lib/rmmseg.rb +1 -1
- data/lib/rmmseg/algorithm.rb +6 -8
- data/lib/rmmseg/complex_algorithm.rb +48 -8
- data/lib/rmmseg/config.rb +5 -3
- data/lib/rmmseg/ferret.rb +3 -6
- data/lib/rmmseg/simple_algorithm.rb +17 -5
- data/lib/rmmseg/token.rb +0 -5
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
* Construct Ferret Token directly.
|
2
|
+
|
3
|
+
=== 0.1.6 / 2008-03-16
|
4
|
+
|
5
|
+
* Optimize for simple algorithm. One time faster than before. And less memory usage.
|
6
|
+
|
1
7
|
=== 0.1.5 / 2008-03-03
|
2
8
|
|
3
9
|
* Bug fix: Ferret Token is not Duck-Typing. We need to construct Ferret token instead of reuse RMMSeg Token.
|
data/lib/rmmseg.rb
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -9,17 +9,15 @@ module RMMSeg
|
|
9
9
|
# words. This module is the common operations shared by
|
10
10
|
# SimpleAlgorithm and ComplexAlgorithm .
|
11
11
|
module Algorithm
|
12
|
-
MATCH_CACHE_MAX_LENGTH = 3
|
13
|
-
|
14
12
|
# Initialize a new instance of Algorithm, the +text+ will
|
15
|
-
# then be segmented by this instance.
|
16
|
-
|
13
|
+
# then be segmented by this instance. +token+ is the class
|
14
|
+
# which will be used to construct the result token.
|
15
|
+
def initialize(text, token=Token)
|
17
16
|
@text = text
|
18
17
|
@chars = text.each_char
|
19
18
|
@index = 0
|
20
19
|
@byte_index = 0
|
21
|
-
@
|
22
|
-
@match_cache_idx = 0
|
20
|
+
@token = token
|
23
21
|
end
|
24
22
|
|
25
23
|
# Get the next Token recognized.
|
@@ -32,7 +30,7 @@ module RMMSeg
|
|
32
30
|
token = get_cjk_word
|
33
31
|
end
|
34
32
|
|
35
|
-
if token.empty
|
33
|
+
if token.start == token.end # empty
|
36
34
|
return next_token
|
37
35
|
else
|
38
36
|
return token
|
@@ -82,7 +80,7 @@ module RMMSeg
|
|
82
80
|
@byte_index += i - @index
|
83
81
|
@index = i
|
84
82
|
|
85
|
-
return
|
83
|
+
return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
86
84
|
end
|
87
85
|
|
88
86
|
# Find all words occuring in the dictionary starting from
|
@@ -6,11 +6,13 @@ require 'rmmseg/lsdmfocw_rule'
|
|
6
6
|
|
7
7
|
module RMMSeg
|
8
8
|
class ComplexAlgorithm
|
9
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
10
|
+
|
9
11
|
include Algorithm
|
10
12
|
|
11
13
|
# Create a new ComplexAlgorithm . Rules used by this algorithm
|
12
14
|
# includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
|
13
|
-
def initialize(text)
|
15
|
+
def initialize(text, token=Token)
|
14
16
|
super
|
15
17
|
@rules = [
|
16
18
|
MMRule,
|
@@ -18,16 +20,13 @@ module RMMSeg
|
|
18
20
|
SVWLRule,
|
19
21
|
LSDMFOCWRule
|
20
22
|
]
|
23
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
24
|
+
@match_cache_idx = 0
|
21
25
|
end
|
22
26
|
|
23
27
|
# Get the most proper CJK word.
|
24
28
|
def get_cjk_word
|
25
|
-
|
26
|
-
end
|
27
|
-
|
28
|
-
# Use rules to filter the +chunks+ to get the most
|
29
|
-
# apropos CJK word.
|
30
|
-
def get_cjk_word_from_chunks(chunks)
|
29
|
+
chunks = create_chunks
|
31
30
|
i = 0
|
32
31
|
while i < @rules.length
|
33
32
|
break if chunks.length < 2
|
@@ -42,7 +41,7 @@ module RMMSeg
|
|
42
41
|
end
|
43
42
|
|
44
43
|
word = chunks[0][0]
|
45
|
-
token =
|
44
|
+
token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
46
45
|
|
47
46
|
@index += word.length
|
48
47
|
@byte_index += word.byte_size
|
@@ -78,5 +77,46 @@ module RMMSeg
|
|
78
77
|
|
79
78
|
chunks
|
80
79
|
end
|
80
|
+
|
81
|
+
# Find all words occuring in the dictionary starting from
|
82
|
+
# +index+ . The maximum word length is determined by
|
83
|
+
# +Config.max_word_length+ .
|
84
|
+
def find_match_words(index)
|
85
|
+
for i, w in @match_cache
|
86
|
+
if i == index
|
87
|
+
return w
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
dic = Dictionary.instance
|
92
|
+
str = String.new
|
93
|
+
strlen = 0
|
94
|
+
words = Array.new
|
95
|
+
i = index
|
96
|
+
|
97
|
+
while i < @chars.length &&
|
98
|
+
!basic_latin?(@chars[i]) &&
|
99
|
+
strlen < Config.max_word_length
|
100
|
+
|
101
|
+
str << @chars[i]
|
102
|
+
strlen += 1
|
103
|
+
|
104
|
+
if dic.has_word?(str)
|
105
|
+
words << dic.get_word(str)
|
106
|
+
end
|
107
|
+
i += 1
|
108
|
+
end
|
109
|
+
|
110
|
+
if words.empty?
|
111
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
112
|
+
end
|
113
|
+
|
114
|
+
@match_cache[@match_cache_idx] = [index, words]
|
115
|
+
@match_cache_idx += 1
|
116
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
117
|
+
|
118
|
+
words
|
119
|
+
end
|
120
|
+
|
81
121
|
end
|
82
122
|
end
|
data/lib/rmmseg/config.rb
CHANGED
@@ -25,9 +25,11 @@ module RMMSeg
|
|
25
25
|
@algorithm = algor
|
26
26
|
end
|
27
27
|
# Get an instance of the algorithm object corresponding to the
|
28
|
-
# algorithm name configured.
|
29
|
-
|
30
|
-
|
28
|
+
# algorithm name configured. +tok+ is the class of the token oject
|
29
|
+
# to be returned. For example, if you want to use with Ferret, you
|
30
|
+
# should provide +::Ferret::Analysis::Token+ .
|
31
|
+
def algorithm_instance(text, tok=Token)
|
32
|
+
RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
|
31
33
|
end
|
32
34
|
|
33
35
|
# Get the behavior description when an unresolved ambiguity occured.
|
data/lib/rmmseg/ferret.rb
CHANGED
@@ -39,11 +39,7 @@ module RMMSeg
|
|
39
39
|
|
40
40
|
# Get next token
|
41
41
|
def next
|
42
|
-
|
43
|
-
if tok
|
44
|
-
tok = ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
|
45
|
-
end
|
46
|
-
tok
|
42
|
+
@algor.next_token
|
47
43
|
end
|
48
44
|
|
49
45
|
# Get the text being tokenized
|
@@ -54,7 +50,8 @@ module RMMSeg
|
|
54
50
|
# Set the text to be tokenized
|
55
51
|
def text=(str)
|
56
52
|
@text = str
|
57
|
-
@algor = RMMSeg::Config.algorithm_instance(@text
|
53
|
+
@algor = RMMSeg::Config.algorithm_instance(@text,
|
54
|
+
::Ferret::Analysis::Token)
|
58
55
|
end
|
59
56
|
end
|
60
57
|
|
@@ -7,17 +7,29 @@ module RMMSeg
|
|
7
7
|
|
8
8
|
# Create a new SimpleAlgorithm . The only rule used by this
|
9
9
|
# algorithm is MMRule .
|
10
|
-
def initialize(text)
|
10
|
+
def initialize(text, token=Token)
|
11
11
|
super
|
12
12
|
end
|
13
13
|
|
14
14
|
# Get the most proper CJK word.
|
15
15
|
def get_cjk_word
|
16
|
-
|
17
|
-
|
16
|
+
dic = Dictionary.instance
|
17
|
+
i = Config.max_word_length
|
18
|
+
if i + @index > @chars.length
|
19
|
+
i = @chars.length - @index
|
20
|
+
end
|
21
|
+
chars = @chars[@index, i]
|
22
|
+
word = chars.join
|
18
23
|
|
19
|
-
|
20
|
-
|
24
|
+
while i > 1 && !dic.has_word?(word)
|
25
|
+
i -= 1
|
26
|
+
word.slice!(-chars[i].size,chars[i].size) # truncate last char
|
27
|
+
end
|
28
|
+
|
29
|
+
token = @token.new(word, @byte_index, @byte_index+word.size)
|
30
|
+
|
31
|
+
@index += i
|
32
|
+
@byte_index += word.size
|
21
33
|
|
22
34
|
return token
|
23
35
|
end
|
data/lib/rmmseg/token.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-03-
|
12
|
+
date: 2008-03-16 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|