rmmseg 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/TODO.txt +3 -0
- data/lib/rmmseg/algorithm.rb +22 -12
- data/lib/rmmseg/chunk.rb +16 -10
- data/lib/rmmseg/complex_algorithm.rb +6 -6
- data/lib/rmmseg/dictionary.rb +1 -1
- data/lib/rmmseg/ferret.rb +4 -11
- data/lib/rmmseg/token.rb +31 -10
- data/lib/rmmseg.rb +1 -1
- data/spec/simple_algorithm_spec.rb +4 -4
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.1.3 / 2008-02-28
|
2
|
+
|
3
|
+
* Make RMMSeg Token campatible to Ferret Token.
|
4
|
+
* Use while instead of loop for performance improvement.
|
5
|
+
* Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
|
6
|
+
|
1
7
|
=== 0.1.2 / 2008-02-25
|
2
8
|
|
3
9
|
* Add cache to find_match_words: performance improved.
|
data/TODO.txt
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -9,6 +9,8 @@ module RMMSeg
|
|
9
9
|
# words. This module is the common operations shared by
|
10
10
|
# SimpleAlgorithm and ComplexAlgorithm .
|
11
11
|
module Algorithm
|
12
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
13
|
+
|
12
14
|
# Initialize a new instance of Algorithm, the +text+ will
|
13
15
|
# then be segmented by this instance.
|
14
16
|
def initialize(text)
|
@@ -16,7 +18,8 @@ module RMMSeg
|
|
16
18
|
@chars = text.each_char
|
17
19
|
@index = 0
|
18
20
|
@byte_index = 0
|
19
|
-
@match_cache = Array.new
|
21
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
22
|
+
@match_cache_idx = 0
|
20
23
|
end
|
21
24
|
|
22
25
|
# Get the next Token recognized.
|
@@ -45,10 +48,11 @@ module RMMSeg
|
|
45
48
|
# of words.
|
46
49
|
def segment
|
47
50
|
words = Array.new
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
+
|
52
|
+
token = next_token
|
53
|
+
until token.nil?
|
51
54
|
words << token.text
|
55
|
+
token = next_token
|
52
56
|
end
|
53
57
|
|
54
58
|
words
|
@@ -83,7 +87,7 @@ module RMMSeg
|
|
83
87
|
@byte_index += i - @index
|
84
88
|
@index = i
|
85
89
|
|
86
|
-
return Token.new(@text, start_pos, end_pos)
|
90
|
+
return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
87
91
|
end
|
88
92
|
|
89
93
|
# Use rules to filter the +chunks+ to get the most
|
@@ -103,7 +107,7 @@ module RMMSeg
|
|
103
107
|
end
|
104
108
|
|
105
109
|
word = chunks[0][0]
|
106
|
-
token = Token.new(
|
110
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
107
111
|
|
108
112
|
@index += word.length
|
109
113
|
@byte_index += word.byte_size
|
@@ -123,25 +127,31 @@ module RMMSeg
|
|
123
127
|
|
124
128
|
dic = Dictionary.instance
|
125
129
|
str = String.new
|
130
|
+
strlen = 0
|
126
131
|
words = Array.new
|
127
132
|
i = index
|
128
|
-
|
129
|
-
|
130
|
-
|
133
|
+
|
134
|
+
while i < chars.length &&
|
135
|
+
!basic_latin?(chars[i]) &&
|
136
|
+
strlen < Config.max_word_length
|
137
|
+
|
131
138
|
str << chars[i]
|
139
|
+
strlen += 1
|
140
|
+
|
132
141
|
if dic.has_word?(str)
|
133
142
|
words << dic.get_word(str)
|
134
143
|
end
|
135
144
|
i += 1
|
136
|
-
break if str.jlength >= Config.max_word_length
|
137
145
|
end
|
138
146
|
|
139
147
|
if words.empty?
|
140
148
|
words << Word.new(chars[index], Word::TYPES[:unrecognized])
|
141
149
|
end
|
142
150
|
|
143
|
-
@match_cache
|
144
|
-
@
|
151
|
+
@match_cache[@match_cache_idx] = [index, words]
|
152
|
+
@match_cache_idx += 1
|
153
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
154
|
+
|
145
155
|
words
|
146
156
|
end
|
147
157
|
|
data/lib/rmmseg/chunk.rb
CHANGED
@@ -4,7 +4,11 @@ module RMMSeg
|
|
4
4
|
|
5
5
|
# The sum of length of all words.
|
6
6
|
def self.total_length(words)
|
7
|
-
|
7
|
+
len = 0
|
8
|
+
for word in words
|
9
|
+
len += word.length
|
10
|
+
end
|
11
|
+
len
|
8
12
|
end
|
9
13
|
|
10
14
|
# The average length of words.
|
@@ -15,21 +19,23 @@ module RMMSeg
|
|
15
19
|
# The square of the standard deviation of length of all words.
|
16
20
|
def self.variance(words)
|
17
21
|
avglen = average_length(words)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
sqr_sum = 0.0
|
23
|
+
for word in words
|
24
|
+
tmp = word.length - avglen
|
25
|
+
sqr_sum += tmp*tmp
|
26
|
+
end
|
27
|
+
Math.sqrt(sqr_sum)
|
22
28
|
end
|
23
29
|
|
24
30
|
# The sum of all frequencies of one-character words.
|
25
31
|
def self.degree_of_morphemic_freedom(words)
|
26
|
-
|
32
|
+
sum = 0
|
33
|
+
for word in words
|
27
34
|
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
28
|
-
sum
|
29
|
-
else
|
30
|
-
sum
|
35
|
+
sum += word.frequency
|
31
36
|
end
|
32
|
-
|
37
|
+
end
|
38
|
+
sum
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -24,27 +24,27 @@ module RMMSeg
|
|
24
24
|
# starting from +@index+ .
|
25
25
|
def create_chunks
|
26
26
|
chunks = Array.new
|
27
|
-
find_match_words(@chars, @index)
|
27
|
+
for w0 in find_match_words(@chars, @index)
|
28
28
|
index0 = @index + w0.length
|
29
29
|
if index0 < @chars.length
|
30
|
-
find_match_words(@chars, index0)
|
30
|
+
for w1 in find_match_words(@chars, index0)
|
31
31
|
index1 = index0 + w1.length
|
32
32
|
if index1 < @chars.length
|
33
|
-
find_match_words(@chars, index1)
|
33
|
+
for w2 in find_match_words(@chars, index1)
|
34
34
|
if w2.type == Word::TYPES[:unrecognized]
|
35
35
|
chunks << [w0, w1]
|
36
36
|
else
|
37
37
|
chunks << [w0, w1, w2]
|
38
38
|
end
|
39
|
-
|
39
|
+
end
|
40
40
|
elsif index1 == @chars.length
|
41
41
|
chunks << [w0, w1]
|
42
42
|
end
|
43
|
-
|
43
|
+
end
|
44
44
|
elsif index0 == @chars.length
|
45
45
|
chunks << [w0]
|
46
46
|
end
|
47
|
-
|
47
|
+
end
|
48
48
|
|
49
49
|
chunks
|
50
50
|
end
|
data/lib/rmmseg/dictionary.rb
CHANGED
data/lib/rmmseg/ferret.rb
CHANGED
@@ -39,12 +39,7 @@ module RMMSeg
|
|
39
39
|
|
40
40
|
# Get next token
|
41
41
|
def next
|
42
|
-
|
43
|
-
if tk.nil?
|
44
|
-
nil
|
45
|
-
else
|
46
|
-
::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
|
47
|
-
end
|
42
|
+
@algor.next_token
|
48
43
|
end
|
49
44
|
|
50
45
|
# Get the text being tokenized
|
@@ -91,13 +86,11 @@ module RMMSeg
|
|
91
86
|
|
92
87
|
# Get next token, skip stand alone Chinese punctuations.
|
93
88
|
def next
|
94
|
-
token =
|
89
|
+
token = @stream.next
|
95
90
|
dic = Dictionary.instance
|
96
|
-
loop do
|
97
|
-
token = @stream.next
|
98
|
-
break if token.nil?
|
99
91
|
|
100
|
-
|
92
|
+
until token.nil? || !(dic.include? token.text)
|
93
|
+
token = @stream.next
|
101
94
|
end
|
102
95
|
|
103
96
|
token
|
data/lib/rmmseg/token.rb
CHANGED
@@ -2,31 +2,52 @@ module RMMSeg
|
|
2
2
|
# A Token consists of a term's text and the start and end offset
|
3
3
|
# of the term.
|
4
4
|
class Token
|
5
|
-
# Text of the token.
|
6
|
-
def text
|
7
|
-
@text[@start_pos...@end_pos]
|
8
|
-
end
|
9
|
-
|
10
5
|
# Does this token contain any characters?
|
11
6
|
def empty?
|
12
|
-
@
|
7
|
+
@start == @end
|
13
8
|
end
|
14
9
|
|
10
|
+
# The text of the token
|
11
|
+
attr_accessor :text
|
12
|
+
|
15
13
|
# The start position of the token. This is *byte* index instead of
|
16
14
|
# character.
|
17
|
-
|
15
|
+
attr_accessor :start
|
18
16
|
|
19
17
|
# The one greater than the position of the last byte of the
|
20
18
|
# token. This is *byte* index instead of character.
|
21
|
-
|
19
|
+
attr_accessor :end
|
20
|
+
|
21
|
+
# See Ferret document for Token.
|
22
|
+
attr_accessor :pos_inc
|
22
23
|
|
23
24
|
# +text+ is the ref to the whole text. In other words:
|
24
25
|
# +text[start_pos...end_pos]+ should be the string held by this
|
25
26
|
# token.
|
26
27
|
def initialize(text, start_pos, end_pos)
|
27
28
|
@text = text
|
28
|
-
@
|
29
|
-
@
|
29
|
+
@start = start_pos
|
30
|
+
@end = end_pos
|
31
|
+
@pos_inc = 1
|
32
|
+
end
|
33
|
+
|
34
|
+
def <=> other
|
35
|
+
if @start > other.start
|
36
|
+
return 1
|
37
|
+
elsif @start < other.start
|
38
|
+
return -1
|
39
|
+
elsif @end > other.end
|
40
|
+
return 1
|
41
|
+
elsif @end < other.end
|
42
|
+
return -1
|
43
|
+
else
|
44
|
+
return @text <=> other.text
|
45
|
+
end
|
46
|
+
end
|
47
|
+
include Comparable
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
@text.dup
|
30
51
|
end
|
31
52
|
end
|
32
53
|
end
|
data/lib/rmmseg.rb
CHANGED
@@ -30,8 +30,8 @@ describe "simple algorithm" do
|
|
30
30
|
3.times { algor.next_token }
|
31
31
|
token = algor.next_token
|
32
32
|
token.text.should == "paragraph"
|
33
|
-
token.
|
34
|
-
token.
|
33
|
+
token.start.should == 10
|
34
|
+
token.end.should == 19
|
35
35
|
end
|
36
36
|
|
37
37
|
it "should handle byte positions of Chinese well" do
|
@@ -40,7 +40,7 @@ describe "simple algorithm" do
|
|
40
40
|
2.times { algor.next_token }
|
41
41
|
token = algor.next_token
|
42
42
|
token.text.should == "中文"
|
43
|
-
token.
|
44
|
-
token.
|
43
|
+
token.start.should == 12
|
44
|
+
token.end.should == 18
|
45
45
|
end
|
46
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-02-
|
12
|
+
date: 2008-02-27 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|