rmmseg 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/TODO.txt +3 -0
- data/lib/rmmseg/algorithm.rb +22 -12
- data/lib/rmmseg/chunk.rb +16 -10
- data/lib/rmmseg/complex_algorithm.rb +6 -6
- data/lib/rmmseg/dictionary.rb +1 -1
- data/lib/rmmseg/ferret.rb +4 -11
- data/lib/rmmseg/token.rb +31 -10
- data/lib/rmmseg.rb +1 -1
- data/spec/simple_algorithm_spec.rb +4 -4
- metadata +2 -2
data/History.txt
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
=== 0.1.3 / 2008-02-28
|
2
|
+
|
3
|
+
* Make RMMSeg Token campatible to Ferret Token.
|
4
|
+
* Use while instead of loop for performance improvement.
|
5
|
+
* Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
|
6
|
+
|
1
7
|
=== 0.1.2 / 2008-02-25
|
2
8
|
|
3
9
|
* Add cache to find_match_words: performance improved.
|
data/TODO.txt
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -9,6 +9,8 @@ module RMMSeg
|
|
9
9
|
# words. This module is the common operations shared by
|
10
10
|
# SimpleAlgorithm and ComplexAlgorithm .
|
11
11
|
module Algorithm
|
12
|
+
MATCH_CACHE_MAX_LENGTH = 3
|
13
|
+
|
12
14
|
# Initialize a new instance of Algorithm, the +text+ will
|
13
15
|
# then be segmented by this instance.
|
14
16
|
def initialize(text)
|
@@ -16,7 +18,8 @@ module RMMSeg
|
|
16
18
|
@chars = text.each_char
|
17
19
|
@index = 0
|
18
20
|
@byte_index = 0
|
19
|
-
@match_cache = Array.new
|
21
|
+
@match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
|
22
|
+
@match_cache_idx = 0
|
20
23
|
end
|
21
24
|
|
22
25
|
# Get the next Token recognized.
|
@@ -45,10 +48,11 @@ module RMMSeg
|
|
45
48
|
# of words.
|
46
49
|
def segment
|
47
50
|
words = Array.new
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
+
|
52
|
+
token = next_token
|
53
|
+
until token.nil?
|
51
54
|
words << token.text
|
55
|
+
token = next_token
|
52
56
|
end
|
53
57
|
|
54
58
|
words
|
@@ -83,7 +87,7 @@ module RMMSeg
|
|
83
87
|
@byte_index += i - @index
|
84
88
|
@index = i
|
85
89
|
|
86
|
-
return Token.new(@text, start_pos, end_pos)
|
90
|
+
return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
87
91
|
end
|
88
92
|
|
89
93
|
# Use rules to filter the +chunks+ to get the most
|
@@ -103,7 +107,7 @@ module RMMSeg
|
|
103
107
|
end
|
104
108
|
|
105
109
|
word = chunks[0][0]
|
106
|
-
token = Token.new(
|
110
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
107
111
|
|
108
112
|
@index += word.length
|
109
113
|
@byte_index += word.byte_size
|
@@ -123,25 +127,31 @@ module RMMSeg
|
|
123
127
|
|
124
128
|
dic = Dictionary.instance
|
125
129
|
str = String.new
|
130
|
+
strlen = 0
|
126
131
|
words = Array.new
|
127
132
|
i = index
|
128
|
-
|
129
|
-
|
130
|
-
|
133
|
+
|
134
|
+
while i < chars.length &&
|
135
|
+
!basic_latin?(chars[i]) &&
|
136
|
+
strlen < Config.max_word_length
|
137
|
+
|
131
138
|
str << chars[i]
|
139
|
+
strlen += 1
|
140
|
+
|
132
141
|
if dic.has_word?(str)
|
133
142
|
words << dic.get_word(str)
|
134
143
|
end
|
135
144
|
i += 1
|
136
|
-
break if str.jlength >= Config.max_word_length
|
137
145
|
end
|
138
146
|
|
139
147
|
if words.empty?
|
140
148
|
words << Word.new(chars[index], Word::TYPES[:unrecognized])
|
141
149
|
end
|
142
150
|
|
143
|
-
@match_cache
|
144
|
-
@
|
151
|
+
@match_cache[@match_cache_idx] = [index, words]
|
152
|
+
@match_cache_idx += 1
|
153
|
+
@match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
|
154
|
+
|
145
155
|
words
|
146
156
|
end
|
147
157
|
|
data/lib/rmmseg/chunk.rb
CHANGED
@@ -4,7 +4,11 @@ module RMMSeg
|
|
4
4
|
|
5
5
|
# The sum of length of all words.
|
6
6
|
def self.total_length(words)
|
7
|
-
|
7
|
+
len = 0
|
8
|
+
for word in words
|
9
|
+
len += word.length
|
10
|
+
end
|
11
|
+
len
|
8
12
|
end
|
9
13
|
|
10
14
|
# The average length of words.
|
@@ -15,21 +19,23 @@ module RMMSeg
|
|
15
19
|
# The square of the standard deviation of length of all words.
|
16
20
|
def self.variance(words)
|
17
21
|
avglen = average_length(words)
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
sqr_sum = 0.0
|
23
|
+
for word in words
|
24
|
+
tmp = word.length - avglen
|
25
|
+
sqr_sum += tmp*tmp
|
26
|
+
end
|
27
|
+
Math.sqrt(sqr_sum)
|
22
28
|
end
|
23
29
|
|
24
30
|
# The sum of all frequencies of one-character words.
|
25
31
|
def self.degree_of_morphemic_freedom(words)
|
26
|
-
|
32
|
+
sum = 0
|
33
|
+
for word in words
|
27
34
|
if word.length == 1 && word.type == Word::TYPES[:cjk_word]
|
28
|
-
sum
|
29
|
-
else
|
30
|
-
sum
|
35
|
+
sum += word.frequency
|
31
36
|
end
|
32
|
-
|
37
|
+
end
|
38
|
+
sum
|
33
39
|
end
|
34
40
|
end
|
35
41
|
end
|
@@ -24,27 +24,27 @@ module RMMSeg
|
|
24
24
|
# starting from +@index+ .
|
25
25
|
def create_chunks
|
26
26
|
chunks = Array.new
|
27
|
-
find_match_words(@chars, @index)
|
27
|
+
for w0 in find_match_words(@chars, @index)
|
28
28
|
index0 = @index + w0.length
|
29
29
|
if index0 < @chars.length
|
30
|
-
find_match_words(@chars, index0)
|
30
|
+
for w1 in find_match_words(@chars, index0)
|
31
31
|
index1 = index0 + w1.length
|
32
32
|
if index1 < @chars.length
|
33
|
-
find_match_words(@chars, index1)
|
33
|
+
for w2 in find_match_words(@chars, index1)
|
34
34
|
if w2.type == Word::TYPES[:unrecognized]
|
35
35
|
chunks << [w0, w1]
|
36
36
|
else
|
37
37
|
chunks << [w0, w1, w2]
|
38
38
|
end
|
39
|
-
|
39
|
+
end
|
40
40
|
elsif index1 == @chars.length
|
41
41
|
chunks << [w0, w1]
|
42
42
|
end
|
43
|
-
|
43
|
+
end
|
44
44
|
elsif index0 == @chars.length
|
45
45
|
chunks << [w0]
|
46
46
|
end
|
47
|
-
|
47
|
+
end
|
48
48
|
|
49
49
|
chunks
|
50
50
|
end
|
data/lib/rmmseg/dictionary.rb
CHANGED
data/lib/rmmseg/ferret.rb
CHANGED
@@ -39,12 +39,7 @@ module RMMSeg
|
|
39
39
|
|
40
40
|
# Get next token
|
41
41
|
def next
|
42
|
-
|
43
|
-
if tk.nil?
|
44
|
-
nil
|
45
|
-
else
|
46
|
-
::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
|
47
|
-
end
|
42
|
+
@algor.next_token
|
48
43
|
end
|
49
44
|
|
50
45
|
# Get the text being tokenized
|
@@ -91,13 +86,11 @@ module RMMSeg
|
|
91
86
|
|
92
87
|
# Get next token, skip stand alone Chinese punctuations.
|
93
88
|
def next
|
94
|
-
token =
|
89
|
+
token = @stream.next
|
95
90
|
dic = Dictionary.instance
|
96
|
-
loop do
|
97
|
-
token = @stream.next
|
98
|
-
break if token.nil?
|
99
91
|
|
100
|
-
|
92
|
+
until token.nil? || !(dic.include? token.text)
|
93
|
+
token = @stream.next
|
101
94
|
end
|
102
95
|
|
103
96
|
token
|
data/lib/rmmseg/token.rb
CHANGED
@@ -2,31 +2,52 @@ module RMMSeg
|
|
2
2
|
# A Token consists of a term's text and the start and end offset
|
3
3
|
# of the term.
|
4
4
|
class Token
|
5
|
-
# Text of the token.
|
6
|
-
def text
|
7
|
-
@text[@start_pos...@end_pos]
|
8
|
-
end
|
9
|
-
|
10
5
|
# Does this token contain any characters?
|
11
6
|
def empty?
|
12
|
-
@
|
7
|
+
@start == @end
|
13
8
|
end
|
14
9
|
|
10
|
+
# The text of the token
|
11
|
+
attr_accessor :text
|
12
|
+
|
15
13
|
# The start position of the token. This is *byte* index instead of
|
16
14
|
# character.
|
17
|
-
|
15
|
+
attr_accessor :start
|
18
16
|
|
19
17
|
# The one greater than the position of the last byte of the
|
20
18
|
# token. This is *byte* index instead of character.
|
21
|
-
|
19
|
+
attr_accessor :end
|
20
|
+
|
21
|
+
# See Ferret document for Token.
|
22
|
+
attr_accessor :pos_inc
|
22
23
|
|
23
24
|
# +text+ is the ref to the whole text. In other words:
|
24
25
|
# +text[start_pos...end_pos]+ should be the string held by this
|
25
26
|
# token.
|
26
27
|
def initialize(text, start_pos, end_pos)
|
27
28
|
@text = text
|
28
|
-
@
|
29
|
-
@
|
29
|
+
@start = start_pos
|
30
|
+
@end = end_pos
|
31
|
+
@pos_inc = 1
|
32
|
+
end
|
33
|
+
|
34
|
+
def <=> other
|
35
|
+
if @start > other.start
|
36
|
+
return 1
|
37
|
+
elsif @start < other.start
|
38
|
+
return -1
|
39
|
+
elsif @end > other.end
|
40
|
+
return 1
|
41
|
+
elsif @end < other.end
|
42
|
+
return -1
|
43
|
+
else
|
44
|
+
return @text <=> other.text
|
45
|
+
end
|
46
|
+
end
|
47
|
+
include Comparable
|
48
|
+
|
49
|
+
def to_s
|
50
|
+
@text.dup
|
30
51
|
end
|
31
52
|
end
|
32
53
|
end
|
data/lib/rmmseg.rb
CHANGED
@@ -30,8 +30,8 @@ describe "simple algorithm" do
|
|
30
30
|
3.times { algor.next_token }
|
31
31
|
token = algor.next_token
|
32
32
|
token.text.should == "paragraph"
|
33
|
-
token.
|
34
|
-
token.
|
33
|
+
token.start.should == 10
|
34
|
+
token.end.should == 19
|
35
35
|
end
|
36
36
|
|
37
37
|
it "should handle byte positions of Chinese well" do
|
@@ -40,7 +40,7 @@ describe "simple algorithm" do
|
|
40
40
|
2.times { algor.next_token }
|
41
41
|
token = algor.next_token
|
42
42
|
token.text.should == "中文"
|
43
|
-
token.
|
44
|
-
token.
|
43
|
+
token.start.should == 12
|
44
|
+
token.end.should == 18
|
45
45
|
end
|
46
46
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-02-
|
12
|
+
date: 2008-02-27 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|