rmmseg 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.1.3 / 2008-02-28
2
+
3
+ * Make RMMSeg Token campatible to Ferret Token.
4
+ * Use while instead of loop for performance improvement.
5
+ * Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
6
+
1
7
  === 0.1.2 / 2008-02-25
2
8
 
3
9
  * Add cache to find_match_words: performance improved.
data/TODO.txt CHANGED
@@ -1,4 +1,7 @@
1
1
  === TODO
2
2
 
3
+ * Release 0.1.3 before adding C staffs.
4
+ * Implement a C version of jcode.
5
+ * Implement a C version of string_ref.
3
6
  * Avoid Memory Leak
4
7
  * Improve Performance
@@ -9,6 +9,8 @@ module RMMSeg
9
9
  # words. This module is the common operations shared by
10
10
  # SimpleAlgorithm and ComplexAlgorithm .
11
11
  module Algorithm
12
+ MATCH_CACHE_MAX_LENGTH = 3
13
+
12
14
  # Initialize a new instance of Algorithm, the +text+ will
13
15
  # then be segmented by this instance.
14
16
  def initialize(text)
@@ -16,7 +18,8 @@ module RMMSeg
16
18
  @chars = text.each_char
17
19
  @index = 0
18
20
  @byte_index = 0
19
- @match_cache = Array.new
21
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
22
+ @match_cache_idx = 0
20
23
  end
21
24
 
22
25
  # Get the next Token recognized.
@@ -45,10 +48,11 @@ module RMMSeg
45
48
  # of words.
46
49
  def segment
47
50
  words = Array.new
48
- loop do
49
- token = next_token
50
- break if token.nil?
51
+
52
+ token = next_token
53
+ until token.nil?
51
54
  words << token.text
55
+ token = next_token
52
56
  end
53
57
 
54
58
  words
@@ -83,7 +87,7 @@ module RMMSeg
83
87
  @byte_index += i - @index
84
88
  @index = i
85
89
 
86
- return Token.new(@text, start_pos, end_pos)
90
+ return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
87
91
  end
88
92
 
89
93
  # Use rules to filter the +chunks+ to get the most
@@ -103,7 +107,7 @@ module RMMSeg
103
107
  end
104
108
 
105
109
  word = chunks[0][0]
106
- token = Token.new(@text, @byte_index, @byte_index+word.byte_size)
110
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
107
111
 
108
112
  @index += word.length
109
113
  @byte_index += word.byte_size
@@ -123,25 +127,31 @@ module RMMSeg
123
127
 
124
128
  dic = Dictionary.instance
125
129
  str = String.new
130
+ strlen = 0
126
131
  words = Array.new
127
132
  i = index
128
-
129
- loop do
130
- break if i >= chars.length || basic_latin?(chars[i])
133
+
134
+ while i < chars.length &&
135
+ !basic_latin?(chars[i]) &&
136
+ strlen < Config.max_word_length
137
+
131
138
  str << chars[i]
139
+ strlen += 1
140
+
132
141
  if dic.has_word?(str)
133
142
  words << dic.get_word(str)
134
143
  end
135
144
  i += 1
136
- break if str.jlength >= Config.max_word_length
137
145
  end
138
146
 
139
147
  if words.empty?
140
148
  words << Word.new(chars[index], Word::TYPES[:unrecognized])
141
149
  end
142
150
 
143
- @match_cache << [index, words]
144
- @match_cache.shift if @match_cache.length > 4
151
+ @match_cache[@match_cache_idx] = [index, words]
152
+ @match_cache_idx += 1
153
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
154
+
145
155
  words
146
156
  end
147
157
 
data/lib/rmmseg/chunk.rb CHANGED
@@ -4,7 +4,11 @@ module RMMSeg
4
4
 
5
5
  # The sum of length of all words.
6
6
  def self.total_length(words)
7
- words.inject(0) { |len, word| len + word.length }
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
8
12
  end
9
13
 
10
14
  # The average length of words.
@@ -15,21 +19,23 @@ module RMMSeg
15
19
  # The square of the standard deviation of length of all words.
16
20
  def self.variance(words)
17
21
  avglen = average_length(words)
18
- Math.sqrt(words.inject(0.0) { |sqr_sum, word|
19
- tmp = word.length - avglen
20
- sqr_sum + tmp*tmp
21
- })
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
22
28
  end
23
29
 
24
30
  # The sum of all frequencies of one-character words.
25
31
  def self.degree_of_morphemic_freedom(words)
26
- words.inject(0) { |sum, word|
32
+ sum = 0
33
+ for word in words
27
34
  if word.length == 1 && word.type == Word::TYPES[:cjk_word]
28
- sum + word.frequency
29
- else
30
- sum
35
+ sum += word.frequency
31
36
  end
32
- }
37
+ end
38
+ sum
33
39
  end
34
40
  end
35
41
  end
@@ -24,27 +24,27 @@ module RMMSeg
24
24
  # starting from +@index+ .
25
25
  def create_chunks
26
26
  chunks = Array.new
27
- find_match_words(@chars, @index).each { |w0|
27
+ for w0 in find_match_words(@chars, @index)
28
28
  index0 = @index + w0.length
29
29
  if index0 < @chars.length
30
- find_match_words(@chars, index0).each { |w1|
30
+ for w1 in find_match_words(@chars, index0)
31
31
  index1 = index0 + w1.length
32
32
  if index1 < @chars.length
33
- find_match_words(@chars, index1).each { |w2|
33
+ for w2 in find_match_words(@chars, index1)
34
34
  if w2.type == Word::TYPES[:unrecognized]
35
35
  chunks << [w0, w1]
36
36
  else
37
37
  chunks << [w0, w1, w2]
38
38
  end
39
- }
39
+ end
40
40
  elsif index1 == @chars.length
41
41
  chunks << [w0, w1]
42
42
  end
43
- }
43
+ end
44
44
  elsif index0 == @chars.length
45
45
  chunks << [w0]
46
46
  end
47
- }
47
+ end
48
48
 
49
49
  chunks
50
50
  end
@@ -25,7 +25,7 @@ module RMMSeg
25
25
  if word == true
26
26
  word = Word.new(value.dup, Word::TYPES[:cjk_word])
27
27
  @dic[value] = word
28
- elsif word.is_a? String
28
+ elsif String === word
29
29
  word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
30
30
  @dic[value] = word
31
31
  end
data/lib/rmmseg/ferret.rb CHANGED
@@ -39,12 +39,7 @@ module RMMSeg
39
39
 
40
40
  # Get next token
41
41
  def next
42
- tk = @algor.next_token
43
- if tk.nil?
44
- nil
45
- else
46
- ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
47
- end
42
+ @algor.next_token
48
43
  end
49
44
 
50
45
  # Get the text being tokenized
@@ -91,13 +86,11 @@ module RMMSeg
91
86
 
92
87
  # Get next token, skip stand alone Chinese punctuations.
93
88
  def next
94
- token = nil
89
+ token = @stream.next
95
90
  dic = Dictionary.instance
96
- loop do
97
- token = @stream.next
98
- break if token.nil?
99
91
 
100
- break unless dic.include? token.text
92
+ until token.nil? || !(dic.include? token.text)
93
+ token = @stream.next
101
94
  end
102
95
 
103
96
  token
data/lib/rmmseg/token.rb CHANGED
@@ -2,31 +2,52 @@ module RMMSeg
2
2
  # A Token consists of a term's text and the start and end offset
3
3
  # of the term.
4
4
  class Token
5
- # Text of the token.
6
- def text
7
- @text[@start_pos...@end_pos]
8
- end
9
-
10
5
  # Does this token contain any characters?
11
6
  def empty?
12
- @start_pos == @end_pos
7
+ @start == @end
13
8
  end
14
9
 
10
+ # The text of the token
11
+ attr_accessor :text
12
+
15
13
  # The start position of the token. This is *byte* index instead of
16
14
  # character.
17
- attr_reader :start_pos
15
+ attr_accessor :start
18
16
 
19
17
  # The one greater than the position of the last byte of the
20
18
  # token. This is *byte* index instead of character.
21
- attr_reader :end_pos
19
+ attr_accessor :end
20
+
21
+ # See Ferret document for Token.
22
+ attr_accessor :pos_inc
22
23
 
23
24
  # +text+ is the ref to the whole text. In other words:
24
25
  # +text[start_pos...end_pos]+ should be the string held by this
25
26
  # token.
26
27
  def initialize(text, start_pos, end_pos)
27
28
  @text = text
28
- @start_pos = start_pos
29
- @end_pos = end_pos
29
+ @start = start_pos
30
+ @end = end_pos
31
+ @pos_inc = 1
32
+ end
33
+
34
+ def <=> other
35
+ if @start > other.start
36
+ return 1
37
+ elsif @start < other.start
38
+ return -1
39
+ elsif @end > other.end
40
+ return 1
41
+ elsif @end < other.end
42
+ return -1
43
+ else
44
+ return @text <=> other.text
45
+ end
46
+ end
47
+ include Comparable
48
+
49
+ def to_s
50
+ @text.dup
30
51
  end
31
52
  end
32
53
  end
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.2'
9
+ VERSION = '0.1.3'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -30,8 +30,8 @@ describe "simple algorithm" do
30
30
  3.times { algor.next_token }
31
31
  token = algor.next_token
32
32
  token.text.should == "paragraph"
33
- token.start_pos.should == 10
34
- token.end_pos.should == 19
33
+ token.start.should == 10
34
+ token.end.should == 19
35
35
  end
36
36
 
37
37
  it "should handle byte positions of Chinese well" do
@@ -40,7 +40,7 @@ describe "simple algorithm" do
40
40
  2.times { algor.next_token }
41
41
  token = algor.next_token
42
42
  token.text.should == "中文"
43
- token.start_pos.should == 12
44
- token.end_pos.should == 18
43
+ token.start.should == 12
44
+ token.end.should == 18
45
45
  end
46
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-02-25 00:00:00 -08:00
12
+ date: 2008-02-27 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies: []
15
15