rmmseg 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.1.3 / 2008-02-28
2
+
3
+ * Make RMMSeg Token campatible to Ferret Token.
4
+ * Use while instead of loop for performance improvement.
5
+ * Avoid many costly String#jlength call for performance improvement (use only 70% time and 40% memory as before).
6
+
1
7
  === 0.1.2 / 2008-02-25
2
8
 
3
9
  * Add cache to find_match_words: performance improved.
data/TODO.txt CHANGED
@@ -1,4 +1,7 @@
1
1
  === TODO
2
2
 
3
+ * Release 0.1.3 before adding C staffs.
4
+ * Implement a C version of jcode.
5
+ * Implement a C version of string_ref.
3
6
  * Avoid Memory Leak
4
7
  * Improve Performance
@@ -9,6 +9,8 @@ module RMMSeg
9
9
  # words. This module is the common operations shared by
10
10
  # SimpleAlgorithm and ComplexAlgorithm .
11
11
  module Algorithm
12
+ MATCH_CACHE_MAX_LENGTH = 3
13
+
12
14
  # Initialize a new instance of Algorithm, the +text+ will
13
15
  # then be segmented by this instance.
14
16
  def initialize(text)
@@ -16,7 +18,8 @@ module RMMSeg
16
18
  @chars = text.each_char
17
19
  @index = 0
18
20
  @byte_index = 0
19
- @match_cache = Array.new
21
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
22
+ @match_cache_idx = 0
20
23
  end
21
24
 
22
25
  # Get the next Token recognized.
@@ -45,10 +48,11 @@ module RMMSeg
45
48
  # of words.
46
49
  def segment
47
50
  words = Array.new
48
- loop do
49
- token = next_token
50
- break if token.nil?
51
+
52
+ token = next_token
53
+ until token.nil?
51
54
  words << token.text
55
+ token = next_token
52
56
  end
53
57
 
54
58
  words
@@ -83,7 +87,7 @@ module RMMSeg
83
87
  @byte_index += i - @index
84
88
  @index = i
85
89
 
86
- return Token.new(@text, start_pos, end_pos)
90
+ return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
87
91
  end
88
92
 
89
93
  # Use rules to filter the +chunks+ to get the most
@@ -103,7 +107,7 @@ module RMMSeg
103
107
  end
104
108
 
105
109
  word = chunks[0][0]
106
- token = Token.new(@text, @byte_index, @byte_index+word.byte_size)
110
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
107
111
 
108
112
  @index += word.length
109
113
  @byte_index += word.byte_size
@@ -123,25 +127,31 @@ module RMMSeg
123
127
 
124
128
  dic = Dictionary.instance
125
129
  str = String.new
130
+ strlen = 0
126
131
  words = Array.new
127
132
  i = index
128
-
129
- loop do
130
- break if i >= chars.length || basic_latin?(chars[i])
133
+
134
+ while i < chars.length &&
135
+ !basic_latin?(chars[i]) &&
136
+ strlen < Config.max_word_length
137
+
131
138
  str << chars[i]
139
+ strlen += 1
140
+
132
141
  if dic.has_word?(str)
133
142
  words << dic.get_word(str)
134
143
  end
135
144
  i += 1
136
- break if str.jlength >= Config.max_word_length
137
145
  end
138
146
 
139
147
  if words.empty?
140
148
  words << Word.new(chars[index], Word::TYPES[:unrecognized])
141
149
  end
142
150
 
143
- @match_cache << [index, words]
144
- @match_cache.shift if @match_cache.length > 4
151
+ @match_cache[@match_cache_idx] = [index, words]
152
+ @match_cache_idx += 1
153
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
154
+
145
155
  words
146
156
  end
147
157
 
data/lib/rmmseg/chunk.rb CHANGED
@@ -4,7 +4,11 @@ module RMMSeg
4
4
 
5
5
  # The sum of length of all words.
6
6
  def self.total_length(words)
7
- words.inject(0) { |len, word| len + word.length }
7
+ len = 0
8
+ for word in words
9
+ len += word.length
10
+ end
11
+ len
8
12
  end
9
13
 
10
14
  # The average length of words.
@@ -15,21 +19,23 @@ module RMMSeg
15
19
  # The square of the standard deviation of length of all words.
16
20
  def self.variance(words)
17
21
  avglen = average_length(words)
18
- Math.sqrt(words.inject(0.0) { |sqr_sum, word|
19
- tmp = word.length - avglen
20
- sqr_sum + tmp*tmp
21
- })
22
+ sqr_sum = 0.0
23
+ for word in words
24
+ tmp = word.length - avglen
25
+ sqr_sum += tmp*tmp
26
+ end
27
+ Math.sqrt(sqr_sum)
22
28
  end
23
29
 
24
30
  # The sum of all frequencies of one-character words.
25
31
  def self.degree_of_morphemic_freedom(words)
26
- words.inject(0) { |sum, word|
32
+ sum = 0
33
+ for word in words
27
34
  if word.length == 1 && word.type == Word::TYPES[:cjk_word]
28
- sum + word.frequency
29
- else
30
- sum
35
+ sum += word.frequency
31
36
  end
32
- }
37
+ end
38
+ sum
33
39
  end
34
40
  end
35
41
  end
@@ -24,27 +24,27 @@ module RMMSeg
24
24
  # starting from +@index+ .
25
25
  def create_chunks
26
26
  chunks = Array.new
27
- find_match_words(@chars, @index).each { |w0|
27
+ for w0 in find_match_words(@chars, @index)
28
28
  index0 = @index + w0.length
29
29
  if index0 < @chars.length
30
- find_match_words(@chars, index0).each { |w1|
30
+ for w1 in find_match_words(@chars, index0)
31
31
  index1 = index0 + w1.length
32
32
  if index1 < @chars.length
33
- find_match_words(@chars, index1).each { |w2|
33
+ for w2 in find_match_words(@chars, index1)
34
34
  if w2.type == Word::TYPES[:unrecognized]
35
35
  chunks << [w0, w1]
36
36
  else
37
37
  chunks << [w0, w1, w2]
38
38
  end
39
- }
39
+ end
40
40
  elsif index1 == @chars.length
41
41
  chunks << [w0, w1]
42
42
  end
43
- }
43
+ end
44
44
  elsif index0 == @chars.length
45
45
  chunks << [w0]
46
46
  end
47
- }
47
+ end
48
48
 
49
49
  chunks
50
50
  end
@@ -25,7 +25,7 @@ module RMMSeg
25
25
  if word == true
26
26
  word = Word.new(value.dup, Word::TYPES[:cjk_word])
27
27
  @dic[value] = word
28
- elsif word.is_a? String
28
+ elsif String === word
29
29
  word = Word.new(value.dup, Word::TYPES[:cjk_word], word.to_i)
30
30
  @dic[value] = word
31
31
  end
data/lib/rmmseg/ferret.rb CHANGED
@@ -39,12 +39,7 @@ module RMMSeg
39
39
 
40
40
  # Get next token
41
41
  def next
42
- tk = @algor.next_token
43
- if tk.nil?
44
- nil
45
- else
46
- ::Ferret::Analysis::Token.new(tk.text, tk.start_pos, tk.end_pos)
47
- end
42
+ @algor.next_token
48
43
  end
49
44
 
50
45
  # Get the text being tokenized
@@ -91,13 +86,11 @@ module RMMSeg
91
86
 
92
87
  # Get next token, skip stand alone Chinese punctuations.
93
88
  def next
94
- token = nil
89
+ token = @stream.next
95
90
  dic = Dictionary.instance
96
- loop do
97
- token = @stream.next
98
- break if token.nil?
99
91
 
100
- break unless dic.include? token.text
92
+ until token.nil? || !(dic.include? token.text)
93
+ token = @stream.next
101
94
  end
102
95
 
103
96
  token
data/lib/rmmseg/token.rb CHANGED
@@ -2,31 +2,52 @@ module RMMSeg
2
2
  # A Token consists of a term's text and the start and end offset
3
3
  # of the term.
4
4
  class Token
5
- # Text of the token.
6
- def text
7
- @text[@start_pos...@end_pos]
8
- end
9
-
10
5
  # Does this token contain any characters?
11
6
  def empty?
12
- @start_pos == @end_pos
7
+ @start == @end
13
8
  end
14
9
 
10
+ # The text of the token
11
+ attr_accessor :text
12
+
15
13
  # The start position of the token. This is *byte* index instead of
16
14
  # character.
17
- attr_reader :start_pos
15
+ attr_accessor :start
18
16
 
19
17
  # The one greater than the position of the last byte of the
20
18
  # token. This is *byte* index instead of character.
21
- attr_reader :end_pos
19
+ attr_accessor :end
20
+
21
+ # See Ferret document for Token.
22
+ attr_accessor :pos_inc
22
23
 
23
24
  # +text+ is the ref to the whole text. In other words:
24
25
  # +text[start_pos...end_pos]+ should be the string held by this
25
26
  # token.
26
27
  def initialize(text, start_pos, end_pos)
27
28
  @text = text
28
- @start_pos = start_pos
29
- @end_pos = end_pos
29
+ @start = start_pos
30
+ @end = end_pos
31
+ @pos_inc = 1
32
+ end
33
+
34
+ def <=> other
35
+ if @start > other.start
36
+ return 1
37
+ elsif @start < other.start
38
+ return -1
39
+ elsif @end > other.end
40
+ return 1
41
+ elsif @end < other.end
42
+ return -1
43
+ else
44
+ return @text <=> other.text
45
+ end
46
+ end
47
+ include Comparable
48
+
49
+ def to_s
50
+ @text.dup
30
51
  end
31
52
  end
32
53
  end
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.2'
9
+ VERSION = '0.1.3'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -30,8 +30,8 @@ describe "simple algorithm" do
30
30
  3.times { algor.next_token }
31
31
  token = algor.next_token
32
32
  token.text.should == "paragraph"
33
- token.start_pos.should == 10
34
- token.end_pos.should == 19
33
+ token.start.should == 10
34
+ token.end.should == 19
35
35
  end
36
36
 
37
37
  it "should handle byte positions of Chinese well" do
@@ -40,7 +40,7 @@ describe "simple algorithm" do
40
40
  2.times { algor.next_token }
41
41
  token = algor.next_token
42
42
  token.text.should == "中文"
43
- token.start_pos.should == 12
44
- token.end_pos.should == 18
43
+ token.start.should == 12
44
+ token.end.should == 18
45
45
  end
46
46
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-02-25 00:00:00 -08:00
12
+ date: 2008-02-27 00:00:00 -08:00
13
13
  default_executable:
14
14
  dependencies: []
15
15