rmmseg 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,9 @@
1
+ * Construct Ferret Token directly.
2
+
3
+ === 0.1.6 / 2008-03-16
4
+
5
+ * Optimize for simple algorithm. One time faster than before. And less memory usage.
6
+
1
7
  === 0.1.5 / 2008-03-03
2
8
 
3
9
  * Bug fix: Ferret Token is not Duck-Typing. We need to construct Ferret token instead of reuse RMMSeg Token.
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.5'
9
+ VERSION = '0.1.6'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -9,17 +9,15 @@ module RMMSeg
9
9
  # words. This module is the common operations shared by
10
10
  # SimpleAlgorithm and ComplexAlgorithm .
11
11
  module Algorithm
12
- MATCH_CACHE_MAX_LENGTH = 3
13
-
14
12
  # Initialize a new instance of Algorithm, the +text+ will
15
- # then be segmented by this instance.
16
- def initialize(text)
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
17
16
  @text = text
18
17
  @chars = text.each_char
19
18
  @index = 0
20
19
  @byte_index = 0
21
- @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
22
- @match_cache_idx = 0
20
+ @token = token
23
21
  end
24
22
 
25
23
  # Get the next Token recognized.
@@ -32,7 +30,7 @@ module RMMSeg
32
30
  token = get_cjk_word
33
31
  end
34
32
 
35
- if token.empty?
33
+ if token.start == token.end # empty
36
34
  return next_token
37
35
  else
38
36
  return token
@@ -82,7 +80,7 @@ module RMMSeg
82
80
  @byte_index += i - @index
83
81
  @index = i
84
82
 
85
- return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
83
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
86
84
  end
87
85
 
88
86
  # Find all words occuring in the dictionary starting from
@@ -6,11 +6,13 @@ require 'rmmseg/lsdmfocw_rule'
6
6
 
7
7
  module RMMSeg
8
8
  class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
9
11
  include Algorithm
10
12
 
11
13
  # Create a new ComplexAlgorithm . Rules used by this algorithm
12
14
  # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
13
- def initialize(text)
15
+ def initialize(text, token=Token)
14
16
  super
15
17
  @rules = [
16
18
  MMRule,
@@ -18,16 +20,13 @@ module RMMSeg
18
20
  SVWLRule,
19
21
  LSDMFOCWRule
20
22
  ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
21
25
  end
22
26
 
23
27
  # Get the most proper CJK word.
24
28
  def get_cjk_word
25
- get_cjk_word_from_chunks(create_chunks)
26
- end
27
-
28
- # Use rules to filter the +chunks+ to get the most
29
- # apropos CJK word.
30
- def get_cjk_word_from_chunks(chunks)
29
+ chunks = create_chunks
31
30
  i = 0
32
31
  while i < @rules.length
33
32
  break if chunks.length < 2
@@ -42,7 +41,7 @@ module RMMSeg
42
41
  end
43
42
 
44
43
  word = chunks[0][0]
45
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
46
45
 
47
46
  @index += word.length
48
47
  @byte_index += word.byte_size
@@ -78,5 +77,46 @@ module RMMSeg
78
77
 
79
78
  chunks
80
79
  end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
81
121
  end
82
122
  end
@@ -25,9 +25,11 @@ module RMMSeg
25
25
  @algorithm = algor
26
26
  end
27
27
  # Get an instance of the algorithm object corresponding to the
28
- # algorithm name configured.
29
- def algorithm_instance(text)
30
- RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
28
+ # algorithm name configured. +tok+ is the class of the token oject
29
+ # to be returned. For example, if you want to use with Ferret, you
30
+ # should provide +::Ferret::Analysis::Token+ .
31
+ def algorithm_instance(text, tok=Token)
32
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
31
33
  end
32
34
 
33
35
  # Get the behavior description when an unresolved ambiguity occured.
@@ -39,11 +39,7 @@ module RMMSeg
39
39
 
40
40
  # Get next token
41
41
  def next
42
- tok = @algor.next_token
43
- if tok
44
- tok = ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
45
- end
46
- tok
42
+ @algor.next_token
47
43
  end
48
44
 
49
45
  # Get the text being tokenized
@@ -54,7 +50,8 @@ module RMMSeg
54
50
  # Set the text to be tokenized
55
51
  def text=(str)
56
52
  @text = str
57
- @algor = RMMSeg::Config.algorithm_instance(@text)
53
+ @algor = RMMSeg::Config.algorithm_instance(@text,
54
+ ::Ferret::Analysis::Token)
58
55
  end
59
56
  end
60
57
 
@@ -7,17 +7,29 @@ module RMMSeg
7
7
 
8
8
  # Create a new SimpleAlgorithm . The only rule used by this
9
9
  # algorithm is MMRule .
10
- def initialize(text)
10
+ def initialize(text, token=Token)
11
11
  super
12
12
  end
13
13
 
14
14
  # Get the most proper CJK word.
15
15
  def get_cjk_word
16
- word = find_match_words(@index).last
17
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
16
+ dic = Dictionary.instance
17
+ i = Config.max_word_length
18
+ if i + @index > @chars.length
19
+ i = @chars.length - @index
20
+ end
21
+ chars = @chars[@index, i]
22
+ word = chars.join
18
23
 
19
- @index += word.length
20
- @byte_index += word.byte_size
24
+ while i > 1 && !dic.has_word?(word)
25
+ i -= 1
26
+ word.slice!(-chars[i].size,chars[i].size) # truncate last char
27
+ end
28
+
29
+ token = @token.new(word, @byte_index, @byte_index+word.size)
30
+
31
+ @index += i
32
+ @byte_index += word.size
21
33
 
22
34
  return token
23
35
  end
@@ -2,11 +2,6 @@ module RMMSeg
2
2
  # A Token consists of a term's text and the start and end offset
3
3
  # of the term.
4
4
  class Token
5
- # Does this token contain any characters?
6
- def empty?
7
- @start == @end
8
- end
9
-
10
5
  # The text of the token
11
6
  attr_accessor :text
12
7
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-03-04 00:00:00 +00:00
12
+ date: 2008-03-16 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15