rmmseg 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,9 @@
1
+ * Construct Ferret Token directly.
2
+
3
+ === 0.1.6 / 2008-03-16
4
+
5
+ * Optimize for simple algorithm. One time faster than before. And less memory usage.
6
+
1
7
  === 0.1.5 / 2008-03-03
2
8
 
3
9
  * Bug fix: Ferret Token is not Duck-Typing. We need to construct Ferret token instead of reuse RMMSeg Token.
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.5'
9
+ VERSION = '0.1.6'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -9,17 +9,15 @@ module RMMSeg
9
9
  # words. This module is the common operations shared by
10
10
  # SimpleAlgorithm and ComplexAlgorithm .
11
11
  module Algorithm
12
- MATCH_CACHE_MAX_LENGTH = 3
13
-
14
12
  # Initialize a new instance of Algorithm, the +text+ will
15
- # then be segmented by this instance.
16
- def initialize(text)
13
+ # then be segmented by this instance. +token+ is the class
14
+ # which will be used to construct the result token.
15
+ def initialize(text, token=Token)
17
16
  @text = text
18
17
  @chars = text.each_char
19
18
  @index = 0
20
19
  @byte_index = 0
21
- @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
22
- @match_cache_idx = 0
20
+ @token = token
23
21
  end
24
22
 
25
23
  # Get the next Token recognized.
@@ -32,7 +30,7 @@ module RMMSeg
32
30
  token = get_cjk_word
33
31
  end
34
32
 
35
- if token.empty?
33
+ if token.start == token.end # empty
36
34
  return next_token
37
35
  else
38
36
  return token
@@ -82,7 +80,7 @@ module RMMSeg
82
80
  @byte_index += i - @index
83
81
  @index = i
84
82
 
85
- return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
83
+ return @token.new(@text[start_pos...end_pos], start_pos, end_pos)
86
84
  end
87
85
 
88
86
  # Find all words occuring in the dictionary starting from
@@ -6,11 +6,13 @@ require 'rmmseg/lsdmfocw_rule'
6
6
 
7
7
  module RMMSeg
8
8
  class ComplexAlgorithm
9
+ MATCH_CACHE_MAX_LENGTH = 3
10
+
9
11
  include Algorithm
10
12
 
11
13
  # Create a new ComplexAlgorithm . Rules used by this algorithm
12
14
  # includes MMRule , LAWLRule , SVWLRule and LSDMFOCWRule .
13
- def initialize(text)
15
+ def initialize(text, token=Token)
14
16
  super
15
17
  @rules = [
16
18
  MMRule,
@@ -18,16 +20,13 @@ module RMMSeg
18
20
  SVWLRule,
19
21
  LSDMFOCWRule
20
22
  ]
23
+ @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH)
24
+ @match_cache_idx = 0
21
25
  end
22
26
 
23
27
  # Get the most proper CJK word.
24
28
  def get_cjk_word
25
- get_cjk_word_from_chunks(create_chunks)
26
- end
27
-
28
- # Use rules to filter the +chunks+ to get the most
29
- # apropos CJK word.
30
- def get_cjk_word_from_chunks(chunks)
29
+ chunks = create_chunks
31
30
  i = 0
32
31
  while i < @rules.length
33
32
  break if chunks.length < 2
@@ -42,7 +41,7 @@ module RMMSeg
42
41
  end
43
42
 
44
43
  word = chunks[0][0]
45
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
44
+ token = @token.new(word.text, @byte_index, @byte_index+word.byte_size)
46
45
 
47
46
  @index += word.length
48
47
  @byte_index += word.byte_size
@@ -78,5 +77,46 @@ module RMMSeg
78
77
 
79
78
  chunks
80
79
  end
80
+
81
+ # Find all words occuring in the dictionary starting from
82
+ # +index+ . The maximum word length is determined by
83
+ # +Config.max_word_length+ .
84
+ def find_match_words(index)
85
+ for i, w in @match_cache
86
+ if i == index
87
+ return w
88
+ end
89
+ end
90
+
91
+ dic = Dictionary.instance
92
+ str = String.new
93
+ strlen = 0
94
+ words = Array.new
95
+ i = index
96
+
97
+ while i < @chars.length &&
98
+ !basic_latin?(@chars[i]) &&
99
+ strlen < Config.max_word_length
100
+
101
+ str << @chars[i]
102
+ strlen += 1
103
+
104
+ if dic.has_word?(str)
105
+ words << dic.get_word(str)
106
+ end
107
+ i += 1
108
+ end
109
+
110
+ if words.empty?
111
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
112
+ end
113
+
114
+ @match_cache[@match_cache_idx] = [index, words]
115
+ @match_cache_idx += 1
116
+ @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH
117
+
118
+ words
119
+ end
120
+
81
121
  end
82
122
  end
@@ -25,9 +25,11 @@ module RMMSeg
25
25
  @algorithm = algor
26
26
  end
27
27
  # Get an instance of the algorithm object corresponding to the
28
- # algorithm name configured.
29
- def algorithm_instance(text)
30
- RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text)
28
+ # algorithm name configured. +tok+ is the class of the token oject
29
+ # to be returned. For example, if you want to use with Ferret, you
30
+ # should provide +::Ferret::Analysis::Token+ .
31
+ def algorithm_instance(text, tok=Token)
32
+ RMMSeg.const_get("#{@algorithm}".capitalize+"Algorithm").new(text, tok)
31
33
  end
32
34
 
33
35
  # Get the behavior description when an unresolved ambiguity occured.
@@ -39,11 +39,7 @@ module RMMSeg
39
39
 
40
40
  # Get next token
41
41
  def next
42
- tok = @algor.next_token
43
- if tok
44
- tok = ::Ferret::Analysis::Token.new(tok.text, tok.start, tok.end)
45
- end
46
- tok
42
+ @algor.next_token
47
43
  end
48
44
 
49
45
  # Get the text being tokenized
@@ -54,7 +50,8 @@ module RMMSeg
54
50
  # Set the text to be tokenized
55
51
  def text=(str)
56
52
  @text = str
57
- @algor = RMMSeg::Config.algorithm_instance(@text)
53
+ @algor = RMMSeg::Config.algorithm_instance(@text,
54
+ ::Ferret::Analysis::Token)
58
55
  end
59
56
  end
60
57
 
@@ -7,17 +7,29 @@ module RMMSeg
7
7
 
8
8
  # Create a new SimpleAlgorithm . The only rule used by this
9
9
  # algorithm is MMRule .
10
- def initialize(text)
10
+ def initialize(text, token=Token)
11
11
  super
12
12
  end
13
13
 
14
14
  # Get the most proper CJK word.
15
15
  def get_cjk_word
16
- word = find_match_words(@index).last
17
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
16
+ dic = Dictionary.instance
17
+ i = Config.max_word_length
18
+ if i + @index > @chars.length
19
+ i = @chars.length - @index
20
+ end
21
+ chars = @chars[@index, i]
22
+ word = chars.join
18
23
 
19
- @index += word.length
20
- @byte_index += word.byte_size
24
+ while i > 1 && !dic.has_word?(word)
25
+ i -= 1
26
+ word.slice!(-chars[i].size,chars[i].size) # truncate last char
27
+ end
28
+
29
+ token = @token.new(word, @byte_index, @byte_index+word.size)
30
+
31
+ @index += i
32
+ @byte_index += word.size
21
33
 
22
34
  return token
23
35
  end
@@ -2,11 +2,6 @@ module RMMSeg
2
2
  # A Token consists of a term's text and the start and end offset
3
3
  # of the term.
4
4
  class Token
5
- # Does this token contain any characters?
6
- def empty?
7
- @start == @end
8
- end
9
-
10
5
  # The text of the token
11
6
  attr_accessor :text
12
7
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-03-04 00:00:00 +00:00
12
+ date: 2008-03-16 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15