rmmseg 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.4 / 2008-03-02
2
+
3
+ * Let user store their customized word to Dictionary after loaded.
4
+ * Improved performance of SimpleAlgorithm.
5
+
1
6
  === 0.1.3 / 2008-02-28
2
7
 
3
8
  * Make RMMSeg Token campatible to Ferret Token.
data/TODO.txt CHANGED
@@ -1,7 +1,4 @@
1
1
  === TODO
2
2
 
3
- * Release 0.1.3 before adding C staffs.
4
- * Implement a C version of jcode.
5
- * Implement a C version of string_ref.
6
3
  * Avoid Memory Leak
7
4
  * Improve Performance
@@ -26,15 +26,10 @@ module RMMSeg
26
26
  def next_token
27
27
  return nil if @index >= @chars.length
28
28
 
29
- current = @chars[@index]
30
- orig_index = @index
31
- token = nil
32
- len = 0
33
-
34
- if basic_latin?(current)
29
+ if basic_latin?(@chars[@index])
35
30
  token = get_basic_latin_word
36
31
  else
37
- token = get_cjk_word(create_chunks)
32
+ token = get_cjk_word
38
33
  end
39
34
 
40
35
  if token.empty?
@@ -90,35 +85,10 @@ module RMMSeg
90
85
  return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
91
86
  end
92
87
 
93
- # Use rules to filter the +chunks+ to get the most
94
- # apropos CJK word.
95
- def get_cjk_word(chunks)
96
- i = 0
97
- while i < @rules.length
98
- break if chunks.length < 2
99
- chunks = @rules[i].filter(chunks)
100
- i += 1
101
- end
102
-
103
- if chunks.length > 1
104
- if Config.on_ambiguity == :raise_exception
105
- raise Ambiguity, "Can't solve ambiguity on #{chunks}"
106
- end
107
- end
108
-
109
- word = chunks[0][0]
110
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
111
-
112
- @index += word.length
113
- @byte_index += word.byte_size
114
-
115
- return token
116
- end
117
-
118
88
  # Find all words occuring in the dictionary starting from
119
89
  # +index+ . The maximum word length is determined by
120
90
  # +Config.max_word_length+ .
121
- def find_match_words(chars, index)
91
+ def find_match_words(index)
122
92
  for i, w in @match_cache
123
93
  if i == index
124
94
  return w
@@ -131,11 +101,11 @@ module RMMSeg
131
101
  words = Array.new
132
102
  i = index
133
103
 
134
- while i < chars.length &&
135
- !basic_latin?(chars[i]) &&
104
+ while i < @chars.length &&
105
+ !basic_latin?(@chars[i]) &&
136
106
  strlen < Config.max_word_length
137
107
 
138
- str << chars[i]
108
+ str << @chars[i]
139
109
  strlen += 1
140
110
 
141
111
  if dic.has_word?(str)
@@ -145,7 +115,7 @@ module RMMSeg
145
115
  end
146
116
 
147
117
  if words.empty?
148
- words << Word.new(chars[index], Word::TYPES[:unrecognized])
118
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
149
119
  end
150
120
 
151
121
  @match_cache[@match_cache_idx] = [index, words]
@@ -13,24 +13,54 @@ module RMMSeg
13
13
  def initialize(text)
14
14
  super
15
15
  @rules = [
16
- MMRule.new,
17
- LAWLRule.new,
18
- SVWLRule.new,
19
- LSDMFOCWRule.new
16
+ MMRule,
17
+ LAWLRule,
18
+ SVWLRule,
19
+ LSDMFOCWRule
20
20
  ]
21
21
  end
22
22
 
23
+ # Get the most proper CJK word.
24
+ def get_cjk_word
25
+ get_cjk_word_from_chunks(create_chunks)
26
+ end
27
+
28
+ # Use rules to filter the +chunks+ to get the most
29
+ # apropos CJK word.
30
+ def get_cjk_word_from_chunks(chunks)
31
+ i = 0
32
+ while i < @rules.length
33
+ break if chunks.length < 2
34
+ chunks = @rules[i].filter(chunks)
35
+ i += 1
36
+ end
37
+
38
+ if chunks.length > 1
39
+ if Config.on_ambiguity == :raise_exception
40
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
41
+ end
42
+ end
43
+
44
+ word = chunks[0][0]
45
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
46
+
47
+ @index += word.length
48
+ @byte_index += word.byte_size
49
+
50
+ return token
51
+ end
52
+
23
53
  # Create all possible three-word (or less) chunks
24
54
  # starting from +@index+ .
25
55
  def create_chunks
26
56
  chunks = Array.new
27
- for w0 in find_match_words(@chars, @index)
57
+ for w0 in find_match_words(@index)
28
58
  index0 = @index + w0.length
29
59
  if index0 < @chars.length
30
- for w1 in find_match_words(@chars, index0)
60
+ for w1 in find_match_words(index0)
31
61
  index1 = index0 + w1.length
32
62
  if index1 < @chars.length
33
- for w2 in find_match_words(@chars, index1)
63
+ for w2 in find_match_words(index1)
34
64
  if w2.type == Word::TYPES[:unrecognized]
35
65
  chunks << [w0, w1]
36
66
  else
@@ -18,6 +18,16 @@ module RMMSeg
18
18
  @dic.has_key?(value)
19
19
  end
20
20
 
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
21
31
  # Get an instance of Word corresponding to +value+ .
22
32
  def get_word(value)
23
33
  word = @dic[value]
@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
3
3
  module RMMSeg
4
4
  # Largest average word length rule.
5
5
  class LAWLRule
6
- def filter(chunks)
6
+ def self.filter(chunks)
7
7
  chunks.take_highest { |a, b|
8
8
  Chunk::average_length(a) <=> Chunk::average_length(b)
9
9
  }
@@ -4,7 +4,7 @@ module RMMSeg
4
4
  # Largest sum of degree of morphemic freedom of one-character
5
5
  # words rule.
6
6
  class LSDMFOCWRule
7
- def filter(chunks)
7
+ def self.filter(chunks)
8
8
  chunks.take_highest { |a, b|
9
9
  Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
10
10
  }
@@ -4,7 +4,7 @@ module RMMSeg
4
4
  # Maximum matching rule, select the chunks with the
5
5
  # maximum length.
6
6
  class MMRule
7
- def filter(chunks)
7
+ def self.filter(chunks)
8
8
  chunks.take_highest { |a, b|
9
9
  Chunk::total_length(a) <=> Chunk::total_length(b)
10
10
  }
@@ -9,14 +9,17 @@ module RMMSeg
9
9
  # algorithm is MMRule .
10
10
  def initialize(text)
11
11
  super
12
- @rules = [ MMRule.new ]
13
12
  end
14
13
 
15
- # Create all possible one-word chunks starting from +@index+ .
16
- def create_chunks
17
- find_match_words(@chars, @index).map { |word|
18
- [word]
19
- }
14
+ # Get the most proper CJK word.
15
+ def get_cjk_word
16
+ word = find_match_words(@index).last
17
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
18
+
19
+ @index += word.length
20
+ @byte_index += word.byte_size
21
+
22
+ return token
20
23
  end
21
24
  end
22
25
  end
@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
3
3
  module RMMSeg
4
4
  # Smallest variance of word length rule.
5
5
  class SVWLRule
6
- def filter(chunks)
6
+ def self.filter(chunks)
7
7
  chunks.take_highest { |a, b|
8
8
  Chunk::variance(b) <=> Chunk::variance(a)
9
9
  }
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.3'
9
+ VERSION = '0.1.4'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -8,7 +8,7 @@ describe "largest average word length rule" do
8
8
  gen_words(["国际", "化"]),
9
9
  gen_words(["国", "际", "化"])
10
10
  ]
11
- chunks = RMMSeg::LAWLRule.new.filter(chunks)
11
+ chunks = RMMSeg::LAWLRule.filter(chunks)
12
12
  chunks.length.should == 1
13
13
  chunks[0][0].text.should == "国际化"
14
14
  end
@@ -7,7 +7,7 @@ describe "largest sum of degree of morphemic freedom of one-character words rule
7
7
  gen_words(["主要", "是", "因为"], [nil, 100, nil]),
8
8
  gen_words(["主", "要是", "因为"], [10, nil, nil])
9
9
  ]
10
- chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
10
+ chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
11
11
  chunks.length.should == 1
12
12
  chunks[0][0].text.should == "主要"
13
13
  end
data/spec/mm_rule_spec.rb CHANGED
@@ -9,7 +9,7 @@ describe 'maximum matching rule' do
9
9
  gen_words(["眼看", "就要", "来"]),
10
10
  gen_words(["眼", "看", "就"])
11
11
  ]
12
- chunks = RMMSeg::MMRule.new.filter(chunks)
12
+ chunks = RMMSeg::MMRule.filter(chunks)
13
13
  chunks.length.should == 2
14
14
  end
15
15
  end
@@ -7,7 +7,7 @@ describe "smallest variance of word length rule" do
7
7
  gen_words(["研究", "生命", "起源"]),
8
8
  gen_words(["研究生", "命", "起源"])
9
9
  ]
10
- chunks = RMMSeg::SVWLRule.new.filter(chunks)
10
+ chunks = RMMSeg::SVWLRule.filter(chunks)
11
11
  chunks.length.should == 1
12
12
  chunks[0][0].text.should == "研究"
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-02-27 00:00:00 -08:00
12
+ date: 2008-03-02 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15