rmmseg 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.4 / 2008-03-02
2
+
3
+ * Let user store their customized word to Dictionary after loaded.
4
+ * Improved performance of SimpleAlgorithm.
5
+
1
6
  === 0.1.3 / 2008-02-28
2
7
 
3
8
  * Make RMMSeg Token campatible to Ferret Token.
data/TODO.txt CHANGED
@@ -1,7 +1,4 @@
1
1
  === TODO
2
2
 
3
- * Release 0.1.3 before adding C staffs.
4
- * Implement a C version of jcode.
5
- * Implement a C version of string_ref.
6
3
  * Avoid Memory Leak
7
4
  * Improve Performance
@@ -26,15 +26,10 @@ module RMMSeg
26
26
  def next_token
27
27
  return nil if @index >= @chars.length
28
28
 
29
- current = @chars[@index]
30
- orig_index = @index
31
- token = nil
32
- len = 0
33
-
34
- if basic_latin?(current)
29
+ if basic_latin?(@chars[@index])
35
30
  token = get_basic_latin_word
36
31
  else
37
- token = get_cjk_word(create_chunks)
32
+ token = get_cjk_word
38
33
  end
39
34
 
40
35
  if token.empty?
@@ -90,35 +85,10 @@ module RMMSeg
90
85
  return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
91
86
  end
92
87
 
93
- # Use rules to filter the +chunks+ to get the most
94
- # apropos CJK word.
95
- def get_cjk_word(chunks)
96
- i = 0
97
- while i < @rules.length
98
- break if chunks.length < 2
99
- chunks = @rules[i].filter(chunks)
100
- i += 1
101
- end
102
-
103
- if chunks.length > 1
104
- if Config.on_ambiguity == :raise_exception
105
- raise Ambiguity, "Can't solve ambiguity on #{chunks}"
106
- end
107
- end
108
-
109
- word = chunks[0][0]
110
- token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
111
-
112
- @index += word.length
113
- @byte_index += word.byte_size
114
-
115
- return token
116
- end
117
-
118
88
  # Find all words occuring in the dictionary starting from
119
89
  # +index+ . The maximum word length is determined by
120
90
  # +Config.max_word_length+ .
121
- def find_match_words(chars, index)
91
+ def find_match_words(index)
122
92
  for i, w in @match_cache
123
93
  if i == index
124
94
  return w
@@ -131,11 +101,11 @@ module RMMSeg
131
101
  words = Array.new
132
102
  i = index
133
103
 
134
- while i < chars.length &&
135
- !basic_latin?(chars[i]) &&
104
+ while i < @chars.length &&
105
+ !basic_latin?(@chars[i]) &&
136
106
  strlen < Config.max_word_length
137
107
 
138
- str << chars[i]
108
+ str << @chars[i]
139
109
  strlen += 1
140
110
 
141
111
  if dic.has_word?(str)
@@ -145,7 +115,7 @@ module RMMSeg
145
115
  end
146
116
 
147
117
  if words.empty?
148
- words << Word.new(chars[index], Word::TYPES[:unrecognized])
118
+ words << Word.new(@chars[index], Word::TYPES[:unrecognized])
149
119
  end
150
120
 
151
121
  @match_cache[@match_cache_idx] = [index, words]
@@ -13,24 +13,54 @@ module RMMSeg
13
13
  def initialize(text)
14
14
  super
15
15
  @rules = [
16
- MMRule.new,
17
- LAWLRule.new,
18
- SVWLRule.new,
19
- LSDMFOCWRule.new
16
+ MMRule,
17
+ LAWLRule,
18
+ SVWLRule,
19
+ LSDMFOCWRule
20
20
  ]
21
21
  end
22
22
 
23
+ # Get the most proper CJK word.
24
+ def get_cjk_word
25
+ get_cjk_word_from_chunks(create_chunks)
26
+ end
27
+
28
+ # Use rules to filter the +chunks+ to get the most
29
+ # apropos CJK word.
30
+ def get_cjk_word_from_chunks(chunks)
31
+ i = 0
32
+ while i < @rules.length
33
+ break if chunks.length < 2
34
+ chunks = @rules[i].filter(chunks)
35
+ i += 1
36
+ end
37
+
38
+ if chunks.length > 1
39
+ if Config.on_ambiguity == :raise_exception
40
+ raise Ambiguity, "Can't solve ambiguity on #{chunks}"
41
+ end
42
+ end
43
+
44
+ word = chunks[0][0]
45
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
46
+
47
+ @index += word.length
48
+ @byte_index += word.byte_size
49
+
50
+ return token
51
+ end
52
+
23
53
  # Create all possible three-word (or less) chunks
24
54
  # starting from +@index+ .
25
55
  def create_chunks
26
56
  chunks = Array.new
27
- for w0 in find_match_words(@chars, @index)
57
+ for w0 in find_match_words(@index)
28
58
  index0 = @index + w0.length
29
59
  if index0 < @chars.length
30
- for w1 in find_match_words(@chars, index0)
60
+ for w1 in find_match_words(index0)
31
61
  index1 = index0 + w1.length
32
62
  if index1 < @chars.length
33
- for w2 in find_match_words(@chars, index1)
63
+ for w2 in find_match_words(index1)
34
64
  if w2.type == Word::TYPES[:unrecognized]
35
65
  chunks << [w0, w1]
36
66
  else
@@ -18,6 +18,16 @@ module RMMSeg
18
18
  @dic.has_key?(value)
19
19
  end
20
20
 
21
+ # Store a new word to dictionary.
22
+ # +w+ may be:
23
+ # * an instance of Word.
24
+ # * +true+, then this is a normal world.
25
+ # * a String(which can be converted to a Number) or Number.
26
+ # The number is the frequency of the word.
27
+ def store_word(key, w=true)
28
+ @dic[key] = w
29
+ end
30
+
21
31
  # Get an instance of Word corresponding to +value+ .
22
32
  def get_word(value)
23
33
  word = @dic[value]
@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
3
3
  module RMMSeg
4
4
  # Largest average word length rule.
5
5
  class LAWLRule
6
- def filter(chunks)
6
+ def self.filter(chunks)
7
7
  chunks.take_highest { |a, b|
8
8
  Chunk::average_length(a) <=> Chunk::average_length(b)
9
9
  }
@@ -4,7 +4,7 @@ module RMMSeg
4
4
  # Largest sum of degree of morphemic freedom of one-character
5
5
  # words rule.
6
6
  class LSDMFOCWRule
7
- def filter(chunks)
7
+ def self.filter(chunks)
8
8
  chunks.take_highest { |a, b|
9
9
  Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
10
10
  }
@@ -4,7 +4,7 @@ module RMMSeg
4
4
  # Maximum matching rule, select the chunks with the
5
5
  # maximum length.
6
6
  class MMRule
7
- def filter(chunks)
7
+ def self.filter(chunks)
8
8
  chunks.take_highest { |a, b|
9
9
  Chunk::total_length(a) <=> Chunk::total_length(b)
10
10
  }
@@ -9,14 +9,17 @@ module RMMSeg
9
9
  # algorithm is MMRule .
10
10
  def initialize(text)
11
11
  super
12
- @rules = [ MMRule.new ]
13
12
  end
14
13
 
15
- # Create all possible one-word chunks starting from +@index+ .
16
- def create_chunks
17
- find_match_words(@chars, @index).map { |word|
18
- [word]
19
- }
14
+ # Get the most proper CJK word.
15
+ def get_cjk_word
16
+ word = find_match_words(@index).last
17
+ token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
18
+
19
+ @index += word.length
20
+ @byte_index += word.byte_size
21
+
22
+ return token
20
23
  end
21
24
  end
22
25
  end
@@ -3,7 +3,7 @@ require 'rmmseg/rule_helper'
3
3
  module RMMSeg
4
4
  # Smallest variance of word length rule.
5
5
  class SVWLRule
6
- def filter(chunks)
6
+ def self.filter(chunks)
7
7
  chunks.take_highest { |a, b|
8
8
  Chunk::variance(b) <=> Chunk::variance(a)
9
9
  }
data/lib/rmmseg.rb CHANGED
@@ -6,7 +6,7 @@ require 'rmmseg/simple_algorithm'
6
6
  require 'rmmseg/complex_algorithm'
7
7
 
8
8
  module RMMSeg
9
- VERSION = '0.1.3'
9
+ VERSION = '0.1.4'
10
10
 
11
11
  # Segment +text+ using the algorithm configured.
12
12
  def segment(text)
@@ -8,7 +8,7 @@ describe "largest average word length rule" do
8
8
  gen_words(["国际", "化"]),
9
9
  gen_words(["国", "际", "化"])
10
10
  ]
11
- chunks = RMMSeg::LAWLRule.new.filter(chunks)
11
+ chunks = RMMSeg::LAWLRule.filter(chunks)
12
12
  chunks.length.should == 1
13
13
  chunks[0][0].text.should == "国际化"
14
14
  end
@@ -7,7 +7,7 @@ describe "largest sum of degree of morphemic freedom of one-character words rule
7
7
  gen_words(["主要", "是", "因为"], [nil, 100, nil]),
8
8
  gen_words(["主", "要是", "因为"], [10, nil, nil])
9
9
  ]
10
- chunks = RMMSeg::LSDMFOCWRule.new.filter(chunks)
10
+ chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
11
11
  chunks.length.should == 1
12
12
  chunks[0][0].text.should == "主要"
13
13
  end
data/spec/mm_rule_spec.rb CHANGED
@@ -9,7 +9,7 @@ describe 'maximum matching rule' do
9
9
  gen_words(["眼看", "就要", "来"]),
10
10
  gen_words(["眼", "看", "就"])
11
11
  ]
12
- chunks = RMMSeg::MMRule.new.filter(chunks)
12
+ chunks = RMMSeg::MMRule.filter(chunks)
13
13
  chunks.length.should == 2
14
14
  end
15
15
  end
@@ -7,7 +7,7 @@ describe "smallest variance of word length rule" do
7
7
  gen_words(["研究", "生命", "起源"]),
8
8
  gen_words(["研究生", "命", "起源"])
9
9
  ]
10
- chunks = RMMSeg::SVWLRule.new.filter(chunks)
10
+ chunks = RMMSeg::SVWLRule.filter(chunks)
11
11
  chunks.length.should == 1
12
12
  chunks[0][0].text.should == "研究"
13
13
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rmmseg
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - pluskid
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2008-02-27 00:00:00 -08:00
12
+ date: 2008-03-02 00:00:00 +00:00
13
13
  default_executable:
14
14
  dependencies: []
15
15