rmmseg 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/TODO.txt +0 -3
- data/lib/rmmseg/algorithm.rb +7 -37
- data/lib/rmmseg/complex_algorithm.rb +37 -7
- data/lib/rmmseg/dictionary.rb +10 -0
- data/lib/rmmseg/lawl_rule.rb +1 -1
- data/lib/rmmseg/lsdmfocw_rule.rb +1 -1
- data/lib/rmmseg/mm_rule.rb +1 -1
- data/lib/rmmseg/simple_algorithm.rb +9 -6
- data/lib/rmmseg/svwl_rule.rb +1 -1
- data/lib/rmmseg.rb +1 -1
- data/spec/lawl_rule_spec.rb +1 -1
- data/spec/lsdmfocw_rule_spec.rb +1 -1
- data/spec/mm_rule_spec.rb +1 -1
- data/spec/svwl_rule_spec.rb +1 -1
- metadata +2 -2
data/History.txt
CHANGED
data/TODO.txt
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -26,15 +26,10 @@ module RMMSeg
|
|
26
26
|
def next_token
|
27
27
|
return nil if @index >= @chars.length
|
28
28
|
|
29
|
-
|
30
|
-
orig_index = @index
|
31
|
-
token = nil
|
32
|
-
len = 0
|
33
|
-
|
34
|
-
if basic_latin?(current)
|
29
|
+
if basic_latin?(@chars[@index])
|
35
30
|
token = get_basic_latin_word
|
36
31
|
else
|
37
|
-
token = get_cjk_word
|
32
|
+
token = get_cjk_word
|
38
33
|
end
|
39
34
|
|
40
35
|
if token.empty?
|
@@ -90,35 +85,10 @@ module RMMSeg
|
|
90
85
|
return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
91
86
|
end
|
92
87
|
|
93
|
-
# Use rules to filter the +chunks+ to get the most
|
94
|
-
# apropos CJK word.
|
95
|
-
def get_cjk_word(chunks)
|
96
|
-
i = 0
|
97
|
-
while i < @rules.length
|
98
|
-
break if chunks.length < 2
|
99
|
-
chunks = @rules[i].filter(chunks)
|
100
|
-
i += 1
|
101
|
-
end
|
102
|
-
|
103
|
-
if chunks.length > 1
|
104
|
-
if Config.on_ambiguity == :raise_exception
|
105
|
-
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
word = chunks[0][0]
|
110
|
-
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
111
|
-
|
112
|
-
@index += word.length
|
113
|
-
@byte_index += word.byte_size
|
114
|
-
|
115
|
-
return token
|
116
|
-
end
|
117
|
-
|
118
88
|
# Find all words occuring in the dictionary starting from
|
119
89
|
# +index+ . The maximum word length is determined by
|
120
90
|
# +Config.max_word_length+ .
|
121
|
-
def find_match_words(
|
91
|
+
def find_match_words(index)
|
122
92
|
for i, w in @match_cache
|
123
93
|
if i == index
|
124
94
|
return w
|
@@ -131,11 +101,11 @@ module RMMSeg
|
|
131
101
|
words = Array.new
|
132
102
|
i = index
|
133
103
|
|
134
|
-
while i < chars.length &&
|
135
|
-
!basic_latin?(chars[i]) &&
|
104
|
+
while i < @chars.length &&
|
105
|
+
!basic_latin?(@chars[i]) &&
|
136
106
|
strlen < Config.max_word_length
|
137
107
|
|
138
|
-
str << chars[i]
|
108
|
+
str << @chars[i]
|
139
109
|
strlen += 1
|
140
110
|
|
141
111
|
if dic.has_word?(str)
|
@@ -145,7 +115,7 @@ module RMMSeg
|
|
145
115
|
end
|
146
116
|
|
147
117
|
if words.empty?
|
148
|
-
words << Word.new(chars[index], Word::TYPES[:unrecognized])
|
118
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
149
119
|
end
|
150
120
|
|
151
121
|
@match_cache[@match_cache_idx] = [index, words]
|
@@ -13,24 +13,54 @@ module RMMSeg
|
|
13
13
|
def initialize(text)
|
14
14
|
super
|
15
15
|
@rules = [
|
16
|
-
MMRule
|
17
|
-
LAWLRule
|
18
|
-
SVWLRule
|
19
|
-
LSDMFOCWRule
|
16
|
+
MMRule,
|
17
|
+
LAWLRule,
|
18
|
+
SVWLRule,
|
19
|
+
LSDMFOCWRule
|
20
20
|
]
|
21
21
|
end
|
22
22
|
|
23
|
+
# Get the most proper CJK word.
|
24
|
+
def get_cjk_word
|
25
|
+
get_cjk_word_from_chunks(create_chunks)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use rules to filter the +chunks+ to get the most
|
29
|
+
# apropos CJK word.
|
30
|
+
def get_cjk_word_from_chunks(chunks)
|
31
|
+
i = 0
|
32
|
+
while i < @rules.length
|
33
|
+
break if chunks.length < 2
|
34
|
+
chunks = @rules[i].filter(chunks)
|
35
|
+
i += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
if chunks.length > 1
|
39
|
+
if Config.on_ambiguity == :raise_exception
|
40
|
+
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
word = chunks[0][0]
|
45
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
46
|
+
|
47
|
+
@index += word.length
|
48
|
+
@byte_index += word.byte_size
|
49
|
+
|
50
|
+
return token
|
51
|
+
end
|
52
|
+
|
23
53
|
# Create all possible three-word (or less) chunks
|
24
54
|
# starting from +@index+ .
|
25
55
|
def create_chunks
|
26
56
|
chunks = Array.new
|
27
|
-
for w0 in find_match_words(@
|
57
|
+
for w0 in find_match_words(@index)
|
28
58
|
index0 = @index + w0.length
|
29
59
|
if index0 < @chars.length
|
30
|
-
for w1 in find_match_words(
|
60
|
+
for w1 in find_match_words(index0)
|
31
61
|
index1 = index0 + w1.length
|
32
62
|
if index1 < @chars.length
|
33
|
-
for w2 in find_match_words(
|
63
|
+
for w2 in find_match_words(index1)
|
34
64
|
if w2.type == Word::TYPES[:unrecognized]
|
35
65
|
chunks << [w0, w1]
|
36
66
|
else
|
data/lib/rmmseg/dictionary.rb
CHANGED
@@ -18,6 +18,16 @@ module RMMSeg
|
|
18
18
|
@dic.has_key?(value)
|
19
19
|
end
|
20
20
|
|
21
|
+
# Store a new word to dictionary.
|
22
|
+
# +w+ may be:
|
23
|
+
# * an instance of Word.
|
24
|
+
# * +true+, then this is a normal world.
|
25
|
+
# * a String(which can be converted to a Number) or Number.
|
26
|
+
# The number is the frequency of the word.
|
27
|
+
def store_word(key, w=true)
|
28
|
+
@dic[key] = w
|
29
|
+
end
|
30
|
+
|
21
31
|
# Get an instance of Word corresponding to +value+ .
|
22
32
|
def get_word(value)
|
23
33
|
word = @dic[value]
|
data/lib/rmmseg/lawl_rule.rb
CHANGED
data/lib/rmmseg/lsdmfocw_rule.rb
CHANGED
@@ -4,7 +4,7 @@ module RMMSeg
|
|
4
4
|
# Largest sum of degree of morphemic freedom of one-character
|
5
5
|
# words rule.
|
6
6
|
class LSDMFOCWRule
|
7
|
-
def filter(chunks)
|
7
|
+
def self.filter(chunks)
|
8
8
|
chunks.take_highest { |a, b|
|
9
9
|
Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
|
10
10
|
}
|
data/lib/rmmseg/mm_rule.rb
CHANGED
@@ -9,14 +9,17 @@ module RMMSeg
|
|
9
9
|
# algorithm is MMRule .
|
10
10
|
def initialize(text)
|
11
11
|
super
|
12
|
-
@rules = [ MMRule.new ]
|
13
12
|
end
|
14
13
|
|
15
|
-
#
|
16
|
-
def
|
17
|
-
find_match_words(@
|
18
|
-
|
19
|
-
|
14
|
+
# Get the most proper CJK word.
|
15
|
+
def get_cjk_word
|
16
|
+
word = find_match_words(@index).last
|
17
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
18
|
+
|
19
|
+
@index += word.length
|
20
|
+
@byte_index += word.byte_size
|
21
|
+
|
22
|
+
return token
|
20
23
|
end
|
21
24
|
end
|
22
25
|
end
|
data/lib/rmmseg/svwl_rule.rb
CHANGED
data/lib/rmmseg.rb
CHANGED
data/spec/lawl_rule_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe "largest average word length rule" do
|
|
8
8
|
gen_words(["国际", "化"]),
|
9
9
|
gen_words(["国", "际", "化"])
|
10
10
|
]
|
11
|
-
chunks = RMMSeg::LAWLRule.
|
11
|
+
chunks = RMMSeg::LAWLRule.filter(chunks)
|
12
12
|
chunks.length.should == 1
|
13
13
|
chunks[0][0].text.should == "国际化"
|
14
14
|
end
|
data/spec/lsdmfocw_rule_spec.rb
CHANGED
@@ -7,7 +7,7 @@ describe "largest sum of degree of morphemic freedom of one-character words rule
|
|
7
7
|
gen_words(["主要", "是", "因为"], [nil, 100, nil]),
|
8
8
|
gen_words(["主", "要是", "因为"], [10, nil, nil])
|
9
9
|
]
|
10
|
-
chunks = RMMSeg::LSDMFOCWRule.
|
10
|
+
chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
|
11
11
|
chunks.length.should == 1
|
12
12
|
chunks[0][0].text.should == "主要"
|
13
13
|
end
|
data/spec/mm_rule_spec.rb
CHANGED
data/spec/svwl_rule_spec.rb
CHANGED
@@ -7,7 +7,7 @@ describe "smallest variance of word length rule" do
|
|
7
7
|
gen_words(["研究", "生命", "起源"]),
|
8
8
|
gen_words(["研究生", "命", "起源"])
|
9
9
|
]
|
10
|
-
chunks = RMMSeg::SVWLRule.
|
10
|
+
chunks = RMMSeg::SVWLRule.filter(chunks)
|
11
11
|
chunks.length.should == 1
|
12
12
|
chunks[0][0].text.should == "研究"
|
13
13
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-02
|
12
|
+
date: 2008-03-02 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|