rmmseg 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/TODO.txt +0 -3
- data/lib/rmmseg/algorithm.rb +7 -37
- data/lib/rmmseg/complex_algorithm.rb +37 -7
- data/lib/rmmseg/dictionary.rb +10 -0
- data/lib/rmmseg/lawl_rule.rb +1 -1
- data/lib/rmmseg/lsdmfocw_rule.rb +1 -1
- data/lib/rmmseg/mm_rule.rb +1 -1
- data/lib/rmmseg/simple_algorithm.rb +9 -6
- data/lib/rmmseg/svwl_rule.rb +1 -1
- data/lib/rmmseg.rb +1 -1
- data/spec/lawl_rule_spec.rb +1 -1
- data/spec/lsdmfocw_rule_spec.rb +1 -1
- data/spec/mm_rule_spec.rb +1 -1
- data/spec/svwl_rule_spec.rb +1 -1
- metadata +2 -2
data/History.txt
CHANGED
data/TODO.txt
CHANGED
data/lib/rmmseg/algorithm.rb
CHANGED
@@ -26,15 +26,10 @@ module RMMSeg
|
|
26
26
|
def next_token
|
27
27
|
return nil if @index >= @chars.length
|
28
28
|
|
29
|
-
|
30
|
-
orig_index = @index
|
31
|
-
token = nil
|
32
|
-
len = 0
|
33
|
-
|
34
|
-
if basic_latin?(current)
|
29
|
+
if basic_latin?(@chars[@index])
|
35
30
|
token = get_basic_latin_word
|
36
31
|
else
|
37
|
-
token = get_cjk_word
|
32
|
+
token = get_cjk_word
|
38
33
|
end
|
39
34
|
|
40
35
|
if token.empty?
|
@@ -90,35 +85,10 @@ module RMMSeg
|
|
90
85
|
return Token.new(@text[start_pos...end_pos], start_pos, end_pos)
|
91
86
|
end
|
92
87
|
|
93
|
-
# Use rules to filter the +chunks+ to get the most
|
94
|
-
# apropos CJK word.
|
95
|
-
def get_cjk_word(chunks)
|
96
|
-
i = 0
|
97
|
-
while i < @rules.length
|
98
|
-
break if chunks.length < 2
|
99
|
-
chunks = @rules[i].filter(chunks)
|
100
|
-
i += 1
|
101
|
-
end
|
102
|
-
|
103
|
-
if chunks.length > 1
|
104
|
-
if Config.on_ambiguity == :raise_exception
|
105
|
-
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
word = chunks[0][0]
|
110
|
-
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
111
|
-
|
112
|
-
@index += word.length
|
113
|
-
@byte_index += word.byte_size
|
114
|
-
|
115
|
-
return token
|
116
|
-
end
|
117
|
-
|
118
88
|
# Find all words occuring in the dictionary starting from
|
119
89
|
# +index+ . The maximum word length is determined by
|
120
90
|
# +Config.max_word_length+ .
|
121
|
-
def find_match_words(
|
91
|
+
def find_match_words(index)
|
122
92
|
for i, w in @match_cache
|
123
93
|
if i == index
|
124
94
|
return w
|
@@ -131,11 +101,11 @@ module RMMSeg
|
|
131
101
|
words = Array.new
|
132
102
|
i = index
|
133
103
|
|
134
|
-
while i < chars.length &&
|
135
|
-
!basic_latin?(chars[i]) &&
|
104
|
+
while i < @chars.length &&
|
105
|
+
!basic_latin?(@chars[i]) &&
|
136
106
|
strlen < Config.max_word_length
|
137
107
|
|
138
|
-
str << chars[i]
|
108
|
+
str << @chars[i]
|
139
109
|
strlen += 1
|
140
110
|
|
141
111
|
if dic.has_word?(str)
|
@@ -145,7 +115,7 @@ module RMMSeg
|
|
145
115
|
end
|
146
116
|
|
147
117
|
if words.empty?
|
148
|
-
words << Word.new(chars[index], Word::TYPES[:unrecognized])
|
118
|
+
words << Word.new(@chars[index], Word::TYPES[:unrecognized])
|
149
119
|
end
|
150
120
|
|
151
121
|
@match_cache[@match_cache_idx] = [index, words]
|
@@ -13,24 +13,54 @@ module RMMSeg
|
|
13
13
|
def initialize(text)
|
14
14
|
super
|
15
15
|
@rules = [
|
16
|
-
MMRule
|
17
|
-
LAWLRule
|
18
|
-
SVWLRule
|
19
|
-
LSDMFOCWRule
|
16
|
+
MMRule,
|
17
|
+
LAWLRule,
|
18
|
+
SVWLRule,
|
19
|
+
LSDMFOCWRule
|
20
20
|
]
|
21
21
|
end
|
22
22
|
|
23
|
+
# Get the most proper CJK word.
|
24
|
+
def get_cjk_word
|
25
|
+
get_cjk_word_from_chunks(create_chunks)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Use rules to filter the +chunks+ to get the most
|
29
|
+
# apropos CJK word.
|
30
|
+
def get_cjk_word_from_chunks(chunks)
|
31
|
+
i = 0
|
32
|
+
while i < @rules.length
|
33
|
+
break if chunks.length < 2
|
34
|
+
chunks = @rules[i].filter(chunks)
|
35
|
+
i += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
if chunks.length > 1
|
39
|
+
if Config.on_ambiguity == :raise_exception
|
40
|
+
raise Ambiguity, "Can't solve ambiguity on #{chunks}"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
word = chunks[0][0]
|
45
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
46
|
+
|
47
|
+
@index += word.length
|
48
|
+
@byte_index += word.byte_size
|
49
|
+
|
50
|
+
return token
|
51
|
+
end
|
52
|
+
|
23
53
|
# Create all possible three-word (or less) chunks
|
24
54
|
# starting from +@index+ .
|
25
55
|
def create_chunks
|
26
56
|
chunks = Array.new
|
27
|
-
for w0 in find_match_words(@
|
57
|
+
for w0 in find_match_words(@index)
|
28
58
|
index0 = @index + w0.length
|
29
59
|
if index0 < @chars.length
|
30
|
-
for w1 in find_match_words(
|
60
|
+
for w1 in find_match_words(index0)
|
31
61
|
index1 = index0 + w1.length
|
32
62
|
if index1 < @chars.length
|
33
|
-
for w2 in find_match_words(
|
63
|
+
for w2 in find_match_words(index1)
|
34
64
|
if w2.type == Word::TYPES[:unrecognized]
|
35
65
|
chunks << [w0, w1]
|
36
66
|
else
|
data/lib/rmmseg/dictionary.rb
CHANGED
@@ -18,6 +18,16 @@ module RMMSeg
|
|
18
18
|
@dic.has_key?(value)
|
19
19
|
end
|
20
20
|
|
21
|
+
# Store a new word to dictionary.
|
22
|
+
# +w+ may be:
|
23
|
+
# * an instance of Word.
|
24
|
+
# * +true+, then this is a normal world.
|
25
|
+
# * a String(which can be converted to a Number) or Number.
|
26
|
+
# The number is the frequency of the word.
|
27
|
+
def store_word(key, w=true)
|
28
|
+
@dic[key] = w
|
29
|
+
end
|
30
|
+
|
21
31
|
# Get an instance of Word corresponding to +value+ .
|
22
32
|
def get_word(value)
|
23
33
|
word = @dic[value]
|
data/lib/rmmseg/lawl_rule.rb
CHANGED
data/lib/rmmseg/lsdmfocw_rule.rb
CHANGED
@@ -4,7 +4,7 @@ module RMMSeg
|
|
4
4
|
# Largest sum of degree of morphemic freedom of one-character
|
5
5
|
# words rule.
|
6
6
|
class LSDMFOCWRule
|
7
|
-
def filter(chunks)
|
7
|
+
def self.filter(chunks)
|
8
8
|
chunks.take_highest { |a, b|
|
9
9
|
Chunk::degree_of_morphemic_freedom(a) <=> Chunk::degree_of_morphemic_freedom(b)
|
10
10
|
}
|
data/lib/rmmseg/mm_rule.rb
CHANGED
@@ -9,14 +9,17 @@ module RMMSeg
|
|
9
9
|
# algorithm is MMRule .
|
10
10
|
def initialize(text)
|
11
11
|
super
|
12
|
-
@rules = [ MMRule.new ]
|
13
12
|
end
|
14
13
|
|
15
|
-
#
|
16
|
-
def
|
17
|
-
find_match_words(@
|
18
|
-
|
19
|
-
|
14
|
+
# Get the most proper CJK word.
|
15
|
+
def get_cjk_word
|
16
|
+
word = find_match_words(@index).last
|
17
|
+
token = Token.new(word.text, @byte_index, @byte_index+word.byte_size)
|
18
|
+
|
19
|
+
@index += word.length
|
20
|
+
@byte_index += word.byte_size
|
21
|
+
|
22
|
+
return token
|
20
23
|
end
|
21
24
|
end
|
22
25
|
end
|
data/lib/rmmseg/svwl_rule.rb
CHANGED
data/lib/rmmseg.rb
CHANGED
data/spec/lawl_rule_spec.rb
CHANGED
@@ -8,7 +8,7 @@ describe "largest average word length rule" do
|
|
8
8
|
gen_words(["国际", "化"]),
|
9
9
|
gen_words(["国", "际", "化"])
|
10
10
|
]
|
11
|
-
chunks = RMMSeg::LAWLRule.
|
11
|
+
chunks = RMMSeg::LAWLRule.filter(chunks)
|
12
12
|
chunks.length.should == 1
|
13
13
|
chunks[0][0].text.should == "国际化"
|
14
14
|
end
|
data/spec/lsdmfocw_rule_spec.rb
CHANGED
@@ -7,7 +7,7 @@ describe "largest sum of degree of morphemic freedom of one-character words rule
|
|
7
7
|
gen_words(["主要", "是", "因为"], [nil, 100, nil]),
|
8
8
|
gen_words(["主", "要是", "因为"], [10, nil, nil])
|
9
9
|
]
|
10
|
-
chunks = RMMSeg::LSDMFOCWRule.
|
10
|
+
chunks = RMMSeg::LSDMFOCWRule.filter(chunks)
|
11
11
|
chunks.length.should == 1
|
12
12
|
chunks[0][0].text.should == "主要"
|
13
13
|
end
|
data/spec/mm_rule_spec.rb
CHANGED
data/spec/svwl_rule_spec.rb
CHANGED
@@ -7,7 +7,7 @@ describe "smallest variance of word length rule" do
|
|
7
7
|
gen_words(["研究", "生命", "起源"]),
|
8
8
|
gen_words(["研究生", "命", "起源"])
|
9
9
|
]
|
10
|
-
chunks = RMMSeg::SVWLRule.
|
10
|
+
chunks = RMMSeg::SVWLRule.filter(chunks)
|
11
11
|
chunks.length.should == 1
|
12
12
|
chunks[0][0].text.should == "研究"
|
13
13
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rmmseg
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- pluskid
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-02
|
12
|
+
date: 2008-03-02 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|