migemo-lib 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README +6 -0
- data/data/migemo-dict +166746 -0
- data/data/migemo-dict.cache +0 -0
- data/data/migemo-dict.cache.idx +0 -0
- data/data/migemo-dict.idx +0 -0
- data/doc/migemo.ja.rd +66 -0
- data/lib/migemo-dict.rb +126 -0
- data/lib/migemo-regex.rb +340 -0
- data/lib/migemo.rb +183 -0
- data/lib/migemo/core_ext/string.rb +60 -0
- data/lib/migemo/version.rb +8 -0
- data/test/cache_test.rb +28 -0
- data/test/charclass_test.rb +15 -0
- data/test/convert_test.rb +29 -0
- data/test/emacs_type_test.rb +10 -0
- data/test/insertion_test.rb +16 -0
- data/test/migemo_test.rb +50 -0
- data/test/regex_dict_test.rb +24 -0
- data/test/regex_test.rb +16 -0
- data/test/symbols_test.rb +16 -0
- data/test/test_helper.rb +22 -0
- data/test/user_dict_test.rb +22 -0
- metadata +102 -0
Binary file
|
Binary file
|
Binary file
|
data/doc/migemo.ja.rd
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
=begin
|
2
|
+
= Ruby/Migemo: ローマ字のまま日本語をインクリメンタル検索する Ruby用のライブラリ
|
3
|
+
Ruby/Migemo はローマ字のまま日本語をインクリメンタル検索する
|
4
|
+
ためのライブラリです。
|
5
|
+
|
6
|
+
=== 使用例
|
7
|
+
|
8
|
+
% cat sample.rb
|
9
|
+
require 'migemo'
|
10
|
+
|
11
|
+
dict = MigemoStaticDict.new("migemo-dict")
|
12
|
+
dict_cache = MigemoDictCache.new("migemo-dict" + ".cache")
|
13
|
+
user_dict = MigemoUserDict.new("user-dict")
|
14
|
+
|
15
|
+
while line = gets
|
16
|
+
pattern = line.chomp
|
17
|
+
migemo = Migemo.new(pattern,dict)
|
18
|
+
migemo.optimization = 3
|
19
|
+
migemo.dict_cache = dict_cache
|
20
|
+
migemo.user_dict = user_dict
|
21
|
+
migemo.type = "ruby"
|
22
|
+
puts migemo.regex
|
23
|
+
end
|
24
|
+
|
25
|
+
== API
|
26
|
+
|
27
|
+
--- MigemoStaticDict#new(filename)
|
28
|
+
静的な辞書のオブジェクトを生成する
|
29
|
+
|
30
|
+
--- MigemoDictCache#new(filename)
|
31
|
+
静的な辞書のキャッシュのオブジェクトを生成する
|
32
|
+
|
33
|
+
--- MigemoUserDict#new(filename)
|
34
|
+
ユーザ辞書のオブジェクトを生成する
|
35
|
+
|
36
|
+
--- MigemoRegexDict#new(filename)
|
37
|
+
正規表現辞書のオブジェクトを生成する
|
38
|
+
|
39
|
+
--- Migemo#new(pattern, dict)
|
40
|
+
Migemoオブジェクトを生成する。dict には
|
41
|
+
MigemoStaticDict オブジェクトかStringを、pattern には検索パター
|
42
|
+
ンを与える
|
43
|
+
|
44
|
+
--- Migemo#regex
|
45
|
+
検索パターンを展開した正規表現の文字列を返す。
|
46
|
+
|
47
|
+
--- Migemo#type
|
48
|
+
正規表現の種類 (emacs, ruby, perl) を設定する accessor。[ruby]
|
49
|
+
|
50
|
+
--- Migemo#dict_cache
|
51
|
+
静的辞書のキャッシュを設定する accessor。
|
52
|
+
|
53
|
+
--- Migemo#usr_dict
|
54
|
+
ユーザ辞書のオブジェクトを設定する accessor。
|
55
|
+
|
56
|
+
--- Migemo#regex_dict
|
57
|
+
正規表現辞書のオブジェクトを設定する accessor。
|
58
|
+
|
59
|
+
--- Migemo#insertion
|
60
|
+
1文字ごとに挟む文字列を設定する accessor。
|
61
|
+
|
62
|
+
--- Migemo#optimization
|
63
|
+
正規表現のコンパクト化のレベル (0-3) を設定する accessor。[3]
|
64
|
+
|
65
|
+
satoru@namazu.org
|
66
|
+
=end
|
data/lib/migemo-dict.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
#
|
2
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
5
|
+
# All rights reserved.
|
6
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the GNU General Public License version 2.
|
10
|
+
|
11
|
+
require 'bsearch'
|
12
|
+
require 'migemo/core_ext/string'
|
13
|
+
|
14
|
+
class MigemoDictItem
|
15
|
+
def initialize(key, values)
|
16
|
+
@key = key
|
17
|
+
@values = values
|
18
|
+
raise if @key == nil
|
19
|
+
raise if @values == nil
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :key
|
23
|
+
attr_reader :values
|
24
|
+
end
|
25
|
+
|
26
|
+
class MigemoDict
|
27
|
+
def initialize (filename)
|
28
|
+
@dict = File.new(filename)
|
29
|
+
end
|
30
|
+
|
31
|
+
def lookup (pattern)
|
32
|
+
pattern = pattern.downcase
|
33
|
+
raise "nil pattern" if pattern == nil
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def decompose (line)
|
38
|
+
array = line.chomp.split("\t").delete_if do |x| x == nil end
|
39
|
+
key = array.shift
|
40
|
+
values = array
|
41
|
+
raise if key == nil
|
42
|
+
raise if values == nil
|
43
|
+
return key, values
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class MigemoStaticDict < MigemoDict
|
48
|
+
def initialize (filename)
|
49
|
+
super(filename)
|
50
|
+
@index = File.new(filename + ".idx").read.unpack "N*"
|
51
|
+
end
|
52
|
+
|
53
|
+
def lookup (pattern)
|
54
|
+
range = @index.bsearch_range do |idx|
|
55
|
+
key, values = decompose(get_line(idx))
|
56
|
+
key.prefix_match(pattern)
|
57
|
+
end
|
58
|
+
if range
|
59
|
+
range.each do |i|
|
60
|
+
key, values = decompose(get_line(@index[i]))
|
61
|
+
yield(MigemoDictItem.new(key, values))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def get_line (index)
|
68
|
+
@dict.seek(index)
|
69
|
+
@dict.gets
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
class MigemoUserDict < MigemoDict
|
74
|
+
def initialize (filename)
|
75
|
+
super(filename)
|
76
|
+
@lines = @dict.readlines.delete_if {|x| /^;/ =~ x}.sort
|
77
|
+
end
|
78
|
+
|
79
|
+
def lookup (pattern)
|
80
|
+
range = @lines.bsearch_range do |line|
|
81
|
+
key, values = decompose(line)
|
82
|
+
key.prefix_match(pattern)
|
83
|
+
end
|
84
|
+
if range
|
85
|
+
range.each do |i|
|
86
|
+
key, values = decompose(@lines[i])
|
87
|
+
yield(MigemoDictItem.new(key, values))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class MigemoRegexDict < MigemoUserDict
|
94
|
+
end
|
95
|
+
|
96
|
+
class MigemoDictCache
|
97
|
+
def initialize (filename)
|
98
|
+
@dict = File.new(filename)
|
99
|
+
@index = File.new(filename + ".idx").read.unpack "N*"
|
100
|
+
end
|
101
|
+
|
102
|
+
def decompose (idx)
|
103
|
+
@dict.seek(idx)
|
104
|
+
keylen = @dict.read(4).unpack("N").first
|
105
|
+
key = @dict.read(keylen).unpack("a*").first
|
106
|
+
datalen = @dict.read(4).unpack("N").first
|
107
|
+
data = Marshal.load(@dict.read(datalen))
|
108
|
+
return key, data
|
109
|
+
end
|
110
|
+
private :decompose
|
111
|
+
|
112
|
+
def lookup (pattern)
|
113
|
+
raise if pattern == nil
|
114
|
+
pattern = pattern.downcase
|
115
|
+
idx = @index.bsearch_first do |_idx|
|
116
|
+
key, data = decompose(_idx)
|
117
|
+
key <=> pattern
|
118
|
+
end
|
119
|
+
if idx
|
120
|
+
key, data = decompose(@index[idx])
|
121
|
+
return data
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/migemo-regex.rb
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
4
|
+
#
|
5
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
6
|
+
# All rights reserved.
|
7
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
8
|
+
#
|
9
|
+
# You can redistribute it and/or modify it under the terms of
|
10
|
+
# the GNU General Public License version 2.
|
11
|
+
|
12
|
+
module MigemoRegex
|
13
|
+
class RegexAlternation < Array
|
14
|
+
def sort
|
15
|
+
self.clone.replace(super)
|
16
|
+
end
|
17
|
+
|
18
|
+
def uniq
|
19
|
+
self.clone.replace(super)
|
20
|
+
end
|
21
|
+
|
22
|
+
def map
|
23
|
+
self.clone.replace(super {|x| yield(x)})
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete_if
|
27
|
+
self.clone.replace(super {|x| yield(x)})
|
28
|
+
end
|
29
|
+
|
30
|
+
def select
|
31
|
+
self.clone.replace(super {|x| yield(x)})
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class RegexConcatnation < Array
|
36
|
+
def map
|
37
|
+
self.clone.replace(super {|x| yield(x)})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class RegexCharClass < Array
|
42
|
+
end
|
43
|
+
|
44
|
+
class RegexCompiler
|
45
|
+
def initialize
|
46
|
+
@regex = RegexAlternation.new
|
47
|
+
end
|
48
|
+
attr_reader :regex
|
49
|
+
|
50
|
+
def push (item)
|
51
|
+
if item and item != ""
|
52
|
+
@regex.push(item)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def uniq
|
57
|
+
@regex.uniq
|
58
|
+
end
|
59
|
+
|
60
|
+
def optimize (level)
|
61
|
+
@regex = optimize1(@regex) if level >= 1
|
62
|
+
@regex = optimize2(@regex) if level >= 2
|
63
|
+
@regex = optimize3(@regex) if level >= 3
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
# ["運", "運動", "運転", "日本", "日本語"] => ["安" "運" "日本"]
|
68
|
+
# (運|運動|運転|日本|日本語) => (安|運|日本)
|
69
|
+
def optimize1 (regex)
|
70
|
+
prefixpat = nil
|
71
|
+
sorted = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") } : regex.sort
|
72
|
+
sorted.select do |word|
|
73
|
+
if prefixpat && prefixpat.match(word) then
|
74
|
+
false # excluded
|
75
|
+
else
|
76
|
+
prefixpat = Regexp.new("^" + Regexp.quote(word))
|
77
|
+
true # included
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# (あああ|ああい|ああう)
|
83
|
+
# => (あ(あ(あ|い|う)))
|
84
|
+
def optimize2 (regex)
|
85
|
+
tmpregex = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") }.clone : regex.sort.clone # I wish Array#cdr were available...
|
86
|
+
optimized = RegexAlternation.new
|
87
|
+
until tmpregex.empty?
|
88
|
+
head = tmpregex.shift
|
89
|
+
initial = head.first
|
90
|
+
friends = RegexAlternation.new
|
91
|
+
while item = tmpregex.first
|
92
|
+
if initial == item.first
|
93
|
+
friends.push(item.rest)
|
94
|
+
tmpregex.shift
|
95
|
+
else
|
96
|
+
break
|
97
|
+
end
|
98
|
+
end
|
99
|
+
if friends.empty?
|
100
|
+
optimized.push head
|
101
|
+
else
|
102
|
+
concat = RegexConcatnation.new
|
103
|
+
concat.push(initial)
|
104
|
+
friends.unshift(head.rest)
|
105
|
+
concat.push(optimize2(friends))
|
106
|
+
optimized.push(concat)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
return optimized
|
110
|
+
end
|
111
|
+
|
112
|
+
# (あ|い|う|え|お)
|
113
|
+
# => [あいうえお]
|
114
|
+
def optimize3 (regex)
|
115
|
+
charclass = RegexCharClass.new
|
116
|
+
if regex.instance_of?(RegexAlternation)
|
117
|
+
regex.delete_if do |x|
|
118
|
+
if x.instance_of?(String) && x =~ /^.$/ then
|
119
|
+
charclass.push(x)
|
120
|
+
true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if charclass.length == 1
|
126
|
+
regex.unshift charclass.first
|
127
|
+
elsif charclass.length > 1
|
128
|
+
regex.unshift charclass
|
129
|
+
end
|
130
|
+
|
131
|
+
regex.map do |x|
|
132
|
+
if x.instance_of?(RegexAlternation) || x.instance_of?(RegexConcatnation)
|
133
|
+
optimize3(x)
|
134
|
+
else
|
135
|
+
x
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class RegexMetachars
|
142
|
+
def initialize
|
143
|
+
@bar = '|'
|
144
|
+
@lparen = '('
|
145
|
+
@rparen = ')'
|
146
|
+
end
|
147
|
+
attr_accessor :bar
|
148
|
+
attr_accessor :lparen
|
149
|
+
attr_accessor :rparen
|
150
|
+
end
|
151
|
+
|
152
|
+
class RegexEgrepMetachars < RegexMetachars
|
153
|
+
end
|
154
|
+
|
155
|
+
class RegexPerlMetachars < RegexMetachars
|
156
|
+
def initialize
|
157
|
+
@bar = '|'
|
158
|
+
@lparen = '(?:'
|
159
|
+
@rparen = ')'
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
class RegexRubyMetachars < RegexMetachars
|
164
|
+
end
|
165
|
+
|
166
|
+
class RegexEmacsMetachars < RegexMetachars
|
167
|
+
def initialize
|
168
|
+
@bar = '\\|'
|
169
|
+
@lparen = '\\('
|
170
|
+
@rparen = '\\)'
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
class RegexRenderer
|
175
|
+
def initialize (regex, insertion)
|
176
|
+
raise if regex == nil
|
177
|
+
@regex = regex
|
178
|
+
@meta = RegexMetachars.new
|
179
|
+
@insertion = insertion
|
180
|
+
@with_paren = false
|
181
|
+
end
|
182
|
+
attr_accessor :with_paren
|
183
|
+
|
184
|
+
def render
|
185
|
+
if @with_paren # e.g. "(a|b|c)"
|
186
|
+
render0(@regex)
|
187
|
+
else # e.g. "a|b|c"
|
188
|
+
@regex.map do |x|
|
189
|
+
render0(x)
|
190
|
+
end.join @meta.bar
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def join_regexes (string, regexes)
|
195
|
+
([string] + regexes).join @meta.bar
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
def render_alternation (regex)
|
200
|
+
if regex.length == 0
|
201
|
+
raise
|
202
|
+
elsif regex.length == 1
|
203
|
+
render0(regex[0])
|
204
|
+
else
|
205
|
+
@meta.lparen +
|
206
|
+
regex.map {|x| render0(x) }.join(@meta.bar) +
|
207
|
+
@meta.rparen
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def render_concatnation (regex)
|
212
|
+
regex.map {|x| render0(x) }.join(@insertion)
|
213
|
+
end
|
214
|
+
|
215
|
+
# We don't use Regexp.quote because the following regex
|
216
|
+
# is more general (not ruby-specific) and safe to use.
|
217
|
+
def escape_string (string)
|
218
|
+
string.gsub(/([\x00-\x1f\x21-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f])/, '\\\\\\1')
|
219
|
+
end
|
220
|
+
|
221
|
+
def escape_charclass (string)
|
222
|
+
string.gsub(/\\/, '\\\\\\')
|
223
|
+
end
|
224
|
+
|
225
|
+
def render_charclass (regex)
|
226
|
+
if regex.delete("-")
|
227
|
+
regex.push("-") # move "-" to the end of Array.
|
228
|
+
end
|
229
|
+
if regex.delete("]")
|
230
|
+
regex.unshift("]") # move "]" to the beginning of Array.
|
231
|
+
end
|
232
|
+
escape_charclass("[" + regex.join + "]")
|
233
|
+
end
|
234
|
+
|
235
|
+
def insert (string)
|
236
|
+
if @insertion != ""
|
237
|
+
tmp = string.gsub(/(\\.|.)/, "\\1#{@insertion}")
|
238
|
+
tmp = tmp.sub(/#{Regexp.quote(@insertion)}$/, "")
|
239
|
+
else
|
240
|
+
string
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def render_string (regex)
|
245
|
+
insert(escape_string(regex))
|
246
|
+
end
|
247
|
+
|
248
|
+
def render0 (x)
|
249
|
+
if x.instance_of?(RegexAlternation)
|
250
|
+
render_alternation(x)
|
251
|
+
elsif x.instance_of?(RegexConcatnation)
|
252
|
+
render_concatnation(x)
|
253
|
+
elsif x.instance_of?(RegexCharClass)
|
254
|
+
render_charclass(x)
|
255
|
+
elsif x.instance_of?(String)
|
256
|
+
render_string(x)
|
257
|
+
else
|
258
|
+
raise "unexpected type: #{x} of #{x.class}"
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class RegexPerlRenderer < RegexRenderer
|
264
|
+
def initialize (regex, insertion)
|
265
|
+
super(regex, insertion)
|
266
|
+
@meta = RegexPerlMetachars.new
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
class RegexRubyRenderer < RegexPerlRenderer
|
271
|
+
end
|
272
|
+
|
273
|
+
class RegexEgrepRenderer < RegexRenderer
|
274
|
+
end
|
275
|
+
|
276
|
+
class RegexEmacsRenderer < RegexRenderer
|
277
|
+
def initialize (regex, insertion)
|
278
|
+
super(regex, insertion)
|
279
|
+
@meta = RegexEmacsMetachars.new
|
280
|
+
end
|
281
|
+
|
282
|
+
def escape_string (string)
|
283
|
+
str = Regexp.quote(string)
|
284
|
+
str.gsub!(/\\\(/, "(")
|
285
|
+
str.gsub!(/\\\)/, ")")
|
286
|
+
str.gsub!(/\\\|/, "|")
|
287
|
+
str.gsub!(/\\\</, "<")
|
288
|
+
str.gsub!(/\\\>/, ">")
|
289
|
+
str.gsub!(/\\\=/, "=")
|
290
|
+
str.gsub!(/\\\'/, "'")
|
291
|
+
str.gsub!(/\\\`/, "`")
|
292
|
+
str.gsub!(/\\\{/, "{")
|
293
|
+
str
|
294
|
+
end
|
295
|
+
|
296
|
+
def escape_charclass (string)
|
297
|
+
string
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
module RegexMetacharsFactory
|
302
|
+
def new (type)
|
303
|
+
case type
|
304
|
+
when nil
|
305
|
+
RegexRubyMetachars.new
|
306
|
+
when "emacs"
|
307
|
+
RegexEmacsMetachars.new
|
308
|
+
when "perl"
|
309
|
+
RegexPerlMetachars.new
|
310
|
+
when "ruby"
|
311
|
+
RegexRubyMetachars.new
|
312
|
+
when "egrep"
|
313
|
+
RegexEgrepMetachars.new
|
314
|
+
else
|
315
|
+
raise "Unknown type: #{type}"
|
316
|
+
end
|
317
|
+
end
|
318
|
+
module_function :new
|
319
|
+
end
|
320
|
+
|
321
|
+
module RegexRendererFactory
|
322
|
+
def new (regex, type, insertion)
|
323
|
+
case type
|
324
|
+
when nil
|
325
|
+
RegexRubyRenderer.new(regex, insertion)
|
326
|
+
when "emacs"
|
327
|
+
RegexEmacsRenderer.new(regex, insertion)
|
328
|
+
when "perl"
|
329
|
+
RegexPerlRenderer.new(regex, insertion)
|
330
|
+
when "ruby"
|
331
|
+
RegexRubyRenderer.new(regex, insertion)
|
332
|
+
when "egrep"
|
333
|
+
RegexEgrepRenderer.new(regex, insertion)
|
334
|
+
else
|
335
|
+
raise "Unknown type: #{regex}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
module_function :new
|
339
|
+
end
|
340
|
+
end
|