migemo-lib 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +6 -0
- data/data/migemo-dict +166746 -0
- data/data/migemo-dict.cache +0 -0
- data/data/migemo-dict.cache.idx +0 -0
- data/data/migemo-dict.idx +0 -0
- data/doc/migemo.ja.rd +66 -0
- data/lib/migemo-dict.rb +126 -0
- data/lib/migemo-regex.rb +340 -0
- data/lib/migemo.rb +183 -0
- data/lib/migemo/core_ext/string.rb +60 -0
- data/lib/migemo/version.rb +8 -0
- data/test/cache_test.rb +28 -0
- data/test/charclass_test.rb +15 -0
- data/test/convert_test.rb +29 -0
- data/test/emacs_type_test.rb +10 -0
- data/test/insertion_test.rb +16 -0
- data/test/migemo_test.rb +50 -0
- data/test/regex_dict_test.rb +24 -0
- data/test/regex_test.rb +16 -0
- data/test/symbols_test.rb +16 -0
- data/test/test_helper.rb +22 -0
- data/test/user_dict_test.rb +22 -0
- metadata +102 -0
Binary file
|
Binary file
|
Binary file
|
data/doc/migemo.ja.rd
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
=begin
|
2
|
+
= Ruby/Migemo: ローマ字のまま日本語をインクリメンタル検索する Ruby用のライブラリ
|
3
|
+
Ruby/Migemo はローマ字のまま日本語をインクリメンタル検索する
|
4
|
+
ためのライブラリです。
|
5
|
+
|
6
|
+
=== 使用例
|
7
|
+
|
8
|
+
% cat sample.rb
|
9
|
+
require 'migemo'
|
10
|
+
|
11
|
+
dict = MigemoStaticDict.new("migemo-dict")
|
12
|
+
dict_cache = MigemoDictCache.new("migemo-dict" + ".cache")
|
13
|
+
user_dict = MigemoUserDict.new("user-dict")
|
14
|
+
|
15
|
+
while line = gets
|
16
|
+
pattern = line.chomp
|
17
|
+
migemo = Migemo.new(pattern,dict)
|
18
|
+
migemo.optimization = 3
|
19
|
+
migemo.dict_cache = dict_cache
|
20
|
+
migemo.user_dict = user_dict
|
21
|
+
migemo.type = "ruby"
|
22
|
+
puts migemo.regex
|
23
|
+
end
|
24
|
+
|
25
|
+
== API
|
26
|
+
|
27
|
+
--- MigemoStaticDict#new(filename)
|
28
|
+
静的な辞書のオブジェクトを生成する
|
29
|
+
|
30
|
+
--- MigemoDictCache#new(filename)
|
31
|
+
静的な辞書のキャッシュのオブジェクトを生成する
|
32
|
+
|
33
|
+
--- MigemoUserDict#new(filename)
|
34
|
+
ユーザ辞書のオブジェクトを生成する
|
35
|
+
|
36
|
+
--- MigemoRegexDict#new(filename)
|
37
|
+
正規表現辞書のオブジェクトを生成する
|
38
|
+
|
39
|
+
--- Migemo#new(pattern, dict)
|
40
|
+
Migemoオブジェクトを生成する。dict には
|
41
|
+
MigemoStaticDict オブジェクトかStringを、pattern には検索パター
|
42
|
+
ンを与える
|
43
|
+
|
44
|
+
--- Migemo#regex
|
45
|
+
検索パターンを展開した正規表現の文字列を返す。
|
46
|
+
|
47
|
+
--- Migemo#type
|
48
|
+
正規表現の種類 (emacs, ruby, perl) を設定する accessor。[ruby]
|
49
|
+
|
50
|
+
--- Migemo#dict_cache
|
51
|
+
静的辞書のキャッシュを設定する accessor。
|
52
|
+
|
53
|
+
--- Migemo#usr_dict
|
54
|
+
ユーザ辞書のオブジェクトを設定する accessor。
|
55
|
+
|
56
|
+
--- Migemo#regex_dict
|
57
|
+
正規表現辞書のオブジェクトを設定する accessor。
|
58
|
+
|
59
|
+
--- Migemo#insertion
|
60
|
+
1文字ごとに挟む文字列を設定する accessor。
|
61
|
+
|
62
|
+
--- Migemo#optimization
|
63
|
+
正規表現のコンパクト化のレベル (0-3) を設定する accessor。[3]
|
64
|
+
|
65
|
+
satoru@namazu.org
|
66
|
+
=end
|
data/lib/migemo-dict.rb
ADDED
@@ -0,0 +1,126 @@
|
|
1
|
+
#
|
2
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
3
|
+
#
|
4
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
5
|
+
# All rights reserved.
|
6
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
7
|
+
#
|
8
|
+
# You can redistribute it and/or modify it under the terms of
|
9
|
+
# the GNU General Public License version 2.
|
10
|
+
|
11
|
+
require 'bsearch'
|
12
|
+
require 'migemo/core_ext/string'
|
13
|
+
|
14
|
+
class MigemoDictItem
|
15
|
+
def initialize(key, values)
|
16
|
+
@key = key
|
17
|
+
@values = values
|
18
|
+
raise if @key == nil
|
19
|
+
raise if @values == nil
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :key
|
23
|
+
attr_reader :values
|
24
|
+
end
|
25
|
+
|
26
|
+
class MigemoDict
|
27
|
+
def initialize (filename)
|
28
|
+
@dict = File.new(filename)
|
29
|
+
end
|
30
|
+
|
31
|
+
def lookup (pattern)
|
32
|
+
pattern = pattern.downcase
|
33
|
+
raise "nil pattern" if pattern == nil
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def decompose (line)
|
38
|
+
array = line.chomp.split("\t").delete_if do |x| x == nil end
|
39
|
+
key = array.shift
|
40
|
+
values = array
|
41
|
+
raise if key == nil
|
42
|
+
raise if values == nil
|
43
|
+
return key, values
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class MigemoStaticDict < MigemoDict
|
48
|
+
def initialize (filename)
|
49
|
+
super(filename)
|
50
|
+
@index = File.new(filename + ".idx").read.unpack "N*"
|
51
|
+
end
|
52
|
+
|
53
|
+
def lookup (pattern)
|
54
|
+
range = @index.bsearch_range do |idx|
|
55
|
+
key, values = decompose(get_line(idx))
|
56
|
+
key.prefix_match(pattern)
|
57
|
+
end
|
58
|
+
if range
|
59
|
+
range.each do |i|
|
60
|
+
key, values = decompose(get_line(@index[i]))
|
61
|
+
yield(MigemoDictItem.new(key, values))
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
def get_line (index)
|
68
|
+
@dict.seek(index)
|
69
|
+
@dict.gets
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
class MigemoUserDict < MigemoDict
|
74
|
+
def initialize (filename)
|
75
|
+
super(filename)
|
76
|
+
@lines = @dict.readlines.delete_if {|x| /^;/ =~ x}.sort
|
77
|
+
end
|
78
|
+
|
79
|
+
def lookup (pattern)
|
80
|
+
range = @lines.bsearch_range do |line|
|
81
|
+
key, values = decompose(line)
|
82
|
+
key.prefix_match(pattern)
|
83
|
+
end
|
84
|
+
if range
|
85
|
+
range.each do |i|
|
86
|
+
key, values = decompose(@lines[i])
|
87
|
+
yield(MigemoDictItem.new(key, values))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
class MigemoRegexDict < MigemoUserDict
|
94
|
+
end
|
95
|
+
|
96
|
+
class MigemoDictCache
|
97
|
+
def initialize (filename)
|
98
|
+
@dict = File.new(filename)
|
99
|
+
@index = File.new(filename + ".idx").read.unpack "N*"
|
100
|
+
end
|
101
|
+
|
102
|
+
def decompose (idx)
|
103
|
+
@dict.seek(idx)
|
104
|
+
keylen = @dict.read(4).unpack("N").first
|
105
|
+
key = @dict.read(keylen).unpack("a*").first
|
106
|
+
datalen = @dict.read(4).unpack("N").first
|
107
|
+
data = Marshal.load(@dict.read(datalen))
|
108
|
+
return key, data
|
109
|
+
end
|
110
|
+
private :decompose
|
111
|
+
|
112
|
+
def lookup (pattern)
|
113
|
+
raise if pattern == nil
|
114
|
+
pattern = pattern.downcase
|
115
|
+
idx = @index.bsearch_first do |_idx|
|
116
|
+
key, data = decompose(_idx)
|
117
|
+
key <=> pattern
|
118
|
+
end
|
119
|
+
if idx
|
120
|
+
key, data = decompose(@index[idx])
|
121
|
+
return data
|
122
|
+
else
|
123
|
+
nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/lib/migemo-regex.rb
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
#
|
3
|
+
# Ruby/Migemo - a library for Japanese incremental search.
|
4
|
+
#
|
5
|
+
# Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
|
6
|
+
# All rights reserved.
|
7
|
+
# This is free software with ABSOLUTELY NO WARRANTY.
|
8
|
+
#
|
9
|
+
# You can redistribute it and/or modify it under the terms of
|
10
|
+
# the GNU General Public License version 2.
|
11
|
+
|
12
|
+
module MigemoRegex
|
13
|
+
class RegexAlternation < Array
|
14
|
+
def sort
|
15
|
+
self.clone.replace(super)
|
16
|
+
end
|
17
|
+
|
18
|
+
def uniq
|
19
|
+
self.clone.replace(super)
|
20
|
+
end
|
21
|
+
|
22
|
+
def map
|
23
|
+
self.clone.replace(super {|x| yield(x)})
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete_if
|
27
|
+
self.clone.replace(super {|x| yield(x)})
|
28
|
+
end
|
29
|
+
|
30
|
+
def select
|
31
|
+
self.clone.replace(super {|x| yield(x)})
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
class RegexConcatnation < Array
|
36
|
+
def map
|
37
|
+
self.clone.replace(super {|x| yield(x)})
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
class RegexCharClass < Array
|
42
|
+
end
|
43
|
+
|
44
|
+
class RegexCompiler
|
45
|
+
def initialize
|
46
|
+
@regex = RegexAlternation.new
|
47
|
+
end
|
48
|
+
attr_reader :regex
|
49
|
+
|
50
|
+
def push (item)
|
51
|
+
if item and item != ""
|
52
|
+
@regex.push(item)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def uniq
|
57
|
+
@regex.uniq
|
58
|
+
end
|
59
|
+
|
60
|
+
def optimize (level)
|
61
|
+
@regex = optimize1(@regex) if level >= 1
|
62
|
+
@regex = optimize2(@regex) if level >= 2
|
63
|
+
@regex = optimize3(@regex) if level >= 3
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
# ["運", "運動", "運転", "日本", "日本語"] => ["安" "運" "日本"]
|
68
|
+
# (運|運動|運転|日本|日本語) => (安|運|日本)
|
69
|
+
def optimize1 (regex)
|
70
|
+
prefixpat = nil
|
71
|
+
sorted = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") } : regex.sort
|
72
|
+
sorted.select do |word|
|
73
|
+
if prefixpat && prefixpat.match(word) then
|
74
|
+
false # excluded
|
75
|
+
else
|
76
|
+
prefixpat = Regexp.new("^" + Regexp.quote(word))
|
77
|
+
true # included
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
# (あああ|ああい|ああう)
|
83
|
+
# => (あ(あ(あ|い|う)))
|
84
|
+
def optimize2 (regex)
|
85
|
+
tmpregex = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") }.clone : regex.sort.clone # I wish Array#cdr were available...
|
86
|
+
optimized = RegexAlternation.new
|
87
|
+
until tmpregex.empty?
|
88
|
+
head = tmpregex.shift
|
89
|
+
initial = head.first
|
90
|
+
friends = RegexAlternation.new
|
91
|
+
while item = tmpregex.first
|
92
|
+
if initial == item.first
|
93
|
+
friends.push(item.rest)
|
94
|
+
tmpregex.shift
|
95
|
+
else
|
96
|
+
break
|
97
|
+
end
|
98
|
+
end
|
99
|
+
if friends.empty?
|
100
|
+
optimized.push head
|
101
|
+
else
|
102
|
+
concat = RegexConcatnation.new
|
103
|
+
concat.push(initial)
|
104
|
+
friends.unshift(head.rest)
|
105
|
+
concat.push(optimize2(friends))
|
106
|
+
optimized.push(concat)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
return optimized
|
110
|
+
end
|
111
|
+
|
112
|
+
# (あ|い|う|え|お)
|
113
|
+
# => [あいうえお]
|
114
|
+
def optimize3 (regex)
|
115
|
+
charclass = RegexCharClass.new
|
116
|
+
if regex.instance_of?(RegexAlternation)
|
117
|
+
regex.delete_if do |x|
|
118
|
+
if x.instance_of?(String) && x =~ /^.$/ then
|
119
|
+
charclass.push(x)
|
120
|
+
true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
if charclass.length == 1
|
126
|
+
regex.unshift charclass.first
|
127
|
+
elsif charclass.length > 1
|
128
|
+
regex.unshift charclass
|
129
|
+
end
|
130
|
+
|
131
|
+
regex.map do |x|
|
132
|
+
if x.instance_of?(RegexAlternation) || x.instance_of?(RegexConcatnation)
|
133
|
+
optimize3(x)
|
134
|
+
else
|
135
|
+
x
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
class RegexMetachars
|
142
|
+
def initialize
|
143
|
+
@bar = '|'
|
144
|
+
@lparen = '('
|
145
|
+
@rparen = ')'
|
146
|
+
end
|
147
|
+
attr_accessor :bar
|
148
|
+
attr_accessor :lparen
|
149
|
+
attr_accessor :rparen
|
150
|
+
end
|
151
|
+
|
152
|
+
class RegexEgrepMetachars < RegexMetachars
|
153
|
+
end
|
154
|
+
|
155
|
+
class RegexPerlMetachars < RegexMetachars
|
156
|
+
def initialize
|
157
|
+
@bar = '|'
|
158
|
+
@lparen = '(?:'
|
159
|
+
@rparen = ')'
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
class RegexRubyMetachars < RegexMetachars
|
164
|
+
end
|
165
|
+
|
166
|
+
class RegexEmacsMetachars < RegexMetachars
|
167
|
+
def initialize
|
168
|
+
@bar = '\\|'
|
169
|
+
@lparen = '\\('
|
170
|
+
@rparen = '\\)'
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
class RegexRenderer
|
175
|
+
def initialize (regex, insertion)
|
176
|
+
raise if regex == nil
|
177
|
+
@regex = regex
|
178
|
+
@meta = RegexMetachars.new
|
179
|
+
@insertion = insertion
|
180
|
+
@with_paren = false
|
181
|
+
end
|
182
|
+
attr_accessor :with_paren
|
183
|
+
|
184
|
+
def render
|
185
|
+
if @with_paren # e.g. "(a|b|c)"
|
186
|
+
render0(@regex)
|
187
|
+
else # e.g. "a|b|c"
|
188
|
+
@regex.map do |x|
|
189
|
+
render0(x)
|
190
|
+
end.join @meta.bar
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
def join_regexes (string, regexes)
|
195
|
+
([string] + regexes).join @meta.bar
|
196
|
+
end
|
197
|
+
|
198
|
+
private
|
199
|
+
def render_alternation (regex)
|
200
|
+
if regex.length == 0
|
201
|
+
raise
|
202
|
+
elsif regex.length == 1
|
203
|
+
render0(regex[0])
|
204
|
+
else
|
205
|
+
@meta.lparen +
|
206
|
+
regex.map {|x| render0(x) }.join(@meta.bar) +
|
207
|
+
@meta.rparen
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
def render_concatnation (regex)
|
212
|
+
regex.map {|x| render0(x) }.join(@insertion)
|
213
|
+
end
|
214
|
+
|
215
|
+
# We don't use Regexp.quote because the following regex
|
216
|
+
# is more general (not ruby-specific) and safe to use.
|
217
|
+
def escape_string (string)
|
218
|
+
string.gsub(/([\x00-\x1f\x21-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f])/, '\\\\\\1')
|
219
|
+
end
|
220
|
+
|
221
|
+
def escape_charclass (string)
|
222
|
+
string.gsub(/\\/, '\\\\\\')
|
223
|
+
end
|
224
|
+
|
225
|
+
def render_charclass (regex)
|
226
|
+
if regex.delete("-")
|
227
|
+
regex.push("-") # move "-" to the end of Array.
|
228
|
+
end
|
229
|
+
if regex.delete("]")
|
230
|
+
regex.unshift("]") # move "]" to the beginning of Array.
|
231
|
+
end
|
232
|
+
escape_charclass("[" + regex.join + "]")
|
233
|
+
end
|
234
|
+
|
235
|
+
def insert (string)
|
236
|
+
if @insertion != ""
|
237
|
+
tmp = string.gsub(/(\\.|.)/, "\\1#{@insertion}")
|
238
|
+
tmp = tmp.sub(/#{Regexp.quote(@insertion)}$/, "")
|
239
|
+
else
|
240
|
+
string
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def render_string (regex)
|
245
|
+
insert(escape_string(regex))
|
246
|
+
end
|
247
|
+
|
248
|
+
def render0 (x)
|
249
|
+
if x.instance_of?(RegexAlternation)
|
250
|
+
render_alternation(x)
|
251
|
+
elsif x.instance_of?(RegexConcatnation)
|
252
|
+
render_concatnation(x)
|
253
|
+
elsif x.instance_of?(RegexCharClass)
|
254
|
+
render_charclass(x)
|
255
|
+
elsif x.instance_of?(String)
|
256
|
+
render_string(x)
|
257
|
+
else
|
258
|
+
raise "unexpected type: #{x} of #{x.class}"
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
class RegexPerlRenderer < RegexRenderer
|
264
|
+
def initialize (regex, insertion)
|
265
|
+
super(regex, insertion)
|
266
|
+
@meta = RegexPerlMetachars.new
|
267
|
+
end
|
268
|
+
end
|
269
|
+
|
270
|
+
class RegexRubyRenderer < RegexPerlRenderer
|
271
|
+
end
|
272
|
+
|
273
|
+
class RegexEgrepRenderer < RegexRenderer
|
274
|
+
end
|
275
|
+
|
276
|
+
class RegexEmacsRenderer < RegexRenderer
|
277
|
+
def initialize (regex, insertion)
|
278
|
+
super(regex, insertion)
|
279
|
+
@meta = RegexEmacsMetachars.new
|
280
|
+
end
|
281
|
+
|
282
|
+
def escape_string (string)
|
283
|
+
str = Regexp.quote(string)
|
284
|
+
str.gsub!(/\\\(/, "(")
|
285
|
+
str.gsub!(/\\\)/, ")")
|
286
|
+
str.gsub!(/\\\|/, "|")
|
287
|
+
str.gsub!(/\\\</, "<")
|
288
|
+
str.gsub!(/\\\>/, ">")
|
289
|
+
str.gsub!(/\\\=/, "=")
|
290
|
+
str.gsub!(/\\\'/, "'")
|
291
|
+
str.gsub!(/\\\`/, "`")
|
292
|
+
str.gsub!(/\\\{/, "{")
|
293
|
+
str
|
294
|
+
end
|
295
|
+
|
296
|
+
def escape_charclass (string)
|
297
|
+
string
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
module RegexMetacharsFactory
|
302
|
+
def new (type)
|
303
|
+
case type
|
304
|
+
when nil
|
305
|
+
RegexRubyMetachars.new
|
306
|
+
when "emacs"
|
307
|
+
RegexEmacsMetachars.new
|
308
|
+
when "perl"
|
309
|
+
RegexPerlMetachars.new
|
310
|
+
when "ruby"
|
311
|
+
RegexRubyMetachars.new
|
312
|
+
when "egrep"
|
313
|
+
RegexEgrepMetachars.new
|
314
|
+
else
|
315
|
+
raise "Unknown type: #{type}"
|
316
|
+
end
|
317
|
+
end
|
318
|
+
module_function :new
|
319
|
+
end
|
320
|
+
|
321
|
+
module RegexRendererFactory
|
322
|
+
def new (regex, type, insertion)
|
323
|
+
case type
|
324
|
+
when nil
|
325
|
+
RegexRubyRenderer.new(regex, insertion)
|
326
|
+
when "emacs"
|
327
|
+
RegexEmacsRenderer.new(regex, insertion)
|
328
|
+
when "perl"
|
329
|
+
RegexPerlRenderer.new(regex, insertion)
|
330
|
+
when "ruby"
|
331
|
+
RegexRubyRenderer.new(regex, insertion)
|
332
|
+
when "egrep"
|
333
|
+
RegexEgrepRenderer.new(regex, insertion)
|
334
|
+
else
|
335
|
+
raise "Unknown type: #{regex}"
|
336
|
+
end
|
337
|
+
end
|
338
|
+
module_function :new
|
339
|
+
end
|
340
|
+
end
|