migemo-lib 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Binary file
Binary file
@@ -0,0 +1,66 @@
1
+ =begin
2
+ = Ruby/Migemo: ローマ字のまま日本語をインクリメンタル検索する Ruby用のライブラリ
3
+ Ruby/Migemo はローマ字のまま日本語をインクリメンタル検索する
4
+ ためのライブラリです。
5
+
6
+ === 使用例
7
+
8
+ % cat sample.rb
9
+ require 'migemo'
10
+
11
+ dict = MigemoStaticDict.new("migemo-dict")
12
+ dict_cache = MigemoDictCache.new("migemo-dict" + ".cache")
13
+ user_dict = MigemoUserDict.new("user-dict")
14
+
15
+ while line = gets
16
+ pattern = line.chomp
17
+ migemo = Migemo.new(pattern,dict)
18
+ migemo.optimization = 3
19
+ migemo.dict_cache = dict_cache
20
+ migemo.user_dict = user_dict
21
+ migemo.type = "ruby"
22
+ puts migemo.regex
23
+ end
24
+
25
+ == API
26
+
27
+ --- MigemoStaticDict#new(filename)
28
+ 静的な辞書のオブジェクトを生成する
29
+
30
+ --- MigemoDictCache#new(filename)
31
+ 静的な辞書のキャッシュのオブジェクトを生成する
32
+
33
+ --- MigemoUserDict#new(filename)
34
+ ユーザ辞書のオブジェクトを生成する
35
+
36
+ --- MigemoRegexDict#new(filename)
37
+ 正規表現辞書のオブジェクトを生成する
38
+
39
+ --- Migemo#new(pattern, dict)
40
+ Migemoオブジェクトを生成する。dict には
41
+ MigemoStaticDict オブジェクトかStringを、pattern には検索パター
42
+ ンを与える
43
+
44
+ --- Migemo#regex
45
+ 検索パターンを展開した正規表現の文字列を返す。
46
+
47
+ --- Migemo#type
48
+ 正規表現の種類 (emacs, ruby, perl) を設定する accessor。[ruby]
49
+
50
+ --- Migemo#dict_cache
51
+ 静的辞書のキャッシュを設定する accessor。
52
+
53
+ --- Migemo#usr_dict
54
+ ユーザ辞書のオブジェクトを設定する accessor。
55
+
56
+ --- Migemo#regex_dict
57
+ 正規表現辞書のオブジェクトを設定する accessor。
58
+
59
+ --- Migemo#insertion
60
+ 1文字ごとに挟む文字列を設定する accessor。
61
+
62
+ --- Migemo#optimization
63
+ 正規表現のコンパクト化のレベル (0-3) を設定する accessor。[3]
64
+
65
+ satoru@namazu.org
66
+ =end
@@ -0,0 +1,126 @@
1
+ #
2
+ # Ruby/Migemo - a library for Japanese incremental search.
3
+ #
4
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+
11
+ require 'bsearch'
12
+ require 'migemo/core_ext/string'
13
+
14
+ class MigemoDictItem
15
+ def initialize(key, values)
16
+ @key = key
17
+ @values = values
18
+ raise if @key == nil
19
+ raise if @values == nil
20
+ end
21
+
22
+ attr_reader :key
23
+ attr_reader :values
24
+ end
25
+
26
+ class MigemoDict
27
+ def initialize (filename)
28
+ @dict = File.new(filename)
29
+ end
30
+
31
+ def lookup (pattern)
32
+ pattern = pattern.downcase
33
+ raise "nil pattern" if pattern == nil
34
+ end
35
+
36
+ private
37
+ def decompose (line)
38
+ array = line.chomp.split("\t").delete_if do |x| x == nil end
39
+ key = array.shift
40
+ values = array
41
+ raise if key == nil
42
+ raise if values == nil
43
+ return key, values
44
+ end
45
+ end
46
+
47
+ class MigemoStaticDict < MigemoDict
48
+ def initialize (filename)
49
+ super(filename)
50
+ @index = File.new(filename + ".idx").read.unpack "N*"
51
+ end
52
+
53
+ def lookup (pattern)
54
+ range = @index.bsearch_range do |idx|
55
+ key, values = decompose(get_line(idx))
56
+ key.prefix_match(pattern)
57
+ end
58
+ if range
59
+ range.each do |i|
60
+ key, values = decompose(get_line(@index[i]))
61
+ yield(MigemoDictItem.new(key, values))
62
+ end
63
+ end
64
+ end
65
+
66
+ private
67
+ def get_line (index)
68
+ @dict.seek(index)
69
+ @dict.gets
70
+ end
71
+ end
72
+
73
+ class MigemoUserDict < MigemoDict
74
+ def initialize (filename)
75
+ super(filename)
76
+ @lines = @dict.readlines.delete_if {|x| /^;/ =~ x}.sort
77
+ end
78
+
79
+ def lookup (pattern)
80
+ range = @lines.bsearch_range do |line|
81
+ key, values = decompose(line)
82
+ key.prefix_match(pattern)
83
+ end
84
+ if range
85
+ range.each do |i|
86
+ key, values = decompose(@lines[i])
87
+ yield(MigemoDictItem.new(key, values))
88
+ end
89
+ end
90
+ end
91
+ end
92
+
93
+ class MigemoRegexDict < MigemoUserDict
94
+ end
95
+
96
+ class MigemoDictCache
97
+ def initialize (filename)
98
+ @dict = File.new(filename)
99
+ @index = File.new(filename + ".idx").read.unpack "N*"
100
+ end
101
+
102
+ def decompose (idx)
103
+ @dict.seek(idx)
104
+ keylen = @dict.read(4).unpack("N").first
105
+ key = @dict.read(keylen).unpack("a*").first
106
+ datalen = @dict.read(4).unpack("N").first
107
+ data = Marshal.load(@dict.read(datalen))
108
+ return key, data
109
+ end
110
+ private :decompose
111
+
112
+ def lookup (pattern)
113
+ raise if pattern == nil
114
+ pattern = pattern.downcase
115
+ idx = @index.bsearch_first do |_idx|
116
+ key, data = decompose(_idx)
117
+ key <=> pattern
118
+ end
119
+ if idx
120
+ key, data = decompose(@index[idx])
121
+ return data
122
+ else
123
+ nil
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,340 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Ruby/Migemo - a library for Japanese incremental search.
4
+ #
5
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
6
+ # All rights reserved.
7
+ # This is free software with ABSOLUTELY NO WARRANTY.
8
+ #
9
+ # You can redistribute it and/or modify it under the terms of
10
+ # the GNU General Public License version 2.
11
+
12
+ module MigemoRegex
13
+ class RegexAlternation < Array
14
+ def sort
15
+ self.clone.replace(super)
16
+ end
17
+
18
+ def uniq
19
+ self.clone.replace(super)
20
+ end
21
+
22
+ def map
23
+ self.clone.replace(super {|x| yield(x)})
24
+ end
25
+
26
+ def delete_if
27
+ self.clone.replace(super {|x| yield(x)})
28
+ end
29
+
30
+ def select
31
+ self.clone.replace(super {|x| yield(x)})
32
+ end
33
+ end
34
+
35
+ class RegexConcatnation < Array
36
+ def map
37
+ self.clone.replace(super {|x| yield(x)})
38
+ end
39
+ end
40
+
41
+ class RegexCharClass < Array
42
+ end
43
+
44
+ class RegexCompiler
45
+ def initialize
46
+ @regex = RegexAlternation.new
47
+ end
48
+ attr_reader :regex
49
+
50
+ def push (item)
51
+ if item and item != ""
52
+ @regex.push(item)
53
+ end
54
+ end
55
+
56
+ def uniq
57
+ @regex.uniq
58
+ end
59
+
60
+ def optimize (level)
61
+ @regex = optimize1(@regex) if level >= 1
62
+ @regex = optimize2(@regex) if level >= 2
63
+ @regex = optimize3(@regex) if level >= 3
64
+ end
65
+
66
+ private
67
+ # ["運", "運動", "運転", "日本", "日本語"] => ["安" "運" "日本"]
68
+ # (運|運動|運転|日本|日本語) => (安|運|日本)
69
+ def optimize1 (regex)
70
+ prefixpat = nil
71
+ sorted = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") } : regex.sort
72
+ sorted.select do |word|
73
+ if prefixpat && prefixpat.match(word) then
74
+ false # excluded
75
+ else
76
+ prefixpat = Regexp.new("^" + Regexp.quote(word))
77
+ true # included
78
+ end
79
+ end
80
+ end
81
+
82
+ # (あああ|ああい|ああう)
83
+ # => (あ(あ(あ|い|う)))
84
+ def optimize2 (regex)
85
+ tmpregex = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") }.clone : regex.sort.clone # I wish Array#cdr were available...
86
+ optimized = RegexAlternation.new
87
+ until tmpregex.empty?
88
+ head = tmpregex.shift
89
+ initial = head.first
90
+ friends = RegexAlternation.new
91
+ while item = tmpregex.first
92
+ if initial == item.first
93
+ friends.push(item.rest)
94
+ tmpregex.shift
95
+ else
96
+ break
97
+ end
98
+ end
99
+ if friends.empty?
100
+ optimized.push head
101
+ else
102
+ concat = RegexConcatnation.new
103
+ concat.push(initial)
104
+ friends.unshift(head.rest)
105
+ concat.push(optimize2(friends))
106
+ optimized.push(concat)
107
+ end
108
+ end
109
+ return optimized
110
+ end
111
+
112
+ # (あ|い|う|え|お)
113
+ # => [あいうえお]
114
+ def optimize3 (regex)
115
+ charclass = RegexCharClass.new
116
+ if regex.instance_of?(RegexAlternation)
117
+ regex.delete_if do |x|
118
+ if x.instance_of?(String) && x =~ /^.$/ then
119
+ charclass.push(x)
120
+ true
121
+ end
122
+ end
123
+ end
124
+
125
+ if charclass.length == 1
126
+ regex.unshift charclass.first
127
+ elsif charclass.length > 1
128
+ regex.unshift charclass
129
+ end
130
+
131
+ regex.map do |x|
132
+ if x.instance_of?(RegexAlternation) || x.instance_of?(RegexConcatnation)
133
+ optimize3(x)
134
+ else
135
+ x
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ class RegexMetachars
142
+ def initialize
143
+ @bar = '|'
144
+ @lparen = '('
145
+ @rparen = ')'
146
+ end
147
+ attr_accessor :bar
148
+ attr_accessor :lparen
149
+ attr_accessor :rparen
150
+ end
151
+
152
+ class RegexEgrepMetachars < RegexMetachars
153
+ end
154
+
155
+ class RegexPerlMetachars < RegexMetachars
156
+ def initialize
157
+ @bar = '|'
158
+ @lparen = '(?:'
159
+ @rparen = ')'
160
+ end
161
+ end
162
+
163
+ class RegexRubyMetachars < RegexMetachars
164
+ end
165
+
166
+ class RegexEmacsMetachars < RegexMetachars
167
+ def initialize
168
+ @bar = '\\|'
169
+ @lparen = '\\('
170
+ @rparen = '\\)'
171
+ end
172
+ end
173
+
174
+ class RegexRenderer
175
+ def initialize (regex, insertion)
176
+ raise if regex == nil
177
+ @regex = regex
178
+ @meta = RegexMetachars.new
179
+ @insertion = insertion
180
+ @with_paren = false
181
+ end
182
+ attr_accessor :with_paren
183
+
184
+ def render
185
+ if @with_paren # e.g. "(a|b|c)"
186
+ render0(@regex)
187
+ else # e.g. "a|b|c"
188
+ @regex.map do |x|
189
+ render0(x)
190
+ end.join @meta.bar
191
+ end
192
+ end
193
+
194
+ def join_regexes (string, regexes)
195
+ ([string] + regexes).join @meta.bar
196
+ end
197
+
198
+ private
199
+ def render_alternation (regex)
200
+ if regex.length == 0
201
+ raise
202
+ elsif regex.length == 1
203
+ render0(regex[0])
204
+ else
205
+ @meta.lparen +
206
+ regex.map {|x| render0(x) }.join(@meta.bar) +
207
+ @meta.rparen
208
+ end
209
+ end
210
+
211
+ def render_concatnation (regex)
212
+ regex.map {|x| render0(x) }.join(@insertion)
213
+ end
214
+
215
+ # We don't use Regexp.quote because the following regex
216
+ # is more general (not ruby-specific) and safe to use.
217
+ def escape_string (string)
218
+ string.gsub(/([\x00-\x1f\x21-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f])/, '\\\\\\1')
219
+ end
220
+
221
+ def escape_charclass (string)
222
+ string.gsub(/\\/, '\\\\\\')
223
+ end
224
+
225
+ def render_charclass (regex)
226
+ if regex.delete("-")
227
+ regex.push("-") # move "-" to the end of Array.
228
+ end
229
+ if regex.delete("]")
230
+ regex.unshift("]") # move "]" to the beginning of Array.
231
+ end
232
+ escape_charclass("[" + regex.join + "]")
233
+ end
234
+
235
+ def insert (string)
236
+ if @insertion != ""
237
+ tmp = string.gsub(/(\\.|.)/, "\\1#{@insertion}")
238
+ tmp = tmp.sub(/#{Regexp.quote(@insertion)}$/, "")
239
+ else
240
+ string
241
+ end
242
+ end
243
+
244
+ def render_string (regex)
245
+ insert(escape_string(regex))
246
+ end
247
+
248
+ def render0 (x)
249
+ if x.instance_of?(RegexAlternation)
250
+ render_alternation(x)
251
+ elsif x.instance_of?(RegexConcatnation)
252
+ render_concatnation(x)
253
+ elsif x.instance_of?(RegexCharClass)
254
+ render_charclass(x)
255
+ elsif x.instance_of?(String)
256
+ render_string(x)
257
+ else
258
+ raise "unexpected type: #{x} of #{x.class}"
259
+ end
260
+ end
261
+ end
262
+
263
+ class RegexPerlRenderer < RegexRenderer
264
+ def initialize (regex, insertion)
265
+ super(regex, insertion)
266
+ @meta = RegexPerlMetachars.new
267
+ end
268
+ end
269
+
270
+ class RegexRubyRenderer < RegexPerlRenderer
271
+ end
272
+
273
+ class RegexEgrepRenderer < RegexRenderer
274
+ end
275
+
276
+ class RegexEmacsRenderer < RegexRenderer
277
+ def initialize (regex, insertion)
278
+ super(regex, insertion)
279
+ @meta = RegexEmacsMetachars.new
280
+ end
281
+
282
+ def escape_string (string)
283
+ str = Regexp.quote(string)
284
+ str.gsub!(/\\\(/, "(")
285
+ str.gsub!(/\\\)/, ")")
286
+ str.gsub!(/\\\|/, "|")
287
+ str.gsub!(/\\\</, "<")
288
+ str.gsub!(/\\\>/, ">")
289
+ str.gsub!(/\\\=/, "=")
290
+ str.gsub!(/\\\'/, "'")
291
+ str.gsub!(/\\\`/, "`")
292
+ str.gsub!(/\\\{/, "{")
293
+ str
294
+ end
295
+
296
+ def escape_charclass (string)
297
+ string
298
+ end
299
+ end
300
+
301
+ module RegexMetacharsFactory
302
+ def new (type)
303
+ case type
304
+ when nil
305
+ RegexRubyMetachars.new
306
+ when "emacs"
307
+ RegexEmacsMetachars.new
308
+ when "perl"
309
+ RegexPerlMetachars.new
310
+ when "ruby"
311
+ RegexRubyMetachars.new
312
+ when "egrep"
313
+ RegexEgrepMetachars.new
314
+ else
315
+ raise "Unknown type: #{type}"
316
+ end
317
+ end
318
+ module_function :new
319
+ end
320
+
321
+ module RegexRendererFactory
322
+ def new (regex, type, insertion)
323
+ case type
324
+ when nil
325
+ RegexRubyRenderer.new(regex, insertion)
326
+ when "emacs"
327
+ RegexEmacsRenderer.new(regex, insertion)
328
+ when "perl"
329
+ RegexPerlRenderer.new(regex, insertion)
330
+ when "ruby"
331
+ RegexRubyRenderer.new(regex, insertion)
332
+ when "egrep"
333
+ RegexEgrepRenderer.new(regex, insertion)
334
+ else
335
+ raise "Unknown type: #{regex}"
336
+ end
337
+ end
338
+ module_function :new
339
+ end
340
+ end