migemo-lib 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
@@ -0,0 +1,66 @@
1
+ =begin
2
+ = Ruby/Migemo: ローマ字のまま日本語をインクリメンタル検索する Ruby用のライブラリ
3
+ Ruby/Migemo はローマ字のまま日本語をインクリメンタル検索する
4
+ ためのライブラリです。
5
+
6
+ === 使用例
7
+
8
+ % cat sample.rb
9
+ require 'migemo'
10
+
11
+ dict = MigemoStaticDict.new("migemo-dict")
12
+ dict_cache = MigemoDictCache.new("migemo-dict" + ".cache")
13
+ user_dict = MigemoUserDict.new("user-dict")
14
+
15
+ while line = gets
16
+ pattern = line.chomp
17
+ migemo = Migemo.new(pattern,dict)
18
+ migemo.optimization = 3
19
+ migemo.dict_cache = dict_cache
20
+ migemo.user_dict = user_dict
21
+ migemo.type = "ruby"
22
+ puts migemo.regex
23
+ end
24
+
25
+ == API
26
+
27
+ --- MigemoStaticDict#new(filename)
28
+ 静的な辞書のオブジェクトを生成する
29
+
30
+ --- MigemoDictCache#new(filename)
31
+ 静的な辞書のキャッシュのオブジェクトを生成する
32
+
33
+ --- MigemoUserDict#new(filename)
34
+ ユーザ辞書のオブジェクトを生成する
35
+
36
+ --- MigemoRegexDict#new(filename)
37
+ 正規表現辞書のオブジェクトを生成する
38
+
39
+ --- Migemo#new(pattern, dict)
40
+ Migemoオブジェクトを生成する。dict には
41
+ MigemoStaticDict オブジェクトかStringを、pattern には検索パター
42
+ ンを与える
43
+
44
+ --- Migemo#regex
45
+ 検索パターンを展開した正規表現の文字列を返す。
46
+
47
+ --- Migemo#type
48
+ 正規表現の種類 (emacs, ruby, perl) を設定する accessor。[ruby]
49
+
50
+ --- Migemo#dict_cache
51
+ 静的辞書のキャッシュを設定する accessor。
52
+
53
+ --- Migemo#usr_dict
54
+ ユーザ辞書のオブジェクトを設定する accessor。
55
+
56
+ --- Migemo#regex_dict
57
+ 正規表現辞書のオブジェクトを設定する accessor。
58
+
59
+ --- Migemo#insertion
60
+ 1文字ごとに挟む文字列を設定する accessor。
61
+
62
+ --- Migemo#optimization
63
+ 正規表現のコンパクト化のレベル (0-3) を設定する accessor。[3]
64
+
65
+ satoru@namazu.org
66
+ =end
@@ -0,0 +1,126 @@
1
+ #
2
+ # Ruby/Migemo - a library for Japanese incremental search.
3
+ #
4
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
5
+ # All rights reserved.
6
+ # This is free software with ABSOLUTELY NO WARRANTY.
7
+ #
8
+ # You can redistribute it and/or modify it under the terms of
9
+ # the GNU General Public License version 2.
10
+
11
+ require 'bsearch'
12
+ require 'migemo/core_ext/string'
13
+
14
+ class MigemoDictItem
15
+ def initialize(key, values)
16
+ @key = key
17
+ @values = values
18
+ raise if @key == nil
19
+ raise if @values == nil
20
+ end
21
+
22
+ attr_reader :key
23
+ attr_reader :values
24
+ end
25
+
26
+ class MigemoDict
27
+ def initialize (filename)
28
+ @dict = File.new(filename)
29
+ end
30
+
31
+ def lookup (pattern)
32
+ pattern = pattern.downcase
33
+ raise "nil pattern" if pattern == nil
34
+ end
35
+
36
+ private
37
+ def decompose (line)
38
+ array = line.chomp.split("\t").delete_if do |x| x == nil end
39
+ key = array.shift
40
+ values = array
41
+ raise if key == nil
42
+ raise if values == nil
43
+ return key, values
44
+ end
45
+ end
46
+
47
+ class MigemoStaticDict < MigemoDict
48
+ def initialize (filename)
49
+ super(filename)
50
+ @index = File.new(filename + ".idx").read.unpack "N*"
51
+ end
52
+
53
+ def lookup (pattern)
54
+ range = @index.bsearch_range do |idx|
55
+ key, values = decompose(get_line(idx))
56
+ key.prefix_match(pattern)
57
+ end
58
+ if range
59
+ range.each do |i|
60
+ key, values = decompose(get_line(@index[i]))
61
+ yield(MigemoDictItem.new(key, values))
62
+ end
63
+ end
64
+ end
65
+
66
+ private
67
+ def get_line (index)
68
+ @dict.seek(index)
69
+ @dict.gets
70
+ end
71
+ end
72
+
73
+ class MigemoUserDict < MigemoDict
74
+ def initialize (filename)
75
+ super(filename)
76
+ @lines = @dict.readlines.delete_if {|x| /^;/ =~ x}.sort
77
+ end
78
+
79
+ def lookup (pattern)
80
+ range = @lines.bsearch_range do |line|
81
+ key, values = decompose(line)
82
+ key.prefix_match(pattern)
83
+ end
84
+ if range
85
+ range.each do |i|
86
+ key, values = decompose(@lines[i])
87
+ yield(MigemoDictItem.new(key, values))
88
+ end
89
+ end
90
+ end
91
+ end
92
+
93
+ class MigemoRegexDict < MigemoUserDict
94
+ end
95
+
96
+ class MigemoDictCache
97
+ def initialize (filename)
98
+ @dict = File.new(filename)
99
+ @index = File.new(filename + ".idx").read.unpack "N*"
100
+ end
101
+
102
+ def decompose (idx)
103
+ @dict.seek(idx)
104
+ keylen = @dict.read(4).unpack("N").first
105
+ key = @dict.read(keylen).unpack("a*").first
106
+ datalen = @dict.read(4).unpack("N").first
107
+ data = Marshal.load(@dict.read(datalen))
108
+ return key, data
109
+ end
110
+ private :decompose
111
+
112
+ def lookup (pattern)
113
+ raise if pattern == nil
114
+ pattern = pattern.downcase
115
+ idx = @index.bsearch_first do |_idx|
116
+ key, data = decompose(_idx)
117
+ key <=> pattern
118
+ end
119
+ if idx
120
+ key, data = decompose(@index[idx])
121
+ return data
122
+ else
123
+ nil
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,340 @@
1
+ # -*- coding: utf-8 -*-
2
+ #
3
+ # Ruby/Migemo - a library for Japanese incremental search.
4
+ #
5
+ # Copyright (C) 2001 Satoru Takabayashi <satoru@namazu.org>
6
+ # All rights reserved.
7
+ # This is free software with ABSOLUTELY NO WARRANTY.
8
+ #
9
+ # You can redistribute it and/or modify it under the terms of
10
+ # the GNU General Public License version 2.
11
+
12
+ module MigemoRegex
13
+ class RegexAlternation < Array
14
+ def sort
15
+ self.clone.replace(super)
16
+ end
17
+
18
+ def uniq
19
+ self.clone.replace(super)
20
+ end
21
+
22
+ def map
23
+ self.clone.replace(super {|x| yield(x)})
24
+ end
25
+
26
+ def delete_if
27
+ self.clone.replace(super {|x| yield(x)})
28
+ end
29
+
30
+ def select
31
+ self.clone.replace(super {|x| yield(x)})
32
+ end
33
+ end
34
+
35
+ class RegexConcatnation < Array
36
+ def map
37
+ self.clone.replace(super {|x| yield(x)})
38
+ end
39
+ end
40
+
41
+ class RegexCharClass < Array
42
+ end
43
+
44
+ class RegexCompiler
45
+ def initialize
46
+ @regex = RegexAlternation.new
47
+ end
48
+ attr_reader :regex
49
+
50
+ def push (item)
51
+ if item and item != ""
52
+ @regex.push(item)
53
+ end
54
+ end
55
+
56
+ def uniq
57
+ @regex.uniq
58
+ end
59
+
60
+ def optimize (level)
61
+ @regex = optimize1(@regex) if level >= 1
62
+ @regex = optimize2(@regex) if level >= 2
63
+ @regex = optimize3(@regex) if level >= 3
64
+ end
65
+
66
+ private
67
+ # ["運", "運動", "運転", "日本", "日本語"] => ["安" "運" "日本"]
68
+ # (運|運動|運転|日本|日本語) => (安|運|日本)
69
+ def optimize1 (regex)
70
+ prefixpat = nil
71
+ sorted = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") } : regex.sort
72
+ sorted.select do |word|
73
+ if prefixpat && prefixpat.match(word) then
74
+ false # excluded
75
+ else
76
+ prefixpat = Regexp.new("^" + Regexp.quote(word))
77
+ true # included
78
+ end
79
+ end
80
+ end
81
+
82
+ # (あああ|ああい|ああう)
83
+ # => (あ(あ(あ|い|う)))
84
+ def optimize2 (regex)
85
+ tmpregex = (defined?(Encoding)) ? regex.sort_by{|s| s.encode("EUC-JP") }.clone : regex.sort.clone # I wish Array#cdr were available...
86
+ optimized = RegexAlternation.new
87
+ until tmpregex.empty?
88
+ head = tmpregex.shift
89
+ initial = head.first
90
+ friends = RegexAlternation.new
91
+ while item = tmpregex.first
92
+ if initial == item.first
93
+ friends.push(item.rest)
94
+ tmpregex.shift
95
+ else
96
+ break
97
+ end
98
+ end
99
+ if friends.empty?
100
+ optimized.push head
101
+ else
102
+ concat = RegexConcatnation.new
103
+ concat.push(initial)
104
+ friends.unshift(head.rest)
105
+ concat.push(optimize2(friends))
106
+ optimized.push(concat)
107
+ end
108
+ end
109
+ return optimized
110
+ end
111
+
112
+ # (あ|い|う|え|お)
113
+ # => [あいうえお]
114
+ def optimize3 (regex)
115
+ charclass = RegexCharClass.new
116
+ if regex.instance_of?(RegexAlternation)
117
+ regex.delete_if do |x|
118
+ if x.instance_of?(String) && x =~ /^.$/ then
119
+ charclass.push(x)
120
+ true
121
+ end
122
+ end
123
+ end
124
+
125
+ if charclass.length == 1
126
+ regex.unshift charclass.first
127
+ elsif charclass.length > 1
128
+ regex.unshift charclass
129
+ end
130
+
131
+ regex.map do |x|
132
+ if x.instance_of?(RegexAlternation) || x.instance_of?(RegexConcatnation)
133
+ optimize3(x)
134
+ else
135
+ x
136
+ end
137
+ end
138
+ end
139
+ end
140
+
141
+ class RegexMetachars
142
+ def initialize
143
+ @bar = '|'
144
+ @lparen = '('
145
+ @rparen = ')'
146
+ end
147
+ attr_accessor :bar
148
+ attr_accessor :lparen
149
+ attr_accessor :rparen
150
+ end
151
+
152
+ class RegexEgrepMetachars < RegexMetachars
153
+ end
154
+
155
+ class RegexPerlMetachars < RegexMetachars
156
+ def initialize
157
+ @bar = '|'
158
+ @lparen = '(?:'
159
+ @rparen = ')'
160
+ end
161
+ end
162
+
163
+ class RegexRubyMetachars < RegexMetachars
164
+ end
165
+
166
+ class RegexEmacsMetachars < RegexMetachars
167
+ def initialize
168
+ @bar = '\\|'
169
+ @lparen = '\\('
170
+ @rparen = '\\)'
171
+ end
172
+ end
173
+
174
+ class RegexRenderer
175
+ def initialize (regex, insertion)
176
+ raise if regex == nil
177
+ @regex = regex
178
+ @meta = RegexMetachars.new
179
+ @insertion = insertion
180
+ @with_paren = false
181
+ end
182
+ attr_accessor :with_paren
183
+
184
+ def render
185
+ if @with_paren # e.g. "(a|b|c)"
186
+ render0(@regex)
187
+ else # e.g. "a|b|c"
188
+ @regex.map do |x|
189
+ render0(x)
190
+ end.join @meta.bar
191
+ end
192
+ end
193
+
194
+ def join_regexes (string, regexes)
195
+ ([string] + regexes).join @meta.bar
196
+ end
197
+
198
+ private
199
+ def render_alternation (regex)
200
+ if regex.length == 0
201
+ raise
202
+ elsif regex.length == 1
203
+ render0(regex[0])
204
+ else
205
+ @meta.lparen +
206
+ regex.map {|x| render0(x) }.join(@meta.bar) +
207
+ @meta.rparen
208
+ end
209
+ end
210
+
211
+ def render_concatnation (regex)
212
+ regex.map {|x| render0(x) }.join(@insertion)
213
+ end
214
+
215
+ # We don't use Regexp.quote because the following regex
216
+ # is more general (not ruby-specific) and safe to use.
217
+ def escape_string (string)
218
+ string.gsub(/([\x00-\x1f\x21-\x2f\x3a-\x40\x5b-\x5e\x60\x7b-\x7f])/, '\\\\\\1')
219
+ end
220
+
221
+ def escape_charclass (string)
222
+ string.gsub(/\\/, '\\\\\\')
223
+ end
224
+
225
+ def render_charclass (regex)
226
+ if regex.delete("-")
227
+ regex.push("-") # move "-" to the end of Array.
228
+ end
229
+ if regex.delete("]")
230
+ regex.unshift("]") # move "]" to the beginning of Array.
231
+ end
232
+ escape_charclass("[" + regex.join + "]")
233
+ end
234
+
235
+ def insert (string)
236
+ if @insertion != ""
237
+ tmp = string.gsub(/(\\.|.)/, "\\1#{@insertion}")
238
+ tmp = tmp.sub(/#{Regexp.quote(@insertion)}$/, "")
239
+ else
240
+ string
241
+ end
242
+ end
243
+
244
+ def render_string (regex)
245
+ insert(escape_string(regex))
246
+ end
247
+
248
+ def render0 (x)
249
+ if x.instance_of?(RegexAlternation)
250
+ render_alternation(x)
251
+ elsif x.instance_of?(RegexConcatnation)
252
+ render_concatnation(x)
253
+ elsif x.instance_of?(RegexCharClass)
254
+ render_charclass(x)
255
+ elsif x.instance_of?(String)
256
+ render_string(x)
257
+ else
258
+ raise "unexpected type: #{x} of #{x.class}"
259
+ end
260
+ end
261
+ end
262
+
263
+ class RegexPerlRenderer < RegexRenderer
264
+ def initialize (regex, insertion)
265
+ super(regex, insertion)
266
+ @meta = RegexPerlMetachars.new
267
+ end
268
+ end
269
+
270
+ class RegexRubyRenderer < RegexPerlRenderer
271
+ end
272
+
273
+ class RegexEgrepRenderer < RegexRenderer
274
+ end
275
+
276
+ class RegexEmacsRenderer < RegexRenderer
277
+ def initialize (regex, insertion)
278
+ super(regex, insertion)
279
+ @meta = RegexEmacsMetachars.new
280
+ end
281
+
282
+ def escape_string (string)
283
+ str = Regexp.quote(string)
284
+ str.gsub!(/\\\(/, "(")
285
+ str.gsub!(/\\\)/, ")")
286
+ str.gsub!(/\\\|/, "|")
287
+ str.gsub!(/\\\</, "<")
288
+ str.gsub!(/\\\>/, ">")
289
+ str.gsub!(/\\\=/, "=")
290
+ str.gsub!(/\\\'/, "'")
291
+ str.gsub!(/\\\`/, "`")
292
+ str.gsub!(/\\\{/, "{")
293
+ str
294
+ end
295
+
296
+ def escape_charclass (string)
297
+ string
298
+ end
299
+ end
300
+
301
+ module RegexMetacharsFactory
302
+ def new (type)
303
+ case type
304
+ when nil
305
+ RegexRubyMetachars.new
306
+ when "emacs"
307
+ RegexEmacsMetachars.new
308
+ when "perl"
309
+ RegexPerlMetachars.new
310
+ when "ruby"
311
+ RegexRubyMetachars.new
312
+ when "egrep"
313
+ RegexEgrepMetachars.new
314
+ else
315
+ raise "Unknown type: #{type}"
316
+ end
317
+ end
318
+ module_function :new
319
+ end
320
+
321
+ module RegexRendererFactory
322
+ def new (regex, type, insertion)
323
+ case type
324
+ when nil
325
+ RegexRubyRenderer.new(regex, insertion)
326
+ when "emacs"
327
+ RegexEmacsRenderer.new(regex, insertion)
328
+ when "perl"
329
+ RegexPerlRenderer.new(regex, insertion)
330
+ when "ruby"
331
+ RegexRubyRenderer.new(regex, insertion)
332
+ when "egrep"
333
+ RegexEgrepRenderer.new(regex, insertion)
334
+ else
335
+ raise "Unknown type: #{regex}"
336
+ end
337
+ end
338
+ module_function :new
339
+ end
340
+ end