list_matcher 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +444 -0
- data/Rakefile +9 -0
- data/examples/date_grammar.rb +49 -0
- data/lib/list_matcher/version.rb +3 -0
- data/lib/list_matcher.rb +729 -0
- data/list_matcher.gemspec +23 -0
- data/test/basic_test.rb +248 -0
- data/test/benchmarks.rb +149 -0
- data/test/stress.rb +44 -0
- metadata +87 -0
data/lib/list_matcher.rb
ADDED
@@ -0,0 +1,729 @@
|
|
1
|
+
require "list_matcher/version"
|
2
|
+
|
3
|
+
module List
|
4
|
+
class Matcher
|
5
|
+
attr_reader :atomic, :backtracking, :bound, :case_insensitive, :strip, :left_bound, :right_bound, :word_test, :normalize_whitespace, :multiline, :name, :vet
|
6
|
+
|
7
|
+
# convenience method for one-off regexen where there's no point in keeping
|
8
|
+
# around a pattern generator
|
9
|
+
def self.pattern(list, opts={})
|
10
|
+
self.new(**opts).pattern list
|
11
|
+
end
|
12
|
+
|
13
|
+
# like self.pattern, but returns a regex rather than a string
|
14
|
+
def self.rx(list, opts={})
|
15
|
+
self.new(**opts).rx list
|
16
|
+
end
|
17
|
+
|
18
|
+
# to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes
|
19
|
+
QRX = Regexp.new "([" + ( (1..255).map(&:chr).select{ |c| Regexp.quote(c) != c } - %w(-) ).map{ |c| Regexp.quote c }.join + "])"
|
20
|
+
|
21
|
+
def initialize(
|
22
|
+
atomic: true,
|
23
|
+
backtracking: true,
|
24
|
+
bound: false,
|
25
|
+
strip: false,
|
26
|
+
case_insensitive: false,
|
27
|
+
multiline: false,
|
28
|
+
normalize_whitespace: false,
|
29
|
+
symbols: {},
|
30
|
+
name: false,
|
31
|
+
vet: false
|
32
|
+
)
|
33
|
+
@atomic = atomic
|
34
|
+
@backtracking = backtracking
|
35
|
+
@strip = strip || normalize_whitespace
|
36
|
+
@case_insensitive = case_insensitive
|
37
|
+
@multiline = multiline
|
38
|
+
@symbols = deep_dup symbols
|
39
|
+
@_bound = bound
|
40
|
+
@bound = !!bound
|
41
|
+
@normalize_whitespace = normalize_whitespace
|
42
|
+
@vet = vet
|
43
|
+
if name
|
44
|
+
raise "" unless name.is_a?(String) || name.is_a?(Symbol)
|
45
|
+
if Regexp.new "(?<#{name}>.*)" # stir up any errors that might arise from using this name in a named capture
|
46
|
+
@name = name
|
47
|
+
end
|
48
|
+
end
|
49
|
+
if bound == :string
|
50
|
+
@word_test = /./
|
51
|
+
@left_bound = '\A'
|
52
|
+
@right_bound = '\z'
|
53
|
+
elsif bound == :line
|
54
|
+
@word_test = /./
|
55
|
+
@left_bound = '^'
|
56
|
+
@right_bound = '$'
|
57
|
+
elsif bound.is_a? Hash
|
58
|
+
@word_test = bound[:test] || raise(SyntaxError.new('no boundary test provided'))
|
59
|
+
@left_bound = bound[:left] || raise(SyntaxError.new('no left boundary expression provided'))
|
60
|
+
@right_bound = bound[:right] || raise(SyntaxError.new('no right boundary expression provided'))
|
61
|
+
elsif bound === true || bound == :word
|
62
|
+
@word_test = /\w/
|
63
|
+
@left_bound = '\b'
|
64
|
+
@right_bound = '\b'
|
65
|
+
elsif !( bound === false )
|
66
|
+
raise "unfamiliar value for :bound option: #{bound.inspect}"
|
67
|
+
end
|
68
|
+
if normalize_whitespace
|
69
|
+
@symbols[' '] = { pattern: '\s++' }
|
70
|
+
end
|
71
|
+
symbols.keys.each do |k|
|
72
|
+
raise "symbols variable #{k} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp)
|
73
|
+
end
|
74
|
+
if vet
|
75
|
+
Special.new( self, @symbols, [] ).verify
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# returns a new pattern matcher differing from the original only in the options specified
|
80
|
+
def bud(opts={})
|
81
|
+
opts = {
|
82
|
+
atomic: @atomic,
|
83
|
+
backtracking: @backtracking,
|
84
|
+
bound: @_bound,
|
85
|
+
strip: @strip,
|
86
|
+
case_insensitive: @case_insensitive,
|
87
|
+
multiline: @multiline,
|
88
|
+
normalize_whitespace: @normalize_whitespace,
|
89
|
+
symbols: @symbols,
|
90
|
+
name: @name,
|
91
|
+
vet: @vet && opts[:symbols]
|
92
|
+
}.merge opts
|
93
|
+
self.class.new(**opts)
|
94
|
+
end
|
95
|
+
|
96
|
+
# converst list into a string representing a regex pattern suitable for inclusion in a larger regex
|
97
|
+
def pattern( list, opts={} )
|
98
|
+
return bud(opts).pattern list unless opts.empty?
|
99
|
+
list = list.compact.map(&:to_s).select{ |s| s.length > 0 }
|
100
|
+
list.map!(&:strip).select!{ |s| s.length > 0 } if strip
|
101
|
+
list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace
|
102
|
+
return nil if list.empty?
|
103
|
+
specializer = Special.new self, @symbols, list
|
104
|
+
list = specializer.normalize
|
105
|
+
|
106
|
+
root = tree list, specializer
|
107
|
+
root.root = true
|
108
|
+
root.flatten
|
109
|
+
rx = root.convert
|
110
|
+
if m = modifiers
|
111
|
+
rx = "(?#{m}:#{rx})"
|
112
|
+
grouped = true
|
113
|
+
end
|
114
|
+
if name
|
115
|
+
rx = "(?<#{name}>#{rx})"
|
116
|
+
grouped = true
|
117
|
+
end
|
118
|
+
return rx if grouped && backtracking
|
119
|
+
if atomic && !root.atomic?
|
120
|
+
wrap rx
|
121
|
+
else
|
122
|
+
rx
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def modifiers
|
127
|
+
( @modifiers ||= if case_insensitive || multiline
|
128
|
+
[ ( 'i' if case_insensitive ), ( 'm' if multiline ) ].compact.join
|
129
|
+
else
|
130
|
+
[nil]
|
131
|
+
end )[0]
|
132
|
+
end
|
133
|
+
|
134
|
+
# like pattern but it returns a regex instead of a string
|
135
|
+
def rx(list, opts={})
|
136
|
+
Regexp.new pattern(list, opts)
|
137
|
+
end
|
138
|
+
|
139
|
+
def pfx
|
140
|
+
@pfx ||= backtracking ? '(?:' : '(?>'
|
141
|
+
end
|
142
|
+
|
143
|
+
def qmark
|
144
|
+
@qmark ||= backtracking ? '?' : '?+'
|
145
|
+
end
|
146
|
+
|
147
|
+
def wrap(s)
|
148
|
+
pfx + s + ')'
|
149
|
+
end
|
150
|
+
|
151
|
+
def wrap_size
|
152
|
+
@wrap_size ||= pfx.length + 1
|
153
|
+
end
|
154
|
+
|
155
|
+
def tree(list, symbols)
|
156
|
+
if list.size == 1
|
157
|
+
leaves = list[0].chars.map do |c|
|
158
|
+
symbols.symbols(c) || Leaf.new( self, c )
|
159
|
+
end
|
160
|
+
if leaves.length == 1
|
161
|
+
leaves.first
|
162
|
+
else
|
163
|
+
Sequence.new self, *leaves
|
164
|
+
end
|
165
|
+
elsif list.all?{ |w| w.length == 1 }
|
166
|
+
chars = list.select{ |w| !symbols.symbols(w) }
|
167
|
+
if chars.size > 1
|
168
|
+
list -= chars
|
169
|
+
c = CharClass.new self, chars
|
170
|
+
end
|
171
|
+
a = Alternate.new self, symbols, list unless list.empty?
|
172
|
+
a.children.unshift c if a && c
|
173
|
+
a || c
|
174
|
+
elsif c = best_prefix(list) # found a fixed-width prefix pattern
|
175
|
+
if optional = c[1].include?('')
|
176
|
+
c[1].reject!{ |w| w == '' }
|
177
|
+
end
|
178
|
+
c1 = tree c[0], symbols
|
179
|
+
c2 = tree c[1], symbols
|
180
|
+
c2.optional = optional
|
181
|
+
Sequence.new self, c1, c2
|
182
|
+
elsif c = best_suffix(list) # found a fixed-width suffix pattern
|
183
|
+
if optional = c[0].include?('')
|
184
|
+
c[0].reject!{ |w| w == '' }
|
185
|
+
end
|
186
|
+
c1 = tree c[0], symbols
|
187
|
+
c1.optional = optional
|
188
|
+
c2 = tree c[1], symbols
|
189
|
+
Sequence.new self, c1, c2
|
190
|
+
else
|
191
|
+
grouped = list.group_by{ |w| w[0] }
|
192
|
+
chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols.symbols(w[0]) }.map{ |v, _| v }
|
193
|
+
if chars.size > 1
|
194
|
+
list -= chars
|
195
|
+
c = CharClass.new self, chars
|
196
|
+
end
|
197
|
+
a = Alternate.new self, symbols, list
|
198
|
+
a.children.unshift c if c
|
199
|
+
a
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.quote(s)
|
204
|
+
s.gsub(QRX) { |c| Regexp.quote c }
|
205
|
+
end
|
206
|
+
|
207
|
+
def quote(s)
|
208
|
+
self.class.quote s
|
209
|
+
end
|
210
|
+
|
211
|
+
protected
|
212
|
+
|
213
|
+
def deep_dup(o)
|
214
|
+
if o.is_a?(Hash)
|
215
|
+
Hash[o.map{ |k, v| [ deep_dup(k), deep_dup(v) ] }]
|
216
|
+
elsif o.is_a?(Array)
|
217
|
+
o.map{ |v| deep_dup v }
|
218
|
+
elsif o.nil? || o.is_a?(Symbol)
|
219
|
+
o
|
220
|
+
else
|
221
|
+
o.dup
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def best_prefix(list)
|
226
|
+
acceptable = nil
|
227
|
+
sizes = list.map(&:size)
|
228
|
+
min = sizes.reduce 0, :+
|
229
|
+
sizes.uniq!
|
230
|
+
lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
|
231
|
+
(1..lim).each do |l|
|
232
|
+
c = {}
|
233
|
+
list.each do |w|
|
234
|
+
pfx = w[0...l]
|
235
|
+
sfx = w[l..-1]
|
236
|
+
( c[pfx] ||= [] ) << sfx
|
237
|
+
end
|
238
|
+
c = cross_products c
|
239
|
+
if c.size == 1
|
240
|
+
count = count(c)
|
241
|
+
if count < min
|
242
|
+
min = count
|
243
|
+
acceptable = c[0]
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
acceptable
|
248
|
+
end
|
249
|
+
|
250
|
+
def best_suffix(list)
|
251
|
+
acceptable = nil
|
252
|
+
sizes = list.map(&:size)
|
253
|
+
min = sizes.reduce 0, :+
|
254
|
+
sizes.uniq!
|
255
|
+
lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
|
256
|
+
(1..lim).each do |l|
|
257
|
+
c = {}
|
258
|
+
list.each do |w|
|
259
|
+
i = w.length - l
|
260
|
+
pfx = w[0...i]
|
261
|
+
sfx = w[i..-1]
|
262
|
+
( c[sfx] ||= [] ) << pfx
|
263
|
+
end
|
264
|
+
c = cross_products c
|
265
|
+
if c.size == 1
|
266
|
+
count = count(c)
|
267
|
+
if count < min
|
268
|
+
min = count
|
269
|
+
acceptable = c[0].reverse
|
270
|
+
end
|
271
|
+
end
|
272
|
+
end
|
273
|
+
acceptable
|
274
|
+
end
|
275
|
+
|
276
|
+
# discover cross products -- e.g., {this, that} X {cat, dog}
|
277
|
+
def cross_products(c)
|
278
|
+
c.to_a.group_by{ |_, v| v.sort }.map{ |k,v| [ v.map{ |a| a[0] }.sort, k ] }
|
279
|
+
end
|
280
|
+
|
281
|
+
def count(c)
|
282
|
+
c = c[0]
|
283
|
+
c[0].map(&:size).reduce( 0, :+ ) + c[1].map(&:size).reduce( 0, :+ )
|
284
|
+
end
|
285
|
+
|
286
|
+
class Special
|
287
|
+
attr_reader :engine
|
288
|
+
attr_accessor :specials, :list, :left, :right
|
289
|
+
|
290
|
+
NULL = Regexp.new '(?!)'
|
291
|
+
|
292
|
+
def initialize( engine, specials, list )
|
293
|
+
@engine = engine
|
294
|
+
@list = list
|
295
|
+
max = 0
|
296
|
+
list.each do |w|
|
297
|
+
w.chars.each{ |c| i = c.ord; max = i if i > max }
|
298
|
+
end
|
299
|
+
@specials = [].tap do |ar|
|
300
|
+
specials.sort do |a, b|
|
301
|
+
a = a.first
|
302
|
+
b = b.first
|
303
|
+
s1 = a.is_a?(String) || a.is_a?(Symbol)
|
304
|
+
s2 = b.is_a?(String) || b.is_a?(Symbol)
|
305
|
+
if s1 && s2
|
306
|
+
b.to_s <=> a.to_s
|
307
|
+
elsif s1
|
308
|
+
-1
|
309
|
+
elsif s2
|
310
|
+
1
|
311
|
+
else
|
312
|
+
s = a.to_s.length - b.to_s.length
|
313
|
+
s == 0 ? a.to_s <=> b.to_s : s
|
314
|
+
end
|
315
|
+
end.each do |var, opts|
|
316
|
+
c = ( max += 1 ).chr
|
317
|
+
sp = if opts.is_a? Hash
|
318
|
+
pat = opts.delete :pattern
|
319
|
+
raise "variable #{var} requires a pattern" unless pat || var.is_a?(Regexp)
|
320
|
+
pat ||= var.to_s
|
321
|
+
SpecialPattern.new engine, c, var, pat, **opts
|
322
|
+
elsif opts.is_a? String
|
323
|
+
SpecialPattern.new engine, c, var, opts
|
324
|
+
elsif var.is_a?(Regexp) && opts.nil?
|
325
|
+
SpecialPattern.new engine, c, var, nil
|
326
|
+
else
|
327
|
+
raise "variable #{var} requires a pattern"
|
328
|
+
end
|
329
|
+
ar << sp
|
330
|
+
end
|
331
|
+
end
|
332
|
+
if engine.bound
|
333
|
+
c = ( max += 1 ).chr
|
334
|
+
@left = SpecialPattern.new engine, c, c, engine.left_bound
|
335
|
+
@specials << @left
|
336
|
+
c = ( max += 1 ).chr
|
337
|
+
@right = SpecialPattern.new engine, c, c, engine.right_bound
|
338
|
+
@specials << @right
|
339
|
+
end
|
340
|
+
end
|
341
|
+
|
342
|
+
# confirm that all special patterns are legitimate regexen
|
343
|
+
def verify
|
344
|
+
specials.each do |s|
|
345
|
+
begin
|
346
|
+
Regexp.new s.pat
|
347
|
+
rescue
|
348
|
+
raise SyntaxError.new "the symbol #{s.var} has an ill-formed pattern: #{s.pat}"
|
349
|
+
end
|
350
|
+
end
|
351
|
+
end
|
352
|
+
|
353
|
+
def special_map
|
354
|
+
@special_map ||= {}
|
355
|
+
end
|
356
|
+
|
357
|
+
def symbols(s)
|
358
|
+
special_map[s]
|
359
|
+
end
|
360
|
+
|
361
|
+
# reduce the list to a version ready for pattern generation
|
362
|
+
def normalize
|
363
|
+
rx = if specials.empty?
|
364
|
+
NULL
|
365
|
+
else
|
366
|
+
Regexp.new '(' + specials.map(&:var).map(&:to_s).join('|') + ')'
|
367
|
+
end
|
368
|
+
l = r = false
|
369
|
+
list = self.list.uniq.map do |w|
|
370
|
+
parts = w.split rx
|
371
|
+
e = parts.size - 1
|
372
|
+
(0..e).map do |i|
|
373
|
+
p = parts[i]
|
374
|
+
if rx === p
|
375
|
+
p = specials.detect{ |sp| sp.var === p }
|
376
|
+
special_map[p.char] = p
|
377
|
+
if engine.bound
|
378
|
+
if i == 0 && p.left
|
379
|
+
p = "#{left}#{p}" if t
|
380
|
+
l = true
|
381
|
+
end
|
382
|
+
if i == e && p.right
|
383
|
+
p = "#{p}#{right}"
|
384
|
+
r = true
|
385
|
+
end
|
386
|
+
end
|
387
|
+
else
|
388
|
+
p = p.downcase if engine.case_insensitive
|
389
|
+
if engine.bound
|
390
|
+
if i == 0 && engine.word_test === p[0]
|
391
|
+
p = "#{left}#{p}"
|
392
|
+
l = true
|
393
|
+
end
|
394
|
+
if i == e && engine.word_test === p[-1]
|
395
|
+
p = "#{p}#{right}"
|
396
|
+
r = true
|
397
|
+
end
|
398
|
+
end
|
399
|
+
end
|
400
|
+
p
|
401
|
+
end.join
|
402
|
+
end.uniq.sort
|
403
|
+
special_map[left.char] = left if l
|
404
|
+
special_map[right.char] = right if r
|
405
|
+
list
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
class Node
|
410
|
+
attr_accessor :engine, :optional, :symbols, :root
|
411
|
+
|
412
|
+
def initialize(engine, symbols)
|
413
|
+
@engine = engine
|
414
|
+
@symbols = symbols
|
415
|
+
@children = []
|
416
|
+
end
|
417
|
+
|
418
|
+
def flatten
|
419
|
+
children.each{ |c| c.flatten }
|
420
|
+
end
|
421
|
+
|
422
|
+
def root?
|
423
|
+
root
|
424
|
+
end
|
425
|
+
|
426
|
+
def bound
|
427
|
+
engine.bound
|
428
|
+
end
|
429
|
+
|
430
|
+
def optional?
|
431
|
+
optional
|
432
|
+
end
|
433
|
+
|
434
|
+
def children
|
435
|
+
@children ||= []
|
436
|
+
end
|
437
|
+
|
438
|
+
def convert
|
439
|
+
raise NotImplementedError
|
440
|
+
end
|
441
|
+
|
442
|
+
def pfx
|
443
|
+
engine.pfx
|
444
|
+
end
|
445
|
+
|
446
|
+
def qmark
|
447
|
+
engine.qmark
|
448
|
+
end
|
449
|
+
|
450
|
+
def finalize(rx)
|
451
|
+
if optional?
|
452
|
+
rx = wrap rx unless atomic?
|
453
|
+
rx += qmark
|
454
|
+
end
|
455
|
+
rx
|
456
|
+
end
|
457
|
+
|
458
|
+
def wrap(s)
|
459
|
+
engine.wrap s
|
460
|
+
end
|
461
|
+
|
462
|
+
def atomic?
|
463
|
+
false
|
464
|
+
end
|
465
|
+
|
466
|
+
def quote(s)
|
467
|
+
engine.quote s
|
468
|
+
end
|
469
|
+
|
470
|
+
end
|
471
|
+
|
472
|
+
class SpecialPattern < Node
|
473
|
+
attr_accessor :char, :var, :left, :right, :pat
|
474
|
+
def initialize(engine, char, var, pat, atomic: (var.is_a?(Regexp) && pat.nil?), word_left: false, word_right: false)
|
475
|
+
super(engine, nil)
|
476
|
+
@char = char
|
477
|
+
@var = var.is_a?(String) || var.is_a?(Symbol) ? Regexp.new(Regexp.quote(var.to_s)) : var
|
478
|
+
@pat = pat || var.to_s
|
479
|
+
@atomic = !!atomic
|
480
|
+
@left = !!word_left
|
481
|
+
@right = !!word_right
|
482
|
+
end
|
483
|
+
|
484
|
+
def left?
|
485
|
+
@left
|
486
|
+
end
|
487
|
+
|
488
|
+
def right?
|
489
|
+
@right
|
490
|
+
end
|
491
|
+
|
492
|
+
def atomic?
|
493
|
+
@atomic
|
494
|
+
end
|
495
|
+
|
496
|
+
def to_s
|
497
|
+
self.char
|
498
|
+
end
|
499
|
+
|
500
|
+
def convert
|
501
|
+
rx = @pat
|
502
|
+
finalize rx
|
503
|
+
end
|
504
|
+
end
|
505
|
+
|
506
|
+
class Sequence < Node
|
507
|
+
|
508
|
+
def initialize(engine, *constituents)
|
509
|
+
super(engine, nil)
|
510
|
+
@children = constituents
|
511
|
+
end
|
512
|
+
|
513
|
+
def convert
|
514
|
+
rx = condense children.map(&:convert)
|
515
|
+
finalize rx
|
516
|
+
end
|
517
|
+
|
518
|
+
def flatten
|
519
|
+
super
|
520
|
+
(0...children.size).to_a.reverse.each do |i|
|
521
|
+
c = children[i]
|
522
|
+
if c.is_a?(Sequence) && !c.optional?
|
523
|
+
children.delete_at i
|
524
|
+
children.insert i, *c.children
|
525
|
+
end
|
526
|
+
end
|
527
|
+
end
|
528
|
+
|
529
|
+
# looks for repeating subsequences, as in ababababab, and condenses them to (?>ab){5}
|
530
|
+
# condensation is only done when it results in a more compact regex
|
531
|
+
def condense_repeats(elements)
|
532
|
+
(1..(elements.size/2)).each do |l| # length of subsequence considered
|
533
|
+
(0...l).each do |o| # offset from the start of the sequence
|
534
|
+
dup_count = []
|
535
|
+
(1...(elements.size - o)/l).each do |s| # the sub-sequence number
|
536
|
+
s2 = s * l + o
|
537
|
+
s1 = s2 - l
|
538
|
+
seq1 = elements[s1...s1 + l]
|
539
|
+
seq2 = elements[s2...s2 + l]
|
540
|
+
if seq1 == seq2
|
541
|
+
s0 = s - 1
|
542
|
+
counts = dup_count[s] = dup_count[s0] || [ 1, seq1.join, s1, nil ]
|
543
|
+
counts[0] += 1
|
544
|
+
counts[3] = s2 + l
|
545
|
+
dup_count[s0] = nil
|
546
|
+
end
|
547
|
+
end
|
548
|
+
dup_count.compact!
|
549
|
+
if dup_count.any?
|
550
|
+
copy = elements.dup
|
551
|
+
changed = false
|
552
|
+
dup_count.reverse.each do |repeats, seq, start, finish|
|
553
|
+
a = atomy? seq
|
554
|
+
sl = seq.length
|
555
|
+
if ( a ? 0 : engine.wrap_size ) + 2 + repeats.to_s.length + sl < sl * repeats
|
556
|
+
changed = true
|
557
|
+
copy[start...finish] = ( a ? seq : wrap(seq) ) + "{#{repeats}}"
|
558
|
+
end
|
559
|
+
end
|
560
|
+
return copy if changed
|
561
|
+
end
|
562
|
+
end
|
563
|
+
end
|
564
|
+
elements
|
565
|
+
end
|
566
|
+
|
567
|
+
# infer atomic patterns
|
568
|
+
def atomy?(s)
|
569
|
+
s.size == 1 || /\A(?>\\\w|\[(?>[^\[\]\\]|\\.)++\])\z/ === s
|
570
|
+
end
|
571
|
+
|
572
|
+
# iterated repeat condensation
|
573
|
+
def condense(elements)
|
574
|
+
while elements.size > 1
|
575
|
+
condensate = condense_repeats elements
|
576
|
+
break if condensate == elements
|
577
|
+
elements = condensate
|
578
|
+
end
|
579
|
+
elements.join
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
class CharClass < Node
|
584
|
+
|
585
|
+
attr_accessor :word, :num, :space
|
586
|
+
|
587
|
+
WORD_CHARS = (1..255).map(&:chr).select{ |c| /\w/ === c }.freeze
|
588
|
+
CI_WORD_CHARS = WORD_CHARS.map(&:downcase).uniq.freeze
|
589
|
+
NUM_CHARS = CI_WORD_CHARS.select{ |c| /\d/ === c }.freeze
|
590
|
+
SPACE_CHARS = (1..255).map(&:chr).select{ |c| /\s/ === c }.freeze
|
591
|
+
|
592
|
+
def initialize(engine, children)
|
593
|
+
super(engine, nil)
|
594
|
+
if engine.case_insensitive
|
595
|
+
if ( CI_WORD_CHARS - children ).empty?
|
596
|
+
self.word = true
|
597
|
+
self.num = false
|
598
|
+
children -= CI_WORD_CHARS
|
599
|
+
end
|
600
|
+
elsif ( WORD_CHARS - children ).empty?
|
601
|
+
self.word = true
|
602
|
+
self.num = false
|
603
|
+
children -= WORD_CHARS
|
604
|
+
end
|
605
|
+
if num.nil? && ( NUM_CHARS - children ).empty?
|
606
|
+
self.num = true
|
607
|
+
children -= NUM_CHARS
|
608
|
+
end
|
609
|
+
if ( SPACE_CHARS - children ).empty?
|
610
|
+
self.space = true
|
611
|
+
children -= SPACE_CHARS
|
612
|
+
end
|
613
|
+
@children = children
|
614
|
+
end
|
615
|
+
|
616
|
+
def atomic?
|
617
|
+
true
|
618
|
+
end
|
619
|
+
|
620
|
+
def flatten; end
|
621
|
+
|
622
|
+
def convert
|
623
|
+
rx = char_class children
|
624
|
+
if optional?
|
625
|
+
rx += qmark
|
626
|
+
end
|
627
|
+
rx
|
628
|
+
end
|
629
|
+
|
630
|
+
# takes a list of characters and returns a character class expression matching it
|
631
|
+
def char_class(chars)
|
632
|
+
mid = if chars.empty?
|
633
|
+
''
|
634
|
+
else
|
635
|
+
rs = ranges(chars)
|
636
|
+
if rs.size == 1 && rs[0][0] == rs[0][1]
|
637
|
+
cc_quote rs[0][0].chr
|
638
|
+
else
|
639
|
+
mid = rs.map do |s, e|
|
640
|
+
if s == e
|
641
|
+
cc_quote s.chr
|
642
|
+
elsif e == s + 1
|
643
|
+
"#{ cc_quote s.chr }#{ cc_quote e.chr }"
|
644
|
+
else
|
645
|
+
"#{ cc_quote s.chr }-#{ cc_quote e.chr }"
|
646
|
+
end
|
647
|
+
end.join
|
648
|
+
end
|
649
|
+
end
|
650
|
+
mid += '\w' if word
|
651
|
+
mid += '\d' if num
|
652
|
+
mid += '\s' if space
|
653
|
+
if mid.length == 1 || mid =~ /\A\\\w\z/
|
654
|
+
mid
|
655
|
+
else
|
656
|
+
"[#{mid}]"
|
657
|
+
end
|
658
|
+
end
|
659
|
+
|
660
|
+
def cc_quote(c)
|
661
|
+
return Regexp.quote(c) if c =~ /\s/
|
662
|
+
case c
|
663
|
+
when '[' then '\['
|
664
|
+
when ']' then '\]'
|
665
|
+
when '\\' then '\\\\'
|
666
|
+
when '-' then '\-'
|
667
|
+
when '^' then '\^'
|
668
|
+
else c
|
669
|
+
end
|
670
|
+
end
|
671
|
+
|
672
|
+
def ranges(chars)
|
673
|
+
chars = chars.map(&:ord).sort
|
674
|
+
rs = []
|
675
|
+
c = chars.shift
|
676
|
+
r = [ c, c ]
|
677
|
+
while chars.size > 0
|
678
|
+
c = chars.shift
|
679
|
+
if c == r[1] + 1
|
680
|
+
r[1] = c
|
681
|
+
else
|
682
|
+
rs << r
|
683
|
+
r = [ c, c ]
|
684
|
+
end
|
685
|
+
end
|
686
|
+
rs << r
|
687
|
+
end
|
688
|
+
end
|
689
|
+
|
690
|
+
class Alternate < Node
|
691
|
+
|
692
|
+
def initialize(engine, symbols, list)
|
693
|
+
super(engine, nil)
|
694
|
+
@children = list.group_by{ |s| s[0] }.values.map{ |ar| engine.tree( ar, symbols ) }
|
695
|
+
end
|
696
|
+
|
697
|
+
def convert
|
698
|
+
rx = children.map(&:convert).join('|')
|
699
|
+
rx = wrap(rx) unless root?
|
700
|
+
finalize rx
|
701
|
+
end
|
702
|
+
|
703
|
+
def atomic?
|
704
|
+
!root?
|
705
|
+
end
|
706
|
+
|
707
|
+
end
|
708
|
+
|
709
|
+
class Leaf < Node
|
710
|
+
|
711
|
+
attr_reader :c
|
712
|
+
|
713
|
+
def initialize(engine, c)
|
714
|
+
super(engine, nil)
|
715
|
+
@c = c
|
716
|
+
end
|
717
|
+
|
718
|
+
def atomic?
|
719
|
+
true
|
720
|
+
end
|
721
|
+
|
722
|
+
def convert
|
723
|
+
rx = quote c
|
724
|
+
finalize rx
|
725
|
+
end
|
726
|
+
end
|
727
|
+
|
728
|
+
end
|
729
|
+
end
|