list_matcher 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,729 @@
1
+ require "list_matcher/version"
2
+
3
+ module List
4
+ class Matcher
5
+ attr_reader :atomic, :backtracking, :bound, :case_insensitive, :strip, :left_bound, :right_bound, :word_test, :normalize_whitespace, :multiline, :name, :vet
6
+
7
+ # convenience method for one-off regexen where there's no point in keeping
8
+ # around a pattern generator
9
+ def self.pattern(list, opts={})
10
+ self.new(**opts).pattern list
11
+ end
12
+
13
+ # like self.pattern, but returns a regex rather than a string
14
+ def self.rx(list, opts={})
15
+ self.new(**opts).rx list
16
+ end
17
+
18
+ # to make a replacement of Regexp.quote that ignores characters that only need quoting inside character classes
19
+ QRX = Regexp.new "([" + ( (1..255).map(&:chr).select{ |c| Regexp.quote(c) != c } - %w(-) ).map{ |c| Regexp.quote c }.join + "])"
20
+
21
+ def initialize(
22
+ atomic: true,
23
+ backtracking: true,
24
+ bound: false,
25
+ strip: false,
26
+ case_insensitive: false,
27
+ multiline: false,
28
+ normalize_whitespace: false,
29
+ symbols: {},
30
+ name: false,
31
+ vet: false
32
+ )
33
+ @atomic = atomic
34
+ @backtracking = backtracking
35
+ @strip = strip || normalize_whitespace
36
+ @case_insensitive = case_insensitive
37
+ @multiline = multiline
38
+ @symbols = deep_dup symbols
39
+ @_bound = bound
40
+ @bound = !!bound
41
+ @normalize_whitespace = normalize_whitespace
42
+ @vet = vet
43
+ if name
44
+ raise "" unless name.is_a?(String) || name.is_a?(Symbol)
45
+ if Regexp.new "(?<#{name}>.*)" # stir up any errors that might arise from using this name in a named capture
46
+ @name = name
47
+ end
48
+ end
49
+ if bound == :string
50
+ @word_test = /./
51
+ @left_bound = '\A'
52
+ @right_bound = '\z'
53
+ elsif bound == :line
54
+ @word_test = /./
55
+ @left_bound = '^'
56
+ @right_bound = '$'
57
+ elsif bound.is_a? Hash
58
+ @word_test = bound[:test] || raise(SyntaxError.new('no boundary test provided'))
59
+ @left_bound = bound[:left] || raise(SyntaxError.new('no left boundary expression provided'))
60
+ @right_bound = bound[:right] || raise(SyntaxError.new('no right boundary expression provided'))
61
+ elsif bound === true || bound == :word
62
+ @word_test = /\w/
63
+ @left_bound = '\b'
64
+ @right_bound = '\b'
65
+ elsif !( bound === false )
66
+ raise "unfamiliar value for :bound option: #{bound.inspect}"
67
+ end
68
+ if normalize_whitespace
69
+ @symbols[' '] = { pattern: '\s++' }
70
+ end
71
+ symbols.keys.each do |k|
72
+ raise "symbols variable #{k} is neither a string, a symbol, nor a regex" unless k.is_a?(String) || k.is_a?(Symbol) || k.is_a?(Regexp)
73
+ end
74
+ if vet
75
+ Special.new( self, @symbols, [] ).verify
76
+ end
77
+ end
78
+
79
+ # returns a new pattern matcher differing from the original only in the options specified
80
+ def bud(opts={})
81
+ opts = {
82
+ atomic: @atomic,
83
+ backtracking: @backtracking,
84
+ bound: @_bound,
85
+ strip: @strip,
86
+ case_insensitive: @case_insensitive,
87
+ multiline: @multiline,
88
+ normalize_whitespace: @normalize_whitespace,
89
+ symbols: @symbols,
90
+ name: @name,
91
+ vet: @vet && opts[:symbols]
92
+ }.merge opts
93
+ self.class.new(**opts)
94
+ end
95
+
96
+ # converst list into a string representing a regex pattern suitable for inclusion in a larger regex
97
+ def pattern( list, opts={} )
98
+ return bud(opts).pattern list unless opts.empty?
99
+ list = list.compact.map(&:to_s).select{ |s| s.length > 0 }
100
+ list.map!(&:strip).select!{ |s| s.length > 0 } if strip
101
+ list.map!{ |s| s.gsub /\s++/, ' ' } if normalize_whitespace
102
+ return nil if list.empty?
103
+ specializer = Special.new self, @symbols, list
104
+ list = specializer.normalize
105
+
106
+ root = tree list, specializer
107
+ root.root = true
108
+ root.flatten
109
+ rx = root.convert
110
+ if m = modifiers
111
+ rx = "(?#{m}:#{rx})"
112
+ grouped = true
113
+ end
114
+ if name
115
+ rx = "(?<#{name}>#{rx})"
116
+ grouped = true
117
+ end
118
+ return rx if grouped && backtracking
119
+ if atomic && !root.atomic?
120
+ wrap rx
121
+ else
122
+ rx
123
+ end
124
+ end
125
+
126
+ def modifiers
127
+ ( @modifiers ||= if case_insensitive || multiline
128
+ [ ( 'i' if case_insensitive ), ( 'm' if multiline ) ].compact.join
129
+ else
130
+ [nil]
131
+ end )[0]
132
+ end
133
+
134
+ # like pattern but it returns a regex instead of a string
135
+ def rx(list, opts={})
136
+ Regexp.new pattern(list, opts)
137
+ end
138
+
139
+ def pfx
140
+ @pfx ||= backtracking ? '(?:' : '(?>'
141
+ end
142
+
143
+ def qmark
144
+ @qmark ||= backtracking ? '?' : '?+'
145
+ end
146
+
147
+ def wrap(s)
148
+ pfx + s + ')'
149
+ end
150
+
151
+ def wrap_size
152
+ @wrap_size ||= pfx.length + 1
153
+ end
154
+
155
+ def tree(list, symbols)
156
+ if list.size == 1
157
+ leaves = list[0].chars.map do |c|
158
+ symbols.symbols(c) || Leaf.new( self, c )
159
+ end
160
+ if leaves.length == 1
161
+ leaves.first
162
+ else
163
+ Sequence.new self, *leaves
164
+ end
165
+ elsif list.all?{ |w| w.length == 1 }
166
+ chars = list.select{ |w| !symbols.symbols(w) }
167
+ if chars.size > 1
168
+ list -= chars
169
+ c = CharClass.new self, chars
170
+ end
171
+ a = Alternate.new self, symbols, list unless list.empty?
172
+ a.children.unshift c if a && c
173
+ a || c
174
+ elsif c = best_prefix(list) # found a fixed-width prefix pattern
175
+ if optional = c[1].include?('')
176
+ c[1].reject!{ |w| w == '' }
177
+ end
178
+ c1 = tree c[0], symbols
179
+ c2 = tree c[1], symbols
180
+ c2.optional = optional
181
+ Sequence.new self, c1, c2
182
+ elsif c = best_suffix(list) # found a fixed-width suffix pattern
183
+ if optional = c[0].include?('')
184
+ c[0].reject!{ |w| w == '' }
185
+ end
186
+ c1 = tree c[0], symbols
187
+ c1.optional = optional
188
+ c2 = tree c[1], symbols
189
+ Sequence.new self, c1, c2
190
+ else
191
+ grouped = list.group_by{ |w| w[0] }
192
+ chars = grouped.select{ |_, w| w.size == 1 && w[0].size == 1 && !symbols.symbols(w[0]) }.map{ |v, _| v }
193
+ if chars.size > 1
194
+ list -= chars
195
+ c = CharClass.new self, chars
196
+ end
197
+ a = Alternate.new self, symbols, list
198
+ a.children.unshift c if c
199
+ a
200
+ end
201
+ end
202
+
203
+ def self.quote(s)
204
+ s.gsub(QRX) { |c| Regexp.quote c }
205
+ end
206
+
207
+ def quote(s)
208
+ self.class.quote s
209
+ end
210
+
211
+ protected
212
+
213
+ def deep_dup(o)
214
+ if o.is_a?(Hash)
215
+ Hash[o.map{ |k, v| [ deep_dup(k), deep_dup(v) ] }]
216
+ elsif o.is_a?(Array)
217
+ o.map{ |v| deep_dup v }
218
+ elsif o.nil? || o.is_a?(Symbol)
219
+ o
220
+ else
221
+ o.dup
222
+ end
223
+ end
224
+
225
+ def best_prefix(list)
226
+ acceptable = nil
227
+ sizes = list.map(&:size)
228
+ min = sizes.reduce 0, :+
229
+ sizes.uniq!
230
+ lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
231
+ (1..lim).each do |l|
232
+ c = {}
233
+ list.each do |w|
234
+ pfx = w[0...l]
235
+ sfx = w[l..-1]
236
+ ( c[pfx] ||= [] ) << sfx
237
+ end
238
+ c = cross_products c
239
+ if c.size == 1
240
+ count = count(c)
241
+ if count < min
242
+ min = count
243
+ acceptable = c[0]
244
+ end
245
+ end
246
+ end
247
+ acceptable
248
+ end
249
+
250
+ def best_suffix(list)
251
+ acceptable = nil
252
+ sizes = list.map(&:size)
253
+ min = sizes.reduce 0, :+
254
+ sizes.uniq!
255
+ lim = sizes.count == 1 ? list[0].size - 1 : sizes.min
256
+ (1..lim).each do |l|
257
+ c = {}
258
+ list.each do |w|
259
+ i = w.length - l
260
+ pfx = w[0...i]
261
+ sfx = w[i..-1]
262
+ ( c[sfx] ||= [] ) << pfx
263
+ end
264
+ c = cross_products c
265
+ if c.size == 1
266
+ count = count(c)
267
+ if count < min
268
+ min = count
269
+ acceptable = c[0].reverse
270
+ end
271
+ end
272
+ end
273
+ acceptable
274
+ end
275
+
276
+ # discover cross products -- e.g., {this, that} X {cat, dog}
277
+ def cross_products(c)
278
+ c.to_a.group_by{ |_, v| v.sort }.map{ |k,v| [ v.map{ |a| a[0] }.sort, k ] }
279
+ end
280
+
281
+ def count(c)
282
+ c = c[0]
283
+ c[0].map(&:size).reduce( 0, :+ ) + c[1].map(&:size).reduce( 0, :+ )
284
+ end
285
+
286
+ class Special
287
+ attr_reader :engine
288
+ attr_accessor :specials, :list, :left, :right
289
+
290
+ NULL = Regexp.new '(?!)'
291
+
292
+ def initialize( engine, specials, list )
293
+ @engine = engine
294
+ @list = list
295
+ max = 0
296
+ list.each do |w|
297
+ w.chars.each{ |c| i = c.ord; max = i if i > max }
298
+ end
299
+ @specials = [].tap do |ar|
300
+ specials.sort do |a, b|
301
+ a = a.first
302
+ b = b.first
303
+ s1 = a.is_a?(String) || a.is_a?(Symbol)
304
+ s2 = b.is_a?(String) || b.is_a?(Symbol)
305
+ if s1 && s2
306
+ b.to_s <=> a.to_s
307
+ elsif s1
308
+ -1
309
+ elsif s2
310
+ 1
311
+ else
312
+ s = a.to_s.length - b.to_s.length
313
+ s == 0 ? a.to_s <=> b.to_s : s
314
+ end
315
+ end.each do |var, opts|
316
+ c = ( max += 1 ).chr
317
+ sp = if opts.is_a? Hash
318
+ pat = opts.delete :pattern
319
+ raise "variable #{var} requires a pattern" unless pat || var.is_a?(Regexp)
320
+ pat ||= var.to_s
321
+ SpecialPattern.new engine, c, var, pat, **opts
322
+ elsif opts.is_a? String
323
+ SpecialPattern.new engine, c, var, opts
324
+ elsif var.is_a?(Regexp) && opts.nil?
325
+ SpecialPattern.new engine, c, var, nil
326
+ else
327
+ raise "variable #{var} requires a pattern"
328
+ end
329
+ ar << sp
330
+ end
331
+ end
332
+ if engine.bound
333
+ c = ( max += 1 ).chr
334
+ @left = SpecialPattern.new engine, c, c, engine.left_bound
335
+ @specials << @left
336
+ c = ( max += 1 ).chr
337
+ @right = SpecialPattern.new engine, c, c, engine.right_bound
338
+ @specials << @right
339
+ end
340
+ end
341
+
342
+ # confirm that all special patterns are legitimate regexen
343
+ def verify
344
+ specials.each do |s|
345
+ begin
346
+ Regexp.new s.pat
347
+ rescue
348
+ raise SyntaxError.new "the symbol #{s.var} has an ill-formed pattern: #{s.pat}"
349
+ end
350
+ end
351
+ end
352
+
353
+ def special_map
354
+ @special_map ||= {}
355
+ end
356
+
357
+ def symbols(s)
358
+ special_map[s]
359
+ end
360
+
361
+ # reduce the list to a version ready for pattern generation
362
+ def normalize
363
+ rx = if specials.empty?
364
+ NULL
365
+ else
366
+ Regexp.new '(' + specials.map(&:var).map(&:to_s).join('|') + ')'
367
+ end
368
+ l = r = false
369
+ list = self.list.uniq.map do |w|
370
+ parts = w.split rx
371
+ e = parts.size - 1
372
+ (0..e).map do |i|
373
+ p = parts[i]
374
+ if rx === p
375
+ p = specials.detect{ |sp| sp.var === p }
376
+ special_map[p.char] = p
377
+ if engine.bound
378
+ if i == 0 && p.left
379
+ p = "#{left}#{p}" if t
380
+ l = true
381
+ end
382
+ if i == e && p.right
383
+ p = "#{p}#{right}"
384
+ r = true
385
+ end
386
+ end
387
+ else
388
+ p = p.downcase if engine.case_insensitive
389
+ if engine.bound
390
+ if i == 0 && engine.word_test === p[0]
391
+ p = "#{left}#{p}"
392
+ l = true
393
+ end
394
+ if i == e && engine.word_test === p[-1]
395
+ p = "#{p}#{right}"
396
+ r = true
397
+ end
398
+ end
399
+ end
400
+ p
401
+ end.join
402
+ end.uniq.sort
403
+ special_map[left.char] = left if l
404
+ special_map[right.char] = right if r
405
+ list
406
+ end
407
+ end
408
+
409
+ class Node
410
+ attr_accessor :engine, :optional, :symbols, :root
411
+
412
+ def initialize(engine, symbols)
413
+ @engine = engine
414
+ @symbols = symbols
415
+ @children = []
416
+ end
417
+
418
+ def flatten
419
+ children.each{ |c| c.flatten }
420
+ end
421
+
422
+ def root?
423
+ root
424
+ end
425
+
426
+ def bound
427
+ engine.bound
428
+ end
429
+
430
+ def optional?
431
+ optional
432
+ end
433
+
434
+ def children
435
+ @children ||= []
436
+ end
437
+
438
+ def convert
439
+ raise NotImplementedError
440
+ end
441
+
442
+ def pfx
443
+ engine.pfx
444
+ end
445
+
446
+ def qmark
447
+ engine.qmark
448
+ end
449
+
450
+ def finalize(rx)
451
+ if optional?
452
+ rx = wrap rx unless atomic?
453
+ rx += qmark
454
+ end
455
+ rx
456
+ end
457
+
458
+ def wrap(s)
459
+ engine.wrap s
460
+ end
461
+
462
+ def atomic?
463
+ false
464
+ end
465
+
466
+ def quote(s)
467
+ engine.quote s
468
+ end
469
+
470
+ end
471
+
472
+ class SpecialPattern < Node
473
+ attr_accessor :char, :var, :left, :right, :pat
474
+ def initialize(engine, char, var, pat, atomic: (var.is_a?(Regexp) && pat.nil?), word_left: false, word_right: false)
475
+ super(engine, nil)
476
+ @char = char
477
+ @var = var.is_a?(String) || var.is_a?(Symbol) ? Regexp.new(Regexp.quote(var.to_s)) : var
478
+ @pat = pat || var.to_s
479
+ @atomic = !!atomic
480
+ @left = !!word_left
481
+ @right = !!word_right
482
+ end
483
+
484
+ def left?
485
+ @left
486
+ end
487
+
488
+ def right?
489
+ @right
490
+ end
491
+
492
+ def atomic?
493
+ @atomic
494
+ end
495
+
496
+ def to_s
497
+ self.char
498
+ end
499
+
500
+ def convert
501
+ rx = @pat
502
+ finalize rx
503
+ end
504
+ end
505
+
506
+ class Sequence < Node
507
+
508
+ def initialize(engine, *constituents)
509
+ super(engine, nil)
510
+ @children = constituents
511
+ end
512
+
513
+ def convert
514
+ rx = condense children.map(&:convert)
515
+ finalize rx
516
+ end
517
+
518
+ def flatten
519
+ super
520
+ (0...children.size).to_a.reverse.each do |i|
521
+ c = children[i]
522
+ if c.is_a?(Sequence) && !c.optional?
523
+ children.delete_at i
524
+ children.insert i, *c.children
525
+ end
526
+ end
527
+ end
528
+
529
+ # looks for repeating subsequences, as in ababababab, and condenses them to (?>ab){5}
530
+ # condensation is only done when it results in a more compact regex
531
+ def condense_repeats(elements)
532
+ (1..(elements.size/2)).each do |l| # length of subsequence considered
533
+ (0...l).each do |o| # offset from the start of the sequence
534
+ dup_count = []
535
+ (1...(elements.size - o)/l).each do |s| # the sub-sequence number
536
+ s2 = s * l + o
537
+ s1 = s2 - l
538
+ seq1 = elements[s1...s1 + l]
539
+ seq2 = elements[s2...s2 + l]
540
+ if seq1 == seq2
541
+ s0 = s - 1
542
+ counts = dup_count[s] = dup_count[s0] || [ 1, seq1.join, s1, nil ]
543
+ counts[0] += 1
544
+ counts[3] = s2 + l
545
+ dup_count[s0] = nil
546
+ end
547
+ end
548
+ dup_count.compact!
549
+ if dup_count.any?
550
+ copy = elements.dup
551
+ changed = false
552
+ dup_count.reverse.each do |repeats, seq, start, finish|
553
+ a = atomy? seq
554
+ sl = seq.length
555
+ if ( a ? 0 : engine.wrap_size ) + 2 + repeats.to_s.length + sl < sl * repeats
556
+ changed = true
557
+ copy[start...finish] = ( a ? seq : wrap(seq) ) + "{#{repeats}}"
558
+ end
559
+ end
560
+ return copy if changed
561
+ end
562
+ end
563
+ end
564
+ elements
565
+ end
566
+
567
+ # infer atomic patterns
568
+ def atomy?(s)
569
+ s.size == 1 || /\A(?>\\\w|\[(?>[^\[\]\\]|\\.)++\])\z/ === s
570
+ end
571
+
572
+ # iterated repeat condensation
573
+ def condense(elements)
574
+ while elements.size > 1
575
+ condensate = condense_repeats elements
576
+ break if condensate == elements
577
+ elements = condensate
578
+ end
579
+ elements.join
580
+ end
581
+ end
582
+
583
+ class CharClass < Node
584
+
585
+ attr_accessor :word, :num, :space
586
+
587
+ WORD_CHARS = (1..255).map(&:chr).select{ |c| /\w/ === c }.freeze
588
+ CI_WORD_CHARS = WORD_CHARS.map(&:downcase).uniq.freeze
589
+ NUM_CHARS = CI_WORD_CHARS.select{ |c| /\d/ === c }.freeze
590
+ SPACE_CHARS = (1..255).map(&:chr).select{ |c| /\s/ === c }.freeze
591
+
592
+ def initialize(engine, children)
593
+ super(engine, nil)
594
+ if engine.case_insensitive
595
+ if ( CI_WORD_CHARS - children ).empty?
596
+ self.word = true
597
+ self.num = false
598
+ children -= CI_WORD_CHARS
599
+ end
600
+ elsif ( WORD_CHARS - children ).empty?
601
+ self.word = true
602
+ self.num = false
603
+ children -= WORD_CHARS
604
+ end
605
+ if num.nil? && ( NUM_CHARS - children ).empty?
606
+ self.num = true
607
+ children -= NUM_CHARS
608
+ end
609
+ if ( SPACE_CHARS - children ).empty?
610
+ self.space = true
611
+ children -= SPACE_CHARS
612
+ end
613
+ @children = children
614
+ end
615
+
616
+ def atomic?
617
+ true
618
+ end
619
+
620
+ def flatten; end
621
+
622
+ def convert
623
+ rx = char_class children
624
+ if optional?
625
+ rx += qmark
626
+ end
627
+ rx
628
+ end
629
+
630
+ # takes a list of characters and returns a character class expression matching it
631
+ def char_class(chars)
632
+ mid = if chars.empty?
633
+ ''
634
+ else
635
+ rs = ranges(chars)
636
+ if rs.size == 1 && rs[0][0] == rs[0][1]
637
+ cc_quote rs[0][0].chr
638
+ else
639
+ mid = rs.map do |s, e|
640
+ if s == e
641
+ cc_quote s.chr
642
+ elsif e == s + 1
643
+ "#{ cc_quote s.chr }#{ cc_quote e.chr }"
644
+ else
645
+ "#{ cc_quote s.chr }-#{ cc_quote e.chr }"
646
+ end
647
+ end.join
648
+ end
649
+ end
650
+ mid += '\w' if word
651
+ mid += '\d' if num
652
+ mid += '\s' if space
653
+ if mid.length == 1 || mid =~ /\A\\\w\z/
654
+ mid
655
+ else
656
+ "[#{mid}]"
657
+ end
658
+ end
659
+
660
+ def cc_quote(c)
661
+ return Regexp.quote(c) if c =~ /\s/
662
+ case c
663
+ when '[' then '\['
664
+ when ']' then '\]'
665
+ when '\\' then '\\\\'
666
+ when '-' then '\-'
667
+ when '^' then '\^'
668
+ else c
669
+ end
670
+ end
671
+
672
+ def ranges(chars)
673
+ chars = chars.map(&:ord).sort
674
+ rs = []
675
+ c = chars.shift
676
+ r = [ c, c ]
677
+ while chars.size > 0
678
+ c = chars.shift
679
+ if c == r[1] + 1
680
+ r[1] = c
681
+ else
682
+ rs << r
683
+ r = [ c, c ]
684
+ end
685
+ end
686
+ rs << r
687
+ end
688
+ end
689
+
690
+ class Alternate < Node
691
+
692
+ def initialize(engine, symbols, list)
693
+ super(engine, nil)
694
+ @children = list.group_by{ |s| s[0] }.values.map{ |ar| engine.tree( ar, symbols ) }
695
+ end
696
+
697
+ def convert
698
+ rx = children.map(&:convert).join('|')
699
+ rx = wrap(rx) unless root?
700
+ finalize rx
701
+ end
702
+
703
+ def atomic?
704
+ !root?
705
+ end
706
+
707
+ end
708
+
709
+ class Leaf < Node
710
+
711
+ attr_reader :c
712
+
713
+ def initialize(engine, c)
714
+ super(engine, nil)
715
+ @c = c
716
+ end
717
+
718
+ def atomic?
719
+ true
720
+ end
721
+
722
+ def convert
723
+ rx = quote c
724
+ finalize rx
725
+ end
726
+ end
727
+
728
+ end
729
+ end