regexp_parser 2.3.1 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,6 +39,9 @@ class Regexp::Parser
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -286,17 +289,9 @@ class Regexp::Parser
286
289
  def nest(exp)
287
290
  nesting.push(exp)
288
291
  node << exp
289
- update_transplanted_subtree(exp, node)
290
292
  self.node = exp
291
293
  end
292
294
 
293
- # subtrees are transplanted to build Alternations, Intersections, Ranges
294
- def update_transplanted_subtree(exp, new_parent)
295
- exp.nesting_level = new_parent.nesting_level + 1
296
- exp.respond_to?(:each) &&
297
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
- end
299
-
300
295
  def escape(token)
301
296
  case token.token
302
297
 
@@ -483,7 +478,7 @@ class Regexp::Parser
483
478
  new_token = Regexp::Token.new(
484
479
  :group,
485
480
  :passive,
486
- '', # text
481
+ '', # text (none because this group is implicit)
487
482
  target_node.ts,
488
483
  nil, # te (unused)
489
484
  target_node.level,
@@ -493,66 +488,23 @@ class Regexp::Parser
493
488
  new_group = Group::Passive.new(new_token, active_opts)
494
489
  new_group.implicit = true
495
490
  new_group << target_node
496
- increase_level(target_node)
491
+ increase_group_level(target_node)
497
492
  node.expressions[node.expressions.index(target_node)] = new_group
498
493
  target_node = new_group
499
494
  end
500
495
 
501
- case token.token
502
- when :zero_or_one
503
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
504
- when :zero_or_one_reluctant
505
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
506
- when :zero_or_one_possessive
507
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
508
-
509
- when :zero_or_more
510
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
511
- when :zero_or_more_reluctant
512
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
513
- when :zero_or_more_possessive
514
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
515
-
516
- when :one_or_more
517
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
518
- when :one_or_more_reluctant
519
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
520
- when :one_or_more_possessive
521
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
522
-
523
- when :interval
524
- interval(target_node, token)
525
-
526
- else
496
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
497
+ (?:_greedy|_reluctant|_possessive)?\z/x
527
498
  raise UnknownTokenError.new('Quantifier', token)
528
499
  end
500
+
501
+ target_node.quantify(token, active_opts)
529
502
  end
530
503
 
531
- def increase_level(exp)
504
+ def increase_group_level(exp)
532
505
  exp.level += 1
533
- exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
- end
535
-
536
- def interval(target_node, token)
537
- text = token.text
538
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
539
- case mchr
540
- when '?'
541
- range_text = text[0...-1]
542
- mode = :reluctant
543
- when '+'
544
- range_text = text[0...-1]
545
- mode = :possessive
546
- else
547
- range_text = text
548
- mode = :greedy
549
- end
550
-
551
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
552
- min = range[0].empty? ? 0 : range[0]
553
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
554
-
555
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
506
+ exp.quantifier.level += 1 if exp.quantifier
507
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
556
508
  end
557
509
 
558
510
  def set(token)
@@ -20,7 +20,7 @@
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ validation_error(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -28,13 +28,7 @@
28
28
 
29
29
  comment = ('#' . [^\n]* . '\n'?);
30
30
 
31
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
32
- 'cntrl' | 'digit' | 'graph' |
33
- 'lower' | 'print' | 'punct' |
34
- 'space' | 'upper' | 'xdigit' |
35
- 'word' | 'ascii';
36
-
37
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
 
40
34
  # these are not supported in ruby at the moment
@@ -74,8 +68,7 @@
74
68
  quantity_maximum = ',' . (digit+);
75
69
  quantity_range = (digit+) . ',' . (digit+);
76
70
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
77
- quantity_maximum | quantity_range ) . range_close .
78
- quantifier_mode?;
71
+ quantity_maximum | quantity_range ) . range_close;
79
72
 
80
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
81
74
  quantifier_possessive | quantifier_interval;
@@ -223,24 +216,28 @@
223
216
  fcall character_set;
224
217
  };
225
218
 
226
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
219
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
227
220
  text = copy(data, ts, te)
228
221
 
229
222
  type = :posixclass
230
223
  class_name = text[2..-3]
231
- if class_name[0].chr == '^'
224
+ if class_name[0] == '^'
232
225
  class_name = class_name[1..-1]
233
226
  type = :nonposixclass
234
227
  end
235
228
 
229
+ unless self.class.posix_classes.include?(class_name)
230
+ validation_error(:posix_class, text)
231
+ end
232
+
236
233
  emit(type, class_name.to_sym, text)
237
234
  };
238
235
 
239
236
  # These are not supported in ruby at the moment. Enable them if they are.
240
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
237
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
238
  # emit(:set, :collation, copy(data, ts, te))
242
239
  # };
243
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
240
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
241
  # emit(:set, :equivalent, copy(data, ts, te))
245
242
  # };
246
243
 
@@ -323,7 +320,7 @@
323
320
 
324
321
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
325
322
  text = copy(data, ts-1, te)
326
- if text[2].chr == '{'
323
+ if text[2] == '{'
327
324
  emit(:escape, :codepoint_list, text)
328
325
  else
329
326
  emit(:escape, :codepoint, text)
@@ -419,12 +416,12 @@
419
416
 
420
417
  backslash . anchor_char > (backslashed, 3) {
421
418
  case text = copy(data, ts, te)
422
- when '\\A'; emit(:anchor, :bos, text)
423
- when '\\z'; emit(:anchor, :eos, text)
424
- when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
- when '\\b'; emit(:anchor, :word_boundary, text)
426
- when '\\B'; emit(:anchor, :nonword_boundary, text)
427
- when '\\G'; emit(:anchor, :match_start, text)
419
+ when '\A'; emit(:anchor, :bos, text)
420
+ when '\z'; emit(:anchor, :eos, text)
421
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\b'; emit(:anchor, :word_boundary, text)
423
+ when '\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\G'; emit(:anchor, :match_start, text)
428
425
  end
429
426
  };
430
427
 
@@ -477,7 +474,7 @@
477
474
  group_open . group_options >group_opened {
478
475
  text = copy(data, ts, te)
479
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
480
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
477
+ validation_error(:group_option, $1 || "-#{$2}", text)
481
478
  end
482
479
  emit_options(text)
483
480
  };
@@ -605,7 +602,7 @@
605
602
  end
606
603
  };
607
604
 
608
- quantifier_interval {
605
+ quantifier_interval {
609
606
  emit(:quantifier, :interval, copy(data, ts, te))
610
607
  };
611
608
 
@@ -686,6 +683,7 @@ class Regexp::Scanner
686
683
  end
687
684
 
688
685
  # Invalid groupOption. Used for inline options.
686
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
689
687
  class InvalidGroupOption < ValidationError
690
688
  def initialize(option, text)
691
689
  super "Invalid group option #{option} in #{text}"
@@ -706,6 +704,13 @@ class Regexp::Scanner
706
704
  end
707
705
  end
708
706
 
707
+ # The POSIX class name was not recognized by the scanner.
708
+ class UnknownPosixClassError < ValidationError
709
+ def initialize(text)
710
+ super "Unknown POSIX class #{text}"
711
+ end
712
+ end
713
+
709
714
  # Scans the given regular expression text, or Regexp object and collects the
710
715
  # emitted token into an array that gets returned at the end. If a block is
711
716
  # given, it gets called for each emitted token.
@@ -771,6 +776,11 @@ class Regexp::Scanner
771
776
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
772
777
  end
773
778
 
779
+ def self.posix_classes
780
+ %w[alnum alpha ascii blank cntrl digit graph
781
+ lower print punct space upper word xdigit]
782
+ end
783
+
774
784
  # Emits an array with the details of the scanned pattern
775
785
  def emit(type, token, text)
776
786
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -873,15 +883,16 @@ class Regexp::Scanner
873
883
 
874
884
  # Centralizes and unifies the handling of validation related
875
885
  # errors.
876
- def validation_error(type, what, reason)
877
- case type
878
- when :group
879
- error = InvalidGroupError.new(what, reason)
880
- when :backref
881
- error = InvalidBackrefError.new(what, reason)
882
- when :sequence
883
- error = InvalidSequenceError.new(what, reason)
884
- end
886
+ def validation_error(type, what, reason = nil)
887
+ error =
888
+ case type
889
+ when :backref then InvalidBackrefError.new(what, reason)
890
+ when :group then InvalidGroupError.new(what, reason)
891
+ when :group_option then InvalidGroupOption.new(what, reason)
892
+ when :posix_class then UnknownPosixClassError.new(what)
893
+ when :property then UnknownUnicodePropertyError.new(what)
894
+ when :sequence then InvalidSequenceError.new(what, reason)
895
+ end
885
896
 
886
897
  raise error # unless @@config.validation_ignore
887
898
  end