regexp_parser 2.3.0 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +63 -6
  3. data/Gemfile +1 -0
  4. data/README.md +12 -6
  5. data/lib/regexp_parser/error.rb +1 -1
  6. data/lib/regexp_parser/expression/base.rb +9 -57
  7. data/lib/regexp_parser/expression/classes/backreference.rb +1 -0
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -2
  9. data/lib/regexp_parser/expression/classes/character_set.rb +2 -2
  10. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -0
  11. data/lib/regexp_parser/expression/classes/conditional.rb +2 -2
  12. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/group.rb +6 -6
  14. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  15. data/lib/regexp_parser/expression/classes/root.rb +3 -5
  16. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +1 -0
  17. data/lib/regexp_parser/expression/methods/construct.rb +43 -0
  18. data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
  19. data/lib/regexp_parser/expression/methods/tests.rb +10 -1
  20. data/lib/regexp_parser/expression/quantifier.rb +41 -23
  21. data/lib/regexp_parser/expression/sequence.rb +9 -23
  22. data/lib/regexp_parser/expression/sequence_operation.rb +2 -2
  23. data/lib/regexp_parser/expression/shared.rb +85 -0
  24. data/lib/regexp_parser/expression/subexpression.rb +11 -7
  25. data/lib/regexp_parser/expression.rb +4 -2
  26. data/lib/regexp_parser/parser.rb +21 -72
  27. data/lib/regexp_parser/scanner/property.rl +1 -1
  28. data/lib/regexp_parser/scanner/scanner.rl +42 -31
  29. data/lib/regexp_parser/scanner.rb +725 -793
  30. data/lib/regexp_parser/syntax/token/escape.rb +1 -1
  31. data/lib/regexp_parser/syntax/token/unicode_property.rb +0 -5
  32. data/lib/regexp_parser/version.rb +1 -1
  33. metadata +10 -8
@@ -0,0 +1,85 @@
1
+ module Regexp::Expression
2
+ module Shared
3
+ module ClassMethods; end # filled in ./methods/*.rb
4
+
5
+ def self.included(mod)
6
+ mod.class_eval do
7
+ extend Shared::ClassMethods
8
+
9
+ attr_accessor :type, :token, :text, :ts, :te,
10
+ :level, :set_level, :conditional_level,
11
+ :options, :quantifier
12
+
13
+ attr_reader :nesting_level
14
+ end
15
+ end
16
+
17
+ def init_from_token_and_options(token, options = {})
18
+ self.type = token.type
19
+ self.token = token.token
20
+ self.text = token.text
21
+ self.ts = token.ts
22
+ self.te = token.te
23
+ self.level = token.level
24
+ self.set_level = token.set_level
25
+ self.conditional_level = token.conditional_level
26
+ self.nesting_level = 0
27
+ self.options = options || {}
28
+ end
29
+ private :init_from_token_and_options
30
+
31
+ def initialize_copy(orig)
32
+ self.text = orig.text.dup if orig.text
33
+ self.options = orig.options.dup if orig.options
34
+ self.quantifier = orig.quantifier.clone if orig.quantifier
35
+ super
36
+ end
37
+
38
+ def starts_at
39
+ ts
40
+ end
41
+
42
+ def base_length
43
+ to_s(:base).length
44
+ end
45
+
46
+ def full_length
47
+ to_s.length
48
+ end
49
+
50
+ def to_s(format = :full)
51
+ "#{parts.join}#{quantifier_affix(format)}"
52
+ end
53
+ alias :to_str :to_s
54
+
55
+ def parts
56
+ [text.dup]
57
+ end
58
+
59
+ def quantifier_affix(expression_format)
60
+ quantifier.to_s if quantified? && expression_format != :base
61
+ end
62
+
63
+ def quantified?
64
+ !quantifier.nil?
65
+ end
66
+
67
+ def offset
68
+ [starts_at, full_length]
69
+ end
70
+
71
+ def coded_offset
72
+ '@%d+%d' % offset
73
+ end
74
+
75
+ def terminal?
76
+ !respond_to?(:expressions)
77
+ end
78
+
79
+ def nesting_level=(lvl)
80
+ @nesting_level = lvl
81
+ quantifier && quantifier.nesting_level = lvl
82
+ terminal? || each { |subexp| subexp.nesting_level = lvl + 1 }
83
+ end
84
+ end
85
+ end
@@ -5,9 +5,8 @@ module Regexp::Expression
5
5
  attr_accessor :expressions
6
6
 
7
7
  def initialize(token, options = {})
8
- super
9
-
10
8
  self.expressions = []
9
+ super
11
10
  end
12
11
 
13
12
  # Override base method to clone the expressions as well.
@@ -43,16 +42,21 @@ module Regexp::Expression
43
42
  ts + to_s.length
44
43
  end
45
44
 
46
- def to_s(format = :full)
47
- # Note: the format does not get passed down to subexpressions.
48
- "#{expressions.join}#{quantifier_affix(format)}"
45
+ def parts
46
+ expressions
49
47
  end
50
48
 
51
49
  def to_h
52
- attributes.merge({
50
+ attributes.merge(
53
51
  text: to_s(:base),
54
52
  expressions: expressions.map(&:to_h)
55
- })
53
+ )
54
+ end
55
+
56
+ private
57
+
58
+ def intersperse(expressions, separator)
59
+ expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
56
60
  end
57
61
  end
58
62
  end
@@ -1,5 +1,6 @@
1
1
  require 'regexp_parser/error'
2
2
 
3
+ require 'regexp_parser/expression/shared'
3
4
  require 'regexp_parser/expression/base'
4
5
  require 'regexp_parser/expression/quantifier'
5
6
  require 'regexp_parser/expression/subexpression'
@@ -12,6 +13,7 @@ require 'regexp_parser/expression/classes/backreference'
12
13
  require 'regexp_parser/expression/classes/character_set'
13
14
  require 'regexp_parser/expression/classes/character_set/intersection'
14
15
  require 'regexp_parser/expression/classes/character_set/range'
16
+ require 'regexp_parser/expression/classes/character_type'
15
17
  require 'regexp_parser/expression/classes/conditional'
16
18
  require 'regexp_parser/expression/classes/escape_sequence'
17
19
  require 'regexp_parser/expression/classes/free_space'
@@ -19,10 +21,10 @@ require 'regexp_parser/expression/classes/group'
19
21
  require 'regexp_parser/expression/classes/keep'
20
22
  require 'regexp_parser/expression/classes/literal'
21
23
  require 'regexp_parser/expression/classes/posix_class'
22
- require 'regexp_parser/expression/classes/property'
23
24
  require 'regexp_parser/expression/classes/root'
24
- require 'regexp_parser/expression/classes/type'
25
+ require 'regexp_parser/expression/classes/unicode_property'
25
26
 
27
+ require 'regexp_parser/expression/methods/construct'
26
28
  require 'regexp_parser/expression/methods/match'
27
29
  require 'regexp_parser/expression/methods/match_length'
28
30
  require 'regexp_parser/expression/methods/options'
@@ -23,7 +23,7 @@ class Regexp::Parser
23
23
  end
24
24
 
25
25
  def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
26
- root = Root.build(extract_options(input, options))
26
+ root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -39,6 +39,9 @@ class Regexp::Parser
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -197,11 +200,11 @@ class Regexp::Parser
197
200
  end
198
201
 
199
202
  def captured_group_count_at_level
200
- captured_group_counts[node.level]
203
+ captured_group_counts[node]
201
204
  end
202
205
 
203
206
  def count_captured_group
204
- captured_group_counts[node.level] += 1
207
+ captured_group_counts[node] += 1
205
208
  end
206
209
 
207
210
  def close_group
@@ -286,17 +289,9 @@ class Regexp::Parser
286
289
  def nest(exp)
287
290
  nesting.push(exp)
288
291
  node << exp
289
- update_transplanted_subtree(exp, node)
290
292
  self.node = exp
291
293
  end
292
294
 
293
- # subtrees are transplanted to build Alternations, Intersections, Ranges
294
- def update_transplanted_subtree(exp, new_parent)
295
- exp.nesting_level = new_parent.nesting_level + 1
296
- exp.respond_to?(:each) &&
297
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
- end
299
-
300
295
  def escape(token)
301
296
  case token.token
302
297
 
@@ -480,79 +475,33 @@ class Regexp::Parser
480
475
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
481
476
  # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
482
477
  if target_node.quantified?
483
- new_token = Regexp::Token.new(
484
- :group,
485
- :passive,
486
- '', # text
487
- target_node.ts,
488
- nil, # te (unused)
489
- target_node.level,
490
- target_node.set_level,
491
- target_node.conditional_level
478
+ new_group = Group::Passive.construct(
479
+ token: :passive,
480
+ ts: target_node.ts,
481
+ level: target_node.level,
482
+ set_level: target_node.set_level,
483
+ conditional_level: target_node.conditional_level,
484
+ options: active_opts,
492
485
  )
493
- new_group = Group::Passive.new(new_token, active_opts)
494
486
  new_group.implicit = true
495
487
  new_group << target_node
496
- increase_level(target_node)
488
+ increase_group_level(target_node)
497
489
  node.expressions[node.expressions.index(target_node)] = new_group
498
490
  target_node = new_group
499
491
  end
500
492
 
501
- case token.token
502
- when :zero_or_one
503
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
504
- when :zero_or_one_reluctant
505
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
506
- when :zero_or_one_possessive
507
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
508
-
509
- when :zero_or_more
510
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
511
- when :zero_or_more_reluctant
512
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
513
- when :zero_or_more_possessive
514
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
515
-
516
- when :one_or_more
517
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
518
- when :one_or_more_reluctant
519
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
520
- when :one_or_more_possessive
521
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
522
-
523
- when :interval
524
- interval(target_node, token)
525
-
526
- else
493
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
494
+ (?:_greedy|_reluctant|_possessive)?\z/x
527
495
  raise UnknownTokenError.new('Quantifier', token)
528
496
  end
497
+
498
+ target_node.quantify(token, active_opts)
529
499
  end
530
500
 
531
- def increase_level(exp)
501
+ def increase_group_level(exp)
532
502
  exp.level += 1
533
- exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
- end
535
-
536
- def interval(target_node, token)
537
- text = token.text
538
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
539
- case mchr
540
- when '?'
541
- range_text = text[0...-1]
542
- mode = :reluctant
543
- when '+'
544
- range_text = text[0...-1]
545
- mode = :possessive
546
- else
547
- range_text = text
548
- mode = :greedy
549
- end
550
-
551
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
552
- min = range[0].empty? ? 0 : range[0]
553
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
554
-
555
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
503
+ exp.quantifier.level += 1 if exp.quantifier
504
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
556
505
  end
557
506
 
558
507
  def set(token)
@@ -20,7 +20,7 @@
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- raise UnknownUnicodePropertyError.new(name) unless token
23
+ validation_error(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -28,13 +28,7 @@
28
28
 
29
29
  comment = ('#' . [^\n]* . '\n'?);
30
30
 
31
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
32
- 'cntrl' | 'digit' | 'graph' |
33
- 'lower' | 'print' | 'punct' |
34
- 'space' | 'upper' | 'xdigit' |
35
- 'word' | 'ascii';
36
-
37
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
 
40
34
  # these are not supported in ruby at the moment
@@ -74,8 +68,7 @@
74
68
  quantity_maximum = ',' . (digit+);
75
69
  quantity_range = (digit+) . ',' . (digit+);
76
70
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
77
- quantity_maximum | quantity_range ) . range_close .
78
- quantifier_mode?;
71
+ quantity_maximum | quantity_range ) . range_close;
79
72
 
80
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
81
74
  quantifier_possessive | quantifier_interval;
@@ -223,24 +216,28 @@
223
216
  fcall character_set;
224
217
  };
225
218
 
226
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
219
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
227
220
  text = copy(data, ts, te)
228
221
 
229
222
  type = :posixclass
230
223
  class_name = text[2..-3]
231
- if class_name[0].chr == '^'
224
+ if class_name[0] == '^'
232
225
  class_name = class_name[1..-1]
233
226
  type = :nonposixclass
234
227
  end
235
228
 
229
+ unless self.class.posix_classes.include?(class_name)
230
+ validation_error(:posix_class, text)
231
+ end
232
+
236
233
  emit(type, class_name.to_sym, text)
237
234
  };
238
235
 
239
236
  # These are not supported in ruby at the moment. Enable them if they are.
240
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
237
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
238
  # emit(:set, :collation, copy(data, ts, te))
242
239
  # };
243
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
240
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
241
  # emit(:set, :equivalent, copy(data, ts, te))
245
242
  # };
246
243
 
@@ -323,7 +320,7 @@
323
320
 
324
321
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
325
322
  text = copy(data, ts-1, te)
326
- if text[2].chr == '{'
323
+ if text[2] == '{'
327
324
  emit(:escape, :codepoint_list, text)
328
325
  else
329
326
  emit(:escape, :codepoint, text)
@@ -419,12 +416,12 @@
419
416
 
420
417
  backslash . anchor_char > (backslashed, 3) {
421
418
  case text = copy(data, ts, te)
422
- when '\\A'; emit(:anchor, :bos, text)
423
- when '\\z'; emit(:anchor, :eos, text)
424
- when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
- when '\\b'; emit(:anchor, :word_boundary, text)
426
- when '\\B'; emit(:anchor, :nonword_boundary, text)
427
- when '\\G'; emit(:anchor, :match_start, text)
419
+ when '\A'; emit(:anchor, :bos, text)
420
+ when '\z'; emit(:anchor, :eos, text)
421
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\b'; emit(:anchor, :word_boundary, text)
423
+ when '\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\G'; emit(:anchor, :match_start, text)
428
425
  end
429
426
  };
430
427
 
@@ -477,7 +474,7 @@
477
474
  group_open . group_options >group_opened {
478
475
  text = copy(data, ts, te)
479
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
480
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
477
+ validation_error(:group_option, $1 || "-#{$2}", text)
481
478
  end
482
479
  emit_options(text)
483
480
  };
@@ -605,7 +602,7 @@
605
602
  end
606
603
  };
607
604
 
608
- quantifier_interval {
605
+ quantifier_interval {
609
606
  emit(:quantifier, :interval, copy(data, ts, te))
610
607
  };
611
608
 
@@ -686,6 +683,7 @@ class Regexp::Scanner
686
683
  end
687
684
 
688
685
  # Invalid groupOption. Used for inline options.
686
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
689
687
  class InvalidGroupOption < ValidationError
690
688
  def initialize(option, text)
691
689
  super "Invalid group option #{option} in #{text}"
@@ -706,6 +704,13 @@ class Regexp::Scanner
706
704
  end
707
705
  end
708
706
 
707
+ # The POSIX class name was not recognized by the scanner.
708
+ class UnknownPosixClassError < ValidationError
709
+ def initialize(text)
710
+ super "Unknown POSIX class #{text}"
711
+ end
712
+ end
713
+
709
714
  # Scans the given regular expression text, or Regexp object and collects the
710
715
  # emitted token into an array that gets returned at the end. If a block is
711
716
  # given, it gets called for each emitted token.
@@ -771,6 +776,11 @@ class Regexp::Scanner
771
776
  File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
772
777
  end
773
778
 
779
+ def self.posix_classes
780
+ %w[alnum alpha ascii blank cntrl digit graph
781
+ lower print punct space upper word xdigit]
782
+ end
783
+
774
784
  # Emits an array with the details of the scanned pattern
775
785
  def emit(type, token, text)
776
786
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
@@ -873,15 +883,16 @@ class Regexp::Scanner
873
883
 
874
884
  # Centralizes and unifies the handling of validation related
875
885
  # errors.
876
- def validation_error(type, what, reason)
877
- case type
878
- when :group
879
- error = InvalidGroupError.new(what, reason)
880
- when :backref
881
- error = InvalidBackrefError.new(what, reason)
882
- when :sequence
883
- error = InvalidSequenceError.new(what, reason)
884
- end
886
+ def validation_error(type, what, reason = nil)
887
+ error =
888
+ case type
889
+ when :backref then InvalidBackrefError.new(what, reason)
890
+ when :group then InvalidGroupError.new(what, reason)
891
+ when :group_option then InvalidGroupOption.new(what, reason)
892
+ when :posix_class then UnknownPosixClassError.new(what)
893
+ when :property then UnknownUnicodePropertyError.new(what)
894
+ when :sequence then InvalidSequenceError.new(what, reason)
895
+ end
885
896
 
886
897
  raise error # unless @@config.validation_ignore
887
898
  end