regexp_parser 2.6.2 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +67 -0
  3. data/Gemfile +2 -2
  4. data/README.md +32 -29
  5. data/lib/regexp_parser/expression/base.rb +0 -7
  6. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  7. data/lib/regexp_parser/expression/classes/backreference.rb +4 -2
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  9. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  10. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  11. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  13. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  18. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  19. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  20. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  21. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  22. data/lib/regexp_parser/expression/sequence.rb +5 -10
  23. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  24. data/lib/regexp_parser/expression/shared.rb +37 -20
  25. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  26. data/lib/regexp_parser/expression.rb +2 -0
  27. data/lib/regexp_parser/lexer.rb +76 -36
  28. data/lib/regexp_parser/parser.rb +97 -97
  29. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  30. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  31. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  32. data/lib/regexp_parser/scanner/mapping.rb +89 -0
  33. data/lib/regexp_parser/scanner/property.rl +2 -2
  34. data/lib/regexp_parser/scanner/scanner.rl +90 -169
  35. data/lib/regexp_parser/scanner.rb +1157 -1330
  36. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  37. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  38. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  39. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  40. data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  42. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  43. data/lib/regexp_parser/syntax/versions.rb +2 -0
  44. data/lib/regexp_parser/version.rb +1 -1
  45. metadata +10 -3
@@ -8,14 +8,10 @@ module Regexp::Expression
8
8
 
9
9
  MODES = %i[greedy possessive reluctant]
10
10
 
11
- attr_reader :min, :max, :mode
12
-
13
11
  def initialize(*args)
14
12
  deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
15
13
 
16
14
  init_from_token_and_options(*args)
17
- @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
18
- @min, @max = minmax
19
15
  # TODO: remove in v3.0.0, stop removing parts of #token (?)
20
16
  self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
21
17
  end
@@ -39,9 +35,21 @@ module Regexp::Expression
39
35
  end
40
36
  alias :lazy? :reluctant?
41
37
 
38
+ def min
39
+ derived_data[:min]
40
+ end
41
+
42
+ def max
43
+ derived_data[:max]
44
+ end
45
+
46
+ def mode
47
+ derived_data[:mode]
48
+ end
49
+
42
50
  private
43
51
 
44
- def deprecated_old_init(token, text, min, max, mode = :greedy)
52
+ def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
45
53
  warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
46
54
  "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
47
55
  "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
51
59
  "This is consistent with how Expression::Base instances are created. "
52
60
  @token = token
53
61
  @text = text
54
- @min = min
55
- @max = max
56
- @mode = mode
57
62
  end
58
63
 
59
- def minmax
60
- case token
61
- when /zero_or_one/ then [0, 1]
62
- when /zero_or_more/ then [0, -1]
63
- when /one_or_more/ then [1, -1]
64
- when :interval
65
- int_min = text[/\{(\d*)/, 1]
66
- int_max = text[/,?(\d*)\}/, 1]
67
- [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
64
+ def derived_data
65
+ @derived_data ||= begin
66
+ min, max =
67
+ case text[0]
68
+ when '?'; [0, 1]
69
+ when '*'; [0, -1]
70
+ when '+'; [1, -1]
71
+ else
72
+ int_min = text[/\{(\d*)/, 1]
73
+ int_max = text[/,?(\d*)\}/, 1]
74
+ [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
75
+ end
76
+
77
+ mod = text[/.([?+])/, 1]
78
+ mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
79
+
80
+ { min: min, max: max, mode: mode }
68
81
  end
69
82
  end
70
83
  end
@@ -12,25 +12,20 @@ module Regexp::Expression
12
12
  level: exp.level,
13
13
  set_level: exp.set_level,
14
14
  conditional_level: params[:conditional_level] || exp.conditional_level,
15
+ ts: params[:ts],
15
16
  )
16
- sequence.nesting_level = exp.nesting_level + 1
17
17
  sequence.options = active_opts
18
18
  exp.expressions << sequence
19
19
  sequence
20
20
  end
21
21
  end
22
22
 
23
- def starts_at
24
- expressions.first.starts_at
23
+ def ts
24
+ (head = expressions.first) ? head.ts : @ts
25
25
  end
26
- alias :ts :starts_at
27
26
 
28
- def quantify(*args)
29
- target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
30
- target or raise Regexp::Parser::Error,
31
- "No valid target found for '#{text}' quantifier"
32
-
33
- target.quantify(*args)
27
+ def quantify(token, *args)
28
+ extract_quantifier_target(token.text).quantify(token, *args)
34
29
  end
35
30
  end
36
31
  end
@@ -5,21 +5,16 @@ module Regexp::Expression
5
5
  alias :operands :expressions
6
6
  alias :operator :text
7
7
 
8
- def starts_at
9
- expressions.first.starts_at
8
+ def ts
9
+ (head = expressions.first) ? head.ts : @ts
10
10
  end
11
- alias :ts :starts_at
12
11
 
13
12
  def <<(exp)
14
13
  expressions.last << exp
15
14
  end
16
15
 
17
- def add_sequence(active_opts = {})
18
- self.class::OPERAND.add_to(self, {}, active_opts)
19
- end
20
-
21
- def parts
22
- intersperse(expressions, text.dup)
16
+ def add_sequence(active_opts = {}, params = { ts: 0 })
17
+ self.class::OPERAND.add_to(self, params, active_opts)
23
18
  end
24
19
  end
25
20
  end
@@ -8,7 +8,8 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options
11
+ :options, :parent,
12
+ :custom_to_s_handling, :pre_quantifier_decorations
12
13
 
13
14
  attr_reader :nesting_level, :quantifier
14
15
  end
@@ -32,6 +33,10 @@ module Regexp::Expression
32
33
  self.text = orig.text.dup if orig.text
33
34
  self.options = orig.options.dup if orig.options
34
35
  self.quantifier = orig.quantifier.clone if orig.quantifier
36
+ self.parent = nil # updated by Subexpression#initialize_copy
37
+ if orig.pre_quantifier_decorations
38
+ self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39
+ end
35
40
  super
36
41
  end
37
42
 
@@ -39,35 +44,51 @@ module Regexp::Expression
39
44
  ts
40
45
  end
41
46
 
47
+ def ends_at(include_quantifier = true)
48
+ ts + (include_quantifier ? full_length : base_length)
49
+ end
50
+
42
51
  def base_length
43
52
  to_s(:base).length
44
53
  end
45
54
 
46
55
  def full_length
47
- to_s.length
48
- end
49
-
56
+ to_s(:original).length
57
+ end
58
+
59
+ # #to_s reproduces the original source, as an unparser would.
60
+ #
61
+ # It takes an optional format argument.
62
+ #
63
+ # Example:
64
+ #
65
+ # lit = Regexp::Parser.parse(/a +/x)[0]
66
+ #
67
+ # lit.to_s # => 'a+' # default; with quantifier
68
+ # lit.to_s(:full) # => 'a+' # default; with quantifier
69
+ # lit.to_s(:base) # => 'a' # without quantifier
70
+ # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
+ #
50
72
  def to_s(format = :full)
51
- "#{parts.join}#{quantifier_affix(format)}"
73
+ base = parts.each_with_object(''.dup) do |part, buff|
74
+ if part.instance_of?(String)
75
+ buff << part
76
+ elsif !part.custom_to_s_handling
77
+ buff << part.to_s(:original)
78
+ end
79
+ end
80
+ "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
52
81
  end
53
82
  alias :to_str :to_s
54
83
 
55
- def parts
56
- [text.dup]
84
+ def pre_quantifier_decoration(expression_format = :original)
85
+ pre_quantifier_decorations.to_a.join if expression_format == :original
57
86
  end
58
87
 
59
- def quantifier_affix(expression_format)
88
+ def quantifier_affix(expression_format = :full)
60
89
  quantifier.to_s if quantified? && expression_format != :base
61
90
  end
62
91
 
63
- def quantified?
64
- !quantifier.nil?
65
- end
66
-
67
- def optional?
68
- quantified? && quantifier.min == 0
69
- end
70
-
71
92
  def offset
72
93
  [starts_at, full_length]
73
94
  end
@@ -76,10 +97,6 @@ module Regexp::Expression
76
97
  '@%d+%d' % offset
77
98
  end
78
99
 
79
- def terminal?
80
- !respond_to?(:expressions)
81
- end
82
-
83
100
  def nesting_level=(lvl)
84
101
  @nesting_level = lvl
85
102
  quantifier && quantifier.nesting_level = lvl
@@ -11,17 +11,15 @@ module Regexp::Expression
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
13
  def initialize_copy(orig)
14
- self.expressions = orig.expressions.map(&:clone)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
15
17
  super
16
18
  end
17
19
 
18
20
  def <<(exp)
19
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
- last.merge(exp)
21
- else
22
- exp.nesting_level = nesting_level + 1
23
- expressions << exp
24
- end
21
+ exp.parent = self
22
+ expressions << exp
25
23
  end
26
24
 
27
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -39,11 +37,7 @@ module Regexp::Expression
39
37
  end
40
38
 
41
39
  def te
42
- ts + to_s.length
43
- end
44
-
45
- def parts
46
- expressions
40
+ ts + base_length
47
41
  end
48
42
 
49
43
  def to_h
@@ -53,10 +47,21 @@ module Regexp::Expression
53
47
  )
54
48
  end
55
49
 
56
- private
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
57
62
 
58
- def intersperse(expressions, separator)
59
- expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
60
65
  end
61
66
  end
62
67
  end
@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
29
29
  require 'regexp_parser/expression/methods/match'
30
30
  require 'regexp_parser/expression/methods/match_length'
31
31
  require 'regexp_parser/expression/methods/options'
32
+ require 'regexp_parser/expression/methods/parts'
33
+ require 'regexp_parser/expression/methods/printing'
32
34
  require 'regexp_parser/expression/methods/strfregexp'
33
35
  require 'regexp_parser/expression/methods/tests'
34
36
  require 'regexp_parser/expression/methods/traverse'
@@ -6,57 +6,75 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
13
13
 
14
14
  CONDITION_TOKENS = %i[condition condition_close].freeze
15
15
 
16
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
17
- new.lex(input, syntax, options: options, &block)
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
18
18
  end
19
19
 
20
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.for(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
22
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
23
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
24
28
  self.nesting = 0
25
29
  self.set_nesting = 0
26
30
  self.conditional_nesting = 0
27
31
  self.shift = 0
28
32
 
29
- last = nil
30
- Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
31
34
  type, token = *syntax.normalize(type, token)
32
35
  syntax.check! type, token
33
36
 
34
37
  ascend(type, token)
35
38
 
36
- if type == :quantifier and last
37
- break_literal(last) if last.type == :literal
38
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
39
47
  end
40
48
 
41
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
42
50
  nesting, set_nesting, conditional_nesting)
43
51
 
44
- current = merge_condition(current) if type == :conditional and
45
- CONDITION_TOKENS.include?(token)
46
-
47
- last.next = current if last
48
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
49
59
 
50
- tokens << current
51
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
52
62
 
53
63
  descend(type, token)
54
64
  end
55
65
 
56
- if block_given?
57
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
58
76
  else
59
- tokens
77
+ tokens << token
60
78
  end
61
79
  end
62
80
 
@@ -66,27 +84,37 @@ class Regexp::Lexer
66
84
 
67
85
  private
68
86
 
69
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
70
90
 
71
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
72
94
  case type
73
95
  when :group, :assertion
74
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
75
97
  when :set
76
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
77
99
  when :conditional
78
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
79
103
  end
80
104
  end
81
105
 
82
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
83
109
  case type
84
110
  when :group, :assertion
85
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
86
112
  when :set
87
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
88
114
  when :conditional
89
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
90
118
  end
91
119
  end
92
120
 
@@ -96,34 +124,46 @@ class Regexp::Lexer
96
124
  lead, last, _ = token.text.partition(/.\z/mu)
97
125
  return if lead.empty?
98
126
 
99
- tokens.pop
100
- tokens << Regexp::Token.new(:literal, :literal, lead,
127
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
101
128
  token.ts, (token.te - last.length),
102
129
  nesting, set_nesting, conditional_nesting)
103
- tokens << Regexp::Token.new(:literal, :literal, last,
130
+ token_2 = Regexp::Token.new(:literal, :literal, last,
104
131
  (token.ts + lead.length), token.te,
105
132
  nesting, set_nesting, conditional_nesting)
133
+
134
+ token_1.previous = preprev_token
135
+ token_1.next = token_2
136
+ token_2.previous = token_1 # .next will be set by #lex
137
+ [token_1, token_2]
106
138
  end
107
139
 
140
+ # if a codepoint list is followed by a quantifier, that quantifier applies
141
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
142
+ # c.f. #break_literal.
108
143
  def break_codepoint_list(token)
109
144
  lead, _, tail = token.text.rpartition(' ')
110
145
  return if lead.empty?
111
146
 
112
- tokens.pop
113
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
147
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
114
148
  token.ts, (token.te - tail.length),
115
149
  nesting, set_nesting, conditional_nesting)
116
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
150
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
117
151
  (token.ts + lead.length + 1), (token.te + 3),
118
152
  nesting, set_nesting, conditional_nesting)
119
153
 
120
154
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
155
+
156
+ token_1.previous = preprev_token
157
+ token_1.next = token_2
158
+ token_2.previous = token_1 # .next will be set by #lex
159
+ [token_1, token_2]
121
160
  end
122
161
 
123
- def merge_condition(current)
124
- last = tokens.pop
125
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
162
+ def merge_condition(current, last)
163
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
126
164
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
165
+ token.previous = preprev_token # .next will be set by #lex
166
+ token
127
167
  end
128
168
 
129
169
  end # module Regexp::Lexer