regexp_parser 2.6.2 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +67 -0
  3. data/Gemfile +2 -2
  4. data/README.md +32 -29
  5. data/lib/regexp_parser/expression/base.rb +0 -7
  6. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  7. data/lib/regexp_parser/expression/classes/backreference.rb +4 -2
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  9. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  10. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  11. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  13. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  18. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  19. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  20. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  21. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  22. data/lib/regexp_parser/expression/sequence.rb +5 -10
  23. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  24. data/lib/regexp_parser/expression/shared.rb +37 -20
  25. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  26. data/lib/regexp_parser/expression.rb +2 -0
  27. data/lib/regexp_parser/lexer.rb +76 -36
  28. data/lib/regexp_parser/parser.rb +97 -97
  29. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  30. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  31. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  32. data/lib/regexp_parser/scanner/mapping.rb +89 -0
  33. data/lib/regexp_parser/scanner/property.rl +2 -2
  34. data/lib/regexp_parser/scanner/scanner.rl +90 -169
  35. data/lib/regexp_parser/scanner.rb +1157 -1330
  36. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  37. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  38. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  39. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  40. data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  42. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  43. data/lib/regexp_parser/syntax/versions.rb +2 -0
  44. data/lib/regexp_parser/version.rb +1 -1
  45. metadata +10 -3
@@ -8,14 +8,10 @@ module Regexp::Expression
8
8
 
9
9
  MODES = %i[greedy possessive reluctant]
10
10
 
11
- attr_reader :min, :max, :mode
12
-
13
11
  def initialize(*args)
14
12
  deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
15
13
 
16
14
  init_from_token_and_options(*args)
17
- @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
18
- @min, @max = minmax
19
15
  # TODO: remove in v3.0.0, stop removing parts of #token (?)
20
16
  self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
21
17
  end
@@ -39,9 +35,21 @@ module Regexp::Expression
39
35
  end
40
36
  alias :lazy? :reluctant?
41
37
 
38
+ def min
39
+ derived_data[:min]
40
+ end
41
+
42
+ def max
43
+ derived_data[:max]
44
+ end
45
+
46
+ def mode
47
+ derived_data[:mode]
48
+ end
49
+
42
50
  private
43
51
 
44
- def deprecated_old_init(token, text, min, max, mode = :greedy)
52
+ def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
45
53
  warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
46
54
  "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
47
55
  "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
51
59
  "This is consistent with how Expression::Base instances are created. "
52
60
  @token = token
53
61
  @text = text
54
- @min = min
55
- @max = max
56
- @mode = mode
57
62
  end
58
63
 
59
- def minmax
60
- case token
61
- when /zero_or_one/ then [0, 1]
62
- when /zero_or_more/ then [0, -1]
63
- when /one_or_more/ then [1, -1]
64
- when :interval
65
- int_min = text[/\{(\d*)/, 1]
66
- int_max = text[/,?(\d*)\}/, 1]
67
- [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
64
+ def derived_data
65
+ @derived_data ||= begin
66
+ min, max =
67
+ case text[0]
68
+ when '?'; [0, 1]
69
+ when '*'; [0, -1]
70
+ when '+'; [1, -1]
71
+ else
72
+ int_min = text[/\{(\d*)/, 1]
73
+ int_max = text[/,?(\d*)\}/, 1]
74
+ [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
75
+ end
76
+
77
+ mod = text[/.([?+])/, 1]
78
+ mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
79
+
80
+ { min: min, max: max, mode: mode }
68
81
  end
69
82
  end
70
83
  end
@@ -12,25 +12,20 @@ module Regexp::Expression
12
12
  level: exp.level,
13
13
  set_level: exp.set_level,
14
14
  conditional_level: params[:conditional_level] || exp.conditional_level,
15
+ ts: params[:ts],
15
16
  )
16
- sequence.nesting_level = exp.nesting_level + 1
17
17
  sequence.options = active_opts
18
18
  exp.expressions << sequence
19
19
  sequence
20
20
  end
21
21
  end
22
22
 
23
- def starts_at
24
- expressions.first.starts_at
23
+ def ts
24
+ (head = expressions.first) ? head.ts : @ts
25
25
  end
26
- alias :ts :starts_at
27
26
 
28
- def quantify(*args)
29
- target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
30
- target or raise Regexp::Parser::Error,
31
- "No valid target found for '#{text}' quantifier"
32
-
33
- target.quantify(*args)
27
+ def quantify(token, *args)
28
+ extract_quantifier_target(token.text).quantify(token, *args)
34
29
  end
35
30
  end
36
31
  end
@@ -5,21 +5,16 @@ module Regexp::Expression
5
5
  alias :operands :expressions
6
6
  alias :operator :text
7
7
 
8
- def starts_at
9
- expressions.first.starts_at
8
+ def ts
9
+ (head = expressions.first) ? head.ts : @ts
10
10
  end
11
- alias :ts :starts_at
12
11
 
13
12
  def <<(exp)
14
13
  expressions.last << exp
15
14
  end
16
15
 
17
- def add_sequence(active_opts = {})
18
- self.class::OPERAND.add_to(self, {}, active_opts)
19
- end
20
-
21
- def parts
22
- intersperse(expressions, text.dup)
16
+ def add_sequence(active_opts = {}, params = { ts: 0 })
17
+ self.class::OPERAND.add_to(self, params, active_opts)
23
18
  end
24
19
  end
25
20
  end
@@ -8,7 +8,8 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options
11
+ :options, :parent,
12
+ :custom_to_s_handling, :pre_quantifier_decorations
12
13
 
13
14
  attr_reader :nesting_level, :quantifier
14
15
  end
@@ -32,6 +33,10 @@ module Regexp::Expression
32
33
  self.text = orig.text.dup if orig.text
33
34
  self.options = orig.options.dup if orig.options
34
35
  self.quantifier = orig.quantifier.clone if orig.quantifier
36
+ self.parent = nil # updated by Subexpression#initialize_copy
37
+ if orig.pre_quantifier_decorations
38
+ self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39
+ end
35
40
  super
36
41
  end
37
42
 
@@ -39,35 +44,51 @@ module Regexp::Expression
39
44
  ts
40
45
  end
41
46
 
47
+ def ends_at(include_quantifier = true)
48
+ ts + (include_quantifier ? full_length : base_length)
49
+ end
50
+
42
51
  def base_length
43
52
  to_s(:base).length
44
53
  end
45
54
 
46
55
  def full_length
47
- to_s.length
48
- end
49
-
56
+ to_s(:original).length
57
+ end
58
+
59
+ # #to_s reproduces the original source, as an unparser would.
60
+ #
61
+ # It takes an optional format argument.
62
+ #
63
+ # Example:
64
+ #
65
+ # lit = Regexp::Parser.parse(/a +/x)[0]
66
+ #
67
+ # lit.to_s # => 'a+' # default; with quantifier
68
+ # lit.to_s(:full) # => 'a+' # default; with quantifier
69
+ # lit.to_s(:base) # => 'a' # without quantifier
70
+ # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
+ #
50
72
  def to_s(format = :full)
51
- "#{parts.join}#{quantifier_affix(format)}"
73
+ base = parts.each_with_object(''.dup) do |part, buff|
74
+ if part.instance_of?(String)
75
+ buff << part
76
+ elsif !part.custom_to_s_handling
77
+ buff << part.to_s(:original)
78
+ end
79
+ end
80
+ "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
52
81
  end
53
82
  alias :to_str :to_s
54
83
 
55
- def parts
56
- [text.dup]
84
+ def pre_quantifier_decoration(expression_format = :original)
85
+ pre_quantifier_decorations.to_a.join if expression_format == :original
57
86
  end
58
87
 
59
- def quantifier_affix(expression_format)
88
+ def quantifier_affix(expression_format = :full)
60
89
  quantifier.to_s if quantified? && expression_format != :base
61
90
  end
62
91
 
63
- def quantified?
64
- !quantifier.nil?
65
- end
66
-
67
- def optional?
68
- quantified? && quantifier.min == 0
69
- end
70
-
71
92
  def offset
72
93
  [starts_at, full_length]
73
94
  end
@@ -76,10 +97,6 @@ module Regexp::Expression
76
97
  '@%d+%d' % offset
77
98
  end
78
99
 
79
- def terminal?
80
- !respond_to?(:expressions)
81
- end
82
-
83
100
  def nesting_level=(lvl)
84
101
  @nesting_level = lvl
85
102
  quantifier && quantifier.nesting_level = lvl
@@ -11,17 +11,15 @@ module Regexp::Expression
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
13
  def initialize_copy(orig)
14
- self.expressions = orig.expressions.map(&:clone)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
15
17
  super
16
18
  end
17
19
 
18
20
  def <<(exp)
19
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
- last.merge(exp)
21
- else
22
- exp.nesting_level = nesting_level + 1
23
- expressions << exp
24
- end
21
+ exp.parent = self
22
+ expressions << exp
25
23
  end
26
24
 
27
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -39,11 +37,7 @@ module Regexp::Expression
39
37
  end
40
38
 
41
39
  def te
42
- ts + to_s.length
43
- end
44
-
45
- def parts
46
- expressions
40
+ ts + base_length
47
41
  end
48
42
 
49
43
  def to_h
@@ -53,10 +47,21 @@ module Regexp::Expression
53
47
  )
54
48
  end
55
49
 
56
- private
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
57
62
 
58
- def intersperse(expressions, separator)
59
- expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
60
65
  end
61
66
  end
62
67
  end
@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
29
29
  require 'regexp_parser/expression/methods/match'
30
30
  require 'regexp_parser/expression/methods/match_length'
31
31
  require 'regexp_parser/expression/methods/options'
32
+ require 'regexp_parser/expression/methods/parts'
33
+ require 'regexp_parser/expression/methods/printing'
32
34
  require 'regexp_parser/expression/methods/strfregexp'
33
35
  require 'regexp_parser/expression/methods/tests'
34
36
  require 'regexp_parser/expression/methods/traverse'
@@ -6,57 +6,75 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
13
13
 
14
14
  CONDITION_TOKENS = %i[condition condition_close].freeze
15
15
 
16
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
17
- new.lex(input, syntax, options: options, &block)
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
18
18
  end
19
19
 
20
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.for(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
22
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
23
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
24
28
  self.nesting = 0
25
29
  self.set_nesting = 0
26
30
  self.conditional_nesting = 0
27
31
  self.shift = 0
28
32
 
29
- last = nil
30
- Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
31
34
  type, token = *syntax.normalize(type, token)
32
35
  syntax.check! type, token
33
36
 
34
37
  ascend(type, token)
35
38
 
36
- if type == :quantifier and last
37
- break_literal(last) if last.type == :literal
38
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
39
47
  end
40
48
 
41
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
42
50
  nesting, set_nesting, conditional_nesting)
43
51
 
44
- current = merge_condition(current) if type == :conditional and
45
- CONDITION_TOKENS.include?(token)
46
-
47
- last.next = current if last
48
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
49
59
 
50
- tokens << current
51
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
52
62
 
53
63
  descend(type, token)
54
64
  end
55
65
 
56
- if block_given?
57
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
58
76
  else
59
- tokens
77
+ tokens << token
60
78
  end
61
79
  end
62
80
 
@@ -66,27 +84,37 @@ class Regexp::Lexer
66
84
 
67
85
  private
68
86
 
69
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
70
90
 
71
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
72
94
  case type
73
95
  when :group, :assertion
74
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
75
97
  when :set
76
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
77
99
  when :conditional
78
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
79
103
  end
80
104
  end
81
105
 
82
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
83
109
  case type
84
110
  when :group, :assertion
85
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
86
112
  when :set
87
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
88
114
  when :conditional
89
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
90
118
  end
91
119
  end
92
120
 
@@ -96,34 +124,46 @@ class Regexp::Lexer
96
124
  lead, last, _ = token.text.partition(/.\z/mu)
97
125
  return if lead.empty?
98
126
 
99
- tokens.pop
100
- tokens << Regexp::Token.new(:literal, :literal, lead,
127
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
101
128
  token.ts, (token.te - last.length),
102
129
  nesting, set_nesting, conditional_nesting)
103
- tokens << Regexp::Token.new(:literal, :literal, last,
130
+ token_2 = Regexp::Token.new(:literal, :literal, last,
104
131
  (token.ts + lead.length), token.te,
105
132
  nesting, set_nesting, conditional_nesting)
133
+
134
+ token_1.previous = preprev_token
135
+ token_1.next = token_2
136
+ token_2.previous = token_1 # .next will be set by #lex
137
+ [token_1, token_2]
106
138
  end
107
139
 
140
+ # if a codepoint list is followed by a quantifier, that quantifier applies
141
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
142
+ # c.f. #break_literal.
108
143
  def break_codepoint_list(token)
109
144
  lead, _, tail = token.text.rpartition(' ')
110
145
  return if lead.empty?
111
146
 
112
- tokens.pop
113
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
147
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
114
148
  token.ts, (token.te - tail.length),
115
149
  nesting, set_nesting, conditional_nesting)
116
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
150
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
117
151
  (token.ts + lead.length + 1), (token.te + 3),
118
152
  nesting, set_nesting, conditional_nesting)
119
153
 
120
154
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
155
+
156
+ token_1.previous = preprev_token
157
+ token_1.next = token_2
158
+ token_2.previous = token_1 # .next will be set by #lex
159
+ [token_1, token_2]
121
160
  end
122
161
 
123
- def merge_condition(current)
124
- last = tokens.pop
125
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
162
+ def merge_condition(current, last)
163
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
126
164
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
165
+ token.previous = preprev_token # .next will be set by #lex
166
+ token
127
167
  end
128
168
 
129
169
  end # module Regexp::Lexer