regexp_parser 2.6.0 → 2.9.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +5 -5
  3. data/LICENSE +1 -1
  4. data/lib/regexp_parser/expression/base.rb +0 -7
  5. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  6. data/lib/regexp_parser/expression/classes/backreference.rb +17 -3
  7. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  8. data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
  9. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  10. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  11. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  13. data/lib/regexp_parser/expression/classes/keep.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
  18. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  19. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  20. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  21. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  22. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  23. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  24. data/lib/regexp_parser/expression/sequence.rb +5 -10
  25. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  26. data/lib/regexp_parser/expression/shared.rb +37 -20
  27. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  28. data/lib/regexp_parser/expression.rb +34 -31
  29. data/lib/regexp_parser/lexer.rb +76 -36
  30. data/lib/regexp_parser/parser.rb +101 -100
  31. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  32. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  33. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  34. data/lib/regexp_parser/scanner/properties/long.csv +29 -0
  35. data/lib/regexp_parser/scanner/properties/short.csv +3 -0
  36. data/lib/regexp_parser/scanner/property.rl +2 -2
  37. data/lib/regexp_parser/scanner/scanner.rl +101 -172
  38. data/lib/regexp_parser/scanner.rb +1132 -1283
  39. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  40. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  42. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  43. data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
  44. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  45. data/lib/regexp_parser/syntax/token.rb +13 -13
  46. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  47. data/lib/regexp_parser/syntax/versions.rb +3 -1
  48. data/lib/regexp_parser/syntax.rb +1 -1
  49. data/lib/regexp_parser/version.rb +1 -1
  50. data/lib/regexp_parser.rb +6 -6
  51. data/regexp_parser.gemspec +5 -5
  52. metadata +14 -8
  53. data/CHANGELOG.md +0 -601
  54. data/README.md +0 -503
@@ -5,21 +5,16 @@ module Regexp::Expression
5
5
  alias :operands :expressions
6
6
  alias :operator :text
7
7
 
8
- def starts_at
9
- expressions.first.starts_at
8
+ def ts
9
+ (head = expressions.first) ? head.ts : @ts
10
10
  end
11
- alias :ts :starts_at
12
11
 
13
12
  def <<(exp)
14
13
  expressions.last << exp
15
14
  end
16
15
 
17
- def add_sequence(active_opts = {})
18
- self.class::OPERAND.add_to(self, {}, active_opts)
19
- end
20
-
21
- def parts
22
- intersperse(expressions, text.dup)
16
+ def add_sequence(active_opts = {}, params = { ts: 0 })
17
+ self.class::OPERAND.add_to(self, params, active_opts)
23
18
  end
24
19
  end
25
20
  end
@@ -8,7 +8,8 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options
11
+ :options, :parent,
12
+ :custom_to_s_handling, :pre_quantifier_decorations
12
13
 
13
14
  attr_reader :nesting_level, :quantifier
14
15
  end
@@ -32,6 +33,10 @@ module Regexp::Expression
32
33
  self.text = orig.text.dup if orig.text
33
34
  self.options = orig.options.dup if orig.options
34
35
  self.quantifier = orig.quantifier.clone if orig.quantifier
36
+ self.parent = nil # updated by Subexpression#initialize_copy
37
+ if orig.pre_quantifier_decorations
38
+ self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39
+ end
35
40
  super
36
41
  end
37
42
 
@@ -39,35 +44,51 @@ module Regexp::Expression
39
44
  ts
40
45
  end
41
46
 
47
+ def ends_at(include_quantifier = true)
48
+ ts + (include_quantifier ? full_length : base_length)
49
+ end
50
+
42
51
  def base_length
43
52
  to_s(:base).length
44
53
  end
45
54
 
46
55
  def full_length
47
- to_s.length
48
- end
49
-
56
+ to_s(:original).length
57
+ end
58
+
59
+ # #to_s reproduces the original source, as an unparser would.
60
+ #
61
+ # It takes an optional format argument.
62
+ #
63
+ # Example:
64
+ #
65
+ # lit = Regexp::Parser.parse(/a +/x)[0]
66
+ #
67
+ # lit.to_s # => 'a+' # default; with quantifier
68
+ # lit.to_s(:full) # => 'a+' # default; with quantifier
69
+ # lit.to_s(:base) # => 'a' # without quantifier
70
+ # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
+ #
50
72
  def to_s(format = :full)
51
- "#{parts.join}#{quantifier_affix(format)}"
73
+ base = parts.each_with_object(''.dup) do |part, buff|
74
+ if part.instance_of?(String)
75
+ buff << part
76
+ elsif !part.custom_to_s_handling
77
+ buff << part.to_s(:original)
78
+ end
79
+ end
80
+ "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
52
81
  end
53
82
  alias :to_str :to_s
54
83
 
55
- def parts
56
- [text.dup]
84
+ def pre_quantifier_decoration(expression_format = :original)
85
+ pre_quantifier_decorations.to_a.join if expression_format == :original
57
86
  end
58
87
 
59
- def quantifier_affix(expression_format)
88
+ def quantifier_affix(expression_format = :full)
60
89
  quantifier.to_s if quantified? && expression_format != :base
61
90
  end
62
91
 
63
- def quantified?
64
- !quantifier.nil?
65
- end
66
-
67
- def optional?
68
- quantified? && quantifier.min == 0
69
- end
70
-
71
92
  def offset
72
93
  [starts_at, full_length]
73
94
  end
@@ -76,10 +97,6 @@ module Regexp::Expression
76
97
  '@%d+%d' % offset
77
98
  end
78
99
 
79
- def terminal?
80
- !respond_to?(:expressions)
81
- end
82
-
83
100
  def nesting_level=(lvl)
84
101
  @nesting_level = lvl
85
102
  quantifier && quantifier.nesting_level = lvl
@@ -11,17 +11,15 @@ module Regexp::Expression
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
13
  def initialize_copy(orig)
14
- self.expressions = orig.expressions.map(&:clone)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
15
17
  super
16
18
  end
17
19
 
18
20
  def <<(exp)
19
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
- last.merge(exp)
21
- else
22
- exp.nesting_level = nesting_level + 1
23
- expressions << exp
24
- end
21
+ exp.parent = self
22
+ expressions << exp
25
23
  end
26
24
 
27
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -39,11 +37,7 @@ module Regexp::Expression
39
37
  end
40
38
 
41
39
  def te
42
- ts + to_s.length
43
- end
44
-
45
- def parts
46
- expressions
40
+ ts + base_length
47
41
  end
48
42
 
49
43
  def to_h
@@ -53,10 +47,21 @@ module Regexp::Expression
53
47
  )
54
48
  end
55
49
 
56
- private
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
57
62
 
58
- def intersperse(expressions, separator)
59
- expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
60
65
  end
61
66
  end
62
67
  end
@@ -1,34 +1,37 @@
1
- require 'regexp_parser/error'
1
+ require_relative 'error'
2
2
 
3
- require 'regexp_parser/expression/shared'
4
- require 'regexp_parser/expression/base'
5
- require 'regexp_parser/expression/quantifier'
6
- require 'regexp_parser/expression/subexpression'
7
- require 'regexp_parser/expression/sequence'
8
- require 'regexp_parser/expression/sequence_operation'
3
+ require_relative 'expression/shared'
4
+ require_relative 'expression/base'
5
+ require_relative 'expression/quantifier'
6
+ require_relative 'expression/subexpression'
7
+ require_relative 'expression/sequence'
8
+ require_relative 'expression/sequence_operation'
9
9
 
10
- require 'regexp_parser/expression/classes/alternation'
11
- require 'regexp_parser/expression/classes/anchor'
12
- require 'regexp_parser/expression/classes/backreference'
13
- require 'regexp_parser/expression/classes/character_set'
14
- require 'regexp_parser/expression/classes/character_set/intersection'
15
- require 'regexp_parser/expression/classes/character_set/range'
16
- require 'regexp_parser/expression/classes/character_type'
17
- require 'regexp_parser/expression/classes/conditional'
18
- require 'regexp_parser/expression/classes/escape_sequence'
19
- require 'regexp_parser/expression/classes/free_space'
20
- require 'regexp_parser/expression/classes/group'
21
- require 'regexp_parser/expression/classes/keep'
22
- require 'regexp_parser/expression/classes/literal'
23
- require 'regexp_parser/expression/classes/posix_class'
24
- require 'regexp_parser/expression/classes/root'
25
- require 'regexp_parser/expression/classes/unicode_property'
10
+ require_relative 'expression/classes/alternation'
11
+ require_relative 'expression/classes/anchor'
12
+ require_relative 'expression/classes/backreference'
13
+ require_relative 'expression/classes/character_set'
14
+ require_relative 'expression/classes/character_set/intersection'
15
+ require_relative 'expression/classes/character_set/range'
16
+ require_relative 'expression/classes/character_type'
17
+ require_relative 'expression/classes/conditional'
18
+ require_relative 'expression/classes/escape_sequence'
19
+ require_relative 'expression/classes/free_space'
20
+ require_relative 'expression/classes/group'
21
+ require_relative 'expression/classes/keep'
22
+ require_relative 'expression/classes/literal'
23
+ require_relative 'expression/classes/posix_class'
24
+ require_relative 'expression/classes/root'
25
+ require_relative 'expression/classes/unicode_property'
26
26
 
27
- require 'regexp_parser/expression/methods/construct'
28
- require 'regexp_parser/expression/methods/human_name'
29
- require 'regexp_parser/expression/methods/match'
30
- require 'regexp_parser/expression/methods/match_length'
31
- require 'regexp_parser/expression/methods/options'
32
- require 'regexp_parser/expression/methods/strfregexp'
33
- require 'regexp_parser/expression/methods/tests'
34
- require 'regexp_parser/expression/methods/traverse'
27
+ require_relative 'expression/methods/construct'
28
+ require_relative 'expression/methods/human_name'
29
+ require_relative 'expression/methods/match'
30
+ require_relative 'expression/methods/match_length'
31
+ require_relative 'expression/methods/negative'
32
+ require_relative 'expression/methods/options'
33
+ require_relative 'expression/methods/parts'
34
+ require_relative 'expression/methods/printing'
35
+ require_relative 'expression/methods/strfregexp'
36
+ require_relative 'expression/methods/tests'
37
+ require_relative 'expression/methods/traverse'
@@ -6,57 +6,75 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
13
13
 
14
14
  CONDITION_TOKENS = %i[condition condition_close].freeze
15
15
 
16
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
17
- new.lex(input, syntax, options: options, &block)
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
18
18
  end
19
19
 
20
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.for(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
22
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
23
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
24
28
  self.nesting = 0
25
29
  self.set_nesting = 0
26
30
  self.conditional_nesting = 0
27
31
  self.shift = 0
28
32
 
29
- last = nil
30
- Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
31
34
  type, token = *syntax.normalize(type, token)
32
35
  syntax.check! type, token
33
36
 
34
37
  ascend(type, token)
35
38
 
36
- if type == :quantifier and last
37
- break_literal(last) if last.type == :literal
38
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
39
47
  end
40
48
 
41
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
42
50
  nesting, set_nesting, conditional_nesting)
43
51
 
44
- current = merge_condition(current) if type == :conditional and
45
- CONDITION_TOKENS.include?(token)
46
-
47
- last.next = current if last
48
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
49
59
 
50
- tokens << current
51
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
52
62
 
53
63
  descend(type, token)
54
64
  end
55
65
 
56
- if block_given?
57
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
58
76
  else
59
- tokens
77
+ tokens << token
60
78
  end
61
79
  end
62
80
 
@@ -66,27 +84,37 @@ class Regexp::Lexer
66
84
 
67
85
  private
68
86
 
69
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
70
90
 
71
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
72
94
  case type
73
95
  when :group, :assertion
74
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
75
97
  when :set
76
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
77
99
  when :conditional
78
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
79
103
  end
80
104
  end
81
105
 
82
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
83
109
  case type
84
110
  when :group, :assertion
85
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
86
112
  when :set
87
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
88
114
  when :conditional
89
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
90
118
  end
91
119
  end
92
120
 
@@ -96,34 +124,46 @@ class Regexp::Lexer
96
124
  lead, last, _ = token.text.partition(/.\z/mu)
97
125
  return if lead.empty?
98
126
 
99
- tokens.pop
100
- tokens << Regexp::Token.new(:literal, :literal, lead,
127
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
101
128
  token.ts, (token.te - last.length),
102
129
  nesting, set_nesting, conditional_nesting)
103
- tokens << Regexp::Token.new(:literal, :literal, last,
130
+ token_2 = Regexp::Token.new(:literal, :literal, last,
104
131
  (token.ts + lead.length), token.te,
105
132
  nesting, set_nesting, conditional_nesting)
133
+
134
+ token_1.previous = preprev_token
135
+ token_1.next = token_2
136
+ token_2.previous = token_1 # .next will be set by #lex
137
+ [token_1, token_2]
106
138
  end
107
139
 
140
+ # if a codepoint list is followed by a quantifier, that quantifier applies
141
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
142
+ # c.f. #break_literal.
108
143
  def break_codepoint_list(token)
109
144
  lead, _, tail = token.text.rpartition(' ')
110
145
  return if lead.empty?
111
146
 
112
- tokens.pop
113
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
147
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
114
148
  token.ts, (token.te - tail.length),
115
149
  nesting, set_nesting, conditional_nesting)
116
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
150
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
117
151
  (token.ts + lead.length + 1), (token.te + 3),
118
152
  nesting, set_nesting, conditional_nesting)
119
153
 
120
154
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
155
+
156
+ token_1.previous = preprev_token
157
+ token_1.next = token_2
158
+ token_2.previous = token_1 # .next will be set by #lex
159
+ [token_1, token_2]
121
160
  end
122
161
 
123
- def merge_condition(current)
124
- last = tokens.pop
125
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
162
+ def merge_condition(current, last)
163
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
126
164
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
165
+ token.previous = preprev_token # .next will be set by #lex
166
+ token
127
167
  end
128
168
 
129
169
  end # module Regexp::Lexer