regexp_parser 1.7.0 → 2.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +8 -2
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +4 -0
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  16. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  20. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  25. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  26. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  27. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  28. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  29. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  30. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  31. data/lib/regexp_parser/expression/sequence.rb +11 -47
  32. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  33. data/lib/regexp_parser/expression/shared.rb +111 -0
  34. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  35. data/lib/regexp_parser/expression.rb +14 -141
  36. data/lib/regexp_parser/lexer.rb +83 -41
  37. data/lib/regexp_parser/parser.rb +371 -429
  38. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  39. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  40. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  41. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  42. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  43. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  44. data/lib/regexp_parser/scanner/property.rl +4 -4
  45. data/lib/regexp_parser/scanner/scanner.rl +303 -368
  46. data/lib/regexp_parser/scanner.rb +1423 -1674
  47. data/lib/regexp_parser/syntax/any.rb +2 -7
  48. data/lib/regexp_parser/syntax/base.rb +92 -67
  49. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  50. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  51. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  52. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  53. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  54. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  55. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  56. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  57. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  58. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  59. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  60. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  61. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  62. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  63. data/lib/regexp_parser/syntax/token.rb +45 -0
  64. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  65. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  66. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  67. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  68. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  69. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  70. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  73. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  78. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions.rb +3 -1
  80. data/lib/regexp_parser/syntax.rb +8 -6
  81. data/lib/regexp_parser/token.rb +9 -20
  82. data/lib/regexp_parser/version.rb +1 -1
  83. data/lib/regexp_parser.rb +0 -2
  84. data/regexp_parser.gemspec +19 -23
  85. metadata +52 -171
  86. data/CHANGELOG.md +0 -349
  87. data/README.md +0 -470
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
22
- new.parse(input, syntax, &block)
21
+ def self.parse(input, syntax = nil, options: nil, &block)
22
+ new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
26
- root = Root.build(options_from_input(input))
25
+ def parse(input, syntax = nil, options: nil, &block)
26
+ root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -35,10 +35,13 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -54,107 +57,173 @@ class Regexp::Parser
54
57
  :options_stack, :switching_options, :conditional_nesting,
55
58
  :captured_group_counts
56
59
 
57
- def options_from_input(input)
58
- return {} unless input.is_a?(::Regexp)
60
+ def extract_options(input, options)
61
+ if options && !input.is_a?(String)
62
+ raise ArgumentError, 'options cannot be supplied unless parsing a String'
63
+ end
59
64
 
60
- options = {}
61
- options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
62
- options[:m] = true if input.options & ::Regexp::MULTILINE != 0
63
- options[:x] = true if input.options & ::Regexp::EXTENDED != 0
64
- options
65
- end
65
+ options = input.options if input.is_a?(::Regexp)
66
66
 
67
- def nest(exp)
68
- nesting.push(exp)
69
- node << exp
70
- update_transplanted_subtree(exp, node)
71
- self.node = exp
72
- end
67
+ return {} unless options
73
68
 
74
- # subtrees are transplanted to build Alternations, Intersections, Ranges
75
- def update_transplanted_subtree(exp, new_parent)
76
- exp.nesting_level = new_parent.nesting_level + 1
77
- exp.respond_to?(:each) &&
78
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
79
- end
80
-
81
- def decrease_nesting
82
- while nesting.last.is_a?(SequenceOperation)
83
- nesting.pop
84
- self.node = nesting.last
85
- end
86
- nesting.pop
87
- yield(node) if block_given?
88
- self.node = nesting.last
89
- self.node = node.last if node.last.is_a?(SequenceOperation)
90
- end
91
-
92
- def nest_conditional(exp)
93
- conditional_nesting.push(exp)
94
- nest(exp)
69
+ enabled_options = {}
70
+ enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
71
+ enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
72
+ enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
73
+ enabled_options
95
74
  end
96
75
 
97
76
  def parse_token(token)
98
- close_completed_character_set_range
99
-
100
77
  case token.type
101
- when :meta; meta(token)
102
- when :quantifier; quantifier(token)
103
- when :anchor; anchor(token)
104
- when :escape; escape(token)
105
- when :group; group(token)
106
- when :assertion; group(token)
107
- when :set; set(token)
108
- when :type; type(token)
109
- when :backref; backref(token)
110
- when :conditional; conditional(token)
111
- when :keep; keep(token)
112
-
113
- when :posixclass, :nonposixclass
114
- posixclass(token)
115
- when :property, :nonproperty
116
- property(token)
117
-
118
- when :literal
119
- node << Literal.new(token, active_opts)
120
- when :free_space
121
- free_space(token)
122
-
78
+ when :anchor; anchor(token)
79
+ when :assertion, :group; group(token)
80
+ when :backref; backref(token)
81
+ when :conditional; conditional(token)
82
+ when :escape; escape(token)
83
+ when :free_space; free_space(token)
84
+ when :keep; keep(token)
85
+ when :literal; literal(token)
86
+ when :meta; meta(token)
87
+ when :posixclass, :nonposixclass; posixclass(token)
88
+ when :property, :nonproperty; property(token)
89
+ when :quantifier; quantifier(token)
90
+ when :set; set(token)
91
+ when :type; type(token)
123
92
  else
124
93
  raise UnknownTokenTypeError.new(token.type, token)
125
94
  end
95
+
96
+ close_completed_character_set_range
126
97
  end
127
98
 
128
- def set(token)
99
+ def anchor(token)
129
100
  case token.token
130
- when :open
131
- open_set(token)
132
- when :close
133
- close_set
134
- when :negate
135
- negate_set
136
- when :range
137
- range(token)
138
- when :intersection
139
- intersection(token)
140
- when :collation, :equivalent
141
- node << Literal.new(token, active_opts)
101
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
102
+ when :bos; node << Anchor::BOS.new(token, active_opts)
103
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
104
+ when :eos; node << Anchor::EOS.new(token, active_opts)
105
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
106
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
107
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
108
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
142
109
  else
143
- raise UnknownTokenError.new('CharacterSet', token)
110
+ raise UnknownTokenError.new('Anchor', token)
144
111
  end
145
112
  end
146
113
 
147
- def meta(token)
114
+ def group(token)
148
115
  case token.token
149
- when :dot
150
- node << CharacterType::Any.new(token, active_opts)
151
- when :alternation
152
- sequence_operation(Alternation, token)
116
+ when :options, :options_switch
117
+ options_group(token)
118
+ when :close
119
+ close_group
120
+ when :comment
121
+ node << Group::Comment.new(token, active_opts)
153
122
  else
154
- raise UnknownTokenError.new('Meta', token)
123
+ open_group(token)
155
124
  end
156
125
  end
157
126
 
127
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
128
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
129
+
130
+ def options_group(token)
131
+ positive, negative = token.text.split('-', 2)
132
+ negative ||= ''
133
+ self.switching_options = token.token.equal?(:options_switch)
134
+
135
+ opt_changes = {}
136
+ new_active_opts = active_opts.dup
137
+
138
+ MOD_FLAGS.each do |flag|
139
+ if positive.include?(flag.to_s)
140
+ opt_changes[flag] = new_active_opts[flag] = true
141
+ end
142
+ if negative.include?(flag.to_s)
143
+ opt_changes[flag] = false
144
+ new_active_opts.delete(flag)
145
+ end
146
+ end
147
+
148
+ if (enc_flag = positive.reverse[/[adu]/])
149
+ enc_flag = enc_flag.to_sym
150
+ (ENC_FLAGS - [enc_flag]).each do |other|
151
+ opt_changes[other] = false if new_active_opts[other]
152
+ new_active_opts.delete(other)
153
+ end
154
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
155
+ end
156
+
157
+ options_stack << new_active_opts
158
+
159
+ options_group = Group::Options.new(token, active_opts)
160
+ options_group.option_changes = opt_changes
161
+
162
+ nest(options_group)
163
+ end
164
+
165
+ def open_group(token)
166
+ group_class =
167
+ case token.token
168
+ when :absence; Group::Absence
169
+ when :atomic; Group::Atomic
170
+ when :capture; Group::Capture
171
+ when :named; Group::Named
172
+ when :passive; Group::Passive
173
+
174
+ when :lookahead; Assertion::Lookahead
175
+ when :lookbehind; Assertion::Lookbehind
176
+ when :nlookahead; Assertion::NegativeLookahead
177
+ when :nlookbehind; Assertion::NegativeLookbehind
178
+
179
+ else
180
+ raise UnknownTokenError.new('Group type open', token)
181
+ end
182
+
183
+ group = group_class.new(token, active_opts)
184
+
185
+ if group.capturing?
186
+ group.number = total_captured_group_count + 1
187
+ group.number_at_level = captured_group_count_at_level + 1
188
+ count_captured_group
189
+ end
190
+
191
+ # Push the active options to the stack again. This way we can simply pop the
192
+ # stack for any group we close, no matter if it had its own options or not.
193
+ options_stack << active_opts
194
+
195
+ nest(group)
196
+ end
197
+
198
+ def total_captured_group_count
199
+ captured_group_counts.values.reduce(0, :+)
200
+ end
201
+
202
+ def captured_group_count_at_level
203
+ captured_group_counts[node]
204
+ end
205
+
206
+ def count_captured_group
207
+ captured_group_counts[node] += 1
208
+ end
209
+
210
+ def close_group
211
+ options_stack.pop unless switching_options
212
+ self.switching_options = false
213
+ decrease_nesting
214
+ end
215
+
216
+ def decrease_nesting
217
+ while nesting.last.is_a?(SequenceOperation)
218
+ nesting.pop
219
+ self.node = nesting.last
220
+ end
221
+ nesting.pop
222
+ yield(node) if block_given?
223
+ self.node = nesting.last
224
+ self.node = node.last if node.last.is_a?(SequenceOperation)
225
+ end
226
+
158
227
  def backref(token)
159
228
  case token.token
160
229
  when :name_ref
@@ -163,10 +232,18 @@ class Regexp::Parser
163
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
164
233
  when :name_call
165
234
  node << Backreference::NameCall.new(token, active_opts)
166
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
167
236
  node << Backreference::Number.new(token, active_opts)
168
237
  when :number_recursion_ref
169
- node << Backreference::NumberRecursionLevel.new(token, active_opts)
238
+ node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
239
+ # TODO: should split off new token number_recursion_rel_ref and new
240
+ # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
241
+ if exp.text =~ /[<'][+-]/
242
+ assign_effective_number(exp)
243
+ else
244
+ exp.effective_number = exp.number
245
+ end
246
+ end
170
247
  when :number_call
171
248
  node << Backreference::NumberCall.new(token, active_opts)
172
249
  when :number_rel_ref
@@ -182,31 +259,11 @@ class Regexp::Parser
182
259
  end
183
260
  end
184
261
 
185
- def type(token)
186
- case token.token
187
- when :digit
188
- node << CharacterType::Digit.new(token, active_opts)
189
- when :nondigit
190
- node << CharacterType::NonDigit.new(token, active_opts)
191
- when :hex
192
- node << CharacterType::Hex.new(token, active_opts)
193
- when :nonhex
194
- node << CharacterType::NonHex.new(token, active_opts)
195
- when :space
196
- node << CharacterType::Space.new(token, active_opts)
197
- when :nonspace
198
- node << CharacterType::NonSpace.new(token, active_opts)
199
- when :word
200
- node << CharacterType::Word.new(token, active_opts)
201
- when :nonword
202
- node << CharacterType::NonWord.new(token, active_opts)
203
- when :linebreak
204
- node << CharacterType::Linebreak.new(token, active_opts)
205
- when :xgrapheme
206
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
207
- else
208
- raise UnknownTokenError.new('CharacterType', token)
209
- end
262
+ def assign_effective_number(exp)
263
+ exp.effective_number =
264
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
265
+ exp.effective_number > 0 ||
266
+ raise(ParserError, "Invalid reference: #{exp.reference}")
210
267
  end
211
268
 
212
269
  def conditional(token)
@@ -215,9 +272,9 @@ class Regexp::Parser
215
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
216
273
  when :condition
217
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
218
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
219
276
  when :separator
220
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
221
278
  self.node = conditional_nesting.last.branches.last
222
279
  when :close
223
280
  conditional_nesting.pop
@@ -234,157 +291,38 @@ class Regexp::Parser
234
291
  end
235
292
  end
236
293
 
237
- def posixclass(token)
238
- node << PosixClass.new(token, active_opts)
239
- end
240
-
241
- include Regexp::Expression::UnicodeProperty
242
-
243
- def property(token)
244
- case token.token
245
- when :alnum; node << Alnum.new(token, active_opts)
246
- when :alpha; node << Alpha.new(token, active_opts)
247
- when :ascii; node << Ascii.new(token, active_opts)
248
- when :blank; node << Blank.new(token, active_opts)
249
- when :cntrl; node << Cntrl.new(token, active_opts)
250
- when :digit; node << Digit.new(token, active_opts)
251
- when :graph; node << Graph.new(token, active_opts)
252
- when :lower; node << Lower.new(token, active_opts)
253
- when :print; node << Print.new(token, active_opts)
254
- when :punct; node << Punct.new(token, active_opts)
255
- when :space; node << Space.new(token, active_opts)
256
- when :upper; node << Upper.new(token, active_opts)
257
- when :word; node << Word.new(token, active_opts)
258
- when :xdigit; node << Xdigit.new(token, active_opts)
259
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
260
-
261
- # only in Oniguruma (old rubies)
262
- when :newline; node << Newline.new(token, active_opts)
263
-
264
- when :any; node << Any.new(token, active_opts)
265
- when :assigned; node << Assigned.new(token, active_opts)
266
-
267
- when :letter; node << Letter::Any.new(token, active_opts)
268
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
269
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
270
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
271
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
272
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
273
- when :other_letter; node << Letter::Other.new(token, active_opts)
274
-
275
- when :mark; node << Mark::Any.new(token, active_opts)
276
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
277
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
278
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
279
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
280
-
281
- when :number; node << Number::Any.new(token, active_opts)
282
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
283
- when :letter_number; node << Number::Letter.new(token, active_opts)
284
- when :other_number; node << Number::Other.new(token, active_opts)
285
-
286
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
287
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
288
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
289
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
290
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
291
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
292
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
293
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
294
-
295
- when :separator; node << Separator::Any.new(token, active_opts)
296
- when :space_separator; node << Separator::Space.new(token, active_opts)
297
- when :line_separator; node << Separator::Line.new(token, active_opts)
298
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
299
-
300
- when :symbol; node << Symbol::Any.new(token, active_opts)
301
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
302
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
303
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
304
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
305
-
306
- when :other; node << Codepoint::Any.new(token, active_opts)
307
- when :control; node << Codepoint::Control.new(token, active_opts)
308
- when :format; node << Codepoint::Format.new(token, active_opts)
309
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
310
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
311
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
312
-
313
- when *Token::UnicodeProperty::Age
314
- node << Age.new(token, active_opts)
315
-
316
- when *Token::UnicodeProperty::Derived
317
- node << Derived.new(token, active_opts)
318
-
319
- when *Token::UnicodeProperty::Emoji
320
- node << Emoji.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Script
323
- node << Script.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::UnicodeBlock
326
- node << Block.new(token, active_opts)
327
-
328
- else
329
- raise UnknownTokenError.new('UnicodeProperty', token)
330
- end
294
+ def nest_conditional(exp)
295
+ conditional_nesting.push(exp)
296
+ nest(exp)
331
297
  end
332
298
 
333
- def anchor(token)
334
- case token.token
335
- when :bol
336
- node << Anchor::BeginningOfLine.new(token, active_opts)
337
- when :eol
338
- node << Anchor::EndOfLine.new(token, active_opts)
339
- when :bos
340
- node << Anchor::BOS.new(token, active_opts)
341
- when :eos
342
- node << Anchor::EOS.new(token, active_opts)
343
- when :eos_ob_eol
344
- node << Anchor::EOSobEOL.new(token, active_opts)
345
- when :word_boundary
346
- node << Anchor::WordBoundary.new(token, active_opts)
347
- when :nonword_boundary
348
- node << Anchor::NonWordBoundary.new(token, active_opts)
349
- when :match_start
350
- node << Anchor::MatchStart.new(token, active_opts)
351
- else
352
- raise UnknownTokenError.new('Anchor', token)
353
- end
299
+ def nest(exp)
300
+ nesting.push(exp)
301
+ node << exp
302
+ self.node = exp
354
303
  end
355
304
 
356
305
  def escape(token)
357
306
  case token.token
358
307
 
359
- when :backspace
360
- node << EscapeSequence::Backspace.new(token, active_opts)
361
-
362
- when :escape
363
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
364
- when :bell
365
- node << EscapeSequence::Bell.new(token, active_opts)
366
- when :form_feed
367
- node << EscapeSequence::FormFeed.new(token, active_opts)
368
- when :newline
369
- node << EscapeSequence::Newline.new(token, active_opts)
370
- when :carriage
371
- node << EscapeSequence::Return.new(token, active_opts)
372
- when :tab
373
- node << EscapeSequence::Tab.new(token, active_opts)
374
- when :vertical_tab
375
- node << EscapeSequence::VerticalTab.new(token, active_opts)
376
-
377
- when :hex
378
- node << EscapeSequence::Hex.new(token, active_opts)
379
- when :octal
380
- node << EscapeSequence::Octal.new(token, active_opts)
381
- when :codepoint
382
- node << EscapeSequence::Codepoint.new(token, active_opts)
383
- when :codepoint_list
384
- node << EscapeSequence::CodepointList.new(token, active_opts)
308
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
309
+
310
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
311
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
312
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
313
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
314
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
315
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
316
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
317
+
318
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
319
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
385
322
 
386
323
  when :control
387
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
388
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
389
327
  else
390
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -392,6 +330,7 @@ class Regexp::Parser
392
330
 
393
331
  when :meta_sequence
394
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
395
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
396
335
  else
397
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -399,188 +338,194 @@ class Regexp::Parser
399
338
 
400
339
  else
401
340
  # treating everything else as a literal
341
+ # TODO: maybe split this up a bit more in v3.0.0?
342
+ # E.g. escaped quantifiers or set meta chars are not the same
343
+ # as stuff that would be a literal even without the backslash.
344
+ # Right now, they all end up here.
402
345
  node << EscapeSequence::Literal.new(token, active_opts)
403
346
  end
404
347
  end
405
348
 
406
- def keep(token)
407
- node << Keep::Mark.new(token, active_opts)
408
- end
409
-
410
349
  def free_space(token)
411
350
  case token.token
412
351
  when :comment
413
352
  node << Comment.new(token, active_opts)
414
353
  when :whitespace
415
- if node.last.is_a?(WhiteSpace)
416
- node.last.merge(WhiteSpace.new(token, active_opts))
417
- else
418
- node << WhiteSpace.new(token, active_opts)
419
- end
354
+ node << WhiteSpace.new(token, active_opts)
420
355
  else
421
356
  raise UnknownTokenError.new('FreeSpace', token)
422
357
  end
423
358
  end
424
359
 
425
- def quantifier(token)
426
- offset = -1
427
- target_node = node.expressions[offset]
428
- while target_node.is_a?(FreeSpace)
429
- target_node = node.expressions[offset -= 1]
430
- end
360
+ def keep(token)
361
+ node << Keep::Mark.new(token, active_opts)
362
+ end
431
363
 
432
- target_node || raise(ArgumentError, 'No valid target found for '\
433
- "'#{token.text}' ")
364
+ def literal(token)
365
+ node << Literal.new(token, active_opts)
366
+ end
434
367
 
368
+ def meta(token)
435
369
  case token.token
436
- when :zero_or_one
437
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
438
- when :zero_or_one_reluctant
439
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
440
- when :zero_or_one_possessive
441
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
442
-
443
- when :zero_or_more
444
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
445
- when :zero_or_more_reluctant
446
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
447
- when :zero_or_more_possessive
448
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
449
-
450
- when :one_or_more
451
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
452
- when :one_or_more_reluctant
453
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
454
- when :one_or_more_possessive
455
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
456
-
457
- when :interval
458
- interval(target_node, token)
459
-
370
+ when :dot
371
+ node << CharacterType::Any.new(token, active_opts)
372
+ when :alternation
373
+ sequence_operation(Alternation, token)
460
374
  else
461
- raise UnknownTokenError.new('Quantifier', token)
375
+ raise UnknownTokenError.new('Meta', token)
462
376
  end
463
377
  end
464
378
 
465
- def interval(target_node, token)
466
- text = token.text
467
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
468
- case mchr
469
- when '?'
470
- range_text = text[0...-1]
471
- mode = :reluctant
472
- when '+'
473
- range_text = text[0...-1]
474
- mode = :possessive
475
- else
476
- range_text = text
477
- mode = :greedy
379
+ def sequence_operation(klass, token)
380
+ unless node.instance_of?(klass)
381
+ operator = klass.new(token, active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
383
+ sequence.expressions = node.expressions
384
+ node.expressions = []
385
+ nest(operator)
478
386
  end
479
-
480
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
481
- min = range[0].empty? ? 0 : range[0]
482
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
483
-
484
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
387
+ node.add_sequence(active_opts, { ts: token.te })
485
388
  end
486
389
 
487
- def group(token)
488
- case token.token
489
- when :options, :options_switch
490
- options_group(token)
491
- when :close
492
- close_group
493
- when :comment
494
- node << Group::Comment.new(token, active_opts)
495
- else
496
- open_group(token)
497
- end
390
+ def posixclass(token)
391
+ node << PosixClass.new(token, active_opts)
498
392
  end
499
393
 
500
- MOD_FLAGS = %w[i m x].map(&:to_sym)
501
- ENC_FLAGS = %w[a d u].map(&:to_sym)
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
502
396
 
503
- def options_group(token)
504
- positive, negative = token.text.split('-', 2)
505
- negative ||= ''
506
- self.switching_options = token.token.equal?(:options_switch)
397
+ def property(token)
398
+ case token.token
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
507
414
 
508
- opt_changes = {}
509
- new_active_opts = active_opts.dup
415
+ # only in Oniguruma (old rubies)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
471
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
510
472
 
511
- MOD_FLAGS.each do |flag|
512
- if positive.include?(flag.to_s)
513
- opt_changes[flag] = new_active_opts[flag] = true
514
- end
515
- if negative.include?(flag.to_s)
516
- opt_changes[flag] = false
517
- new_active_opts.delete(flag)
518
- end
473
+ else
474
+ raise UnknownTokenError.new('UnicodeProperty', token)
519
475
  end
476
+ end
520
477
 
521
- if (enc_flag = positive.reverse[/[adu]/])
522
- enc_flag = enc_flag.to_sym
523
- (ENC_FLAGS - [enc_flag]).each do |other|
524
- opt_changes[other] = false if new_active_opts[other]
525
- new_active_opts.delete(other)
526
- end
527
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
478
+ def quantifier(token)
479
+ target_node = node.extract_quantifier_target(token.text)
480
+
481
+ # in case of chained quantifiers, wrap target in an implicit passive group
482
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
483
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
484
+ if target_node.quantified?
485
+ new_group = Group::Passive.construct(
486
+ token: :passive,
487
+ ts: target_node.ts,
488
+ level: target_node.level,
489
+ set_level: target_node.set_level,
490
+ conditional_level: target_node.conditional_level,
491
+ options: active_opts,
492
+ )
493
+ new_group.implicit = true
494
+ new_group << target_node
495
+ increase_group_level(target_node)
496
+ node.expressions[node.expressions.index(target_node)] = new_group
497
+ target_node = new_group
528
498
  end
529
499
 
530
- options_stack << new_active_opts
500
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
501
+ (?:_greedy|_reluctant|_possessive)?\z/x
502
+ raise UnknownTokenError.new('Quantifier', token)
503
+ end
531
504
 
532
- options_group = Group::Options.new(token, active_opts)
533
- options_group.option_changes = opt_changes
505
+ target_node.quantify(token, active_opts)
506
+ end
534
507
 
535
- nest(options_group)
508
+ def increase_group_level(exp)
509
+ exp.level += 1
510
+ exp.quantifier.level += 1 if exp.quantifier
511
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
536
512
  end
537
513
 
538
- def open_group(token)
514
+ def set(token)
539
515
  case token.token
540
- when :passive
541
- exp = Group::Passive.new(token, active_opts)
542
- when :atomic
543
- exp = Group::Atomic.new(token, active_opts)
544
- when :named
545
- exp = Group::Named.new(token, active_opts)
546
- when :capture
547
- exp = Group::Capture.new(token, active_opts)
548
- when :absence
549
- exp = Group::Absence.new(token, active_opts)
550
-
551
- when :lookahead
552
- exp = Assertion::Lookahead.new(token, active_opts)
553
- when :nlookahead
554
- exp = Assertion::NegativeLookahead.new(token, active_opts)
555
- when :lookbehind
556
- exp = Assertion::Lookbehind.new(token, active_opts)
557
- when :nlookbehind
558
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
559
-
516
+ when :open; open_set(token)
517
+ when :close; close_set
518
+ when :negate; negate_set
519
+ when :range; range(token)
520
+ when :intersection; intersection(token)
560
521
  else
561
- raise UnknownTokenError.new('Group type open', token)
562
- end
563
-
564
- if exp.capturing?
565
- exp.number = total_captured_group_count + 1
566
- exp.number_at_level = captured_group_count_at_level + 1
567
- count_captured_group
522
+ raise UnknownTokenError.new('CharacterSet', token)
568
523
  end
569
-
570
- # Push the active options to the stack again. This way we can simply pop the
571
- # stack for any group we close, no matter if it had its own options or not.
572
- options_stack << active_opts
573
-
574
- nest(exp)
575
- end
576
-
577
- def close_group
578
- options_stack.pop unless switching_options
579
- self.switching_options = false
580
- decrease_nesting
581
524
  end
582
525
 
583
526
  def open_set(token)
527
+ # TODO: this and Quantifier are the only cases where Expression#token
528
+ # does not match the scanner/lexer output. Fix in v3.0.0.
584
529
  token.token = :character
585
530
  nest(CharacterSet.new(token, active_opts))
586
531
  end
@@ -595,59 +540,56 @@ class Regexp::Parser
595
540
 
596
541
  def range(token)
597
542
  exp = CharacterSet::Range.new(token, active_opts)
598
- scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
543
+ scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
599
544
  exp << scope.expressions.pop
600
545
  nest(exp)
601
546
  end
602
547
 
603
- def close_completed_character_set_range
604
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
605
- end
606
-
607
548
  def intersection(token)
608
549
  sequence_operation(CharacterSet::Intersection, token)
609
550
  end
610
551
 
611
- def sequence_operation(klass, token)
612
- unless node.is_a?(klass)
613
- operator = klass.new(token, active_opts)
614
- sequence = operator.add_sequence(active_opts)
615
- sequence.expressions = node.expressions
616
- node.expressions = []
617
- nest(operator)
552
+ def type(token)
553
+ case token.token
554
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
555
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
556
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
557
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
558
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
559
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
560
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
561
+ when :space; node << CharacterType::Space.new(token, active_opts)
562
+ when :word; node << CharacterType::Word.new(token, active_opts)
563
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
564
+ else
565
+ raise UnknownTokenError.new('CharacterType', token)
618
566
  end
619
- node.add_sequence(active_opts)
620
- end
621
-
622
- def active_opts
623
- options_stack.last
624
- end
625
-
626
- def total_captured_group_count
627
- captured_group_counts.values.reduce(0, :+)
628
- end
629
-
630
- def captured_group_count_at_level
631
- captured_group_counts[node.level]
632
567
  end
633
568
 
634
- def count_captured_group
635
- captured_group_counts[node.level] += 1
569
+ def close_completed_character_set_range
570
+ decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
636
571
  end
637
572
 
638
- def assign_effective_number(exp)
639
- exp.effective_number =
640
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
573
+ def active_opts
574
+ options_stack.last
641
575
  end
642
576
 
577
+ # Assigns referenced expressions to refering expressions, e.g. if there is
578
+ # an instance of Backreference::Number, its #referenced_expression is set to
579
+ # the instance of Group::Capture that it refers to via its number.
643
580
  def assign_referenced_expressions
644
- targets = {}
581
+ # find all referencable and refering expressions
582
+ targets = { 0 => root }
583
+ referrers = []
645
584
  root.each_expression do |exp|
646
585
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
586
+ referrers << exp if exp.referential?
647
587
  end
648
- root.each_expression do |exp|
649
- exp.respond_to?(:reference) &&
650
- exp.referenced_expression = targets[exp.reference]
588
+ # assign reference expression to refering expressions
589
+ # (in a second iteration because there might be forward references)
590
+ referrers.each do |exp|
591
+ exp.referenced_expression = targets[exp.reference] ||
592
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
651
593
  end
652
594
  end
653
595
  end # module Regexp::Parser