regexp_parser 1.7.0 → 2.8.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +364 -22
  3. data/Gemfile +8 -2
  4. data/LICENSE +1 -1
  5. data/README.md +124 -88
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +76 -0
  9. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  10. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  11. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  18. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  22. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  23. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  24. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  25. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  26. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  27. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  28. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  29. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  30. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  31. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  32. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  33. data/lib/regexp_parser/expression/sequence.rb +11 -47
  34. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  35. data/lib/regexp_parser/expression/shared.rb +111 -0
  36. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  37. data/lib/regexp_parser/expression.rb +14 -141
  38. data/lib/regexp_parser/lexer.rb +83 -41
  39. data/lib/regexp_parser/parser.rb +371 -429
  40. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  41. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  42. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  43. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  44. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  45. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  46. data/lib/regexp_parser/scanner/property.rl +4 -4
  47. data/lib/regexp_parser/scanner/scanner.rl +295 -368
  48. data/lib/regexp_parser/scanner.rb +1405 -1674
  49. data/lib/regexp_parser/syntax/any.rb +2 -7
  50. data/lib/regexp_parser/syntax/base.rb +92 -67
  51. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  52. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  53. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  54. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  55. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  56. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  57. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  58. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  59. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  60. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  61. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  62. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  63. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  64. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  65. data/lib/regexp_parser/syntax/token.rb +45 -0
  66. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  67. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  68. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  69. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  70. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  71. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  75. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  79. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  81. data/lib/regexp_parser/syntax/versions.rb +3 -1
  82. data/lib/regexp_parser/syntax.rb +8 -6
  83. data/lib/regexp_parser/token.rb +9 -20
  84. data/lib/regexp_parser/version.rb +1 -1
  85. data/lib/regexp_parser.rb +0 -2
  86. data/regexp_parser.gemspec +20 -22
  87. metadata +49 -166
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -1,10 +1,10 @@
1
+ require 'regexp_parser/error'
1
2
  require 'regexp_parser/expression'
2
3
 
3
4
  class Regexp::Parser
4
5
  include Regexp::Expression
5
- include Regexp::Syntax
6
6
 
7
- class ParserError < StandardError; end
7
+ class ParserError < Regexp::Parser::Error; end
8
8
 
9
9
  class UnknownTokenTypeError < ParserError
10
10
  def initialize(type, token)
@@ -18,12 +18,12 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
22
- new.parse(input, syntax, &block)
21
+ def self.parse(input, syntax = nil, options: nil, &block)
22
+ new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
26
- root = Root.build(options_from_input(input))
25
+ def parse(input, syntax = nil, options: nil, &block)
26
+ root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -35,10 +35,13 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -54,107 +57,173 @@ class Regexp::Parser
54
57
  :options_stack, :switching_options, :conditional_nesting,
55
58
  :captured_group_counts
56
59
 
57
- def options_from_input(input)
58
- return {} unless input.is_a?(::Regexp)
60
+ def extract_options(input, options)
61
+ if options && !input.is_a?(String)
62
+ raise ArgumentError, 'options cannot be supplied unless parsing a String'
63
+ end
59
64
 
60
- options = {}
61
- options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
62
- options[:m] = true if input.options & ::Regexp::MULTILINE != 0
63
- options[:x] = true if input.options & ::Regexp::EXTENDED != 0
64
- options
65
- end
65
+ options = input.options if input.is_a?(::Regexp)
66
66
 
67
- def nest(exp)
68
- nesting.push(exp)
69
- node << exp
70
- update_transplanted_subtree(exp, node)
71
- self.node = exp
72
- end
67
+ return {} unless options
73
68
 
74
- # subtrees are transplanted to build Alternations, Intersections, Ranges
75
- def update_transplanted_subtree(exp, new_parent)
76
- exp.nesting_level = new_parent.nesting_level + 1
77
- exp.respond_to?(:each) &&
78
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
79
- end
80
-
81
- def decrease_nesting
82
- while nesting.last.is_a?(SequenceOperation)
83
- nesting.pop
84
- self.node = nesting.last
85
- end
86
- nesting.pop
87
- yield(node) if block_given?
88
- self.node = nesting.last
89
- self.node = node.last if node.last.is_a?(SequenceOperation)
90
- end
91
-
92
- def nest_conditional(exp)
93
- conditional_nesting.push(exp)
94
- nest(exp)
69
+ enabled_options = {}
70
+ enabled_options[:i] = true if options & ::Regexp::IGNORECASE != 0
71
+ enabled_options[:m] = true if options & ::Regexp::MULTILINE != 0
72
+ enabled_options[:x] = true if options & ::Regexp::EXTENDED != 0
73
+ enabled_options
95
74
  end
96
75
 
97
76
  def parse_token(token)
98
- close_completed_character_set_range
99
-
100
77
  case token.type
101
- when :meta; meta(token)
102
- when :quantifier; quantifier(token)
103
- when :anchor; anchor(token)
104
- when :escape; escape(token)
105
- when :group; group(token)
106
- when :assertion; group(token)
107
- when :set; set(token)
108
- when :type; type(token)
109
- when :backref; backref(token)
110
- when :conditional; conditional(token)
111
- when :keep; keep(token)
112
-
113
- when :posixclass, :nonposixclass
114
- posixclass(token)
115
- when :property, :nonproperty
116
- property(token)
117
-
118
- when :literal
119
- node << Literal.new(token, active_opts)
120
- when :free_space
121
- free_space(token)
122
-
78
+ when :anchor; anchor(token)
79
+ when :assertion, :group; group(token)
80
+ when :backref; backref(token)
81
+ when :conditional; conditional(token)
82
+ when :escape; escape(token)
83
+ when :free_space; free_space(token)
84
+ when :keep; keep(token)
85
+ when :literal; literal(token)
86
+ when :meta; meta(token)
87
+ when :posixclass, :nonposixclass; posixclass(token)
88
+ when :property, :nonproperty; property(token)
89
+ when :quantifier; quantifier(token)
90
+ when :set; set(token)
91
+ when :type; type(token)
123
92
  else
124
93
  raise UnknownTokenTypeError.new(token.type, token)
125
94
  end
95
+
96
+ close_completed_character_set_range
126
97
  end
127
98
 
128
- def set(token)
99
+ def anchor(token)
129
100
  case token.token
130
- when :open
131
- open_set(token)
132
- when :close
133
- close_set
134
- when :negate
135
- negate_set
136
- when :range
137
- range(token)
138
- when :intersection
139
- intersection(token)
140
- when :collation, :equivalent
141
- node << Literal.new(token, active_opts)
101
+ when :bol; node << Anchor::BeginningOfLine.new(token, active_opts)
102
+ when :bos; node << Anchor::BOS.new(token, active_opts)
103
+ when :eol; node << Anchor::EndOfLine.new(token, active_opts)
104
+ when :eos; node << Anchor::EOS.new(token, active_opts)
105
+ when :eos_ob_eol; node << Anchor::EOSobEOL.new(token, active_opts)
106
+ when :match_start; node << Anchor::MatchStart.new(token, active_opts)
107
+ when :nonword_boundary; node << Anchor::NonWordBoundary.new(token, active_opts)
108
+ when :word_boundary; node << Anchor::WordBoundary.new(token, active_opts)
142
109
  else
143
- raise UnknownTokenError.new('CharacterSet', token)
110
+ raise UnknownTokenError.new('Anchor', token)
144
111
  end
145
112
  end
146
113
 
147
- def meta(token)
114
+ def group(token)
148
115
  case token.token
149
- when :dot
150
- node << CharacterType::Any.new(token, active_opts)
151
- when :alternation
152
- sequence_operation(Alternation, token)
116
+ when :options, :options_switch
117
+ options_group(token)
118
+ when :close
119
+ close_group
120
+ when :comment
121
+ node << Group::Comment.new(token, active_opts)
153
122
  else
154
- raise UnknownTokenError.new('Meta', token)
123
+ open_group(token)
155
124
  end
156
125
  end
157
126
 
127
+ MOD_FLAGS = %w[i m x].map(&:to_sym)
128
+ ENC_FLAGS = %w[a d u].map(&:to_sym)
129
+
130
+ def options_group(token)
131
+ positive, negative = token.text.split('-', 2)
132
+ negative ||= ''
133
+ self.switching_options = token.token.equal?(:options_switch)
134
+
135
+ opt_changes = {}
136
+ new_active_opts = active_opts.dup
137
+
138
+ MOD_FLAGS.each do |flag|
139
+ if positive.include?(flag.to_s)
140
+ opt_changes[flag] = new_active_opts[flag] = true
141
+ end
142
+ if negative.include?(flag.to_s)
143
+ opt_changes[flag] = false
144
+ new_active_opts.delete(flag)
145
+ end
146
+ end
147
+
148
+ if (enc_flag = positive.reverse[/[adu]/])
149
+ enc_flag = enc_flag.to_sym
150
+ (ENC_FLAGS - [enc_flag]).each do |other|
151
+ opt_changes[other] = false if new_active_opts[other]
152
+ new_active_opts.delete(other)
153
+ end
154
+ opt_changes[enc_flag] = new_active_opts[enc_flag] = true
155
+ end
156
+
157
+ options_stack << new_active_opts
158
+
159
+ options_group = Group::Options.new(token, active_opts)
160
+ options_group.option_changes = opt_changes
161
+
162
+ nest(options_group)
163
+ end
164
+
165
+ def open_group(token)
166
+ group_class =
167
+ case token.token
168
+ when :absence; Group::Absence
169
+ when :atomic; Group::Atomic
170
+ when :capture; Group::Capture
171
+ when :named; Group::Named
172
+ when :passive; Group::Passive
173
+
174
+ when :lookahead; Assertion::Lookahead
175
+ when :lookbehind; Assertion::Lookbehind
176
+ when :nlookahead; Assertion::NegativeLookahead
177
+ when :nlookbehind; Assertion::NegativeLookbehind
178
+
179
+ else
180
+ raise UnknownTokenError.new('Group type open', token)
181
+ end
182
+
183
+ group = group_class.new(token, active_opts)
184
+
185
+ if group.capturing?
186
+ group.number = total_captured_group_count + 1
187
+ group.number_at_level = captured_group_count_at_level + 1
188
+ count_captured_group
189
+ end
190
+
191
+ # Push the active options to the stack again. This way we can simply pop the
192
+ # stack for any group we close, no matter if it had its own options or not.
193
+ options_stack << active_opts
194
+
195
+ nest(group)
196
+ end
197
+
198
+ def total_captured_group_count
199
+ captured_group_counts.values.reduce(0, :+)
200
+ end
201
+
202
+ def captured_group_count_at_level
203
+ captured_group_counts[node]
204
+ end
205
+
206
+ def count_captured_group
207
+ captured_group_counts[node] += 1
208
+ end
209
+
210
+ def close_group
211
+ options_stack.pop unless switching_options
212
+ self.switching_options = false
213
+ decrease_nesting
214
+ end
215
+
216
+ def decrease_nesting
217
+ while nesting.last.is_a?(SequenceOperation)
218
+ nesting.pop
219
+ self.node = nesting.last
220
+ end
221
+ nesting.pop
222
+ yield(node) if block_given?
223
+ self.node = nesting.last
224
+ self.node = node.last if node.last.is_a?(SequenceOperation)
225
+ end
226
+
158
227
  def backref(token)
159
228
  case token.token
160
229
  when :name_ref
@@ -163,10 +232,18 @@ class Regexp::Parser
163
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
164
233
  when :name_call
165
234
  node << Backreference::NameCall.new(token, active_opts)
166
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
167
236
  node << Backreference::Number.new(token, active_opts)
168
237
  when :number_recursion_ref
169
- node << Backreference::NumberRecursionLevel.new(token, active_opts)
238
+ node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
239
+ # TODO: should split off new token number_recursion_rel_ref and new
240
+ # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
241
+ if exp.text =~ /[<'][+-]/
242
+ assign_effective_number(exp)
243
+ else
244
+ exp.effective_number = exp.number
245
+ end
246
+ end
170
247
  when :number_call
171
248
  node << Backreference::NumberCall.new(token, active_opts)
172
249
  when :number_rel_ref
@@ -182,31 +259,11 @@ class Regexp::Parser
182
259
  end
183
260
  end
184
261
 
185
- def type(token)
186
- case token.token
187
- when :digit
188
- node << CharacterType::Digit.new(token, active_opts)
189
- when :nondigit
190
- node << CharacterType::NonDigit.new(token, active_opts)
191
- when :hex
192
- node << CharacterType::Hex.new(token, active_opts)
193
- when :nonhex
194
- node << CharacterType::NonHex.new(token, active_opts)
195
- when :space
196
- node << CharacterType::Space.new(token, active_opts)
197
- when :nonspace
198
- node << CharacterType::NonSpace.new(token, active_opts)
199
- when :word
200
- node << CharacterType::Word.new(token, active_opts)
201
- when :nonword
202
- node << CharacterType::NonWord.new(token, active_opts)
203
- when :linebreak
204
- node << CharacterType::Linebreak.new(token, active_opts)
205
- when :xgrapheme
206
- node << CharacterType::ExtendedGrapheme.new(token, active_opts)
207
- else
208
- raise UnknownTokenError.new('CharacterType', token)
209
- end
262
+ def assign_effective_number(exp)
263
+ exp.effective_number =
264
+ exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
265
+ exp.effective_number > 0 ||
266
+ raise(ParserError, "Invalid reference: #{exp.reference}")
210
267
  end
211
268
 
212
269
  def conditional(token)
@@ -215,9 +272,9 @@ class Regexp::Parser
215
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
216
273
  when :condition
217
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
218
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
219
276
  when :separator
220
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
221
278
  self.node = conditional_nesting.last.branches.last
222
279
  when :close
223
280
  conditional_nesting.pop
@@ -234,157 +291,38 @@ class Regexp::Parser
234
291
  end
235
292
  end
236
293
 
237
- def posixclass(token)
238
- node << PosixClass.new(token, active_opts)
239
- end
240
-
241
- include Regexp::Expression::UnicodeProperty
242
-
243
- def property(token)
244
- case token.token
245
- when :alnum; node << Alnum.new(token, active_opts)
246
- when :alpha; node << Alpha.new(token, active_opts)
247
- when :ascii; node << Ascii.new(token, active_opts)
248
- when :blank; node << Blank.new(token, active_opts)
249
- when :cntrl; node << Cntrl.new(token, active_opts)
250
- when :digit; node << Digit.new(token, active_opts)
251
- when :graph; node << Graph.new(token, active_opts)
252
- when :lower; node << Lower.new(token, active_opts)
253
- when :print; node << Print.new(token, active_opts)
254
- when :punct; node << Punct.new(token, active_opts)
255
- when :space; node << Space.new(token, active_opts)
256
- when :upper; node << Upper.new(token, active_opts)
257
- when :word; node << Word.new(token, active_opts)
258
- when :xdigit; node << Xdigit.new(token, active_opts)
259
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
260
-
261
- # only in Oniguruma (old rubies)
262
- when :newline; node << Newline.new(token, active_opts)
263
-
264
- when :any; node << Any.new(token, active_opts)
265
- when :assigned; node << Assigned.new(token, active_opts)
266
-
267
- when :letter; node << Letter::Any.new(token, active_opts)
268
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
269
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
270
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
271
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
272
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
273
- when :other_letter; node << Letter::Other.new(token, active_opts)
274
-
275
- when :mark; node << Mark::Any.new(token, active_opts)
276
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
277
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
278
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
279
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
280
-
281
- when :number; node << Number::Any.new(token, active_opts)
282
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
283
- when :letter_number; node << Number::Letter.new(token, active_opts)
284
- when :other_number; node << Number::Other.new(token, active_opts)
285
-
286
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
287
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
288
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
289
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
290
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
291
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
292
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
293
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
294
-
295
- when :separator; node << Separator::Any.new(token, active_opts)
296
- when :space_separator; node << Separator::Space.new(token, active_opts)
297
- when :line_separator; node << Separator::Line.new(token, active_opts)
298
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
299
-
300
- when :symbol; node << Symbol::Any.new(token, active_opts)
301
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
302
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
303
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
304
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
305
-
306
- when :other; node << Codepoint::Any.new(token, active_opts)
307
- when :control; node << Codepoint::Control.new(token, active_opts)
308
- when :format; node << Codepoint::Format.new(token, active_opts)
309
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
310
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
311
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
312
-
313
- when *Token::UnicodeProperty::Age
314
- node << Age.new(token, active_opts)
315
-
316
- when *Token::UnicodeProperty::Derived
317
- node << Derived.new(token, active_opts)
318
-
319
- when *Token::UnicodeProperty::Emoji
320
- node << Emoji.new(token, active_opts)
321
-
322
- when *Token::UnicodeProperty::Script
323
- node << Script.new(token, active_opts)
324
-
325
- when *Token::UnicodeProperty::UnicodeBlock
326
- node << Block.new(token, active_opts)
327
-
328
- else
329
- raise UnknownTokenError.new('UnicodeProperty', token)
330
- end
294
+ def nest_conditional(exp)
295
+ conditional_nesting.push(exp)
296
+ nest(exp)
331
297
  end
332
298
 
333
- def anchor(token)
334
- case token.token
335
- when :bol
336
- node << Anchor::BeginningOfLine.new(token, active_opts)
337
- when :eol
338
- node << Anchor::EndOfLine.new(token, active_opts)
339
- when :bos
340
- node << Anchor::BOS.new(token, active_opts)
341
- when :eos
342
- node << Anchor::EOS.new(token, active_opts)
343
- when :eos_ob_eol
344
- node << Anchor::EOSobEOL.new(token, active_opts)
345
- when :word_boundary
346
- node << Anchor::WordBoundary.new(token, active_opts)
347
- when :nonword_boundary
348
- node << Anchor::NonWordBoundary.new(token, active_opts)
349
- when :match_start
350
- node << Anchor::MatchStart.new(token, active_opts)
351
- else
352
- raise UnknownTokenError.new('Anchor', token)
353
- end
299
+ def nest(exp)
300
+ nesting.push(exp)
301
+ node << exp
302
+ self.node = exp
354
303
  end
355
304
 
356
305
  def escape(token)
357
306
  case token.token
358
307
 
359
- when :backspace
360
- node << EscapeSequence::Backspace.new(token, active_opts)
361
-
362
- when :escape
363
- node << EscapeSequence::AsciiEscape.new(token, active_opts)
364
- when :bell
365
- node << EscapeSequence::Bell.new(token, active_opts)
366
- when :form_feed
367
- node << EscapeSequence::FormFeed.new(token, active_opts)
368
- when :newline
369
- node << EscapeSequence::Newline.new(token, active_opts)
370
- when :carriage
371
- node << EscapeSequence::Return.new(token, active_opts)
372
- when :tab
373
- node << EscapeSequence::Tab.new(token, active_opts)
374
- when :vertical_tab
375
- node << EscapeSequence::VerticalTab.new(token, active_opts)
376
-
377
- when :hex
378
- node << EscapeSequence::Hex.new(token, active_opts)
379
- when :octal
380
- node << EscapeSequence::Octal.new(token, active_opts)
381
- when :codepoint
382
- node << EscapeSequence::Codepoint.new(token, active_opts)
383
- when :codepoint_list
384
- node << EscapeSequence::CodepointList.new(token, active_opts)
308
+ when :backspace; node << EscapeSequence::Backspace.new(token, active_opts)
309
+
310
+ when :escape; node << EscapeSequence::AsciiEscape.new(token, active_opts)
311
+ when :bell; node << EscapeSequence::Bell.new(token, active_opts)
312
+ when :form_feed; node << EscapeSequence::FormFeed.new(token, active_opts)
313
+ when :newline; node << EscapeSequence::Newline.new(token, active_opts)
314
+ when :carriage; node << EscapeSequence::Return.new(token, active_opts)
315
+ when :tab; node << EscapeSequence::Tab.new(token, active_opts)
316
+ when :vertical_tab; node << EscapeSequence::VerticalTab.new(token, active_opts)
317
+
318
+ when :codepoint; node << EscapeSequence::Codepoint.new(token, active_opts)
319
+ when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320
+ when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321
+ when :octal; node << EscapeSequence::Octal.new(token, active_opts)
385
322
 
386
323
  when :control
387
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
388
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
389
327
  else
390
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -392,6 +330,7 @@ class Regexp::Parser
392
330
 
393
331
  when :meta_sequence
394
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
395
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
396
335
  else
397
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -399,188 +338,194 @@ class Regexp::Parser
399
338
 
400
339
  else
401
340
  # treating everything else as a literal
341
+ # TODO: maybe split this up a bit more in v3.0.0?
342
+ # E.g. escaped quantifiers or set meta chars are not the same
343
+ # as stuff that would be a literal even without the backslash.
344
+ # Right now, they all end up here.
402
345
  node << EscapeSequence::Literal.new(token, active_opts)
403
346
  end
404
347
  end
405
348
 
406
- def keep(token)
407
- node << Keep::Mark.new(token, active_opts)
408
- end
409
-
410
349
  def free_space(token)
411
350
  case token.token
412
351
  when :comment
413
352
  node << Comment.new(token, active_opts)
414
353
  when :whitespace
415
- if node.last.is_a?(WhiteSpace)
416
- node.last.merge(WhiteSpace.new(token, active_opts))
417
- else
418
- node << WhiteSpace.new(token, active_opts)
419
- end
354
+ node << WhiteSpace.new(token, active_opts)
420
355
  else
421
356
  raise UnknownTokenError.new('FreeSpace', token)
422
357
  end
423
358
  end
424
359
 
425
- def quantifier(token)
426
- offset = -1
427
- target_node = node.expressions[offset]
428
- while target_node.is_a?(FreeSpace)
429
- target_node = node.expressions[offset -= 1]
430
- end
360
+ def keep(token)
361
+ node << Keep::Mark.new(token, active_opts)
362
+ end
431
363
 
432
- target_node || raise(ArgumentError, 'No valid target found for '\
433
- "'#{token.text}' ")
364
+ def literal(token)
365
+ node << Literal.new(token, active_opts)
366
+ end
434
367
 
368
+ def meta(token)
435
369
  case token.token
436
- when :zero_or_one
437
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
438
- when :zero_or_one_reluctant
439
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
440
- when :zero_or_one_possessive
441
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
442
-
443
- when :zero_or_more
444
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
445
- when :zero_or_more_reluctant
446
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
447
- when :zero_or_more_possessive
448
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
449
-
450
- when :one_or_more
451
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
452
- when :one_or_more_reluctant
453
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
454
- when :one_or_more_possessive
455
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
456
-
457
- when :interval
458
- interval(target_node, token)
459
-
370
+ when :dot
371
+ node << CharacterType::Any.new(token, active_opts)
372
+ when :alternation
373
+ sequence_operation(Alternation, token)
460
374
  else
461
- raise UnknownTokenError.new('Quantifier', token)
375
+ raise UnknownTokenError.new('Meta', token)
462
376
  end
463
377
  end
464
378
 
465
- def interval(target_node, token)
466
- text = token.text
467
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
468
- case mchr
469
- when '?'
470
- range_text = text[0...-1]
471
- mode = :reluctant
472
- when '+'
473
- range_text = text[0...-1]
474
- mode = :possessive
475
- else
476
- range_text = text
477
- mode = :greedy
379
+ def sequence_operation(klass, token)
380
+ unless node.instance_of?(klass)
381
+ operator = klass.new(token, active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
383
+ sequence.expressions = node.expressions
384
+ node.expressions = []
385
+ nest(operator)
478
386
  end
479
-
480
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
481
- min = range[0].empty? ? 0 : range[0]
482
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
483
-
484
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
387
+ node.add_sequence(active_opts, { ts: token.te })
485
388
  end
486
389
 
487
- def group(token)
488
- case token.token
489
- when :options, :options_switch
490
- options_group(token)
491
- when :close
492
- close_group
493
- when :comment
494
- node << Group::Comment.new(token, active_opts)
495
- else
496
- open_group(token)
497
- end
390
+ def posixclass(token)
391
+ node << PosixClass.new(token, active_opts)
498
392
  end
499
393
 
500
- MOD_FLAGS = %w[i m x].map(&:to_sym)
501
- ENC_FLAGS = %w[a d u].map(&:to_sym)
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
502
396
 
503
- def options_group(token)
504
- positive, negative = token.text.split('-', 2)
505
- negative ||= ''
506
- self.switching_options = token.token.equal?(:options_switch)
397
+ def property(token)
398
+ case token.token
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
507
414
 
508
- opt_changes = {}
509
- new_active_opts = active_opts.dup
415
+ # only in Oniguruma (old rubies)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
471
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
510
472
 
511
- MOD_FLAGS.each do |flag|
512
- if positive.include?(flag.to_s)
513
- opt_changes[flag] = new_active_opts[flag] = true
514
- end
515
- if negative.include?(flag.to_s)
516
- opt_changes[flag] = false
517
- new_active_opts.delete(flag)
518
- end
473
+ else
474
+ raise UnknownTokenError.new('UnicodeProperty', token)
519
475
  end
476
+ end
520
477
 
521
- if (enc_flag = positive.reverse[/[adu]/])
522
- enc_flag = enc_flag.to_sym
523
- (ENC_FLAGS - [enc_flag]).each do |other|
524
- opt_changes[other] = false if new_active_opts[other]
525
- new_active_opts.delete(other)
526
- end
527
- opt_changes[enc_flag] = new_active_opts[enc_flag] = true
478
+ def quantifier(token)
479
+ target_node = node.extract_quantifier_target(token.text)
480
+
481
+ # in case of chained quantifiers, wrap target in an implicit passive group
482
+ # description of the problem: https://github.com/ammar/regexp_parser/issues/3
483
+ # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
484
+ if target_node.quantified?
485
+ new_group = Group::Passive.construct(
486
+ token: :passive,
487
+ ts: target_node.ts,
488
+ level: target_node.level,
489
+ set_level: target_node.set_level,
490
+ conditional_level: target_node.conditional_level,
491
+ options: active_opts,
492
+ )
493
+ new_group.implicit = true
494
+ new_group << target_node
495
+ increase_group_level(target_node)
496
+ node.expressions[node.expressions.index(target_node)] = new_group
497
+ target_node = new_group
528
498
  end
529
499
 
530
- options_stack << new_active_opts
500
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
501
+ (?:_greedy|_reluctant|_possessive)?\z/x
502
+ raise UnknownTokenError.new('Quantifier', token)
503
+ end
531
504
 
532
- options_group = Group::Options.new(token, active_opts)
533
- options_group.option_changes = opt_changes
505
+ target_node.quantify(token, active_opts)
506
+ end
534
507
 
535
- nest(options_group)
508
+ def increase_group_level(exp)
509
+ exp.level += 1
510
+ exp.quantifier.level += 1 if exp.quantifier
511
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
536
512
  end
537
513
 
538
- def open_group(token)
514
+ def set(token)
539
515
  case token.token
540
- when :passive
541
- exp = Group::Passive.new(token, active_opts)
542
- when :atomic
543
- exp = Group::Atomic.new(token, active_opts)
544
- when :named
545
- exp = Group::Named.new(token, active_opts)
546
- when :capture
547
- exp = Group::Capture.new(token, active_opts)
548
- when :absence
549
- exp = Group::Absence.new(token, active_opts)
550
-
551
- when :lookahead
552
- exp = Assertion::Lookahead.new(token, active_opts)
553
- when :nlookahead
554
- exp = Assertion::NegativeLookahead.new(token, active_opts)
555
- when :lookbehind
556
- exp = Assertion::Lookbehind.new(token, active_opts)
557
- when :nlookbehind
558
- exp = Assertion::NegativeLookbehind.new(token, active_opts)
559
-
516
+ when :open; open_set(token)
517
+ when :close; close_set
518
+ when :negate; negate_set
519
+ when :range; range(token)
520
+ when :intersection; intersection(token)
560
521
  else
561
- raise UnknownTokenError.new('Group type open', token)
562
- end
563
-
564
- if exp.capturing?
565
- exp.number = total_captured_group_count + 1
566
- exp.number_at_level = captured_group_count_at_level + 1
567
- count_captured_group
522
+ raise UnknownTokenError.new('CharacterSet', token)
568
523
  end
569
-
570
- # Push the active options to the stack again. This way we can simply pop the
571
- # stack for any group we close, no matter if it had its own options or not.
572
- options_stack << active_opts
573
-
574
- nest(exp)
575
- end
576
-
577
- def close_group
578
- options_stack.pop unless switching_options
579
- self.switching_options = false
580
- decrease_nesting
581
524
  end
582
525
 
583
526
  def open_set(token)
527
+ # TODO: this and Quantifier are the only cases where Expression#token
528
+ # does not match the scanner/lexer output. Fix in v3.0.0.
584
529
  token.token = :character
585
530
  nest(CharacterSet.new(token, active_opts))
586
531
  end
@@ -595,59 +540,56 @@ class Regexp::Parser
595
540
 
596
541
  def range(token)
597
542
  exp = CharacterSet::Range.new(token, active_opts)
598
- scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
543
+ scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
599
544
  exp << scope.expressions.pop
600
545
  nest(exp)
601
546
  end
602
547
 
603
- def close_completed_character_set_range
604
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
605
- end
606
-
607
548
  def intersection(token)
608
549
  sequence_operation(CharacterSet::Intersection, token)
609
550
  end
610
551
 
611
- def sequence_operation(klass, token)
612
- unless node.is_a?(klass)
613
- operator = klass.new(token, active_opts)
614
- sequence = operator.add_sequence(active_opts)
615
- sequence.expressions = node.expressions
616
- node.expressions = []
617
- nest(operator)
552
+ def type(token)
553
+ case token.token
554
+ when :digit; node << CharacterType::Digit.new(token, active_opts)
555
+ when :hex; node << CharacterType::Hex.new(token, active_opts)
556
+ when :linebreak; node << CharacterType::Linebreak.new(token, active_opts)
557
+ when :nondigit; node << CharacterType::NonDigit.new(token, active_opts)
558
+ when :nonhex; node << CharacterType::NonHex.new(token, active_opts)
559
+ when :nonspace; node << CharacterType::NonSpace.new(token, active_opts)
560
+ when :nonword; node << CharacterType::NonWord.new(token, active_opts)
561
+ when :space; node << CharacterType::Space.new(token, active_opts)
562
+ when :word; node << CharacterType::Word.new(token, active_opts)
563
+ when :xgrapheme; node << CharacterType::ExtendedGrapheme.new(token, active_opts)
564
+ else
565
+ raise UnknownTokenError.new('CharacterType', token)
618
566
  end
619
- node.add_sequence(active_opts)
620
- end
621
-
622
- def active_opts
623
- options_stack.last
624
- end
625
-
626
- def total_captured_group_count
627
- captured_group_counts.values.reduce(0, :+)
628
- end
629
-
630
- def captured_group_count_at_level
631
- captured_group_counts[node.level]
632
567
  end
633
568
 
634
- def count_captured_group
635
- captured_group_counts[node.level] += 1
569
+ def close_completed_character_set_range
570
+ decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
636
571
  end
637
572
 
638
- def assign_effective_number(exp)
639
- exp.effective_number =
640
- exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
573
+ def active_opts
574
+ options_stack.last
641
575
  end
642
576
 
577
+ # Assigns referenced expressions to refering expressions, e.g. if there is
578
+ # an instance of Backreference::Number, its #referenced_expression is set to
579
+ # the instance of Group::Capture that it refers to via its number.
643
580
  def assign_referenced_expressions
644
- targets = {}
581
+ # find all referencable and refering expressions
582
+ targets = { 0 => root }
583
+ referrers = []
645
584
  root.each_expression do |exp|
646
585
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
586
+ referrers << exp if exp.referential?
647
587
  end
648
- root.each_expression do |exp|
649
- exp.respond_to?(:reference) &&
650
- exp.referenced_expression = targets[exp.reference]
588
+ # assign reference expression to refering expressions
589
+ # (in a second iteration because there might be forward references)
590
+ referrers.each do |exp|
591
+ exp.referenced_expression = targets[exp.reference] ||
592
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
651
593
  end
652
594
  end
653
595
  end # module Regexp::Parser