regexp_parser 2.1.1 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -5
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +1 -1
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +18 -3
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -7
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +4 -4
  16. data/lib/regexp_parser/expression/classes/group.rb +10 -22
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  20. data/lib/regexp_parser/expression/classes/root.rb +3 -6
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +10 -11
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
  25. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  26. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  27. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  28. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  29. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  30. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  31. data/lib/regexp_parser/expression/quantifier.rb +55 -24
  32. data/lib/regexp_parser/expression/sequence.rb +11 -31
  33. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  34. data/lib/regexp_parser/expression/shared.rb +111 -0
  35. data/lib/regexp_parser/expression/subexpression.rb +26 -18
  36. data/lib/regexp_parser/expression.rb +37 -155
  37. data/lib/regexp_parser/lexer.rb +81 -39
  38. data/lib/regexp_parser/parser.rb +135 -173
  39. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  40. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  41. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  42. data/lib/regexp_parser/scanner/properties/long.csv +651 -0
  43. data/lib/regexp_parser/scanner/properties/short.csv +249 -0
  44. data/lib/regexp_parser/scanner/property.rl +2 -2
  45. data/lib/regexp_parser/scanner/scanner.rl +127 -185
  46. data/lib/regexp_parser/scanner.rb +1185 -1402
  47. data/lib/regexp_parser/syntax/any.rb +2 -7
  48. data/lib/regexp_parser/syntax/base.rb +91 -66
  49. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  50. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  51. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  52. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  53. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  54. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  55. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  56. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  57. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  58. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  59. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  60. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  61. data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
  62. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  63. data/lib/regexp_parser/syntax/token.rb +45 -0
  64. data/lib/regexp_parser/syntax/version_lookup.rb +17 -34
  65. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  66. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  67. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  68. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  69. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  70. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  73. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  78. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions.rb +4 -2
  80. data/lib/regexp_parser/syntax.rb +2 -2
  81. data/lib/regexp_parser/token.rb +9 -20
  82. data/lib/regexp_parser/version.rb +1 -1
  83. data/lib/regexp_parser.rb +6 -8
  84. data/regexp_parser.gemspec +20 -22
  85. metadata +49 -171
  86. data/CHANGELOG.md +0 -494
  87. data/README.md +0 -479
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -104
  99. data/spec/expression/clone_spec.rb +0 -152
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/subexpression_spec.rb +0 -50
  109. data/spec/expression/to_h_spec.rb +0 -26
  110. data/spec/expression/to_s_spec.rb +0 -108
  111. data/spec/lexer/all_spec.rb +0 -22
  112. data/spec/lexer/conditionals_spec.rb +0 -53
  113. data/spec/lexer/delimiters_spec.rb +0 -68
  114. data/spec/lexer/escapes_spec.rb +0 -14
  115. data/spec/lexer/keep_spec.rb +0 -10
  116. data/spec/lexer/literals_spec.rb +0 -64
  117. data/spec/lexer/nesting_spec.rb +0 -99
  118. data/spec/lexer/refcalls_spec.rb +0 -60
  119. data/spec/parser/all_spec.rb +0 -43
  120. data/spec/parser/alternation_spec.rb +0 -88
  121. data/spec/parser/anchors_spec.rb +0 -17
  122. data/spec/parser/conditionals_spec.rb +0 -179
  123. data/spec/parser/errors_spec.rb +0 -30
  124. data/spec/parser/escapes_spec.rb +0 -121
  125. data/spec/parser/free_space_spec.rb +0 -130
  126. data/spec/parser/groups_spec.rb +0 -108
  127. data/spec/parser/keep_spec.rb +0 -6
  128. data/spec/parser/options_spec.rb +0 -28
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -68
  132. data/spec/parser/refcalls_spec.rb +0 -117
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/delimiters_spec.rb +0 -52
  141. data/spec/scanner/errors_spec.rb +0 -67
  142. data/spec/scanner/escapes_spec.rb +0 -64
  143. data/spec/scanner/free_space_spec.rb +0 -165
  144. data/spec/scanner/groups_spec.rb +0 -61
  145. data/spec/scanner/keep_spec.rb +0 -10
  146. data/spec/scanner/literals_spec.rb +0 -39
  147. data/spec/scanner/meta_spec.rb +0 -18
  148. data/spec/scanner/options_spec.rb +0 -36
  149. data/spec/scanner/properties_spec.rb +0 -64
  150. data/spec/scanner/quantifiers_spec.rb +0 -25
  151. data/spec/scanner/refcalls_spec.rb +0 -55
  152. data/spec/scanner/sets_spec.rb +0 -151
  153. data/spec/scanner/types_spec.rb +0 -14
  154. data/spec/spec_helper.rb +0 -16
  155. data/spec/support/runner.rb +0 -42
  156. data/spec/support/shared_examples.rb +0 -77
  157. data/spec/support/warning_extractor.rb +0 -60
  158. data/spec/syntax/syntax_spec.rb +0 -48
  159. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  160. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  161. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  162. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  163. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  164. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  165. data/spec/syntax/versions/aliases_spec.rb +0 -37
  166. data/spec/token/token_spec.rb +0 -85
  167. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -1,5 +1,5 @@
1
- require 'regexp_parser/error'
2
- require 'regexp_parser/expression'
1
+ require_relative 'error'
2
+ require_relative 'expression'
3
3
 
4
4
  class Regexp::Parser
5
5
  include Regexp::Expression
@@ -18,12 +18,12 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
+ def self.parse(input, syntax = nil, options: nil, &block)
22
22
  new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
26
- root = Root.build(extract_options(input, options))
25
+ def parse(input, syntax = nil, options: nil, &block)
26
+ root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
29
29
  self.node = root
@@ -35,10 +35,13 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax, options: options) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
42
+ # Trigger recursive setting of #nesting_level, which reflects how deep
43
+ # a node is in the tree. Do this at the end to account for tree rewrites.
44
+ root.nesting_level = 0
42
45
  assign_referenced_expressions
43
46
 
44
47
  if block_given?
@@ -197,11 +200,11 @@ class Regexp::Parser
197
200
  end
198
201
 
199
202
  def captured_group_count_at_level
200
- captured_group_counts[node.level]
203
+ captured_group_counts[node]
201
204
  end
202
205
 
203
206
  def count_captured_group
204
- captured_group_counts[node.level] += 1
207
+ captured_group_counts[node] += 1
205
208
  end
206
209
 
207
210
  def close_group
@@ -229,10 +232,18 @@ class Regexp::Parser
229
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
230
233
  when :name_call
231
234
  node << Backreference::NameCall.new(token, active_opts)
232
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
233
236
  node << Backreference::Number.new(token, active_opts)
234
237
  when :number_recursion_ref
235
- node << Backreference::NumberRecursionLevel.new(token, active_opts)
238
+ node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
239
+ # TODO: should split off new token number_recursion_rel_ref and new
240
+ # class NumberRelativeRecursionLevel in v3.0.0 to get rid of this
241
+ if exp.text =~ /[<'][+-]/
242
+ assign_effective_number(exp)
243
+ else
244
+ exp.effective_number = exp.number
245
+ end
246
+ end
236
247
  when :number_call
237
248
  node << Backreference::NumberCall.new(token, active_opts)
238
249
  when :number_rel_ref
@@ -251,6 +262,8 @@ class Regexp::Parser
251
262
  def assign_effective_number(exp)
252
263
  exp.effective_number =
253
264
  exp.number + total_captured_group_count + (exp.number < 0 ? 1 : 0)
265
+ exp.effective_number > 0 ||
266
+ raise(ParserError, "Invalid reference: #{exp.reference}")
254
267
  end
255
268
 
256
269
  def conditional(token)
@@ -259,9 +272,9 @@ class Regexp::Parser
259
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
260
273
  when :condition
261
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
262
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
263
276
  when :separator
264
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
265
278
  self.node = conditional_nesting.last.branches.last
266
279
  when :close
267
280
  conditional_nesting.pop
@@ -286,17 +299,9 @@ class Regexp::Parser
286
299
  def nest(exp)
287
300
  nesting.push(exp)
288
301
  node << exp
289
- update_transplanted_subtree(exp, node)
290
302
  self.node = exp
291
303
  end
292
304
 
293
- # subtrees are transplanted to build Alternations, Intersections, Ranges
294
- def update_transplanted_subtree(exp, new_parent)
295
- exp.nesting_level = new_parent.nesting_level + 1
296
- exp.respond_to?(:each) &&
297
- exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
298
- end
299
-
300
305
  def escape(token)
301
306
  case token.token
302
307
 
@@ -317,6 +322,7 @@ class Regexp::Parser
317
322
 
318
323
  when :control
319
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
320
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
321
327
  else
322
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -324,6 +330,7 @@ class Regexp::Parser
324
330
 
325
331
  when :meta_sequence
326
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
327
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
328
335
  else
329
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -344,11 +351,7 @@ class Regexp::Parser
344
351
  when :comment
345
352
  node << Comment.new(token, active_opts)
346
353
  when :whitespace
347
- if node.last.is_a?(WhiteSpace)
348
- node.last.merge(WhiteSpace.new(token, active_opts))
349
- else
350
- node << WhiteSpace.new(token, active_opts)
351
- end
354
+ node << WhiteSpace.new(token, active_opts)
352
355
  else
353
356
  raise UnknownTokenError.new('FreeSpace', token)
354
357
  end
@@ -374,98 +377,99 @@ class Regexp::Parser
374
377
  end
375
378
 
376
379
  def sequence_operation(klass, token)
377
- unless node.is_a?(klass)
380
+ unless node.instance_of?(klass)
378
381
  operator = klass.new(token, active_opts)
379
- sequence = operator.add_sequence(active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
380
383
  sequence.expressions = node.expressions
381
384
  node.expressions = []
382
385
  nest(operator)
383
386
  end
384
- node.add_sequence(active_opts)
387
+ node.add_sequence(active_opts, { ts: token.te })
385
388
  end
386
389
 
387
390
  def posixclass(token)
388
391
  node << PosixClass.new(token, active_opts)
389
392
  end
390
393
 
391
- include Regexp::Expression::UnicodeProperty
392
- UPTokens = Regexp::Syntax::Token::UnicodeProperty
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
393
396
 
394
397
  def property(token)
395
398
  case token.token
396
- when :alnum; node << Alnum.new(token, active_opts)
397
- when :alpha; node << Alpha.new(token, active_opts)
398
- when :ascii; node << Ascii.new(token, active_opts)
399
- when :blank; node << Blank.new(token, active_opts)
400
- when :cntrl; node << Cntrl.new(token, active_opts)
401
- when :digit; node << Digit.new(token, active_opts)
402
- when :graph; node << Graph.new(token, active_opts)
403
- when :lower; node << Lower.new(token, active_opts)
404
- when :print; node << Print.new(token, active_opts)
405
- when :punct; node << Punct.new(token, active_opts)
406
- when :space; node << Space.new(token, active_opts)
407
- when :upper; node << Upper.new(token, active_opts)
408
- when :word; node << Word.new(token, active_opts)
409
- when :xdigit; node << Xdigit.new(token, active_opts)
410
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
411
414
 
412
415
  # only in Oniguruma (old rubies)
413
- when :newline; node << Newline.new(token, active_opts)
414
-
415
- when :any; node << Any.new(token, active_opts)
416
- when :assigned; node << Assigned.new(token, active_opts)
417
-
418
- when :letter; node << Letter::Any.new(token, active_opts)
419
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
420
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
421
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
422
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
423
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
424
- when :other_letter; node << Letter::Other.new(token, active_opts)
425
-
426
- when :mark; node << Mark::Any.new(token, active_opts)
427
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
428
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
429
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
430
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
431
-
432
- when :number; node << Number::Any.new(token, active_opts)
433
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
434
- when :letter_number; node << Number::Letter.new(token, active_opts)
435
- when :other_number; node << Number::Other.new(token, active_opts)
436
-
437
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
438
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
439
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
440
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
441
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
442
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
443
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
444
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
445
-
446
- when :separator; node << Separator::Any.new(token, active_opts)
447
- when :space_separator; node << Separator::Space.new(token, active_opts)
448
- when :line_separator; node << Separator::Line.new(token, active_opts)
449
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
450
-
451
- when :symbol; node << Symbol::Any.new(token, active_opts)
452
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
453
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
454
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
455
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
456
-
457
- when :other; node << Codepoint::Any.new(token, active_opts)
458
- when :control; node << Codepoint::Control.new(token, active_opts)
459
- when :format; node << Codepoint::Format.new(token, active_opts)
460
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
461
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
462
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
463
-
464
- when *UPTokens::Age; node << Age.new(token, active_opts)
465
- when *UPTokens::Derived; node << Derived.new(token, active_opts)
466
- when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
467
- when *UPTokens::Script; node << Script.new(token, active_opts)
468
- when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
471
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
472
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
469
473
 
470
474
  else
471
475
  raise UnknownTokenError.new('UnicodeProperty', token)
@@ -473,86 +477,39 @@ class Regexp::Parser
473
477
  end
474
478
 
475
479
  def quantifier(token)
476
- target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
477
- target_node or raise ParserError, "No valid target found for '#{token.text}'"
480
+ target_node = node.extract_quantifier_target(token.text)
478
481
 
479
482
  # in case of chained quantifiers, wrap target in an implicit passive group
480
483
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
481
484
  # rationale for this solution: https://github.com/ammar/regexp_parser/pull/69
482
485
  if target_node.quantified?
483
- new_token = Regexp::Token.new(
484
- :group,
485
- :passive,
486
- '', # text
487
- target_node.ts,
488
- nil, # te (unused)
489
- target_node.level,
490
- target_node.set_level,
491
- target_node.conditional_level
486
+ new_group = Group::Passive.construct(
487
+ token: :passive,
488
+ ts: target_node.ts,
489
+ level: target_node.level,
490
+ set_level: target_node.set_level,
491
+ conditional_level: target_node.conditional_level,
492
+ options: active_opts,
492
493
  )
493
- new_group = Group::Passive.new(new_token, active_opts)
494
494
  new_group.implicit = true
495
495
  new_group << target_node
496
- increase_level(target_node)
496
+ increase_group_level(target_node)
497
497
  node.expressions[node.expressions.index(target_node)] = new_group
498
498
  target_node = new_group
499
499
  end
500
500
 
501
- case token.token
502
- when :zero_or_one
503
- target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
504
- when :zero_or_one_reluctant
505
- target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
506
- when :zero_or_one_possessive
507
- target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
508
-
509
- when :zero_or_more
510
- target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
511
- when :zero_or_more_reluctant
512
- target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
513
- when :zero_or_more_possessive
514
- target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
515
-
516
- when :one_or_more
517
- target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
518
- when :one_or_more_reluctant
519
- target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
520
- when :one_or_more_possessive
521
- target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
522
-
523
- when :interval
524
- interval(target_node, token)
525
-
526
- else
501
+ unless token.token =~ /\A(?:zero_or_one|zero_or_more|one_or_more|interval)
502
+ (?:_greedy|_reluctant|_possessive)?\z/x
527
503
  raise UnknownTokenError.new('Quantifier', token)
528
504
  end
505
+
506
+ target_node.quantify(token, active_opts)
529
507
  end
530
508
 
531
- def increase_level(exp)
509
+ def increase_group_level(exp)
532
510
  exp.level += 1
533
- exp.respond_to?(:each) && exp.each { |subexp| increase_level(subexp) }
534
- end
535
-
536
- def interval(target_node, token)
537
- text = token.text
538
- mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
539
- case mchr
540
- when '?'
541
- range_text = text[0...-1]
542
- mode = :reluctant
543
- when '+'
544
- range_text = text[0...-1]
545
- mode = :possessive
546
- else
547
- range_text = text
548
- mode = :greedy
549
- end
550
-
551
- range = range_text.gsub(/\{|\}/, '').split(',', 2)
552
- min = range[0].empty? ? 0 : range[0]
553
- max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
554
-
555
- target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
511
+ exp.quantifier.level += 1 if exp.quantifier
512
+ exp.terminal? || exp.each { |subexp| increase_group_level(subexp) }
556
513
  end
557
514
 
558
515
  def set(token)
@@ -568,6 +525,8 @@ class Regexp::Parser
568
525
  end
569
526
 
570
527
  def open_set(token)
528
+ # TODO: this and Quantifier are the only cases where Expression#token
529
+ # does not match the scanner/lexer output. Fix in v3.0.0.
571
530
  token.token = :character
572
531
  nest(CharacterSet.new(token, active_opts))
573
532
  end
@@ -582,7 +541,7 @@ class Regexp::Parser
582
541
 
583
542
  def range(token)
584
543
  exp = CharacterSet::Range.new(token, active_opts)
585
- scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
544
+ scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
586
545
  exp << scope.expressions.pop
587
546
  nest(exp)
588
547
  end
@@ -609,26 +568,29 @@ class Regexp::Parser
609
568
  end
610
569
 
611
570
  def close_completed_character_set_range
612
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
571
+ decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
613
572
  end
614
573
 
615
574
  def active_opts
616
575
  options_stack.last
617
576
  end
618
577
 
619
- # Assigns referenced expressions to refering expressions, e.g. if there is
578
+ # Assigns referenced expressions to referring expressions, e.g. if there is
620
579
  # an instance of Backreference::Number, its #referenced_expression is set to
621
580
  # the instance of Group::Capture that it refers to via its number.
622
581
  def assign_referenced_expressions
623
- targets = {}
624
- # find all referencable expressions
582
+ # find all referenceable and referring expressions
583
+ targets = { 0 => root }
584
+ referrers = []
625
585
  root.each_expression do |exp|
626
586
  exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
587
+ referrers << exp if exp.referential?
627
588
  end
628
- # assign them to any refering expressions
629
- root.each_expression do |exp|
630
- exp.respond_to?(:reference) &&
631
- exp.referenced_expression = targets[exp.reference]
589
+ # assign reference expression to referring expressions
590
+ # (in a second iteration because there might be forward references)
591
+ referrers.each do |exp|
592
+ exp.referenced_expression = targets[exp.reference] ||
593
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
632
594
  end
633
595
  end
634
596
  end # module Regexp::Parser
@@ -0,0 +1,8 @@
1
+ class Regexp::Scanner
2
+ # Unexpected end of pattern
3
+ class PrematureEndError < ScannerError
4
+ def initialize(where = '')
5
+ super "Premature end of pattern at #{where}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ require_relative '../../../regexp_parser/error'
2
+
3
+ class Regexp::Scanner
4
+ # General scanner error (catch all)
5
+ class ScannerError < Regexp::Parser::Error; end
6
+ end
@@ -0,0 +1,63 @@
1
+ class Regexp::Scanner
2
+ # Base for all scanner validation errors
3
+ class ValidationError < ScannerError
4
+ # Centralizes and unifies the handling of validation related errors.
5
+ def self.for(type, problem, reason = nil)
6
+ types.fetch(type).new(problem, reason)
7
+ end
8
+
9
+ def self.types
10
+ @types ||= {
11
+ backref: InvalidBackrefError,
12
+ group: InvalidGroupError,
13
+ group_option: InvalidGroupOption,
14
+ posix_class: UnknownPosixClassError,
15
+ property: UnknownUnicodePropertyError,
16
+ sequence: InvalidSequenceError,
17
+ }
18
+ end
19
+ end
20
+
21
+ # Invalid sequence format. Used for escape sequences, mainly.
22
+ class InvalidSequenceError < ValidationError
23
+ def initialize(what = 'sequence', where = '')
24
+ super "Invalid #{what} at #{where}"
25
+ end
26
+ end
27
+
28
+ # Invalid group. Used for named groups.
29
+ class InvalidGroupError < ValidationError
30
+ def initialize(what, reason)
31
+ super "Invalid #{what}, #{reason}."
32
+ end
33
+ end
34
+
35
+ # Invalid groupOption. Used for inline options.
36
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
37
+ class InvalidGroupOption < ValidationError
38
+ def initialize(option, text)
39
+ super "Invalid group option #{option} in #{text}"
40
+ end
41
+ end
42
+
43
+ # Invalid back reference. Used for name a number refs/calls.
44
+ class InvalidBackrefError < ValidationError
45
+ def initialize(what, reason)
46
+ super "Invalid back reference #{what}, #{reason}"
47
+ end
48
+ end
49
+
50
+ # The property name was not recognized by the scanner.
51
+ class UnknownUnicodePropertyError < ValidationError
52
+ def initialize(name, _)
53
+ super "Unknown unicode character property name #{name}"
54
+ end
55
+ end
56
+
57
+ # The POSIX class name was not recognized by the scanner.
58
+ class UnknownPosixClassError < ValidationError
59
+ def initialize(text, _)
60
+ super "Unknown POSIX class #{text}"
61
+ end
62
+ end
63
+ end