regexp_parser 2.6.0 → 2.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +5 -5
  3. data/LICENSE +1 -1
  4. data/lib/regexp_parser/expression/base.rb +0 -7
  5. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  6. data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
  7. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  8. data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
  9. data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
  10. data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
  11. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  13. data/lib/regexp_parser/expression/classes/keep.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
  18. data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
  19. data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
  20. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  21. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  22. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  23. data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
  24. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  25. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  26. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  27. data/lib/regexp_parser/expression/sequence.rb +5 -10
  28. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  29. data/lib/regexp_parser/expression/shared.rb +37 -20
  30. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  31. data/lib/regexp_parser/expression.rb +37 -31
  32. data/lib/regexp_parser/lexer.rb +76 -36
  33. data/lib/regexp_parser/parser.rb +107 -103
  34. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  35. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  36. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  37. data/lib/regexp_parser/scanner/properties/long.csv +29 -0
  38. data/lib/regexp_parser/scanner/properties/short.csv +3 -0
  39. data/lib/regexp_parser/scanner/property.rl +2 -2
  40. data/lib/regexp_parser/scanner/scanner.rl +101 -172
  41. data/lib/regexp_parser/scanner.rb +1171 -1365
  42. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  43. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  44. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  45. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  46. data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
  47. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  48. data/lib/regexp_parser/syntax/token.rb +13 -13
  49. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  50. data/lib/regexp_parser/syntax/versions.rb +3 -1
  51. data/lib/regexp_parser/syntax.rb +1 -1
  52. data/lib/regexp_parser/version.rb +1 -1
  53. data/lib/regexp_parser.rb +6 -6
  54. data/regexp_parser.gemspec +5 -5
  55. metadata +17 -8
  56. data/CHANGELOG.md +0 -601
  57. data/README.md +0 -503
@@ -6,57 +6,75 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
13
13
 
14
14
  CONDITION_TOKENS = %i[condition condition_close].freeze
15
15
 
16
- def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
17
- new.lex(input, syntax, options: options, &block)
16
+ def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
17
+ new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
18
18
  end
19
19
 
20
- def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
- syntax = Regexp::Syntax.for(syntax)
20
+ def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
21
+ syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
22
22
 
23
+ self.block = block
24
+ self.collect_tokens = collect_tokens
23
25
  self.tokens = []
26
+ self.prev_token = nil
27
+ self.preprev_token = nil
24
28
  self.nesting = 0
25
29
  self.set_nesting = 0
26
30
  self.conditional_nesting = 0
27
31
  self.shift = 0
28
32
 
29
- last = nil
30
- Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
33
+ Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
31
34
  type, token = *syntax.normalize(type, token)
32
35
  syntax.check! type, token
33
36
 
34
37
  ascend(type, token)
35
38
 
36
- if type == :quantifier and last
37
- break_literal(last) if last.type == :literal
38
- break_codepoint_list(last) if last.token == :codepoint_list
39
+ if (last = prev_token) &&
40
+ type == :quantifier &&
41
+ (
42
+ (last.type == :literal && (parts = break_literal(last))) ||
43
+ (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
44
+ )
45
+ emit(parts[0])
46
+ last = parts[1]
39
47
  end
40
48
 
41
49
  current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
42
50
  nesting, set_nesting, conditional_nesting)
43
51
 
44
- current = merge_condition(current) if type == :conditional and
45
- CONDITION_TOKENS.include?(token)
46
-
47
- last.next = current if last
48
- current.previous = last if last
52
+ if type == :conditional && CONDITION_TOKENS.include?(token)
53
+ current = merge_condition(current, last)
54
+ elsif last
55
+ last.next = current
56
+ current.previous = last
57
+ emit(last)
58
+ end
49
59
 
50
- tokens << current
51
- last = current
60
+ self.preprev_token = last
61
+ self.prev_token = current
52
62
 
53
63
  descend(type, token)
54
64
  end
55
65
 
56
- if block_given?
57
- tokens.map { |t| block.call(t) }
66
+ emit(prev_token) if prev_token
67
+
68
+ collect_tokens ? tokens : nil
69
+ end
70
+
71
+ def emit(token)
72
+ if block
73
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
74
+ res = block.call(token)
75
+ tokens << res if collect_tokens
58
76
  else
59
- tokens
77
+ tokens << token
60
78
  end
61
79
  end
62
80
 
@@ -66,27 +84,37 @@ class Regexp::Lexer
66
84
 
67
85
  private
68
86
 
69
- attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
87
+ attr_accessor :block,
88
+ :collect_tokens, :tokens, :prev_token, :preprev_token,
89
+ :nesting, :set_nesting, :conditional_nesting, :shift
70
90
 
71
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
72
94
  case type
73
95
  when :group, :assertion
74
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
75
97
  when :set
76
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
77
99
  when :conditional
78
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
79
103
  end
80
104
  end
81
105
 
82
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
83
109
  case type
84
110
  when :group, :assertion
85
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
86
112
  when :set
87
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
88
114
  when :conditional
89
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
90
118
  end
91
119
  end
92
120
 
@@ -96,34 +124,46 @@ class Regexp::Lexer
96
124
  lead, last, _ = token.text.partition(/.\z/mu)
97
125
  return if lead.empty?
98
126
 
99
- tokens.pop
100
- tokens << Regexp::Token.new(:literal, :literal, lead,
127
+ token_1 = Regexp::Token.new(:literal, :literal, lead,
101
128
  token.ts, (token.te - last.length),
102
129
  nesting, set_nesting, conditional_nesting)
103
- tokens << Regexp::Token.new(:literal, :literal, last,
130
+ token_2 = Regexp::Token.new(:literal, :literal, last,
104
131
  (token.ts + lead.length), token.te,
105
132
  nesting, set_nesting, conditional_nesting)
133
+
134
+ token_1.previous = preprev_token
135
+ token_1.next = token_2
136
+ token_2.previous = token_1 # .next will be set by #lex
137
+ [token_1, token_2]
106
138
  end
107
139
 
140
+ # if a codepoint list is followed by a quantifier, that quantifier applies
141
+ # to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
142
+ # c.f. #break_literal.
108
143
  def break_codepoint_list(token)
109
144
  lead, _, tail = token.text.rpartition(' ')
110
145
  return if lead.empty?
111
146
 
112
- tokens.pop
113
- tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
147
+ token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
114
148
  token.ts, (token.te - tail.length),
115
149
  nesting, set_nesting, conditional_nesting)
116
- tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
150
+ token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
117
151
  (token.ts + lead.length + 1), (token.te + 3),
118
152
  nesting, set_nesting, conditional_nesting)
119
153
 
120
154
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
155
+
156
+ token_1.previous = preprev_token
157
+ token_1.next = token_2
158
+ token_2.previous = token_1 # .next will be set by #lex
159
+ [token_1, token_2]
121
160
  end
122
161
 
123
- def merge_condition(current)
124
- last = tokens.pop
125
- Regexp::Token.new(:conditional, :condition, last.text + current.text,
162
+ def merge_condition(current, last)
163
+ token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
126
164
  last.ts, current.te, nesting, set_nesting, conditional_nesting)
165
+ token.previous = preprev_token # .next will be set by #lex
166
+ token
127
167
  end
128
168
 
129
169
  end # module Regexp::Lexer
@@ -1,5 +1,5 @@
1
- require 'regexp_parser/error'
2
- require 'regexp_parser/expression'
1
+ require_relative 'error'
2
+ require_relative 'expression'
3
3
 
4
4
  class Regexp::Parser
5
5
  include Regexp::Expression
@@ -18,11 +18,11 @@ class Regexp::Parser
18
18
  end
19
19
  end
20
20
 
21
- def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
21
+ def self.parse(input, syntax = nil, options: nil, &block)
22
22
  new.parse(input, syntax, options: options, &block)
23
23
  end
24
24
 
25
- def parse(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
25
+ def parse(input, syntax = nil, options: nil, &block)
26
26
  root = Root.construct(options: extract_options(input, options))
27
27
 
28
28
  self.root = root
@@ -35,7 +35,7 @@ class Regexp::Parser
35
35
 
36
36
  self.captured_group_counts = Hash.new(0)
37
37
 
38
- Regexp::Lexer.scan(input, syntax, options: options) do |token|
38
+ Regexp::Lexer.scan(input, syntax, options: options, collect_tokens: false) do |token|
39
39
  parse_token(token)
40
40
  end
41
41
 
@@ -232,7 +232,7 @@ class Regexp::Parser
232
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
233
233
  when :name_call
234
234
  node << Backreference::NameCall.new(token, active_opts)
235
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
236
236
  node << Backreference::Number.new(token, active_opts)
237
237
  when :number_recursion_ref
238
238
  node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
@@ -272,9 +272,9 @@ class Regexp::Parser
272
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
273
273
  when :condition
274
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
275
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
276
276
  when :separator
277
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
278
278
  self.node = conditional_nesting.last.branches.last
279
279
  when :close
280
280
  conditional_nesting.pop
@@ -322,6 +322,7 @@ class Regexp::Parser
322
322
 
323
323
  when :control
324
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
325
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
326
327
  else
327
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -329,6 +330,7 @@ class Regexp::Parser
329
330
 
330
331
  when :meta_sequence
331
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
332
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
333
335
  else
334
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -349,11 +351,7 @@ class Regexp::Parser
349
351
  when :comment
350
352
  node << Comment.new(token, active_opts)
351
353
  when :whitespace
352
- if node.last.is_a?(WhiteSpace)
353
- node.last.merge(WhiteSpace.new(token, active_opts))
354
- else
355
- node << WhiteSpace.new(token, active_opts)
356
- end
354
+ node << WhiteSpace.new(token, active_opts)
357
355
  else
358
356
  raise UnknownTokenError.new('FreeSpace', token)
359
357
  end
@@ -379,98 +377,99 @@ class Regexp::Parser
379
377
  end
380
378
 
381
379
  def sequence_operation(klass, token)
382
- unless node.is_a?(klass)
380
+ unless node.instance_of?(klass)
383
381
  operator = klass.new(token, active_opts)
384
- sequence = operator.add_sequence(active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
385
383
  sequence.expressions = node.expressions
386
384
  node.expressions = []
387
385
  nest(operator)
388
386
  end
389
- node.add_sequence(active_opts)
387
+ node.add_sequence(active_opts, { ts: token.te })
390
388
  end
391
389
 
392
390
  def posixclass(token)
393
391
  node << PosixClass.new(token, active_opts)
394
392
  end
395
393
 
396
- include Regexp::Expression::UnicodeProperty
397
- UPTokens = Regexp::Syntax::Token::UnicodeProperty
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
398
396
 
399
397
  def property(token)
400
398
  case token.token
401
- when :alnum; node << Alnum.new(token, active_opts)
402
- when :alpha; node << Alpha.new(token, active_opts)
403
- when :ascii; node << Ascii.new(token, active_opts)
404
- when :blank; node << Blank.new(token, active_opts)
405
- when :cntrl; node << Cntrl.new(token, active_opts)
406
- when :digit; node << Digit.new(token, active_opts)
407
- when :graph; node << Graph.new(token, active_opts)
408
- when :lower; node << Lower.new(token, active_opts)
409
- when :print; node << Print.new(token, active_opts)
410
- when :punct; node << Punct.new(token, active_opts)
411
- when :space; node << Space.new(token, active_opts)
412
- when :upper; node << Upper.new(token, active_opts)
413
- when :word; node << Word.new(token, active_opts)
414
- when :xdigit; node << Xdigit.new(token, active_opts)
415
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
416
414
 
417
415
  # only in Oniguruma (old rubies)
418
- when :newline; node << Newline.new(token, active_opts)
419
-
420
- when :any; node << Any.new(token, active_opts)
421
- when :assigned; node << Assigned.new(token, active_opts)
422
-
423
- when :letter; node << Letter::Any.new(token, active_opts)
424
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
425
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
426
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
427
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
428
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
429
- when :other_letter; node << Letter::Other.new(token, active_opts)
430
-
431
- when :mark; node << Mark::Any.new(token, active_opts)
432
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
433
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
434
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
435
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
436
-
437
- when :number; node << Number::Any.new(token, active_opts)
438
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
439
- when :letter_number; node << Number::Letter.new(token, active_opts)
440
- when :other_number; node << Number::Other.new(token, active_opts)
441
-
442
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
443
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
444
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
445
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
446
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
447
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
448
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
449
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
450
-
451
- when :separator; node << Separator::Any.new(token, active_opts)
452
- when :space_separator; node << Separator::Space.new(token, active_opts)
453
- when :line_separator; node << Separator::Line.new(token, active_opts)
454
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
455
-
456
- when :symbol; node << Symbol::Any.new(token, active_opts)
457
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
458
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
459
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
460
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
461
-
462
- when :other; node << Codepoint::Any.new(token, active_opts)
463
- when :control; node << Codepoint::Control.new(token, active_opts)
464
- when :format; node << Codepoint::Format.new(token, active_opts)
465
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
466
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
467
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
468
-
469
- when *UPTokens::Age; node << Age.new(token, active_opts)
470
- when *UPTokens::Derived; node << Derived.new(token, active_opts)
471
- when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
472
- when *UPTokens::Script; node << Script.new(token, active_opts)
473
- when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
471
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
472
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
474
473
 
475
474
  else
476
475
  raise UnknownTokenError.new('UnicodeProperty', token)
@@ -478,8 +477,7 @@ class Regexp::Parser
478
477
  end
479
478
 
480
479
  def quantifier(token)
481
- target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
482
- target_node or raise ParserError, "No valid target found for '#{token.text}'"
480
+ target_node = node.extract_quantifier_target(token.text)
483
481
 
484
482
  # in case of chained quantifiers, wrap target in an implicit passive group
485
483
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -527,6 +525,8 @@ class Regexp::Parser
527
525
  end
528
526
 
529
527
  def open_set(token)
528
+ # TODO: this and Quantifier are the only cases where Expression#token
529
+ # does not match the scanner/lexer output. Fix in v3.0.0.
530
530
  token.token = :character
531
531
  nest(CharacterSet.new(token, active_opts))
532
532
  end
@@ -541,7 +541,7 @@ class Regexp::Parser
541
541
 
542
542
  def range(token)
543
543
  exp = CharacterSet::Range.new(token, active_opts)
544
- scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
544
+ scope = node.last.instance_of?(CharacterSet::IntersectedSequence) ? node.last : node
545
545
  exp << scope.expressions.pop
546
546
  nest(exp)
547
547
  end
@@ -568,28 +568,32 @@ class Regexp::Parser
568
568
  end
569
569
 
570
570
  def close_completed_character_set_range
571
- decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
571
+ decrease_nesting if node.instance_of?(CharacterSet::Range) && node.complete?
572
572
  end
573
573
 
574
574
  def active_opts
575
575
  options_stack.last
576
576
  end
577
577
 
578
- # Assigns referenced expressions to refering expressions, e.g. if there is
578
+ # Assigns referenced expressions to referring expressions, e.g. if there is
579
579
  # an instance of Backreference::Number, its #referenced_expression is set to
580
580
  # the instance of Group::Capture that it refers to via its number.
581
581
  def assign_referenced_expressions
582
- # find all referencable expressions
583
- targets = { 0 => root }
582
+ # find all referenceable and referring expressions
583
+ targets = { 0 => [root] }
584
+ referrers = []
584
585
  root.each_expression do |exp|
585
- exp.is_a?(Group::Capture) && targets[exp.identifier] = exp
586
+ if exp.referential?
587
+ referrers << exp
588
+ elsif exp.is_a?(Group::Capture)
589
+ (targets[exp.identifier] ||= []) << exp
590
+ end
586
591
  end
587
- # assign them to any refering expressions
588
- root.each_expression do |exp|
589
- next unless exp.respond_to?(:reference)
590
-
591
- exp.referenced_expression = targets[exp.reference] ||
592
- raise(ParserError, "Invalid reference: #{exp.reference}")
592
+ # assign referenced expressions to referring expressions
593
+ # (in a second iteration because there might be forward references)
594
+ referrers.each do |exp|
595
+ exp.referenced_expressions = targets[exp.reference] ||
596
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
593
597
  end
594
598
  end
595
599
  end # module Regexp::Parser
@@ -0,0 +1,8 @@
1
+ class Regexp::Scanner
2
+ # Unexpected end of pattern
3
+ class PrematureEndError < ScannerError
4
+ def initialize(where = '')
5
+ super "Premature end of pattern at #{where}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ require_relative '../../../regexp_parser/error'
2
+
3
+ class Regexp::Scanner
4
+ # General scanner error (catch all)
5
+ class ScannerError < Regexp::Parser::Error; end
6
+ end
@@ -0,0 +1,63 @@
1
+ class Regexp::Scanner
2
+ # Base for all scanner validation errors
3
+ class ValidationError < ScannerError
4
+ # Centralizes and unifies the handling of validation related errors.
5
+ def self.for(type, problem, reason = nil)
6
+ types.fetch(type).new(problem, reason)
7
+ end
8
+
9
+ def self.types
10
+ @types ||= {
11
+ backref: InvalidBackrefError,
12
+ group: InvalidGroupError,
13
+ group_option: InvalidGroupOption,
14
+ posix_class: UnknownPosixClassError,
15
+ property: UnknownUnicodePropertyError,
16
+ sequence: InvalidSequenceError,
17
+ }
18
+ end
19
+ end
20
+
21
+ # Invalid sequence format. Used for escape sequences, mainly.
22
+ class InvalidSequenceError < ValidationError
23
+ def initialize(what = 'sequence', where = '')
24
+ super "Invalid #{what} at #{where}"
25
+ end
26
+ end
27
+
28
+ # Invalid group. Used for named groups.
29
+ class InvalidGroupError < ValidationError
30
+ def initialize(what, reason)
31
+ super "Invalid #{what}, #{reason}."
32
+ end
33
+ end
34
+
35
+ # Invalid groupOption. Used for inline options.
36
+ # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
37
+ class InvalidGroupOption < ValidationError
38
+ def initialize(option, text)
39
+ super "Invalid group option #{option} in #{text}"
40
+ end
41
+ end
42
+
43
+ # Invalid back reference. Used for name a number refs/calls.
44
+ class InvalidBackrefError < ValidationError
45
+ def initialize(what, reason)
46
+ super "Invalid back reference #{what}, #{reason}"
47
+ end
48
+ end
49
+
50
+ # The property name was not recognized by the scanner.
51
+ class UnknownUnicodePropertyError < ValidationError
52
+ def initialize(name, _)
53
+ super "Unknown unicode character property name #{name}"
54
+ end
55
+ end
56
+
57
+ # The POSIX class name was not recognized by the scanner.
58
+ class UnknownPosixClassError < ValidationError
59
+ def initialize(text, _)
60
+ super "Unknown POSIX class #{text}"
61
+ end
62
+ end
63
+ end