regexp_parser 2.7.0 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +55 -3
  3. data/Gemfile +2 -2
  4. data/README.md +32 -29
  5. data/lib/regexp_parser/expression/base.rb +0 -7
  6. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  7. data/lib/regexp_parser/expression/classes/backreference.rb +4 -6
  8. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  9. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  10. data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
  11. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  13. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  18. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  19. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  20. data/lib/regexp_parser/expression/methods/traverse.rb +33 -20
  21. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  22. data/lib/regexp_parser/expression/sequence.rb +5 -9
  23. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  24. data/lib/regexp_parser/expression/shared.rb +37 -24
  25. data/lib/regexp_parser/expression/subexpression.rb +20 -18
  26. data/lib/regexp_parser/expression.rb +2 -0
  27. data/lib/regexp_parser/lexer.rb +15 -7
  28. data/lib/regexp_parser/parser.rb +85 -86
  29. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  30. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  31. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  32. data/lib/regexp_parser/scanner/mapping.rb +89 -0
  33. data/lib/regexp_parser/scanner/property.rl +1 -1
  34. data/lib/regexp_parser/scanner/scanner.rl +35 -129
  35. data/lib/regexp_parser/scanner.rb +1084 -1303
  36. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  37. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  38. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  39. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  40. data/lib/regexp_parser/syntax/token/unicode_property.rb +3 -0
  41. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  42. data/lib/regexp_parser/version.rb +1 -1
  43. metadata +9 -2
@@ -8,14 +8,10 @@ module Regexp::Expression
8
8
 
9
9
  MODES = %i[greedy possessive reluctant]
10
10
 
11
- attr_reader :min, :max, :mode
12
-
13
11
  def initialize(*args)
14
12
  deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
15
13
 
16
14
  init_from_token_and_options(*args)
17
- @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
18
- @min, @max = minmax
19
15
  # TODO: remove in v3.0.0, stop removing parts of #token (?)
20
16
  self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
21
17
  end
@@ -39,9 +35,21 @@ module Regexp::Expression
39
35
  end
40
36
  alias :lazy? :reluctant?
41
37
 
38
+ def min
39
+ derived_data[:min]
40
+ end
41
+
42
+ def max
43
+ derived_data[:max]
44
+ end
45
+
46
+ def mode
47
+ derived_data[:mode]
48
+ end
49
+
42
50
  private
43
51
 
44
- def deprecated_old_init(token, text, min, max, mode = :greedy)
52
+ def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
45
53
  warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
46
54
  "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
47
55
  "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
51
59
  "This is consistent with how Expression::Base instances are created. "
52
60
  @token = token
53
61
  @text = text
54
- @min = min
55
- @max = max
56
- @mode = mode
57
62
  end
58
63
 
59
- def minmax
60
- case token
61
- when /zero_or_one/ then [0, 1]
62
- when /zero_or_more/ then [0, -1]
63
- when /one_or_more/ then [1, -1]
64
- when :interval
65
- int_min = text[/\{(\d*)/, 1]
66
- int_max = text[/,?(\d*)\}/, 1]
67
- [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
64
+ def derived_data
65
+ @derived_data ||= begin
66
+ min, max =
67
+ case text[0]
68
+ when '?'; [0, 1]
69
+ when '*'; [0, -1]
70
+ when '+'; [1, -1]
71
+ else
72
+ int_min = text[/\{(\d*)/, 1]
73
+ int_max = text[/,?(\d*)\}/, 1]
74
+ [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
75
+ end
76
+
77
+ mod = text[/.([?+])/, 1]
78
+ mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
79
+
80
+ { min: min, max: max, mode: mode }
68
81
  end
69
82
  end
70
83
  end
@@ -12,6 +12,7 @@ module Regexp::Expression
12
12
  level: exp.level,
13
13
  set_level: exp.set_level,
14
14
  conditional_level: params[:conditional_level] || exp.conditional_level,
15
+ ts: params[:ts],
15
16
  )
16
17
  sequence.options = active_opts
17
18
  exp.expressions << sequence
@@ -19,17 +20,12 @@ module Regexp::Expression
19
20
  end
20
21
  end
21
22
 
22
- def starts_at
23
- expressions.first.starts_at
23
+ def ts
24
+ (head = expressions.first) ? head.ts : @ts
24
25
  end
25
- alias :ts :starts_at
26
26
 
27
- def quantify(*args)
28
- target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
29
- target or raise Regexp::Parser::Error,
30
- "No valid target found for '#{text}' quantifier"
31
-
32
- target.quantify(*args)
27
+ def quantify(token, *args)
28
+ extract_quantifier_target(token.text).quantify(token, *args)
33
29
  end
34
30
  end
35
31
  end
@@ -5,21 +5,16 @@ module Regexp::Expression
5
5
  alias :operands :expressions
6
6
  alias :operator :text
7
7
 
8
- def starts_at
9
- expressions.first.starts_at
8
+ def ts
9
+ (head = expressions.first) ? head.ts : @ts
10
10
  end
11
- alias :ts :starts_at
12
11
 
13
12
  def <<(exp)
14
13
  expressions.last << exp
15
14
  end
16
15
 
17
- def add_sequence(active_opts = {})
18
- self.class::OPERAND.add_to(self, {}, active_opts)
19
- end
20
-
21
- def parts
22
- intersperse(expressions, text.dup)
16
+ def add_sequence(active_opts = {}, params = { ts: 0 })
17
+ self.class::OPERAND.add_to(self, params, active_opts)
23
18
  end
24
19
  end
25
20
  end
@@ -8,7 +8,8 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options
11
+ :options, :parent,
12
+ :custom_to_s_handling, :pre_quantifier_decorations
12
13
 
13
14
  attr_reader :nesting_level, :quantifier
14
15
  end
@@ -32,6 +33,10 @@ module Regexp::Expression
32
33
  self.text = orig.text.dup if orig.text
33
34
  self.options = orig.options.dup if orig.options
34
35
  self.quantifier = orig.quantifier.clone if orig.quantifier
36
+ self.parent = nil # updated by Subexpression#initialize_copy
37
+ if orig.pre_quantifier_decorations
38
+ self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39
+ end
35
40
  super
36
41
  end
37
42
 
@@ -39,35 +44,51 @@ module Regexp::Expression
39
44
  ts
40
45
  end
41
46
 
47
+ def ends_at(include_quantifier = true)
48
+ ts + (include_quantifier ? full_length : base_length)
49
+ end
50
+
42
51
  def base_length
43
52
  to_s(:base).length
44
53
  end
45
54
 
46
55
  def full_length
47
- to_s.length
48
- end
49
-
56
+ to_s(:original).length
57
+ end
58
+
59
+ # #to_s reproduces the original source, as an unparser would.
60
+ #
61
+ # It takes an optional format argument.
62
+ #
63
+ # Example:
64
+ #
65
+ # lit = Regexp::Parser.parse(/a +/x)[0]
66
+ #
67
+ # lit.to_s # => 'a+' # default; with quantifier
68
+ # lit.to_s(:full) # => 'a+' # default; with quantifier
69
+ # lit.to_s(:base) # => 'a' # without quantifier
70
+ # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
+ #
50
72
  def to_s(format = :full)
51
- "#{parts.join}#{quantifier_affix(format)}"
73
+ base = parts.each_with_object(''.dup) do |part, buff|
74
+ if part.instance_of?(String)
75
+ buff << part
76
+ elsif !part.custom_to_s_handling
77
+ buff << part.to_s(:original)
78
+ end
79
+ end
80
+ "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
52
81
  end
53
82
  alias :to_str :to_s
54
83
 
55
- def parts
56
- [text.dup]
84
+ def pre_quantifier_decoration(expression_format = :original)
85
+ pre_quantifier_decorations.to_a.join if expression_format == :original
57
86
  end
58
87
 
59
- def quantifier_affix(expression_format)
88
+ def quantifier_affix(expression_format = :full)
60
89
  quantifier.to_s if quantified? && expression_format != :base
61
90
  end
62
91
 
63
- def quantified?
64
- !quantifier.nil?
65
- end
66
-
67
- def optional?
68
- quantified? && quantifier.min == 0
69
- end
70
-
71
92
  def offset
72
93
  [starts_at, full_length]
73
94
  end
@@ -76,14 +97,6 @@ module Regexp::Expression
76
97
  '@%d+%d' % offset
77
98
  end
78
99
 
79
- def terminal?
80
- true # overridden to be false in Expression::Subexpression
81
- end
82
-
83
- def referential?
84
- false # overridden to be true e.g. in Expression::Backreference::Base
85
- end
86
-
87
100
  def nesting_level=(lvl)
88
101
  @nesting_level = lvl
89
102
  quantifier && quantifier.nesting_level = lvl
@@ -11,16 +11,15 @@ module Regexp::Expression
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
13
  def initialize_copy(orig)
14
- self.expressions = orig.expressions.map(&:clone)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
15
17
  super
16
18
  end
17
19
 
18
20
  def <<(exp)
19
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
- last.merge(exp)
21
- else
22
- expressions << exp
23
- end
21
+ exp.parent = self
22
+ expressions << exp
24
23
  end
25
24
 
26
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -38,11 +37,7 @@ module Regexp::Expression
38
37
  end
39
38
 
40
39
  def te
41
- ts + to_s.length
42
- end
43
-
44
- def parts
45
- expressions
40
+ ts + base_length
46
41
  end
47
42
 
48
43
  def to_h
@@ -52,14 +47,21 @@ module Regexp::Expression
52
47
  )
53
48
  end
54
49
 
55
- def terminal?
56
- false
57
- end
58
-
59
- private
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
60
62
 
61
- def intersperse(expressions, separator)
62
- expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
63
65
  end
64
66
  end
65
67
  end
@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
29
29
  require 'regexp_parser/expression/methods/match'
30
30
  require 'regexp_parser/expression/methods/match_length'
31
31
  require 'regexp_parser/expression/methods/options'
32
+ require 'regexp_parser/expression/methods/parts'
33
+ require 'regexp_parser/expression/methods/printing'
32
34
  require 'regexp_parser/expression/methods/strfregexp'
33
35
  require 'regexp_parser/expression/methods/tests'
34
36
  require 'regexp_parser/expression/methods/traverse'
@@ -6,7 +6,7 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
@@ -89,24 +89,32 @@ class Regexp::Lexer
89
89
  :nesting, :set_nesting, :conditional_nesting, :shift
90
90
 
91
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
92
94
  case type
93
95
  when :group, :assertion
94
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
95
97
  when :set
96
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
97
99
  when :conditional
98
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
99
103
  end
100
104
  end
101
105
 
102
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
103
109
  case type
104
110
  when :group, :assertion
105
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
106
112
  when :set
107
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
108
114
  when :conditional
109
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
110
118
  end
111
119
  end
112
120
 
@@ -232,7 +232,7 @@ class Regexp::Parser
232
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
233
233
  when :name_call
234
234
  node << Backreference::NameCall.new(token, active_opts)
235
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
236
236
  node << Backreference::Number.new(token, active_opts)
237
237
  when :number_recursion_ref
238
238
  node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
@@ -272,9 +272,9 @@ class Regexp::Parser
272
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
273
273
  when :condition
274
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
275
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
276
276
  when :separator
277
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
278
278
  self.node = conditional_nesting.last.branches.last
279
279
  when :close
280
280
  conditional_nesting.pop
@@ -322,6 +322,7 @@ class Regexp::Parser
322
322
 
323
323
  when :control
324
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
325
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
326
327
  else
327
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -329,6 +330,7 @@ class Regexp::Parser
329
330
 
330
331
  when :meta_sequence
331
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
332
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
333
335
  else
334
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -349,11 +351,7 @@ class Regexp::Parser
349
351
  when :comment
350
352
  node << Comment.new(token, active_opts)
351
353
  when :whitespace
352
- if node.last.is_a?(WhiteSpace)
353
- node.last.merge(WhiteSpace.new(token, active_opts))
354
- else
355
- node << WhiteSpace.new(token, active_opts)
356
- end
354
+ node << WhiteSpace.new(token, active_opts)
357
355
  else
358
356
  raise UnknownTokenError.new('FreeSpace', token)
359
357
  end
@@ -381,96 +379,96 @@ class Regexp::Parser
381
379
  def sequence_operation(klass, token)
382
380
  unless node.instance_of?(klass)
383
381
  operator = klass.new(token, active_opts)
384
- sequence = operator.add_sequence(active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
385
383
  sequence.expressions = node.expressions
386
384
  node.expressions = []
387
385
  nest(operator)
388
386
  end
389
- node.add_sequence(active_opts)
387
+ node.add_sequence(active_opts, { ts: token.te })
390
388
  end
391
389
 
392
390
  def posixclass(token)
393
391
  node << PosixClass.new(token, active_opts)
394
392
  end
395
393
 
396
- include Regexp::Expression::UnicodeProperty
397
- UPTokens = Regexp::Syntax::Token::UnicodeProperty
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
398
396
 
399
397
  def property(token)
400
398
  case token.token
401
- when :alnum; node << Alnum.new(token, active_opts)
402
- when :alpha; node << Alpha.new(token, active_opts)
403
- when :ascii; node << Ascii.new(token, active_opts)
404
- when :blank; node << Blank.new(token, active_opts)
405
- when :cntrl; node << Cntrl.new(token, active_opts)
406
- when :digit; node << Digit.new(token, active_opts)
407
- when :graph; node << Graph.new(token, active_opts)
408
- when :lower; node << Lower.new(token, active_opts)
409
- when :print; node << Print.new(token, active_opts)
410
- when :punct; node << Punct.new(token, active_opts)
411
- when :space; node << Space.new(token, active_opts)
412
- when :upper; node << Upper.new(token, active_opts)
413
- when :word; node << Word.new(token, active_opts)
414
- when :xdigit; node << Xdigit.new(token, active_opts)
415
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
416
414
 
417
415
  # only in Oniguruma (old rubies)
418
- when :newline; node << Newline.new(token, active_opts)
419
-
420
- when :any; node << Any.new(token, active_opts)
421
- when :assigned; node << Assigned.new(token, active_opts)
422
-
423
- when :letter; node << Letter::Any.new(token, active_opts)
424
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
425
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
426
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
427
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
428
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
429
- when :other_letter; node << Letter::Other.new(token, active_opts)
430
-
431
- when :mark; node << Mark::Any.new(token, active_opts)
432
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
433
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
434
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
435
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
436
-
437
- when :number; node << Number::Any.new(token, active_opts)
438
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
439
- when :letter_number; node << Number::Letter.new(token, active_opts)
440
- when :other_number; node << Number::Other.new(token, active_opts)
441
-
442
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
443
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
444
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
445
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
446
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
447
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
448
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
449
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
450
-
451
- when :separator; node << Separator::Any.new(token, active_opts)
452
- when :space_separator; node << Separator::Space.new(token, active_opts)
453
- when :line_separator; node << Separator::Line.new(token, active_opts)
454
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
455
-
456
- when :symbol; node << Symbol::Any.new(token, active_opts)
457
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
458
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
459
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
460
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
461
-
462
- when :other; node << Codepoint::Any.new(token, active_opts)
463
- when :control; node << Codepoint::Control.new(token, active_opts)
464
- when :format; node << Codepoint::Format.new(token, active_opts)
465
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
466
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
467
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
468
-
469
- when *UPTokens::Age; node << Age.new(token, active_opts)
470
- when *UPTokens::Derived; node << Derived.new(token, active_opts)
471
- when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
472
- when *UPTokens::Script; node << Script.new(token, active_opts)
473
- when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
471
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
474
472
 
475
473
  else
476
474
  raise UnknownTokenError.new('UnicodeProperty', token)
@@ -478,8 +476,7 @@ class Regexp::Parser
478
476
  end
479
477
 
480
478
  def quantifier(token)
481
- target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
482
- target_node or raise ParserError, "No valid target found for '#{token.text}'"
479
+ target_node = node.extract_quantifier_target(token.text)
483
480
 
484
481
  # in case of chained quantifiers, wrap target in an implicit passive group
485
482
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -527,6 +524,8 @@ class Regexp::Parser
527
524
  end
528
525
 
529
526
  def open_set(token)
527
+ # TODO: this and Quantifier are the only cases where Expression#token
528
+ # does not match the scanner/lexer output. Fix in v3.0.0.
530
529
  token.token = :character
531
530
  nest(CharacterSet.new(token, active_opts))
532
531
  end
@@ -590,7 +589,7 @@ class Regexp::Parser
590
589
  # (in a second iteration because there might be forward references)
591
590
  referrers.each do |exp|
592
591
  exp.referenced_expression = targets[exp.reference] ||
593
- raise(ParserError, "Invalid reference: #{exp.reference}")
592
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
594
593
  end
595
594
  end
596
595
  end # module Regexp::Parser
@@ -0,0 +1,8 @@
1
+ class Regexp::Scanner
2
+ # Unexpected end of pattern
3
+ class PrematureEndError < ScannerError
4
+ def initialize(where = '')
5
+ super "Premature end of pattern at #{where}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ require 'regexp_parser/error'
2
+
3
+ class Regexp::Scanner
4
+ # General scanner error (catch all)
5
+ class ScannerError < Regexp::Parser::Error; end
6
+ end