regexp_parser 2.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +62 -3
  3. data/Gemfile +3 -3
  4. data/LICENSE +1 -1
  5. data/README.md +33 -30
  6. data/lib/regexp_parser/expression/base.rb +0 -7
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/backreference.rb +4 -6
  9. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  10. data/lib/regexp_parser/expression/classes/character_set.rb +3 -4
  11. data/lib/regexp_parser/expression/classes/conditional.rb +2 -14
  12. data/lib/regexp_parser/expression/classes/escape_sequence.rb +3 -1
  13. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  14. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  15. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  16. data/lib/regexp_parser/expression/classes/unicode_property.rb +5 -2
  17. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  18. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  19. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  20. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  21. data/lib/regexp_parser/expression/methods/traverse.rb +33 -20
  22. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  23. data/lib/regexp_parser/expression/sequence.rb +5 -9
  24. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  25. data/lib/regexp_parser/expression/shared.rb +37 -24
  26. data/lib/regexp_parser/expression/subexpression.rb +20 -18
  27. data/lib/regexp_parser/expression.rb +2 -0
  28. data/lib/regexp_parser/lexer.rb +15 -7
  29. data/lib/regexp_parser/parser.rb +85 -86
  30. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  31. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  32. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  33. data/lib/regexp_parser/scanner/properties/long.csv +11 -0
  34. data/lib/regexp_parser/scanner/properties/short.csv +2 -0
  35. data/lib/regexp_parser/scanner/property.rl +1 -1
  36. data/lib/regexp_parser/scanner/scanner.rl +35 -129
  37. data/lib/regexp_parser/scanner.rb +1084 -1303
  38. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  39. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  40. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  41. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  42. data/lib/regexp_parser/syntax/token/unicode_property.rb +17 -1
  43. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  44. data/lib/regexp_parser/version.rb +1 -1
  45. metadata +9 -3
@@ -8,14 +8,10 @@ module Regexp::Expression
8
8
 
9
9
  MODES = %i[greedy possessive reluctant]
10
10
 
11
- attr_reader :min, :max, :mode
12
-
13
11
  def initialize(*args)
14
12
  deprecated_old_init(*args) and return if args.count == 4 || args.count == 5
15
13
 
16
14
  init_from_token_and_options(*args)
17
- @mode = (token.to_s[/greedy|reluctant|possessive/] || :greedy).to_sym
18
- @min, @max = minmax
19
15
  # TODO: remove in v3.0.0, stop removing parts of #token (?)
20
16
  self.token = token.to_s.sub(/_(greedy|possessive|reluctant)/, '').to_sym
21
17
  end
@@ -39,9 +35,21 @@ module Regexp::Expression
39
35
  end
40
36
  alias :lazy? :reluctant?
41
37
 
38
+ def min
39
+ derived_data[:min]
40
+ end
41
+
42
+ def max
43
+ derived_data[:max]
44
+ end
45
+
46
+ def mode
47
+ derived_data[:mode]
48
+ end
49
+
42
50
  private
43
51
 
44
- def deprecated_old_init(token, text, min, max, mode = :greedy)
52
+ def deprecated_old_init(token, text, _min, _max, _mode = :greedy)
45
53
  warn "Calling `Expression::Base#quantify` or `#{self.class}.new` with 4+ arguments "\
46
54
  "is deprecated.\nIt will no longer be supported in regexp_parser v3.0.0.\n"\
47
55
  "Please pass a Regexp::Token instead, e.g. replace `token, text, min, max, mode` "\
@@ -51,20 +59,25 @@ module Regexp::Expression
51
59
  "This is consistent with how Expression::Base instances are created. "
52
60
  @token = token
53
61
  @text = text
54
- @min = min
55
- @max = max
56
- @mode = mode
57
62
  end
58
63
 
59
- def minmax
60
- case token
61
- when /zero_or_one/ then [0, 1]
62
- when /zero_or_more/ then [0, -1]
63
- when /one_or_more/ then [1, -1]
64
- when :interval
65
- int_min = text[/\{(\d*)/, 1]
66
- int_max = text[/,?(\d*)\}/, 1]
67
- [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
64
+ def derived_data
65
+ @derived_data ||= begin
66
+ min, max =
67
+ case text[0]
68
+ when '?'; [0, 1]
69
+ when '*'; [0, -1]
70
+ when '+'; [1, -1]
71
+ else
72
+ int_min = text[/\{(\d*)/, 1]
73
+ int_max = text[/,?(\d*)\}/, 1]
74
+ [int_min.to_i, (int_max.empty? ? -1 : int_max.to_i)]
75
+ end
76
+
77
+ mod = text[/.([?+])/, 1]
78
+ mode = (mod == '?' && :reluctant) || (mod == '+' && :possessive) || :greedy
79
+
80
+ { min: min, max: max, mode: mode }
68
81
  end
69
82
  end
70
83
  end
@@ -12,6 +12,7 @@ module Regexp::Expression
12
12
  level: exp.level,
13
13
  set_level: exp.set_level,
14
14
  conditional_level: params[:conditional_level] || exp.conditional_level,
15
+ ts: params[:ts],
15
16
  )
16
17
  sequence.options = active_opts
17
18
  exp.expressions << sequence
@@ -19,17 +20,12 @@ module Regexp::Expression
19
20
  end
20
21
  end
21
22
 
22
- def starts_at
23
- expressions.first.starts_at
23
+ def ts
24
+ (head = expressions.first) ? head.ts : @ts
24
25
  end
25
- alias :ts :starts_at
26
26
 
27
- def quantify(*args)
28
- target = expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
29
- target or raise Regexp::Parser::Error,
30
- "No valid target found for '#{text}' quantifier"
31
-
32
- target.quantify(*args)
27
+ def quantify(token, *args)
28
+ extract_quantifier_target(token.text).quantify(token, *args)
33
29
  end
34
30
  end
35
31
  end
@@ -5,21 +5,16 @@ module Regexp::Expression
5
5
  alias :operands :expressions
6
6
  alias :operator :text
7
7
 
8
- def starts_at
9
- expressions.first.starts_at
8
+ def ts
9
+ (head = expressions.first) ? head.ts : @ts
10
10
  end
11
- alias :ts :starts_at
12
11
 
13
12
  def <<(exp)
14
13
  expressions.last << exp
15
14
  end
16
15
 
17
- def add_sequence(active_opts = {})
18
- self.class::OPERAND.add_to(self, {}, active_opts)
19
- end
20
-
21
- def parts
22
- intersperse(expressions, text.dup)
16
+ def add_sequence(active_opts = {}, params = { ts: 0 })
17
+ self.class::OPERAND.add_to(self, params, active_opts)
23
18
  end
24
19
  end
25
20
  end
@@ -8,7 +8,8 @@ module Regexp::Expression
8
8
 
9
9
  attr_accessor :type, :token, :text, :ts, :te,
10
10
  :level, :set_level, :conditional_level,
11
- :options
11
+ :options, :parent,
12
+ :custom_to_s_handling, :pre_quantifier_decorations
12
13
 
13
14
  attr_reader :nesting_level, :quantifier
14
15
  end
@@ -32,6 +33,10 @@ module Regexp::Expression
32
33
  self.text = orig.text.dup if orig.text
33
34
  self.options = orig.options.dup if orig.options
34
35
  self.quantifier = orig.quantifier.clone if orig.quantifier
36
+ self.parent = nil # updated by Subexpression#initialize_copy
37
+ if orig.pre_quantifier_decorations
38
+ self.pre_quantifier_decorations = orig.pre_quantifier_decorations.map(&:dup)
39
+ end
35
40
  super
36
41
  end
37
42
 
@@ -39,35 +44,51 @@ module Regexp::Expression
39
44
  ts
40
45
  end
41
46
 
47
+ def ends_at(include_quantifier = true)
48
+ ts + (include_quantifier ? full_length : base_length)
49
+ end
50
+
42
51
  def base_length
43
52
  to_s(:base).length
44
53
  end
45
54
 
46
55
  def full_length
47
- to_s.length
48
- end
49
-
56
+ to_s(:original).length
57
+ end
58
+
59
+ # #to_s reproduces the original source, as an unparser would.
60
+ #
61
+ # It takes an optional format argument.
62
+ #
63
+ # Example:
64
+ #
65
+ # lit = Regexp::Parser.parse(/a +/x)[0]
66
+ #
67
+ # lit.to_s # => 'a+' # default; with quantifier
68
+ # lit.to_s(:full) # => 'a+' # default; with quantifier
69
+ # lit.to_s(:base) # => 'a' # without quantifier
70
+ # lit.to_s(:original) # => 'a +' # with quantifier AND intermittent decorations
71
+ #
50
72
  def to_s(format = :full)
51
- "#{parts.join}#{quantifier_affix(format)}"
73
+ base = parts.each_with_object(''.dup) do |part, buff|
74
+ if part.instance_of?(String)
75
+ buff << part
76
+ elsif !part.custom_to_s_handling
77
+ buff << part.to_s(:original)
78
+ end
79
+ end
80
+ "#{base}#{pre_quantifier_decoration(format)}#{quantifier_affix(format)}"
52
81
  end
53
82
  alias :to_str :to_s
54
83
 
55
- def parts
56
- [text.dup]
84
+ def pre_quantifier_decoration(expression_format = :original)
85
+ pre_quantifier_decorations.to_a.join if expression_format == :original
57
86
  end
58
87
 
59
- def quantifier_affix(expression_format)
88
+ def quantifier_affix(expression_format = :full)
60
89
  quantifier.to_s if quantified? && expression_format != :base
61
90
  end
62
91
 
63
- def quantified?
64
- !quantifier.nil?
65
- end
66
-
67
- def optional?
68
- quantified? && quantifier.min == 0
69
- end
70
-
71
92
  def offset
72
93
  [starts_at, full_length]
73
94
  end
@@ -76,14 +97,6 @@ module Regexp::Expression
76
97
  '@%d+%d' % offset
77
98
  end
78
99
 
79
- def terminal?
80
- true # overridden to be false in Expression::Subexpression
81
- end
82
-
83
- def referential?
84
- false # overridden to be true e.g. in Expression::Backreference::Base
85
- end
86
-
87
100
  def nesting_level=(lvl)
88
101
  @nesting_level = lvl
89
102
  quantifier && quantifier.nesting_level = lvl
@@ -11,16 +11,15 @@ module Regexp::Expression
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
13
  def initialize_copy(orig)
14
- self.expressions = orig.expressions.map(&:clone)
14
+ self.expressions = orig.expressions.map do |exp|
15
+ exp.clone.tap { |copy| copy.parent = self }
16
+ end
15
17
  super
16
18
  end
17
19
 
18
20
  def <<(exp)
19
- if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
20
- last.merge(exp)
21
- else
22
- expressions << exp
23
- end
21
+ exp.parent = self
22
+ expressions << exp
24
23
  end
25
24
 
26
25
  %w[[] at each empty? fetch index join last length values_at].each do |method|
@@ -38,11 +37,7 @@ module Regexp::Expression
38
37
  end
39
38
 
40
39
  def te
41
- ts + to_s.length
42
- end
43
-
44
- def parts
45
- expressions
40
+ ts + base_length
46
41
  end
47
42
 
48
43
  def to_h
@@ -52,14 +47,21 @@ module Regexp::Expression
52
47
  )
53
48
  end
54
49
 
55
- def terminal?
56
- false
57
- end
58
-
59
- private
50
+ def extract_quantifier_target(quantifier_description)
51
+ pre_quantifier_decorations = []
52
+ target = expressions.reverse.find do |exp|
53
+ if exp.decorative?
54
+ exp.custom_to_s_handling = true
55
+ pre_quantifier_decorations << exp.text
56
+ next
57
+ end
58
+ exp
59
+ end
60
+ target or raise Regexp::Parser::ParserError,
61
+ "No valid target found for '#{quantifier_description}' quantifier"
60
62
 
61
- def intersperse(expressions, separator)
62
- expressions.flat_map { |exp| [exp, separator] }.slice(0...-1)
63
+ target.pre_quantifier_decorations = pre_quantifier_decorations
64
+ target
63
65
  end
64
66
  end
65
67
  end
@@ -29,6 +29,8 @@ require 'regexp_parser/expression/methods/human_name'
29
29
  require 'regexp_parser/expression/methods/match'
30
30
  require 'regexp_parser/expression/methods/match_length'
31
31
  require 'regexp_parser/expression/methods/options'
32
+ require 'regexp_parser/expression/methods/parts'
33
+ require 'regexp_parser/expression/methods/printing'
32
34
  require 'regexp_parser/expression/methods/strfregexp'
33
35
  require 'regexp_parser/expression/methods/tests'
34
36
  require 'regexp_parser/expression/methods/traverse'
@@ -6,7 +6,7 @@ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = %i[
8
8
  capture passive lookahead nlookahead lookbehind nlookbehind
9
- atomic options options_switch named absence
9
+ atomic options options_switch named absence open
10
10
  ].freeze
11
11
 
12
12
  CLOSING_TOKENS = %i[close].freeze
@@ -89,24 +89,32 @@ class Regexp::Lexer
89
89
  :nesting, :set_nesting, :conditional_nesting, :shift
90
90
 
91
91
  def ascend(type, token)
92
+ return unless CLOSING_TOKENS.include?(token)
93
+
92
94
  case type
93
95
  when :group, :assertion
94
- self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
96
+ self.nesting = nesting - 1
95
97
  when :set
96
- self.set_nesting = set_nesting - 1 if token == :close
98
+ self.set_nesting = set_nesting - 1
97
99
  when :conditional
98
- self.conditional_nesting = conditional_nesting - 1 if token == :close
100
+ self.conditional_nesting = conditional_nesting - 1
101
+ else
102
+ raise "unhandled nesting type #{type}"
99
103
  end
100
104
  end
101
105
 
102
106
  def descend(type, token)
107
+ return unless OPENING_TOKENS.include?(token)
108
+
103
109
  case type
104
110
  when :group, :assertion
105
- self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
111
+ self.nesting = nesting + 1
106
112
  when :set
107
- self.set_nesting = set_nesting + 1 if token == :open
113
+ self.set_nesting = set_nesting + 1
108
114
  when :conditional
109
- self.conditional_nesting = conditional_nesting + 1 if token == :open
115
+ self.conditional_nesting = conditional_nesting + 1
116
+ else
117
+ raise "unhandled nesting type #{type}"
110
118
  end
111
119
  end
112
120
 
@@ -232,7 +232,7 @@ class Regexp::Parser
232
232
  node << Backreference::NameRecursionLevel.new(token, active_opts)
233
233
  when :name_call
234
234
  node << Backreference::NameCall.new(token, active_opts)
235
- when :number, :number_ref
235
+ when :number, :number_ref # TODO: split in v3.0.0
236
236
  node << Backreference::Number.new(token, active_opts)
237
237
  when :number_recursion_ref
238
238
  node << Backreference::NumberRecursionLevel.new(token, active_opts).tap do |exp|
@@ -272,9 +272,9 @@ class Regexp::Parser
272
272
  nest_conditional(Conditional::Expression.new(token, active_opts))
273
273
  when :condition
274
274
  conditional_nesting.last.condition = Conditional::Condition.new(token, active_opts)
275
- conditional_nesting.last.add_sequence(active_opts)
275
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
276
276
  when :separator
277
- conditional_nesting.last.add_sequence(active_opts)
277
+ conditional_nesting.last.add_sequence(active_opts, { ts: token.te })
278
278
  self.node = conditional_nesting.last.branches.last
279
279
  when :close
280
280
  conditional_nesting.pop
@@ -322,6 +322,7 @@ class Regexp::Parser
322
322
 
323
323
  when :control
324
324
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
325
+ # TODO: emit :meta_control_sequence token in v3.0.0
325
326
  node << EscapeSequence::MetaControl.new(token, active_opts)
326
327
  else
327
328
  node << EscapeSequence::Control.new(token, active_opts)
@@ -329,6 +330,7 @@ class Regexp::Parser
329
330
 
330
331
  when :meta_sequence
331
332
  if token.text =~ /\A\\M-\\[Cc]/
333
+ # TODO: emit :meta_control_sequence token in v3.0.0:
332
334
  node << EscapeSequence::MetaControl.new(token, active_opts)
333
335
  else
334
336
  node << EscapeSequence::Meta.new(token, active_opts)
@@ -349,11 +351,7 @@ class Regexp::Parser
349
351
  when :comment
350
352
  node << Comment.new(token, active_opts)
351
353
  when :whitespace
352
- if node.last.is_a?(WhiteSpace)
353
- node.last.merge(WhiteSpace.new(token, active_opts))
354
- else
355
- node << WhiteSpace.new(token, active_opts)
356
- end
354
+ node << WhiteSpace.new(token, active_opts)
357
355
  else
358
356
  raise UnknownTokenError.new('FreeSpace', token)
359
357
  end
@@ -381,96 +379,96 @@ class Regexp::Parser
381
379
  def sequence_operation(klass, token)
382
380
  unless node.instance_of?(klass)
383
381
  operator = klass.new(token, active_opts)
384
- sequence = operator.add_sequence(active_opts)
382
+ sequence = operator.add_sequence(active_opts, { ts: token.ts })
385
383
  sequence.expressions = node.expressions
386
384
  node.expressions = []
387
385
  nest(operator)
388
386
  end
389
- node.add_sequence(active_opts)
387
+ node.add_sequence(active_opts, { ts: token.te })
390
388
  end
391
389
 
392
390
  def posixclass(token)
393
391
  node << PosixClass.new(token, active_opts)
394
392
  end
395
393
 
396
- include Regexp::Expression::UnicodeProperty
397
- UPTokens = Regexp::Syntax::Token::UnicodeProperty
394
+ UP = Regexp::Expression::Property
395
+ UPTokens = Regexp::Syntax::Token::Property
398
396
 
399
397
  def property(token)
400
398
  case token.token
401
- when :alnum; node << Alnum.new(token, active_opts)
402
- when :alpha; node << Alpha.new(token, active_opts)
403
- when :ascii; node << Ascii.new(token, active_opts)
404
- when :blank; node << Blank.new(token, active_opts)
405
- when :cntrl; node << Cntrl.new(token, active_opts)
406
- when :digit; node << Digit.new(token, active_opts)
407
- when :graph; node << Graph.new(token, active_opts)
408
- when :lower; node << Lower.new(token, active_opts)
409
- when :print; node << Print.new(token, active_opts)
410
- when :punct; node << Punct.new(token, active_opts)
411
- when :space; node << Space.new(token, active_opts)
412
- when :upper; node << Upper.new(token, active_opts)
413
- when :word; node << Word.new(token, active_opts)
414
- when :xdigit; node << Xdigit.new(token, active_opts)
415
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
399
+ when :alnum; node << UP::Alnum.new(token, active_opts)
400
+ when :alpha; node << UP::Alpha.new(token, active_opts)
401
+ when :ascii; node << UP::Ascii.new(token, active_opts)
402
+ when :blank; node << UP::Blank.new(token, active_opts)
403
+ when :cntrl; node << UP::Cntrl.new(token, active_opts)
404
+ when :digit; node << UP::Digit.new(token, active_opts)
405
+ when :graph; node << UP::Graph.new(token, active_opts)
406
+ when :lower; node << UP::Lower.new(token, active_opts)
407
+ when :print; node << UP::Print.new(token, active_opts)
408
+ when :punct; node << UP::Punct.new(token, active_opts)
409
+ when :space; node << UP::Space.new(token, active_opts)
410
+ when :upper; node << UP::Upper.new(token, active_opts)
411
+ when :word; node << UP::Word.new(token, active_opts)
412
+ when :xdigit; node << UP::Xdigit.new(token, active_opts)
413
+ when :xposixpunct; node << UP::XPosixPunct.new(token, active_opts)
416
414
 
417
415
  # only in Oniguruma (old rubies)
418
- when :newline; node << Newline.new(token, active_opts)
419
-
420
- when :any; node << Any.new(token, active_opts)
421
- when :assigned; node << Assigned.new(token, active_opts)
422
-
423
- when :letter; node << Letter::Any.new(token, active_opts)
424
- when :cased_letter; node << Letter::Cased.new(token, active_opts)
425
- when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
426
- when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
427
- when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
428
- when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
429
- when :other_letter; node << Letter::Other.new(token, active_opts)
430
-
431
- when :mark; node << Mark::Any.new(token, active_opts)
432
- when :combining_mark; node << Mark::Combining.new(token, active_opts)
433
- when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
434
- when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
435
- when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
436
-
437
- when :number; node << Number::Any.new(token, active_opts)
438
- when :decimal_number; node << Number::Decimal.new(token, active_opts)
439
- when :letter_number; node << Number::Letter.new(token, active_opts)
440
- when :other_number; node << Number::Other.new(token, active_opts)
441
-
442
- when :punctuation; node << Punctuation::Any.new(token, active_opts)
443
- when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
444
- when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
445
- when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
446
- when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
447
- when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
448
- when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
449
- when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
450
-
451
- when :separator; node << Separator::Any.new(token, active_opts)
452
- when :space_separator; node << Separator::Space.new(token, active_opts)
453
- when :line_separator; node << Separator::Line.new(token, active_opts)
454
- when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
455
-
456
- when :symbol; node << Symbol::Any.new(token, active_opts)
457
- when :math_symbol; node << Symbol::Math.new(token, active_opts)
458
- when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
459
- when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
460
- when :other_symbol; node << Symbol::Other.new(token, active_opts)
461
-
462
- when :other; node << Codepoint::Any.new(token, active_opts)
463
- when :control; node << Codepoint::Control.new(token, active_opts)
464
- when :format; node << Codepoint::Format.new(token, active_opts)
465
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
466
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
467
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
468
-
469
- when *UPTokens::Age; node << Age.new(token, active_opts)
470
- when *UPTokens::Derived; node << Derived.new(token, active_opts)
471
- when *UPTokens::Emoji; node << Emoji.new(token, active_opts)
472
- when *UPTokens::Script; node << Script.new(token, active_opts)
473
- when *UPTokens::UnicodeBlock; node << Block.new(token, active_opts)
416
+ when :newline; node << UP::Newline.new(token, active_opts)
417
+
418
+ when :any; node << UP::Any.new(token, active_opts)
419
+ when :assigned; node << UP::Assigned.new(token, active_opts)
420
+
421
+ when :letter; node << UP::Letter::Any.new(token, active_opts)
422
+ when :cased_letter; node << UP::Letter::Cased.new(token, active_opts)
423
+ when :uppercase_letter; node << UP::Letter::Uppercase.new(token, active_opts)
424
+ when :lowercase_letter; node << UP::Letter::Lowercase.new(token, active_opts)
425
+ when :titlecase_letter; node << UP::Letter::Titlecase.new(token, active_opts)
426
+ when :modifier_letter; node << UP::Letter::Modifier.new(token, active_opts)
427
+ when :other_letter; node << UP::Letter::Other.new(token, active_opts)
428
+
429
+ when :mark; node << UP::Mark::Any.new(token, active_opts)
430
+ when :combining_mark; node << UP::Mark::Combining.new(token, active_opts)
431
+ when :nonspacing_mark; node << UP::Mark::Nonspacing.new(token, active_opts)
432
+ when :spacing_mark; node << UP::Mark::Spacing.new(token, active_opts)
433
+ when :enclosing_mark; node << UP::Mark::Enclosing.new(token, active_opts)
434
+
435
+ when :number; node << UP::Number::Any.new(token, active_opts)
436
+ when :decimal_number; node << UP::Number::Decimal.new(token, active_opts)
437
+ when :letter_number; node << UP::Number::Letter.new(token, active_opts)
438
+ when :other_number; node << UP::Number::Other.new(token, active_opts)
439
+
440
+ when :punctuation; node << UP::Punctuation::Any.new(token, active_opts)
441
+ when :connector_punctuation; node << UP::Punctuation::Connector.new(token, active_opts)
442
+ when :dash_punctuation; node << UP::Punctuation::Dash.new(token, active_opts)
443
+ when :open_punctuation; node << UP::Punctuation::Open.new(token, active_opts)
444
+ when :close_punctuation; node << UP::Punctuation::Close.new(token, active_opts)
445
+ when :initial_punctuation; node << UP::Punctuation::Initial.new(token, active_opts)
446
+ when :final_punctuation; node << UP::Punctuation::Final.new(token, active_opts)
447
+ when :other_punctuation; node << UP::Punctuation::Other.new(token, active_opts)
448
+
449
+ when :separator; node << UP::Separator::Any.new(token, active_opts)
450
+ when :space_separator; node << UP::Separator::Space.new(token, active_opts)
451
+ when :line_separator; node << UP::Separator::Line.new(token, active_opts)
452
+ when :paragraph_separator; node << UP::Separator::Paragraph.new(token, active_opts)
453
+
454
+ when :symbol; node << UP::Symbol::Any.new(token, active_opts)
455
+ when :math_symbol; node << UP::Symbol::Math.new(token, active_opts)
456
+ when :currency_symbol; node << UP::Symbol::Currency.new(token, active_opts)
457
+ when :modifier_symbol; node << UP::Symbol::Modifier.new(token, active_opts)
458
+ when :other_symbol; node << UP::Symbol::Other.new(token, active_opts)
459
+
460
+ when :other; node << UP::Codepoint::Any.new(token, active_opts)
461
+ when :control; node << UP::Codepoint::Control.new(token, active_opts)
462
+ when :format; node << UP::Codepoint::Format.new(token, active_opts)
463
+ when :surrogate; node << UP::Codepoint::Surrogate.new(token, active_opts)
464
+ when :private_use; node << UP::Codepoint::PrivateUse.new(token, active_opts)
465
+ when :unassigned; node << UP::Codepoint::Unassigned.new(token, active_opts)
466
+
467
+ when *UPTokens::Age; node << UP::Age.new(token, active_opts)
468
+ when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
469
+ when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
470
+ when *UPTokens::Script; node << UP::Script.new(token, active_opts)
471
+ when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
474
472
 
475
473
  else
476
474
  raise UnknownTokenError.new('UnicodeProperty', token)
@@ -478,8 +476,7 @@ class Regexp::Parser
478
476
  end
479
477
 
480
478
  def quantifier(token)
481
- target_node = node.expressions.reverse.find { |exp| !exp.is_a?(FreeSpace) }
482
- target_node or raise ParserError, "No valid target found for '#{token.text}'"
479
+ target_node = node.extract_quantifier_target(token.text)
483
480
 
484
481
  # in case of chained quantifiers, wrap target in an implicit passive group
485
482
  # description of the problem: https://github.com/ammar/regexp_parser/issues/3
@@ -527,6 +524,8 @@ class Regexp::Parser
527
524
  end
528
525
 
529
526
  def open_set(token)
527
+ # TODO: this and Quantifier are the only cases where Expression#token
528
+ # does not match the scanner/lexer output. Fix in v3.0.0.
530
529
  token.token = :character
531
530
  nest(CharacterSet.new(token, active_opts))
532
531
  end
@@ -590,7 +589,7 @@ class Regexp::Parser
590
589
  # (in a second iteration because there might be forward references)
591
590
  referrers.each do |exp|
592
591
  exp.referenced_expression = targets[exp.reference] ||
593
- raise(ParserError, "Invalid reference: #{exp.reference}")
592
+ raise(ParserError, "Invalid reference #{exp.reference} at pos #{exp.ts}")
594
593
  end
595
594
  end
596
595
  end # module Regexp::Parser
@@ -0,0 +1,8 @@
1
+ class Regexp::Scanner
2
+ # Unexpected end of pattern
3
+ class PrematureEndError < ScannerError
4
+ def initialize(where = '')
5
+ super "Premature end of pattern at #{where}"
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ require 'regexp_parser/error'
2
+
3
+ class Regexp::Scanner
4
+ # General scanner error (catch all)
5
+ class ScannerError < Regexp::Parser::Error; end
6
+ end