regexp_parser 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -7,22 +7,12 @@ module Regexp::Expression
7
7
  # # is it a :group expression
8
8
  # exp.type? :group
9
9
  #
10
- # # is it a :set, :subset, or :meta
11
- # exp.type? [:set, :subset, :meta]
10
+ # # is it a :set, or :meta
11
+ # exp.type? [:set, :meta]
12
12
  #
13
13
  def type?(test_type)
14
- case test_type
15
- when Array
16
- if test_type.include?(:*)
17
- return (test_type.include?(type) or test_type.include?(:*))
18
- else
19
- return test_type.include?(type)
20
- end
21
- when Symbol
22
- return (type == test_type or test_type == :*)
23
- else
24
- raise "Array or Symbol expected, #{test_type.class.name} given"
25
- end
14
+ test_types = Array(test_type).map(&:to_sym)
15
+ test_types.include?(:*) || test_types.include?(type)
26
16
  end
27
17
 
28
18
  # Test if this expression has the given test_token, and optionally a given
@@ -45,7 +45,7 @@ module Regexp::Expression
45
45
  # Returns a new array with the results of calling the given block once
46
46
  # for every expression. If a block is not given, returns an array with
47
47
  # each expression and its level index as an array.
48
- def map(include_self = false, &block)
48
+ def flat_map(include_self = false, &block)
49
49
  result = []
50
50
 
51
51
  each_expression(include_self) do |exp, index|
@@ -11,10 +11,9 @@ module Regexp::Expression
11
11
  @max = max
12
12
  end
13
13
 
14
- def clone
15
- copy = dup
16
- copy.instance_variable_set(:@text, text.dup)
17
- copy
14
+ def initialize_clone(other)
15
+ other.instance_variable_set(:@text, text.dup)
16
+ super
18
17
  end
19
18
 
20
19
  def to_s
@@ -0,0 +1,34 @@
1
+ module Regexp::Expression
2
+ # abstract class
3
+ class SequenceOperation < Regexp::Expression::Subexpression
4
+ alias :sequences :expressions
5
+ alias :operands :expressions
6
+ alias :operator :text
7
+
8
+ def starts_at
9
+ expressions.first.starts_at
10
+ end
11
+ alias :ts :starts_at
12
+
13
+ def <<(exp)
14
+ expressions.last << exp
15
+ end
16
+
17
+ def add_sequence
18
+ exp = self.class::OPERAND.new(level, set_level, conditional_level)
19
+ exp.nesting_level = nesting_level + 1
20
+ expressions << exp
21
+ exp
22
+ end
23
+
24
+ def quantify(token, text, min = nil, max = nil, mode = :greedy)
25
+ sequences.last.last.quantify(token, text, min, max, mode)
26
+ sequences.last.last.quantify(token, text, min, max, mode)
27
+ end
28
+
29
+ def to_s(format = :full)
30
+ sequences.map { |e| e.to_s(format) }.join(text)
31
+ sequences.map { |e| e.to_s(format) }.join(text)
32
+ end
33
+ end
34
+ end
@@ -10,26 +10,22 @@ module Regexp::Expression
10
10
  end
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
- def clone
14
- copy = super
15
- copy.expressions = expressions.map(&:clone)
16
- copy
13
+ def initialize_clone(other)
14
+ other.expressions = expressions.map(&:clone)
15
+ super
17
16
  end
18
17
 
19
18
  def <<(exp)
20
19
  if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
21
20
  last.merge(exp)
22
21
  else
22
+ exp.nesting_level = nesting_level + 1
23
23
  expressions << exp
24
24
  end
25
25
  end
26
26
 
27
- def insert(exp)
28
- expressions.insert(0, exp)
29
- end
30
-
31
- %w[[] all? any? at count each each_with_index empty?
32
- fetch find first index join last length values_at].each do |m|
27
+ %w[[] all? any? at collect count each each_with_index empty?
28
+ fetch find first index join last length map values_at].each do |m|
33
29
  define_method(m) { |*args, &block| expressions.send(m, *args, &block) }
34
30
  end
35
31
 
@@ -4,9 +4,10 @@
4
4
  # given syntax flavor.
5
5
  class Regexp::Lexer
6
6
 
7
- OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named, :absence,
8
- :lookahead, :nlookahead, :lookbehind, :nlookbehind
9
- ].freeze
7
+ OPENING_TOKENS = [
8
+ :capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
9
+ :atomic, :options, :options_switch, :named, :absence
10
+ ].freeze
10
11
 
11
12
  CLOSING_TOKENS = [:close].freeze
12
13
 
@@ -36,6 +37,7 @@ class Regexp::Lexer
36
37
  nesting, set_nesting, conditional_nesting)
37
38
 
38
39
  current = merge_literal(current) if type == :literal and
40
+ set_nesting == 0 and
39
41
  last and last.type == :literal
40
42
 
41
43
  current = merge_condition(current) if type == :conditional and
@@ -66,29 +68,23 @@ class Regexp::Lexer
66
68
  attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
67
69
 
68
70
  def ascend(type, token)
69
- if type == :group or type == :assertion
71
+ case type
72
+ when :group, :assertion
70
73
  self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
71
- end
72
-
73
- if type == :set or type == :subset
74
+ when :set
74
75
  self.set_nesting = set_nesting - 1 if token == :close
75
- end
76
-
77
- if type == :conditional
76
+ when :conditional
78
77
  self.conditional_nesting = conditional_nesting - 1 if token == :close
79
78
  end
80
79
  end
81
80
 
82
81
  def descend(type, token)
83
- if type == :group or type == :assertion
82
+ case type
83
+ when :group, :assertion
84
84
  self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
85
- end
86
-
87
- if type == :set or type == :subset
85
+ when :set
88
86
  self.set_nesting = set_nesting + 1 if token == :open
89
- end
90
-
91
- if type == :conditional
87
+ when :conditional
92
88
  self.conditional_nesting = conditional_nesting + 1 if token == :open
93
89
  end
94
90
  end
@@ -33,6 +33,8 @@ class Regexp::Parser
33
33
  self.switching_options = false
34
34
  self.conditional_nesting = []
35
35
 
36
+ self.captured_group_counts = Hash.new(0)
37
+
36
38
  Regexp::Lexer.scan(input, syntax) do |token|
37
39
  parse_token(token)
38
40
  end
@@ -48,7 +50,7 @@ class Regexp::Parser
48
50
 
49
51
  attr_accessor :root, :node, :nesting,
50
52
  :options_stack, :switching_options, :conditional_nesting,
51
- :current_set
53
+ :captured_group_counts
52
54
 
53
55
  def options_from_input(input)
54
56
  return {} unless input.is_a?(::Regexp)
@@ -63,9 +65,28 @@ class Regexp::Parser
63
65
  def nest(exp)
64
66
  nesting.push(exp)
65
67
  node << exp
68
+ update_transplanted_subtree(exp, node)
66
69
  self.node = exp
67
70
  end
68
71
 
72
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
73
+ def update_transplanted_subtree(exp, new_parent)
74
+ exp.nesting_level = new_parent.nesting_level + 1
75
+ exp.respond_to?(:each) &&
76
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
77
+ end
78
+
79
+ def decrease_nesting
80
+ while nesting.last.is_a?(SequenceOperation)
81
+ nesting.pop
82
+ self.node = nesting.last
83
+ end
84
+ nesting.pop
85
+ yield(node) if block_given?
86
+ self.node = nesting.last
87
+ self.node = node.last if node.last.is_a?(SequenceOperation)
88
+ end
89
+
69
90
  def nest_conditional(exp)
70
91
  conditional_nesting.push(exp)
71
92
  node << exp
@@ -73,6 +94,8 @@ class Regexp::Parser
73
94
  end
74
95
 
75
96
  def parse_token(token)
97
+ close_completed_character_set_range
98
+
76
99
  case token.type
77
100
  when :meta; meta(token)
78
101
  when :quantifier; quantifier(token)
@@ -80,12 +103,14 @@ class Regexp::Parser
80
103
  when :escape; escape(token)
81
104
  when :group; group(token)
82
105
  when :assertion; group(token)
83
- when :set, :subset; set(token)
106
+ when :set; set(token)
84
107
  when :type; type(token)
85
108
  when :backref; backref(token)
86
109
  when :conditional; conditional(token)
87
110
  when :keep; keep(token)
88
111
 
112
+ when :posixclass, :nonposixclass
113
+ posixclass(token)
89
114
  when :property, :nonproperty
90
115
  property(token)
91
116
 
@@ -104,17 +129,15 @@ class Regexp::Parser
104
129
  when :open
105
130
  open_set(token)
106
131
  when :close
107
- close_set(token)
132
+ close_set
108
133
  when :negate
109
134
  negate_set
110
- when :member, :range, :escape, :collation, :equivalent
111
- append_set(token)
112
- when *Token::Escape::All
113
- append_set(token)
114
- when *Token::CharacterSet::All
115
- append_set(token)
116
- when *Token::UnicodeProperty::All
117
- append_set(token)
135
+ when :range
136
+ range(token)
137
+ when :intersection
138
+ intersection(token)
139
+ when :collation, :equivalent
140
+ node << Literal.new(token, active_opts)
118
141
  else
119
142
  raise UnknownTokenError.new('CharacterSet', token)
120
143
  end
@@ -125,19 +148,7 @@ class Regexp::Parser
125
148
  when :dot
126
149
  node << CharacterType::Any.new(token, active_opts)
127
150
  when :alternation
128
- if node.token == :alternation
129
- elsif node.last.is_a?(Alternation)
130
- self.node = node.last
131
- else
132
- alt = Alternation.new(token, active_opts)
133
- seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
134
- node.expressions.count.times { seq.insert(node.expressions.pop) }
135
- alt.alternative(seq)
136
-
137
- node << alt
138
- self.node = alt
139
- end
140
- node.alternative
151
+ sequence_operation(Alternation, token)
141
152
  else
142
153
  raise UnknownTokenError.new('Meta', token)
143
154
  end
@@ -147,16 +158,16 @@ class Regexp::Parser
147
158
  case token.token
148
159
  when :name_ref
149
160
  node << Backreference::Name.new(token, active_opts)
150
- when :name_nest_ref
151
- node << Backreference::NameNestLevel.new(token, active_opts)
161
+ when :name_recursion_ref
162
+ node << Backreference::NameRecursionLevel.new(token, active_opts)
152
163
  when :name_call
153
164
  node << Backreference::NameCall.new(token, active_opts)
154
165
  when :number, :number_ref
155
166
  node << Backreference::Number.new(token, active_opts)
156
167
  when :number_rel_ref
157
168
  node << Backreference::NumberRelative.new(token, active_opts)
158
- when :number_nest_ref
159
- node << Backreference::NumberNestLevel.new(token, active_opts)
169
+ when :number_recursion_ref
170
+ node << Backreference::NumberRecursionLevel.new(token, active_opts)
160
171
  when :number_call
161
172
  node << Backreference::NumberCall.new(token, active_opts)
162
173
  when :number_rel_call
@@ -217,75 +228,81 @@ class Regexp::Parser
217
228
  end
218
229
  end
219
230
 
231
+ def posixclass(token)
232
+ node << PosixClass.new(token)
233
+ end
234
+
220
235
  include Regexp::Expression::UnicodeProperty
221
236
 
222
237
  def property(token)
223
238
  case token.token
224
- when :alnum; node << Alnum.new(token, active_opts)
225
- when :alpha; node << Alpha.new(token, active_opts)
226
- when :ascii; node << Ascii.new(token, active_opts)
227
- when :blank; node << Blank.new(token, active_opts)
228
- when :cntrl; node << Cntrl.new(token, active_opts)
229
- when :digit; node << Digit.new(token, active_opts)
230
- when :graph; node << Graph.new(token, active_opts)
231
- when :lower; node << Lower.new(token, active_opts)
232
- when :print; node << Print.new(token, active_opts)
233
- when :punct; node << Punct.new(token, active_opts)
234
- when :space; node << Space.new(token, active_opts)
235
- when :upper; node << Upper.new(token, active_opts)
236
- when :word; node << Word.new(token, active_opts)
237
- when :xdigit; node << Xdigit.new(token, active_opts)
238
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
239
+ when :alnum; node << Alnum.new(token, active_opts)
240
+ when :alpha; node << Alpha.new(token, active_opts)
241
+ when :ascii; node << Ascii.new(token, active_opts)
242
+ when :blank; node << Blank.new(token, active_opts)
243
+ when :cntrl; node << Cntrl.new(token, active_opts)
244
+ when :digit; node << Digit.new(token, active_opts)
245
+ when :graph; node << Graph.new(token, active_opts)
246
+ when :lower; node << Lower.new(token, active_opts)
247
+ when :print; node << Print.new(token, active_opts)
248
+ when :punct; node << Punct.new(token, active_opts)
249
+ when :space; node << Space.new(token, active_opts)
250
+ when :upper; node << Upper.new(token, active_opts)
251
+ when :word; node << Word.new(token, active_opts)
252
+ when :xdigit; node << Xdigit.new(token, active_opts)
253
+ when :xposixpunct; node << XPosixPunct.new(token, active_opts)
239
254
 
240
255
  # only in Oniguruma (old rubies)
241
- when :newline; node << Newline.new(token, active_opts)
242
-
243
- when :any; node << Any.new(token, active_opts)
244
- when :assigned; node << Assigned.new(token, active_opts)
245
-
246
- when :letter_any; node << Letter::Any.new(token, active_opts)
247
- when :letter_uppercase; node << Letter::Uppercase.new(token, active_opts)
248
- when :letter_lowercase; node << Letter::Lowercase.new(token, active_opts)
249
- when :letter_titlecase; node << Letter::Titlecase.new(token, active_opts)
250
- when :letter_modifier; node << Letter::Modifier.new(token, active_opts)
251
- when :letter_other; node << Letter::Other.new(token, active_opts)
252
-
253
- when :mark_any; node << Mark::Any.new(token, active_opts)
254
- when :mark_nonspacing; node << Mark::Nonspacing.new(token, active_opts)
255
- when :mark_spacing; node << Mark::Spacing.new(token, active_opts)
256
- when :mark_enclosing; node << Mark::Enclosing.new(token, active_opts)
257
-
258
- when :number_any; node << Number::Any.new(token, active_opts)
259
- when :number_decimal; node << Number::Decimal.new(token, active_opts)
260
- when :number_letter; node << Number::Letter.new(token, active_opts)
261
- when :number_other; node << Number::Other.new(token, active_opts)
262
-
263
- when :punct_any; node << Punctuation::Any.new(token, active_opts)
264
- when :punct_connector; node << Punctuation::Connector.new(token, active_opts)
265
- when :punct_dash; node << Punctuation::Dash.new(token, active_opts)
266
- when :punct_open; node << Punctuation::Open.new(token, active_opts)
267
- when :punct_close; node << Punctuation::Close.new(token, active_opts)
268
- when :punct_initial; node << Punctuation::Initial.new(token, active_opts)
269
- when :punct_final; node << Punctuation::Final.new(token, active_opts)
270
- when :punct_other; node << Punctuation::Other.new(token, active_opts)
271
-
272
- when :separator_any; node << Separator::Any.new(token, active_opts)
273
- when :separator_space; node << Separator::Space.new(token, active_opts)
274
- when :separator_line; node << Separator::Line.new(token, active_opts)
275
- when :separator_para; node << Separator::Paragraph.new(token, active_opts)
276
-
277
- when :symbol_any; node << Symbol::Any.new(token, active_opts)
278
- when :symbol_math; node << Symbol::Math.new(token, active_opts)
279
- when :symbol_currency; node << Symbol::Currency.new(token, active_opts)
280
- when :symbol_modifier; node << Symbol::Modifier.new(token, active_opts)
281
- when :symbol_other; node << Symbol::Other.new(token, active_opts)
282
-
283
- when :other; node << Codepoint::Any.new(token, active_opts)
284
- when :control; node << Codepoint::Control.new(token, active_opts)
285
- when :format; node << Codepoint::Format.new(token, active_opts)
286
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
287
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
288
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
256
+ when :newline; node << Newline.new(token, active_opts)
257
+
258
+ when :any; node << Any.new(token, active_opts)
259
+ when :assigned; node << Assigned.new(token, active_opts)
260
+
261
+ when :letter; node << Letter::Any.new(token, active_opts)
262
+ when :cased_letter; node << Letter::Cased.new(token, active_opts)
263
+ when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
264
+ when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
265
+ when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
266
+ when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
267
+ when :other_letter; node << Letter::Other.new(token, active_opts)
268
+
269
+ when :mark; node << Mark::Any.new(token, active_opts)
270
+ when :combining_mark; node << Mark::Combining.new(token, active_opts)
271
+ when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
272
+ when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
273
+ when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
274
+
275
+ when :number; node << Number::Any.new(token, active_opts)
276
+ when :decimal_number; node << Number::Decimal.new(token, active_opts)
277
+ when :letter_number; node << Number::Letter.new(token, active_opts)
278
+ when :other_number; node << Number::Other.new(token, active_opts)
279
+
280
+ when :punctuation; node << Punctuation::Any.new(token, active_opts)
281
+ when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
282
+ when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
283
+ when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
284
+ when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
285
+ when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
286
+ when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
287
+ when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
288
+
289
+ when :separator; node << Separator::Any.new(token, active_opts)
290
+ when :space_separator; node << Separator::Space.new(token, active_opts)
291
+ when :line_separator; node << Separator::Line.new(token, active_opts)
292
+ when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
293
+
294
+ when :symbol; node << Symbol::Any.new(token, active_opts)
295
+ when :math_symbol; node << Symbol::Math.new(token, active_opts)
296
+ when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
297
+ when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
298
+ when :other_symbol; node << Symbol::Other.new(token, active_opts)
299
+
300
+ when :other; node << Codepoint::Any.new(token, active_opts)
301
+ when :control; node << Codepoint::Control.new(token, active_opts)
302
+ when :format; node << Codepoint::Format.new(token, active_opts)
303
+ when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
304
+ when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
305
+ when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
289
306
 
290
307
  when *Token::UnicodeProperty::Age
291
308
  node << Age.new(token, active_opts)
@@ -346,13 +363,20 @@ class Regexp::Parser
346
363
  node << EscapeSequence::Newline.new(token, active_opts)
347
364
  when :carriage
348
365
  node << EscapeSequence::Return.new(token, active_opts)
349
- when :space
350
- node << EscapeSequence::Space.new(token, active_opts)
351
366
  when :tab
352
367
  node << EscapeSequence::Tab.new(token, active_opts)
353
368
  when :vertical_tab
354
369
  node << EscapeSequence::VerticalTab.new(token, active_opts)
355
370
 
371
+ when :hex
372
+ node << EscapeSequence::Hex.new(token, active_opts)
373
+ when :octal
374
+ node << EscapeSequence::Octal.new(token, active_opts)
375
+ when :codepoint
376
+ node << EscapeSequence::Codepoint.new(token, active_opts)
377
+ when :codepoint_list
378
+ node << EscapeSequence::CodepointList.new(token, active_opts)
379
+
356
380
  when :control
357
381
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
358
382
  node << EscapeSequence::MetaControl.new(token, active_opts)
@@ -447,7 +471,7 @@ class Regexp::Parser
447
471
  mode = :greedy
448
472
  end
449
473
 
450
- range = range_text.gsub(/\{|\}/, '').split(',', 2).each {|i| i.strip}
474
+ range = range_text.gsub(/\{|\}/, '').split(',', 2)
451
475
  min = range[0].empty? ? 0 : range[0]
452
476
  max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
453
477
 
@@ -456,7 +480,7 @@ class Regexp::Parser
456
480
 
457
481
  def group(token)
458
482
  case token.token
459
- when :options
483
+ when :options, :options_switch
460
484
  options_group(token)
461
485
  when :close
462
486
  close_group
@@ -470,8 +494,7 @@ class Regexp::Parser
470
494
  def options_group(token)
471
495
  positive, negative = token.text.split('-', 2)
472
496
  negative ||= ''
473
- self.switching_options = !token.text.include?(':')
474
- # TODO: change this -^ to token.type == :options_switch in v1.0.0
497
+ self.switching_options = token.token.equal?(:options_switch)
475
498
 
476
499
  new_options = active_opts.dup
477
500
 
@@ -491,9 +514,7 @@ class Regexp::Parser
491
514
 
492
515
  options_stack << new_options
493
516
 
494
- exp = Group::Options.new(token, active_opts)
495
-
496
- nest(exp)
517
+ nest(Group::Options.new(token, active_opts))
497
518
  end
498
519
 
499
520
  def open_group(token)
@@ -522,6 +543,12 @@ class Regexp::Parser
522
543
  raise UnknownTokenError.new('Group type open', token)
523
544
  end
524
545
 
546
+ if exp.capturing?
547
+ exp.number = total_captured_group_count + 1
548
+ exp.number_at_level = captured_group_count_at_level + 1
549
+ count_captured_group
550
+ end
551
+
525
552
  # Push the active options to the stack again. This way we can simply pop the
526
553
  # stack for any group we close, no matter if it had its own options or not.
527
554
  options_stack << active_opts
@@ -530,38 +557,65 @@ class Regexp::Parser
530
557
  end
531
558
 
532
559
  def close_group
533
- nesting.pop
534
560
  options_stack.pop unless switching_options
535
561
  self.switching_options = false
536
-
537
- self.node = nesting.last
538
- self.node = node.last if node.last and node.last.is_a?(Alternation)
562
+ decrease_nesting
539
563
  end
540
564
 
541
565
  def open_set(token)
542
566
  token.token = :character
543
-
544
- if token.type == :subset
545
- current_set << CharacterSubSet.new(token, active_opts)
546
- else
547
- self.current_set = CharacterSet.new(token, active_opts)
548
- node << current_set
549
- end
567
+ nest(CharacterSet.new(token, active_opts))
550
568
  end
551
569
 
552
570
  def negate_set
553
- current_set.negate
571
+ node.negate
554
572
  end
555
573
 
556
- def append_set(token)
557
- current_set << token.text
574
+ def close_set
575
+ decrease_nesting(&:close)
558
576
  end
559
577
 
560
- def close_set(token)
561
- current_set.close
578
+ def range(token)
579
+ exp = CharacterSet::Range.new(token, active_opts)
580
+ scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
581
+ exp << scope.expressions.pop
582
+ nest(exp)
583
+ end
584
+
585
+ def close_completed_character_set_range
586
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
587
+ end
588
+
589
+ def intersection(token)
590
+ sequence_operation(CharacterSet::Intersection, token)
591
+ end
592
+
593
+ def sequence_operation(klass, token)
594
+ if node.last.is_a?(klass)
595
+ self.node = node.last
596
+ elsif !node.is_a?(klass)
597
+ operator = klass.new(token, active_opts)
598
+ sequence = operator.add_sequence
599
+ sequence.expressions = node.expressions
600
+ node.expressions = []
601
+ nest(operator)
602
+ end
603
+ node.add_sequence
562
604
  end
563
605
 
564
606
  def active_opts
565
607
  options_stack.last
566
608
  end
609
+
610
+ def total_captured_group_count
611
+ captured_group_counts.values.reduce(0, :+)
612
+ end
613
+
614
+ def captured_group_count_at_level
615
+ captured_group_counts[node.level]
616
+ end
617
+
618
+ def count_captured_group
619
+ captured_group_counts[node.level] += 1
620
+ end
567
621
  end # module Regexp::Parser