regexp_parser 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +242 -0
  3. data/Gemfile +1 -0
  4. data/README.md +21 -17
  5. data/Rakefile +31 -0
  6. data/lib/regexp_parser/expression.rb +11 -9
  7. data/lib/regexp_parser/expression/classes/alternation.rb +5 -28
  8. data/lib/regexp_parser/expression/classes/backref.rb +21 -16
  9. data/lib/regexp_parser/expression/classes/escape.rb +81 -10
  10. data/lib/regexp_parser/expression/classes/group.rb +20 -20
  11. data/lib/regexp_parser/expression/classes/{character_class.rb → posix_class.rb} +2 -2
  12. data/lib/regexp_parser/expression/classes/property.rb +6 -0
  13. data/lib/regexp_parser/expression/classes/set.rb +10 -93
  14. data/lib/regexp_parser/expression/classes/set/intersection.rb +9 -0
  15. data/lib/regexp_parser/expression/classes/set/range.rb +23 -0
  16. data/lib/regexp_parser/expression/methods/strfregexp.rb +6 -4
  17. data/lib/regexp_parser/expression/methods/tests.rb +4 -14
  18. data/lib/regexp_parser/expression/methods/traverse.rb +1 -1
  19. data/lib/regexp_parser/expression/quantifier.rb +3 -4
  20. data/lib/regexp_parser/expression/sequence_operation.rb +34 -0
  21. data/lib/regexp_parser/expression/subexpression.rb +6 -10
  22. data/lib/regexp_parser/lexer.rb +13 -17
  23. data/lib/regexp_parser/parser.rb +170 -116
  24. data/lib/regexp_parser/scanner.rb +952 -2431
  25. data/lib/regexp_parser/scanner/char_type.rl +31 -0
  26. data/lib/regexp_parser/scanner/properties/long.yml +561 -0
  27. data/lib/regexp_parser/scanner/properties/short.yml +225 -0
  28. data/lib/regexp_parser/scanner/property.rl +7 -806
  29. data/lib/regexp_parser/scanner/scanner.rl +112 -154
  30. data/lib/regexp_parser/syntax/base.rb +4 -4
  31. data/lib/regexp_parser/syntax/tokens.rb +1 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +2 -2
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +3 -38
  34. data/lib/regexp_parser/syntax/tokens/escape.rb +2 -3
  35. data/lib/regexp_parser/syntax/tokens/group.rb +5 -4
  36. data/lib/regexp_parser/syntax/tokens/{character_class.rb → posix_class.rb} +5 -5
  37. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +519 -266
  38. data/lib/regexp_parser/syntax/versions/1.8.6.rb +2 -4
  39. data/lib/regexp_parser/syntax/versions/1.9.1.rb +4 -10
  40. data/lib/regexp_parser/syntax/versions/2.0.0.rb +0 -2
  41. data/lib/regexp_parser/syntax/versions/2.4.1.rb +1 -1
  42. data/lib/regexp_parser/version.rb +1 -1
  43. data/regexp_parser.gemspec +2 -1
  44. data/test/expression/test_base.rb +2 -1
  45. data/test/expression/test_clone.rb +0 -57
  46. data/test/expression/test_set.rb +31 -8
  47. data/test/expression/test_strfregexp.rb +13 -4
  48. data/test/expression/test_subexpression.rb +25 -0
  49. data/test/expression/test_traverse.rb +25 -25
  50. data/test/helpers.rb +1 -0
  51. data/test/lexer/test_all.rb +1 -1
  52. data/test/lexer/test_conditionals.rb +9 -7
  53. data/test/lexer/test_nesting.rb +39 -21
  54. data/test/lexer/test_refcalls.rb +4 -4
  55. data/test/parser/set/test_intersections.rb +127 -0
  56. data/test/parser/set/test_ranges.rb +111 -0
  57. data/test/parser/test_all.rb +4 -1
  58. data/test/parser/test_escapes.rb +41 -9
  59. data/test/parser/test_groups.rb +22 -3
  60. data/test/parser/test_posix_classes.rb +27 -0
  61. data/test/parser/test_properties.rb +17 -290
  62. data/test/parser/test_refcalls.rb +66 -26
  63. data/test/parser/test_sets.rb +132 -129
  64. data/test/scanner/test_all.rb +1 -7
  65. data/test/scanner/test_conditionals.rb +16 -16
  66. data/test/scanner/test_errors.rb +0 -30
  67. data/test/scanner/test_escapes.rb +1 -2
  68. data/test/scanner/test_free_space.rb +28 -28
  69. data/test/scanner/test_groups.rb +35 -35
  70. data/test/scanner/test_meta.rb +1 -1
  71. data/test/scanner/test_properties.rb +87 -114
  72. data/test/scanner/test_refcalls.rb +18 -18
  73. data/test/scanner/test_scripts.rb +19 -351
  74. data/test/scanner/test_sets.rb +87 -60
  75. data/test/scanner/test_unicode_blocks.rb +4 -105
  76. data/test/support/warning_extractor.rb +1 -1
  77. data/test/syntax/test_syntax.rb +7 -0
  78. data/test/syntax/versions/test_1.8.rb +2 -4
  79. metadata +17 -7
  80. data/ChangeLog +0 -325
  81. data/test/scanner/test_emojis.rb +0 -31
@@ -7,22 +7,12 @@ module Regexp::Expression
7
7
  # # is it a :group expression
8
8
  # exp.type? :group
9
9
  #
10
- # # is it a :set, :subset, or :meta
11
- # exp.type? [:set, :subset, :meta]
10
+ # # is it a :set, or :meta
11
+ # exp.type? [:set, :meta]
12
12
  #
13
13
  def type?(test_type)
14
- case test_type
15
- when Array
16
- if test_type.include?(:*)
17
- return (test_type.include?(type) or test_type.include?(:*))
18
- else
19
- return test_type.include?(type)
20
- end
21
- when Symbol
22
- return (type == test_type or test_type == :*)
23
- else
24
- raise "Array or Symbol expected, #{test_type.class.name} given"
25
- end
14
+ test_types = Array(test_type).map(&:to_sym)
15
+ test_types.include?(:*) || test_types.include?(type)
26
16
  end
27
17
 
28
18
  # Test if this expression has the given test_token, and optionally a given
@@ -45,7 +45,7 @@ module Regexp::Expression
45
45
  # Returns a new array with the results of calling the given block once
46
46
  # for every expression. If a block is not given, returns an array with
47
47
  # each expression and its level index as an array.
48
- def map(include_self = false, &block)
48
+ def flat_map(include_self = false, &block)
49
49
  result = []
50
50
 
51
51
  each_expression(include_self) do |exp, index|
@@ -11,10 +11,9 @@ module Regexp::Expression
11
11
  @max = max
12
12
  end
13
13
 
14
- def clone
15
- copy = dup
16
- copy.instance_variable_set(:@text, text.dup)
17
- copy
14
+ def initialize_clone(other)
15
+ other.instance_variable_set(:@text, text.dup)
16
+ super
18
17
  end
19
18
 
20
19
  def to_s
@@ -0,0 +1,34 @@
1
+ module Regexp::Expression
2
+ # abstract class
3
+ class SequenceOperation < Regexp::Expression::Subexpression
4
+ alias :sequences :expressions
5
+ alias :operands :expressions
6
+ alias :operator :text
7
+
8
+ def starts_at
9
+ expressions.first.starts_at
10
+ end
11
+ alias :ts :starts_at
12
+
13
+ def <<(exp)
14
+ expressions.last << exp
15
+ end
16
+
17
+ def add_sequence
18
+ exp = self.class::OPERAND.new(level, set_level, conditional_level)
19
+ exp.nesting_level = nesting_level + 1
20
+ expressions << exp
21
+ exp
22
+ end
23
+
24
+ def quantify(token, text, min = nil, max = nil, mode = :greedy)
25
+ sequences.last.last.quantify(token, text, min, max, mode)
26
+ sequences.last.last.quantify(token, text, min, max, mode)
27
+ end
28
+
29
+ def to_s(format = :full)
30
+ sequences.map { |e| e.to_s(format) }.join(text)
31
+ sequences.map { |e| e.to_s(format) }.join(text)
32
+ end
33
+ end
34
+ end
@@ -10,26 +10,22 @@ module Regexp::Expression
10
10
  end
11
11
 
12
12
  # Override base method to clone the expressions as well.
13
- def clone
14
- copy = super
15
- copy.expressions = expressions.map(&:clone)
16
- copy
13
+ def initialize_clone(other)
14
+ other.expressions = expressions.map(&:clone)
15
+ super
17
16
  end
18
17
 
19
18
  def <<(exp)
20
19
  if exp.is_a?(WhiteSpace) && last && last.is_a?(WhiteSpace)
21
20
  last.merge(exp)
22
21
  else
22
+ exp.nesting_level = nesting_level + 1
23
23
  expressions << exp
24
24
  end
25
25
  end
26
26
 
27
- def insert(exp)
28
- expressions.insert(0, exp)
29
- end
30
-
31
- %w[[] all? any? at count each each_with_index empty?
32
- fetch find first index join last length values_at].each do |m|
27
+ %w[[] all? any? at collect count each each_with_index empty?
28
+ fetch find first index join last length map values_at].each do |m|
33
29
  define_method(m) { |*args, &block| expressions.send(m, *args, &block) }
34
30
  end
35
31
 
@@ -4,9 +4,10 @@
4
4
  # given syntax flavor.
5
5
  class Regexp::Lexer
6
6
 
7
- OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named, :absence,
8
- :lookahead, :nlookahead, :lookbehind, :nlookbehind
9
- ].freeze
7
+ OPENING_TOKENS = [
8
+ :capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
9
+ :atomic, :options, :options_switch, :named, :absence
10
+ ].freeze
10
11
 
11
12
  CLOSING_TOKENS = [:close].freeze
12
13
 
@@ -36,6 +37,7 @@ class Regexp::Lexer
36
37
  nesting, set_nesting, conditional_nesting)
37
38
 
38
39
  current = merge_literal(current) if type == :literal and
40
+ set_nesting == 0 and
39
41
  last and last.type == :literal
40
42
 
41
43
  current = merge_condition(current) if type == :conditional and
@@ -66,29 +68,23 @@ class Regexp::Lexer
66
68
  attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting
67
69
 
68
70
  def ascend(type, token)
69
- if type == :group or type == :assertion
71
+ case type
72
+ when :group, :assertion
70
73
  self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
71
- end
72
-
73
- if type == :set or type == :subset
74
+ when :set
74
75
  self.set_nesting = set_nesting - 1 if token == :close
75
- end
76
-
77
- if type == :conditional
76
+ when :conditional
78
77
  self.conditional_nesting = conditional_nesting - 1 if token == :close
79
78
  end
80
79
  end
81
80
 
82
81
  def descend(type, token)
83
- if type == :group or type == :assertion
82
+ case type
83
+ when :group, :assertion
84
84
  self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
85
- end
86
-
87
- if type == :set or type == :subset
85
+ when :set
88
86
  self.set_nesting = set_nesting + 1 if token == :open
89
- end
90
-
91
- if type == :conditional
87
+ when :conditional
92
88
  self.conditional_nesting = conditional_nesting + 1 if token == :open
93
89
  end
94
90
  end
@@ -33,6 +33,8 @@ class Regexp::Parser
33
33
  self.switching_options = false
34
34
  self.conditional_nesting = []
35
35
 
36
+ self.captured_group_counts = Hash.new(0)
37
+
36
38
  Regexp::Lexer.scan(input, syntax) do |token|
37
39
  parse_token(token)
38
40
  end
@@ -48,7 +50,7 @@ class Regexp::Parser
48
50
 
49
51
  attr_accessor :root, :node, :nesting,
50
52
  :options_stack, :switching_options, :conditional_nesting,
51
- :current_set
53
+ :captured_group_counts
52
54
 
53
55
  def options_from_input(input)
54
56
  return {} unless input.is_a?(::Regexp)
@@ -63,9 +65,28 @@ class Regexp::Parser
63
65
  def nest(exp)
64
66
  nesting.push(exp)
65
67
  node << exp
68
+ update_transplanted_subtree(exp, node)
66
69
  self.node = exp
67
70
  end
68
71
 
72
+ # subtrees are transplanted to build Alternations, Intersections, Ranges
73
+ def update_transplanted_subtree(exp, new_parent)
74
+ exp.nesting_level = new_parent.nesting_level + 1
75
+ exp.respond_to?(:each) &&
76
+ exp.each { |subexp| update_transplanted_subtree(subexp, exp) }
77
+ end
78
+
79
+ def decrease_nesting
80
+ while nesting.last.is_a?(SequenceOperation)
81
+ nesting.pop
82
+ self.node = nesting.last
83
+ end
84
+ nesting.pop
85
+ yield(node) if block_given?
86
+ self.node = nesting.last
87
+ self.node = node.last if node.last.is_a?(SequenceOperation)
88
+ end
89
+
69
90
  def nest_conditional(exp)
70
91
  conditional_nesting.push(exp)
71
92
  node << exp
@@ -73,6 +94,8 @@ class Regexp::Parser
73
94
  end
74
95
 
75
96
  def parse_token(token)
97
+ close_completed_character_set_range
98
+
76
99
  case token.type
77
100
  when :meta; meta(token)
78
101
  when :quantifier; quantifier(token)
@@ -80,12 +103,14 @@ class Regexp::Parser
80
103
  when :escape; escape(token)
81
104
  when :group; group(token)
82
105
  when :assertion; group(token)
83
- when :set, :subset; set(token)
106
+ when :set; set(token)
84
107
  when :type; type(token)
85
108
  when :backref; backref(token)
86
109
  when :conditional; conditional(token)
87
110
  when :keep; keep(token)
88
111
 
112
+ when :posixclass, :nonposixclass
113
+ posixclass(token)
89
114
  when :property, :nonproperty
90
115
  property(token)
91
116
 
@@ -104,17 +129,15 @@ class Regexp::Parser
104
129
  when :open
105
130
  open_set(token)
106
131
  when :close
107
- close_set(token)
132
+ close_set
108
133
  when :negate
109
134
  negate_set
110
- when :member, :range, :escape, :collation, :equivalent
111
- append_set(token)
112
- when *Token::Escape::All
113
- append_set(token)
114
- when *Token::CharacterSet::All
115
- append_set(token)
116
- when *Token::UnicodeProperty::All
117
- append_set(token)
135
+ when :range
136
+ range(token)
137
+ when :intersection
138
+ intersection(token)
139
+ when :collation, :equivalent
140
+ node << Literal.new(token, active_opts)
118
141
  else
119
142
  raise UnknownTokenError.new('CharacterSet', token)
120
143
  end
@@ -125,19 +148,7 @@ class Regexp::Parser
125
148
  when :dot
126
149
  node << CharacterType::Any.new(token, active_opts)
127
150
  when :alternation
128
- if node.token == :alternation
129
- elsif node.last.is_a?(Alternation)
130
- self.node = node.last
131
- else
132
- alt = Alternation.new(token, active_opts)
133
- seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
134
- node.expressions.count.times { seq.insert(node.expressions.pop) }
135
- alt.alternative(seq)
136
-
137
- node << alt
138
- self.node = alt
139
- end
140
- node.alternative
151
+ sequence_operation(Alternation, token)
141
152
  else
142
153
  raise UnknownTokenError.new('Meta', token)
143
154
  end
@@ -147,16 +158,16 @@ class Regexp::Parser
147
158
  case token.token
148
159
  when :name_ref
149
160
  node << Backreference::Name.new(token, active_opts)
150
- when :name_nest_ref
151
- node << Backreference::NameNestLevel.new(token, active_opts)
161
+ when :name_recursion_ref
162
+ node << Backreference::NameRecursionLevel.new(token, active_opts)
152
163
  when :name_call
153
164
  node << Backreference::NameCall.new(token, active_opts)
154
165
  when :number, :number_ref
155
166
  node << Backreference::Number.new(token, active_opts)
156
167
  when :number_rel_ref
157
168
  node << Backreference::NumberRelative.new(token, active_opts)
158
- when :number_nest_ref
159
- node << Backreference::NumberNestLevel.new(token, active_opts)
169
+ when :number_recursion_ref
170
+ node << Backreference::NumberRecursionLevel.new(token, active_opts)
160
171
  when :number_call
161
172
  node << Backreference::NumberCall.new(token, active_opts)
162
173
  when :number_rel_call
@@ -217,75 +228,81 @@ class Regexp::Parser
217
228
  end
218
229
  end
219
230
 
231
+ def posixclass(token)
232
+ node << PosixClass.new(token)
233
+ end
234
+
220
235
  include Regexp::Expression::UnicodeProperty
221
236
 
222
237
  def property(token)
223
238
  case token.token
224
- when :alnum; node << Alnum.new(token, active_opts)
225
- when :alpha; node << Alpha.new(token, active_opts)
226
- when :ascii; node << Ascii.new(token, active_opts)
227
- when :blank; node << Blank.new(token, active_opts)
228
- when :cntrl; node << Cntrl.new(token, active_opts)
229
- when :digit; node << Digit.new(token, active_opts)
230
- when :graph; node << Graph.new(token, active_opts)
231
- when :lower; node << Lower.new(token, active_opts)
232
- when :print; node << Print.new(token, active_opts)
233
- when :punct; node << Punct.new(token, active_opts)
234
- when :space; node << Space.new(token, active_opts)
235
- when :upper; node << Upper.new(token, active_opts)
236
- when :word; node << Word.new(token, active_opts)
237
- when :xdigit; node << Xdigit.new(token, active_opts)
238
- when :xposixpunct; node << XPosixPunct.new(token, active_opts)
239
+ when :alnum; node << Alnum.new(token, active_opts)
240
+ when :alpha; node << Alpha.new(token, active_opts)
241
+ when :ascii; node << Ascii.new(token, active_opts)
242
+ when :blank; node << Blank.new(token, active_opts)
243
+ when :cntrl; node << Cntrl.new(token, active_opts)
244
+ when :digit; node << Digit.new(token, active_opts)
245
+ when :graph; node << Graph.new(token, active_opts)
246
+ when :lower; node << Lower.new(token, active_opts)
247
+ when :print; node << Print.new(token, active_opts)
248
+ when :punct; node << Punct.new(token, active_opts)
249
+ when :space; node << Space.new(token, active_opts)
250
+ when :upper; node << Upper.new(token, active_opts)
251
+ when :word; node << Word.new(token, active_opts)
252
+ when :xdigit; node << Xdigit.new(token, active_opts)
253
+ when :xposixpunct; node << XPosixPunct.new(token, active_opts)
239
254
 
240
255
  # only in Oniguruma (old rubies)
241
- when :newline; node << Newline.new(token, active_opts)
242
-
243
- when :any; node << Any.new(token, active_opts)
244
- when :assigned; node << Assigned.new(token, active_opts)
245
-
246
- when :letter_any; node << Letter::Any.new(token, active_opts)
247
- when :letter_uppercase; node << Letter::Uppercase.new(token, active_opts)
248
- when :letter_lowercase; node << Letter::Lowercase.new(token, active_opts)
249
- when :letter_titlecase; node << Letter::Titlecase.new(token, active_opts)
250
- when :letter_modifier; node << Letter::Modifier.new(token, active_opts)
251
- when :letter_other; node << Letter::Other.new(token, active_opts)
252
-
253
- when :mark_any; node << Mark::Any.new(token, active_opts)
254
- when :mark_nonspacing; node << Mark::Nonspacing.new(token, active_opts)
255
- when :mark_spacing; node << Mark::Spacing.new(token, active_opts)
256
- when :mark_enclosing; node << Mark::Enclosing.new(token, active_opts)
257
-
258
- when :number_any; node << Number::Any.new(token, active_opts)
259
- when :number_decimal; node << Number::Decimal.new(token, active_opts)
260
- when :number_letter; node << Number::Letter.new(token, active_opts)
261
- when :number_other; node << Number::Other.new(token, active_opts)
262
-
263
- when :punct_any; node << Punctuation::Any.new(token, active_opts)
264
- when :punct_connector; node << Punctuation::Connector.new(token, active_opts)
265
- when :punct_dash; node << Punctuation::Dash.new(token, active_opts)
266
- when :punct_open; node << Punctuation::Open.new(token, active_opts)
267
- when :punct_close; node << Punctuation::Close.new(token, active_opts)
268
- when :punct_initial; node << Punctuation::Initial.new(token, active_opts)
269
- when :punct_final; node << Punctuation::Final.new(token, active_opts)
270
- when :punct_other; node << Punctuation::Other.new(token, active_opts)
271
-
272
- when :separator_any; node << Separator::Any.new(token, active_opts)
273
- when :separator_space; node << Separator::Space.new(token, active_opts)
274
- when :separator_line; node << Separator::Line.new(token, active_opts)
275
- when :separator_para; node << Separator::Paragraph.new(token, active_opts)
276
-
277
- when :symbol_any; node << Symbol::Any.new(token, active_opts)
278
- when :symbol_math; node << Symbol::Math.new(token, active_opts)
279
- when :symbol_currency; node << Symbol::Currency.new(token, active_opts)
280
- when :symbol_modifier; node << Symbol::Modifier.new(token, active_opts)
281
- when :symbol_other; node << Symbol::Other.new(token, active_opts)
282
-
283
- when :other; node << Codepoint::Any.new(token, active_opts)
284
- when :control; node << Codepoint::Control.new(token, active_opts)
285
- when :format; node << Codepoint::Format.new(token, active_opts)
286
- when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
287
- when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
288
- when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
256
+ when :newline; node << Newline.new(token, active_opts)
257
+
258
+ when :any; node << Any.new(token, active_opts)
259
+ when :assigned; node << Assigned.new(token, active_opts)
260
+
261
+ when :letter; node << Letter::Any.new(token, active_opts)
262
+ when :cased_letter; node << Letter::Cased.new(token, active_opts)
263
+ when :uppercase_letter; node << Letter::Uppercase.new(token, active_opts)
264
+ when :lowercase_letter; node << Letter::Lowercase.new(token, active_opts)
265
+ when :titlecase_letter; node << Letter::Titlecase.new(token, active_opts)
266
+ when :modifier_letter; node << Letter::Modifier.new(token, active_opts)
267
+ when :other_letter; node << Letter::Other.new(token, active_opts)
268
+
269
+ when :mark; node << Mark::Any.new(token, active_opts)
270
+ when :combining_mark; node << Mark::Combining.new(token, active_opts)
271
+ when :nonspacing_mark; node << Mark::Nonspacing.new(token, active_opts)
272
+ when :spacing_mark; node << Mark::Spacing.new(token, active_opts)
273
+ when :enclosing_mark; node << Mark::Enclosing.new(token, active_opts)
274
+
275
+ when :number; node << Number::Any.new(token, active_opts)
276
+ when :decimal_number; node << Number::Decimal.new(token, active_opts)
277
+ when :letter_number; node << Number::Letter.new(token, active_opts)
278
+ when :other_number; node << Number::Other.new(token, active_opts)
279
+
280
+ when :punctuation; node << Punctuation::Any.new(token, active_opts)
281
+ when :connector_punctuation; node << Punctuation::Connector.new(token, active_opts)
282
+ when :dash_punctuation; node << Punctuation::Dash.new(token, active_opts)
283
+ when :open_punctuation; node << Punctuation::Open.new(token, active_opts)
284
+ when :close_punctuation; node << Punctuation::Close.new(token, active_opts)
285
+ when :initial_punctuation; node << Punctuation::Initial.new(token, active_opts)
286
+ when :final_punctuation; node << Punctuation::Final.new(token, active_opts)
287
+ when :other_punctuation; node << Punctuation::Other.new(token, active_opts)
288
+
289
+ when :separator; node << Separator::Any.new(token, active_opts)
290
+ when :space_separator; node << Separator::Space.new(token, active_opts)
291
+ when :line_separator; node << Separator::Line.new(token, active_opts)
292
+ when :paragraph_separator; node << Separator::Paragraph.new(token, active_opts)
293
+
294
+ when :symbol; node << Symbol::Any.new(token, active_opts)
295
+ when :math_symbol; node << Symbol::Math.new(token, active_opts)
296
+ when :currency_symbol; node << Symbol::Currency.new(token, active_opts)
297
+ when :modifier_symbol; node << Symbol::Modifier.new(token, active_opts)
298
+ when :other_symbol; node << Symbol::Other.new(token, active_opts)
299
+
300
+ when :other; node << Codepoint::Any.new(token, active_opts)
301
+ when :control; node << Codepoint::Control.new(token, active_opts)
302
+ when :format; node << Codepoint::Format.new(token, active_opts)
303
+ when :surrogate; node << Codepoint::Surrogate.new(token, active_opts)
304
+ when :private_use; node << Codepoint::PrivateUse.new(token, active_opts)
305
+ when :unassigned; node << Codepoint::Unassigned.new(token, active_opts)
289
306
 
290
307
  when *Token::UnicodeProperty::Age
291
308
  node << Age.new(token, active_opts)
@@ -346,13 +363,20 @@ class Regexp::Parser
346
363
  node << EscapeSequence::Newline.new(token, active_opts)
347
364
  when :carriage
348
365
  node << EscapeSequence::Return.new(token, active_opts)
349
- when :space
350
- node << EscapeSequence::Space.new(token, active_opts)
351
366
  when :tab
352
367
  node << EscapeSequence::Tab.new(token, active_opts)
353
368
  when :vertical_tab
354
369
  node << EscapeSequence::VerticalTab.new(token, active_opts)
355
370
 
371
+ when :hex
372
+ node << EscapeSequence::Hex.new(token, active_opts)
373
+ when :octal
374
+ node << EscapeSequence::Octal.new(token, active_opts)
375
+ when :codepoint
376
+ node << EscapeSequence::Codepoint.new(token, active_opts)
377
+ when :codepoint_list
378
+ node << EscapeSequence::CodepointList.new(token, active_opts)
379
+
356
380
  when :control
357
381
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
358
382
  node << EscapeSequence::MetaControl.new(token, active_opts)
@@ -447,7 +471,7 @@ class Regexp::Parser
447
471
  mode = :greedy
448
472
  end
449
473
 
450
- range = range_text.gsub(/\{|\}/, '').split(',', 2).each {|i| i.strip}
474
+ range = range_text.gsub(/\{|\}/, '').split(',', 2)
451
475
  min = range[0].empty? ? 0 : range[0]
452
476
  max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
453
477
 
@@ -456,7 +480,7 @@ class Regexp::Parser
456
480
 
457
481
  def group(token)
458
482
  case token.token
459
- when :options
483
+ when :options, :options_switch
460
484
  options_group(token)
461
485
  when :close
462
486
  close_group
@@ -470,8 +494,7 @@ class Regexp::Parser
470
494
  def options_group(token)
471
495
  positive, negative = token.text.split('-', 2)
472
496
  negative ||= ''
473
- self.switching_options = !token.text.include?(':')
474
- # TODO: change this -^ to token.type == :options_switch in v1.0.0
497
+ self.switching_options = token.token.equal?(:options_switch)
475
498
 
476
499
  new_options = active_opts.dup
477
500
 
@@ -491,9 +514,7 @@ class Regexp::Parser
491
514
 
492
515
  options_stack << new_options
493
516
 
494
- exp = Group::Options.new(token, active_opts)
495
-
496
- nest(exp)
517
+ nest(Group::Options.new(token, active_opts))
497
518
  end
498
519
 
499
520
  def open_group(token)
@@ -522,6 +543,12 @@ class Regexp::Parser
522
543
  raise UnknownTokenError.new('Group type open', token)
523
544
  end
524
545
 
546
+ if exp.capturing?
547
+ exp.number = total_captured_group_count + 1
548
+ exp.number_at_level = captured_group_count_at_level + 1
549
+ count_captured_group
550
+ end
551
+
525
552
  # Push the active options to the stack again. This way we can simply pop the
526
553
  # stack for any group we close, no matter if it had its own options or not.
527
554
  options_stack << active_opts
@@ -530,38 +557,65 @@ class Regexp::Parser
530
557
  end
531
558
 
532
559
  def close_group
533
- nesting.pop
534
560
  options_stack.pop unless switching_options
535
561
  self.switching_options = false
536
-
537
- self.node = nesting.last
538
- self.node = node.last if node.last and node.last.is_a?(Alternation)
562
+ decrease_nesting
539
563
  end
540
564
 
541
565
  def open_set(token)
542
566
  token.token = :character
543
-
544
- if token.type == :subset
545
- current_set << CharacterSubSet.new(token, active_opts)
546
- else
547
- self.current_set = CharacterSet.new(token, active_opts)
548
- node << current_set
549
- end
567
+ nest(CharacterSet.new(token, active_opts))
550
568
  end
551
569
 
552
570
  def negate_set
553
- current_set.negate
571
+ node.negate
554
572
  end
555
573
 
556
- def append_set(token)
557
- current_set << token.text
574
+ def close_set
575
+ decrease_nesting(&:close)
558
576
  end
559
577
 
560
- def close_set(token)
561
- current_set.close
578
+ def range(token)
579
+ exp = CharacterSet::Range.new(token, active_opts)
580
+ scope = node.last.is_a?(CharacterSet::IntersectedSequence) ? node.last : node
581
+ exp << scope.expressions.pop
582
+ nest(exp)
583
+ end
584
+
585
+ def close_completed_character_set_range
586
+ decrease_nesting if node.is_a?(CharacterSet::Range) && node.complete?
587
+ end
588
+
589
+ def intersection(token)
590
+ sequence_operation(CharacterSet::Intersection, token)
591
+ end
592
+
593
+ def sequence_operation(klass, token)
594
+ if node.last.is_a?(klass)
595
+ self.node = node.last
596
+ elsif !node.is_a?(klass)
597
+ operator = klass.new(token, active_opts)
598
+ sequence = operator.add_sequence
599
+ sequence.expressions = node.expressions
600
+ node.expressions = []
601
+ nest(operator)
602
+ end
603
+ node.add_sequence
562
604
  end
563
605
 
564
606
  def active_opts
565
607
  options_stack.last
566
608
  end
609
+
610
+ def total_captured_group_count
611
+ captured_group_counts.values.reduce(0, :+)
612
+ end
613
+
614
+ def captured_group_count_at_level
615
+ captured_group_counts[node.level]
616
+ end
617
+
618
+ def count_captured_group
619
+ captured_group_counts[node.level] += 1
620
+ end
567
621
  end # module Regexp::Parser