regexp_parser 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6bc36d64eb84ebef5287ca3c866c834339b6b563
4
- data.tar.gz: 59db2b321120f59697d3ce6e0c612dd745a41ffc
3
+ metadata.gz: de01aa2d195d95dd0bee1afd232f85195562a8bf
4
+ data.tar.gz: b41cb58a4e07d681da7c16473f4f03d96a792ff9
5
5
  SHA512:
6
- metadata.gz: fcf1c339c360217fbb2a1c4cedfd2eeeca199a52983fe706c7d661141a93b3793e1de0901d6c935b61dad30a78c4ad7f510bfec6abdf765f2c6e6a6edb3adefa
7
- data.tar.gz: d0fc3eb8fd70a252d60ed25f7472ad36e76d379d5660f5d9d28735b9faa7caf1e15bc69468849e91d20832e3249a582c53ce52b67c5a01e50b2f6325dcf1063e
6
+ metadata.gz: 342d6218d5553f2f2f6975f202cf650cd74c9128379348526981d829188b38836dfd88bc46b8476212d4b10aeb628479109baff075e1297c4cf69aaa4fe8ff03
7
+ data.tar.gz: 80883d05ff9bb3f5f9f296aeeb5eabde013a362c6d6c82f4eeab87438050f93fea2224f407b170b65ff55d175937c55f56d49077e0ad43b0a55832989e221544
data/ChangeLog CHANGED
@@ -1,3 +1,16 @@
1
+ Sun Oct 15 2017 Janosch Müller <janosch84@gmail.com>
2
+
3
+ * Fixed a thread safety issue (issue #45)
4
+ * Some public class methods that were only reliable for
5
+ internal use are now private instance methods (PR #46)
6
+ * Improved the usefulness of Expression#options (issue #43) -
7
+ #options and derived methods such as #i?, #m? and #x? are now
8
+ defined for all Expressions that are affected by such flags.
9
+ * Fixed scanning of whitespace following (?x) (commit 5c94bd2)
10
+ * Fixed a Parser bug where the #number attribute of traditional
11
+ numerical backreferences was not set correctly (commit 851b620)
12
+ * Bumped version to 0.4.7
13
+
1
14
  Mon Sep 18 2017 Janosch Müller <janosch84@gmail.com>
2
15
 
3
16
  * Added Parser support for hex escapes in sets (PR #36)
data/README.md CHANGED
@@ -125,9 +125,10 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
125
125
 
126
126
  * If the input is a ruby **Regexp** object, the scanner calls #source on it to
127
127
  get its string representation. #source does not include the options of
128
- the expression (m, i, and x) To include the options in the scan, #to_s
129
- should be called on the **Regexp** before passing it to the scanner or any
130
- of the other modules.
128
+ the expression (m, i, and x). To include the options in the scan, #to_s
129
+ should be called on the **Regexp** before passing it to the scanner or the
130
+ lexer. For the parser, however, this is not necessary. It automatically
131
+ exposes the options of a passed **Regexp** in the returned root expression.
131
132
 
132
133
  * To keep the scanner simple(r) and fairly reusable for other purposes, it
133
134
  does not perform lexical analysis on the tokens, sticking to the task
@@ -8,7 +8,7 @@ module Regexp::Expression
8
8
  attr_accessor :quantifier
9
9
  attr_accessor :options
10
10
 
11
- def initialize(token)
11
+ def initialize(token, options = {})
12
12
  @type = token.type
13
13
  @token = token.token
14
14
  @text = token.text
@@ -17,7 +17,7 @@ module Regexp::Expression
17
17
  @set_level = token.set_level
18
18
  @conditional_level = token.conditional_level
19
19
  @quantifier = nil
20
- @options = nil
20
+ @options = options
21
21
  end
22
22
 
23
23
  def clone
@@ -95,35 +95,35 @@ module Regexp::Expression
95
95
  end
96
96
 
97
97
  def multiline?
98
- (@options and @options[:m]) ? true : false
98
+ @options[:m] == true
99
99
  end
100
100
  alias :m? :multiline?
101
101
 
102
102
  def case_insensitive?
103
- (@options and @options[:i]) ? true : false
103
+ @options[:i] == true
104
104
  end
105
105
  alias :i? :case_insensitive?
106
106
  alias :ignore_case? :case_insensitive?
107
107
 
108
108
  def free_spacing?
109
- (@options and @options[:x]) ? true : false
109
+ @options[:x] == true
110
110
  end
111
111
  alias :x? :free_spacing?
112
112
  alias :extended? :free_spacing?
113
113
 
114
114
  if RUBY_VERSION >= '2.0'
115
115
  def default_classes?
116
- (@options and @options[:d]) ? true : false
116
+ @options[:d] == true
117
117
  end
118
118
  alias :d? :default_classes?
119
119
 
120
120
  def ascii_classes?
121
- (@options and @options[:a]) ? true : false
121
+ @options[:a] == true
122
122
  end
123
123
  alias :a? :ascii_classes?
124
124
 
125
125
  def unicode_classes?
126
- (@options and @options[:u]) ? true : false
126
+ @options[:u] == true
127
127
  end
128
128
  alias :u? :unicode_classes?
129
129
  end
@@ -6,18 +6,18 @@ module Regexp::Expression
6
6
  class Name < Backreference::Base
7
7
  attr_reader :name
8
8
 
9
- def initialize(token)
9
+ def initialize(token, options = {})
10
10
  @name = token.text[3..-2]
11
- super(token)
11
+ super
12
12
  end
13
13
  end
14
14
 
15
15
  class Number < Backreference::Base
16
16
  attr_reader :number
17
17
 
18
- def initialize(token)
19
- @number = token.text[3..-2]
20
- super(token)
18
+ def initialize(token, options = {})
19
+ @number = token.text[token.token.equal?(:number) ? 1..-1 : 3..-2]
20
+ super
21
21
  end
22
22
  end
23
23
 
@@ -29,9 +29,9 @@ module Regexp::Expression
29
29
  class NameCall < Backreference::Base
30
30
  attr_reader :name
31
31
 
32
- def initialize(token)
32
+ def initialize(token, options = {})
33
33
  @name = token.text[3..-2]
34
- super(token)
34
+ super
35
35
  end
36
36
  end
37
37
 
@@ -11,8 +11,8 @@ module Regexp::Expression
11
11
  class Branch < Regexp::Expression::Sequence; end
12
12
 
13
13
  class Expression < Regexp::Expression::Subexpression
14
- def initialize(token)
15
- super(token)
14
+ def initialize(token, options = {})
15
+ super
16
16
 
17
17
  @condition = nil
18
18
  @branches = []
@@ -36,9 +36,9 @@ module Regexp::Expression
36
36
  class Named < Group::Capture
37
37
  attr_reader :name
38
38
 
39
- def initialize(token)
39
+ def initialize(token, options = {})
40
40
  @name = token.text[3..-2]
41
- super(token)
41
+ super
42
42
  end
43
43
 
44
44
  def clone
@@ -1,26 +1,12 @@
1
1
  module Regexp::Expression
2
2
 
3
3
  class Root < Regexp::Expression::Subexpression
4
- def initialize
5
- super Regexp::Token.new(:expression, :root, '', 0)
4
+ def initialize(options = {})
5
+ super(Regexp::Token.new(:expression, :root, '', 0), options)
6
6
  end
7
7
 
8
- def multiline?
9
- @expressions[0].m?
10
- end
11
- alias :m? :multiline?
12
-
13
- def case_insensitive?
14
- @expressions[0].i?
15
- end
16
- alias :i? :case_insensitive?
17
- alias :ignore_case? :case_insensitive?
18
-
19
- def free_spacing?
20
- @expressions[0].x?
21
- end
22
- alias :x? :free_spacing?
23
- alias :extended? :free_spacing?
8
+ alias ignore_case? case_insensitive?
9
+ alias extended? free_spacing?
24
10
  end
25
11
 
26
12
  end
@@ -3,7 +3,7 @@ module Regexp::Expression
3
3
  class CharacterSet < Regexp::Expression::Base
4
4
  attr_accessor :members
5
5
 
6
- def initialize(token)
6
+ def initialize(token, options = {})
7
7
  @members = []
8
8
  @negative = false
9
9
  @closed = false
@@ -3,8 +3,8 @@ module Regexp::Expression
3
3
  class Subexpression < Regexp::Expression::Base
4
4
  attr_accessor :expressions
5
5
 
6
- def initialize(token)
7
- super(token)
6
+ def initialize(token, options = {})
7
+ super
8
8
 
9
9
  @expressions = []
10
10
  end
@@ -2,7 +2,7 @@
2
2
  # collects emitted tokens into an array, calculates their nesting depth, and
3
3
  # normalizes tokens for the parser, and checks if they are implemented by the
4
4
  # given syntax flavor.
5
- module Regexp::Lexer
5
+ class Regexp::Lexer
6
6
 
7
7
  OPENING_TOKENS = [:capture, :options, :passive, :atomic, :named, :absence,
8
8
  :lookahead, :nlookahead, :lookbehind, :nlookbehind
@@ -11,6 +11,10 @@ module Regexp::Lexer
11
11
  CLOSING_TOKENS = [:close].freeze
12
12
 
13
13
  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
14
+ new.lex(input, syntax, &block)
15
+ end
16
+
17
+ def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
14
18
  syntax = Regexp::Syntax.new(syntax)
15
19
 
16
20
  @tokens = []
@@ -57,7 +61,7 @@ module Regexp::Lexer
57
61
 
58
62
  protected
59
63
 
60
- def self.ascend(type, token)
64
+ def ascend(type, token)
61
65
  if type == :group or type == :assertion
62
66
  @nesting -= 1 if CLOSING_TOKENS.include?(token)
63
67
  end
@@ -71,7 +75,7 @@ module Regexp::Lexer
71
75
  end
72
76
  end
73
77
 
74
- def self.descend(type, token)
78
+ def descend(type, token)
75
79
  if type == :group or type == :assertion
76
80
  @nesting += 1 if OPENING_TOKENS.include?(token)
77
81
  end
@@ -87,7 +91,7 @@ module Regexp::Lexer
87
91
 
88
92
  # called by scan to break a literal run that is longer than one character
89
93
  # into two separate tokens when it is followed by a quantifier
90
- def self.break_literal(token)
94
+ def break_literal(token)
91
95
  text = token.text
92
96
  if text.scan(/./mu).length > 1
93
97
  lead = text.sub(/.\z/mu, "")
@@ -113,7 +117,7 @@ module Regexp::Lexer
113
117
 
114
118
  # called by scan to merge two consecutive literals. this happens when tokens
115
119
  # get normalized (as in the case of posix/bre) and end up becoming literals.
116
- def self.merge_literal(current)
120
+ def merge_literal(current)
117
121
  last = @tokens.pop
118
122
 
119
123
  Regexp::Token.new(
@@ -128,7 +132,7 @@ module Regexp::Lexer
128
132
  )
129
133
  end
130
134
 
131
- def self.merge_condition(current)
135
+ def merge_condition(current)
132
136
  last = @tokens.pop
133
137
  Regexp::Token.new(:conditional, :condition, last.text + current.text,
134
138
  last.ts, current.te, @nesting, @set_nesting, @conditional_nesting)
@@ -1,6 +1,6 @@
1
1
  require 'regexp_parser/expression'
2
2
 
3
- module Regexp::Parser
3
+ class Regexp::Parser
4
4
  include Regexp::Expression
5
5
  include Regexp::Syntax
6
6
 
@@ -19,8 +19,14 @@ module Regexp::Parser
19
19
  end
20
20
 
21
21
  def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
22
- @nesting = [@root = @node = Root.new]
22
+ new.parse(input, syntax, &block)
23
+ end
24
+
25
+ def parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
26
+ @nesting = [@root = @node = Root.new(options_from_input(input))]
23
27
 
28
+ @options_stack = [@root.options]
29
+ @switching_options = false
24
30
  @conditional_nesting = []
25
31
 
26
32
  Regexp::Lexer.scan(input, syntax) do |token|
@@ -34,21 +40,33 @@ module Regexp::Parser
34
40
  end
35
41
  end
36
42
 
37
- def self.nest(exp)
43
+ private
44
+
45
+ def options_from_input(input)
46
+ return {} unless input.is_a?(::Regexp)
47
+
48
+ options = {}
49
+ options[:i] = true if input.options & ::Regexp::IGNORECASE != 0
50
+ options[:m] = true if input.options & ::Regexp::MULTILINE != 0
51
+ options[:x] = true if input.options & ::Regexp::EXTENDED != 0
52
+ options
53
+ end
54
+
55
+ def nest(exp)
38
56
  @nesting.push exp
39
57
 
40
58
  @node << exp
41
59
  @node = exp
42
60
  end
43
61
 
44
- def self.nest_conditional(exp)
62
+ def nest_conditional(exp)
45
63
  @conditional_nesting.push exp
46
64
 
47
65
  @node << exp
48
66
  @node = exp
49
67
  end
50
68
 
51
- def self.parse_token(token)
69
+ def parse_token(token)
52
70
  case token.type
53
71
  when :meta; meta(token)
54
72
  when :quantifier; quantifier(token)
@@ -66,7 +84,7 @@ module Regexp::Parser
66
84
  property(token)
67
85
 
68
86
  when :literal
69
- @node << Literal.new(token)
87
+ @node << Literal.new(token, active_opts)
70
88
  when :free_space
71
89
  free_space(token)
72
90
 
@@ -75,7 +93,7 @@ module Regexp::Parser
75
93
  end
76
94
  end
77
95
 
78
- def self.set(token)
96
+ def set(token)
79
97
  case token.token
80
98
  when :open
81
99
  open_set(token)
@@ -96,14 +114,14 @@ module Regexp::Parser
96
114
  end
97
115
  end
98
116
 
99
- def self.meta(token)
117
+ def meta(token)
100
118
  case token.token
101
119
  when :dot
102
- @node << CharacterType::Any.new(token)
120
+ @node << CharacterType::Any.new(token, active_opts)
103
121
  when :alternation
104
122
  unless @node.token == :alternation
105
123
  unless @node.last.is_a?(Alternation)
106
- alt = Alternation.new(token)
124
+ alt = Alternation.new(token, active_opts)
107
125
  seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
108
126
 
109
127
  while @node.expressions.last
@@ -126,62 +144,62 @@ module Regexp::Parser
126
144
  end
127
145
  end
128
146
 
129
- def self.backref(token)
147
+ def backref(token)
130
148
  case token.token
131
149
  when :name_ref
132
- @node << Backreference::Name.new(token)
150
+ @node << Backreference::Name.new(token, active_opts)
133
151
  when :name_nest_ref
134
- @node << Backreference::NameNestLevel.new(token)
152
+ @node << Backreference::NameNestLevel.new(token, active_opts)
135
153
  when :name_call
136
- @node << Backreference::NameCall.new(token)
154
+ @node << Backreference::NameCall.new(token, active_opts)
137
155
  when :number, :number_ref
138
- @node << Backreference::Number.new(token)
156
+ @node << Backreference::Number.new(token, active_opts)
139
157
  when :number_rel_ref
140
- @node << Backreference::NumberRelative.new(token)
158
+ @node << Backreference::NumberRelative.new(token, active_opts)
141
159
  when :number_nest_ref
142
- @node << Backreference::NumberNestLevel.new(token)
160
+ @node << Backreference::NumberNestLevel.new(token, active_opts)
143
161
  when :number_call
144
- @node << Backreference::NumberCall.new(token)
162
+ @node << Backreference::NumberCall.new(token, active_opts)
145
163
  when :number_rel_call
146
- @node << Backreference::NumberCallRelative.new(token)
164
+ @node << Backreference::NumberCallRelative.new(token, active_opts)
147
165
  else
148
166
  raise UnknownTokenError.new('Backreference', token)
149
167
  end
150
168
  end
151
169
 
152
- def self.type(token)
170
+ def type(token)
153
171
  case token.token
154
172
  when :digit
155
- @node << CharacterType::Digit.new(token)
173
+ @node << CharacterType::Digit.new(token, active_opts)
156
174
  when :nondigit
157
- @node << CharacterType::NonDigit.new(token)
175
+ @node << CharacterType::NonDigit.new(token, active_opts)
158
176
  when :hex
159
- @node << CharacterType::Hex.new(token)
177
+ @node << CharacterType::Hex.new(token, active_opts)
160
178
  when :nonhex
161
- @node << CharacterType::NonHex.new(token)
179
+ @node << CharacterType::NonHex.new(token, active_opts)
162
180
  when :space
163
- @node << CharacterType::Space.new(token)
181
+ @node << CharacterType::Space.new(token, active_opts)
164
182
  when :nonspace
165
- @node << CharacterType::NonSpace.new(token)
183
+ @node << CharacterType::NonSpace.new(token, active_opts)
166
184
  when :word
167
- @node << CharacterType::Word.new(token)
185
+ @node << CharacterType::Word.new(token, active_opts)
168
186
  when :nonword
169
- @node << CharacterType::NonWord.new(token)
187
+ @node << CharacterType::NonWord.new(token, active_opts)
170
188
  when :linebreak
171
- @node << CharacterType::Linebreak.new(token)
189
+ @node << CharacterType::Linebreak.new(token, active_opts)
172
190
  when :xgrapheme
173
- @node << CharacterType::ExtendedGrapheme.new(token)
191
+ @node << CharacterType::ExtendedGrapheme.new(token, active_opts)
174
192
  else
175
193
  raise UnknownTokenError.new('CharacterType', token)
176
194
  end
177
195
  end
178
196
 
179
- def self.conditional(token)
197
+ def conditional(token)
180
198
  case token.token
181
199
  when :open
182
- nest_conditional(Conditional::Expression.new(token))
200
+ nest_conditional(Conditional::Expression.new(token, active_opts))
183
201
  when :condition
184
- @conditional_nesting.last.condition(Conditional::Condition.new(token))
202
+ @conditional_nesting.last.condition(Conditional::Condition.new(token, active_opts))
185
203
  @conditional_nesting.last.branch
186
204
  when :separator
187
205
  @conditional_nesting.last.branch
@@ -200,175 +218,174 @@ module Regexp::Parser
200
218
  end
201
219
  end
202
220
 
203
- def self.property(token)
204
- include Regexp::Expression::UnicodeProperty
221
+ include Regexp::Expression::UnicodeProperty
205
222
 
223
+ def property(token)
206
224
  case token.token
207
- when :alnum; @node << Alnum.new(token)
208
- when :alpha; @node << Alpha.new(token)
209
- when :any; @node << Any.new(token)
210
- when :ascii; @node << Ascii.new(token)
211
- when :blank; @node << Blank.new(token)
212
- when :cntrl; @node << Cntrl.new(token)
213
- when :digit; @node << Digit.new(token)
214
- when :graph; @node << Graph.new(token)
215
- when :lower; @node << Lower.new(token)
216
- when :print; @node << Print.new(token)
217
- when :punct; @node << Punct.new(token)
218
- when :space; @node << Space.new(token)
219
- when :upper; @node << Upper.new(token)
220
- when :word; @node << Word.new(token)
221
- when :xdigit; @node << Xdigit.new(token)
222
- when :newline; @node << Newline.new(token)
223
-
224
- when :letter_any; @node << Letter::Any.new(token)
225
- when :letter_uppercase; @node << Letter::Uppercase.new(token)
226
- when :letter_lowercase; @node << Letter::Lowercase.new(token)
227
- when :letter_titlecase; @node << Letter::Titlecase.new(token)
228
- when :letter_modifier; @node << Letter::Modifier.new(token)
229
- when :letter_other; @node << Letter::Other.new(token)
230
-
231
- when :mark_any; @node << Mark::Any.new(token)
232
- when :mark_nonspacing; @node << Mark::Nonspacing.new(token)
233
- when :mark_spacing; @node << Mark::Spacing.new(token)
234
- when :mark_enclosing; @node << Mark::Enclosing.new(token)
235
-
236
- when :number_any; @node << Number::Any.new(token)
237
- when :number_decimal; @node << Number::Decimal.new(token)
238
- when :number_letter; @node << Number::Letter.new(token)
239
- when :number_other; @node << Number::Other.new(token)
240
-
241
- when :punct_any; @node << Punctuation::Any.new(token)
242
- when :punct_connector; @node << Punctuation::Connector.new(token)
243
- when :punct_dash; @node << Punctuation::Dash.new(token)
244
- when :punct_open; @node << Punctuation::Open.new(token)
245
- when :punct_close; @node << Punctuation::Close.new(token)
246
- when :punct_initial; @node << Punctuation::Initial.new(token)
247
- when :punct_final; @node << Punctuation::Final.new(token)
248
- when :punct_other; @node << Punctuation::Other.new(token)
249
-
250
- when :separator_any; @node << Separator::Any.new(token)
251
- when :separator_space; @node << Separator::Space.new(token)
252
- when :separator_line; @node << Separator::Line.new(token)
253
- when :separator_para; @node << Separator::Paragraph.new(token)
254
-
255
- when :symbol_any; @node << Symbol::Any.new(token)
256
- when :symbol_math; @node << Symbol::Math.new(token)
257
- when :symbol_currency; @node << Symbol::Currency.new(token)
258
- when :symbol_modifier; @node << Symbol::Modifier.new(token)
259
- when :symbol_other; @node << Symbol::Other.new(token)
260
-
261
- when :other; @node << Codepoint::Any.new(token)
262
- when :control; @node << Codepoint::Control.new(token)
263
- when :format; @node << Codepoint::Format.new(token)
264
- when :surrogate; @node << Codepoint::Surrogate.new(token)
265
- when :private_use; @node << Codepoint::PrivateUse.new(token)
266
- when :unassigned; @node << Codepoint::Unassigned.new(token)
225
+ when :alnum; @node << Alnum.new(token, active_opts)
226
+ when :alpha; @node << Alpha.new(token, active_opts)
227
+ when :any; @node << Any.new(token, active_opts)
228
+ when :ascii; @node << Ascii.new(token, active_opts)
229
+ when :blank; @node << Blank.new(token, active_opts)
230
+ when :cntrl; @node << Cntrl.new(token, active_opts)
231
+ when :digit; @node << Digit.new(token, active_opts)
232
+ when :graph; @node << Graph.new(token, active_opts)
233
+ when :lower; @node << Lower.new(token, active_opts)
234
+ when :print; @node << Print.new(token, active_opts)
235
+ when :punct; @node << Punct.new(token, active_opts)
236
+ when :space; @node << Space.new(token, active_opts)
237
+ when :upper; @node << Upper.new(token, active_opts)
238
+ when :word; @node << Word.new(token, active_opts)
239
+ when :xdigit; @node << Xdigit.new(token, active_opts)
240
+ when :newline; @node << Newline.new(token, active_opts)
241
+
242
+ when :letter_any; @node << Letter::Any.new(token, active_opts)
243
+ when :letter_uppercase; @node << Letter::Uppercase.new(token, active_opts)
244
+ when :letter_lowercase; @node << Letter::Lowercase.new(token, active_opts)
245
+ when :letter_titlecase; @node << Letter::Titlecase.new(token, active_opts)
246
+ when :letter_modifier; @node << Letter::Modifier.new(token, active_opts)
247
+ when :letter_other; @node << Letter::Other.new(token, active_opts)
248
+
249
+ when :mark_any; @node << Mark::Any.new(token, active_opts)
250
+ when :mark_nonspacing; @node << Mark::Nonspacing.new(token, active_opts)
251
+ when :mark_spacing; @node << Mark::Spacing.new(token, active_opts)
252
+ when :mark_enclosing; @node << Mark::Enclosing.new(token, active_opts)
253
+
254
+ when :number_any; @node << Number::Any.new(token, active_opts)
255
+ when :number_decimal; @node << Number::Decimal.new(token, active_opts)
256
+ when :number_letter; @node << Number::Letter.new(token, active_opts)
257
+ when :number_other; @node << Number::Other.new(token, active_opts)
258
+
259
+ when :punct_any; @node << Punctuation::Any.new(token, active_opts)
260
+ when :punct_connector; @node << Punctuation::Connector.new(token, active_opts)
261
+ when :punct_dash; @node << Punctuation::Dash.new(token, active_opts)
262
+ when :punct_open; @node << Punctuation::Open.new(token, active_opts)
263
+ when :punct_close; @node << Punctuation::Close.new(token, active_opts)
264
+ when :punct_initial; @node << Punctuation::Initial.new(token, active_opts)
265
+ when :punct_final; @node << Punctuation::Final.new(token, active_opts)
266
+ when :punct_other; @node << Punctuation::Other.new(token, active_opts)
267
+
268
+ when :separator_any; @node << Separator::Any.new(token, active_opts)
269
+ when :separator_space; @node << Separator::Space.new(token, active_opts)
270
+ when :separator_line; @node << Separator::Line.new(token, active_opts)
271
+ when :separator_para; @node << Separator::Paragraph.new(token, active_opts)
272
+
273
+ when :symbol_any; @node << Symbol::Any.new(token, active_opts)
274
+ when :symbol_math; @node << Symbol::Math.new(token, active_opts)
275
+ when :symbol_currency; @node << Symbol::Currency.new(token, active_opts)
276
+ when :symbol_modifier; @node << Symbol::Modifier.new(token, active_opts)
277
+ when :symbol_other; @node << Symbol::Other.new(token, active_opts)
278
+
279
+ when :other; @node << Codepoint::Any.new(token, active_opts)
280
+ when :control; @node << Codepoint::Control.new(token, active_opts)
281
+ when :format; @node << Codepoint::Format.new(token, active_opts)
282
+ when :surrogate; @node << Codepoint::Surrogate.new(token, active_opts)
283
+ when :private_use; @node << Codepoint::PrivateUse.new(token, active_opts)
284
+ when :unassigned; @node << Codepoint::Unassigned.new(token, active_opts)
267
285
 
268
286
  when *Token::UnicodeProperty::Age
269
- @node << Age.new(token)
287
+ @node << Age.new(token, active_opts)
270
288
 
271
289
  when *Token::UnicodeProperty::Derived
272
- @node << Derived.new(token)
290
+ @node << Derived.new(token, active_opts)
273
291
 
274
292
  when *Regexp::Syntax::Token::UnicodeProperty::Script
275
- @node << Script.new(token)
293
+ @node << Script.new(token, active_opts)
276
294
 
277
295
  when *Regexp::Syntax::Token::UnicodeProperty::UnicodeBlock
278
- @node << Block.new(token)
296
+ @node << Block.new(token, active_opts)
279
297
 
280
298
  else
281
299
  raise UnknownTokenError.new('UnicodeProperty', token)
282
300
  end
283
301
  end
284
302
 
285
- def self.anchor(token)
303
+ def anchor(token)
286
304
  case token.token
287
305
  when :bol
288
- @node << Anchor::BeginningOfLine.new(token)
306
+ @node << Anchor::BeginningOfLine.new(token, active_opts)
289
307
  when :eol
290
- @node << Anchor::EndOfLine.new(token)
308
+ @node << Anchor::EndOfLine.new(token, active_opts)
291
309
  when :bos
292
- @node << Anchor::BOS.new(token)
310
+ @node << Anchor::BOS.new(token, active_opts)
293
311
  when :eos
294
- @node << Anchor::EOS.new(token)
312
+ @node << Anchor::EOS.new(token, active_opts)
295
313
  when :eos_ob_eol
296
- @node << Anchor::EOSobEOL.new(token)
314
+ @node << Anchor::EOSobEOL.new(token, active_opts)
297
315
  when :word_boundary
298
- @node << Anchor::WordBoundary.new(token)
316
+ @node << Anchor::WordBoundary.new(token, active_opts)
299
317
  when :nonword_boundary
300
- @node << Anchor::NonWordBoundary.new(token)
318
+ @node << Anchor::NonWordBoundary.new(token, active_opts)
301
319
  when :match_start
302
- @node << Anchor::MatchStart.new(token)
320
+ @node << Anchor::MatchStart.new(token, active_opts)
303
321
  else
304
322
  raise UnknownTokenError.new('Anchor', token)
305
323
  end
306
324
  end
307
325
 
308
- def self.escape(token)
326
+ def escape(token)
309
327
  case token.token
310
328
 
311
329
  when :backspace
312
- @node << EscapeSequence::Backspace.new(token)
330
+ @node << EscapeSequence::Backspace.new(token, active_opts)
313
331
 
314
332
  when :escape
315
- @node << EscapeSequence::AsciiEscape.new(token)
333
+ @node << EscapeSequence::AsciiEscape.new(token, active_opts)
316
334
  when :bell
317
- @node << EscapeSequence::Bell.new(token)
335
+ @node << EscapeSequence::Bell.new(token, active_opts)
318
336
  when :form_feed
319
- @node << EscapeSequence::FormFeed.new(token)
337
+ @node << EscapeSequence::FormFeed.new(token, active_opts)
320
338
  when :newline
321
- @node << EscapeSequence::Newline.new(token)
339
+ @node << EscapeSequence::Newline.new(token, active_opts)
322
340
  when :carriage
323
- @node << EscapeSequence::Return.new(token)
341
+ @node << EscapeSequence::Return.new(token, active_opts)
324
342
  when :space
325
- @node << EscapeSequence::Space.new(token)
343
+ @node << EscapeSequence::Space.new(token, active_opts)
326
344
  when :tab
327
- @node << EscapeSequence::Tab.new(token)
345
+ @node << EscapeSequence::Tab.new(token, active_opts)
328
346
  when :vertical_tab
329
- @node << EscapeSequence::VerticalTab.new(token)
347
+ @node << EscapeSequence::VerticalTab.new(token, active_opts)
330
348
 
331
349
  when :control
332
350
  if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
333
- @node << EscapeSequence::MetaControl.new(token)
351
+ @node << EscapeSequence::MetaControl.new(token, active_opts)
334
352
  else
335
- @node << EscapeSequence::Control.new(token)
353
+ @node << EscapeSequence::Control.new(token, active_opts)
336
354
  end
337
355
 
338
356
  when :meta_sequence
339
357
  if token.text =~ /\A\\M-\\[Cc]/
340
- @node << EscapeSequence::MetaControl.new(token)
358
+ @node << EscapeSequence::MetaControl.new(token, active_opts)
341
359
  else
342
- @node << EscapeSequence::Meta.new(token)
360
+ @node << EscapeSequence::Meta.new(token, active_opts)
343
361
  end
344
362
 
345
363
  else
346
364
  # treating everything else as a literal
347
- @node << EscapeSequence::Literal.new(token)
365
+ @node << EscapeSequence::Literal.new(token, active_opts)
348
366
  end
349
367
  end
350
368
 
351
-
352
- def self.keep(token)
353
- @node << Keep::Mark.new(token)
369
+ def keep(token)
370
+ @node << Keep::Mark.new(token, active_opts)
354
371
  end
355
372
 
356
- def self.free_space(token)
373
+ def free_space(token)
357
374
  case token.token
358
375
  when :comment
359
- @node << Comment.new(token)
376
+ @node << Comment.new(token, active_opts)
360
377
  when :whitespace
361
378
  if @node.last and @node.last.is_a?(WhiteSpace)
362
- @node.last.merge(WhiteSpace.new(token))
379
+ @node.last.merge(WhiteSpace.new(token, active_opts))
363
380
  else
364
- @node << WhiteSpace.new(token)
381
+ @node << WhiteSpace.new(token, active_opts)
365
382
  end
366
383
  else
367
384
  raise UnknownTokenError.new('FreeSpace', token)
368
385
  end
369
386
  end
370
387
 
371
- def self.quantifier(token)
388
+ def quantifier(token)
372
389
  offset = -1
373
390
  target_node = @node.expressions[offset]
374
391
  while target_node and target_node.is_a?(FreeSpace)
@@ -378,15 +395,6 @@ module Regexp::Parser
378
395
  raise ArgumentError.new("No valid target found for '#{token.text}' "+
379
396
  "quantifier") unless target_node
380
397
 
381
- unless target_node
382
- if token.token == :zero_or_one
383
- raise "Quantifier given without a target, or the syntax of the group " +
384
- "or its options is incorrect"
385
- else
386
- raise "Quantifier `#{token.text}' given without a target"
387
- end
388
- end
389
-
390
398
  case token.token
391
399
  when :zero_or_one
392
400
  target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
@@ -417,7 +425,7 @@ module Regexp::Parser
417
425
  end
418
426
  end
419
427
 
420
- def self.interval(target_node, token)
428
+ def interval(target_node, token)
421
429
  text = token.text
422
430
  mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
423
431
  case mchr
@@ -439,91 +447,113 @@ module Regexp::Parser
439
447
  target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
440
448
  end
441
449
 
442
- def self.group(token)
450
+ def group(token)
443
451
  case token.token
444
452
  when :options
445
- options(token)
453
+ options_group(token)
446
454
  when :close
447
455
  close_group
448
456
  when :comment
449
- @node << Group::Comment.new(token)
457
+ @node << Group::Comment.new(token, active_opts)
450
458
  else
451
459
  open_group(token)
452
460
  end
453
461
  end
454
462
 
455
- def self.options(token)
456
- opt = token.text.split('-', 2)
463
+ def options_group(token)
464
+ positive, negative = token.text.split('-', 2)
465
+ negative ||= ''
466
+ @switching_options = !token.text.include?(':')
467
+ # TODO: change this -^ to token.type == :options_switch in v1.0.0
468
+
469
+ new_options = active_opts.dup
457
470
 
458
- exp = Group::Options.new(token)
459
- exp.options = {
460
- :m => opt[0].include?('m') ? true : false,
461
- :i => opt[0].include?('i') ? true : false,
462
- :x => opt[0].include?('x') ? true : false,
463
- :d => opt[0].include?('d') ? true : false,
464
- :a => opt[0].include?('a') ? true : false,
465
- :u => opt[0].include?('u') ? true : false
466
- }
471
+ # Negative options have precedence. E.g. /(?i-i)a/ is case-sensitive.
472
+ %w[i m x].each do |flag|
473
+ new_options[flag.to_sym] = true if positive.include?(flag)
474
+ new_options.delete(flag.to_sym) if negative.include?(flag)
475
+ end
476
+
477
+ # Any encoding flag overrides all previous encoding flags. If there are
478
+ # multiple encoding flags in an options string, the last one wins.
479
+ # E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
480
+ if (flag = positive.reverse[/[adu]/])
481
+ %w[a d u].each { |key| new_options.delete(key.to_sym) }
482
+ new_options[flag.to_sym] = true
483
+ end
484
+
485
+ @options_stack << new_options
486
+
487
+ exp = Group::Options.new(token, active_opts)
467
488
 
468
489
  nest(exp)
469
490
  end
470
491
 
471
- def self.open_group(token)
492
+ def open_group(token)
472
493
  case token.token
473
494
  when :passive
474
- exp = Group::Passive.new(token)
495
+ exp = Group::Passive.new(token, active_opts)
475
496
  when :atomic
476
- exp = Group::Atomic.new(token)
497
+ exp = Group::Atomic.new(token, active_opts)
477
498
  when :named
478
- exp = Group::Named.new(token)
499
+ exp = Group::Named.new(token, active_opts)
479
500
  when :capture
480
- exp = Group::Capture.new(token)
501
+ exp = Group::Capture.new(token, active_opts)
481
502
  when :absence
482
- exp = Group::Absence.new(token)
503
+ exp = Group::Absence.new(token, active_opts)
483
504
 
484
505
  when :lookahead
485
- exp = Assertion::Lookahead.new(token)
506
+ exp = Assertion::Lookahead.new(token, active_opts)
486
507
  when :nlookahead
487
- exp = Assertion::NegativeLookahead.new(token)
508
+ exp = Assertion::NegativeLookahead.new(token, active_opts)
488
509
  when :lookbehind
489
- exp = Assertion::Lookbehind.new(token)
510
+ exp = Assertion::Lookbehind.new(token, active_opts)
490
511
  when :nlookbehind
491
- exp = Assertion::NegativeLookbehind.new(token)
512
+ exp = Assertion::NegativeLookbehind.new(token, active_opts)
492
513
 
493
514
  else
494
515
  raise UnknownTokenError.new('Group type open', token)
495
516
  end
496
517
 
518
+ # Push the active options to the stack again. This way we can simply pop the
519
+ # stack for any group we close, no matter if it had its own options or not.
520
+ @options_stack << active_opts
521
+
497
522
  nest(exp)
498
523
  end
499
524
 
500
- def self.close_group
525
+ def close_group
501
526
  @nesting.pop
527
+ @options_stack.pop unless @switching_options
528
+ @switching_options = false
502
529
 
503
530
  @node = @nesting.last
504
531
  @node = @node.last if @node.last and @node.last.is_a?(Alternation)
505
532
  end
506
533
 
507
- def self.open_set(token)
534
+ def open_set(token)
508
535
  token.token = :character
509
536
 
510
537
  if token.type == :subset
511
- @set << CharacterSubSet.new(token)
538
+ @set << CharacterSubSet.new(token, active_opts)
512
539
  else
513
- @node << (@set = CharacterSet.new(token))
540
+ @node << (@set = CharacterSet.new(token, active_opts))
514
541
  end
515
542
  end
516
543
 
517
- def self.negate_set
544
+ def negate_set
518
545
  @set.negate
519
546
  end
520
547
 
521
- def self.append_set(token)
548
+ def append_set(token)
522
549
  @set << token.text
523
550
  end
524
551
 
525
- def self.close_set(token)
552
+ def close_set(token)
526
553
  @set.close
527
554
  end
528
555
 
556
+ def active_opts
557
+ @options_stack.last
558
+ end
529
559
  end # module Regexp::Parser