regexp_parser 1.7.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +80 -1
  3. data/README.md +24 -12
  4. data/lib/regexp_parser/expression.rb +10 -19
  5. data/lib/regexp_parser/expression/classes/group.rb +17 -2
  6. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  7. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  8. data/lib/regexp_parser/expression/sequence.rb +0 -10
  9. data/lib/regexp_parser/lexer.rb +6 -6
  10. data/lib/regexp_parser/parser.rb +45 -12
  11. data/lib/regexp_parser/scanner.rb +1305 -1193
  12. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  13. data/lib/regexp_parser/scanner/property.rl +2 -2
  14. data/lib/regexp_parser/scanner/scanner.rl +194 -171
  15. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  16. data/lib/regexp_parser/version.rb +1 -1
  17. data/regexp_parser.gemspec +1 -1
  18. data/spec/expression/base_spec.rb +10 -0
  19. data/spec/expression/to_s_spec.rb +16 -0
  20. data/spec/lexer/delimiters_spec.rb +68 -0
  21. data/spec/lexer/literals_spec.rb +24 -49
  22. data/spec/parser/escapes_spec.rb +1 -1
  23. data/spec/parser/options_spec.rb +28 -0
  24. data/spec/parser/quantifiers_spec.rb +16 -0
  25. data/spec/parser/set/ranges_spec.rb +3 -3
  26. data/spec/scanner/delimiters_spec.rb +52 -0
  27. data/spec/scanner/errors_spec.rb +0 -1
  28. data/spec/scanner/escapes_spec.rb +10 -0
  29. data/spec/scanner/free_space_spec.rb +32 -0
  30. data/spec/scanner/literals_spec.rb +28 -38
  31. data/spec/scanner/options_spec.rb +36 -0
  32. data/spec/scanner/quantifiers_spec.rb +18 -13
  33. data/spec/scanner/sets_spec.rb +8 -2
  34. metadata +65 -61
  35. data/spec/expression/root_spec.rb +0 -9
  36. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -21,7 +21,7 @@
21
21
  set_close = ']';
22
22
  brackets = set_open | set_close;
23
23
 
24
- comment = ('#' . [^\n]* . '\n');
24
+ comment = ('#' . [^\n]* . '\n'?);
25
25
 
26
26
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
27
  'cntrl' | 'digit' | 'graph' |
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -114,7 +118,9 @@
114
118
  curlies | parantheses | brackets |
115
119
  line_anchor | quantifier_greedy;
116
120
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
121
+ literal_delimiters = ']' | '}';
122
+
123
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
118
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
119
125
 
120
126
  utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
@@ -122,20 +128,20 @@
122
128
  utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
123
129
 
124
130
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
131
+ keep_mark | [xucCM];
126
132
 
127
133
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
134
  multi_codepoint_char_type | [0-9cCM];
129
135
 
130
136
  # EOF error, used where it can be detected
131
137
  action premature_end_error {
132
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
+ text = copy(data, ts ? ts-1 : 0, -1)
133
139
  raise PrematureEndError.new( text )
134
140
  }
135
141
 
136
142
  # Invalid sequence error, used from sequences, like escapes and sets
137
143
  action invalid_sequence_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
144
+ text = copy(data, ts ? ts-1 : 0, -1)
139
145
  validation_error(:sequence, 'sequence', text)
140
146
  }
141
147
 
@@ -150,7 +156,7 @@
150
156
  # --------------------------------------------------------------------------
151
157
  character_set := |*
152
158
  set_close > (set_meta, 2) @set_closed {
153
- emit(:set, :close, *text(data, ts, te))
159
+ emit(:set, :close, copy(data, ts, te))
154
160
  if in_set?
155
161
  fret;
156
162
  else
@@ -159,8 +165,8 @@
159
165
  };
160
166
 
161
167
  '-]' @set_closed { # special case, emits two tokens
162
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
168
+ emit(:literal, :literal, copy(data, ts, te-1))
169
+ emit(:set, :close, copy(data, ts+1, te))
164
170
  if in_set?
165
171
  fret;
166
172
  else
@@ -169,33 +175,33 @@
169
175
  };
170
176
 
171
177
  '-&&' { # special case, emits two tokens
172
- emit(:literal, :literal, '-', ts, te)
173
- emit(:set, :intersection, '&&', ts, te)
178
+ emit(:literal, :literal, '-')
179
+ emit(:set, :intersection, '&&')
174
180
  };
175
181
 
176
182
  '^' {
177
- text = text(data, ts, te).first
183
+ text = copy(data, ts, te)
178
184
  if tokens.last[1] == :open
179
- emit(:set, :negate, text, ts, te)
185
+ emit(:set, :negate, text)
180
186
  else
181
- emit(:literal, :literal, text, ts, te)
187
+ emit(:literal, :literal, text)
182
188
  end
183
189
  };
184
190
 
185
191
  '-' {
186
- text = text(data, ts, te).first
192
+ text = copy(data, ts, te)
187
193
  # ranges cant start with a subset or intersection/negation/range operator
188
194
  if tokens.last[0] == :set
189
- emit(:literal, :literal, text, ts, te)
195
+ emit(:literal, :literal, text)
190
196
  else
191
- emit(:set, :range, text, ts, te)
197
+ emit(:set, :range, text)
192
198
  end
193
199
  };
194
200
 
195
201
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
196
202
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
197
203
  '&&' {
198
- emit(:set, :intersection, *text(data, ts, te))
204
+ emit(:set, :intersection, copy(data, ts, te))
199
205
  };
200
206
 
201
207
  backslash {
@@ -203,12 +209,12 @@
203
209
  };
204
210
 
205
211
  set_open >(open_bracket, 1) >set_opened {
206
- emit(:set, :open, *text(data, ts, te))
212
+ emit(:set, :open, copy(data, ts, te))
207
213
  fcall character_set;
208
214
  };
209
215
 
210
216
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
211
- text = text(data, ts, te).first
217
+ text = copy(data, ts, te)
212
218
 
213
219
  type = :posixclass
214
220
  class_name = text[2..-3]
@@ -217,19 +223,19 @@
217
223
  type = :nonposixclass
218
224
  end
219
225
 
220
- emit(type, class_name.to_sym, text, ts, te)
226
+ emit(type, class_name.to_sym, text)
221
227
  };
222
228
 
223
229
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
224
- emit(:set, :collation, *text(data, ts, te))
230
+ emit(:set, :collation, copy(data, ts, te))
225
231
  };
226
232
 
227
233
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
228
- emit(:set, :equivalent, *text(data, ts, te))
234
+ emit(:set, :equivalent, copy(data, ts, te))
229
235
  };
230
236
 
231
237
  meta_char > (set_meta, 1) {
232
- emit(:literal, :literal, *text(data, ts, te))
238
+ emit(:literal, :literal, copy(data, ts, te))
233
239
  };
234
240
 
235
241
  any |
@@ -237,9 +243,8 @@
237
243
  utf8_2_byte |
238
244
  utf8_3_byte |
239
245
  utf8_4_byte {
240
- char, *rest = *text(data, ts, te)
241
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
242
- emit(:literal, :literal, char, *rest)
246
+ text = copy(data, ts, te)
247
+ emit(:literal, :literal, text)
243
248
  };
244
249
  *|;
245
250
 
@@ -247,7 +252,7 @@
247
252
  # --------------------------------------------------------------------------
248
253
  set_escape_sequence := |*
249
254
  non_set_escape > (escaped_set_alpha, 2) {
250
- emit(:escape, :literal, *text(data, ts, te, 1))
255
+ emit(:escape, :literal, copy(data, ts-1, te))
251
256
  fret;
252
257
  };
253
258
 
@@ -263,33 +268,33 @@
263
268
  # --------------------------------------------------------------------------
264
269
  escape_sequence := |*
265
270
  [1-9] {
266
- text = text(data, ts, te, 1).first
267
- emit(:backref, :number, text, ts-1, te)
271
+ text = copy(data, ts-1, te)
272
+ emit(:backref, :number, text)
268
273
  fret;
269
274
  };
270
275
 
271
276
  octal_sequence {
272
- emit(:escape, :octal, *text(data, ts, te, 1))
277
+ emit(:escape, :octal, copy(data, ts-1, te))
273
278
  fret;
274
279
  };
275
280
 
276
281
  meta_char {
277
- case text = text(data, ts, te, 1).first
278
- when '\.'; emit(:escape, :dot, text, ts-1, te)
279
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
280
- when '\^'; emit(:escape, :bol, text, ts-1, te)
281
- when '\$'; emit(:escape, :eol, text, ts-1, te)
282
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
283
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
284
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
285
- when '\('; emit(:escape, :group_open, text, ts-1, te)
286
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
287
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
288
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
289
- when '\['; emit(:escape, :set_open, text, ts-1, te)
290
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
282
+ case text = copy(data, ts-1, te)
283
+ when '\.'; emit(:escape, :dot, text)
284
+ when '\|'; emit(:escape, :alternation, text)
285
+ when '\^'; emit(:escape, :bol, text)
286
+ when '\$'; emit(:escape, :eol, text)
287
+ when '\?'; emit(:escape, :zero_or_one, text)
288
+ when '\*'; emit(:escape, :zero_or_more, text)
289
+ when '\+'; emit(:escape, :one_or_more, text)
290
+ when '\('; emit(:escape, :group_open, text)
291
+ when '\)'; emit(:escape, :group_close, text)
292
+ when '\{'; emit(:escape, :interval_open, text)
293
+ when '\}'; emit(:escape, :interval_close, text)
294
+ when '\['; emit(:escape, :set_open, text)
295
+ when '\]'; emit(:escape, :set_close, text)
291
296
  when "\\\\";
292
- emit(:escape, :backslash, text, ts-1, te)
297
+ emit(:escape, :backslash, text)
293
298
  end
294
299
  fret;
295
300
  };
@@ -297,31 +302,31 @@
297
302
  escaped_ascii > (escaped_alpha, 7) {
298
303
  # \b is emitted as backspace only when inside a character set, otherwise
299
304
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
300
- case text = text(data, ts, te, 1).first
301
- when '\a'; emit(:escape, :bell, text, ts-1, te)
302
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
303
- when '\e'; emit(:escape, :escape, text, ts-1, te)
304
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
305
- when '\n'; emit(:escape, :newline, text, ts-1, te)
306
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
307
- when '\t'; emit(:escape, :tab, text, ts-1, te)
308
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
305
+ case text = copy(data, ts-1, te)
306
+ when '\a'; emit(:escape, :bell, text)
307
+ when '\b'; emit(:escape, :backspace, text)
308
+ when '\e'; emit(:escape, :escape, text)
309
+ when '\f'; emit(:escape, :form_feed, text)
310
+ when '\n'; emit(:escape, :newline, text)
311
+ when '\r'; emit(:escape, :carriage, text)
312
+ when '\t'; emit(:escape, :tab, text)
313
+ when '\v'; emit(:escape, :vertical_tab, text)
309
314
  end
310
315
  fret;
311
316
  };
312
317
 
313
318
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
314
- text = text(data, ts, te, 1).first
319
+ text = copy(data, ts-1, te)
315
320
  if text[2].chr == '{'
316
- emit(:escape, :codepoint_list, text, ts-1, te)
321
+ emit(:escape, :codepoint_list, text)
317
322
  else
318
- emit(:escape, :codepoint, text, ts-1, te)
323
+ emit(:escape, :codepoint, text)
319
324
  end
320
325
  fret;
321
326
  };
322
327
 
323
328
  hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
324
- emit(:escape, :hex, *text(data, ts, te, 1))
329
+ emit(:escape, :hex, copy(data, ts-1, te))
325
330
  fret;
326
331
  };
327
332
 
@@ -351,8 +356,11 @@
351
356
  fcall unicode_property;
352
357
  };
353
358
 
354
- (any -- non_literal_escape) > (escaped_alpha, 1) {
355
- emit(:escape, :literal, *text(data, ts, te, 1))
359
+ (any -- non_literal_escape) |
360
+ utf8_2_byte |
361
+ utf8_3_byte |
362
+ utf8_4_byte > (escaped_alpha, 1) {
363
+ emit(:escape, :literal, copy(data, ts-1, te))
356
364
  fret;
357
365
  };
358
366
  *|;
@@ -362,9 +370,9 @@
362
370
  # --------------------------------------------------------------------------
363
371
  conditional_expression := |*
364
372
  group_lookup . ')' {
365
- text = text(data, ts, te-1).first
366
- emit(:conditional, :condition, text, ts, te-1)
367
- emit(:conditional, :condition_close, ')', te-1, te)
373
+ text = copy(data, ts, te-1)
374
+ emit(:conditional, :condition, text)
375
+ emit(:conditional, :condition_close, ')')
368
376
  };
369
377
 
370
378
  any {
@@ -381,46 +389,50 @@
381
389
  # Meta characters
382
390
  # ------------------------------------------------------------------------
383
391
  dot {
384
- emit(:meta, :dot, *text(data, ts, te))
392
+ emit(:meta, :dot, copy(data, ts, te))
385
393
  };
386
394
 
387
395
  alternation {
388
396
  if conditional_stack.last == group_depth
389
- emit(:conditional, :separator, *text(data, ts, te))
397
+ emit(:conditional, :separator, copy(data, ts, te))
390
398
  else
391
- emit(:meta, :alternation, *text(data, ts, te))
399
+ emit(:meta, :alternation, copy(data, ts, te))
392
400
  end
393
401
  };
394
402
 
395
403
  # Anchors
396
404
  # ------------------------------------------------------------------------
397
405
  beginning_of_line {
398
- emit(:anchor, :bol, *text(data, ts, te))
406
+ emit(:anchor, :bol, copy(data, ts, te))
399
407
  };
400
408
 
401
409
  end_of_line {
402
- emit(:anchor, :eol, *text(data, ts, te))
410
+ emit(:anchor, :eol, copy(data, ts, te))
403
411
  };
404
412
 
405
413
  backslash . keep_mark > (backslashed, 4) {
406
- emit(:keep, :mark, *text(data, ts, te))
414
+ emit(:keep, :mark, copy(data, ts, te))
407
415
  };
408
416
 
409
417
  backslash . anchor_char > (backslashed, 3) {
410
- case text = text(data, ts, te).first
411
- when '\\A'; emit(:anchor, :bos, text, ts, te)
412
- when '\\z'; emit(:anchor, :eos, text, ts, te)
413
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
414
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
415
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
416
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
418
+ case text = copy(data, ts, te)
419
+ when '\\A'; emit(:anchor, :bos, text)
420
+ when '\\z'; emit(:anchor, :eos, text)
421
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\\b'; emit(:anchor, :word_boundary, text)
423
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\\G'; emit(:anchor, :match_start, text)
417
425
  end
418
426
  };
419
427
 
428
+ literal_delimiters {
429
+ append_literal(data, ts, te)
430
+ };
431
+
420
432
  # Character sets
421
433
  # ------------------------------------------------------------------------
422
434
  set_open >set_opened {
423
- emit(:set, :open, *text(data, ts, te))
435
+ emit(:set, :open, copy(data, ts, te))
424
436
  fcall character_set;
425
437
  };
426
438
 
@@ -429,12 +441,12 @@
429
441
  # (?(condition)Y|N) conditional expression
430
442
  # ------------------------------------------------------------------------
431
443
  conditional {
432
- text = text(data, ts, te).first
444
+ text = copy(data, ts, te)
433
445
 
434
446
  conditional_stack << group_depth
435
447
 
436
- emit(:conditional, :open, text[0..-2], ts, te-1)
437
- emit(:conditional, :condition_open, '(', te-1, te)
448
+ emit(:conditional, :open, text[0..-2])
449
+ emit(:conditional, :condition_open, '(')
438
450
  fcall conditional_expression;
439
451
  };
440
452
 
@@ -445,7 +457,7 @@
445
457
  # correct closing count.
446
458
  # ------------------------------------------------------------------------
447
459
  group_open . group_comment $group_closed {
448
- emit(:group, :comment, *text(data, ts, te))
460
+ emit(:group, :comment, copy(data, ts, te))
449
461
  };
450
462
 
451
463
  # Expression options:
@@ -460,11 +472,11 @@
460
472
  # (?imxdau-imx:subexp) option on/off for subexp
461
473
  # ------------------------------------------------------------------------
462
474
  group_open . group_options >group_opened {
463
- text = text(data, ts, te).first
475
+ text = copy(data, ts, te)
464
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
477
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
478
  end
467
- emit_options(text, ts, te)
479
+ emit_options(text)
468
480
  };
469
481
 
470
482
  # Assertions
@@ -474,11 +486,11 @@
474
486
  # (?<!subexp) negative look-behind
475
487
  # ------------------------------------------------------------------------
476
488
  group_open . assertion_type >group_opened {
477
- case text = text(data, ts, te).first
478
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
479
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
480
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
481
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
489
+ case text = copy(data, ts, te)
490
+ when '(?='; emit(:assertion, :lookahead, text)
491
+ when '(?!'; emit(:assertion, :nlookahead, text)
492
+ when '(?<='; emit(:assertion, :lookbehind, text)
493
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
482
494
  end
483
495
  };
484
496
 
@@ -491,32 +503,32 @@
491
503
  # (subexp) captured group
492
504
  # ------------------------------------------------------------------------
493
505
  group_open . group_type >group_opened {
494
- case text = text(data, ts, te).first
495
- when '(?:'; emit(:group, :passive, text, ts, te)
496
- when '(?>'; emit(:group, :atomic, text, ts, te)
497
- when '(?~'; emit(:group, :absence, text, ts, te)
506
+ case text = copy(data, ts, te)
507
+ when '(?:'; emit(:group, :passive, text)
508
+ when '(?>'; emit(:group, :atomic, text)
509
+ when '(?~'; emit(:group, :absence, text)
498
510
 
499
511
  when /^\(\?(?:<>|'')/
500
512
  validation_error(:group, 'named group', 'name is empty')
501
513
 
502
514
  when /^\(\?<\w*>/
503
- emit(:group, :named_ab, text, ts, te)
515
+ emit(:group, :named_ab, text)
504
516
 
505
517
  when /^\(\?'\w*'/
506
- emit(:group, :named_sq, text, ts, te)
518
+ emit(:group, :named_sq, text)
507
519
 
508
520
  end
509
521
  };
510
522
 
511
523
  group_open @group_opened {
512
- text = text(data, ts, te).first
513
- emit(:group, :capture, text, ts, te)
524
+ text = copy(data, ts, te)
525
+ emit(:group, :capture, text)
514
526
  };
515
527
 
516
528
  group_close @group_closed {
517
529
  if conditional_stack.last == group_depth + 1
518
530
  conditional_stack.pop
519
- emit(:conditional, :close, *text(data, ts, te))
531
+ emit(:conditional, :close, copy(data, ts, te))
520
532
  else
521
533
  if spacing_stack.length > 1 &&
522
534
  spacing_stack.last[:depth] == group_depth + 1
@@ -524,7 +536,7 @@
524
536
  self.free_spacing = spacing_stack.last[:free_spacing]
525
537
  end
526
538
 
527
- emit(:group, :close, *text(data, ts, te))
539
+ emit(:group, :close, copy(data, ts, te))
528
540
  end
529
541
  };
530
542
 
@@ -532,63 +544,63 @@
532
544
  # Group backreference, named and numbered
533
545
  # ------------------------------------------------------------------------
534
546
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
535
- case text = text(data, ts, te).first
547
+ case text = copy(data, ts, te)
536
548
  when /^\\([gk])(<>|'')/ # angle brackets
537
549
  validation_error(:backref, 'ref/call', 'ref ID is empty')
538
550
 
539
551
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
540
552
  if $1 == 'k'
541
- emit(:backref, :name_ref_ab, text, ts, te)
553
+ emit(:backref, :name_ref_ab, text)
542
554
  else
543
- emit(:backref, :name_call_ab, text, ts, te)
555
+ emit(:backref, :name_call_ab, text)
544
556
  end
545
557
 
546
558
  when /^\\([gk])'[^\d+-]\w*'/ #single quotes
547
559
  if $1 == 'k'
548
- emit(:backref, :name_ref_sq, text, ts, te)
560
+ emit(:backref, :name_ref_sq, text)
549
561
  else
550
- emit(:backref, :name_call_sq, text, ts, te)
562
+ emit(:backref, :name_call_sq, text)
551
563
  end
552
564
 
553
565
  when /^\\([gk])<\d+>/ # angle-brackets
554
566
  if $1 == 'k'
555
- emit(:backref, :number_ref_ab, text, ts, te)
567
+ emit(:backref, :number_ref_ab, text)
556
568
  else
557
- emit(:backref, :number_call_ab, text, ts, te)
569
+ emit(:backref, :number_call_ab, text)
558
570
  end
559
571
 
560
572
  when /^\\([gk])'\d+'/ # single quotes
561
573
  if $1 == 'k'
562
- emit(:backref, :number_ref_sq, text, ts, te)
574
+ emit(:backref, :number_ref_sq, text)
563
575
  else
564
- emit(:backref, :number_call_sq, text, ts, te)
576
+ emit(:backref, :number_call_sq, text)
565
577
  end
566
578
 
567
579
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
568
580
  if $1 == 'k'
569
- emit(:backref, :number_rel_ref_ab, text, ts, te)
581
+ emit(:backref, :number_rel_ref_ab, text)
570
582
  else
571
- emit(:backref, :number_rel_call_ab, text, ts, te)
583
+ emit(:backref, :number_rel_call_ab, text)
572
584
  end
573
585
 
574
586
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
575
587
  if $1 == 'k'
576
- emit(:backref, :number_rel_ref_sq, text, ts, te)
588
+ emit(:backref, :number_rel_ref_sq, text)
577
589
  else
578
- emit(:backref, :number_rel_call_sq, text, ts, te)
590
+ emit(:backref, :number_rel_call_sq, text)
579
591
  end
580
592
 
581
593
  when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
582
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
594
+ emit(:backref, :name_recursion_ref_ab, text)
583
595
 
584
596
  when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
585
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
597
+ emit(:backref, :name_recursion_ref_sq, text)
586
598
 
587
599
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
588
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_ab, text)
589
601
 
590
602
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
591
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
603
+ emit(:backref, :number_recursion_ref_sq, text)
592
604
 
593
605
  end
594
606
  };
@@ -597,31 +609,36 @@
597
609
  # Quantifiers
598
610
  # ------------------------------------------------------------------------
599
611
  zero_or_one {
600
- case text = text(data, ts, te).first
601
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
602
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
603
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
612
+ case text = copy(data, ts, te)
613
+ when '?' ; emit(:quantifier, :zero_or_one, text)
614
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
615
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
604
616
  end
605
617
  };
606
618
 
607
619
  zero_or_more {
608
- case text = text(data, ts, te).first
609
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
610
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
611
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
620
+ case text = copy(data, ts, te)
621
+ when '*' ; emit(:quantifier, :zero_or_more, text)
622
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
623
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
612
624
  end
613
625
  };
614
626
 
615
627
  one_or_more {
616
- case text = text(data, ts, te).first
617
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
618
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
619
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
628
+ case text = copy(data, ts, te)
629
+ when '+' ; emit(:quantifier, :one_or_more, text)
630
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
631
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
620
632
  end
621
633
  };
622
634
 
623
- quantifier_interval @err(premature_end_error) {
624
- emit(:quantifier, :interval, *text(data, ts, te))
635
+ quantifier_interval {
636
+ emit(:quantifier, :interval, copy(data, ts, te))
637
+ };
638
+
639
+ # Catch unmatched curly braces as literals
640
+ range_open {
641
+ append_literal(data, ts, te)
625
642
  };
626
643
 
627
644
  # Escaped sequences
@@ -632,15 +649,17 @@
632
649
 
633
650
  comment {
634
651
  if free_spacing
635
- emit(:free_space, :comment, *text(data, ts, te))
652
+ emit(:free_space, :comment, copy(data, ts, te))
636
653
  else
637
- append_literal(data, ts, te)
654
+ # consume only the pound sign (#) and backtrack to do regular scanning
655
+ append_literal(data, ts, ts + 1)
656
+ fexec ts + 1;
638
657
  end
639
658
  };
640
659
 
641
660
  space+ {
642
661
  if free_spacing
643
- emit(:free_space, :whitespace, *text(data, ts, te))
662
+ emit(:free_space, :whitespace, copy(data, ts, te))
644
663
  else
645
664
  append_literal(data, ts, te)
646
665
  end
@@ -722,21 +741,16 @@ class Regexp::Scanner
722
741
  #
723
742
  # This method may raise errors if a syntax error is encountered.
724
743
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
744
+ def self.scan(input_object, options: nil, &block)
745
+ new.scan(input_object, options: options, &block)
727
746
  end
728
747
 
729
- def scan(input_object, &block)
748
+ def scan(input_object, options: nil, &block)
730
749
  self.literal = nil
731
750
  stack = []
732
751
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
752
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
753
+ self.free_spacing = free_spacing?(input_object, options)
740
754
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
755
 
742
756
  data = input.unpack("c*") if input.is_a?(String)
@@ -748,6 +762,7 @@ class Regexp::Scanner
748
762
  self.set_depth = 0
749
763
  self.group_depth = 0
750
764
  self.conditional_stack = []
765
+ self.char_pos = 0
751
766
 
752
767
  %% write data;
753
768
  %% write init;
@@ -757,7 +772,7 @@ class Regexp::Scanner
757
772
  testEof = testEof
758
773
 
759
774
  if cs == re_scanner_error
760
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
775
+ text = copy(data, ts ? ts-1 : 0, -1)
761
776
  raise ScannerError.new("Scan error at '#{text}'")
762
777
  end
763
778
 
@@ -785,22 +800,41 @@ class Regexp::Scanner
785
800
  end
786
801
 
787
802
  # Emits an array with the details of the scanned pattern
788
- def emit(type, token, text, ts, te)
803
+ def emit(type, token, text)
789
804
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
790
805
 
791
806
  emit_literal if literal
792
807
 
808
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
809
+ # end-users, so we keep track of char-based indices and emit those instead.
810
+ ts_char_pos = char_pos
811
+ te_char_pos = char_pos + text.length
812
+
793
813
  if block
794
- block.call type, token, text, ts, te
814
+ block.call type, token, text, ts_char_pos, te_char_pos
795
815
  end
796
816
 
797
- tokens << [type, token, text, ts, te]
817
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
818
+
819
+ self.char_pos = te_char_pos
798
820
  end
799
821
 
800
822
  private
801
823
 
802
824
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
- :group_depth, :set_depth, :conditional_stack
825
+ :group_depth, :set_depth, :conditional_stack, :char_pos
826
+
827
+ def free_spacing?(input_object, options)
828
+ if options && !input_object.is_a?(String)
829
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
830
+ end
831
+
832
+ options = input_object.options if input_object.is_a?(::Regexp)
833
+
834
+ return false unless options
835
+
836
+ options & Regexp::EXTENDED != 0
837
+ end
804
838
 
805
839
  def in_group?
806
840
  group_depth > 0
@@ -811,36 +845,25 @@ class Regexp::Scanner
811
845
  end
812
846
 
813
847
  # Copy from ts to te from data as text
814
- def copy(data, range)
815
- data[range].pack('c*')
816
- end
817
-
818
- # Copy from ts to te from data as text, returning an array with the text
819
- # and the offsets used to copy it.
820
- def text(data, ts, te, soff = 0)
821
- [copy(data, ts-soff..te-1), ts-soff, te]
848
+ def copy(data, ts, te)
849
+ data[ts...te].pack('c*').force_encoding('utf-8')
822
850
  end
823
851
 
824
852
  # Appends one or more characters to the literal buffer, to be emitted later
825
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
853
+ # by a call to emit_literal.
826
854
  def append_literal(data, ts, te)
827
855
  self.literal = literal || []
828
- literal << text(data, ts, te)
856
+ literal << copy(data, ts, te)
829
857
  end
830
858
 
831
- # Emits the literal run collected by calls to the append_literal method,
832
- # using the total start (ts) and end (te) offsets of the run.
859
+ # Emits the literal run collected by calls to the append_literal method.
833
860
  def emit_literal
834
- ts, te = literal.first[1], literal.last[2]
835
- text = literal.map {|t| t[0]}.join
836
-
837
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
838
-
861
+ text = literal.join
839
862
  self.literal = nil
840
- emit(:literal, :literal, text, ts, te)
863
+ emit(:literal, :literal, text)
841
864
  end
842
865
 
843
- def emit_options(text, ts, te)
866
+ def emit_options(text)
844
867
  token = nil
845
868
 
846
869
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,14 +889,14 @@ class Regexp::Scanner
866
889
  token = :options_switch
867
890
  end
868
891
 
869
- emit(:group, token, text, ts, te)
892
+ emit(:group, token, text)
870
893
  end
871
894
 
872
895
  def emit_meta_control_sequence(data, ts, te, token)
873
896
  if data.last < 0x00 || data.last > 0x7F
874
897
  validation_error(:sequence, 'escape', token.to_s)
875
898
  end
876
- emit(:escape, token, *text(data, ts, te, 1))
899
+ emit(:escape, token, copy(data, ts-1, te))
877
900
  end
878
901
 
879
902
  # Centralizes and unifies the handling of validation related