regexp_parser 1.7.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +80 -1
  3. data/README.md +24 -12
  4. data/lib/regexp_parser/expression.rb +10 -19
  5. data/lib/regexp_parser/expression/classes/group.rb +17 -2
  6. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  7. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  8. data/lib/regexp_parser/expression/sequence.rb +0 -10
  9. data/lib/regexp_parser/lexer.rb +6 -6
  10. data/lib/regexp_parser/parser.rb +45 -12
  11. data/lib/regexp_parser/scanner.rb +1305 -1193
  12. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  13. data/lib/regexp_parser/scanner/property.rl +2 -2
  14. data/lib/regexp_parser/scanner/scanner.rl +194 -171
  15. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  16. data/lib/regexp_parser/version.rb +1 -1
  17. data/regexp_parser.gemspec +1 -1
  18. data/spec/expression/base_spec.rb +10 -0
  19. data/spec/expression/to_s_spec.rb +16 -0
  20. data/spec/lexer/delimiters_spec.rb +68 -0
  21. data/spec/lexer/literals_spec.rb +24 -49
  22. data/spec/parser/escapes_spec.rb +1 -1
  23. data/spec/parser/options_spec.rb +28 -0
  24. data/spec/parser/quantifiers_spec.rb +16 -0
  25. data/spec/parser/set/ranges_spec.rb +3 -3
  26. data/spec/scanner/delimiters_spec.rb +52 -0
  27. data/spec/scanner/errors_spec.rb +0 -1
  28. data/spec/scanner/escapes_spec.rb +10 -0
  29. data/spec/scanner/free_space_spec.rb +32 -0
  30. data/spec/scanner/literals_spec.rb +28 -38
  31. data/spec/scanner/options_spec.rb +36 -0
  32. data/spec/scanner/quantifiers_spec.rb +18 -13
  33. data/spec/scanner/sets_spec.rb +8 -2
  34. metadata +65 -61
  35. data/spec/expression/root_spec.rb +0 -9
  36. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -21,7 +21,7 @@
21
21
  set_close = ']';
22
22
  brackets = set_open | set_close;
23
23
 
24
- comment = ('#' . [^\n]* . '\n');
24
+ comment = ('#' . [^\n]* . '\n'?);
25
25
 
26
26
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
27
  'cntrl' | 'digit' | 'graph' |
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -114,7 +118,9 @@
114
118
  curlies | parantheses | brackets |
115
119
  line_anchor | quantifier_greedy;
116
120
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
121
+ literal_delimiters = ']' | '}';
122
+
123
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
118
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
119
125
 
120
126
  utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
@@ -122,20 +128,20 @@
122
128
  utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
123
129
 
124
130
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
131
+ keep_mark | [xucCM];
126
132
 
127
133
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
134
  multi_codepoint_char_type | [0-9cCM];
129
135
 
130
136
  # EOF error, used where it can be detected
131
137
  action premature_end_error {
132
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
+ text = copy(data, ts ? ts-1 : 0, -1)
133
139
  raise PrematureEndError.new( text )
134
140
  }
135
141
 
136
142
  # Invalid sequence error, used from sequences, like escapes and sets
137
143
  action invalid_sequence_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
144
+ text = copy(data, ts ? ts-1 : 0, -1)
139
145
  validation_error(:sequence, 'sequence', text)
140
146
  }
141
147
 
@@ -150,7 +156,7 @@
150
156
  # --------------------------------------------------------------------------
151
157
  character_set := |*
152
158
  set_close > (set_meta, 2) @set_closed {
153
- emit(:set, :close, *text(data, ts, te))
159
+ emit(:set, :close, copy(data, ts, te))
154
160
  if in_set?
155
161
  fret;
156
162
  else
@@ -159,8 +165,8 @@
159
165
  };
160
166
 
161
167
  '-]' @set_closed { # special case, emits two tokens
162
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
168
+ emit(:literal, :literal, copy(data, ts, te-1))
169
+ emit(:set, :close, copy(data, ts+1, te))
164
170
  if in_set?
165
171
  fret;
166
172
  else
@@ -169,33 +175,33 @@
169
175
  };
170
176
 
171
177
  '-&&' { # special case, emits two tokens
172
- emit(:literal, :literal, '-', ts, te)
173
- emit(:set, :intersection, '&&', ts, te)
178
+ emit(:literal, :literal, '-')
179
+ emit(:set, :intersection, '&&')
174
180
  };
175
181
 
176
182
  '^' {
177
- text = text(data, ts, te).first
183
+ text = copy(data, ts, te)
178
184
  if tokens.last[1] == :open
179
- emit(:set, :negate, text, ts, te)
185
+ emit(:set, :negate, text)
180
186
  else
181
- emit(:literal, :literal, text, ts, te)
187
+ emit(:literal, :literal, text)
182
188
  end
183
189
  };
184
190
 
185
191
  '-' {
186
- text = text(data, ts, te).first
192
+ text = copy(data, ts, te)
187
193
  # ranges cant start with a subset or intersection/negation/range operator
188
194
  if tokens.last[0] == :set
189
- emit(:literal, :literal, text, ts, te)
195
+ emit(:literal, :literal, text)
190
196
  else
191
- emit(:set, :range, text, ts, te)
197
+ emit(:set, :range, text)
192
198
  end
193
199
  };
194
200
 
195
201
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
196
202
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
197
203
  '&&' {
198
- emit(:set, :intersection, *text(data, ts, te))
204
+ emit(:set, :intersection, copy(data, ts, te))
199
205
  };
200
206
 
201
207
  backslash {
@@ -203,12 +209,12 @@
203
209
  };
204
210
 
205
211
  set_open >(open_bracket, 1) >set_opened {
206
- emit(:set, :open, *text(data, ts, te))
212
+ emit(:set, :open, copy(data, ts, te))
207
213
  fcall character_set;
208
214
  };
209
215
 
210
216
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
211
- text = text(data, ts, te).first
217
+ text = copy(data, ts, te)
212
218
 
213
219
  type = :posixclass
214
220
  class_name = text[2..-3]
@@ -217,19 +223,19 @@
217
223
  type = :nonposixclass
218
224
  end
219
225
 
220
- emit(type, class_name.to_sym, text, ts, te)
226
+ emit(type, class_name.to_sym, text)
221
227
  };
222
228
 
223
229
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
224
- emit(:set, :collation, *text(data, ts, te))
230
+ emit(:set, :collation, copy(data, ts, te))
225
231
  };
226
232
 
227
233
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
228
- emit(:set, :equivalent, *text(data, ts, te))
234
+ emit(:set, :equivalent, copy(data, ts, te))
229
235
  };
230
236
 
231
237
  meta_char > (set_meta, 1) {
232
- emit(:literal, :literal, *text(data, ts, te))
238
+ emit(:literal, :literal, copy(data, ts, te))
233
239
  };
234
240
 
235
241
  any |
@@ -237,9 +243,8 @@
237
243
  utf8_2_byte |
238
244
  utf8_3_byte |
239
245
  utf8_4_byte {
240
- char, *rest = *text(data, ts, te)
241
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
242
- emit(:literal, :literal, char, *rest)
246
+ text = copy(data, ts, te)
247
+ emit(:literal, :literal, text)
243
248
  };
244
249
  *|;
245
250
 
@@ -247,7 +252,7 @@
247
252
  # --------------------------------------------------------------------------
248
253
  set_escape_sequence := |*
249
254
  non_set_escape > (escaped_set_alpha, 2) {
250
- emit(:escape, :literal, *text(data, ts, te, 1))
255
+ emit(:escape, :literal, copy(data, ts-1, te))
251
256
  fret;
252
257
  };
253
258
 
@@ -263,33 +268,33 @@
263
268
  # --------------------------------------------------------------------------
264
269
  escape_sequence := |*
265
270
  [1-9] {
266
- text = text(data, ts, te, 1).first
267
- emit(:backref, :number, text, ts-1, te)
271
+ text = copy(data, ts-1, te)
272
+ emit(:backref, :number, text)
268
273
  fret;
269
274
  };
270
275
 
271
276
  octal_sequence {
272
- emit(:escape, :octal, *text(data, ts, te, 1))
277
+ emit(:escape, :octal, copy(data, ts-1, te))
273
278
  fret;
274
279
  };
275
280
 
276
281
  meta_char {
277
- case text = text(data, ts, te, 1).first
278
- when '\.'; emit(:escape, :dot, text, ts-1, te)
279
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
280
- when '\^'; emit(:escape, :bol, text, ts-1, te)
281
- when '\$'; emit(:escape, :eol, text, ts-1, te)
282
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
283
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
284
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
285
- when '\('; emit(:escape, :group_open, text, ts-1, te)
286
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
287
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
288
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
289
- when '\['; emit(:escape, :set_open, text, ts-1, te)
290
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
282
+ case text = copy(data, ts-1, te)
283
+ when '\.'; emit(:escape, :dot, text)
284
+ when '\|'; emit(:escape, :alternation, text)
285
+ when '\^'; emit(:escape, :bol, text)
286
+ when '\$'; emit(:escape, :eol, text)
287
+ when '\?'; emit(:escape, :zero_or_one, text)
288
+ when '\*'; emit(:escape, :zero_or_more, text)
289
+ when '\+'; emit(:escape, :one_or_more, text)
290
+ when '\('; emit(:escape, :group_open, text)
291
+ when '\)'; emit(:escape, :group_close, text)
292
+ when '\{'; emit(:escape, :interval_open, text)
293
+ when '\}'; emit(:escape, :interval_close, text)
294
+ when '\['; emit(:escape, :set_open, text)
295
+ when '\]'; emit(:escape, :set_close, text)
291
296
  when "\\\\";
292
- emit(:escape, :backslash, text, ts-1, te)
297
+ emit(:escape, :backslash, text)
293
298
  end
294
299
  fret;
295
300
  };
@@ -297,31 +302,31 @@
297
302
  escaped_ascii > (escaped_alpha, 7) {
298
303
  # \b is emitted as backspace only when inside a character set, otherwise
299
304
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
300
- case text = text(data, ts, te, 1).first
301
- when '\a'; emit(:escape, :bell, text, ts-1, te)
302
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
303
- when '\e'; emit(:escape, :escape, text, ts-1, te)
304
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
305
- when '\n'; emit(:escape, :newline, text, ts-1, te)
306
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
307
- when '\t'; emit(:escape, :tab, text, ts-1, te)
308
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
305
+ case text = copy(data, ts-1, te)
306
+ when '\a'; emit(:escape, :bell, text)
307
+ when '\b'; emit(:escape, :backspace, text)
308
+ when '\e'; emit(:escape, :escape, text)
309
+ when '\f'; emit(:escape, :form_feed, text)
310
+ when '\n'; emit(:escape, :newline, text)
311
+ when '\r'; emit(:escape, :carriage, text)
312
+ when '\t'; emit(:escape, :tab, text)
313
+ when '\v'; emit(:escape, :vertical_tab, text)
309
314
  end
310
315
  fret;
311
316
  };
312
317
 
313
318
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
314
- text = text(data, ts, te, 1).first
319
+ text = copy(data, ts-1, te)
315
320
  if text[2].chr == '{'
316
- emit(:escape, :codepoint_list, text, ts-1, te)
321
+ emit(:escape, :codepoint_list, text)
317
322
  else
318
- emit(:escape, :codepoint, text, ts-1, te)
323
+ emit(:escape, :codepoint, text)
319
324
  end
320
325
  fret;
321
326
  };
322
327
 
323
328
  hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
324
- emit(:escape, :hex, *text(data, ts, te, 1))
329
+ emit(:escape, :hex, copy(data, ts-1, te))
325
330
  fret;
326
331
  };
327
332
 
@@ -351,8 +356,11 @@
351
356
  fcall unicode_property;
352
357
  };
353
358
 
354
- (any -- non_literal_escape) > (escaped_alpha, 1) {
355
- emit(:escape, :literal, *text(data, ts, te, 1))
359
+ (any -- non_literal_escape) |
360
+ utf8_2_byte |
361
+ utf8_3_byte |
362
+ utf8_4_byte > (escaped_alpha, 1) {
363
+ emit(:escape, :literal, copy(data, ts-1, te))
356
364
  fret;
357
365
  };
358
366
  *|;
@@ -362,9 +370,9 @@
362
370
  # --------------------------------------------------------------------------
363
371
  conditional_expression := |*
364
372
  group_lookup . ')' {
365
- text = text(data, ts, te-1).first
366
- emit(:conditional, :condition, text, ts, te-1)
367
- emit(:conditional, :condition_close, ')', te-1, te)
373
+ text = copy(data, ts, te-1)
374
+ emit(:conditional, :condition, text)
375
+ emit(:conditional, :condition_close, ')')
368
376
  };
369
377
 
370
378
  any {
@@ -381,46 +389,50 @@
381
389
  # Meta characters
382
390
  # ------------------------------------------------------------------------
383
391
  dot {
384
- emit(:meta, :dot, *text(data, ts, te))
392
+ emit(:meta, :dot, copy(data, ts, te))
385
393
  };
386
394
 
387
395
  alternation {
388
396
  if conditional_stack.last == group_depth
389
- emit(:conditional, :separator, *text(data, ts, te))
397
+ emit(:conditional, :separator, copy(data, ts, te))
390
398
  else
391
- emit(:meta, :alternation, *text(data, ts, te))
399
+ emit(:meta, :alternation, copy(data, ts, te))
392
400
  end
393
401
  };
394
402
 
395
403
  # Anchors
396
404
  # ------------------------------------------------------------------------
397
405
  beginning_of_line {
398
- emit(:anchor, :bol, *text(data, ts, te))
406
+ emit(:anchor, :bol, copy(data, ts, te))
399
407
  };
400
408
 
401
409
  end_of_line {
402
- emit(:anchor, :eol, *text(data, ts, te))
410
+ emit(:anchor, :eol, copy(data, ts, te))
403
411
  };
404
412
 
405
413
  backslash . keep_mark > (backslashed, 4) {
406
- emit(:keep, :mark, *text(data, ts, te))
414
+ emit(:keep, :mark, copy(data, ts, te))
407
415
  };
408
416
 
409
417
  backslash . anchor_char > (backslashed, 3) {
410
- case text = text(data, ts, te).first
411
- when '\\A'; emit(:anchor, :bos, text, ts, te)
412
- when '\\z'; emit(:anchor, :eos, text, ts, te)
413
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
414
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
415
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
416
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
418
+ case text = copy(data, ts, te)
419
+ when '\\A'; emit(:anchor, :bos, text)
420
+ when '\\z'; emit(:anchor, :eos, text)
421
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\\b'; emit(:anchor, :word_boundary, text)
423
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\\G'; emit(:anchor, :match_start, text)
417
425
  end
418
426
  };
419
427
 
428
+ literal_delimiters {
429
+ append_literal(data, ts, te)
430
+ };
431
+
420
432
  # Character sets
421
433
  # ------------------------------------------------------------------------
422
434
  set_open >set_opened {
423
- emit(:set, :open, *text(data, ts, te))
435
+ emit(:set, :open, copy(data, ts, te))
424
436
  fcall character_set;
425
437
  };
426
438
 
@@ -429,12 +441,12 @@
429
441
  # (?(condition)Y|N) conditional expression
430
442
  # ------------------------------------------------------------------------
431
443
  conditional {
432
- text = text(data, ts, te).first
444
+ text = copy(data, ts, te)
433
445
 
434
446
  conditional_stack << group_depth
435
447
 
436
- emit(:conditional, :open, text[0..-2], ts, te-1)
437
- emit(:conditional, :condition_open, '(', te-1, te)
448
+ emit(:conditional, :open, text[0..-2])
449
+ emit(:conditional, :condition_open, '(')
438
450
  fcall conditional_expression;
439
451
  };
440
452
 
@@ -445,7 +457,7 @@
445
457
  # correct closing count.
446
458
  # ------------------------------------------------------------------------
447
459
  group_open . group_comment $group_closed {
448
- emit(:group, :comment, *text(data, ts, te))
460
+ emit(:group, :comment, copy(data, ts, te))
449
461
  };
450
462
 
451
463
  # Expression options:
@@ -460,11 +472,11 @@
460
472
  # (?imxdau-imx:subexp) option on/off for subexp
461
473
  # ------------------------------------------------------------------------
462
474
  group_open . group_options >group_opened {
463
- text = text(data, ts, te).first
475
+ text = copy(data, ts, te)
464
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
477
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
466
478
  end
467
- emit_options(text, ts, te)
479
+ emit_options(text)
468
480
  };
469
481
 
470
482
  # Assertions
@@ -474,11 +486,11 @@
474
486
  # (?<!subexp) negative look-behind
475
487
  # ------------------------------------------------------------------------
476
488
  group_open . assertion_type >group_opened {
477
- case text = text(data, ts, te).first
478
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
479
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
480
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
481
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
489
+ case text = copy(data, ts, te)
490
+ when '(?='; emit(:assertion, :lookahead, text)
491
+ when '(?!'; emit(:assertion, :nlookahead, text)
492
+ when '(?<='; emit(:assertion, :lookbehind, text)
493
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
482
494
  end
483
495
  };
484
496
 
@@ -491,32 +503,32 @@
491
503
  # (subexp) captured group
492
504
  # ------------------------------------------------------------------------
493
505
  group_open . group_type >group_opened {
494
- case text = text(data, ts, te).first
495
- when '(?:'; emit(:group, :passive, text, ts, te)
496
- when '(?>'; emit(:group, :atomic, text, ts, te)
497
- when '(?~'; emit(:group, :absence, text, ts, te)
506
+ case text = copy(data, ts, te)
507
+ when '(?:'; emit(:group, :passive, text)
508
+ when '(?>'; emit(:group, :atomic, text)
509
+ when '(?~'; emit(:group, :absence, text)
498
510
 
499
511
  when /^\(\?(?:<>|'')/
500
512
  validation_error(:group, 'named group', 'name is empty')
501
513
 
502
514
  when /^\(\?<\w*>/
503
- emit(:group, :named_ab, text, ts, te)
515
+ emit(:group, :named_ab, text)
504
516
 
505
517
  when /^\(\?'\w*'/
506
- emit(:group, :named_sq, text, ts, te)
518
+ emit(:group, :named_sq, text)
507
519
 
508
520
  end
509
521
  };
510
522
 
511
523
  group_open @group_opened {
512
- text = text(data, ts, te).first
513
- emit(:group, :capture, text, ts, te)
524
+ text = copy(data, ts, te)
525
+ emit(:group, :capture, text)
514
526
  };
515
527
 
516
528
  group_close @group_closed {
517
529
  if conditional_stack.last == group_depth + 1
518
530
  conditional_stack.pop
519
- emit(:conditional, :close, *text(data, ts, te))
531
+ emit(:conditional, :close, copy(data, ts, te))
520
532
  else
521
533
  if spacing_stack.length > 1 &&
522
534
  spacing_stack.last[:depth] == group_depth + 1
@@ -524,7 +536,7 @@
524
536
  self.free_spacing = spacing_stack.last[:free_spacing]
525
537
  end
526
538
 
527
- emit(:group, :close, *text(data, ts, te))
539
+ emit(:group, :close, copy(data, ts, te))
528
540
  end
529
541
  };
530
542
 
@@ -532,63 +544,63 @@
532
544
  # Group backreference, named and numbered
533
545
  # ------------------------------------------------------------------------
534
546
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
535
- case text = text(data, ts, te).first
547
+ case text = copy(data, ts, te)
536
548
  when /^\\([gk])(<>|'')/ # angle brackets
537
549
  validation_error(:backref, 'ref/call', 'ref ID is empty')
538
550
 
539
551
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
540
552
  if $1 == 'k'
541
- emit(:backref, :name_ref_ab, text, ts, te)
553
+ emit(:backref, :name_ref_ab, text)
542
554
  else
543
- emit(:backref, :name_call_ab, text, ts, te)
555
+ emit(:backref, :name_call_ab, text)
544
556
  end
545
557
 
546
558
  when /^\\([gk])'[^\d+-]\w*'/ #single quotes
547
559
  if $1 == 'k'
548
- emit(:backref, :name_ref_sq, text, ts, te)
560
+ emit(:backref, :name_ref_sq, text)
549
561
  else
550
- emit(:backref, :name_call_sq, text, ts, te)
562
+ emit(:backref, :name_call_sq, text)
551
563
  end
552
564
 
553
565
  when /^\\([gk])<\d+>/ # angle-brackets
554
566
  if $1 == 'k'
555
- emit(:backref, :number_ref_ab, text, ts, te)
567
+ emit(:backref, :number_ref_ab, text)
556
568
  else
557
- emit(:backref, :number_call_ab, text, ts, te)
569
+ emit(:backref, :number_call_ab, text)
558
570
  end
559
571
 
560
572
  when /^\\([gk])'\d+'/ # single quotes
561
573
  if $1 == 'k'
562
- emit(:backref, :number_ref_sq, text, ts, te)
574
+ emit(:backref, :number_ref_sq, text)
563
575
  else
564
- emit(:backref, :number_call_sq, text, ts, te)
576
+ emit(:backref, :number_call_sq, text)
565
577
  end
566
578
 
567
579
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
568
580
  if $1 == 'k'
569
- emit(:backref, :number_rel_ref_ab, text, ts, te)
581
+ emit(:backref, :number_rel_ref_ab, text)
570
582
  else
571
- emit(:backref, :number_rel_call_ab, text, ts, te)
583
+ emit(:backref, :number_rel_call_ab, text)
572
584
  end
573
585
 
574
586
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
575
587
  if $1 == 'k'
576
- emit(:backref, :number_rel_ref_sq, text, ts, te)
588
+ emit(:backref, :number_rel_ref_sq, text)
577
589
  else
578
- emit(:backref, :number_rel_call_sq, text, ts, te)
590
+ emit(:backref, :number_rel_call_sq, text)
579
591
  end
580
592
 
581
593
  when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
582
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
594
+ emit(:backref, :name_recursion_ref_ab, text)
583
595
 
584
596
  when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
585
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
597
+ emit(:backref, :name_recursion_ref_sq, text)
586
598
 
587
599
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
588
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_ab, text)
589
601
 
590
602
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
591
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
603
+ emit(:backref, :number_recursion_ref_sq, text)
592
604
 
593
605
  end
594
606
  };
@@ -597,31 +609,36 @@
597
609
  # Quantifiers
598
610
  # ------------------------------------------------------------------------
599
611
  zero_or_one {
600
- case text = text(data, ts, te).first
601
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
602
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
603
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
612
+ case text = copy(data, ts, te)
613
+ when '?' ; emit(:quantifier, :zero_or_one, text)
614
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
615
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
604
616
  end
605
617
  };
606
618
 
607
619
  zero_or_more {
608
- case text = text(data, ts, te).first
609
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
610
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
611
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
620
+ case text = copy(data, ts, te)
621
+ when '*' ; emit(:quantifier, :zero_or_more, text)
622
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
623
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
612
624
  end
613
625
  };
614
626
 
615
627
  one_or_more {
616
- case text = text(data, ts, te).first
617
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
618
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
619
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
628
+ case text = copy(data, ts, te)
629
+ when '+' ; emit(:quantifier, :one_or_more, text)
630
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
631
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
620
632
  end
621
633
  };
622
634
 
623
- quantifier_interval @err(premature_end_error) {
624
- emit(:quantifier, :interval, *text(data, ts, te))
635
+ quantifier_interval {
636
+ emit(:quantifier, :interval, copy(data, ts, te))
637
+ };
638
+
639
+ # Catch unmatched curly braces as literals
640
+ range_open {
641
+ append_literal(data, ts, te)
625
642
  };
626
643
 
627
644
  # Escaped sequences
@@ -632,15 +649,17 @@
632
649
 
633
650
  comment {
634
651
  if free_spacing
635
- emit(:free_space, :comment, *text(data, ts, te))
652
+ emit(:free_space, :comment, copy(data, ts, te))
636
653
  else
637
- append_literal(data, ts, te)
654
+ # consume only the pound sign (#) and backtrack to do regular scanning
655
+ append_literal(data, ts, ts + 1)
656
+ fexec ts + 1;
638
657
  end
639
658
  };
640
659
 
641
660
  space+ {
642
661
  if free_spacing
643
- emit(:free_space, :whitespace, *text(data, ts, te))
662
+ emit(:free_space, :whitespace, copy(data, ts, te))
644
663
  else
645
664
  append_literal(data, ts, te)
646
665
  end
@@ -722,21 +741,16 @@ class Regexp::Scanner
722
741
  #
723
742
  # This method may raise errors if a syntax error is encountered.
724
743
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
744
+ def self.scan(input_object, options: nil, &block)
745
+ new.scan(input_object, options: options, &block)
727
746
  end
728
747
 
729
- def scan(input_object, &block)
748
+ def scan(input_object, options: nil, &block)
730
749
  self.literal = nil
731
750
  stack = []
732
751
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
752
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
753
+ self.free_spacing = free_spacing?(input_object, options)
740
754
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
755
 
742
756
  data = input.unpack("c*") if input.is_a?(String)
@@ -748,6 +762,7 @@ class Regexp::Scanner
748
762
  self.set_depth = 0
749
763
  self.group_depth = 0
750
764
  self.conditional_stack = []
765
+ self.char_pos = 0
751
766
 
752
767
  %% write data;
753
768
  %% write init;
@@ -757,7 +772,7 @@ class Regexp::Scanner
757
772
  testEof = testEof
758
773
 
759
774
  if cs == re_scanner_error
760
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
775
+ text = copy(data, ts ? ts-1 : 0, -1)
761
776
  raise ScannerError.new("Scan error at '#{text}'")
762
777
  end
763
778
 
@@ -785,22 +800,41 @@ class Regexp::Scanner
785
800
  end
786
801
 
787
802
  # Emits an array with the details of the scanned pattern
788
- def emit(type, token, text, ts, te)
803
+ def emit(type, token, text)
789
804
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
790
805
 
791
806
  emit_literal if literal
792
807
 
808
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
809
+ # end-users, so we keep track of char-based indices and emit those instead.
810
+ ts_char_pos = char_pos
811
+ te_char_pos = char_pos + text.length
812
+
793
813
  if block
794
- block.call type, token, text, ts, te
814
+ block.call type, token, text, ts_char_pos, te_char_pos
795
815
  end
796
816
 
797
- tokens << [type, token, text, ts, te]
817
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
818
+
819
+ self.char_pos = te_char_pos
798
820
  end
799
821
 
800
822
  private
801
823
 
802
824
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
- :group_depth, :set_depth, :conditional_stack
825
+ :group_depth, :set_depth, :conditional_stack, :char_pos
826
+
827
+ def free_spacing?(input_object, options)
828
+ if options && !input_object.is_a?(String)
829
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
830
+ end
831
+
832
+ options = input_object.options if input_object.is_a?(::Regexp)
833
+
834
+ return false unless options
835
+
836
+ options & Regexp::EXTENDED != 0
837
+ end
804
838
 
805
839
  def in_group?
806
840
  group_depth > 0
@@ -811,36 +845,25 @@ class Regexp::Scanner
811
845
  end
812
846
 
813
847
  # Copy from ts to te from data as text
814
- def copy(data, range)
815
- data[range].pack('c*')
816
- end
817
-
818
- # Copy from ts to te from data as text, returning an array with the text
819
- # and the offsets used to copy it.
820
- def text(data, ts, te, soff = 0)
821
- [copy(data, ts-soff..te-1), ts-soff, te]
848
+ def copy(data, ts, te)
849
+ data[ts...te].pack('c*').force_encoding('utf-8')
822
850
  end
823
851
 
824
852
  # Appends one or more characters to the literal buffer, to be emitted later
825
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
853
+ # by a call to emit_literal.
826
854
  def append_literal(data, ts, te)
827
855
  self.literal = literal || []
828
- literal << text(data, ts, te)
856
+ literal << copy(data, ts, te)
829
857
  end
830
858
 
831
- # Emits the literal run collected by calls to the append_literal method,
832
- # using the total start (ts) and end (te) offsets of the run.
859
+ # Emits the literal run collected by calls to the append_literal method.
833
860
  def emit_literal
834
- ts, te = literal.first[1], literal.last[2]
835
- text = literal.map {|t| t[0]}.join
836
-
837
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
838
-
861
+ text = literal.join
839
862
  self.literal = nil
840
- emit(:literal, :literal, text, ts, te)
863
+ emit(:literal, :literal, text)
841
864
  end
842
865
 
843
- def emit_options(text, ts, te)
866
+ def emit_options(text)
844
867
  token = nil
845
868
 
846
869
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,14 +889,14 @@ class Regexp::Scanner
866
889
  token = :options_switch
867
890
  end
868
891
 
869
- emit(:group, token, text, ts, te)
892
+ emit(:group, token, text)
870
893
  end
871
894
 
872
895
  def emit_meta_control_sequence(data, ts, te, token)
873
896
  if data.last < 0x00 || data.last > 0x7F
874
897
  validation_error(:sequence, 'escape', token.to_s)
875
898
  end
876
- emit(:escape, token, *text(data, ts, te, 1))
899
+ emit(:escape, token, copy(data, ts-1, te))
877
900
  end
878
901
 
879
902
  # Centralizes and unifies the handling of validation related