regexp_parser 1.8.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -135,13 +135,13 @@
135
135
 
136
136
  # EOF error, used where it can be detected
137
137
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
+ text = copy(data, ts ? ts-1 : 0, -1)
139
139
  raise PrematureEndError.new( text )
140
140
  }
141
141
 
142
142
  # Invalid sequence error, used from sequences, like escapes and sets
143
143
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
144
+ text = copy(data, ts ? ts-1 : 0, -1)
145
145
  validation_error(:sequence, 'sequence', text)
146
146
  }
147
147
 
@@ -156,7 +156,7 @@
156
156
  # --------------------------------------------------------------------------
157
157
  character_set := |*
158
158
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
159
+ emit(:set, :close, copy(data, ts, te))
160
160
  if in_set?
161
161
  fret;
162
162
  else
@@ -165,8 +165,8 @@
165
165
  };
166
166
 
167
167
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
168
+ emit(:literal, :literal, copy(data, ts, te-1))
169
+ emit(:set, :close, copy(data, ts+1, te))
170
170
  if in_set?
171
171
  fret;
172
172
  else
@@ -175,33 +175,33 @@
175
175
  };
176
176
 
177
177
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
178
+ emit(:literal, :literal, '-')
179
+ emit(:set, :intersection, '&&')
180
180
  };
181
181
 
182
182
  '^' {
183
- text = text(data, ts, te).first
183
+ text = copy(data, ts, te)
184
184
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
185
+ emit(:set, :negate, text)
186
186
  else
187
- emit(:literal, :literal, text, ts, te)
187
+ emit(:literal, :literal, text)
188
188
  end
189
189
  };
190
190
 
191
191
  '-' {
192
- text = text(data, ts, te).first
192
+ text = copy(data, ts, te)
193
193
  # ranges cant start with a subset or intersection/negation/range operator
194
194
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
195
+ emit(:literal, :literal, text)
196
196
  else
197
- emit(:set, :range, text, ts, te)
197
+ emit(:set, :range, text)
198
198
  end
199
199
  };
200
200
 
201
201
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
202
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
203
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
204
+ emit(:set, :intersection, copy(data, ts, te))
205
205
  };
206
206
 
207
207
  backslash {
@@ -209,12 +209,12 @@
209
209
  };
210
210
 
211
211
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
212
+ emit(:set, :open, copy(data, ts, te))
213
213
  fcall character_set;
214
214
  };
215
215
 
216
216
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
217
+ text = copy(data, ts, te)
218
218
 
219
219
  type = :posixclass
220
220
  class_name = text[2..-3]
@@ -223,19 +223,19 @@
223
223
  type = :nonposixclass
224
224
  end
225
225
 
226
- emit(type, class_name.to_sym, text, ts, te)
226
+ emit(type, class_name.to_sym, text)
227
227
  };
228
228
 
229
229
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
230
+ emit(:set, :collation, copy(data, ts, te))
231
231
  };
232
232
 
233
233
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
234
+ emit(:set, :equivalent, copy(data, ts, te))
235
235
  };
236
236
 
237
237
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
238
+ emit(:literal, :literal, copy(data, ts, te))
239
239
  };
240
240
 
241
241
  any |
@@ -243,9 +243,8 @@
243
243
  utf8_2_byte |
244
244
  utf8_3_byte |
245
245
  utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
246
+ text = copy(data, ts, te)
247
+ emit(:literal, :literal, text)
249
248
  };
250
249
  *|;
251
250
 
@@ -253,7 +252,7 @@
253
252
  # --------------------------------------------------------------------------
254
253
  set_escape_sequence := |*
255
254
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
255
+ emit(:escape, :literal, copy(data, ts-1, te))
257
256
  fret;
258
257
  };
259
258
 
@@ -269,33 +268,33 @@
269
268
  # --------------------------------------------------------------------------
270
269
  escape_sequence := |*
271
270
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
271
+ text = copy(data, ts-1, te)
272
+ emit(:backref, :number, text)
274
273
  fret;
275
274
  };
276
275
 
277
276
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
277
+ emit(:escape, :octal, copy(data, ts-1, te))
279
278
  fret;
280
279
  };
281
280
 
282
281
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
282
+ case text = copy(data, ts-1, te)
283
+ when '\.'; emit(:escape, :dot, text)
284
+ when '\|'; emit(:escape, :alternation, text)
285
+ when '\^'; emit(:escape, :bol, text)
286
+ when '\$'; emit(:escape, :eol, text)
287
+ when '\?'; emit(:escape, :zero_or_one, text)
288
+ when '\*'; emit(:escape, :zero_or_more, text)
289
+ when '\+'; emit(:escape, :one_or_more, text)
290
+ when '\('; emit(:escape, :group_open, text)
291
+ when '\)'; emit(:escape, :group_close, text)
292
+ when '\{'; emit(:escape, :interval_open, text)
293
+ when '\}'; emit(:escape, :interval_close, text)
294
+ when '\['; emit(:escape, :set_open, text)
295
+ when '\]'; emit(:escape, :set_close, text)
297
296
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
297
+ emit(:escape, :backslash, text)
299
298
  end
300
299
  fret;
301
300
  };
@@ -303,31 +302,31 @@
303
302
  escaped_ascii > (escaped_alpha, 7) {
304
303
  # \b is emitted as backspace only when inside a character set, otherwise
305
304
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
305
+ case text = copy(data, ts-1, te)
306
+ when '\a'; emit(:escape, :bell, text)
307
+ when '\b'; emit(:escape, :backspace, text)
308
+ when '\e'; emit(:escape, :escape, text)
309
+ when '\f'; emit(:escape, :form_feed, text)
310
+ when '\n'; emit(:escape, :newline, text)
311
+ when '\r'; emit(:escape, :carriage, text)
312
+ when '\t'; emit(:escape, :tab, text)
313
+ when '\v'; emit(:escape, :vertical_tab, text)
315
314
  end
316
315
  fret;
317
316
  };
318
317
 
319
318
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
319
+ text = copy(data, ts-1, te)
321
320
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
321
+ emit(:escape, :codepoint_list, text)
323
322
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
323
+ emit(:escape, :codepoint, text)
325
324
  end
326
325
  fret;
327
326
  };
328
327
 
329
328
  hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
329
+ emit(:escape, :hex, copy(data, ts-1, te))
331
330
  fret;
332
331
  };
333
332
 
@@ -357,8 +356,11 @@
357
356
  fcall unicode_property;
358
357
  };
359
358
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
359
+ (any -- non_literal_escape) |
360
+ utf8_2_byte |
361
+ utf8_3_byte |
362
+ utf8_4_byte > (escaped_alpha, 1) {
363
+ emit(:escape, :literal, copy(data, ts-1, te))
362
364
  fret;
363
365
  };
364
366
  *|;
@@ -368,9 +370,9 @@
368
370
  # --------------------------------------------------------------------------
369
371
  conditional_expression := |*
370
372
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
373
+ text = copy(data, ts, te-1)
374
+ emit(:conditional, :condition, text)
375
+ emit(:conditional, :condition_close, ')')
374
376
  };
375
377
 
376
378
  any {
@@ -387,39 +389,39 @@
387
389
  # Meta characters
388
390
  # ------------------------------------------------------------------------
389
391
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
392
+ emit(:meta, :dot, copy(data, ts, te))
391
393
  };
392
394
 
393
395
  alternation {
394
396
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
397
+ emit(:conditional, :separator, copy(data, ts, te))
396
398
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
399
+ emit(:meta, :alternation, copy(data, ts, te))
398
400
  end
399
401
  };
400
402
 
401
403
  # Anchors
402
404
  # ------------------------------------------------------------------------
403
405
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
406
+ emit(:anchor, :bol, copy(data, ts, te))
405
407
  };
406
408
 
407
409
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
410
+ emit(:anchor, :eol, copy(data, ts, te))
409
411
  };
410
412
 
411
413
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
414
+ emit(:keep, :mark, copy(data, ts, te))
413
415
  };
414
416
 
415
417
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
418
+ case text = copy(data, ts, te)
419
+ when '\\A'; emit(:anchor, :bos, text)
420
+ when '\\z'; emit(:anchor, :eos, text)
421
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\\b'; emit(:anchor, :word_boundary, text)
423
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\\G'; emit(:anchor, :match_start, text)
423
425
  end
424
426
  };
425
427
 
@@ -430,7 +432,7 @@
430
432
  # Character sets
431
433
  # ------------------------------------------------------------------------
432
434
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
435
+ emit(:set, :open, copy(data, ts, te))
434
436
  fcall character_set;
435
437
  };
436
438
 
@@ -439,12 +441,12 @@
439
441
  # (?(condition)Y|N) conditional expression
440
442
  # ------------------------------------------------------------------------
441
443
  conditional {
442
- text = text(data, ts, te).first
444
+ text = copy(data, ts, te)
443
445
 
444
446
  conditional_stack << group_depth
445
447
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
448
+ emit(:conditional, :open, text[0..-2])
449
+ emit(:conditional, :condition_open, '(')
448
450
  fcall conditional_expression;
449
451
  };
450
452
 
@@ -455,7 +457,7 @@
455
457
  # correct closing count.
456
458
  # ------------------------------------------------------------------------
457
459
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
460
+ emit(:group, :comment, copy(data, ts, te))
459
461
  };
460
462
 
461
463
  # Expression options:
@@ -470,11 +472,11 @@
470
472
  # (?imxdau-imx:subexp) option on/off for subexp
471
473
  # ------------------------------------------------------------------------
472
474
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
475
+ text = copy(data, ts, te)
474
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
477
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
478
  end
477
- emit_options(text, ts, te)
479
+ emit_options(text)
478
480
  };
479
481
 
480
482
  # Assertions
@@ -484,11 +486,11 @@
484
486
  # (?<!subexp) negative look-behind
485
487
  # ------------------------------------------------------------------------
486
488
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
489
+ case text = copy(data, ts, te)
490
+ when '(?='; emit(:assertion, :lookahead, text)
491
+ when '(?!'; emit(:assertion, :nlookahead, text)
492
+ when '(?<='; emit(:assertion, :lookbehind, text)
493
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
494
  end
493
495
  };
494
496
 
@@ -501,32 +503,32 @@
501
503
  # (subexp) captured group
502
504
  # ------------------------------------------------------------------------
503
505
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
506
+ case text = copy(data, ts, te)
507
+ when '(?:'; emit(:group, :passive, text)
508
+ when '(?>'; emit(:group, :atomic, text)
509
+ when '(?~'; emit(:group, :absence, text)
508
510
 
509
511
  when /^\(\?(?:<>|'')/
510
512
  validation_error(:group, 'named group', 'name is empty')
511
513
 
512
514
  when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
515
+ emit(:group, :named_ab, text)
514
516
 
515
517
  when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
518
+ emit(:group, :named_sq, text)
517
519
 
518
520
  end
519
521
  };
520
522
 
521
523
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
524
+ text = copy(data, ts, te)
525
+ emit(:group, :capture, text)
524
526
  };
525
527
 
526
528
  group_close @group_closed {
527
529
  if conditional_stack.last == group_depth + 1
528
530
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
531
+ emit(:conditional, :close, copy(data, ts, te))
530
532
  else
531
533
  if spacing_stack.length > 1 &&
532
534
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +536,7 @@
534
536
  self.free_spacing = spacing_stack.last[:free_spacing]
535
537
  end
536
538
 
537
- emit(:group, :close, *text(data, ts, te))
539
+ emit(:group, :close, copy(data, ts, te))
538
540
  end
539
541
  };
540
542
 
@@ -542,63 +544,63 @@
542
544
  # Group backreference, named and numbered
543
545
  # ------------------------------------------------------------------------
544
546
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
547
+ case text = copy(data, ts, te)
546
548
  when /^\\([gk])(<>|'')/ # angle brackets
547
549
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
550
 
549
551
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
552
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
553
+ emit(:backref, :name_ref_ab, text)
552
554
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
555
+ emit(:backref, :name_call_ab, text)
554
556
  end
555
557
 
556
558
  when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
559
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
560
+ emit(:backref, :name_ref_sq, text)
559
561
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
562
+ emit(:backref, :name_call_sq, text)
561
563
  end
562
564
 
563
565
  when /^\\([gk])<\d+>/ # angle-brackets
564
566
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
567
+ emit(:backref, :number_ref_ab, text)
566
568
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
569
+ emit(:backref, :number_call_ab, text)
568
570
  end
569
571
 
570
572
  when /^\\([gk])'\d+'/ # single quotes
571
573
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
574
+ emit(:backref, :number_ref_sq, text)
573
575
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
576
+ emit(:backref, :number_call_sq, text)
575
577
  end
576
578
 
577
579
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
580
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
581
+ emit(:backref, :number_rel_ref_ab, text)
580
582
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
583
+ emit(:backref, :number_rel_call_ab, text)
582
584
  end
583
585
 
584
586
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
587
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
588
+ emit(:backref, :number_rel_ref_sq, text)
587
589
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
590
+ emit(:backref, :number_rel_call_sq, text)
589
591
  end
590
592
 
591
593
  when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
594
+ emit(:backref, :name_recursion_ref_ab, text)
593
595
 
594
596
  when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
597
+ emit(:backref, :name_recursion_ref_sq, text)
596
598
 
597
599
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_ab, text)
599
601
 
600
602
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
603
+ emit(:backref, :number_recursion_ref_sq, text)
602
604
 
603
605
  end
604
606
  };
@@ -607,31 +609,31 @@
607
609
  # Quantifiers
608
610
  # ------------------------------------------------------------------------
609
611
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
612
+ case text = copy(data, ts, te)
613
+ when '?' ; emit(:quantifier, :zero_or_one, text)
614
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
615
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
616
  end
615
617
  };
616
618
 
617
619
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
620
+ case text = copy(data, ts, te)
621
+ when '*' ; emit(:quantifier, :zero_or_more, text)
622
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
623
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
624
  end
623
625
  };
624
626
 
625
627
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
628
+ case text = copy(data, ts, te)
629
+ when '+' ; emit(:quantifier, :one_or_more, text)
630
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
631
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
632
  end
631
633
  };
632
634
 
633
635
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
636
+ emit(:quantifier, :interval, copy(data, ts, te))
635
637
  };
636
638
 
637
639
  # Catch unmatched curly braces as literals
@@ -647,7 +649,7 @@
647
649
 
648
650
  comment {
649
651
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
652
+ emit(:free_space, :comment, copy(data, ts, te))
651
653
  else
652
654
  # consume only the pound sign (#) and backtrack to do regular scanning
653
655
  append_literal(data, ts, ts + 1)
@@ -657,7 +659,7 @@
657
659
 
658
660
  space+ {
659
661
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
662
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
663
  else
662
664
  append_literal(data, ts, te)
663
665
  end
@@ -760,6 +762,7 @@ class Regexp::Scanner
760
762
  self.set_depth = 0
761
763
  self.group_depth = 0
762
764
  self.conditional_stack = []
765
+ self.char_pos = 0
763
766
 
764
767
  %% write data;
765
768
  %% write init;
@@ -769,7 +772,7 @@ class Regexp::Scanner
769
772
  testEof = testEof
770
773
 
771
774
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
775
+ text = copy(data, ts ? ts-1 : 0, -1)
773
776
  raise ScannerError.new("Scan error at '#{text}'")
774
777
  end
775
778
 
@@ -797,22 +800,29 @@ class Regexp::Scanner
797
800
  end
798
801
 
799
802
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
803
+ def emit(type, token, text)
801
804
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
805
 
803
806
  emit_literal if literal
804
807
 
808
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
809
+ # end-users, so we keep track of char-based indices and emit those instead.
810
+ ts_char_pos = char_pos
811
+ te_char_pos = char_pos + text.length
812
+
805
813
  if block
806
- block.call type, token, text, ts, te
814
+ block.call type, token, text, ts_char_pos, te_char_pos
807
815
  end
808
816
 
809
- tokens << [type, token, text, ts, te]
817
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
818
+
819
+ self.char_pos = te_char_pos
810
820
  end
811
821
 
812
822
  private
813
823
 
814
824
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
825
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
826
 
817
827
  def free_spacing?(input_object, options)
818
828
  if options && !input_object.is_a?(String)
@@ -835,36 +845,25 @@ class Regexp::Scanner
835
845
  end
836
846
 
837
847
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
848
+ def copy(data, ts, te)
849
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
850
  end
847
851
 
848
852
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
853
+ # by a call to emit_literal.
850
854
  def append_literal(data, ts, te)
851
855
  self.literal = literal || []
852
- literal << text(data, ts, te)
856
+ literal << copy(data, ts, te)
853
857
  end
854
858
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
859
+ # Emits the literal run collected by calls to the append_literal method.
857
860
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
861
+ text = literal.join
863
862
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
863
+ emit(:literal, :literal, text)
865
864
  end
866
865
 
867
- def emit_options(text, ts, te)
866
+ def emit_options(text)
868
867
  token = nil
869
868
 
870
869
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +889,14 @@ class Regexp::Scanner
890
889
  token = :options_switch
891
890
  end
892
891
 
893
- emit(:group, token, text, ts, te)
892
+ emit(:group, token, text)
894
893
  end
895
894
 
896
895
  def emit_meta_control_sequence(data, ts, te, token)
897
896
  if data.last < 0x00 || data.last > 0x7F
898
897
  validation_error(:sequence, 'escape', token.to_s)
899
898
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
899
+ emit(:escape, token, copy(data, ts-1, te))
901
900
  end
902
901
 
903
902
  # Centralizes and unifies the handling of validation related