regexp_parser 1.8.2 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -135,13 +135,13 @@
135
135
 
136
136
  # EOF error, used where it can be detected
137
137
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
+ text = copy(data, ts ? ts-1 : 0, -1)
139
139
  raise PrematureEndError.new( text )
140
140
  }
141
141
 
142
142
  # Invalid sequence error, used from sequences, like escapes and sets
143
143
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
144
+ text = copy(data, ts ? ts-1 : 0, -1)
145
145
  validation_error(:sequence, 'sequence', text)
146
146
  }
147
147
 
@@ -156,7 +156,7 @@
156
156
  # --------------------------------------------------------------------------
157
157
  character_set := |*
158
158
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
159
+ emit(:set, :close, copy(data, ts, te))
160
160
  if in_set?
161
161
  fret;
162
162
  else
@@ -165,8 +165,8 @@
165
165
  };
166
166
 
167
167
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
168
+ emit(:literal, :literal, copy(data, ts, te-1))
169
+ emit(:set, :close, copy(data, ts+1, te))
170
170
  if in_set?
171
171
  fret;
172
172
  else
@@ -175,33 +175,33 @@
175
175
  };
176
176
 
177
177
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
178
+ emit(:literal, :literal, '-')
179
+ emit(:set, :intersection, '&&')
180
180
  };
181
181
 
182
182
  '^' {
183
- text = text(data, ts, te).first
183
+ text = copy(data, ts, te)
184
184
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
185
+ emit(:set, :negate, text)
186
186
  else
187
- emit(:literal, :literal, text, ts, te)
187
+ emit(:literal, :literal, text)
188
188
  end
189
189
  };
190
190
 
191
191
  '-' {
192
- text = text(data, ts, te).first
192
+ text = copy(data, ts, te)
193
193
  # ranges cant start with a subset or intersection/negation/range operator
194
194
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
195
+ emit(:literal, :literal, text)
196
196
  else
197
- emit(:set, :range, text, ts, te)
197
+ emit(:set, :range, text)
198
198
  end
199
199
  };
200
200
 
201
201
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
202
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
203
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
204
+ emit(:set, :intersection, copy(data, ts, te))
205
205
  };
206
206
 
207
207
  backslash {
@@ -209,12 +209,12 @@
209
209
  };
210
210
 
211
211
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
212
+ emit(:set, :open, copy(data, ts, te))
213
213
  fcall character_set;
214
214
  };
215
215
 
216
216
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
217
+ text = copy(data, ts, te)
218
218
 
219
219
  type = :posixclass
220
220
  class_name = text[2..-3]
@@ -223,19 +223,19 @@
223
223
  type = :nonposixclass
224
224
  end
225
225
 
226
- emit(type, class_name.to_sym, text, ts, te)
226
+ emit(type, class_name.to_sym, text)
227
227
  };
228
228
 
229
229
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
230
+ emit(:set, :collation, copy(data, ts, te))
231
231
  };
232
232
 
233
233
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
234
+ emit(:set, :equivalent, copy(data, ts, te))
235
235
  };
236
236
 
237
237
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
238
+ emit(:literal, :literal, copy(data, ts, te))
239
239
  };
240
240
 
241
241
  any |
@@ -243,9 +243,8 @@
243
243
  utf8_2_byte |
244
244
  utf8_3_byte |
245
245
  utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
246
+ text = copy(data, ts, te)
247
+ emit(:literal, :literal, text)
249
248
  };
250
249
  *|;
251
250
 
@@ -253,7 +252,7 @@
253
252
  # --------------------------------------------------------------------------
254
253
  set_escape_sequence := |*
255
254
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
255
+ emit(:escape, :literal, copy(data, ts-1, te))
257
256
  fret;
258
257
  };
259
258
 
@@ -269,33 +268,33 @@
269
268
  # --------------------------------------------------------------------------
270
269
  escape_sequence := |*
271
270
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
271
+ text = copy(data, ts-1, te)
272
+ emit(:backref, :number, text)
274
273
  fret;
275
274
  };
276
275
 
277
276
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
277
+ emit(:escape, :octal, copy(data, ts-1, te))
279
278
  fret;
280
279
  };
281
280
 
282
281
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
282
+ case text = copy(data, ts-1, te)
283
+ when '\.'; emit(:escape, :dot, text)
284
+ when '\|'; emit(:escape, :alternation, text)
285
+ when '\^'; emit(:escape, :bol, text)
286
+ when '\$'; emit(:escape, :eol, text)
287
+ when '\?'; emit(:escape, :zero_or_one, text)
288
+ when '\*'; emit(:escape, :zero_or_more, text)
289
+ when '\+'; emit(:escape, :one_or_more, text)
290
+ when '\('; emit(:escape, :group_open, text)
291
+ when '\)'; emit(:escape, :group_close, text)
292
+ when '\{'; emit(:escape, :interval_open, text)
293
+ when '\}'; emit(:escape, :interval_close, text)
294
+ when '\['; emit(:escape, :set_open, text)
295
+ when '\]'; emit(:escape, :set_close, text)
297
296
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
297
+ emit(:escape, :backslash, text)
299
298
  end
300
299
  fret;
301
300
  };
@@ -303,31 +302,31 @@
303
302
  escaped_ascii > (escaped_alpha, 7) {
304
303
  # \b is emitted as backspace only when inside a character set, otherwise
305
304
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
305
+ case text = copy(data, ts-1, te)
306
+ when '\a'; emit(:escape, :bell, text)
307
+ when '\b'; emit(:escape, :backspace, text)
308
+ when '\e'; emit(:escape, :escape, text)
309
+ when '\f'; emit(:escape, :form_feed, text)
310
+ when '\n'; emit(:escape, :newline, text)
311
+ when '\r'; emit(:escape, :carriage, text)
312
+ when '\t'; emit(:escape, :tab, text)
313
+ when '\v'; emit(:escape, :vertical_tab, text)
315
314
  end
316
315
  fret;
317
316
  };
318
317
 
319
318
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
319
+ text = copy(data, ts-1, te)
321
320
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
321
+ emit(:escape, :codepoint_list, text)
323
322
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
323
+ emit(:escape, :codepoint, text)
325
324
  end
326
325
  fret;
327
326
  };
328
327
 
329
328
  hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
329
+ emit(:escape, :hex, copy(data, ts-1, te))
331
330
  fret;
332
331
  };
333
332
 
@@ -357,8 +356,11 @@
357
356
  fcall unicode_property;
358
357
  };
359
358
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
359
+ (any -- non_literal_escape) |
360
+ utf8_2_byte |
361
+ utf8_3_byte |
362
+ utf8_4_byte > (escaped_alpha, 1) {
363
+ emit(:escape, :literal, copy(data, ts-1, te))
362
364
  fret;
363
365
  };
364
366
  *|;
@@ -368,9 +370,9 @@
368
370
  # --------------------------------------------------------------------------
369
371
  conditional_expression := |*
370
372
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
373
+ text = copy(data, ts, te-1)
374
+ emit(:conditional, :condition, text)
375
+ emit(:conditional, :condition_close, ')')
374
376
  };
375
377
 
376
378
  any {
@@ -387,39 +389,39 @@
387
389
  # Meta characters
388
390
  # ------------------------------------------------------------------------
389
391
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
392
+ emit(:meta, :dot, copy(data, ts, te))
391
393
  };
392
394
 
393
395
  alternation {
394
396
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
397
+ emit(:conditional, :separator, copy(data, ts, te))
396
398
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
399
+ emit(:meta, :alternation, copy(data, ts, te))
398
400
  end
399
401
  };
400
402
 
401
403
  # Anchors
402
404
  # ------------------------------------------------------------------------
403
405
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
406
+ emit(:anchor, :bol, copy(data, ts, te))
405
407
  };
406
408
 
407
409
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
410
+ emit(:anchor, :eol, copy(data, ts, te))
409
411
  };
410
412
 
411
413
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
414
+ emit(:keep, :mark, copy(data, ts, te))
413
415
  };
414
416
 
415
417
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
418
+ case text = copy(data, ts, te)
419
+ when '\\A'; emit(:anchor, :bos, text)
420
+ when '\\z'; emit(:anchor, :eos, text)
421
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
422
+ when '\\b'; emit(:anchor, :word_boundary, text)
423
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
424
+ when '\\G'; emit(:anchor, :match_start, text)
423
425
  end
424
426
  };
425
427
 
@@ -430,7 +432,7 @@
430
432
  # Character sets
431
433
  # ------------------------------------------------------------------------
432
434
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
435
+ emit(:set, :open, copy(data, ts, te))
434
436
  fcall character_set;
435
437
  };
436
438
 
@@ -439,12 +441,12 @@
439
441
  # (?(condition)Y|N) conditional expression
440
442
  # ------------------------------------------------------------------------
441
443
  conditional {
442
- text = text(data, ts, te).first
444
+ text = copy(data, ts, te)
443
445
 
444
446
  conditional_stack << group_depth
445
447
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
448
+ emit(:conditional, :open, text[0..-2])
449
+ emit(:conditional, :condition_open, '(')
448
450
  fcall conditional_expression;
449
451
  };
450
452
 
@@ -455,7 +457,7 @@
455
457
  # correct closing count.
456
458
  # ------------------------------------------------------------------------
457
459
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
460
+ emit(:group, :comment, copy(data, ts, te))
459
461
  };
460
462
 
461
463
  # Expression options:
@@ -470,11 +472,11 @@
470
472
  # (?imxdau-imx:subexp) option on/off for subexp
471
473
  # ------------------------------------------------------------------------
472
474
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
475
+ text = copy(data, ts, te)
474
476
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
477
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
478
  end
477
- emit_options(text, ts, te)
479
+ emit_options(text)
478
480
  };
479
481
 
480
482
  # Assertions
@@ -484,11 +486,11 @@
484
486
  # (?<!subexp) negative look-behind
485
487
  # ------------------------------------------------------------------------
486
488
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
489
+ case text = copy(data, ts, te)
490
+ when '(?='; emit(:assertion, :lookahead, text)
491
+ when '(?!'; emit(:assertion, :nlookahead, text)
492
+ when '(?<='; emit(:assertion, :lookbehind, text)
493
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
494
  end
493
495
  };
494
496
 
@@ -501,32 +503,32 @@
501
503
  # (subexp) captured group
502
504
  # ------------------------------------------------------------------------
503
505
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
506
+ case text = copy(data, ts, te)
507
+ when '(?:'; emit(:group, :passive, text)
508
+ when '(?>'; emit(:group, :atomic, text)
509
+ when '(?~'; emit(:group, :absence, text)
508
510
 
509
511
  when /^\(\?(?:<>|'')/
510
512
  validation_error(:group, 'named group', 'name is empty')
511
513
 
512
514
  when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
515
+ emit(:group, :named_ab, text)
514
516
 
515
517
  when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
518
+ emit(:group, :named_sq, text)
517
519
 
518
520
  end
519
521
  };
520
522
 
521
523
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
524
+ text = copy(data, ts, te)
525
+ emit(:group, :capture, text)
524
526
  };
525
527
 
526
528
  group_close @group_closed {
527
529
  if conditional_stack.last == group_depth + 1
528
530
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
531
+ emit(:conditional, :close, copy(data, ts, te))
530
532
  else
531
533
  if spacing_stack.length > 1 &&
532
534
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +536,7 @@
534
536
  self.free_spacing = spacing_stack.last[:free_spacing]
535
537
  end
536
538
 
537
- emit(:group, :close, *text(data, ts, te))
539
+ emit(:group, :close, copy(data, ts, te))
538
540
  end
539
541
  };
540
542
 
@@ -542,63 +544,63 @@
542
544
  # Group backreference, named and numbered
543
545
  # ------------------------------------------------------------------------
544
546
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
547
+ case text = copy(data, ts, te)
546
548
  when /^\\([gk])(<>|'')/ # angle brackets
547
549
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
550
 
549
551
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
552
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
553
+ emit(:backref, :name_ref_ab, text)
552
554
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
555
+ emit(:backref, :name_call_ab, text)
554
556
  end
555
557
 
556
558
  when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
559
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
560
+ emit(:backref, :name_ref_sq, text)
559
561
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
562
+ emit(:backref, :name_call_sq, text)
561
563
  end
562
564
 
563
565
  when /^\\([gk])<\d+>/ # angle-brackets
564
566
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
567
+ emit(:backref, :number_ref_ab, text)
566
568
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
569
+ emit(:backref, :number_call_ab, text)
568
570
  end
569
571
 
570
572
  when /^\\([gk])'\d+'/ # single quotes
571
573
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
574
+ emit(:backref, :number_ref_sq, text)
573
575
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
576
+ emit(:backref, :number_call_sq, text)
575
577
  end
576
578
 
577
579
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
580
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
581
+ emit(:backref, :number_rel_ref_ab, text)
580
582
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
583
+ emit(:backref, :number_rel_call_ab, text)
582
584
  end
583
585
 
584
586
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
587
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
588
+ emit(:backref, :number_rel_ref_sq, text)
587
589
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
590
+ emit(:backref, :number_rel_call_sq, text)
589
591
  end
590
592
 
591
593
  when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
594
+ emit(:backref, :name_recursion_ref_ab, text)
593
595
 
594
596
  when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
597
+ emit(:backref, :name_recursion_ref_sq, text)
596
598
 
597
599
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_ab, text)
599
601
 
600
602
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
603
+ emit(:backref, :number_recursion_ref_sq, text)
602
604
 
603
605
  end
604
606
  };
@@ -607,31 +609,31 @@
607
609
  # Quantifiers
608
610
  # ------------------------------------------------------------------------
609
611
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
612
+ case text = copy(data, ts, te)
613
+ when '?' ; emit(:quantifier, :zero_or_one, text)
614
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
615
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
616
  end
615
617
  };
616
618
 
617
619
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
620
+ case text = copy(data, ts, te)
621
+ when '*' ; emit(:quantifier, :zero_or_more, text)
622
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
623
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
624
  end
623
625
  };
624
626
 
625
627
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
628
+ case text = copy(data, ts, te)
629
+ when '+' ; emit(:quantifier, :one_or_more, text)
630
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
631
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
632
  end
631
633
  };
632
634
 
633
635
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
636
+ emit(:quantifier, :interval, copy(data, ts, te))
635
637
  };
636
638
 
637
639
  # Catch unmatched curly braces as literals
@@ -647,7 +649,7 @@
647
649
 
648
650
  comment {
649
651
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
652
+ emit(:free_space, :comment, copy(data, ts, te))
651
653
  else
652
654
  # consume only the pound sign (#) and backtrack to do regular scanning
653
655
  append_literal(data, ts, ts + 1)
@@ -657,7 +659,7 @@
657
659
 
658
660
  space+ {
659
661
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
662
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
663
  else
662
664
  append_literal(data, ts, te)
663
665
  end
@@ -760,6 +762,7 @@ class Regexp::Scanner
760
762
  self.set_depth = 0
761
763
  self.group_depth = 0
762
764
  self.conditional_stack = []
765
+ self.char_pos = 0
763
766
 
764
767
  %% write data;
765
768
  %% write init;
@@ -769,7 +772,7 @@ class Regexp::Scanner
769
772
  testEof = testEof
770
773
 
771
774
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
775
+ text = copy(data, ts ? ts-1 : 0, -1)
773
776
  raise ScannerError.new("Scan error at '#{text}'")
774
777
  end
775
778
 
@@ -797,22 +800,29 @@ class Regexp::Scanner
797
800
  end
798
801
 
799
802
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
803
+ def emit(type, token, text)
801
804
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
805
 
803
806
  emit_literal if literal
804
807
 
808
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
809
+ # end-users, so we keep track of char-based indices and emit those instead.
810
+ ts_char_pos = char_pos
811
+ te_char_pos = char_pos + text.length
812
+
805
813
  if block
806
- block.call type, token, text, ts, te
814
+ block.call type, token, text, ts_char_pos, te_char_pos
807
815
  end
808
816
 
809
- tokens << [type, token, text, ts, te]
817
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
818
+
819
+ self.char_pos = te_char_pos
810
820
  end
811
821
 
812
822
  private
813
823
 
814
824
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
825
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
826
 
817
827
  def free_spacing?(input_object, options)
818
828
  if options && !input_object.is_a?(String)
@@ -835,36 +845,25 @@ class Regexp::Scanner
835
845
  end
836
846
 
837
847
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
848
+ def copy(data, ts, te)
849
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
850
  end
847
851
 
848
852
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
853
+ # by a call to emit_literal.
850
854
  def append_literal(data, ts, te)
851
855
  self.literal = literal || []
852
- literal << text(data, ts, te)
856
+ literal << copy(data, ts, te)
853
857
  end
854
858
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
859
+ # Emits the literal run collected by calls to the append_literal method.
857
860
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
861
+ text = literal.join
863
862
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
863
+ emit(:literal, :literal, text)
865
864
  end
866
865
 
867
- def emit_options(text, ts, te)
866
+ def emit_options(text)
868
867
  token = nil
869
868
 
870
869
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +889,14 @@ class Regexp::Scanner
890
889
  token = :options_switch
891
890
  end
892
891
 
893
- emit(:group, token, text, ts, te)
892
+ emit(:group, token, text)
894
893
  end
895
894
 
896
895
  def emit_meta_control_sequence(data, ts, te, token)
897
896
  if data.last < 0x00 || data.last > 0x7F
898
897
  validation_error(:sequence, 'escape', token.to_s)
899
898
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
899
+ emit(:escape, token, copy(data, ts-1, te))
901
900
  end
902
901
 
903
902
  # Centralizes and unifies the handling of validation related