regexp_parser 1.7.1 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -21,7 +26,7 @@
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
29
+ comment = ('#' . [^\n]* . '\n'?);
25
30
 
26
31
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
32
  'cntrl' | 'digit' | 'graph' |
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -120,28 +126,24 @@
120
126
 
121
127
  literal_delimiters = ']' | '}';
122
128
 
123
- ascii_print = ((0x20..0x7e) - meta_char);
129
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- group_ref | keep_mark | [xucCM];
133
+ keep_mark | [xucCM];
132
134
 
133
135
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
136
  multi_codepoint_char_type | [0-9cCM];
135
137
 
136
138
  # EOF error, used where it can be detected
137
139
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
140
+ text = copy(data, ts ? ts-1 : 0, -1)
139
141
  raise PrematureEndError.new( text )
140
142
  }
141
143
 
142
144
  # Invalid sequence error, used from sequences, like escapes and sets
143
145
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
146
+ text = copy(data, ts ? ts-1 : 0, -1)
145
147
  validation_error(:sequence, 'sequence', text)
146
148
  }
147
149
 
@@ -156,7 +158,7 @@
156
158
  # --------------------------------------------------------------------------
157
159
  character_set := |*
158
160
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
161
+ emit(:set, :close, copy(data, ts, te))
160
162
  if in_set?
161
163
  fret;
162
164
  else
@@ -165,8 +167,8 @@
165
167
  };
166
168
 
167
169
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ emit(:literal, :literal, copy(data, ts, te-1))
171
+ emit(:set, :close, copy(data, ts+1, te))
170
172
  if in_set?
171
173
  fret;
172
174
  else
@@ -175,33 +177,33 @@
175
177
  };
176
178
 
177
179
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
180
+ emit(:literal, :literal, '-')
181
+ emit(:set, :intersection, '&&')
180
182
  };
181
183
 
182
184
  '^' {
183
- text = text(data, ts, te).first
185
+ text = copy(data, ts, te)
184
186
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
187
+ emit(:set, :negate, text)
186
188
  else
187
- emit(:literal, :literal, text, ts, te)
189
+ emit(:literal, :literal, text)
188
190
  end
189
191
  };
190
192
 
191
193
  '-' {
192
- text = text(data, ts, te).first
194
+ text = copy(data, ts, te)
193
195
  # ranges cant start with a subset or intersection/negation/range operator
194
196
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
196
198
  else
197
- emit(:set, :range, text, ts, te)
199
+ emit(:set, :range, text)
198
200
  end
199
201
  };
200
202
 
201
203
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
204
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
205
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
206
+ emit(:set, :intersection, copy(data, ts, te))
205
207
  };
206
208
 
207
209
  backslash {
@@ -209,12 +211,12 @@
209
211
  };
210
212
 
211
213
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
214
+ emit(:set, :open, copy(data, ts, te))
213
215
  fcall character_set;
214
216
  };
215
217
 
216
218
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
219
+ text = copy(data, ts, te)
218
220
 
219
221
  type = :posixclass
220
222
  class_name = text[2..-3]
@@ -223,29 +225,24 @@
223
225
  type = :nonposixclass
224
226
  end
225
227
 
226
- emit(type, class_name.to_sym, text, ts, te)
228
+ emit(type, class_name.to_sym, text)
227
229
  };
228
230
 
229
231
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
232
+ emit(:set, :collation, copy(data, ts, te))
231
233
  };
232
234
 
233
235
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
236
+ emit(:set, :equivalent, copy(data, ts, te))
235
237
  };
236
238
 
237
239
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
240
+ emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
243
+ any | ascii_nonprint | utf8_multibyte {
244
+ text = copy(data, ts, te)
245
+ emit(:literal, :literal, text)
249
246
  };
250
247
  *|;
251
248
 
@@ -253,7 +250,7 @@
253
250
  # --------------------------------------------------------------------------
254
251
  set_escape_sequence := |*
255
252
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
253
+ emit(:escape, :literal, copy(data, ts-1, te))
257
254
  fret;
258
255
  };
259
256
 
@@ -269,33 +266,33 @@
269
266
  # --------------------------------------------------------------------------
270
267
  escape_sequence := |*
271
268
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
269
+ text = copy(data, ts-1, te)
270
+ emit(:backref, :number, text)
274
271
  fret;
275
272
  };
276
273
 
277
274
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
275
+ emit(:escape, :octal, copy(data, ts-1, te))
279
276
  fret;
280
277
  };
281
278
 
282
279
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
280
+ case text = copy(data, ts-1, te)
281
+ when '\.'; emit(:escape, :dot, text)
282
+ when '\|'; emit(:escape, :alternation, text)
283
+ when '\^'; emit(:escape, :bol, text)
284
+ when '\$'; emit(:escape, :eol, text)
285
+ when '\?'; emit(:escape, :zero_or_one, text)
286
+ when '\*'; emit(:escape, :zero_or_more, text)
287
+ when '\+'; emit(:escape, :one_or_more, text)
288
+ when '\('; emit(:escape, :group_open, text)
289
+ when '\)'; emit(:escape, :group_close, text)
290
+ when '\{'; emit(:escape, :interval_open, text)
291
+ when '\}'; emit(:escape, :interval_close, text)
292
+ when '\['; emit(:escape, :set_open, text)
293
+ when '\]'; emit(:escape, :set_close, text)
297
294
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
295
+ emit(:escape, :backslash, text)
299
296
  end
300
297
  fret;
301
298
  };
@@ -303,31 +300,31 @@
303
300
  escaped_ascii > (escaped_alpha, 7) {
304
301
  # \b is emitted as backspace only when inside a character set, otherwise
305
302
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
303
+ case text = copy(data, ts-1, te)
304
+ when '\a'; emit(:escape, :bell, text)
305
+ when '\b'; emit(:escape, :backspace, text)
306
+ when '\e'; emit(:escape, :escape, text)
307
+ when '\f'; emit(:escape, :form_feed, text)
308
+ when '\n'; emit(:escape, :newline, text)
309
+ when '\r'; emit(:escape, :carriage, text)
310
+ when '\t'; emit(:escape, :tab, text)
311
+ when '\v'; emit(:escape, :vertical_tab, text)
315
312
  end
316
313
  fret;
317
314
  };
318
315
 
319
316
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
317
+ text = copy(data, ts-1, te)
321
318
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
319
+ emit(:escape, :codepoint_list, text)
323
320
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
321
+ emit(:escape, :codepoint, text)
325
322
  end
326
323
  fret;
327
324
  };
328
325
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
327
+ emit(:escape, :hex, copy(data, ts-1, te))
331
328
  fret;
332
329
  };
333
330
 
@@ -357,8 +354,8 @@
357
354
  fcall unicode_property;
358
355
  };
359
356
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
358
+ emit(:escape, :literal, copy(data, ts-1, te))
362
359
  fret;
363
360
  };
364
361
  *|;
@@ -368,9 +365,9 @@
368
365
  # --------------------------------------------------------------------------
369
366
  conditional_expression := |*
370
367
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
368
+ text = copy(data, ts, te-1)
369
+ emit(:conditional, :condition, text)
370
+ emit(:conditional, :condition_close, ')')
374
371
  };
375
372
 
376
373
  any {
@@ -387,39 +384,39 @@
387
384
  # Meta characters
388
385
  # ------------------------------------------------------------------------
389
386
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
387
+ emit(:meta, :dot, copy(data, ts, te))
391
388
  };
392
389
 
393
390
  alternation {
394
391
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
392
+ emit(:conditional, :separator, copy(data, ts, te))
396
393
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
394
+ emit(:meta, :alternation, copy(data, ts, te))
398
395
  end
399
396
  };
400
397
 
401
398
  # Anchors
402
399
  # ------------------------------------------------------------------------
403
400
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
401
+ emit(:anchor, :bol, copy(data, ts, te))
405
402
  };
406
403
 
407
404
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
405
+ emit(:anchor, :eol, copy(data, ts, te))
409
406
  };
410
407
 
411
408
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
409
+ emit(:keep, :mark, copy(data, ts, te))
413
410
  };
414
411
 
415
412
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
413
+ case text = copy(data, ts, te)
414
+ when '\\A'; emit(:anchor, :bos, text)
415
+ when '\\z'; emit(:anchor, :eos, text)
416
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
417
+ when '\\b'; emit(:anchor, :word_boundary, text)
418
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
419
+ when '\\G'; emit(:anchor, :match_start, text)
423
420
  end
424
421
  };
425
422
 
@@ -430,7 +427,7 @@
430
427
  # Character sets
431
428
  # ------------------------------------------------------------------------
432
429
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
430
+ emit(:set, :open, copy(data, ts, te))
434
431
  fcall character_set;
435
432
  };
436
433
 
@@ -439,12 +436,12 @@
439
436
  # (?(condition)Y|N) conditional expression
440
437
  # ------------------------------------------------------------------------
441
438
  conditional {
442
- text = text(data, ts, te).first
439
+ text = copy(data, ts, te)
443
440
 
444
441
  conditional_stack << group_depth
445
442
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
443
+ emit(:conditional, :open, text[0..-2])
444
+ emit(:conditional, :condition_open, '(')
448
445
  fcall conditional_expression;
449
446
  };
450
447
 
@@ -455,7 +452,7 @@
455
452
  # correct closing count.
456
453
  # ------------------------------------------------------------------------
457
454
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
455
+ emit(:group, :comment, copy(data, ts, te))
459
456
  };
460
457
 
461
458
  # Expression options:
@@ -470,11 +467,11 @@
470
467
  # (?imxdau-imx:subexp) option on/off for subexp
471
468
  # ------------------------------------------------------------------------
472
469
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
470
+ text = copy(data, ts, te)
474
471
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
472
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
473
  end
477
- emit_options(text, ts, te)
474
+ emit_options(text)
478
475
  };
479
476
 
480
477
  # Assertions
@@ -484,11 +481,11 @@
484
481
  # (?<!subexp) negative look-behind
485
482
  # ------------------------------------------------------------------------
486
483
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
484
+ case text = copy(data, ts, te)
485
+ when '(?='; emit(:assertion, :lookahead, text)
486
+ when '(?!'; emit(:assertion, :nlookahead, text)
487
+ when '(?<='; emit(:assertion, :lookbehind, text)
488
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
489
  end
493
490
  };
494
491
 
@@ -501,32 +498,32 @@
501
498
  # (subexp) captured group
502
499
  # ------------------------------------------------------------------------
503
500
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
501
+ case text = copy(data, ts, te)
502
+ when '(?:'; emit(:group, :passive, text)
503
+ when '(?>'; emit(:group, :atomic, text)
504
+ when '(?~'; emit(:group, :absence, text)
508
505
 
509
506
  when /^\(\?(?:<>|'')/
510
507
  validation_error(:group, 'named group', 'name is empty')
511
508
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
509
+ when /^\(\?<[^>]+>/
510
+ emit(:group, :named_ab, text)
514
511
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
512
+ when /^\(\?'[^']+'/
513
+ emit(:group, :named_sq, text)
517
514
 
518
515
  end
519
516
  };
520
517
 
521
518
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
519
+ text = copy(data, ts, te)
520
+ emit(:group, :capture, text)
524
521
  };
525
522
 
526
523
  group_close @group_closed {
527
524
  if conditional_stack.last == group_depth + 1
528
525
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
526
+ emit(:conditional, :close, copy(data, ts, te))
530
527
  else
531
528
  if spacing_stack.length > 1 &&
532
529
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +531,7 @@
534
531
  self.free_spacing = spacing_stack.last[:free_spacing]
535
532
  end
536
533
 
537
- emit(:group, :close, *text(data, ts, te))
534
+ emit(:group, :close, copy(data, ts, te))
538
535
  end
539
536
  };
540
537
 
@@ -542,63 +539,65 @@
542
539
  # Group backreference, named and numbered
543
540
  # ------------------------------------------------------------------------
544
541
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
542
+ case text = copy(data, ts, te)
546
543
  when /^\\([gk])(<>|'')/ # angle brackets
547
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
545
 
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
550
549
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
550
+ emit(:backref, :name_ref_ab, text)
552
551
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
552
+ emit(:backref, :name_call_ab, text)
554
553
  end
555
554
 
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
557
556
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
557
+ emit(:backref, :name_ref_sq, text)
559
558
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
559
+ emit(:backref, :name_call_sq, text)
561
560
  end
562
561
 
563
562
  when /^\\([gk])<\d+>/ # angle-brackets
564
563
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
564
+ emit(:backref, :number_ref_ab, text)
566
565
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
566
+ emit(:backref, :number_call_ab, text)
568
567
  end
569
568
 
570
569
  when /^\\([gk])'\d+'/ # single quotes
571
570
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
571
+ emit(:backref, :number_ref_sq, text)
573
572
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
573
+ emit(:backref, :number_call_sq, text)
575
574
  end
576
575
 
577
576
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
577
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
578
+ emit(:backref, :number_rel_ref_ab, text)
580
579
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
580
+ emit(:backref, :number_rel_call_ab, text)
582
581
  end
583
582
 
584
583
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
584
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
585
+ emit(:backref, :number_rel_ref_sq, text)
587
586
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
587
+ emit(:backref, :number_rel_call_sq, text)
589
588
  end
590
589
 
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
591
+ emit(:backref, :name_recursion_ref_ab, text)
593
592
 
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
594
+ emit(:backref, :name_recursion_ref_sq, text)
596
595
 
597
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
597
+ emit(:backref, :number_recursion_ref_ab, text)
599
598
 
600
599
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_sq, text)
602
601
 
603
602
  end
604
603
  };
@@ -607,31 +606,31 @@
607
606
  # Quantifiers
608
607
  # ------------------------------------------------------------------------
609
608
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
609
+ case text = copy(data, ts, te)
610
+ when '?' ; emit(:quantifier, :zero_or_one, text)
611
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
612
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
613
  end
615
614
  };
616
615
 
617
616
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
617
+ case text = copy(data, ts, te)
618
+ when '*' ; emit(:quantifier, :zero_or_more, text)
619
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
620
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
621
  end
623
622
  };
624
623
 
625
624
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
625
+ case text = copy(data, ts, te)
626
+ when '+' ; emit(:quantifier, :one_or_more, text)
627
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
628
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
629
  end
631
630
  };
632
631
 
633
632
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
633
+ emit(:quantifier, :interval, copy(data, ts, te))
635
634
  };
636
635
 
637
636
  # Catch unmatched curly braces as literals
@@ -647,15 +646,17 @@
647
646
 
648
647
  comment {
649
648
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
649
+ emit(:free_space, :comment, copy(data, ts, te))
651
650
  else
652
- append_literal(data, ts, te)
651
+ # consume only the pound sign (#) and backtrack to do regular scanning
652
+ append_literal(data, ts, ts + 1)
653
+ fexec ts + 1;
653
654
  end
654
655
  };
655
656
 
656
657
  space+ {
657
658
  if free_spacing
658
- emit(:free_space, :whitespace, *text(data, ts, te))
659
+ emit(:free_space, :whitespace, copy(data, ts, te))
659
660
  else
660
661
  append_literal(data, ts, te)
661
662
  end
@@ -664,11 +665,7 @@
664
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
665
666
  # except meta characters.
666
667
  # ------------------------------------------------------------------------
667
- (ascii_print -- space)+ |
668
- ascii_nonprint+ |
669
- utf8_2_byte+ |
670
- utf8_3_byte+ |
671
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
672
669
  append_literal(data, ts, te)
673
670
  };
674
671
 
@@ -737,21 +734,16 @@ class Regexp::Scanner
737
734
  #
738
735
  # This method may raise errors if a syntax error is encountered.
739
736
  # --------------------------------------------------------------------------
740
- def self.scan(input_object, &block)
741
- new.scan(input_object, &block)
737
+ def self.scan(input_object, options: nil, &block)
738
+ new.scan(input_object, options: options, &block)
742
739
  end
743
740
 
744
- def scan(input_object, &block)
741
+ def scan(input_object, options: nil, &block)
745
742
  self.literal = nil
746
743
  stack = []
747
744
 
748
- if input_object.is_a?(Regexp)
749
- input = input_object.source
750
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
751
- else
752
- input = input_object
753
- self.free_spacing = false
754
- end
745
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
746
+ self.free_spacing = free_spacing?(input_object, options)
755
747
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
756
748
 
757
749
  data = input.unpack("c*") if input.is_a?(String)
@@ -763,6 +755,7 @@ class Regexp::Scanner
763
755
  self.set_depth = 0
764
756
  self.group_depth = 0
765
757
  self.conditional_stack = []
758
+ self.char_pos = 0
766
759
 
767
760
  %% write data;
768
761
  %% write init;
@@ -772,7 +765,7 @@ class Regexp::Scanner
772
765
  testEof = testEof
773
766
 
774
767
  if cs == re_scanner_error
775
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
768
+ text = copy(data, ts ? ts-1 : 0, -1)
776
769
  raise ScannerError.new("Scan error at '#{text}'")
777
770
  end
778
771
 
@@ -800,22 +793,41 @@ class Regexp::Scanner
800
793
  end
801
794
 
802
795
  # Emits an array with the details of the scanned pattern
803
- def emit(type, token, text, ts, te)
796
+ def emit(type, token, text)
804
797
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
805
798
 
806
799
  emit_literal if literal
807
800
 
801
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
802
+ # end-users, so we keep track of char-based indices and emit those instead.
803
+ ts_char_pos = char_pos
804
+ te_char_pos = char_pos + text.length
805
+
808
806
  if block
809
- block.call type, token, text, ts, te
807
+ block.call type, token, text, ts_char_pos, te_char_pos
810
808
  end
811
809
 
812
- tokens << [type, token, text, ts, te]
810
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
811
+
812
+ self.char_pos = te_char_pos
813
813
  end
814
814
 
815
815
  private
816
816
 
817
817
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
818
- :group_depth, :set_depth, :conditional_stack
818
+ :group_depth, :set_depth, :conditional_stack, :char_pos
819
+
820
+ def free_spacing?(input_object, options)
821
+ if options && !input_object.is_a?(String)
822
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
823
+ end
824
+
825
+ options = input_object.options if input_object.is_a?(::Regexp)
826
+
827
+ return false unless options
828
+
829
+ options & Regexp::EXTENDED != 0
830
+ end
819
831
 
820
832
  def in_group?
821
833
  group_depth > 0
@@ -826,36 +838,25 @@ class Regexp::Scanner
826
838
  end
827
839
 
828
840
  # Copy from ts to te from data as text
829
- def copy(data, range)
830
- data[range].pack('c*')
831
- end
832
-
833
- # Copy from ts to te from data as text, returning an array with the text
834
- # and the offsets used to copy it.
835
- def text(data, ts, te, soff = 0)
836
- [copy(data, ts-soff..te-1), ts-soff, te]
841
+ def copy(data, ts, te)
842
+ data[ts...te].pack('c*').force_encoding('utf-8')
837
843
  end
838
844
 
839
845
  # Appends one or more characters to the literal buffer, to be emitted later
840
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
846
+ # by a call to emit_literal.
841
847
  def append_literal(data, ts, te)
842
848
  self.literal = literal || []
843
- literal << text(data, ts, te)
849
+ literal << copy(data, ts, te)
844
850
  end
845
851
 
846
- # Emits the literal run collected by calls to the append_literal method,
847
- # using the total start (ts) and end (te) offsets of the run.
852
+ # Emits the literal run collected by calls to the append_literal method.
848
853
  def emit_literal
849
- ts, te = literal.first[1], literal.last[2]
850
- text = literal.map {|t| t[0]}.join
851
-
852
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
853
-
854
+ text = literal.join
854
855
  self.literal = nil
855
- emit(:literal, :literal, text, ts, te)
856
+ emit(:literal, :literal, text)
856
857
  end
857
858
 
858
- def emit_options(text, ts, te)
859
+ def emit_options(text)
859
860
  token = nil
860
861
 
861
862
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -881,14 +882,14 @@ class Regexp::Scanner
881
882
  token = :options_switch
882
883
  end
883
884
 
884
- emit(:group, token, text, ts, te)
885
+ emit(:group, token, text)
885
886
  end
886
887
 
887
888
  def emit_meta_control_sequence(data, ts, te, token)
888
889
  if data.last < 0x00 || data.last > 0x7F
889
890
  validation_error(:sequence, 'escape', token.to_s)
890
891
  end
891
- emit(:escape, token, *text(data, ts, te, 1))
892
+ emit(:escape, token, copy(data, ts-1, te))
892
893
  end
893
894
 
894
895
  # Centralizes and unifies the handling of validation related