regexp_parser 1.7.1 → 2.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -21,7 +26,7 @@
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
29
+ comment = ('#' . [^\n]* . '\n'?);
25
30
 
26
31
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
32
  'cntrl' | 'digit' | 'graph' |
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -120,28 +126,24 @@
120
126
 
121
127
  literal_delimiters = ']' | '}';
122
128
 
123
- ascii_print = ((0x20..0x7e) - meta_char);
129
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- group_ref | keep_mark | [xucCM];
133
+ keep_mark | [xucCM];
132
134
 
133
135
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
136
  multi_codepoint_char_type | [0-9cCM];
135
137
 
136
138
  # EOF error, used where it can be detected
137
139
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
140
+ text = copy(data, ts ? ts-1 : 0, -1)
139
141
  raise PrematureEndError.new( text )
140
142
  }
141
143
 
142
144
  # Invalid sequence error, used from sequences, like escapes and sets
143
145
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
146
+ text = copy(data, ts ? ts-1 : 0, -1)
145
147
  validation_error(:sequence, 'sequence', text)
146
148
  }
147
149
 
@@ -156,7 +158,7 @@
156
158
  # --------------------------------------------------------------------------
157
159
  character_set := |*
158
160
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
161
+ emit(:set, :close, copy(data, ts, te))
160
162
  if in_set?
161
163
  fret;
162
164
  else
@@ -165,8 +167,8 @@
165
167
  };
166
168
 
167
169
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ emit(:literal, :literal, copy(data, ts, te-1))
171
+ emit(:set, :close, copy(data, ts+1, te))
170
172
  if in_set?
171
173
  fret;
172
174
  else
@@ -175,33 +177,33 @@
175
177
  };
176
178
 
177
179
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
180
+ emit(:literal, :literal, '-')
181
+ emit(:set, :intersection, '&&')
180
182
  };
181
183
 
182
184
  '^' {
183
- text = text(data, ts, te).first
185
+ text = copy(data, ts, te)
184
186
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
187
+ emit(:set, :negate, text)
186
188
  else
187
- emit(:literal, :literal, text, ts, te)
189
+ emit(:literal, :literal, text)
188
190
  end
189
191
  };
190
192
 
191
193
  '-' {
192
- text = text(data, ts, te).first
194
+ text = copy(data, ts, te)
193
195
  # ranges cant start with a subset or intersection/negation/range operator
194
196
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
196
198
  else
197
- emit(:set, :range, text, ts, te)
199
+ emit(:set, :range, text)
198
200
  end
199
201
  };
200
202
 
201
203
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
204
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
205
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
206
+ emit(:set, :intersection, copy(data, ts, te))
205
207
  };
206
208
 
207
209
  backslash {
@@ -209,12 +211,12 @@
209
211
  };
210
212
 
211
213
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
214
+ emit(:set, :open, copy(data, ts, te))
213
215
  fcall character_set;
214
216
  };
215
217
 
216
218
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
219
+ text = copy(data, ts, te)
218
220
 
219
221
  type = :posixclass
220
222
  class_name = text[2..-3]
@@ -223,29 +225,24 @@
223
225
  type = :nonposixclass
224
226
  end
225
227
 
226
- emit(type, class_name.to_sym, text, ts, te)
228
+ emit(type, class_name.to_sym, text)
227
229
  };
228
230
 
229
231
  collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
232
+ emit(:set, :collation, copy(data, ts, te))
231
233
  };
232
234
 
233
235
  character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
236
+ emit(:set, :equivalent, copy(data, ts, te))
235
237
  };
236
238
 
237
239
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
240
+ emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
243
+ any | ascii_nonprint | utf8_multibyte {
244
+ text = copy(data, ts, te)
245
+ emit(:literal, :literal, text)
249
246
  };
250
247
  *|;
251
248
 
@@ -253,7 +250,7 @@
253
250
  # --------------------------------------------------------------------------
254
251
  set_escape_sequence := |*
255
252
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
253
+ emit(:escape, :literal, copy(data, ts-1, te))
257
254
  fret;
258
255
  };
259
256
 
@@ -269,33 +266,33 @@
269
266
  # --------------------------------------------------------------------------
270
267
  escape_sequence := |*
271
268
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
269
+ text = copy(data, ts-1, te)
270
+ emit(:backref, :number, text)
274
271
  fret;
275
272
  };
276
273
 
277
274
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
275
+ emit(:escape, :octal, copy(data, ts-1, te))
279
276
  fret;
280
277
  };
281
278
 
282
279
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
280
+ case text = copy(data, ts-1, te)
281
+ when '\.'; emit(:escape, :dot, text)
282
+ when '\|'; emit(:escape, :alternation, text)
283
+ when '\^'; emit(:escape, :bol, text)
284
+ when '\$'; emit(:escape, :eol, text)
285
+ when '\?'; emit(:escape, :zero_or_one, text)
286
+ when '\*'; emit(:escape, :zero_or_more, text)
287
+ when '\+'; emit(:escape, :one_or_more, text)
288
+ when '\('; emit(:escape, :group_open, text)
289
+ when '\)'; emit(:escape, :group_close, text)
290
+ when '\{'; emit(:escape, :interval_open, text)
291
+ when '\}'; emit(:escape, :interval_close, text)
292
+ when '\['; emit(:escape, :set_open, text)
293
+ when '\]'; emit(:escape, :set_close, text)
297
294
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
295
+ emit(:escape, :backslash, text)
299
296
  end
300
297
  fret;
301
298
  };
@@ -303,31 +300,31 @@
303
300
  escaped_ascii > (escaped_alpha, 7) {
304
301
  # \b is emitted as backspace only when inside a character set, otherwise
305
302
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
303
+ case text = copy(data, ts-1, te)
304
+ when '\a'; emit(:escape, :bell, text)
305
+ when '\b'; emit(:escape, :backspace, text)
306
+ when '\e'; emit(:escape, :escape, text)
307
+ when '\f'; emit(:escape, :form_feed, text)
308
+ when '\n'; emit(:escape, :newline, text)
309
+ when '\r'; emit(:escape, :carriage, text)
310
+ when '\t'; emit(:escape, :tab, text)
311
+ when '\v'; emit(:escape, :vertical_tab, text)
315
312
  end
316
313
  fret;
317
314
  };
318
315
 
319
316
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
317
+ text = copy(data, ts-1, te)
321
318
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
319
+ emit(:escape, :codepoint_list, text)
323
320
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
321
+ emit(:escape, :codepoint, text)
325
322
  end
326
323
  fret;
327
324
  };
328
325
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
327
+ emit(:escape, :hex, copy(data, ts-1, te))
331
328
  fret;
332
329
  };
333
330
 
@@ -357,8 +354,8 @@
357
354
  fcall unicode_property;
358
355
  };
359
356
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
358
+ emit(:escape, :literal, copy(data, ts-1, te))
362
359
  fret;
363
360
  };
364
361
  *|;
@@ -368,9 +365,9 @@
368
365
  # --------------------------------------------------------------------------
369
366
  conditional_expression := |*
370
367
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
368
+ text = copy(data, ts, te-1)
369
+ emit(:conditional, :condition, text)
370
+ emit(:conditional, :condition_close, ')')
374
371
  };
375
372
 
376
373
  any {
@@ -387,39 +384,39 @@
387
384
  # Meta characters
388
385
  # ------------------------------------------------------------------------
389
386
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
387
+ emit(:meta, :dot, copy(data, ts, te))
391
388
  };
392
389
 
393
390
  alternation {
394
391
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
392
+ emit(:conditional, :separator, copy(data, ts, te))
396
393
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
394
+ emit(:meta, :alternation, copy(data, ts, te))
398
395
  end
399
396
  };
400
397
 
401
398
  # Anchors
402
399
  # ------------------------------------------------------------------------
403
400
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
401
+ emit(:anchor, :bol, copy(data, ts, te))
405
402
  };
406
403
 
407
404
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
405
+ emit(:anchor, :eol, copy(data, ts, te))
409
406
  };
410
407
 
411
408
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
409
+ emit(:keep, :mark, copy(data, ts, te))
413
410
  };
414
411
 
415
412
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
413
+ case text = copy(data, ts, te)
414
+ when '\\A'; emit(:anchor, :bos, text)
415
+ when '\\z'; emit(:anchor, :eos, text)
416
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
417
+ when '\\b'; emit(:anchor, :word_boundary, text)
418
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
419
+ when '\\G'; emit(:anchor, :match_start, text)
423
420
  end
424
421
  };
425
422
 
@@ -430,7 +427,7 @@
430
427
  # Character sets
431
428
  # ------------------------------------------------------------------------
432
429
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
430
+ emit(:set, :open, copy(data, ts, te))
434
431
  fcall character_set;
435
432
  };
436
433
 
@@ -439,12 +436,12 @@
439
436
  # (?(condition)Y|N) conditional expression
440
437
  # ------------------------------------------------------------------------
441
438
  conditional {
442
- text = text(data, ts, te).first
439
+ text = copy(data, ts, te)
443
440
 
444
441
  conditional_stack << group_depth
445
442
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
443
+ emit(:conditional, :open, text[0..-2])
444
+ emit(:conditional, :condition_open, '(')
448
445
  fcall conditional_expression;
449
446
  };
450
447
 
@@ -455,7 +452,7 @@
455
452
  # correct closing count.
456
453
  # ------------------------------------------------------------------------
457
454
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
455
+ emit(:group, :comment, copy(data, ts, te))
459
456
  };
460
457
 
461
458
  # Expression options:
@@ -470,11 +467,11 @@
470
467
  # (?imxdau-imx:subexp) option on/off for subexp
471
468
  # ------------------------------------------------------------------------
472
469
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
470
+ text = copy(data, ts, te)
474
471
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
472
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
473
  end
477
- emit_options(text, ts, te)
474
+ emit_options(text)
478
475
  };
479
476
 
480
477
  # Assertions
@@ -484,11 +481,11 @@
484
481
  # (?<!subexp) negative look-behind
485
482
  # ------------------------------------------------------------------------
486
483
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
484
+ case text = copy(data, ts, te)
485
+ when '(?='; emit(:assertion, :lookahead, text)
486
+ when '(?!'; emit(:assertion, :nlookahead, text)
487
+ when '(?<='; emit(:assertion, :lookbehind, text)
488
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
489
  end
493
490
  };
494
491
 
@@ -501,32 +498,32 @@
501
498
  # (subexp) captured group
502
499
  # ------------------------------------------------------------------------
503
500
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
501
+ case text = copy(data, ts, te)
502
+ when '(?:'; emit(:group, :passive, text)
503
+ when '(?>'; emit(:group, :atomic, text)
504
+ when '(?~'; emit(:group, :absence, text)
508
505
 
509
506
  when /^\(\?(?:<>|'')/
510
507
  validation_error(:group, 'named group', 'name is empty')
511
508
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
509
+ when /^\(\?<[^>]+>/
510
+ emit(:group, :named_ab, text)
514
511
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
512
+ when /^\(\?'[^']+'/
513
+ emit(:group, :named_sq, text)
517
514
 
518
515
  end
519
516
  };
520
517
 
521
518
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
519
+ text = copy(data, ts, te)
520
+ emit(:group, :capture, text)
524
521
  };
525
522
 
526
523
  group_close @group_closed {
527
524
  if conditional_stack.last == group_depth + 1
528
525
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
526
+ emit(:conditional, :close, copy(data, ts, te))
530
527
  else
531
528
  if spacing_stack.length > 1 &&
532
529
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +531,7 @@
534
531
  self.free_spacing = spacing_stack.last[:free_spacing]
535
532
  end
536
533
 
537
- emit(:group, :close, *text(data, ts, te))
534
+ emit(:group, :close, copy(data, ts, te))
538
535
  end
539
536
  };
540
537
 
@@ -542,63 +539,65 @@
542
539
  # Group backreference, named and numbered
543
540
  # ------------------------------------------------------------------------
544
541
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
542
+ case text = copy(data, ts, te)
546
543
  when /^\\([gk])(<>|'')/ # angle brackets
547
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
545
 
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
550
549
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
550
+ emit(:backref, :name_ref_ab, text)
552
551
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
552
+ emit(:backref, :name_call_ab, text)
554
553
  end
555
554
 
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
557
556
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
557
+ emit(:backref, :name_ref_sq, text)
559
558
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
559
+ emit(:backref, :name_call_sq, text)
561
560
  end
562
561
 
563
562
  when /^\\([gk])<\d+>/ # angle-brackets
564
563
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
564
+ emit(:backref, :number_ref_ab, text)
566
565
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
566
+ emit(:backref, :number_call_ab, text)
568
567
  end
569
568
 
570
569
  when /^\\([gk])'\d+'/ # single quotes
571
570
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
571
+ emit(:backref, :number_ref_sq, text)
573
572
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
573
+ emit(:backref, :number_call_sq, text)
575
574
  end
576
575
 
577
576
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
577
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
578
+ emit(:backref, :number_rel_ref_ab, text)
580
579
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
580
+ emit(:backref, :number_rel_call_ab, text)
582
581
  end
583
582
 
584
583
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
584
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
585
+ emit(:backref, :number_rel_ref_sq, text)
587
586
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
587
+ emit(:backref, :number_rel_call_sq, text)
589
588
  end
590
589
 
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
591
+ emit(:backref, :name_recursion_ref_ab, text)
593
592
 
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
594
+ emit(:backref, :name_recursion_ref_sq, text)
596
595
 
597
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
597
+ emit(:backref, :number_recursion_ref_ab, text)
599
598
 
600
599
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_sq, text)
602
601
 
603
602
  end
604
603
  };
@@ -607,31 +606,31 @@
607
606
  # Quantifiers
608
607
  # ------------------------------------------------------------------------
609
608
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
609
+ case text = copy(data, ts, te)
610
+ when '?' ; emit(:quantifier, :zero_or_one, text)
611
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
612
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
613
  end
615
614
  };
616
615
 
617
616
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
617
+ case text = copy(data, ts, te)
618
+ when '*' ; emit(:quantifier, :zero_or_more, text)
619
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
620
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
621
  end
623
622
  };
624
623
 
625
624
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
625
+ case text = copy(data, ts, te)
626
+ when '+' ; emit(:quantifier, :one_or_more, text)
627
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
628
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
629
  end
631
630
  };
632
631
 
633
632
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
633
+ emit(:quantifier, :interval, copy(data, ts, te))
635
634
  };
636
635
 
637
636
  # Catch unmatched curly braces as literals
@@ -647,15 +646,17 @@
647
646
 
648
647
  comment {
649
648
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
649
+ emit(:free_space, :comment, copy(data, ts, te))
651
650
  else
652
- append_literal(data, ts, te)
651
+ # consume only the pound sign (#) and backtrack to do regular scanning
652
+ append_literal(data, ts, ts + 1)
653
+ fexec ts + 1;
653
654
  end
654
655
  };
655
656
 
656
657
  space+ {
657
658
  if free_spacing
658
- emit(:free_space, :whitespace, *text(data, ts, te))
659
+ emit(:free_space, :whitespace, copy(data, ts, te))
659
660
  else
660
661
  append_literal(data, ts, te)
661
662
  end
@@ -664,11 +665,7 @@
664
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
665
666
  # except meta characters.
666
667
  # ------------------------------------------------------------------------
667
- (ascii_print -- space)+ |
668
- ascii_nonprint+ |
669
- utf8_2_byte+ |
670
- utf8_3_byte+ |
671
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
672
669
  append_literal(data, ts, te)
673
670
  };
674
671
 
@@ -737,21 +734,16 @@ class Regexp::Scanner
737
734
  #
738
735
  # This method may raise errors if a syntax error is encountered.
739
736
  # --------------------------------------------------------------------------
740
- def self.scan(input_object, &block)
741
- new.scan(input_object, &block)
737
+ def self.scan(input_object, options: nil, &block)
738
+ new.scan(input_object, options: options, &block)
742
739
  end
743
740
 
744
- def scan(input_object, &block)
741
+ def scan(input_object, options: nil, &block)
745
742
  self.literal = nil
746
743
  stack = []
747
744
 
748
- if input_object.is_a?(Regexp)
749
- input = input_object.source
750
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
751
- else
752
- input = input_object
753
- self.free_spacing = false
754
- end
745
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
746
+ self.free_spacing = free_spacing?(input_object, options)
755
747
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
756
748
 
757
749
  data = input.unpack("c*") if input.is_a?(String)
@@ -763,6 +755,7 @@ class Regexp::Scanner
763
755
  self.set_depth = 0
764
756
  self.group_depth = 0
765
757
  self.conditional_stack = []
758
+ self.char_pos = 0
766
759
 
767
760
  %% write data;
768
761
  %% write init;
@@ -772,7 +765,7 @@ class Regexp::Scanner
772
765
  testEof = testEof
773
766
 
774
767
  if cs == re_scanner_error
775
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
768
+ text = copy(data, ts ? ts-1 : 0, -1)
776
769
  raise ScannerError.new("Scan error at '#{text}'")
777
770
  end
778
771
 
@@ -800,22 +793,41 @@ class Regexp::Scanner
800
793
  end
801
794
 
802
795
  # Emits an array with the details of the scanned pattern
803
- def emit(type, token, text, ts, te)
796
+ def emit(type, token, text)
804
797
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
805
798
 
806
799
  emit_literal if literal
807
800
 
801
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
802
+ # end-users, so we keep track of char-based indices and emit those instead.
803
+ ts_char_pos = char_pos
804
+ te_char_pos = char_pos + text.length
805
+
808
806
  if block
809
- block.call type, token, text, ts, te
807
+ block.call type, token, text, ts_char_pos, te_char_pos
810
808
  end
811
809
 
812
- tokens << [type, token, text, ts, te]
810
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
811
+
812
+ self.char_pos = te_char_pos
813
813
  end
814
814
 
815
815
  private
816
816
 
817
817
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
818
- :group_depth, :set_depth, :conditional_stack
818
+ :group_depth, :set_depth, :conditional_stack, :char_pos
819
+
820
+ def free_spacing?(input_object, options)
821
+ if options && !input_object.is_a?(String)
822
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
823
+ end
824
+
825
+ options = input_object.options if input_object.is_a?(::Regexp)
826
+
827
+ return false unless options
828
+
829
+ options & Regexp::EXTENDED != 0
830
+ end
819
831
 
820
832
  def in_group?
821
833
  group_depth > 0
@@ -826,36 +838,25 @@ class Regexp::Scanner
826
838
  end
827
839
 
828
840
  # Copy from ts to te from data as text
829
- def copy(data, range)
830
- data[range].pack('c*')
831
- end
832
-
833
- # Copy from ts to te from data as text, returning an array with the text
834
- # and the offsets used to copy it.
835
- def text(data, ts, te, soff = 0)
836
- [copy(data, ts-soff..te-1), ts-soff, te]
841
+ def copy(data, ts, te)
842
+ data[ts...te].pack('c*').force_encoding('utf-8')
837
843
  end
838
844
 
839
845
  # Appends one or more characters to the literal buffer, to be emitted later
840
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
846
+ # by a call to emit_literal.
841
847
  def append_literal(data, ts, te)
842
848
  self.literal = literal || []
843
- literal << text(data, ts, te)
849
+ literal << copy(data, ts, te)
844
850
  end
845
851
 
846
- # Emits the literal run collected by calls to the append_literal method,
847
- # using the total start (ts) and end (te) offsets of the run.
852
+ # Emits the literal run collected by calls to the append_literal method.
848
853
  def emit_literal
849
- ts, te = literal.first[1], literal.last[2]
850
- text = literal.map {|t| t[0]}.join
851
-
852
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
853
-
854
+ text = literal.join
854
855
  self.literal = nil
855
- emit(:literal, :literal, text, ts, te)
856
+ emit(:literal, :literal, text)
856
857
  end
857
858
 
858
- def emit_options(text, ts, te)
859
+ def emit_options(text)
859
860
  token = nil
860
861
 
861
862
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -881,14 +882,14 @@ class Regexp::Scanner
881
882
  token = :options_switch
882
883
  end
883
884
 
884
- emit(:group, token, text, ts, te)
885
+ emit(:group, token, text)
885
886
  end
886
887
 
887
888
  def emit_meta_control_sequence(data, ts, te, token)
888
889
  if data.last < 0x00 || data.last > 0x7F
889
890
  validation_error(:sequence, 'escape', token.to_s)
890
891
  end
891
- emit(:escape, token, *text(data, ts, te, 1))
892
+ emit(:escape, token, copy(data, ts-1, te))
892
893
  end
893
894
 
894
895
  # Centralizes and unifies the handling of validation related