regexp_parser 1.8.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +70 -0
  3. data/Gemfile +1 -0
  4. data/README.md +12 -11
  5. data/Rakefile +2 -2
  6. data/lib/regexp_parser/expression.rb +10 -19
  7. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/group.rb +22 -2
  9. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  10. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  11. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  12. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  13. data/lib/regexp_parser/expression/sequence.rb +0 -10
  14. data/lib/regexp_parser/lexer.rb +2 -2
  15. data/lib/regexp_parser/parser.rb +27 -2
  16. data/lib/regexp_parser/scanner.rb +1194 -1272
  17. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  18. data/lib/regexp_parser/scanner/property.rl +2 -2
  19. data/lib/regexp_parser/scanner/scanner.rl +178 -186
  20. data/lib/regexp_parser/syntax.rb +4 -4
  21. data/lib/regexp_parser/syntax/any.rb +2 -2
  22. data/lib/regexp_parser/syntax/base.rb +1 -1
  23. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  24. data/lib/regexp_parser/version.rb +1 -1
  25. data/spec/expression/base_spec.rb +10 -0
  26. data/spec/expression/subexpression_spec.rb +1 -1
  27. data/spec/expression/to_s_spec.rb +39 -31
  28. data/spec/lexer/literals_spec.rb +24 -49
  29. data/spec/parser/errors_spec.rb +1 -1
  30. data/spec/parser/escapes_spec.rb +1 -1
  31. data/spec/parser/quantifiers_spec.rb +16 -0
  32. data/spec/parser/set/ranges_spec.rb +3 -3
  33. data/spec/scanner/escapes_spec.rb +7 -0
  34. data/spec/scanner/groups_spec.rb +10 -1
  35. data/spec/scanner/literals_spec.rb +28 -38
  36. data/spec/scanner/quantifiers_spec.rb +18 -13
  37. data/spec/scanner/sets_spec.rb +23 -5
  38. data/spec/spec_helper.rb +1 -0
  39. metadata +56 -60
  40. data/spec/expression/root_spec.rb +0 -9
  41. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -123,10 +129,6 @@
123
129
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
133
  keep_mark | [xucCM];
132
134
 
@@ -135,13 +137,13 @@
135
137
 
136
138
  # EOF error, used where it can be detected
137
139
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
140
+ text = copy(data, ts ? ts-1 : 0, -1)
139
141
  raise PrematureEndError.new( text )
140
142
  }
141
143
 
142
144
  # Invalid sequence error, used from sequences, like escapes and sets
143
145
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
146
+ text = copy(data, ts ? ts-1 : 0, -1)
145
147
  validation_error(:sequence, 'sequence', text)
146
148
  }
147
149
 
@@ -156,7 +158,7 @@
156
158
  # --------------------------------------------------------------------------
157
159
  character_set := |*
158
160
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
161
+ emit(:set, :close, copy(data, ts, te))
160
162
  if in_set?
161
163
  fret;
162
164
  else
@@ -165,8 +167,8 @@
165
167
  };
166
168
 
167
169
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ emit(:literal, :literal, copy(data, ts, te-1))
171
+ emit(:set, :close, copy(data, ts+1, te))
170
172
  if in_set?
171
173
  fret;
172
174
  else
@@ -175,33 +177,33 @@
175
177
  };
176
178
 
177
179
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
180
+ emit(:literal, :literal, '-')
181
+ emit(:set, :intersection, '&&')
180
182
  };
181
183
 
182
184
  '^' {
183
- text = text(data, ts, te).first
185
+ text = copy(data, ts, te)
184
186
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
187
+ emit(:set, :negate, text)
186
188
  else
187
- emit(:literal, :literal, text, ts, te)
189
+ emit(:literal, :literal, text)
188
190
  end
189
191
  };
190
192
 
191
193
  '-' {
192
- text = text(data, ts, te).first
194
+ text = copy(data, ts, te)
193
195
  # ranges cant start with a subset or intersection/negation/range operator
194
196
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
196
198
  else
197
- emit(:set, :range, text, ts, te)
199
+ emit(:set, :range, text)
198
200
  end
199
201
  };
200
202
 
201
203
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
204
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
205
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
206
+ emit(:set, :intersection, copy(data, ts, te))
205
207
  };
206
208
 
207
209
  backslash {
@@ -209,12 +211,12 @@
209
211
  };
210
212
 
211
213
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
214
+ emit(:set, :open, copy(data, ts, te))
213
215
  fcall character_set;
214
216
  };
215
217
 
216
218
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
219
+ text = copy(data, ts, te)
218
220
 
219
221
  type = :posixclass
220
222
  class_name = text[2..-3]
@@ -223,29 +225,24 @@
223
225
  type = :nonposixclass
224
226
  end
225
227
 
226
- emit(type, class_name.to_sym, text, ts, te)
228
+ emit(type, class_name.to_sym, text)
227
229
  };
228
230
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
231
+ # These are not supported in ruby at the moment. Enable them if they are.
232
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
233
+ # emit(:set, :collation, copy(data, ts, te))
234
+ # };
235
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
236
+ # emit(:set, :equivalent, copy(data, ts, te))
237
+ # };
236
238
 
237
239
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
240
+ emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
243
+ any | ascii_nonprint | utf8_multibyte {
244
+ text = copy(data, ts, te)
245
+ emit(:literal, :literal, text)
249
246
  };
250
247
  *|;
251
248
 
@@ -253,7 +250,7 @@
253
250
  # --------------------------------------------------------------------------
254
251
  set_escape_sequence := |*
255
252
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
253
+ emit(:escape, :literal, copy(data, ts-1, te))
257
254
  fret;
258
255
  };
259
256
 
@@ -269,33 +266,33 @@
269
266
  # --------------------------------------------------------------------------
270
267
  escape_sequence := |*
271
268
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
269
+ text = copy(data, ts-1, te)
270
+ emit(:backref, :number, text)
274
271
  fret;
275
272
  };
276
273
 
277
274
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
275
+ emit(:escape, :octal, copy(data, ts-1, te))
279
276
  fret;
280
277
  };
281
278
 
282
279
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
280
+ case text = copy(data, ts-1, te)
281
+ when '\.'; emit(:escape, :dot, text)
282
+ when '\|'; emit(:escape, :alternation, text)
283
+ when '\^'; emit(:escape, :bol, text)
284
+ when '\$'; emit(:escape, :eol, text)
285
+ when '\?'; emit(:escape, :zero_or_one, text)
286
+ when '\*'; emit(:escape, :zero_or_more, text)
287
+ when '\+'; emit(:escape, :one_or_more, text)
288
+ when '\('; emit(:escape, :group_open, text)
289
+ when '\)'; emit(:escape, :group_close, text)
290
+ when '\{'; emit(:escape, :interval_open, text)
291
+ when '\}'; emit(:escape, :interval_close, text)
292
+ when '\['; emit(:escape, :set_open, text)
293
+ when '\]'; emit(:escape, :set_close, text)
297
294
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
295
+ emit(:escape, :backslash, text)
299
296
  end
300
297
  fret;
301
298
  };
@@ -303,31 +300,31 @@
303
300
  escaped_ascii > (escaped_alpha, 7) {
304
301
  # \b is emitted as backspace only when inside a character set, otherwise
305
302
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
303
+ case text = copy(data, ts-1, te)
304
+ when '\a'; emit(:escape, :bell, text)
305
+ when '\b'; emit(:escape, :backspace, text)
306
+ when '\e'; emit(:escape, :escape, text)
307
+ when '\f'; emit(:escape, :form_feed, text)
308
+ when '\n'; emit(:escape, :newline, text)
309
+ when '\r'; emit(:escape, :carriage, text)
310
+ when '\t'; emit(:escape, :tab, text)
311
+ when '\v'; emit(:escape, :vertical_tab, text)
315
312
  end
316
313
  fret;
317
314
  };
318
315
 
319
316
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
317
+ text = copy(data, ts-1, te)
321
318
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
319
+ emit(:escape, :codepoint_list, text)
323
320
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
321
+ emit(:escape, :codepoint, text)
325
322
  end
326
323
  fret;
327
324
  };
328
325
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
327
+ emit(:escape, :hex, copy(data, ts-1, te))
331
328
  fret;
332
329
  };
333
330
 
@@ -357,8 +354,8 @@
357
354
  fcall unicode_property;
358
355
  };
359
356
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
358
+ emit(:escape, :literal, copy(data, ts-1, te))
362
359
  fret;
363
360
  };
364
361
  *|;
@@ -368,9 +365,9 @@
368
365
  # --------------------------------------------------------------------------
369
366
  conditional_expression := |*
370
367
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
368
+ text = copy(data, ts, te-1)
369
+ emit(:conditional, :condition, text)
370
+ emit(:conditional, :condition_close, ')')
374
371
  };
375
372
 
376
373
  any {
@@ -387,39 +384,39 @@
387
384
  # Meta characters
388
385
  # ------------------------------------------------------------------------
389
386
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
387
+ emit(:meta, :dot, copy(data, ts, te))
391
388
  };
392
389
 
393
390
  alternation {
394
391
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
392
+ emit(:conditional, :separator, copy(data, ts, te))
396
393
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
394
+ emit(:meta, :alternation, copy(data, ts, te))
398
395
  end
399
396
  };
400
397
 
401
398
  # Anchors
402
399
  # ------------------------------------------------------------------------
403
400
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
401
+ emit(:anchor, :bol, copy(data, ts, te))
405
402
  };
406
403
 
407
404
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
405
+ emit(:anchor, :eol, copy(data, ts, te))
409
406
  };
410
407
 
411
408
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
409
+ emit(:keep, :mark, copy(data, ts, te))
413
410
  };
414
411
 
415
412
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
413
+ case text = copy(data, ts, te)
414
+ when '\\A'; emit(:anchor, :bos, text)
415
+ when '\\z'; emit(:anchor, :eos, text)
416
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
417
+ when '\\b'; emit(:anchor, :word_boundary, text)
418
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
419
+ when '\\G'; emit(:anchor, :match_start, text)
423
420
  end
424
421
  };
425
422
 
@@ -430,7 +427,7 @@
430
427
  # Character sets
431
428
  # ------------------------------------------------------------------------
432
429
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
430
+ emit(:set, :open, copy(data, ts, te))
434
431
  fcall character_set;
435
432
  };
436
433
 
@@ -439,12 +436,12 @@
439
436
  # (?(condition)Y|N) conditional expression
440
437
  # ------------------------------------------------------------------------
441
438
  conditional {
442
- text = text(data, ts, te).first
439
+ text = copy(data, ts, te)
443
440
 
444
441
  conditional_stack << group_depth
445
442
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
443
+ emit(:conditional, :open, text[0..-2])
444
+ emit(:conditional, :condition_open, '(')
448
445
  fcall conditional_expression;
449
446
  };
450
447
 
@@ -455,7 +452,7 @@
455
452
  # correct closing count.
456
453
  # ------------------------------------------------------------------------
457
454
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
455
+ emit(:group, :comment, copy(data, ts, te))
459
456
  };
460
457
 
461
458
  # Expression options:
@@ -470,11 +467,11 @@
470
467
  # (?imxdau-imx:subexp) option on/off for subexp
471
468
  # ------------------------------------------------------------------------
472
469
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
470
+ text = copy(data, ts, te)
474
471
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
472
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
473
  end
477
- emit_options(text, ts, te)
474
+ emit_options(text)
478
475
  };
479
476
 
480
477
  # Assertions
@@ -484,11 +481,11 @@
484
481
  # (?<!subexp) negative look-behind
485
482
  # ------------------------------------------------------------------------
486
483
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
484
+ case text = copy(data, ts, te)
485
+ when '(?='; emit(:assertion, :lookahead, text)
486
+ when '(?!'; emit(:assertion, :nlookahead, text)
487
+ when '(?<='; emit(:assertion, :lookbehind, text)
488
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
489
  end
493
490
  };
494
491
 
@@ -501,32 +498,32 @@
501
498
  # (subexp) captured group
502
499
  # ------------------------------------------------------------------------
503
500
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
501
+ case text = copy(data, ts, te)
502
+ when '(?:'; emit(:group, :passive, text)
503
+ when '(?>'; emit(:group, :atomic, text)
504
+ when '(?~'; emit(:group, :absence, text)
508
505
 
509
506
  when /^\(\?(?:<>|'')/
510
507
  validation_error(:group, 'named group', 'name is empty')
511
508
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
509
+ when /^\(\?<[^>]+>/
510
+ emit(:group, :named_ab, text)
514
511
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
512
+ when /^\(\?'[^']+'/
513
+ emit(:group, :named_sq, text)
517
514
 
518
515
  end
519
516
  };
520
517
 
521
518
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
519
+ text = copy(data, ts, te)
520
+ emit(:group, :capture, text)
524
521
  };
525
522
 
526
523
  group_close @group_closed {
527
524
  if conditional_stack.last == group_depth + 1
528
525
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
526
+ emit(:conditional, :close, copy(data, ts, te))
530
527
  else
531
528
  if spacing_stack.length > 1 &&
532
529
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +531,7 @@
534
531
  self.free_spacing = spacing_stack.last[:free_spacing]
535
532
  end
536
533
 
537
- emit(:group, :close, *text(data, ts, te))
534
+ emit(:group, :close, copy(data, ts, te))
538
535
  end
539
536
  };
540
537
 
@@ -542,63 +539,65 @@
542
539
  # Group backreference, named and numbered
543
540
  # ------------------------------------------------------------------------
544
541
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
542
+ case text = copy(data, ts, te)
546
543
  when /^\\([gk])(<>|'')/ # angle brackets
547
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
545
 
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
550
549
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
550
+ emit(:backref, :name_ref_ab, text)
552
551
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
552
+ emit(:backref, :name_call_ab, text)
554
553
  end
555
554
 
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
557
556
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
557
+ emit(:backref, :name_ref_sq, text)
559
558
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
559
+ emit(:backref, :name_call_sq, text)
561
560
  end
562
561
 
563
562
  when /^\\([gk])<\d+>/ # angle-brackets
564
563
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
564
+ emit(:backref, :number_ref_ab, text)
566
565
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
566
+ emit(:backref, :number_call_ab, text)
568
567
  end
569
568
 
570
569
  when /^\\([gk])'\d+'/ # single quotes
571
570
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
571
+ emit(:backref, :number_ref_sq, text)
573
572
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
573
+ emit(:backref, :number_call_sq, text)
575
574
  end
576
575
 
577
576
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
577
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
578
+ emit(:backref, :number_rel_ref_ab, text)
580
579
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
580
+ emit(:backref, :number_rel_call_ab, text)
582
581
  end
583
582
 
584
583
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
584
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
585
+ emit(:backref, :number_rel_ref_sq, text)
587
586
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
587
+ emit(:backref, :number_rel_call_sq, text)
589
588
  end
590
589
 
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
591
+ emit(:backref, :name_recursion_ref_ab, text)
593
592
 
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
594
+ emit(:backref, :name_recursion_ref_sq, text)
596
595
 
597
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
597
+ emit(:backref, :number_recursion_ref_ab, text)
599
598
 
600
599
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_sq, text)
602
601
 
603
602
  end
604
603
  };
@@ -607,31 +606,31 @@
607
606
  # Quantifiers
608
607
  # ------------------------------------------------------------------------
609
608
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
609
+ case text = copy(data, ts, te)
610
+ when '?' ; emit(:quantifier, :zero_or_one, text)
611
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
612
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
613
  end
615
614
  };
616
615
 
617
616
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
617
+ case text = copy(data, ts, te)
618
+ when '*' ; emit(:quantifier, :zero_or_more, text)
619
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
620
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
621
  end
623
622
  };
624
623
 
625
624
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
625
+ case text = copy(data, ts, te)
626
+ when '+' ; emit(:quantifier, :one_or_more, text)
627
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
628
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
629
  end
631
630
  };
632
631
 
633
632
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
633
+ emit(:quantifier, :interval, copy(data, ts, te))
635
634
  };
636
635
 
637
636
  # Catch unmatched curly braces as literals
@@ -647,7 +646,7 @@
647
646
 
648
647
  comment {
649
648
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
649
+ emit(:free_space, :comment, copy(data, ts, te))
651
650
  else
652
651
  # consume only the pound sign (#) and backtrack to do regular scanning
653
652
  append_literal(data, ts, ts + 1)
@@ -657,7 +656,7 @@
657
656
 
658
657
  space+ {
659
658
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
659
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
660
  else
662
661
  append_literal(data, ts, te)
663
662
  end
@@ -666,11 +665,7 @@
666
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
667
666
  # except meta characters.
668
667
  # ------------------------------------------------------------------------
669
- (ascii_print -- space)+ |
670
- ascii_nonprint+ |
671
- utf8_2_byte+ |
672
- utf8_3_byte+ |
673
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
674
669
  append_literal(data, ts, te)
675
670
  };
676
671
 
@@ -760,6 +755,7 @@ class Regexp::Scanner
760
755
  self.set_depth = 0
761
756
  self.group_depth = 0
762
757
  self.conditional_stack = []
758
+ self.char_pos = 0
763
759
 
764
760
  %% write data;
765
761
  %% write init;
@@ -769,7 +765,7 @@ class Regexp::Scanner
769
765
  testEof = testEof
770
766
 
771
767
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
768
+ text = copy(data, ts ? ts-1 : 0, -1)
773
769
  raise ScannerError.new("Scan error at '#{text}'")
774
770
  end
775
771
 
@@ -797,22 +793,29 @@ class Regexp::Scanner
797
793
  end
798
794
 
799
795
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
796
+ def emit(type, token, text)
801
797
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
798
 
803
799
  emit_literal if literal
804
800
 
801
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
802
+ # end-users, so we keep track of char-based indices and emit those instead.
803
+ ts_char_pos = char_pos
804
+ te_char_pos = char_pos + text.length
805
+
805
806
  if block
806
- block.call type, token, text, ts, te
807
+ block.call type, token, text, ts_char_pos, te_char_pos
807
808
  end
808
809
 
809
- tokens << [type, token, text, ts, te]
810
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
811
+
812
+ self.char_pos = te_char_pos
810
813
  end
811
814
 
812
815
  private
813
816
 
814
817
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
818
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
819
 
817
820
  def free_spacing?(input_object, options)
818
821
  if options && !input_object.is_a?(String)
@@ -835,36 +838,25 @@ class Regexp::Scanner
835
838
  end
836
839
 
837
840
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
841
+ def copy(data, ts, te)
842
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
843
  end
847
844
 
848
845
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
846
+ # by a call to emit_literal.
850
847
  def append_literal(data, ts, te)
851
848
  self.literal = literal || []
852
- literal << text(data, ts, te)
849
+ literal << copy(data, ts, te)
853
850
  end
854
851
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
852
+ # Emits the literal run collected by calls to the append_literal method.
857
853
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
854
+ text = literal.join
863
855
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
856
+ emit(:literal, :literal, text)
865
857
  end
866
858
 
867
- def emit_options(text, ts, te)
859
+ def emit_options(text)
868
860
  token = nil
869
861
 
870
862
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +882,14 @@ class Regexp::Scanner
890
882
  token = :options_switch
891
883
  end
892
884
 
893
- emit(:group, token, text, ts, te)
885
+ emit(:group, token, text)
894
886
  end
895
887
 
896
888
  def emit_meta_control_sequence(data, ts, te, token)
897
889
  if data.last < 0x00 || data.last > 0x7F
898
890
  validation_error(:sequence, 'escape', token.to_s)
899
891
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
892
+ emit(:escape, token, copy(data, ts-1, te))
901
893
  end
902
894
 
903
895
  # Centralizes and unifies the handling of validation related