regexp_parser 1.8.1 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +70 -0
  3. data/Gemfile +1 -0
  4. data/README.md +12 -11
  5. data/Rakefile +2 -2
  6. data/lib/regexp_parser/expression.rb +10 -19
  7. data/lib/regexp_parser/expression/classes/free_space.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/group.rb +22 -2
  9. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  10. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  11. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  12. data/lib/regexp_parser/expression/quantifier.rb +9 -0
  13. data/lib/regexp_parser/expression/sequence.rb +0 -10
  14. data/lib/regexp_parser/lexer.rb +2 -2
  15. data/lib/regexp_parser/parser.rb +27 -2
  16. data/lib/regexp_parser/scanner.rb +1194 -1272
  17. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  18. data/lib/regexp_parser/scanner/property.rl +2 -2
  19. data/lib/regexp_parser/scanner/scanner.rl +178 -186
  20. data/lib/regexp_parser/syntax.rb +4 -4
  21. data/lib/regexp_parser/syntax/any.rb +2 -2
  22. data/lib/regexp_parser/syntax/base.rb +1 -1
  23. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  24. data/lib/regexp_parser/version.rb +1 -1
  25. data/spec/expression/base_spec.rb +10 -0
  26. data/spec/expression/subexpression_spec.rb +1 -1
  27. data/spec/expression/to_s_spec.rb +39 -31
  28. data/spec/lexer/literals_spec.rb +24 -49
  29. data/spec/parser/errors_spec.rb +1 -1
  30. data/spec/parser/escapes_spec.rb +1 -1
  31. data/spec/parser/quantifiers_spec.rb +16 -0
  32. data/spec/parser/set/ranges_spec.rb +3 -3
  33. data/spec/scanner/escapes_spec.rb +7 -0
  34. data/spec/scanner/groups_spec.rb +10 -1
  35. data/spec/scanner/literals_spec.rb +28 -38
  36. data/spec/scanner/quantifiers_spec.rb +18 -13
  37. data/spec/scanner/sets_spec.rb +23 -5
  38. data/spec/spec_helper.rb +1 -0
  39. metadata +56 -60
  40. data/spec/expression/root_spec.rb +0 -9
  41. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -90,18 +95,19 @@
90
95
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
96
 
92
97
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
98
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
99
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
100
+ group_number = '-'? . [1-9] . [0-9]*;
96
101
  group_level = [+\-] . [0-9]+;
97
102
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
103
+ group_name = ('<' . group_name_id_ab? . '>') |
104
+ ("'" . group_name_id_sq? . "'");
99
105
  group_lookup = group_name | group_number;
100
106
 
101
107
  group_named = ('?' . group_name );
102
108
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
109
+ group_name_ref = group_ref . (('<' . group_name_id_ab? . group_level? '>') |
110
+ ("'" . group_name_id_sq? . group_level? "'"));
105
111
 
106
112
  group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
113
  ("'" . group_number . group_level? "'"));
@@ -123,10 +129,6 @@
123
129
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
130
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
131
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
132
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
133
  keep_mark | [xucCM];
132
134
 
@@ -135,13 +137,13 @@
135
137
 
136
138
  # EOF error, used where it can be detected
137
139
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
140
+ text = copy(data, ts ? ts-1 : 0, -1)
139
141
  raise PrematureEndError.new( text )
140
142
  }
141
143
 
142
144
  # Invalid sequence error, used from sequences, like escapes and sets
143
145
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
146
+ text = copy(data, ts ? ts-1 : 0, -1)
145
147
  validation_error(:sequence, 'sequence', text)
146
148
  }
147
149
 
@@ -156,7 +158,7 @@
156
158
  # --------------------------------------------------------------------------
157
159
  character_set := |*
158
160
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
161
+ emit(:set, :close, copy(data, ts, te))
160
162
  if in_set?
161
163
  fret;
162
164
  else
@@ -165,8 +167,8 @@
165
167
  };
166
168
 
167
169
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ emit(:literal, :literal, copy(data, ts, te-1))
171
+ emit(:set, :close, copy(data, ts+1, te))
170
172
  if in_set?
171
173
  fret;
172
174
  else
@@ -175,33 +177,33 @@
175
177
  };
176
178
 
177
179
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
180
+ emit(:literal, :literal, '-')
181
+ emit(:set, :intersection, '&&')
180
182
  };
181
183
 
182
184
  '^' {
183
- text = text(data, ts, te).first
185
+ text = copy(data, ts, te)
184
186
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
187
+ emit(:set, :negate, text)
186
188
  else
187
- emit(:literal, :literal, text, ts, te)
189
+ emit(:literal, :literal, text)
188
190
  end
189
191
  };
190
192
 
191
193
  '-' {
192
- text = text(data, ts, te).first
194
+ text = copy(data, ts, te)
193
195
  # ranges cant start with a subset or intersection/negation/range operator
194
196
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
196
198
  else
197
- emit(:set, :range, text, ts, te)
199
+ emit(:set, :range, text)
198
200
  end
199
201
  };
200
202
 
201
203
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
204
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
205
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
206
+ emit(:set, :intersection, copy(data, ts, te))
205
207
  };
206
208
 
207
209
  backslash {
@@ -209,12 +211,12 @@
209
211
  };
210
212
 
211
213
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
214
+ emit(:set, :open, copy(data, ts, te))
213
215
  fcall character_set;
214
216
  };
215
217
 
216
218
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
219
+ text = copy(data, ts, te)
218
220
 
219
221
  type = :posixclass
220
222
  class_name = text[2..-3]
@@ -223,29 +225,24 @@
223
225
  type = :nonposixclass
224
226
  end
225
227
 
226
- emit(type, class_name.to_sym, text, ts, te)
228
+ emit(type, class_name.to_sym, text)
227
229
  };
228
230
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
231
+ # These are not supported in ruby at the moment. Enable them if they are.
232
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
233
+ # emit(:set, :collation, copy(data, ts, te))
234
+ # };
235
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
236
+ # emit(:set, :equivalent, copy(data, ts, te))
237
+ # };
236
238
 
237
239
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
240
+ emit(:literal, :literal, copy(data, ts, te))
239
241
  };
240
242
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
243
+ any | ascii_nonprint | utf8_multibyte {
244
+ text = copy(data, ts, te)
245
+ emit(:literal, :literal, text)
249
246
  };
250
247
  *|;
251
248
 
@@ -253,7 +250,7 @@
253
250
  # --------------------------------------------------------------------------
254
251
  set_escape_sequence := |*
255
252
  non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
253
+ emit(:escape, :literal, copy(data, ts-1, te))
257
254
  fret;
258
255
  };
259
256
 
@@ -269,33 +266,33 @@
269
266
  # --------------------------------------------------------------------------
270
267
  escape_sequence := |*
271
268
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
269
+ text = copy(data, ts-1, te)
270
+ emit(:backref, :number, text)
274
271
  fret;
275
272
  };
276
273
 
277
274
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
275
+ emit(:escape, :octal, copy(data, ts-1, te))
279
276
  fret;
280
277
  };
281
278
 
282
279
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
280
+ case text = copy(data, ts-1, te)
281
+ when '\.'; emit(:escape, :dot, text)
282
+ when '\|'; emit(:escape, :alternation, text)
283
+ when '\^'; emit(:escape, :bol, text)
284
+ when '\$'; emit(:escape, :eol, text)
285
+ when '\?'; emit(:escape, :zero_or_one, text)
286
+ when '\*'; emit(:escape, :zero_or_more, text)
287
+ when '\+'; emit(:escape, :one_or_more, text)
288
+ when '\('; emit(:escape, :group_open, text)
289
+ when '\)'; emit(:escape, :group_close, text)
290
+ when '\{'; emit(:escape, :interval_open, text)
291
+ when '\}'; emit(:escape, :interval_close, text)
292
+ when '\['; emit(:escape, :set_open, text)
293
+ when '\]'; emit(:escape, :set_close, text)
297
294
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
295
+ emit(:escape, :backslash, text)
299
296
  end
300
297
  fret;
301
298
  };
@@ -303,31 +300,31 @@
303
300
  escaped_ascii > (escaped_alpha, 7) {
304
301
  # \b is emitted as backspace only when inside a character set, otherwise
305
302
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
303
+ case text = copy(data, ts-1, te)
304
+ when '\a'; emit(:escape, :bell, text)
305
+ when '\b'; emit(:escape, :backspace, text)
306
+ when '\e'; emit(:escape, :escape, text)
307
+ when '\f'; emit(:escape, :form_feed, text)
308
+ when '\n'; emit(:escape, :newline, text)
309
+ when '\r'; emit(:escape, :carriage, text)
310
+ when '\t'; emit(:escape, :tab, text)
311
+ when '\v'; emit(:escape, :vertical_tab, text)
315
312
  end
316
313
  fret;
317
314
  };
318
315
 
319
316
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
317
+ text = copy(data, ts-1, te)
321
318
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
319
+ emit(:escape, :codepoint_list, text)
323
320
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
321
+ emit(:escape, :codepoint, text)
325
322
  end
326
323
  fret;
327
324
  };
328
325
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
326
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
327
+ emit(:escape, :hex, copy(data, ts-1, te))
331
328
  fret;
332
329
  };
333
330
 
@@ -357,8 +354,8 @@
357
354
  fcall unicode_property;
358
355
  };
359
356
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
357
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
358
+ emit(:escape, :literal, copy(data, ts-1, te))
362
359
  fret;
363
360
  };
364
361
  *|;
@@ -368,9 +365,9 @@
368
365
  # --------------------------------------------------------------------------
369
366
  conditional_expression := |*
370
367
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
368
+ text = copy(data, ts, te-1)
369
+ emit(:conditional, :condition, text)
370
+ emit(:conditional, :condition_close, ')')
374
371
  };
375
372
 
376
373
  any {
@@ -387,39 +384,39 @@
387
384
  # Meta characters
388
385
  # ------------------------------------------------------------------------
389
386
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
387
+ emit(:meta, :dot, copy(data, ts, te))
391
388
  };
392
389
 
393
390
  alternation {
394
391
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
392
+ emit(:conditional, :separator, copy(data, ts, te))
396
393
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
394
+ emit(:meta, :alternation, copy(data, ts, te))
398
395
  end
399
396
  };
400
397
 
401
398
  # Anchors
402
399
  # ------------------------------------------------------------------------
403
400
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
401
+ emit(:anchor, :bol, copy(data, ts, te))
405
402
  };
406
403
 
407
404
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
405
+ emit(:anchor, :eol, copy(data, ts, te))
409
406
  };
410
407
 
411
408
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
409
+ emit(:keep, :mark, copy(data, ts, te))
413
410
  };
414
411
 
415
412
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
413
+ case text = copy(data, ts, te)
414
+ when '\\A'; emit(:anchor, :bos, text)
415
+ when '\\z'; emit(:anchor, :eos, text)
416
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
417
+ when '\\b'; emit(:anchor, :word_boundary, text)
418
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
419
+ when '\\G'; emit(:anchor, :match_start, text)
423
420
  end
424
421
  };
425
422
 
@@ -430,7 +427,7 @@
430
427
  # Character sets
431
428
  # ------------------------------------------------------------------------
432
429
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
430
+ emit(:set, :open, copy(data, ts, te))
434
431
  fcall character_set;
435
432
  };
436
433
 
@@ -439,12 +436,12 @@
439
436
  # (?(condition)Y|N) conditional expression
440
437
  # ------------------------------------------------------------------------
441
438
  conditional {
442
- text = text(data, ts, te).first
439
+ text = copy(data, ts, te)
443
440
 
444
441
  conditional_stack << group_depth
445
442
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
443
+ emit(:conditional, :open, text[0..-2])
444
+ emit(:conditional, :condition_open, '(')
448
445
  fcall conditional_expression;
449
446
  };
450
447
 
@@ -455,7 +452,7 @@
455
452
  # correct closing count.
456
453
  # ------------------------------------------------------------------------
457
454
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
455
+ emit(:group, :comment, copy(data, ts, te))
459
456
  };
460
457
 
461
458
  # Expression options:
@@ -470,11 +467,11 @@
470
467
  # (?imxdau-imx:subexp) option on/off for subexp
471
468
  # ------------------------------------------------------------------------
472
469
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
470
+ text = copy(data, ts, te)
474
471
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
472
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
473
  end
477
- emit_options(text, ts, te)
474
+ emit_options(text)
478
475
  };
479
476
 
480
477
  # Assertions
@@ -484,11 +481,11 @@
484
481
  # (?<!subexp) negative look-behind
485
482
  # ------------------------------------------------------------------------
486
483
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
484
+ case text = copy(data, ts, te)
485
+ when '(?='; emit(:assertion, :lookahead, text)
486
+ when '(?!'; emit(:assertion, :nlookahead, text)
487
+ when '(?<='; emit(:assertion, :lookbehind, text)
488
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
489
  end
493
490
  };
494
491
 
@@ -501,32 +498,32 @@
501
498
  # (subexp) captured group
502
499
  # ------------------------------------------------------------------------
503
500
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
501
+ case text = copy(data, ts, te)
502
+ when '(?:'; emit(:group, :passive, text)
503
+ when '(?>'; emit(:group, :atomic, text)
504
+ when '(?~'; emit(:group, :absence, text)
508
505
 
509
506
  when /^\(\?(?:<>|'')/
510
507
  validation_error(:group, 'named group', 'name is empty')
511
508
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
509
+ when /^\(\?<[^>]+>/
510
+ emit(:group, :named_ab, text)
514
511
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
512
+ when /^\(\?'[^']+'/
513
+ emit(:group, :named_sq, text)
517
514
 
518
515
  end
519
516
  };
520
517
 
521
518
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
519
+ text = copy(data, ts, te)
520
+ emit(:group, :capture, text)
524
521
  };
525
522
 
526
523
  group_close @group_closed {
527
524
  if conditional_stack.last == group_depth + 1
528
525
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
526
+ emit(:conditional, :close, copy(data, ts, te))
530
527
  else
531
528
  if spacing_stack.length > 1 &&
532
529
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,7 +531,7 @@
534
531
  self.free_spacing = spacing_stack.last[:free_spacing]
535
532
  end
536
533
 
537
- emit(:group, :close, *text(data, ts, te))
534
+ emit(:group, :close, copy(data, ts, te))
538
535
  end
539
536
  };
540
537
 
@@ -542,63 +539,65 @@
542
539
  # Group backreference, named and numbered
543
540
  # ------------------------------------------------------------------------
544
541
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
542
+ case text = copy(data, ts, te)
546
543
  when /^\\([gk])(<>|'')/ # angle brackets
547
544
  validation_error(:backref, 'ref/call', 'ref ID is empty')
548
545
 
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
546
+ # TODO: finer quirks of choosing recursive or non-recursive refs/calls.
547
+ # e.g.: `a-1` is a valid group id: 'aa'[/(?<a-1>a)\g<a-1>/] # => 'aa'
548
+ when /^\\([gk])<[^\p{digit}+\->][^>+\-]*>/ # angle-brackets
550
549
  if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
550
+ emit(:backref, :name_ref_ab, text)
552
551
  else
553
- emit(:backref, :name_call_ab, text, ts, te)
552
+ emit(:backref, :name_call_ab, text)
554
553
  end
555
554
 
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
555
+ when /^\\([gk])'[^\p{digit}+\-'][^'+\-]*'/ # single quotes
557
556
  if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
557
+ emit(:backref, :name_ref_sq, text)
559
558
  else
560
- emit(:backref, :name_call_sq, text, ts, te)
559
+ emit(:backref, :name_call_sq, text)
561
560
  end
562
561
 
563
562
  when /^\\([gk])<\d+>/ # angle-brackets
564
563
  if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
564
+ emit(:backref, :number_ref_ab, text)
566
565
  else
567
- emit(:backref, :number_call_ab, text, ts, te)
566
+ emit(:backref, :number_call_ab, text)
568
567
  end
569
568
 
570
569
  when /^\\([gk])'\d+'/ # single quotes
571
570
  if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
571
+ emit(:backref, :number_ref_sq, text)
573
572
  else
574
- emit(:backref, :number_call_sq, text, ts, te)
573
+ emit(:backref, :number_call_sq, text)
575
574
  end
576
575
 
577
576
  when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
577
  if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
578
+ emit(:backref, :number_rel_ref_ab, text)
580
579
  else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
580
+ emit(:backref, :number_rel_call_ab, text)
582
581
  end
583
582
 
584
583
  when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
584
  if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
585
+ emit(:backref, :number_rel_ref_sq, text)
587
586
  else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
587
+ emit(:backref, :number_rel_call_sq, text)
589
588
  end
590
589
 
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
590
+ when /^\\k<[^\p{digit}+\->][^>]*[+\-]\d+>/ # angle-brackets
591
+ emit(:backref, :name_recursion_ref_ab, text)
593
592
 
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
593
+ when /^\\k'[^\p{digit}+\-'][^']*[+\-]\d+'/ # single-quotes
594
+ emit(:backref, :name_recursion_ref_sq, text)
596
595
 
597
596
  when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
597
+ emit(:backref, :number_recursion_ref_ab, text)
599
598
 
600
599
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
600
+ emit(:backref, :number_recursion_ref_sq, text)
602
601
 
603
602
  end
604
603
  };
@@ -607,31 +606,31 @@
607
606
  # Quantifiers
608
607
  # ------------------------------------------------------------------------
609
608
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
609
+ case text = copy(data, ts, te)
610
+ when '?' ; emit(:quantifier, :zero_or_one, text)
611
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
612
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
613
  end
615
614
  };
616
615
 
617
616
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
617
+ case text = copy(data, ts, te)
618
+ when '*' ; emit(:quantifier, :zero_or_more, text)
619
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
620
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
621
  end
623
622
  };
624
623
 
625
624
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
625
+ case text = copy(data, ts, te)
626
+ when '+' ; emit(:quantifier, :one_or_more, text)
627
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
628
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
629
  end
631
630
  };
632
631
 
633
632
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
633
+ emit(:quantifier, :interval, copy(data, ts, te))
635
634
  };
636
635
 
637
636
  # Catch unmatched curly braces as literals
@@ -647,7 +646,7 @@
647
646
 
648
647
  comment {
649
648
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
649
+ emit(:free_space, :comment, copy(data, ts, te))
651
650
  else
652
651
  # consume only the pound sign (#) and backtrack to do regular scanning
653
652
  append_literal(data, ts, ts + 1)
@@ -657,7 +656,7 @@
657
656
 
658
657
  space+ {
659
658
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
659
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
660
  else
662
661
  append_literal(data, ts, te)
663
662
  end
@@ -666,11 +665,7 @@
666
665
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
667
666
  # except meta characters.
668
667
  # ------------------------------------------------------------------------
669
- (ascii_print -- space)+ |
670
- ascii_nonprint+ |
671
- utf8_2_byte+ |
672
- utf8_3_byte+ |
673
- utf8_4_byte+ {
668
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
674
669
  append_literal(data, ts, te)
675
670
  };
676
671
 
@@ -760,6 +755,7 @@ class Regexp::Scanner
760
755
  self.set_depth = 0
761
756
  self.group_depth = 0
762
757
  self.conditional_stack = []
758
+ self.char_pos = 0
763
759
 
764
760
  %% write data;
765
761
  %% write init;
@@ -769,7 +765,7 @@ class Regexp::Scanner
769
765
  testEof = testEof
770
766
 
771
767
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
768
+ text = copy(data, ts ? ts-1 : 0, -1)
773
769
  raise ScannerError.new("Scan error at '#{text}'")
774
770
  end
775
771
 
@@ -797,22 +793,29 @@ class Regexp::Scanner
797
793
  end
798
794
 
799
795
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
796
+ def emit(type, token, text)
801
797
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
798
 
803
799
  emit_literal if literal
804
800
 
801
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
802
+ # end-users, so we keep track of char-based indices and emit those instead.
803
+ ts_char_pos = char_pos
804
+ te_char_pos = char_pos + text.length
805
+
805
806
  if block
806
- block.call type, token, text, ts, te
807
+ block.call type, token, text, ts_char_pos, te_char_pos
807
808
  end
808
809
 
809
- tokens << [type, token, text, ts, te]
810
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
811
+
812
+ self.char_pos = te_char_pos
810
813
  end
811
814
 
812
815
  private
813
816
 
814
817
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
818
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
819
 
817
820
  def free_spacing?(input_object, options)
818
821
  if options && !input_object.is_a?(String)
@@ -835,36 +838,25 @@ class Regexp::Scanner
835
838
  end
836
839
 
837
840
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
841
+ def copy(data, ts, te)
842
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
843
  end
847
844
 
848
845
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
846
+ # by a call to emit_literal.
850
847
  def append_literal(data, ts, te)
851
848
  self.literal = literal || []
852
- literal << text(data, ts, te)
849
+ literal << copy(data, ts, te)
853
850
  end
854
851
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
852
+ # Emits the literal run collected by calls to the append_literal method.
857
853
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
854
+ text = literal.join
863
855
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
856
+ emit(:literal, :literal, text)
865
857
  end
866
858
 
867
- def emit_options(text, ts, te)
859
+ def emit_options(text)
868
860
  token = nil
869
861
 
870
862
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +882,14 @@ class Regexp::Scanner
890
882
  token = :options_switch
891
883
  end
892
884
 
893
- emit(:group, token, text, ts, te)
885
+ emit(:group, token, text)
894
886
  end
895
887
 
896
888
  def emit_meta_control_sequence(data, ts, te, token)
897
889
  if data.last < 0x00 || data.last > 0x7F
898
890
  validation_error(:sequence, 'escape', token.to_s)
899
891
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
892
+ emit(:escape, token, copy(data, ts-1, te))
901
893
  end
902
894
 
903
895
  # Centralizes and unifies the handling of validation related