regexp_parser 1.8.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +93 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser.rb +1 -0
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression.rb +5 -18
  9. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  10. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  11. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  12. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  13. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  15. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  16. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  17. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  18. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  19. data/lib/regexp_parser/expression/sequence.rb +3 -19
  20. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  21. data/lib/regexp_parser/lexer.rb +2 -2
  22. data/lib/regexp_parser/parser.rb +306 -332
  23. data/lib/regexp_parser/scanner.rb +1272 -1338
  24. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  25. data/lib/regexp_parser/scanner/property.rl +2 -2
  26. data/lib/regexp_parser/scanner/scanner.rl +206 -238
  27. data/lib/regexp_parser/syntax.rb +7 -7
  28. data/lib/regexp_parser/syntax/any.rb +3 -3
  29. data/lib/regexp_parser/syntax/base.rb +1 -1
  30. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  31. data/lib/regexp_parser/syntax/versions.rb +1 -1
  32. data/lib/regexp_parser/version.rb +1 -1
  33. data/spec/expression/base_spec.rb +10 -0
  34. data/spec/expression/clone_spec.rb +36 -4
  35. data/spec/expression/free_space_spec.rb +2 -2
  36. data/spec/expression/methods/match_length_spec.rb +2 -2
  37. data/spec/expression/subexpression_spec.rb +1 -1
  38. data/spec/expression/to_s_spec.rb +39 -31
  39. data/spec/lexer/literals_spec.rb +24 -49
  40. data/spec/lexer/refcalls_spec.rb +5 -0
  41. data/spec/parser/all_spec.rb +2 -2
  42. data/spec/parser/errors_spec.rb +1 -1
  43. data/spec/parser/escapes_spec.rb +1 -1
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +8 -1
  48. data/spec/scanner/groups_spec.rb +10 -1
  49. data/spec/scanner/literals_spec.rb +28 -38
  50. data/spec/scanner/quantifiers_spec.rb +18 -13
  51. data/spec/scanner/refcalls_spec.rb +19 -0
  52. data/spec/scanner/sets_spec.rb +65 -16
  53. data/spec/spec_helper.rb +1 -0
  54. metadata +4 -7
  55. data/spec/expression/root_spec.rb +0 -9
  56. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,7 +20,7 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,7 +127,7 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
@@ -123,25 +135,23 @@
123
135
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
148
+ text = copy(data, ts ? ts-1 : 0, -1)
139
149
  raise PrematureEndError.new( text )
140
150
  }
141
151
 
142
152
  # Invalid sequence error, used from sequences, like escapes and sets
143
153
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
154
+ text = copy(data, ts ? ts-1 : 0, -1)
145
155
  validation_error(:sequence, 'sequence', text)
146
156
  }
147
157
 
@@ -156,7 +166,7 @@
156
166
  # --------------------------------------------------------------------------
157
167
  character_set := |*
158
168
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
169
+ emit(:set, :close, copy(data, ts, te))
160
170
  if in_set?
161
171
  fret;
162
172
  else
@@ -165,8 +175,8 @@
165
175
  };
166
176
 
167
177
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
178
+ emit(:literal, :literal, copy(data, ts, te-1))
179
+ emit(:set, :close, copy(data, ts+1, te))
170
180
  if in_set?
171
181
  fret;
172
182
  else
@@ -175,33 +185,33 @@
175
185
  };
176
186
 
177
187
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
188
+ emit(:literal, :literal, '-')
189
+ emit(:set, :intersection, '&&')
180
190
  };
181
191
 
182
192
  '^' {
183
- text = text(data, ts, te).first
193
+ text = copy(data, ts, te)
184
194
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
195
+ emit(:set, :negate, text)
186
196
  else
187
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
188
198
  end
189
199
  };
190
200
 
191
201
  '-' {
192
- text = text(data, ts, te).first
202
+ text = copy(data, ts, te)
193
203
  # ranges cant start with a subset or intersection/negation/range operator
194
204
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
205
+ emit(:literal, :literal, text)
196
206
  else
197
- emit(:set, :range, text, ts, te)
207
+ emit(:set, :range, text)
198
208
  end
199
209
  };
200
210
 
201
211
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
212
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
213
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
214
+ emit(:set, :intersection, copy(data, ts, te))
205
215
  };
206
216
 
207
217
  backslash {
@@ -209,12 +219,12 @@
209
219
  };
210
220
 
211
221
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
222
+ emit(:set, :open, copy(data, ts, te))
213
223
  fcall character_set;
214
224
  };
215
225
 
216
226
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
227
+ text = copy(data, ts, te)
218
228
 
219
229
  type = :posixclass
220
230
  class_name = text[2..-3]
@@ -223,45 +233,40 @@
223
233
  type = :nonposixclass
224
234
  end
225
235
 
226
- emit(type, class_name.to_sym, text, ts, te)
236
+ emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
248
+ emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
251
+ any | ascii_nonprint | utf8_multibyte {
252
+ text = copy(data, ts, te)
253
+ emit(:literal, :literal, text)
249
254
  };
250
255
  *|;
251
256
 
252
257
  # set escapes scanner
253
258
  # --------------------------------------------------------------------------
254
259
  set_escape_sequence := |*
255
- non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
257
- fret;
258
- };
259
-
260
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
261
261
  fhold;
262
262
  fnext character_set;
263
263
  fcall escape_sequence;
264
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
265
270
  *|;
266
271
 
267
272
 
@@ -269,33 +274,33 @@
269
274
  # --------------------------------------------------------------------------
270
275
  escape_sequence := |*
271
276
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
277
+ text = copy(data, ts-1, te)
278
+ emit(:backref, :number, text)
274
279
  fret;
275
280
  };
276
281
 
277
282
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
283
+ emit(:escape, :octal, copy(data, ts-1, te))
279
284
  fret;
280
285
  };
281
286
 
282
287
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
288
+ case text = copy(data, ts-1, te)
289
+ when '\.'; emit(:escape, :dot, text)
290
+ when '\|'; emit(:escape, :alternation, text)
291
+ when '\^'; emit(:escape, :bol, text)
292
+ when '\$'; emit(:escape, :eol, text)
293
+ when '\?'; emit(:escape, :zero_or_one, text)
294
+ when '\*'; emit(:escape, :zero_or_more, text)
295
+ when '\+'; emit(:escape, :one_or_more, text)
296
+ when '\('; emit(:escape, :group_open, text)
297
+ when '\)'; emit(:escape, :group_close, text)
298
+ when '\{'; emit(:escape, :interval_open, text)
299
+ when '\}'; emit(:escape, :interval_close, text)
300
+ when '\['; emit(:escape, :set_open, text)
301
+ when '\]'; emit(:escape, :set_close, text)
297
302
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
303
+ emit(:escape, :backslash, text)
299
304
  end
300
305
  fret;
301
306
  };
@@ -303,31 +308,31 @@
303
308
  escaped_ascii > (escaped_alpha, 7) {
304
309
  # \b is emitted as backspace only when inside a character set, otherwise
305
310
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
311
+ case text = copy(data, ts-1, te)
312
+ when '\a'; emit(:escape, :bell, text)
313
+ when '\b'; emit(:escape, :backspace, text)
314
+ when '\e'; emit(:escape, :escape, text)
315
+ when '\f'; emit(:escape, :form_feed, text)
316
+ when '\n'; emit(:escape, :newline, text)
317
+ when '\r'; emit(:escape, :carriage, text)
318
+ when '\t'; emit(:escape, :tab, text)
319
+ when '\v'; emit(:escape, :vertical_tab, text)
315
320
  end
316
321
  fret;
317
322
  };
318
323
 
319
324
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
325
+ text = copy(data, ts-1, te)
321
326
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
327
+ emit(:escape, :codepoint_list, text)
323
328
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
329
+ emit(:escape, :codepoint, text)
325
330
  end
326
331
  fret;
327
332
  };
328
333
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
335
+ emit(:escape, :hex, copy(data, ts-1, te))
331
336
  fret;
332
337
  };
333
338
 
@@ -357,8 +362,8 @@
357
362
  fcall unicode_property;
358
363
  };
359
364
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
366
+ emit(:escape, :literal, copy(data, ts-1, te))
362
367
  fret;
363
368
  };
364
369
  *|;
@@ -368,9 +373,9 @@
368
373
  # --------------------------------------------------------------------------
369
374
  conditional_expression := |*
370
375
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
376
+ text = copy(data, ts, te-1)
377
+ emit(:conditional, :condition, text)
378
+ emit(:conditional, :condition_close, ')')
374
379
  };
375
380
 
376
381
  any {
@@ -387,39 +392,39 @@
387
392
  # Meta characters
388
393
  # ------------------------------------------------------------------------
389
394
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
395
+ emit(:meta, :dot, copy(data, ts, te))
391
396
  };
392
397
 
393
398
  alternation {
394
399
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
400
+ emit(:conditional, :separator, copy(data, ts, te))
396
401
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
402
+ emit(:meta, :alternation, copy(data, ts, te))
398
403
  end
399
404
  };
400
405
 
401
406
  # Anchors
402
407
  # ------------------------------------------------------------------------
403
408
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
409
+ emit(:anchor, :bol, copy(data, ts, te))
405
410
  };
406
411
 
407
412
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
413
+ emit(:anchor, :eol, copy(data, ts, te))
409
414
  };
410
415
 
411
416
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
417
+ emit(:keep, :mark, copy(data, ts, te))
413
418
  };
414
419
 
415
420
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
421
+ case text = copy(data, ts, te)
422
+ when '\\A'; emit(:anchor, :bos, text)
423
+ when '\\z'; emit(:anchor, :eos, text)
424
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
+ when '\\b'; emit(:anchor, :word_boundary, text)
426
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
427
+ when '\\G'; emit(:anchor, :match_start, text)
423
428
  end
424
429
  };
425
430
 
@@ -430,7 +435,7 @@
430
435
  # Character sets
431
436
  # ------------------------------------------------------------------------
432
437
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
438
+ emit(:set, :open, copy(data, ts, te))
434
439
  fcall character_set;
435
440
  };
436
441
 
@@ -439,12 +444,12 @@
439
444
  # (?(condition)Y|N) conditional expression
440
445
  # ------------------------------------------------------------------------
441
446
  conditional {
442
- text = text(data, ts, te).first
447
+ text = copy(data, ts, te)
443
448
 
444
449
  conditional_stack << group_depth
445
450
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
451
+ emit(:conditional, :open, text[0..-2])
452
+ emit(:conditional, :condition_open, '(')
448
453
  fcall conditional_expression;
449
454
  };
450
455
 
@@ -455,7 +460,7 @@
455
460
  # correct closing count.
456
461
  # ------------------------------------------------------------------------
457
462
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
463
+ emit(:group, :comment, copy(data, ts, te))
459
464
  };
460
465
 
461
466
  # Expression options:
@@ -470,11 +475,11 @@
470
475
  # (?imxdau-imx:subexp) option on/off for subexp
471
476
  # ------------------------------------------------------------------------
472
477
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
478
+ text = copy(data, ts, te)
474
479
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
480
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
481
  end
477
- emit_options(text, ts, te)
482
+ emit_options(text)
478
483
  };
479
484
 
480
485
  # Assertions
@@ -484,11 +489,11 @@
484
489
  # (?<!subexp) negative look-behind
485
490
  # ------------------------------------------------------------------------
486
491
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
492
+ case text = copy(data, ts, te)
493
+ when '(?='; emit(:assertion, :lookahead, text)
494
+ when '(?!'; emit(:assertion, :nlookahead, text)
495
+ when '(?<='; emit(:assertion, :lookbehind, text)
496
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
497
  end
493
498
  };
494
499
 
@@ -501,32 +506,32 @@
501
506
  # (subexp) captured group
502
507
  # ------------------------------------------------------------------------
503
508
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
509
+ case text = copy(data, ts, te)
510
+ when '(?:'; emit(:group, :passive, text)
511
+ when '(?>'; emit(:group, :atomic, text)
512
+ when '(?~'; emit(:group, :absence, text)
508
513
 
509
514
  when /^\(\?(?:<>|'')/
510
515
  validation_error(:group, 'named group', 'name is empty')
511
516
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
517
+ when /^\(\?<[^>]+>/
518
+ emit(:group, :named_ab, text)
514
519
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
520
+ when /^\(\?'[^']+'/
521
+ emit(:group, :named_sq, text)
517
522
 
518
523
  end
519
524
  };
520
525
 
521
526
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
527
+ text = copy(data, ts, te)
528
+ emit(:group, :capture, text)
524
529
  };
525
530
 
526
531
  group_close @group_closed {
527
532
  if conditional_stack.last == group_depth + 1
528
533
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
534
+ emit(:conditional, :close, copy(data, ts, te))
530
535
  else
531
536
  if spacing_stack.length > 1 &&
532
537
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,72 +539,42 @@
534
539
  self.free_spacing = spacing_stack.last[:free_spacing]
535
540
  end
536
541
 
537
- emit(:group, :close, *text(data, ts, te))
542
+ emit(:group, :close, copy(data, ts, te))
538
543
  end
539
544
  };
540
545
 
541
546
 
542
547
  # Group backreference, named and numbered
543
548
  # ------------------------------------------------------------------------
544
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
546
- when /^\\([gk])(<>|'')/ # angle brackets
547
- validation_error(:backref, 'ref/call', 'ref ID is empty')
548
-
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
- if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
552
- else
553
- emit(:backref, :name_call_ab, text, ts, te)
554
- end
555
-
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
- if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
559
- else
560
- emit(:backref, :name_call_sq, text, ts, te)
561
- end
562
-
563
- when /^\\([gk])<\d+>/ # angle-brackets
564
- if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
566
- else
567
- emit(:backref, :number_call_ab, text, ts, te)
568
- end
569
-
570
- when /^\\([gk])'\d+'/ # single quotes
571
- if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
573
- else
574
- emit(:backref, :number_call_sq, text, ts, te)
575
- end
576
-
577
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
- if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
580
- else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
582
- end
583
-
584
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
- if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
587
- else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
589
- end
590
-
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
593
-
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
596
-
597
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
599
-
600
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
550
+ case text = copy(data, ts, te)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
602
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
603
578
  end
604
579
  };
605
580
 
@@ -607,31 +582,31 @@
607
582
  # Quantifiers
608
583
  # ------------------------------------------------------------------------
609
584
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '?' ; emit(:quantifier, :zero_or_one, text)
587
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
588
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
589
  end
615
590
  };
616
591
 
617
592
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '*' ; emit(:quantifier, :zero_or_more, text)
595
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
596
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
597
  end
623
598
  };
624
599
 
625
600
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
601
+ case text = copy(data, ts, te)
602
+ when '+' ; emit(:quantifier, :one_or_more, text)
603
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
604
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
605
  end
631
606
  };
632
607
 
633
608
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
609
+ emit(:quantifier, :interval, copy(data, ts, te))
635
610
  };
636
611
 
637
612
  # Catch unmatched curly braces as literals
@@ -647,7 +622,7 @@
647
622
 
648
623
  comment {
649
624
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
625
+ emit(:free_space, :comment, copy(data, ts, te))
651
626
  else
652
627
  # consume only the pound sign (#) and backtrack to do regular scanning
653
628
  append_literal(data, ts, ts + 1)
@@ -657,7 +632,7 @@
657
632
 
658
633
  space+ {
659
634
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
635
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
636
  else
662
637
  append_literal(data, ts, te)
663
638
  end
@@ -666,11 +641,7 @@
666
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
667
642
  # except meta characters.
668
643
  # ------------------------------------------------------------------------
669
- (ascii_print -- space)+ |
670
- ascii_nonprint+ |
671
- utf8_2_byte+ |
672
- utf8_3_byte+ |
673
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
674
645
  append_literal(data, ts, te)
675
646
  };
676
647
 
@@ -682,10 +653,10 @@
682
653
 
683
654
  class Regexp::Scanner
684
655
  # General scanner error (catch all)
685
- class ScannerError < StandardError; end
656
+ class ScannerError < Regexp::Parser::Error; end
686
657
 
687
658
  # Base for all scanner validation errors
688
- class ValidationError < StandardError
659
+ class ValidationError < Regexp::Parser::Error
689
660
  def initialize(reason)
690
661
  super reason
691
662
  end
@@ -760,6 +731,7 @@ class Regexp::Scanner
760
731
  self.set_depth = 0
761
732
  self.group_depth = 0
762
733
  self.conditional_stack = []
734
+ self.char_pos = 0
763
735
 
764
736
  %% write data;
765
737
  %% write init;
@@ -769,7 +741,7 @@ class Regexp::Scanner
769
741
  testEof = testEof
770
742
 
771
743
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
744
+ text = copy(data, ts ? ts-1 : 0, -1)
773
745
  raise ScannerError.new("Scan error at '#{text}'")
774
746
  end
775
747
 
@@ -786,7 +758,7 @@ class Regexp::Scanner
786
758
 
787
759
  # lazy-load property maps when first needed
788
760
  require 'yaml'
789
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
761
+ PROP_MAPS_DIR = File.join(__dir__, 'scanner', 'properties')
790
762
 
791
763
  def self.short_prop_map
792
764
  @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
@@ -797,22 +769,29 @@ class Regexp::Scanner
797
769
  end
798
770
 
799
771
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
772
+ def emit(type, token, text)
801
773
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
774
 
803
775
  emit_literal if literal
804
776
 
777
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
778
+ # end-users, so we keep track of char-based indices and emit those instead.
779
+ ts_char_pos = char_pos
780
+ te_char_pos = char_pos + text.length
781
+
805
782
  if block
806
- block.call type, token, text, ts, te
783
+ block.call type, token, text, ts_char_pos, te_char_pos
807
784
  end
808
785
 
809
- tokens << [type, token, text, ts, te]
786
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
787
+
788
+ self.char_pos = te_char_pos
810
789
  end
811
790
 
812
791
  private
813
792
 
814
793
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
794
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
795
 
817
796
  def free_spacing?(input_object, options)
818
797
  if options && !input_object.is_a?(String)
@@ -835,36 +814,25 @@ class Regexp::Scanner
835
814
  end
836
815
 
837
816
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
817
+ def copy(data, ts, te)
818
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
819
  end
847
820
 
848
821
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
822
+ # by a call to emit_literal.
850
823
  def append_literal(data, ts, te)
851
824
  self.literal = literal || []
852
- literal << text(data, ts, te)
825
+ literal << copy(data, ts, te)
853
826
  end
854
827
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
828
+ # Emits the literal run collected by calls to the append_literal method.
857
829
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
830
+ text = literal.join
863
831
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
832
+ emit(:literal, :literal, text)
865
833
  end
866
834
 
867
- def emit_options(text, ts, te)
835
+ def emit_options(text)
868
836
  token = nil
869
837
 
870
838
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +858,14 @@ class Regexp::Scanner
890
858
  token = :options_switch
891
859
  end
892
860
 
893
- emit(:group, token, text, ts, te)
861
+ emit(:group, token, text)
894
862
  end
895
863
 
896
864
  def emit_meta_control_sequence(data, ts, te, token)
897
865
  if data.last < 0x00 || data.last > 0x7F
898
866
  validation_error(:sequence, 'escape', token.to_s)
899
867
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
868
+ emit(:escape, token, copy(data, ts-1, te))
901
869
  end
902
870
 
903
871
  # Centralizes and unifies the handling of validation related