regexp_parser 1.8.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +93 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser.rb +1 -0
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression.rb +5 -18
  9. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  10. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  11. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  12. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  13. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  15. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  16. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  17. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  18. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  19. data/lib/regexp_parser/expression/sequence.rb +3 -19
  20. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  21. data/lib/regexp_parser/lexer.rb +2 -2
  22. data/lib/regexp_parser/parser.rb +306 -332
  23. data/lib/regexp_parser/scanner.rb +1272 -1338
  24. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  25. data/lib/regexp_parser/scanner/property.rl +2 -2
  26. data/lib/regexp_parser/scanner/scanner.rl +206 -238
  27. data/lib/regexp_parser/syntax.rb +7 -7
  28. data/lib/regexp_parser/syntax/any.rb +3 -3
  29. data/lib/regexp_parser/syntax/base.rb +1 -1
  30. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  31. data/lib/regexp_parser/syntax/versions.rb +1 -1
  32. data/lib/regexp_parser/version.rb +1 -1
  33. data/spec/expression/base_spec.rb +10 -0
  34. data/spec/expression/clone_spec.rb +36 -4
  35. data/spec/expression/free_space_spec.rb +2 -2
  36. data/spec/expression/methods/match_length_spec.rb +2 -2
  37. data/spec/expression/subexpression_spec.rb +1 -1
  38. data/spec/expression/to_s_spec.rb +39 -31
  39. data/spec/lexer/literals_spec.rb +24 -49
  40. data/spec/lexer/refcalls_spec.rb +5 -0
  41. data/spec/parser/all_spec.rb +2 -2
  42. data/spec/parser/errors_spec.rb +1 -1
  43. data/spec/parser/escapes_spec.rb +1 -1
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +8 -1
  48. data/spec/scanner/groups_spec.rb +10 -1
  49. data/spec/scanner/literals_spec.rb +28 -38
  50. data/spec/scanner/quantifiers_spec.rb +18 -13
  51. data/spec/scanner/refcalls_spec.rb +19 -0
  52. data/spec/scanner/sets_spec.rb +65 -16
  53. data/spec/spec_helper.rb +1 -0
  54. metadata +4 -7
  55. data/spec/expression/root_spec.rb +0 -9
  56. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,7 +20,7 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,7 +127,7 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
@@ -123,25 +135,23 @@
123
135
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
148
+ text = copy(data, ts ? ts-1 : 0, -1)
139
149
  raise PrematureEndError.new( text )
140
150
  }
141
151
 
142
152
  # Invalid sequence error, used from sequences, like escapes and sets
143
153
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
154
+ text = copy(data, ts ? ts-1 : 0, -1)
145
155
  validation_error(:sequence, 'sequence', text)
146
156
  }
147
157
 
@@ -156,7 +166,7 @@
156
166
  # --------------------------------------------------------------------------
157
167
  character_set := |*
158
168
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
169
+ emit(:set, :close, copy(data, ts, te))
160
170
  if in_set?
161
171
  fret;
162
172
  else
@@ -165,8 +175,8 @@
165
175
  };
166
176
 
167
177
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
178
+ emit(:literal, :literal, copy(data, ts, te-1))
179
+ emit(:set, :close, copy(data, ts+1, te))
170
180
  if in_set?
171
181
  fret;
172
182
  else
@@ -175,33 +185,33 @@
175
185
  };
176
186
 
177
187
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
188
+ emit(:literal, :literal, '-')
189
+ emit(:set, :intersection, '&&')
180
190
  };
181
191
 
182
192
  '^' {
183
- text = text(data, ts, te).first
193
+ text = copy(data, ts, te)
184
194
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
195
+ emit(:set, :negate, text)
186
196
  else
187
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
188
198
  end
189
199
  };
190
200
 
191
201
  '-' {
192
- text = text(data, ts, te).first
202
+ text = copy(data, ts, te)
193
203
  # ranges cant start with a subset or intersection/negation/range operator
194
204
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
205
+ emit(:literal, :literal, text)
196
206
  else
197
- emit(:set, :range, text, ts, te)
207
+ emit(:set, :range, text)
198
208
  end
199
209
  };
200
210
 
201
211
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
212
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
213
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
214
+ emit(:set, :intersection, copy(data, ts, te))
205
215
  };
206
216
 
207
217
  backslash {
@@ -209,12 +219,12 @@
209
219
  };
210
220
 
211
221
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
222
+ emit(:set, :open, copy(data, ts, te))
213
223
  fcall character_set;
214
224
  };
215
225
 
216
226
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
227
+ text = copy(data, ts, te)
218
228
 
219
229
  type = :posixclass
220
230
  class_name = text[2..-3]
@@ -223,45 +233,40 @@
223
233
  type = :nonposixclass
224
234
  end
225
235
 
226
- emit(type, class_name.to_sym, text, ts, te)
236
+ emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
248
+ emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
251
+ any | ascii_nonprint | utf8_multibyte {
252
+ text = copy(data, ts, te)
253
+ emit(:literal, :literal, text)
249
254
  };
250
255
  *|;
251
256
 
252
257
  # set escapes scanner
253
258
  # --------------------------------------------------------------------------
254
259
  set_escape_sequence := |*
255
- non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
257
- fret;
258
- };
259
-
260
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
261
261
  fhold;
262
262
  fnext character_set;
263
263
  fcall escape_sequence;
264
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
265
270
  *|;
266
271
 
267
272
 
@@ -269,33 +274,33 @@
269
274
  # --------------------------------------------------------------------------
270
275
  escape_sequence := |*
271
276
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
277
+ text = copy(data, ts-1, te)
278
+ emit(:backref, :number, text)
274
279
  fret;
275
280
  };
276
281
 
277
282
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
283
+ emit(:escape, :octal, copy(data, ts-1, te))
279
284
  fret;
280
285
  };
281
286
 
282
287
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
288
+ case text = copy(data, ts-1, te)
289
+ when '\.'; emit(:escape, :dot, text)
290
+ when '\|'; emit(:escape, :alternation, text)
291
+ when '\^'; emit(:escape, :bol, text)
292
+ when '\$'; emit(:escape, :eol, text)
293
+ when '\?'; emit(:escape, :zero_or_one, text)
294
+ when '\*'; emit(:escape, :zero_or_more, text)
295
+ when '\+'; emit(:escape, :one_or_more, text)
296
+ when '\('; emit(:escape, :group_open, text)
297
+ when '\)'; emit(:escape, :group_close, text)
298
+ when '\{'; emit(:escape, :interval_open, text)
299
+ when '\}'; emit(:escape, :interval_close, text)
300
+ when '\['; emit(:escape, :set_open, text)
301
+ when '\]'; emit(:escape, :set_close, text)
297
302
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
303
+ emit(:escape, :backslash, text)
299
304
  end
300
305
  fret;
301
306
  };
@@ -303,31 +308,31 @@
303
308
  escaped_ascii > (escaped_alpha, 7) {
304
309
  # \b is emitted as backspace only when inside a character set, otherwise
305
310
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
311
+ case text = copy(data, ts-1, te)
312
+ when '\a'; emit(:escape, :bell, text)
313
+ when '\b'; emit(:escape, :backspace, text)
314
+ when '\e'; emit(:escape, :escape, text)
315
+ when '\f'; emit(:escape, :form_feed, text)
316
+ when '\n'; emit(:escape, :newline, text)
317
+ when '\r'; emit(:escape, :carriage, text)
318
+ when '\t'; emit(:escape, :tab, text)
319
+ when '\v'; emit(:escape, :vertical_tab, text)
315
320
  end
316
321
  fret;
317
322
  };
318
323
 
319
324
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
325
+ text = copy(data, ts-1, te)
321
326
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
327
+ emit(:escape, :codepoint_list, text)
323
328
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
329
+ emit(:escape, :codepoint, text)
325
330
  end
326
331
  fret;
327
332
  };
328
333
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
335
+ emit(:escape, :hex, copy(data, ts-1, te))
331
336
  fret;
332
337
  };
333
338
 
@@ -357,8 +362,8 @@
357
362
  fcall unicode_property;
358
363
  };
359
364
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
366
+ emit(:escape, :literal, copy(data, ts-1, te))
362
367
  fret;
363
368
  };
364
369
  *|;
@@ -368,9 +373,9 @@
368
373
  # --------------------------------------------------------------------------
369
374
  conditional_expression := |*
370
375
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
376
+ text = copy(data, ts, te-1)
377
+ emit(:conditional, :condition, text)
378
+ emit(:conditional, :condition_close, ')')
374
379
  };
375
380
 
376
381
  any {
@@ -387,39 +392,39 @@
387
392
  # Meta characters
388
393
  # ------------------------------------------------------------------------
389
394
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
395
+ emit(:meta, :dot, copy(data, ts, te))
391
396
  };
392
397
 
393
398
  alternation {
394
399
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
400
+ emit(:conditional, :separator, copy(data, ts, te))
396
401
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
402
+ emit(:meta, :alternation, copy(data, ts, te))
398
403
  end
399
404
  };
400
405
 
401
406
  # Anchors
402
407
  # ------------------------------------------------------------------------
403
408
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
409
+ emit(:anchor, :bol, copy(data, ts, te))
405
410
  };
406
411
 
407
412
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
413
+ emit(:anchor, :eol, copy(data, ts, te))
409
414
  };
410
415
 
411
416
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
417
+ emit(:keep, :mark, copy(data, ts, te))
413
418
  };
414
419
 
415
420
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
421
+ case text = copy(data, ts, te)
422
+ when '\\A'; emit(:anchor, :bos, text)
423
+ when '\\z'; emit(:anchor, :eos, text)
424
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
+ when '\\b'; emit(:anchor, :word_boundary, text)
426
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
427
+ when '\\G'; emit(:anchor, :match_start, text)
423
428
  end
424
429
  };
425
430
 
@@ -430,7 +435,7 @@
430
435
  # Character sets
431
436
  # ------------------------------------------------------------------------
432
437
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
438
+ emit(:set, :open, copy(data, ts, te))
434
439
  fcall character_set;
435
440
  };
436
441
 
@@ -439,12 +444,12 @@
439
444
  # (?(condition)Y|N) conditional expression
440
445
  # ------------------------------------------------------------------------
441
446
  conditional {
442
- text = text(data, ts, te).first
447
+ text = copy(data, ts, te)
443
448
 
444
449
  conditional_stack << group_depth
445
450
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
451
+ emit(:conditional, :open, text[0..-2])
452
+ emit(:conditional, :condition_open, '(')
448
453
  fcall conditional_expression;
449
454
  };
450
455
 
@@ -455,7 +460,7 @@
455
460
  # correct closing count.
456
461
  # ------------------------------------------------------------------------
457
462
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
463
+ emit(:group, :comment, copy(data, ts, te))
459
464
  };
460
465
 
461
466
  # Expression options:
@@ -470,11 +475,11 @@
470
475
  # (?imxdau-imx:subexp) option on/off for subexp
471
476
  # ------------------------------------------------------------------------
472
477
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
478
+ text = copy(data, ts, te)
474
479
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
480
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
481
  end
477
- emit_options(text, ts, te)
482
+ emit_options(text)
478
483
  };
479
484
 
480
485
  # Assertions
@@ -484,11 +489,11 @@
484
489
  # (?<!subexp) negative look-behind
485
490
  # ------------------------------------------------------------------------
486
491
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
492
+ case text = copy(data, ts, te)
493
+ when '(?='; emit(:assertion, :lookahead, text)
494
+ when '(?!'; emit(:assertion, :nlookahead, text)
495
+ when '(?<='; emit(:assertion, :lookbehind, text)
496
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
497
  end
493
498
  };
494
499
 
@@ -501,32 +506,32 @@
501
506
  # (subexp) captured group
502
507
  # ------------------------------------------------------------------------
503
508
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
509
+ case text = copy(data, ts, te)
510
+ when '(?:'; emit(:group, :passive, text)
511
+ when '(?>'; emit(:group, :atomic, text)
512
+ when '(?~'; emit(:group, :absence, text)
508
513
 
509
514
  when /^\(\?(?:<>|'')/
510
515
  validation_error(:group, 'named group', 'name is empty')
511
516
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
517
+ when /^\(\?<[^>]+>/
518
+ emit(:group, :named_ab, text)
514
519
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
520
+ when /^\(\?'[^']+'/
521
+ emit(:group, :named_sq, text)
517
522
 
518
523
  end
519
524
  };
520
525
 
521
526
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
527
+ text = copy(data, ts, te)
528
+ emit(:group, :capture, text)
524
529
  };
525
530
 
526
531
  group_close @group_closed {
527
532
  if conditional_stack.last == group_depth + 1
528
533
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
534
+ emit(:conditional, :close, copy(data, ts, te))
530
535
  else
531
536
  if spacing_stack.length > 1 &&
532
537
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,72 +539,42 @@
534
539
  self.free_spacing = spacing_stack.last[:free_spacing]
535
540
  end
536
541
 
537
- emit(:group, :close, *text(data, ts, te))
542
+ emit(:group, :close, copy(data, ts, te))
538
543
  end
539
544
  };
540
545
 
541
546
 
542
547
  # Group backreference, named and numbered
543
548
  # ------------------------------------------------------------------------
544
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
546
- when /^\\([gk])(<>|'')/ # angle brackets
547
- validation_error(:backref, 'ref/call', 'ref ID is empty')
548
-
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
- if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
552
- else
553
- emit(:backref, :name_call_ab, text, ts, te)
554
- end
555
-
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
- if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
559
- else
560
- emit(:backref, :name_call_sq, text, ts, te)
561
- end
562
-
563
- when /^\\([gk])<\d+>/ # angle-brackets
564
- if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
566
- else
567
- emit(:backref, :number_call_ab, text, ts, te)
568
- end
569
-
570
- when /^\\([gk])'\d+'/ # single quotes
571
- if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
573
- else
574
- emit(:backref, :number_call_sq, text, ts, te)
575
- end
576
-
577
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
- if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
580
- else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
582
- end
583
-
584
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
- if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
587
- else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
589
- end
590
-
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
593
-
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
596
-
597
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
599
-
600
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
550
+ case text = copy(data, ts, te)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
602
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
603
578
  end
604
579
  };
605
580
 
@@ -607,31 +582,31 @@
607
582
  # Quantifiers
608
583
  # ------------------------------------------------------------------------
609
584
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '?' ; emit(:quantifier, :zero_or_one, text)
587
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
588
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
589
  end
615
590
  };
616
591
 
617
592
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '*' ; emit(:quantifier, :zero_or_more, text)
595
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
596
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
597
  end
623
598
  };
624
599
 
625
600
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
601
+ case text = copy(data, ts, te)
602
+ when '+' ; emit(:quantifier, :one_or_more, text)
603
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
604
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
605
  end
631
606
  };
632
607
 
633
608
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
609
+ emit(:quantifier, :interval, copy(data, ts, te))
635
610
  };
636
611
 
637
612
  # Catch unmatched curly braces as literals
@@ -647,7 +622,7 @@
647
622
 
648
623
  comment {
649
624
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
625
+ emit(:free_space, :comment, copy(data, ts, te))
651
626
  else
652
627
  # consume only the pound sign (#) and backtrack to do regular scanning
653
628
  append_literal(data, ts, ts + 1)
@@ -657,7 +632,7 @@
657
632
 
658
633
  space+ {
659
634
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
635
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
636
  else
662
637
  append_literal(data, ts, te)
663
638
  end
@@ -666,11 +641,7 @@
666
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
667
642
  # except meta characters.
668
643
  # ------------------------------------------------------------------------
669
- (ascii_print -- space)+ |
670
- ascii_nonprint+ |
671
- utf8_2_byte+ |
672
- utf8_3_byte+ |
673
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
674
645
  append_literal(data, ts, te)
675
646
  };
676
647
 
@@ -682,10 +653,10 @@
682
653
 
683
654
  class Regexp::Scanner
684
655
  # General scanner error (catch all)
685
- class ScannerError < StandardError; end
656
+ class ScannerError < Regexp::Parser::Error; end
686
657
 
687
658
  # Base for all scanner validation errors
688
- class ValidationError < StandardError
659
+ class ValidationError < Regexp::Parser::Error
689
660
  def initialize(reason)
690
661
  super reason
691
662
  end
@@ -760,6 +731,7 @@ class Regexp::Scanner
760
731
  self.set_depth = 0
761
732
  self.group_depth = 0
762
733
  self.conditional_stack = []
734
+ self.char_pos = 0
763
735
 
764
736
  %% write data;
765
737
  %% write init;
@@ -769,7 +741,7 @@ class Regexp::Scanner
769
741
  testEof = testEof
770
742
 
771
743
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
744
+ text = copy(data, ts ? ts-1 : 0, -1)
773
745
  raise ScannerError.new("Scan error at '#{text}'")
774
746
  end
775
747
 
@@ -786,7 +758,7 @@ class Regexp::Scanner
786
758
 
787
759
  # lazy-load property maps when first needed
788
760
  require 'yaml'
789
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
761
+ PROP_MAPS_DIR = File.join(__dir__, 'scanner', 'properties')
790
762
 
791
763
  def self.short_prop_map
792
764
  @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
@@ -797,22 +769,29 @@ class Regexp::Scanner
797
769
  end
798
770
 
799
771
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
772
+ def emit(type, token, text)
801
773
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
774
 
803
775
  emit_literal if literal
804
776
 
777
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
778
+ # end-users, so we keep track of char-based indices and emit those instead.
779
+ ts_char_pos = char_pos
780
+ te_char_pos = char_pos + text.length
781
+
805
782
  if block
806
- block.call type, token, text, ts, te
783
+ block.call type, token, text, ts_char_pos, te_char_pos
807
784
  end
808
785
 
809
- tokens << [type, token, text, ts, te]
786
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
787
+
788
+ self.char_pos = te_char_pos
810
789
  end
811
790
 
812
791
  private
813
792
 
814
793
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
794
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
795
 
817
796
  def free_spacing?(input_object, options)
818
797
  if options && !input_object.is_a?(String)
@@ -835,36 +814,25 @@ class Regexp::Scanner
835
814
  end
836
815
 
837
816
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
817
+ def copy(data, ts, te)
818
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
819
  end
847
820
 
848
821
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
822
+ # by a call to emit_literal.
850
823
  def append_literal(data, ts, te)
851
824
  self.literal = literal || []
852
- literal << text(data, ts, te)
825
+ literal << copy(data, ts, te)
853
826
  end
854
827
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
828
+ # Emits the literal run collected by calls to the append_literal method.
857
829
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
830
+ text = literal.join
863
831
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
832
+ emit(:literal, :literal, text)
865
833
  end
866
834
 
867
- def emit_options(text, ts, te)
835
+ def emit_options(text)
868
836
  token = nil
869
837
 
870
838
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +858,14 @@ class Regexp::Scanner
890
858
  token = :options_switch
891
859
  end
892
860
 
893
- emit(:group, token, text, ts, te)
861
+ emit(:group, token, text)
894
862
  end
895
863
 
896
864
  def emit_meta_control_sequence(data, ts, te, token)
897
865
  if data.last < 0x00 || data.last > 0x7F
898
866
  validation_error(:sequence, 'escape', token.to_s)
899
867
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
868
+ emit(:escape, token, copy(data, ts-1, te))
901
869
  end
902
870
 
903
871
  # Centralizes and unifies the handling of validation related