regexp_parser 1.8.2 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +100 -0
  3. data/Gemfile +6 -1
  4. data/README.md +1 -4
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  8. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  9. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  10. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  11. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  12. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  13. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  14. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  15. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  16. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  17. data/lib/regexp_parser/expression/sequence.rb +3 -19
  18. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  19. data/lib/regexp_parser/expression.rb +7 -19
  20. data/lib/regexp_parser/lexer.rb +2 -2
  21. data/lib/regexp_parser/parser.rb +307 -332
  22. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  23. data/lib/regexp_parser/scanner/property.rl +2 -2
  24. data/lib/regexp_parser/scanner/scanner.rl +209 -240
  25. data/lib/regexp_parser/scanner.rb +1275 -1340
  26. data/lib/regexp_parser/syntax/any.rb +3 -3
  27. data/lib/regexp_parser/syntax/base.rb +1 -1
  28. data/lib/regexp_parser/syntax/version_lookup.rb +2 -2
  29. data/lib/regexp_parser/syntax.rb +8 -6
  30. data/lib/regexp_parser/version.rb +1 -1
  31. data/spec/expression/base_spec.rb +10 -0
  32. data/spec/expression/clone_spec.rb +36 -4
  33. data/spec/expression/free_space_spec.rb +2 -2
  34. data/spec/expression/methods/match_length_spec.rb +2 -2
  35. data/spec/expression/subexpression_spec.rb +1 -1
  36. data/spec/expression/to_s_spec.rb +39 -31
  37. data/spec/lexer/literals_spec.rb +24 -49
  38. data/spec/lexer/refcalls_spec.rb +5 -0
  39. data/spec/parser/all_spec.rb +2 -2
  40. data/spec/parser/errors_spec.rb +1 -1
  41. data/spec/parser/escapes_spec.rb +1 -1
  42. data/spec/parser/quantifiers_spec.rb +16 -0
  43. data/spec/parser/refcalls_spec.rb +5 -0
  44. data/spec/parser/set/ranges_spec.rb +3 -3
  45. data/spec/scanner/escapes_spec.rb +8 -1
  46. data/spec/scanner/groups_spec.rb +10 -1
  47. data/spec/scanner/literals_spec.rb +28 -38
  48. data/spec/scanner/quantifiers_spec.rb +18 -13
  49. data/spec/scanner/refcalls_spec.rb +19 -0
  50. data/spec/scanner/sets_spec.rb +65 -16
  51. data/spec/spec_helper.rb +1 -0
  52. metadata +4 -7
  53. data/spec/expression/root_spec.rb +0 -9
  54. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,7 +20,7 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,7 +127,7 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
@@ -123,25 +135,23 @@
123
135
  ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
148
+ text = copy(data, ts ? ts-1 : 0, -1)
139
149
  raise PrematureEndError.new( text )
140
150
  }
141
151
 
142
152
  # Invalid sequence error, used from sequences, like escapes and sets
143
153
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
154
+ text = copy(data, ts ? ts-1 : 0, -1)
145
155
  validation_error(:sequence, 'sequence', text)
146
156
  }
147
157
 
@@ -156,7 +166,7 @@
156
166
  # --------------------------------------------------------------------------
157
167
  character_set := |*
158
168
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
169
+ emit(:set, :close, copy(data, ts, te))
160
170
  if in_set?
161
171
  fret;
162
172
  else
@@ -165,8 +175,8 @@
165
175
  };
166
176
 
167
177
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
178
+ emit(:literal, :literal, copy(data, ts, te-1))
179
+ emit(:set, :close, copy(data, ts+1, te))
170
180
  if in_set?
171
181
  fret;
172
182
  else
@@ -175,33 +185,33 @@
175
185
  };
176
186
 
177
187
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
188
+ emit(:literal, :literal, '-')
189
+ emit(:set, :intersection, '&&')
180
190
  };
181
191
 
182
192
  '^' {
183
- text = text(data, ts, te).first
193
+ text = copy(data, ts, te)
184
194
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
195
+ emit(:set, :negate, text)
186
196
  else
187
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
188
198
  end
189
199
  };
190
200
 
191
201
  '-' {
192
- text = text(data, ts, te).first
202
+ text = copy(data, ts, te)
193
203
  # ranges cant start with a subset or intersection/negation/range operator
194
204
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
205
+ emit(:literal, :literal, text)
196
206
  else
197
- emit(:set, :range, text, ts, te)
207
+ emit(:set, :range, text)
198
208
  end
199
209
  };
200
210
 
201
211
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
212
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
213
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
214
+ emit(:set, :intersection, copy(data, ts, te))
205
215
  };
206
216
 
207
217
  backslash {
@@ -209,12 +219,12 @@
209
219
  };
210
220
 
211
221
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
222
+ emit(:set, :open, copy(data, ts, te))
213
223
  fcall character_set;
214
224
  };
215
225
 
216
226
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
227
+ text = copy(data, ts, te)
218
228
 
219
229
  type = :posixclass
220
230
  class_name = text[2..-3]
@@ -223,45 +233,40 @@
223
233
  type = :nonposixclass
224
234
  end
225
235
 
226
- emit(type, class_name.to_sym, text, ts, te)
236
+ emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
248
+ emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
251
+ any | ascii_nonprint | utf8_multibyte {
252
+ text = copy(data, ts, te)
253
+ emit(:literal, :literal, text)
249
254
  };
250
255
  *|;
251
256
 
252
257
  # set escapes scanner
253
258
  # --------------------------------------------------------------------------
254
259
  set_escape_sequence := |*
255
- non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
257
- fret;
258
- };
259
-
260
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
261
261
  fhold;
262
262
  fnext character_set;
263
263
  fcall escape_sequence;
264
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
265
270
  *|;
266
271
 
267
272
 
@@ -269,33 +274,33 @@
269
274
  # --------------------------------------------------------------------------
270
275
  escape_sequence := |*
271
276
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
277
+ text = copy(data, ts-1, te)
278
+ emit(:backref, :number, text)
274
279
  fret;
275
280
  };
276
281
 
277
282
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
283
+ emit(:escape, :octal, copy(data, ts-1, te))
279
284
  fret;
280
285
  };
281
286
 
282
287
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
288
+ case text = copy(data, ts-1, te)
289
+ when '\.'; emit(:escape, :dot, text)
290
+ when '\|'; emit(:escape, :alternation, text)
291
+ when '\^'; emit(:escape, :bol, text)
292
+ when '\$'; emit(:escape, :eol, text)
293
+ when '\?'; emit(:escape, :zero_or_one, text)
294
+ when '\*'; emit(:escape, :zero_or_more, text)
295
+ when '\+'; emit(:escape, :one_or_more, text)
296
+ when '\('; emit(:escape, :group_open, text)
297
+ when '\)'; emit(:escape, :group_close, text)
298
+ when '\{'; emit(:escape, :interval_open, text)
299
+ when '\}'; emit(:escape, :interval_close, text)
300
+ when '\['; emit(:escape, :set_open, text)
301
+ when '\]'; emit(:escape, :set_close, text)
297
302
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
303
+ emit(:escape, :backslash, text)
299
304
  end
300
305
  fret;
301
306
  };
@@ -303,31 +308,31 @@
303
308
  escaped_ascii > (escaped_alpha, 7) {
304
309
  # \b is emitted as backspace only when inside a character set, otherwise
305
310
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
311
+ case text = copy(data, ts-1, te)
312
+ when '\a'; emit(:escape, :bell, text)
313
+ when '\b'; emit(:escape, :backspace, text)
314
+ when '\e'; emit(:escape, :escape, text)
315
+ when '\f'; emit(:escape, :form_feed, text)
316
+ when '\n'; emit(:escape, :newline, text)
317
+ when '\r'; emit(:escape, :carriage, text)
318
+ when '\t'; emit(:escape, :tab, text)
319
+ when '\v'; emit(:escape, :vertical_tab, text)
315
320
  end
316
321
  fret;
317
322
  };
318
323
 
319
324
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
325
+ text = copy(data, ts-1, te)
321
326
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
327
+ emit(:escape, :codepoint_list, text)
323
328
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
329
+ emit(:escape, :codepoint, text)
325
330
  end
326
331
  fret;
327
332
  };
328
333
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
335
+ emit(:escape, :hex, copy(data, ts-1, te))
331
336
  fret;
332
337
  };
333
338
 
@@ -357,8 +362,8 @@
357
362
  fcall unicode_property;
358
363
  };
359
364
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
366
+ emit(:escape, :literal, copy(data, ts-1, te))
362
367
  fret;
363
368
  };
364
369
  *|;
@@ -368,9 +373,9 @@
368
373
  # --------------------------------------------------------------------------
369
374
  conditional_expression := |*
370
375
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
376
+ text = copy(data, ts, te-1)
377
+ emit(:conditional, :condition, text)
378
+ emit(:conditional, :condition_close, ')')
374
379
  };
375
380
 
376
381
  any {
@@ -387,39 +392,39 @@
387
392
  # Meta characters
388
393
  # ------------------------------------------------------------------------
389
394
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
395
+ emit(:meta, :dot, copy(data, ts, te))
391
396
  };
392
397
 
393
398
  alternation {
394
399
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
400
+ emit(:conditional, :separator, copy(data, ts, te))
396
401
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
402
+ emit(:meta, :alternation, copy(data, ts, te))
398
403
  end
399
404
  };
400
405
 
401
406
  # Anchors
402
407
  # ------------------------------------------------------------------------
403
408
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
409
+ emit(:anchor, :bol, copy(data, ts, te))
405
410
  };
406
411
 
407
412
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
413
+ emit(:anchor, :eol, copy(data, ts, te))
409
414
  };
410
415
 
411
416
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
417
+ emit(:keep, :mark, copy(data, ts, te))
413
418
  };
414
419
 
415
420
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
421
+ case text = copy(data, ts, te)
422
+ when '\\A'; emit(:anchor, :bos, text)
423
+ when '\\z'; emit(:anchor, :eos, text)
424
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
+ when '\\b'; emit(:anchor, :word_boundary, text)
426
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
427
+ when '\\G'; emit(:anchor, :match_start, text)
423
428
  end
424
429
  };
425
430
 
@@ -430,7 +435,7 @@
430
435
  # Character sets
431
436
  # ------------------------------------------------------------------------
432
437
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
438
+ emit(:set, :open, copy(data, ts, te))
434
439
  fcall character_set;
435
440
  };
436
441
 
@@ -439,12 +444,12 @@
439
444
  # (?(condition)Y|N) conditional expression
440
445
  # ------------------------------------------------------------------------
441
446
  conditional {
442
- text = text(data, ts, te).first
447
+ text = copy(data, ts, te)
443
448
 
444
449
  conditional_stack << group_depth
445
450
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
451
+ emit(:conditional, :open, text[0..-2])
452
+ emit(:conditional, :condition_open, '(')
448
453
  fcall conditional_expression;
449
454
  };
450
455
 
@@ -455,7 +460,7 @@
455
460
  # correct closing count.
456
461
  # ------------------------------------------------------------------------
457
462
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
463
+ emit(:group, :comment, copy(data, ts, te))
459
464
  };
460
465
 
461
466
  # Expression options:
@@ -470,11 +475,11 @@
470
475
  # (?imxdau-imx:subexp) option on/off for subexp
471
476
  # ------------------------------------------------------------------------
472
477
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
478
+ text = copy(data, ts, te)
474
479
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
480
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
481
  end
477
- emit_options(text, ts, te)
482
+ emit_options(text)
478
483
  };
479
484
 
480
485
  # Assertions
@@ -484,11 +489,11 @@
484
489
  # (?<!subexp) negative look-behind
485
490
  # ------------------------------------------------------------------------
486
491
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
492
+ case text = copy(data, ts, te)
493
+ when '(?='; emit(:assertion, :lookahead, text)
494
+ when '(?!'; emit(:assertion, :nlookahead, text)
495
+ when '(?<='; emit(:assertion, :lookbehind, text)
496
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
497
  end
493
498
  };
494
499
 
@@ -501,32 +506,32 @@
501
506
  # (subexp) captured group
502
507
  # ------------------------------------------------------------------------
503
508
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
509
+ case text = copy(data, ts, te)
510
+ when '(?:'; emit(:group, :passive, text)
511
+ when '(?>'; emit(:group, :atomic, text)
512
+ when '(?~'; emit(:group, :absence, text)
508
513
 
509
514
  when /^\(\?(?:<>|'')/
510
515
  validation_error(:group, 'named group', 'name is empty')
511
516
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
517
+ when /^\(\?<[^>]+>/
518
+ emit(:group, :named_ab, text)
514
519
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
520
+ when /^\(\?'[^']+'/
521
+ emit(:group, :named_sq, text)
517
522
 
518
523
  end
519
524
  };
520
525
 
521
526
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
527
+ text = copy(data, ts, te)
528
+ emit(:group, :capture, text)
524
529
  };
525
530
 
526
531
  group_close @group_closed {
527
532
  if conditional_stack.last == group_depth + 1
528
533
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
534
+ emit(:conditional, :close, copy(data, ts, te))
530
535
  else
531
536
  if spacing_stack.length > 1 &&
532
537
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,72 +539,42 @@
534
539
  self.free_spacing = spacing_stack.last[:free_spacing]
535
540
  end
536
541
 
537
- emit(:group, :close, *text(data, ts, te))
542
+ emit(:group, :close, copy(data, ts, te))
538
543
  end
539
544
  };
540
545
 
541
546
 
542
547
  # Group backreference, named and numbered
543
548
  # ------------------------------------------------------------------------
544
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
546
- when /^\\([gk])(<>|'')/ # angle brackets
547
- validation_error(:backref, 'ref/call', 'ref ID is empty')
548
-
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
- if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
552
- else
553
- emit(:backref, :name_call_ab, text, ts, te)
554
- end
555
-
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
- if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
559
- else
560
- emit(:backref, :name_call_sq, text, ts, te)
561
- end
562
-
563
- when /^\\([gk])<\d+>/ # angle-brackets
564
- if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
566
- else
567
- emit(:backref, :number_call_ab, text, ts, te)
568
- end
569
-
570
- when /^\\([gk])'\d+'/ # single quotes
571
- if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
573
- else
574
- emit(:backref, :number_call_sq, text, ts, te)
575
- end
576
-
577
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
- if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
580
- else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
582
- end
583
-
584
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
- if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
587
- else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
589
- end
590
-
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
593
-
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
596
-
597
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
599
-
600
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
550
+ case text = copy(data, ts, te)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
602
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
603
578
  end
604
579
  };
605
580
 
@@ -607,31 +582,31 @@
607
582
  # Quantifiers
608
583
  # ------------------------------------------------------------------------
609
584
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '?' ; emit(:quantifier, :zero_or_one, text)
587
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
588
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
589
  end
615
590
  };
616
591
 
617
592
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '*' ; emit(:quantifier, :zero_or_more, text)
595
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
596
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
597
  end
623
598
  };
624
599
 
625
600
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
601
+ case text = copy(data, ts, te)
602
+ when '+' ; emit(:quantifier, :one_or_more, text)
603
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
604
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
605
  end
631
606
  };
632
607
 
633
608
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
609
+ emit(:quantifier, :interval, copy(data, ts, te))
635
610
  };
636
611
 
637
612
  # Catch unmatched curly braces as literals
@@ -647,7 +622,7 @@
647
622
 
648
623
  comment {
649
624
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
625
+ emit(:free_space, :comment, copy(data, ts, te))
651
626
  else
652
627
  # consume only the pound sign (#) and backtrack to do regular scanning
653
628
  append_literal(data, ts, ts + 1)
@@ -657,7 +632,7 @@
657
632
 
658
633
  space+ {
659
634
  if free_spacing
660
- emit(:free_space, :whitespace, *text(data, ts, te))
635
+ emit(:free_space, :whitespace, copy(data, ts, te))
661
636
  else
662
637
  append_literal(data, ts, te)
663
638
  end
@@ -666,11 +641,7 @@
666
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
667
642
  # except meta characters.
668
643
  # ------------------------------------------------------------------------
669
- (ascii_print -- space)+ |
670
- ascii_nonprint+ |
671
- utf8_2_byte+ |
672
- utf8_3_byte+ |
673
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
674
645
  append_literal(data, ts, te)
675
646
  };
676
647
 
@@ -680,12 +651,14 @@
680
651
  # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
681
652
  # This file was generated from lib/regexp_parser/scanner/scanner.rl
682
653
 
654
+ require 'regexp_parser/error'
655
+
683
656
  class Regexp::Scanner
684
657
  # General scanner error (catch all)
685
- class ScannerError < StandardError; end
658
+ class ScannerError < Regexp::Parser::Error; end
686
659
 
687
660
  # Base for all scanner validation errors
688
- class ValidationError < StandardError
661
+ class ValidationError < Regexp::Parser::Error
689
662
  def initialize(reason)
690
663
  super reason
691
664
  end
@@ -760,6 +733,7 @@ class Regexp::Scanner
760
733
  self.set_depth = 0
761
734
  self.group_depth = 0
762
735
  self.conditional_stack = []
736
+ self.char_pos = 0
763
737
 
764
738
  %% write data;
765
739
  %% write init;
@@ -769,7 +743,7 @@ class Regexp::Scanner
769
743
  testEof = testEof
770
744
 
771
745
  if cs == re_scanner_error
772
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
746
+ text = copy(data, ts ? ts-1 : 0, -1)
773
747
  raise ScannerError.new("Scan error at '#{text}'")
774
748
  end
775
749
 
@@ -786,33 +760,39 @@ class Regexp::Scanner
786
760
 
787
761
  # lazy-load property maps when first needed
788
762
  require 'yaml'
789
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
790
763
 
791
764
  def self.short_prop_map
792
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
765
+ @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
793
766
  end
794
767
 
795
768
  def self.long_prop_map
796
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
769
+ @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
797
770
  end
798
771
 
799
772
  # Emits an array with the details of the scanned pattern
800
- def emit(type, token, text, ts, te)
773
+ def emit(type, token, text)
801
774
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
802
775
 
803
776
  emit_literal if literal
804
777
 
778
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
779
+ # end-users, so we keep track of char-based indices and emit those instead.
780
+ ts_char_pos = char_pos
781
+ te_char_pos = char_pos + text.length
782
+
805
783
  if block
806
- block.call type, token, text, ts, te
784
+ block.call type, token, text, ts_char_pos, te_char_pos
807
785
  end
808
786
 
809
- tokens << [type, token, text, ts, te]
787
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
788
+
789
+ self.char_pos = te_char_pos
810
790
  end
811
791
 
812
792
  private
813
793
 
814
794
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
815
- :group_depth, :set_depth, :conditional_stack
795
+ :group_depth, :set_depth, :conditional_stack, :char_pos
816
796
 
817
797
  def free_spacing?(input_object, options)
818
798
  if options && !input_object.is_a?(String)
@@ -835,36 +815,25 @@ class Regexp::Scanner
835
815
  end
836
816
 
837
817
  # Copy from ts to te from data as text
838
- def copy(data, range)
839
- data[range].pack('c*')
840
- end
841
-
842
- # Copy from ts to te from data as text, returning an array with the text
843
- # and the offsets used to copy it.
844
- def text(data, ts, te, soff = 0)
845
- [copy(data, ts-soff..te-1), ts-soff, te]
818
+ def copy(data, ts, te)
819
+ data[ts...te].pack('c*').force_encoding('utf-8')
846
820
  end
847
821
 
848
822
  # Appends one or more characters to the literal buffer, to be emitted later
849
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
823
+ # by a call to emit_literal.
850
824
  def append_literal(data, ts, te)
851
825
  self.literal = literal || []
852
- literal << text(data, ts, te)
826
+ literal << copy(data, ts, te)
853
827
  end
854
828
 
855
- # Emits the literal run collected by calls to the append_literal method,
856
- # using the total start (ts) and end (te) offsets of the run.
829
+ # Emits the literal run collected by calls to the append_literal method.
857
830
  def emit_literal
858
- ts, te = literal.first[1], literal.last[2]
859
- text = literal.map {|t| t[0]}.join
860
-
861
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
862
-
831
+ text = literal.join
863
832
  self.literal = nil
864
- emit(:literal, :literal, text, ts, te)
833
+ emit(:literal, :literal, text)
865
834
  end
866
835
 
867
- def emit_options(text, ts, te)
836
+ def emit_options(text)
868
837
  token = nil
869
838
 
870
839
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -890,14 +859,14 @@ class Regexp::Scanner
890
859
  token = :options_switch
891
860
  end
892
861
 
893
- emit(:group, token, text, ts, te)
862
+ emit(:group, token, text)
894
863
  end
895
864
 
896
865
  def emit_meta_control_sequence(data, ts, te, token)
897
866
  if data.last < 0x00 || data.last > 0x7F
898
867
  validation_error(:sequence, 'escape', token.to_s)
899
868
  end
900
- emit(:escape, token, *text(data, ts, te, 1))
869
+ emit(:escape, token, copy(data, ts-1, te))
901
870
  end
902
871
 
903
872
  # Centralizes and unifies the handling of validation related