regexp_parser 1.7.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +138 -0
  3. data/Gemfile +6 -1
  4. data/README.md +23 -11
  5. data/Rakefile +8 -8
  6. data/lib/regexp_parser/error.rb +4 -0
  7. data/lib/regexp_parser/expression.rb +13 -21
  8. data/lib/regexp_parser/expression/classes/backref.rb +5 -0
  9. data/lib/regexp_parser/expression/classes/conditional.rb +11 -1
  10. data/lib/regexp_parser/expression/classes/free_space.rb +2 -2
  11. data/lib/regexp_parser/expression/classes/group.rb +28 -3
  12. data/lib/regexp_parser/expression/classes/property.rb +1 -1
  13. data/lib/regexp_parser/expression/classes/root.rb +4 -16
  14. data/lib/regexp_parser/expression/classes/set/range.rb +2 -1
  15. data/lib/regexp_parser/expression/methods/match_length.rb +2 -2
  16. data/lib/regexp_parser/expression/methods/traverse.rb +2 -2
  17. data/lib/regexp_parser/expression/quantifier.rb +10 -1
  18. data/lib/regexp_parser/expression/sequence.rb +3 -19
  19. data/lib/regexp_parser/expression/subexpression.rb +1 -1
  20. data/lib/regexp_parser/lexer.rb +6 -6
  21. data/lib/regexp_parser/parser.rb +325 -344
  22. data/lib/regexp_parser/scanner.rb +1320 -1385
  23. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  24. data/lib/regexp_parser/scanner/property.rl +2 -2
  25. data/lib/regexp_parser/scanner/scanner.rl +231 -253
  26. data/lib/regexp_parser/syntax.rb +8 -6
  27. data/lib/regexp_parser/syntax/any.rb +3 -3
  28. data/lib/regexp_parser/syntax/base.rb +1 -1
  29. data/lib/regexp_parser/syntax/version_lookup.rb +4 -4
  30. data/lib/regexp_parser/version.rb +1 -1
  31. data/regexp_parser.gemspec +1 -1
  32. data/spec/expression/base_spec.rb +10 -0
  33. data/spec/expression/clone_spec.rb +36 -4
  34. data/spec/expression/free_space_spec.rb +2 -2
  35. data/spec/expression/methods/match_length_spec.rb +2 -2
  36. data/spec/expression/subexpression_spec.rb +1 -1
  37. data/spec/expression/to_s_spec.rb +39 -31
  38. data/spec/lexer/literals_spec.rb +24 -49
  39. data/spec/lexer/refcalls_spec.rb +5 -0
  40. data/spec/parser/all_spec.rb +2 -2
  41. data/spec/parser/errors_spec.rb +1 -1
  42. data/spec/parser/escapes_spec.rb +1 -1
  43. data/spec/parser/options_spec.rb +28 -0
  44. data/spec/parser/quantifiers_spec.rb +16 -0
  45. data/spec/parser/refcalls_spec.rb +5 -0
  46. data/spec/parser/set/ranges_spec.rb +3 -3
  47. data/spec/scanner/escapes_spec.rb +12 -1
  48. data/spec/scanner/free_space_spec.rb +32 -0
  49. data/spec/scanner/groups_spec.rb +10 -1
  50. data/spec/scanner/literals_spec.rb +28 -38
  51. data/spec/scanner/options_spec.rb +36 -0
  52. data/spec/scanner/quantifiers_spec.rb +18 -13
  53. data/spec/scanner/refcalls_spec.rb +19 -0
  54. data/spec/scanner/sets_spec.rb +65 -16
  55. data/spec/spec_helper.rb +1 -0
  56. metadata +61 -60
  57. data/spec/expression/root_spec.rb +0 -9
  58. data/spec/expression/sequence_spec.rb +0 -9
@@ -10,17 +10,17 @@
10
10
  # --------------------------------------------------------------------------
11
11
  char_type := |*
12
12
  char_type_char {
13
- case text = text(data, ts, te, 1).first
14
- when '\d'; emit(:type, :digit, text, ts - 1, te)
15
- when '\D'; emit(:type, :nondigit, text, ts - 1, te)
16
- when '\h'; emit(:type, :hex, text, ts - 1, te)
17
- when '\H'; emit(:type, :nonhex, text, ts - 1, te)
18
- when '\s'; emit(:type, :space, text, ts - 1, te)
19
- when '\S'; emit(:type, :nonspace, text, ts - 1, te)
20
- when '\w'; emit(:type, :word, text, ts - 1, te)
21
- when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
- when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
- when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
13
+ case text = copy(data, ts-1, te)
14
+ when '\d'; emit(:type, :digit, text)
15
+ when '\D'; emit(:type, :nondigit, text)
16
+ when '\h'; emit(:type, :hex, text)
17
+ when '\H'; emit(:type, :nonhex, text)
18
+ when '\s'; emit(:type, :space, text)
19
+ when '\S'; emit(:type, :nonspace, text)
20
+ when '\w'; emit(:type, :word, text)
21
+ when '\W'; emit(:type, :nonword, text)
22
+ when '\R'; emit(:type, :linebreak, text)
23
+ when '\X'; emit(:type, :xgrapheme, text)
24
24
  end
25
25
  fret;
26
26
  };
@@ -14,7 +14,7 @@
14
14
  unicode_property := |*
15
15
 
16
16
  property_sequence < eof(premature_property_end) {
17
- text = text(data, ts, te, 1).first
17
+ text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
20
  name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
@@ -22,7 +22,7 @@
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
23
  raise UnknownUnicodePropertyError.new(name) unless token
24
24
 
25
- self.emit(type, token.to_sym, text, ts-1, te)
25
+ self.emit(type, token.to_sym, text)
26
26
 
27
27
  fret;
28
28
  };
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,13 +20,13 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
29
+ comment = ('#' . [^\n]* . '\n'?);
25
30
 
26
31
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
32
  'cntrl' | 'digit' | 'graph' |
@@ -32,7 +37,7 @@
32
37
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
38
 
34
39
 
35
- # these are not supported in ruby, and need verification
40
+ # these are not supported in ruby at the moment
36
41
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
42
  character_equivalent = '[=' . alpha . '=]';
38
43
 
@@ -53,6 +58,8 @@
53
58
 
54
59
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
60
 
61
+ sequence_char = [CMcux];
62
+
56
63
  zero_or_one = '?' | '??' | '?+';
57
64
  zero_or_more = '*' | '*?' | '*+';
58
65
  one_or_more = '+' | '+?' | '++';
@@ -90,21 +97,26 @@
90
97
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
98
 
92
99
  group_ref = [gk];
93
- group_name_char = (alnum | '_');
94
- group_name_id = (group_name_char . (group_name_char+)?)?;
95
- group_number = '-'? . [1-9] . ([0-9]+)?;
100
+ group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
+ group_number = '-'? . [1-9] . [0-9]*;
96
103
  group_level = [+\-] . [0-9]+;
97
104
 
98
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
105
+ group_name = ('<' . group_name_id_ab? . '>') |
106
+ ("'" . group_name_id_sq? . "'");
99
107
  group_lookup = group_name | group_number;
100
108
 
101
109
  group_named = ('?' . group_name );
102
110
 
103
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
104
- ("'" . group_name_id . group_level? "'"));
111
+ group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
+ ("'" . group_name_id_sq? . group_level? "'"));
113
+ group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
+ ("'" . group_name_id_sq? . group_level? "'"));
105
115
 
106
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
107
- ("'" . group_number . group_level? "'"));
116
+ group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
+ ("'" . group_number . group_level? "'"));
118
+ group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
+ ("'" . ((group_number . group_level?) | '0') "'"));
108
120
 
109
121
  group_type = group_atomic | group_passive | group_absence | group_named;
110
122
 
@@ -115,33 +127,31 @@
115
127
 
116
128
  # characters that 'break' a literal
117
129
  meta_char = dot | backslash | alternation |
118
- curlies | parantheses | brackets |
130
+ curlies | parentheses | brackets |
119
131
  line_anchor | quantifier_greedy;
120
132
 
121
133
  literal_delimiters = ']' | '}';
122
134
 
123
- ascii_print = ((0x20..0x7e) - meta_char);
135
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
124
136
  ascii_nonprint = (0x01..0x1f | 0x7f);
125
137
 
126
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
127
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
128
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
129
-
130
138
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
131
- group_ref | keep_mark | [xucCM];
139
+ keep_mark | sequence_char;
140
+
141
+ # escapes that also work within a character set
142
+ set_escape = backslash | brackets | escaped_ascii | property_char |
143
+ sequence_char | single_codepoint_char_type;
132
144
 
133
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
134
- multi_codepoint_char_type | [0-9cCM];
135
145
 
136
146
  # EOF error, used where it can be detected
137
147
  action premature_end_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
148
+ text = copy(data, ts ? ts-1 : 0, -1)
139
149
  raise PrematureEndError.new( text )
140
150
  }
141
151
 
142
152
  # Invalid sequence error, used from sequences, like escapes and sets
143
153
  action invalid_sequence_error {
144
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
154
+ text = copy(data, ts ? ts-1 : 0, -1)
145
155
  validation_error(:sequence, 'sequence', text)
146
156
  }
147
157
 
@@ -156,7 +166,7 @@
156
166
  # --------------------------------------------------------------------------
157
167
  character_set := |*
158
168
  set_close > (set_meta, 2) @set_closed {
159
- emit(:set, :close, *text(data, ts, te))
169
+ emit(:set, :close, copy(data, ts, te))
160
170
  if in_set?
161
171
  fret;
162
172
  else
@@ -165,8 +175,8 @@
165
175
  };
166
176
 
167
177
  '-]' @set_closed { # special case, emits two tokens
168
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
178
+ emit(:literal, :literal, copy(data, ts, te-1))
179
+ emit(:set, :close, copy(data, ts+1, te))
170
180
  if in_set?
171
181
  fret;
172
182
  else
@@ -175,33 +185,33 @@
175
185
  };
176
186
 
177
187
  '-&&' { # special case, emits two tokens
178
- emit(:literal, :literal, '-', ts, te)
179
- emit(:set, :intersection, '&&', ts, te)
188
+ emit(:literal, :literal, '-')
189
+ emit(:set, :intersection, '&&')
180
190
  };
181
191
 
182
192
  '^' {
183
- text = text(data, ts, te).first
193
+ text = copy(data, ts, te)
184
194
  if tokens.last[1] == :open
185
- emit(:set, :negate, text, ts, te)
195
+ emit(:set, :negate, text)
186
196
  else
187
- emit(:literal, :literal, text, ts, te)
197
+ emit(:literal, :literal, text)
188
198
  end
189
199
  };
190
200
 
191
201
  '-' {
192
- text = text(data, ts, te).first
202
+ text = copy(data, ts, te)
193
203
  # ranges cant start with a subset or intersection/negation/range operator
194
204
  if tokens.last[0] == :set
195
- emit(:literal, :literal, text, ts, te)
205
+ emit(:literal, :literal, text)
196
206
  else
197
- emit(:set, :range, text, ts, te)
207
+ emit(:set, :range, text)
198
208
  end
199
209
  };
200
210
 
201
211
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
202
212
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
203
213
  '&&' {
204
- emit(:set, :intersection, *text(data, ts, te))
214
+ emit(:set, :intersection, copy(data, ts, te))
205
215
  };
206
216
 
207
217
  backslash {
@@ -209,12 +219,12 @@
209
219
  };
210
220
 
211
221
  set_open >(open_bracket, 1) >set_opened {
212
- emit(:set, :open, *text(data, ts, te))
222
+ emit(:set, :open, copy(data, ts, te))
213
223
  fcall character_set;
214
224
  };
215
225
 
216
226
  class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
217
- text = text(data, ts, te).first
227
+ text = copy(data, ts, te)
218
228
 
219
229
  type = :posixclass
220
230
  class_name = text[2..-3]
@@ -223,45 +233,40 @@
223
233
  type = :nonposixclass
224
234
  end
225
235
 
226
- emit(type, class_name.to_sym, text, ts, te)
236
+ emit(type, class_name.to_sym, text)
227
237
  };
228
238
 
229
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
230
- emit(:set, :collation, *text(data, ts, te))
231
- };
232
-
233
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
234
- emit(:set, :equivalent, *text(data, ts, te))
235
- };
239
+ # These are not supported in ruby at the moment. Enable them if they are.
240
+ # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
+ # emit(:set, :collation, copy(data, ts, te))
242
+ # };
243
+ # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
+ # emit(:set, :equivalent, copy(data, ts, te))
245
+ # };
236
246
 
237
247
  meta_char > (set_meta, 1) {
238
- emit(:literal, :literal, *text(data, ts, te))
248
+ emit(:literal, :literal, copy(data, ts, te))
239
249
  };
240
250
 
241
- any |
242
- ascii_nonprint |
243
- utf8_2_byte |
244
- utf8_3_byte |
245
- utf8_4_byte {
246
- char, *rest = *text(data, ts, te)
247
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
248
- emit(:literal, :literal, char, *rest)
251
+ any | ascii_nonprint | utf8_multibyte {
252
+ text = copy(data, ts, te)
253
+ emit(:literal, :literal, text)
249
254
  };
250
255
  *|;
251
256
 
252
257
  # set escapes scanner
253
258
  # --------------------------------------------------------------------------
254
259
  set_escape_sequence := |*
255
- non_set_escape > (escaped_set_alpha, 2) {
256
- emit(:escape, :literal, *text(data, ts, te, 1))
257
- fret;
258
- };
259
-
260
- any > (escaped_set_alpha, 1) {
260
+ set_escape > (escaped_set_alpha, 2) {
261
261
  fhold;
262
262
  fnext character_set;
263
263
  fcall escape_sequence;
264
264
  };
265
+
266
+ any > (escaped_set_alpha, 1) {
267
+ emit(:escape, :literal, copy(data, ts-1, te))
268
+ fret;
269
+ };
265
270
  *|;
266
271
 
267
272
 
@@ -269,33 +274,33 @@
269
274
  # --------------------------------------------------------------------------
270
275
  escape_sequence := |*
271
276
  [1-9] {
272
- text = text(data, ts, te, 1).first
273
- emit(:backref, :number, text, ts-1, te)
277
+ text = copy(data, ts-1, te)
278
+ emit(:backref, :number, text)
274
279
  fret;
275
280
  };
276
281
 
277
282
  octal_sequence {
278
- emit(:escape, :octal, *text(data, ts, te, 1))
283
+ emit(:escape, :octal, copy(data, ts-1, te))
279
284
  fret;
280
285
  };
281
286
 
282
287
  meta_char {
283
- case text = text(data, ts, te, 1).first
284
- when '\.'; emit(:escape, :dot, text, ts-1, te)
285
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
286
- when '\^'; emit(:escape, :bol, text, ts-1, te)
287
- when '\$'; emit(:escape, :eol, text, ts-1, te)
288
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
289
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
290
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
291
- when '\('; emit(:escape, :group_open, text, ts-1, te)
292
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
293
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
294
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
295
- when '\['; emit(:escape, :set_open, text, ts-1, te)
296
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
288
+ case text = copy(data, ts-1, te)
289
+ when '\.'; emit(:escape, :dot, text)
290
+ when '\|'; emit(:escape, :alternation, text)
291
+ when '\^'; emit(:escape, :bol, text)
292
+ when '\$'; emit(:escape, :eol, text)
293
+ when '\?'; emit(:escape, :zero_or_one, text)
294
+ when '\*'; emit(:escape, :zero_or_more, text)
295
+ when '\+'; emit(:escape, :one_or_more, text)
296
+ when '\('; emit(:escape, :group_open, text)
297
+ when '\)'; emit(:escape, :group_close, text)
298
+ when '\{'; emit(:escape, :interval_open, text)
299
+ when '\}'; emit(:escape, :interval_close, text)
300
+ when '\['; emit(:escape, :set_open, text)
301
+ when '\]'; emit(:escape, :set_close, text)
297
302
  when "\\\\";
298
- emit(:escape, :backslash, text, ts-1, te)
303
+ emit(:escape, :backslash, text)
299
304
  end
300
305
  fret;
301
306
  };
@@ -303,31 +308,31 @@
303
308
  escaped_ascii > (escaped_alpha, 7) {
304
309
  # \b is emitted as backspace only when inside a character set, otherwise
305
310
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
306
- case text = text(data, ts, te, 1).first
307
- when '\a'; emit(:escape, :bell, text, ts-1, te)
308
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
309
- when '\e'; emit(:escape, :escape, text, ts-1, te)
310
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
311
- when '\n'; emit(:escape, :newline, text, ts-1, te)
312
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
313
- when '\t'; emit(:escape, :tab, text, ts-1, te)
314
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
311
+ case text = copy(data, ts-1, te)
312
+ when '\a'; emit(:escape, :bell, text)
313
+ when '\b'; emit(:escape, :backspace, text)
314
+ when '\e'; emit(:escape, :escape, text)
315
+ when '\f'; emit(:escape, :form_feed, text)
316
+ when '\n'; emit(:escape, :newline, text)
317
+ when '\r'; emit(:escape, :carriage, text)
318
+ when '\t'; emit(:escape, :tab, text)
319
+ when '\v'; emit(:escape, :vertical_tab, text)
315
320
  end
316
321
  fret;
317
322
  };
318
323
 
319
324
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
320
- text = text(data, ts, te, 1).first
325
+ text = copy(data, ts-1, te)
321
326
  if text[2].chr == '{'
322
- emit(:escape, :codepoint_list, text, ts-1, te)
327
+ emit(:escape, :codepoint_list, text)
323
328
  else
324
- emit(:escape, :codepoint, text, ts-1, te)
329
+ emit(:escape, :codepoint, text)
325
330
  end
326
331
  fret;
327
332
  };
328
333
 
329
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
330
- emit(:escape, :hex, *text(data, ts, te, 1))
334
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
335
+ emit(:escape, :hex, copy(data, ts-1, te))
331
336
  fret;
332
337
  };
333
338
 
@@ -357,8 +362,8 @@
357
362
  fcall unicode_property;
358
363
  };
359
364
 
360
- (any -- non_literal_escape) > (escaped_alpha, 1) {
361
- emit(:escape, :literal, *text(data, ts, te, 1))
365
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
366
+ emit(:escape, :literal, copy(data, ts-1, te))
362
367
  fret;
363
368
  };
364
369
  *|;
@@ -368,9 +373,9 @@
368
373
  # --------------------------------------------------------------------------
369
374
  conditional_expression := |*
370
375
  group_lookup . ')' {
371
- text = text(data, ts, te-1).first
372
- emit(:conditional, :condition, text, ts, te-1)
373
- emit(:conditional, :condition_close, ')', te-1, te)
376
+ text = copy(data, ts, te-1)
377
+ emit(:conditional, :condition, text)
378
+ emit(:conditional, :condition_close, ')')
374
379
  };
375
380
 
376
381
  any {
@@ -387,39 +392,39 @@
387
392
  # Meta characters
388
393
  # ------------------------------------------------------------------------
389
394
  dot {
390
- emit(:meta, :dot, *text(data, ts, te))
395
+ emit(:meta, :dot, copy(data, ts, te))
391
396
  };
392
397
 
393
398
  alternation {
394
399
  if conditional_stack.last == group_depth
395
- emit(:conditional, :separator, *text(data, ts, te))
400
+ emit(:conditional, :separator, copy(data, ts, te))
396
401
  else
397
- emit(:meta, :alternation, *text(data, ts, te))
402
+ emit(:meta, :alternation, copy(data, ts, te))
398
403
  end
399
404
  };
400
405
 
401
406
  # Anchors
402
407
  # ------------------------------------------------------------------------
403
408
  beginning_of_line {
404
- emit(:anchor, :bol, *text(data, ts, te))
409
+ emit(:anchor, :bol, copy(data, ts, te))
405
410
  };
406
411
 
407
412
  end_of_line {
408
- emit(:anchor, :eol, *text(data, ts, te))
413
+ emit(:anchor, :eol, copy(data, ts, te))
409
414
  };
410
415
 
411
416
  backslash . keep_mark > (backslashed, 4) {
412
- emit(:keep, :mark, *text(data, ts, te))
417
+ emit(:keep, :mark, copy(data, ts, te))
413
418
  };
414
419
 
415
420
  backslash . anchor_char > (backslashed, 3) {
416
- case text = text(data, ts, te).first
417
- when '\\A'; emit(:anchor, :bos, text, ts, te)
418
- when '\\z'; emit(:anchor, :eos, text, ts, te)
419
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
420
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
421
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
422
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
421
+ case text = copy(data, ts, te)
422
+ when '\\A'; emit(:anchor, :bos, text)
423
+ when '\\z'; emit(:anchor, :eos, text)
424
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
+ when '\\b'; emit(:anchor, :word_boundary, text)
426
+ when '\\B'; emit(:anchor, :nonword_boundary, text)
427
+ when '\\G'; emit(:anchor, :match_start, text)
423
428
  end
424
429
  };
425
430
 
@@ -430,7 +435,7 @@
430
435
  # Character sets
431
436
  # ------------------------------------------------------------------------
432
437
  set_open >set_opened {
433
- emit(:set, :open, *text(data, ts, te))
438
+ emit(:set, :open, copy(data, ts, te))
434
439
  fcall character_set;
435
440
  };
436
441
 
@@ -439,12 +444,12 @@
439
444
  # (?(condition)Y|N) conditional expression
440
445
  # ------------------------------------------------------------------------
441
446
  conditional {
442
- text = text(data, ts, te).first
447
+ text = copy(data, ts, te)
443
448
 
444
449
  conditional_stack << group_depth
445
450
 
446
- emit(:conditional, :open, text[0..-2], ts, te-1)
447
- emit(:conditional, :condition_open, '(', te-1, te)
451
+ emit(:conditional, :open, text[0..-2])
452
+ emit(:conditional, :condition_open, '(')
448
453
  fcall conditional_expression;
449
454
  };
450
455
 
@@ -455,7 +460,7 @@
455
460
  # correct closing count.
456
461
  # ------------------------------------------------------------------------
457
462
  group_open . group_comment $group_closed {
458
- emit(:group, :comment, *text(data, ts, te))
463
+ emit(:group, :comment, copy(data, ts, te))
459
464
  };
460
465
 
461
466
  # Expression options:
@@ -470,11 +475,11 @@
470
475
  # (?imxdau-imx:subexp) option on/off for subexp
471
476
  # ------------------------------------------------------------------------
472
477
  group_open . group_options >group_opened {
473
- text = text(data, ts, te).first
478
+ text = copy(data, ts, te)
474
479
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
480
  raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
481
  end
477
- emit_options(text, ts, te)
482
+ emit_options(text)
478
483
  };
479
484
 
480
485
  # Assertions
@@ -484,11 +489,11 @@
484
489
  # (?<!subexp) negative look-behind
485
490
  # ------------------------------------------------------------------------
486
491
  group_open . assertion_type >group_opened {
487
- case text = text(data, ts, te).first
488
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
489
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
490
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
491
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
492
+ case text = copy(data, ts, te)
493
+ when '(?='; emit(:assertion, :lookahead, text)
494
+ when '(?!'; emit(:assertion, :nlookahead, text)
495
+ when '(?<='; emit(:assertion, :lookbehind, text)
496
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
492
497
  end
493
498
  };
494
499
 
@@ -501,32 +506,32 @@
501
506
  # (subexp) captured group
502
507
  # ------------------------------------------------------------------------
503
508
  group_open . group_type >group_opened {
504
- case text = text(data, ts, te).first
505
- when '(?:'; emit(:group, :passive, text, ts, te)
506
- when '(?>'; emit(:group, :atomic, text, ts, te)
507
- when '(?~'; emit(:group, :absence, text, ts, te)
509
+ case text = copy(data, ts, te)
510
+ when '(?:'; emit(:group, :passive, text)
511
+ when '(?>'; emit(:group, :atomic, text)
512
+ when '(?~'; emit(:group, :absence, text)
508
513
 
509
514
  when /^\(\?(?:<>|'')/
510
515
  validation_error(:group, 'named group', 'name is empty')
511
516
 
512
- when /^\(\?<\w*>/
513
- emit(:group, :named_ab, text, ts, te)
517
+ when /^\(\?<[^>]+>/
518
+ emit(:group, :named_ab, text)
514
519
 
515
- when /^\(\?'\w*'/
516
- emit(:group, :named_sq, text, ts, te)
520
+ when /^\(\?'[^']+'/
521
+ emit(:group, :named_sq, text)
517
522
 
518
523
  end
519
524
  };
520
525
 
521
526
  group_open @group_opened {
522
- text = text(data, ts, te).first
523
- emit(:group, :capture, text, ts, te)
527
+ text = copy(data, ts, te)
528
+ emit(:group, :capture, text)
524
529
  };
525
530
 
526
531
  group_close @group_closed {
527
532
  if conditional_stack.last == group_depth + 1
528
533
  conditional_stack.pop
529
- emit(:conditional, :close, *text(data, ts, te))
534
+ emit(:conditional, :close, copy(data, ts, te))
530
535
  else
531
536
  if spacing_stack.length > 1 &&
532
537
  spacing_stack.last[:depth] == group_depth + 1
@@ -534,72 +539,42 @@
534
539
  self.free_spacing = spacing_stack.last[:free_spacing]
535
540
  end
536
541
 
537
- emit(:group, :close, *text(data, ts, te))
542
+ emit(:group, :close, copy(data, ts, te))
538
543
  end
539
544
  };
540
545
 
541
546
 
542
547
  # Group backreference, named and numbered
543
548
  # ------------------------------------------------------------------------
544
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
545
- case text = text(data, ts, te).first
546
- when /^\\([gk])(<>|'')/ # angle brackets
547
- validation_error(:backref, 'ref/call', 'ref ID is empty')
548
-
549
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
550
- if $1 == 'k'
551
- emit(:backref, :name_ref_ab, text, ts, te)
552
- else
553
- emit(:backref, :name_call_ab, text, ts, te)
554
- end
555
-
556
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
557
- if $1 == 'k'
558
- emit(:backref, :name_ref_sq, text, ts, te)
559
- else
560
- emit(:backref, :name_call_sq, text, ts, te)
561
- end
562
-
563
- when /^\\([gk])<\d+>/ # angle-brackets
564
- if $1 == 'k'
565
- emit(:backref, :number_ref_ab, text, ts, te)
566
- else
567
- emit(:backref, :number_call_ab, text, ts, te)
568
- end
569
-
570
- when /^\\([gk])'\d+'/ # single quotes
571
- if $1 == 'k'
572
- emit(:backref, :number_ref_sq, text, ts, te)
573
- else
574
- emit(:backref, :number_call_sq, text, ts, te)
575
- end
576
-
577
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
578
- if $1 == 'k'
579
- emit(:backref, :number_rel_ref_ab, text, ts, te)
580
- else
581
- emit(:backref, :number_rel_call_ab, text, ts, te)
582
- end
583
-
584
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
585
- if $1 == 'k'
586
- emit(:backref, :number_rel_ref_sq, text, ts, te)
587
- else
588
- emit(:backref, :number_rel_call_sq, text, ts, te)
589
- end
590
-
591
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
592
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
593
-
594
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
595
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
596
-
597
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
598
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
599
-
600
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
601
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
549
+ backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
550
+ case text = copy(data, ts, te)
551
+ when /^\\k(<>|'')/
552
+ validation_error(:backref, 'backreference', 'ref ID is empty')
553
+ when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
554
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
+ when /^\\k(.)\d+\D$/
556
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
+ when /^\\k(.)-\d+\D$/
558
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
+ when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
560
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
+ when /^\\k(.)-?\d+[+\-]\d+\D$/
562
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
563
+ end
564
+ };
602
565
 
566
+ # Group call, named and numbered
567
+ # ------------------------------------------------------------------------
568
+ backslash . (group_name_call | group_number_call) > (backslashed, 4) {
569
+ case text = copy(data, ts, te)
570
+ when /^\\g(<>|'')/
571
+ validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
+ when /^\\g(.)[^\p{digit}+\->][^+\-]*/
573
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
+ when /^\\g(.)\d+\D$/
575
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
+ when /^\\g(.)[+-]\d+/
577
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
603
578
  end
604
579
  };
605
580
 
@@ -607,31 +582,31 @@
607
582
  # Quantifiers
608
583
  # ------------------------------------------------------------------------
609
584
  zero_or_one {
610
- case text = text(data, ts, te).first
611
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
612
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
613
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '?' ; emit(:quantifier, :zero_or_one, text)
587
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
588
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
614
589
  end
615
590
  };
616
591
 
617
592
  zero_or_more {
618
- case text = text(data, ts, te).first
619
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
620
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
621
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
593
+ case text = copy(data, ts, te)
594
+ when '*' ; emit(:quantifier, :zero_or_more, text)
595
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
596
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
622
597
  end
623
598
  };
624
599
 
625
600
  one_or_more {
626
- case text = text(data, ts, te).first
627
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
628
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
629
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
601
+ case text = copy(data, ts, te)
602
+ when '+' ; emit(:quantifier, :one_or_more, text)
603
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
604
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
630
605
  end
631
606
  };
632
607
 
633
608
  quantifier_interval {
634
- emit(:quantifier, :interval, *text(data, ts, te))
609
+ emit(:quantifier, :interval, copy(data, ts, te))
635
610
  };
636
611
 
637
612
  # Catch unmatched curly braces as literals
@@ -647,15 +622,17 @@
647
622
 
648
623
  comment {
649
624
  if free_spacing
650
- emit(:free_space, :comment, *text(data, ts, te))
625
+ emit(:free_space, :comment, copy(data, ts, te))
651
626
  else
652
- append_literal(data, ts, te)
627
+ # consume only the pound sign (#) and backtrack to do regular scanning
628
+ append_literal(data, ts, ts + 1)
629
+ fexec ts + 1;
653
630
  end
654
631
  };
655
632
 
656
633
  space+ {
657
634
  if free_spacing
658
- emit(:free_space, :whitespace, *text(data, ts, te))
635
+ emit(:free_space, :whitespace, copy(data, ts, te))
659
636
  else
660
637
  append_literal(data, ts, te)
661
638
  end
@@ -664,11 +641,7 @@
664
641
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
665
642
  # except meta characters.
666
643
  # ------------------------------------------------------------------------
667
- (ascii_print -- space)+ |
668
- ascii_nonprint+ |
669
- utf8_2_byte+ |
670
- utf8_3_byte+ |
671
- utf8_4_byte+ {
644
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
672
645
  append_literal(data, ts, te)
673
646
  };
674
647
 
@@ -678,12 +651,14 @@
678
651
  # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
679
652
  # This file was generated from lib/regexp_parser/scanner/scanner.rl
680
653
 
654
+ require 'regexp_parser/error'
655
+
681
656
  class Regexp::Scanner
682
657
  # General scanner error (catch all)
683
- class ScannerError < StandardError; end
658
+ class ScannerError < Regexp::Parser::Error; end
684
659
 
685
660
  # Base for all scanner validation errors
686
- class ValidationError < StandardError
661
+ class ValidationError < Regexp::Parser::Error
687
662
  def initialize(reason)
688
663
  super reason
689
664
  end
@@ -737,21 +712,16 @@ class Regexp::Scanner
737
712
  #
738
713
  # This method may raise errors if a syntax error is encountered.
739
714
  # --------------------------------------------------------------------------
740
- def self.scan(input_object, &block)
741
- new.scan(input_object, &block)
715
+ def self.scan(input_object, options: nil, &block)
716
+ new.scan(input_object, options: options, &block)
742
717
  end
743
718
 
744
- def scan(input_object, &block)
719
+ def scan(input_object, options: nil, &block)
745
720
  self.literal = nil
746
721
  stack = []
747
722
 
748
- if input_object.is_a?(Regexp)
749
- input = input_object.source
750
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
751
- else
752
- input = input_object
753
- self.free_spacing = false
754
- end
723
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
724
+ self.free_spacing = free_spacing?(input_object, options)
755
725
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
756
726
 
757
727
  data = input.unpack("c*") if input.is_a?(String)
@@ -763,6 +733,7 @@ class Regexp::Scanner
763
733
  self.set_depth = 0
764
734
  self.group_depth = 0
765
735
  self.conditional_stack = []
736
+ self.char_pos = 0
766
737
 
767
738
  %% write data;
768
739
  %% write init;
@@ -772,7 +743,7 @@ class Regexp::Scanner
772
743
  testEof = testEof
773
744
 
774
745
  if cs == re_scanner_error
775
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
746
+ text = copy(data, ts ? ts-1 : 0, -1)
776
747
  raise ScannerError.new("Scan error at '#{text}'")
777
748
  end
778
749
 
@@ -789,33 +760,51 @@ class Regexp::Scanner
789
760
 
790
761
  # lazy-load property maps when first needed
791
762
  require 'yaml'
792
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
793
763
 
794
764
  def self.short_prop_map
795
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
765
+ @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
796
766
  end
797
767
 
798
768
  def self.long_prop_map
799
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
769
+ @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
800
770
  end
801
771
 
802
772
  # Emits an array with the details of the scanned pattern
803
- def emit(type, token, text, ts, te)
773
+ def emit(type, token, text)
804
774
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
805
775
 
806
776
  emit_literal if literal
807
777
 
778
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
779
+ # end-users, so we keep track of char-based indices and emit those instead.
780
+ ts_char_pos = char_pos
781
+ te_char_pos = char_pos + text.length
782
+
808
783
  if block
809
- block.call type, token, text, ts, te
784
+ block.call type, token, text, ts_char_pos, te_char_pos
810
785
  end
811
786
 
812
- tokens << [type, token, text, ts, te]
787
+ tokens << [type, token, text, ts_char_pos, te_char_pos]
788
+
789
+ self.char_pos = te_char_pos
813
790
  end
814
791
 
815
792
  private
816
793
 
817
794
  attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
818
- :group_depth, :set_depth, :conditional_stack
795
+ :group_depth, :set_depth, :conditional_stack, :char_pos
796
+
797
+ def free_spacing?(input_object, options)
798
+ if options && !input_object.is_a?(String)
799
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
800
+ end
801
+
802
+ options = input_object.options if input_object.is_a?(::Regexp)
803
+
804
+ return false unless options
805
+
806
+ options & Regexp::EXTENDED != 0
807
+ end
819
808
 
820
809
  def in_group?
821
810
  group_depth > 0
@@ -826,36 +815,25 @@ class Regexp::Scanner
826
815
  end
827
816
 
828
817
  # Copy from ts to te from data as text
829
- def copy(data, range)
830
- data[range].pack('c*')
831
- end
832
-
833
- # Copy from ts to te from data as text, returning an array with the text
834
- # and the offsets used to copy it.
835
- def text(data, ts, te, soff = 0)
836
- [copy(data, ts-soff..te-1), ts-soff, te]
818
+ def copy(data, ts, te)
819
+ data[ts...te].pack('c*').force_encoding('utf-8')
837
820
  end
838
821
 
839
822
  # Appends one or more characters to the literal buffer, to be emitted later
840
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
823
+ # by a call to emit_literal.
841
824
  def append_literal(data, ts, te)
842
825
  self.literal = literal || []
843
- literal << text(data, ts, te)
826
+ literal << copy(data, ts, te)
844
827
  end
845
828
 
846
- # Emits the literal run collected by calls to the append_literal method,
847
- # using the total start (ts) and end (te) offsets of the run.
829
+ # Emits the literal run collected by calls to the append_literal method.
848
830
  def emit_literal
849
- ts, te = literal.first[1], literal.last[2]
850
- text = literal.map {|t| t[0]}.join
851
-
852
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
853
-
831
+ text = literal.join
854
832
  self.literal = nil
855
- emit(:literal, :literal, text, ts, te)
833
+ emit(:literal, :literal, text)
856
834
  end
857
835
 
858
- def emit_options(text, ts, te)
836
+ def emit_options(text)
859
837
  token = nil
860
838
 
861
839
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -881,14 +859,14 @@ class Regexp::Scanner
881
859
  token = :options_switch
882
860
  end
883
861
 
884
- emit(:group, token, text, ts, te)
862
+ emit(:group, token, text)
885
863
  end
886
864
 
887
865
  def emit_meta_control_sequence(data, ts, te, token)
888
866
  if data.last < 0x00 || data.last > 0x7F
889
867
  validation_error(:sequence, 'escape', token.to_s)
890
868
  end
891
- emit(:escape, token, *text(data, ts, te, 1))
869
+ emit(:escape, token, copy(data, ts-1, te))
892
870
  end
893
871
 
894
872
  # Centralizes and unifies the handling of validation related