regexp_parser 0.1.1 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/ChangeLog +45 -0
  3. data/Rakefile +12 -44
  4. data/VERSION.yml +5 -0
  5. data/lib/regexp_parser.rb +5 -38
  6. data/lib/regexp_parser/expression.rb +68 -221
  7. data/lib/regexp_parser/expression/classes/alternation.rb +47 -0
  8. data/lib/regexp_parser/expression/classes/anchor.rb +26 -0
  9. data/lib/regexp_parser/expression/classes/backref.rb +42 -0
  10. data/lib/regexp_parser/expression/classes/escape.rb +27 -0
  11. data/lib/regexp_parser/expression/classes/group.rb +67 -0
  12. data/lib/regexp_parser/expression/classes/literal.rb +7 -0
  13. data/lib/regexp_parser/expression/{property.rb → classes/property.rb} +1 -1
  14. data/lib/regexp_parser/expression/classes/root.rb +26 -0
  15. data/lib/regexp_parser/expression/classes/set.rb +100 -0
  16. data/lib/regexp_parser/expression/classes/type.rb +17 -0
  17. data/lib/regexp_parser/expression/quantifier.rb +26 -0
  18. data/lib/regexp_parser/expression/subexpression.rb +69 -0
  19. data/lib/regexp_parser/lexer.rb +4 -4
  20. data/lib/regexp_parser/parser.rb +31 -13
  21. data/lib/regexp_parser/scanner.rb +1849 -1488
  22. data/lib/regexp_parser/scanner/property.rl +7 -2
  23. data/lib/regexp_parser/scanner/scanner.rl +377 -191
  24. data/lib/regexp_parser/syntax.rb +7 -0
  25. data/lib/regexp_parser/syntax/ruby/1.8.6.rb +4 -4
  26. data/lib/regexp_parser/syntax/ruby/1.9.1.rb +9 -9
  27. data/lib/regexp_parser/syntax/ruby/2.0.0.rb +16 -0
  28. data/lib/regexp_parser/syntax/ruby/2.1.0.rb +13 -0
  29. data/lib/regexp_parser/syntax/tokens.rb +21 -320
  30. data/lib/regexp_parser/syntax/tokens/anchor.rb +17 -0
  31. data/lib/regexp_parser/syntax/tokens/assertion.rb +15 -0
  32. data/lib/regexp_parser/syntax/tokens/backref.rb +26 -0
  33. data/lib/regexp_parser/syntax/tokens/character_set.rb +48 -0
  34. data/lib/regexp_parser/syntax/tokens/character_type.rb +16 -0
  35. data/lib/regexp_parser/syntax/tokens/escape.rb +29 -0
  36. data/lib/regexp_parser/syntax/tokens/group.rb +22 -0
  37. data/lib/regexp_parser/syntax/tokens/meta.rb +15 -0
  38. data/lib/regexp_parser/syntax/tokens/quantifier.rb +37 -0
  39. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +204 -0
  40. data/lib/regexp_parser/token.rb +37 -0
  41. data/test/expression/test_all.rb +7 -0
  42. data/test/expression/test_base.rb +72 -0
  43. data/test/expression/test_clone.rb +144 -0
  44. data/test/{parser/test_expression.rb → expression/test_to_s.rb} +10 -10
  45. data/test/helpers.rb +1 -0
  46. data/test/parser/test_all.rb +1 -1
  47. data/test/parser/test_alternation.rb +35 -0
  48. data/test/parser/test_anchors.rb +2 -2
  49. data/test/parser/test_refcalls.rb +1 -1
  50. data/test/parser/test_sets.rb +54 -8
  51. data/test/scanner/test_anchors.rb +2 -2
  52. data/test/scanner/test_conditionals.rb +31 -0
  53. data/test/scanner/test_errors.rb +88 -8
  54. data/test/scanner/test_escapes.rb +4 -4
  55. data/test/scanner/test_groups.rb +7 -0
  56. data/test/scanner/test_quoting.rb +29 -0
  57. data/test/scanner/test_sets.rb +1 -0
  58. data/test/syntax/ruby/test_1.8.rb +3 -3
  59. data/test/test_all.rb +1 -1
  60. metadata +62 -48
  61. data/lib/regexp_parser/expression/set.rb +0 -59
@@ -58,7 +58,7 @@
58
58
  unicode_property := |*
59
59
 
60
60
  property_sequence < eof(premature_property_end) {
61
- text = data[ts-1..te-1].pack('c*')
61
+ text = text(data, ts, te, 1).first
62
62
  if in_set
63
63
  type = :set
64
64
  else
@@ -525,9 +525,14 @@
525
525
  self.emit(type, :script_unknown, text, ts-1, te)
526
526
 
527
527
  else
528
- raise UnknownUnicodePropertyError.new(name)
528
+ # Should this really be an error? Or would emitting
529
+ # an :unknown for the property be better?
530
+ #
531
+ # self.emit(type, :unknown, text, ts-1, te)
529
532
 
533
+ raise UnknownUnicodePropertyError.new(name)
530
534
  end
535
+
531
536
  fret;
532
537
  };
533
538
  *|;
@@ -28,6 +28,7 @@
28
28
 
29
29
  class_posix = ('[:' . '^'? . class_name_posix . ':]');
30
30
 
31
+
31
32
  # these are not supported in ruby, and need verification
32
33
  collating_sequence = '[.' . (alpha | [\-])+ . '.]';
33
34
  character_equivalent = '[=' . alpha . '=]';
@@ -41,14 +42,21 @@
41
42
  octal_sequence = [0-7]{1,3};
42
43
 
43
44
  hex_sequence = 'x' . xdigit{1,2};
45
+ hex_sequence_err = 'x' . [^0-9a-fA-F{];
44
46
  wide_hex_sequence = 'x' . '{' . xdigit{1,8} . '}';
45
47
 
48
+ hex_or_not = (xdigit|[^0-9a-fA-F}]); # note closing curly at end
49
+
50
+ wide_hex_seq_invalid = 'x' . '{' . hex_or_not{1,9};
51
+ wide_hex_seq_empty = 'x' . '{' . (space+)? . '}';
52
+
46
53
  codepoint_single = 'u' . xdigit{4};
47
54
  codepoint_list = 'u{' . (xdigit{4} . space?)+'}';
48
55
  codepoint_sequence = codepoint_single | codepoint_list;
49
56
 
50
- control_sequence = ('c' | 'C-') . alpha;
51
- meta_sequence = 'M-' . ((backslash . control_sequence) | alpha);
57
+ control_sequence = ('c' | 'C-');
58
+
59
+ meta_sequence = 'M-' . (backslash . control_sequence)?;
52
60
 
53
61
  zero_or_one = '?' | '??' | '?+';
54
62
  zero_or_more = '*' | '*?' | '*+';
@@ -59,11 +67,11 @@
59
67
  quantifier_possessive = '?+' | '*+' | '++';
60
68
  quantifier_mode = '?' | '+';
61
69
 
62
- quantifier_range = range_open . (digit+)? . ','? . (digit+)? .
70
+ quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
63
71
  range_close . quantifier_mode?;
64
72
 
65
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
66
- quantifier_possessive | quantifier_range;
74
+ quantifier_possessive | quantifier_interval;
67
75
 
68
76
 
69
77
  group_comment = '?#' . [^)]+ . group_close;
@@ -76,10 +84,10 @@
76
84
  assertion_lookbehind = '?<=';
77
85
  assertion_nlookbehind = '?<!';
78
86
 
79
- group_options = '?' . ([mix]{1,3})? . '-'? . ([mix]{1,3})?;
87
+ group_options = '?' . [\-mix];
80
88
 
81
89
  group_ref = [gk];
82
- group_name = alpha . (alnum+)?;
90
+ group_name = (alnum . (alnum+)?)?;
83
91
  group_number = '-'? . [1-9] . ([0-9]+)?;
84
92
  group_level = [+\-] . [0-9]+;
85
93
 
@@ -113,7 +121,16 @@
113
121
  group_ref | [xucCM];
114
122
 
115
123
  # EOF error, used where it can be detected
116
- action premature_end_error { raise PrematureEndError }
124
+ action premature_end_error {
125
+ text = ts ? copy(data, ts-1..-1) : data.pack('c*')
126
+ raise PrematureEndError.new( text )
127
+ }
128
+
129
+ # Invalid sequence error, used from sequences, like escapes and sets
130
+ action invalid_sequence_error {
131
+ text = ts ? copy(data, ts-1..-1) : data.pack('c*')
132
+ raise InvalidSequenceError.new('sequence', text)
133
+ }
117
134
 
118
135
  # group (nesting) and set open/close actions
119
136
  action group_opened { group_depth += 1; in_group = true }
@@ -127,7 +144,7 @@
127
144
  set_type = set_depth > 1 ? :subset : :set
128
145
  set_depth -= 1; in_set = set_depth > 0 ? true : false
129
146
 
130
- self.emit(set_type, :close, data[ts..te-1].pack('c*'), ts, te)
147
+ emit(set_type, :close, *text(data, ts, te))
131
148
 
132
149
  if set_depth == 0
133
150
  fgoto main;
@@ -140,8 +157,8 @@
140
157
  set_type = set_depth > 1 ? :subset : :set
141
158
  set_depth -= 1; in_set = set_depth > 0 ? true : false
142
159
 
143
- self.emit(set_type, :member, data[ts..te-2].pack('c*'), ts, te)
144
- self.emit(set_type, :close, data[ts+1..te-1].pack('c*'), ts, te)
160
+ emit(set_type, :member, copy(data, ts..te-2), ts, te)
161
+ emit(set_type, :close, copy(data, ts+1..te-1), ts, te)
145
162
 
146
163
  if set_depth == 0
147
164
  fgoto main;
@@ -151,20 +168,20 @@
151
168
  };
152
169
 
153
170
  '^' {
154
- text = data[ts..te-1].pack('c*')
171
+ text = text(data, ts, te).first
155
172
  if @tokens.last[1] == :open
156
- self.emit(set_type, :negate, text, ts, te)
173
+ emit(set_type, :negate, text, ts, te)
157
174
  else
158
- self.emit(set_type, :member, text, ts, te)
175
+ emit(set_type, :member, text, ts, te)
159
176
  end
160
177
  };
161
178
 
162
179
  alnum . '-' . alnum {
163
- self.emit(set_type, :range, data[ts..te-1].pack('c*'), ts, te)
180
+ emit(set_type, :range, *text(data, ts, te))
164
181
  };
165
182
 
166
183
  '&&' {
167
- self.emit(set_type, :intersection, data[ts..te-1].pack('c*'), ts, te)
184
+ emit(set_type, :intersection, *text(data, ts, te))
168
185
  };
169
186
 
170
187
  '\\' {
@@ -175,12 +192,12 @@
175
192
  set_depth += 1; in_set = true
176
193
  set_type = set_depth > 1 ? :subset : :set
177
194
 
178
- self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
195
+ emit(set_type, :open, *text(data, ts, te))
179
196
  fcall character_set;
180
197
  };
181
198
 
182
199
  class_posix >(open_bracket, 1) @eof(premature_end_error) {
183
- text = data[ts..te-1].pack('c*')
200
+ text = text(data, ts, te).first
184
201
 
185
202
  class_name = text[2..-3]
186
203
  if class_name[0].chr == '^'
@@ -188,21 +205,21 @@
188
205
  end
189
206
 
190
207
  token_sym = "class_#{class_name}".to_sym
191
- self.emit(set_type, token_sym, text, ts, te)
208
+ emit(set_type, token_sym, text, ts, te)
192
209
  };
193
210
 
194
211
  collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
195
- self.emit(set_type, :collation, data[ts..te-1].pack('c*'), ts, te)
212
+ emit(set_type, :collation, *text(data, ts, te))
196
213
  };
197
214
 
198
215
  character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
199
- self.emit(set_type, :equivalent, data[ts..te-1].pack('c*'), ts, te)
216
+ emit(set_type, :equivalent, *text(data, ts, te))
200
217
  };
201
218
 
202
219
  # exclude the closing bracket as a cleaner workaround for dealing with the
203
220
  # ambiguity caused upon exit from the unicode properties machine
204
221
  meta_char -- ']' {
205
- self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
222
+ emit(set_type, :member, *text(data, ts, te))
206
223
  };
207
224
 
208
225
  any |
@@ -210,48 +227,48 @@
210
227
  utf8_2_byte |
211
228
  utf8_3_byte |
212
229
  utf8_4_byte {
213
- self.emit(set_type, :member, data[ts..te-1].pack('c*'), ts, te)
230
+ emit(set_type, :member, *text(data, ts, te))
214
231
  };
215
232
  *|;
216
233
 
217
234
  # set escapes scanner
218
235
  # --------------------------------------------------------------------------
219
236
  set_escape_sequence := |*
220
- 'b' {
221
- self.emit(set_type, :backspace, data[ts-1..te-1].pack('c*'), ts-1, te)
237
+ 'b' > (escaped_set_alpha, 2) {
238
+ emit(set_type, :backspace, *text(data, ts, te, 1))
222
239
  fret;
223
240
  };
224
241
 
225
242
  char_type {
226
- case text = data[ts-1..te-1].pack('c*')
227
- when '\d'; self.emit(set_type, :type_digit, text, ts-1, te)
228
- when '\D'; self.emit(set_type, :type_nondigit, text, ts-1, te)
229
- when '\h'; self.emit(set_type, :type_hex, text, ts-1, te)
230
- when '\H'; self.emit(set_type, :type_nonhex, text, ts-1, te)
231
- when '\s'; self.emit(set_type, :type_space, text, ts-1, te)
232
- when '\S'; self.emit(set_type, :type_nonspace, text, ts-1, te)
233
- when '\w'; self.emit(set_type, :type_word, text, ts-1, te)
234
- when '\W'; self.emit(set_type, :type_nonword, text, ts-1, te)
243
+ case text = text(data, ts, te, 1).first
244
+ when '\d'; emit(set_type, :type_digit, text, ts-1, te)
245
+ when '\D'; emit(set_type, :type_nondigit, text, ts-1, te)
246
+ when '\h'; emit(set_type, :type_hex, text, ts-1, te)
247
+ when '\H'; emit(set_type, :type_nonhex, text, ts-1, te)
248
+ when '\s'; emit(set_type, :type_space, text, ts-1, te)
249
+ when '\S'; emit(set_type, :type_nonspace, text, ts-1, te)
250
+ when '\w'; emit(set_type, :type_word, text, ts-1, te)
251
+ when '\W'; emit(set_type, :type_nonword, text, ts-1, te)
235
252
  end
236
253
  fret;
237
254
  };
238
255
 
239
256
  hex_sequence . '-\\' . hex_sequence {
240
- self.emit(set_type, :range_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
257
+ emit(set_type, :range_hex, *text(data, ts, te, 1))
241
258
  fret;
242
259
  };
243
260
 
244
261
  hex_sequence {
245
- self.emit(set_type, :member_hex, data[ts-1..te-1].pack('c*'), ts-1, te)
262
+ emit(set_type, :member_hex, *text(data, ts, te, 1))
246
263
  fret;
247
264
  };
248
265
 
249
266
  meta_char | [\\\]\-\,] {
250
- self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
267
+ emit(set_type, :escape, *text(data, ts, te, 1))
251
268
  fret;
252
269
  };
253
270
 
254
- property_char > (escaped_set_alpha, 2) {
271
+ property_char > (escaped_set_alpha, 3) {
255
272
  fhold;
256
273
  fnext character_set;
257
274
  fcall unicode_property;
@@ -264,7 +281,7 @@
264
281
  utf8_2_byte |
265
282
  utf8_3_byte |
266
283
  utf8_4_byte {
267
- self.emit(set_type, :escape, data[ts-1..te-1].pack('c*'), ts-1, te)
284
+ emit(set_type, :escape, *text(data, ts, te, 1))
268
285
  fret;
269
286
  };
270
287
  *|;
@@ -274,33 +291,33 @@
274
291
  # --------------------------------------------------------------------------
275
292
  escape_sequence := |*
276
293
  [1-9] {
277
- text = data[ts-1..te-1].pack('c*')
278
- self.emit(:backref, :number, text, ts-1, te)
294
+ text = text(data, ts, te, 1).first
295
+ emit(:backref, :number, text, ts-1, te)
279
296
  fret;
280
297
  };
281
298
 
282
299
  octal_sequence {
283
- self.emit(:escape, :octal, data[ts-1..te-1].pack('c*'), ts-1, te)
300
+ emit(:escape, :octal, *text(data, ts, te, 1))
284
301
  fret;
285
302
  };
286
303
 
287
304
  meta_char {
288
- case text = data[ts-1..te-1].pack('c*')
289
- when '\.'; self.emit(:escape, :dot, text, ts-1, te)
290
- when '\|'; self.emit(:escape, :alternation, text, ts-1, te)
291
- when '\^'; self.emit(:escape, :beginning_of_line, text, ts-1, te)
292
- when '\$'; self.emit(:escape, :end_of_line, text, ts-1, te)
293
- when '\?'; self.emit(:escape, :zero_or_one, text, ts-1, te)
294
- when '\*'; self.emit(:escape, :zero_or_more, text, ts-1, te)
295
- when '\+'; self.emit(:escape, :one_or_more, text, ts-1, te)
296
- when '\('; self.emit(:escape, :group_open, text, ts-1, te)
297
- when '\)'; self.emit(:escape, :group_close, text, ts-1, te)
298
- when '\{'; self.emit(:escape, :interval_open, text, ts-1, te)
299
- when '\}'; self.emit(:escape, :interval_close, text, ts-1, te)
300
- when '\['; self.emit(:escape, :set_open, text, ts-1, te)
301
- when '\]'; self.emit(:escape, :set_close, text, ts-1, te)
305
+ case text = text(data, ts, te, 1).first
306
+ when '\.'; emit(:escape, :dot, text, ts-1, te)
307
+ when '\|'; emit(:escape, :alternation, text, ts-1, te)
308
+ when '\^'; emit(:escape, :bol, text, ts-1, te)
309
+ when '\$'; emit(:escape, :eol, text, ts-1, te)
310
+ when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
311
+ when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
312
+ when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
313
+ when '\('; emit(:escape, :group_open, text, ts-1, te)
314
+ when '\)'; emit(:escape, :group_close, text, ts-1, te)
315
+ when '\{'; emit(:escape, :interval_open, text, ts-1, te)
316
+ when '\}'; emit(:escape, :interval_close, text, ts-1, te)
317
+ when '\['; emit(:escape, :set_open, text, ts-1, te)
318
+ when '\]'; emit(:escape, :set_close, text, ts-1, te)
302
319
  when "\\\\";
303
- self.emit(:escape, :backslash, text, ts-1, te)
320
+ emit(:escape, :backslash, text, ts-1, te)
304
321
  end
305
322
  fret;
306
323
  };
@@ -308,46 +325,76 @@
308
325
  escaped_ascii > (escaped_alpha, 7) {
309
326
  # \b is emitted as backspace only when inside a character set, otherwise
310
327
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
311
- case text = data[ts-1..te-1].pack('c*')
312
- when '\a'; self.emit(:escape, :bell, text, ts-1, te)
313
- when '\e'; self.emit(:escape, :escape, text, ts-1, te)
314
- when '\f'; self.emit(:escape, :form_feed, text, ts-1, te)
315
- when '\n'; self.emit(:escape, :newline, text, ts-1, te)
316
- when '\r'; self.emit(:escape, :carriage, text, ts-1, te)
317
- when '\s'; self.emit(:escape, :space, text, ts-1, te)
318
- when '\t'; self.emit(:escape, :tab, text, ts-1, te)
319
- when '\v'; self.emit(:escape, :vertical_tab, text, ts-1, te)
328
+ case text = text(data, ts, te, 1).first
329
+ when '\a'; emit(:escape, :bell, text, ts-1, te)
330
+ when '\e'; emit(:escape, :escape, text, ts-1, te)
331
+ when '\f'; emit(:escape, :form_feed, text, ts-1, te)
332
+ when '\n'; emit(:escape, :newline, text, ts-1, te)
333
+ when '\r'; emit(:escape, :carriage, text, ts-1, te)
334
+ when '\s'; emit(:escape, :space, text, ts-1, te)
335
+ when '\t'; emit(:escape, :tab, text, ts-1, te)
336
+ when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
320
337
  end
321
338
  fret;
322
339
  };
323
340
 
324
- codepoint_sequence > (escaped_alpha, 6) {
325
- text = data[ts-1..te-1].pack('c*')
341
+ codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
342
+ text = text(data, ts, te, 1).first
326
343
  if text[2].chr == '{'
327
- self.emit(:escape, :codepoint_list, text, ts-1, te)
344
+ emit(:escape, :codepoint_list, text, ts-1, te)
328
345
  else
329
- self.emit(:escape, :codepoint, text, ts-1, te)
346
+ emit(:escape, :codepoint, text, ts-1, te)
330
347
  end
331
348
  fret;
332
349
  };
333
350
 
334
- hex_sequence > (escaped_alpha, 5) {
335
- self.emit(:escape, :hex, data[ts-1..te-1].pack('c*'), ts-1, te)
351
+ hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
352
+ emit(:escape, :hex, *text(data, ts, te, 1))
353
+ fret;
354
+ };
355
+
356
+ wide_hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
357
+ emit(:escape, :hex_wide, *text(data, ts, te, 1))
336
358
  fret;
337
359
  };
338
360
 
339
- wide_hex_sequence > (escaped_alpha, 5) {
340
- self.emit(:escape, :hex_wide, data[ts-1..te-1].pack('c*'), ts-1, te)
361
+ hex_sequence_err @invalid_sequence_error {
341
362
  fret;
342
363
  };
343
364
 
344
- control_sequence > (escaped_alpha, 4) {
345
- self.emit(:escape, :control, data[ts-1..te-1].pack('c*'), ts-1, te)
365
+ (wide_hex_seq_invalid | wide_hex_seq_empty) {
366
+ raise InvalidSequenceError.new("wide hex sequence")
346
367
  fret;
347
368
  };
348
369
 
349
- meta_sequence > (backslashed, 3) {
350
- self.emit(:escape, :meta_sequence, data[ts-1..te-1].pack('c*'), ts-1, te)
370
+ control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
371
+ if data[te]
372
+ c = data[te].chr
373
+ if c =~ /[\x00-\x7F]/
374
+ emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
375
+ p += 1
376
+ else
377
+ raise InvalidSequenceError.new("control sequence")
378
+ end
379
+ else
380
+ raise PrematureEndError.new("control sequence")
381
+ end
382
+ fret;
383
+ };
384
+
385
+ meta_sequence >(backslashed, 3) $eof(premature_end_error) {
386
+ if data[te]
387
+ c = data[te].chr
388
+ if c =~ /[\x00-\x7F]/
389
+ emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
390
+ p += 1
391
+ else
392
+ raise InvalidSequenceError.new("meta sequence")
393
+ end
394
+ else
395
+ raise PrematureEndError.new("meta sequence")
396
+ end
397
+ fret;
351
398
  };
352
399
 
353
400
  property_char > (escaped_alpha, 2) {
@@ -357,7 +404,7 @@
357
404
  };
358
405
 
359
406
  (any -- non_literal_escape) > (escaped_alpha, 1) {
360
- self.emit(:escape, :literal, data[ts-1..te-1].pack('c*'), ts-1, te)
407
+ emit(:escape, :literal, *text(data, ts, te, 1))
361
408
  fret;
362
409
  };
363
410
  *|;
@@ -370,32 +417,34 @@
370
417
  # Meta characters
371
418
  # ------------------------------------------------------------------------
372
419
  dot {
373
- self.emit(:meta, :dot, data[ts..te-1].pack('c*'), ts, te)
420
+ emit(:meta, :dot, *text(data, ts, te))
374
421
  };
375
422
 
376
423
  alternation {
377
- self.emit(:meta, :alternation, data[ts..te-1].pack('c*'), ts, te)
424
+ emit(:meta, :alternation, *text(data, ts, te))
378
425
  };
379
426
 
380
427
  # Anchors
381
428
  # ------------------------------------------------------------------------
382
429
  beginning_of_line {
383
- self.emit(:anchor, :beginning_of_line, data[ts..te-1].pack('c*'), ts, te)
430
+ emit(:anchor, :bol, *text(data, ts, te))
384
431
  };
385
432
 
386
433
  end_of_line {
387
- self.emit(:anchor, :end_of_line, data[ts..te-1].pack('c*'), ts, te)
434
+ emit(:anchor, :eol, *text(data, ts, te))
388
435
  };
389
436
 
390
437
  backslash . anchor_char > (backslashed, 3) {
391
- case text = data[ts..te-1].pack('c*')
392
- when '\\A'; self.emit(:anchor, :bos, text, ts, te)
393
- when '\\z'; self.emit(:anchor, :eos, text, ts, te)
394
- when '\\Z'; self.emit(:anchor, :eos_ob_eol, text, ts, te)
395
- when '\\b'; self.emit(:anchor, :word_boundary, text, ts, te)
396
- when '\\B'; self.emit(:anchor, :nonword_boundary, text, ts, te)
397
- when '\\G'; self.emit(:anchor, :match_start, text, ts, te)
398
- else raise ScannerError.new("Unsupported anchor at #{text} (char #{ts})")
438
+ case text = text(data, ts, te).first
439
+ when '\\A'; emit(:anchor, :bos, text, ts, te)
440
+ when '\\z'; emit(:anchor, :eos, text, ts, te)
441
+ when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
442
+ when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
+ when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
+ when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
+ else
446
+ raise ScannerError.new(
447
+ "Unexpected character in anchor at #{text} (char #{ts})")
399
448
  end
400
449
  };
401
450
 
@@ -406,15 +455,18 @@
406
455
  # \w, \W word, non-word
407
456
  # ------------------------------------------------------------------------
408
457
  backslash . char_type > (backslashed, 2) {
409
- case text = data[ts..te-1].pack('c*')
410
- when '\\d'; self.emit(:type, :digit, text, ts, te)
411
- when '\\D'; self.emit(:type, :nondigit, text, ts, te)
412
- when '\\h'; self.emit(:type, :hex, text, ts, te)
413
- when '\\H'; self.emit(:type, :nonhex, text, ts, te)
414
- when '\\s'; self.emit(:type, :space, text, ts, te)
415
- when '\\S'; self.emit(:type, :nonspace, text, ts, te)
416
- when '\\w'; self.emit(:type, :word, text, ts, te)
417
- when '\\W'; self.emit(:type, :nonword, text, ts, te)
458
+ case text = text(data, ts, te).first
459
+ when '\\d'; emit(:type, :digit, text, ts, te)
460
+ when '\\D'; emit(:type, :nondigit, text, ts, te)
461
+ when '\\h'; emit(:type, :hex, text, ts, te)
462
+ when '\\H'; emit(:type, :nonhex, text, ts, te)
463
+ when '\\s'; emit(:type, :space, text, ts, te)
464
+ when '\\S'; emit(:type, :nonspace, text, ts, te)
465
+ when '\\w'; emit(:type, :word, text, ts, te)
466
+ when '\\W'; emit(:type, :nonword, text, ts, te)
467
+ else
468
+ raise ScannerError.new(
469
+ "Unexpected character in type at #{text} (char #{ts})")
418
470
  end
419
471
  };
420
472
 
@@ -425,7 +477,7 @@
425
477
  set_depth += 1; in_set = true
426
478
  set_type = set_depth > 1 ? :subset : :set
427
479
 
428
- self.emit(set_type, :open, data[ts..te-1].pack('c*'), ts, te)
480
+ emit(set_type, :open, *text(data, ts, te))
429
481
  fcall character_set;
430
482
  };
431
483
 
@@ -435,7 +487,7 @@
435
487
  # correct closing count.
436
488
  # ------------------------------------------------------------------------
437
489
  group_open . group_comment $group_closed {
438
- self.emit(:group, :comment, data[ts..te-1].pack('c*'), ts, te)
490
+ emit(:group, :comment, *text(data, ts, te))
439
491
  };
440
492
 
441
493
  # Expression options:
@@ -447,21 +499,7 @@
447
499
  # (?imx-imx:subexp) option on/off for subexp
448
500
  # ------------------------------------------------------------------------
449
501
  group_open . group_options >group_opened {
450
- # special handling to resolve ambiguity with passive groups
451
- if data[te]
452
- c = data[te].chr
453
- if c == ':' # include the ':'
454
- self.emit(:group, :options, data[ts..te].pack('c*'), ts, te+1)
455
- p += 1
456
- elsif c == ')' # just options by themselves
457
- self.emit(:group, :options, data[ts..te-1].pack('c*'), ts, te)
458
- else
459
- raise ScannerError.new(
460
- "Unexpected '#{c}' in options sequence, ':' or ')' expected")
461
- end
462
- else
463
- raise PrematureEndError.new("options") unless data[te]
464
- end
502
+ p = scan_options(p, data, ts, te)
465
503
  };
466
504
 
467
505
  # Assertions
@@ -471,11 +509,11 @@
471
509
  # (?<!subexp) negative look-behind
472
510
  # ------------------------------------------------------------------------
473
511
  group_open . assertion_type >group_opened {
474
- case text = data[ts..te-1].pack('c*')
475
- when '(?='; self.emit(:assertion, :lookahead, text, ts, te)
476
- when '(?!'; self.emit(:assertion, :nlookahead, text, ts, te)
477
- when '(?<='; self.emit(:assertion, :lookbehind, text, ts, te)
478
- when '(?<!'; self.emit(:assertion, :nlookbehind, text, ts, te)
512
+ case text = text(data, ts, te).first
513
+ when '(?='; emit(:assertion, :lookahead, text, ts, te)
514
+ when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
515
+ when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
516
+ when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
479
517
  end
480
518
  };
481
519
 
@@ -487,85 +525,103 @@
487
525
  # (subexp) captured group
488
526
  # ------------------------------------------------------------------------
489
527
  group_open . group_type >group_opened {
490
- case text = data[ts..te-1].pack('c*')
491
- when '(?:'; self.emit(:group, :passive, text, ts, te)
492
- when '(?>'; self.emit(:group, :atomic, text, ts, te)
493
-
494
- when /\(\?<\w+>/
495
- self.emit(:group, :named_ab, text, ts, te)
496
- when /\(\?'\w+'/
497
- self.emit(:group, :named_sq, text, ts, te)
528
+ case text = text(data, ts, te).first
529
+ when '(?:'; emit(:group, :passive, text, ts, te)
530
+ when '(?>'; emit(:group, :atomic, text, ts, te)
531
+
532
+ when /^\(\?<(\w*)>/
533
+ empty_name_error(:group, 'named group (ab)') if $1.empty?
534
+
535
+ emit(:group, :named_ab, text, ts, te)
536
+
537
+ when /^\(\?'(\w*)'/
538
+ empty_name_error(:group, 'named group (sq)') if $1.empty?
539
+
540
+ emit(:group, :named_sq, text, ts, te)
541
+
542
+ else
543
+ raise ScannerError.new(
544
+ "Unknown subexpression group format '#{text}'")
498
545
  end
499
546
  };
500
547
 
501
548
  group_open @group_opened {
502
- text = data[ts..te-1].pack('c*')
503
- self.emit(:group, :capture, text, ts, te)
549
+ text = text(data, ts, te).first
550
+ emit(:group, :capture, text, ts, te)
504
551
  };
505
552
 
506
553
  group_close @group_closed {
507
- self.emit(:group, :close, data[ts..te-1].pack('c*'), ts, te)
554
+ emit(:group, :close, *text(data, ts, te))
508
555
  };
509
556
 
510
557
 
511
- # Group back-reference, named and numbered
558
+ # Group backreference, named and numbered
512
559
  # ------------------------------------------------------------------------
513
560
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
514
- case text = data[ts..te-1].pack('c*')
515
- when /\\([gk])<[^\d-](\w+)?>/ # angle-brackets
561
+ case text = text(data, ts, te).first
562
+ when /^\\([gk])<>/ # angle brackets
563
+ empty_backref_error("ref/call (ab)")
564
+
565
+ when /^\\([gk])''/ # single quotes
566
+ empty_backref_error("ref/call (sq)")
567
+
568
+ when /^\\([gk])<[^\d-](\w+)?>/ # angle-brackets
516
569
  if $1 == 'k'
517
- self.emit(:backref, :name_ref_ab, text, ts, te)
570
+ emit(:backref, :name_ref_ab, text, ts, te)
518
571
  else
519
- self.emit(:backref, :name_call_ab, text, ts, te)
572
+ emit(:backref, :name_call_ab, text, ts, te)
520
573
  end
521
574
 
522
- when /\\([gk])'[^\d-](\w+)?'/ #single quotes
575
+ when /^\\([gk])'[^\d-](\w+)?'/ #single quotes
523
576
  if $1 == 'k'
524
- self.emit(:backref, :name_ref_sq, text, ts, te)
577
+ emit(:backref, :name_ref_sq, text, ts, te)
525
578
  else
526
- self.emit(:backref, :name_call_sq, text, ts, te)
579
+ emit(:backref, :name_call_sq, text, ts, te)
527
580
  end
528
581
 
529
- when /\\([gk])<\d+>/ # angle-brackets
582
+ when /^\\([gk])<\d+>/ # angle-brackets
530
583
  if $1 == 'k'
531
- self.emit(:backref, :number_ref_ab, text, ts, te)
584
+ emit(:backref, :number_ref_ab, text, ts, te)
532
585
  else
533
- self.emit(:backref, :number_call_ab, text, ts, te)
586
+ emit(:backref, :number_call_ab, text, ts, te)
534
587
  end
535
588
 
536
- when /\\([gk])'\d+'/ # single quotes
589
+ when /^\\([gk])'\d+'/ # single quotes
537
590
  if $1 == 'k'
538
- self.emit(:backref, :number_ref_sq, text, ts, te)
591
+ emit(:backref, :number_ref_sq, text, ts, te)
539
592
  else
540
- self.emit(:backref, :number_call_sq, text, ts, te)
593
+ emit(:backref, :number_call_sq, text, ts, te)
541
594
  end
542
595
 
543
- when /\\([gk])<-\d+>/ # angle-brackets
596
+ when /^\\([gk])<-\d+>/ # angle-brackets
544
597
  if $1 == 'k'
545
- self.emit(:backref, :number_rel_ref_ab, text, ts, te)
598
+ emit(:backref, :number_rel_ref_ab, text, ts, te)
546
599
  else
547
- self.emit(:backref, :number_rel_call_ab, text, ts, te)
600
+ emit(:backref, :number_rel_call_ab, text, ts, te)
548
601
  end
549
602
 
550
- when /\\([gk])'-\d+'/ # single quotes
603
+ when /^\\([gk])'-\d+'/ # single quotes
551
604
  if $1 == 'k'
552
- self.emit(:backref, :number_rel_ref_sq, text, ts, te)
605
+ emit(:backref, :number_rel_ref_sq, text, ts, te)
553
606
  else
554
- self.emit(:backref, :number_rel_call_sq, text, ts, te)
607
+ emit(:backref, :number_rel_call_sq, text, ts, te)
555
608
  end
556
609
 
557
- when /\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
558
- self.emit(:backref, :name_nest_ref_ab, text, ts, te)
610
+ when /^\\k<[^\d-](\w+)?[+\-]\d+>/ # angle-brackets
611
+ emit(:backref, :name_nest_ref_ab, text, ts, te)
559
612
 
560
- when /\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
561
- self.emit(:backref, :name_nest_ref_sq, text, ts, te)
613
+ when /^\\k'[^\d-](\w+)?[+\-]\d+'/ # single-quotes
614
+ emit(:backref, :name_nest_ref_sq, text, ts, te)
562
615
 
563
- when /\\([gk])<\d+[+\-]\d+>/ # angle-brackets
564
- self.emit(:backref, :number_nest_ref_ab, text, ts, te)
616
+ when /^\\([gk])<\d+[+\-]\d+>/ # angle-brackets
617
+ emit(:backref, :number_nest_ref_ab, text, ts, te)
565
618
 
566
- when /\\([gk])'\d+[+\-]\d+'/ # single-quotes
567
- self.emit(:backref, :number_nest_ref_sq, text, ts, te)
619
+ when /^\\([gk])'\d+[+\-]\d+'/ # single-quotes
620
+ emit(:backref, :number_nest_ref_sq, text, ts, te)
568
621
 
622
+ else
623
+ raise ScannerError.new(
624
+ "Unknown backreference format '#{text}'")
569
625
  end
570
626
  };
571
627
 
@@ -573,31 +629,31 @@
573
629
  # Quantifiers
574
630
  # ------------------------------------------------------------------------
575
631
  zero_or_one {
576
- case text = data[ts..te-1].pack('c*')
577
- when '?' ; self.emit(:quantifier, :zero_or_one, text, ts, te)
578
- when '??'; self.emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
579
- when '?+'; self.emit(:quantifier, :zero_or_one_possessive, text, ts, te)
632
+ case text = text(data, ts, te).first
633
+ when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
634
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
635
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
580
636
  end
581
637
  };
582
-
638
+
583
639
  zero_or_more {
584
- case text = data[ts..te-1].pack('c*')
585
- when '*' ; self.emit(:quantifier, :zero_or_more, text, ts, te)
586
- when '*?'; self.emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
587
- when '*+'; self.emit(:quantifier, :zero_or_more_possessive, text, ts, te)
640
+ case text = text(data, ts, te).first
641
+ when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
642
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
643
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
588
644
  end
589
645
  };
590
-
646
+
591
647
  one_or_more {
592
- case text = data[ts..te-1].pack('c*')
593
- when '+' ; self.emit(:quantifier, :one_or_more, text, ts, te)
594
- when '+?'; self.emit(:quantifier, :one_or_more_reluctant, text, ts, te)
595
- when '++'; self.emit(:quantifier, :one_or_more_possessive, text, ts, te)
648
+ case text = text(data, ts, te).first
649
+ when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
650
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
651
+ when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
596
652
  end
597
653
  };
598
654
 
599
- quantifier_range @err(premature_end_error) {
600
- self.emit(:quantifier, :interval, data[ts..te-1].pack('c*'), ts, te)
655
+ quantifier_interval @err(premature_end_error) {
656
+ emit(:quantifier, :interval, *text(data, ts, te))
601
657
  };
602
658
 
603
659
  # Escaped sequences
@@ -614,35 +670,67 @@
614
670
  utf8_2_byte+ |
615
671
  utf8_3_byte+ |
616
672
  utf8_4_byte+ {
617
- self.append_literal(data, ts, te)
673
+ append_literal(data, ts, te)
618
674
  };
619
675
 
620
676
  *|;
621
677
  }%%
622
678
 
679
+ # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
680
+ # This file was generated from scanner.rl
623
681
 
624
682
  module Regexp::Scanner
625
683
  %% write data;
626
684
 
685
+ # General scanner error (catch all)
627
686
  class ScannerError < StandardError
628
687
  def initialize(what)
629
688
  super what
630
689
  end
631
690
  end
632
691
 
692
+ # Base for all scanner validation errors
693
+ class ValidationError < StandardError
694
+ def initialize(reason)
695
+ super reason
696
+ end
697
+ end
698
+
699
+ # Unexpected end of pattern
633
700
  class PrematureEndError < ScannerError
634
701
  def initialize(where = '')
635
- super "Premature end of pattern: #{where}"
702
+ super "Premature end of pattern at #{where}"
703
+ end
704
+ end
705
+
706
+ # Invalid sequence format. Used for escape sequences, mainly.
707
+ class InvalidSequenceError < ValidationError
708
+ def initialize(what = 'sequence', where = '')
709
+ super "Invalid #{what} at #{where}"
710
+ end
711
+ end
712
+
713
+ # Invalid group. Used for named groups.
714
+ class InvalidGroupError < ValidationError
715
+ def initialize(what, reason)
716
+ super "Invalid #{what}, #{reason}."
717
+ end
718
+ end
719
+
720
+ # Invalid back reference. Used for name a number refs/calls.
721
+ class InvalidBackrefError < ValidationError
722
+ def initialize(what, reason)
723
+ super "Invalid back reference #{what}, #{reason}"
636
724
  end
637
725
  end
638
726
 
639
- class UnknownUnicodePropertyError < ScannerError
727
+ # The property name was not recognized by the scanner.
728
+ class UnknownUnicodePropertyError < ValidationError
640
729
  def initialize(name)
641
730
  super "Unknown unicode character property name #{name}"
642
731
  end
643
732
  end
644
733
 
645
-
646
734
  # Scans the given regular expression text, or Regexp object and collects the
647
735
  # emitted token into an array that gets returned at the end. If a block is
648
736
  # given, it gets called for each emitted token.
@@ -665,42 +753,107 @@ module Regexp::Scanner
665
753
  %% write init;
666
754
  %% write exec;
667
755
 
756
+ if cs == re_scanner_error
757
+ text = ts ? copy(data, ts-1..-1) : data.pack('c*')
758
+ raise ScannerError.new("Scan error at '#{text}'")
759
+ end
760
+
668
761
  raise PrematureEndError.new("(missing group closing paranthesis) "+
669
762
  "[#{in_group}:#{group_depth}]") if in_group
670
763
  raise PrematureEndError.new("(missing set closing bracket) "+
671
764
  "[#{in_set}:#{set_depth}]") if in_set
672
765
 
673
766
  # when the entire expression is a literal run
674
- self.emit_literal if @literal
767
+ emit_literal if @literal
675
768
 
676
769
  @tokens
677
770
  end
678
771
 
679
- # appends one or more characters to the literal buffer, to be emitted later
680
- # by a call to emit_literal. contents a mix of ASCII and UTF-8
772
+ private
773
+
774
+ # Ragel's regex-based scan of the group options introduced a lot of
775
+ # ambiguity, so we just ask it to find the beginning of what looks
776
+ # like an options run and handle the rest in here.
777
+ def self.scan_options(p, data, ts, te)
778
+ text = text(data, ts, te).first
779
+
780
+ options_char, options_length = true, 0
781
+
782
+ # Copy while we have option characters, the maximum is 7, for (?mix-mix,
783
+ # even though it doesn't make sense it is possible.
784
+ while options_char and options_length < 7
785
+ if data[te + options_length]
786
+ c = data[te + options_length].chr
787
+
788
+ if c =~ /[-mix]/
789
+ text << c ; p += 1 ; options_length += 1
790
+ else
791
+ options_char = false
792
+ end
793
+ else
794
+ raise PrematureEndError.new("expression options `#{text}'")
795
+ end
796
+ end
797
+
798
+ if data[te + options_length]
799
+ c = data[te + options_length].chr
800
+
801
+ if c == ':'
802
+ # Include the ':' in the options text
803
+ text << c ; p += 1 ; options_length += 1
804
+ emit(:group, :options, text, ts, te + options_length)
805
+
806
+ elsif c == ')'
807
+ # Don't include the closing ')', let group_close handle it.
808
+ emit(:group, :options, text, ts, te + options_length)
809
+
810
+ else
811
+ # Plain Regexp reports this as 'undefined group option'
812
+ raise ScannerError.new(
813
+ "Unexpected `#{c}' in options sequence, ':' or ')' expected")
814
+ end
815
+ else
816
+ raise PrematureEndError.new("expression options `#{text}'")
817
+ end
818
+
819
+ p # return the new value of the data pointer
820
+ end
821
+
822
+ # Copy from ts to te from data as text
823
+ def self.copy(data, range)
824
+ data[range].pack('c*')
825
+ end
826
+
827
+ # Copy from ts to te from data as text, returning an array with the text
828
+ # and the offsets used to copy it.
829
+ def self.text(data, ts, te, soff = 0)
830
+ [copy(data, ts-soff..te-1), ts-soff, te]
831
+ end
832
+
833
+ # Appends one or more characters to the literal buffer, to be emitted later
834
+ # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
681
835
  def self.append_literal(data, ts, te)
682
836
  @literal ||= []
683
- @literal << [data[ts..te-1].pack('c*'), ts, te]
837
+ @literal << text(data, ts, te)
684
838
  end
685
839
 
686
- # emits the collected literal run collected by one or more calls to the
687
- # append_literal method
840
+ # Emits the literal run collected by calls to the append_literal method,
841
+ # using the total start (ts) and end (te) offsets of the run.
688
842
  def self.emit_literal
689
843
  ts, te = @literal.first[1], @literal.last[2]
690
844
  text = @literal.map {|t| t[0]}.join
691
845
 
692
846
  text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
693
847
 
694
- self.emit(:literal, :literal, text, ts, te)
695
848
  @literal = nil
849
+ emit(:literal, :literal, text, ts, te)
696
850
  end
697
851
 
852
+ # Emits an array with the details of the scanned pattern
698
853
  def self.emit(type, token, text, ts, te)
699
- #puts " > emit: #{type}:#{token} '#{text}' [#{ts}..#{te}]"
854
+ #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
700
855
 
701
- if @literal and type != :literal
702
- self.emit_literal
703
- end
856
+ emit_literal if @literal
704
857
 
705
858
  if @block
706
859
  @block.call type, token, text, ts, te
@@ -709,4 +862,37 @@ module Regexp::Scanner
709
862
  @tokens << [type, token, text, ts, te]
710
863
  end
711
864
 
865
+ # Centralizes and unifies the handling of validation related
866
+ # errors.
867
+ def self.validation_error(type, what, reason)
868
+ case type
869
+ when :group
870
+ error = InvalidGroupError.new(what, reason)
871
+ when :backref
872
+ error = InvalidBackrefError.new(what, reason)
873
+ when :sequence
874
+ error = InvalidSequenceError.new(what, reason)
875
+ else
876
+ error = ValidationError.new('expression')
877
+ end
878
+
879
+ # TODO: configuration option to treat scanner level validation
880
+ # errors as warnings or ignore them
881
+ if false # @@config.validation_warn
882
+ $stderr.puts error.to_s # unless @@config.validation_ignore
883
+ else
884
+ raise error # unless @@config.validation_ignore
885
+ end
886
+ end
887
+
888
+ # Used for references with an empty name or number
889
+ def self.empty_backref_error(type, what)
890
+ validation_error(:backref, what, 'ref ID is empty')
891
+ end
892
+
893
+ # Used for named expressions with an empty name
894
+ def self.empty_name_error(type, what)
895
+ validation_error(type, what, 'name is empty')
896
+ end
897
+
712
898
  end # module Regexp::Scanner