regexp_parser 2.1.1 → 2.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (167) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -5
  3. data/LICENSE +1 -1
  4. data/Rakefile +6 -70
  5. data/lib/regexp_parser/error.rb +1 -1
  6. data/lib/regexp_parser/expression/base.rb +76 -0
  7. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  8. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  9. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +18 -3
  10. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +2 -7
  11. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +4 -8
  12. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  13. data/lib/regexp_parser/expression/classes/conditional.rb +2 -6
  14. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  15. data/lib/regexp_parser/expression/classes/free_space.rb +4 -4
  16. data/lib/regexp_parser/expression/classes/group.rb +10 -22
  17. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  18. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  19. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  20. data/lib/regexp_parser/expression/classes/root.rb +3 -6
  21. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +10 -11
  22. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  23. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  24. data/lib/regexp_parser/expression/methods/match_length.rb +9 -5
  25. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  26. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  27. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  28. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  29. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  30. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  31. data/lib/regexp_parser/expression/quantifier.rb +55 -24
  32. data/lib/regexp_parser/expression/sequence.rb +11 -31
  33. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  34. data/lib/regexp_parser/expression/shared.rb +111 -0
  35. data/lib/regexp_parser/expression/subexpression.rb +26 -18
  36. data/lib/regexp_parser/expression.rb +37 -155
  37. data/lib/regexp_parser/lexer.rb +81 -39
  38. data/lib/regexp_parser/parser.rb +135 -173
  39. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  40. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  41. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  42. data/lib/regexp_parser/scanner/properties/long.csv +651 -0
  43. data/lib/regexp_parser/scanner/properties/short.csv +249 -0
  44. data/lib/regexp_parser/scanner/property.rl +2 -2
  45. data/lib/regexp_parser/scanner/scanner.rl +127 -185
  46. data/lib/regexp_parser/scanner.rb +1185 -1402
  47. data/lib/regexp_parser/syntax/any.rb +2 -7
  48. data/lib/regexp_parser/syntax/base.rb +91 -66
  49. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  50. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  51. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  52. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  53. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  54. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  55. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  56. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  57. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  58. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  59. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  60. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  61. data/lib/regexp_parser/syntax/token/unicode_property.rb +751 -0
  62. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  63. data/lib/regexp_parser/syntax/token.rb +45 -0
  64. data/lib/regexp_parser/syntax/version_lookup.rb +17 -34
  65. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  66. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  67. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  68. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  69. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  70. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  71. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  73. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  75. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  78. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  79. data/lib/regexp_parser/syntax/versions.rb +4 -2
  80. data/lib/regexp_parser/syntax.rb +2 -2
  81. data/lib/regexp_parser/token.rb +9 -20
  82. data/lib/regexp_parser/version.rb +1 -1
  83. data/lib/regexp_parser.rb +6 -8
  84. data/regexp_parser.gemspec +20 -22
  85. metadata +49 -171
  86. data/CHANGELOG.md +0 -494
  87. data/README.md +0 -479
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -104
  99. data/spec/expression/clone_spec.rb +0 -152
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/subexpression_spec.rb +0 -50
  109. data/spec/expression/to_h_spec.rb +0 -26
  110. data/spec/expression/to_s_spec.rb +0 -108
  111. data/spec/lexer/all_spec.rb +0 -22
  112. data/spec/lexer/conditionals_spec.rb +0 -53
  113. data/spec/lexer/delimiters_spec.rb +0 -68
  114. data/spec/lexer/escapes_spec.rb +0 -14
  115. data/spec/lexer/keep_spec.rb +0 -10
  116. data/spec/lexer/literals_spec.rb +0 -64
  117. data/spec/lexer/nesting_spec.rb +0 -99
  118. data/spec/lexer/refcalls_spec.rb +0 -60
  119. data/spec/parser/all_spec.rb +0 -43
  120. data/spec/parser/alternation_spec.rb +0 -88
  121. data/spec/parser/anchors_spec.rb +0 -17
  122. data/spec/parser/conditionals_spec.rb +0 -179
  123. data/spec/parser/errors_spec.rb +0 -30
  124. data/spec/parser/escapes_spec.rb +0 -121
  125. data/spec/parser/free_space_spec.rb +0 -130
  126. data/spec/parser/groups_spec.rb +0 -108
  127. data/spec/parser/keep_spec.rb +0 -6
  128. data/spec/parser/options_spec.rb +0 -28
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -68
  132. data/spec/parser/refcalls_spec.rb +0 -117
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/delimiters_spec.rb +0 -52
  141. data/spec/scanner/errors_spec.rb +0 -67
  142. data/spec/scanner/escapes_spec.rb +0 -64
  143. data/spec/scanner/free_space_spec.rb +0 -165
  144. data/spec/scanner/groups_spec.rb +0 -61
  145. data/spec/scanner/keep_spec.rb +0 -10
  146. data/spec/scanner/literals_spec.rb +0 -39
  147. data/spec/scanner/meta_spec.rb +0 -18
  148. data/spec/scanner/options_spec.rb +0 -36
  149. data/spec/scanner/properties_spec.rb +0 -64
  150. data/spec/scanner/quantifiers_spec.rb +0 -25
  151. data/spec/scanner/refcalls_spec.rb +0 -55
  152. data/spec/scanner/sets_spec.rb +0 -151
  153. data/spec/scanner/types_spec.rb +0 -14
  154. data/spec/spec_helper.rb +0 -16
  155. data/spec/support/runner.rb +0 -42
  156. data/spec/support/shared_examples.rb +0 -77
  157. data/spec/support/warning_extractor.rb +0 -60
  158. data/spec/syntax/syntax_spec.rb +0 -48
  159. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  160. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  161. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  162. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  163. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  164. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  165. data/spec/syntax/versions/aliases_spec.rb +0 -37
  166. data/spec/token/token_spec.rb +0 -85
  167. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -28,18 +28,7 @@
28
28
 
29
29
  comment = ('#' . [^\n]* . '\n'?);
30
30
 
31
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
32
- 'cntrl' | 'digit' | 'graph' |
33
- 'lower' | 'print' | 'punct' |
34
- 'space' | 'upper' | 'xdigit' |
35
- 'word' | 'ascii';
36
-
37
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
38
-
39
-
40
- # these are not supported in ruby at the moment
41
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
42
- character_equivalent = '[=' . alpha . '=]';
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
43
32
 
44
33
  line_anchor = beginning_of_line | end_of_line;
45
34
  anchor_char = [AbBzZG];
@@ -65,20 +54,13 @@
65
54
  one_or_more = '+' | '+?' | '++';
66
55
 
67
56
  quantifier_greedy = '?' | '*' | '+';
68
- quantifier_reluctant = '??' | '*?' | '+?';
69
- quantifier_possessive = '?+' | '*+' | '++';
70
- quantifier_mode = '?' | '+';
71
57
 
72
58
  quantity_exact = (digit+);
73
59
  quantity_minimum = (digit+) . ',';
74
60
  quantity_maximum = ',' . (digit+);
75
61
  quantity_range = (digit+) . ',' . (digit+);
76
62
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
77
- quantity_maximum | quantity_range ) . range_close .
78
- quantifier_mode?;
79
-
80
- quantifiers = quantifier_greedy | quantifier_reluctant |
81
- quantifier_possessive | quantifier_interval;
63
+ quantity_maximum | quantity_range ) . range_close;
82
64
 
83
65
  conditional = '(?(';
84
66
 
@@ -96,10 +78,9 @@
96
78
  # try to treat every other group head as options group, like Ruby
97
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
98
80
 
99
- group_ref = [gk];
100
- group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
101
- group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
102
- group_number = '-'? . [1-9] . [0-9]*;
81
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
103
84
  group_level = [+\-] . [0-9]+;
104
85
 
105
86
  group_name = ('<' . group_name_id_ab? . '>') |
@@ -108,15 +89,11 @@
108
89
 
109
90
  group_named = ('?' . group_name );
110
91
 
111
- group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
112
- ("'" . group_name_id_sq? . group_level? "'"));
113
- group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
114
- ("'" . group_name_id_sq? . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
115
94
 
116
- group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
117
- ("'" . group_number . group_level? "'"));
118
- group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
119
- ("'" . ((group_number . group_level?) | '0') "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
120
97
 
121
98
  group_type = group_atomic | group_passive | group_absence | group_named;
122
99
 
@@ -139,20 +116,21 @@
139
116
  keep_mark | sequence_char;
140
117
 
141
118
  # escapes that also work within a character set
142
- set_escape = backslash | brackets | escaped_ascii | property_char |
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
143
121
  sequence_char | single_codepoint_char_type;
144
122
 
145
123
 
146
124
  # EOF error, used where it can be detected
147
125
  action premature_end_error {
148
126
  text = copy(data, ts ? ts-1 : 0, -1)
149
- raise PrematureEndError.new( text )
127
+ raise PrematureEndError.new(text)
150
128
  }
151
129
 
152
130
  # Invalid sequence error, used from sequences, like escapes and sets
153
131
  action invalid_sequence_error {
154
132
  text = copy(data, ts ? ts-1 : 0, -1)
155
- validation_error(:sequence, 'sequence', text)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
156
134
  }
157
135
 
158
136
  # group (nesting) and set open/close actions
@@ -175,8 +153,8 @@
175
153
  };
176
154
 
177
155
  '-]' @set_closed { # special case, emits two tokens
178
- emit(:literal, :literal, copy(data, ts, te-1))
179
- emit(:set, :close, copy(data, ts+1, te))
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
180
158
  if in_set?
181
159
  fret;
182
160
  else
@@ -190,28 +168,27 @@
190
168
  };
191
169
 
192
170
  '^' {
193
- text = copy(data, ts, te)
194
- if tokens.last[1] == :open
195
- emit(:set, :negate, text)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
196
173
  else
197
- emit(:literal, :literal, text)
174
+ emit(:literal, :literal, '^')
198
175
  end
199
176
  };
200
177
 
201
178
  '-' {
202
- text = copy(data, ts, te)
203
- # ranges cant start with a subset or intersection/negation/range operator
204
- if tokens.last[0] == :set
205
- emit(:literal, :literal, text)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
206
183
  else
207
- emit(:set, :range, text)
184
+ emit(:set, :range, '-')
208
185
  end
209
186
  };
210
187
 
211
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
212
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
213
190
  '&&' {
214
- emit(:set, :intersection, copy(data, ts, te))
191
+ emit(:set, :intersection, '&&')
215
192
  };
216
193
 
217
194
  backslash {
@@ -219,31 +196,27 @@
219
196
  };
220
197
 
221
198
  set_open >(open_bracket, 1) >set_opened {
222
- emit(:set, :open, copy(data, ts, te))
199
+ emit(:set, :open, '[')
223
200
  fcall character_set;
224
201
  };
225
202
 
226
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
203
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
227
204
  text = copy(data, ts, te)
228
205
 
229
206
  type = :posixclass
230
207
  class_name = text[2..-3]
231
- if class_name[0].chr == '^'
208
+ if class_name[0] == '^'
232
209
  class_name = class_name[1..-1]
233
210
  type = :nonposixclass
234
211
  end
235
212
 
213
+ unless self.class.posix_classes.include?(class_name)
214
+ raise ValidationError.for(:posix_class, text)
215
+ end
216
+
236
217
  emit(type, class_name.to_sym, text)
237
218
  };
238
219
 
239
- # These are not supported in ruby at the moment. Enable them if they are.
240
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
- # emit(:set, :collation, copy(data, ts, te))
242
- # };
243
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
244
- # emit(:set, :equivalent, copy(data, ts, te))
245
- # };
246
-
247
220
  meta_char > (set_meta, 1) {
248
221
  emit(:literal, :literal, copy(data, ts, te))
249
222
  };
@@ -257,12 +230,22 @@
257
230
  # set escapes scanner
258
231
  # --------------------------------------------------------------------------
259
232
  set_escape_sequence := |*
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
236
+ fret;
237
+ };
238
+
239
+ # Scan all other escapes that work in sets with the generic escape scanner
260
240
  set_escape > (escaped_set_alpha, 2) {
261
241
  fhold;
262
242
  fnext character_set;
263
243
  fcall escape_sequence;
264
244
  };
265
245
 
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
266
249
  any > (escaped_set_alpha, 1) {
267
250
  emit(:escape, :literal, copy(data, ts-1, te))
268
251
  fret;
@@ -284,6 +267,13 @@
284
267
  fret;
285
268
  };
286
269
 
270
+ [8-9] . [0-9] { # special case, emits two tokens
271
+ text = copy(data, ts-1, te)
272
+ emit(:escape, :literal, text[0, 2])
273
+ emit(:literal, :literal, text[2])
274
+ fret;
275
+ };
276
+
287
277
  meta_char {
288
278
  case text = copy(data, ts-1, te)
289
279
  when '\.'; emit(:escape, :dot, text)
@@ -323,7 +313,7 @@
323
313
 
324
314
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
325
315
  text = copy(data, ts-1, te)
326
- if text[2].chr == '{'
316
+ if text[2] == '{'
327
317
  emit(:escape, :codepoint_list, text)
328
318
  else
329
319
  emit(:escape, :codepoint, text)
@@ -374,6 +364,7 @@
374
364
  conditional_expression := |*
375
365
  group_lookup . ')' {
376
366
  text = copy(data, ts, te-1)
367
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
377
368
  emit(:conditional, :condition, text)
378
369
  emit(:conditional, :condition_close, ')')
379
370
  };
@@ -419,12 +410,12 @@
419
410
 
420
411
  backslash . anchor_char > (backslashed, 3) {
421
412
  case text = copy(data, ts, te)
422
- when '\\A'; emit(:anchor, :bos, text)
423
- when '\\z'; emit(:anchor, :eos, text)
424
- when '\\Z'; emit(:anchor, :eos_ob_eol, text)
425
- when '\\b'; emit(:anchor, :word_boundary, text)
426
- when '\\B'; emit(:anchor, :nonword_boundary, text)
427
- when '\\G'; emit(:anchor, :match_start, text)
413
+ when '\A'; emit(:anchor, :bos, text)
414
+ when '\z'; emit(:anchor, :eos, text)
415
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
416
+ when '\b'; emit(:anchor, :word_boundary, text)
417
+ when '\B'; emit(:anchor, :nonword_boundary, text)
418
+ when '\G'; emit(:anchor, :match_start, text)
428
419
  end
429
420
  };
430
421
 
@@ -456,10 +447,9 @@
456
447
 
457
448
  # (?#...) comments: parsed as a single expression, without introducing a
458
449
  # new nesting level. Comments may not include parentheses, escaped or not.
459
- # special case for close, action performed on all transitions to get the
460
- # correct closing count.
450
+ # special case for close to get the correct closing count.
461
451
  # ------------------------------------------------------------------------
462
- group_open . group_comment $group_closed {
452
+ (group_open . group_comment) @group_closed {
463
453
  emit(:group, :comment, copy(data, ts, te))
464
454
  };
465
455
 
@@ -474,10 +464,10 @@
474
464
  #
475
465
  # (?imxdau-imx:subexp) option on/off for subexp
476
466
  # ------------------------------------------------------------------------
477
- group_open . group_options >group_opened {
467
+ (group_open . group_options) >group_opened {
478
468
  text = copy(data, ts, te)
479
469
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
480
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
470
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
481
471
  end
482
472
  emit_options(text)
483
473
  };
@@ -488,7 +478,7 @@
488
478
  # (?<=subexp) look-behind
489
479
  # (?<!subexp) negative look-behind
490
480
  # ------------------------------------------------------------------------
491
- group_open . assertion_type >group_opened {
481
+ (group_open . assertion_type) >group_opened {
492
482
  case text = copy(data, ts, te)
493
483
  when '(?='; emit(:assertion, :lookahead, text)
494
484
  when '(?!'; emit(:assertion, :nlookahead, text)
@@ -505,14 +495,14 @@
505
495
  # (?'name'subexp) named group (single quoted version)
506
496
  # (subexp) captured group
507
497
  # ------------------------------------------------------------------------
508
- group_open . group_type >group_opened {
498
+ (group_open . group_type) >group_opened {
509
499
  case text = copy(data, ts, te)
510
500
  when '(?:'; emit(:group, :passive, text)
511
501
  when '(?>'; emit(:group, :atomic, text)
512
502
  when '(?~'; emit(:group, :absence, text)
513
503
 
514
504
  when /^\(\?(?:<>|'')/
515
- validation_error(:group, 'named group', 'name is empty')
505
+ raise ValidationError.for(:group, 'named group', 'name is empty')
516
506
 
517
507
  when /^\(\?<[^>]+>/
518
508
  emit(:group, :named_ab, text)
@@ -531,50 +521,52 @@
531
521
  group_close @group_closed {
532
522
  if conditional_stack.last == group_depth + 1
533
523
  conditional_stack.pop
534
- emit(:conditional, :close, copy(data, ts, te))
535
- else
524
+ emit(:conditional, :close, ')')
525
+ elsif group_depth >= 0
536
526
  if spacing_stack.length > 1 &&
537
527
  spacing_stack.last[:depth] == group_depth + 1
538
528
  spacing_stack.pop
539
529
  self.free_spacing = spacing_stack.last[:free_spacing]
540
530
  end
541
531
 
542
- emit(:group, :close, copy(data, ts, te))
532
+ emit(:group, :close, ')')
533
+ else
534
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
543
535
  end
544
536
  };
545
537
 
546
538
 
547
539
  # Group backreference, named and numbered
548
540
  # ------------------------------------------------------------------------
549
- backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
541
+ backslash . (group_ref) > (backslashed, 4) {
550
542
  case text = copy(data, ts, te)
551
- when /^\\k(<>|'')/
552
- validation_error(:backref, 'backreference', 'ref ID is empty')
553
- when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
543
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
554
544
  emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
555
- when /^\\k(.)\d+\D$/
545
+ when /^\\k(.)0*[1-9]\d*['>]$/
556
546
  emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
557
- when /^\\k(.)-\d+\D$/
547
+ when /^\\k(.)-0*[1-9]\d*['>]$/
558
548
  emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
559
- when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
549
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
560
550
  emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
561
- when /^\\k(.)-?\d+[+\-]\d+\D$/
551
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
562
552
  emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
553
+ else
554
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
563
555
  end
564
556
  };
565
557
 
566
558
  # Group call, named and numbered
567
559
  # ------------------------------------------------------------------------
568
- backslash . (group_name_call | group_number_call) > (backslashed, 4) {
560
+ backslash . (group_call) > (backslashed, 4) {
569
561
  case text = copy(data, ts, te)
570
- when /^\\g(<>|'')/
571
- validation_error(:backref, 'subexpression call', 'ref ID is empty')
572
- when /^\\g(.)[^\p{digit}+\->][^+\-]*/
562
+ when /^\\g(.)[^0-9+\-].*['>]$/
573
563
  emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
574
- when /^\\g(.)\d+\D$/
564
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
575
565
  emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
576
- when /^\\g(.)[+-]\d+/
566
+ when /^\\g(.)[+-]0*[1-9]\d*/
577
567
  emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
568
+ else
569
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
578
570
  end
579
571
  };
580
572
 
@@ -605,7 +597,7 @@
605
597
  end
606
598
  };
607
599
 
608
- quantifier_interval {
600
+ quantifier_interval {
609
601
  emit(:quantifier, :interval, copy(data, ts, te))
610
602
  };
611
603
 
@@ -648,87 +640,35 @@
648
640
  *|;
649
641
  }%%
650
642
 
651
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
652
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
653
-
654
- require 'regexp_parser/error'
643
+ require_relative 'scanner/errors/scanner_error'
644
+ require_relative 'scanner/errors/premature_end_error'
645
+ require_relative 'scanner/errors/validation_error'
655
646
 
656
647
  class Regexp::Scanner
657
- # General scanner error (catch all)
658
- class ScannerError < Regexp::Parser::Error; end
659
-
660
- # Base for all scanner validation errors
661
- class ValidationError < Regexp::Parser::Error
662
- def initialize(reason)
663
- super reason
664
- end
665
- end
666
-
667
- # Unexpected end of pattern
668
- class PrematureEndError < ScannerError
669
- def initialize(where = '')
670
- super "Premature end of pattern at #{where}"
671
- end
672
- end
673
-
674
- # Invalid sequence format. Used for escape sequences, mainly.
675
- class InvalidSequenceError < ValidationError
676
- def initialize(what = 'sequence', where = '')
677
- super "Invalid #{what} at #{where}"
678
- end
679
- end
680
-
681
- # Invalid group. Used for named groups.
682
- class InvalidGroupError < ValidationError
683
- def initialize(what, reason)
684
- super "Invalid #{what}, #{reason}."
685
- end
686
- end
687
-
688
- # Invalid groupOption. Used for inline options.
689
- class InvalidGroupOption < ValidationError
690
- def initialize(option, text)
691
- super "Invalid group option #{option} in #{text}"
692
- end
693
- end
694
-
695
- # Invalid back reference. Used for name a number refs/calls.
696
- class InvalidBackrefError < ValidationError
697
- def initialize(what, reason)
698
- super "Invalid back reference #{what}, #{reason}"
699
- end
700
- end
701
-
702
- # The property name was not recognized by the scanner.
703
- class UnknownUnicodePropertyError < ValidationError
704
- def initialize(name)
705
- super "Unknown unicode character property name #{name}"
706
- end
707
- end
708
-
709
648
  # Scans the given regular expression text, or Regexp object and collects the
710
649
  # emitted token into an array that gets returned at the end. If a block is
711
650
  # given, it gets called for each emitted token.
712
651
  #
713
652
  # This method may raise errors if a syntax error is encountered.
714
653
  # --------------------------------------------------------------------------
715
- def self.scan(input_object, options: nil, &block)
716
- new.scan(input_object, options: options, &block)
654
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
655
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
717
656
  end
718
657
 
719
- def scan(input_object, options: nil, &block)
720
- self.literal = nil
658
+ def scan(input_object, options: nil, collect_tokens: true, &block)
659
+ self.collect_tokens = collect_tokens
660
+ self.literal_run = nil
721
661
  stack = []
722
662
 
723
663
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
724
664
  self.free_spacing = free_spacing?(input_object, options)
725
665
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
726
666
 
727
- data = input.unpack("c*") if input.is_a?(String)
667
+ data = input.unpack("c*")
728
668
  eof = data.length
729
669
 
730
670
  self.tokens = []
731
- self.block = block_given? ? block : nil
671
+ self.block = block
732
672
 
733
673
  self.set_depth = 0
734
674
  self.group_depth = 0
@@ -753,46 +693,64 @@ class Regexp::Scanner
753
693
  "[#{set_depth}]") if in_set?
754
694
 
755
695
  # when the entire expression is a literal run
756
- emit_literal if literal
696
+ emit_literal if literal_run
757
697
 
758
698
  tokens
759
699
  end
760
700
 
761
701
  # lazy-load property maps when first needed
762
- require 'yaml'
763
-
764
702
  def self.short_prop_map
765
- @short_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/short.yml")
703
+ @short_prop_map ||= parse_prop_map('short')
766
704
  end
767
705
 
768
706
  def self.long_prop_map
769
- @long_prop_map ||= YAML.load_file("#{__dir__}/scanner/properties/long.yml")
707
+ @long_prop_map ||= parse_prop_map('long')
708
+ end
709
+
710
+ def self.parse_prop_map(name)
711
+ File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712
+ end
713
+
714
+ def self.posix_classes
715
+ %w[alnum alpha ascii blank cntrl digit graph
716
+ lower print punct space upper word xdigit]
770
717
  end
771
718
 
772
719
  # Emits an array with the details of the scanned pattern
773
720
  def emit(type, token, text)
774
721
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
775
722
 
776
- emit_literal if literal
723
+ emit_literal if literal_run
777
724
 
778
725
  # Ragel runs with byte-based indices (ts, te). These are of little value to
779
726
  # end-users, so we keep track of char-based indices and emit those instead.
780
727
  ts_char_pos = char_pos
781
728
  te_char_pos = char_pos + text.length
782
729
 
783
- if block
784
- block.call type, token, text, ts_char_pos, te_char_pos
785
- end
730
+ tok = [type, token, text, ts_char_pos, te_char_pos]
786
731
 
787
- tokens << [type, token, text, ts_char_pos, te_char_pos]
732
+ self.prev_token = tok
788
733
 
789
734
  self.char_pos = te_char_pos
735
+
736
+ if block
737
+ block.call type, token, text, ts_char_pos, te_char_pos
738
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
739
+ tokens << tok if collect_tokens
740
+ elsif collect_tokens
741
+ tokens << tok
742
+ end
790
743
  end
791
744
 
745
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
746
+
792
747
  private
793
748
 
794
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
795
- :group_depth, :set_depth, :conditional_stack, :char_pos
749
+ attr_accessor :block,
750
+ :collect_tokens, :tokens, :prev_token,
751
+ :free_spacing, :spacing_stack,
752
+ :group_depth, :set_depth, :conditional_stack,
753
+ :char_pos
796
754
 
797
755
  def free_spacing?(input_object, options)
798
756
  if options && !input_object.is_a?(String)
@@ -822,14 +780,13 @@ class Regexp::Scanner
822
780
  # Appends one or more characters to the literal buffer, to be emitted later
823
781
  # by a call to emit_literal.
824
782
  def append_literal(data, ts, te)
825
- self.literal = literal || []
826
- literal << copy(data, ts, te)
783
+ (self.literal_run ||= []) << copy(data, ts, te)
827
784
  end
828
785
 
829
786
  # Emits the literal run collected by calls to the append_literal method.
830
787
  def emit_literal
831
- text = literal.join
832
- self.literal = nil
788
+ text = literal_run.join
789
+ self.literal_run = nil
833
790
  emit(:literal, :literal, text)
834
791
  end
835
792
 
@@ -864,23 +821,8 @@ class Regexp::Scanner
864
821
 
865
822
  def emit_meta_control_sequence(data, ts, te, token)
866
823
  if data.last < 0x00 || data.last > 0x7F
867
- validation_error(:sequence, 'escape', token.to_s)
824
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
868
825
  end
869
826
  emit(:escape, token, copy(data, ts-1, te))
870
827
  end
871
-
872
- # Centralizes and unifies the handling of validation related
873
- # errors.
874
- def validation_error(type, what, reason)
875
- case type
876
- when :group
877
- error = InvalidGroupError.new(what, reason)
878
- when :backref
879
- error = InvalidBackrefError.new(what, reason)
880
- when :sequence
881
- error = InvalidSequenceError.new(what, reason)
882
- end
883
-
884
- raise error # unless @@config.validation_ignore
885
- end
886
828
  end # module Regexp::Scanner