regexp_parser 2.6.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +5 -5
  3. data/LICENSE +1 -1
  4. data/lib/regexp_parser/expression/base.rb +0 -7
  5. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  6. data/lib/regexp_parser/expression/classes/backreference.rb +5 -10
  7. data/lib/regexp_parser/expression/classes/character_set/range.rb +2 -7
  8. data/lib/regexp_parser/expression/classes/character_set.rb +4 -8
  9. data/lib/regexp_parser/expression/classes/conditional.rb +2 -20
  10. data/lib/regexp_parser/expression/classes/escape_sequence.rb +21 -91
  11. data/lib/regexp_parser/expression/classes/free_space.rb +3 -1
  12. data/lib/regexp_parser/expression/classes/group.rb +0 -22
  13. data/lib/regexp_parser/expression/classes/keep.rb +1 -1
  14. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -5
  15. data/lib/regexp_parser/expression/classes/unicode_property.rb +11 -11
  16. data/lib/regexp_parser/expression/methods/construct.rb +2 -4
  17. data/lib/regexp_parser/expression/methods/escape_sequence_char.rb +5 -0
  18. data/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb +68 -0
  19. data/lib/regexp_parser/expression/methods/match_length.rb +8 -4
  20. data/lib/regexp_parser/expression/methods/negative.rb +20 -0
  21. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  22. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  23. data/lib/regexp_parser/expression/methods/referenced_expressions.rb +28 -0
  24. data/lib/regexp_parser/expression/methods/tests.rb +40 -3
  25. data/lib/regexp_parser/expression/methods/traverse.rb +35 -19
  26. data/lib/regexp_parser/expression/quantifier.rb +30 -17
  27. data/lib/regexp_parser/expression/sequence.rb +5 -10
  28. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  29. data/lib/regexp_parser/expression/shared.rb +37 -20
  30. data/lib/regexp_parser/expression/subexpression.rb +20 -15
  31. data/lib/regexp_parser/expression.rb +37 -31
  32. data/lib/regexp_parser/lexer.rb +76 -36
  33. data/lib/regexp_parser/parser.rb +107 -103
  34. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  35. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  36. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  37. data/lib/regexp_parser/scanner/properties/long.csv +29 -0
  38. data/lib/regexp_parser/scanner/properties/short.csv +3 -0
  39. data/lib/regexp_parser/scanner/property.rl +2 -2
  40. data/lib/regexp_parser/scanner/scanner.rl +101 -172
  41. data/lib/regexp_parser/scanner.rb +1171 -1365
  42. data/lib/regexp_parser/syntax/token/backreference.rb +3 -0
  43. data/lib/regexp_parser/syntax/token/character_set.rb +3 -0
  44. data/lib/regexp_parser/syntax/token/escape.rb +3 -1
  45. data/lib/regexp_parser/syntax/token/meta.rb +9 -2
  46. data/lib/regexp_parser/syntax/token/unicode_property.rb +35 -1
  47. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  48. data/lib/regexp_parser/syntax/token.rb +13 -13
  49. data/lib/regexp_parser/syntax/version_lookup.rb +0 -8
  50. data/lib/regexp_parser/syntax/versions.rb +3 -1
  51. data/lib/regexp_parser/syntax.rb +1 -1
  52. data/lib/regexp_parser/version.rb +1 -1
  53. data/lib/regexp_parser.rb +6 -6
  54. data/regexp_parser.gemspec +5 -5
  55. metadata +17 -8
  56. data/CHANGELOG.md +0 -601
  57. data/README.md +0 -503
@@ -7,6 +7,8 @@ age=12.0,age=12.0
7
7
  age=12.1,age=12.1
8
8
  age=13.0,age=13.0
9
9
  age=14.0,age=14.0
10
+ age=15.0,age=15.0
11
+ age=15.1,age=15.1
10
12
  age=2.0,age=2.0
11
13
  age=2.1,age=2.1
12
14
  age=3.0,age=3.0
@@ -97,6 +99,7 @@ emojimodifierbase,emoji_modifier_base
97
99
  emojipresentation,emoji_presentation
98
100
  enclosingmark,enclosing_mark
99
101
  ethiopic,ethiopic
102
+ extendedpictographic,extended_pictographic
100
103
  extender,extender
101
104
  finalpunctuation,final_punctuation
102
105
  format,format
@@ -106,6 +109,19 @@ gothic,gothic
106
109
  grantha,grantha
107
110
  graph,graph
108
111
  graphemebase,grapheme_base
112
+ graphemeclusterbreak=control,grapheme_cluster_break=control
113
+ graphemeclusterbreak=cr,grapheme_cluster_break=cr
114
+ graphemeclusterbreak=extend,grapheme_cluster_break=extend
115
+ graphemeclusterbreak=l,grapheme_cluster_break=l
116
+ graphemeclusterbreak=lf,grapheme_cluster_break=lf
117
+ graphemeclusterbreak=lv,grapheme_cluster_break=lv
118
+ graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
119
+ graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
120
+ graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
121
+ graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
122
+ graphemeclusterbreak=t,grapheme_cluster_break=t
123
+ graphemeclusterbreak=v,grapheme_cluster_break=v
124
+ graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
109
125
  graphemeextend,grapheme_extend
110
126
  graphemelink,grapheme_link
111
127
  greek,greek
@@ -121,11 +137,14 @@ hebrew,hebrew
121
137
  hexdigit,hex_digit
122
138
  hiragana,hiragana
123
139
  hyphen,hyphen
140
+ idcompatmathcontinue,id_compat_math_continue
141
+ idcompatmathstart,id_compat_math_start
124
142
  idcontinue,id_continue
125
143
  ideographic,ideographic
126
144
  idsbinaryoperator,ids_binary_operator
127
145
  idstart,id_start
128
146
  idstrinaryoperator,ids_trinary_operator
147
+ idsunaryoperator,ids_unary_operator
129
148
  imperialaramaic,imperial_aramaic
130
149
  inadlam,in_adlam
131
150
  inaegeannumbers,in_aegean_numbers
@@ -139,6 +158,7 @@ inancientsymbols,in_ancient_symbols
139
158
  inarabic,in_arabic
140
159
  inarabicextendeda,in_arabic_extended_a
141
160
  inarabicextendedb,in_arabic_extended_b
161
+ inarabicextendedc,in_arabic_extended_c
142
162
  inarabicmathematicalalphabeticsymbols,in_arabic_mathematical_alphabetic_symbols
143
163
  inarabicpresentationformsa,in_arabic_presentation_forms_a
144
164
  inarabicpresentationformsb,in_arabic_presentation_forms_b
@@ -186,6 +206,8 @@ incjkunifiedideographsextensiond,in_cjk_unified_ideographs_extension_d
186
206
  incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
187
207
  incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
188
208
  incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
209
+ incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
210
+ incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
189
211
  incombiningdiacriticalmarks,in_combining_diacritical_marks
190
212
  incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
191
213
  incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
@@ -205,10 +227,12 @@ incyrillic,in_cyrillic
205
227
  incyrillicextendeda,in_cyrillic_extended_a
206
228
  incyrillicextendedb,in_cyrillic_extended_b
207
229
  incyrillicextendedc,in_cyrillic_extended_c
230
+ incyrillicextendedd,in_cyrillic_extended_d
208
231
  incyrillicsupplement,in_cyrillic_supplement
209
232
  indeseret,in_deseret
210
233
  indevanagari,in_devanagari
211
234
  indevanagariextended,in_devanagari_extended
235
+ indevanagariextendeda,in_devanagari_extended_a
212
236
  indingbats,in_dingbats
213
237
  indivesakuru,in_dives_akuru
214
238
  indogra,in_dogra
@@ -268,6 +292,7 @@ inipaextensions,in_ipa_extensions
268
292
  initialpunctuation,initial_punctuation
269
293
  injavanese,in_javanese
270
294
  inkaithi,in_kaithi
295
+ inkaktoviknumerals,in_kaktovik_numerals
271
296
  inkanaextendeda,in_kana_extended_a
272
297
  inkanaextendedb,in_kana_extended_b
273
298
  inkanasupplement,in_kana_supplement
@@ -276,6 +301,7 @@ inkangxiradicals,in_kangxi_radicals
276
301
  inkannada,in_kannada
277
302
  inkatakana,in_katakana
278
303
  inkatakanaphoneticextensions,in_katakana_phonetic_extensions
304
+ inkawi,in_kawi
279
305
  inkayahli,in_kayah_li
280
306
  inkharoshthi,in_kharoshthi
281
307
  inkhitansmallscript,in_khitan_small_script
@@ -339,6 +365,7 @@ inmyanmar,in_myanmar
339
365
  inmyanmarextendeda,in_myanmar_extended_a
340
366
  inmyanmarextendedb,in_myanmar_extended_b
341
367
  innabataean,in_nabataean
368
+ innagmundari,in_nag_mundari
342
369
  innandinagari,in_nandinagari
343
370
  innewa,in_newa
344
371
  innewtailue,in_new_tai_lue
@@ -457,6 +484,7 @@ joincontrol,join_control
457
484
  kaithi,kaithi
458
485
  kannada,kannada
459
486
  katakana,katakana
487
+ kawi,kawi
460
488
  kayahli,kayah_li
461
489
  kharoshthi,kharoshthi
462
490
  khitansmallscript,khitan_small_script
@@ -503,6 +531,7 @@ mro,mro
503
531
  multani,multani
504
532
  myanmar,myanmar
505
533
  nabataean,nabataean
534
+ nagmundari,nag_mundari
506
535
  nandinagari,nandinagari
507
536
  newa,newa
508
537
  newline,newline
@@ -57,6 +57,7 @@ emod,emoji_modifier
57
57
  epres,emoji_presentation
58
58
  ethi,ethiopic
59
59
  ext,extender
60
+ extpict,extended_pictographic
60
61
  geor,georgian
61
62
  glag,glagolitic
62
63
  gong,gunjala_gondi
@@ -85,6 +86,7 @@ ideo,ideographic
85
86
  ids,id_start
86
87
  idsb,ids_binary_operator
87
88
  idst,ids_trinary_operator
89
+ idsu,ids_unary_operator
88
90
  ital,old_italic
89
91
  java,javanese
90
92
  joinc,join_control
@@ -133,6 +135,7 @@ mtei,meetei_mayek
133
135
  mult,multani
134
136
  mymr,myanmar
135
137
  n,number
138
+ nagm,nag_mundari
136
139
  nand,nandinagari
137
140
  narb,old_north_arabian
138
141
  nbat,nabataean
@@ -17,10 +17,10 @@
17
17
  text = copy(data, ts-1, te)
18
18
  type = (text[1] == 'P') ^ (text[3] == '^') ? :nonproperty : :property
19
19
 
20
- name = data[ts+2..te-2].pack('c*').gsub(/[\^\s_\-]/, '').downcase
20
+ name = text[3..-2].gsub(/[\^\s_\-]/, '').downcase
21
21
 
22
22
  token = self.class.short_prop_map[name] || self.class.long_prop_map[name]
23
- validation_error(:property, name) unless token
23
+ raise ValidationError.for(:property, name) unless token
24
24
 
25
25
  self.emit(type, token.to_sym, text)
26
26
 
@@ -30,11 +30,6 @@
30
30
 
31
31
  class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
32
32
 
33
-
34
- # these are not supported in ruby at the moment
35
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
36
- character_equivalent = '[=' . alpha . '=]';
37
-
38
33
  line_anchor = beginning_of_line | end_of_line;
39
34
  anchor_char = [AbBzZG];
40
35
 
@@ -59,9 +54,6 @@
59
54
  one_or_more = '+' | '+?' | '++';
60
55
 
61
56
  quantifier_greedy = '?' | '*' | '+';
62
- quantifier_reluctant = '??' | '*?' | '+?';
63
- quantifier_possessive = '?+' | '*+' | '++';
64
- quantifier_mode = '?' | '+';
65
57
 
66
58
  quantity_exact = (digit+);
67
59
  quantity_minimum = (digit+) . ',';
@@ -70,9 +62,6 @@
70
62
  quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
71
63
  quantity_maximum | quantity_range ) . range_close;
72
64
 
73
- quantifiers = quantifier_greedy | quantifier_reluctant |
74
- quantifier_possessive | quantifier_interval;
75
-
76
65
  conditional = '(?(';
77
66
 
78
67
  group_comment = '?#' . [^)]* . group_close;
@@ -89,10 +78,9 @@
89
78
  # try to treat every other group head as options group, like Ruby
90
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
91
80
 
92
- group_ref = [gk];
93
- group_name_id_ab = ([^0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
94
- group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
95
- group_number = '-'? . [1-9] . [0-9]*;
81
+ group_name_id_ab = ([^!=0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
96
84
  group_level = [+\-] . [0-9]+;
97
85
 
98
86
  group_name = ('<' . group_name_id_ab? . '>') |
@@ -101,15 +89,11 @@
101
89
 
102
90
  group_named = ('?' . group_name );
103
91
 
104
- group_name_backref = 'k' . (('<' . group_name_id_ab? . group_level? '>') |
105
- ("'" . group_name_id_sq? . group_level? "'"));
106
- group_name_call = 'g' . (('<' . group_name_id_ab? . group_level? '>') |
107
- ("'" . group_name_id_sq? . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
108
94
 
109
- group_number_backref = 'k' . (('<' . group_number . group_level? '>') |
110
- ("'" . group_number . group_level? "'"));
111
- group_number_call = 'g' . (('<' . ((group_number . group_level?) | '0') '>') |
112
- ("'" . ((group_number . group_level?) | '0') "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
113
97
 
114
98
  group_type = group_atomic | group_passive | group_absence | group_named;
115
99
 
@@ -132,20 +116,21 @@
132
116
  keep_mark | sequence_char;
133
117
 
134
118
  # escapes that also work within a character set
135
- set_escape = backslash | brackets | escaped_ascii | property_char |
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
136
121
  sequence_char | single_codepoint_char_type;
137
122
 
138
123
 
139
124
  # EOF error, used where it can be detected
140
125
  action premature_end_error {
141
126
  text = copy(data, ts ? ts-1 : 0, -1)
142
- raise PrematureEndError.new( text )
127
+ raise PrematureEndError.new(text)
143
128
  }
144
129
 
145
130
  # Invalid sequence error, used from sequences, like escapes and sets
146
131
  action invalid_sequence_error {
147
132
  text = copy(data, ts ? ts-1 : 0, -1)
148
- validation_error(:sequence, 'sequence', text)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
149
134
  }
150
135
 
151
136
  # group (nesting) and set open/close actions
@@ -168,8 +153,8 @@
168
153
  };
169
154
 
170
155
  '-]' @set_closed { # special case, emits two tokens
171
- emit(:literal, :literal, copy(data, ts, te-1))
172
- emit(:set, :close, copy(data, ts+1, te))
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
173
158
  if in_set?
174
159
  fret;
175
160
  else
@@ -183,28 +168,27 @@
183
168
  };
184
169
 
185
170
  '^' {
186
- text = copy(data, ts, te)
187
- if tokens.last[1] == :open
188
- emit(:set, :negate, text)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
189
173
  else
190
- emit(:literal, :literal, text)
174
+ emit(:literal, :literal, '^')
191
175
  end
192
176
  };
193
177
 
194
178
  '-' {
195
- text = copy(data, ts, te)
196
- # ranges cant start with a subset or intersection/negation/range operator
197
- if tokens.last[0] == :set
198
- emit(:literal, :literal, text)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
199
183
  else
200
- emit(:set, :range, text)
184
+ emit(:set, :range, '-')
201
185
  end
202
186
  };
203
187
 
204
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
205
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
206
190
  '&&' {
207
- emit(:set, :intersection, copy(data, ts, te))
191
+ emit(:set, :intersection, '&&')
208
192
  };
209
193
 
210
194
  backslash {
@@ -212,7 +196,7 @@
212
196
  };
213
197
 
214
198
  set_open >(open_bracket, 1) >set_opened {
215
- emit(:set, :open, copy(data, ts, te))
199
+ emit(:set, :open, '[')
216
200
  fcall character_set;
217
201
  };
218
202
 
@@ -227,20 +211,12 @@
227
211
  end
228
212
 
229
213
  unless self.class.posix_classes.include?(class_name)
230
- validation_error(:posix_class, text)
214
+ raise ValidationError.for(:posix_class, text)
231
215
  end
232
216
 
233
217
  emit(type, class_name.to_sym, text)
234
218
  };
235
219
 
236
- # These are not supported in ruby at the moment. Enable them if they are.
237
- # collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
238
- # emit(:set, :collation, copy(data, ts, te))
239
- # };
240
- # character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
241
- # emit(:set, :equivalent, copy(data, ts, te))
242
- # };
243
-
244
220
  meta_char > (set_meta, 1) {
245
221
  emit(:literal, :literal, copy(data, ts, te))
246
222
  };
@@ -254,12 +230,22 @@
254
230
  # set escapes scanner
255
231
  # --------------------------------------------------------------------------
256
232
  set_escape_sequence := |*
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
236
+ fret;
237
+ };
238
+
239
+ # Scan all other escapes that work in sets with the generic escape scanner
257
240
  set_escape > (escaped_set_alpha, 2) {
258
241
  fhold;
259
242
  fnext character_set;
260
243
  fcall escape_sequence;
261
244
  };
262
245
 
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
263
249
  any > (escaped_set_alpha, 1) {
264
250
  emit(:escape, :literal, copy(data, ts-1, te))
265
251
  fret;
@@ -281,6 +267,13 @@
281
267
  fret;
282
268
  };
283
269
 
270
+ [8-9] . [0-9] { # special case, emits two tokens
271
+ text = copy(data, ts-1, te)
272
+ emit(:escape, :literal, text[0, 2])
273
+ emit(:literal, :literal, text[2])
274
+ fret;
275
+ };
276
+
284
277
  meta_char {
285
278
  case text = copy(data, ts-1, te)
286
279
  when '\.'; emit(:escape, :dot, text)
@@ -371,6 +364,7 @@
371
364
  conditional_expression := |*
372
365
  group_lookup . ')' {
373
366
  text = copy(data, ts, te-1)
367
+ text =~ /[^0]/ or raise ValidationError.for(:backref, 'condition', 'invalid ref ID')
374
368
  emit(:conditional, :condition, text)
375
369
  emit(:conditional, :condition_close, ')')
376
370
  };
@@ -453,10 +447,9 @@
453
447
 
454
448
  # (?#...) comments: parsed as a single expression, without introducing a
455
449
  # new nesting level. Comments may not include parentheses, escaped or not.
456
- # special case for close, action performed on all transitions to get the
457
- # correct closing count.
450
+ # special case for close to get the correct closing count.
458
451
  # ------------------------------------------------------------------------
459
- group_open . group_comment $group_closed {
452
+ (group_open . group_comment) @group_closed {
460
453
  emit(:group, :comment, copy(data, ts, te))
461
454
  };
462
455
 
@@ -471,10 +464,10 @@
471
464
  #
472
465
  # (?imxdau-imx:subexp) option on/off for subexp
473
466
  # ------------------------------------------------------------------------
474
- group_open . group_options >group_opened {
467
+ (group_open . group_options) >group_opened {
475
468
  text = copy(data, ts, te)
476
469
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
477
- validation_error(:group_option, $1 || "-#{$2}", text)
470
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
478
471
  end
479
472
  emit_options(text)
480
473
  };
@@ -485,7 +478,7 @@
485
478
  # (?<=subexp) look-behind
486
479
  # (?<!subexp) negative look-behind
487
480
  # ------------------------------------------------------------------------
488
- group_open . assertion_type >group_opened {
481
+ (group_open . assertion_type) >group_opened {
489
482
  case text = copy(data, ts, te)
490
483
  when '(?='; emit(:assertion, :lookahead, text)
491
484
  when '(?!'; emit(:assertion, :nlookahead, text)
@@ -502,14 +495,14 @@
502
495
  # (?'name'subexp) named group (single quoted version)
503
496
  # (subexp) captured group
504
497
  # ------------------------------------------------------------------------
505
- group_open . group_type >group_opened {
498
+ (group_open . group_type) >group_opened {
506
499
  case text = copy(data, ts, te)
507
500
  when '(?:'; emit(:group, :passive, text)
508
501
  when '(?>'; emit(:group, :atomic, text)
509
502
  when '(?~'; emit(:group, :absence, text)
510
503
 
511
504
  when /^\(\?(?:<>|'')/
512
- validation_error(:group, 'named group', 'name is empty')
505
+ raise ValidationError.for(:group, 'named group', 'name is empty')
513
506
 
514
507
  when /^\(\?<[^>]+>/
515
508
  emit(:group, :named_ab, text)
@@ -528,50 +521,52 @@
528
521
  group_close @group_closed {
529
522
  if conditional_stack.last == group_depth + 1
530
523
  conditional_stack.pop
531
- emit(:conditional, :close, copy(data, ts, te))
532
- else
524
+ emit(:conditional, :close, ')')
525
+ elsif group_depth >= 0
533
526
  if spacing_stack.length > 1 &&
534
527
  spacing_stack.last[:depth] == group_depth + 1
535
528
  spacing_stack.pop
536
529
  self.free_spacing = spacing_stack.last[:free_spacing]
537
530
  end
538
531
 
539
- emit(:group, :close, copy(data, ts, te))
532
+ emit(:group, :close, ')')
533
+ else
534
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
540
535
  end
541
536
  };
542
537
 
543
538
 
544
539
  # Group backreference, named and numbered
545
540
  # ------------------------------------------------------------------------
546
- backslash . (group_name_backref | group_number_backref) > (backslashed, 4) {
541
+ backslash . (group_ref) > (backslashed, 4) {
547
542
  case text = copy(data, ts, te)
548
- when /^\\k(<>|'')/
549
- validation_error(:backref, 'backreference', 'ref ID is empty')
550
- when /^\\k(.)[^\p{digit}\-][^+\-]*\D$/
543
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
551
544
  emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
552
- when /^\\k(.)\d+\D$/
545
+ when /^\\k(.)0*[1-9]\d*['>]$/
553
546
  emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
554
- when /^\\k(.)-\d+\D$/
547
+ when /^\\k(.)-0*[1-9]\d*['>]$/
555
548
  emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
556
- when /^\\k(.)[^\p{digit}\-].*[+\-]\d+\D$/
549
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
557
550
  emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
558
- when /^\\k(.)-?\d+[+\-]\d+\D$/
551
+ when /^\\k(.)-?0*[1-9]\d*[+\-]\d+['>]$/
559
552
  emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
553
+ else
554
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
560
555
  end
561
556
  };
562
557
 
563
558
  # Group call, named and numbered
564
559
  # ------------------------------------------------------------------------
565
- backslash . (group_name_call | group_number_call) > (backslashed, 4) {
560
+ backslash . (group_call) > (backslashed, 4) {
566
561
  case text = copy(data, ts, te)
567
- when /^\\g(<>|'')/
568
- validation_error(:backref, 'subexpression call', 'ref ID is empty')
569
- when /^\\g(.)[^\p{digit}+\->][^+\-]*/
562
+ when /^\\g(.)[^0-9+\-].*['>]$/
570
563
  emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
571
- when /^\\g(.)\d+\D$/
564
+ when /^\\g(.)(?:0|0*[1-9]\d*)['>]$/
572
565
  emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
573
- when /^\\g(.)[+-]\d+/
566
+ when /^\\g(.)[+-]0*[1-9]\d*/
574
567
  emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
568
+ else
569
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
575
570
  end
576
571
  };
577
572
 
@@ -645,95 +640,35 @@
645
640
  *|;
646
641
  }%%
647
642
 
648
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
649
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
650
-
651
- require 'regexp_parser/error'
643
+ require_relative 'scanner/errors/scanner_error'
644
+ require_relative 'scanner/errors/premature_end_error'
645
+ require_relative 'scanner/errors/validation_error'
652
646
 
653
647
  class Regexp::Scanner
654
- # General scanner error (catch all)
655
- class ScannerError < Regexp::Parser::Error; end
656
-
657
- # Base for all scanner validation errors
658
- class ValidationError < Regexp::Parser::Error
659
- def initialize(reason)
660
- super reason
661
- end
662
- end
663
-
664
- # Unexpected end of pattern
665
- class PrematureEndError < ScannerError
666
- def initialize(where = '')
667
- super "Premature end of pattern at #{where}"
668
- end
669
- end
670
-
671
- # Invalid sequence format. Used for escape sequences, mainly.
672
- class InvalidSequenceError < ValidationError
673
- def initialize(what = 'sequence', where = '')
674
- super "Invalid #{what} at #{where}"
675
- end
676
- end
677
-
678
- # Invalid group. Used for named groups.
679
- class InvalidGroupError < ValidationError
680
- def initialize(what, reason)
681
- super "Invalid #{what}, #{reason}."
682
- end
683
- end
684
-
685
- # Invalid groupOption. Used for inline options.
686
- # TODO: should become InvalidGroupOptionError in v3.0.0 for consistency
687
- class InvalidGroupOption < ValidationError
688
- def initialize(option, text)
689
- super "Invalid group option #{option} in #{text}"
690
- end
691
- end
692
-
693
- # Invalid back reference. Used for name a number refs/calls.
694
- class InvalidBackrefError < ValidationError
695
- def initialize(what, reason)
696
- super "Invalid back reference #{what}, #{reason}"
697
- end
698
- end
699
-
700
- # The property name was not recognized by the scanner.
701
- class UnknownUnicodePropertyError < ValidationError
702
- def initialize(name)
703
- super "Unknown unicode character property name #{name}"
704
- end
705
- end
706
-
707
- # The POSIX class name was not recognized by the scanner.
708
- class UnknownPosixClassError < ValidationError
709
- def initialize(text)
710
- super "Unknown POSIX class #{text}"
711
- end
712
- end
713
-
714
648
  # Scans the given regular expression text, or Regexp object and collects the
715
649
  # emitted token into an array that gets returned at the end. If a block is
716
650
  # given, it gets called for each emitted token.
717
651
  #
718
652
  # This method may raise errors if a syntax error is encountered.
719
653
  # --------------------------------------------------------------------------
720
- def self.scan(input_object, options: nil, &block)
721
- new.scan(input_object, options: options, &block)
654
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
655
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
722
656
  end
723
657
 
724
- def scan(input_object, options: nil, &block)
725
- self.literal = nil
658
+ def scan(input_object, options: nil, collect_tokens: true, &block)
659
+ self.collect_tokens = collect_tokens
660
+ self.literal_run = nil
726
661
  stack = []
727
662
 
728
663
  input = input_object.is_a?(Regexp) ? input_object.source : input_object
729
664
  self.free_spacing = free_spacing?(input_object, options)
730
665
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
731
666
 
732
- data = input.unpack("c*") if input.is_a?(String)
667
+ data = input.unpack("c*")
733
668
  eof = data.length
734
669
 
735
670
  self.tokens = []
736
- self.block = block_given? ? block : nil
671
+ self.block = block
737
672
 
738
673
  self.set_depth = 0
739
674
  self.group_depth = 0
@@ -758,7 +693,7 @@ class Regexp::Scanner
758
693
  "[#{set_depth}]") if in_set?
759
694
 
760
695
  # when the entire expression is a literal run
761
- emit_literal if literal
696
+ emit_literal if literal_run
762
697
 
763
698
  tokens
764
699
  end
@@ -785,26 +720,37 @@ class Regexp::Scanner
785
720
  def emit(type, token, text)
786
721
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
787
722
 
788
- emit_literal if literal
723
+ emit_literal if literal_run
789
724
 
790
725
  # Ragel runs with byte-based indices (ts, te). These are of little value to
791
726
  # end-users, so we keep track of char-based indices and emit those instead.
792
727
  ts_char_pos = char_pos
793
728
  te_char_pos = char_pos + text.length
794
729
 
795
- if block
796
- block.call type, token, text, ts_char_pos, te_char_pos
797
- end
730
+ tok = [type, token, text, ts_char_pos, te_char_pos]
798
731
 
799
- tokens << [type, token, text, ts_char_pos, te_char_pos]
732
+ self.prev_token = tok
800
733
 
801
734
  self.char_pos = te_char_pos
735
+
736
+ if block
737
+ block.call type, token, text, ts_char_pos, te_char_pos
738
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
739
+ tokens << tok if collect_tokens
740
+ elsif collect_tokens
741
+ tokens << tok
742
+ end
802
743
  end
803
744
 
745
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
746
+
804
747
  private
805
748
 
806
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
807
- :group_depth, :set_depth, :conditional_stack, :char_pos
749
+ attr_accessor :block,
750
+ :collect_tokens, :tokens, :prev_token,
751
+ :free_spacing, :spacing_stack,
752
+ :group_depth, :set_depth, :conditional_stack,
753
+ :char_pos
808
754
 
809
755
  def free_spacing?(input_object, options)
810
756
  if options && !input_object.is_a?(String)
@@ -834,14 +780,13 @@ class Regexp::Scanner
834
780
  # Appends one or more characters to the literal buffer, to be emitted later
835
781
  # by a call to emit_literal.
836
782
  def append_literal(data, ts, te)
837
- self.literal = literal || []
838
- literal << copy(data, ts, te)
783
+ (self.literal_run ||= []) << copy(data, ts, te)
839
784
  end
840
785
 
841
786
  # Emits the literal run collected by calls to the append_literal method.
842
787
  def emit_literal
843
- text = literal.join
844
- self.literal = nil
788
+ text = literal_run.join
789
+ self.literal_run = nil
845
790
  emit(:literal, :literal, text)
846
791
  end
847
792
 
@@ -876,24 +821,8 @@ class Regexp::Scanner
876
821
 
877
822
  def emit_meta_control_sequence(data, ts, te, token)
878
823
  if data.last < 0x00 || data.last > 0x7F
879
- validation_error(:sequence, 'escape', token.to_s)
824
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
880
825
  end
881
826
  emit(:escape, token, copy(data, ts-1, te))
882
827
  end
883
-
884
- # Centralizes and unifies the handling of validation related
885
- # errors.
886
- def validation_error(type, what, reason = nil)
887
- error =
888
- case type
889
- when :backref then InvalidBackrefError.new(what, reason)
890
- when :group then InvalidGroupError.new(what, reason)
891
- when :group_option then InvalidGroupOption.new(what, reason)
892
- when :posix_class then UnknownPosixClassError.new(what)
893
- when :property then UnknownUnicodePropertyError.new(what)
894
- when :sequence then InvalidSequenceError.new(what, reason)
895
- end
896
-
897
- raise error # unless @@config.validation_ignore
898
- end
899
828
  end # module Regexp::Scanner