regexp_parser 1.5.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +59 -0
  3. data/Gemfile +3 -3
  4. data/README.md +14 -6
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +6 -43
  7. data/lib/regexp_parser/expression/classes/conditional.rb +3 -2
  8. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  9. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  10. data/lib/regexp_parser/expression/methods/match_length.rb +1 -1
  11. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  12. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  13. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  14. data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
  15. data/lib/regexp_parser/expression/sequence.rb +3 -2
  16. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  17. data/lib/regexp_parser/lexer.rb +4 -25
  18. data/lib/regexp_parser/parser.rb +40 -33
  19. data/lib/regexp_parser/scanner.rb +1208 -1353
  20. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  21. data/lib/regexp_parser/scanner/properties/long.yml +15 -1
  22. data/lib/regexp_parser/scanner/properties/short.yml +5 -0
  23. data/lib/regexp_parser/scanner/scanner.rl +116 -202
  24. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
  25. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  26. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  27. data/lib/regexp_parser/version.rb +1 -1
  28. data/spec/expression/base_spec.rb +14 -0
  29. data/spec/expression/methods/match_length_spec.rb +20 -0
  30. data/spec/expression/methods/match_spec.rb +25 -0
  31. data/spec/expression/methods/tests_spec.rb +2 -0
  32. data/spec/expression/methods/traverse_spec.rb +21 -0
  33. data/spec/expression/options_spec.rb +128 -0
  34. data/spec/expression/root_spec.rb +9 -0
  35. data/spec/expression/sequence_spec.rb +9 -0
  36. data/spec/lexer/conditionals_spec.rb +49 -119
  37. data/spec/lexer/delimiters_spec.rb +68 -0
  38. data/spec/lexer/escapes_spec.rb +8 -32
  39. data/spec/lexer/keep_spec.rb +5 -17
  40. data/spec/lexer/literals_spec.rb +73 -110
  41. data/spec/lexer/nesting_spec.rb +86 -117
  42. data/spec/lexer/refcalls_spec.rb +51 -50
  43. data/spec/parser/all_spec.rb +13 -1
  44. data/spec/parser/anchors_spec.rb +9 -23
  45. data/spec/parser/conditionals_spec.rb +9 -9
  46. data/spec/parser/errors_spec.rb +22 -43
  47. data/spec/parser/escapes_spec.rb +33 -44
  48. data/spec/parser/free_space_spec.rb +25 -4
  49. data/spec/parser/groups_spec.rb +98 -257
  50. data/spec/parser/keep_spec.rb +2 -15
  51. data/spec/parser/options_spec.rb +28 -0
  52. data/spec/parser/posix_classes_spec.rb +5 -24
  53. data/spec/parser/properties_spec.rb +42 -54
  54. data/spec/parser/quantifiers_spec.rb +42 -283
  55. data/spec/parser/refcalls_spec.rb +60 -185
  56. data/spec/parser/set/intersections_spec.rb +17 -17
  57. data/spec/parser/set/ranges_spec.rb +17 -17
  58. data/spec/parser/sets_spec.rb +5 -5
  59. data/spec/parser/types_spec.rb +11 -36
  60. data/spec/scanner/anchors_spec.rb +13 -28
  61. data/spec/scanner/conditionals_spec.rb +121 -173
  62. data/spec/scanner/delimiters_spec.rb +52 -0
  63. data/spec/scanner/errors_spec.rb +64 -87
  64. data/spec/scanner/escapes_spec.rb +53 -50
  65. data/spec/scanner/free_space_spec.rb +102 -165
  66. data/spec/scanner/groups_spec.rb +45 -64
  67. data/spec/scanner/keep_spec.rb +5 -28
  68. data/spec/scanner/literals_spec.rb +45 -81
  69. data/spec/scanner/meta_spec.rb +13 -33
  70. data/spec/scanner/options_spec.rb +36 -0
  71. data/spec/scanner/properties_spec.rb +43 -286
  72. data/spec/scanner/quantifiers_spec.rb +13 -28
  73. data/spec/scanner/refcalls_spec.rb +32 -48
  74. data/spec/scanner/sets_spec.rb +88 -102
  75. data/spec/scanner/types_spec.rb +10 -25
  76. data/spec/spec_helper.rb +1 -0
  77. data/spec/support/shared_examples.rb +77 -0
  78. data/spec/syntax/syntax_spec.rb +4 -0
  79. data/spec/syntax/versions/1.8.6_spec.rb +12 -33
  80. data/spec/syntax/versions/1.9.1_spec.rb +5 -18
  81. data/spec/syntax/versions/1.9.3_spec.rb +4 -17
  82. data/spec/syntax/versions/2.0.0_spec.rb +8 -23
  83. data/spec/syntax/versions/2.2.0_spec.rb +4 -17
  84. data/spec/syntax/versions/aliases_spec.rb +27 -109
  85. metadata +28 -10
  86. data/spec/scanner/scripts_spec.rb +0 -49
  87. data/spec/scanner/unicode_blocks_spec.rb +0 -28
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -6,6 +6,8 @@ adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
8
  age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
9
11
  age=2.0: age=2.0
10
12
  age=2.1: age=2.1
11
13
  age=3.0: age=3.0
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
64
66
  cherokee: cherokee
65
67
  closepunctuation: close_punctuation
66
68
  cntrl: cntrl
67
- combiningmark: combining_mark
68
69
  common: common
69
70
  connectorpunctuation: connector_punctuation
70
71
  control: control
@@ -86,6 +87,7 @@ dogra: dogra
86
87
  duployan: duployan
87
88
  egyptianhieroglyphs: egyptian_hieroglyphs
88
89
  elbasan: elbasan
90
+ elymaic: elymaic
89
91
  emoji: emoji
90
92
  emojicomponent: emoji_component
91
93
  emojimodifier: emoji_modifier
@@ -206,8 +208,10 @@ indogra: in_dogra
206
208
  indominotiles: in_domino_tiles
207
209
  induployan: in_duployan
208
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
209
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
210
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
211
215
  inemoticons: in_emoticons
212
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
213
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
322
326
  inmyanmarextendeda: in_myanmar_extended_a
323
327
  inmyanmarextendedb: in_myanmar_extended_b
324
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
325
330
  innewa: in_newa
326
331
  innewtailue: in_new_tai_lue
327
332
  innko: in_nko
328
333
  innoblock: in_no_block
329
334
  innumberforms: in_number_forms
330
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
331
337
  inogham: in_ogham
332
338
  inolchiki: in_ol_chiki
333
339
  inoldhungarian: in_old_hungarian
@@ -343,6 +349,7 @@ inoriya: in_oriya
343
349
  inornamentaldingbats: in_ornamental_dingbats
344
350
  inosage: in_osage
345
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
346
353
  inpahawhhmong: in_pahawh_hmong
347
354
  inpalmyrene: in_palmyrene
348
355
  inpaucinhau: in_pau_cin_hau
@@ -368,6 +375,7 @@ insiddham: in_siddham
368
375
  insinhala: in_sinhala
369
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
370
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
371
379
  insogdian: in_sogdian
372
380
  insorasompeng: in_sora_sompeng
373
381
  insoyombo: in_soyombo
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
386
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
387
395
  insuttonsignwriting: in_sutton_signwriting
388
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
389
398
  insyriac: in_syriac
390
399
  insyriacsupplement: in_syriac_supplement
391
400
  intagalog: in_tagalog
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
397
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
398
407
  intakri: in_takri
399
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
400
410
  intangut: in_tangut
401
411
  intangutcomponents: in_tangut_components
402
412
  intelugu: in_telugu
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
414
424
  invariationselectorssupplement: in_variation_selectors_supplement
415
425
  invedicextensions: in_vedic_extensions
416
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
417
428
  inwarangciti: in_warang_citi
418
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
419
430
  inyiradicals: in_yi_radicals
@@ -469,6 +480,7 @@ mro: mro
469
480
  multani: multani
470
481
  myanmar: myanmar
471
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
472
484
  newa: newa
473
485
  newline: newline
474
486
  newtailue: new_tai_lue
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
477
489
  nonspacingmark: nonspacing_mark
478
490
  number: number
479
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
480
493
  ogham: ogham
481
494
  olchiki: ol_chiki
482
495
  oldhungarian: old_hungarian
@@ -569,6 +582,7 @@ uppercase: uppercase
569
582
  uppercaseletter: uppercase_letter
570
583
  vai: vai
571
584
  variationselector: variation_selector
585
+ wancho: wancho
572
586
  warangciti: warang_citi
573
587
  whitespace: white_space
574
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -49,6 +50,7 @@ dsrt: deseret
49
50
  dupl: duployan
50
51
  egyp: egyptian_hieroglyphs
51
52
  elba: elbasan
53
+ elym: elymaic
52
54
  ethi: ethiopic
53
55
  ext: extender
54
56
  geor: georgian
@@ -72,6 +74,7 @@ hex: hex_digit
72
74
  hira: hiragana
73
75
  hluw: anatolian_hieroglyphs
74
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
75
78
  hung: old_hungarian
76
79
  idc: id_continue
77
80
  ideo: ideographic
@@ -125,6 +128,7 @@ mtei: meetei_mayek
125
128
  mult: multani
126
129
  mymr: myanmar
127
130
  n: number
131
+ nand: nandinagari
128
132
  narb: old_north_arabian
129
133
  nbat: nabataean
130
134
  nchar: noncharacter_code_point
@@ -216,6 +220,7 @@ uideo: unified_ideograph
216
220
  vaii: vai
217
221
  vs: variation_selector
218
222
  wara: warang_citi
223
+ wcho: wancho
219
224
  wspace: white_space
220
225
  xidc: xid_continue
221
226
  xids: xid_start
@@ -21,7 +21,7 @@
21
21
  set_close = ']';
22
22
  brackets = set_open | set_close;
23
23
 
24
- comment = ('#' . [^\n]* . '\n');
24
+ comment = ('#' . [^\n]* . '\n'?);
25
25
 
26
26
  class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
27
  'cntrl' | 'digit' | 'graph' |
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -82,7 +86,8 @@
82
86
  assertion_lookbehind = '?<=';
83
87
  assertion_nlookbehind = '?<!';
84
88
 
85
- group_options = '?' . [\-mixdau];
89
+ # try to treat every other group head as options group, like Ruby
90
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
91
 
87
92
  group_ref = [gk];
88
93
  group_name_char = (alnum | '_');
@@ -113,7 +118,9 @@
113
118
  curlies | parantheses | brackets |
114
119
  line_anchor | quantifier_greedy;
115
120
 
116
- ascii_print = ((0x20..0x7e) - meta_char);
121
+ literal_delimiters = ']' | '}';
122
+
123
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
117
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
118
125
 
119
126
  utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
@@ -121,7 +128,7 @@
121
128
  utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
122
129
 
123
130
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
124
- group_ref | keep_mark | [xucCM];
131
+ keep_mark | [xucCM];
125
132
 
126
133
  non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
127
134
  multi_codepoint_char_type | [0-9cCM];
@@ -135,41 +142,35 @@
135
142
  # Invalid sequence error, used from sequences, like escapes and sets
136
143
  action invalid_sequence_error {
137
144
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
145
+ validation_error(:sequence, 'sequence', text)
139
146
  }
140
147
 
141
148
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
149
+ action group_opened { self.group_depth = group_depth + 1 }
150
+ action group_closed { self.group_depth = group_depth - 1 }
151
+ action set_opened { self.set_depth = set_depth + 1 }
152
+ action set_closed { self.set_depth = set_depth - 1 }
144
153
 
145
154
  # Character set scanner, continues consuming characters until it meets the
146
155
  # closing bracket of the set.
147
156
  # --------------------------------------------------------------------------
148
157
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
158
+ set_close > (set_meta, 2) @set_closed {
153
159
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
160
+ if in_set?
158
161
  fret;
162
+ else
163
+ fgoto main;
159
164
  end
160
165
  };
161
166
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
167
+ '-]' @set_closed { # special case, emits two tokens
168
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ if in_set?
172
171
  fret;
172
+ else
173
+ fgoto main;
173
174
  end
174
175
  };
175
176
 
@@ -207,14 +208,12 @@
207
208
  fcall set_escape_sequence;
208
209
  };
209
210
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
211
+ set_open >(open_bracket, 1) >set_opened {
213
212
  emit(:set, :open, *text(data, ts, te))
214
213
  fcall character_set;
215
214
  };
216
215
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
216
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
217
  text = text(data, ts, te).first
219
218
 
220
219
  type = :posixclass
@@ -227,11 +226,11 @@
227
226
  emit(type, class_name.to_sym, text, ts, te)
228
227
  };
229
228
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
229
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
230
  emit(:set, :collation, *text(data, ts, te))
232
231
  };
233
232
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
233
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
234
  emit(:set, :equivalent, *text(data, ts, te))
236
235
  };
237
236
 
@@ -337,44 +336,24 @@
337
336
  };
338
337
 
339
338
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
339
+ emit_meta_control_sequence(data, ts, te, :control)
351
340
  fret;
352
341
  };
353
342
 
354
343
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
344
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
345
  fret;
367
346
  };
368
347
 
369
348
  char_type_char > (escaped_alpha, 2) {
370
349
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
351
  fcall char_type;
373
352
  };
374
353
 
375
354
  property_char > (escaped_alpha, 2) {
376
355
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
356
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
357
  fcall unicode_property;
379
358
  };
380
359
 
@@ -412,8 +391,7 @@
412
391
  };
413
392
 
414
393
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
394
+ if conditional_stack.last == group_depth
417
395
  emit(:conditional, :separator, *text(data, ts, te))
418
396
  else
419
397
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +420,16 @@
442
420
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
421
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
422
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
423
  end
449
424
  };
450
425
 
426
+ literal_delimiters {
427
+ append_literal(data, ts, te)
428
+ };
429
+
451
430
  # Character sets
452
431
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
432
+ set_open >set_opened {
457
433
  emit(:set, :open, *text(data, ts, te))
458
434
  fcall character_set;
459
435
  };
@@ -465,9 +441,7 @@
465
441
  conditional {
466
442
  text = text(data, ts, te).first
467
443
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
444
+ conditional_stack << group_depth
471
445
 
472
446
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
447
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +470,11 @@
496
470
  # (?imxdau-imx:subexp) option on/off for subexp
497
471
  # ------------------------------------------------------------------------
498
472
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
473
+ text = text(data, ts, te).first
474
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
+ end
477
+ emit_options(text, ts, te)
500
478
  };
501
479
 
502
480
  # Assertions
@@ -528,19 +506,15 @@
528
506
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
507
  when '(?~'; emit(:group, :absence, text, ts, te)
530
508
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
509
+ when /^\(\?(?:<>|'')/
510
+ validation_error(:group, 'named group', 'name is empty')
533
511
 
512
+ when /^\(\?<\w*>/
534
513
  emit(:group, :named_ab, text, ts, te)
535
514
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
515
+ when /^\(\?'\w*'/
539
516
  emit(:group, :named_sq, text, ts, te)
540
517
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
518
  end
545
519
  };
546
520
 
@@ -550,20 +524,13 @@
550
524
  };
551
525
 
552
526
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
527
+ if conditional_stack.last == group_depth + 1
557
528
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
529
+ emit(:conditional, :close, *text(data, ts, te))
562
530
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
531
+ if spacing_stack.length > 1 &&
532
+ spacing_stack.last[:depth] == group_depth + 1
565
533
  spacing_stack.pop
566
-
567
534
  self.free_spacing = spacing_stack.last[:free_spacing]
568
535
  end
569
536
 
@@ -576,11 +543,8 @@
576
543
  # ------------------------------------------------------------------------
577
544
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
545
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
546
+ when /^\\([gk])(<>|'')/ # angle brackets
547
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
548
 
585
549
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
550
  if $1 == 'k'
@@ -636,9 +600,6 @@
636
600
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
601
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
602
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
603
  end
643
604
  };
644
605
 
@@ -669,10 +630,15 @@
669
630
  end
670
631
  };
671
632
 
672
- quantifier_interval @err(premature_end_error) {
633
+ quantifier_interval {
673
634
  emit(:quantifier, :interval, *text(data, ts, te))
674
635
  };
675
636
 
637
+ # Catch unmatched curly braces as literals
638
+ range_open {
639
+ append_literal(data, ts, te)
640
+ };
641
+
676
642
  # Escaped sequences
677
643
  # ------------------------------------------------------------------------
678
644
  backslash > (backslashed, 1) {
@@ -771,22 +737,17 @@ class Regexp::Scanner
771
737
  #
772
738
  # This method may raise errors if a syntax error is encountered.
773
739
  # --------------------------------------------------------------------------
774
- def self.scan(input_object, &block)
775
- new.scan(input_object, &block)
740
+ def self.scan(input_object, options: nil, &block)
741
+ new.scan(input_object, options: options, &block)
776
742
  end
777
743
 
778
- def scan(input_object, &block)
744
+ def scan(input_object, options: nil, &block)
779
745
  self.literal = nil
780
746
  stack = []
781
747
 
782
- if input_object.is_a?(Regexp)
783
- input = input_object.source
784
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
785
- else
786
- input = input_object
787
- self.free_spacing = false
788
- end
789
-
748
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
749
+ self.free_spacing = free_spacing?(input_object, options)
750
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
751
 
791
752
  data = input.unpack("c*") if input.is_a?(String)
792
753
  eof = data.length
@@ -794,15 +755,9 @@ class Regexp::Scanner
794
755
  self.tokens = []
795
756
  self.block = block_given? ? block : nil
796
757
 
797
- self.in_group = false
758
+ self.set_depth = 0
798
759
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
760
+ self.conditional_stack = []
806
761
 
807
762
  %% write data;
808
763
  %% write init;
@@ -817,9 +772,9 @@ class Regexp::Scanner
817
772
  end
818
773
 
819
774
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
775
+ "[#{group_depth}]") if in_group?
821
776
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
777
+ "[#{set_depth}]") if in_set?
823
778
 
824
779
  # when the entire expression is a literal run
825
780
  emit_literal if literal
@@ -854,62 +809,27 @@ class Regexp::Scanner
854
809
 
855
810
  private
856
811
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
866
-
867
- options_char, options_length = true, 0
812
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
813
+ :group_depth, :set_depth, :conditional_stack
868
814
 
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
878
-
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
815
+ def free_spacing?(input_object, options)
816
+ if options && !input_object.is_a?(String)
817
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
889
818
  end
890
819
 
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
820
+ options = input_object.options if input_object.is_a?(::Regexp)
893
821
 
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
822
+ return false unless options
898
823
 
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
824
+ options & Regexp::EXTENDED != 0
825
+ end
902
826
 
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
827
+ def in_group?
828
+ group_depth > 0
829
+ end
911
830
 
912
- p # return the new value of the data pointer
831
+ def in_set?
832
+ set_depth > 0
913
833
  end
914
834
 
915
835
  # Copy from ts to te from data as text
@@ -945,32 +865,39 @@ class Regexp::Scanner
945
865
  def emit_options(text, ts, te)
946
866
  token = nil
947
867
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
868
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
869
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
870
+ positive, negative, group_local = $1, $2, $3
950
871
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
872
+ if positive.include?('x')
873
+ self.free_spacing = true
874
+ end
954
875
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
876
+ # If the x appears in both, treat it like ruby does, the second cancels
877
+ # the first.
878
+ if negative && negative.include?('x')
879
+ self.free_spacing = false
880
+ end
960
881
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
882
+ if group_local
883
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
884
+ token = :options
885
+ else
886
+ # switch for parent group level
887
+ spacing_stack.last[:free_spacing] = free_spacing
888
+ token = :options_switch
969
889
  end
970
890
 
971
891
  emit(:group, token, text, ts, te)
972
892
  end
973
893
 
894
+ def emit_meta_control_sequence(data, ts, te, token)
895
+ if data.last < 0x00 || data.last > 0x7F
896
+ validation_error(:sequence, 'escape', token.to_s)
897
+ end
898
+ emit(:escape, token, *text(data, ts, te, 1))
899
+ end
900
+
974
901
  # Centralizes and unifies the handling of validation related
975
902
  # errors.
976
903
  def validation_error(type, what, reason)
@@ -981,21 +908,8 @@ class Regexp::Scanner
981
908
  error = InvalidBackrefError.new(what, reason)
982
909
  when :sequence
983
910
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
911
  end
987
912
 
988
913
  raise error # unless @@config.validation_ignore
989
914
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
915
  end # module Regexp::Scanner