regexp_parser 1.4.0 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +66 -1
  3. data/Gemfile +3 -3
  4. data/README.md +11 -18
  5. data/Rakefile +3 -4
  6. data/lib/regexp_parser/expression.rb +28 -53
  7. data/lib/regexp_parser/expression/classes/backref.rb +18 -10
  8. data/lib/regexp_parser/expression/classes/conditional.rb +7 -2
  9. data/lib/regexp_parser/expression/classes/escape.rb +0 -4
  10. data/lib/regexp_parser/expression/classes/group.rb +4 -2
  11. data/lib/regexp_parser/expression/classes/keep.rb +1 -3
  12. data/lib/regexp_parser/expression/methods/match.rb +13 -0
  13. data/lib/regexp_parser/expression/methods/match_length.rb +172 -0
  14. data/lib/regexp_parser/expression/methods/options.rb +35 -0
  15. data/lib/regexp_parser/expression/methods/strfregexp.rb +0 -1
  16. data/lib/regexp_parser/expression/methods/tests.rb +6 -15
  17. data/lib/regexp_parser/expression/methods/traverse.rb +3 -1
  18. data/lib/regexp_parser/expression/quantifier.rb +2 -2
  19. data/lib/regexp_parser/expression/sequence.rb +3 -6
  20. data/lib/regexp_parser/expression/sequence_operation.rb +2 -6
  21. data/lib/regexp_parser/expression/subexpression.rb +3 -5
  22. data/lib/regexp_parser/lexer.rb +30 -44
  23. data/lib/regexp_parser/parser.rb +47 -24
  24. data/lib/regexp_parser/scanner.rb +1228 -1367
  25. data/lib/regexp_parser/scanner/char_type.rl +0 -3
  26. data/lib/regexp_parser/scanner/properties/long.yml +15 -1
  27. data/lib/regexp_parser/scanner/properties/short.yml +5 -0
  28. data/lib/regexp_parser/scanner/scanner.rl +101 -194
  29. data/lib/regexp_parser/syntax/tokens.rb +2 -10
  30. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +30 -0
  31. data/lib/regexp_parser/syntax/versions/2.6.2.rb +10 -0
  32. data/lib/regexp_parser/syntax/versions/2.6.3.rb +10 -0
  33. data/lib/regexp_parser/version.rb +1 -1
  34. data/regexp_parser.gemspec +2 -2
  35. data/spec/expression/base_spec.rb +94 -0
  36. data/spec/expression/clone_spec.rb +120 -0
  37. data/spec/expression/conditional_spec.rb +89 -0
  38. data/spec/expression/free_space_spec.rb +27 -0
  39. data/spec/expression/methods/match_length_spec.rb +161 -0
  40. data/spec/expression/methods/match_spec.rb +25 -0
  41. data/spec/expression/methods/strfregexp_spec.rb +224 -0
  42. data/spec/expression/methods/tests_spec.rb +99 -0
  43. data/spec/expression/methods/traverse_spec.rb +161 -0
  44. data/spec/expression/options_spec.rb +128 -0
  45. data/spec/expression/root_spec.rb +9 -0
  46. data/spec/expression/sequence_spec.rb +9 -0
  47. data/spec/expression/subexpression_spec.rb +50 -0
  48. data/spec/expression/to_h_spec.rb +26 -0
  49. data/spec/expression/to_s_spec.rb +100 -0
  50. data/spec/lexer/all_spec.rb +22 -0
  51. data/spec/lexer/conditionals_spec.rb +53 -0
  52. data/spec/lexer/delimiters_spec.rb +68 -0
  53. data/spec/lexer/escapes_spec.rb +14 -0
  54. data/spec/lexer/keep_spec.rb +10 -0
  55. data/spec/lexer/literals_spec.rb +89 -0
  56. data/spec/lexer/nesting_spec.rb +99 -0
  57. data/spec/lexer/refcalls_spec.rb +55 -0
  58. data/spec/parser/all_spec.rb +43 -0
  59. data/spec/parser/alternation_spec.rb +88 -0
  60. data/spec/parser/anchors_spec.rb +17 -0
  61. data/spec/parser/conditionals_spec.rb +179 -0
  62. data/spec/parser/errors_spec.rb +30 -0
  63. data/spec/parser/escapes_spec.rb +121 -0
  64. data/spec/parser/free_space_spec.rb +130 -0
  65. data/spec/parser/groups_spec.rb +108 -0
  66. data/spec/parser/keep_spec.rb +6 -0
  67. data/spec/parser/posix_classes_spec.rb +8 -0
  68. data/spec/parser/properties_spec.rb +115 -0
  69. data/spec/parser/quantifiers_spec.rb +52 -0
  70. data/spec/parser/refcalls_spec.rb +112 -0
  71. data/spec/parser/set/intersections_spec.rb +127 -0
  72. data/spec/parser/set/ranges_spec.rb +111 -0
  73. data/spec/parser/sets_spec.rb +178 -0
  74. data/spec/parser/types_spec.rb +18 -0
  75. data/spec/scanner/all_spec.rb +18 -0
  76. data/spec/scanner/anchors_spec.rb +21 -0
  77. data/spec/scanner/conditionals_spec.rb +128 -0
  78. data/spec/scanner/delimiters_spec.rb +52 -0
  79. data/spec/scanner/errors_spec.rb +67 -0
  80. data/spec/scanner/escapes_spec.rb +53 -0
  81. data/spec/scanner/free_space_spec.rb +133 -0
  82. data/spec/scanner/groups_spec.rb +52 -0
  83. data/spec/scanner/keep_spec.rb +10 -0
  84. data/spec/scanner/literals_spec.rb +49 -0
  85. data/spec/scanner/meta_spec.rb +18 -0
  86. data/spec/scanner/properties_spec.rb +64 -0
  87. data/spec/scanner/quantifiers_spec.rb +20 -0
  88. data/spec/scanner/refcalls_spec.rb +36 -0
  89. data/spec/scanner/sets_spec.rb +102 -0
  90. data/spec/scanner/types_spec.rb +14 -0
  91. data/spec/spec_helper.rb +15 -0
  92. data/{test → spec}/support/runner.rb +9 -8
  93. data/spec/support/shared_examples.rb +77 -0
  94. data/{test → spec}/support/warning_extractor.rb +5 -7
  95. data/spec/syntax/syntax_spec.rb +48 -0
  96. data/spec/syntax/syntax_token_map_spec.rb +23 -0
  97. data/spec/syntax/versions/1.8.6_spec.rb +17 -0
  98. data/spec/syntax/versions/1.9.1_spec.rb +10 -0
  99. data/spec/syntax/versions/1.9.3_spec.rb +9 -0
  100. data/spec/syntax/versions/2.0.0_spec.rb +13 -0
  101. data/spec/syntax/versions/2.2.0_spec.rb +9 -0
  102. data/spec/syntax/versions/aliases_spec.rb +37 -0
  103. data/spec/token/token_spec.rb +85 -0
  104. metadata +149 -144
  105. data/test/expression/test_all.rb +0 -12
  106. data/test/expression/test_base.rb +0 -90
  107. data/test/expression/test_clone.rb +0 -89
  108. data/test/expression/test_conditionals.rb +0 -113
  109. data/test/expression/test_free_space.rb +0 -35
  110. data/test/expression/test_set.rb +0 -84
  111. data/test/expression/test_strfregexp.rb +0 -230
  112. data/test/expression/test_subexpression.rb +0 -58
  113. data/test/expression/test_tests.rb +0 -99
  114. data/test/expression/test_to_h.rb +0 -59
  115. data/test/expression/test_to_s.rb +0 -104
  116. data/test/expression/test_traverse.rb +0 -161
  117. data/test/helpers.rb +0 -10
  118. data/test/lexer/test_all.rb +0 -41
  119. data/test/lexer/test_conditionals.rb +0 -127
  120. data/test/lexer/test_keep.rb +0 -24
  121. data/test/lexer/test_literals.rb +0 -130
  122. data/test/lexer/test_nesting.rb +0 -132
  123. data/test/lexer/test_refcalls.rb +0 -56
  124. data/test/parser/set/test_intersections.rb +0 -127
  125. data/test/parser/set/test_ranges.rb +0 -111
  126. data/test/parser/test_all.rb +0 -64
  127. data/test/parser/test_alternation.rb +0 -92
  128. data/test/parser/test_anchors.rb +0 -34
  129. data/test/parser/test_conditionals.rb +0 -187
  130. data/test/parser/test_errors.rb +0 -63
  131. data/test/parser/test_escapes.rb +0 -134
  132. data/test/parser/test_free_space.rb +0 -139
  133. data/test/parser/test_groups.rb +0 -289
  134. data/test/parser/test_keep.rb +0 -21
  135. data/test/parser/test_posix_classes.rb +0 -27
  136. data/test/parser/test_properties.rb +0 -134
  137. data/test/parser/test_quantifiers.rb +0 -301
  138. data/test/parser/test_refcalls.rb +0 -186
  139. data/test/parser/test_sets.rb +0 -179
  140. data/test/parser/test_types.rb +0 -50
  141. data/test/scanner/test_all.rb +0 -38
  142. data/test/scanner/test_anchors.rb +0 -38
  143. data/test/scanner/test_conditionals.rb +0 -184
  144. data/test/scanner/test_errors.rb +0 -91
  145. data/test/scanner/test_escapes.rb +0 -56
  146. data/test/scanner/test_free_space.rb +0 -200
  147. data/test/scanner/test_groups.rb +0 -79
  148. data/test/scanner/test_keep.rb +0 -35
  149. data/test/scanner/test_literals.rb +0 -89
  150. data/test/scanner/test_meta.rb +0 -40
  151. data/test/scanner/test_properties.rb +0 -312
  152. data/test/scanner/test_quantifiers.rb +0 -37
  153. data/test/scanner/test_refcalls.rb +0 -52
  154. data/test/scanner/test_scripts.rb +0 -53
  155. data/test/scanner/test_sets.rb +0 -119
  156. data/test/scanner/test_types.rb +0 -35
  157. data/test/scanner/test_unicode_blocks.rb +0 -30
  158. data/test/support/disable_autotest.rb +0 -8
  159. data/test/syntax/test_all.rb +0 -6
  160. data/test/syntax/test_syntax.rb +0 -61
  161. data/test/syntax/test_syntax_token_map.rb +0 -25
  162. data/test/syntax/versions/test_1.8.rb +0 -55
  163. data/test/syntax/versions/test_1.9.1.rb +0 -36
  164. data/test/syntax/versions/test_1.9.3.rb +0 -32
  165. data/test/syntax/versions/test_2.0.0.rb +0 -37
  166. data/test/syntax/versions/test_2.2.0.rb +0 -32
  167. data/test/syntax/versions/test_aliases.rb +0 -129
  168. data/test/syntax/versions/test_all.rb +0 -5
  169. data/test/test_all.rb +0 -5
  170. data/test/token/test_all.rb +0 -2
  171. data/test/token/test_token.rb +0 -107
@@ -21,9 +21,6 @@
21
21
  when '\W'; emit(:type, :nonword, text, ts - 1, te)
22
22
  when '\R'; emit(:type, :linebreak, text, ts - 1, te)
23
23
  when '\X'; emit(:type, :xgrapheme, text, ts - 1, te)
24
- else
25
- raise ScannerError.new(
26
- "Unexpected character in type at #{text} (char #{ts})")
27
24
  end
28
25
  fret;
29
26
  };
@@ -6,6 +6,8 @@ adlam: adlam
6
6
  age=1.1: age=1.1
7
7
  age=10.0: age=10.0
8
8
  age=11.0: age=11.0
9
+ age=12.0: age=12.0
10
+ age=12.1: age=12.1
9
11
  age=2.0: age=2.0
10
12
  age=2.1: age=2.1
11
13
  age=3.0: age=3.0
@@ -64,7 +66,6 @@ changeswhenuppercased: changes_when_uppercased
64
66
  cherokee: cherokee
65
67
  closepunctuation: close_punctuation
66
68
  cntrl: cntrl
67
- combiningmark: combining_mark
68
69
  common: common
69
70
  connectorpunctuation: connector_punctuation
70
71
  control: control
@@ -86,6 +87,7 @@ dogra: dogra
86
87
  duployan: duployan
87
88
  egyptianhieroglyphs: egyptian_hieroglyphs
88
89
  elbasan: elbasan
90
+ elymaic: elymaic
89
91
  emoji: emoji
90
92
  emojicomponent: emoji_component
91
93
  emojimodifier: emoji_modifier
@@ -206,8 +208,10 @@ indogra: in_dogra
206
208
  indominotiles: in_domino_tiles
207
209
  induployan: in_duployan
208
210
  inearlydynasticcuneiform: in_early_dynastic_cuneiform
211
+ inegyptianhieroglyphformatcontrols: in_egyptian_hieroglyph_format_controls
209
212
  inegyptianhieroglyphs: in_egyptian_hieroglyphs
210
213
  inelbasan: in_elbasan
214
+ inelymaic: in_elymaic
211
215
  inemoticons: in_emoticons
212
216
  inenclosedalphanumerics: in_enclosed_alphanumerics
213
217
  inenclosedalphanumericsupplement: in_enclosed_alphanumeric_supplement
@@ -322,12 +326,14 @@ inmyanmar: in_myanmar
322
326
  inmyanmarextendeda: in_myanmar_extended_a
323
327
  inmyanmarextendedb: in_myanmar_extended_b
324
328
  innabataean: in_nabataean
329
+ innandinagari: in_nandinagari
325
330
  innewa: in_newa
326
331
  innewtailue: in_new_tai_lue
327
332
  innko: in_nko
328
333
  innoblock: in_no_block
329
334
  innumberforms: in_number_forms
330
335
  innushu: in_nushu
336
+ innyiakengpuachuehmong: in_nyiakeng_puachue_hmong
331
337
  inogham: in_ogham
332
338
  inolchiki: in_ol_chiki
333
339
  inoldhungarian: in_old_hungarian
@@ -343,6 +349,7 @@ inoriya: in_oriya
343
349
  inornamentaldingbats: in_ornamental_dingbats
344
350
  inosage: in_osage
345
351
  inosmanya: in_osmanya
352
+ inottomansiyaqnumbers: in_ottoman_siyaq_numbers
346
353
  inpahawhhmong: in_pahawh_hmong
347
354
  inpalmyrene: in_palmyrene
348
355
  inpaucinhau: in_pau_cin_hau
@@ -368,6 +375,7 @@ insiddham: in_siddham
368
375
  insinhala: in_sinhala
369
376
  insinhalaarchaicnumbers: in_sinhala_archaic_numbers
370
377
  insmallformvariants: in_small_form_variants
378
+ insmallkanaextension: in_small_kana_extension
371
379
  insogdian: in_sogdian
372
380
  insorasompeng: in_sora_sompeng
373
381
  insoyombo: in_soyombo
@@ -386,6 +394,7 @@ insupplementaryprivateuseareaa: in_supplementary_private_use_area_a
386
394
  insupplementaryprivateuseareab: in_supplementary_private_use_area_b
387
395
  insuttonsignwriting: in_sutton_signwriting
388
396
  insylotinagri: in_syloti_nagri
397
+ insymbolsandpictographsextendeda: in_symbols_and_pictographs_extended_a
389
398
  insyriac: in_syriac
390
399
  insyriacsupplement: in_syriac_supplement
391
400
  intagalog: in_tagalog
@@ -397,6 +406,7 @@ intaiviet: in_tai_viet
397
406
  intaixuanjingsymbols: in_tai_xuan_jing_symbols
398
407
  intakri: in_takri
399
408
  intamil: in_tamil
409
+ intamilsupplement: in_tamil_supplement
400
410
  intangut: in_tangut
401
411
  intangutcomponents: in_tangut_components
402
412
  intelugu: in_telugu
@@ -414,6 +424,7 @@ invariationselectors: in_variation_selectors
414
424
  invariationselectorssupplement: in_variation_selectors_supplement
415
425
  invedicextensions: in_vedic_extensions
416
426
  inverticalforms: in_vertical_forms
427
+ inwancho: in_wancho
417
428
  inwarangciti: in_warang_citi
418
429
  inyijinghexagramsymbols: in_yijing_hexagram_symbols
419
430
  inyiradicals: in_yi_radicals
@@ -469,6 +480,7 @@ mro: mro
469
480
  multani: multani
470
481
  myanmar: myanmar
471
482
  nabataean: nabataean
483
+ nandinagari: nandinagari
472
484
  newa: newa
473
485
  newline: newline
474
486
  newtailue: new_tai_lue
@@ -477,6 +489,7 @@ noncharactercodepoint: noncharacter_code_point
477
489
  nonspacingmark: nonspacing_mark
478
490
  number: number
479
491
  nushu: nushu
492
+ nyiakengpuachuehmong: nyiakeng_puachue_hmong
480
493
  ogham: ogham
481
494
  olchiki: ol_chiki
482
495
  oldhungarian: old_hungarian
@@ -569,6 +582,7 @@ uppercase: uppercase
569
582
  uppercaseletter: uppercase_letter
570
583
  vai: vai
571
584
  variationselector: variation_selector
585
+ wancho: wancho
572
586
  warangciti: warang_citi
573
587
  whitespace: white_space
574
588
  word: word
@@ -31,6 +31,7 @@ cher: cherokee
31
31
  ci: case_ignorable
32
32
  cn: unassigned
33
33
  co: private_use
34
+ combiningmark: mark
34
35
  copt: coptic
35
36
  cprt: cypriot
36
37
  cs: surrogate
@@ -49,6 +50,7 @@ dsrt: deseret
49
50
  dupl: duployan
50
51
  egyp: egyptian_hieroglyphs
51
52
  elba: elbasan
53
+ elym: elymaic
52
54
  ethi: ethiopic
53
55
  ext: extender
54
56
  geor: georgian
@@ -72,6 +74,7 @@ hex: hex_digit
72
74
  hira: hiragana
73
75
  hluw: anatolian_hieroglyphs
74
76
  hmng: pahawh_hmong
77
+ hmnp: nyiakeng_puachue_hmong
75
78
  hung: old_hungarian
76
79
  idc: id_continue
77
80
  ideo: ideographic
@@ -125,6 +128,7 @@ mtei: meetei_mayek
125
128
  mult: multani
126
129
  mymr: myanmar
127
130
  n: number
131
+ nand: nandinagari
128
132
  narb: old_north_arabian
129
133
  nbat: nabataean
130
134
  nchar: noncharacter_code_point
@@ -216,6 +220,7 @@ uideo: unified_ideograph
216
220
  vaii: vai
217
221
  vs: variation_selector
218
222
  wara: warang_citi
223
+ wcho: wancho
219
224
  wspace: white_space
220
225
  xidc: xid_continue
221
226
  xids: xid_start
@@ -49,9 +49,9 @@
49
49
  codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
50
50
  codepoint_sequence = codepoint_single | codepoint_list;
51
51
 
52
- control_sequence = ('c' | 'C-') . (backslash . 'M-')?;
52
+ control_sequence = ('c' | 'C-') . (backslash . 'M-')? . backslash? . any;
53
53
 
54
- meta_sequence = 'M-' . (backslash . control_sequence)?;
54
+ meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
55
 
56
56
  zero_or_one = '?' | '??' | '?+';
57
57
  zero_or_more = '*' | '*?' | '*+';
@@ -62,13 +62,17 @@
62
62
  quantifier_possessive = '?+' | '*+' | '++';
63
63
  quantifier_mode = '?' | '+';
64
64
 
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
65
+ quantity_exact = (digit+);
66
+ quantity_minimum = (digit+) . ',';
67
+ quantity_maximum = ',' . (digit+);
68
+ quantity_range = (digit+) . ',' . (digit+);
69
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
70
+ quantity_maximum | quantity_range ) . range_close .
71
+ quantifier_mode?;
67
72
 
68
73
  quantifiers = quantifier_greedy | quantifier_reluctant |
69
74
  quantifier_possessive | quantifier_interval;
70
75
 
71
-
72
76
  conditional = '(?(';
73
77
 
74
78
  group_comment = '?#' . [^)]* . group_close;
@@ -82,7 +86,8 @@
82
86
  assertion_lookbehind = '?<=';
83
87
  assertion_nlookbehind = '?<!';
84
88
 
85
- group_options = '?' . [\-mixdau];
89
+ # try to treat every other group head as options group, like Ruby
90
+ group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
86
91
 
87
92
  group_ref = [gk];
88
93
  group_name_char = (alnum | '_');
@@ -113,6 +118,8 @@
113
118
  curlies | parantheses | brackets |
114
119
  line_anchor | quantifier_greedy;
115
120
 
121
+ literal_delimiters = ']' | '}';
122
+
116
123
  ascii_print = ((0x20..0x7e) - meta_char);
117
124
  ascii_nonprint = (0x01..0x1f | 0x7f);
118
125
 
@@ -135,41 +142,35 @@
135
142
  # Invalid sequence error, used from sequences, like escapes and sets
136
143
  action invalid_sequence_error {
137
144
  text = ts ? copy(data, ts-1..-1) : data.pack('c*')
138
- raise InvalidSequenceError.new('sequence', text)
145
+ validation_error(:sequence, 'sequence', text)
139
146
  }
140
147
 
141
148
  # group (nesting) and set open/close actions
142
- action group_opened { self.group_depth = group_depth + 1; in_group = true }
143
- action group_closed { self.group_depth = group_depth - 1; in_group = group_depth > 0 ? true : false }
149
+ action group_opened { self.group_depth = group_depth + 1 }
150
+ action group_closed { self.group_depth = group_depth - 1 }
151
+ action set_opened { self.set_depth = set_depth + 1 }
152
+ action set_closed { self.set_depth = set_depth - 1 }
144
153
 
145
154
  # Character set scanner, continues consuming characters until it meets the
146
155
  # closing bracket of the set.
147
156
  # --------------------------------------------------------------------------
148
157
  character_set := |*
149
- set_close > (set_meta, 2) {
150
- set_depth -= 1
151
- in_set = set_depth > 0 ? true : false
152
-
158
+ set_close > (set_meta, 2) @set_closed {
153
159
  emit(:set, :close, *text(data, ts, te))
154
-
155
- if set_depth == 0
156
- fgoto main;
157
- else
160
+ if in_set?
158
161
  fret;
162
+ else
163
+ fgoto main;
159
164
  end
160
165
  };
161
166
 
162
- '-]' { # special case, emits two tokens
163
- set_depth -= 1
164
- in_set = set_depth > 0 ? true : false
165
-
166
- emit(:literal, :literal, copy(data, ts..te-2), ts, te)
167
- emit(:set, :close, copy(data, ts+1..te-1), ts, te)
168
-
169
- if set_depth == 0
170
- fgoto main;
171
- else
167
+ '-]' @set_closed { # special case, emits two tokens
168
+ emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
169
+ emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
170
+ if in_set?
172
171
  fret;
172
+ else
173
+ fgoto main;
173
174
  end
174
175
  };
175
176
 
@@ -207,14 +208,12 @@
207
208
  fcall set_escape_sequence;
208
209
  };
209
210
 
210
- set_open >(open_bracket, 1) {
211
- set_depth += 1
212
-
211
+ set_open >(open_bracket, 1) >set_opened {
213
212
  emit(:set, :open, *text(data, ts, te))
214
213
  fcall character_set;
215
214
  };
216
215
 
217
- class_posix >(open_bracket, 1) @eof(premature_end_error) {
216
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
218
217
  text = text(data, ts, te).first
219
218
 
220
219
  type = :posixclass
@@ -227,11 +226,11 @@
227
226
  emit(type, class_name.to_sym, text, ts, te)
228
227
  };
229
228
 
230
- collating_sequence >(open_bracket, 1) @eof(premature_end_error) {
229
+ collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
231
230
  emit(:set, :collation, *text(data, ts, te))
232
231
  };
233
232
 
234
- character_equivalent >(open_bracket, 1) @eof(premature_end_error) {
233
+ character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
235
234
  emit(:set, :equivalent, *text(data, ts, te))
236
235
  };
237
236
 
@@ -337,44 +336,24 @@
337
336
  };
338
337
 
339
338
  control_sequence >(escaped_alpha, 4) $eof(premature_end_error) {
340
- if data[te]
341
- c = data[te].chr
342
- if c =~ /[\x00-\x7F]/
343
- emit(:escape, :control, copy(data, ts-1..te), ts-1, te+1)
344
- p += 1
345
- else
346
- raise InvalidSequenceError.new("control sequence")
347
- end
348
- else
349
- raise PrematureEndError.new("control sequence")
350
- end
339
+ emit_meta_control_sequence(data, ts, te, :control)
351
340
  fret;
352
341
  };
353
342
 
354
343
  meta_sequence >(backslashed, 3) $eof(premature_end_error) {
355
- if data[te]
356
- c = data[te].chr
357
- if c =~ /[\x00-\x7F]/
358
- emit(:escape, :meta_sequence, copy(data, ts-1..te), ts-1, te+1)
359
- p += 1
360
- else
361
- raise InvalidSequenceError.new("meta sequence")
362
- end
363
- else
364
- raise PrematureEndError.new("meta sequence")
365
- end
344
+ emit_meta_control_sequence(data, ts, te, :meta_sequence)
366
345
  fret;
367
346
  };
368
347
 
369
348
  char_type_char > (escaped_alpha, 2) {
370
349
  fhold;
371
- fnext *(in_set ? fentry(character_set) : fentry(main));
350
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
372
351
  fcall char_type;
373
352
  };
374
353
 
375
354
  property_char > (escaped_alpha, 2) {
376
355
  fhold;
377
- fnext *(in_set ? fentry(character_set) : fentry(main));
356
+ fnext *(in_set? ? fentry(character_set) : fentry(main));
378
357
  fcall unicode_property;
379
358
  };
380
359
 
@@ -412,8 +391,7 @@
412
391
  };
413
392
 
414
393
  alternation {
415
- if in_conditional and conditional_stack.length > 0 and
416
- conditional_stack.last[1] == group_depth
394
+ if conditional_stack.last == group_depth
417
395
  emit(:conditional, :separator, *text(data, ts, te))
418
396
  else
419
397
  emit(:meta, :alternation, *text(data, ts, te))
@@ -442,18 +420,16 @@
442
420
  when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
443
421
  when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
444
422
  when '\\G'; emit(:anchor, :match_start, text, ts, te)
445
- else
446
- raise ScannerError.new(
447
- "Unexpected character in anchor at #{text} (char #{ts})")
448
423
  end
449
424
  };
450
425
 
426
+ literal_delimiters {
427
+ append_literal(data, ts, te)
428
+ };
429
+
451
430
  # Character sets
452
431
  # ------------------------------------------------------------------------
453
- set_open {
454
- set_depth += 1
455
- in_set = true
456
-
432
+ set_open >set_opened {
457
433
  emit(:set, :open, *text(data, ts, te))
458
434
  fcall character_set;
459
435
  };
@@ -465,9 +441,7 @@
465
441
  conditional {
466
442
  text = text(data, ts, te).first
467
443
 
468
- in_conditional = true unless in_conditional
469
- conditional_depth += 1
470
- conditional_stack << [conditional_depth, group_depth]
444
+ conditional_stack << group_depth
471
445
 
472
446
  emit(:conditional, :open, text[0..-2], ts, te-1)
473
447
  emit(:conditional, :condition_open, '(', te-1, te)
@@ -496,7 +470,11 @@
496
470
  # (?imxdau-imx:subexp) option on/off for subexp
497
471
  # ------------------------------------------------------------------------
498
472
  group_open . group_options >group_opened {
499
- p = scan_options(p, data, ts, te)
473
+ text = text(data, ts, te).first
474
+ if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
475
+ raise InvalidGroupOption.new($1 || "-#{$2}", text)
476
+ end
477
+ emit_options(text, ts, te)
500
478
  };
501
479
 
502
480
  # Assertions
@@ -528,19 +506,15 @@
528
506
  when '(?>'; emit(:group, :atomic, text, ts, te)
529
507
  when '(?~'; emit(:group, :absence, text, ts, te)
530
508
 
531
- when /^\(\?<(\w*)>/
532
- empty_name_error(:group, 'named group (ab)') if $1.empty?
509
+ when /^\(\?(?:<>|'')/
510
+ validation_error(:group, 'named group', 'name is empty')
533
511
 
512
+ when /^\(\?<\w*>/
534
513
  emit(:group, :named_ab, text, ts, te)
535
514
 
536
- when /^\(\?'(\w*)'/
537
- empty_name_error(:group, 'named group (sq)') if $1.empty?
538
-
515
+ when /^\(\?'\w*'/
539
516
  emit(:group, :named_sq, text, ts, te)
540
517
 
541
- else
542
- raise ScannerError.new(
543
- "Unknown subexpression group format '#{text}'")
544
518
  end
545
519
  };
546
520
 
@@ -550,20 +524,13 @@
550
524
  };
551
525
 
552
526
  group_close @group_closed {
553
- if in_conditional and conditional_stack.last and
554
- conditional_stack.last[1] == (group_depth + 1)
555
-
556
- emit(:conditional, :close, *text(data, ts, te))
527
+ if conditional_stack.last == group_depth + 1
557
528
  conditional_stack.pop
558
-
559
- if conditional_stack.length == 0
560
- in_conditional = false
561
- end
529
+ emit(:conditional, :close, *text(data, ts, te))
562
530
  else
563
- if spacing_stack.length > 1 and
564
- spacing_stack.last[:depth] == (group_depth + 1)
531
+ if spacing_stack.length > 1 &&
532
+ spacing_stack.last[:depth] == group_depth + 1
565
533
  spacing_stack.pop
566
-
567
534
  self.free_spacing = spacing_stack.last[:free_spacing]
568
535
  end
569
536
 
@@ -576,11 +543,8 @@
576
543
  # ------------------------------------------------------------------------
577
544
  backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
578
545
  case text = text(data, ts, te).first
579
- when /^\\([gk])<>/ # angle brackets
580
- empty_backref_error("ref/call (ab)")
581
-
582
- when /^\\([gk])''/ # single quotes
583
- empty_backref_error("ref/call (sq)")
546
+ when /^\\([gk])(<>|'')/ # angle brackets
547
+ validation_error(:backref, 'ref/call', 'ref ID is empty')
584
548
 
585
549
  when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
586
550
  if $1 == 'k'
@@ -636,9 +600,6 @@
636
600
  when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
637
601
  emit(:backref, :number_recursion_ref_sq, text, ts, te)
638
602
 
639
- else
640
- raise ScannerError.new(
641
- "Unknown backreference format '#{text}'")
642
603
  end
643
604
  };
644
605
 
@@ -669,10 +630,15 @@
669
630
  end
670
631
  };
671
632
 
672
- quantifier_interval @err(premature_end_error) {
633
+ quantifier_interval {
673
634
  emit(:quantifier, :interval, *text(data, ts, te))
674
635
  };
675
636
 
637
+ # Catch unmatched curly braces as literals
638
+ range_open {
639
+ append_literal(data, ts, te)
640
+ };
641
+
676
642
  # Escaped sequences
677
643
  # ------------------------------------------------------------------------
678
644
  backslash > (backslashed, 1) {
@@ -786,7 +752,7 @@ class Regexp::Scanner
786
752
  input = input_object
787
753
  self.free_spacing = false
788
754
  end
789
-
755
+ self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
790
756
 
791
757
  data = input.unpack("c*") if input.is_a?(String)
792
758
  eof = data.length
@@ -794,15 +760,9 @@ class Regexp::Scanner
794
760
  self.tokens = []
795
761
  self.block = block_given? ? block : nil
796
762
 
797
- self.in_group = false
763
+ self.set_depth = 0
798
764
  self.group_depth = 0
799
- self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
800
-
801
- in_set = false
802
- set_depth = 0
803
- in_conditional = false
804
- conditional_depth = 0
805
- conditional_stack = []
765
+ self.conditional_stack = []
806
766
 
807
767
  %% write data;
808
768
  %% write init;
@@ -817,9 +777,9 @@ class Regexp::Scanner
817
777
  end
818
778
 
819
779
  raise PrematureEndError.new("(missing group closing paranthesis) "+
820
- "[#{in_group}:#{group_depth}]") if in_group
780
+ "[#{group_depth}]") if in_group?
821
781
  raise PrematureEndError.new("(missing set closing bracket) "+
822
- "[#{in_set}:#{set_depth}]") if in_set
782
+ "[#{set_depth}]") if in_set?
823
783
 
824
784
  # when the entire expression is a literal run
825
785
  emit_literal if literal
@@ -854,62 +814,15 @@ class Regexp::Scanner
854
814
 
855
815
  private
856
816
 
857
- attr_accessor :tokens, :literal, :block,
858
- :in_group, :group_depth,
859
- :free_spacing, :spacing_stack
860
-
861
- # Ragel's regex-based scan of the group options introduced a lot of
862
- # ambiguity, so we just ask it to find the beginning of what looks
863
- # like an options run and handle the rest in here.
864
- def scan_options(p, data, ts, te)
865
- text = text(data, ts, te).first
817
+ attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
818
+ :group_depth, :set_depth, :conditional_stack
866
819
 
867
- options_char, options_length = true, 0
868
-
869
- # Copy while we have option characters. There is no maximum length,
870
- # as ruby allows things like '(?xxxxxxxxx-xxxxxxxxxxxxx:abc)'.
871
- negative_options = false
872
- while options_char
873
- if data[te + options_length]
874
- c = data[te + options_length].chr
875
-
876
- if c =~ /[-mixdau]/
877
- negative_options = true if c == '-'
878
-
879
- raise InvalidGroupOption.new(c, text) if negative_options and
880
- c =~ /[dau]/
881
-
882
- text << c ; p += 1 ; options_length += 1
883
- else
884
- options_char = false
885
- end
886
- else
887
- raise PrematureEndError.new("expression options `#{text}'")
888
- end
889
- end
890
-
891
- if data[te + options_length]
892
- c = data[te + options_length].chr
893
-
894
- if c == ':'
895
- # Include the ':' in the options text
896
- text << c ; p += 1 ; options_length += 1
897
- emit_options(text, ts, te + options_length)
898
-
899
- elsif c == ')'
900
- # Don't include the closing ')', let group_close handle it.
901
- emit_options(text, ts, te + options_length)
902
-
903
- else
904
- # Plain Regexp reports this as 'undefined group option'
905
- raise ScannerError.new(
906
- "Unexpected `#{c}' in options sequence, ':' or ')' expected")
907
- end
908
- else
909
- raise PrematureEndError.new("expression options `#{text}'")
910
- end
820
+ def in_group?
821
+ group_depth > 0
822
+ end
911
823
 
912
- p # return the new value of the data pointer
824
+ def in_set?
825
+ set_depth > 0
913
826
  end
914
827
 
915
828
  # Copy from ts to te from data as text
@@ -945,32 +858,39 @@ class Regexp::Scanner
945
858
  def emit_options(text, ts, te)
946
859
  token = nil
947
860
 
948
- if text =~ /\(\?([mixdau]*)-?([mix]*)(:)?/
949
- positive, negative, group_local = $1, $2, $3
861
+ # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
862
+ text =~ /\(\?([mixdau]*)(-(?:[mix]*))*(:)?/
863
+ positive, negative, group_local = $1, $2, $3
950
864
 
951
- if positive.include?('x')
952
- self.free_spacing = true
953
- end
865
+ if positive.include?('x')
866
+ self.free_spacing = true
867
+ end
954
868
 
955
- # If the x appears in both, treat it like ruby does, the second cancels
956
- # the first.
957
- if negative.include?('x')
958
- self.free_spacing = false
959
- end
869
+ # If the x appears in both, treat it like ruby does, the second cancels
870
+ # the first.
871
+ if negative && negative.include?('x')
872
+ self.free_spacing = false
873
+ end
960
874
 
961
- if group_local
962
- spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
963
- token = :options
964
- else
965
- # switch for parent group level
966
- spacing_stack.last[:free_spacing] = free_spacing
967
- token = :options_switch
968
- end
875
+ if group_local
876
+ spacing_stack << {:free_spacing => free_spacing, :depth => group_depth}
877
+ token = :options
878
+ else
879
+ # switch for parent group level
880
+ spacing_stack.last[:free_spacing] = free_spacing
881
+ token = :options_switch
969
882
  end
970
883
 
971
884
  emit(:group, token, text, ts, te)
972
885
  end
973
886
 
887
+ def emit_meta_control_sequence(data, ts, te, token)
888
+ if data.last < 0x00 || data.last > 0x7F
889
+ validation_error(:sequence, 'escape', token.to_s)
890
+ end
891
+ emit(:escape, token, *text(data, ts, te, 1))
892
+ end
893
+
974
894
  # Centralizes and unifies the handling of validation related
975
895
  # errors.
976
896
  def validation_error(type, what, reason)
@@ -981,21 +901,8 @@ class Regexp::Scanner
981
901
  error = InvalidBackrefError.new(what, reason)
982
902
  when :sequence
983
903
  error = InvalidSequenceError.new(what, reason)
984
- else
985
- error = ValidationError.new('expression')
986
904
  end
987
905
 
988
906
  raise error # unless @@config.validation_ignore
989
907
  end
990
-
991
- # Used for references with an empty name or number
992
- def empty_backref_error(type, what)
993
- validation_error(:backref, what, 'ref ID is empty')
994
- end
995
-
996
- # Used for named expressions with an empty name
997
- def empty_name_error(type, what)
998
- validation_error(type, what, 'name is empty')
999
- end
1000
-
1001
908
  end # module Regexp::Scanner