regexp_parser 1.7.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +364 -22
  3. data/Gemfile +8 -2
  4. data/LICENSE +1 -1
  5. data/README.md +124 -88
  6. data/Rakefile +6 -70
  7. data/lib/regexp_parser/error.rb +4 -0
  8. data/lib/regexp_parser/expression/base.rb +76 -0
  9. data/lib/regexp_parser/expression/classes/alternation.rb +1 -1
  10. data/lib/regexp_parser/expression/classes/anchor.rb +0 -2
  11. data/lib/regexp_parser/expression/classes/{backref.rb → backreference.rb} +22 -2
  12. data/lib/regexp_parser/expression/classes/{set → character_set}/range.rb +4 -8
  13. data/lib/regexp_parser/expression/classes/{set.rb → character_set.rb} +3 -4
  14. data/lib/regexp_parser/expression/classes/{type.rb → character_type.rb} +0 -2
  15. data/lib/regexp_parser/expression/classes/conditional.rb +11 -5
  16. data/lib/regexp_parser/expression/classes/{escape.rb → escape_sequence.rb} +15 -7
  17. data/lib/regexp_parser/expression/classes/free_space.rb +5 -5
  18. data/lib/regexp_parser/expression/classes/group.rb +28 -15
  19. data/lib/regexp_parser/expression/classes/keep.rb +2 -0
  20. data/lib/regexp_parser/expression/classes/literal.rb +1 -5
  21. data/lib/regexp_parser/expression/classes/posix_class.rb +5 -1
  22. data/lib/regexp_parser/expression/classes/root.rb +4 -19
  23. data/lib/regexp_parser/expression/classes/{property.rb → unicode_property.rb} +5 -3
  24. data/lib/regexp_parser/expression/methods/construct.rb +41 -0
  25. data/lib/regexp_parser/expression/methods/human_name.rb +43 -0
  26. data/lib/regexp_parser/expression/methods/match_length.rb +11 -7
  27. data/lib/regexp_parser/expression/methods/parts.rb +23 -0
  28. data/lib/regexp_parser/expression/methods/printing.rb +26 -0
  29. data/lib/regexp_parser/expression/methods/strfregexp.rb +1 -1
  30. data/lib/regexp_parser/expression/methods/tests.rb +47 -1
  31. data/lib/regexp_parser/expression/methods/traverse.rb +34 -18
  32. data/lib/regexp_parser/expression/quantifier.rb +57 -17
  33. data/lib/regexp_parser/expression/sequence.rb +11 -47
  34. data/lib/regexp_parser/expression/sequence_operation.rb +4 -9
  35. data/lib/regexp_parser/expression/shared.rb +111 -0
  36. data/lib/regexp_parser/expression/subexpression.rb +27 -19
  37. data/lib/regexp_parser/expression.rb +14 -141
  38. data/lib/regexp_parser/lexer.rb +83 -41
  39. data/lib/regexp_parser/parser.rb +371 -429
  40. data/lib/regexp_parser/scanner/char_type.rl +11 -11
  41. data/lib/regexp_parser/scanner/errors/premature_end_error.rb +8 -0
  42. data/lib/regexp_parser/scanner/errors/scanner_error.rb +6 -0
  43. data/lib/regexp_parser/scanner/errors/validation_error.rb +63 -0
  44. data/lib/regexp_parser/scanner/properties/long.csv +633 -0
  45. data/lib/regexp_parser/scanner/properties/short.csv +248 -0
  46. data/lib/regexp_parser/scanner/property.rl +4 -4
  47. data/lib/regexp_parser/scanner/scanner.rl +295 -368
  48. data/lib/regexp_parser/scanner.rb +1405 -1674
  49. data/lib/regexp_parser/syntax/any.rb +2 -7
  50. data/lib/regexp_parser/syntax/base.rb +92 -67
  51. data/lib/regexp_parser/syntax/token/anchor.rb +15 -0
  52. data/lib/regexp_parser/syntax/{tokens → token}/assertion.rb +2 -2
  53. data/lib/regexp_parser/syntax/token/backreference.rb +33 -0
  54. data/lib/regexp_parser/syntax/token/character_set.rb +16 -0
  55. data/lib/regexp_parser/syntax/{tokens → token}/character_type.rb +3 -3
  56. data/lib/regexp_parser/syntax/{tokens → token}/conditional.rb +3 -3
  57. data/lib/regexp_parser/syntax/token/escape.rb +33 -0
  58. data/lib/regexp_parser/syntax/{tokens → token}/group.rb +7 -7
  59. data/lib/regexp_parser/syntax/{tokens → token}/keep.rb +1 -1
  60. data/lib/regexp_parser/syntax/token/meta.rb +20 -0
  61. data/lib/regexp_parser/syntax/{tokens → token}/posix_class.rb +3 -3
  62. data/lib/regexp_parser/syntax/token/quantifier.rb +35 -0
  63. data/lib/regexp_parser/syntax/token/unicode_property.rb +733 -0
  64. data/lib/regexp_parser/syntax/token/virtual.rb +11 -0
  65. data/lib/regexp_parser/syntax/token.rb +45 -0
  66. data/lib/regexp_parser/syntax/version_lookup.rb +19 -36
  67. data/lib/regexp_parser/syntax/versions/1.8.6.rb +13 -20
  68. data/lib/regexp_parser/syntax/versions/1.9.1.rb +10 -17
  69. data/lib/regexp_parser/syntax/versions/1.9.3.rb +3 -10
  70. data/lib/regexp_parser/syntax/versions/2.0.0.rb +8 -15
  71. data/lib/regexp_parser/syntax/versions/2.2.0.rb +3 -9
  72. data/lib/regexp_parser/syntax/versions/2.3.0.rb +3 -9
  73. data/lib/regexp_parser/syntax/versions/2.4.0.rb +3 -9
  74. data/lib/regexp_parser/syntax/versions/2.4.1.rb +2 -8
  75. data/lib/regexp_parser/syntax/versions/2.5.0.rb +3 -9
  76. data/lib/regexp_parser/syntax/versions/2.6.0.rb +3 -9
  77. data/lib/regexp_parser/syntax/versions/2.6.2.rb +3 -9
  78. data/lib/regexp_parser/syntax/versions/2.6.3.rb +3 -9
  79. data/lib/regexp_parser/syntax/versions/3.1.0.rb +4 -0
  80. data/lib/regexp_parser/syntax/versions/3.2.0.rb +4 -0
  81. data/lib/regexp_parser/syntax/versions.rb +3 -1
  82. data/lib/regexp_parser/syntax.rb +8 -6
  83. data/lib/regexp_parser/token.rb +9 -20
  84. data/lib/regexp_parser/version.rb +1 -1
  85. data/lib/regexp_parser.rb +0 -2
  86. data/regexp_parser.gemspec +20 -22
  87. metadata +49 -166
  88. data/lib/regexp_parser/scanner/properties/long.yml +0 -594
  89. data/lib/regexp_parser/scanner/properties/short.yml +0 -237
  90. data/lib/regexp_parser/syntax/tokens/anchor.rb +0 -15
  91. data/lib/regexp_parser/syntax/tokens/backref.rb +0 -24
  92. data/lib/regexp_parser/syntax/tokens/character_set.rb +0 -13
  93. data/lib/regexp_parser/syntax/tokens/escape.rb +0 -30
  94. data/lib/regexp_parser/syntax/tokens/meta.rb +0 -13
  95. data/lib/regexp_parser/syntax/tokens/quantifier.rb +0 -35
  96. data/lib/regexp_parser/syntax/tokens/unicode_property.rb +0 -675
  97. data/lib/regexp_parser/syntax/tokens.rb +0 -45
  98. data/spec/expression/base_spec.rb +0 -94
  99. data/spec/expression/clone_spec.rb +0 -120
  100. data/spec/expression/conditional_spec.rb +0 -89
  101. data/spec/expression/free_space_spec.rb +0 -27
  102. data/spec/expression/methods/match_length_spec.rb +0 -161
  103. data/spec/expression/methods/match_spec.rb +0 -25
  104. data/spec/expression/methods/strfregexp_spec.rb +0 -224
  105. data/spec/expression/methods/tests_spec.rb +0 -99
  106. data/spec/expression/methods/traverse_spec.rb +0 -161
  107. data/spec/expression/options_spec.rb +0 -128
  108. data/spec/expression/root_spec.rb +0 -9
  109. data/spec/expression/sequence_spec.rb +0 -9
  110. data/spec/expression/subexpression_spec.rb +0 -50
  111. data/spec/expression/to_h_spec.rb +0 -26
  112. data/spec/expression/to_s_spec.rb +0 -100
  113. data/spec/lexer/all_spec.rb +0 -22
  114. data/spec/lexer/conditionals_spec.rb +0 -53
  115. data/spec/lexer/escapes_spec.rb +0 -14
  116. data/spec/lexer/keep_spec.rb +0 -10
  117. data/spec/lexer/literals_spec.rb +0 -89
  118. data/spec/lexer/nesting_spec.rb +0 -99
  119. data/spec/lexer/refcalls_spec.rb +0 -55
  120. data/spec/parser/all_spec.rb +0 -43
  121. data/spec/parser/alternation_spec.rb +0 -88
  122. data/spec/parser/anchors_spec.rb +0 -17
  123. data/spec/parser/conditionals_spec.rb +0 -179
  124. data/spec/parser/errors_spec.rb +0 -30
  125. data/spec/parser/escapes_spec.rb +0 -121
  126. data/spec/parser/free_space_spec.rb +0 -130
  127. data/spec/parser/groups_spec.rb +0 -108
  128. data/spec/parser/keep_spec.rb +0 -6
  129. data/spec/parser/posix_classes_spec.rb +0 -8
  130. data/spec/parser/properties_spec.rb +0 -115
  131. data/spec/parser/quantifiers_spec.rb +0 -51
  132. data/spec/parser/refcalls_spec.rb +0 -112
  133. data/spec/parser/set/intersections_spec.rb +0 -127
  134. data/spec/parser/set/ranges_spec.rb +0 -111
  135. data/spec/parser/sets_spec.rb +0 -178
  136. data/spec/parser/types_spec.rb +0 -18
  137. data/spec/scanner/all_spec.rb +0 -18
  138. data/spec/scanner/anchors_spec.rb +0 -21
  139. data/spec/scanner/conditionals_spec.rb +0 -128
  140. data/spec/scanner/errors_spec.rb +0 -68
  141. data/spec/scanner/escapes_spec.rb +0 -53
  142. data/spec/scanner/free_space_spec.rb +0 -133
  143. data/spec/scanner/groups_spec.rb +0 -52
  144. data/spec/scanner/keep_spec.rb +0 -10
  145. data/spec/scanner/literals_spec.rb +0 -49
  146. data/spec/scanner/meta_spec.rb +0 -18
  147. data/spec/scanner/properties_spec.rb +0 -64
  148. data/spec/scanner/quantifiers_spec.rb +0 -20
  149. data/spec/scanner/refcalls_spec.rb +0 -36
  150. data/spec/scanner/sets_spec.rb +0 -102
  151. data/spec/scanner/types_spec.rb +0 -14
  152. data/spec/spec_helper.rb +0 -15
  153. data/spec/support/runner.rb +0 -42
  154. data/spec/support/shared_examples.rb +0 -77
  155. data/spec/support/warning_extractor.rb +0 -60
  156. data/spec/syntax/syntax_spec.rb +0 -48
  157. data/spec/syntax/syntax_token_map_spec.rb +0 -23
  158. data/spec/syntax/versions/1.8.6_spec.rb +0 -17
  159. data/spec/syntax/versions/1.9.1_spec.rb +0 -10
  160. data/spec/syntax/versions/1.9.3_spec.rb +0 -9
  161. data/spec/syntax/versions/2.0.0_spec.rb +0 -13
  162. data/spec/syntax/versions/2.2.0_spec.rb +0 -9
  163. data/spec/syntax/versions/aliases_spec.rb +0 -37
  164. data/spec/token/token_spec.rb +0 -85
  165. /data/lib/regexp_parser/expression/classes/{set → character_set}/intersection.rb +0 -0
@@ -3,6 +3,11 @@
3
3
  include re_char_type "char_type.rl";
4
4
  include re_property "property.rl";
5
5
 
6
+ utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
7
+ utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
8
+ utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
9
+ utf8_multibyte = utf8_2_byte | utf8_3_byte | utf8_4_byte;
10
+
6
11
  dot = '.';
7
12
  backslash = '\\';
8
13
  alternation = '|';
@@ -15,26 +20,15 @@
15
20
 
16
21
  group_open = '(';
17
22
  group_close = ')';
18
- parantheses = group_open | group_close;
23
+ parentheses = group_open | group_close;
19
24
 
20
25
  set_open = '[';
21
26
  set_close = ']';
22
27
  brackets = set_open | set_close;
23
28
 
24
- comment = ('#' . [^\n]* . '\n');
25
-
26
- class_name_posix = 'alnum' | 'alpha' | 'blank' |
27
- 'cntrl' | 'digit' | 'graph' |
28
- 'lower' | 'print' | 'punct' |
29
- 'space' | 'upper' | 'xdigit' |
30
- 'word' | 'ascii';
31
-
32
- class_posix = ('[:' . '^'? . class_name_posix . ':]');
33
-
29
+ comment = ('#' . [^\n]* . '\n'?);
34
30
 
35
- # these are not supported in ruby, and need verification
36
- collating_sequence = '[.' . (alpha | [\-])+ . '.]';
37
- character_equivalent = '[=' . alpha . '=]';
31
+ class_posix = ('[:' . '^'? . [^\[\]]* . ':]');
38
32
 
39
33
  line_anchor = beginning_of_line | end_of_line;
40
34
  anchor_char = [AbBzZG];
@@ -53,21 +47,20 @@
53
47
 
54
48
  meta_sequence = 'M-' . (backslash . ('c' | 'C-'))? . backslash? . any;
55
49
 
50
+ sequence_char = [CMcux];
51
+
56
52
  zero_or_one = '?' | '??' | '?+';
57
53
  zero_or_more = '*' | '*?' | '*+';
58
54
  one_or_more = '+' | '+?' | '++';
59
55
 
60
56
  quantifier_greedy = '?' | '*' | '+';
61
- quantifier_reluctant = '??' | '*?' | '+?';
62
- quantifier_possessive = '?+' | '*+' | '++';
63
- quantifier_mode = '?' | '+';
64
-
65
- quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
66
- range_close . quantifier_mode?;
67
-
68
- quantifiers = quantifier_greedy | quantifier_reluctant |
69
- quantifier_possessive | quantifier_interval;
70
57
 
58
+ quantity_exact = (digit+);
59
+ quantity_minimum = (digit+) . ',';
60
+ quantity_maximum = ',' . (digit+);
61
+ quantity_range = (digit+) . ',' . (digit+);
62
+ quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
63
+ quantity_maximum | quantity_range ) . range_close;
71
64
 
72
65
  conditional = '(?(';
73
66
 
@@ -85,22 +78,22 @@
85
78
  # try to treat every other group head as options group, like Ruby
86
79
  group_options = '?' . ( [^!#'():<=>~]+ . ':'? ) ?;
87
80
 
88
- group_ref = [gk];
89
- group_name_char = (alnum | '_');
90
- group_name_id = (group_name_char . (group_name_char+)?)?;
91
- group_number = '-'? . [1-9] . ([0-9]+)?;
81
+ group_name_id_ab = ([^!0-9\->] | utf8_multibyte) . ([^>] | utf8_multibyte)*;
82
+ group_name_id_sq = ([^0-9\-'] | utf8_multibyte) . ([^'] | utf8_multibyte)*;
83
+ group_number = '-'? . [0-9]+;
92
84
  group_level = [+\-] . [0-9]+;
93
85
 
94
- group_name = ('<' . group_name_id . '>') | ("'" . group_name_id . "'");
86
+ group_name = ('<' . group_name_id_ab? . '>') |
87
+ ("'" . group_name_id_sq? . "'");
95
88
  group_lookup = group_name | group_number;
96
89
 
97
90
  group_named = ('?' . group_name );
98
91
 
99
- group_name_ref = group_ref . (('<' . group_name_id . group_level? '>') |
100
- ("'" . group_name_id . group_level? "'"));
92
+ group_ref_body = (('<' . (group_name_id_ab? | group_number) . group_level? '>') |
93
+ ("'" . (group_name_id_sq? | group_number) . group_level? "'"));
101
94
 
102
- group_number_ref = group_ref . (('<' . group_number . group_level? '>') |
103
- ("'" . group_number . group_level? "'"));
95
+ group_ref = 'k' . group_ref_body;
96
+ group_call = 'g' . group_ref_body;
104
97
 
105
98
  group_type = group_atomic | group_passive | group_absence | group_named;
106
99
 
@@ -111,32 +104,33 @@
111
104
 
112
105
  # characters that 'break' a literal
113
106
  meta_char = dot | backslash | alternation |
114
- curlies | parantheses | brackets |
107
+ curlies | parentheses | brackets |
115
108
  line_anchor | quantifier_greedy;
116
109
 
117
- ascii_print = ((0x20..0x7e) - meta_char);
118
- ascii_nonprint = (0x01..0x1f | 0x7f);
110
+ literal_delimiters = ']' | '}';
119
111
 
120
- utf8_2_byte = (0xc2..0xdf 0x80..0xbf);
121
- utf8_3_byte = (0xe0..0xef 0x80..0xbf 0x80..0xbf);
122
- utf8_4_byte = (0xf0..0xf4 0x80..0xbf 0x80..0xbf 0x80..0xbf);
112
+ ascii_print = ((0x20..0x7e) - meta_char - '#');
113
+ ascii_nonprint = (0x01..0x1f | 0x7f);
123
114
 
124
115
  non_literal_escape = char_type_char | anchor_char | escaped_ascii |
125
- group_ref | keep_mark | [xucCM];
116
+ keep_mark | sequence_char;
117
+
118
+ # escapes that also work within a character set
119
+ set_escape = backslash | brackets | escaped_ascii |
120
+ octal_sequence | property_char |
121
+ sequence_char | single_codepoint_char_type;
126
122
 
127
- non_set_escape = (anchor_char - 'b') | group_ref | keep_mark |
128
- multi_codepoint_char_type | [0-9cCM];
129
123
 
130
124
  # EOF error, used where it can be detected
131
125
  action premature_end_error {
132
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
133
- raise PrematureEndError.new( text )
126
+ text = copy(data, ts ? ts-1 : 0, -1)
127
+ raise PrematureEndError.new(text)
134
128
  }
135
129
 
136
130
  # Invalid sequence error, used from sequences, like escapes and sets
137
131
  action invalid_sequence_error {
138
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
139
- validation_error(:sequence, 'sequence', text)
132
+ text = copy(data, ts ? ts-1 : 0, -1)
133
+ raise ValidationError.for(:sequence, 'sequence', text)
140
134
  }
141
135
 
142
136
  # group (nesting) and set open/close actions
@@ -150,7 +144,7 @@
150
144
  # --------------------------------------------------------------------------
151
145
  character_set := |*
152
146
  set_close > (set_meta, 2) @set_closed {
153
- emit(:set, :close, *text(data, ts, te))
147
+ emit(:set, :close, copy(data, ts, te))
154
148
  if in_set?
155
149
  fret;
156
150
  else
@@ -159,8 +153,8 @@
159
153
  };
160
154
 
161
155
  '-]' @set_closed { # special case, emits two tokens
162
- emit(:literal, :literal, copy(data, ts..te-2), ts, te - 1)
163
- emit(:set, :close, copy(data, ts+1..te-1), ts + 1, te)
156
+ emit(:literal, :literal, '-')
157
+ emit(:set, :close, ']')
164
158
  if in_set?
165
159
  fret;
166
160
  else
@@ -169,33 +163,32 @@
169
163
  };
170
164
 
171
165
  '-&&' { # special case, emits two tokens
172
- emit(:literal, :literal, '-', ts, te)
173
- emit(:set, :intersection, '&&', ts, te)
166
+ emit(:literal, :literal, '-')
167
+ emit(:set, :intersection, '&&')
174
168
  };
175
169
 
176
170
  '^' {
177
- text = text(data, ts, te).first
178
- if tokens.last[1] == :open
179
- emit(:set, :negate, text, ts, te)
171
+ if prev_token[1] == :open
172
+ emit(:set, :negate, '^')
180
173
  else
181
- emit(:literal, :literal, text, ts, te)
174
+ emit(:literal, :literal, '^')
182
175
  end
183
176
  };
184
177
 
185
178
  '-' {
186
- text = text(data, ts, te).first
187
- # ranges cant start with a subset or intersection/negation/range operator
188
- if tokens.last[0] == :set
189
- emit(:literal, :literal, text, ts, te)
179
+ # ranges cant start with the opening bracket, a subset, or
180
+ # intersection/negation/range operators
181
+ if prev_token[0] == :set
182
+ emit(:literal, :literal, '-')
190
183
  else
191
- emit(:set, :range, text, ts, te)
184
+ emit(:set, :range, '-')
192
185
  end
193
186
  };
194
187
 
195
188
  # Unlike ranges, intersections can start or end at set boundaries, whereupon
196
189
  # they match nothing: r = /[a&&]/; [r =~ ?a, r =~ ?&] # => [nil, nil]
197
190
  '&&' {
198
- emit(:set, :intersection, *text(data, ts, te))
191
+ emit(:set, :intersection, '&&')
199
192
  };
200
193
 
201
194
  backslash {
@@ -203,59 +196,60 @@
203
196
  };
204
197
 
205
198
  set_open >(open_bracket, 1) >set_opened {
206
- emit(:set, :open, *text(data, ts, te))
199
+ emit(:set, :open, '[')
207
200
  fcall character_set;
208
201
  };
209
202
 
210
- class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
211
- text = text(data, ts, te).first
203
+ class_posix >(open_bracket, 1) @set_closed @eof(premature_end_error) {
204
+ text = copy(data, ts, te)
212
205
 
213
206
  type = :posixclass
214
207
  class_name = text[2..-3]
215
- if class_name[0].chr == '^'
208
+ if class_name[0] == '^'
216
209
  class_name = class_name[1..-1]
217
210
  type = :nonposixclass
218
211
  end
219
212
 
220
- emit(type, class_name.to_sym, text, ts, te)
221
- };
222
-
223
- collating_sequence >(open_bracket, 1) @set_closed @eof(premature_end_error) {
224
- emit(:set, :collation, *text(data, ts, te))
225
- };
213
+ unless self.class.posix_classes.include?(class_name)
214
+ raise ValidationError.for(:posix_class, text)
215
+ end
226
216
 
227
- character_equivalent >(open_bracket, 1) @set_closed @eof(premature_end_error) {
228
- emit(:set, :equivalent, *text(data, ts, te))
217
+ emit(type, class_name.to_sym, text)
229
218
  };
230
219
 
231
220
  meta_char > (set_meta, 1) {
232
- emit(:literal, :literal, *text(data, ts, te))
221
+ emit(:literal, :literal, copy(data, ts, te))
233
222
  };
234
223
 
235
- any |
236
- ascii_nonprint |
237
- utf8_2_byte |
238
- utf8_3_byte |
239
- utf8_4_byte {
240
- char, *rest = *text(data, ts, te)
241
- char.force_encoding('utf-8') if char.respond_to?(:force_encoding)
242
- emit(:literal, :literal, char, *rest)
224
+ any | ascii_nonprint | utf8_multibyte {
225
+ text = copy(data, ts, te)
226
+ emit(:literal, :literal, text)
243
227
  };
244
228
  *|;
245
229
 
246
230
  # set escapes scanner
247
231
  # --------------------------------------------------------------------------
248
232
  set_escape_sequence := |*
249
- non_set_escape > (escaped_set_alpha, 2) {
250
- emit(:escape, :literal, *text(data, ts, te, 1))
233
+ # Special case: in sets, octal sequences have higher priority than backrefs
234
+ octal_sequence {
235
+ emit(:escape, :octal, copy(data, ts-1, te))
251
236
  fret;
252
237
  };
253
238
 
254
- any > (escaped_set_alpha, 1) {
239
+ # Scan all other escapes that work in sets with the generic escape scanner
240
+ set_escape > (escaped_set_alpha, 2) {
255
241
  fhold;
256
242
  fnext character_set;
257
243
  fcall escape_sequence;
258
244
  };
245
+
246
+ # Treat all remaining escapes - those not supported in sets - as literal.
247
+ # (This currently includes \^, \-, \&, \:, although these could potentially
248
+ # be meta chars when not escaped, depending on their position in the set.)
249
+ any > (escaped_set_alpha, 1) {
250
+ emit(:escape, :literal, copy(data, ts-1, te))
251
+ fret;
252
+ };
259
253
  *|;
260
254
 
261
255
 
@@ -263,33 +257,33 @@
263
257
  # --------------------------------------------------------------------------
264
258
  escape_sequence := |*
265
259
  [1-9] {
266
- text = text(data, ts, te, 1).first
267
- emit(:backref, :number, text, ts-1, te)
260
+ text = copy(data, ts-1, te)
261
+ emit(:backref, :number, text)
268
262
  fret;
269
263
  };
270
264
 
271
265
  octal_sequence {
272
- emit(:escape, :octal, *text(data, ts, te, 1))
266
+ emit(:escape, :octal, copy(data, ts-1, te))
273
267
  fret;
274
268
  };
275
269
 
276
270
  meta_char {
277
- case text = text(data, ts, te, 1).first
278
- when '\.'; emit(:escape, :dot, text, ts-1, te)
279
- when '\|'; emit(:escape, :alternation, text, ts-1, te)
280
- when '\^'; emit(:escape, :bol, text, ts-1, te)
281
- when '\$'; emit(:escape, :eol, text, ts-1, te)
282
- when '\?'; emit(:escape, :zero_or_one, text, ts-1, te)
283
- when '\*'; emit(:escape, :zero_or_more, text, ts-1, te)
284
- when '\+'; emit(:escape, :one_or_more, text, ts-1, te)
285
- when '\('; emit(:escape, :group_open, text, ts-1, te)
286
- when '\)'; emit(:escape, :group_close, text, ts-1, te)
287
- when '\{'; emit(:escape, :interval_open, text, ts-1, te)
288
- when '\}'; emit(:escape, :interval_close, text, ts-1, te)
289
- when '\['; emit(:escape, :set_open, text, ts-1, te)
290
- when '\]'; emit(:escape, :set_close, text, ts-1, te)
271
+ case text = copy(data, ts-1, te)
272
+ when '\.'; emit(:escape, :dot, text)
273
+ when '\|'; emit(:escape, :alternation, text)
274
+ when '\^'; emit(:escape, :bol, text)
275
+ when '\$'; emit(:escape, :eol, text)
276
+ when '\?'; emit(:escape, :zero_or_one, text)
277
+ when '\*'; emit(:escape, :zero_or_more, text)
278
+ when '\+'; emit(:escape, :one_or_more, text)
279
+ when '\('; emit(:escape, :group_open, text)
280
+ when '\)'; emit(:escape, :group_close, text)
281
+ when '\{'; emit(:escape, :interval_open, text)
282
+ when '\}'; emit(:escape, :interval_close, text)
283
+ when '\['; emit(:escape, :set_open, text)
284
+ when '\]'; emit(:escape, :set_close, text)
291
285
  when "\\\\";
292
- emit(:escape, :backslash, text, ts-1, te)
286
+ emit(:escape, :backslash, text)
293
287
  end
294
288
  fret;
295
289
  };
@@ -297,31 +291,31 @@
297
291
  escaped_ascii > (escaped_alpha, 7) {
298
292
  # \b is emitted as backspace only when inside a character set, otherwise
299
293
  # it is a word boundary anchor. A syntax might "normalize" it if needed.
300
- case text = text(data, ts, te, 1).first
301
- when '\a'; emit(:escape, :bell, text, ts-1, te)
302
- when '\b'; emit(:escape, :backspace, text, ts-1, te)
303
- when '\e'; emit(:escape, :escape, text, ts-1, te)
304
- when '\f'; emit(:escape, :form_feed, text, ts-1, te)
305
- when '\n'; emit(:escape, :newline, text, ts-1, te)
306
- when '\r'; emit(:escape, :carriage, text, ts-1, te)
307
- when '\t'; emit(:escape, :tab, text, ts-1, te)
308
- when '\v'; emit(:escape, :vertical_tab, text, ts-1, te)
294
+ case text = copy(data, ts-1, te)
295
+ when '\a'; emit(:escape, :bell, text)
296
+ when '\b'; emit(:escape, :backspace, text)
297
+ when '\e'; emit(:escape, :escape, text)
298
+ when '\f'; emit(:escape, :form_feed, text)
299
+ when '\n'; emit(:escape, :newline, text)
300
+ when '\r'; emit(:escape, :carriage, text)
301
+ when '\t'; emit(:escape, :tab, text)
302
+ when '\v'; emit(:escape, :vertical_tab, text)
309
303
  end
310
304
  fret;
311
305
  };
312
306
 
313
307
  codepoint_sequence > (escaped_alpha, 6) $eof(premature_end_error) {
314
- text = text(data, ts, te, 1).first
315
- if text[2].chr == '{'
316
- emit(:escape, :codepoint_list, text, ts-1, te)
308
+ text = copy(data, ts-1, te)
309
+ if text[2] == '{'
310
+ emit(:escape, :codepoint_list, text)
317
311
  else
318
- emit(:escape, :codepoint, text, ts-1, te)
312
+ emit(:escape, :codepoint, text)
319
313
  end
320
314
  fret;
321
315
  };
322
316
 
323
- hex_sequence > (escaped_alpha, 5) $eof(premature_end_error) {
324
- emit(:escape, :hex, *text(data, ts, te, 1))
317
+ hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
318
+ emit(:escape, :hex, copy(data, ts-1, te))
325
319
  fret;
326
320
  };
327
321
 
@@ -351,8 +345,8 @@
351
345
  fcall unicode_property;
352
346
  };
353
347
 
354
- (any -- non_literal_escape) > (escaped_alpha, 1) {
355
- emit(:escape, :literal, *text(data, ts, te, 1))
348
+ (any -- non_literal_escape) | utf8_multibyte > (escaped_alpha, 1) {
349
+ emit(:escape, :literal, copy(data, ts-1, te))
356
350
  fret;
357
351
  };
358
352
  *|;
@@ -362,9 +356,9 @@
362
356
  # --------------------------------------------------------------------------
363
357
  conditional_expression := |*
364
358
  group_lookup . ')' {
365
- text = text(data, ts, te-1).first
366
- emit(:conditional, :condition, text, ts, te-1)
367
- emit(:conditional, :condition_close, ')', te-1, te)
359
+ text = copy(data, ts, te-1)
360
+ emit(:conditional, :condition, text)
361
+ emit(:conditional, :condition_close, ')')
368
362
  };
369
363
 
370
364
  any {
@@ -381,46 +375,50 @@
381
375
  # Meta characters
382
376
  # ------------------------------------------------------------------------
383
377
  dot {
384
- emit(:meta, :dot, *text(data, ts, te))
378
+ emit(:meta, :dot, copy(data, ts, te))
385
379
  };
386
380
 
387
381
  alternation {
388
382
  if conditional_stack.last == group_depth
389
- emit(:conditional, :separator, *text(data, ts, te))
383
+ emit(:conditional, :separator, copy(data, ts, te))
390
384
  else
391
- emit(:meta, :alternation, *text(data, ts, te))
385
+ emit(:meta, :alternation, copy(data, ts, te))
392
386
  end
393
387
  };
394
388
 
395
389
  # Anchors
396
390
  # ------------------------------------------------------------------------
397
391
  beginning_of_line {
398
- emit(:anchor, :bol, *text(data, ts, te))
392
+ emit(:anchor, :bol, copy(data, ts, te))
399
393
  };
400
394
 
401
395
  end_of_line {
402
- emit(:anchor, :eol, *text(data, ts, te))
396
+ emit(:anchor, :eol, copy(data, ts, te))
403
397
  };
404
398
 
405
399
  backslash . keep_mark > (backslashed, 4) {
406
- emit(:keep, :mark, *text(data, ts, te))
400
+ emit(:keep, :mark, copy(data, ts, te))
407
401
  };
408
402
 
409
403
  backslash . anchor_char > (backslashed, 3) {
410
- case text = text(data, ts, te).first
411
- when '\\A'; emit(:anchor, :bos, text, ts, te)
412
- when '\\z'; emit(:anchor, :eos, text, ts, te)
413
- when '\\Z'; emit(:anchor, :eos_ob_eol, text, ts, te)
414
- when '\\b'; emit(:anchor, :word_boundary, text, ts, te)
415
- when '\\B'; emit(:anchor, :nonword_boundary, text, ts, te)
416
- when '\\G'; emit(:anchor, :match_start, text, ts, te)
404
+ case text = copy(data, ts, te)
405
+ when '\A'; emit(:anchor, :bos, text)
406
+ when '\z'; emit(:anchor, :eos, text)
407
+ when '\Z'; emit(:anchor, :eos_ob_eol, text)
408
+ when '\b'; emit(:anchor, :word_boundary, text)
409
+ when '\B'; emit(:anchor, :nonword_boundary, text)
410
+ when '\G'; emit(:anchor, :match_start, text)
417
411
  end
418
412
  };
419
413
 
414
+ literal_delimiters {
415
+ append_literal(data, ts, te)
416
+ };
417
+
420
418
  # Character sets
421
419
  # ------------------------------------------------------------------------
422
420
  set_open >set_opened {
423
- emit(:set, :open, *text(data, ts, te))
421
+ emit(:set, :open, copy(data, ts, te))
424
422
  fcall character_set;
425
423
  };
426
424
 
@@ -429,23 +427,22 @@
429
427
  # (?(condition)Y|N) conditional expression
430
428
  # ------------------------------------------------------------------------
431
429
  conditional {
432
- text = text(data, ts, te).first
430
+ text = copy(data, ts, te)
433
431
 
434
432
  conditional_stack << group_depth
435
433
 
436
- emit(:conditional, :open, text[0..-2], ts, te-1)
437
- emit(:conditional, :condition_open, '(', te-1, te)
434
+ emit(:conditional, :open, text[0..-2])
435
+ emit(:conditional, :condition_open, '(')
438
436
  fcall conditional_expression;
439
437
  };
440
438
 
441
439
 
442
440
  # (?#...) comments: parsed as a single expression, without introducing a
443
441
  # new nesting level. Comments may not include parentheses, escaped or not.
444
- # special case for close, action performed on all transitions to get the
445
- # correct closing count.
442
+ # special case for close to get the correct closing count.
446
443
  # ------------------------------------------------------------------------
447
- group_open . group_comment $group_closed {
448
- emit(:group, :comment, *text(data, ts, te))
444
+ (group_open . group_comment) @group_closed {
445
+ emit(:group, :comment, copy(data, ts, te))
449
446
  };
450
447
 
451
448
  # Expression options:
@@ -459,12 +456,12 @@
459
456
  #
460
457
  # (?imxdau-imx:subexp) option on/off for subexp
461
458
  # ------------------------------------------------------------------------
462
- group_open . group_options >group_opened {
463
- text = text(data, ts, te).first
459
+ (group_open . group_options) >group_opened {
460
+ text = copy(data, ts, te)
464
461
  if text[2..-1] =~ /([^\-mixdau:]|^$)|-.*([dau])/
465
- raise InvalidGroupOption.new($1 || "-#{$2}", text)
462
+ raise ValidationError.for(:group_option, $1 || "-#{$2}", text)
466
463
  end
467
- emit_options(text, ts, te)
464
+ emit_options(text)
468
465
  };
469
466
 
470
467
  # Assertions
@@ -473,12 +470,12 @@
473
470
  # (?<=subexp) look-behind
474
471
  # (?<!subexp) negative look-behind
475
472
  # ------------------------------------------------------------------------
476
- group_open . assertion_type >group_opened {
477
- case text = text(data, ts, te).first
478
- when '(?='; emit(:assertion, :lookahead, text, ts, te)
479
- when '(?!'; emit(:assertion, :nlookahead, text, ts, te)
480
- when '(?<='; emit(:assertion, :lookbehind, text, ts, te)
481
- when '(?<!'; emit(:assertion, :nlookbehind, text, ts, te)
473
+ (group_open . assertion_type) >group_opened {
474
+ case text = copy(data, ts, te)
475
+ when '(?='; emit(:assertion, :lookahead, text)
476
+ when '(?!'; emit(:assertion, :nlookahead, text)
477
+ when '(?<='; emit(:assertion, :lookbehind, text)
478
+ when '(?<!'; emit(:assertion, :nlookbehind, text)
482
479
  end
483
480
  };
484
481
 
@@ -490,106 +487,78 @@
490
487
  # (?'name'subexp) named group (single quoted version)
491
488
  # (subexp) captured group
492
489
  # ------------------------------------------------------------------------
493
- group_open . group_type >group_opened {
494
- case text = text(data, ts, te).first
495
- when '(?:'; emit(:group, :passive, text, ts, te)
496
- when '(?>'; emit(:group, :atomic, text, ts, te)
497
- when '(?~'; emit(:group, :absence, text, ts, te)
490
+ (group_open . group_type) >group_opened {
491
+ case text = copy(data, ts, te)
492
+ when '(?:'; emit(:group, :passive, text)
493
+ when '(?>'; emit(:group, :atomic, text)
494
+ when '(?~'; emit(:group, :absence, text)
498
495
 
499
496
  when /^\(\?(?:<>|'')/
500
- validation_error(:group, 'named group', 'name is empty')
497
+ raise ValidationError.for(:group, 'named group', 'name is empty')
501
498
 
502
- when /^\(\?<\w*>/
503
- emit(:group, :named_ab, text, ts, te)
499
+ when /^\(\?<[^>]+>/
500
+ emit(:group, :named_ab, text)
504
501
 
505
- when /^\(\?'\w*'/
506
- emit(:group, :named_sq, text, ts, te)
502
+ when /^\(\?'[^']+'/
503
+ emit(:group, :named_sq, text)
507
504
 
508
505
  end
509
506
  };
510
507
 
511
508
  group_open @group_opened {
512
- text = text(data, ts, te).first
513
- emit(:group, :capture, text, ts, te)
509
+ text = copy(data, ts, te)
510
+ emit(:group, :capture, text)
514
511
  };
515
512
 
516
513
  group_close @group_closed {
517
514
  if conditional_stack.last == group_depth + 1
518
515
  conditional_stack.pop
519
- emit(:conditional, :close, *text(data, ts, te))
520
- else
516
+ emit(:conditional, :close, ')')
517
+ elsif group_depth >= 0
521
518
  if spacing_stack.length > 1 &&
522
519
  spacing_stack.last[:depth] == group_depth + 1
523
520
  spacing_stack.pop
524
521
  self.free_spacing = spacing_stack.last[:free_spacing]
525
522
  end
526
523
 
527
- emit(:group, :close, *text(data, ts, te))
524
+ emit(:group, :close, ')')
525
+ else
526
+ raise ValidationError.for(:group, 'group', 'unmatched close parenthesis')
528
527
  end
529
528
  };
530
529
 
531
530
 
532
531
  # Group backreference, named and numbered
533
532
  # ------------------------------------------------------------------------
534
- backslash . (group_name_ref | group_number_ref) > (backslashed, 4) {
535
- case text = text(data, ts, te).first
536
- when /^\\([gk])(<>|'')/ # angle brackets
537
- validation_error(:backref, 'ref/call', 'ref ID is empty')
538
-
539
- when /^\\([gk])<[^\d+-]\w*>/ # angle-brackets
540
- if $1 == 'k'
541
- emit(:backref, :name_ref_ab, text, ts, te)
542
- else
543
- emit(:backref, :name_call_ab, text, ts, te)
544
- end
545
-
546
- when /^\\([gk])'[^\d+-]\w*'/ #single quotes
547
- if $1 == 'k'
548
- emit(:backref, :name_ref_sq, text, ts, te)
549
- else
550
- emit(:backref, :name_call_sq, text, ts, te)
551
- end
552
-
553
- when /^\\([gk])<\d+>/ # angle-brackets
554
- if $1 == 'k'
555
- emit(:backref, :number_ref_ab, text, ts, te)
556
- else
557
- emit(:backref, :number_call_ab, text, ts, te)
558
- end
559
-
560
- when /^\\([gk])'\d+'/ # single quotes
561
- if $1 == 'k'
562
- emit(:backref, :number_ref_sq, text, ts, te)
563
- else
564
- emit(:backref, :number_call_sq, text, ts, te)
565
- end
566
-
567
- when /^\\(?:g<\+|g<-|(k)<-)\d+>/ # angle-brackets
568
- if $1 == 'k'
569
- emit(:backref, :number_rel_ref_ab, text, ts, te)
570
- else
571
- emit(:backref, :number_rel_call_ab, text, ts, te)
572
- end
573
-
574
- when /^\\(?:g'\+|g'-|(k)'-)\d+'/ # single quotes
575
- if $1 == 'k'
576
- emit(:backref, :number_rel_ref_sq, text, ts, te)
577
- else
578
- emit(:backref, :number_rel_call_sq, text, ts, te)
579
- end
580
-
581
- when /^\\k<[^\d+\-]\w*[+\-]\d+>/ # angle-brackets
582
- emit(:backref, :name_recursion_ref_ab, text, ts, te)
583
-
584
- when /^\\k'[^\d+\-]\w*[+\-]\d+'/ # single-quotes
585
- emit(:backref, :name_recursion_ref_sq, text, ts, te)
586
-
587
- when /^\\([gk])<[+\-]?\d+[+\-]\d+>/ # angle-brackets
588
- emit(:backref, :number_recursion_ref_ab, text, ts, te)
589
-
590
- when /^\\([gk])'[+\-]?\d+[+\-]\d+'/ # single-quotes
591
- emit(:backref, :number_recursion_ref_sq, text, ts, te)
533
+ backslash . (group_ref) > (backslashed, 4) {
534
+ case text = copy(data, ts, te)
535
+ when /^\\k(.)[^0-9\-][^+\-]*['>]$/
536
+ emit(:backref, $1 == '<' ? :name_ref_ab : :name_ref_sq, text)
537
+ when /^\\k(.)[1-9]\d*['>]$/
538
+ emit(:backref, $1 == '<' ? :number_ref_ab : :number_ref_sq, text)
539
+ when /^\\k(.)-[1-9]\d*['>]$/
540
+ emit(:backref, $1 == '<' ? :number_rel_ref_ab : :number_rel_ref_sq, text)
541
+ when /^\\k(.)[^0-9\-].*[+\-]\d+['>]$/
542
+ emit(:backref, $1 == '<' ? :name_recursion_ref_ab : :name_recursion_ref_sq, text)
543
+ when /^\\k(.)-?[1-9]\d*[+\-]\d+['>]$/
544
+ emit(:backref, $1 == '<' ? :number_recursion_ref_ab : :number_recursion_ref_sq, text)
545
+ else
546
+ raise ValidationError.for(:backref, 'backreference', 'invalid ref ID')
547
+ end
548
+ };
592
549
 
550
+ # Group call, named and numbered
551
+ # ------------------------------------------------------------------------
552
+ backslash . (group_call) > (backslashed, 4) {
553
+ case text = copy(data, ts, te)
554
+ when /^\\g(.)[^0-9+\-].*['>]$/
555
+ emit(:backref, $1 == '<' ? :name_call_ab : :name_call_sq, text)
556
+ when /^\\g(.)\d+['>]$/
557
+ emit(:backref, $1 == '<' ? :number_call_ab : :number_call_sq, text)
558
+ when /^\\g(.)[+-]\d+/
559
+ emit(:backref, $1 == '<' ? :number_rel_call_ab : :number_rel_call_sq, text)
560
+ else
561
+ raise ValidationError.for(:backref, 'subexpression call', 'invalid ref ID')
593
562
  end
594
563
  };
595
564
 
@@ -597,31 +566,36 @@
597
566
  # Quantifiers
598
567
  # ------------------------------------------------------------------------
599
568
  zero_or_one {
600
- case text = text(data, ts, te).first
601
- when '?' ; emit(:quantifier, :zero_or_one, text, ts, te)
602
- when '??'; emit(:quantifier, :zero_or_one_reluctant, text, ts, te)
603
- when '?+'; emit(:quantifier, :zero_or_one_possessive, text, ts, te)
569
+ case text = copy(data, ts, te)
570
+ when '?' ; emit(:quantifier, :zero_or_one, text)
571
+ when '??'; emit(:quantifier, :zero_or_one_reluctant, text)
572
+ when '?+'; emit(:quantifier, :zero_or_one_possessive, text)
604
573
  end
605
574
  };
606
575
 
607
576
  zero_or_more {
608
- case text = text(data, ts, te).first
609
- when '*' ; emit(:quantifier, :zero_or_more, text, ts, te)
610
- when '*?'; emit(:quantifier, :zero_or_more_reluctant, text, ts, te)
611
- when '*+'; emit(:quantifier, :zero_or_more_possessive, text, ts, te)
577
+ case text = copy(data, ts, te)
578
+ when '*' ; emit(:quantifier, :zero_or_more, text)
579
+ when '*?'; emit(:quantifier, :zero_or_more_reluctant, text)
580
+ when '*+'; emit(:quantifier, :zero_or_more_possessive, text)
612
581
  end
613
582
  };
614
583
 
615
584
  one_or_more {
616
- case text = text(data, ts, te).first
617
- when '+' ; emit(:quantifier, :one_or_more, text, ts, te)
618
- when '+?'; emit(:quantifier, :one_or_more_reluctant, text, ts, te)
619
- when '++'; emit(:quantifier, :one_or_more_possessive, text, ts, te)
585
+ case text = copy(data, ts, te)
586
+ when '+' ; emit(:quantifier, :one_or_more, text)
587
+ when '+?'; emit(:quantifier, :one_or_more_reluctant, text)
588
+ when '++'; emit(:quantifier, :one_or_more_possessive, text)
620
589
  end
621
590
  };
622
591
 
623
- quantifier_interval @err(premature_end_error) {
624
- emit(:quantifier, :interval, *text(data, ts, te))
592
+ quantifier_interval {
593
+ emit(:quantifier, :interval, copy(data, ts, te))
594
+ };
595
+
596
+ # Catch unmatched curly braces as literals
597
+ range_open {
598
+ append_literal(data, ts, te)
625
599
  };
626
600
 
627
601
  # Escaped sequences
@@ -632,15 +606,17 @@
632
606
 
633
607
  comment {
634
608
  if free_spacing
635
- emit(:free_space, :comment, *text(data, ts, te))
609
+ emit(:free_space, :comment, copy(data, ts, te))
636
610
  else
637
- append_literal(data, ts, te)
611
+ # consume only the pound sign (#) and backtrack to do regular scanning
612
+ append_literal(data, ts, ts + 1)
613
+ fexec ts + 1;
638
614
  end
639
615
  };
640
616
 
641
617
  space+ {
642
618
  if free_spacing
643
- emit(:free_space, :whitespace, *text(data, ts, te))
619
+ emit(:free_space, :whitespace, copy(data, ts, te))
644
620
  else
645
621
  append_literal(data, ts, te)
646
622
  end
@@ -649,105 +625,47 @@
649
625
  # Literal: any run of ASCII (pritable or non-printable), and/or UTF-8,
650
626
  # except meta characters.
651
627
  # ------------------------------------------------------------------------
652
- (ascii_print -- space)+ |
653
- ascii_nonprint+ |
654
- utf8_2_byte+ |
655
- utf8_3_byte+ |
656
- utf8_4_byte+ {
628
+ (ascii_print -- space)+ | ascii_nonprint+ | utf8_multibyte+ {
657
629
  append_literal(data, ts, te)
658
630
  };
659
631
 
660
632
  *|;
661
633
  }%%
662
634
 
663
- # THIS IS A GENERATED FILE, DO NOT EDIT DIRECTLY
664
- # This file was generated from lib/regexp_parser/scanner/scanner.rl
635
+ require 'regexp_parser/scanner/errors/scanner_error'
636
+ require 'regexp_parser/scanner/errors/premature_end_error'
637
+ require 'regexp_parser/scanner/errors/validation_error'
665
638
 
666
639
  class Regexp::Scanner
667
- # General scanner error (catch all)
668
- class ScannerError < StandardError; end
669
-
670
- # Base for all scanner validation errors
671
- class ValidationError < StandardError
672
- def initialize(reason)
673
- super reason
674
- end
675
- end
676
-
677
- # Unexpected end of pattern
678
- class PrematureEndError < ScannerError
679
- def initialize(where = '')
680
- super "Premature end of pattern at #{where}"
681
- end
682
- end
683
-
684
- # Invalid sequence format. Used for escape sequences, mainly.
685
- class InvalidSequenceError < ValidationError
686
- def initialize(what = 'sequence', where = '')
687
- super "Invalid #{what} at #{where}"
688
- end
689
- end
690
-
691
- # Invalid group. Used for named groups.
692
- class InvalidGroupError < ValidationError
693
- def initialize(what, reason)
694
- super "Invalid #{what}, #{reason}."
695
- end
696
- end
697
-
698
- # Invalid groupOption. Used for inline options.
699
- class InvalidGroupOption < ValidationError
700
- def initialize(option, text)
701
- super "Invalid group option #{option} in #{text}"
702
- end
703
- end
704
-
705
- # Invalid back reference. Used for name a number refs/calls.
706
- class InvalidBackrefError < ValidationError
707
- def initialize(what, reason)
708
- super "Invalid back reference #{what}, #{reason}"
709
- end
710
- end
711
-
712
- # The property name was not recognized by the scanner.
713
- class UnknownUnicodePropertyError < ValidationError
714
- def initialize(name)
715
- super "Unknown unicode character property name #{name}"
716
- end
717
- end
718
-
719
640
  # Scans the given regular expression text, or Regexp object and collects the
720
641
  # emitted token into an array that gets returned at the end. If a block is
721
642
  # given, it gets called for each emitted token.
722
643
  #
723
644
  # This method may raise errors if a syntax error is encountered.
724
645
  # --------------------------------------------------------------------------
725
- def self.scan(input_object, &block)
726
- new.scan(input_object, &block)
646
+ def self.scan(input_object, options: nil, collect_tokens: true, &block)
647
+ new.scan(input_object, options: options, collect_tokens: collect_tokens, &block)
727
648
  end
728
649
 
729
- def scan(input_object, &block)
730
- self.literal = nil
650
+ def scan(input_object, options: nil, collect_tokens: true, &block)
651
+ self.collect_tokens = collect_tokens
652
+ self.literal_run = nil
731
653
  stack = []
732
654
 
733
- if input_object.is_a?(Regexp)
734
- input = input_object.source
735
- self.free_spacing = (input_object.options & Regexp::EXTENDED != 0)
736
- else
737
- input = input_object
738
- self.free_spacing = false
739
- end
655
+ input = input_object.is_a?(Regexp) ? input_object.source : input_object
656
+ self.free_spacing = free_spacing?(input_object, options)
740
657
  self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
741
658
 
742
- data = input.unpack("c*") if input.is_a?(String)
659
+ data = input.unpack("c*")
743
660
  eof = data.length
744
661
 
745
662
  self.tokens = []
746
- self.block = block_given? ? block : nil
663
+ self.block = block
747
664
 
748
665
  self.set_depth = 0
749
666
  self.group_depth = 0
750
667
  self.conditional_stack = []
668
+ self.char_pos = 0
751
669
 
752
670
  %% write data;
753
671
  %% write init;
@@ -757,7 +675,7 @@ class Regexp::Scanner
757
675
  testEof = testEof
758
676
 
759
677
  if cs == re_scanner_error
760
- text = ts ? copy(data, ts-1..-1) : data.pack('c*')
678
+ text = copy(data, ts ? ts-1 : 0, -1)
761
679
  raise ScannerError.new("Scan error at '#{text}'")
762
680
  end
763
681
 
@@ -767,40 +685,76 @@ class Regexp::Scanner
767
685
  "[#{set_depth}]") if in_set?
768
686
 
769
687
  # when the entire expression is a literal run
770
- emit_literal if literal
688
+ emit_literal if literal_run
771
689
 
772
690
  tokens
773
691
  end
774
692
 
775
693
  # lazy-load property maps when first needed
776
- require 'yaml'
777
- PROP_MAPS_DIR = File.expand_path('../scanner/properties', __FILE__)
778
-
779
694
  def self.short_prop_map
780
- @short_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/short.yml")
695
+ @short_prop_map ||= parse_prop_map('short')
781
696
  end
782
697
 
783
698
  def self.long_prop_map
784
- @long_prop_map ||= YAML.load_file("#{PROP_MAPS_DIR}/long.yml")
699
+ @long_prop_map ||= parse_prop_map('long')
700
+ end
701
+
702
+ def self.parse_prop_map(name)
703
+ File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
704
+ end
705
+
706
+ def self.posix_classes
707
+ %w[alnum alpha ascii blank cntrl digit graph
708
+ lower print punct space upper word xdigit]
785
709
  end
786
710
 
787
711
  # Emits an array with the details of the scanned pattern
788
- def emit(type, token, text, ts, te)
712
+ def emit(type, token, text)
789
713
  #puts "EMIT: type: #{type}, token: #{token}, text: #{text}, ts: #{ts}, te: #{te}"
790
714
 
791
- emit_literal if literal
715
+ emit_literal if literal_run
716
+
717
+ # Ragel runs with byte-based indices (ts, te). These are of little value to
718
+ # end-users, so we keep track of char-based indices and emit those instead.
719
+ ts_char_pos = char_pos
720
+ te_char_pos = char_pos + text.length
721
+
722
+ tok = [type, token, text, ts_char_pos, te_char_pos]
723
+
724
+ self.prev_token = tok
725
+
726
+ self.char_pos = te_char_pos
792
727
 
793
728
  if block
794
- block.call type, token, text, ts, te
729
+ block.call type, token, text, ts_char_pos, te_char_pos
730
+ # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect if no block given
731
+ tokens << tok if collect_tokens
732
+ elsif collect_tokens
733
+ tokens << tok
795
734
  end
796
-
797
- tokens << [type, token, text, ts, te]
798
735
  end
799
736
 
737
+ attr_accessor :literal_run # only public for #||= to work on ruby <= 2.5
738
+
800
739
  private
801
740
 
802
- attr_accessor :tokens, :literal, :block, :free_spacing, :spacing_stack,
803
- :group_depth, :set_depth, :conditional_stack
741
+ attr_accessor :block,
742
+ :collect_tokens, :tokens, :prev_token,
743
+ :free_spacing, :spacing_stack,
744
+ :group_depth, :set_depth, :conditional_stack,
745
+ :char_pos
746
+
747
+ def free_spacing?(input_object, options)
748
+ if options && !input_object.is_a?(String)
749
+ raise ArgumentError, 'options cannot be supplied unless scanning a String'
750
+ end
751
+
752
+ options = input_object.options if input_object.is_a?(::Regexp)
753
+
754
+ return false unless options
755
+
756
+ options & Regexp::EXTENDED != 0
757
+ end
804
758
 
805
759
  def in_group?
806
760
  group_depth > 0
@@ -811,36 +765,24 @@ class Regexp::Scanner
811
765
  end
812
766
 
813
767
  # Copy from ts to te from data as text
814
- def copy(data, range)
815
- data[range].pack('c*')
816
- end
817
-
818
- # Copy from ts to te from data as text, returning an array with the text
819
- # and the offsets used to copy it.
820
- def text(data, ts, te, soff = 0)
821
- [copy(data, ts-soff..te-1), ts-soff, te]
768
+ def copy(data, ts, te)
769
+ data[ts...te].pack('c*').force_encoding('utf-8')
822
770
  end
823
771
 
824
772
  # Appends one or more characters to the literal buffer, to be emitted later
825
- # by a call to emit_literal. Contents can be a mix of ASCII and UTF-8.
773
+ # by a call to emit_literal.
826
774
  def append_literal(data, ts, te)
827
- self.literal = literal || []
828
- literal << text(data, ts, te)
775
+ (self.literal_run ||= []) << copy(data, ts, te)
829
776
  end
830
777
 
831
- # Emits the literal run collected by calls to the append_literal method,
832
- # using the total start (ts) and end (te) offsets of the run.
778
+ # Emits the literal run collected by calls to the append_literal method.
833
779
  def emit_literal
834
- ts, te = literal.first[1], literal.last[2]
835
- text = literal.map {|t| t[0]}.join
836
-
837
- text.force_encoding('utf-8') if text.respond_to?(:force_encoding)
838
-
839
- self.literal = nil
840
- emit(:literal, :literal, text, ts, te)
780
+ text = literal_run.join
781
+ self.literal_run = nil
782
+ emit(:literal, :literal, text)
841
783
  end
842
784
 
843
- def emit_options(text, ts, te)
785
+ def emit_options(text)
844
786
  token = nil
845
787
 
846
788
  # Ruby allows things like '(?-xxxx)' or '(?xx-xx--xx-:abc)'.
@@ -866,28 +808,13 @@ class Regexp::Scanner
866
808
  token = :options_switch
867
809
  end
868
810
 
869
- emit(:group, token, text, ts, te)
811
+ emit(:group, token, text)
870
812
  end
871
813
 
872
814
  def emit_meta_control_sequence(data, ts, te, token)
873
815
  if data.last < 0x00 || data.last > 0x7F
874
- validation_error(:sequence, 'escape', token.to_s)
875
- end
876
- emit(:escape, token, *text(data, ts, te, 1))
877
- end
878
-
879
- # Centralizes and unifies the handling of validation related
880
- # errors.
881
- def validation_error(type, what, reason)
882
- case type
883
- when :group
884
- error = InvalidGroupError.new(what, reason)
885
- when :backref
886
- error = InvalidBackrefError.new(what, reason)
887
- when :sequence
888
- error = InvalidSequenceError.new(what, reason)
816
+ raise ValidationError.for(:sequence, 'escape', token.to_s)
889
817
  end
890
-
891
- raise error # unless @@config.validation_ignore
818
+ emit(:escape, token, copy(data, ts-1, te))
892
819
  end
893
820
  end # module Regexp::Scanner