jruby-prism-parser 0.23.0.pre.SNAPSHOT-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +401 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +62 -0
- data/LICENSE.md +7 -0
- data/Makefile +101 -0
- data/README.md +98 -0
- data/config.yml +2902 -0
- data/docs/build_system.md +91 -0
- data/docs/configuration.md +64 -0
- data/docs/cruby_compilation.md +27 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +121 -0
- data/docs/fuzzing.md +88 -0
- data/docs/heredocs.md +36 -0
- data/docs/javascript.md +118 -0
- data/docs/local_variable_depth.md +229 -0
- data/docs/mapping.md +117 -0
- data/docs/parser_translation.md +34 -0
- data/docs/parsing_rules.md +19 -0
- data/docs/releasing.md +98 -0
- data/docs/ripper.md +36 -0
- data/docs/ruby_api.md +43 -0
- data/docs/ruby_parser_translation.md +19 -0
- data/docs/serialization.md +209 -0
- data/docs/testing.md +55 -0
- data/ext/prism/api_node.c +5098 -0
- data/ext/prism/api_pack.c +267 -0
- data/ext/prism/extconf.rb +110 -0
- data/ext/prism/extension.c +1155 -0
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +5807 -0
- data/include/prism/defines.h +102 -0
- data/include/prism/diagnostic.h +339 -0
- data/include/prism/encoding.h +265 -0
- data/include/prism/node.h +57 -0
- data/include/prism/options.h +230 -0
- data/include/prism/pack.h +152 -0
- data/include/prism/parser.h +732 -0
- data/include/prism/prettyprint.h +26 -0
- data/include/prism/regexp.h +33 -0
- data/include/prism/util/pm_buffer.h +155 -0
- data/include/prism/util/pm_char.h +205 -0
- data/include/prism/util/pm_constant_pool.h +209 -0
- data/include/prism/util/pm_list.h +97 -0
- data/include/prism/util/pm_memchr.h +29 -0
- data/include/prism/util/pm_newline_list.h +93 -0
- data/include/prism/util/pm_state_stack.h +42 -0
- data/include/prism/util/pm_string.h +150 -0
- data/include/prism/util/pm_string_list.h +44 -0
- data/include/prism/util/pm_strncasecmp.h +32 -0
- data/include/prism/util/pm_strpbrk.h +46 -0
- data/include/prism/version.h +29 -0
- data/include/prism.h +289 -0
- data/jruby-prism.jar +0 -0
- data/lib/prism/compiler.rb +486 -0
- data/lib/prism/debug.rb +206 -0
- data/lib/prism/desugar_compiler.rb +207 -0
- data/lib/prism/dispatcher.rb +2150 -0
- data/lib/prism/dot_visitor.rb +4634 -0
- data/lib/prism/dsl.rb +785 -0
- data/lib/prism/ffi.rb +346 -0
- data/lib/prism/lex_compat.rb +908 -0
- data/lib/prism/mutation_compiler.rb +753 -0
- data/lib/prism/node.rb +17864 -0
- data/lib/prism/node_ext.rb +212 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/prism/pack.rb +224 -0
- data/lib/prism/parse_result/comments.rb +177 -0
- data/lib/prism/parse_result/newlines.rb +64 -0
- data/lib/prism/parse_result.rb +498 -0
- data/lib/prism/pattern.rb +250 -0
- data/lib/prism/serialize.rb +1354 -0
- data/lib/prism/translation/parser/compiler.rb +1838 -0
- data/lib/prism/translation/parser/lexer.rb +335 -0
- data/lib/prism/translation/parser/rubocop.rb +37 -0
- data/lib/prism/translation/parser.rb +178 -0
- data/lib/prism/translation/ripper.rb +577 -0
- data/lib/prism/translation/ruby_parser.rb +1521 -0
- data/lib/prism/translation.rb +11 -0
- data/lib/prism/version.rb +3 -0
- data/lib/prism/visitor.rb +495 -0
- data/lib/prism.rb +99 -0
- data/prism.gemspec +135 -0
- data/rbi/prism.rbi +7767 -0
- data/rbi/prism_static.rbi +207 -0
- data/sig/prism.rbs +4773 -0
- data/sig/prism_static.rbs +201 -0
- data/src/diagnostic.c +400 -0
- data/src/encoding.c +5132 -0
- data/src/node.c +2786 -0
- data/src/options.c +213 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +8881 -0
- data/src/prism.c +18406 -0
- data/src/regexp.c +638 -0
- data/src/serialize.c +1554 -0
- data/src/token_type.c +700 -0
- data/src/util/pm_buffer.c +190 -0
- data/src/util/pm_char.c +318 -0
- data/src/util/pm_constant_pool.c +322 -0
- data/src/util/pm_list.c +49 -0
- data/src/util/pm_memchr.c +35 -0
- data/src/util/pm_newline_list.c +84 -0
- data/src/util/pm_state_stack.c +25 -0
- data/src/util/pm_string.c +203 -0
- data/src/util/pm_string_list.c +28 -0
- data/src/util/pm_strncasecmp.c +24 -0
- data/src/util/pm_strpbrk.c +180 -0
- metadata +156 -0
data/src/regexp.c
ADDED
@@ -0,0 +1,638 @@
|
|
1
|
+
#include "prism/regexp.h"
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This is the parser that is going to handle parsing regular expressions.
|
5
|
+
*/
|
6
|
+
typedef struct {
|
7
|
+
/** A pointer to the start of the source that we are parsing. */
|
8
|
+
const uint8_t *start;
|
9
|
+
|
10
|
+
/** A pointer to the current position in the source. */
|
11
|
+
const uint8_t *cursor;
|
12
|
+
|
13
|
+
/** A pointer to the end of the source that we are parsing. */
|
14
|
+
const uint8_t *end;
|
15
|
+
|
16
|
+
/** A list of named captures that we've found. */
|
17
|
+
pm_string_list_t *named_captures;
|
18
|
+
|
19
|
+
/** Whether the encoding has changed from the default. */
|
20
|
+
bool encoding_changed;
|
21
|
+
|
22
|
+
/** The encoding of the source. */
|
23
|
+
const pm_encoding_t *encoding;
|
24
|
+
} pm_regexp_parser_t;
|
25
|
+
|
26
|
+
/**
|
27
|
+
* This initializes a new parser with the given source.
|
28
|
+
*/
|
29
|
+
static void
|
30
|
+
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
|
31
|
+
*parser = (pm_regexp_parser_t) {
|
32
|
+
.start = start,
|
33
|
+
.cursor = start,
|
34
|
+
.end = end,
|
35
|
+
.named_captures = named_captures,
|
36
|
+
.encoding_changed = encoding_changed,
|
37
|
+
.encoding = encoding
|
38
|
+
};
|
39
|
+
}
|
40
|
+
|
41
|
+
/**
|
42
|
+
* This appends a new string to the list of named captures.
|
43
|
+
*/
|
44
|
+
static void
|
45
|
+
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
46
|
+
pm_string_t string;
|
47
|
+
pm_string_shared_init(&string, start, end);
|
48
|
+
pm_string_list_append(parser->named_captures, &string);
|
49
|
+
pm_string_free(&string);
|
50
|
+
}
|
51
|
+
|
52
|
+
/**
|
53
|
+
* Returns true if the next character is the end of the source.
|
54
|
+
*/
|
55
|
+
static inline bool
|
56
|
+
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
57
|
+
return parser->cursor >= parser->end;
|
58
|
+
}
|
59
|
+
|
60
|
+
/**
|
61
|
+
* Optionally accept a char and consume it if it exists.
|
62
|
+
*/
|
63
|
+
static inline bool
|
64
|
+
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
65
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
66
|
+
parser->cursor++;
|
67
|
+
return true;
|
68
|
+
}
|
69
|
+
return false;
|
70
|
+
}
|
71
|
+
|
72
|
+
/**
|
73
|
+
* Expect a character to be present and consume it.
|
74
|
+
*/
|
75
|
+
static inline bool
|
76
|
+
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
77
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
78
|
+
parser->cursor++;
|
79
|
+
return true;
|
80
|
+
}
|
81
|
+
return false;
|
82
|
+
}
|
83
|
+
|
84
|
+
/**
|
85
|
+
* This advances the current token to the next instance of the given character.
|
86
|
+
*/
|
87
|
+
static bool
|
88
|
+
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
89
|
+
if (pm_regexp_char_is_eof(parser)) {
|
90
|
+
return false;
|
91
|
+
}
|
92
|
+
|
93
|
+
const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
94
|
+
if (end == NULL) {
|
95
|
+
return false;
|
96
|
+
}
|
97
|
+
|
98
|
+
parser->cursor = end + 1;
|
99
|
+
return true;
|
100
|
+
}
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Range quantifiers are a special class of quantifiers that look like
|
104
|
+
*
|
105
|
+
* * {digit}
|
106
|
+
* * {digit,}
|
107
|
+
* * {digit,digit}
|
108
|
+
* * {,digit}
|
109
|
+
*
|
110
|
+
* Unfortunately, if there are any spaces in between, then this just becomes a
|
111
|
+
* regular character match expression and we have to backtrack. So when this
|
112
|
+
* function first starts running, we'll create a "save" point and then attempt
|
113
|
+
* to parse the quantifier. If it fails, we'll restore the save point and
|
114
|
+
* return.
|
115
|
+
*
|
116
|
+
* The properly track everything, we're going to build a little state machine.
|
117
|
+
* It looks something like the following:
|
118
|
+
*
|
119
|
+
* +-------+ +---------+ ------------+
|
120
|
+
* ---- lbrace ---> | start | ---- digit ---> | minimum | |
|
121
|
+
* +-------+ +---------+ <--- digit -+
|
122
|
+
* | | |
|
123
|
+
* +-------+ | | rbrace
|
124
|
+
* | comma | <----- comma +---- comma -------+ |
|
125
|
+
* +-------+ V V
|
126
|
+
* | +---------+ +---------+
|
127
|
+
* +-- digit --> | maximum | -- rbrace --> || final ||
|
128
|
+
* +---------+ +---------+
|
129
|
+
* | ^
|
130
|
+
* +- digit -+
|
131
|
+
*
|
132
|
+
* Note that by the time we've hit this function, the lbrace has already been
|
133
|
+
* consumed so we're in the start state.
|
134
|
+
*/
|
135
|
+
static bool
|
136
|
+
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
137
|
+
const uint8_t *savepoint = parser->cursor;
|
138
|
+
|
139
|
+
enum {
|
140
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
|
141
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
|
142
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
|
143
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
|
144
|
+
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
145
|
+
|
146
|
+
while (1) {
|
147
|
+
switch (state) {
|
148
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
149
|
+
switch (*parser->cursor) {
|
150
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
151
|
+
parser->cursor++;
|
152
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
|
153
|
+
break;
|
154
|
+
case ',':
|
155
|
+
parser->cursor++;
|
156
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
|
157
|
+
break;
|
158
|
+
default:
|
159
|
+
parser->cursor = savepoint;
|
160
|
+
return true;
|
161
|
+
}
|
162
|
+
break;
|
163
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
|
164
|
+
switch (*parser->cursor) {
|
165
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
166
|
+
parser->cursor++;
|
167
|
+
break;
|
168
|
+
case ',':
|
169
|
+
parser->cursor++;
|
170
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
171
|
+
break;
|
172
|
+
case '}':
|
173
|
+
parser->cursor++;
|
174
|
+
return true;
|
175
|
+
default:
|
176
|
+
parser->cursor = savepoint;
|
177
|
+
return true;
|
178
|
+
}
|
179
|
+
break;
|
180
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
|
181
|
+
switch (*parser->cursor) {
|
182
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
183
|
+
parser->cursor++;
|
184
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
185
|
+
break;
|
186
|
+
default:
|
187
|
+
parser->cursor = savepoint;
|
188
|
+
return true;
|
189
|
+
}
|
190
|
+
break;
|
191
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
|
192
|
+
switch (*parser->cursor) {
|
193
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
194
|
+
parser->cursor++;
|
195
|
+
break;
|
196
|
+
case '}':
|
197
|
+
parser->cursor++;
|
198
|
+
return true;
|
199
|
+
default:
|
200
|
+
parser->cursor = savepoint;
|
201
|
+
return true;
|
202
|
+
}
|
203
|
+
break;
|
204
|
+
}
|
205
|
+
}
|
206
|
+
|
207
|
+
return true;
|
208
|
+
}
|
209
|
+
|
210
|
+
/**
|
211
|
+
* quantifier : star-quantifier
|
212
|
+
* | plus-quantifier
|
213
|
+
* | optional-quantifier
|
214
|
+
* | range-quantifier
|
215
|
+
* | <empty>
|
216
|
+
* ;
|
217
|
+
*/
|
218
|
+
static bool
|
219
|
+
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
220
|
+
if (pm_regexp_char_is_eof(parser)) return true;
|
221
|
+
|
222
|
+
switch (*parser->cursor) {
|
223
|
+
case '*':
|
224
|
+
case '+':
|
225
|
+
case '?':
|
226
|
+
parser->cursor++;
|
227
|
+
return true;
|
228
|
+
case '{':
|
229
|
+
parser->cursor++;
|
230
|
+
return pm_regexp_parse_range_quantifier(parser);
|
231
|
+
default:
|
232
|
+
// In this case there is no quantifier.
|
233
|
+
return true;
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
/**
|
238
|
+
* match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
239
|
+
* ;
|
240
|
+
*/
|
241
|
+
static bool
|
242
|
+
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
243
|
+
if (!pm_regexp_char_expect(parser, ':')) {
|
244
|
+
return false;
|
245
|
+
}
|
246
|
+
|
247
|
+
pm_regexp_char_accept(parser, '^');
|
248
|
+
|
249
|
+
return (
|
250
|
+
pm_regexp_char_find(parser, ':') &&
|
251
|
+
pm_regexp_char_expect(parser, ']') &&
|
252
|
+
pm_regexp_char_expect(parser, ']')
|
253
|
+
);
|
254
|
+
}
|
255
|
+
|
256
|
+
// Forward declaration because character sets can be nested.
|
257
|
+
static bool
|
258
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
259
|
+
|
260
|
+
/**
|
261
|
+
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
262
|
+
* ;
|
263
|
+
*/
|
264
|
+
static bool
|
265
|
+
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
266
|
+
pm_regexp_char_accept(parser, '^');
|
267
|
+
|
268
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
269
|
+
switch (*parser->cursor++) {
|
270
|
+
case '[':
|
271
|
+
pm_regexp_parse_lbracket(parser);
|
272
|
+
break;
|
273
|
+
case '\\':
|
274
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
275
|
+
parser->cursor++;
|
276
|
+
}
|
277
|
+
break;
|
278
|
+
default:
|
279
|
+
// do nothing, we've already advanced the cursor
|
280
|
+
break;
|
281
|
+
}
|
282
|
+
}
|
283
|
+
|
284
|
+
return pm_regexp_char_expect(parser, ']');
|
285
|
+
}
|
286
|
+
|
287
|
+
/**
|
288
|
+
* A left bracket can either mean a POSIX class or a character set.
|
289
|
+
*/
|
290
|
+
static bool
|
291
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
292
|
+
const uint8_t *reset = parser->cursor;
|
293
|
+
|
294
|
+
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
295
|
+
parser->cursor++;
|
296
|
+
if (pm_regexp_parse_posix_class(parser)) return true;
|
297
|
+
|
298
|
+
parser->cursor = reset;
|
299
|
+
}
|
300
|
+
|
301
|
+
return pm_regexp_parse_character_set(parser);
|
302
|
+
}
|
303
|
+
|
304
|
+
// Forward declaration here since parsing groups needs to go back up the grammar
|
305
|
+
// to parse expressions within them.
|
306
|
+
static bool
|
307
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
308
|
+
|
309
|
+
/**
|
310
|
+
* These are the states of the options that are configurable on the regular
|
311
|
+
* expression (or from within a group).
|
312
|
+
*/
|
313
|
+
typedef enum {
|
314
|
+
PM_REGEXP_OPTION_STATE_INVALID,
|
315
|
+
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
316
|
+
PM_REGEXP_OPTION_STATE_ADDABLE,
|
317
|
+
PM_REGEXP_OPTION_STATE_ADDED,
|
318
|
+
PM_REGEXP_OPTION_STATE_REMOVED
|
319
|
+
} pm_regexp_option_state_t;
|
320
|
+
|
321
|
+
// These are the options that are configurable on the regular expression (or
|
322
|
+
// from within a group).
|
323
|
+
|
324
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
325
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
326
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
327
|
+
|
328
|
+
/**
|
329
|
+
* This is the set of options that are configurable on the regular expression.
|
330
|
+
*/
|
331
|
+
typedef struct {
|
332
|
+
/** The current state of each option. */
|
333
|
+
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
334
|
+
} pm_regexp_options_t;
|
335
|
+
|
336
|
+
/**
|
337
|
+
* Initialize a new set of options to their default values.
|
338
|
+
*/
|
339
|
+
static void
|
340
|
+
pm_regexp_options_init(pm_regexp_options_t *options) {
|
341
|
+
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
342
|
+
options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
343
|
+
options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
344
|
+
options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
345
|
+
options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
346
|
+
options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
347
|
+
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
348
|
+
}
|
349
|
+
|
350
|
+
/**
|
351
|
+
* Attempt to add the given option to the set of options. Returns true if it was
|
352
|
+
* added, false if it was already present.
|
353
|
+
*/
|
354
|
+
static bool
|
355
|
+
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
356
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
357
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
358
|
+
|
359
|
+
switch (options->values[key]) {
|
360
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
361
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
362
|
+
return false;
|
363
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
364
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
365
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
|
366
|
+
return true;
|
367
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
368
|
+
return true;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
|
372
|
+
return false;
|
373
|
+
}
|
374
|
+
|
375
|
+
/**
|
376
|
+
* Attempt to remove the given option from the set of options. Returns true if
|
377
|
+
* it was removed, false if it was already absent.
|
378
|
+
*/
|
379
|
+
static bool
|
380
|
+
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
381
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
382
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
383
|
+
|
384
|
+
switch (options->values[key]) {
|
385
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
386
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
387
|
+
return false;
|
388
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
389
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
390
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
391
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
|
392
|
+
return true;
|
393
|
+
}
|
394
|
+
}
|
395
|
+
|
396
|
+
return false;
|
397
|
+
}
|
398
|
+
|
399
|
+
/**
|
400
|
+
* Groups can have quite a few different patterns for syntax. They basically
|
401
|
+
* just wrap a set of expressions, but they can potentially have options after a
|
402
|
+
* question mark. If there _isn't_ a question mark, then it's just a set of
|
403
|
+
* expressions. If there _is_, then here are the options:
|
404
|
+
*
|
405
|
+
* * (?#...) - inline comments
|
406
|
+
* * (?:subexp) - non-capturing group
|
407
|
+
* * (?=subexp) - positive lookahead
|
408
|
+
* * (?!subexp) - negative lookahead
|
409
|
+
* * (?>subexp) - atomic group
|
410
|
+
* * (?~subexp) - absence operator
|
411
|
+
* * (?<=subexp) - positive lookbehind
|
412
|
+
* * (?<!subexp) - negative lookbehind
|
413
|
+
* * (?<name>subexp) - named capturing group
|
414
|
+
* * (?'name'subexp) - named capturing group
|
415
|
+
* * (?(cond)yes-subexp) - conditional expression
|
416
|
+
* * (?(cond)yes-subexp|no-subexp) - conditional expression
|
417
|
+
* * (?imxdau-imx) - turn on and off configuration
|
418
|
+
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
419
|
+
*/
|
420
|
+
static bool
|
421
|
+
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
422
|
+
// First, parse any options for the group.
|
423
|
+
if (pm_regexp_char_accept(parser, '?')) {
|
424
|
+
if (pm_regexp_char_is_eof(parser)) {
|
425
|
+
return false;
|
426
|
+
}
|
427
|
+
pm_regexp_options_t options;
|
428
|
+
pm_regexp_options_init(&options);
|
429
|
+
|
430
|
+
switch (*parser->cursor) {
|
431
|
+
case '#': { // inline comments
|
432
|
+
if (parser->encoding_changed && parser->encoding->multibyte) {
|
433
|
+
bool escaped = false;
|
434
|
+
|
435
|
+
// Here we're going to take a slow path and iterate through
|
436
|
+
// each multibyte character to find the close paren. We do
|
437
|
+
// this because \ can be a trailing byte in some encodings.
|
438
|
+
while (parser->cursor < parser->end) {
|
439
|
+
if (!escaped && *parser->cursor == ')') {
|
440
|
+
parser->cursor++;
|
441
|
+
return true;
|
442
|
+
}
|
443
|
+
|
444
|
+
size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
445
|
+
if (width == 0) return false;
|
446
|
+
|
447
|
+
escaped = (width == 1) && (*parser->cursor == '\\');
|
448
|
+
parser->cursor += width;
|
449
|
+
}
|
450
|
+
|
451
|
+
return false;
|
452
|
+
} else {
|
453
|
+
// Here we can take the fast path and use memchr to find the
|
454
|
+
// next ) because we are safe checking backward for \ since
|
455
|
+
// it cannot be a trailing character.
|
456
|
+
bool found = pm_regexp_char_find(parser, ')');
|
457
|
+
|
458
|
+
while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
|
459
|
+
found = pm_regexp_char_find(parser, ')');
|
460
|
+
}
|
461
|
+
|
462
|
+
return found;
|
463
|
+
}
|
464
|
+
}
|
465
|
+
case ':': // non-capturing group
|
466
|
+
case '=': // positive lookahead
|
467
|
+
case '!': // negative lookahead
|
468
|
+
case '>': // atomic group
|
469
|
+
case '~': // absence operator
|
470
|
+
parser->cursor++;
|
471
|
+
break;
|
472
|
+
case '<':
|
473
|
+
parser->cursor++;
|
474
|
+
if (pm_regexp_char_is_eof(parser)) {
|
475
|
+
return false;
|
476
|
+
}
|
477
|
+
|
478
|
+
switch (*parser->cursor) {
|
479
|
+
case '=': // positive lookbehind
|
480
|
+
case '!': // negative lookbehind
|
481
|
+
parser->cursor++;
|
482
|
+
break;
|
483
|
+
default: { // named capture group
|
484
|
+
const uint8_t *start = parser->cursor;
|
485
|
+
if (!pm_regexp_char_find(parser, '>')) {
|
486
|
+
return false;
|
487
|
+
}
|
488
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
489
|
+
break;
|
490
|
+
}
|
491
|
+
}
|
492
|
+
break;
|
493
|
+
case '\'': { // named capture group
|
494
|
+
const uint8_t *start = ++parser->cursor;
|
495
|
+
if (!pm_regexp_char_find(parser, '\'')) {
|
496
|
+
return false;
|
497
|
+
}
|
498
|
+
|
499
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
500
|
+
break;
|
501
|
+
}
|
502
|
+
case '(': // conditional expression
|
503
|
+
if (!pm_regexp_char_find(parser, ')')) {
|
504
|
+
return false;
|
505
|
+
}
|
506
|
+
break;
|
507
|
+
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
|
508
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
|
509
|
+
if (!pm_regexp_options_add(&options, *parser->cursor)) {
|
510
|
+
return false;
|
511
|
+
}
|
512
|
+
parser->cursor++;
|
513
|
+
}
|
514
|
+
|
515
|
+
if (pm_regexp_char_is_eof(parser)) {
|
516
|
+
return false;
|
517
|
+
}
|
518
|
+
|
519
|
+
// If we hit a -, then we're done parsing options.
|
520
|
+
if (*parser->cursor != '-') break;
|
521
|
+
|
522
|
+
// Otherwise, fallthrough to the - case.
|
523
|
+
/* fallthrough */
|
524
|
+
case '-':
|
525
|
+
parser->cursor++;
|
526
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
527
|
+
if (!pm_regexp_options_remove(&options, *parser->cursor)) {
|
528
|
+
return false;
|
529
|
+
}
|
530
|
+
parser->cursor++;
|
531
|
+
}
|
532
|
+
|
533
|
+
if (pm_regexp_char_is_eof(parser)) {
|
534
|
+
return false;
|
535
|
+
}
|
536
|
+
break;
|
537
|
+
default:
|
538
|
+
return false;
|
539
|
+
}
|
540
|
+
}
|
541
|
+
|
542
|
+
// Now, parse the expressions within this group.
|
543
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
544
|
+
if (!pm_regexp_parse_expression(parser)) {
|
545
|
+
return false;
|
546
|
+
}
|
547
|
+
pm_regexp_char_accept(parser, '|');
|
548
|
+
}
|
549
|
+
|
550
|
+
// Finally, make sure we have a closing parenthesis.
|
551
|
+
return pm_regexp_char_expect(parser, ')');
|
552
|
+
}
|
553
|
+
|
554
|
+
/**
|
555
|
+
* item : anchor
|
556
|
+
* | match-posix-class
|
557
|
+
* | match-char-set
|
558
|
+
* | match-char-class
|
559
|
+
* | match-char-prop
|
560
|
+
* | match-char
|
561
|
+
* | match-any
|
562
|
+
* | group
|
563
|
+
* | quantified
|
564
|
+
* ;
|
565
|
+
*/
|
566
|
+
static bool
|
567
|
+
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
568
|
+
switch (*parser->cursor++) {
|
569
|
+
case '^':
|
570
|
+
case '$':
|
571
|
+
return true;
|
572
|
+
case '\\':
|
573
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
574
|
+
parser->cursor++;
|
575
|
+
}
|
576
|
+
return pm_regexp_parse_quantifier(parser);
|
577
|
+
case '(':
|
578
|
+
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
|
579
|
+
case '[':
|
580
|
+
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
|
581
|
+
default:
|
582
|
+
return pm_regexp_parse_quantifier(parser);
|
583
|
+
}
|
584
|
+
}
|
585
|
+
|
586
|
+
/**
|
587
|
+
* expression : item+
|
588
|
+
* ;
|
589
|
+
*/
|
590
|
+
static bool
|
591
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
592
|
+
if (!pm_regexp_parse_item(parser)) {
|
593
|
+
return false;
|
594
|
+
}
|
595
|
+
|
596
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
597
|
+
if (!pm_regexp_parse_item(parser)) {
|
598
|
+
return false;
|
599
|
+
}
|
600
|
+
}
|
601
|
+
|
602
|
+
return true;
|
603
|
+
}
|
604
|
+
|
605
|
+
/**
|
606
|
+
* pattern : EOF
|
607
|
+
* | expression EOF
|
608
|
+
* | expression '|' pattern
|
609
|
+
* ;
|
610
|
+
*/
|
611
|
+
static bool
|
612
|
+
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
613
|
+
return (
|
614
|
+
(
|
615
|
+
// Exit early if the pattern is empty.
|
616
|
+
pm_regexp_char_is_eof(parser) ||
|
617
|
+
// Parse the first expression in the pattern.
|
618
|
+
pm_regexp_parse_expression(parser)
|
619
|
+
) &&
|
620
|
+
(
|
621
|
+
// Return now if we've parsed the entire pattern.
|
622
|
+
pm_regexp_char_is_eof(parser) ||
|
623
|
+
// Otherwise, we should have a pipe character.
|
624
|
+
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
625
|
+
)
|
626
|
+
);
|
627
|
+
}
|
628
|
+
|
629
|
+
/**
|
630
|
+
* Parse a regular expression and extract the names of all of the named capture
|
631
|
+
* groups.
|
632
|
+
*/
|
633
|
+
PRISM_EXPORTED_FUNCTION bool
|
634
|
+
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
|
635
|
+
pm_regexp_parser_t parser;
|
636
|
+
pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
|
637
|
+
return pm_regexp_parse_pattern(&parser);
|
638
|
+
}
|