prism 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +172 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +62 -0
- data/LICENSE.md +7 -0
- data/Makefile +84 -0
- data/README.md +89 -0
- data/config.yml +2481 -0
- data/docs/build_system.md +74 -0
- data/docs/building.md +22 -0
- data/docs/configuration.md +60 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +117 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/ruby_api.md +25 -0
- data/docs/serialization.md +181 -0
- data/docs/testing.md +55 -0
- data/ext/prism/api_node.c +4725 -0
- data/ext/prism/api_pack.c +256 -0
- data/ext/prism/extconf.rb +136 -0
- data/ext/prism/extension.c +626 -0
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +1932 -0
- data/include/prism/defines.h +45 -0
- data/include/prism/diagnostic.h +231 -0
- data/include/prism/enc/pm_encoding.h +95 -0
- data/include/prism/node.h +41 -0
- data/include/prism/pack.h +141 -0
- data/include/prism/parser.h +418 -0
- data/include/prism/regexp.h +19 -0
- data/include/prism/unescape.h +48 -0
- data/include/prism/util/pm_buffer.h +51 -0
- data/include/prism/util/pm_char.h +91 -0
- data/include/prism/util/pm_constant_pool.h +78 -0
- data/include/prism/util/pm_list.h +67 -0
- data/include/prism/util/pm_memchr.h +14 -0
- data/include/prism/util/pm_newline_list.h +61 -0
- data/include/prism/util/pm_state_stack.h +24 -0
- data/include/prism/util/pm_string.h +61 -0
- data/include/prism/util/pm_string_list.h +25 -0
- data/include/prism/util/pm_strpbrk.h +29 -0
- data/include/prism/version.h +4 -0
- data/include/prism.h +82 -0
- data/lib/prism/compiler.rb +465 -0
- data/lib/prism/debug.rb +157 -0
- data/lib/prism/desugar_compiler.rb +206 -0
- data/lib/prism/dispatcher.rb +2051 -0
- data/lib/prism/dsl.rb +750 -0
- data/lib/prism/ffi.rb +251 -0
- data/lib/prism/lex_compat.rb +838 -0
- data/lib/prism/mutation_compiler.rb +718 -0
- data/lib/prism/node.rb +14540 -0
- data/lib/prism/node_ext.rb +55 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/prism/pack.rb +185 -0
- data/lib/prism/parse_result/comments.rb +172 -0
- data/lib/prism/parse_result/newlines.rb +60 -0
- data/lib/prism/parse_result.rb +266 -0
- data/lib/prism/pattern.rb +239 -0
- data/lib/prism/ripper_compat.rb +174 -0
- data/lib/prism/serialize.rb +662 -0
- data/lib/prism/visitor.rb +470 -0
- data/lib/prism.rb +64 -0
- data/prism.gemspec +113 -0
- data/src/diagnostic.c +287 -0
- data/src/enc/pm_big5.c +52 -0
- data/src/enc/pm_euc_jp.c +58 -0
- data/src/enc/pm_gbk.c +61 -0
- data/src/enc/pm_shift_jis.c +56 -0
- data/src/enc/pm_tables.c +507 -0
- data/src/enc/pm_unicode.c +2324 -0
- data/src/enc/pm_windows_31j.c +56 -0
- data/src/node.c +2633 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +2136 -0
- data/src/prism.c +14587 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1899 -0
- data/src/token_type.c +349 -0
- data/src/unescape.c +637 -0
- data/src/util/pm_buffer.c +103 -0
- data/src/util/pm_char.c +272 -0
- data/src/util/pm_constant_pool.c +252 -0
- data/src/util/pm_list.c +41 -0
- data/src/util/pm_memchr.c +33 -0
- data/src/util/pm_newline_list.c +134 -0
- data/src/util/pm_state_stack.c +19 -0
- data/src/util/pm_string.c +200 -0
- data/src/util/pm_string_list.c +29 -0
- data/src/util/pm_strncasecmp.c +17 -0
- data/src/util/pm_strpbrk.c +66 -0
- metadata +138 -0
data/src/regexp.c
ADDED
@@ -0,0 +1,580 @@
|
|
1
|
+
#include "prism/regexp.h"
|
2
|
+
|
3
|
+
// This is the parser that is going to handle parsing regular expressions.
|
4
|
+
typedef struct {
|
5
|
+
const uint8_t *start;
|
6
|
+
const uint8_t *cursor;
|
7
|
+
const uint8_t *end;
|
8
|
+
pm_string_list_t *named_captures;
|
9
|
+
bool encoding_changed;
|
10
|
+
pm_encoding_t *encoding;
|
11
|
+
} pm_regexp_parser_t;
|
12
|
+
|
13
|
+
// This initializes a new parser with the given source.
|
14
|
+
static void
|
15
|
+
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
16
|
+
*parser = (pm_regexp_parser_t) {
|
17
|
+
.start = start,
|
18
|
+
.cursor = start,
|
19
|
+
.end = end,
|
20
|
+
.named_captures = named_captures,
|
21
|
+
.encoding_changed = encoding_changed,
|
22
|
+
.encoding = encoding
|
23
|
+
};
|
24
|
+
}
|
25
|
+
|
26
|
+
// This appends a new string to the list of named captures.
|
27
|
+
static void
|
28
|
+
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
29
|
+
pm_string_t string;
|
30
|
+
pm_string_shared_init(&string, start, end);
|
31
|
+
pm_string_list_append(parser->named_captures, &string);
|
32
|
+
pm_string_free(&string);
|
33
|
+
}
|
34
|
+
|
35
|
+
// Returns true if the next character is the end of the source.
|
36
|
+
static inline bool
|
37
|
+
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
38
|
+
return parser->cursor >= parser->end;
|
39
|
+
}
|
40
|
+
|
41
|
+
// Optionally accept a char and consume it if it exists.
|
42
|
+
static inline bool
|
43
|
+
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
44
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
45
|
+
parser->cursor++;
|
46
|
+
return true;
|
47
|
+
}
|
48
|
+
return false;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Expect a character to be present and consume it.
|
52
|
+
static inline bool
|
53
|
+
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
54
|
+
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
55
|
+
parser->cursor++;
|
56
|
+
return true;
|
57
|
+
}
|
58
|
+
return false;
|
59
|
+
}
|
60
|
+
|
61
|
+
// This advances the current token to the next instance of the given character.
|
62
|
+
static bool
|
63
|
+
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
64
|
+
if (pm_regexp_char_is_eof(parser)) {
|
65
|
+
return false;
|
66
|
+
}
|
67
|
+
|
68
|
+
const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
69
|
+
if (end == NULL) {
|
70
|
+
return false;
|
71
|
+
}
|
72
|
+
|
73
|
+
parser->cursor = end + 1;
|
74
|
+
return true;
|
75
|
+
}
|
76
|
+
|
77
|
+
// Range quantifiers are a special class of quantifiers that look like
|
78
|
+
//
|
79
|
+
// * {digit}
|
80
|
+
// * {digit,}
|
81
|
+
// * {digit,digit}
|
82
|
+
// * {,digit}
|
83
|
+
//
|
84
|
+
// Unfortunately, if there are any spaces in between, then this just becomes a
|
85
|
+
// regular character match expression and we have to backtrack. So when this
|
86
|
+
// function first starts running, we'll create a "save" point and then attempt
|
87
|
+
// to parse the quantifier. If it fails, we'll restore the save point and
|
88
|
+
// return.
|
89
|
+
//
|
90
|
+
// The properly track everything, we're going to build a little state machine.
|
91
|
+
// It looks something like the following:
|
92
|
+
//
|
93
|
+
// ┌───────┐ ┌─────────┐ ────────────┐
|
94
|
+
// ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
|
95
|
+
// └───────┘ └─────────┘ <─── digit ─┘
|
96
|
+
// │ │ │
|
97
|
+
// ┌───────┐ │ │ rbrace
|
98
|
+
// │ comma │ <───── comma ┌──── comma ───────┘ │
|
99
|
+
// └───────┘ V V
|
100
|
+
// │ ┌─────────┐ ┌─────────┐
|
101
|
+
// └── digit ──> │ maximum │ ── rbrace ──> │| final |│
|
102
|
+
// └─────────┘ └─────────┘
|
103
|
+
// │ ^
|
104
|
+
// └─ digit ─┘
|
105
|
+
//
|
106
|
+
// Note that by the time we've hit this function, the lbrace has already been
|
107
|
+
// consumed so we're in the start state.
|
108
|
+
static bool
|
109
|
+
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
110
|
+
const uint8_t *savepoint = parser->cursor;
|
111
|
+
|
112
|
+
enum {
|
113
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
|
114
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
|
115
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
|
116
|
+
PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
|
117
|
+
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
118
|
+
|
119
|
+
while (1) {
|
120
|
+
switch (state) {
|
121
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
122
|
+
switch (*parser->cursor) {
|
123
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
124
|
+
parser->cursor++;
|
125
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
|
126
|
+
break;
|
127
|
+
case ',':
|
128
|
+
parser->cursor++;
|
129
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
|
130
|
+
break;
|
131
|
+
default:
|
132
|
+
parser->cursor = savepoint;
|
133
|
+
return true;
|
134
|
+
}
|
135
|
+
break;
|
136
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
|
137
|
+
switch (*parser->cursor) {
|
138
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
139
|
+
parser->cursor++;
|
140
|
+
break;
|
141
|
+
case ',':
|
142
|
+
parser->cursor++;
|
143
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
144
|
+
break;
|
145
|
+
case '}':
|
146
|
+
parser->cursor++;
|
147
|
+
return true;
|
148
|
+
default:
|
149
|
+
parser->cursor = savepoint;
|
150
|
+
return true;
|
151
|
+
}
|
152
|
+
break;
|
153
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
|
154
|
+
switch (*parser->cursor) {
|
155
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
156
|
+
parser->cursor++;
|
157
|
+
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
158
|
+
break;
|
159
|
+
default:
|
160
|
+
parser->cursor = savepoint;
|
161
|
+
return true;
|
162
|
+
}
|
163
|
+
break;
|
164
|
+
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
|
165
|
+
switch (*parser->cursor) {
|
166
|
+
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
167
|
+
parser->cursor++;
|
168
|
+
break;
|
169
|
+
case '}':
|
170
|
+
parser->cursor++;
|
171
|
+
return true;
|
172
|
+
default:
|
173
|
+
parser->cursor = savepoint;
|
174
|
+
return true;
|
175
|
+
}
|
176
|
+
break;
|
177
|
+
}
|
178
|
+
}
|
179
|
+
|
180
|
+
return true;
|
181
|
+
}
|
182
|
+
|
183
|
+
// quantifier : star-quantifier
|
184
|
+
// | plus-quantifier
|
185
|
+
// | optional-quantifier
|
186
|
+
// | range-quantifier
|
187
|
+
// | <empty>
|
188
|
+
// ;
|
189
|
+
static bool
|
190
|
+
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
191
|
+
switch (*parser->cursor) {
|
192
|
+
case '*':
|
193
|
+
case '+':
|
194
|
+
case '?':
|
195
|
+
parser->cursor++;
|
196
|
+
return true;
|
197
|
+
case '{':
|
198
|
+
parser->cursor++;
|
199
|
+
return pm_regexp_parse_range_quantifier(parser);
|
200
|
+
default:
|
201
|
+
// In this case there is no quantifier.
|
202
|
+
return true;
|
203
|
+
}
|
204
|
+
}
|
205
|
+
|
206
|
+
// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
207
|
+
// ;
|
208
|
+
static bool
|
209
|
+
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
210
|
+
if (!pm_regexp_char_expect(parser, ':')) {
|
211
|
+
return false;
|
212
|
+
}
|
213
|
+
|
214
|
+
pm_regexp_char_accept(parser, '^');
|
215
|
+
|
216
|
+
return (
|
217
|
+
pm_regexp_char_find(parser, ':') &&
|
218
|
+
pm_regexp_char_expect(parser, ']') &&
|
219
|
+
pm_regexp_char_expect(parser, ']')
|
220
|
+
);
|
221
|
+
}
|
222
|
+
|
223
|
+
// Forward declaration because character sets can be nested.
|
224
|
+
static bool
|
225
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
226
|
+
|
227
|
+
// match-char-set : '[' '^'? (match-range | match-char)* ']'
|
228
|
+
// ;
|
229
|
+
static bool
|
230
|
+
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
231
|
+
pm_regexp_char_accept(parser, '^');
|
232
|
+
|
233
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
234
|
+
switch (*parser->cursor++) {
|
235
|
+
case '[':
|
236
|
+
pm_regexp_parse_lbracket(parser);
|
237
|
+
break;
|
238
|
+
case '\\':
|
239
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
240
|
+
parser->cursor++;
|
241
|
+
}
|
242
|
+
break;
|
243
|
+
default:
|
244
|
+
// do nothing, we've already advanced the cursor
|
245
|
+
break;
|
246
|
+
}
|
247
|
+
}
|
248
|
+
|
249
|
+
return pm_regexp_char_expect(parser, ']');
|
250
|
+
}
|
251
|
+
|
252
|
+
// A left bracket can either mean a POSIX class or a character set.
|
253
|
+
static bool
|
254
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
255
|
+
const uint8_t *reset = parser->cursor;
|
256
|
+
|
257
|
+
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
258
|
+
parser->cursor++;
|
259
|
+
if (pm_regexp_parse_posix_class(parser)) return true;
|
260
|
+
|
261
|
+
parser->cursor = reset;
|
262
|
+
}
|
263
|
+
|
264
|
+
return pm_regexp_parse_character_set(parser);
|
265
|
+
}
|
266
|
+
|
267
|
+
// Forward declaration here since parsing groups needs to go back up the grammar
|
268
|
+
// to parse expressions within them.
|
269
|
+
static bool
|
270
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
271
|
+
|
272
|
+
// These are the states of the options that are configurable on the regular
|
273
|
+
// expression (or from within a group).
|
274
|
+
typedef enum {
|
275
|
+
PM_REGEXP_OPTION_STATE_INVALID,
|
276
|
+
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
277
|
+
PM_REGEXP_OPTION_STATE_ADDABLE,
|
278
|
+
PM_REGEXP_OPTION_STATE_ADDED,
|
279
|
+
PM_REGEXP_OPTION_STATE_REMOVED
|
280
|
+
} pm_regexp_option_state_t;
|
281
|
+
|
282
|
+
// These are the options that are configurable on the regular expression (or
|
283
|
+
// from within a group).
|
284
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
285
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
286
|
+
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
287
|
+
|
288
|
+
// This is the set of options that are configurable on the regular expression.
|
289
|
+
typedef struct {
|
290
|
+
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
291
|
+
} pm_regexp_options_t;
|
292
|
+
|
293
|
+
// Initialize a new set of options to their default values.
|
294
|
+
static void
|
295
|
+
pm_regexp_options_init(pm_regexp_options_t *options) {
|
296
|
+
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
297
|
+
options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
298
|
+
options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
299
|
+
options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
300
|
+
options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
301
|
+
options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
302
|
+
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
303
|
+
}
|
304
|
+
|
305
|
+
// Attempt to add the given option to the set of options. Returns true if it was
|
306
|
+
// added, false if it was already present.
|
307
|
+
static bool
|
308
|
+
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
309
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
310
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
311
|
+
|
312
|
+
switch (options->values[key]) {
|
313
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
314
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
315
|
+
return false;
|
316
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
317
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
318
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
|
319
|
+
return true;
|
320
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
321
|
+
return true;
|
322
|
+
}
|
323
|
+
}
|
324
|
+
|
325
|
+
return false;
|
326
|
+
}
|
327
|
+
|
328
|
+
// Attempt to remove the given option from the set of options. Returns true if
|
329
|
+
// it was removed, false if it was already absent.
|
330
|
+
static bool
|
331
|
+
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
332
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
333
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
334
|
+
|
335
|
+
switch (options->values[key]) {
|
336
|
+
case PM_REGEXP_OPTION_STATE_INVALID:
|
337
|
+
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
338
|
+
return false;
|
339
|
+
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
340
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
341
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
342
|
+
options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
|
343
|
+
return true;
|
344
|
+
}
|
345
|
+
}
|
346
|
+
|
347
|
+
return false;
|
348
|
+
}
|
349
|
+
|
350
|
+
// Groups can have quite a few different patterns for syntax. They basically
|
351
|
+
// just wrap a set of expressions, but they can potentially have options after a
|
352
|
+
// question mark. If there _isn't_ a question mark, then it's just a set of
|
353
|
+
// expressions. If there _is_, then here are the options:
|
354
|
+
//
|
355
|
+
// * (?#...) - inline comments
|
356
|
+
// * (?:subexp) - non-capturing group
|
357
|
+
// * (?=subexp) - positive lookahead
|
358
|
+
// * (?!subexp) - negative lookahead
|
359
|
+
// * (?>subexp) - atomic group
|
360
|
+
// * (?~subexp) - absence operator
|
361
|
+
// * (?<=subexp) - positive lookbehind
|
362
|
+
// * (?<!subexp) - negative lookbehind
|
363
|
+
// * (?<name>subexp) - named capturing group
|
364
|
+
// * (?'name'subexp) - named capturing group
|
365
|
+
// * (?(cond)yes-subexp) - conditional expression
|
366
|
+
// * (?(cond)yes-subexp|no-subexp) - conditional expression
|
367
|
+
// * (?imxdau-imx) - turn on and off configuration
|
368
|
+
// * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
369
|
+
//
|
370
|
+
static bool
|
371
|
+
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
372
|
+
// First, parse any options for the group.
|
373
|
+
if (pm_regexp_char_accept(parser, '?')) {
|
374
|
+
if (pm_regexp_char_is_eof(parser)) {
|
375
|
+
return false;
|
376
|
+
}
|
377
|
+
pm_regexp_options_t options;
|
378
|
+
pm_regexp_options_init(&options);
|
379
|
+
|
380
|
+
switch (*parser->cursor) {
|
381
|
+
case '#': { // inline comments
|
382
|
+
if (parser->encoding_changed && parser->encoding->multibyte) {
|
383
|
+
bool escaped = false;
|
384
|
+
|
385
|
+
// Here we're going to take a slow path and iterate through
|
386
|
+
// each multibyte character to find the close paren. We do
|
387
|
+
// this because \ can be a trailing byte in some encodings.
|
388
|
+
while (parser->cursor < parser->end) {
|
389
|
+
if (!escaped && *parser->cursor == ')') {
|
390
|
+
parser->cursor++;
|
391
|
+
return true;
|
392
|
+
}
|
393
|
+
|
394
|
+
size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
395
|
+
if (width == 0) return false;
|
396
|
+
|
397
|
+
escaped = (width == 1) && (*parser->cursor == '\\');
|
398
|
+
parser->cursor += width;
|
399
|
+
}
|
400
|
+
|
401
|
+
return false;
|
402
|
+
} else {
|
403
|
+
// Here we can take the fast path and use memchr to find the
|
404
|
+
// next ) because we are safe checking backward for \ since
|
405
|
+
// it cannot be a trailing character.
|
406
|
+
bool found = pm_regexp_char_find(parser, ')');
|
407
|
+
|
408
|
+
while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
|
409
|
+
found = pm_regexp_char_find(parser, ')');
|
410
|
+
}
|
411
|
+
|
412
|
+
return found;
|
413
|
+
}
|
414
|
+
}
|
415
|
+
case ':': // non-capturing group
|
416
|
+
case '=': // positive lookahead
|
417
|
+
case '!': // negative lookahead
|
418
|
+
case '>': // atomic group
|
419
|
+
case '~': // absence operator
|
420
|
+
parser->cursor++;
|
421
|
+
break;
|
422
|
+
case '<':
|
423
|
+
parser->cursor++;
|
424
|
+
if (pm_regexp_char_is_eof(parser)) {
|
425
|
+
return false;
|
426
|
+
}
|
427
|
+
|
428
|
+
switch (*parser->cursor) {
|
429
|
+
case '=': // positive lookbehind
|
430
|
+
case '!': // negative lookbehind
|
431
|
+
parser->cursor++;
|
432
|
+
break;
|
433
|
+
default: { // named capture group
|
434
|
+
const uint8_t *start = parser->cursor;
|
435
|
+
if (!pm_regexp_char_find(parser, '>')) {
|
436
|
+
return false;
|
437
|
+
}
|
438
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
439
|
+
break;
|
440
|
+
}
|
441
|
+
}
|
442
|
+
break;
|
443
|
+
case '\'': { // named capture group
|
444
|
+
const uint8_t *start = ++parser->cursor;
|
445
|
+
if (!pm_regexp_char_find(parser, '\'')) {
|
446
|
+
return false;
|
447
|
+
}
|
448
|
+
|
449
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
450
|
+
break;
|
451
|
+
}
|
452
|
+
case '(': // conditional expression
|
453
|
+
if (!pm_regexp_char_find(parser, ')')) {
|
454
|
+
return false;
|
455
|
+
}
|
456
|
+
break;
|
457
|
+
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
|
458
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
|
459
|
+
if (!pm_regexp_options_add(&options, *parser->cursor)) {
|
460
|
+
return false;
|
461
|
+
}
|
462
|
+
parser->cursor++;
|
463
|
+
}
|
464
|
+
|
465
|
+
if (pm_regexp_char_is_eof(parser)) {
|
466
|
+
return false;
|
467
|
+
}
|
468
|
+
|
469
|
+
// If we hit a -, then we're done parsing options.
|
470
|
+
if (*parser->cursor != '-') break;
|
471
|
+
|
472
|
+
// Otherwise, fallthrough to the - case.
|
473
|
+
/* fallthrough */
|
474
|
+
case '-':
|
475
|
+
parser->cursor++;
|
476
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
477
|
+
if (!pm_regexp_options_remove(&options, *parser->cursor)) {
|
478
|
+
return false;
|
479
|
+
}
|
480
|
+
parser->cursor++;
|
481
|
+
}
|
482
|
+
|
483
|
+
if (pm_regexp_char_is_eof(parser)) {
|
484
|
+
return false;
|
485
|
+
}
|
486
|
+
break;
|
487
|
+
default:
|
488
|
+
return false;
|
489
|
+
}
|
490
|
+
}
|
491
|
+
|
492
|
+
// Now, parse the expressions within this group.
|
493
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
494
|
+
if (!pm_regexp_parse_expression(parser)) {
|
495
|
+
return false;
|
496
|
+
}
|
497
|
+
pm_regexp_char_accept(parser, '|');
|
498
|
+
}
|
499
|
+
|
500
|
+
// Finally, make sure we have a closing parenthesis.
|
501
|
+
return pm_regexp_char_expect(parser, ')');
|
502
|
+
}
|
503
|
+
|
504
|
+
// item : anchor
|
505
|
+
// | match-posix-class
|
506
|
+
// | match-char-set
|
507
|
+
// | match-char-class
|
508
|
+
// | match-char-prop
|
509
|
+
// | match-char
|
510
|
+
// | match-any
|
511
|
+
// | group
|
512
|
+
// | quantified
|
513
|
+
// ;
|
514
|
+
static bool
|
515
|
+
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
516
|
+
switch (*parser->cursor++) {
|
517
|
+
case '^':
|
518
|
+
case '$':
|
519
|
+
return true;
|
520
|
+
case '\\':
|
521
|
+
if (!pm_regexp_char_is_eof(parser)) {
|
522
|
+
parser->cursor++;
|
523
|
+
}
|
524
|
+
return pm_regexp_parse_quantifier(parser);
|
525
|
+
case '(':
|
526
|
+
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
|
527
|
+
case '[':
|
528
|
+
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
|
529
|
+
default:
|
530
|
+
return pm_regexp_parse_quantifier(parser);
|
531
|
+
}
|
532
|
+
}
|
533
|
+
|
534
|
+
// expression : item+
|
535
|
+
// ;
|
536
|
+
static bool
|
537
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
538
|
+
if (!pm_regexp_parse_item(parser)) {
|
539
|
+
return false;
|
540
|
+
}
|
541
|
+
|
542
|
+
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
543
|
+
if (!pm_regexp_parse_item(parser)) {
|
544
|
+
return false;
|
545
|
+
}
|
546
|
+
}
|
547
|
+
|
548
|
+
return true;
|
549
|
+
}
|
550
|
+
|
551
|
+
// pattern : EOF
|
552
|
+
// | expression EOF
|
553
|
+
// | expression '|' pattern
|
554
|
+
// ;
|
555
|
+
static bool
|
556
|
+
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
557
|
+
return (
|
558
|
+
(
|
559
|
+
// Exit early if the pattern is empty.
|
560
|
+
pm_regexp_char_is_eof(parser) ||
|
561
|
+
// Parse the first expression in the pattern.
|
562
|
+
pm_regexp_parse_expression(parser)
|
563
|
+
) &&
|
564
|
+
(
|
565
|
+
// Return now if we've parsed the entire pattern.
|
566
|
+
pm_regexp_char_is_eof(parser) ||
|
567
|
+
// Otherwise, we should have a pipe character.
|
568
|
+
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
569
|
+
)
|
570
|
+
);
|
571
|
+
}
|
572
|
+
|
573
|
+
// Parse a regular expression and extract the names of all of the named capture
|
574
|
+
// groups.
|
575
|
+
PRISM_EXPORTED_FUNCTION bool
|
576
|
+
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
|
577
|
+
pm_regexp_parser_t parser;
|
578
|
+
pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
|
579
|
+
return pm_regexp_parse_pattern(&parser);
|
580
|
+
}
|