prism 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +172 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +84 -0
  7. data/README.md +89 -0
  8. data/config.yml +2481 -0
  9. data/docs/build_system.md +74 -0
  10. data/docs/building.md +22 -0
  11. data/docs/configuration.md +60 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +117 -0
  14. data/docs/fuzzing.md +93 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/mapping.md +117 -0
  17. data/docs/ripper.md +36 -0
  18. data/docs/ruby_api.md +25 -0
  19. data/docs/serialization.md +181 -0
  20. data/docs/testing.md +55 -0
  21. data/ext/prism/api_node.c +4725 -0
  22. data/ext/prism/api_pack.c +256 -0
  23. data/ext/prism/extconf.rb +136 -0
  24. data/ext/prism/extension.c +626 -0
  25. data/ext/prism/extension.h +18 -0
  26. data/include/prism/ast.h +1932 -0
  27. data/include/prism/defines.h +45 -0
  28. data/include/prism/diagnostic.h +231 -0
  29. data/include/prism/enc/pm_encoding.h +95 -0
  30. data/include/prism/node.h +41 -0
  31. data/include/prism/pack.h +141 -0
  32. data/include/prism/parser.h +418 -0
  33. data/include/prism/regexp.h +19 -0
  34. data/include/prism/unescape.h +48 -0
  35. data/include/prism/util/pm_buffer.h +51 -0
  36. data/include/prism/util/pm_char.h +91 -0
  37. data/include/prism/util/pm_constant_pool.h +78 -0
  38. data/include/prism/util/pm_list.h +67 -0
  39. data/include/prism/util/pm_memchr.h +14 -0
  40. data/include/prism/util/pm_newline_list.h +61 -0
  41. data/include/prism/util/pm_state_stack.h +24 -0
  42. data/include/prism/util/pm_string.h +61 -0
  43. data/include/prism/util/pm_string_list.h +25 -0
  44. data/include/prism/util/pm_strpbrk.h +29 -0
  45. data/include/prism/version.h +4 -0
  46. data/include/prism.h +82 -0
  47. data/lib/prism/compiler.rb +465 -0
  48. data/lib/prism/debug.rb +157 -0
  49. data/lib/prism/desugar_compiler.rb +206 -0
  50. data/lib/prism/dispatcher.rb +2051 -0
  51. data/lib/prism/dsl.rb +750 -0
  52. data/lib/prism/ffi.rb +251 -0
  53. data/lib/prism/lex_compat.rb +838 -0
  54. data/lib/prism/mutation_compiler.rb +718 -0
  55. data/lib/prism/node.rb +14540 -0
  56. data/lib/prism/node_ext.rb +55 -0
  57. data/lib/prism/node_inspector.rb +68 -0
  58. data/lib/prism/pack.rb +185 -0
  59. data/lib/prism/parse_result/comments.rb +172 -0
  60. data/lib/prism/parse_result/newlines.rb +60 -0
  61. data/lib/prism/parse_result.rb +266 -0
  62. data/lib/prism/pattern.rb +239 -0
  63. data/lib/prism/ripper_compat.rb +174 -0
  64. data/lib/prism/serialize.rb +662 -0
  65. data/lib/prism/visitor.rb +470 -0
  66. data/lib/prism.rb +64 -0
  67. data/prism.gemspec +113 -0
  68. data/src/diagnostic.c +287 -0
  69. data/src/enc/pm_big5.c +52 -0
  70. data/src/enc/pm_euc_jp.c +58 -0
  71. data/src/enc/pm_gbk.c +61 -0
  72. data/src/enc/pm_shift_jis.c +56 -0
  73. data/src/enc/pm_tables.c +507 -0
  74. data/src/enc/pm_unicode.c +2324 -0
  75. data/src/enc/pm_windows_31j.c +56 -0
  76. data/src/node.c +2633 -0
  77. data/src/pack.c +493 -0
  78. data/src/prettyprint.c +2136 -0
  79. data/src/prism.c +14587 -0
  80. data/src/regexp.c +580 -0
  81. data/src/serialize.c +1899 -0
  82. data/src/token_type.c +349 -0
  83. data/src/unescape.c +637 -0
  84. data/src/util/pm_buffer.c +103 -0
  85. data/src/util/pm_char.c +272 -0
  86. data/src/util/pm_constant_pool.c +252 -0
  87. data/src/util/pm_list.c +41 -0
  88. data/src/util/pm_memchr.c +33 -0
  89. data/src/util/pm_newline_list.c +134 -0
  90. data/src/util/pm_state_stack.c +19 -0
  91. data/src/util/pm_string.c +200 -0
  92. data/src/util/pm_string_list.c +29 -0
  93. data/src/util/pm_strncasecmp.c +17 -0
  94. data/src/util/pm_strpbrk.c +66 -0
  95. metadata +138 -0
data/src/regexp.c ADDED
@@ -0,0 +1,580 @@
1
+ #include "prism/regexp.h"
2
+
3
+ // This is the parser that is going to handle parsing regular expressions.
4
+ typedef struct {
5
+ const uint8_t *start;
6
+ const uint8_t *cursor;
7
+ const uint8_t *end;
8
+ pm_string_list_t *named_captures;
9
+ bool encoding_changed;
10
+ pm_encoding_t *encoding;
11
+ } pm_regexp_parser_t;
12
+
13
+ // This initializes a new parser with the given source.
14
+ static void
15
+ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
16
+ *parser = (pm_regexp_parser_t) {
17
+ .start = start,
18
+ .cursor = start,
19
+ .end = end,
20
+ .named_captures = named_captures,
21
+ .encoding_changed = encoding_changed,
22
+ .encoding = encoding
23
+ };
24
+ }
25
+
26
+ // This appends a new string to the list of named captures.
27
+ static void
28
+ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
29
+ pm_string_t string;
30
+ pm_string_shared_init(&string, start, end);
31
+ pm_string_list_append(parser->named_captures, &string);
32
+ pm_string_free(&string);
33
+ }
34
+
35
+ // Returns true if the next character is the end of the source.
36
+ static inline bool
37
+ pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
38
+ return parser->cursor >= parser->end;
39
+ }
40
+
41
+ // Optionally accept a char and consume it if it exists.
42
+ static inline bool
43
+ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
44
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
45
+ parser->cursor++;
46
+ return true;
47
+ }
48
+ return false;
49
+ }
50
+
51
+ // Expect a character to be present and consume it.
52
+ static inline bool
53
+ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
54
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
55
+ parser->cursor++;
56
+ return true;
57
+ }
58
+ return false;
59
+ }
60
+
61
+ // This advances the current token to the next instance of the given character.
62
+ static bool
63
+ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
64
+ if (pm_regexp_char_is_eof(parser)) {
65
+ return false;
66
+ }
67
+
68
+ const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
69
+ if (end == NULL) {
70
+ return false;
71
+ }
72
+
73
+ parser->cursor = end + 1;
74
+ return true;
75
+ }
76
+
77
+ // Range quantifiers are a special class of quantifiers that look like
78
+ //
79
+ // * {digit}
80
+ // * {digit,}
81
+ // * {digit,digit}
82
+ // * {,digit}
83
+ //
84
+ // Unfortunately, if there are any spaces in between, then this just becomes a
85
+ // regular character match expression and we have to backtrack. So when this
86
+ // function first starts running, we'll create a "save" point and then attempt
87
+ // to parse the quantifier. If it fails, we'll restore the save point and
88
+ // return.
89
+ //
90
+ // The properly track everything, we're going to build a little state machine.
91
+ // It looks something like the following:
92
+ //
93
+ // ┌───────┐ ┌─────────┐ ────────────┐
94
+ // ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
95
+ // └───────┘ └─────────┘ <─── digit ─┘
96
+ // │ │ │
97
+ // ┌───────┐ │ │ rbrace
98
+ // │ comma │ <───── comma ┌──── comma ───────┘ │
99
+ // └───────┘ V V
100
+ // │ ┌─────────┐ ┌─────────┐
101
+ // └── digit ──> │ maximum │ ── rbrace ──> │| final |│
102
+ // └─────────┘ └─────────┘
103
+ // │ ^
104
+ // └─ digit ─┘
105
+ //
106
+ // Note that by the time we've hit this function, the lbrace has already been
107
+ // consumed so we're in the start state.
108
+ static bool
109
+ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
110
+ const uint8_t *savepoint = parser->cursor;
111
+
112
+ enum {
113
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
114
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
115
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
116
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
117
+ } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
118
+
119
+ while (1) {
120
+ switch (state) {
121
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
122
+ switch (*parser->cursor) {
123
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
124
+ parser->cursor++;
125
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
126
+ break;
127
+ case ',':
128
+ parser->cursor++;
129
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
130
+ break;
131
+ default:
132
+ parser->cursor = savepoint;
133
+ return true;
134
+ }
135
+ break;
136
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
137
+ switch (*parser->cursor) {
138
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
139
+ parser->cursor++;
140
+ break;
141
+ case ',':
142
+ parser->cursor++;
143
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
144
+ break;
145
+ case '}':
146
+ parser->cursor++;
147
+ return true;
148
+ default:
149
+ parser->cursor = savepoint;
150
+ return true;
151
+ }
152
+ break;
153
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
154
+ switch (*parser->cursor) {
155
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
156
+ parser->cursor++;
157
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
158
+ break;
159
+ default:
160
+ parser->cursor = savepoint;
161
+ return true;
162
+ }
163
+ break;
164
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
165
+ switch (*parser->cursor) {
166
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
167
+ parser->cursor++;
168
+ break;
169
+ case '}':
170
+ parser->cursor++;
171
+ return true;
172
+ default:
173
+ parser->cursor = savepoint;
174
+ return true;
175
+ }
176
+ break;
177
+ }
178
+ }
179
+
180
+ return true;
181
+ }
182
+
183
+ // quantifier : star-quantifier
184
+ // | plus-quantifier
185
+ // | optional-quantifier
186
+ // | range-quantifier
187
+ // | <empty>
188
+ // ;
189
+ static bool
190
+ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
191
+ switch (*parser->cursor) {
192
+ case '*':
193
+ case '+':
194
+ case '?':
195
+ parser->cursor++;
196
+ return true;
197
+ case '{':
198
+ parser->cursor++;
199
+ return pm_regexp_parse_range_quantifier(parser);
200
+ default:
201
+ // In this case there is no quantifier.
202
+ return true;
203
+ }
204
+ }
205
+
206
+ // match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
207
+ // ;
208
+ static bool
209
+ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
210
+ if (!pm_regexp_char_expect(parser, ':')) {
211
+ return false;
212
+ }
213
+
214
+ pm_regexp_char_accept(parser, '^');
215
+
216
+ return (
217
+ pm_regexp_char_find(parser, ':') &&
218
+ pm_regexp_char_expect(parser, ']') &&
219
+ pm_regexp_char_expect(parser, ']')
220
+ );
221
+ }
222
+
223
+ // Forward declaration because character sets can be nested.
224
+ static bool
225
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
226
+
227
+ // match-char-set : '[' '^'? (match-range | match-char)* ']'
228
+ // ;
229
+ static bool
230
+ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
231
+ pm_regexp_char_accept(parser, '^');
232
+
233
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
234
+ switch (*parser->cursor++) {
235
+ case '[':
236
+ pm_regexp_parse_lbracket(parser);
237
+ break;
238
+ case '\\':
239
+ if (!pm_regexp_char_is_eof(parser)) {
240
+ parser->cursor++;
241
+ }
242
+ break;
243
+ default:
244
+ // do nothing, we've already advanced the cursor
245
+ break;
246
+ }
247
+ }
248
+
249
+ return pm_regexp_char_expect(parser, ']');
250
+ }
251
+
252
+ // A left bracket can either mean a POSIX class or a character set.
253
+ static bool
254
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
255
+ const uint8_t *reset = parser->cursor;
256
+
257
+ if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
258
+ parser->cursor++;
259
+ if (pm_regexp_parse_posix_class(parser)) return true;
260
+
261
+ parser->cursor = reset;
262
+ }
263
+
264
+ return pm_regexp_parse_character_set(parser);
265
+ }
266
+
267
+ // Forward declaration here since parsing groups needs to go back up the grammar
268
+ // to parse expressions within them.
269
+ static bool
270
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser);
271
+
272
+ // These are the states of the options that are configurable on the regular
273
+ // expression (or from within a group).
274
+ typedef enum {
275
+ PM_REGEXP_OPTION_STATE_INVALID,
276
+ PM_REGEXP_OPTION_STATE_TOGGLEABLE,
277
+ PM_REGEXP_OPTION_STATE_ADDABLE,
278
+ PM_REGEXP_OPTION_STATE_ADDED,
279
+ PM_REGEXP_OPTION_STATE_REMOVED
280
+ } pm_regexp_option_state_t;
281
+
282
+ // These are the options that are configurable on the regular expression (or
283
+ // from within a group).
284
+ #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
285
+ #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
286
+ #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
287
+
288
+ // This is the set of options that are configurable on the regular expression.
289
+ typedef struct {
290
+ uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
291
+ } pm_regexp_options_t;
292
+
293
+ // Initialize a new set of options to their default values.
294
+ static void
295
+ pm_regexp_options_init(pm_regexp_options_t *options) {
296
+ memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
297
+ options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
298
+ options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
299
+ options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
300
+ options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
301
+ options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
302
+ options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
303
+ }
304
+
305
+ // Attempt to add the given option to the set of options. Returns true if it was
306
+ // added, false if it was already present.
307
+ static bool
308
+ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
309
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
310
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
311
+
312
+ switch (options->values[key]) {
313
+ case PM_REGEXP_OPTION_STATE_INVALID:
314
+ case PM_REGEXP_OPTION_STATE_REMOVED:
315
+ return false;
316
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
317
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
318
+ options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
319
+ return true;
320
+ case PM_REGEXP_OPTION_STATE_ADDED:
321
+ return true;
322
+ }
323
+ }
324
+
325
+ return false;
326
+ }
327
+
328
+ // Attempt to remove the given option from the set of options. Returns true if
329
+ // it was removed, false if it was already absent.
330
+ static bool
331
+ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
332
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
333
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
334
+
335
+ switch (options->values[key]) {
336
+ case PM_REGEXP_OPTION_STATE_INVALID:
337
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
338
+ return false;
339
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
340
+ case PM_REGEXP_OPTION_STATE_ADDED:
341
+ case PM_REGEXP_OPTION_STATE_REMOVED:
342
+ options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
343
+ return true;
344
+ }
345
+ }
346
+
347
+ return false;
348
+ }
349
+
350
+ // Groups can have quite a few different patterns for syntax. They basically
351
+ // just wrap a set of expressions, but they can potentially have options after a
352
+ // question mark. If there _isn't_ a question mark, then it's just a set of
353
+ // expressions. If there _is_, then here are the options:
354
+ //
355
+ // * (?#...) - inline comments
356
+ // * (?:subexp) - non-capturing group
357
+ // * (?=subexp) - positive lookahead
358
+ // * (?!subexp) - negative lookahead
359
+ // * (?>subexp) - atomic group
360
+ // * (?~subexp) - absence operator
361
+ // * (?<=subexp) - positive lookbehind
362
+ // * (?<!subexp) - negative lookbehind
363
+ // * (?<name>subexp) - named capturing group
364
+ // * (?'name'subexp) - named capturing group
365
+ // * (?(cond)yes-subexp) - conditional expression
366
+ // * (?(cond)yes-subexp|no-subexp) - conditional expression
367
+ // * (?imxdau-imx) - turn on and off configuration
368
+ // * (?imxdau-imx:subexp) - turn on and off configuration for an expression
369
+ //
370
+ static bool
371
+ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
372
+ // First, parse any options for the group.
373
+ if (pm_regexp_char_accept(parser, '?')) {
374
+ if (pm_regexp_char_is_eof(parser)) {
375
+ return false;
376
+ }
377
+ pm_regexp_options_t options;
378
+ pm_regexp_options_init(&options);
379
+
380
+ switch (*parser->cursor) {
381
+ case '#': { // inline comments
382
+ if (parser->encoding_changed && parser->encoding->multibyte) {
383
+ bool escaped = false;
384
+
385
+ // Here we're going to take a slow path and iterate through
386
+ // each multibyte character to find the close paren. We do
387
+ // this because \ can be a trailing byte in some encodings.
388
+ while (parser->cursor < parser->end) {
389
+ if (!escaped && *parser->cursor == ')') {
390
+ parser->cursor++;
391
+ return true;
392
+ }
393
+
394
+ size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
395
+ if (width == 0) return false;
396
+
397
+ escaped = (width == 1) && (*parser->cursor == '\\');
398
+ parser->cursor += width;
399
+ }
400
+
401
+ return false;
402
+ } else {
403
+ // Here we can take the fast path and use memchr to find the
404
+ // next ) because we are safe checking backward for \ since
405
+ // it cannot be a trailing character.
406
+ bool found = pm_regexp_char_find(parser, ')');
407
+
408
+ while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
409
+ found = pm_regexp_char_find(parser, ')');
410
+ }
411
+
412
+ return found;
413
+ }
414
+ }
415
+ case ':': // non-capturing group
416
+ case '=': // positive lookahead
417
+ case '!': // negative lookahead
418
+ case '>': // atomic group
419
+ case '~': // absence operator
420
+ parser->cursor++;
421
+ break;
422
+ case '<':
423
+ parser->cursor++;
424
+ if (pm_regexp_char_is_eof(parser)) {
425
+ return false;
426
+ }
427
+
428
+ switch (*parser->cursor) {
429
+ case '=': // positive lookbehind
430
+ case '!': // negative lookbehind
431
+ parser->cursor++;
432
+ break;
433
+ default: { // named capture group
434
+ const uint8_t *start = parser->cursor;
435
+ if (!pm_regexp_char_find(parser, '>')) {
436
+ return false;
437
+ }
438
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
439
+ break;
440
+ }
441
+ }
442
+ break;
443
+ case '\'': { // named capture group
444
+ const uint8_t *start = ++parser->cursor;
445
+ if (!pm_regexp_char_find(parser, '\'')) {
446
+ return false;
447
+ }
448
+
449
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
450
+ break;
451
+ }
452
+ case '(': // conditional expression
453
+ if (!pm_regexp_char_find(parser, ')')) {
454
+ return false;
455
+ }
456
+ break;
457
+ case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
458
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
459
+ if (!pm_regexp_options_add(&options, *parser->cursor)) {
460
+ return false;
461
+ }
462
+ parser->cursor++;
463
+ }
464
+
465
+ if (pm_regexp_char_is_eof(parser)) {
466
+ return false;
467
+ }
468
+
469
+ // If we hit a -, then we're done parsing options.
470
+ if (*parser->cursor != '-') break;
471
+
472
+ // Otherwise, fallthrough to the - case.
473
+ /* fallthrough */
474
+ case '-':
475
+ parser->cursor++;
476
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
477
+ if (!pm_regexp_options_remove(&options, *parser->cursor)) {
478
+ return false;
479
+ }
480
+ parser->cursor++;
481
+ }
482
+
483
+ if (pm_regexp_char_is_eof(parser)) {
484
+ return false;
485
+ }
486
+ break;
487
+ default:
488
+ return false;
489
+ }
490
+ }
491
+
492
+ // Now, parse the expressions within this group.
493
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
494
+ if (!pm_regexp_parse_expression(parser)) {
495
+ return false;
496
+ }
497
+ pm_regexp_char_accept(parser, '|');
498
+ }
499
+
500
+ // Finally, make sure we have a closing parenthesis.
501
+ return pm_regexp_char_expect(parser, ')');
502
+ }
503
+
504
+ // item : anchor
505
+ // | match-posix-class
506
+ // | match-char-set
507
+ // | match-char-class
508
+ // | match-char-prop
509
+ // | match-char
510
+ // | match-any
511
+ // | group
512
+ // | quantified
513
+ // ;
514
+ static bool
515
+ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
516
+ switch (*parser->cursor++) {
517
+ case '^':
518
+ case '$':
519
+ return true;
520
+ case '\\':
521
+ if (!pm_regexp_char_is_eof(parser)) {
522
+ parser->cursor++;
523
+ }
524
+ return pm_regexp_parse_quantifier(parser);
525
+ case '(':
526
+ return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
527
+ case '[':
528
+ return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
529
+ default:
530
+ return pm_regexp_parse_quantifier(parser);
531
+ }
532
+ }
533
+
534
+ // expression : item+
535
+ // ;
536
+ static bool
537
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
538
+ if (!pm_regexp_parse_item(parser)) {
539
+ return false;
540
+ }
541
+
542
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
543
+ if (!pm_regexp_parse_item(parser)) {
544
+ return false;
545
+ }
546
+ }
547
+
548
+ return true;
549
+ }
550
+
551
+ // pattern : EOF
552
+ // | expression EOF
553
+ // | expression '|' pattern
554
+ // ;
555
+ static bool
556
+ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
557
+ return (
558
+ (
559
+ // Exit early if the pattern is empty.
560
+ pm_regexp_char_is_eof(parser) ||
561
+ // Parse the first expression in the pattern.
562
+ pm_regexp_parse_expression(parser)
563
+ ) &&
564
+ (
565
+ // Return now if we've parsed the entire pattern.
566
+ pm_regexp_char_is_eof(parser) ||
567
+ // Otherwise, we should have a pipe character.
568
+ (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
569
+ )
570
+ );
571
+ }
572
+
573
+ // Parse a regular expression and extract the names of all of the named capture
574
+ // groups.
575
+ PRISM_EXPORTED_FUNCTION bool
576
+ pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
577
+ pm_regexp_parser_t parser;
578
+ pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
579
+ return pm_regexp_parse_pattern(&parser);
580
+ }