jruby-prism-parser 0.23.0.pre.SNAPSHOT-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +401 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +101 -0
  7. data/README.md +98 -0
  8. data/config.yml +2902 -0
  9. data/docs/build_system.md +91 -0
  10. data/docs/configuration.md +64 -0
  11. data/docs/cruby_compilation.md +27 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +121 -0
  14. data/docs/fuzzing.md +88 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/javascript.md +118 -0
  17. data/docs/local_variable_depth.md +229 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/parser_translation.md +34 -0
  20. data/docs/parsing_rules.md +19 -0
  21. data/docs/releasing.md +98 -0
  22. data/docs/ripper.md +36 -0
  23. data/docs/ruby_api.md +43 -0
  24. data/docs/ruby_parser_translation.md +19 -0
  25. data/docs/serialization.md +209 -0
  26. data/docs/testing.md +55 -0
  27. data/ext/prism/api_node.c +5098 -0
  28. data/ext/prism/api_pack.c +267 -0
  29. data/ext/prism/extconf.rb +110 -0
  30. data/ext/prism/extension.c +1155 -0
  31. data/ext/prism/extension.h +18 -0
  32. data/include/prism/ast.h +5807 -0
  33. data/include/prism/defines.h +102 -0
  34. data/include/prism/diagnostic.h +339 -0
  35. data/include/prism/encoding.h +265 -0
  36. data/include/prism/node.h +57 -0
  37. data/include/prism/options.h +230 -0
  38. data/include/prism/pack.h +152 -0
  39. data/include/prism/parser.h +732 -0
  40. data/include/prism/prettyprint.h +26 -0
  41. data/include/prism/regexp.h +33 -0
  42. data/include/prism/util/pm_buffer.h +155 -0
  43. data/include/prism/util/pm_char.h +205 -0
  44. data/include/prism/util/pm_constant_pool.h +209 -0
  45. data/include/prism/util/pm_list.h +97 -0
  46. data/include/prism/util/pm_memchr.h +29 -0
  47. data/include/prism/util/pm_newline_list.h +93 -0
  48. data/include/prism/util/pm_state_stack.h +42 -0
  49. data/include/prism/util/pm_string.h +150 -0
  50. data/include/prism/util/pm_string_list.h +44 -0
  51. data/include/prism/util/pm_strncasecmp.h +32 -0
  52. data/include/prism/util/pm_strpbrk.h +46 -0
  53. data/include/prism/version.h +29 -0
  54. data/include/prism.h +289 -0
  55. data/jruby-prism.jar +0 -0
  56. data/lib/prism/compiler.rb +486 -0
  57. data/lib/prism/debug.rb +206 -0
  58. data/lib/prism/desugar_compiler.rb +207 -0
  59. data/lib/prism/dispatcher.rb +2150 -0
  60. data/lib/prism/dot_visitor.rb +4634 -0
  61. data/lib/prism/dsl.rb +785 -0
  62. data/lib/prism/ffi.rb +346 -0
  63. data/lib/prism/lex_compat.rb +908 -0
  64. data/lib/prism/mutation_compiler.rb +753 -0
  65. data/lib/prism/node.rb +17864 -0
  66. data/lib/prism/node_ext.rb +212 -0
  67. data/lib/prism/node_inspector.rb +68 -0
  68. data/lib/prism/pack.rb +224 -0
  69. data/lib/prism/parse_result/comments.rb +177 -0
  70. data/lib/prism/parse_result/newlines.rb +64 -0
  71. data/lib/prism/parse_result.rb +498 -0
  72. data/lib/prism/pattern.rb +250 -0
  73. data/lib/prism/serialize.rb +1354 -0
  74. data/lib/prism/translation/parser/compiler.rb +1838 -0
  75. data/lib/prism/translation/parser/lexer.rb +335 -0
  76. data/lib/prism/translation/parser/rubocop.rb +37 -0
  77. data/lib/prism/translation/parser.rb +178 -0
  78. data/lib/prism/translation/ripper.rb +577 -0
  79. data/lib/prism/translation/ruby_parser.rb +1521 -0
  80. data/lib/prism/translation.rb +11 -0
  81. data/lib/prism/version.rb +3 -0
  82. data/lib/prism/visitor.rb +495 -0
  83. data/lib/prism.rb +99 -0
  84. data/prism.gemspec +135 -0
  85. data/rbi/prism.rbi +7767 -0
  86. data/rbi/prism_static.rbi +207 -0
  87. data/sig/prism.rbs +4773 -0
  88. data/sig/prism_static.rbs +201 -0
  89. data/src/diagnostic.c +400 -0
  90. data/src/encoding.c +5132 -0
  91. data/src/node.c +2786 -0
  92. data/src/options.c +213 -0
  93. data/src/pack.c +493 -0
  94. data/src/prettyprint.c +8881 -0
  95. data/src/prism.c +18406 -0
  96. data/src/regexp.c +638 -0
  97. data/src/serialize.c +1554 -0
  98. data/src/token_type.c +700 -0
  99. data/src/util/pm_buffer.c +190 -0
  100. data/src/util/pm_char.c +318 -0
  101. data/src/util/pm_constant_pool.c +322 -0
  102. data/src/util/pm_list.c +49 -0
  103. data/src/util/pm_memchr.c +35 -0
  104. data/src/util/pm_newline_list.c +84 -0
  105. data/src/util/pm_state_stack.c +25 -0
  106. data/src/util/pm_string.c +203 -0
  107. data/src/util/pm_string_list.c +28 -0
  108. data/src/util/pm_strncasecmp.c +24 -0
  109. data/src/util/pm_strpbrk.c +180 -0
  110. metadata +156 -0
data/src/regexp.c ADDED
@@ -0,0 +1,638 @@
1
+ #include "prism/regexp.h"
2
+
3
+ /**
4
+ * This is the parser that is going to handle parsing regular expressions.
5
+ */
6
+ typedef struct {
7
+ /** A pointer to the start of the source that we are parsing. */
8
+ const uint8_t *start;
9
+
10
+ /** A pointer to the current position in the source. */
11
+ const uint8_t *cursor;
12
+
13
+ /** A pointer to the end of the source that we are parsing. */
14
+ const uint8_t *end;
15
+
16
+ /** A list of named captures that we've found. */
17
+ pm_string_list_t *named_captures;
18
+
19
+ /** Whether the encoding has changed from the default. */
20
+ bool encoding_changed;
21
+
22
+ /** The encoding of the source. */
23
+ const pm_encoding_t *encoding;
24
+ } pm_regexp_parser_t;
25
+
26
+ /**
27
+ * This initializes a new parser with the given source.
28
+ */
29
+ static void
30
+ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
31
+ *parser = (pm_regexp_parser_t) {
32
+ .start = start,
33
+ .cursor = start,
34
+ .end = end,
35
+ .named_captures = named_captures,
36
+ .encoding_changed = encoding_changed,
37
+ .encoding = encoding
38
+ };
39
+ }
40
+
41
+ /**
42
+ * This appends a new string to the list of named captures.
43
+ */
44
+ static void
45
+ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
46
+ pm_string_t string;
47
+ pm_string_shared_init(&string, start, end);
48
+ pm_string_list_append(parser->named_captures, &string);
49
+ pm_string_free(&string);
50
+ }
51
+
52
+ /**
53
+ * Returns true if the next character is the end of the source.
54
+ */
55
+ static inline bool
56
+ pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
57
+ return parser->cursor >= parser->end;
58
+ }
59
+
60
+ /**
61
+ * Optionally accept a char and consume it if it exists.
62
+ */
63
+ static inline bool
64
+ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
65
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
66
+ parser->cursor++;
67
+ return true;
68
+ }
69
+ return false;
70
+ }
71
+
72
+ /**
73
+ * Expect a character to be present and consume it.
74
+ */
75
+ static inline bool
76
+ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
77
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
78
+ parser->cursor++;
79
+ return true;
80
+ }
81
+ return false;
82
+ }
83
+
84
+ /**
85
+ * This advances the current token to the next instance of the given character.
86
+ */
87
+ static bool
88
+ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
89
+ if (pm_regexp_char_is_eof(parser)) {
90
+ return false;
91
+ }
92
+
93
+ const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
94
+ if (end == NULL) {
95
+ return false;
96
+ }
97
+
98
+ parser->cursor = end + 1;
99
+ return true;
100
+ }
101
+
102
+ /**
103
+ * Range quantifiers are a special class of quantifiers that look like
104
+ *
105
+ * * {digit}
106
+ * * {digit,}
107
+ * * {digit,digit}
108
+ * * {,digit}
109
+ *
110
+ * Unfortunately, if there are any spaces in between, then this just becomes a
111
+ * regular character match expression and we have to backtrack. So when this
112
+ * function first starts running, we'll create a "save" point and then attempt
113
+ * to parse the quantifier. If it fails, we'll restore the save point and
114
+ * return.
115
+ *
116
+ * The properly track everything, we're going to build a little state machine.
117
+ * It looks something like the following:
118
+ *
119
+ * +-------+ +---------+ ------------+
120
+ * ---- lbrace ---> | start | ---- digit ---> | minimum | |
121
+ * +-------+ +---------+ <--- digit -+
122
+ * | | |
123
+ * +-------+ | | rbrace
124
+ * | comma | <----- comma +---- comma -------+ |
125
+ * +-------+ V V
126
+ * | +---------+ +---------+
127
+ * +-- digit --> | maximum | -- rbrace --> || final ||
128
+ * +---------+ +---------+
129
+ * | ^
130
+ * +- digit -+
131
+ *
132
+ * Note that by the time we've hit this function, the lbrace has already been
133
+ * consumed so we're in the start state.
134
+ */
135
+ static bool
136
+ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
137
+ const uint8_t *savepoint = parser->cursor;
138
+
139
+ enum {
140
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
141
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
142
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
143
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
144
+ } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
145
+
146
+ while (1) {
147
+ switch (state) {
148
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
149
+ switch (*parser->cursor) {
150
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
151
+ parser->cursor++;
152
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
153
+ break;
154
+ case ',':
155
+ parser->cursor++;
156
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
157
+ break;
158
+ default:
159
+ parser->cursor = savepoint;
160
+ return true;
161
+ }
162
+ break;
163
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
164
+ switch (*parser->cursor) {
165
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
166
+ parser->cursor++;
167
+ break;
168
+ case ',':
169
+ parser->cursor++;
170
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
171
+ break;
172
+ case '}':
173
+ parser->cursor++;
174
+ return true;
175
+ default:
176
+ parser->cursor = savepoint;
177
+ return true;
178
+ }
179
+ break;
180
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
181
+ switch (*parser->cursor) {
182
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
183
+ parser->cursor++;
184
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
185
+ break;
186
+ default:
187
+ parser->cursor = savepoint;
188
+ return true;
189
+ }
190
+ break;
191
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
192
+ switch (*parser->cursor) {
193
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
194
+ parser->cursor++;
195
+ break;
196
+ case '}':
197
+ parser->cursor++;
198
+ return true;
199
+ default:
200
+ parser->cursor = savepoint;
201
+ return true;
202
+ }
203
+ break;
204
+ }
205
+ }
206
+
207
+ return true;
208
+ }
209
+
210
+ /**
211
+ * quantifier : star-quantifier
212
+ * | plus-quantifier
213
+ * | optional-quantifier
214
+ * | range-quantifier
215
+ * | <empty>
216
+ * ;
217
+ */
218
+ static bool
219
+ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220
+ if (pm_regexp_char_is_eof(parser)) return true;
221
+
222
+ switch (*parser->cursor) {
223
+ case '*':
224
+ case '+':
225
+ case '?':
226
+ parser->cursor++;
227
+ return true;
228
+ case '{':
229
+ parser->cursor++;
230
+ return pm_regexp_parse_range_quantifier(parser);
231
+ default:
232
+ // In this case there is no quantifier.
233
+ return true;
234
+ }
235
+ }
236
+
237
+ /**
238
+ * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
239
+ * ;
240
+ */
241
+ static bool
242
+ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
243
+ if (!pm_regexp_char_expect(parser, ':')) {
244
+ return false;
245
+ }
246
+
247
+ pm_regexp_char_accept(parser, '^');
248
+
249
+ return (
250
+ pm_regexp_char_find(parser, ':') &&
251
+ pm_regexp_char_expect(parser, ']') &&
252
+ pm_regexp_char_expect(parser, ']')
253
+ );
254
+ }
255
+
256
+ // Forward declaration because character sets can be nested.
257
+ static bool
258
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
259
+
260
+ /**
261
+ * match-char-set : '[' '^'? (match-range | match-char)* ']'
262
+ * ;
263
+ */
264
+ static bool
265
+ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
266
+ pm_regexp_char_accept(parser, '^');
267
+
268
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
269
+ switch (*parser->cursor++) {
270
+ case '[':
271
+ pm_regexp_parse_lbracket(parser);
272
+ break;
273
+ case '\\':
274
+ if (!pm_regexp_char_is_eof(parser)) {
275
+ parser->cursor++;
276
+ }
277
+ break;
278
+ default:
279
+ // do nothing, we've already advanced the cursor
280
+ break;
281
+ }
282
+ }
283
+
284
+ return pm_regexp_char_expect(parser, ']');
285
+ }
286
+
287
+ /**
288
+ * A left bracket can either mean a POSIX class or a character set.
289
+ */
290
+ static bool
291
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
292
+ const uint8_t *reset = parser->cursor;
293
+
294
+ if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
295
+ parser->cursor++;
296
+ if (pm_regexp_parse_posix_class(parser)) return true;
297
+
298
+ parser->cursor = reset;
299
+ }
300
+
301
+ return pm_regexp_parse_character_set(parser);
302
+ }
303
+
304
+ // Forward declaration here since parsing groups needs to go back up the grammar
305
+ // to parse expressions within them.
306
+ static bool
307
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser);
308
+
309
+ /**
310
+ * These are the states of the options that are configurable on the regular
311
+ * expression (or from within a group).
312
+ */
313
+ typedef enum {
314
+ PM_REGEXP_OPTION_STATE_INVALID,
315
+ PM_REGEXP_OPTION_STATE_TOGGLEABLE,
316
+ PM_REGEXP_OPTION_STATE_ADDABLE,
317
+ PM_REGEXP_OPTION_STATE_ADDED,
318
+ PM_REGEXP_OPTION_STATE_REMOVED
319
+ } pm_regexp_option_state_t;
320
+
321
+ // These are the options that are configurable on the regular expression (or
322
+ // from within a group).
323
+
324
+ #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
325
+ #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
326
+ #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
327
+
328
+ /**
329
+ * This is the set of options that are configurable on the regular expression.
330
+ */
331
+ typedef struct {
332
+ /** The current state of each option. */
333
+ uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
334
+ } pm_regexp_options_t;
335
+
336
+ /**
337
+ * Initialize a new set of options to their default values.
338
+ */
339
+ static void
340
+ pm_regexp_options_init(pm_regexp_options_t *options) {
341
+ memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
342
+ options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
343
+ options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
344
+ options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
345
+ options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
346
+ options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
347
+ options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
348
+ }
349
+
350
+ /**
351
+ * Attempt to add the given option to the set of options. Returns true if it was
352
+ * added, false if it was already present.
353
+ */
354
+ static bool
355
+ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
356
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
357
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
358
+
359
+ switch (options->values[key]) {
360
+ case PM_REGEXP_OPTION_STATE_INVALID:
361
+ case PM_REGEXP_OPTION_STATE_REMOVED:
362
+ return false;
363
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
364
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
365
+ options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
366
+ return true;
367
+ case PM_REGEXP_OPTION_STATE_ADDED:
368
+ return true;
369
+ }
370
+ }
371
+
372
+ return false;
373
+ }
374
+
375
+ /**
376
+ * Attempt to remove the given option from the set of options. Returns true if
377
+ * it was removed, false if it was already absent.
378
+ */
379
+ static bool
380
+ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
381
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
382
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
383
+
384
+ switch (options->values[key]) {
385
+ case PM_REGEXP_OPTION_STATE_INVALID:
386
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
387
+ return false;
388
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
389
+ case PM_REGEXP_OPTION_STATE_ADDED:
390
+ case PM_REGEXP_OPTION_STATE_REMOVED:
391
+ options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
392
+ return true;
393
+ }
394
+ }
395
+
396
+ return false;
397
+ }
398
+
399
+ /**
400
+ * Groups can have quite a few different patterns for syntax. They basically
401
+ * just wrap a set of expressions, but they can potentially have options after a
402
+ * question mark. If there _isn't_ a question mark, then it's just a set of
403
+ * expressions. If there _is_, then here are the options:
404
+ *
405
+ * * (?#...) - inline comments
406
+ * * (?:subexp) - non-capturing group
407
+ * * (?=subexp) - positive lookahead
408
+ * * (?!subexp) - negative lookahead
409
+ * * (?>subexp) - atomic group
410
+ * * (?~subexp) - absence operator
411
+ * * (?<=subexp) - positive lookbehind
412
+ * * (?<!subexp) - negative lookbehind
413
+ * * (?<name>subexp) - named capturing group
414
+ * * (?'name'subexp) - named capturing group
415
+ * * (?(cond)yes-subexp) - conditional expression
416
+ * * (?(cond)yes-subexp|no-subexp) - conditional expression
417
+ * * (?imxdau-imx) - turn on and off configuration
418
+ * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
419
+ */
420
+ static bool
421
+ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
422
+ // First, parse any options for the group.
423
+ if (pm_regexp_char_accept(parser, '?')) {
424
+ if (pm_regexp_char_is_eof(parser)) {
425
+ return false;
426
+ }
427
+ pm_regexp_options_t options;
428
+ pm_regexp_options_init(&options);
429
+
430
+ switch (*parser->cursor) {
431
+ case '#': { // inline comments
432
+ if (parser->encoding_changed && parser->encoding->multibyte) {
433
+ bool escaped = false;
434
+
435
+ // Here we're going to take a slow path and iterate through
436
+ // each multibyte character to find the close paren. We do
437
+ // this because \ can be a trailing byte in some encodings.
438
+ while (parser->cursor < parser->end) {
439
+ if (!escaped && *parser->cursor == ')') {
440
+ parser->cursor++;
441
+ return true;
442
+ }
443
+
444
+ size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
445
+ if (width == 0) return false;
446
+
447
+ escaped = (width == 1) && (*parser->cursor == '\\');
448
+ parser->cursor += width;
449
+ }
450
+
451
+ return false;
452
+ } else {
453
+ // Here we can take the fast path and use memchr to find the
454
+ // next ) because we are safe checking backward for \ since
455
+ // it cannot be a trailing character.
456
+ bool found = pm_regexp_char_find(parser, ')');
457
+
458
+ while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
459
+ found = pm_regexp_char_find(parser, ')');
460
+ }
461
+
462
+ return found;
463
+ }
464
+ }
465
+ case ':': // non-capturing group
466
+ case '=': // positive lookahead
467
+ case '!': // negative lookahead
468
+ case '>': // atomic group
469
+ case '~': // absence operator
470
+ parser->cursor++;
471
+ break;
472
+ case '<':
473
+ parser->cursor++;
474
+ if (pm_regexp_char_is_eof(parser)) {
475
+ return false;
476
+ }
477
+
478
+ switch (*parser->cursor) {
479
+ case '=': // positive lookbehind
480
+ case '!': // negative lookbehind
481
+ parser->cursor++;
482
+ break;
483
+ default: { // named capture group
484
+ const uint8_t *start = parser->cursor;
485
+ if (!pm_regexp_char_find(parser, '>')) {
486
+ return false;
487
+ }
488
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
489
+ break;
490
+ }
491
+ }
492
+ break;
493
+ case '\'': { // named capture group
494
+ const uint8_t *start = ++parser->cursor;
495
+ if (!pm_regexp_char_find(parser, '\'')) {
496
+ return false;
497
+ }
498
+
499
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
500
+ break;
501
+ }
502
+ case '(': // conditional expression
503
+ if (!pm_regexp_char_find(parser, ')')) {
504
+ return false;
505
+ }
506
+ break;
507
+ case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
508
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
509
+ if (!pm_regexp_options_add(&options, *parser->cursor)) {
510
+ return false;
511
+ }
512
+ parser->cursor++;
513
+ }
514
+
515
+ if (pm_regexp_char_is_eof(parser)) {
516
+ return false;
517
+ }
518
+
519
+ // If we hit a -, then we're done parsing options.
520
+ if (*parser->cursor != '-') break;
521
+
522
+ // Otherwise, fallthrough to the - case.
523
+ /* fallthrough */
524
+ case '-':
525
+ parser->cursor++;
526
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
527
+ if (!pm_regexp_options_remove(&options, *parser->cursor)) {
528
+ return false;
529
+ }
530
+ parser->cursor++;
531
+ }
532
+
533
+ if (pm_regexp_char_is_eof(parser)) {
534
+ return false;
535
+ }
536
+ break;
537
+ default:
538
+ return false;
539
+ }
540
+ }
541
+
542
+ // Now, parse the expressions within this group.
543
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
544
+ if (!pm_regexp_parse_expression(parser)) {
545
+ return false;
546
+ }
547
+ pm_regexp_char_accept(parser, '|');
548
+ }
549
+
550
+ // Finally, make sure we have a closing parenthesis.
551
+ return pm_regexp_char_expect(parser, ')');
552
+ }
553
+
554
+ /**
555
+ * item : anchor
556
+ * | match-posix-class
557
+ * | match-char-set
558
+ * | match-char-class
559
+ * | match-char-prop
560
+ * | match-char
561
+ * | match-any
562
+ * | group
563
+ * | quantified
564
+ * ;
565
+ */
566
+ static bool
567
+ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
568
+ switch (*parser->cursor++) {
569
+ case '^':
570
+ case '$':
571
+ return true;
572
+ case '\\':
573
+ if (!pm_regexp_char_is_eof(parser)) {
574
+ parser->cursor++;
575
+ }
576
+ return pm_regexp_parse_quantifier(parser);
577
+ case '(':
578
+ return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
579
+ case '[':
580
+ return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
581
+ default:
582
+ return pm_regexp_parse_quantifier(parser);
583
+ }
584
+ }
585
+
586
+ /**
587
+ * expression : item+
588
+ * ;
589
+ */
590
+ static bool
591
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
592
+ if (!pm_regexp_parse_item(parser)) {
593
+ return false;
594
+ }
595
+
596
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
597
+ if (!pm_regexp_parse_item(parser)) {
598
+ return false;
599
+ }
600
+ }
601
+
602
+ return true;
603
+ }
604
+
605
+ /**
606
+ * pattern : EOF
607
+ * | expression EOF
608
+ * | expression '|' pattern
609
+ * ;
610
+ */
611
+ static bool
612
+ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
613
+ return (
614
+ (
615
+ // Exit early if the pattern is empty.
616
+ pm_regexp_char_is_eof(parser) ||
617
+ // Parse the first expression in the pattern.
618
+ pm_regexp_parse_expression(parser)
619
+ ) &&
620
+ (
621
+ // Return now if we've parsed the entire pattern.
622
+ pm_regexp_char_is_eof(parser) ||
623
+ // Otherwise, we should have a pipe character.
624
+ (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
625
+ )
626
+ );
627
+ }
628
+
629
+ /**
630
+ * Parse a regular expression and extract the names of all of the named capture
631
+ * groups.
632
+ */
633
+ PRISM_EXPORTED_FUNCTION bool
634
+ pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
635
+ pm_regexp_parser_t parser;
636
+ pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
637
+ return pm_regexp_parse_pattern(&parser);
638
+ }