prism 0.29.0 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +115 -1
- data/CONTRIBUTING.md +0 -4
- data/Makefile +1 -1
- data/README.md +4 -0
- data/config.yml +920 -148
- data/docs/build_system.md +8 -11
- data/docs/fuzzing.md +1 -1
- data/docs/parsing_rules.md +4 -1
- data/docs/relocation.md +34 -0
- data/docs/ripper_translation.md +22 -0
- data/docs/serialization.md +3 -0
- data/ext/prism/api_node.c +2863 -2079
- data/ext/prism/extconf.rb +14 -37
- data/ext/prism/extension.c +241 -391
- data/ext/prism/extension.h +2 -2
- data/include/prism/ast.h +2156 -453
- data/include/prism/defines.h +58 -7
- data/include/prism/diagnostic.h +24 -6
- data/include/prism/node.h +0 -21
- data/include/prism/options.h +94 -3
- data/include/prism/parser.h +82 -40
- data/include/prism/regexp.h +18 -8
- data/include/prism/static_literals.h +3 -2
- data/include/prism/util/pm_char.h +1 -2
- data/include/prism/util/pm_constant_pool.h +0 -8
- data/include/prism/util/pm_integer.h +22 -15
- data/include/prism/util/pm_newline_list.h +11 -0
- data/include/prism/util/pm_string.h +28 -12
- data/include/prism/version.h +3 -3
- data/include/prism.h +47 -11
- data/lib/prism/compiler.rb +3 -0
- data/lib/prism/desugar_compiler.rb +111 -74
- data/lib/prism/dispatcher.rb +16 -1
- data/lib/prism/dot_visitor.rb +55 -34
- data/lib/prism/dsl.rb +660 -468
- data/lib/prism/ffi.rb +113 -8
- data/lib/prism/inspect_visitor.rb +296 -64
- data/lib/prism/lex_compat.rb +1 -1
- data/lib/prism/mutation_compiler.rb +11 -6
- data/lib/prism/node.rb +4262 -5023
- data/lib/prism/node_ext.rb +91 -14
- data/lib/prism/parse_result/comments.rb +0 -7
- data/lib/prism/parse_result/errors.rb +65 -0
- data/lib/prism/parse_result/newlines.rb +101 -11
- data/lib/prism/parse_result.rb +183 -6
- data/lib/prism/reflection.rb +12 -10
- data/lib/prism/relocation.rb +504 -0
- data/lib/prism/serialize.rb +496 -609
- data/lib/prism/string_query.rb +30 -0
- data/lib/prism/translation/parser/compiler.rb +185 -155
- data/lib/prism/translation/parser/lexer.rb +26 -4
- data/lib/prism/translation/parser.rb +9 -4
- data/lib/prism/translation/ripper.rb +23 -25
- data/lib/prism/translation/ruby_parser.rb +86 -17
- data/lib/prism/visitor.rb +3 -0
- data/lib/prism.rb +6 -8
- data/prism.gemspec +9 -5
- data/rbi/prism/dsl.rbi +521 -0
- data/rbi/prism/node.rbi +1115 -1120
- data/rbi/prism/parse_result.rbi +29 -0
- data/rbi/prism/string_query.rbi +12 -0
- data/rbi/prism/visitor.rbi +3 -0
- data/rbi/prism.rbi +36 -30
- data/sig/prism/dsl.rbs +190 -303
- data/sig/prism/mutation_compiler.rbs +1 -0
- data/sig/prism/node.rbs +678 -632
- data/sig/prism/parse_result.rbs +22 -0
- data/sig/prism/relocation.rbs +185 -0
- data/sig/prism/string_query.rbs +11 -0
- data/sig/prism/visitor.rbs +1 -0
- data/sig/prism.rbs +103 -64
- data/src/diagnostic.c +64 -28
- data/src/node.c +502 -1739
- data/src/options.c +76 -27
- data/src/prettyprint.c +188 -112
- data/src/prism.c +3376 -2293
- data/src/regexp.c +208 -71
- data/src/serialize.c +182 -50
- data/src/static_literals.c +64 -85
- data/src/token_type.c +4 -4
- data/src/util/pm_char.c +1 -1
- data/src/util/pm_constant_pool.c +0 -8
- data/src/util/pm_integer.c +53 -25
- data/src/util/pm_newline_list.c +29 -0
- data/src/util/pm_string.c +131 -80
- data/src/util/pm_strpbrk.c +32 -6
- metadata +11 -7
- data/include/prism/util/pm_string_list.h +0 -44
- data/lib/prism/debug.rb +0 -249
- data/lib/prism/translation/parser/rubocop.rb +0 -73
- data/src/util/pm_string_list.c +0 -28
data/src/regexp.c
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
#include "prism/regexp.h"
|
2
2
|
|
3
|
+
#define PM_REGEXP_PARSE_DEPTH_MAX 4096
|
4
|
+
|
3
5
|
/**
|
4
6
|
* This is the parser that is going to handle parsing regular expressions.
|
5
7
|
*/
|
6
8
|
typedef struct {
|
9
|
+
/** The parser that is currently being used. */
|
10
|
+
pm_parser_t *parser;
|
11
|
+
|
7
12
|
/** A pointer to the start of the source that we are parsing. */
|
8
13
|
const uint8_t *start;
|
9
14
|
|
@@ -13,39 +18,48 @@ typedef struct {
|
|
13
18
|
/** A pointer to the end of the source that we are parsing. */
|
14
19
|
const uint8_t *end;
|
15
20
|
|
16
|
-
/**
|
17
|
-
|
21
|
+
/**
|
22
|
+
* Whether or not the regular expression currently being parsed is in
|
23
|
+
* extended mode, wherein whitespace is ignored and comments are allowed.
|
24
|
+
*/
|
25
|
+
bool extended_mode;
|
18
26
|
|
19
27
|
/** Whether the encoding has changed from the default. */
|
20
28
|
bool encoding_changed;
|
21
29
|
|
22
30
|
/** The encoding of the source. */
|
23
31
|
const pm_encoding_t *encoding;
|
32
|
+
|
33
|
+
/** The callback to call when a named capture group is found. */
|
34
|
+
pm_regexp_name_callback_t name_callback;
|
35
|
+
|
36
|
+
/** The data to pass to the name callback. */
|
37
|
+
void *name_data;
|
38
|
+
|
39
|
+
/** The callback to call when a parse error is found. */
|
40
|
+
pm_regexp_error_callback_t error_callback;
|
41
|
+
|
42
|
+
/** The data to pass to the error callback. */
|
43
|
+
void *error_data;
|
24
44
|
} pm_regexp_parser_t;
|
25
45
|
|
26
46
|
/**
|
27
|
-
*
|
47
|
+
* Append an error to the parser.
|
28
48
|
*/
|
29
|
-
static void
|
30
|
-
|
31
|
-
|
32
|
-
.start = start,
|
33
|
-
.cursor = start,
|
34
|
-
.end = end,
|
35
|
-
.named_captures = named_captures,
|
36
|
-
.encoding_changed = encoding_changed,
|
37
|
-
.encoding = encoding
|
38
|
-
};
|
49
|
+
static inline void
|
50
|
+
pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
|
51
|
+
parser->error_callback(start, end, message, parser->error_data);
|
39
52
|
}
|
40
53
|
|
41
54
|
/**
|
42
|
-
* This appends a new string to the list of named captures.
|
55
|
+
* This appends a new string to the list of named captures. This function
|
56
|
+
* assumes the caller has already checked the validity of the name callback.
|
43
57
|
*/
|
44
58
|
static void
|
45
59
|
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
46
60
|
pm_string_t string;
|
47
61
|
pm_string_shared_init(&string, start, end);
|
48
|
-
|
62
|
+
parser->name_callback(&string, parser->name_data);
|
49
63
|
pm_string_free(&string);
|
50
64
|
}
|
51
65
|
|
@@ -144,6 +158,11 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
144
158
|
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
145
159
|
|
146
160
|
while (1) {
|
161
|
+
if (parser->cursor >= parser->end) {
|
162
|
+
parser->cursor = savepoint;
|
163
|
+
return true;
|
164
|
+
}
|
165
|
+
|
147
166
|
switch (state) {
|
148
167
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
149
168
|
switch (*parser->cursor) {
|
@@ -217,21 +236,24 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
217
236
|
*/
|
218
237
|
static bool
|
219
238
|
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
239
|
+
while (!pm_regexp_char_is_eof(parser)) {
|
240
|
+
switch (*parser->cursor) {
|
241
|
+
case '*':
|
242
|
+
case '+':
|
243
|
+
case '?':
|
244
|
+
parser->cursor++;
|
245
|
+
break;
|
246
|
+
case '{':
|
247
|
+
parser->cursor++;
|
248
|
+
if (!pm_regexp_parse_range_quantifier(parser)) return false;
|
249
|
+
break;
|
250
|
+
default:
|
251
|
+
// In this case there is no quantifier.
|
252
|
+
return true;
|
253
|
+
}
|
234
254
|
}
|
255
|
+
|
256
|
+
return true;
|
235
257
|
}
|
236
258
|
|
237
259
|
/**
|
@@ -255,20 +277,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
|
255
277
|
|
256
278
|
// Forward declaration because character sets can be nested.
|
257
279
|
static bool
|
258
|
-
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
280
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
|
259
281
|
|
260
282
|
/**
|
261
283
|
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
262
284
|
* ;
|
263
285
|
*/
|
264
286
|
static bool
|
265
|
-
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
287
|
+
pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
|
266
288
|
pm_regexp_char_accept(parser, '^');
|
267
289
|
|
268
290
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
269
291
|
switch (*parser->cursor++) {
|
270
292
|
case '[':
|
271
|
-
pm_regexp_parse_lbracket(parser);
|
293
|
+
pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
|
272
294
|
break;
|
273
295
|
case '\\':
|
274
296
|
if (!pm_regexp_char_is_eof(parser)) {
|
@@ -288,7 +310,18 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
|
288
310
|
* A left bracket can either mean a POSIX class or a character set.
|
289
311
|
*/
|
290
312
|
static bool
|
291
|
-
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
313
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
|
314
|
+
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
|
315
|
+
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
|
316
|
+
return false;
|
317
|
+
}
|
318
|
+
|
319
|
+
if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
|
320
|
+
parser->cursor++;
|
321
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
|
322
|
+
return true;
|
323
|
+
}
|
324
|
+
|
292
325
|
const uint8_t *reset = parser->cursor;
|
293
326
|
|
294
327
|
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
@@ -298,13 +331,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
|
298
331
|
parser->cursor = reset;
|
299
332
|
}
|
300
333
|
|
301
|
-
return pm_regexp_parse_character_set(parser);
|
334
|
+
return pm_regexp_parse_character_set(parser, depth);
|
302
335
|
}
|
303
336
|
|
304
337
|
// Forward declaration here since parsing groups needs to go back up the grammar
|
305
338
|
// to parse expressions within them.
|
306
339
|
static bool
|
307
|
-
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
340
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
|
308
341
|
|
309
342
|
/**
|
310
343
|
* These are the states of the options that are configurable on the regular
|
@@ -396,6 +429,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
396
429
|
return false;
|
397
430
|
}
|
398
431
|
|
432
|
+
/**
|
433
|
+
* True if the given key is set in the options.
|
434
|
+
*/
|
435
|
+
static uint8_t
|
436
|
+
pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
|
437
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
438
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
439
|
+
return options->values[key];
|
440
|
+
}
|
441
|
+
|
442
|
+
return false;
|
443
|
+
}
|
444
|
+
|
399
445
|
/**
|
400
446
|
* Groups can have quite a few different patterns for syntax. They basically
|
401
447
|
* just wrap a set of expressions, but they can potentially have options after a
|
@@ -418,17 +464,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
418
464
|
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
419
465
|
*/
|
420
466
|
static bool
|
421
|
-
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
467
|
+
pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
|
468
|
+
const uint8_t *group_start = parser->cursor;
|
469
|
+
|
470
|
+
pm_regexp_options_t options;
|
471
|
+
pm_regexp_options_init(&options);
|
472
|
+
|
422
473
|
// First, parse any options for the group.
|
423
474
|
if (pm_regexp_char_accept(parser, '?')) {
|
424
475
|
if (pm_regexp_char_is_eof(parser)) {
|
476
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
|
425
477
|
return false;
|
426
478
|
}
|
427
|
-
pm_regexp_options_t options;
|
428
|
-
pm_regexp_options_init(&options);
|
429
479
|
|
430
480
|
switch (*parser->cursor) {
|
431
481
|
case '#': { // inline comments
|
482
|
+
parser->cursor++;
|
483
|
+
if (pm_regexp_char_is_eof(parser)) {
|
484
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
|
485
|
+
return false;
|
486
|
+
}
|
487
|
+
|
432
488
|
if (parser->encoding_changed && parser->encoding->multibyte) {
|
433
489
|
bool escaped = false;
|
434
490
|
|
@@ -472,6 +528,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
472
528
|
case '<':
|
473
529
|
parser->cursor++;
|
474
530
|
if (pm_regexp_char_is_eof(parser)) {
|
531
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
|
475
532
|
return false;
|
476
533
|
}
|
477
534
|
|
@@ -485,7 +542,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
485
542
|
if (!pm_regexp_char_find(parser, '>')) {
|
486
543
|
return false;
|
487
544
|
}
|
488
|
-
|
545
|
+
|
546
|
+
if (parser->cursor - start == 1) {
|
547
|
+
pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
|
548
|
+
}
|
549
|
+
|
550
|
+
if (parser->name_callback != NULL) {
|
551
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
552
|
+
}
|
553
|
+
|
489
554
|
break;
|
490
555
|
}
|
491
556
|
}
|
@@ -496,7 +561,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
496
561
|
return false;
|
497
562
|
}
|
498
563
|
|
499
|
-
|
564
|
+
if (parser->name_callback != NULL) {
|
565
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
566
|
+
}
|
567
|
+
|
500
568
|
break;
|
501
569
|
}
|
502
570
|
case '(': // conditional expression
|
@@ -516,11 +584,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
516
584
|
return false;
|
517
585
|
}
|
518
586
|
|
587
|
+
// If we are at the end of the group of options and there is no
|
588
|
+
// subexpression, then we are going to be setting the options
|
589
|
+
// for the parent group. In this case we are safe to return now.
|
590
|
+
if (*parser->cursor == ')') {
|
591
|
+
if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
|
592
|
+
parser->extended_mode = true;
|
593
|
+
}
|
594
|
+
|
595
|
+
parser->cursor++;
|
596
|
+
return true;
|
597
|
+
}
|
598
|
+
|
519
599
|
// If we hit a -, then we're done parsing options.
|
520
600
|
if (*parser->cursor != '-') break;
|
521
601
|
|
522
|
-
|
523
|
-
/* fallthrough */
|
602
|
+
PRISM_FALLTHROUGH
|
524
603
|
case '-':
|
525
604
|
parser->cursor++;
|
526
605
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
@@ -533,22 +612,57 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
533
612
|
if (pm_regexp_char_is_eof(parser)) {
|
534
613
|
return false;
|
535
614
|
}
|
615
|
+
|
616
|
+
// If we are at the end of the group of options and there is no
|
617
|
+
// subexpression, then we are going to be setting the options
|
618
|
+
// for the parent group. In this case we are safe to return now.
|
619
|
+
if (*parser->cursor == ')') {
|
620
|
+
switch (pm_regexp_options_state(&options, 'x')) {
|
621
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
622
|
+
parser->extended_mode = true;
|
623
|
+
break;
|
624
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
625
|
+
parser->extended_mode = false;
|
626
|
+
break;
|
627
|
+
}
|
628
|
+
|
629
|
+
parser->cursor++;
|
630
|
+
return true;
|
631
|
+
}
|
632
|
+
|
536
633
|
break;
|
537
634
|
default:
|
538
|
-
|
635
|
+
parser->cursor++;
|
636
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
|
637
|
+
break;
|
539
638
|
}
|
540
639
|
}
|
541
640
|
|
641
|
+
bool extended_mode = parser->extended_mode;
|
642
|
+
switch (pm_regexp_options_state(&options, 'x')) {
|
643
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
644
|
+
parser->extended_mode = true;
|
645
|
+
break;
|
646
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
647
|
+
parser->extended_mode = false;
|
648
|
+
break;
|
649
|
+
}
|
650
|
+
|
542
651
|
// Now, parse the expressions within this group.
|
543
652
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
544
|
-
if (!pm_regexp_parse_expression(parser)) {
|
653
|
+
if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
|
654
|
+
parser->extended_mode = extended_mode;
|
545
655
|
return false;
|
546
656
|
}
|
547
657
|
pm_regexp_char_accept(parser, '|');
|
548
658
|
}
|
549
659
|
|
550
660
|
// Finally, make sure we have a closing parenthesis.
|
551
|
-
|
661
|
+
parser->extended_mode = extended_mode;
|
662
|
+
if (pm_regexp_char_expect(parser, ')')) return true;
|
663
|
+
|
664
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
|
665
|
+
return false;
|
552
666
|
}
|
553
667
|
|
554
668
|
/**
|
@@ -564,12 +678,12 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
564
678
|
* ;
|
565
679
|
*/
|
566
680
|
static bool
|
567
|
-
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
681
|
+
pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
|
568
682
|
switch (*parser->cursor) {
|
569
683
|
case '^':
|
570
684
|
case '$':
|
571
685
|
parser->cursor++;
|
572
|
-
return
|
686
|
+
return pm_regexp_parse_quantifier(parser);
|
573
687
|
case '\\':
|
574
688
|
parser->cursor++;
|
575
689
|
if (!pm_regexp_char_is_eof(parser)) {
|
@@ -578,10 +692,26 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
|
578
692
|
return pm_regexp_parse_quantifier(parser);
|
579
693
|
case '(':
|
580
694
|
parser->cursor++;
|
581
|
-
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
|
695
|
+
return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
|
582
696
|
case '[':
|
583
697
|
parser->cursor++;
|
584
|
-
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
|
698
|
+
return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
|
699
|
+
case '*':
|
700
|
+
case '?':
|
701
|
+
case '+':
|
702
|
+
parser->cursor++;
|
703
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
|
704
|
+
return true;
|
705
|
+
case ')':
|
706
|
+
parser->cursor++;
|
707
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
|
708
|
+
return true;
|
709
|
+
case '#':
|
710
|
+
if (parser->extended_mode) {
|
711
|
+
if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
|
712
|
+
return true;
|
713
|
+
}
|
714
|
+
PRISM_FALLTHROUGH
|
585
715
|
default: {
|
586
716
|
size_t width;
|
587
717
|
if (!parser->encoding_changed) {
|
@@ -603,13 +733,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
|
603
733
|
* ;
|
604
734
|
*/
|
605
735
|
static bool
|
606
|
-
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
607
|
-
if (
|
736
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
|
737
|
+
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
|
738
|
+
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
|
739
|
+
return false;
|
740
|
+
}
|
741
|
+
|
742
|
+
if (!pm_regexp_parse_item(parser, depth)) {
|
608
743
|
return false;
|
609
744
|
}
|
610
745
|
|
611
746
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
612
|
-
if (!pm_regexp_parse_item(parser)) {
|
747
|
+
if (!pm_regexp_parse_item(parser, depth)) {
|
613
748
|
return false;
|
614
749
|
}
|
615
750
|
}
|
@@ -625,29 +760,31 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
|
625
760
|
*/
|
626
761
|
static bool
|
627
762
|
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
628
|
-
|
629
|
-
(
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
) &&
|
635
|
-
(
|
636
|
-
// Return now if we've parsed the entire pattern.
|
637
|
-
pm_regexp_char_is_eof(parser) ||
|
638
|
-
// Otherwise, we should have a pipe character.
|
639
|
-
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
640
|
-
)
|
641
|
-
);
|
763
|
+
do {
|
764
|
+
if (pm_regexp_char_is_eof(parser)) return true;
|
765
|
+
if (!pm_regexp_parse_expression(parser, 0)) return false;
|
766
|
+
} while (pm_regexp_char_accept(parser, '|'));
|
767
|
+
|
768
|
+
return pm_regexp_char_is_eof(parser);
|
642
769
|
}
|
643
770
|
|
644
771
|
/**
|
645
772
|
* Parse a regular expression and extract the names of all of the named capture
|
646
773
|
* groups.
|
647
774
|
*/
|
648
|
-
PRISM_EXPORTED_FUNCTION
|
649
|
-
|
650
|
-
pm_regexp_parser_t
|
651
|
-
|
652
|
-
|
775
|
+
PRISM_EXPORTED_FUNCTION void
|
776
|
+
pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
|
777
|
+
pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
|
778
|
+
.parser = parser,
|
779
|
+
.start = source,
|
780
|
+
.cursor = source,
|
781
|
+
.end = source + size,
|
782
|
+
.extended_mode = extended_mode,
|
783
|
+
.encoding_changed = parser->encoding_changed,
|
784
|
+
.encoding = parser->encoding,
|
785
|
+
.name_callback = name_callback,
|
786
|
+
.name_data = name_data,
|
787
|
+
.error_callback = error_callback,
|
788
|
+
.error_data = error_data
|
789
|
+
});
|
653
790
|
}
|