jruby-prism-parser 0.23.0.pre.SNAPSHOT-java → 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/BSDmakefile +58 -0
- data/CHANGELOG.md +284 -1
- data/CONTRIBUTING.md +0 -4
- data/Makefile +25 -18
- data/README.md +57 -6
- data/config.yml +1724 -140
- data/docs/build_system.md +39 -11
- data/docs/configuration.md +4 -0
- data/docs/cruby_compilation.md +1 -1
- data/docs/fuzzing.md +1 -1
- data/docs/parser_translation.md +14 -9
- data/docs/parsing_rules.md +4 -1
- data/docs/releasing.md +9 -11
- data/docs/relocation.md +34 -0
- data/docs/ripper_translation.md +72 -0
- data/docs/ruby_api.md +2 -1
- data/docs/serialization.md +29 -5
- data/ext/prism/api_node.c +3841 -2000
- data/ext/prism/api_pack.c +9 -0
- data/ext/prism/extconf.rb +55 -34
- data/ext/prism/extension.c +597 -346
- data/ext/prism/extension.h +6 -5
- data/include/prism/ast.h +2612 -455
- data/include/prism/defines.h +160 -2
- data/include/prism/diagnostic.h +188 -76
- data/include/prism/encoding.h +22 -4
- data/include/prism/node.h +89 -17
- data/include/prism/options.h +224 -12
- data/include/prism/pack.h +11 -0
- data/include/prism/parser.h +267 -66
- data/include/prism/prettyprint.h +8 -0
- data/include/prism/regexp.h +18 -8
- data/include/prism/static_literals.h +121 -0
- data/include/prism/util/pm_buffer.h +75 -2
- data/include/prism/util/pm_char.h +1 -2
- data/include/prism/util/pm_constant_pool.h +18 -9
- data/include/prism/util/pm_integer.h +126 -0
- data/include/prism/util/pm_list.h +1 -1
- data/include/prism/util/pm_newline_list.h +23 -3
- data/include/prism/util/pm_string.h +48 -8
- data/include/prism/version.h +3 -3
- data/include/prism.h +99 -5
- data/jruby-prism.jar +0 -0
- data/lib/prism/compiler.rb +11 -1
- data/lib/prism/desugar_compiler.rb +264 -80
- data/lib/prism/dispatcher.rb +45 -1
- data/lib/prism/dot_visitor.rb +201 -77
- data/lib/prism/dsl.rb +672 -457
- data/lib/prism/ffi.rb +308 -94
- data/lib/prism/inspect_visitor.rb +2389 -0
- data/lib/prism/lex_compat.rb +35 -16
- data/lib/prism/mutation_compiler.rb +24 -8
- data/lib/prism/node.rb +9712 -8931
- data/lib/prism/node_ext.rb +328 -32
- data/lib/prism/pack.rb +4 -0
- data/lib/prism/parse_result/comments.rb +34 -24
- data/lib/prism/parse_result/errors.rb +65 -0
- data/lib/prism/parse_result/newlines.rb +102 -12
- data/lib/prism/parse_result.rb +458 -46
- data/lib/prism/pattern.rb +28 -10
- data/lib/prism/polyfill/append_as_bytes.rb +15 -0
- data/lib/prism/polyfill/byteindex.rb +13 -0
- data/lib/prism/polyfill/unpack1.rb +14 -0
- data/lib/prism/reflection.rb +413 -0
- data/lib/prism/relocation.rb +504 -0
- data/lib/prism/serialize.rb +1940 -902
- data/lib/prism/string_query.rb +30 -0
- data/lib/prism/translation/parser/builder.rb +61 -0
- data/lib/prism/translation/parser/compiler.rb +569 -195
- data/lib/prism/translation/parser/lexer.rb +516 -39
- data/lib/prism/translation/parser.rb +188 -11
- data/lib/prism/translation/parser33.rb +12 -0
- data/lib/prism/translation/parser34.rb +12 -0
- data/lib/prism/translation/parser35.rb +12 -0
- data/lib/prism/translation/ripper/sexp.rb +125 -0
- data/lib/prism/translation/ripper/shim.rb +5 -0
- data/lib/prism/translation/ripper.rb +3267 -386
- data/lib/prism/translation/ruby_parser.rb +194 -69
- data/lib/prism/translation.rb +4 -1
- data/lib/prism/version.rb +1 -1
- data/lib/prism/visitor.rb +13 -0
- data/lib/prism.rb +17 -27
- data/prism.gemspec +59 -17
- data/rbi/prism/compiler.rbi +12 -0
- data/rbi/prism/dsl.rbi +524 -0
- data/rbi/prism/inspect_visitor.rbi +12 -0
- data/rbi/prism/node.rbi +8722 -0
- data/rbi/prism/node_ext.rbi +107 -0
- data/rbi/prism/parse_result.rbi +404 -0
- data/rbi/prism/reflection.rbi +58 -0
- data/rbi/prism/string_query.rbi +12 -0
- data/rbi/prism/translation/parser.rbi +11 -0
- data/rbi/prism/translation/parser33.rbi +6 -0
- data/rbi/prism/translation/parser34.rbi +6 -0
- data/rbi/prism/translation/parser35.rbi +6 -0
- data/rbi/prism/translation/ripper.rbi +15 -0
- data/rbi/prism/visitor.rbi +473 -0
- data/rbi/prism.rbi +44 -7745
- data/sig/prism/compiler.rbs +9 -0
- data/sig/prism/dispatcher.rbs +16 -0
- data/sig/prism/dot_visitor.rbs +6 -0
- data/sig/prism/dsl.rbs +351 -0
- data/sig/prism/inspect_visitor.rbs +22 -0
- data/sig/prism/lex_compat.rbs +10 -0
- data/sig/prism/mutation_compiler.rbs +159 -0
- data/sig/prism/node.rbs +3614 -0
- data/sig/prism/node_ext.rbs +82 -0
- data/sig/prism/pack.rbs +43 -0
- data/sig/prism/parse_result.rbs +192 -0
- data/sig/prism/pattern.rbs +13 -0
- data/sig/prism/reflection.rbs +50 -0
- data/sig/prism/relocation.rbs +185 -0
- data/sig/prism/serialize.rbs +8 -0
- data/sig/prism/string_query.rbs +11 -0
- data/sig/prism/visitor.rbs +169 -0
- data/sig/prism.rbs +248 -4767
- data/src/diagnostic.c +672 -230
- data/src/encoding.c +211 -108
- data/src/node.c +7541 -1653
- data/src/options.c +135 -20
- data/src/pack.c +33 -17
- data/src/prettyprint.c +1546 -1488
- data/src/prism.c +7822 -3044
- data/src/regexp.c +225 -73
- data/src/serialize.c +101 -77
- data/src/static_literals.c +617 -0
- data/src/token_type.c +14 -13
- data/src/util/pm_buffer.c +187 -20
- data/src/util/pm_char.c +5 -5
- data/src/util/pm_constant_pool.c +39 -19
- data/src/util/pm_integer.c +670 -0
- data/src/util/pm_list.c +1 -1
- data/src/util/pm_newline_list.c +49 -8
- data/src/util/pm_string.c +213 -33
- data/src/util/pm_strncasecmp.c +13 -1
- data/src/util/pm_strpbrk.c +32 -6
- metadata +59 -21
- data/docs/ripper.md +0 -36
- data/include/prism/util/pm_state_stack.h +0 -42
- data/include/prism/util/pm_string_list.h +0 -44
- data/lib/prism/debug.rb +0 -206
- data/lib/prism/node_inspector.rb +0 -68
- data/lib/prism/translation/parser/rubocop.rb +0 -37
- data/rbi/prism_static.rbi +0 -207
- data/sig/prism_static.rbs +0 -201
- data/src/util/pm_state_stack.c +0 -25
- data/src/util/pm_string_list.c +0 -28
data/src/regexp.c
CHANGED
@@ -1,9 +1,14 @@
|
|
1
1
|
#include "prism/regexp.h"
|
2
2
|
|
3
|
+
#define PM_REGEXP_PARSE_DEPTH_MAX 4096
|
4
|
+
|
3
5
|
/**
|
4
6
|
* This is the parser that is going to handle parsing regular expressions.
|
5
7
|
*/
|
6
8
|
typedef struct {
|
9
|
+
/** The parser that is currently being used. */
|
10
|
+
pm_parser_t *parser;
|
11
|
+
|
7
12
|
/** A pointer to the start of the source that we are parsing. */
|
8
13
|
const uint8_t *start;
|
9
14
|
|
@@ -13,39 +18,48 @@ typedef struct {
|
|
13
18
|
/** A pointer to the end of the source that we are parsing. */
|
14
19
|
const uint8_t *end;
|
15
20
|
|
16
|
-
/**
|
17
|
-
|
21
|
+
/**
|
22
|
+
* Whether or not the regular expression currently being parsed is in
|
23
|
+
* extended mode, wherein whitespace is ignored and comments are allowed.
|
24
|
+
*/
|
25
|
+
bool extended_mode;
|
18
26
|
|
19
27
|
/** Whether the encoding has changed from the default. */
|
20
28
|
bool encoding_changed;
|
21
29
|
|
22
30
|
/** The encoding of the source. */
|
23
31
|
const pm_encoding_t *encoding;
|
32
|
+
|
33
|
+
/** The callback to call when a named capture group is found. */
|
34
|
+
pm_regexp_name_callback_t name_callback;
|
35
|
+
|
36
|
+
/** The data to pass to the name callback. */
|
37
|
+
void *name_data;
|
38
|
+
|
39
|
+
/** The callback to call when a parse error is found. */
|
40
|
+
pm_regexp_error_callback_t error_callback;
|
41
|
+
|
42
|
+
/** The data to pass to the error callback. */
|
43
|
+
void *error_data;
|
24
44
|
} pm_regexp_parser_t;
|
25
45
|
|
26
46
|
/**
|
27
|
-
*
|
47
|
+
* Append an error to the parser.
|
28
48
|
*/
|
29
|
-
static void
|
30
|
-
|
31
|
-
|
32
|
-
.start = start,
|
33
|
-
.cursor = start,
|
34
|
-
.end = end,
|
35
|
-
.named_captures = named_captures,
|
36
|
-
.encoding_changed = encoding_changed,
|
37
|
-
.encoding = encoding
|
38
|
-
};
|
49
|
+
static inline void
|
50
|
+
pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
|
51
|
+
parser->error_callback(start, end, message, parser->error_data);
|
39
52
|
}
|
40
53
|
|
41
54
|
/**
|
42
|
-
* This appends a new string to the list of named captures.
|
55
|
+
* This appends a new string to the list of named captures. This function
|
56
|
+
* assumes the caller has already checked the validity of the name callback.
|
43
57
|
*/
|
44
58
|
static void
|
45
59
|
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
46
60
|
pm_string_t string;
|
47
61
|
pm_string_shared_init(&string, start, end);
|
48
|
-
|
62
|
+
parser->name_callback(&string, parser->name_data);
|
49
63
|
pm_string_free(&string);
|
50
64
|
}
|
51
65
|
|
@@ -144,6 +158,11 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
144
158
|
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
145
159
|
|
146
160
|
while (1) {
|
161
|
+
if (parser->cursor >= parser->end) {
|
162
|
+
parser->cursor = savepoint;
|
163
|
+
return true;
|
164
|
+
}
|
165
|
+
|
147
166
|
switch (state) {
|
148
167
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
149
168
|
switch (*parser->cursor) {
|
@@ -217,21 +236,24 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
|
217
236
|
*/
|
218
237
|
static bool
|
219
238
|
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
239
|
+
while (!pm_regexp_char_is_eof(parser)) {
|
240
|
+
switch (*parser->cursor) {
|
241
|
+
case '*':
|
242
|
+
case '+':
|
243
|
+
case '?':
|
244
|
+
parser->cursor++;
|
245
|
+
break;
|
246
|
+
case '{':
|
247
|
+
parser->cursor++;
|
248
|
+
if (!pm_regexp_parse_range_quantifier(parser)) return false;
|
249
|
+
break;
|
250
|
+
default:
|
251
|
+
// In this case there is no quantifier.
|
252
|
+
return true;
|
253
|
+
}
|
234
254
|
}
|
255
|
+
|
256
|
+
return true;
|
235
257
|
}
|
236
258
|
|
237
259
|
/**
|
@@ -255,20 +277,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
|
255
277
|
|
256
278
|
// Forward declaration because character sets can be nested.
|
257
279
|
static bool
|
258
|
-
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
280
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
|
259
281
|
|
260
282
|
/**
|
261
283
|
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
262
284
|
* ;
|
263
285
|
*/
|
264
286
|
static bool
|
265
|
-
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
287
|
+
pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
|
266
288
|
pm_regexp_char_accept(parser, '^');
|
267
289
|
|
268
290
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
269
291
|
switch (*parser->cursor++) {
|
270
292
|
case '[':
|
271
|
-
pm_regexp_parse_lbracket(parser);
|
293
|
+
pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
|
272
294
|
break;
|
273
295
|
case '\\':
|
274
296
|
if (!pm_regexp_char_is_eof(parser)) {
|
@@ -288,7 +310,18 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
|
288
310
|
* A left bracket can either mean a POSIX class or a character set.
|
289
311
|
*/
|
290
312
|
static bool
|
291
|
-
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
313
|
+
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
|
314
|
+
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
|
315
|
+
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
|
316
|
+
return false;
|
317
|
+
}
|
318
|
+
|
319
|
+
if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
|
320
|
+
parser->cursor++;
|
321
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
|
322
|
+
return true;
|
323
|
+
}
|
324
|
+
|
292
325
|
const uint8_t *reset = parser->cursor;
|
293
326
|
|
294
327
|
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
@@ -298,13 +331,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
|
298
331
|
parser->cursor = reset;
|
299
332
|
}
|
300
333
|
|
301
|
-
return pm_regexp_parse_character_set(parser);
|
334
|
+
return pm_regexp_parse_character_set(parser, depth);
|
302
335
|
}
|
303
336
|
|
304
337
|
// Forward declaration here since parsing groups needs to go back up the grammar
|
305
338
|
// to parse expressions within them.
|
306
339
|
static bool
|
307
|
-
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
340
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
|
308
341
|
|
309
342
|
/**
|
310
343
|
* These are the states of the options that are configurable on the regular
|
@@ -396,6 +429,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
396
429
|
return false;
|
397
430
|
}
|
398
431
|
|
432
|
+
/**
|
433
|
+
* True if the given key is set in the options.
|
434
|
+
*/
|
435
|
+
static uint8_t
|
436
|
+
pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
|
437
|
+
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
438
|
+
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
439
|
+
return options->values[key];
|
440
|
+
}
|
441
|
+
|
442
|
+
return false;
|
443
|
+
}
|
444
|
+
|
399
445
|
/**
|
400
446
|
* Groups can have quite a few different patterns for syntax. They basically
|
401
447
|
* just wrap a set of expressions, but they can potentially have options after a
|
@@ -418,17 +464,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
418
464
|
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
419
465
|
*/
|
420
466
|
static bool
|
421
|
-
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
467
|
+
pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
|
468
|
+
const uint8_t *group_start = parser->cursor;
|
469
|
+
|
470
|
+
pm_regexp_options_t options;
|
471
|
+
pm_regexp_options_init(&options);
|
472
|
+
|
422
473
|
// First, parse any options for the group.
|
423
474
|
if (pm_regexp_char_accept(parser, '?')) {
|
424
475
|
if (pm_regexp_char_is_eof(parser)) {
|
476
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
|
425
477
|
return false;
|
426
478
|
}
|
427
|
-
pm_regexp_options_t options;
|
428
|
-
pm_regexp_options_init(&options);
|
429
479
|
|
430
480
|
switch (*parser->cursor) {
|
431
481
|
case '#': { // inline comments
|
482
|
+
parser->cursor++;
|
483
|
+
if (pm_regexp_char_is_eof(parser)) {
|
484
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
|
485
|
+
return false;
|
486
|
+
}
|
487
|
+
|
432
488
|
if (parser->encoding_changed && parser->encoding->multibyte) {
|
433
489
|
bool escaped = false;
|
434
490
|
|
@@ -472,6 +528,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
472
528
|
case '<':
|
473
529
|
parser->cursor++;
|
474
530
|
if (pm_regexp_char_is_eof(parser)) {
|
531
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
|
475
532
|
return false;
|
476
533
|
}
|
477
534
|
|
@@ -485,7 +542,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
485
542
|
if (!pm_regexp_char_find(parser, '>')) {
|
486
543
|
return false;
|
487
544
|
}
|
488
|
-
|
545
|
+
|
546
|
+
if (parser->cursor - start == 1) {
|
547
|
+
pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
|
548
|
+
}
|
549
|
+
|
550
|
+
if (parser->name_callback != NULL) {
|
551
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
552
|
+
}
|
553
|
+
|
489
554
|
break;
|
490
555
|
}
|
491
556
|
}
|
@@ -496,7 +561,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
496
561
|
return false;
|
497
562
|
}
|
498
563
|
|
499
|
-
|
564
|
+
if (parser->name_callback != NULL) {
|
565
|
+
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
566
|
+
}
|
567
|
+
|
500
568
|
break;
|
501
569
|
}
|
502
570
|
case '(': // conditional expression
|
@@ -516,11 +584,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
516
584
|
return false;
|
517
585
|
}
|
518
586
|
|
587
|
+
// If we are at the end of the group of options and there is no
|
588
|
+
// subexpression, then we are going to be setting the options
|
589
|
+
// for the parent group. In this case we are safe to return now.
|
590
|
+
if (*parser->cursor == ')') {
|
591
|
+
if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
|
592
|
+
parser->extended_mode = true;
|
593
|
+
}
|
594
|
+
|
595
|
+
parser->cursor++;
|
596
|
+
return true;
|
597
|
+
}
|
598
|
+
|
519
599
|
// If we hit a -, then we're done parsing options.
|
520
600
|
if (*parser->cursor != '-') break;
|
521
601
|
|
522
|
-
|
523
|
-
/* fallthrough */
|
602
|
+
PRISM_FALLTHROUGH
|
524
603
|
case '-':
|
525
604
|
parser->cursor++;
|
526
605
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
@@ -533,22 +612,57 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
533
612
|
if (pm_regexp_char_is_eof(parser)) {
|
534
613
|
return false;
|
535
614
|
}
|
615
|
+
|
616
|
+
// If we are at the end of the group of options and there is no
|
617
|
+
// subexpression, then we are going to be setting the options
|
618
|
+
// for the parent group. In this case we are safe to return now.
|
619
|
+
if (*parser->cursor == ')') {
|
620
|
+
switch (pm_regexp_options_state(&options, 'x')) {
|
621
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
622
|
+
parser->extended_mode = true;
|
623
|
+
break;
|
624
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
625
|
+
parser->extended_mode = false;
|
626
|
+
break;
|
627
|
+
}
|
628
|
+
|
629
|
+
parser->cursor++;
|
630
|
+
return true;
|
631
|
+
}
|
632
|
+
|
536
633
|
break;
|
537
634
|
default:
|
538
|
-
|
635
|
+
parser->cursor++;
|
636
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
|
637
|
+
break;
|
539
638
|
}
|
540
639
|
}
|
541
640
|
|
641
|
+
bool extended_mode = parser->extended_mode;
|
642
|
+
switch (pm_regexp_options_state(&options, 'x')) {
|
643
|
+
case PM_REGEXP_OPTION_STATE_ADDED:
|
644
|
+
parser->extended_mode = true;
|
645
|
+
break;
|
646
|
+
case PM_REGEXP_OPTION_STATE_REMOVED:
|
647
|
+
parser->extended_mode = false;
|
648
|
+
break;
|
649
|
+
}
|
650
|
+
|
542
651
|
// Now, parse the expressions within this group.
|
543
652
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
544
|
-
if (!pm_regexp_parse_expression(parser)) {
|
653
|
+
if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
|
654
|
+
parser->extended_mode = extended_mode;
|
545
655
|
return false;
|
546
656
|
}
|
547
657
|
pm_regexp_char_accept(parser, '|');
|
548
658
|
}
|
549
659
|
|
550
660
|
// Finally, make sure we have a closing parenthesis.
|
551
|
-
|
661
|
+
parser->extended_mode = extended_mode;
|
662
|
+
if (pm_regexp_char_expect(parser, ')')) return true;
|
663
|
+
|
664
|
+
pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
|
665
|
+
return false;
|
552
666
|
}
|
553
667
|
|
554
668
|
/**
|
@@ -564,22 +678,53 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
|
564
678
|
* ;
|
565
679
|
*/
|
566
680
|
static bool
|
567
|
-
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
568
|
-
switch (*parser->cursor
|
681
|
+
pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
|
682
|
+
switch (*parser->cursor) {
|
569
683
|
case '^':
|
570
684
|
case '$':
|
571
|
-
|
685
|
+
parser->cursor++;
|
686
|
+
return pm_regexp_parse_quantifier(parser);
|
572
687
|
case '\\':
|
688
|
+
parser->cursor++;
|
573
689
|
if (!pm_regexp_char_is_eof(parser)) {
|
574
690
|
parser->cursor++;
|
575
691
|
}
|
576
692
|
return pm_regexp_parse_quantifier(parser);
|
577
693
|
case '(':
|
578
|
-
|
694
|
+
parser->cursor++;
|
695
|
+
return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
|
579
696
|
case '[':
|
580
|
-
|
581
|
-
|
697
|
+
parser->cursor++;
|
698
|
+
return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
|
699
|
+
case '*':
|
700
|
+
case '?':
|
701
|
+
case '+':
|
702
|
+
parser->cursor++;
|
703
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
|
704
|
+
return true;
|
705
|
+
case ')':
|
706
|
+
parser->cursor++;
|
707
|
+
pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
|
708
|
+
return true;
|
709
|
+
case '#':
|
710
|
+
if (parser->extended_mode) {
|
711
|
+
if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
|
712
|
+
return true;
|
713
|
+
}
|
714
|
+
PRISM_FALLTHROUGH
|
715
|
+
default: {
|
716
|
+
size_t width;
|
717
|
+
if (!parser->encoding_changed) {
|
718
|
+
width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
719
|
+
} else {
|
720
|
+
width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
721
|
+
}
|
722
|
+
|
723
|
+
if (width == 0) return false; // TODO: add appropriate error
|
724
|
+
parser->cursor += width;
|
725
|
+
|
582
726
|
return pm_regexp_parse_quantifier(parser);
|
727
|
+
}
|
583
728
|
}
|
584
729
|
}
|
585
730
|
|
@@ -588,13 +733,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
|
588
733
|
* ;
|
589
734
|
*/
|
590
735
|
static bool
|
591
|
-
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
592
|
-
if (
|
736
|
+
pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
|
737
|
+
if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
|
738
|
+
pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
|
739
|
+
return false;
|
740
|
+
}
|
741
|
+
|
742
|
+
if (!pm_regexp_parse_item(parser, depth)) {
|
593
743
|
return false;
|
594
744
|
}
|
595
745
|
|
596
746
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
597
|
-
if (!pm_regexp_parse_item(parser)) {
|
747
|
+
if (!pm_regexp_parse_item(parser, depth)) {
|
598
748
|
return false;
|
599
749
|
}
|
600
750
|
}
|
@@ -610,29 +760,31 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
|
610
760
|
*/
|
611
761
|
static bool
|
612
762
|
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
613
|
-
|
614
|
-
(
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
) &&
|
620
|
-
(
|
621
|
-
// Return now if we've parsed the entire pattern.
|
622
|
-
pm_regexp_char_is_eof(parser) ||
|
623
|
-
// Otherwise, we should have a pipe character.
|
624
|
-
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
625
|
-
)
|
626
|
-
);
|
763
|
+
do {
|
764
|
+
if (pm_regexp_char_is_eof(parser)) return true;
|
765
|
+
if (!pm_regexp_parse_expression(parser, 0)) return false;
|
766
|
+
} while (pm_regexp_char_accept(parser, '|'));
|
767
|
+
|
768
|
+
return pm_regexp_char_is_eof(parser);
|
627
769
|
}
|
628
770
|
|
629
771
|
/**
|
630
772
|
* Parse a regular expression and extract the names of all of the named capture
|
631
773
|
* groups.
|
632
774
|
*/
|
633
|
-
PRISM_EXPORTED_FUNCTION
|
634
|
-
|
635
|
-
pm_regexp_parser_t
|
636
|
-
|
637
|
-
|
775
|
+
PRISM_EXPORTED_FUNCTION void
|
776
|
+
pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
|
777
|
+
pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
|
778
|
+
.parser = parser,
|
779
|
+
.start = source,
|
780
|
+
.cursor = source,
|
781
|
+
.end = source + size,
|
782
|
+
.extended_mode = extended_mode,
|
783
|
+
.encoding_changed = parser->encoding_changed,
|
784
|
+
.encoding = parser->encoding,
|
785
|
+
.name_callback = name_callback,
|
786
|
+
.name_data = name_data,
|
787
|
+
.error_callback = error_callback,
|
788
|
+
.error_data = error_data
|
789
|
+
});
|
638
790
|
}
|