jruby-prism-parser 0.23.0.pre.SNAPSHOT-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +284 -1
  4. data/CONTRIBUTING.md +0 -4
  5. data/Makefile +25 -18
  6. data/README.md +57 -6
  7. data/config.yml +1724 -140
  8. data/docs/build_system.md +39 -11
  9. data/docs/configuration.md +4 -0
  10. data/docs/cruby_compilation.md +1 -1
  11. data/docs/fuzzing.md +1 -1
  12. data/docs/parser_translation.md +14 -9
  13. data/docs/parsing_rules.md +4 -1
  14. data/docs/releasing.md +9 -11
  15. data/docs/relocation.md +34 -0
  16. data/docs/ripper_translation.md +72 -0
  17. data/docs/ruby_api.md +2 -1
  18. data/docs/serialization.md +29 -5
  19. data/ext/prism/api_node.c +3841 -2000
  20. data/ext/prism/api_pack.c +9 -0
  21. data/ext/prism/extconf.rb +55 -34
  22. data/ext/prism/extension.c +597 -346
  23. data/ext/prism/extension.h +6 -5
  24. data/include/prism/ast.h +2612 -455
  25. data/include/prism/defines.h +160 -2
  26. data/include/prism/diagnostic.h +188 -76
  27. data/include/prism/encoding.h +22 -4
  28. data/include/prism/node.h +89 -17
  29. data/include/prism/options.h +224 -12
  30. data/include/prism/pack.h +11 -0
  31. data/include/prism/parser.h +267 -66
  32. data/include/prism/prettyprint.h +8 -0
  33. data/include/prism/regexp.h +18 -8
  34. data/include/prism/static_literals.h +121 -0
  35. data/include/prism/util/pm_buffer.h +75 -2
  36. data/include/prism/util/pm_char.h +1 -2
  37. data/include/prism/util/pm_constant_pool.h +18 -9
  38. data/include/prism/util/pm_integer.h +126 -0
  39. data/include/prism/util/pm_list.h +1 -1
  40. data/include/prism/util/pm_newline_list.h +23 -3
  41. data/include/prism/util/pm_string.h +48 -8
  42. data/include/prism/version.h +3 -3
  43. data/include/prism.h +99 -5
  44. data/jruby-prism.jar +0 -0
  45. data/lib/prism/compiler.rb +11 -1
  46. data/lib/prism/desugar_compiler.rb +264 -80
  47. data/lib/prism/dispatcher.rb +45 -1
  48. data/lib/prism/dot_visitor.rb +201 -77
  49. data/lib/prism/dsl.rb +672 -457
  50. data/lib/prism/ffi.rb +308 -94
  51. data/lib/prism/inspect_visitor.rb +2389 -0
  52. data/lib/prism/lex_compat.rb +35 -16
  53. data/lib/prism/mutation_compiler.rb +24 -8
  54. data/lib/prism/node.rb +9712 -8931
  55. data/lib/prism/node_ext.rb +328 -32
  56. data/lib/prism/pack.rb +4 -0
  57. data/lib/prism/parse_result/comments.rb +34 -24
  58. data/lib/prism/parse_result/errors.rb +65 -0
  59. data/lib/prism/parse_result/newlines.rb +102 -12
  60. data/lib/prism/parse_result.rb +458 -46
  61. data/lib/prism/pattern.rb +28 -10
  62. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  63. data/lib/prism/polyfill/byteindex.rb +13 -0
  64. data/lib/prism/polyfill/unpack1.rb +14 -0
  65. data/lib/prism/reflection.rb +413 -0
  66. data/lib/prism/relocation.rb +504 -0
  67. data/lib/prism/serialize.rb +1940 -902
  68. data/lib/prism/string_query.rb +30 -0
  69. data/lib/prism/translation/parser/builder.rb +61 -0
  70. data/lib/prism/translation/parser/compiler.rb +569 -195
  71. data/lib/prism/translation/parser/lexer.rb +516 -39
  72. data/lib/prism/translation/parser.rb +188 -11
  73. data/lib/prism/translation/parser33.rb +12 -0
  74. data/lib/prism/translation/parser34.rb +12 -0
  75. data/lib/prism/translation/parser35.rb +12 -0
  76. data/lib/prism/translation/ripper/sexp.rb +125 -0
  77. data/lib/prism/translation/ripper/shim.rb +5 -0
  78. data/lib/prism/translation/ripper.rb +3267 -386
  79. data/lib/prism/translation/ruby_parser.rb +194 -69
  80. data/lib/prism/translation.rb +4 -1
  81. data/lib/prism/version.rb +1 -1
  82. data/lib/prism/visitor.rb +13 -0
  83. data/lib/prism.rb +17 -27
  84. data/prism.gemspec +59 -17
  85. data/rbi/prism/compiler.rbi +12 -0
  86. data/rbi/prism/dsl.rbi +524 -0
  87. data/rbi/prism/inspect_visitor.rbi +12 -0
  88. data/rbi/prism/node.rbi +8722 -0
  89. data/rbi/prism/node_ext.rbi +107 -0
  90. data/rbi/prism/parse_result.rbi +404 -0
  91. data/rbi/prism/reflection.rbi +58 -0
  92. data/rbi/prism/string_query.rbi +12 -0
  93. data/rbi/prism/translation/parser.rbi +11 -0
  94. data/rbi/prism/translation/parser33.rbi +6 -0
  95. data/rbi/prism/translation/parser34.rbi +6 -0
  96. data/rbi/prism/translation/parser35.rbi +6 -0
  97. data/rbi/prism/translation/ripper.rbi +15 -0
  98. data/rbi/prism/visitor.rbi +473 -0
  99. data/rbi/prism.rbi +44 -7745
  100. data/sig/prism/compiler.rbs +9 -0
  101. data/sig/prism/dispatcher.rbs +16 -0
  102. data/sig/prism/dot_visitor.rbs +6 -0
  103. data/sig/prism/dsl.rbs +351 -0
  104. data/sig/prism/inspect_visitor.rbs +22 -0
  105. data/sig/prism/lex_compat.rbs +10 -0
  106. data/sig/prism/mutation_compiler.rbs +159 -0
  107. data/sig/prism/node.rbs +3614 -0
  108. data/sig/prism/node_ext.rbs +82 -0
  109. data/sig/prism/pack.rbs +43 -0
  110. data/sig/prism/parse_result.rbs +192 -0
  111. data/sig/prism/pattern.rbs +13 -0
  112. data/sig/prism/reflection.rbs +50 -0
  113. data/sig/prism/relocation.rbs +185 -0
  114. data/sig/prism/serialize.rbs +8 -0
  115. data/sig/prism/string_query.rbs +11 -0
  116. data/sig/prism/visitor.rbs +169 -0
  117. data/sig/prism.rbs +248 -4767
  118. data/src/diagnostic.c +672 -230
  119. data/src/encoding.c +211 -108
  120. data/src/node.c +7541 -1653
  121. data/src/options.c +135 -20
  122. data/src/pack.c +33 -17
  123. data/src/prettyprint.c +1546 -1488
  124. data/src/prism.c +7822 -3044
  125. data/src/regexp.c +225 -73
  126. data/src/serialize.c +101 -77
  127. data/src/static_literals.c +617 -0
  128. data/src/token_type.c +14 -13
  129. data/src/util/pm_buffer.c +187 -20
  130. data/src/util/pm_char.c +5 -5
  131. data/src/util/pm_constant_pool.c +39 -19
  132. data/src/util/pm_integer.c +670 -0
  133. data/src/util/pm_list.c +1 -1
  134. data/src/util/pm_newline_list.c +49 -8
  135. data/src/util/pm_string.c +213 -33
  136. data/src/util/pm_strncasecmp.c +13 -1
  137. data/src/util/pm_strpbrk.c +32 -6
  138. metadata +59 -21
  139. data/docs/ripper.md +0 -36
  140. data/include/prism/util/pm_state_stack.h +0 -42
  141. data/include/prism/util/pm_string_list.h +0 -44
  142. data/lib/prism/debug.rb +0 -206
  143. data/lib/prism/node_inspector.rb +0 -68
  144. data/lib/prism/translation/parser/rubocop.rb +0 -37
  145. data/rbi/prism_static.rbi +0 -207
  146. data/sig/prism_static.rbs +0 -201
  147. data/src/util/pm_state_stack.c +0 -25
  148. data/src/util/pm_string_list.c +0 -28
data/src/regexp.c CHANGED
@@ -1,9 +1,14 @@
1
1
  #include "prism/regexp.h"
2
2
 
3
+ #define PM_REGEXP_PARSE_DEPTH_MAX 4096
4
+
3
5
  /**
4
6
  * This is the parser that is going to handle parsing regular expressions.
5
7
  */
6
8
  typedef struct {
9
+ /** The parser that is currently being used. */
10
+ pm_parser_t *parser;
11
+
7
12
  /** A pointer to the start of the source that we are parsing. */
8
13
  const uint8_t *start;
9
14
 
@@ -13,39 +18,48 @@ typedef struct {
13
18
  /** A pointer to the end of the source that we are parsing. */
14
19
  const uint8_t *end;
15
20
 
16
- /** A list of named captures that we've found. */
17
- pm_string_list_t *named_captures;
21
+ /**
22
+ * Whether or not the regular expression currently being parsed is in
23
+ * extended mode, wherein whitespace is ignored and comments are allowed.
24
+ */
25
+ bool extended_mode;
18
26
 
19
27
  /** Whether the encoding has changed from the default. */
20
28
  bool encoding_changed;
21
29
 
22
30
  /** The encoding of the source. */
23
31
  const pm_encoding_t *encoding;
32
+
33
+ /** The callback to call when a named capture group is found. */
34
+ pm_regexp_name_callback_t name_callback;
35
+
36
+ /** The data to pass to the name callback. */
37
+ void *name_data;
38
+
39
+ /** The callback to call when a parse error is found. */
40
+ pm_regexp_error_callback_t error_callback;
41
+
42
+ /** The data to pass to the error callback. */
43
+ void *error_data;
24
44
  } pm_regexp_parser_t;
25
45
 
26
46
  /**
27
- * This initializes a new parser with the given source.
47
+ * Append an error to the parser.
28
48
  */
29
- static void
30
- pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
31
- *parser = (pm_regexp_parser_t) {
32
- .start = start,
33
- .cursor = start,
34
- .end = end,
35
- .named_captures = named_captures,
36
- .encoding_changed = encoding_changed,
37
- .encoding = encoding
38
- };
49
+ static inline void
50
+ pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
51
+ parser->error_callback(start, end, message, parser->error_data);
39
52
  }
40
53
 
41
54
  /**
42
- * This appends a new string to the list of named captures.
55
+ * This appends a new string to the list of named captures. This function
56
+ * assumes the caller has already checked the validity of the name callback.
43
57
  */
44
58
  static void
45
59
  pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
46
60
  pm_string_t string;
47
61
  pm_string_shared_init(&string, start, end);
48
- pm_string_list_append(parser->named_captures, &string);
62
+ parser->name_callback(&string, parser->name_data);
49
63
  pm_string_free(&string);
50
64
  }
51
65
 
@@ -144,6 +158,11 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
144
158
  } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
145
159
 
146
160
  while (1) {
161
+ if (parser->cursor >= parser->end) {
162
+ parser->cursor = savepoint;
163
+ return true;
164
+ }
165
+
147
166
  switch (state) {
148
167
  case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
149
168
  switch (*parser->cursor) {
@@ -217,21 +236,24 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
217
236
  */
218
237
  static bool
219
238
  pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
220
- if (pm_regexp_char_is_eof(parser)) return true;
221
-
222
- switch (*parser->cursor) {
223
- case '*':
224
- case '+':
225
- case '?':
226
- parser->cursor++;
227
- return true;
228
- case '{':
229
- parser->cursor++;
230
- return pm_regexp_parse_range_quantifier(parser);
231
- default:
232
- // In this case there is no quantifier.
233
- return true;
239
+ while (!pm_regexp_char_is_eof(parser)) {
240
+ switch (*parser->cursor) {
241
+ case '*':
242
+ case '+':
243
+ case '?':
244
+ parser->cursor++;
245
+ break;
246
+ case '{':
247
+ parser->cursor++;
248
+ if (!pm_regexp_parse_range_quantifier(parser)) return false;
249
+ break;
250
+ default:
251
+ // In this case there is no quantifier.
252
+ return true;
253
+ }
234
254
  }
255
+
256
+ return true;
235
257
  }
236
258
 
237
259
  /**
@@ -255,20 +277,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
255
277
 
256
278
  // Forward declaration because character sets can be nested.
257
279
  static bool
258
- pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
280
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
259
281
 
260
282
  /**
261
283
  * match-char-set : '[' '^'? (match-range | match-char)* ']'
262
284
  * ;
263
285
  */
264
286
  static bool
265
- pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
287
+ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
266
288
  pm_regexp_char_accept(parser, '^');
267
289
 
268
290
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
269
291
  switch (*parser->cursor++) {
270
292
  case '[':
271
- pm_regexp_parse_lbracket(parser);
293
+ pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
272
294
  break;
273
295
  case '\\':
274
296
  if (!pm_regexp_char_is_eof(parser)) {
@@ -288,7 +310,18 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
288
310
  * A left bracket can either mean a POSIX class or a character set.
289
311
  */
290
312
  static bool
291
- pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
313
+ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
314
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
315
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
316
+ return false;
317
+ }
318
+
319
+ if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
320
+ parser->cursor++;
321
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
322
+ return true;
323
+ }
324
+
292
325
  const uint8_t *reset = parser->cursor;
293
326
 
294
327
  if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
@@ -298,13 +331,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
298
331
  parser->cursor = reset;
299
332
  }
300
333
 
301
- return pm_regexp_parse_character_set(parser);
334
+ return pm_regexp_parse_character_set(parser, depth);
302
335
  }
303
336
 
304
337
  // Forward declaration here since parsing groups needs to go back up the grammar
305
338
  // to parse expressions within them.
306
339
  static bool
307
- pm_regexp_parse_expression(pm_regexp_parser_t *parser);
340
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
308
341
 
309
342
  /**
310
343
  * These are the states of the options that are configurable on the regular
@@ -396,6 +429,19 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
396
429
  return false;
397
430
  }
398
431
 
432
+ /**
433
+ * True if the given key is set in the options.
434
+ */
435
+ static uint8_t
436
+ pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
437
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
438
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
439
+ return options->values[key];
440
+ }
441
+
442
+ return false;
443
+ }
444
+
399
445
  /**
400
446
  * Groups can have quite a few different patterns for syntax. They basically
401
447
  * just wrap a set of expressions, but they can potentially have options after a
@@ -418,17 +464,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
418
464
  * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
419
465
  */
420
466
  static bool
421
- pm_regexp_parse_group(pm_regexp_parser_t *parser) {
467
+ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
468
+ const uint8_t *group_start = parser->cursor;
469
+
470
+ pm_regexp_options_t options;
471
+ pm_regexp_options_init(&options);
472
+
422
473
  // First, parse any options for the group.
423
474
  if (pm_regexp_char_accept(parser, '?')) {
424
475
  if (pm_regexp_char_is_eof(parser)) {
476
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
425
477
  return false;
426
478
  }
427
- pm_regexp_options_t options;
428
- pm_regexp_options_init(&options);
429
479
 
430
480
  switch (*parser->cursor) {
431
481
  case '#': { // inline comments
482
+ parser->cursor++;
483
+ if (pm_regexp_char_is_eof(parser)) {
484
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
485
+ return false;
486
+ }
487
+
432
488
  if (parser->encoding_changed && parser->encoding->multibyte) {
433
489
  bool escaped = false;
434
490
 
@@ -472,6 +528,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
472
528
  case '<':
473
529
  parser->cursor++;
474
530
  if (pm_regexp_char_is_eof(parser)) {
531
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
475
532
  return false;
476
533
  }
477
534
 
@@ -485,7 +542,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
485
542
  if (!pm_regexp_char_find(parser, '>')) {
486
543
  return false;
487
544
  }
488
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
545
+
546
+ if (parser->cursor - start == 1) {
547
+ pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
548
+ }
549
+
550
+ if (parser->name_callback != NULL) {
551
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
552
+ }
553
+
489
554
  break;
490
555
  }
491
556
  }
@@ -496,7 +561,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
496
561
  return false;
497
562
  }
498
563
 
499
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
564
+ if (parser->name_callback != NULL) {
565
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
566
+ }
567
+
500
568
  break;
501
569
  }
502
570
  case '(': // conditional expression
@@ -516,11 +584,22 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
516
584
  return false;
517
585
  }
518
586
 
587
+ // If we are at the end of the group of options and there is no
588
+ // subexpression, then we are going to be setting the options
589
+ // for the parent group. In this case we are safe to return now.
590
+ if (*parser->cursor == ')') {
591
+ if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
592
+ parser->extended_mode = true;
593
+ }
594
+
595
+ parser->cursor++;
596
+ return true;
597
+ }
598
+
519
599
  // If we hit a -, then we're done parsing options.
520
600
  if (*parser->cursor != '-') break;
521
601
 
522
- // Otherwise, fallthrough to the - case.
523
- /* fallthrough */
602
+ PRISM_FALLTHROUGH
524
603
  case '-':
525
604
  parser->cursor++;
526
605
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
@@ -533,22 +612,57 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
533
612
  if (pm_regexp_char_is_eof(parser)) {
534
613
  return false;
535
614
  }
615
+
616
+ // If we are at the end of the group of options and there is no
617
+ // subexpression, then we are going to be setting the options
618
+ // for the parent group. In this case we are safe to return now.
619
+ if (*parser->cursor == ')') {
620
+ switch (pm_regexp_options_state(&options, 'x')) {
621
+ case PM_REGEXP_OPTION_STATE_ADDED:
622
+ parser->extended_mode = true;
623
+ break;
624
+ case PM_REGEXP_OPTION_STATE_REMOVED:
625
+ parser->extended_mode = false;
626
+ break;
627
+ }
628
+
629
+ parser->cursor++;
630
+ return true;
631
+ }
632
+
536
633
  break;
537
634
  default:
538
- return false;
635
+ parser->cursor++;
636
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
637
+ break;
539
638
  }
540
639
  }
541
640
 
641
+ bool extended_mode = parser->extended_mode;
642
+ switch (pm_regexp_options_state(&options, 'x')) {
643
+ case PM_REGEXP_OPTION_STATE_ADDED:
644
+ parser->extended_mode = true;
645
+ break;
646
+ case PM_REGEXP_OPTION_STATE_REMOVED:
647
+ parser->extended_mode = false;
648
+ break;
649
+ }
650
+
542
651
  // Now, parse the expressions within this group.
543
652
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
544
- if (!pm_regexp_parse_expression(parser)) {
653
+ if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
654
+ parser->extended_mode = extended_mode;
545
655
  return false;
546
656
  }
547
657
  pm_regexp_char_accept(parser, '|');
548
658
  }
549
659
 
550
660
  // Finally, make sure we have a closing parenthesis.
551
- return pm_regexp_char_expect(parser, ')');
661
+ parser->extended_mode = extended_mode;
662
+ if (pm_regexp_char_expect(parser, ')')) return true;
663
+
664
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
665
+ return false;
552
666
  }
553
667
 
554
668
  /**
@@ -564,22 +678,53 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
564
678
  * ;
565
679
  */
566
680
  static bool
567
- pm_regexp_parse_item(pm_regexp_parser_t *parser) {
568
- switch (*parser->cursor++) {
681
+ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
682
+ switch (*parser->cursor) {
569
683
  case '^':
570
684
  case '$':
571
- return true;
685
+ parser->cursor++;
686
+ return pm_regexp_parse_quantifier(parser);
572
687
  case '\\':
688
+ parser->cursor++;
573
689
  if (!pm_regexp_char_is_eof(parser)) {
574
690
  parser->cursor++;
575
691
  }
576
692
  return pm_regexp_parse_quantifier(parser);
577
693
  case '(':
578
- return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
694
+ parser->cursor++;
695
+ return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
579
696
  case '[':
580
- return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
581
- default:
697
+ parser->cursor++;
698
+ return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
699
+ case '*':
700
+ case '?':
701
+ case '+':
702
+ parser->cursor++;
703
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
704
+ return true;
705
+ case ')':
706
+ parser->cursor++;
707
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
708
+ return true;
709
+ case '#':
710
+ if (parser->extended_mode) {
711
+ if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
712
+ return true;
713
+ }
714
+ PRISM_FALLTHROUGH
715
+ default: {
716
+ size_t width;
717
+ if (!parser->encoding_changed) {
718
+ width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
719
+ } else {
720
+ width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
721
+ }
722
+
723
+ if (width == 0) return false; // TODO: add appropriate error
724
+ parser->cursor += width;
725
+
582
726
  return pm_regexp_parse_quantifier(parser);
727
+ }
583
728
  }
584
729
  }
585
730
 
@@ -588,13 +733,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
588
733
  * ;
589
734
  */
590
735
  static bool
591
- pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
592
- if (!pm_regexp_parse_item(parser)) {
736
+ pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
737
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
738
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
739
+ return false;
740
+ }
741
+
742
+ if (!pm_regexp_parse_item(parser, depth)) {
593
743
  return false;
594
744
  }
595
745
 
596
746
  while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
597
- if (!pm_regexp_parse_item(parser)) {
747
+ if (!pm_regexp_parse_item(parser, depth)) {
598
748
  return false;
599
749
  }
600
750
  }
@@ -610,29 +760,31 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
610
760
  */
611
761
  static bool
612
762
  pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
613
- return (
614
- (
615
- // Exit early if the pattern is empty.
616
- pm_regexp_char_is_eof(parser) ||
617
- // Parse the first expression in the pattern.
618
- pm_regexp_parse_expression(parser)
619
- ) &&
620
- (
621
- // Return now if we've parsed the entire pattern.
622
- pm_regexp_char_is_eof(parser) ||
623
- // Otherwise, we should have a pipe character.
624
- (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
625
- )
626
- );
763
+ do {
764
+ if (pm_regexp_char_is_eof(parser)) return true;
765
+ if (!pm_regexp_parse_expression(parser, 0)) return false;
766
+ } while (pm_regexp_char_accept(parser, '|'));
767
+
768
+ return pm_regexp_char_is_eof(parser);
627
769
  }
628
770
 
629
771
  /**
630
772
  * Parse a regular expression and extract the names of all of the named capture
631
773
  * groups.
632
774
  */
633
- PRISM_EXPORTED_FUNCTION bool
634
- pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
635
- pm_regexp_parser_t parser;
636
- pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
637
- return pm_regexp_parse_pattern(&parser);
775
+ PRISM_EXPORTED_FUNCTION void
776
+ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
777
+ pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
778
+ .parser = parser,
779
+ .start = source,
780
+ .cursor = source,
781
+ .end = source + size,
782
+ .extended_mode = extended_mode,
783
+ .encoding_changed = parser->encoding_changed,
784
+ .encoding = parser->encoding,
785
+ .name_callback = name_callback,
786
+ .name_data = name_data,
787
+ .error_callback = error_callback,
788
+ .error_data = error_data
789
+ });
638
790
  }