prism 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +24 -1
  3. data/config.yml +9 -0
  4. data/docs/releasing.md +1 -1
  5. data/docs/ruby_api.md +1 -1
  6. data/ext/prism/api_node.c +1814 -1303
  7. data/ext/prism/extension.c +230 -109
  8. data/ext/prism/extension.h +4 -4
  9. data/include/prism/ast.h +16 -0
  10. data/include/prism/defines.h +4 -1
  11. data/include/prism/options.h +47 -1
  12. data/include/prism/util/pm_buffer.h +10 -0
  13. data/include/prism/version.h +2 -2
  14. data/include/prism.h +4 -4
  15. data/lib/prism/dot_visitor.rb +16 -0
  16. data/lib/prism/dsl.rb +10 -2
  17. data/lib/prism/ffi.rb +45 -27
  18. data/lib/prism/inspect_visitor.rb +2 -1
  19. data/lib/prism/node.rb +48 -10
  20. data/lib/prism/parse_result/newlines.rb +1 -1
  21. data/lib/prism/parse_result.rb +52 -0
  22. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  23. data/lib/prism/reflection.rb +2 -2
  24. data/lib/prism/serialize.rb +1252 -765
  25. data/lib/prism/translation/parser/builder.rb +61 -0
  26. data/lib/prism/translation/parser/compiler.rb +192 -136
  27. data/lib/prism/translation/parser/lexer.rb +435 -61
  28. data/lib/prism/translation/parser.rb +51 -3
  29. data/lib/prism/translation/parser35.rb +12 -0
  30. data/lib/prism/translation/ripper.rb +13 -3
  31. data/lib/prism/translation/ruby_parser.rb +5 -4
  32. data/lib/prism/translation.rb +1 -0
  33. data/lib/prism.rb +3 -3
  34. data/prism.gemspec +5 -1
  35. data/rbi/prism/dsl.rbi +6 -3
  36. data/rbi/prism/node.rbi +22 -7
  37. data/rbi/prism/parse_result.rbi +17 -0
  38. data/rbi/prism/translation/parser35.rbi +6 -0
  39. data/rbi/prism.rbi +39 -36
  40. data/sig/prism/dsl.rbs +4 -2
  41. data/sig/prism/node.rbs +17 -7
  42. data/sig/prism/parse_result.rbs +10 -0
  43. data/sig/prism/serialize.rbs +4 -2
  44. data/sig/prism.rbs +22 -1
  45. data/src/diagnostic.c +2 -2
  46. data/src/node.c +21 -0
  47. data/src/options.c +31 -0
  48. data/src/prettyprint.c +30 -0
  49. data/src/prism.c +374 -118
  50. data/src/serialize.c +6 -0
  51. data/src/util/pm_buffer.c +40 -0
  52. data/src/util/pm_constant_pool.c +6 -2
  53. data/src/util/pm_strncasecmp.c +13 -1
  54. metadata +7 -7
data/src/prism.c CHANGED
@@ -1649,22 +1649,25 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
1649
1649
  * the function pointer or can just directly use the UTF-8 functions.
1650
1650
  */
1651
1651
  static inline size_t
1652
- char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
1652
+ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) {
1653
+ if (n <= 0) return 0;
1654
+
1653
1655
  if (parser->encoding_changed) {
1654
1656
  size_t width;
1655
- if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
1657
+
1658
+ if ((width = parser->encoding->alpha_char(b, n)) != 0) {
1656
1659
  return width;
1657
1660
  } else if (*b == '_') {
1658
1661
  return 1;
1659
1662
  } else if (*b >= 0x80) {
1660
- return parser->encoding->char_width(b, parser->end - b);
1663
+ return parser->encoding->char_width(b, n);
1661
1664
  } else {
1662
1665
  return 0;
1663
1666
  }
1664
1667
  } else if (*b < 0x80) {
1665
1668
  return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
1666
1669
  } else {
1667
- return pm_encoding_utf_8_char_width(b, parser->end - b);
1670
+ return pm_encoding_utf_8_char_width(b, n);
1668
1671
  }
1669
1672
  }
1670
1673
 
@@ -1673,11 +1676,13 @@ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
1673
1676
  * has not been changed.
1674
1677
  */
1675
1678
  static inline size_t
1676
- char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
1677
- if (*b < 0x80) {
1679
+ char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) {
1680
+ if (n <= 0) {
1681
+ return 0;
1682
+ } else if (*b < 0x80) {
1678
1683
  return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
1679
1684
  } else {
1680
- return pm_encoding_utf_8_char_width(b, end - b);
1685
+ return pm_encoding_utf_8_char_width(b, n);
1681
1686
  }
1682
1687
  }
1683
1688
 
@@ -1687,20 +1692,24 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
1687
1692
  * it's important that it be as fast as possible.
1688
1693
  */
1689
1694
  static inline size_t
1690
- char_is_identifier(const pm_parser_t *parser, const uint8_t *b) {
1691
- if (parser->encoding_changed) {
1695
+ char_is_identifier(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) {
1696
+ if (n <= 0) {
1697
+ return 0;
1698
+ } else if (parser->encoding_changed) {
1692
1699
  size_t width;
1693
- if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
1700
+
1701
+ if ((width = parser->encoding->alnum_char(b, n)) != 0) {
1694
1702
  return width;
1695
1703
  } else if (*b == '_') {
1696
1704
  return 1;
1697
1705
  } else if (*b >= 0x80) {
1698
- return parser->encoding->char_width(b, parser->end - b);
1706
+ return parser->encoding->char_width(b, n);
1699
1707
  } else {
1700
1708
  return 0;
1701
1709
  }
1710
+ } else {
1711
+ return char_is_identifier_utf8(b, n);
1702
1712
  }
1703
- return char_is_identifier_utf8(b, parser->end);
1704
1713
  }
1705
1714
 
1706
1715
  // Here we're defining a perfect hash for the characters that are allowed in
@@ -1731,9 +1740,10 @@ char_is_global_name_punctuation(const uint8_t b) {
1731
1740
  static inline bool
1732
1741
  token_is_setter_name(pm_token_t *token) {
1733
1742
  return (
1734
- (token->type == PM_TOKEN_IDENTIFIER) &&
1743
+ (token->type == PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL) ||
1744
+ ((token->type == PM_TOKEN_IDENTIFIER) &&
1735
1745
  (token->end - token->start >= 2) &&
1736
- (token->end[-1] == '=')
1746
+ (token->end[-1] == '='))
1737
1747
  );
1738
1748
  }
1739
1749
 
@@ -2895,7 +2905,7 @@ pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) {
2895
2905
  (node->message_loc.start != NULL) &&
2896
2906
  (node->message_loc.end[-1] != '!') &&
2897
2907
  (node->message_loc.end[-1] != '?') &&
2898
- char_is_identifier_start(parser, node->message_loc.start) &&
2908
+ char_is_identifier_start(parser, node->message_loc.start, parser->end - node->message_loc.start) &&
2899
2909
  (node->opening_loc.start == NULL) &&
2900
2910
  (node->arguments == NULL) &&
2901
2911
  (node->block == NULL)
@@ -5318,6 +5328,12 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_
5318
5328
  // should clear the mutability flags.
5319
5329
  CLEAR_FLAGS(node);
5320
5330
  break;
5331
+ case PM_X_STRING_NODE:
5332
+ case PM_INTERPOLATED_X_STRING_NODE:
5333
+ // If this is an x string, then this is a syntax error. But we want
5334
+ // to handle it here so that we don't fail the assertion.
5335
+ CLEAR_FLAGS(node);
5336
+ break;
5321
5337
  default:
5322
5338
  assert(false && "unexpected node type");
5323
5339
  break;
@@ -5652,7 +5668,7 @@ pm_lambda_node_create(
5652
5668
  */
5653
5669
  static pm_local_variable_and_write_node_t *
5654
5670
  pm_local_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) {
5655
- assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5671
+ assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5656
5672
  assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL);
5657
5673
  pm_local_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_and_write_node_t);
5658
5674
 
@@ -5707,7 +5723,7 @@ pm_local_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *tar
5707
5723
  */
5708
5724
  static pm_local_variable_or_write_node_t *
5709
5725
  pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) {
5710
- assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5726
+ assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5711
5727
  assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL);
5712
5728
  pm_local_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_or_write_node_t);
5713
5729
 
@@ -6159,7 +6175,10 @@ pm_numbered_reference_read_node_number(pm_parser_t *parser, const pm_token_t *to
6159
6175
  const uint8_t *end = token->end;
6160
6176
 
6161
6177
  ptrdiff_t diff = end - start;
6162
- assert(diff > 0 && ((unsigned long) diff < SIZE_MAX));
6178
+ assert(diff > 0);
6179
+ #if PTRDIFF_MAX > SIZE_MAX
6180
+ assert(diff < (ptrdiff_t) SIZE_MAX);
6181
+ #endif
6163
6182
  size_t length = (size_t) diff;
6164
6183
 
6165
6184
  char *digits = xcalloc(length + 1, sizeof(char));
@@ -6393,12 +6412,13 @@ pm_program_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, pm_st
6393
6412
  * Allocate and initialize new ParenthesesNode node.
6394
6413
  */
6395
6414
  static pm_parentheses_node_t *
6396
- pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing) {
6415
+ pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing, pm_node_flags_t flags) {
6397
6416
  pm_parentheses_node_t *node = PM_NODE_ALLOC(parser, pm_parentheses_node_t);
6398
6417
 
6399
6418
  *node = (pm_parentheses_node_t) {
6400
6419
  {
6401
6420
  .type = PM_PARENTHESES_NODE,
6421
+ .flags = flags,
6402
6422
  .node_id = PM_NODE_IDENTIFY(parser),
6403
6423
  .location = {
6404
6424
  .start = opening->start,
@@ -6665,6 +6685,7 @@ pm_rescue_node_create(pm_parser_t *parser, const pm_token_t *keyword) {
6665
6685
  },
6666
6686
  .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword),
6667
6687
  .operator_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
6688
+ .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
6668
6689
  .reference = NULL,
6669
6690
  .statements = NULL,
6670
6691
  .subsequent = NULL,
@@ -9082,10 +9103,10 @@ lex_global_variable(pm_parser_t *parser) {
9082
9103
  parser->current.end++;
9083
9104
  size_t width;
9084
9105
 
9085
- if (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
9106
+ if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) {
9086
9107
  do {
9087
9108
  parser->current.end += width;
9088
- } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
9109
+ } while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0);
9089
9110
 
9090
9111
  // $0 isn't allowed to be followed by anything.
9091
9112
  pm_diagnostic_id_t diag_id = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL;
@@ -9114,10 +9135,10 @@ lex_global_variable(pm_parser_t *parser) {
9114
9135
  default: {
9115
9136
  size_t width;
9116
9137
 
9117
- if ((width = char_is_identifier(parser, parser->current.end)) > 0) {
9138
+ if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) {
9118
9139
  do {
9119
9140
  parser->current.end += width;
9120
- } while (allow_multiple && parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
9141
+ } while (allow_multiple && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0);
9121
9142
  } else if (pm_char_is_whitespace(peek(parser))) {
9122
9143
  // If we get here, then we have a $ followed by whitespace,
9123
9144
  // which is not allowed.
@@ -9182,11 +9203,11 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
9182
9203
  bool encoding_changed = parser->encoding_changed;
9183
9204
 
9184
9205
  if (encoding_changed) {
9185
- while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
9206
+ while ((width = char_is_identifier(parser, current_end, end - current_end)) > 0) {
9186
9207
  current_end += width;
9187
9208
  }
9188
9209
  } else {
9189
- while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) {
9210
+ while ((width = char_is_identifier_utf8(current_end, end - current_end)) > 0) {
9190
9211
  current_end += width;
9191
9212
  }
9192
9213
  }
@@ -9360,7 +9381,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
9360
9381
  const uint8_t *variable = pound + 2;
9361
9382
  if (*variable == '@' && pound + 3 < parser->end) variable++;
9362
9383
 
9363
- if (char_is_identifier_start(parser, variable)) {
9384
+ if (char_is_identifier_start(parser, variable, parser->end - variable)) {
9364
9385
  // At this point we're sure that we've either hit an embedded instance
9365
9386
  // or class variable. In this case we'll first need to check if we've
9366
9387
  // already consumed content.
@@ -9409,7 +9430,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
9409
9430
  // or a global name punctuation character, then we've hit an embedded
9410
9431
  // global variable.
9411
9432
  if (
9412
- char_is_identifier_start(parser, check) ||
9433
+ char_is_identifier_start(parser, check, parser->end - check) ||
9413
9434
  (pound[2] != '-' && (pm_char_is_decimal_digit(pound[2]) || char_is_global_name_punctuation(pound[2])))
9414
9435
  ) {
9415
9436
  // In this case we've hit an embedded global variable. First check to
@@ -9541,21 +9562,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla
9541
9562
  parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
9542
9563
  }
9543
9564
 
9544
- if (value <= 0x7F) { // 0xxxxxxx
9545
- pm_buffer_append_byte(buffer, (uint8_t) value);
9546
- } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
9547
- pm_buffer_append_byte(buffer, (uint8_t) (0xC0 | (value >> 6)));
9548
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9549
- } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx
9550
- pm_buffer_append_byte(buffer, (uint8_t) (0xE0 | (value >> 12)));
9551
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
9552
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9553
- } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
9554
- pm_buffer_append_byte(buffer, (uint8_t) (0xF0 | (value >> 18)));
9555
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F)));
9556
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
9557
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9558
- } else {
9565
+ if (!pm_buffer_append_unicode_codepoint(buffer, value)) {
9559
9566
  pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
9560
9567
  pm_buffer_append_byte(buffer, 0xEF);
9561
9568
  pm_buffer_append_byte(buffer, 0xBF);
@@ -9580,28 +9587,6 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
9580
9587
  pm_buffer_append_byte(buffer, byte);
9581
9588
  }
9582
9589
 
9583
- /**
9584
- * Write each byte of the given escaped character into the buffer.
9585
- */
9586
- static inline void
9587
- escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
9588
- size_t width;
9589
- if (parser->encoding_changed) {
9590
- width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
9591
- } else {
9592
- width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
9593
- }
9594
-
9595
- // TODO: If the character is invalid in the given encoding, then we'll just
9596
- // push one byte into the buffer. This should actually be an error.
9597
- width = (width == 0) ? 1 : width;
9598
-
9599
- for (size_t index = 0; index < width; index++) {
9600
- escape_write_byte_encoded(parser, buffer, *parser->current.end);
9601
- parser->current.end++;
9602
- }
9603
- }
9604
-
9605
9590
  /**
9606
9591
  * The regular expression engine doesn't support the same escape sequences as
9607
9592
  * Ruby does. So first we have to read the escape sequence, and then we have to
@@ -9626,6 +9611,33 @@ escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular
9626
9611
  escape_write_byte_encoded(parser, buffer, byte);
9627
9612
  }
9628
9613
 
9614
+ /**
9615
+ * Write each byte of the given escaped character into the buffer.
9616
+ */
9617
+ static inline void
9618
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) {
9619
+ size_t width;
9620
+ if (parser->encoding_changed) {
9621
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
9622
+ } else {
9623
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
9624
+ }
9625
+
9626
+ if (width == 1) {
9627
+ escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(*parser->current.end++, flags));
9628
+ } else if (width > 1) {
9629
+ // Valid multibyte character. Just ignore escape.
9630
+ pm_buffer_t *b = (flags & PM_ESCAPE_FLAG_REGEXP) ? regular_expression_buffer : buffer;
9631
+ pm_buffer_append_bytes(b, parser->current.end, width);
9632
+ parser->current.end += width;
9633
+ } else {
9634
+ // Assume the next character wasn't meant to be part of this escape
9635
+ // sequence since it is invalid. Add an error and move on.
9636
+ parser->current.end++;
9637
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
9638
+ }
9639
+ }
9640
+
9629
9641
  /**
9630
9642
  * Warn about using a space or a tab character in an escape, as opposed to using
9631
9643
  * \\s or \\t. Note that we can quite copy the source because the warning
@@ -9652,7 +9664,8 @@ escape_read_warn(pm_parser_t *parser, uint8_t flags, uint8_t flag, const char *t
9652
9664
  */
9653
9665
  static void
9654
9666
  escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) {
9655
- switch (peek(parser)) {
9667
+ uint8_t peeked = peek(parser);
9668
+ switch (peeked) {
9656
9669
  case '\\': {
9657
9670
  parser->current.end++;
9658
9671
  escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\\', flags));
@@ -9722,6 +9735,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9722
9735
  }
9723
9736
  }
9724
9737
 
9738
+ value = escape_byte(value, flags);
9725
9739
  escape_write_byte(parser, buffer, regular_expression_buffer, flags, value);
9726
9740
  return;
9727
9741
  }
@@ -9770,7 +9784,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9770
9784
 
9771
9785
  size_t whitespace;
9772
9786
  while (true) {
9773
- if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
9787
+ if ((whitespace = pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
9774
9788
  parser->current.end += whitespace;
9775
9789
  } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
9776
9790
  // This is super hacky, but it gets us nicer error
@@ -9818,7 +9832,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9818
9832
  uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length);
9819
9833
  escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
9820
9834
 
9821
- parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
9835
+ parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end);
9822
9836
  }
9823
9837
 
9824
9838
  // ?\u{nnnn} character literal should contain only one codepoint
@@ -10049,8 +10063,13 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
10049
10063
  PRISM_FALLTHROUGH
10050
10064
  }
10051
10065
  default: {
10066
+ if ((flags & (PM_ESCAPE_FLAG_CONTROL | PM_ESCAPE_FLAG_META)) && !char_is_ascii_printable(peeked)) {
10067
+ size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
10068
+ pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META);
10069
+ return;
10070
+ }
10052
10071
  if (parser->current.end < parser->end) {
10053
- escape_write_escape_encoded(parser, buffer);
10072
+ escape_write_escape_encoded(parser, buffer, regular_expression_buffer, flags);
10054
10073
  } else {
10055
10074
  pm_parser_err_current(parser, PM_ERR_INVALID_ESCAPE_CHARACTER);
10056
10075
  }
@@ -10123,7 +10142,7 @@ lex_question_mark(pm_parser_t *parser) {
10123
10142
  !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') ||
10124
10143
  (
10125
10144
  (parser->current.end + encoding_width >= parser->end) ||
10126
- !char_is_identifier(parser, parser->current.end + encoding_width)
10145
+ !char_is_identifier(parser, parser->current.end + encoding_width, parser->end - (parser->current.end + encoding_width))
10127
10146
  )
10128
10147
  ) {
10129
10148
  lex_state_set(parser, PM_LEX_STATE_END);
@@ -10143,21 +10162,22 @@ lex_question_mark(pm_parser_t *parser) {
10143
10162
  static pm_token_type_t
10144
10163
  lex_at_variable(pm_parser_t *parser) {
10145
10164
  pm_token_type_t type = match(parser, '@') ? PM_TOKEN_CLASS_VARIABLE : PM_TOKEN_INSTANCE_VARIABLE;
10146
- size_t width;
10165
+ const uint8_t *end = parser->end;
10147
10166
 
10148
- if (parser->current.end < parser->end && (width = char_is_identifier_start(parser, parser->current.end)) > 0) {
10167
+ size_t width;
10168
+ if ((width = char_is_identifier_start(parser, parser->current.end, end - parser->current.end)) > 0) {
10149
10169
  parser->current.end += width;
10150
10170
 
10151
- while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
10171
+ while ((width = char_is_identifier(parser, parser->current.end, end - parser->current.end)) > 0) {
10152
10172
  parser->current.end += width;
10153
10173
  }
10154
- } else if (parser->current.end < parser->end && pm_char_is_decimal_digit(*parser->current.end)) {
10174
+ } else if (parser->current.end < end && pm_char_is_decimal_digit(*parser->current.end)) {
10155
10175
  pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE;
10156
10176
  if (parser->version == PM_OPTIONS_VERSION_CRUBY_3_3) {
10157
10177
  diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS_3_3 : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE_3_3;
10158
10178
  }
10159
10179
 
10160
- size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
10180
+ size_t width = parser->encoding->char_width(parser->current.end, end - parser->current.end);
10161
10181
  PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start);
10162
10182
  } else {
10163
10183
  pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_CLASS_VARIABLE_BARE : PM_ERR_INSTANCE_VARIABLE_BARE;
@@ -11145,13 +11165,13 @@ parser_lex(pm_parser_t *parser) {
11145
11165
 
11146
11166
  if (parser->current.end >= parser->end) {
11147
11167
  parser->current.end = end;
11148
- } else if (quote == PM_HEREDOC_QUOTE_NONE && (width = char_is_identifier(parser, parser->current.end)) == 0) {
11168
+ } else if (quote == PM_HEREDOC_QUOTE_NONE && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) == 0) {
11149
11169
  parser->current.end = end;
11150
11170
  } else {
11151
11171
  if (quote == PM_HEREDOC_QUOTE_NONE) {
11152
11172
  parser->current.end += width;
11153
11173
 
11154
- while ((parser->current.end < parser->end) && (width = char_is_identifier(parser, parser->current.end))) {
11174
+ while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end))) {
11155
11175
  parser->current.end += width;
11156
11176
  }
11157
11177
  } else {
@@ -11336,7 +11356,7 @@ parser_lex(pm_parser_t *parser) {
11336
11356
  } else {
11337
11357
  const uint8_t delim = peek_offset(parser, 1);
11338
11358
 
11339
- if ((delim != '\'') && (delim != '"') && !char_is_identifier(parser, parser->current.end + 1)) {
11359
+ if ((delim != '\'') && (delim != '"') && !char_is_identifier(parser, parser->current.end + 1, parser->end - (parser->current.end + 1))) {
11340
11360
  pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_AMPERSAND);
11341
11361
  }
11342
11362
  }
@@ -11774,7 +11794,7 @@ parser_lex(pm_parser_t *parser) {
11774
11794
 
11775
11795
  default: {
11776
11796
  if (*parser->current.start != '_') {
11777
- size_t width = char_is_identifier_start(parser, parser->current.start);
11797
+ size_t width = char_is_identifier_start(parser, parser->current.start, parser->end - parser->current.start);
11778
11798
 
11779
11799
  // If this isn't the beginning of an identifier, then
11780
11800
  // it's an invalid token as we've exhausted all of the
@@ -12965,7 +12985,7 @@ typedef struct {
12965
12985
 
12966
12986
  pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = {
12967
12987
  // rescue
12968
- [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
12988
+ [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = { PM_BINDING_POWER_MODIFIER_RESCUE, PM_BINDING_POWER_COMPOSITION, true, false },
12969
12989
 
12970
12990
  // if unless until while
12971
12991
  [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
@@ -13708,7 +13728,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
13708
13728
  return target;
13709
13729
  }
13710
13730
 
13711
- if (char_is_identifier_start(parser, call->message_loc.start)) {
13731
+ if (char_is_identifier_start(parser, call->message_loc.start, parser->end - call->message_loc.start)) {
13712
13732
  // When we get here, we have a method call, because it was
13713
13733
  // previously marked as a method call but now we have an =. This
13714
13734
  // looks like:
@@ -13936,6 +13956,15 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) {
13936
13956
  if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) {
13937
13957
  parser_lex(parser);
13938
13958
 
13959
+ // If we are at the end of the file, then we need to stop parsing
13960
+ // the statements entirely at this point. Mark the parser as
13961
+ // recovering, as we know that EOF closes the top-level context, and
13962
+ // then break out of the loop.
13963
+ if (match1(parser, PM_TOKEN_EOF)) {
13964
+ parser->recovering = true;
13965
+ break;
13966
+ }
13967
+
13939
13968
  while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
13940
13969
  if (context_terminator(context, &parser->current)) break;
13941
13970
  } else if (!accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_EOF)) {
@@ -15051,8 +15080,8 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_
15051
15080
  case PM_TOKEN_NEWLINE:
15052
15081
  case PM_TOKEN_SEMICOLON:
15053
15082
  case PM_TOKEN_KEYWORD_THEN:
15054
- // Here we have a terminator for the rescue keyword, in which case we're
15055
- // going to just continue on.
15083
+ // Here we have a terminator for the rescue keyword, in which
15084
+ // case we're going to just continue on.
15056
15085
  break;
15057
15086
  default: {
15058
15087
  if (token_begins_expression_p(parser->current.type) || match1(parser, PM_TOKEN_USTAR)) {
@@ -15084,9 +15113,12 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_
15084
15113
  }
15085
15114
 
15086
15115
  if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
15087
- accept1(parser, PM_TOKEN_KEYWORD_THEN);
15116
+ if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) {
15117
+ rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous);
15118
+ }
15088
15119
  } else {
15089
15120
  expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_RESCUE_TERM);
15121
+ rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous);
15090
15122
  }
15091
15123
 
15092
15124
  if (!match3(parser, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_END)) {
@@ -16802,6 +16834,10 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1
16802
16834
  // If we haven't already created our container for concatenation,
16803
16835
  // we'll do that now.
16804
16836
  if (!concating) {
16837
+ if (!PM_NODE_TYPE_P(current, PM_STRING_NODE) && !PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
16838
+ pm_parser_err_node(parser, current, PM_ERR_STRING_CONCATENATION);
16839
+ }
16840
+
16805
16841
  concating = true;
16806
16842
  pm_token_t bounds = not_provided(parser);
16807
16843
 
@@ -17040,7 +17076,7 @@ pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const u
17040
17076
  if (length == 0) return false;
17041
17077
 
17042
17078
  // First ensure that it starts with a valid identifier starting character.
17043
- size_t width = char_is_identifier_start(parser, start);
17079
+ size_t width = char_is_identifier_start(parser, start, end - start);
17044
17080
  if (width == 0) return false;
17045
17081
 
17046
17082
  // Next, ensure that it's not an uppercase character.
@@ -17053,7 +17089,7 @@ pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const u
17053
17089
  // Next, iterate through all of the bytes of the string to ensure that they
17054
17090
  // are all valid identifier characters.
17055
17091
  const uint8_t *cursor = start + width;
17056
- while ((cursor < end) && (width = char_is_identifier(parser, cursor))) cursor += width;
17092
+ while ((width = char_is_identifier(parser, cursor, end - cursor))) cursor += width;
17057
17093
  return cursor == end;
17058
17094
  }
17059
17095
 
@@ -17526,7 +17562,7 @@ parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, p
17526
17562
  pm_node_t *body = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1));
17527
17563
  accept1(parser, PM_TOKEN_NEWLINE);
17528
17564
  expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN);
17529
- pm_node_t *right = (pm_node_t *) pm_parentheses_node_create(parser, &opening, body, &parser->previous);
17565
+ pm_node_t *right = (pm_node_t *) pm_parentheses_node_create(parser, &opening, body, &parser->previous, 0);
17530
17566
 
17531
17567
  if (node == NULL) {
17532
17568
  node = right;
@@ -18149,12 +18185,19 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18149
18185
  case PM_TOKEN_PARENTHESIS_LEFT:
18150
18186
  case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: {
18151
18187
  pm_token_t opening = parser->current;
18188
+ pm_node_flags_t flags = 0;
18152
18189
 
18153
18190
  pm_node_list_t current_block_exits = { 0 };
18154
18191
  pm_node_list_t *previous_block_exits = push_block_exits(parser, &current_block_exits);
18155
18192
 
18156
18193
  parser_lex(parser);
18157
- while (accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE));
18194
+ while (true) {
18195
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18196
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18197
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
18198
+ break;
18199
+ }
18200
+ }
18158
18201
 
18159
18202
  // If this is the end of the file or we match a right parenthesis, then
18160
18203
  // we have an empty parentheses node, and we can immediately return.
@@ -18164,7 +18207,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18164
18207
  pop_block_exits(parser, previous_block_exits);
18165
18208
  pm_node_list_free(&current_block_exits);
18166
18209
 
18167
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, NULL, &parser->previous);
18210
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, flags);
18168
18211
  }
18169
18212
 
18170
18213
  // Otherwise, we're going to parse the first statement in the list
@@ -18177,9 +18220,23 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18177
18220
  // Determine if this statement is followed by a terminator. In the
18178
18221
  // case of a single statement, this is fine. But in the case of
18179
18222
  // multiple statements it's required.
18180
- bool terminator_found = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
18223
+ bool terminator_found = false;
18224
+
18225
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18226
+ terminator_found = true;
18227
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18228
+ } else if (accept1(parser, PM_TOKEN_NEWLINE)) {
18229
+ terminator_found = true;
18230
+ }
18231
+
18181
18232
  if (terminator_found) {
18182
- while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
18233
+ while (true) {
18234
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18235
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18236
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
18237
+ break;
18238
+ }
18239
+ }
18183
18240
  }
18184
18241
 
18185
18242
  // If we hit a right parenthesis, then we're done parsing the
@@ -18251,13 +18308,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18251
18308
  pm_statements_node_t *statements = pm_statements_node_create(parser);
18252
18309
  pm_statements_node_body_append(parser, statements, statement, true);
18253
18310
 
18254
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous);
18311
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags);
18255
18312
  }
18256
18313
 
18257
18314
  // If we have more than one statement in the set of parentheses,
18258
18315
  // then we are going to parse all of them as a list of statements.
18259
18316
  // We'll do that here.
18260
18317
  context_push(parser, PM_CONTEXT_PARENS);
18318
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18319
+
18261
18320
  pm_statements_node_t *statements = pm_statements_node_create(parser);
18262
18321
  pm_statements_node_body_append(parser, statements, statement, true);
18263
18322
 
@@ -18334,7 +18393,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18334
18393
  pm_node_list_free(&current_block_exits);
18335
18394
 
18336
18395
  pm_void_statements_check(parser, statements, true);
18337
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous);
18396
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags);
18338
18397
  }
18339
18398
  case PM_TOKEN_BRACE_LEFT: {
18340
18399
  // If we were passed a current_hash_keys via the parser, then that
@@ -19380,7 +19439,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19380
19439
  expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM);
19381
19440
 
19382
19441
  operator = parser->previous;
19383
- receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen);
19442
+ receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0);
19384
19443
 
19385
19444
  // To push `PM_CONTEXT_DEF_PARAMS` again is for the same
19386
19445
  // reason as described the above.
@@ -19467,7 +19526,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19467
19526
  context_push(parser, PM_CONTEXT_RESCUE_MODIFIER);
19468
19527
 
19469
19528
  pm_token_t rescue_keyword = parser->previous;
19470
- pm_node_t *value = parse_expression(parser, binding_power, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
19529
+ pm_node_t *value = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
19471
19530
  context_pop(parser);
19472
19531
 
19473
19532
  statement = (pm_node_t *) pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value);
@@ -19710,11 +19769,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19710
19769
  accept1(parser, PM_TOKEN_NEWLINE);
19711
19770
 
19712
19771
  if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
19713
- arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous);
19772
+ pm_token_t lparen = parser->previous;
19714
19773
 
19715
19774
  if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
19716
- arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous);
19775
+ receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0);
19717
19776
  } else {
19777
+ arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&lparen);
19718
19778
  receiver = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1));
19719
19779
 
19720
19780
  if (!parser->recovering) {
@@ -20687,7 +20747,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_
20687
20747
  pm_token_t rescue = parser->current;
20688
20748
  parser_lex(parser);
20689
20749
 
20690
- pm_node_t *right = parse_expression(parser, binding_power, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20750
+ pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20691
20751
  context_pop(parser);
20692
20752
 
20693
20753
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
@@ -20793,7 +20853,7 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
20793
20853
  }
20794
20854
  }
20795
20855
 
20796
- pm_node_t *right = parse_expression(parser, binding_power, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20856
+ pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20797
20857
  context_pop(parser);
20798
20858
 
20799
20859
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
@@ -20849,6 +20909,123 @@ typedef struct {
20849
20909
  bool shared;
20850
20910
  } parse_regular_expression_named_capture_data_t;
20851
20911
 
20912
+ static inline const uint8_t *
20913
+ pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
20914
+ cursor++;
20915
+
20916
+ if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
20917
+ uint8_t value = escape_hexadecimal_digit(*cursor);
20918
+ cursor++;
20919
+
20920
+ if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
20921
+ value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(*cursor));
20922
+ cursor++;
20923
+ }
20924
+
20925
+ pm_buffer_append_byte(unescaped, value);
20926
+ } else {
20927
+ pm_buffer_append_string(unescaped, "\\x", 2);
20928
+ }
20929
+
20930
+ return cursor;
20931
+ }
20932
+
20933
+ static inline const uint8_t *
20934
+ pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
20935
+ uint8_t value = (uint8_t) (*cursor - '0');
20936
+ cursor++;
20937
+
20938
+ if (cursor < end && pm_char_is_octal_digit(*cursor)) {
20939
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
20940
+ cursor++;
20941
+
20942
+ if (cursor < end && pm_char_is_octal_digit(*cursor)) {
20943
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
20944
+ cursor++;
20945
+ }
20946
+ }
20947
+
20948
+ pm_buffer_append_byte(unescaped, value);
20949
+ return cursor;
20950
+ }
20951
+
20952
+ static inline const uint8_t *
20953
+ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
20954
+ const uint8_t *start = cursor - 1;
20955
+ cursor++;
20956
+
20957
+ if (cursor >= end) {
20958
+ pm_buffer_append_string(unescaped, "\\u", 2);
20959
+ return cursor;
20960
+ }
20961
+
20962
+ if (*cursor != '{') {
20963
+ size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4));
20964
+ uint32_t value = escape_unicode(parser, cursor, length);
20965
+
20966
+ if (!pm_buffer_append_unicode_codepoint(unescaped, value)) {
20967
+ pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start));
20968
+ }
20969
+
20970
+ return cursor + length;
20971
+ }
20972
+
20973
+ cursor++;
20974
+ for (;;) {
20975
+ while (cursor < end && *cursor == ' ') cursor++;
20976
+
20977
+ if (cursor >= end) break;
20978
+ if (*cursor == '}') {
20979
+ cursor++;
20980
+ break;
20981
+ }
20982
+
20983
+ size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor);
20984
+ uint32_t value = escape_unicode(parser, cursor, length);
20985
+
20986
+ (void) pm_buffer_append_unicode_codepoint(unescaped, value);
20987
+ cursor += length;
20988
+ }
20989
+
20990
+ return cursor;
20991
+ }
20992
+
20993
+ static void
20994
+ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) {
20995
+ const uint8_t *end = source + length;
20996
+ pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source));
20997
+
20998
+ for (;;) {
20999
+ if (++cursor >= end) {
21000
+ pm_buffer_append_byte(unescaped, '\\');
21001
+ return;
21002
+ }
21003
+
21004
+ switch (*cursor) {
21005
+ case 'x':
21006
+ cursor = pm_named_capture_escape_hex(unescaped, cursor, end);
21007
+ break;
21008
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
21009
+ cursor = pm_named_capture_escape_octal(unescaped, cursor, end);
21010
+ break;
21011
+ case 'u':
21012
+ cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end);
21013
+ break;
21014
+ default:
21015
+ pm_buffer_append_byte(unescaped, '\\');
21016
+ break;
21017
+ }
21018
+
21019
+ const uint8_t *next_cursor = pm_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, parser->encoding);
21020
+ if (next_cursor == NULL) break;
21021
+
21022
+ pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (next_cursor - cursor));
21023
+ cursor = next_cursor;
21024
+ }
21025
+
21026
+ pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (end - cursor));
21027
+ }
21028
+
20852
21029
  /**
20853
21030
  * This callback is called when the regular expression parser encounters a named
20854
21031
  * capture group.
@@ -20863,13 +21040,32 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20863
21040
 
20864
21041
  const uint8_t *source = pm_string_source(capture);
20865
21042
  size_t length = pm_string_length(capture);
21043
+ pm_buffer_t unescaped = { 0 };
21044
+
21045
+ // First, we need to handle escapes within the name of the capture group.
21046
+ // This is because regular expressions have three different representations
21047
+ // in prism. The first is the plain source code. The second is the
21048
+ // representation that will be sent to the regular expression engine, which
21049
+ // is the value of the "unescaped" field. This is poorly named, because it
21050
+ // actually still contains escapes, just a subset of them that the regular
21051
+ // expression engine knows how to handle. The third representation is fully
21052
+ // unescaped, which is what we need.
21053
+ const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding);
21054
+ if (PRISM_UNLIKELY(cursor != NULL)) {
21055
+ pm_named_capture_escape(parser, &unescaped, source, length, cursor);
21056
+ source = (const uint8_t *) pm_buffer_value(&unescaped);
21057
+ length = pm_buffer_length(&unescaped);
21058
+ }
20866
21059
 
20867
21060
  pm_location_t location;
20868
21061
  pm_constant_id_t name;
20869
21062
 
20870
21063
  // If the name of the capture group isn't a valid identifier, we do
20871
21064
  // not add it to the local table.
20872
- if (!pm_slice_is_valid_local(parser, source, source + length)) return;
21065
+ if (!pm_slice_is_valid_local(parser, source, source + length)) {
21066
+ pm_buffer_free(&unescaped);
21067
+ return;
21068
+ }
20873
21069
 
20874
21070
  if (callback_data->shared) {
20875
21071
  // If the unescaped string is a slice of the source, then we can
@@ -20897,7 +21093,10 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20897
21093
  if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
20898
21094
  // If the local is not already a local but it is a keyword, then we
20899
21095
  // do not want to add a capture for this.
20900
- if (pm_local_is_keyword((const char *) source, length)) return;
21096
+ if (pm_local_is_keyword((const char *) source, length)) {
21097
+ pm_buffer_free(&unescaped);
21098
+ return;
21099
+ }
20901
21100
 
20902
21101
  // If the identifier is not already a local, then we will add it to
20903
21102
  // the local table.
@@ -20915,6 +21114,8 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20915
21114
  pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
20916
21115
  pm_node_list_append(&callback_data->match->targets, target);
20917
21116
  }
21117
+
21118
+ pm_buffer_free(&unescaped);
20918
21119
  }
20919
21120
 
20920
21121
  /**
@@ -21055,7 +21256,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21055
21256
  pm_node_destroy(parser, node);
21056
21257
  return result;
21057
21258
  }
21259
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21260
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21261
+ parser_lex(parser);
21262
+
21263
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1));
21264
+ pm_node_t *result = (pm_node_t *) pm_local_variable_and_write_node_create(parser, node, &token, value, name, 0);
21265
+
21266
+ parse_target_implicit_parameter(parser, node);
21267
+ pm_node_destroy(parser, node);
21268
+ return result;
21269
+ }
21058
21270
  case PM_LOCAL_VARIABLE_READ_NODE: {
21271
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21272
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21273
+ parse_target_implicit_parameter(parser, node);
21274
+ }
21275
+
21059
21276
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21060
21277
  parser_lex(parser);
21061
21278
 
@@ -21173,7 +21390,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21173
21390
  pm_node_destroy(parser, node);
21174
21391
  return result;
21175
21392
  }
21393
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21394
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21395
+ parser_lex(parser);
21396
+
21397
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1));
21398
+ pm_node_t *result = (pm_node_t *) pm_local_variable_or_write_node_create(parser, node, &token, value, name, 0);
21399
+
21400
+ parse_target_implicit_parameter(parser, node);
21401
+ pm_node_destroy(parser, node);
21402
+ return result;
21403
+ }
21176
21404
  case PM_LOCAL_VARIABLE_READ_NODE: {
21405
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21406
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21407
+ parse_target_implicit_parameter(parser, node);
21408
+ }
21409
+
21177
21410
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21178
21411
  parser_lex(parser);
21179
21412
 
@@ -21301,7 +21534,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21301
21534
  pm_node_destroy(parser, node);
21302
21535
  return result;
21303
21536
  }
21537
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21538
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21539
+ parser_lex(parser);
21540
+
21541
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1));
21542
+ pm_node_t *result = (pm_node_t *) pm_local_variable_operator_write_node_create(parser, node, &token, value, name, 0);
21543
+
21544
+ parse_target_implicit_parameter(parser, node);
21545
+ pm_node_destroy(parser, node);
21546
+ return result;
21547
+ }
21304
21548
  case PM_LOCAL_VARIABLE_READ_NODE: {
21549
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21550
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21551
+ parse_target_implicit_parameter(parser, node);
21552
+ }
21553
+
21305
21554
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21306
21555
  parser_lex(parser);
21307
21556
 
@@ -22035,6 +22284,10 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
22035
22284
  static pm_statements_node_t *
22036
22285
  wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) {
22037
22286
  if (PM_PARSER_COMMAND_LINE_OPTION_P(parser)) {
22287
+ if (statements == NULL) {
22288
+ statements = pm_statements_node_create(parser);
22289
+ }
22290
+
22038
22291
  pm_arguments_node_t *arguments = pm_arguments_node_create(parser);
22039
22292
  pm_arguments_node_arguments_append(
22040
22293
  arguments,
@@ -22050,6 +22303,10 @@ wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) {
22050
22303
 
22051
22304
  if (PM_PARSER_COMMAND_LINE_OPTION_N(parser)) {
22052
22305
  if (PM_PARSER_COMMAND_LINE_OPTION_A(parser)) {
22306
+ if (statements == NULL) {
22307
+ statements = pm_statements_node_create(parser);
22308
+ }
22309
+
22053
22310
  pm_arguments_node_t *arguments = pm_arguments_node_create(parser);
22054
22311
  pm_arguments_node_arguments_append(
22055
22312
  arguments,
@@ -22118,9 +22375,7 @@ parse_program(pm_parser_t *parser) {
22118
22375
  parser_lex(parser);
22119
22376
  pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_MAIN, 0);
22120
22377
 
22121
- if (statements == NULL) {
22122
- statements = pm_statements_node_create(parser);
22123
- } else if (!parser->parsing_eval) {
22378
+ if (statements != NULL && !parser->parsing_eval) {
22124
22379
  // If we have statements, then the top-level statement should be
22125
22380
  // explicitly checked as well. We have to do this here because
22126
22381
  // everywhere else we check all but the last statement.
@@ -22132,13 +22387,6 @@ parse_program(pm_parser_t *parser) {
22132
22387
  pm_locals_order(parser, &parser->current_scope->locals, &locals, true);
22133
22388
  pm_parser_scope_pop(parser);
22134
22389
 
22135
- // If this is an empty file, then we're still going to parse all of the
22136
- // statements in order to gather up all of the comments and such. Here we'll
22137
- // correct the location information.
22138
- if (pm_statements_node_body_length(statements) == 0) {
22139
- pm_statements_node_location_set(statements, parser->start, parser->start);
22140
- }
22141
-
22142
22390
  // At the top level, see if we need to wrap the statements in a program
22143
22391
  // node with a while loop based on the options.
22144
22392
  if (parser->command_line & (PM_OPTIONS_COMMAND_LINE_P | PM_OPTIONS_COMMAND_LINE_N)) {
@@ -22148,6 +22396,14 @@ parse_program(pm_parser_t *parser) {
22148
22396
  pm_node_list_free(&current_block_exits);
22149
22397
  }
22150
22398
 
22399
+ // If this is an empty file, then we're still going to parse all of the
22400
+ // statements in order to gather up all of the comments and such. Here we'll
22401
+ // correct the location information.
22402
+ if (statements == NULL) {
22403
+ statements = pm_statements_node_create(parser);
22404
+ pm_statements_node_location_set(statements, parser->start, parser->start);
22405
+ }
22406
+
22151
22407
  return (pm_node_t *) pm_program_node_create(parser, &locals, statements);
22152
22408
  }
22153
22409
 
@@ -22341,7 +22597,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
22341
22597
 
22342
22598
  // Scopes given from the outside are not allowed to have numbered
22343
22599
  // parameters.
22344
- parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED;
22600
+ parser->current_scope->parameters = ((pm_scope_parameters_t) scope->forwarding) | PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED;
22345
22601
 
22346
22602
  for (size_t local_index = 0; local_index < scope->locals_count; local_index++) {
22347
22603
  const pm_string_t *local = pm_options_scope_local_get(scope, local_index);
@@ -22551,11 +22807,11 @@ pm_parse(pm_parser_t *parser) {
22551
22807
  * otherwise return true.
22552
22808
  */
22553
22809
  static bool
22554
- pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets) {
22810
+ pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets) {
22555
22811
  #define LINE_SIZE 4096
22556
22812
  char line[LINE_SIZE];
22557
22813
 
22558
- while (memset(line, '\n', LINE_SIZE), fgets(line, LINE_SIZE, stream) != NULL) {
22814
+ while (memset(line, '\n', LINE_SIZE), stream_fgets(line, LINE_SIZE, stream) != NULL) {
22559
22815
  size_t length = LINE_SIZE;
22560
22816
  while (length > 0 && line[length - 1] == '\n') length--;
22561
22817
 
@@ -22622,16 +22878,16 @@ pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) {
22622
22878
  * can stream stdin in to Ruby so we need to support a streaming API.
22623
22879
  */
22624
22880
  PRISM_EXPORTED_FUNCTION pm_node_t *
22625
- pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options) {
22881
+ pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const pm_options_t *options) {
22626
22882
  pm_buffer_init(buffer);
22627
22883
 
22628
- bool eof = pm_parse_stream_read(buffer, stream, fgets);
22884
+ bool eof = pm_parse_stream_read(buffer, stream, stream_fgets);
22629
22885
  pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
22630
22886
  pm_node_t *node = pm_parse(parser);
22631
22887
 
22632
22888
  while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) {
22633
22889
  pm_node_destroy(parser, node);
22634
- eof = pm_parse_stream_read(buffer, stream, fgets);
22890
+ eof = pm_parse_stream_read(buffer, stream, stream_fgets);
22635
22891
 
22636
22892
  pm_parser_free(parser);
22637
22893
  pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
@@ -22723,13 +22979,13 @@ pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, cons
22723
22979
  * given stream into to the given buffer.
22724
22980
  */
22725
22981
  PRISM_EXPORTED_FUNCTION void
22726
- pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data) {
22982
+ pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const char *data) {
22727
22983
  pm_parser_t parser;
22728
22984
  pm_options_t options = { 0 };
22729
22985
  pm_options_read(&options, data);
22730
22986
 
22731
22987
  pm_buffer_t parser_buffer;
22732
- pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, fgets, &options);
22988
+ pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, stream_fgets, &options);
22733
22989
  pm_serialize_header(buffer);
22734
22990
  pm_serialize_content(&parser, node, buffer);
22735
22991
  pm_buffer_append_byte(buffer, '\0');