prism 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +46 -1
  3. data/Makefile +2 -1
  4. data/README.md +1 -0
  5. data/config.yml +273 -37
  6. data/docs/parser_translation.md +8 -23
  7. data/docs/releasing.md +1 -1
  8. data/docs/ripper_translation.md +1 -1
  9. data/docs/ruby_api.md +1 -1
  10. data/ext/prism/api_node.c +1816 -1303
  11. data/ext/prism/extension.c +244 -110
  12. data/ext/prism/extension.h +4 -4
  13. data/include/prism/ast.h +291 -49
  14. data/include/prism/defines.h +4 -1
  15. data/include/prism/diagnostic.h +4 -0
  16. data/include/prism/options.h +89 -3
  17. data/include/prism/regexp.h +2 -2
  18. data/include/prism/util/pm_buffer.h +18 -0
  19. data/include/prism/util/pm_integer.h +4 -0
  20. data/include/prism/util/pm_list.h +6 -0
  21. data/include/prism/util/pm_string.h +12 -2
  22. data/include/prism/version.h +2 -2
  23. data/include/prism.h +41 -16
  24. data/lib/prism/compiler.rb +456 -151
  25. data/lib/prism/desugar_compiler.rb +1 -0
  26. data/lib/prism/dispatcher.rb +16 -0
  27. data/lib/prism/dot_visitor.rb +21 -1
  28. data/lib/prism/dsl.rb +13 -2
  29. data/lib/prism/ffi.rb +62 -34
  30. data/lib/prism/inspect_visitor.rb +5 -1
  31. data/lib/prism/lex_compat.rb +1 -0
  32. data/lib/prism/mutation_compiler.rb +3 -0
  33. data/lib/prism/node.rb +554 -345
  34. data/lib/prism/node_ext.rb +4 -1
  35. data/lib/prism/pack.rb +2 -0
  36. data/lib/prism/parse_result/comments.rb +1 -0
  37. data/lib/prism/parse_result/errors.rb +1 -0
  38. data/lib/prism/parse_result/newlines.rb +2 -1
  39. data/lib/prism/parse_result.rb +53 -0
  40. data/lib/prism/pattern.rb +1 -0
  41. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  42. data/lib/prism/polyfill/scan_byte.rb +14 -0
  43. data/lib/prism/polyfill/warn.rb +42 -0
  44. data/lib/prism/reflection.rb +5 -2
  45. data/lib/prism/relocation.rb +1 -0
  46. data/lib/prism/serialize.rb +1275 -783
  47. data/lib/prism/string_query.rb +1 -0
  48. data/lib/prism/translation/parser/builder.rb +62 -0
  49. data/lib/prism/translation/parser/compiler.rb +230 -152
  50. data/lib/prism/translation/parser/lexer.rb +446 -64
  51. data/lib/prism/translation/parser.rb +64 -4
  52. data/lib/prism/translation/parser33.rb +1 -0
  53. data/lib/prism/translation/parser34.rb +1 -0
  54. data/lib/prism/translation/parser35.rb +13 -0
  55. data/lib/prism/translation/parser_current.rb +24 -0
  56. data/lib/prism/translation/ripper/sexp.rb +1 -0
  57. data/lib/prism/translation/ripper.rb +30 -4
  58. data/lib/prism/translation/ruby_parser.rb +291 -7
  59. data/lib/prism/translation.rb +3 -0
  60. data/lib/prism/visitor.rb +457 -152
  61. data/lib/prism.rb +5 -3
  62. data/prism.gemspec +9 -1
  63. data/rbi/prism/dsl.rbi +9 -6
  64. data/rbi/prism/node.rbi +43 -16
  65. data/rbi/prism/parse_result.rbi +17 -0
  66. data/rbi/prism/translation/parser35.rbi +6 -0
  67. data/rbi/prism.rbi +39 -36
  68. data/sig/prism/dispatcher.rbs +3 -0
  69. data/sig/prism/dsl.rbs +7 -5
  70. data/sig/prism/node.rbs +461 -37
  71. data/sig/prism/node_ext.rbs +84 -17
  72. data/sig/prism/parse_result/comments.rbs +38 -0
  73. data/sig/prism/parse_result.rbs +14 -0
  74. data/sig/prism/reflection.rbs +1 -1
  75. data/sig/prism/serialize.rbs +4 -2
  76. data/sig/prism.rbs +22 -1
  77. data/src/diagnostic.c +9 -3
  78. data/src/node.c +23 -0
  79. data/src/options.c +33 -2
  80. data/src/prettyprint.c +32 -0
  81. data/src/prism.c +620 -242
  82. data/src/serialize.c +8 -0
  83. data/src/token_type.c +36 -34
  84. data/src/util/pm_buffer.c +40 -0
  85. data/src/util/pm_constant_pool.c +6 -2
  86. data/src/util/pm_strncasecmp.c +13 -1
  87. metadata +11 -7
data/src/prism.c CHANGED
@@ -1409,7 +1409,7 @@ pm_conditional_predicate_warn_write_literal_p(const pm_node_t *node) {
1409
1409
  static inline void
1410
1410
  pm_conditional_predicate_warn_write_literal(pm_parser_t *parser, const pm_node_t *node) {
1411
1411
  if (pm_conditional_predicate_warn_write_literal_p(node)) {
1412
- pm_parser_warn_node(parser, node, parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_WARN_EQUAL_IN_CONDITIONAL_3_3 : PM_WARN_EQUAL_IN_CONDITIONAL);
1412
+ pm_parser_warn_node(parser, node, parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_WARN_EQUAL_IN_CONDITIONAL_3_3 : PM_WARN_EQUAL_IN_CONDITIONAL);
1413
1413
  }
1414
1414
  }
1415
1415
 
@@ -1649,22 +1649,25 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
1649
1649
  * the function pointer or can just directly use the UTF-8 functions.
1650
1650
  */
1651
1651
  static inline size_t
1652
- char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
1652
+ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) {
1653
+ if (n <= 0) return 0;
1654
+
1653
1655
  if (parser->encoding_changed) {
1654
1656
  size_t width;
1655
- if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
1657
+
1658
+ if ((width = parser->encoding->alpha_char(b, n)) != 0) {
1656
1659
  return width;
1657
1660
  } else if (*b == '_') {
1658
1661
  return 1;
1659
1662
  } else if (*b >= 0x80) {
1660
- return parser->encoding->char_width(b, parser->end - b);
1663
+ return parser->encoding->char_width(b, n);
1661
1664
  } else {
1662
1665
  return 0;
1663
1666
  }
1664
1667
  } else if (*b < 0x80) {
1665
1668
  return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
1666
1669
  } else {
1667
- return pm_encoding_utf_8_char_width(b, parser->end - b);
1670
+ return pm_encoding_utf_8_char_width(b, n);
1668
1671
  }
1669
1672
  }
1670
1673
 
@@ -1673,11 +1676,13 @@ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
1673
1676
  * has not been changed.
1674
1677
  */
1675
1678
  static inline size_t
1676
- char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
1677
- if (*b < 0x80) {
1679
+ char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) {
1680
+ if (n <= 0) {
1681
+ return 0;
1682
+ } else if (*b < 0x80) {
1678
1683
  return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
1679
1684
  } else {
1680
- return pm_encoding_utf_8_char_width(b, end - b);
1685
+ return pm_encoding_utf_8_char_width(b, n);
1681
1686
  }
1682
1687
  }
1683
1688
 
@@ -1687,20 +1692,24 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
1687
1692
  * it's important that it be as fast as possible.
1688
1693
  */
1689
1694
  static inline size_t
1690
- char_is_identifier(const pm_parser_t *parser, const uint8_t *b) {
1691
- if (parser->encoding_changed) {
1695
+ char_is_identifier(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) {
1696
+ if (n <= 0) {
1697
+ return 0;
1698
+ } else if (parser->encoding_changed) {
1692
1699
  size_t width;
1693
- if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
1700
+
1701
+ if ((width = parser->encoding->alnum_char(b, n)) != 0) {
1694
1702
  return width;
1695
1703
  } else if (*b == '_') {
1696
1704
  return 1;
1697
1705
  } else if (*b >= 0x80) {
1698
- return parser->encoding->char_width(b, parser->end - b);
1706
+ return parser->encoding->char_width(b, n);
1699
1707
  } else {
1700
1708
  return 0;
1701
1709
  }
1710
+ } else {
1711
+ return char_is_identifier_utf8(b, n);
1702
1712
  }
1703
- return char_is_identifier_utf8(b, parser->end);
1704
1713
  }
1705
1714
 
1706
1715
  // Here we're defining a perfect hash for the characters that are allowed in
@@ -1731,9 +1740,10 @@ char_is_global_name_punctuation(const uint8_t b) {
1731
1740
  static inline bool
1732
1741
  token_is_setter_name(pm_token_t *token) {
1733
1742
  return (
1734
- (token->type == PM_TOKEN_IDENTIFIER) &&
1743
+ (token->type == PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL) ||
1744
+ ((token->type == PM_TOKEN_IDENTIFIER) &&
1735
1745
  (token->end - token->start >= 2) &&
1736
- (token->end[-1] == '=')
1746
+ (token->end[-1] == '='))
1737
1747
  );
1738
1748
  }
1739
1749
 
@@ -2895,7 +2905,7 @@ pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) {
2895
2905
  (node->message_loc.start != NULL) &&
2896
2906
  (node->message_loc.end[-1] != '!') &&
2897
2907
  (node->message_loc.end[-1] != '?') &&
2898
- char_is_identifier_start(parser, node->message_loc.start) &&
2908
+ char_is_identifier_start(parser, node->message_loc.start, parser->end - node->message_loc.start) &&
2899
2909
  (node->opening_loc.start == NULL) &&
2900
2910
  (node->arguments == NULL) &&
2901
2911
  (node->block == NULL)
@@ -2966,7 +2976,7 @@ pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const
2966
2976
  */
2967
2977
  static void
2968
2978
  pm_index_arguments_check(pm_parser_t *parser, const pm_arguments_node_t *arguments, const pm_node_t *block) {
2969
- if (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3) {
2979
+ if (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) {
2970
2980
  if (arguments != NULL && PM_NODE_FLAG_P(arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS)) {
2971
2981
  pm_node_t *node;
2972
2982
  PM_NODE_LIST_FOREACH(&arguments->arguments, index, node) {
@@ -3864,7 +3874,7 @@ pm_def_node_create(
3864
3874
  end = end_keyword->end;
3865
3875
  }
3866
3876
 
3867
- if ((receiver != NULL) && PM_NODE_TYPE_P(receiver, PM_PARENTHESES_NODE)) {
3877
+ if (receiver != NULL) {
3868
3878
  pm_def_node_receiver_check(parser, receiver);
3869
3879
  }
3870
3880
 
@@ -4243,7 +4253,7 @@ pm_float_node_rational_create(pm_parser_t *parser, const pm_token_t *token) {
4243
4253
  const uint8_t *point = memchr(start, '.', length);
4244
4254
  assert(point && "should have a decimal point");
4245
4255
 
4246
- uint8_t *digits = malloc(length);
4256
+ uint8_t *digits = xmalloc(length);
4247
4257
  if (digits == NULL) {
4248
4258
  fputs("[pm_float_node_rational_create] Failed to allocate memory", stderr);
4249
4259
  abort();
@@ -4256,7 +4266,7 @@ pm_float_node_rational_create(pm_parser_t *parser, const pm_token_t *token) {
4256
4266
  digits[0] = '1';
4257
4267
  if (end - point > 1) memset(digits + 1, '0', (size_t) (end - point - 1));
4258
4268
  pm_integer_parse(&node->denominator, PM_INTEGER_BASE_DEFAULT, digits, digits + (end - point));
4259
- free(digits);
4269
+ xfree(digits);
4260
4270
 
4261
4271
  pm_integers_reduce(&node->numerator, &node->denominator);
4262
4272
  return node;
@@ -5269,6 +5279,10 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_
5269
5279
 
5270
5280
  switch (PM_NODE_TYPE(part)) {
5271
5281
  case PM_STRING_NODE:
5282
+ // If inner string is not frozen, clear flags for this string
5283
+ if (!PM_NODE_FLAG_P(part, PM_STRING_FLAGS_FROZEN)) {
5284
+ CLEAR_FLAGS(node);
5285
+ }
5272
5286
  part->flags = (pm_node_flags_t) ((part->flags | PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN) & ~PM_STRING_FLAGS_MUTABLE);
5273
5287
  break;
5274
5288
  case PM_INTERPOLATED_STRING_NODE:
@@ -5318,6 +5332,12 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_
5318
5332
  // should clear the mutability flags.
5319
5333
  CLEAR_FLAGS(node);
5320
5334
  break;
5335
+ case PM_X_STRING_NODE:
5336
+ case PM_INTERPOLATED_X_STRING_NODE:
5337
+ // If this is an x string, then this is a syntax error. But we want
5338
+ // to handle it here so that we don't fail the assertion.
5339
+ CLEAR_FLAGS(node);
5340
+ break;
5321
5341
  default:
5322
5342
  assert(false && "unexpected node type");
5323
5343
  break;
@@ -5652,7 +5672,7 @@ pm_lambda_node_create(
5652
5672
  */
5653
5673
  static pm_local_variable_and_write_node_t *
5654
5674
  pm_local_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) {
5655
- assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5675
+ assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5656
5676
  assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL);
5657
5677
  pm_local_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_and_write_node_t);
5658
5678
 
@@ -5707,7 +5727,7 @@ pm_local_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *tar
5707
5727
  */
5708
5728
  static pm_local_variable_or_write_node_t *
5709
5729
  pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) {
5710
- assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5730
+ assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE));
5711
5731
  assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL);
5712
5732
  pm_local_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_or_write_node_t);
5713
5733
 
@@ -6159,7 +6179,10 @@ pm_numbered_reference_read_node_number(pm_parser_t *parser, const pm_token_t *to
6159
6179
  const uint8_t *end = token->end;
6160
6180
 
6161
6181
  ptrdiff_t diff = end - start;
6162
- assert(diff > 0 && ((unsigned long) diff < SIZE_MAX));
6182
+ assert(diff > 0);
6183
+ #if PTRDIFF_MAX > SIZE_MAX
6184
+ assert(diff < (ptrdiff_t) SIZE_MAX);
6185
+ #endif
6163
6186
  size_t length = (size_t) diff;
6164
6187
 
6165
6188
  char *digits = xcalloc(length + 1, sizeof(char));
@@ -6393,12 +6416,13 @@ pm_program_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, pm_st
6393
6416
  * Allocate and initialize new ParenthesesNode node.
6394
6417
  */
6395
6418
  static pm_parentheses_node_t *
6396
- pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing) {
6419
+ pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing, pm_node_flags_t flags) {
6397
6420
  pm_parentheses_node_t *node = PM_NODE_ALLOC(parser, pm_parentheses_node_t);
6398
6421
 
6399
6422
  *node = (pm_parentheses_node_t) {
6400
6423
  {
6401
6424
  .type = PM_PARENTHESES_NODE,
6425
+ .flags = flags,
6402
6426
  .node_id = PM_NODE_IDENTIFY(parser),
6403
6427
  .location = {
6404
6428
  .start = opening->start,
@@ -6665,6 +6689,7 @@ pm_rescue_node_create(pm_parser_t *parser, const pm_token_t *keyword) {
6665
6689
  },
6666
6690
  .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword),
6667
6691
  .operator_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
6692
+ .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
6668
6693
  .reference = NULL,
6669
6694
  .statements = NULL,
6670
6695
  .subsequent = NULL,
@@ -8561,85 +8586,66 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
8561
8586
  /* Context manipulations */
8562
8587
  /******************************************************************************/
8563
8588
 
8564
- static bool
8565
- context_terminator(pm_context_t context, pm_token_t *token) {
8566
- switch (context) {
8567
- case PM_CONTEXT_MAIN:
8568
- case PM_CONTEXT_DEF_PARAMS:
8569
- case PM_CONTEXT_DEFINED:
8570
- case PM_CONTEXT_MULTI_TARGET:
8571
- case PM_CONTEXT_TERNARY:
8572
- case PM_CONTEXT_RESCUE_MODIFIER:
8573
- return token->type == PM_TOKEN_EOF;
8574
- case PM_CONTEXT_DEFAULT_PARAMS:
8575
- return token->type == PM_TOKEN_COMMA || token->type == PM_TOKEN_PARENTHESIS_RIGHT;
8576
- case PM_CONTEXT_PREEXE:
8577
- case PM_CONTEXT_POSTEXE:
8578
- return token->type == PM_TOKEN_BRACE_RIGHT;
8579
- case PM_CONTEXT_MODULE:
8580
- case PM_CONTEXT_CLASS:
8581
- case PM_CONTEXT_SCLASS:
8582
- case PM_CONTEXT_LAMBDA_DO_END:
8583
- case PM_CONTEXT_DEF:
8584
- case PM_CONTEXT_BLOCK_KEYWORDS:
8585
- return token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ENSURE;
8586
- case PM_CONTEXT_WHILE:
8587
- case PM_CONTEXT_UNTIL:
8588
- case PM_CONTEXT_ELSE:
8589
- case PM_CONTEXT_FOR:
8590
- case PM_CONTEXT_BEGIN_ENSURE:
8591
- case PM_CONTEXT_BLOCK_ENSURE:
8592
- case PM_CONTEXT_CLASS_ENSURE:
8593
- case PM_CONTEXT_DEF_ENSURE:
8594
- case PM_CONTEXT_LAMBDA_ENSURE:
8595
- case PM_CONTEXT_MODULE_ENSURE:
8596
- case PM_CONTEXT_SCLASS_ENSURE:
8597
- return token->type == PM_TOKEN_KEYWORD_END;
8598
- case PM_CONTEXT_LOOP_PREDICATE:
8599
- return token->type == PM_TOKEN_KEYWORD_DO || token->type == PM_TOKEN_KEYWORD_THEN;
8600
- case PM_CONTEXT_FOR_INDEX:
8601
- return token->type == PM_TOKEN_KEYWORD_IN;
8602
- case PM_CONTEXT_CASE_WHEN:
8603
- return token->type == PM_TOKEN_KEYWORD_WHEN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE;
8604
- case PM_CONTEXT_CASE_IN:
8605
- return token->type == PM_TOKEN_KEYWORD_IN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE;
8606
- case PM_CONTEXT_IF:
8607
- case PM_CONTEXT_ELSIF:
8608
- return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_ELSIF || token->type == PM_TOKEN_KEYWORD_END;
8609
- case PM_CONTEXT_UNLESS:
8610
- return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END;
8611
- case PM_CONTEXT_EMBEXPR:
8612
- return token->type == PM_TOKEN_EMBEXPR_END;
8613
- case PM_CONTEXT_BLOCK_BRACES:
8614
- return token->type == PM_TOKEN_BRACE_RIGHT;
8615
- case PM_CONTEXT_PARENS:
8616
- return token->type == PM_TOKEN_PARENTHESIS_RIGHT;
8617
- case PM_CONTEXT_BEGIN:
8618
- case PM_CONTEXT_BEGIN_RESCUE:
8619
- case PM_CONTEXT_BLOCK_RESCUE:
8620
- case PM_CONTEXT_CLASS_RESCUE:
8621
- case PM_CONTEXT_DEF_RESCUE:
8622
- case PM_CONTEXT_LAMBDA_RESCUE:
8623
- case PM_CONTEXT_MODULE_RESCUE:
8624
- case PM_CONTEXT_SCLASS_RESCUE:
8625
- return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END;
8626
- case PM_CONTEXT_BEGIN_ELSE:
8627
- case PM_CONTEXT_BLOCK_ELSE:
8628
- case PM_CONTEXT_CLASS_ELSE:
8629
- case PM_CONTEXT_DEF_ELSE:
8630
- case PM_CONTEXT_LAMBDA_ELSE:
8631
- case PM_CONTEXT_MODULE_ELSE:
8632
- case PM_CONTEXT_SCLASS_ELSE:
8633
- return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_END;
8634
- case PM_CONTEXT_LAMBDA_BRACES:
8635
- return token->type == PM_TOKEN_BRACE_RIGHT;
8636
- case PM_CONTEXT_PREDICATE:
8637
- return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON;
8638
- case PM_CONTEXT_NONE:
8639
- return false;
8640
- }
8589
+ static const uint32_t context_terminators[] = {
8590
+ [PM_CONTEXT_NONE] = 0,
8591
+ [PM_CONTEXT_BEGIN] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8592
+ [PM_CONTEXT_BEGIN_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8593
+ [PM_CONTEXT_BEGIN_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8594
+ [PM_CONTEXT_BEGIN_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8595
+ [PM_CONTEXT_BLOCK_BRACES] = (1 << PM_TOKEN_BRACE_RIGHT),
8596
+ [PM_CONTEXT_BLOCK_KEYWORDS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8597
+ [PM_CONTEXT_BLOCK_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8598
+ [PM_CONTEXT_BLOCK_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8599
+ [PM_CONTEXT_BLOCK_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8600
+ [PM_CONTEXT_CASE_WHEN] = (1 << PM_TOKEN_KEYWORD_WHEN) | (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_ELSE),
8601
+ [PM_CONTEXT_CASE_IN] = (1 << PM_TOKEN_KEYWORD_IN) | (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_ELSE),
8602
+ [PM_CONTEXT_CLASS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8603
+ [PM_CONTEXT_CLASS_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8604
+ [PM_CONTEXT_CLASS_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8605
+ [PM_CONTEXT_CLASS_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8606
+ [PM_CONTEXT_DEF] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8607
+ [PM_CONTEXT_DEF_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8608
+ [PM_CONTEXT_DEF_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8609
+ [PM_CONTEXT_DEF_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8610
+ [PM_CONTEXT_DEF_PARAMS] = (1 << PM_TOKEN_EOF),
8611
+ [PM_CONTEXT_DEFINED] = (1 << PM_TOKEN_EOF),
8612
+ [PM_CONTEXT_DEFAULT_PARAMS] = (1 << PM_TOKEN_COMMA) | (1 << PM_TOKEN_PARENTHESIS_RIGHT),
8613
+ [PM_CONTEXT_ELSE] = (1 << PM_TOKEN_KEYWORD_END),
8614
+ [PM_CONTEXT_ELSIF] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_ELSIF) | (1 << PM_TOKEN_KEYWORD_END),
8615
+ [PM_CONTEXT_EMBEXPR] = (1 << PM_TOKEN_EMBEXPR_END),
8616
+ [PM_CONTEXT_FOR] = (1 << PM_TOKEN_KEYWORD_END),
8617
+ [PM_CONTEXT_FOR_INDEX] = (1 << PM_TOKEN_KEYWORD_IN),
8618
+ [PM_CONTEXT_IF] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_ELSIF) | (1 << PM_TOKEN_KEYWORD_END),
8619
+ [PM_CONTEXT_LAMBDA_BRACES] = (1 << PM_TOKEN_BRACE_RIGHT),
8620
+ [PM_CONTEXT_LAMBDA_DO_END] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8621
+ [PM_CONTEXT_LAMBDA_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8622
+ [PM_CONTEXT_LAMBDA_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8623
+ [PM_CONTEXT_LAMBDA_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8624
+ [PM_CONTEXT_LOOP_PREDICATE] = (1 << PM_TOKEN_KEYWORD_DO) | (1 << PM_TOKEN_KEYWORD_THEN),
8625
+ [PM_CONTEXT_MAIN] = (1 << PM_TOKEN_EOF),
8626
+ [PM_CONTEXT_MODULE] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8627
+ [PM_CONTEXT_MODULE_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8628
+ [PM_CONTEXT_MODULE_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8629
+ [PM_CONTEXT_MODULE_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8630
+ [PM_CONTEXT_MULTI_TARGET] = (1 << PM_TOKEN_EOF),
8631
+ [PM_CONTEXT_PARENS] = (1 << PM_TOKEN_PARENTHESIS_RIGHT),
8632
+ [PM_CONTEXT_POSTEXE] = (1 << PM_TOKEN_BRACE_RIGHT),
8633
+ [PM_CONTEXT_PREDICATE] = (1 << PM_TOKEN_KEYWORD_THEN) | (1 << PM_TOKEN_NEWLINE) | (1 << PM_TOKEN_SEMICOLON),
8634
+ [PM_CONTEXT_PREEXE] = (1 << PM_TOKEN_BRACE_RIGHT),
8635
+ [PM_CONTEXT_RESCUE_MODIFIER] = (1 << PM_TOKEN_EOF),
8636
+ [PM_CONTEXT_SCLASS] = (1 << PM_TOKEN_KEYWORD_END) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ENSURE),
8637
+ [PM_CONTEXT_SCLASS_ENSURE] = (1 << PM_TOKEN_KEYWORD_END),
8638
+ [PM_CONTEXT_SCLASS_ELSE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_END),
8639
+ [PM_CONTEXT_SCLASS_RESCUE] = (1 << PM_TOKEN_KEYWORD_ENSURE) | (1 << PM_TOKEN_KEYWORD_RESCUE) | (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8640
+ [PM_CONTEXT_TERNARY] = (1 << PM_TOKEN_EOF),
8641
+ [PM_CONTEXT_UNLESS] = (1 << PM_TOKEN_KEYWORD_ELSE) | (1 << PM_TOKEN_KEYWORD_END),
8642
+ [PM_CONTEXT_UNTIL] = (1 << PM_TOKEN_KEYWORD_END),
8643
+ [PM_CONTEXT_WHILE] = (1 << PM_TOKEN_KEYWORD_END),
8644
+ };
8641
8645
 
8642
- return false;
8646
+ static inline bool
8647
+ context_terminator(pm_context_t context, pm_token_t *token) {
8648
+ return token->type < 32 && (context_terminators[context] & (1 << token->type));
8643
8649
  }
8644
8650
 
8645
8651
  /**
@@ -9082,13 +9088,13 @@ lex_global_variable(pm_parser_t *parser) {
9082
9088
  parser->current.end++;
9083
9089
  size_t width;
9084
9090
 
9085
- if (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
9091
+ if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) {
9086
9092
  do {
9087
9093
  parser->current.end += width;
9088
- } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
9094
+ } while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0);
9089
9095
 
9090
9096
  // $0 isn't allowed to be followed by anything.
9091
- pm_diagnostic_id_t diag_id = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL;
9097
+ pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL;
9092
9098
  PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, diag_id);
9093
9099
  }
9094
9100
 
@@ -9114,10 +9120,10 @@ lex_global_variable(pm_parser_t *parser) {
9114
9120
  default: {
9115
9121
  size_t width;
9116
9122
 
9117
- if ((width = char_is_identifier(parser, parser->current.end)) > 0) {
9123
+ if ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0) {
9118
9124
  do {
9119
9125
  parser->current.end += width;
9120
- } while (allow_multiple && parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
9126
+ } while (allow_multiple && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0);
9121
9127
  } else if (pm_char_is_whitespace(peek(parser))) {
9122
9128
  // If we get here, then we have a $ followed by whitespace,
9123
9129
  // which is not allowed.
@@ -9125,7 +9131,7 @@ lex_global_variable(pm_parser_t *parser) {
9125
9131
  } else {
9126
9132
  // If we get here, then we have a $ followed by something that
9127
9133
  // isn't recognized as a global variable.
9128
- pm_diagnostic_id_t diag_id = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL;
9134
+ pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL;
9129
9135
  const uint8_t *end = parser->current.end + parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
9130
9136
  PM_PARSER_ERR_FORMAT(parser, parser->current.start, end, diag_id, (int) (end - parser->current.start), (const char *) parser->current.start);
9131
9137
  }
@@ -9182,11 +9188,11 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
9182
9188
  bool encoding_changed = parser->encoding_changed;
9183
9189
 
9184
9190
  if (encoding_changed) {
9185
- while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
9191
+ while ((width = char_is_identifier(parser, current_end, end - current_end)) > 0) {
9186
9192
  current_end += width;
9187
9193
  }
9188
9194
  } else {
9189
- while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) {
9195
+ while ((width = char_is_identifier_utf8(current_end, end - current_end)) > 0) {
9190
9196
  current_end += width;
9191
9197
  }
9192
9198
  }
@@ -9360,7 +9366,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
9360
9366
  const uint8_t *variable = pound + 2;
9361
9367
  if (*variable == '@' && pound + 3 < parser->end) variable++;
9362
9368
 
9363
- if (char_is_identifier_start(parser, variable)) {
9369
+ if (char_is_identifier_start(parser, variable, parser->end - variable)) {
9364
9370
  // At this point we're sure that we've either hit an embedded instance
9365
9371
  // or class variable. In this case we'll first need to check if we've
9366
9372
  // already consumed content.
@@ -9409,7 +9415,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
9409
9415
  // or a global name punctuation character, then we've hit an embedded
9410
9416
  // global variable.
9411
9417
  if (
9412
- char_is_identifier_start(parser, check) ||
9418
+ char_is_identifier_start(parser, check, parser->end - check) ||
9413
9419
  (pound[2] != '-' && (pm_char_is_decimal_digit(pound[2]) || char_is_global_name_punctuation(pound[2])))
9414
9420
  ) {
9415
9421
  // In this case we've hit an embedded global variable. First check to
@@ -9541,21 +9547,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla
9541
9547
  parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
9542
9548
  }
9543
9549
 
9544
- if (value <= 0x7F) { // 0xxxxxxx
9545
- pm_buffer_append_byte(buffer, (uint8_t) value);
9546
- } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
9547
- pm_buffer_append_byte(buffer, (uint8_t) (0xC0 | (value >> 6)));
9548
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9549
- } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx
9550
- pm_buffer_append_byte(buffer, (uint8_t) (0xE0 | (value >> 12)));
9551
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
9552
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9553
- } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
9554
- pm_buffer_append_byte(buffer, (uint8_t) (0xF0 | (value >> 18)));
9555
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F)));
9556
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
9557
- pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
9558
- } else {
9550
+ if (!pm_buffer_append_unicode_codepoint(buffer, value)) {
9559
9551
  pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
9560
9552
  pm_buffer_append_byte(buffer, 0xEF);
9561
9553
  pm_buffer_append_byte(buffer, 0xBF);
@@ -9580,28 +9572,6 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
9580
9572
  pm_buffer_append_byte(buffer, byte);
9581
9573
  }
9582
9574
 
9583
- /**
9584
- * Write each byte of the given escaped character into the buffer.
9585
- */
9586
- static inline void
9587
- escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
9588
- size_t width;
9589
- if (parser->encoding_changed) {
9590
- width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
9591
- } else {
9592
- width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
9593
- }
9594
-
9595
- // TODO: If the character is invalid in the given encoding, then we'll just
9596
- // push one byte into the buffer. This should actually be an error.
9597
- width = (width == 0) ? 1 : width;
9598
-
9599
- for (size_t index = 0; index < width; index++) {
9600
- escape_write_byte_encoded(parser, buffer, *parser->current.end);
9601
- parser->current.end++;
9602
- }
9603
- }
9604
-
9605
9575
  /**
9606
9576
  * The regular expression engine doesn't support the same escape sequences as
9607
9577
  * Ruby does. So first we have to read the escape sequence, and then we have to
@@ -9626,6 +9596,33 @@ escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular
9626
9596
  escape_write_byte_encoded(parser, buffer, byte);
9627
9597
  }
9628
9598
 
9599
+ /**
9600
+ * Write each byte of the given escaped character into the buffer.
9601
+ */
9602
+ static inline void
9603
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) {
9604
+ size_t width;
9605
+ if (parser->encoding_changed) {
9606
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
9607
+ } else {
9608
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
9609
+ }
9610
+
9611
+ if (width == 1) {
9612
+ escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(*parser->current.end++, flags));
9613
+ } else if (width > 1) {
9614
+ // Valid multibyte character. Just ignore escape.
9615
+ pm_buffer_t *b = (flags & PM_ESCAPE_FLAG_REGEXP) ? regular_expression_buffer : buffer;
9616
+ pm_buffer_append_bytes(b, parser->current.end, width);
9617
+ parser->current.end += width;
9618
+ } else {
9619
+ // Assume the next character wasn't meant to be part of this escape
9620
+ // sequence since it is invalid. Add an error and move on.
9621
+ parser->current.end++;
9622
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
9623
+ }
9624
+ }
9625
+
9629
9626
  /**
9630
9627
  * Warn about using a space or a tab character in an escape, as opposed to using
9631
9628
  * \\s or \\t. Note that we can quite copy the source because the warning
@@ -9652,7 +9649,8 @@ escape_read_warn(pm_parser_t *parser, uint8_t flags, uint8_t flag, const char *t
9652
9649
  */
9653
9650
  static void
9654
9651
  escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) {
9655
- switch (peek(parser)) {
9652
+ uint8_t peeked = peek(parser);
9653
+ switch (peeked) {
9656
9654
  case '\\': {
9657
9655
  parser->current.end++;
9658
9656
  escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte('\\', flags));
@@ -9722,6 +9720,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9722
9720
  }
9723
9721
  }
9724
9722
 
9723
+ value = escape_byte(value, flags);
9725
9724
  escape_write_byte(parser, buffer, regular_expression_buffer, flags, value);
9726
9725
  return;
9727
9726
  }
@@ -9770,7 +9769,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9770
9769
 
9771
9770
  size_t whitespace;
9772
9771
  while (true) {
9773
- if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
9772
+ if ((whitespace = pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
9774
9773
  parser->current.end += whitespace;
9775
9774
  } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
9776
9775
  // This is super hacky, but it gets us nicer error
@@ -9818,7 +9817,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
9818
9817
  uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length);
9819
9818
  escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
9820
9819
 
9821
- parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
9820
+ parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end);
9822
9821
  }
9823
9822
 
9824
9823
  // ?\u{nnnn} character literal should contain only one codepoint
@@ -10049,8 +10048,13 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
10049
10048
  PRISM_FALLTHROUGH
10050
10049
  }
10051
10050
  default: {
10051
+ if ((flags & (PM_ESCAPE_FLAG_CONTROL | PM_ESCAPE_FLAG_META)) && !char_is_ascii_printable(peeked)) {
10052
+ size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
10053
+ pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META);
10054
+ return;
10055
+ }
10052
10056
  if (parser->current.end < parser->end) {
10053
- escape_write_escape_encoded(parser, buffer);
10057
+ escape_write_escape_encoded(parser, buffer, regular_expression_buffer, flags);
10054
10058
  } else {
10055
10059
  pm_parser_err_current(parser, PM_ERR_INVALID_ESCAPE_CHARACTER);
10056
10060
  }
@@ -10123,7 +10127,7 @@ lex_question_mark(pm_parser_t *parser) {
10123
10127
  !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') ||
10124
10128
  (
10125
10129
  (parser->current.end + encoding_width >= parser->end) ||
10126
- !char_is_identifier(parser, parser->current.end + encoding_width)
10130
+ !char_is_identifier(parser, parser->current.end + encoding_width, parser->end - (parser->current.end + encoding_width))
10127
10131
  )
10128
10132
  ) {
10129
10133
  lex_state_set(parser, PM_LEX_STATE_END);
@@ -10143,21 +10147,22 @@ lex_question_mark(pm_parser_t *parser) {
10143
10147
  static pm_token_type_t
10144
10148
  lex_at_variable(pm_parser_t *parser) {
10145
10149
  pm_token_type_t type = match(parser, '@') ? PM_TOKEN_CLASS_VARIABLE : PM_TOKEN_INSTANCE_VARIABLE;
10146
- size_t width;
10150
+ const uint8_t *end = parser->end;
10147
10151
 
10148
- if (parser->current.end < parser->end && (width = char_is_identifier_start(parser, parser->current.end)) > 0) {
10152
+ size_t width;
10153
+ if ((width = char_is_identifier_start(parser, parser->current.end, end - parser->current.end)) > 0) {
10149
10154
  parser->current.end += width;
10150
10155
 
10151
- while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
10156
+ while ((width = char_is_identifier(parser, parser->current.end, end - parser->current.end)) > 0) {
10152
10157
  parser->current.end += width;
10153
10158
  }
10154
- } else if (parser->current.end < parser->end && pm_char_is_decimal_digit(*parser->current.end)) {
10159
+ } else if (parser->current.end < end && pm_char_is_decimal_digit(*parser->current.end)) {
10155
10160
  pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE;
10156
- if (parser->version == PM_OPTIONS_VERSION_CRUBY_3_3) {
10161
+ if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) {
10157
10162
  diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS_3_3 : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE_3_3;
10158
10163
  }
10159
10164
 
10160
- size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
10165
+ size_t width = parser->encoding->char_width(parser->current.end, end - parser->current.end);
10161
10166
  PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start);
10162
10167
  } else {
10163
10168
  pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_CLASS_VARIABLE_BARE : PM_ERR_INSTANCE_VARIABLE_BARE;
@@ -10829,14 +10834,37 @@ parser_lex(pm_parser_t *parser) {
10829
10834
  following = next_newline(following, parser->end - following);
10830
10835
  }
10831
10836
 
10832
- // If the lex state was ignored, or we hit a '.' or a '&.',
10833
- // we will lex the ignored newline
10837
+ // If the lex state was ignored, we will lex the
10838
+ // ignored newline.
10839
+ if (lex_state_ignored_p(parser)) {
10840
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10841
+ lexed_comment = false;
10842
+ goto lex_next_token;
10843
+ }
10844
+
10845
+ // If we hit a '.' or a '&.' we will lex the ignored
10846
+ // newline.
10847
+ if (following && (
10848
+ (peek_at(parser, following) == '.') ||
10849
+ (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '.')
10850
+ )) {
10851
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10852
+ lexed_comment = false;
10853
+ goto lex_next_token;
10854
+ }
10855
+
10856
+
10857
+ // If we are parsing as CRuby 3.5 or later and we
10858
+ // hit a '&&' or a '||' then we will lex the ignored
10859
+ // newline.
10834
10860
  if (
10835
- lex_state_ignored_p(parser) ||
10836
- (following && (
10837
- (peek_at(parser, following) == '.') ||
10838
- (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '.')
10839
- ))
10861
+ (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_5) &&
10862
+ following && (
10863
+ (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '&') ||
10864
+ (peek_at(parser, following) == '|' && peek_at(parser, following + 1) == '|') ||
10865
+ (peek_at(parser, following) == 'a' && peek_at(parser, following + 1) == 'n' && peek_at(parser, following + 2) == 'd' && !char_is_identifier(parser, following + 3, parser->end - (following + 3))) ||
10866
+ (peek_at(parser, following) == 'o' && peek_at(parser, following + 1) == 'r' && !char_is_identifier(parser, following + 2, parser->end - (following + 2)))
10867
+ )
10840
10868
  ) {
10841
10869
  if (!lexed_comment) parser_lex_ignored_newline(parser);
10842
10870
  lexed_comment = false;
@@ -10876,6 +10904,63 @@ parser_lex(pm_parser_t *parser) {
10876
10904
  parser->next_start = NULL;
10877
10905
  LEX(PM_TOKEN_AMPERSAND_DOT);
10878
10906
  }
10907
+
10908
+ if (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_5) {
10909
+ // If we hit an && then we are in a logical chain
10910
+ // and we need to return the logical operator.
10911
+ if (peek_at(parser, next_content) == '&' && peek_at(parser, next_content + 1) == '&') {
10912
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10913
+ lex_state_set(parser, PM_LEX_STATE_BEG);
10914
+ parser->current.start = next_content;
10915
+ parser->current.end = next_content + 2;
10916
+ parser->next_start = NULL;
10917
+ LEX(PM_TOKEN_AMPERSAND_AMPERSAND);
10918
+ }
10919
+
10920
+ // If we hit a || then we are in a logical chain and
10921
+ // we need to return the logical operator.
10922
+ if (peek_at(parser, next_content) == '|' && peek_at(parser, next_content + 1) == '|') {
10923
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10924
+ lex_state_set(parser, PM_LEX_STATE_BEG);
10925
+ parser->current.start = next_content;
10926
+ parser->current.end = next_content + 2;
10927
+ parser->next_start = NULL;
10928
+ LEX(PM_TOKEN_PIPE_PIPE);
10929
+ }
10930
+
10931
+ // If we hit an 'and' then we are in a logical chain
10932
+ // and we need to return the logical operator.
10933
+ if (
10934
+ peek_at(parser, next_content) == 'a' &&
10935
+ peek_at(parser, next_content + 1) == 'n' &&
10936
+ peek_at(parser, next_content + 2) == 'd' &&
10937
+ !char_is_identifier(parser, next_content + 3, parser->end - (next_content + 3))
10938
+ ) {
10939
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10940
+ lex_state_set(parser, PM_LEX_STATE_BEG);
10941
+ parser->current.start = next_content;
10942
+ parser->current.end = next_content + 3;
10943
+ parser->next_start = NULL;
10944
+ parser->command_start = true;
10945
+ LEX(PM_TOKEN_KEYWORD_AND);
10946
+ }
10947
+
10948
+ // If we hit a 'or' then we are in a logical chain
10949
+ // and we need to return the logical operator.
10950
+ if (
10951
+ peek_at(parser, next_content) == 'o' &&
10952
+ peek_at(parser, next_content + 1) == 'r' &&
10953
+ !char_is_identifier(parser, next_content + 2, parser->end - (next_content + 2))
10954
+ ) {
10955
+ if (!lexed_comment) parser_lex_ignored_newline(parser);
10956
+ lex_state_set(parser, PM_LEX_STATE_BEG);
10957
+ parser->current.start = next_content;
10958
+ parser->current.end = next_content + 2;
10959
+ parser->next_start = NULL;
10960
+ parser->command_start = true;
10961
+ LEX(PM_TOKEN_KEYWORD_OR);
10962
+ }
10963
+ }
10879
10964
  }
10880
10965
 
10881
10966
  // At this point we know this is a regular newline, and we can set the
@@ -11145,13 +11230,13 @@ parser_lex(pm_parser_t *parser) {
11145
11230
 
11146
11231
  if (parser->current.end >= parser->end) {
11147
11232
  parser->current.end = end;
11148
- } else if (quote == PM_HEREDOC_QUOTE_NONE && (width = char_is_identifier(parser, parser->current.end)) == 0) {
11233
+ } else if (quote == PM_HEREDOC_QUOTE_NONE && (width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) == 0) {
11149
11234
  parser->current.end = end;
11150
11235
  } else {
11151
11236
  if (quote == PM_HEREDOC_QUOTE_NONE) {
11152
11237
  parser->current.end += width;
11153
11238
 
11154
- while ((parser->current.end < parser->end) && (width = char_is_identifier(parser, parser->current.end))) {
11239
+ while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end))) {
11155
11240
  parser->current.end += width;
11156
11241
  }
11157
11242
  } else {
@@ -11336,7 +11421,7 @@ parser_lex(pm_parser_t *parser) {
11336
11421
  } else {
11337
11422
  const uint8_t delim = peek_offset(parser, 1);
11338
11423
 
11339
- if ((delim != '\'') && (delim != '"') && !char_is_identifier(parser, parser->current.end + 1)) {
11424
+ if ((delim != '\'') && (delim != '"') && !char_is_identifier(parser, parser->current.end + 1, parser->end - (parser->current.end + 1))) {
11340
11425
  pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_AMPERSAND);
11341
11426
  }
11342
11427
  }
@@ -11774,7 +11859,7 @@ parser_lex(pm_parser_t *parser) {
11774
11859
 
11775
11860
  default: {
11776
11861
  if (*parser->current.start != '_') {
11777
- size_t width = char_is_identifier_start(parser, parser->current.start);
11862
+ size_t width = char_is_identifier_start(parser, parser->current.start, parser->end - parser->current.start);
11778
11863
 
11779
11864
  // If this isn't the beginning of an identifier, then
11780
11865
  // it's an invalid token as we've exhausted all of the
@@ -12965,7 +13050,7 @@ typedef struct {
12965
13050
 
12966
13051
  pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = {
12967
13052
  // rescue
12968
- [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
13053
+ [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = { PM_BINDING_POWER_MODIFIER_RESCUE, PM_BINDING_POWER_COMPOSITION, true, false },
12969
13054
 
12970
13055
  // if unless until while
12971
13056
  [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
@@ -13122,14 +13207,6 @@ match8(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2,
13122
13207
  return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8);
13123
13208
  }
13124
13209
 
13125
- /**
13126
- * Returns true if the current token is any of the nine given types.
13127
- */
13128
- static inline bool
13129
- match9(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7, pm_token_type_t type8, pm_token_type_t type9) {
13130
- return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8) || match1(parser, type9);
13131
- }
13132
-
13133
13210
  /**
13134
13211
  * If the current token is of the specified type, lex forward by one token and
13135
13212
  * return true. Otherwise, return false. For example:
@@ -13708,7 +13785,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
13708
13785
  return target;
13709
13786
  }
13710
13787
 
13711
- if (char_is_identifier_start(parser, call->message_loc.start)) {
13788
+ if (char_is_identifier_start(parser, call->message_loc.start, parser->end - call->message_loc.start)) {
13712
13789
  // When we get here, we have a method call, because it was
13713
13790
  // previously marked as a method call but now we have an =. This
13714
13791
  // looks like:
@@ -13936,6 +14013,15 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) {
13936
14013
  if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) {
13937
14014
  parser_lex(parser);
13938
14015
 
14016
+ // If we are at the end of the file, then we need to stop parsing
14017
+ // the statements entirely at this point. Mark the parser as
14018
+ // recovering, as we know that EOF closes the top-level context, and
14019
+ // then break out of the loop.
14020
+ if (match1(parser, PM_TOKEN_EOF)) {
14021
+ parser->recovering = true;
14022
+ break;
14023
+ }
14024
+
13939
14025
  while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
13940
14026
  if (context_terminator(context, &parser->current)) break;
13941
14027
  } else if (!accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_EOF)) {
@@ -14642,7 +14728,7 @@ parse_parameters(
14642
14728
  parser_lex(parser);
14643
14729
 
14644
14730
  pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &name);
14645
- uint32_t reads = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0;
14731
+ uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0;
14646
14732
 
14647
14733
  if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true);
14648
14734
  pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT, (uint16_t) (depth + 1));
@@ -14658,7 +14744,7 @@ parse_parameters(
14658
14744
  // If the value of the parameter increased the number of
14659
14745
  // reads of that parameter, then we need to warn that we
14660
14746
  // have a circular definition.
14661
- if ((parser->version == PM_OPTIONS_VERSION_CRUBY_3_3) && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) {
14747
+ if ((parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) {
14662
14748
  PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, name, PM_ERR_PARAMETER_CIRCULAR);
14663
14749
  }
14664
14750
 
@@ -14743,13 +14829,13 @@ parse_parameters(
14743
14829
 
14744
14830
  if (token_begins_expression_p(parser->current.type)) {
14745
14831
  pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &local);
14746
- uint32_t reads = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0;
14832
+ uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0;
14747
14833
 
14748
14834
  if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true);
14749
14835
  pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT_KW, (uint16_t) (depth + 1));
14750
14836
  if (accepts_blocks_in_defaults) pm_accepts_block_stack_pop(parser);
14751
14837
 
14752
- if (parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) {
14838
+ if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) {
14753
14839
  PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, local, PM_ERR_PARAMETER_CIRCULAR);
14754
14840
  }
14755
14841
 
@@ -15051,8 +15137,8 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_
15051
15137
  case PM_TOKEN_NEWLINE:
15052
15138
  case PM_TOKEN_SEMICOLON:
15053
15139
  case PM_TOKEN_KEYWORD_THEN:
15054
- // Here we have a terminator for the rescue keyword, in which case we're
15055
- // going to just continue on.
15140
+ // Here we have a terminator for the rescue keyword, in which
15141
+ // case we're going to just continue on.
15056
15142
  break;
15057
15143
  default: {
15058
15144
  if (token_begins_expression_p(parser->current.type) || match1(parser, PM_TOKEN_USTAR)) {
@@ -15084,9 +15170,12 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_
15084
15170
  }
15085
15171
 
15086
15172
  if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
15087
- accept1(parser, PM_TOKEN_KEYWORD_THEN);
15173
+ if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) {
15174
+ rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous);
15175
+ }
15088
15176
  } else {
15089
15177
  expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_RESCUE_TERM);
15178
+ rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous);
15090
15179
  }
15091
15180
 
15092
15181
  if (!match3(parser, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_END)) {
@@ -16450,7 +16539,7 @@ parse_variable(pm_parser_t *parser) {
16450
16539
  pm_node_list_append(&current_scope->implicit_parameters, node);
16451
16540
 
16452
16541
  return node;
16453
- } else if ((parser->version != PM_OPTIONS_VERSION_CRUBY_3_3) && pm_token_is_it(parser->previous.start, parser->previous.end)) {
16542
+ } else if ((parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) && pm_token_is_it(parser->previous.start, parser->previous.end)) {
16454
16543
  pm_node_t *node = (pm_node_t *) pm_it_local_variable_read_node_create(parser, &parser->previous);
16455
16544
  pm_node_list_append(&current_scope->implicit_parameters, node);
16456
16545
 
@@ -16802,6 +16891,10 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1
16802
16891
  // If we haven't already created our container for concatenation,
16803
16892
  // we'll do that now.
16804
16893
  if (!concating) {
16894
+ if (!PM_NODE_TYPE_P(current, PM_STRING_NODE) && !PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
16895
+ pm_parser_err_node(parser, current, PM_ERR_STRING_CONCATENATION);
16896
+ }
16897
+
16805
16898
  concating = true;
16806
16899
  pm_token_t bounds = not_provided(parser);
16807
16900
 
@@ -17040,7 +17133,7 @@ pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const u
17040
17133
  if (length == 0) return false;
17041
17134
 
17042
17135
  // First ensure that it starts with a valid identifier starting character.
17043
- size_t width = char_is_identifier_start(parser, start);
17136
+ size_t width = char_is_identifier_start(parser, start, end - start);
17044
17137
  if (width == 0) return false;
17045
17138
 
17046
17139
  // Next, ensure that it's not an uppercase character.
@@ -17053,7 +17146,7 @@ pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const u
17053
17146
  // Next, iterate through all of the bytes of the string to ensure that they
17054
17147
  // are all valid identifier characters.
17055
17148
  const uint8_t *cursor = start + width;
17056
- while ((cursor < end) && (width = char_is_identifier(parser, cursor))) cursor += width;
17149
+ while ((width = char_is_identifier(parser, cursor, end - cursor))) cursor += width;
17057
17150
  return cursor == end;
17058
17151
  }
17059
17152
 
@@ -17376,6 +17469,14 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm
17376
17469
  // If we found a label, we need to immediately return to the caller.
17377
17470
  if (pm_symbol_node_label_p(node)) return node;
17378
17471
 
17472
+ // Call nodes (arithmetic operations) are not allowed in patterns
17473
+ if (PM_NODE_TYPE(node) == PM_CALL_NODE) {
17474
+ pm_parser_err_node(parser, node, diag_id);
17475
+ pm_missing_node_t *missing_node = pm_missing_node_create(parser, node->location.start, node->location.end);
17476
+ pm_node_destroy(parser, node);
17477
+ return (pm_node_t *) missing_node;
17478
+ }
17479
+
17379
17480
  // Now that we have a primitive, we need to check if it's part of a range.
17380
17481
  if (accept2(parser, PM_TOKEN_DOT_DOT, PM_TOKEN_DOT_DOT_DOT)) {
17381
17482
  pm_token_t operator = parser->previous;
@@ -17526,7 +17627,7 @@ parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, p
17526
17627
  pm_node_t *body = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1));
17527
17628
  accept1(parser, PM_TOKEN_NEWLINE);
17528
17629
  expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN);
17529
- pm_node_t *right = (pm_node_t *) pm_parentheses_node_create(parser, &opening, body, &parser->previous);
17630
+ pm_node_t *right = (pm_node_t *) pm_parentheses_node_create(parser, &opening, body, &parser->previous, 0);
17530
17631
 
17531
17632
  if (node == NULL) {
17532
17633
  node = right;
@@ -17658,7 +17759,7 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag
17658
17759
  // Gather up all of the patterns into the list.
17659
17760
  while (accept1(parser, PM_TOKEN_COMMA)) {
17660
17761
  // Break early here in case we have a trailing comma.
17661
- if (match9(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE, PM_TOKEN_EOF,PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) {
17762
+ if (match7(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) {
17662
17763
  node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
17663
17764
  pm_node_list_append(&nodes, node);
17664
17765
  trailing_rest = true;
@@ -18149,12 +18250,19 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18149
18250
  case PM_TOKEN_PARENTHESIS_LEFT:
18150
18251
  case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: {
18151
18252
  pm_token_t opening = parser->current;
18253
+ pm_node_flags_t flags = 0;
18152
18254
 
18153
18255
  pm_node_list_t current_block_exits = { 0 };
18154
18256
  pm_node_list_t *previous_block_exits = push_block_exits(parser, &current_block_exits);
18155
18257
 
18156
18258
  parser_lex(parser);
18157
- while (accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE));
18259
+ while (true) {
18260
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18261
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18262
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
18263
+ break;
18264
+ }
18265
+ }
18158
18266
 
18159
18267
  // If this is the end of the file or we match a right parenthesis, then
18160
18268
  // we have an empty parentheses node, and we can immediately return.
@@ -18164,7 +18272,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18164
18272
  pop_block_exits(parser, previous_block_exits);
18165
18273
  pm_node_list_free(&current_block_exits);
18166
18274
 
18167
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, NULL, &parser->previous);
18275
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, flags);
18168
18276
  }
18169
18277
 
18170
18278
  // Otherwise, we're going to parse the first statement in the list
@@ -18177,9 +18285,23 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18177
18285
  // Determine if this statement is followed by a terminator. In the
18178
18286
  // case of a single statement, this is fine. But in the case of
18179
18287
  // multiple statements it's required.
18180
- bool terminator_found = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
18288
+ bool terminator_found = false;
18289
+
18290
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18291
+ terminator_found = true;
18292
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18293
+ } else if (accept1(parser, PM_TOKEN_NEWLINE)) {
18294
+ terminator_found = true;
18295
+ }
18296
+
18181
18297
  if (terminator_found) {
18182
- while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
18298
+ while (true) {
18299
+ if (accept1(parser, PM_TOKEN_SEMICOLON)) {
18300
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18301
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
18302
+ break;
18303
+ }
18304
+ }
18183
18305
  }
18184
18306
 
18185
18307
  // If we hit a right parenthesis, then we're done parsing the
@@ -18251,13 +18373,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18251
18373
  pm_statements_node_t *statements = pm_statements_node_create(parser);
18252
18374
  pm_statements_node_body_append(parser, statements, statement, true);
18253
18375
 
18254
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous);
18376
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags);
18255
18377
  }
18256
18378
 
18257
18379
  // If we have more than one statement in the set of parentheses,
18258
18380
  // then we are going to parse all of them as a list of statements.
18259
18381
  // We'll do that here.
18260
18382
  context_push(parser, PM_CONTEXT_PARENS);
18383
+ flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS;
18384
+
18261
18385
  pm_statements_node_t *statements = pm_statements_node_create(parser);
18262
18386
  pm_statements_node_body_append(parser, statements, statement, true);
18263
18387
 
@@ -18334,7 +18458,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18334
18458
  pm_node_list_free(&current_block_exits);
18335
18459
 
18336
18460
  pm_void_statements_check(parser, statements, true);
18337
- return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous);
18461
+ return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags);
18338
18462
  }
18339
18463
  case PM_TOKEN_BRACE_LEFT: {
18340
18464
  // If we were passed a current_hash_keys via the parser, then that
@@ -18526,17 +18650,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
18526
18650
  call->closing_loc = arguments.closing_loc;
18527
18651
  call->block = arguments.block;
18528
18652
 
18529
- if (arguments.block != NULL) {
18530
- call->base.location.end = arguments.block->location.end;
18531
- } else if (arguments.closing_loc.start == NULL) {
18532
- if (arguments.arguments != NULL) {
18533
- call->base.location.end = arguments.arguments->base.location.end;
18534
- } else {
18535
- call->base.location.end = call->message_loc.end;
18536
- }
18537
- } else {
18538
- call->base.location.end = arguments.closing_loc.end;
18653
+ const uint8_t *end = pm_arguments_end(&arguments);
18654
+ if (!end) {
18655
+ end = call->message_loc.end;
18539
18656
  }
18657
+ call->base.location.end = end;
18540
18658
  }
18541
18659
  } else {
18542
18660
  // Otherwise, we know the identifier is in the local table. This
@@ -19064,7 +19182,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19064
19182
  pm_binding_power_t binding_power = pm_binding_powers[parser->current.type].left;
19065
19183
 
19066
19184
  if (binding_power == PM_BINDING_POWER_UNSET || binding_power >= PM_BINDING_POWER_RANGE) {
19185
+ pm_token_t next = parser->current;
19067
19186
  parse_arguments(parser, &arguments, false, PM_TOKEN_EOF, (uint16_t) (depth + 1));
19187
+
19188
+ // Reject `foo && return bar`.
19189
+ if (!accepts_command_call && arguments.arguments != NULL) {
19190
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, next, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(next.type));
19191
+ }
19068
19192
  }
19069
19193
  }
19070
19194
 
@@ -19380,7 +19504,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19380
19504
  expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM);
19381
19505
 
19382
19506
  operator = parser->previous;
19383
- receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen);
19507
+ receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0);
19384
19508
 
19385
19509
  // To push `PM_CONTEXT_DEF_PARAMS` again is for the same
19386
19510
  // reason as described the above.
@@ -19461,13 +19585,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19461
19585
  pm_do_loop_stack_push(parser, false);
19462
19586
  statements = (pm_node_t *) pm_statements_node_create(parser);
19463
19587
 
19464
- pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, binding_power < PM_BINDING_POWER_COMPOSITION, false, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1));
19588
+ bool allow_command_call;
19589
+ if (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_5) {
19590
+ allow_command_call = accepts_command_call;
19591
+ } else {
19592
+ // Allow `def foo = puts "Hello"` but not `private def foo = puts "Hello"`
19593
+ allow_command_call = binding_power == PM_BINDING_POWER_ASSIGNMENT || binding_power < PM_BINDING_POWER_COMPOSITION;
19594
+ }
19595
+
19596
+ pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, allow_command_call, false, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1));
19465
19597
 
19466
19598
  if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) {
19467
19599
  context_push(parser, PM_CONTEXT_RESCUE_MODIFIER);
19468
19600
 
19469
19601
  pm_token_t rescue_keyword = parser->previous;
19470
- pm_node_t *value = parse_expression(parser, binding_power, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
19602
+ pm_node_t *value = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
19471
19603
  context_pop(parser);
19472
19604
 
19473
19605
  statement = (pm_node_t *) pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value);
@@ -19548,18 +19680,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19548
19680
  pm_token_t lparen;
19549
19681
  pm_token_t rparen;
19550
19682
  pm_node_t *expression;
19683
+
19551
19684
  context_push(parser, PM_CONTEXT_DEFINED);
19685
+ bool newline = accept1(parser, PM_TOKEN_NEWLINE);
19552
19686
 
19553
19687
  if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
19554
19688
  lparen = parser->previous;
19555
- expression = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1));
19556
19689
 
19557
- if (parser->recovering) {
19690
+ if (newline && accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
19691
+ expression = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0);
19692
+ lparen = not_provided(parser);
19558
19693
  rparen = not_provided(parser);
19559
19694
  } else {
19560
- accept1(parser, PM_TOKEN_NEWLINE);
19561
- expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN);
19562
- rparen = parser->previous;
19695
+ expression = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1));
19696
+
19697
+ if (parser->recovering) {
19698
+ rparen = not_provided(parser);
19699
+ } else {
19700
+ accept1(parser, PM_TOKEN_NEWLINE);
19701
+ expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN);
19702
+ rparen = parser->previous;
19703
+ }
19563
19704
  }
19564
19705
  } else {
19565
19706
  lparen = not_provided(parser);
@@ -19707,14 +19848,29 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
19707
19848
  pm_arguments_t arguments = { 0 };
19708
19849
  pm_node_t *receiver = NULL;
19709
19850
 
19851
+ // If we do not accept a command call, then we also do not accept a
19852
+ // not without parentheses. In this case we need to reject this
19853
+ // syntax.
19854
+ if (!accepts_command_call && !match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
19855
+ if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES)) {
19856
+ pm_parser_err(parser, parser->previous.end, parser->previous.end + 1, PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN);
19857
+ } else {
19858
+ accept1(parser, PM_TOKEN_NEWLINE);
19859
+ pm_parser_err_current(parser, PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER);
19860
+ }
19861
+
19862
+ return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
19863
+ }
19864
+
19710
19865
  accept1(parser, PM_TOKEN_NEWLINE);
19711
19866
 
19712
19867
  if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) {
19713
- arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous);
19868
+ pm_token_t lparen = parser->previous;
19714
19869
 
19715
19870
  if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
19716
- arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous);
19871
+ receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0);
19717
19872
  } else {
19873
+ arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&lparen);
19718
19874
  receiver = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1));
19719
19875
 
19720
19876
  if (!parser->recovering) {
@@ -20687,7 +20843,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_
20687
20843
  pm_token_t rescue = parser->current;
20688
20844
  parser_lex(parser);
20689
20845
 
20690
- pm_node_t *right = parse_expression(parser, binding_power, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20846
+ pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20691
20847
  context_pop(parser);
20692
20848
 
20693
20849
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
@@ -20793,7 +20949,7 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
20793
20949
  }
20794
20950
  }
20795
20951
 
20796
- pm_node_t *right = parse_expression(parser, binding_power, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20952
+ pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1));
20797
20953
  context_pop(parser);
20798
20954
 
20799
20955
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
@@ -20849,6 +21005,123 @@ typedef struct {
20849
21005
  bool shared;
20850
21006
  } parse_regular_expression_named_capture_data_t;
20851
21007
 
21008
+ static inline const uint8_t *
21009
+ pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
21010
+ cursor++;
21011
+
21012
+ if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
21013
+ uint8_t value = escape_hexadecimal_digit(*cursor);
21014
+ cursor++;
21015
+
21016
+ if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
21017
+ value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(*cursor));
21018
+ cursor++;
21019
+ }
21020
+
21021
+ pm_buffer_append_byte(unescaped, value);
21022
+ } else {
21023
+ pm_buffer_append_string(unescaped, "\\x", 2);
21024
+ }
21025
+
21026
+ return cursor;
21027
+ }
21028
+
21029
+ static inline const uint8_t *
21030
+ pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
21031
+ uint8_t value = (uint8_t) (*cursor - '0');
21032
+ cursor++;
21033
+
21034
+ if (cursor < end && pm_char_is_octal_digit(*cursor)) {
21035
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
21036
+ cursor++;
21037
+
21038
+ if (cursor < end && pm_char_is_octal_digit(*cursor)) {
21039
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
21040
+ cursor++;
21041
+ }
21042
+ }
21043
+
21044
+ pm_buffer_append_byte(unescaped, value);
21045
+ return cursor;
21046
+ }
21047
+
21048
+ static inline const uint8_t *
21049
+ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
21050
+ const uint8_t *start = cursor - 1;
21051
+ cursor++;
21052
+
21053
+ if (cursor >= end) {
21054
+ pm_buffer_append_string(unescaped, "\\u", 2);
21055
+ return cursor;
21056
+ }
21057
+
21058
+ if (*cursor != '{') {
21059
+ size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4));
21060
+ uint32_t value = escape_unicode(parser, cursor, length);
21061
+
21062
+ if (!pm_buffer_append_unicode_codepoint(unescaped, value)) {
21063
+ pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start));
21064
+ }
21065
+
21066
+ return cursor + length;
21067
+ }
21068
+
21069
+ cursor++;
21070
+ for (;;) {
21071
+ while (cursor < end && *cursor == ' ') cursor++;
21072
+
21073
+ if (cursor >= end) break;
21074
+ if (*cursor == '}') {
21075
+ cursor++;
21076
+ break;
21077
+ }
21078
+
21079
+ size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor);
21080
+ uint32_t value = escape_unicode(parser, cursor, length);
21081
+
21082
+ (void) pm_buffer_append_unicode_codepoint(unescaped, value);
21083
+ cursor += length;
21084
+ }
21085
+
21086
+ return cursor;
21087
+ }
21088
+
21089
+ static void
21090
+ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) {
21091
+ const uint8_t *end = source + length;
21092
+ pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source));
21093
+
21094
+ for (;;) {
21095
+ if (++cursor >= end) {
21096
+ pm_buffer_append_byte(unescaped, '\\');
21097
+ return;
21098
+ }
21099
+
21100
+ switch (*cursor) {
21101
+ case 'x':
21102
+ cursor = pm_named_capture_escape_hex(unescaped, cursor, end);
21103
+ break;
21104
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
21105
+ cursor = pm_named_capture_escape_octal(unescaped, cursor, end);
21106
+ break;
21107
+ case 'u':
21108
+ cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end);
21109
+ break;
21110
+ default:
21111
+ pm_buffer_append_byte(unescaped, '\\');
21112
+ break;
21113
+ }
21114
+
21115
+ const uint8_t *next_cursor = pm_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, parser->encoding);
21116
+ if (next_cursor == NULL) break;
21117
+
21118
+ pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (next_cursor - cursor));
21119
+ cursor = next_cursor;
21120
+ }
21121
+
21122
+ pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (end - cursor));
21123
+ }
21124
+
20852
21125
  /**
20853
21126
  * This callback is called when the regular expression parser encounters a named
20854
21127
  * capture group.
@@ -20863,13 +21136,32 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20863
21136
 
20864
21137
  const uint8_t *source = pm_string_source(capture);
20865
21138
  size_t length = pm_string_length(capture);
21139
+ pm_buffer_t unescaped = { 0 };
21140
+
21141
+ // First, we need to handle escapes within the name of the capture group.
21142
+ // This is because regular expressions have three different representations
21143
+ // in prism. The first is the plain source code. The second is the
21144
+ // representation that will be sent to the regular expression engine, which
21145
+ // is the value of the "unescaped" field. This is poorly named, because it
21146
+ // actually still contains escapes, just a subset of them that the regular
21147
+ // expression engine knows how to handle. The third representation is fully
21148
+ // unescaped, which is what we need.
21149
+ const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding);
21150
+ if (PRISM_UNLIKELY(cursor != NULL)) {
21151
+ pm_named_capture_escape(parser, &unescaped, source, length, cursor);
21152
+ source = (const uint8_t *) pm_buffer_value(&unescaped);
21153
+ length = pm_buffer_length(&unescaped);
21154
+ }
20866
21155
 
20867
21156
  pm_location_t location;
20868
21157
  pm_constant_id_t name;
20869
21158
 
20870
21159
  // If the name of the capture group isn't a valid identifier, we do
20871
21160
  // not add it to the local table.
20872
- if (!pm_slice_is_valid_local(parser, source, source + length)) return;
21161
+ if (!pm_slice_is_valid_local(parser, source, source + length)) {
21162
+ pm_buffer_free(&unescaped);
21163
+ return;
21164
+ }
20873
21165
 
20874
21166
  if (callback_data->shared) {
20875
21167
  // If the unescaped string is a slice of the source, then we can
@@ -20897,7 +21189,10 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20897
21189
  if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
20898
21190
  // If the local is not already a local but it is a keyword, then we
20899
21191
  // do not want to add a capture for this.
20900
- if (pm_local_is_keyword((const char *) source, length)) return;
21192
+ if (pm_local_is_keyword((const char *) source, length)) {
21193
+ pm_buffer_free(&unescaped);
21194
+ return;
21195
+ }
20901
21196
 
20902
21197
  // If the identifier is not already a local, then we will add it to
20903
21198
  // the local table.
@@ -20915,6 +21210,8 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
20915
21210
  pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
20916
21211
  pm_node_list_append(&callback_data->match->targets, target);
20917
21212
  }
21213
+
21214
+ pm_buffer_free(&unescaped);
20918
21215
  }
20919
21216
 
20920
21217
  /**
@@ -20966,6 +21263,13 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
20966
21263
  }
20967
21264
  PRISM_FALLTHROUGH
20968
21265
  case PM_CASE_WRITABLE: {
21266
+ // When we have `it = value`, we need to add `it` as a local
21267
+ // variable before parsing the value, in case the value
21268
+ // references the variable.
21269
+ if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) {
21270
+ pm_parser_local_add_location(parser, node->location.start, node->location.end, 0);
21271
+ }
21272
+
20969
21273
  parser_lex(parser);
20970
21274
  pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) ? PM_BINDING_POWER_MULTI_ASSIGNMENT + 1 : binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1));
20971
21275
 
@@ -21055,7 +21359,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21055
21359
  pm_node_destroy(parser, node);
21056
21360
  return result;
21057
21361
  }
21362
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21363
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21364
+ parser_lex(parser);
21365
+
21366
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1));
21367
+ pm_node_t *result = (pm_node_t *) pm_local_variable_and_write_node_create(parser, node, &token, value, name, 0);
21368
+
21369
+ parse_target_implicit_parameter(parser, node);
21370
+ pm_node_destroy(parser, node);
21371
+ return result;
21372
+ }
21058
21373
  case PM_LOCAL_VARIABLE_READ_NODE: {
21374
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21375
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21376
+ parse_target_implicit_parameter(parser, node);
21377
+ }
21378
+
21059
21379
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21060
21380
  parser_lex(parser);
21061
21381
 
@@ -21173,7 +21493,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21173
21493
  pm_node_destroy(parser, node);
21174
21494
  return result;
21175
21495
  }
21496
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21497
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21498
+ parser_lex(parser);
21499
+
21500
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1));
21501
+ pm_node_t *result = (pm_node_t *) pm_local_variable_or_write_node_create(parser, node, &token, value, name, 0);
21502
+
21503
+ parse_target_implicit_parameter(parser, node);
21504
+ pm_node_destroy(parser, node);
21505
+ return result;
21506
+ }
21176
21507
  case PM_LOCAL_VARIABLE_READ_NODE: {
21508
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21509
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21510
+ parse_target_implicit_parameter(parser, node);
21511
+ }
21512
+
21177
21513
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21178
21514
  parser_lex(parser);
21179
21515
 
@@ -21301,7 +21637,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
21301
21637
  pm_node_destroy(parser, node);
21302
21638
  return result;
21303
21639
  }
21640
+ case PM_IT_LOCAL_VARIABLE_READ_NODE: {
21641
+ pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2);
21642
+ parser_lex(parser);
21643
+
21644
+ pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1));
21645
+ pm_node_t *result = (pm_node_t *) pm_local_variable_operator_write_node_create(parser, node, &token, value, name, 0);
21646
+
21647
+ parse_target_implicit_parameter(parser, node);
21648
+ pm_node_destroy(parser, node);
21649
+ return result;
21650
+ }
21304
21651
  case PM_LOCAL_VARIABLE_READ_NODE: {
21652
+ if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) {
21653
+ PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start);
21654
+ parse_target_implicit_parameter(parser, node);
21655
+ }
21656
+
21305
21657
  pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node;
21306
21658
  parser_lex(parser);
21307
21659
 
@@ -21911,6 +22263,12 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
21911
22263
  ) {
21912
22264
  node = parse_expression_infix(parser, node, binding_power, current_binding_powers.right, accepts_command_call, (uint16_t) (depth + 1));
21913
22265
 
22266
+ if (context_terminator(parser->current_context->context, &parser->current)) {
22267
+ // If this token terminates the current context, then we need to
22268
+ // stop parsing the expression, as it has become a statement.
22269
+ return node;
22270
+ }
22271
+
21914
22272
  switch (PM_NODE_TYPE(node)) {
21915
22273
  case PM_MULTI_WRITE_NODE:
21916
22274
  // Multi-write nodes are statements, and cannot be followed by
@@ -22035,6 +22393,10 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
22035
22393
  static pm_statements_node_t *
22036
22394
  wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) {
22037
22395
  if (PM_PARSER_COMMAND_LINE_OPTION_P(parser)) {
22396
+ if (statements == NULL) {
22397
+ statements = pm_statements_node_create(parser);
22398
+ }
22399
+
22038
22400
  pm_arguments_node_t *arguments = pm_arguments_node_create(parser);
22039
22401
  pm_arguments_node_arguments_append(
22040
22402
  arguments,
@@ -22050,6 +22412,10 @@ wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) {
22050
22412
 
22051
22413
  if (PM_PARSER_COMMAND_LINE_OPTION_N(parser)) {
22052
22414
  if (PM_PARSER_COMMAND_LINE_OPTION_A(parser)) {
22415
+ if (statements == NULL) {
22416
+ statements = pm_statements_node_create(parser);
22417
+ }
22418
+
22053
22419
  pm_arguments_node_t *arguments = pm_arguments_node_create(parser);
22054
22420
  pm_arguments_node_arguments_append(
22055
22421
  arguments,
@@ -22118,9 +22484,7 @@ parse_program(pm_parser_t *parser) {
22118
22484
  parser_lex(parser);
22119
22485
  pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_MAIN, 0);
22120
22486
 
22121
- if (statements == NULL) {
22122
- statements = pm_statements_node_create(parser);
22123
- } else if (!parser->parsing_eval) {
22487
+ if (statements != NULL && !parser->parsing_eval) {
22124
22488
  // If we have statements, then the top-level statement should be
22125
22489
  // explicitly checked as well. We have to do this here because
22126
22490
  // everywhere else we check all but the last statement.
@@ -22132,13 +22496,6 @@ parse_program(pm_parser_t *parser) {
22132
22496
  pm_locals_order(parser, &parser->current_scope->locals, &locals, true);
22133
22497
  pm_parser_scope_pop(parser);
22134
22498
 
22135
- // If this is an empty file, then we're still going to parse all of the
22136
- // statements in order to gather up all of the comments and such. Here we'll
22137
- // correct the location information.
22138
- if (pm_statements_node_body_length(statements) == 0) {
22139
- pm_statements_node_location_set(statements, parser->start, parser->start);
22140
- }
22141
-
22142
22499
  // At the top level, see if we need to wrap the statements in a program
22143
22500
  // node with a while loop based on the options.
22144
22501
  if (parser->command_line & (PM_OPTIONS_COMMAND_LINE_P | PM_OPTIONS_COMMAND_LINE_N)) {
@@ -22148,6 +22505,14 @@ parse_program(pm_parser_t *parser) {
22148
22505
  pm_node_list_free(&current_block_exits);
22149
22506
  }
22150
22507
 
22508
+ // If this is an empty file, then we're still going to parse all of the
22509
+ // statements in order to gather up all of the comments and such. Here we'll
22510
+ // correct the location information.
22511
+ if (statements == NULL) {
22512
+ statements = pm_statements_node_create(parser);
22513
+ pm_statements_node_location_set(statements, parser->start, parser->start);
22514
+ }
22515
+
22151
22516
  return (pm_node_t *) pm_program_node_create(parser, &locals, statements);
22152
22517
  }
22153
22518
 
@@ -22341,7 +22706,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
22341
22706
 
22342
22707
  // Scopes given from the outside are not allowed to have numbered
22343
22708
  // parameters.
22344
- parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED;
22709
+ parser->current_scope->parameters = ((pm_scope_parameters_t) scope->forwarding) | PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED;
22345
22710
 
22346
22711
  for (size_t local_index = 0; local_index < scope->locals_count; local_index++) {
22347
22712
  const pm_string_t *local = pm_options_scope_local_get(scope, local_index);
@@ -22358,6 +22723,12 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
22358
22723
  }
22359
22724
  }
22360
22725
 
22726
+ // Now that we have established the user-provided options, check if
22727
+ // a version was given and parse as the latest version otherwise.
22728
+ if (parser->version == PM_OPTIONS_VERSION_UNSET) {
22729
+ parser->version = PM_OPTIONS_VERSION_LATEST;
22730
+ }
22731
+
22361
22732
  pm_accepts_block_stack_push(parser, true);
22362
22733
 
22363
22734
  // Skip past the UTF-8 BOM if it exists.
@@ -22411,7 +22782,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
22411
22782
  }
22412
22783
 
22413
22784
  search_shebang = false;
22414
- } else if (options->main_script && !parser->parsing_eval) {
22785
+ } else if (options != NULL && options->main_script && !parser->parsing_eval) {
22415
22786
  search_shebang = true;
22416
22787
  }
22417
22788
  }
@@ -22551,11 +22922,11 @@ pm_parse(pm_parser_t *parser) {
22551
22922
  * otherwise return true.
22552
22923
  */
22553
22924
  static bool
22554
- pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets) {
22925
+ pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof) {
22555
22926
  #define LINE_SIZE 4096
22556
22927
  char line[LINE_SIZE];
22557
22928
 
22558
- while (memset(line, '\n', LINE_SIZE), fgets(line, LINE_SIZE, stream) != NULL) {
22929
+ while (memset(line, '\n', LINE_SIZE), stream_fgets(line, LINE_SIZE, stream) != NULL) {
22559
22930
  size_t length = LINE_SIZE;
22560
22931
  while (length > 0 && line[length - 1] == '\n') length--;
22561
22932
 
@@ -22587,6 +22958,12 @@ pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t
22587
22958
  if (strncmp(line, "__END__\r\n", 9) == 0) return false;
22588
22959
  break;
22589
22960
  }
22961
+
22962
+ // All data should be read via gets. If the string returned by gets
22963
+ // _doesn't_ end with a newline, then we assume we hit EOF condition.
22964
+ if (stream_feof(stream)) {
22965
+ break;
22966
+ }
22590
22967
  }
22591
22968
 
22592
22969
  return true;
@@ -22622,16 +22999,17 @@ pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) {
22622
22999
  * can stream stdin in to Ruby so we need to support a streaming API.
22623
23000
  */
22624
23001
  PRISM_EXPORTED_FUNCTION pm_node_t *
22625
- pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options) {
23002
+ pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const pm_options_t *options) {
22626
23003
  pm_buffer_init(buffer);
22627
23004
 
22628
- bool eof = pm_parse_stream_read(buffer, stream, fgets);
23005
+ bool eof = pm_parse_stream_read(buffer, stream, stream_fgets, stream_feof);
23006
+
22629
23007
  pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
22630
23008
  pm_node_t *node = pm_parse(parser);
22631
23009
 
22632
23010
  while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) {
22633
23011
  pm_node_destroy(parser, node);
22634
- eof = pm_parse_stream_read(buffer, stream, fgets);
23012
+ eof = pm_parse_stream_read(buffer, stream, stream_fgets, stream_feof);
22635
23013
 
22636
23014
  pm_parser_free(parser);
22637
23015
  pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options);
@@ -22723,13 +23101,13 @@ pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, cons
22723
23101
  * given stream into to the given buffer.
22724
23102
  */
22725
23103
  PRISM_EXPORTED_FUNCTION void
22726
- pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const char *data) {
23104
+ pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, pm_parse_stream_feof_t *stream_feof, const char *data) {
22727
23105
  pm_parser_t parser;
22728
23106
  pm_options_t options = { 0 };
22729
23107
  pm_options_read(&options, data);
22730
23108
 
22731
23109
  pm_buffer_t parser_buffer;
22732
- pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, fgets, &options);
23110
+ pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, stream_fgets, stream_feof, &options);
22733
23111
  pm_serialize_header(buffer);
22734
23112
  pm_serialize_content(&parser, node, buffer);
22735
23113
  pm_buffer_append_byte(buffer, '\0');