prism 0.19.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +102 -1
  3. data/Makefile +5 -0
  4. data/README.md +9 -6
  5. data/config.yml +236 -38
  6. data/docs/build_system.md +19 -2
  7. data/docs/cruby_compilation.md +27 -0
  8. data/docs/parser_translation.md +34 -0
  9. data/docs/parsing_rules.md +19 -0
  10. data/docs/releasing.md +84 -16
  11. data/docs/ruby_api.md +1 -1
  12. data/docs/ruby_parser_translation.md +19 -0
  13. data/docs/serialization.md +19 -5
  14. data/ext/prism/api_node.c +1989 -1525
  15. data/ext/prism/extension.c +130 -30
  16. data/ext/prism/extension.h +2 -2
  17. data/include/prism/ast.h +1700 -505
  18. data/include/prism/defines.h +8 -0
  19. data/include/prism/diagnostic.h +49 -7
  20. data/include/prism/encoding.h +17 -0
  21. data/include/prism/options.h +40 -14
  22. data/include/prism/parser.h +34 -18
  23. data/include/prism/util/pm_buffer.h +9 -0
  24. data/include/prism/util/pm_constant_pool.h +18 -0
  25. data/include/prism/util/pm_newline_list.h +4 -14
  26. data/include/prism/util/pm_strpbrk.h +4 -1
  27. data/include/prism/version.h +2 -2
  28. data/include/prism.h +19 -2
  29. data/lib/prism/debug.rb +11 -5
  30. data/lib/prism/desugar_compiler.rb +225 -80
  31. data/lib/prism/dot_visitor.rb +36 -14
  32. data/lib/prism/dsl.rb +302 -299
  33. data/lib/prism/ffi.rb +107 -76
  34. data/lib/prism/lex_compat.rb +17 -1
  35. data/lib/prism/node.rb +4580 -2607
  36. data/lib/prism/node_ext.rb +27 -4
  37. data/lib/prism/parse_result.rb +75 -29
  38. data/lib/prism/serialize.rb +633 -305
  39. data/lib/prism/translation/parser/compiler.rb +1838 -0
  40. data/lib/prism/translation/parser/lexer.rb +335 -0
  41. data/lib/prism/translation/parser/rubocop.rb +45 -0
  42. data/lib/prism/translation/parser.rb +190 -0
  43. data/lib/prism/translation/parser33.rb +12 -0
  44. data/lib/prism/translation/parser34.rb +12 -0
  45. data/lib/prism/translation/ripper.rb +696 -0
  46. data/lib/prism/translation/ruby_parser.rb +1521 -0
  47. data/lib/prism/translation.rb +11 -0
  48. data/lib/prism.rb +1 -1
  49. data/prism.gemspec +18 -7
  50. data/rbi/prism.rbi +150 -88
  51. data/rbi/prism_static.rbi +15 -3
  52. data/sig/prism.rbs +996 -961
  53. data/sig/prism_static.rbs +123 -46
  54. data/src/diagnostic.c +264 -219
  55. data/src/encoding.c +21 -26
  56. data/src/node.c +2 -6
  57. data/src/options.c +29 -5
  58. data/src/prettyprint.c +176 -44
  59. data/src/prism.c +1499 -564
  60. data/src/serialize.c +35 -21
  61. data/src/token_type.c +353 -4
  62. data/src/util/pm_buffer.c +11 -0
  63. data/src/util/pm_constant_pool.c +37 -11
  64. data/src/util/pm_newline_list.c +6 -15
  65. data/src/util/pm_string.c +0 -7
  66. data/src/util/pm_strpbrk.c +122 -14
  67. metadata +16 -5
  68. data/docs/building.md +0 -29
  69. data/lib/prism/ripper_compat.rb +0 -207
data/src/prism.c CHANGED
@@ -51,6 +51,7 @@ debug_context(pm_context_t context) {
51
51
  case PM_CONTEXT_IF: return "IF";
52
52
  case PM_CONTEXT_MAIN: return "MAIN";
53
53
  case PM_CONTEXT_MODULE: return "MODULE";
54
+ case PM_CONTEXT_NONE: return "NONE";
54
55
  case PM_CONTEXT_PARENS: return "PARENS";
55
56
  case PM_CONTEXT_POSTEXE: return "POSTEXE";
56
57
  case PM_CONTEXT_PREDICATE: return "PREDICATE";
@@ -164,7 +165,7 @@ debug_state(pm_parser_t *parser) {
164
165
 
165
166
  PRISM_ATTRIBUTE_UNUSED static void
166
167
  debug_token(pm_token_t * token) {
167
- fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_to_str(token->type), (int) (token->end - token->start), token->start);
168
+ fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_human(token->type), (int) (token->end - token->start), token->start);
168
169
  }
169
170
 
170
171
  #endif
@@ -423,6 +424,11 @@ lex_state_beg_p(pm_parser_t *parser) {
423
424
  return lex_state_p(parser, PM_LEX_STATE_BEG_ANY) || ((parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED));
424
425
  }
425
426
 
427
+ static inline bool
428
+ lex_state_arg_labeled_p(pm_parser_t *parser) {
429
+ return (parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
430
+ }
431
+
426
432
  static inline bool
427
433
  lex_state_arg_p(pm_parser_t *parser) {
428
434
  return lex_state_p(parser, PM_LEX_STATE_ARG_ANY);
@@ -487,7 +493,8 @@ pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_
487
493
  /**
488
494
  * Append an error to the list of errors on the parser using a format string.
489
495
  */
490
- #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
496
+ #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) \
497
+ pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
491
498
 
492
499
  /**
493
500
  * Append an error to the list of errors on the parser using the location of the
@@ -502,7 +509,8 @@ pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
502
509
  * Append an error to the list of errors on the parser using the given location
503
510
  * using a format string.
504
511
  */
505
- #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (location)->start, (location)->end, diag_id, __VA_ARGS__)
512
+ #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) \
513
+ PM_PARSER_ERR_FORMAT(parser, (location)->start, (location)->end, diag_id, __VA_ARGS__)
506
514
 
507
515
  /**
508
516
  * Append an error to the list of errors on the parser using the location of the
@@ -517,7 +525,15 @@ pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_
517
525
  * Append an error to the list of errors on the parser using the location of the
518
526
  * given node and a format string.
519
527
  */
520
- #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, node->location.start, node->location.end, diag_id, __VA_ARGS__)
528
+ #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) \
529
+ PM_PARSER_ERR_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__)
530
+
531
+ /**
532
+ * Append an error to the list of errors on the parser using the location of the
533
+ * given node and a format string, and add on the content of the node.
534
+ */
535
+ #define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, diag_id) \
536
+ PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, (int) ((node)->location.end - (node)->location.start), (const char *) (node)->location.start)
521
537
 
522
538
  /**
523
539
  * Append an error to the list of errors on the parser using the location of the
@@ -541,16 +557,22 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_
541
557
  * Append an error to the list of errors on the parser using the location of the
542
558
  * given token and a format string.
543
559
  */
544
- #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__)
560
+ #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) \
561
+ PM_PARSER_ERR_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__)
562
+
563
+ /**
564
+ * Append an error to the list of errors on the parser using the location of the
565
+ * given token and a format string, and add on the content of the token.
566
+ */
567
+ #define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \
568
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start)
545
569
 
546
570
  /**
547
571
  * Append a warning to the list of warnings on the parser.
548
572
  */
549
573
  static inline void
550
574
  pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
551
- if (!parser->suppress_warnings) {
552
- pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
553
- }
575
+ pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
554
576
  }
555
577
 
556
578
  /**
@@ -813,6 +835,9 @@ typedef struct {
813
835
 
814
836
  /** The optional block attached to the call. */
815
837
  pm_node_t *block;
838
+
839
+ /** The flag indicating whether this arguments list has forwarding argument. */
840
+ bool has_forwarding;
816
841
  } pm_arguments_t;
817
842
 
818
843
  /**
@@ -864,6 +889,105 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
864
889
  pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK);
865
890
  }
866
891
 
892
+ /******************************************************************************/
893
+ /* Basic character checks */
894
+ /******************************************************************************/
895
+
896
+ /**
897
+ * This function is used extremely frequently to lex all of the identifiers in a
898
+ * source file, so it's important that it be as fast as possible. For this
899
+ * reason we have the encoding_changed boolean to check if we need to go through
900
+ * the function pointer or can just directly use the UTF-8 functions.
901
+ */
902
+ static inline size_t
903
+ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
904
+ if (parser->encoding_changed) {
905
+ size_t width;
906
+ if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
907
+ return width;
908
+ } else if (*b == '_') {
909
+ return 1;
910
+ } else if (*b >= 0x80) {
911
+ return parser->encoding->char_width(b, parser->end - b);
912
+ } else {
913
+ return 0;
914
+ }
915
+ } else if (*b < 0x80) {
916
+ return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
917
+ } else {
918
+ return pm_encoding_utf_8_char_width(b, parser->end - b);
919
+ }
920
+ }
921
+
922
+ /**
923
+ * Similar to char_is_identifier but this function assumes that the encoding
924
+ * has not been changed.
925
+ */
926
+ static inline size_t
927
+ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
928
+ if (*b < 0x80) {
929
+ return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
930
+ } else {
931
+ return pm_encoding_utf_8_char_width(b, end - b);
932
+ }
933
+ }
934
+
935
+ /**
936
+ * Like the above, this function is also used extremely frequently to lex all of
937
+ * the identifiers in a source file once the first character has been found. So
938
+ * it's important that it be as fast as possible.
939
+ */
940
+ static inline size_t
941
+ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
942
+ if (parser->encoding_changed) {
943
+ size_t width;
944
+ if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
945
+ return width;
946
+ } else if (*b == '_') {
947
+ return 1;
948
+ } else if (*b >= 0x80) {
949
+ return parser->encoding->char_width(b, parser->end - b);
950
+ } else {
951
+ return 0;
952
+ }
953
+ }
954
+ return char_is_identifier_utf8(b, parser->end);
955
+ }
956
+
957
+ // Here we're defining a perfect hash for the characters that are allowed in
958
+ // global names. This is used to quickly check the next character after a $ to
959
+ // see if it's a valid character for a global name.
960
+ #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
961
+ #define PUNCT(idx) ( \
962
+ BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
963
+ BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
964
+ BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
965
+ BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
966
+ BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
967
+ BIT('0', idx))
968
+
969
+ const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
970
+
971
+ #undef BIT
972
+ #undef PUNCT
973
+
974
+ static inline bool
975
+ char_is_global_name_punctuation(const uint8_t b) {
976
+ const unsigned int i = (const unsigned int) b;
977
+ if (i <= 0x20 || 0x7e < i) return false;
978
+
979
+ return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
980
+ }
981
+
982
+ static inline bool
983
+ token_is_setter_name(pm_token_t *token) {
984
+ return (
985
+ (token->type == PM_TOKEN_IDENTIFIER) &&
986
+ (token->end - token->start >= 2) &&
987
+ (token->end[-1] == '=')
988
+ );
989
+ }
990
+
867
991
  /******************************************************************************/
868
992
  /* Node flag handling functions */
869
993
  /******************************************************************************/
@@ -884,6 +1008,22 @@ pm_node_flag_unset(pm_node_t *node, pm_node_flags_t flag) {
884
1008
  node->flags &= (pm_node_flags_t) ~flag;
885
1009
  }
886
1010
 
1011
+ /**
1012
+ * Set the repeated parameter flag on the given node.
1013
+ */
1014
+ static inline void
1015
+ pm_node_flag_set_repeated_parameter(pm_node_t *node) {
1016
+ assert(PM_NODE_TYPE(node) == PM_BLOCK_LOCAL_VARIABLE_NODE ||
1017
+ PM_NODE_TYPE(node) == PM_BLOCK_PARAMETER_NODE ||
1018
+ PM_NODE_TYPE(node) == PM_KEYWORD_REST_PARAMETER_NODE ||
1019
+ PM_NODE_TYPE(node) == PM_OPTIONAL_KEYWORD_PARAMETER_NODE ||
1020
+ PM_NODE_TYPE(node) == PM_OPTIONAL_PARAMETER_NODE ||
1021
+ PM_NODE_TYPE(node) == PM_REQUIRED_KEYWORD_PARAMETER_NODE ||
1022
+ PM_NODE_TYPE(node) == PM_REQUIRED_PARAMETER_NODE ||
1023
+ PM_NODE_TYPE(node) == PM_REST_PARAMETER_NODE);
1024
+
1025
+ pm_node_flag_set(node, PM_PARAMETER_FLAGS_REPEATED_PARAMETER);
1026
+ }
887
1027
 
888
1028
  /******************************************************************************/
889
1029
  /* Node creation functions */
@@ -977,7 +1117,7 @@ static inline void *
977
1117
  pm_alloc_node(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, size_t size) {
978
1118
  void *memory = calloc(1, size);
979
1119
  if (memory == NULL) {
980
- fprintf(stderr, "Failed to allocate %zu bytes\n", size);
1120
+ fprintf(stderr, "Failed to allocate %d bytes\n", (int) size);
981
1121
  abort();
982
1122
  }
983
1123
  return memory;
@@ -1325,7 +1465,7 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1325
1465
  pm_assoc_node_t *node = PM_ALLOC_NODE(parser, pm_assoc_node_t);
1326
1466
  const uint8_t *end;
1327
1467
 
1328
- if (value != NULL) {
1468
+ if (value != NULL && value->location.end > key->location.end) {
1329
1469
  end = value->location.end;
1330
1470
  } else if (operator->type != PM_TOKEN_NOT_PROVIDED) {
1331
1471
  end = operator->end;
@@ -1333,6 +1473,13 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1333
1473
  end = key->location.end;
1334
1474
  }
1335
1475
 
1476
+ // Hash string keys will be frozen, so we can mark them as frozen here so
1477
+ // that the compiler picks them up and also when we check for static literal
1478
+ // on the keys it gets factored in.
1479
+ if (PM_NODE_TYPE_P(key, PM_STRING_NODE)) {
1480
+ key->flags |= PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL;
1481
+ }
1482
+
1336
1483
  // If the key and value of this assoc node are both static literals, then
1337
1484
  // we can mark this node as a static literal.
1338
1485
  pm_node_flags_t flags = 0;
@@ -1490,7 +1637,7 @@ pm_block_argument_node_create(pm_parser_t *parser, const pm_token_t *operator, p
1490
1637
  * Allocate and initialize a new BlockNode node.
1491
1638
  */
1492
1639
  static pm_block_node_t *
1493
- pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, uint32_t locals_body_index, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) {
1640
+ pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) {
1494
1641
  pm_block_node_t *node = PM_ALLOC_NODE(parser, pm_block_node_t);
1495
1642
 
1496
1643
  *node = (pm_block_node_t) {
@@ -1499,7 +1646,6 @@ pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, uint32_
1499
1646
  .location = { .start = opening->start, .end = closing->end },
1500
1647
  },
1501
1648
  .locals = *locals,
1502
- .locals_body_index = locals_body_index,
1503
1649
  .parameters = parameters,
1504
1650
  .body = body,
1505
1651
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
@@ -1645,12 +1791,13 @@ pm_break_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
1645
1791
  * in the various specializations of this function.
1646
1792
  */
1647
1793
  static pm_call_node_t *
1648
- pm_call_node_create(pm_parser_t *parser) {
1794
+ pm_call_node_create(pm_parser_t *parser, pm_node_flags_t flags) {
1649
1795
  pm_call_node_t *node = PM_ALLOC_NODE(parser, pm_call_node_t);
1650
1796
 
1651
1797
  *node = (pm_call_node_t) {
1652
1798
  {
1653
1799
  .type = PM_CALL_NODE,
1800
+ .flags = flags,
1654
1801
  .location = PM_LOCATION_NULL_VALUE(parser),
1655
1802
  },
1656
1803
  .receiver = NULL,
@@ -1666,6 +1813,15 @@ pm_call_node_create(pm_parser_t *parser) {
1666
1813
  return node;
1667
1814
  }
1668
1815
 
1816
+ /**
1817
+ * Returns the value that the ignore visibility flag should be set to for the
1818
+ * given receiver.
1819
+ */
1820
+ static inline pm_node_flags_t
1821
+ pm_call_node_ignore_visibility_flag(const pm_node_t *receiver) {
1822
+ return PM_NODE_TYPE_P(receiver, PM_SELF_NODE) ? PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY : 0;
1823
+ }
1824
+
1669
1825
  /**
1670
1826
  * Allocate and initialize a new CallNode node from an aref or an aset
1671
1827
  * expression.
@@ -1674,7 +1830,7 @@ static pm_call_node_t *
1674
1830
  pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_t *arguments) {
1675
1831
  pm_assert_value_expression(parser, receiver);
1676
1832
 
1677
- pm_call_node_t *node = pm_call_node_create(parser);
1833
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1678
1834
 
1679
1835
  node->base.location.start = receiver->location.start;
1680
1836
  node->base.location.end = pm_arguments_end(arguments);
@@ -1700,7 +1856,7 @@ pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t
1700
1856
  pm_assert_value_expression(parser, receiver);
1701
1857
  pm_assert_value_expression(parser, argument);
1702
1858
 
1703
- pm_call_node_t *node = pm_call_node_create(parser);
1859
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1704
1860
 
1705
1861
  node->base.location.start = MIN(receiver->location.start, argument->location.start);
1706
1862
  node->base.location.end = MAX(receiver->location.end, argument->location.end);
@@ -1723,7 +1879,7 @@ static pm_call_node_t *
1723
1879
  pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_token_t *message, pm_arguments_t *arguments) {
1724
1880
  pm_assert_value_expression(parser, receiver);
1725
1881
 
1726
- pm_call_node_t *node = pm_call_node_create(parser);
1882
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1727
1883
 
1728
1884
  node->base.location.start = receiver->location.start;
1729
1885
  const uint8_t *end = pm_arguments_end(arguments);
@@ -1754,7 +1910,7 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o
1754
1910
  */
1755
1911
  static pm_call_node_t *
1756
1912
  pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments_t *arguments) {
1757
- pm_call_node_t *node = pm_call_node_create(parser);
1913
+ pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY);
1758
1914
 
1759
1915
  node->base.location.start = message->start;
1760
1916
  node->base.location.end = pm_arguments_end(arguments);
@@ -1776,7 +1932,7 @@ static pm_call_node_t *
1776
1932
  pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *message, pm_arguments_t *arguments) {
1777
1933
  pm_assert_value_expression(parser, receiver);
1778
1934
 
1779
- pm_call_node_t *node = pm_call_node_create(parser);
1935
+ pm_call_node_t *node = pm_call_node_create(parser, receiver == NULL ? 0 : pm_call_node_ignore_visibility_flag(receiver));
1780
1936
 
1781
1937
  node->base.location.start = message->start;
1782
1938
  if (arguments->closing_loc.start != NULL) {
@@ -1802,7 +1958,7 @@ static pm_call_node_t *
1802
1958
  pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_arguments_t *arguments) {
1803
1959
  pm_assert_value_expression(parser, receiver);
1804
1960
 
1805
- pm_call_node_t *node = pm_call_node_create(parser);
1961
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1806
1962
 
1807
1963
  node->base.location.start = receiver->location.start;
1808
1964
  node->base.location.end = pm_arguments_end(arguments);
@@ -1829,7 +1985,7 @@ static pm_call_node_t *
1829
1985
  pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *receiver, const char *name) {
1830
1986
  pm_assert_value_expression(parser, receiver);
1831
1987
 
1832
- pm_call_node_t *node = pm_call_node_create(parser);
1988
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1833
1989
 
1834
1990
  node->base.location.start = operator->start;
1835
1991
  node->base.location.end = receiver->location.end;
@@ -1847,7 +2003,7 @@ pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *
1847
2003
  */
1848
2004
  static pm_call_node_t *
1849
2005
  pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) {
1850
- pm_call_node_t *node = pm_call_node_create(parser);
2006
+ pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY);
1851
2007
 
1852
2008
  node->base.location = PM_LOCATION_TOKEN_VALUE(message);
1853
2009
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message);
@@ -1885,11 +2041,12 @@ pm_call_node_index_p(pm_call_node_t *node) {
1885
2041
  * operator assignment.
1886
2042
  */
1887
2043
  static inline bool
1888
- pm_call_node_writable_p(pm_call_node_t *node) {
2044
+ pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) {
1889
2045
  return (
1890
2046
  (node->message_loc.start != NULL) &&
1891
2047
  (node->message_loc.end[-1] != '!') &&
1892
2048
  (node->message_loc.end[-1] != '?') &&
2049
+ char_is_identifier_start(parser, node->message_loc.start) &&
1893
2050
  (node->opening_loc.start == NULL) &&
1894
2051
  (node->arguments == NULL) &&
1895
2052
  (node->block == NULL)
@@ -2167,11 +2324,12 @@ pm_call_target_node_create(pm_parser_t *parser, pm_call_node_t *target) {
2167
2324
  static pm_index_target_node_t *
2168
2325
  pm_index_target_node_create(pm_parser_t *parser, pm_call_node_t *target) {
2169
2326
  pm_index_target_node_t *node = PM_ALLOC_NODE(parser, pm_index_target_node_t);
2327
+ pm_node_flags_t flags = target->base.flags;
2170
2328
 
2171
2329
  *node = (pm_index_target_node_t) {
2172
2330
  {
2173
2331
  .type = PM_INDEX_TARGET_NODE,
2174
- .flags = target->base.flags,
2332
+ .flags = flags | PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE,
2175
2333
  .location = target->base.location
2176
2334
  },
2177
2335
  .receiver = target->receiver,
@@ -2701,18 +2859,62 @@ pm_constant_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *targ
2701
2859
  return node;
2702
2860
  }
2703
2861
 
2862
+ /**
2863
+ * Check if the receiver of a `def` node is allowed.
2864
+ */
2865
+ static void
2866
+ pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) {
2867
+ switch (PM_NODE_TYPE(node)) {
2868
+ case PM_BEGIN_NODE: {
2869
+ const pm_begin_node_t *cast = (pm_begin_node_t *) node;
2870
+ if (cast->statements != NULL) pm_def_node_receiver_check(parser, (pm_node_t *) cast->statements);
2871
+ break;
2872
+ }
2873
+ case PM_PARENTHESES_NODE: {
2874
+ const pm_parentheses_node_t *cast = (const pm_parentheses_node_t *) node;
2875
+ if (cast->body != NULL) pm_def_node_receiver_check(parser, cast->body);
2876
+ break;
2877
+ }
2878
+ case PM_STATEMENTS_NODE: {
2879
+ const pm_statements_node_t *cast = (const pm_statements_node_t *) node;
2880
+ pm_def_node_receiver_check(parser, cast->body.nodes[cast->body.size - 1]);
2881
+ break;
2882
+ }
2883
+ case PM_ARRAY_NODE:
2884
+ case PM_FLOAT_NODE:
2885
+ case PM_IMAGINARY_NODE:
2886
+ case PM_INTEGER_NODE:
2887
+ case PM_INTERPOLATED_REGULAR_EXPRESSION_NODE:
2888
+ case PM_INTERPOLATED_STRING_NODE:
2889
+ case PM_INTERPOLATED_SYMBOL_NODE:
2890
+ case PM_INTERPOLATED_X_STRING_NODE:
2891
+ case PM_RATIONAL_NODE:
2892
+ case PM_REGULAR_EXPRESSION_NODE:
2893
+ case PM_SOURCE_ENCODING_NODE:
2894
+ case PM_SOURCE_FILE_NODE:
2895
+ case PM_SOURCE_LINE_NODE:
2896
+ case PM_STRING_NODE:
2897
+ case PM_SYMBOL_NODE:
2898
+ case PM_X_STRING_NODE:
2899
+ pm_parser_err_node(parser, node, PM_ERR_SINGLETON_FOR_LITERALS);
2900
+ break;
2901
+ default:
2902
+ break;
2903
+ }
2904
+ }
2905
+
2704
2906
  /**
2705
2907
  * Allocate and initialize a new DefNode node.
2706
2908
  */
2707
2909
  static pm_def_node_t *
2708
2910
  pm_def_node_create(
2709
2911
  pm_parser_t *parser,
2710
- const pm_token_t *name,
2912
+ pm_constant_id_t name,
2913
+ const pm_token_t *name_loc,
2711
2914
  pm_node_t *receiver,
2712
2915
  pm_parameters_node_t *parameters,
2713
2916
  pm_node_t *body,
2714
2917
  pm_constant_id_list_t *locals,
2715
- uint32_t locals_body_index,
2716
2918
  const pm_token_t *def_keyword,
2717
2919
  const pm_token_t *operator,
2718
2920
  const pm_token_t *lparen,
@@ -2729,18 +2931,21 @@ pm_def_node_create(
2729
2931
  end = end_keyword->end;
2730
2932
  }
2731
2933
 
2934
+ if ((receiver != NULL) && PM_NODE_TYPE_P(receiver, PM_PARENTHESES_NODE)) {
2935
+ pm_def_node_receiver_check(parser, receiver);
2936
+ }
2937
+
2732
2938
  *node = (pm_def_node_t) {
2733
2939
  {
2734
2940
  .type = PM_DEF_NODE,
2735
2941
  .location = { .start = def_keyword->start, .end = end },
2736
2942
  },
2737
- .name = pm_parser_constant_id_token(parser, name),
2738
- .name_loc = PM_LOCATION_TOKEN_VALUE(name),
2943
+ .name = name,
2944
+ .name_loc = PM_LOCATION_TOKEN_VALUE(name_loc),
2739
2945
  .receiver = receiver,
2740
2946
  .parameters = parameters,
2741
2947
  .body = body,
2742
2948
  .locals = *locals,
2743
- .locals_body_index = locals_body_index,
2744
2949
  .def_keyword_loc = PM_LOCATION_TOKEN_VALUE(def_keyword),
2745
2950
  .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator),
2746
2951
  .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen),
@@ -3962,9 +4167,8 @@ pm_keyword_hash_node_create(pm_parser_t *parser) {
3962
4167
  */
3963
4168
  static void
3964
4169
  pm_keyword_hash_node_elements_append(pm_keyword_hash_node_t *hash, pm_node_t *element) {
3965
- // If the element being added is not an AssocNode or does not have a symbol key, then
3966
- // we want to turn the STATIC_KEYS flag off.
3967
- // TODO: Rename the flag to SYMBOL_KEYS instead.
4170
+ // If the element being added is not an AssocNode or does not have a symbol
4171
+ // key, then we want to turn the SYMBOL_KEYS flag off.
3968
4172
  if (!PM_NODE_TYPE_P(element, PM_ASSOC_NODE) || !PM_NODE_TYPE_P(((pm_assoc_node_t *) element)->key, PM_SYMBOL_NODE)) {
3969
4173
  pm_node_flag_unset((pm_node_t *)hash, PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS);
3970
4174
  }
@@ -4051,7 +4255,6 @@ static pm_lambda_node_t *
4051
4255
  pm_lambda_node_create(
4052
4256
  pm_parser_t *parser,
4053
4257
  pm_constant_id_list_t *locals,
4054
- uint32_t locals_body_index,
4055
4258
  const pm_token_t *operator,
4056
4259
  const pm_token_t *opening,
4057
4260
  const pm_token_t *closing,
@@ -4069,7 +4272,6 @@ pm_lambda_node_create(
4069
4272
  },
4070
4273
  },
4071
4274
  .locals = *locals,
4072
- .locals_body_index = locals_body_index,
4073
4275
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
4074
4276
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
4075
4277
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
@@ -4161,12 +4363,10 @@ pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, c
4161
4363
  }
4162
4364
 
4163
4365
  /**
4164
- * Allocate a new LocalVariableReadNode node.
4366
+ * Allocate a new LocalVariableReadNode node with constant_id.
4165
4367
  */
4166
4368
  static pm_local_variable_read_node_t *
4167
- pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) {
4168
- pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name);
4169
-
4369
+ pm_local_variable_read_node_create_constant_id(pm_parser_t *parser, const pm_token_t *name, pm_constant_id_t name_id, uint32_t depth) {
4170
4370
  if (parser->current_param_name == name_id) {
4171
4371
  pm_parser_err_token(parser, name, PM_ERR_PARAMETER_CIRCULAR);
4172
4372
  }
@@ -4185,6 +4385,15 @@ pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name,
4185
4385
  return node;
4186
4386
  }
4187
4387
 
4388
+ /**
4389
+ * Allocate a new LocalVariableReadNode node.
4390
+ */
4391
+ static pm_local_variable_read_node_t *
4392
+ pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) {
4393
+ pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name);
4394
+ return pm_local_variable_read_node_create_constant_id(parser, name, name_id, depth);
4395
+ }
4396
+
4188
4397
  /**
4189
4398
  * Allocate and initialize a new LocalVariableWriteNode node.
4190
4399
  */
@@ -4210,6 +4419,57 @@ pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name,
4210
4419
  return node;
4211
4420
  }
4212
4421
 
4422
+ /**
4423
+ * Returns true if the given bounds comprise `it`.
4424
+ */
4425
+ static inline bool
4426
+ pm_token_is_it(const uint8_t *start, const uint8_t *end) {
4427
+ return (end - start == 2) && (start[0] == 'i') && (start[1] == 't');
4428
+ }
4429
+
4430
+ /**
4431
+ * Returns true if the given node is `it` default parameter.
4432
+ */
4433
+ static inline bool
4434
+ pm_node_is_it(pm_parser_t *parser, pm_node_t *node) {
4435
+ // Check if it's a local variable reference
4436
+ if (node->type != PM_CALL_NODE) {
4437
+ return false;
4438
+ }
4439
+
4440
+ // Check if it's a variable call
4441
+ pm_call_node_t *call_node = (pm_call_node_t *) node;
4442
+ if (!pm_call_node_variable_call_p(call_node)) {
4443
+ return false;
4444
+ }
4445
+
4446
+ // Check if it's called `it`
4447
+ pm_constant_id_t id = ((pm_call_node_t *)node)->name;
4448
+ pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, id);
4449
+ return pm_token_is_it(constant->start, constant->start + constant->length);
4450
+ }
4451
+
4452
+ /**
4453
+ * Convert a `it` variable call node to a node for `it` default parameter.
4454
+ */
4455
+ static pm_node_t *
4456
+ pm_node_check_it(pm_parser_t *parser, pm_node_t *node) {
4457
+ if (
4458
+ (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3_0) &&
4459
+ !parser->current_scope->closed &&
4460
+ pm_node_is_it(parser, node)
4461
+ ) {
4462
+ if (parser->current_scope->explicit_params) {
4463
+ pm_parser_err_previous(parser, PM_ERR_IT_NOT_ALLOWED);
4464
+ } else {
4465
+ pm_node_destroy(parser, node);
4466
+ pm_constant_id_t name_id = pm_parser_constant_id_constant(parser, "0it", 3);
4467
+ node = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
4468
+ }
4469
+ }
4470
+ return node;
4471
+ }
4472
+
4213
4473
  /**
4214
4474
  * Returns true if the given bounds comprise a numbered parameter (i.e., they
4215
4475
  * are of the form /^_\d$/).
@@ -4402,13 +4662,20 @@ pm_multi_target_node_create(pm_parser_t *parser) {
4402
4662
  */
4403
4663
  static void
4404
4664
  pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t *node, pm_node_t *target) {
4405
- if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE) || PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4665
+ if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE)) {
4406
4666
  if (node->rest == NULL) {
4407
4667
  node->rest = target;
4408
4668
  } else {
4409
4669
  pm_parser_err_node(parser, target, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
4410
4670
  pm_node_list_append(&node->rights, target);
4411
4671
  }
4672
+ } else if (PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4673
+ if (node->rest == NULL) {
4674
+ node->rest = target;
4675
+ } else {
4676
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST);
4677
+ pm_node_list_append(&node->rights, target);
4678
+ }
4412
4679
  } else if (node->rest == NULL) {
4413
4680
  pm_node_list_append(&node->lefts, target);
4414
4681
  } else {
@@ -5195,7 +5462,7 @@ pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword)
5195
5462
  .flags = PM_NODE_FLAG_STATIC_LITERAL,
5196
5463
  .location = PM_LOCATION_TOKEN_VALUE(file_keyword),
5197
5464
  },
5198
- .filepath = parser->filepath_string,
5465
+ .filepath = parser->filepath
5199
5466
  };
5200
5467
 
5201
5468
  return node;
@@ -5372,18 +5639,59 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
5372
5639
  return node;
5373
5640
  }
5374
5641
 
5642
+ /**
5643
+ * Read through the contents of a string and check if it consists solely of US ASCII code points.
5644
+ */
5645
+ static bool
5646
+ pm_ascii_only_p(const pm_string_t *contents) {
5647
+ const size_t length = pm_string_length(contents);
5648
+ const uint8_t *source = pm_string_source(contents);
5649
+
5650
+ for (size_t index = 0; index < length; index++) {
5651
+ if (source[index] & 0x80) return false;
5652
+ }
5653
+
5654
+ return true;
5655
+ }
5656
+
5657
+ /**
5658
+ * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
5659
+ * encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
5660
+ * points. Otherwise, the encoding may be explicitly set with an escape
5661
+ * sequence.
5662
+ */
5663
+ static inline pm_node_flags_t
5664
+ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
5665
+ if (parser->explicit_encoding != NULL) {
5666
+ // A Symbol may optionally have its encoding explicitly set. This will
5667
+ // happen if an escape sequence results in a non-ASCII code point.
5668
+ if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
5669
+ return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
5670
+ } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
5671
+ return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
5672
+ }
5673
+ } else if (pm_ascii_only_p(contents)) {
5674
+ // Ruby stipulates that all source files must use an ASCII-compatible
5675
+ // encoding. Thus, all symbols appearing in source are eligible for
5676
+ // "downgrading" to US-ASCII.
5677
+ return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
5678
+ }
5679
+
5680
+ return 0;
5681
+ }
5682
+
5375
5683
  /**
5376
5684
  * Allocate and initialize a new SymbolNode node with the given unescaped
5377
5685
  * string.
5378
5686
  */
5379
5687
  static pm_symbol_node_t *
5380
- pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) {
5688
+ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) {
5381
5689
  pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t);
5382
5690
 
5383
5691
  *node = (pm_symbol_node_t) {
5384
5692
  {
5385
5693
  .type = PM_SYMBOL_NODE,
5386
- .flags = PM_NODE_FLAG_STATIC_LITERAL,
5694
+ .flags = PM_NODE_FLAG_STATIC_LITERAL | flags,
5387
5695
  .location = {
5388
5696
  .start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start),
5389
5697
  .end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end)
@@ -5403,7 +5711,7 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
5403
5711
  */
5404
5712
  static inline pm_symbol_node_t *
5405
5713
  pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
5406
- return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY);
5714
+ return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0);
5407
5715
  }
5408
5716
 
5409
5717
  /**
@@ -5411,7 +5719,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
5411
5719
  */
5412
5720
  static pm_symbol_node_t *
5413
5721
  pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
5414
- pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
5722
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
5415
5723
  parser->current_string = PM_STRING_EMPTY;
5416
5724
  return node;
5417
5725
  }
@@ -5433,6 +5741,8 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
5433
5741
 
5434
5742
  assert((label.end - label.start) >= 0);
5435
5743
  pm_string_shared_init(&node->unescaped, label.start, label.end);
5744
+ pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
5745
+
5436
5746
  break;
5437
5747
  }
5438
5748
  case PM_TOKEN_MISSING: {
@@ -5495,6 +5805,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
5495
5805
  .unescaped = node->unescaped
5496
5806
  };
5497
5807
 
5808
+ pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
5809
+
5498
5810
  // We are explicitly _not_ using pm_node_destroy here because we don't want
5499
5811
  // to trash the unescaped string. We could instead copy the string if we
5500
5812
  // know that it is owned, but we're taking the fast path for now.
@@ -5885,6 +6197,7 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
5885
6197
  .closed = closed,
5886
6198
  .explicit_params = false,
5887
6199
  .numbered_parameters = 0,
6200
+ .forwarding_params = 0,
5888
6201
  };
5889
6202
 
5890
6203
  pm_constant_id_list_init(&scope->locals);
@@ -5893,6 +6206,76 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
5893
6206
  return true;
5894
6207
  }
5895
6208
 
6209
+ static void
6210
+ pm_parser_scope_forwarding_param_check(pm_parser_t *parser, const pm_token_t * token, const uint8_t mask, pm_diagnostic_id_t diag)
6211
+ {
6212
+ pm_scope_t *scope = parser->current_scope;
6213
+ while (scope) {
6214
+ if (scope->forwarding_params & mask) {
6215
+ if (!scope->closed) {
6216
+ pm_parser_err_token(parser, token, diag);
6217
+ return;
6218
+ }
6219
+ return;
6220
+ }
6221
+ if (scope->closed) break;
6222
+ scope = scope->previous;
6223
+ }
6224
+
6225
+ pm_parser_err_token(parser, token, diag);
6226
+ }
6227
+
6228
+ static inline void
6229
+ pm_parser_scope_forwarding_block_check(pm_parser_t *parser, const pm_token_t * token)
6230
+ {
6231
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_BLOCK, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
6232
+ }
6233
+
6234
+ static void
6235
+ pm_parser_scope_forwarding_positionals_check(pm_parser_t *parser, const pm_token_t * token)
6236
+ {
6237
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_POSITIONALS, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
6238
+ }
6239
+
6240
+ static inline void
6241
+ pm_parser_scope_forwarding_all_check(pm_parser_t *parser, const pm_token_t * token)
6242
+ {
6243
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_ALL, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
6244
+ }
6245
+
6246
+ static inline void
6247
+ pm_parser_scope_forwarding_keywords_check(pm_parser_t *parser, const pm_token_t * token)
6248
+ {
6249
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_KEYWORDS, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
6250
+ }
6251
+
6252
+ /**
6253
+ * Save the current param name as the return value and set it to the given
6254
+ * constant id.
6255
+ */
6256
+ static inline pm_constant_id_t
6257
+ pm_parser_current_param_name_set(pm_parser_t *parser, pm_constant_id_t current_param_name) {
6258
+ pm_constant_id_t saved_param_name = parser->current_param_name;
6259
+ parser->current_param_name = current_param_name;
6260
+ return saved_param_name;
6261
+ }
6262
+
6263
+ /**
6264
+ * Save the current param name as the return value and clear it.
6265
+ */
6266
+ static inline pm_constant_id_t
6267
+ pm_parser_current_param_name_unset(pm_parser_t *parser) {
6268
+ return pm_parser_current_param_name_set(parser, PM_CONSTANT_ID_UNSET);
6269
+ }
6270
+
6271
+ /**
6272
+ * Restore the current param name from the given value.
6273
+ */
6274
+ static inline void
6275
+ pm_parser_current_param_name_restore(pm_parser_t *parser, pm_constant_id_t saved_param_name) {
6276
+ parser->current_param_name = saved_param_name;
6277
+ }
6278
+
5896
6279
  /**
5897
6280
  * Check if any of the currently visible scopes contain a local variable
5898
6281
  * described by the given constant id.
@@ -5969,26 +6352,41 @@ pm_parser_local_add_owned(pm_parser_t *parser, const uint8_t *start, size_t leng
5969
6352
  return constant_id;
5970
6353
  }
5971
6354
 
6355
+ /**
6356
+ * Add a local variable from a constant string to the current scope.
6357
+ */
6358
+ static pm_constant_id_t
6359
+ pm_parser_local_add_constant(pm_parser_t *parser, const char *start, size_t length) {
6360
+ pm_constant_id_t constant_id = pm_parser_constant_id_constant(parser, start, length);
6361
+ if (constant_id != 0) pm_parser_local_add(parser, constant_id);
6362
+ return constant_id;
6363
+ }
6364
+
5972
6365
  /**
5973
6366
  * Add a parameter name to the current scope and check whether the name of the
5974
6367
  * parameter is unique or not.
6368
+ *
6369
+ * Returns `true` if this is a duplicate parameter name, otherwise returns
6370
+ * false.
5975
6371
  */
5976
- static void
6372
+ static bool
5977
6373
  pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) {
5978
6374
  // We want to check whether the parameter name is a numbered parameter or
5979
6375
  // not.
5980
6376
  pm_refute_numbered_parameter(parser, name->start, name->end);
5981
6377
 
5982
- // We want to ignore any parameter name that starts with an underscore.
5983
- if ((name->start < name->end) && (*name->start == '_')) return;
5984
-
5985
6378
  // Otherwise we'll fetch the constant id for the parameter name and check
5986
6379
  // whether it's already in the current scope.
5987
6380
  pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, name);
5988
6381
 
5989
6382
  if (pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
5990
- pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
6383
+ // Add an error if the parameter doesn't start with _ and has been seen before
6384
+ if ((name->start < name->end) && (*name->start != '_')) {
6385
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
6386
+ }
6387
+ return true;
5991
6388
  }
6389
+ return false;
5992
6390
  }
5993
6391
 
5994
6392
  /**
@@ -6003,105 +6401,6 @@ pm_parser_scope_pop(pm_parser_t *parser) {
6003
6401
  free(scope);
6004
6402
  }
6005
6403
 
6006
- /******************************************************************************/
6007
- /* Basic character checks */
6008
- /******************************************************************************/
6009
-
6010
- /**
6011
- * This function is used extremely frequently to lex all of the identifiers in a
6012
- * source file, so it's important that it be as fast as possible. For this
6013
- * reason we have the encoding_changed boolean to check if we need to go through
6014
- * the function pointer or can just directly use the UTF-8 functions.
6015
- */
6016
- static inline size_t
6017
- char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
6018
- if (parser->encoding_changed) {
6019
- size_t width;
6020
- if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
6021
- return width;
6022
- } else if (*b == '_') {
6023
- return 1;
6024
- } else if (*b >= 0x80) {
6025
- return parser->encoding->char_width(b, parser->end - b);
6026
- } else {
6027
- return 0;
6028
- }
6029
- } else if (*b < 0x80) {
6030
- return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
6031
- } else {
6032
- return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
6033
- }
6034
- }
6035
-
6036
- /**
6037
- * Similar to char_is_identifier but this function assumes that the encoding
6038
- * has not been changed.
6039
- */
6040
- static inline size_t
6041
- char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
6042
- if (*b < 0x80) {
6043
- return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
6044
- } else {
6045
- return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
6046
- }
6047
- }
6048
-
6049
- /**
6050
- * Like the above, this function is also used extremely frequently to lex all of
6051
- * the identifiers in a source file once the first character has been found. So
6052
- * it's important that it be as fast as possible.
6053
- */
6054
- static inline size_t
6055
- char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
6056
- if (parser->encoding_changed) {
6057
- size_t width;
6058
- if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
6059
- return width;
6060
- } else if (*b == '_') {
6061
- return 1;
6062
- } else if (*b >= 0x80) {
6063
- return parser->encoding->char_width(b, parser->end - b);
6064
- } else {
6065
- return 0;
6066
- }
6067
- }
6068
- return char_is_identifier_utf8(b, parser->end);
6069
- }
6070
-
6071
- // Here we're defining a perfect hash for the characters that are allowed in
6072
- // global names. This is used to quickly check the next character after a $ to
6073
- // see if it's a valid character for a global name.
6074
- #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
6075
- #define PUNCT(idx) ( \
6076
- BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
6077
- BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
6078
- BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
6079
- BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
6080
- BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
6081
- BIT('0', idx))
6082
-
6083
- const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
6084
-
6085
- #undef BIT
6086
- #undef PUNCT
6087
-
6088
- static inline bool
6089
- char_is_global_name_punctuation(const uint8_t b) {
6090
- const unsigned int i = (const unsigned int) b;
6091
- if (i <= 0x20 || 0x7e < i) return false;
6092
-
6093
- return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
6094
- }
6095
-
6096
- static inline bool
6097
- token_is_setter_name(pm_token_t *token) {
6098
- return (
6099
- (token->type == PM_TOKEN_IDENTIFIER) &&
6100
- (token->end - token->start >= 2) &&
6101
- (token->end[-1] == '=')
6102
- );
6103
- }
6104
-
6105
6404
  /******************************************************************************/
6106
6405
  /* Stack helpers */
6107
6406
  /******************************************************************************/
@@ -6317,8 +6616,10 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
6317
6616
  */
6318
6617
  static void
6319
6618
  parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
6320
- if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
6619
+ if ((start + 4 <= end) && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
6321
6620
  parser->frozen_string_literal = true;
6621
+ } else if ((start + 5 <= end) && pm_strncasecmp(start, (const uint8_t *) "false", 5) == 0) {
6622
+ parser->frozen_string_literal = false;
6322
6623
  }
6323
6624
  }
6324
6625
 
@@ -6541,21 +6842,27 @@ context_terminator(pm_context_t context, pm_token_t *token) {
6541
6842
  return token->type == PM_TOKEN_BRACE_RIGHT;
6542
6843
  case PM_CONTEXT_PREDICATE:
6543
6844
  return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON;
6845
+ case PM_CONTEXT_NONE:
6846
+ return false;
6544
6847
  }
6545
6848
 
6546
6849
  return false;
6547
6850
  }
6548
6851
 
6549
- static bool
6550
- context_recoverable(pm_parser_t *parser, pm_token_t *token) {
6852
+ /**
6853
+ * Returns the context that the given token is found to be terminating, or
6854
+ * returns PM_CONTEXT_NONE.
6855
+ */
6856
+ static pm_context_t
6857
+ context_recoverable(const pm_parser_t *parser, pm_token_t *token) {
6551
6858
  pm_context_node_t *context_node = parser->current_context;
6552
6859
 
6553
6860
  while (context_node != NULL) {
6554
- if (context_terminator(context_node->context, token)) return true;
6861
+ if (context_terminator(context_node->context, token)) return context_node->context;
6555
6862
  context_node = context_node->prev;
6556
6863
  }
6557
6864
 
6558
- return false;
6865
+ return PM_CONTEXT_NONE;
6559
6866
  }
6560
6867
 
6561
6868
  static bool
@@ -6583,7 +6890,7 @@ context_pop(pm_parser_t *parser) {
6583
6890
  }
6584
6891
 
6585
6892
  static bool
6586
- context_p(pm_parser_t *parser, pm_context_t context) {
6893
+ context_p(const pm_parser_t *parser, pm_context_t context) {
6587
6894
  pm_context_node_t *context_node = parser->current_context;
6588
6895
 
6589
6896
  while (context_node != NULL) {
@@ -6595,7 +6902,7 @@ context_p(pm_parser_t *parser, pm_context_t context) {
6595
6902
  }
6596
6903
 
6597
6904
  static bool
6598
- context_def_p(pm_parser_t *parser) {
6905
+ context_def_p(const pm_parser_t *parser) {
6599
6906
  pm_context_node_t *context_node = parser->current_context;
6600
6907
 
6601
6908
  while (context_node != NULL) {
@@ -6618,6 +6925,55 @@ context_def_p(pm_parser_t *parser) {
6618
6925
  return false;
6619
6926
  }
6620
6927
 
6928
+ /**
6929
+ * Returns a human readable string for the given context, used in error
6930
+ * messages.
6931
+ */
6932
+ static const char *
6933
+ context_human(pm_context_t context) {
6934
+ switch (context) {
6935
+ case PM_CONTEXT_NONE:
6936
+ assert(false && "unreachable");
6937
+ return "";
6938
+ case PM_CONTEXT_BEGIN: return "begin statement";
6939
+ case PM_CONTEXT_BLOCK_BRACES: return "'{'..'}' block";
6940
+ case PM_CONTEXT_BLOCK_KEYWORDS: return "'do'..'end' block";
6941
+ case PM_CONTEXT_CASE_WHEN: return "'when' clause";
6942
+ case PM_CONTEXT_CASE_IN: return "'in' clause";
6943
+ case PM_CONTEXT_CLASS: return "class definition";
6944
+ case PM_CONTEXT_DEF: return "method definition";
6945
+ case PM_CONTEXT_DEF_PARAMS: return "method parameters";
6946
+ case PM_CONTEXT_DEFAULT_PARAMS: return "parameter default value";
6947
+ case PM_CONTEXT_ELSE: return "'else' clause";
6948
+ case PM_CONTEXT_ELSIF: return "'elsif' clause";
6949
+ case PM_CONTEXT_EMBEXPR: return "embedded expression";
6950
+ case PM_CONTEXT_ENSURE: return "'ensure' clause";
6951
+ case PM_CONTEXT_ENSURE_DEF: return "'ensure' clause";
6952
+ case PM_CONTEXT_FOR: return "for loop";
6953
+ case PM_CONTEXT_FOR_INDEX: return "for loop index";
6954
+ case PM_CONTEXT_IF: return "if statement";
6955
+ case PM_CONTEXT_LAMBDA_BRACES: return "'{'..'}' lambda block";
6956
+ case PM_CONTEXT_LAMBDA_DO_END: return "'do'..'end' lambda block";
6957
+ case PM_CONTEXT_MAIN: return "top level context";
6958
+ case PM_CONTEXT_MODULE: return "module definition";
6959
+ case PM_CONTEXT_PARENS: return "parentheses";
6960
+ case PM_CONTEXT_POSTEXE: return "'END' block";
6961
+ case PM_CONTEXT_PREDICATE: return "predicate";
6962
+ case PM_CONTEXT_PREEXE: return "'BEGIN' block";
6963
+ case PM_CONTEXT_RESCUE_ELSE: return "'else' clause";
6964
+ case PM_CONTEXT_RESCUE_ELSE_DEF: return "'else' clause";
6965
+ case PM_CONTEXT_RESCUE: return "'rescue' clause";
6966
+ case PM_CONTEXT_RESCUE_DEF: return "'rescue' clause";
6967
+ case PM_CONTEXT_SCLASS: return "singleton class definition";
6968
+ case PM_CONTEXT_UNLESS: return "unless statement";
6969
+ case PM_CONTEXT_UNTIL: return "until statement";
6970
+ case PM_CONTEXT_WHILE: return "while statement";
6971
+ }
6972
+
6973
+ assert(false && "unreachable");
6974
+ return "";
6975
+ }
6976
+
6621
6977
  /******************************************************************************/
6622
6978
  /* Specific token lexers */
6623
6979
  /******************************************************************************/
@@ -6843,7 +7199,7 @@ lex_numeric(pm_parser_t *parser) {
6843
7199
  static pm_token_type_t
6844
7200
  lex_global_variable(pm_parser_t *parser) {
6845
7201
  if (parser->current.end >= parser->end) {
6846
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7202
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6847
7203
  return PM_TOKEN_GLOBAL_VARIABLE;
6848
7204
  }
6849
7205
 
@@ -6884,7 +7240,7 @@ lex_global_variable(pm_parser_t *parser) {
6884
7240
  } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
6885
7241
 
6886
7242
  // $0 isn't allowed to be followed by anything.
6887
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7243
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6888
7244
  }
6889
7245
 
6890
7246
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -6915,7 +7271,7 @@ lex_global_variable(pm_parser_t *parser) {
6915
7271
  } else {
6916
7272
  // If we get here, then we have a $ followed by something that isn't
6917
7273
  // recognized as a global variable.
6918
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7274
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6919
7275
  }
6920
7276
 
6921
7277
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -7360,6 +7716,28 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
7360
7716
  pm_buffer_append_byte(buffer, byte);
7361
7717
  }
7362
7718
 
7719
+ /**
7720
+ * Write each byte of the given escaped character into the buffer.
7721
+ */
7722
+ static inline void
7723
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
7724
+ size_t width;
7725
+ if (parser->encoding_changed) {
7726
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
7727
+ } else {
7728
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
7729
+ }
7730
+
7731
+ // TODO: If the character is invalid in the given encoding, then we'll just
7732
+ // push one byte into the buffer. This should actually be an error.
7733
+ width = (width == 0) ? 1 : width;
7734
+
7735
+ for (size_t index = 0; index < width; index++) {
7736
+ escape_write_byte_encoded(parser, buffer, *parser->current.end);
7737
+ parser->current.end++;
7738
+ }
7739
+ }
7740
+
7363
7741
  /**
7364
7742
  * The regular expression engine doesn't support the same escape sequences as
7365
7743
  * Ruby does. So first we have to read the escape sequence, and then we have to
@@ -7698,7 +8076,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
7698
8076
  /* fallthrough */
7699
8077
  default: {
7700
8078
  if (parser->current.end < parser->end) {
7701
- escape_write_byte_encoded(parser, buffer, *parser->current.end++);
8079
+ escape_write_escape_encoded(parser, buffer);
7702
8080
  }
7703
8081
  return;
7704
8082
  }
@@ -7797,10 +8175,10 @@ lex_at_variable(pm_parser_t *parser) {
7797
8175
  while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
7798
8176
  parser->current.end += width;
7799
8177
  }
7800
- } else if (type == PM_TOKEN_CLASS_VARIABLE) {
7801
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
7802
8178
  } else {
7803
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
8179
+ pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE;
8180
+ size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8181
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start);
7804
8182
  }
7805
8183
 
7806
8184
  // If we're lexing an embedded variable, then we need to pop back into the
@@ -7975,14 +8353,43 @@ typedef struct {
7975
8353
  * Push the given byte into the token buffer.
7976
8354
  */
7977
8355
  static inline void
7978
- pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
8356
+ pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) {
7979
8357
  pm_buffer_append_byte(&token_buffer->buffer, byte);
7980
8358
  }
7981
8359
 
8360
+ /**
8361
+ * Append the given bytes into the token buffer.
8362
+ */
8363
+ static inline void
8364
+ pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
8365
+ pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
8366
+ }
8367
+
8368
+ /**
8369
+ * Push an escaped character into the token buffer.
8370
+ */
8371
+ static inline void
8372
+ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
8373
+ // First, determine the width of the character to be escaped.
8374
+ size_t width;
8375
+ if (parser->encoding_changed) {
8376
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8377
+ } else {
8378
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
8379
+ }
8380
+
8381
+ // TODO: If the character is invalid in the given encoding, then we'll just
8382
+ // push one byte into the buffer. This should actually be an error.
8383
+ width = (width == 0 ? 1 : width);
8384
+
8385
+ // Now, push the bytes into the buffer.
8386
+ pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
8387
+ parser->current.end += width;
8388
+ }
8389
+
7982
8390
  /**
7983
8391
  * When we're about to return from lexing the current token and we know for sure
7984
8392
  * that we have found an escape sequence, this function is called to copy the
7985
- *
7986
8393
  * contents of the token buffer into the current string on the parser so that it
7987
8394
  * can be attached to the correct node.
7988
8395
  */
@@ -7997,7 +8404,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
7997
8404
  * string. If we haven't pushed anything into the buffer, this means that we
7998
8405
  * never found an escape sequence, so we can directly reference the bounds of
7999
8406
  * the current string. Either way, at the return of this function it is expected
8000
- *
8001
8407
  * that parser->current_string is established in such a way that it can be
8002
8408
  * attached to a node.
8003
8409
  */
@@ -8016,7 +8422,6 @@ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
8016
8422
  * point into the buffer because we're about to provide a string that has
8017
8423
  * different content than a direct slice of the source.
8018
8424
  *
8019
- *
8020
8425
  * It is expected that the parser's current token end will be pointing at one
8021
8426
  * byte past the backslash that starts the escape sequence.
8022
8427
  */
@@ -8070,6 +8475,34 @@ pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor,
8070
8475
  return whitespace;
8071
8476
  }
8072
8477
 
8478
+ /**
8479
+ * Lex past the delimiter of a percent literal. Handle newlines and heredocs
8480
+ * appropriately.
8481
+ */
8482
+ static uint8_t
8483
+ pm_lex_percent_delimiter(pm_parser_t *parser) {
8484
+ size_t eol_length = match_eol(parser);
8485
+
8486
+ if (eol_length) {
8487
+ if (parser->heredoc_end) {
8488
+ // If we have already lexed a heredoc, then the newline has already
8489
+ // been added to the list. In this case we want to just flush the
8490
+ // heredoc end.
8491
+ parser_flush_heredoc_end(parser);
8492
+ } else {
8493
+ // Otherwise, we'll add the newline to the list of newlines.
8494
+ pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1);
8495
+ }
8496
+
8497
+ const uint8_t delimiter = *parser->current.end;
8498
+ parser->current.end += eol_length;
8499
+
8500
+ return delimiter;
8501
+ }
8502
+
8503
+ return *parser->current.end++;
8504
+ }
8505
+
8073
8506
  /**
8074
8507
  * This is a convenience macro that will set the current token type, call the
8075
8508
  * lex callback, and then return from the parser_lex function.
@@ -8635,7 +9068,7 @@ parser_lex(pm_parser_t *parser) {
8635
9068
  // this is not a valid heredoc declaration. In this case we
8636
9069
  // will add an error, but we will still return a heredoc
8637
9070
  // start.
8638
- pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
9071
+ pm_parser_err_current(parser, PM_ERR_HEREDOC_TERM);
8639
9072
  body_start = parser->end;
8640
9073
  } else {
8641
9074
  // Otherwise, we want to indicate that the body of the
@@ -8826,12 +9259,10 @@ parser_lex(pm_parser_t *parser) {
8826
9259
  LEX(PM_TOKEN_PLUS_EQUAL);
8827
9260
  }
8828
9261
 
8829
- bool spcarg = lex_state_spcarg_p(parser, space_seen);
8830
- if (spcarg) {
8831
- pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS);
8832
- }
8833
-
8834
- if (lex_state_beg_p(parser) || spcarg) {
9262
+ if (
9263
+ lex_state_beg_p(parser) ||
9264
+ (lex_state_spcarg_p(parser, space_seen) ? (pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS), true) : false)
9265
+ ) {
8835
9266
  lex_state_set(parser, PM_LEX_STATE_BEG);
8836
9267
 
8837
9268
  if (pm_char_is_decimal_digit(peek(parser))) {
@@ -8871,11 +9302,12 @@ parser_lex(pm_parser_t *parser) {
8871
9302
  }
8872
9303
 
8873
9304
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
8874
- if (spcarg) {
9305
+ bool is_beg = lex_state_beg_p(parser);
9306
+ if (!is_beg && spcarg) {
8875
9307
  pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS);
8876
9308
  }
8877
9309
 
8878
- if (lex_state_beg_p(parser) || spcarg) {
9310
+ if (is_beg || spcarg) {
8879
9311
  lex_state_set(parser, PM_LEX_STATE_BEG);
8880
9312
  LEX(pm_char_is_decimal_digit(peek(parser)) ? PM_TOKEN_UMINUS_NUM : PM_TOKEN_UMINUS);
8881
9313
  }
@@ -9026,15 +9458,8 @@ parser_lex(pm_parser_t *parser) {
9026
9458
  pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
9027
9459
  }
9028
9460
 
9029
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9030
-
9031
- size_t eol_length = match_eol(parser);
9032
- if (eol_length) {
9033
- parser->current.end += eol_length;
9034
- pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
9035
- } else {
9036
- parser->current.end++;
9037
- }
9461
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9462
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9038
9463
 
9039
9464
  if (parser->current.end < parser->end) {
9040
9465
  LEX(PM_TOKEN_STRING_BEGIN);
@@ -9054,7 +9479,7 @@ parser_lex(pm_parser_t *parser) {
9054
9479
  parser->current.end++;
9055
9480
 
9056
9481
  if (parser->current.end < parser->end) {
9057
- lex_mode_push_list(parser, false, *parser->current.end++);
9482
+ lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
9058
9483
  } else {
9059
9484
  lex_mode_push_list_eof(parser);
9060
9485
  }
@@ -9065,7 +9490,7 @@ parser_lex(pm_parser_t *parser) {
9065
9490
  parser->current.end++;
9066
9491
 
9067
9492
  if (parser->current.end < parser->end) {
9068
- lex_mode_push_list(parser, true, *parser->current.end++);
9493
+ lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
9069
9494
  } else {
9070
9495
  lex_mode_push_list_eof(parser);
9071
9496
  }
@@ -9076,9 +9501,8 @@ parser_lex(pm_parser_t *parser) {
9076
9501
  parser->current.end++;
9077
9502
 
9078
9503
  if (parser->current.end < parser->end) {
9079
- lex_mode_push_regexp(parser, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9080
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9081
- parser->current.end++;
9504
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9505
+ lex_mode_push_regexp(parser, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9082
9506
  } else {
9083
9507
  lex_mode_push_regexp(parser, '\0', '\0');
9084
9508
  }
@@ -9089,9 +9513,8 @@ parser_lex(pm_parser_t *parser) {
9089
9513
  parser->current.end++;
9090
9514
 
9091
9515
  if (parser->current.end < parser->end) {
9092
- lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9093
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9094
- parser->current.end++;
9516
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9517
+ lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9095
9518
  } else {
9096
9519
  lex_mode_push_string_eof(parser);
9097
9520
  }
@@ -9102,9 +9525,8 @@ parser_lex(pm_parser_t *parser) {
9102
9525
  parser->current.end++;
9103
9526
 
9104
9527
  if (parser->current.end < parser->end) {
9105
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9106
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9107
- parser->current.end++;
9528
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9529
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9108
9530
  } else {
9109
9531
  lex_mode_push_string_eof(parser);
9110
9532
  }
@@ -9115,9 +9537,9 @@ parser_lex(pm_parser_t *parser) {
9115
9537
  parser->current.end++;
9116
9538
 
9117
9539
  if (parser->current.end < parser->end) {
9118
- lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9540
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9541
+ lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9119
9542
  lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
9120
- parser->current.end++;
9121
9543
  } else {
9122
9544
  lex_mode_push_string_eof(parser);
9123
9545
  }
@@ -9128,7 +9550,7 @@ parser_lex(pm_parser_t *parser) {
9128
9550
  parser->current.end++;
9129
9551
 
9130
9552
  if (parser->current.end < parser->end) {
9131
- lex_mode_push_list(parser, false, *parser->current.end++);
9553
+ lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
9132
9554
  } else {
9133
9555
  lex_mode_push_list_eof(parser);
9134
9556
  }
@@ -9139,7 +9561,7 @@ parser_lex(pm_parser_t *parser) {
9139
9561
  parser->current.end++;
9140
9562
 
9141
9563
  if (parser->current.end < parser->end) {
9142
- lex_mode_push_list(parser, true, *parser->current.end++);
9564
+ lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
9143
9565
  } else {
9144
9566
  lex_mode_push_list_eof(parser);
9145
9567
  }
@@ -9150,8 +9572,8 @@ parser_lex(pm_parser_t *parser) {
9150
9572
  parser->current.end++;
9151
9573
 
9152
9574
  if (parser->current.end < parser->end) {
9153
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9154
- parser->current.end++;
9575
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9576
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9155
9577
  } else {
9156
9578
  lex_mode_push_string_eof(parser);
9157
9579
  }
@@ -9195,11 +9617,21 @@ parser_lex(pm_parser_t *parser) {
9195
9617
  if (*parser->current.start != '_') {
9196
9618
  size_t width = char_is_identifier_start(parser, parser->current.start);
9197
9619
 
9198
- // If this isn't the beginning of an identifier, then it's an invalid
9199
- // token as we've exhausted all of the other options. We'll skip past
9200
- // it and return the next token.
9620
+ // If this isn't the beginning of an identifier, then
9621
+ // it's an invalid token as we've exhausted all of the
9622
+ // other options. We'll skip past it and return the next
9623
+ // token after adding an appropriate error message.
9201
9624
  if (!width) {
9202
- pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
9625
+ pm_diagnostic_id_t diag_id;
9626
+ if (*parser->current.start >= 0x80) {
9627
+ diag_id = PM_ERR_INVALID_MULTIBYTE_CHARACTER;
9628
+ } else if (char_is_ascii_printable(*parser->current.start) || (*parser->current.start == '\\')) {
9629
+ diag_id = PM_ERR_INVALID_PRINTABLE_CHARACTER;
9630
+ } else {
9631
+ diag_id = PM_ERR_INVALID_CHARACTER;
9632
+ }
9633
+
9634
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, *parser->current.start);
9203
9635
  goto lex_next_token;
9204
9636
  }
9205
9637
 
@@ -9306,7 +9738,7 @@ parser_lex(pm_parser_t *parser) {
9306
9738
  // and then find the first one.
9307
9739
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9308
9740
  const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
9309
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9741
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9310
9742
 
9311
9743
  // If we haven't found an escape yet, then this buffer will be
9312
9744
  // unallocated since we can refer directly to the source string.
@@ -9315,7 +9747,7 @@ parser_lex(pm_parser_t *parser) {
9315
9747
  while (breakpoint != NULL) {
9316
9748
  // If we hit a null byte, skip directly past it.
9317
9749
  if (*breakpoint == '\0') {
9318
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
9750
+ breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1), true);
9319
9751
  continue;
9320
9752
  }
9321
9753
 
@@ -9334,7 +9766,7 @@ parser_lex(pm_parser_t *parser) {
9334
9766
  // we need to continue on past it.
9335
9767
  if (lex_mode->as.list.nesting > 0) {
9336
9768
  parser->current.end = breakpoint + 1;
9337
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9769
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9338
9770
  lex_mode->as.list.nesting--;
9339
9771
  continue;
9340
9772
  }
@@ -9377,18 +9809,18 @@ parser_lex(pm_parser_t *parser) {
9377
9809
  case '\t':
9378
9810
  case '\v':
9379
9811
  case '\\':
9380
- pm_token_buffer_push(&token_buffer, peeked);
9812
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9381
9813
  parser->current.end++;
9382
9814
  break;
9383
9815
  case '\r':
9384
9816
  parser->current.end++;
9385
9817
  if (peek(parser) != '\n') {
9386
- pm_token_buffer_push(&token_buffer, '\r');
9818
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9387
9819
  break;
9388
9820
  }
9389
9821
  /* fallthrough */
9390
9822
  case '\n':
9391
- pm_token_buffer_push(&token_buffer, '\n');
9823
+ pm_token_buffer_push_byte(&token_buffer, '\n');
9392
9824
 
9393
9825
  if (parser->heredoc_end) {
9394
9826
  // ... if we are on the same line as a heredoc,
@@ -9406,21 +9838,20 @@ parser_lex(pm_parser_t *parser) {
9406
9838
  break;
9407
9839
  default:
9408
9840
  if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
9409
- pm_token_buffer_push(&token_buffer, peeked);
9841
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9410
9842
  parser->current.end++;
9411
9843
  } else if (lex_mode->as.list.interpolation) {
9412
9844
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9413
9845
  } else {
9414
- pm_token_buffer_push(&token_buffer, '\\');
9415
- pm_token_buffer_push(&token_buffer, peeked);
9416
- parser->current.end++;
9846
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9847
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9417
9848
  }
9418
9849
 
9419
9850
  break;
9420
9851
  }
9421
9852
 
9422
9853
  token_buffer.cursor = parser->current.end;
9423
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9854
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9424
9855
  continue;
9425
9856
  }
9426
9857
 
@@ -9433,7 +9864,7 @@ parser_lex(pm_parser_t *parser) {
9433
9864
  // that looked like an interpolated class or instance variable
9434
9865
  // like "#@" but wasn't actually. In this case we'll just skip
9435
9866
  // to the next breakpoint.
9436
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9867
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9437
9868
  continue;
9438
9869
  }
9439
9870
 
@@ -9448,7 +9879,7 @@ parser_lex(pm_parser_t *parser) {
9448
9879
  // and find the next breakpoint.
9449
9880
  assert(*breakpoint == lex_mode->as.list.incrementor);
9450
9881
  parser->current.end = breakpoint + 1;
9451
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9882
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9452
9883
  lex_mode->as.list.nesting++;
9453
9884
  continue;
9454
9885
  }
@@ -9487,14 +9918,14 @@ parser_lex(pm_parser_t *parser) {
9487
9918
  // regular expression. We'll use strpbrk to find the first of these
9488
9919
  // characters.
9489
9920
  const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
9490
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9921
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9491
9922
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
9492
9923
 
9493
9924
  while (breakpoint != NULL) {
9494
9925
  // If we hit a null byte, skip directly past it.
9495
9926
  if (*breakpoint == '\0') {
9496
9927
  parser->current.end = breakpoint + 1;
9497
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9928
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9498
9929
  continue;
9499
9930
  }
9500
9931
 
@@ -9516,7 +9947,7 @@ parser_lex(pm_parser_t *parser) {
9516
9947
  // If the terminator is not a newline, then we can set
9517
9948
  // the next breakpoint and continue.
9518
9949
  parser->current.end = breakpoint + 1;
9519
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9950
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9520
9951
  continue;
9521
9952
  }
9522
9953
  }
@@ -9526,7 +9957,7 @@ parser_lex(pm_parser_t *parser) {
9526
9957
  if (*breakpoint == lex_mode->as.regexp.terminator) {
9527
9958
  if (lex_mode->as.regexp.nesting > 0) {
9528
9959
  parser->current.end = breakpoint + 1;
9529
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9960
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9530
9961
  lex_mode->as.regexp.nesting--;
9531
9962
  continue;
9532
9963
  }
@@ -9571,9 +10002,9 @@ parser_lex(pm_parser_t *parser) {
9571
10002
  parser->current.end++;
9572
10003
  if (peek(parser) != '\n') {
9573
10004
  if (lex_mode->as.regexp.terminator != '\r') {
9574
- pm_token_buffer_push(&token_buffer, '\\');
10005
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9575
10006
  }
9576
- pm_token_buffer_push(&token_buffer, '\r');
10007
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9577
10008
  break;
9578
10009
  }
9579
10010
  /* fallthrough */
@@ -9608,25 +10039,24 @@ parser_lex(pm_parser_t *parser) {
9608
10039
  case '$': case ')': case '*': case '+':
9609
10040
  case '.': case '>': case '?': case ']':
9610
10041
  case '^': case '|': case '}':
9611
- pm_token_buffer_push(&token_buffer, '\\');
10042
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9612
10043
  break;
9613
10044
  default:
9614
10045
  break;
9615
10046
  }
9616
10047
 
9617
- pm_token_buffer_push(&token_buffer, peeked);
10048
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9618
10049
  parser->current.end++;
9619
10050
  break;
9620
10051
  }
9621
10052
 
9622
- if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
9623
- pm_token_buffer_push(&token_buffer, peeked);
9624
- parser->current.end++;
10053
+ if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
10054
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9625
10055
  break;
9626
10056
  }
9627
10057
 
9628
10058
  token_buffer.cursor = parser->current.end;
9629
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10059
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9630
10060
  continue;
9631
10061
  }
9632
10062
 
@@ -9639,7 +10069,7 @@ parser_lex(pm_parser_t *parser) {
9639
10069
  // something that looked like an interpolated class or
9640
10070
  // instance variable like "#@" but wasn't actually. In
9641
10071
  // this case we'll just skip to the next breakpoint.
9642
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10072
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9643
10073
  continue;
9644
10074
  }
9645
10075
 
@@ -9654,7 +10084,7 @@ parser_lex(pm_parser_t *parser) {
9654
10084
  // and find the next breakpoint.
9655
10085
  assert(*breakpoint == lex_mode->as.regexp.incrementor);
9656
10086
  parser->current.end = breakpoint + 1;
9657
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10087
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9658
10088
  lex_mode->as.regexp.nesting++;
9659
10089
  continue;
9660
10090
  }
@@ -9690,7 +10120,7 @@ parser_lex(pm_parser_t *parser) {
9690
10120
  // string. We'll use strpbrk to find the first of these characters.
9691
10121
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9692
10122
  const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
9693
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10123
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9694
10124
 
9695
10125
  // If we haven't found an escape yet, then this buffer will be
9696
10126
  // unallocated since we can refer directly to the source string.
@@ -9702,7 +10132,7 @@ parser_lex(pm_parser_t *parser) {
9702
10132
  if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
9703
10133
  lex_mode->as.string.nesting++;
9704
10134
  parser->current.end = breakpoint + 1;
9705
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10135
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9706
10136
  continue;
9707
10137
  }
9708
10138
 
@@ -9714,7 +10144,7 @@ parser_lex(pm_parser_t *parser) {
9714
10144
  // to continue on past it.
9715
10145
  if (lex_mode->as.string.nesting > 0) {
9716
10146
  parser->current.end = breakpoint + 1;
9717
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10147
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9718
10148
  lex_mode->as.string.nesting--;
9719
10149
  continue;
9720
10150
  }
@@ -9756,7 +10186,7 @@ parser_lex(pm_parser_t *parser) {
9756
10186
  if (parser->heredoc_end == NULL) {
9757
10187
  pm_newline_list_append(&parser->newline_list, breakpoint);
9758
10188
  parser->current.end = breakpoint + 1;
9759
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10189
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9760
10190
  continue;
9761
10191
  } else {
9762
10192
  parser->current.end = breakpoint + 1;
@@ -9770,7 +10200,7 @@ parser_lex(pm_parser_t *parser) {
9770
10200
  case '\0':
9771
10201
  // Skip directly past the null character.
9772
10202
  parser->current.end = breakpoint + 1;
9773
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10203
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9774
10204
  break;
9775
10205
  case '\\': {
9776
10206
  // Here we hit escapes.
@@ -9788,23 +10218,23 @@ parser_lex(pm_parser_t *parser) {
9788
10218
 
9789
10219
  switch (peeked) {
9790
10220
  case '\\':
9791
- pm_token_buffer_push(&token_buffer, '\\');
10221
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9792
10222
  parser->current.end++;
9793
10223
  break;
9794
10224
  case '\r':
9795
10225
  parser->current.end++;
9796
10226
  if (peek(parser) != '\n') {
9797
10227
  if (!lex_mode->as.string.interpolation) {
9798
- pm_token_buffer_push(&token_buffer, '\\');
10228
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9799
10229
  }
9800
- pm_token_buffer_push(&token_buffer, '\r');
10230
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9801
10231
  break;
9802
10232
  }
9803
10233
  /* fallthrough */
9804
10234
  case '\n':
9805
10235
  if (!lex_mode->as.string.interpolation) {
9806
- pm_token_buffer_push(&token_buffer, '\\');
9807
- pm_token_buffer_push(&token_buffer, '\n');
10236
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10237
+ pm_token_buffer_push_byte(&token_buffer, '\n');
9808
10238
  }
9809
10239
 
9810
10240
  if (parser->heredoc_end) {
@@ -9823,24 +10253,23 @@ parser_lex(pm_parser_t *parser) {
9823
10253
  break;
9824
10254
  default:
9825
10255
  if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
9826
- pm_token_buffer_push(&token_buffer, peeked);
10256
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9827
10257
  parser->current.end++;
9828
10258
  } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
9829
- pm_token_buffer_push(&token_buffer, peeked);
10259
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9830
10260
  parser->current.end++;
9831
10261
  } else if (lex_mode->as.string.interpolation) {
9832
10262
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9833
10263
  } else {
9834
- pm_token_buffer_push(&token_buffer, '\\');
9835
- pm_token_buffer_push(&token_buffer, peeked);
9836
- parser->current.end++;
10264
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10265
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9837
10266
  }
9838
10267
 
9839
10268
  break;
9840
10269
  }
9841
10270
 
9842
10271
  token_buffer.cursor = parser->current.end;
9843
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10272
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9844
10273
  break;
9845
10274
  }
9846
10275
  case '#': {
@@ -9851,7 +10280,7 @@ parser_lex(pm_parser_t *parser) {
9851
10280
  // looked like an interpolated class or instance variable like "#@"
9852
10281
  // but wasn't actually. In this case we'll just skip to the next
9853
10282
  // breakpoint.
9854
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10283
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9855
10284
  break;
9856
10285
  }
9857
10286
 
@@ -9888,15 +10317,22 @@ parser_lex(pm_parser_t *parser) {
9888
10317
  parser->next_start = NULL;
9889
10318
  }
9890
10319
 
9891
- // We'll check if we're at the end of the file. If we are, then we need to
9892
- // return the EOF token.
10320
+ // Now let's grab the information about the identifier off of the
10321
+ // current lex mode.
10322
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
10323
+
10324
+ // We'll check if we're at the end of the file. If we are, then we
10325
+ // will add an error (because we weren't able to find the
10326
+ // terminator) but still continue parsing so that content after the
10327
+ // declaration of the heredoc can be parsed.
9893
10328
  if (parser->current.end >= parser->end) {
9894
- LEX(PM_TOKEN_EOF);
10329
+ pm_parser_err_current(parser, PM_ERR_HEREDOC_TERM);
10330
+ parser->next_start = lex_mode->as.heredoc.next_start;
10331
+ parser->heredoc_end = parser->current.end;
10332
+ lex_state_set(parser, PM_LEX_STATE_END);
10333
+ LEX(PM_TOKEN_HEREDOC_END);
9895
10334
  }
9896
10335
 
9897
- // Now let's grab the information about the identifier off of the current
9898
- // lex mode.
9899
- pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9900
10336
  const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
9901
10337
  size_t ident_length = lex_mode->as.heredoc.ident_length;
9902
10338
 
@@ -9972,7 +10408,7 @@ parser_lex(pm_parser_t *parser) {
9972
10408
  breakpoints[2] = '\0';
9973
10409
  }
9974
10410
 
9975
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10411
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9976
10412
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
9977
10413
  bool was_escaped_newline = false;
9978
10414
 
@@ -9981,7 +10417,7 @@ parser_lex(pm_parser_t *parser) {
9981
10417
  case '\0':
9982
10418
  // Skip directly past the null character.
9983
10419
  parser->current.end = breakpoint + 1;
9984
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10420
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9985
10421
  break;
9986
10422
  case '\n': {
9987
10423
  if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
@@ -10056,7 +10492,7 @@ parser_lex(pm_parser_t *parser) {
10056
10492
  // Otherwise we hit a newline and it wasn't followed by
10057
10493
  // a terminator, so we can continue parsing.
10058
10494
  parser->current.end = breakpoint + 1;
10059
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10495
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10060
10496
  break;
10061
10497
  }
10062
10498
  case '\\': {
@@ -10083,21 +10519,20 @@ parser_lex(pm_parser_t *parser) {
10083
10519
  case '\r':
10084
10520
  parser->current.end++;
10085
10521
  if (peek(parser) != '\n') {
10086
- pm_token_buffer_push(&token_buffer, '\\');
10087
- pm_token_buffer_push(&token_buffer, '\r');
10522
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10523
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10088
10524
  break;
10089
10525
  }
10090
10526
  /* fallthrough */
10091
10527
  case '\n':
10092
- pm_token_buffer_push(&token_buffer, '\\');
10093
- pm_token_buffer_push(&token_buffer, '\n');
10528
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10529
+ pm_token_buffer_push_byte(&token_buffer, '\n');
10094
10530
  token_buffer.cursor = parser->current.end + 1;
10095
10531
  breakpoint = parser->current.end;
10096
10532
  continue;
10097
10533
  default:
10098
- parser->current.end++;
10099
- pm_token_buffer_push(&token_buffer, '\\');
10100
- pm_token_buffer_push(&token_buffer, peeked);
10534
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10535
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10101
10536
  break;
10102
10537
  }
10103
10538
  } else {
@@ -10105,7 +10540,7 @@ parser_lex(pm_parser_t *parser) {
10105
10540
  case '\r':
10106
10541
  parser->current.end++;
10107
10542
  if (peek(parser) != '\n') {
10108
- pm_token_buffer_push(&token_buffer, '\r');
10543
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10109
10544
  break;
10110
10545
  }
10111
10546
  /* fallthrough */
@@ -10121,7 +10556,7 @@ parser_lex(pm_parser_t *parser) {
10121
10556
  }
10122
10557
 
10123
10558
  token_buffer.cursor = parser->current.end;
10124
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10559
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10125
10560
  break;
10126
10561
  }
10127
10562
  case '#': {
@@ -10133,7 +10568,7 @@ parser_lex(pm_parser_t *parser) {
10133
10568
  // or instance variable like "#@" but wasn't
10134
10569
  // actually. In this case we'll just skip to the
10135
10570
  // next breakpoint.
10136
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10571
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10137
10572
  break;
10138
10573
  }
10139
10574
 
@@ -10184,8 +10619,8 @@ parser_lex(pm_parser_t *parser) {
10184
10619
  typedef enum {
10185
10620
  PM_BINDING_POWER_UNSET = 0, // used to indicate this token cannot be used as an infix operator
10186
10621
  PM_BINDING_POWER_STATEMENT = 2,
10187
- PM_BINDING_POWER_MODIFIER = 4, // if unless until while
10188
- PM_BINDING_POWER_MODIFIER_RESCUE = 6, // rescue
10622
+ PM_BINDING_POWER_MODIFIER_RESCUE = 4, // rescue
10623
+ PM_BINDING_POWER_MODIFIER = 6, // if unless until while
10189
10624
  PM_BINDING_POWER_COMPOSITION = 8, // and or
10190
10625
  PM_BINDING_POWER_NOT = 10, // not
10191
10626
  PM_BINDING_POWER_MATCH = 12, // => in
@@ -10239,15 +10674,15 @@ typedef struct {
10239
10674
  #define RIGHT_ASSOCIATIVE_UNARY(precedence) { precedence, precedence, false, false }
10240
10675
 
10241
10676
  pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = {
10677
+ // rescue
10678
+ [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
10679
+
10242
10680
  // if unless until while
10243
10681
  [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10244
10682
  [PM_TOKEN_KEYWORD_UNLESS_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10245
10683
  [PM_TOKEN_KEYWORD_UNTIL_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10246
10684
  [PM_TOKEN_KEYWORD_WHILE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10247
10685
 
10248
- // rescue
10249
- [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
10250
-
10251
10686
  // and or
10252
10687
  [PM_TOKEN_KEYWORD_AND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
10253
10688
  [PM_TOKEN_KEYWORD_OR] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
@@ -10381,14 +10816,6 @@ match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2,
10381
10816
  return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4);
10382
10817
  }
10383
10818
 
10384
- /**
10385
- * Returns true if the current token is any of the five given types.
10386
- */
10387
- static inline bool
10388
- match5(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5) {
10389
- return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5);
10390
- }
10391
-
10392
10819
  /**
10393
10820
  * Returns true if the current token is any of the six given types.
10394
10821
  */
@@ -10654,7 +11081,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
10654
11081
  return target;
10655
11082
  case PM_BACK_REFERENCE_READ_NODE:
10656
11083
  case PM_NUMBERED_REFERENCE_READ_NODE:
10657
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11084
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
10658
11085
  return target;
10659
11086
  case PM_GLOBAL_VARIABLE_READ_NODE:
10660
11087
  assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t));
@@ -10792,7 +11219,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
10792
11219
  }
10793
11220
  case PM_BACK_REFERENCE_READ_NODE:
10794
11221
  case PM_NUMBERED_REFERENCE_READ_NODE:
10795
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11222
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
10796
11223
  /* fallthrough */
10797
11224
  case PM_GLOBAL_VARIABLE_READ_NODE: {
10798
11225
  pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value);
@@ -10866,7 +11293,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
10866
11293
  return target;
10867
11294
  }
10868
11295
 
10869
- if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
11296
+ if (char_is_identifier_start(parser, call->message_loc.start)) {
10870
11297
  // When we get here, we have a method call, because it was
10871
11298
  // previously marked as a method call but now we have an =. This
10872
11299
  // looks like:
@@ -10967,7 +11394,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
10967
11394
  pm_multi_target_node_targets_append(parser, result, target);
10968
11395
  } else if (!match1(parser, PM_TOKEN_EOF)) {
10969
11396
  // If we get here, then we have a trailing , in a multi target node.
10970
- // We'll set the implicit rest flag to indicate this.
11397
+ // We'll add an implicit rest node to represent this.
10971
11398
  pm_node_t *rest = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
10972
11399
  pm_multi_target_node_targets_append(parser, result, rest);
10973
11400
  break;
@@ -10984,6 +11411,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
10984
11411
  static pm_node_t *
10985
11412
  parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t binding_power) {
10986
11413
  pm_node_t *result = parse_targets(parser, first_target, binding_power);
11414
+ accept1(parser, PM_TOKEN_NEWLINE);
10987
11415
 
10988
11416
  // Ensure that we have either an = or a ) after the targets.
10989
11417
  if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) {
@@ -11024,7 +11452,7 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11024
11452
  break;
11025
11453
  }
11026
11454
 
11027
- // If we have a terminator, then we will parse all consequtive terminators
11455
+ // If we have a terminator, then we will parse all consecutive terminators
11028
11456
  // and then continue parsing the statements list.
11029
11457
  if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
11030
11458
  // If we have a terminator, then we will continue parsing the statements
@@ -11056,8 +11484,13 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11056
11484
 
11057
11485
  while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
11058
11486
  if (context_terminator(context, &parser->current)) break;
11059
- } else {
11060
- expect1(parser, PM_TOKEN_NEWLINE, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11487
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
11488
+ // This is an inlined version of accept1 because the error that we
11489
+ // want to add has varargs. If this happens again, we should
11490
+ // probably extract a helper function.
11491
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
11492
+ parser->previous.start = parser->previous.end;
11493
+ parser->previous.type = PM_TOKEN_MISSING;
11061
11494
  }
11062
11495
  }
11063
11496
 
@@ -11084,8 +11517,9 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
11084
11517
 
11085
11518
  if (token_begins_expression_p(parser->current.type)) {
11086
11519
  value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
11087
- } else if (pm_parser_local_depth(parser, &operator) == -1) {
11088
- pm_parser_err_token(parser, &operator, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
11520
+ }
11521
+ else {
11522
+ pm_parser_scope_forwarding_keywords_check(parser, &operator);
11089
11523
  }
11090
11524
 
11091
11525
  element = (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
@@ -11234,13 +11668,8 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11234
11668
  if (token_begins_expression_p(parser->current.type)) {
11235
11669
  expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_ARGUMENT);
11236
11670
  } else {
11237
- if (pm_parser_local_depth(parser, &operator) == -1) {
11238
- // A block forwarding in a method having `...` parameter (e.g. `def foo(...); bar(&); end`) is available.
11239
- pm_constant_id_t ellipsis_id = pm_parser_constant_id_constant(parser, "...", 3);
11240
- if (pm_parser_local_depth_constant_id(parser, ellipsis_id) == -1) {
11241
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
11242
- }
11243
- }
11671
+ // A block forwarding in a method having `...` parameter (e.g. `def foo(...); bar(&); end`) is available.
11672
+ pm_parser_scope_forwarding_block_check(parser, &operator);
11244
11673
  }
11245
11674
 
11246
11675
  argument = (pm_node_t *) pm_block_argument_node_create(parser, &operator, expression);
@@ -11258,10 +11687,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11258
11687
  pm_token_t operator = parser->previous;
11259
11688
 
11260
11689
  if (match4(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_SEMICOLON, PM_TOKEN_BRACKET_RIGHT)) {
11261
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
11262
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
11263
- }
11264
-
11690
+ pm_parser_scope_forwarding_positionals_check(parser, &operator);
11265
11691
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, NULL);
11266
11692
  } else {
11267
11693
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT);
@@ -11287,15 +11713,14 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11287
11713
  pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_RANGE, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
11288
11714
  argument = (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
11289
11715
  } else {
11290
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
11291
- pm_parser_err_previous(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
11292
- }
11716
+ pm_parser_scope_forwarding_all_check(parser, &parser->previous);
11293
11717
  if (parsed_first_argument && terminator == PM_TOKEN_EOF) {
11294
11718
  pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORWARDING_UNBOUND);
11295
11719
  }
11296
11720
 
11297
11721
  argument = (pm_node_t *) pm_forwarding_arguments_node_create(parser, &parser->previous);
11298
11722
  parse_arguments_append(parser, arguments, argument);
11723
+ arguments->has_forwarding = true;
11299
11724
  parsed_forwarding_arguments = true;
11300
11725
  break;
11301
11726
  }
@@ -11338,6 +11763,9 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11338
11763
  }
11339
11764
 
11340
11765
  parsed_bare_hash = true;
11766
+ } else if (accept1(parser, PM_TOKEN_KEYWORD_IN)) {
11767
+ // TODO: Could we solve this with binding powers instead?
11768
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_IN);
11341
11769
  }
11342
11770
 
11343
11771
  parse_arguments_append(parser, arguments, argument);
@@ -11414,7 +11842,9 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
11414
11842
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11415
11843
  pm_token_t name = parser->previous;
11416
11844
  value = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
11417
- pm_parser_parameter_name_check(parser, &name);
11845
+ if (pm_parser_parameter_name_check(parser, &name)) {
11846
+ pm_node_flag_set_repeated_parameter(value);
11847
+ }
11418
11848
  pm_parser_local_add_token(parser, &name);
11419
11849
  }
11420
11850
 
@@ -11424,7 +11854,9 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
11424
11854
  pm_token_t name = parser->previous;
11425
11855
 
11426
11856
  param = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
11427
- pm_parser_parameter_name_check(parser, &name);
11857
+ if (pm_parser_parameter_name_check(parser, &name)) {
11858
+ pm_node_flag_set_repeated_parameter(param);
11859
+ }
11428
11860
  pm_parser_local_add_token(parser, &name);
11429
11861
  }
11430
11862
 
@@ -11541,19 +11973,20 @@ parse_parameters(
11541
11973
  pm_token_t operator = parser->previous;
11542
11974
  pm_token_t name;
11543
11975
 
11976
+ bool repeated = false;
11544
11977
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11545
11978
  name = parser->previous;
11546
- pm_parser_parameter_name_check(parser, &name);
11979
+ repeated = pm_parser_parameter_name_check(parser, &name);
11547
11980
  pm_parser_local_add_token(parser, &name);
11548
11981
  } else {
11549
11982
  name = not_provided(parser);
11550
-
11551
- if (allows_forwarding_parameters) {
11552
- pm_parser_local_add_token(parser, &operator);
11553
- }
11983
+ parser->current_scope->forwarding_params |= PM_FORWARDING_BLOCK;
11554
11984
  }
11555
11985
 
11556
11986
  pm_block_parameter_node_t *param = pm_block_parameter_node_create(parser, &name, &operator);
11987
+ if (repeated) {
11988
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
11989
+ }
11557
11990
  if (params->block == NULL) {
11558
11991
  pm_parameters_node_block_set(params, param);
11559
11992
  } else {
@@ -11572,9 +12005,8 @@ parse_parameters(
11572
12005
  update_parameter_state(parser, &parser->current, &order);
11573
12006
  parser_lex(parser);
11574
12007
 
11575
- if (allows_forwarding_parameters) {
11576
- pm_parser_local_add_token(parser, &parser->previous);
11577
- }
12008
+ parser->current_scope->forwarding_params |= PM_FORWARDING_BLOCK;
12009
+ parser->current_scope->forwarding_params |= PM_FORWARDING_ALL;
11578
12010
 
11579
12011
  pm_forwarding_parameter_node_t *param = pm_forwarding_parameter_node_create(parser, &parser->previous);
11580
12012
  if (params->keyword_rest != NULL) {
@@ -11626,20 +12058,23 @@ parse_parameters(
11626
12058
  }
11627
12059
 
11628
12060
  pm_token_t name = parser->previous;
11629
- pm_parser_parameter_name_check(parser, &name);
12061
+ bool repeated = pm_parser_parameter_name_check(parser, &name);
11630
12062
  pm_parser_local_add_token(parser, &name);
11631
12063
 
11632
12064
  if (accept1(parser, PM_TOKEN_EQUAL)) {
11633
12065
  pm_token_t operator = parser->previous;
11634
12066
  context_push(parser, PM_CONTEXT_DEFAULT_PARAMS);
11635
- pm_constant_id_t old_param_name = parser->current_param_name;
11636
- parser->current_param_name = pm_parser_constant_id_token(parser, &name);
12067
+
12068
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_set(parser, pm_parser_constant_id_token(parser, &name));
11637
12069
  pm_node_t *value = parse_value_expression(parser, binding_power, false, PM_ERR_PARAMETER_NO_DEFAULT);
11638
12070
 
11639
12071
  pm_optional_parameter_node_t *param = pm_optional_parameter_node_create(parser, &name, &operator, value);
12072
+ if (repeated) {
12073
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12074
+ }
11640
12075
  pm_parameters_node_optionals_append(params, param);
11641
12076
 
11642
- parser->current_param_name = old_param_name;
12077
+ pm_parser_current_param_name_restore(parser, saved_param_name);
11643
12078
  context_pop(parser);
11644
12079
 
11645
12080
  // If parsing the value of the parameter resulted in error recovery,
@@ -11651,9 +12086,15 @@ parse_parameters(
11651
12086
  }
11652
12087
  } else if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) {
11653
12088
  pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name);
12089
+ if (repeated) {
12090
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12091
+ }
11654
12092
  pm_parameters_node_requireds_append(params, (pm_node_t *) param);
11655
12093
  } else {
11656
12094
  pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name);
12095
+ if (repeated) {
12096
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12097
+ }
11657
12098
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
11658
12099
  }
11659
12100
 
@@ -11668,7 +12109,7 @@ parse_parameters(
11668
12109
  pm_token_t local = name;
11669
12110
  local.end -= 1;
11670
12111
 
11671
- pm_parser_parameter_name_check(parser, &local);
12112
+ bool repeated = pm_parser_parameter_name_check(parser, &local);
11672
12113
  pm_parser_local_add_token(parser, &local);
11673
12114
 
11674
12115
  switch (parser->current.type) {
@@ -11676,6 +12117,9 @@ parse_parameters(
11676
12117
  case PM_TOKEN_PARENTHESIS_RIGHT:
11677
12118
  case PM_TOKEN_PIPE: {
11678
12119
  pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
12120
+ if (repeated) {
12121
+ pm_node_flag_set_repeated_parameter(param);
12122
+ }
11679
12123
  pm_parameters_node_keywords_append(params, param);
11680
12124
  break;
11681
12125
  }
@@ -11687,6 +12131,9 @@ parse_parameters(
11687
12131
  }
11688
12132
 
11689
12133
  pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
12134
+ if (repeated) {
12135
+ pm_node_flag_set_repeated_parameter(param);
12136
+ }
11690
12137
  pm_parameters_node_keywords_append(params, param);
11691
12138
  break;
11692
12139
  }
@@ -11695,17 +12142,22 @@ parse_parameters(
11695
12142
 
11696
12143
  if (token_begins_expression_p(parser->current.type)) {
11697
12144
  context_push(parser, PM_CONTEXT_DEFAULT_PARAMS);
11698
- pm_constant_id_t old_param_name = parser->current_param_name;
11699
- parser->current_param_name = pm_parser_constant_id_token(parser, &local);
12145
+
12146
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_set(parser, pm_parser_constant_id_token(parser, &local));
11700
12147
  pm_node_t *value = parse_value_expression(parser, binding_power, false, PM_ERR_PARAMETER_NO_DEFAULT_KW);
11701
- parser->current_param_name = old_param_name;
12148
+
12149
+ pm_parser_current_param_name_restore(parser, saved_param_name);
11702
12150
  context_pop(parser);
12151
+
11703
12152
  param = (pm_node_t *) pm_optional_keyword_parameter_node_create(parser, &name, value);
11704
12153
  }
11705
12154
  else {
11706
12155
  param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
11707
12156
  }
11708
12157
 
12158
+ if (repeated) {
12159
+ pm_node_flag_set_repeated_parameter(param);
12160
+ }
11709
12161
  pm_parameters_node_keywords_append(params, param);
11710
12162
 
11711
12163
  // If parsing the value of the parameter resulted in error recovery,
@@ -11728,20 +12180,21 @@ parse_parameters(
11728
12180
 
11729
12181
  pm_token_t operator = parser->previous;
11730
12182
  pm_token_t name;
11731
-
12183
+ bool repeated = false;
11732
12184
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11733
12185
  name = parser->previous;
11734
- pm_parser_parameter_name_check(parser, &name);
12186
+ repeated = pm_parser_parameter_name_check(parser, &name);
11735
12187
  pm_parser_local_add_token(parser, &name);
11736
12188
  } else {
11737
12189
  name = not_provided(parser);
11738
12190
 
11739
- if (allows_forwarding_parameters) {
11740
- pm_parser_local_add_token(parser, &operator);
11741
- }
12191
+ parser->current_scope->forwarding_params |= PM_FORWARDING_POSITIONALS;
11742
12192
  }
11743
12193
 
11744
12194
  pm_node_t *param = (pm_node_t *) pm_rest_parameter_node_create(parser, &operator, &name);
12195
+ if (repeated) {
12196
+ pm_node_flag_set_repeated_parameter(param);
12197
+ }
11745
12198
  if (params->rest == NULL) {
11746
12199
  pm_parameters_node_rest_set(params, param);
11747
12200
  } else {
@@ -11764,19 +12217,21 @@ parse_parameters(
11764
12217
  } else {
11765
12218
  pm_token_t name;
11766
12219
 
12220
+ bool repeated = false;
11767
12221
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11768
12222
  name = parser->previous;
11769
- pm_parser_parameter_name_check(parser, &name);
12223
+ repeated = pm_parser_parameter_name_check(parser, &name);
11770
12224
  pm_parser_local_add_token(parser, &name);
11771
12225
  } else {
11772
12226
  name = not_provided(parser);
11773
12227
 
11774
- if (allows_forwarding_parameters) {
11775
- pm_parser_local_add_token(parser, &operator);
11776
- }
12228
+ parser->current_scope->forwarding_params |= PM_FORWARDING_KEYWORDS;
11777
12229
  }
11778
12230
 
11779
12231
  param = (pm_node_t *) pm_keyword_rest_parameter_node_create(parser, &operator, &name);
12232
+ if (repeated) {
12233
+ pm_node_flag_set_repeated_parameter(param);
12234
+ }
11780
12235
  }
11781
12236
 
11782
12237
  if (params->keyword_rest == NULL) {
@@ -11964,25 +12419,10 @@ parse_rescues(pm_parser_t *parser, pm_begin_node_t *parent_node, bool def_p) {
11964
12419
  }
11965
12420
 
11966
12421
  static inline pm_begin_node_t *
11967
- parse_rescues_as_begin(pm_parser_t *parser, pm_statements_node_t *statements, bool def_p) {
12422
+ parse_rescues_as_begin(pm_parser_t *parser, const uint8_t *start, pm_statements_node_t *statements, bool def_p) {
11968
12423
  pm_token_t no_begin_token = not_provided(parser);
11969
12424
  pm_begin_node_t *begin_node = pm_begin_node_create(parser, &no_begin_token, statements);
11970
12425
  parse_rescues(parser, begin_node, def_p);
11971
-
11972
- // All nodes within a begin node are optional, so we look
11973
- // for the earliest possible node that we can use to set
11974
- // the BeginNode's start location
11975
- const uint8_t *start = begin_node->base.location.start;
11976
- if (begin_node->statements) {
11977
- start = begin_node->statements->base.location.start;
11978
- } else if (begin_node->rescue_clause) {
11979
- start = begin_node->rescue_clause->base.location.start;
11980
- } else if (begin_node->else_clause) {
11981
- start = begin_node->else_clause->base.location.start;
11982
- } else if (begin_node->ensure_clause) {
11983
- start = begin_node->ensure_clause->base.location.start;
11984
- }
11985
-
11986
12426
  begin_node->base.location.start = start;
11987
12427
  return begin_node;
11988
12428
  }
@@ -12012,10 +12452,13 @@ parse_block_parameters(
12012
12452
  if ((opening->type != PM_TOKEN_NOT_PROVIDED) && accept1(parser, PM_TOKEN_SEMICOLON)) {
12013
12453
  do {
12014
12454
  expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE);
12015
- pm_parser_parameter_name_check(parser, &parser->previous);
12455
+ bool repeated = pm_parser_parameter_name_check(parser, &parser->previous);
12016
12456
  pm_parser_local_add_token(parser, &parser->previous);
12017
12457
 
12018
12458
  pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous);
12459
+ if (repeated) {
12460
+ pm_node_flag_set_repeated_parameter((pm_node_t *)local);
12461
+ }
12019
12462
  pm_block_parameters_node_append_local(block_parameters, local);
12020
12463
  } while (accept1(parser, PM_TOKEN_COMMA));
12021
12464
  }
@@ -12031,8 +12474,10 @@ parse_block(pm_parser_t *parser) {
12031
12474
  pm_token_t opening = parser->previous;
12032
12475
  accept1(parser, PM_TOKEN_NEWLINE);
12033
12476
 
12477
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
12034
12478
  pm_accepts_block_stack_push(parser, true);
12035
12479
  pm_parser_scope_push(parser, false);
12480
+
12036
12481
  pm_block_parameters_node_t *block_parameters = NULL;
12037
12482
 
12038
12483
  if (accept1(parser, PM_TOKEN_PIPE)) {
@@ -12053,12 +12498,6 @@ parse_block(pm_parser_t *parser) {
12053
12498
  pm_block_parameters_node_closing_set(block_parameters, &parser->previous);
12054
12499
  }
12055
12500
 
12056
- uint32_t locals_body_index = 0;
12057
-
12058
- if (block_parameters) {
12059
- locals_body_index = (uint32_t) parser->current_scope->locals.size;
12060
- }
12061
-
12062
12501
  accept1(parser, PM_TOKEN_NEWLINE);
12063
12502
  pm_node_t *statements = NULL;
12064
12503
 
@@ -12078,7 +12517,7 @@ parse_block(pm_parser_t *parser) {
12078
12517
 
12079
12518
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
12080
12519
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
12081
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
12520
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) statements, false);
12082
12521
  }
12083
12522
  }
12084
12523
 
@@ -12090,13 +12529,14 @@ parse_block(pm_parser_t *parser) {
12090
12529
 
12091
12530
  if (parameters == NULL && (maximum > 0)) {
12092
12531
  parameters = (pm_node_t *) pm_numbered_parameters_node_create(parser, &(pm_location_t) { .start = opening.start, .end = parser->previous.end }, maximum);
12093
- locals_body_index = maximum;
12094
12532
  }
12095
12533
 
12096
12534
  pm_constant_id_list_t locals = parser->current_scope->locals;
12097
12535
  pm_parser_scope_pop(parser);
12098
12536
  pm_accepts_block_stack_pop(parser);
12099
- return pm_block_node_create(parser, &locals, locals_body_index, &opening, parameters, statements, &parser->previous);
12537
+ pm_parser_current_param_name_restore(parser, saved_param_name);
12538
+
12539
+ return pm_block_node_create(parser, &locals, &opening, parameters, statements, &parser->previous);
12100
12540
  }
12101
12541
 
12102
12542
  /**
@@ -12157,14 +12597,20 @@ parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accept
12157
12597
  }
12158
12598
 
12159
12599
  if (block != NULL) {
12160
- if (arguments->block == NULL) {
12600
+ if (arguments->block == NULL && !arguments->has_forwarding) {
12161
12601
  arguments->block = (pm_node_t *) block;
12162
12602
  } else {
12163
- pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
12164
- if (arguments->arguments == NULL) {
12165
- arguments->arguments = pm_arguments_node_create(parser);
12603
+ if (arguments->has_forwarding) {
12604
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_FORWARDING);
12605
+ } else {
12606
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
12607
+ }
12608
+ if (arguments->block != NULL) {
12609
+ if (arguments->arguments == NULL) {
12610
+ arguments->arguments = pm_arguments_node_create(parser);
12611
+ }
12612
+ pm_arguments_node_arguments_append(arguments->arguments, arguments->block);
12166
12613
  }
12167
- pm_arguments_node_arguments_append(arguments->arguments, arguments->block);
12168
12614
  arguments->block = (pm_node_t *) block;
12169
12615
  }
12170
12616
  }
@@ -12384,8 +12830,14 @@ static inline pm_node_flags_t
12384
12830
  parse_unescaped_encoding(const pm_parser_t *parser) {
12385
12831
  if (parser->explicit_encoding != NULL) {
12386
12832
  if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
12833
+ // If the there's an explicit encoding and it's using a UTF-8 escape
12834
+ // sequence, then mark the string as UTF-8.
12387
12835
  return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
12388
12836
  } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
12837
+ // If there's a non-UTF-8 escape sequence being used, then the
12838
+ // string uses the source encoding, unless the source is marked as
12839
+ // US-ASCII. In that case the string is forced as ASCII-8BIT in
12840
+ // order to keep the string valid.
12389
12841
  return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
12390
12842
  }
12391
12843
  }
@@ -12509,14 +12961,54 @@ parse_string_part(pm_parser_t *parser) {
12509
12961
  }
12510
12962
  }
12511
12963
 
12964
+ /**
12965
+ * When creating a symbol, unary operators that cannot be binary operators
12966
+ * automatically drop trailing `@` characters. This happens at the parser level,
12967
+ * such that `~@` is parsed as `~` and `!@` is parsed as `!`. We do that here.
12968
+ */
12969
+ static const uint8_t *
12970
+ parse_operator_symbol_name(const pm_token_t *name) {
12971
+ switch (name->type) {
12972
+ case PM_TOKEN_TILDE:
12973
+ case PM_TOKEN_BANG:
12974
+ if (name->end[-1] == '@') return name->end - 1;
12975
+ /* fallthrough */
12976
+ default:
12977
+ return name->end;
12978
+ }
12979
+ }
12980
+
12981
+ static pm_node_t *
12982
+ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_state_t next_state) {
12983
+ pm_token_t closing = not_provided(parser);
12984
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, opening, &parser->current, &closing);
12985
+
12986
+ const uint8_t *end = parse_operator_symbol_name(&parser->current);
12987
+
12988
+ if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
12989
+ parser_lex(parser);
12990
+
12991
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, end);
12992
+ pm_node_flag_set((pm_node_t *) symbol, PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING);
12993
+
12994
+ return (pm_node_t *) symbol;
12995
+ }
12996
+
12997
+ /**
12998
+ * Parse a symbol node. This function will get called immediately after finding
12999
+ * a symbol opening token. This handles parsing bare symbols and interpolated
13000
+ * symbols.
13001
+ */
12512
13002
  static pm_node_t *
12513
13003
  parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_state) {
12514
- pm_token_t opening = parser->previous;
13004
+ const pm_token_t opening = parser->previous;
12515
13005
 
12516
13006
  if (lex_mode->mode != PM_LEX_STRING) {
12517
13007
  if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
12518
13008
 
12519
13009
  switch (parser->current.type) {
13010
+ case PM_CASE_OPERATOR:
13011
+ return parse_operator_symbol(parser, &opening, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
12520
13012
  case PM_TOKEN_IDENTIFIER:
12521
13013
  case PM_TOKEN_CONSTANT:
12522
13014
  case PM_TOKEN_INSTANCE_VARIABLE:
@@ -12528,10 +13020,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12528
13020
  case PM_CASE_KEYWORD:
12529
13021
  parser_lex(parser);
12530
13022
  break;
12531
- case PM_CASE_OPERATOR:
12532
- lex_state_set(parser, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
12533
- parser_lex(parser);
12534
- break;
12535
13023
  default:
12536
13024
  expect2(parser, PM_TOKEN_IDENTIFIER, PM_TOKEN_METHOD_NAME, PM_ERR_SYMBOL_INVALID);
12537
13025
  break;
@@ -12541,6 +13029,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12541
13029
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12542
13030
 
12543
13031
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13032
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13033
+
12544
13034
  return (pm_node_t *) symbol;
12545
13035
  }
12546
13036
 
@@ -12637,7 +13127,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12637
13127
  } else {
12638
13128
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
12639
13129
  }
12640
- return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13130
+
13131
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
12641
13132
  }
12642
13133
 
12643
13134
  /**
@@ -12647,8 +13138,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12647
13138
  static inline pm_node_t *
12648
13139
  parse_undef_argument(pm_parser_t *parser) {
12649
13140
  switch (parser->current.type) {
13141
+ case PM_CASE_OPERATOR: {
13142
+ const pm_token_t opening = not_provided(parser);
13143
+ return parse_operator_symbol(parser, &opening, PM_LEX_STATE_NONE);
13144
+ }
12650
13145
  case PM_CASE_KEYWORD:
12651
- case PM_CASE_OPERATOR:
12652
13146
  case PM_TOKEN_CONSTANT:
12653
13147
  case PM_TOKEN_IDENTIFIER:
12654
13148
  case PM_TOKEN_METHOD_NAME: {
@@ -12659,6 +13153,8 @@ parse_undef_argument(pm_parser_t *parser) {
12659
13153
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12660
13154
 
12661
13155
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13156
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13157
+
12662
13158
  return (pm_node_t *) symbol;
12663
13159
  }
12664
13160
  case PM_TOKEN_SYMBOL_BEGIN: {
@@ -12682,21 +13178,24 @@ parse_undef_argument(pm_parser_t *parser) {
12682
13178
  static inline pm_node_t *
12683
13179
  parse_alias_argument(pm_parser_t *parser, bool first) {
12684
13180
  switch (parser->current.type) {
12685
- case PM_CASE_OPERATOR:
13181
+ case PM_CASE_OPERATOR: {
13182
+ const pm_token_t opening = not_provided(parser);
13183
+ return parse_operator_symbol(parser, &opening, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE);
13184
+ }
12686
13185
  case PM_CASE_KEYWORD:
12687
13186
  case PM_TOKEN_CONSTANT:
12688
13187
  case PM_TOKEN_IDENTIFIER:
12689
13188
  case PM_TOKEN_METHOD_NAME: {
12690
- if (first) {
12691
- lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
12692
- }
12693
-
13189
+ if (first) lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
12694
13190
  parser_lex(parser);
13191
+
12695
13192
  pm_token_t opening = not_provided(parser);
12696
13193
  pm_token_t closing = not_provided(parser);
12697
13194
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12698
13195
 
12699
13196
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13197
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13198
+
12700
13199
  return (pm_node_t *) symbol;
12701
13200
  }
12702
13201
  case PM_TOKEN_SYMBOL_BEGIN: {
@@ -12733,6 +13232,64 @@ outer_scope_using_numbered_parameters_p(pm_parser_t *parser) {
12733
13232
  return false;
12734
13233
  }
12735
13234
 
13235
+ /**
13236
+ * These are the names of the various numbered parameters. We have them here so
13237
+ * that when we insert them into the constant pool we can use a constant string
13238
+ * and not have to allocate.
13239
+ */
13240
+ static const char * const pm_numbered_parameter_names[] = {
13241
+ "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9"
13242
+ };
13243
+
13244
+ /**
13245
+ * Parse an identifier into either a local variable read. If the local variable
13246
+ * is not found, it returns NULL instead.
13247
+ */
13248
+ static pm_local_variable_read_node_t *
13249
+ parse_variable(pm_parser_t *parser) {
13250
+ int depth;
13251
+ if ((depth = pm_parser_local_depth(parser, &parser->previous)) != -1) {
13252
+ return pm_local_variable_read_node_create(parser, &parser->previous, (uint32_t) depth);
13253
+ }
13254
+
13255
+ if (!parser->current_scope->closed && pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end)) {
13256
+ // Now that we know we have a numbered parameter, we need to check
13257
+ // if it's allowed in this context. If it is, then we will create a
13258
+ // local variable read. If it's not, then we'll create a normal call
13259
+ // node but add an error.
13260
+ if (parser->current_scope->explicit_params) {
13261
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
13262
+ } else if (outer_scope_using_numbered_parameters_p(parser)) {
13263
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
13264
+ } else {
13265
+ // Indicate that this scope is using numbered params so that child
13266
+ // scopes cannot. We subtract the value for the character '0' to get
13267
+ // the actual integer value of the number (only _1 through _9 are
13268
+ // valid).
13269
+ uint8_t numbered_parameters = (uint8_t) (parser->previous.start[1] - '0');
13270
+ if (numbered_parameters > parser->current_scope->numbered_parameters) {
13271
+ parser->current_scope->numbered_parameters = numbered_parameters;
13272
+ pm_parser_numbered_parameters_set(parser, numbered_parameters);
13273
+ }
13274
+
13275
+ // When you use a numbered parameter, it implies the existence
13276
+ // of all of the locals that exist before it. For example,
13277
+ // referencing _2 means that _1 must exist. Therefore here we
13278
+ // loop through all of the possibilities and add them into the
13279
+ // constant pool.
13280
+ for (uint8_t numbered_parameter = 1; numbered_parameter <= numbered_parameters - 1; numbered_parameter++) {
13281
+ pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameter - 1], 2);
13282
+ }
13283
+
13284
+ // Finally we can create the local variable read node.
13285
+ pm_constant_id_t name_id = pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameters - 1], 2);
13286
+ return pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13287
+ }
13288
+ }
13289
+
13290
+ return NULL;
13291
+ }
13292
+
12736
13293
  /**
12737
13294
  * Parse an identifier into either a local variable read or a call.
12738
13295
  */
@@ -12741,56 +13298,8 @@ parse_variable_call(pm_parser_t *parser) {
12741
13298
  pm_node_flags_t flags = 0;
12742
13299
 
12743
13300
  if (!match1(parser, PM_TOKEN_PARENTHESIS_LEFT) && (parser->previous.end[-1] != '!') && (parser->previous.end[-1] != '?')) {
12744
- int depth;
12745
- if ((depth = pm_parser_local_depth(parser, &parser->previous)) != -1) {
12746
- return (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, (uint32_t) depth);
12747
- }
12748
-
12749
- if (!parser->current_scope->closed && pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end)) {
12750
- // Now that we know we have a numbered parameter, we need to check
12751
- // if it's allowed in this context. If it is, then we will create a
12752
- // local variable read. If it's not, then we'll create a normal call
12753
- // node but add an error.
12754
- if (parser->current_scope->explicit_params) {
12755
- pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
12756
- } else if (outer_scope_using_numbered_parameters_p(parser)) {
12757
- pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
12758
- } else {
12759
- // Indicate that this scope is using numbered params so that child
12760
- // scopes cannot.
12761
- uint8_t number = parser->previous.start[1];
12762
-
12763
- // We subtract the value for the character '0' to get the actual
12764
- // integer value of the number (only _1 through _9 are valid)
12765
- uint8_t numbered_parameters = (uint8_t) (number - '0');
12766
- if (numbered_parameters > parser->current_scope->numbered_parameters) {
12767
- parser->current_scope->numbered_parameters = numbered_parameters;
12768
- pm_parser_numbered_parameters_set(parser, numbered_parameters);
12769
- }
12770
-
12771
- // When you use a numbered parameter, it implies the existence
12772
- // of all of the locals that exist before it. For example,
12773
- // referencing _2 means that _1 must exist. Therefore here we
12774
- // loop through all of the possibilities and add them into the
12775
- // constant pool.
12776
- uint8_t current = '1';
12777
- uint8_t *value;
12778
-
12779
- while (current < number) {
12780
- value = malloc(2);
12781
- value[0] = '_';
12782
- value[1] = current++;
12783
- pm_parser_local_add_owned(parser, value, 2);
12784
- }
12785
-
12786
- // Now we can add the actual token that is being used. For
12787
- // this one we can add a shared version since it is directly
12788
- // referenced in the source.
12789
- pm_parser_local_add_token(parser, &parser->previous);
12790
- return (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
12791
- }
12792
- }
12793
-
13301
+ pm_local_variable_read_node_t *node = parse_variable(parser);
13302
+ if (node != NULL) return (pm_node_t *) node;
12794
13303
  flags |= PM_CALL_NODE_FLAGS_VARIABLE_CALL;
12795
13304
  }
12796
13305
 
@@ -13076,43 +13585,77 @@ parse_pattern_keyword_rest(pm_parser_t *parser) {
13076
13585
  return (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
13077
13586
  }
13078
13587
 
13588
+ /**
13589
+ * Create an implicit node for the value of a hash pattern that has omitted the
13590
+ * value. This will use an implicit local variable target.
13591
+ */
13592
+ static pm_node_t *
13593
+ parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_symbol_node_t *key) {
13594
+ const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13595
+ pm_constant_id_t name = pm_parser_constant_id_location(parser, value_loc->start, value_loc->end);
13596
+
13597
+ int current_depth = pm_parser_local_depth_constant_id(parser, name);
13598
+ uint32_t depth;
13599
+
13600
+ if (current_depth == -1) {
13601
+ pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13602
+ depth = 0;
13603
+ } else {
13604
+ depth = (uint32_t) current_depth;
13605
+ }
13606
+
13607
+ pm_local_variable_target_node_t *target = pm_local_variable_target_node_create_values(parser, value_loc, name, depth);
13608
+ return (pm_node_t *) pm_implicit_node_create(parser, (pm_node_t *) target);
13609
+ }
13610
+
13079
13611
  /**
13080
13612
  * Parse a hash pattern.
13081
13613
  */
13082
13614
  static pm_hash_pattern_node_t *
13083
- parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_assoc) {
13615
+ parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_node) {
13084
13616
  pm_node_list_t assocs = { 0 };
13085
13617
  pm_node_t *rest = NULL;
13086
13618
 
13087
- switch (PM_NODE_TYPE(first_assoc)) {
13088
- case PM_ASSOC_NODE: {
13089
- if (!match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
13090
- // Here we have a value for the first assoc in the list, so we will
13091
- // parse it now and update the first assoc.
13092
- pm_node_t *value = parse_pattern(parser, false, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY);
13619
+ switch (PM_NODE_TYPE(first_node)) {
13620
+ case PM_ASSOC_SPLAT_NODE:
13621
+ case PM_NO_KEYWORDS_PARAMETER_NODE:
13622
+ rest = first_node;
13623
+ break;
13624
+ case PM_SYMBOL_NODE: {
13625
+ if (pm_symbol_node_label_p(first_node)) {
13626
+ pm_node_t *value;
13627
+
13628
+ if (!match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
13629
+ // Here we have a value for the first assoc in the list, so
13630
+ // we will parse it now.
13631
+ value = parse_pattern(parser, false, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY);
13632
+ } else {
13633
+ // Otherwise, we will create an implicit local variable
13634
+ // target for the value.
13635
+ value = parse_pattern_hash_implicit_value(parser, (pm_symbol_node_t *) first_node);
13636
+ }
13093
13637
 
13094
- pm_assoc_node_t *assoc = (pm_assoc_node_t *) first_assoc;
13095
- assoc->base.location.end = value->location.end;
13096
- assoc->value = value;
13097
- } else {
13098
- pm_node_t *key = ((pm_assoc_node_t *) first_assoc)->key;
13638
+ pm_token_t operator = not_provided(parser);
13639
+ pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value);
13099
13640
 
13100
- if (PM_NODE_TYPE_P(key, PM_SYMBOL_NODE)) {
13101
- const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13102
- pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13103
- }
13641
+ pm_node_list_append(&assocs, assoc);
13642
+ break;
13104
13643
  }
13644
+ }
13645
+ /* fallthrough */
13646
+ default: {
13647
+ // If we get anything else, then this is an error. For this we'll
13648
+ // create a missing node for the value and create an assoc node for
13649
+ // the first node in the list.
13650
+ pm_parser_err_node(parser, first_node, PM_ERR_PATTERN_HASH_KEY_LABEL);
13651
+
13652
+ pm_token_t operator = not_provided(parser);
13653
+ pm_node_t *value = (pm_node_t *) pm_missing_node_create(parser, first_node->location.start, first_node->location.end);
13654
+ pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value);
13105
13655
 
13106
- pm_node_list_append(&assocs, first_assoc);
13656
+ pm_node_list_append(&assocs, assoc);
13107
13657
  break;
13108
13658
  }
13109
- case PM_ASSOC_SPLAT_NODE:
13110
- case PM_NO_KEYWORDS_PARAMETER_NODE:
13111
- rest = first_assoc;
13112
- break;
13113
- default:
13114
- assert(false);
13115
- break;
13116
13659
  }
13117
13660
 
13118
13661
  // If there are any other assocs, then we'll parse them now.
@@ -13141,6 +13684,7 @@ parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_assoc) {
13141
13684
  } else {
13142
13685
  const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13143
13686
  pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13687
+ value = parse_pattern_hash_implicit_value(parser, (pm_symbol_node_t *) key);
13144
13688
  }
13145
13689
 
13146
13690
  pm_token_t operator = not_provided(parser);
@@ -13246,45 +13790,29 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
13246
13790
  // pattern node.
13247
13791
  node = pm_hash_pattern_node_empty_create(parser, &opening, &parser->previous);
13248
13792
  } else {
13249
- pm_node_t *first_assoc;
13793
+ pm_node_t *first_node;
13250
13794
 
13251
13795
  switch (parser->current.type) {
13252
- case PM_TOKEN_LABEL: {
13796
+ case PM_TOKEN_LABEL:
13253
13797
  parser_lex(parser);
13254
-
13255
- pm_symbol_node_t *key = pm_symbol_node_label_create(parser, &parser->previous);
13256
- pm_token_t operator = not_provided(parser);
13257
-
13258
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, (pm_node_t *) key, &operator, NULL);
13798
+ first_node = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous);
13259
13799
  break;
13260
- }
13261
13800
  case PM_TOKEN_USTAR_STAR:
13262
- first_assoc = parse_pattern_keyword_rest(parser);
13801
+ first_node = parse_pattern_keyword_rest(parser);
13263
13802
  break;
13264
- case PM_TOKEN_STRING_BEGIN: {
13265
- pm_node_t *key = parse_expression(parser, PM_BINDING_POWER_MAX, false, PM_ERR_PATTERN_HASH_KEY);
13266
- pm_token_t operator = not_provided(parser);
13267
-
13268
- if (!pm_symbol_node_label_p(key)) {
13269
- pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_LABEL);
13270
- }
13271
-
13272
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, key, &operator, NULL);
13803
+ case PM_TOKEN_STRING_BEGIN:
13804
+ first_node = parse_expression(parser, PM_BINDING_POWER_MAX, false, PM_ERR_PATTERN_HASH_KEY);
13273
13805
  break;
13274
- }
13275
13806
  default: {
13276
13807
  parser_lex(parser);
13277
13808
  pm_parser_err_previous(parser, PM_ERR_PATTERN_HASH_KEY);
13278
13809
 
13279
- pm_missing_node_t *key = pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
13280
- pm_token_t operator = not_provided(parser);
13281
-
13282
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, (pm_node_t *) key, &operator, NULL);
13810
+ first_node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
13283
13811
  break;
13284
13812
  }
13285
13813
  }
13286
13814
 
13287
- node = parse_pattern_hash(parser, first_assoc);
13815
+ node = parse_pattern_hash(parser, first_node);
13288
13816
 
13289
13817
  accept1(parser, PM_TOKEN_NEWLINE);
13290
13818
  expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_PATTERN_TERM_BRACE);
@@ -13350,7 +13878,16 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
13350
13878
  switch (parser->current.type) {
13351
13879
  case PM_TOKEN_IDENTIFIER: {
13352
13880
  parser_lex(parser);
13353
- pm_node_t *variable = (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
13881
+ pm_node_t *variable = (pm_node_t *) parse_variable(parser);
13882
+ if (variable == NULL) {
13883
+ if (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3_0 && pm_token_is_it(parser->previous.start, parser->previous.end)) {
13884
+ pm_constant_id_t name_id = pm_parser_constant_id_constant(parser, "0it", 3);
13885
+ variable = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13886
+ } else {
13887
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE);
13888
+ variable = (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
13889
+ }
13890
+ }
13354
13891
 
13355
13892
  return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable);
13356
13893
  }
@@ -13519,9 +14056,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13519
14056
  case PM_TOKEN_LABEL: {
13520
14057
  parser_lex(parser);
13521
14058
  pm_node_t *key = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous);
13522
- pm_token_t operator = not_provided(parser);
13523
-
13524
- return (pm_node_t *) parse_pattern_hash(parser, (pm_node_t *) pm_assoc_node_create(parser, key, &operator, NULL));
14059
+ return (pm_node_t *) parse_pattern_hash(parser, key);
13525
14060
  }
13526
14061
  case PM_TOKEN_USTAR_STAR: {
13527
14062
  node = parse_pattern_keyword_rest(parser);
@@ -13544,8 +14079,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13544
14079
  // If we got a dynamic label symbol, then we need to treat it like the
13545
14080
  // beginning of a hash pattern.
13546
14081
  if (pm_symbol_node_label_p(node)) {
13547
- pm_token_t operator = not_provided(parser);
13548
- return (pm_node_t *) parse_pattern_hash(parser, (pm_node_t *) pm_assoc_node_create(parser, node, &operator, NULL));
14082
+ return (pm_node_t *) parse_pattern_hash(parser, node);
13549
14083
  }
13550
14084
 
13551
14085
  if (top_pattern && match1(parser, PM_TOKEN_COMMA)) {
@@ -13558,7 +14092,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13558
14092
  // Gather up all of the patterns into the list.
13559
14093
  while (accept1(parser, PM_TOKEN_COMMA)) {
13560
14094
  // Break early here in case we have a trailing comma.
13561
- if (match5(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
14095
+ if (match6(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) {
13562
14096
  node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
13563
14097
  pm_node_list_append(&nodes, node);
13564
14098
  break;
@@ -13644,7 +14178,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13644
14178
  assert(parser->current.type == PM_TOKEN_STRING_BEGIN);
13645
14179
 
13646
14180
  bool concating = false;
13647
- bool state_is_arg_labeled = lex_state_p(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
14181
+ bool state_is_arg_labeled = lex_state_arg_labeled_p(parser);
13648
14182
 
13649
14183
  while (match1(parser, PM_TOKEN_STRING_BEGIN)) {
13650
14184
  pm_node_t *node = NULL;
@@ -13659,7 +14193,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13659
14193
  parser_lex(parser);
13660
14194
 
13661
14195
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13662
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14196
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13663
14197
  // If we get here, then we have an end immediately after a
13664
14198
  // start. In that case we'll create an empty content token and
13665
14199
  // return an uninterpolated string.
@@ -13672,7 +14206,6 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13672
14206
  // If we get here, then we have an end of a label immediately
13673
14207
  // after a start. In that case we'll create an empty symbol
13674
14208
  // node.
13675
- pm_token_t opening = not_provided(parser);
13676
14209
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
13677
14210
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous);
13678
14211
 
@@ -13716,15 +14249,19 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13716
14249
  parser_lex(parser);
13717
14250
  } while (match1(parser, PM_TOKEN_STRING_CONTENT));
13718
14251
 
13719
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14252
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13720
14253
  node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
13721
14254
  } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
13722
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14255
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
13723
14256
  } else if (match1(parser, PM_TOKEN_EOF)) {
13724
- pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_TERM);
14257
+ pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
13725
14258
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
14259
+ } else if (accept1(parser, PM_TOKEN_STRING_END)) {
14260
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13726
14261
  } else {
13727
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14262
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_type_human(parser->previous.type));
14263
+ parser->previous.start = parser->previous.end;
14264
+ parser->previous.type = PM_TOKEN_MISSING;
13728
14265
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13729
14266
  }
13730
14267
  } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
@@ -13739,9 +14276,9 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13739
14276
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13740
14277
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
13741
14278
  pm_node_flag_set(node, parse_unescaped_encoding(parser));
13742
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14279
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13743
14280
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
13744
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14281
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
13745
14282
  } else {
13746
14283
  // If we get here, then we have interpolation so we'll need
13747
14284
  // to create a string or symbol node with interpolation.
@@ -13830,11 +14367,34 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13830
14367
  return current;
13831
14368
  }
13832
14369
 
14370
+ /**
14371
+ * Append an error to the error list on the parser using the given diagnostic
14372
+ * ID. This function is a specialization that handles formatting the specific
14373
+ * kind of error that is being appended.
14374
+ */
14375
+ static void
14376
+ pm_parser_err_prefix(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
14377
+ switch (diag_id) {
14378
+ case PM_ERR_HASH_KEY: {
14379
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, pm_token_type_human(parser->previous.type));
14380
+ break;
14381
+ }
14382
+ case PM_ERR_UNARY_RECEIVER: {
14383
+ const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_type_human(parser->current.type));
14384
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, human, parser->previous.start[0]);
14385
+ break;
14386
+ }
14387
+ default:
14388
+ pm_parser_err_previous(parser, diag_id);
14389
+ break;
14390
+ }
14391
+ }
14392
+
13833
14393
  /**
13834
14394
  * Parse an expression that begins with the previous node that we just lexed.
13835
14395
  */
13836
14396
  static inline pm_node_t *
13837
- parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call) {
14397
+ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
13838
14398
  switch (parser->current.type) {
13839
14399
  case PM_TOKEN_BRACKET_LEFT_ARRAY: {
13840
14400
  parser_lex(parser);
@@ -13866,9 +14426,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
13866
14426
  pm_node_t *expression = NULL;
13867
14427
 
13868
14428
  if (match3(parser, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_EOF)) {
13869
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
13870
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
13871
- }
14429
+ pm_parser_scope_forwarding_positionals_check(parser, &operator);
13872
14430
  } else {
13873
14431
  expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_ARRAY_EXPRESSION_AFTER_STAR);
13874
14432
  }
@@ -14016,7 +14574,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14016
14574
  // If we didn't find a terminator and we didn't find a right
14017
14575
  // parenthesis, then this is a syntax error.
14018
14576
  if (!terminator_found) {
14019
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14577
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14020
14578
  }
14021
14579
 
14022
14580
  // Parse each statement within the parentheses.
@@ -14045,7 +14603,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14045
14603
  } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
14046
14604
  break;
14047
14605
  } else {
14048
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14606
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14049
14607
  }
14050
14608
  }
14051
14609
 
@@ -14113,7 +14671,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14113
14671
  if (
14114
14672
  match1(parser, PM_TOKEN_PARENTHESIS_LEFT) ||
14115
14673
  (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) ||
14116
- (pm_accepts_block_stack_p(parser) && match2(parser, PM_TOKEN_KEYWORD_DO, PM_TOKEN_BRACE_LEFT))
14674
+ (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) ||
14675
+ match1(parser, PM_TOKEN_BRACE_LEFT)
14117
14676
  ) {
14118
14677
  pm_arguments_t arguments = { 0 };
14119
14678
  parse_arguments_list(parser, &arguments, true, accepts_command_call);
@@ -14237,7 +14796,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14237
14796
  // a block, so we need to check for that here.
14238
14797
  if (
14239
14798
  (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) ||
14240
- (pm_accepts_block_stack_p(parser) && match2(parser, PM_TOKEN_KEYWORD_DO, PM_TOKEN_BRACE_LEFT))
14799
+ (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) ||
14800
+ match1(parser, PM_TOKEN_BRACE_LEFT)
14241
14801
  ) {
14242
14802
  pm_arguments_t arguments = { 0 };
14243
14803
  parse_arguments_list(parser, &arguments, true, accepts_command_call);
@@ -14250,6 +14810,31 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14250
14810
 
14251
14811
  if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) {
14252
14812
  node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX);
14813
+ } else {
14814
+ // Check if `it` is not going to be assigned.
14815
+ switch (parser->current.type) {
14816
+ case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL:
14817
+ case PM_TOKEN_AMPERSAND_EQUAL:
14818
+ case PM_TOKEN_CARET_EQUAL:
14819
+ case PM_TOKEN_EQUAL:
14820
+ case PM_TOKEN_GREATER_GREATER_EQUAL:
14821
+ case PM_TOKEN_LESS_LESS_EQUAL:
14822
+ case PM_TOKEN_MINUS_EQUAL:
14823
+ case PM_TOKEN_PARENTHESIS_RIGHT:
14824
+ case PM_TOKEN_PERCENT_EQUAL:
14825
+ case PM_TOKEN_PIPE_EQUAL:
14826
+ case PM_TOKEN_PIPE_PIPE_EQUAL:
14827
+ case PM_TOKEN_PLUS_EQUAL:
14828
+ case PM_TOKEN_SLASH_EQUAL:
14829
+ case PM_TOKEN_STAR_EQUAL:
14830
+ case PM_TOKEN_STAR_STAR_EQUAL:
14831
+ break;
14832
+ default:
14833
+ // Once we know it's neither a method call nor an
14834
+ // assignment, we can finally create `it` default
14835
+ // parameter.
14836
+ node = pm_node_check_it(parser, node);
14837
+ }
14253
14838
  }
14254
14839
 
14255
14840
  return node;
@@ -14286,6 +14871,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14286
14871
  // If we get here, then we tried to find something in the
14287
14872
  // heredoc but couldn't actually parse anything, so we'll just
14288
14873
  // return a missing node.
14874
+ //
14875
+ // parse_string_part handles its own errors, so there is no need
14876
+ // for us to add one here.
14289
14877
  node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
14290
14878
  } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
14291
14879
  // If we get here, then the part that we parsed was plain string
@@ -14549,11 +15137,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14549
15137
  // for guard clauses in the form of `if` or `unless` statements.
14550
15138
  if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) {
14551
15139
  pm_token_t keyword = parser->previous;
14552
- pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, true, PM_ERR_CONDITIONAL_IF_PREDICATE);
15140
+ pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, PM_ERR_CONDITIONAL_IF_PREDICATE);
14553
15141
  pattern = (pm_node_t *) pm_if_node_modifier_create(parser, pattern, &keyword, predicate);
14554
15142
  } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) {
14555
15143
  pm_token_t keyword = parser->previous;
14556
- pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, true, PM_ERR_CONDITIONAL_UNLESS_PREDICATE);
15144
+ pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, PM_ERR_CONDITIONAL_UNLESS_PREDICATE);
14557
15145
  pattern = (pm_node_t *) pm_unless_node_modifier_create(parser, pattern, &keyword, predicate);
14558
15146
  }
14559
15147
 
@@ -14742,8 +15330,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14742
15330
  pm_token_t operator = parser->previous;
14743
15331
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_NOT, true, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS);
14744
15332
 
14745
- pm_constant_id_t old_param_name = parser->current_param_name;
14746
- parser->current_param_name = 0;
15333
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
14747
15334
  pm_parser_scope_push(parser, true);
14748
15335
  accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
14749
15336
 
@@ -14756,15 +15343,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14756
15343
 
14757
15344
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
14758
15345
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
14759
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15346
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
14760
15347
  }
14761
15348
 
14762
15349
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
14763
-
14764
15350
  pm_constant_id_list_t locals = parser->current_scope->locals;
15351
+
14765
15352
  pm_parser_scope_pop(parser);
14766
- parser->current_param_name = old_param_name;
14767
15353
  pm_do_loop_stack_pop(parser);
15354
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15355
+
14768
15356
  return (pm_node_t *) pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous);
14769
15357
  }
14770
15358
 
@@ -14790,9 +15378,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14790
15378
  superclass = NULL;
14791
15379
  }
14792
15380
 
14793
- pm_constant_id_t old_param_name = parser->current_param_name;
14794
- parser->current_param_name = 0;
15381
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
14795
15382
  pm_parser_scope_push(parser, true);
15383
+
14796
15384
  if (inheritance_operator.type != PM_TOKEN_NOT_PROVIDED) {
14797
15385
  expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END);
14798
15386
  } else {
@@ -14808,7 +15396,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14808
15396
 
14809
15397
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
14810
15398
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
14811
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15399
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
14812
15400
  }
14813
15401
 
14814
15402
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
@@ -14818,9 +15406,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14818
15406
  }
14819
15407
 
14820
15408
  pm_constant_id_list_t locals = parser->current_scope->locals;
15409
+
14821
15410
  pm_parser_scope_pop(parser);
14822
- parser->current_param_name = old_param_name;
14823
15411
  pm_do_loop_stack_pop(parser);
15412
+ pm_parser_current_param_name_restore(parser, saved_param_name);
14824
15413
 
14825
15414
  if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) {
14826
15415
  pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME);
@@ -14835,18 +15424,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14835
15424
  pm_token_t operator = not_provided(parser);
14836
15425
  pm_token_t name = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = def_keyword.end, .end = def_keyword.end };
14837
15426
 
14838
- // This context is necessary for lexing `...` in a bare params correctly.
14839
- // It must be pushed before lexing the first param, so it is here.
15427
+ // This context is necessary for lexing `...` in a bare params
15428
+ // correctly. It must be pushed before lexing the first param, so it
15429
+ // is here.
14840
15430
  context_push(parser, PM_CONTEXT_DEF_PARAMS);
15431
+ pm_constant_id_t saved_param_name;
15432
+
14841
15433
  parser_lex(parser);
14842
- pm_constant_id_t old_param_name = parser->current_param_name;
14843
15434
 
14844
15435
  switch (parser->current.type) {
14845
15436
  case PM_CASE_OPERATOR:
15437
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14846
15438
  pm_parser_scope_push(parser, true);
14847
- parser->current_param_name = 0;
14848
15439
  lex_state_set(parser, PM_LEX_STATE_ENDFN);
14849
15440
  parser_lex(parser);
15441
+
14850
15442
  name = parser->previous;
14851
15443
  break;
14852
15444
  case PM_TOKEN_IDENTIFIER: {
@@ -14854,18 +15446,20 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14854
15446
 
14855
15447
  if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) {
14856
15448
  receiver = parse_variable_call(parser);
15449
+ receiver = pm_node_check_it(parser, receiver);
14857
15450
 
15451
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14858
15452
  pm_parser_scope_push(parser, true);
14859
- parser->current_param_name = 0;
14860
15453
  lex_state_set(parser, PM_LEX_STATE_FNAME);
14861
15454
  parser_lex(parser);
14862
15455
 
14863
15456
  operator = parser->previous;
14864
15457
  name = parse_method_definition_name(parser);
14865
15458
  } else {
15459
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14866
15460
  pm_refute_numbered_parameter(parser, parser->previous.start, parser->previous.end);
14867
15461
  pm_parser_scope_push(parser, true);
14868
- parser->current_param_name = 0;
15462
+
14869
15463
  name = parser->previous;
14870
15464
  }
14871
15465
 
@@ -14882,9 +15476,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14882
15476
  case PM_TOKEN_KEYWORD___FILE__:
14883
15477
  case PM_TOKEN_KEYWORD___LINE__:
14884
15478
  case PM_TOKEN_KEYWORD___ENCODING__: {
15479
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14885
15480
  pm_parser_scope_push(parser, true);
14886
- parser->current_param_name = 0;
14887
15481
  parser_lex(parser);
15482
+
14888
15483
  pm_token_t identifier = parser->previous;
14889
15484
 
14890
15485
  if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) {
@@ -14946,6 +15541,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14946
15541
  pm_token_t lparen = parser->previous;
14947
15542
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_STATEMENT, true, PM_ERR_DEF_RECEIVER);
14948
15543
 
15544
+ accept1(parser, PM_TOKEN_NEWLINE);
14949
15545
  expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN);
14950
15546
  pm_token_t rparen = parser->previous;
14951
15547
 
@@ -14955,8 +15551,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14955
15551
  operator = parser->previous;
14956
15552
  receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen);
14957
15553
 
15554
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14958
15555
  pm_parser_scope_push(parser, true);
14959
- parser->current_param_name = 0;
14960
15556
 
14961
15557
  // To push `PM_CONTEXT_DEF_PARAMS` again is for the same reason as described the above.
14962
15558
  context_push(parser, PM_CONTEXT_DEF_PARAMS);
@@ -14964,8 +15560,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14964
15560
  break;
14965
15561
  }
14966
15562
  default:
15563
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14967
15564
  pm_parser_scope_push(parser, true);
14968
- parser->current_param_name = 0;
15565
+
14969
15566
  name = parse_method_definition_name(parser);
14970
15567
  break;
14971
15568
  }
@@ -15018,8 +15615,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15018
15615
  }
15019
15616
  }
15020
15617
 
15021
- uint32_t locals_body_index = (uint32_t) parser->current_scope->locals.size;
15022
-
15023
15618
  context_pop(parser);
15024
15619
  pm_node_t *statements = NULL;
15025
15620
  pm_token_t equal;
@@ -15070,7 +15665,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15070
15665
 
15071
15666
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15072
15667
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15073
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, true);
15668
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, def_keyword.start, (pm_statements_node_t *) statements, true);
15074
15669
  }
15075
15670
 
15076
15671
  pm_accepts_block_stack_pop(parser);
@@ -15080,17 +15675,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15080
15675
  }
15081
15676
 
15082
15677
  pm_constant_id_list_t locals = parser->current_scope->locals;
15083
- parser->current_param_name = old_param_name;
15678
+
15084
15679
  pm_parser_scope_pop(parser);
15680
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15681
+
15682
+ /**
15683
+ * If the final character is @. As is the case when defining
15684
+ * methods to override the unary operators, we should ignore
15685
+ * the @ in the same way we do for symbols.
15686
+ */
15687
+ pm_constant_id_t name_id = pm_parser_constant_id_location(parser, name.start, parse_operator_symbol_name(&name));
15085
15688
 
15086
15689
  return (pm_node_t *) pm_def_node_create(
15087
15690
  parser,
15691
+ name_id,
15088
15692
  &name,
15089
15693
  receiver,
15090
15694
  params,
15091
15695
  statements,
15092
15696
  &locals,
15093
- locals_body_index,
15094
15697
  &def_keyword,
15095
15698
  &operator,
15096
15699
  &lparen,
@@ -15309,9 +15912,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15309
15912
  pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME);
15310
15913
  }
15311
15914
 
15312
- pm_constant_id_t old_param_name = parser->current_param_name;
15313
- parser->current_param_name = 0;
15915
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
15314
15916
  pm_parser_scope_push(parser, true);
15917
+
15315
15918
  accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE);
15316
15919
  pm_node_t *statements = NULL;
15317
15920
 
@@ -15323,12 +15926,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15323
15926
 
15324
15927
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15325
15928
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15326
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15929
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, module_keyword.start, (pm_statements_node_t *) statements, false);
15327
15930
  }
15328
15931
 
15329
15932
  pm_constant_id_list_t locals = parser->current_scope->locals;
15330
15933
  pm_parser_scope_pop(parser);
15331
- parser->current_param_name = old_param_name;
15934
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15332
15935
 
15333
15936
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM);
15334
15937
 
@@ -15914,6 +16517,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15914
16517
  // context of a multiple assignment. We enforce that here. We'll
15915
16518
  // still lex past it though and create a missing node place.
15916
16519
  if (binding_power != PM_BINDING_POWER_STATEMENT) {
16520
+ pm_parser_err_prefix(parser, diag_id);
15917
16521
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
15918
16522
  }
15919
16523
 
@@ -15936,7 +16540,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15936
16540
  parser_lex(parser);
15937
16541
 
15938
16542
  pm_token_t operator = parser->previous;
15939
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER_BANG);
16543
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER);
15940
16544
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "!");
15941
16545
 
15942
16546
  pm_conditional_predicate(receiver);
@@ -15946,7 +16550,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15946
16550
  parser_lex(parser);
15947
16551
 
15948
16552
  pm_token_t operator = parser->previous;
15949
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_TILDE);
16553
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15950
16554
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "~");
15951
16555
 
15952
16556
  return (pm_node_t *) node;
@@ -15955,7 +16559,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15955
16559
  parser_lex(parser);
15956
16560
 
15957
16561
  pm_token_t operator = parser->previous;
15958
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16562
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15959
16563
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "-@");
15960
16564
 
15961
16565
  return (pm_node_t *) node;
@@ -15964,7 +16568,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15964
16568
  parser_lex(parser);
15965
16569
 
15966
16570
  pm_token_t operator = parser->previous;
15967
- pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16571
+ pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15968
16572
 
15969
16573
  if (accept1(parser, PM_TOKEN_STAR_STAR)) {
15970
16574
  pm_token_t exponent_operator = parser->previous;
@@ -15995,7 +16599,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15995
16599
  parser_lex(parser);
15996
16600
 
15997
16601
  pm_token_t operator = parser->previous;
16602
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
15998
16603
  pm_parser_scope_push(parser, false);
16604
+
15999
16605
  pm_block_parameters_node_t *block_parameters;
16000
16606
 
16001
16607
  switch (parser->current.type) {
@@ -16030,12 +16636,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16030
16636
  }
16031
16637
  }
16032
16638
 
16033
- uint32_t locals_body_index = 0;
16034
-
16035
- if (block_parameters) {
16036
- locals_body_index = (uint32_t) parser->current_scope->locals.size;
16037
- }
16038
-
16039
16639
  pm_token_t opening;
16040
16640
  pm_node_t *body = NULL;
16041
16641
  parser->lambda_enclosure_nesting = previous_lambda_enclosure_nesting;
@@ -16059,7 +16659,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16059
16659
 
16060
16660
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
16061
16661
  assert(body == NULL || PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE));
16062
- body = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) body, false);
16662
+ body = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) body, false);
16063
16663
  }
16064
16664
 
16065
16665
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END);
@@ -16070,19 +16670,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16070
16670
 
16071
16671
  if (parameters == NULL && (maximum > 0)) {
16072
16672
  parameters = (pm_node_t *) pm_numbered_parameters_node_create(parser, &(pm_location_t) { .start = operator.start, .end = parser->previous.end }, maximum);
16073
- locals_body_index = maximum;
16074
16673
  }
16075
16674
 
16076
16675
  pm_constant_id_list_t locals = parser->current_scope->locals;
16676
+
16077
16677
  pm_parser_scope_pop(parser);
16078
16678
  pm_accepts_block_stack_pop(parser);
16079
- return (pm_node_t *) pm_lambda_node_create(parser, &locals, locals_body_index, &operator, &opening, &parser->previous, parameters, body);
16679
+ pm_parser_current_param_name_restore(parser, saved_param_name);
16680
+
16681
+ return (pm_node_t *) pm_lambda_node_create(parser, &locals, &operator, &opening, &parser->previous, parameters, body);
16080
16682
  }
16081
16683
  case PM_TOKEN_UPLUS: {
16082
16684
  parser_lex(parser);
16083
16685
 
16084
16686
  pm_token_t operator = parser->previous;
16085
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_PLUS);
16687
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16086
16688
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "+@");
16087
16689
 
16088
16690
  return (pm_node_t *) node;
@@ -16095,12 +16697,34 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16095
16697
 
16096
16698
  return parse_symbol(parser, &lex_mode, PM_LEX_STATE_END);
16097
16699
  }
16098
- default:
16099
- if (context_recoverable(parser, &parser->current)) {
16700
+ default: {
16701
+ pm_context_t recoverable = context_recoverable(parser, &parser->current);
16702
+
16703
+ if (recoverable != PM_CONTEXT_NONE) {
16100
16704
  parser->recovering = true;
16705
+
16706
+ // If the given error is not the generic one, then we'll add it
16707
+ // here because it will provide more context in addition to the
16708
+ // recoverable error that we will also add.
16709
+ if (diag_id != PM_ERR_CANNOT_PARSE_EXPRESSION) {
16710
+ pm_parser_err_prefix(parser, diag_id);
16711
+ }
16712
+
16713
+ // If we get here, then we are assuming this token is closing a
16714
+ // parent context, so we'll indicate that to the user so that
16715
+ // they know how we behaved.
16716
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_type_human(parser->current.type), context_human(recoverable));
16717
+ } else if (diag_id == PM_ERR_CANNOT_PARSE_EXPRESSION) {
16718
+ // We're going to make a special case here, because "cannot
16719
+ // parse expression" is pretty generic, and we know here that we
16720
+ // have an unexpected token.
16721
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type));
16722
+ } else {
16723
+ pm_parser_err_prefix(parser, diag_id);
16101
16724
  }
16102
16725
 
16103
16726
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
16727
+ }
16104
16728
  }
16105
16729
  }
16106
16730
 
@@ -16145,7 +16769,18 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
16145
16769
  if (is_single_value && match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) {
16146
16770
  pm_token_t rescue = parser->current;
16147
16771
  parser_lex(parser);
16148
- pm_node_t *right = parse_expression(parser, binding_power, false, PM_ERR_RESCUE_MODIFIER_VALUE);
16772
+
16773
+ bool accepts_command_call_inner = false;
16774
+
16775
+ // RHS can accept command call iff the value is a call with arguments but without paranthesis.
16776
+ if (PM_NODE_TYPE_P(value, PM_CALL_NODE)) {
16777
+ pm_call_node_t *call_node = (pm_call_node_t *)value;
16778
+ if ((call_node->arguments != NULL) && (call_node->opening_loc.start == NULL)) {
16779
+ accepts_command_call_inner = true;
16780
+ }
16781
+ }
16782
+
16783
+ pm_node_t *right = parse_expression(parser, binding_power, accepts_command_call_inner, PM_ERR_RESCUE_MODIFIER_VALUE);
16149
16784
 
16150
16785
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
16151
16786
  }
@@ -16330,7 +16965,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16330
16965
  switch (PM_NODE_TYPE(node)) {
16331
16966
  case PM_BACK_REFERENCE_READ_NODE:
16332
16967
  case PM_NUMBERED_REFERENCE_READ_NODE:
16333
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16968
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16334
16969
  /* fallthrough */
16335
16970
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16336
16971
  parser_lex(parser);
@@ -16412,7 +17047,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16412
17047
  }
16413
17048
 
16414
17049
  // If this node cannot be writable, then we have an error.
16415
- if (pm_call_node_writable_p(cast)) {
17050
+ if (pm_call_node_writable_p(parser, cast)) {
16416
17051
  parse_write_name(parser, &cast->name);
16417
17052
  } else {
16418
17053
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -16441,7 +17076,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16441
17076
  switch (PM_NODE_TYPE(node)) {
16442
17077
  case PM_BACK_REFERENCE_READ_NODE:
16443
17078
  case PM_NUMBERED_REFERENCE_READ_NODE:
16444
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17079
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16445
17080
  /* fallthrough */
16446
17081
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16447
17082
  parser_lex(parser);
@@ -16523,7 +17158,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16523
17158
  }
16524
17159
 
16525
17160
  // If this node cannot be writable, then we have an error.
16526
- if (pm_call_node_writable_p(cast)) {
17161
+ if (pm_call_node_writable_p(parser, cast)) {
16527
17162
  parse_write_name(parser, &cast->name);
16528
17163
  } else {
16529
17164
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -16562,7 +17197,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16562
17197
  switch (PM_NODE_TYPE(node)) {
16563
17198
  case PM_BACK_REFERENCE_READ_NODE:
16564
17199
  case PM_NUMBERED_REFERENCE_READ_NODE:
16565
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17200
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16566
17201
  /* fallthrough */
16567
17202
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16568
17203
  parser_lex(parser);
@@ -16644,7 +17279,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16644
17279
  }
16645
17280
 
16646
17281
  // If this node cannot be writable, then we have an error.
16647
- if (pm_call_node_writable_p(cast)) {
17282
+ if (pm_call_node_writable_p(parser, cast)) {
16648
17283
  parse_write_name(parser, &cast->name);
16649
17284
  } else {
16650
17285
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -17063,15 +17698,12 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17063
17698
  */
17064
17699
  static pm_node_t *
17065
17700
  parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
17066
- pm_token_t recovery = parser->previous;
17067
- pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call);
17701
+ pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call, diag_id);
17068
17702
 
17069
17703
  switch (PM_NODE_TYPE(node)) {
17070
17704
  case PM_MISSING_NODE:
17071
17705
  // If we found a syntax error, then the type of node returned by
17072
- // parse_expression_prefix is going to be a missing node. In that
17073
- // case we need to add the error message to the parser's error list.
17074
- pm_parser_err(parser, recovery.end, recovery.end, diag_id);
17706
+ // parse_expression_prefix is going to be a missing node.
17075
17707
  return node;
17076
17708
  case PM_PRE_EXECUTION_NODE:
17077
17709
  case PM_POST_EXECUTION_NODE:
@@ -17080,7 +17712,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
17080
17712
  case PM_UNDEF_NODE:
17081
17713
  // These expressions are statements, and cannot be followed by
17082
17714
  // operators (except modifiers).
17083
- if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER_RESCUE) {
17715
+ if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) {
17084
17716
  return node;
17085
17717
  }
17086
17718
  break;
@@ -17175,9 +17807,14 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
17175
17807
 
17176
17808
  static pm_node_t *
17177
17809
  parse_program(pm_parser_t *parser) {
17178
- pm_parser_scope_push(parser, !parser->current_scope);
17179
- parser_lex(parser);
17810
+ // If the current scope is NULL, then we want to push a new top level scope.
17811
+ // The current scope could exist in the event that we are parsing an eval
17812
+ // and the user has passed into scopes that already exist.
17813
+ if (parser->current_scope == NULL) {
17814
+ pm_parser_scope_push(parser, true);
17815
+ }
17180
17816
 
17817
+ parser_lex(parser);
17181
17818
  pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_MAIN);
17182
17819
  if (!statements) {
17183
17820
  statements = pm_statements_node_create(parser);
@@ -17224,6 +17861,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17224
17861
  .current = { .type = PM_TOKEN_EOF, .start = source, .end = source },
17225
17862
  .next_start = NULL,
17226
17863
  .heredoc_end = NULL,
17864
+ .data_loc = { .start = NULL, .end = NULL },
17227
17865
  .comment_list = { 0 },
17228
17866
  .magic_comment_list = { 0 },
17229
17867
  .warning_list = { 0 },
@@ -17234,7 +17872,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17234
17872
  .encoding_changed_callback = NULL,
17235
17873
  .encoding_comment_start = source,
17236
17874
  .lex_callback = NULL,
17237
- .filepath_string = { 0 },
17875
+ .filepath = { 0 },
17238
17876
  .constant_pool = { 0 },
17239
17877
  .newline_list = { 0 },
17240
17878
  .integer_base = 0,
@@ -17248,8 +17886,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17248
17886
  .in_keyword_arg = false,
17249
17887
  .current_param_name = 0,
17250
17888
  .semantic_token_seen = false,
17251
- .frozen_string_literal = false,
17252
- .suppress_warnings = false
17889
+ .frozen_string_literal = false
17253
17890
  };
17254
17891
 
17255
17892
  // Initialize the constant pool. We're going to completely guess as to the
@@ -17278,7 +17915,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17278
17915
  // If options were provided to this parse, establish them here.
17279
17916
  if (options != NULL) {
17280
17917
  // filepath option
17281
- parser->filepath_string = options->filepath;
17918
+ parser->filepath = options->filepath;
17282
17919
 
17283
17920
  // line option
17284
17921
  parser->start_line = options->line;
@@ -17295,10 +17932,8 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17295
17932
  parser->frozen_string_literal = true;
17296
17933
  }
17297
17934
 
17298
- // suppress_warnings option
17299
- if (options->suppress_warnings) {
17300
- parser->suppress_warnings = true;
17301
- }
17935
+ // version option
17936
+ parser->version = options->version;
17302
17937
 
17303
17938
  // scopes option
17304
17939
  for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) {
@@ -17382,7 +18017,7 @@ pm_magic_comment_list_free(pm_list_t *list) {
17382
18017
  */
17383
18018
  PRISM_EXPORTED_FUNCTION void
17384
18019
  pm_parser_free(pm_parser_t *parser) {
17385
- pm_string_free(&parser->filepath_string);
18020
+ pm_string_free(&parser->filepath);
17386
18021
  pm_diagnostic_list_free(&parser->error_list);
17387
18022
  pm_diagnostic_list_free(&parser->warning_list);
17388
18023
  pm_comment_list_free(&parser->comment_list);
@@ -17484,3 +18119,303 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
17484
18119
  #undef PM_LOCATION_NODE_VALUE
17485
18120
  #undef PM_LOCATION_NULL_VALUE
17486
18121
  #undef PM_LOCATION_TOKEN_VALUE
18122
+
18123
+ /** An error that is going to be formatted into the output. */
18124
+ typedef struct {
18125
+ /** A pointer to the diagnostic that was generated during parsing. */
18126
+ pm_diagnostic_t *error;
18127
+
18128
+ /** The start line of the diagnostic message. */
18129
+ int32_t line;
18130
+
18131
+ /** The column start of the diagnostic message. */
18132
+ uint32_t column_start;
18133
+
18134
+ /** The column end of the diagnostic message. */
18135
+ uint32_t column_end;
18136
+ } pm_error_t;
18137
+
18138
+ /** The format that will be used to format the errors into the output. */
18139
+ typedef struct {
18140
+ /** The prefix that will be used for line numbers. */
18141
+ const char *number_prefix;
18142
+
18143
+ /** The prefix that will be used for blank lines. */
18144
+ const char *blank_prefix;
18145
+
18146
+ /** The divider that will be used between sections of source code. */
18147
+ const char *divider;
18148
+
18149
+ /** The length of the blank prefix. */
18150
+ size_t blank_prefix_length;
18151
+
18152
+ /** The length of the divider. */
18153
+ size_t divider_length;
18154
+ } pm_error_format_t;
18155
+
18156
+ #define PM_COLOR_GRAY "\033[38;5;102m"
18157
+ #define PM_COLOR_RED "\033[1;31m"
18158
+ #define PM_COLOR_RESET "\033[0m"
18159
+
18160
+ static inline pm_error_t *
18161
+ pm_parser_errors_format_sort(const pm_parser_t *parser, const pm_list_t *error_list, const pm_newline_list_t *newline_list) {
18162
+ pm_error_t *errors = calloc(error_list->size, sizeof(pm_error_t));
18163
+ int32_t start_line = parser->start_line;
18164
+
18165
+ for (pm_diagnostic_t *error = (pm_diagnostic_t *) error_list->head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
18166
+ pm_line_column_t start = pm_newline_list_line_column(newline_list, error->location.start, start_line);
18167
+ pm_line_column_t end = pm_newline_list_line_column(newline_list, error->location.end, start_line);
18168
+
18169
+ // We're going to insert this error into the array in sorted order. We
18170
+ // do this by finding the first error that has a line number greater
18171
+ // than the current error and then inserting the current error before
18172
+ // that one.
18173
+ size_t index = 0;
18174
+ while (
18175
+ (index < error_list->size) &&
18176
+ (errors[index].error != NULL) &&
18177
+ (
18178
+ (errors[index].line < start.line) ||
18179
+ ((errors[index].line == start.line) && (errors[index].column_start < start.column))
18180
+ )
18181
+ ) index++;
18182
+
18183
+ // Now we're going to shift all of the errors after this one down one
18184
+ // index to make room for the new error.
18185
+ if (index + 1 < error_list->size) {
18186
+ memmove(&errors[index + 1], &errors[index], sizeof(pm_error_t) * (error_list->size - index - 1));
18187
+ }
18188
+
18189
+ // Finally, we'll insert the error into the array.
18190
+ uint32_t column_end;
18191
+ if (start.line == end.line) {
18192
+ column_end = end.column;
18193
+ } else {
18194
+ column_end = (uint32_t) (newline_list->offsets[start.line - start_line + 1] - newline_list->offsets[start.line - start_line] - 1);
18195
+ }
18196
+
18197
+ // Ensure we have at least one column of error.
18198
+ if (start.column == column_end) column_end++;
18199
+
18200
+ errors[index] = (pm_error_t) {
18201
+ .error = error,
18202
+ .line = start.line,
18203
+ .column_start = start.column,
18204
+ .column_end = column_end
18205
+ };
18206
+ }
18207
+
18208
+ return errors;
18209
+ }
18210
+
18211
+ static inline void
18212
+ pm_parser_errors_format_line(const pm_parser_t *parser, const pm_newline_list_t *newline_list, const char *number_prefix, int32_t line, pm_buffer_t *buffer) {
18213
+ size_t index = (size_t) (line - parser->start_line);
18214
+
18215
+ const uint8_t *start = &parser->start[newline_list->offsets[index]];
18216
+ const uint8_t *end;
18217
+
18218
+ if (index >= newline_list->size - 1) {
18219
+ end = parser->end;
18220
+ } else {
18221
+ end = &parser->start[newline_list->offsets[index + 1]];
18222
+ }
18223
+
18224
+ pm_buffer_append_format(buffer, number_prefix, line);
18225
+ pm_buffer_append_string(buffer, (const char *) start, (size_t) (end - start));
18226
+
18227
+ if (end == parser->end && end[-1] != '\n') {
18228
+ pm_buffer_append_string(buffer, "\n", 1);
18229
+ }
18230
+ }
18231
+
18232
+ /**
18233
+ * Format the errors on the parser into the given buffer.
18234
+ */
18235
+ PRISM_EXPORTED_FUNCTION void
18236
+ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool colorize) {
18237
+ const pm_list_t *error_list = &parser->error_list;
18238
+ assert(error_list->size != 0);
18239
+
18240
+ // First, we're going to sort all of the errors by line number using an
18241
+ // insertion sort into a newly allocated array.
18242
+ const int32_t start_line = parser->start_line;
18243
+ const pm_newline_list_t *newline_list = &parser->newline_list;
18244
+ pm_error_t *errors = pm_parser_errors_format_sort(parser, error_list, newline_list);
18245
+
18246
+ // Now we're going to determine how we're going to format line numbers and
18247
+ // blank lines based on the maximum number of digits in the line numbers
18248
+ // that are going to be displayed.
18249
+ pm_error_format_t error_format;
18250
+ int32_t max_line_number = errors[error_list->size - 1].line - start_line;
18251
+
18252
+ if (max_line_number < 10) {
18253
+ if (colorize) {
18254
+ error_format = (pm_error_format_t) {
18255
+ .number_prefix = PM_COLOR_GRAY "%1" PRIi32 " | " PM_COLOR_RESET,
18256
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18257
+ .divider = PM_COLOR_GRAY " ~~~~~" PM_COLOR_RESET "\n"
18258
+ };
18259
+ } else {
18260
+ error_format = (pm_error_format_t) {
18261
+ .number_prefix = "%1" PRIi32 " | ",
18262
+ .blank_prefix = " | ",
18263
+ .divider = " ~~~~~\n"
18264
+ };
18265
+ }
18266
+ } else if (max_line_number < 100) {
18267
+ if (colorize) {
18268
+ error_format = (pm_error_format_t) {
18269
+ .number_prefix = PM_COLOR_GRAY "%2" PRIi32 " | " PM_COLOR_RESET,
18270
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18271
+ .divider = PM_COLOR_GRAY " ~~~~~~" PM_COLOR_RESET "\n"
18272
+ };
18273
+ } else {
18274
+ error_format = (pm_error_format_t) {
18275
+ .number_prefix = "%2" PRIi32 " | ",
18276
+ .blank_prefix = " | ",
18277
+ .divider = " ~~~~~~\n"
18278
+ };
18279
+ }
18280
+ } else if (max_line_number < 1000) {
18281
+ if (colorize) {
18282
+ error_format = (pm_error_format_t) {
18283
+ .number_prefix = PM_COLOR_GRAY "%3" PRIi32 " | " PM_COLOR_RESET,
18284
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18285
+ .divider = PM_COLOR_GRAY " ~~~~~~~" PM_COLOR_RESET "\n"
18286
+ };
18287
+ } else {
18288
+ error_format = (pm_error_format_t) {
18289
+ .number_prefix = "%3" PRIi32 " | ",
18290
+ .blank_prefix = " | ",
18291
+ .divider = " ~~~~~~~\n"
18292
+ };
18293
+ }
18294
+ } else if (max_line_number < 10000) {
18295
+ if (colorize) {
18296
+ error_format = (pm_error_format_t) {
18297
+ .number_prefix = PM_COLOR_GRAY "%4" PRIi32 " | " PM_COLOR_RESET,
18298
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18299
+ .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18300
+ };
18301
+ } else {
18302
+ error_format = (pm_error_format_t) {
18303
+ .number_prefix = "%4" PRIi32 " | ",
18304
+ .blank_prefix = " | ",
18305
+ .divider = " ~~~~~~~~\n"
18306
+ };
18307
+ }
18308
+ } else {
18309
+ if (colorize) {
18310
+ error_format = (pm_error_format_t) {
18311
+ .number_prefix = PM_COLOR_GRAY "%5" PRIi32 " | " PM_COLOR_RESET,
18312
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18313
+ .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18314
+ };
18315
+ } else {
18316
+ error_format = (pm_error_format_t) {
18317
+ .number_prefix = "%5" PRIi32 " | ",
18318
+ .blank_prefix = " | ",
18319
+ .divider = " ~~~~~~~~\n"
18320
+ };
18321
+ }
18322
+ }
18323
+
18324
+ error_format.blank_prefix_length = strlen(error_format.blank_prefix);
18325
+ error_format.divider_length = strlen(error_format.divider);
18326
+
18327
+ // Now we're going to iterate through every error in our error list and
18328
+ // display it. While we're iterating, we will display some padding lines of
18329
+ // the source before the error to give some context. We'll be careful not to
18330
+ // display the same line twice in case the errors are close enough in the
18331
+ // source.
18332
+ int32_t last_line = 0;
18333
+ const pm_encoding_t *encoding = parser->encoding;
18334
+
18335
+ for (size_t index = 0; index < error_list->size; index++) {
18336
+ pm_error_t *error = &errors[index];
18337
+
18338
+ // Here we determine how many lines of padding of the source to display,
18339
+ // based on the difference from the last line that was displayed.
18340
+ if (error->line - last_line > 1) {
18341
+ if (error->line - last_line > 2) {
18342
+ if ((index != 0) && (error->line - last_line > 3)) {
18343
+ pm_buffer_append_string(buffer, error_format.divider, error_format.divider_length);
18344
+ }
18345
+
18346
+ pm_buffer_append_string(buffer, " ", 2);
18347
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line - 2, buffer);
18348
+ }
18349
+
18350
+ pm_buffer_append_string(buffer, " ", 2);
18351
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line - 1, buffer);
18352
+ }
18353
+
18354
+ // If this is the first error or we're on a new line, then we'll display
18355
+ // the line that has the error in it.
18356
+ if ((index == 0) || (error->line != last_line)) {
18357
+ if (colorize) {
18358
+ pm_buffer_append_string(buffer, PM_COLOR_RED "> " PM_COLOR_RESET, 13);
18359
+ } else {
18360
+ pm_buffer_append_string(buffer, "> ", 2);
18361
+ }
18362
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line, buffer);
18363
+ }
18364
+
18365
+ // Now we'll display the actual error message. We'll do this by first
18366
+ // putting the prefix to the line, then a bunch of blank spaces
18367
+ // depending on the column, then as many carets as we need to display
18368
+ // the width of the error, then the error message itself.
18369
+ //
18370
+ // Note that this doesn't take into account the width of the actual
18371
+ // character when displayed in the terminal. For some east-asian
18372
+ // languages or emoji, this means it can be thrown off pretty badly. We
18373
+ // will need to solve this eventually.
18374
+ pm_buffer_append_string(buffer, " ", 2);
18375
+ pm_buffer_append_string(buffer, error_format.blank_prefix, error_format.blank_prefix_length);
18376
+
18377
+ size_t column = 0;
18378
+ const uint8_t *start = &parser->start[newline_list->offsets[error->line - start_line]];
18379
+
18380
+ while (column < error->column_end) {
18381
+ if (column < error->column_start) {
18382
+ pm_buffer_append_byte(buffer, ' ');
18383
+ } else if (colorize) {
18384
+ pm_buffer_append_string(buffer, PM_COLOR_RED "^" PM_COLOR_RESET, 12);
18385
+ } else {
18386
+ pm_buffer_append_byte(buffer, '^');
18387
+ }
18388
+
18389
+ size_t char_width = encoding->char_width(start + column, parser->end - (start + column));
18390
+ column += (char_width == 0 ? 1 : char_width);
18391
+ }
18392
+
18393
+ pm_buffer_append_byte(buffer, ' ');
18394
+
18395
+ const char *message = error->error->message;
18396
+ pm_buffer_append_string(buffer, message, strlen(message));
18397
+ pm_buffer_append_byte(buffer, '\n');
18398
+
18399
+ // Here we determine how many lines of padding to display after the
18400
+ // error, depending on where the next error is in source.
18401
+ last_line = error->line;
18402
+ int32_t next_line = (index == error_list->size - 1) ? ((int32_t) newline_list->size) : errors[index + 1].line;
18403
+
18404
+ if (next_line - last_line > 1) {
18405
+ pm_buffer_append_string(buffer, " ", 2);
18406
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, ++last_line, buffer);
18407
+ }
18408
+
18409
+ if (next_line - last_line > 1) {
18410
+ pm_buffer_append_string(buffer, " ", 2);
18411
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, ++last_line, buffer);
18412
+ }
18413
+ }
18414
+
18415
+ // Finally, we'll free the array of errors that we allocated.
18416
+ free(errors);
18417
+ }
18418
+
18419
+ #undef PM_COLOR_GRAY
18420
+ #undef PM_COLOR_RED
18421
+ #undef PM_COLOR_RESET