prism 0.19.0 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (69) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +102 -1
  3. data/Makefile +5 -0
  4. data/README.md +9 -6
  5. data/config.yml +236 -38
  6. data/docs/build_system.md +19 -2
  7. data/docs/cruby_compilation.md +27 -0
  8. data/docs/parser_translation.md +34 -0
  9. data/docs/parsing_rules.md +19 -0
  10. data/docs/releasing.md +84 -16
  11. data/docs/ruby_api.md +1 -1
  12. data/docs/ruby_parser_translation.md +19 -0
  13. data/docs/serialization.md +19 -5
  14. data/ext/prism/api_node.c +1989 -1525
  15. data/ext/prism/extension.c +130 -30
  16. data/ext/prism/extension.h +2 -2
  17. data/include/prism/ast.h +1700 -505
  18. data/include/prism/defines.h +8 -0
  19. data/include/prism/diagnostic.h +49 -7
  20. data/include/prism/encoding.h +17 -0
  21. data/include/prism/options.h +40 -14
  22. data/include/prism/parser.h +34 -18
  23. data/include/prism/util/pm_buffer.h +9 -0
  24. data/include/prism/util/pm_constant_pool.h +18 -0
  25. data/include/prism/util/pm_newline_list.h +4 -14
  26. data/include/prism/util/pm_strpbrk.h +4 -1
  27. data/include/prism/version.h +2 -2
  28. data/include/prism.h +19 -2
  29. data/lib/prism/debug.rb +11 -5
  30. data/lib/prism/desugar_compiler.rb +225 -80
  31. data/lib/prism/dot_visitor.rb +36 -14
  32. data/lib/prism/dsl.rb +302 -299
  33. data/lib/prism/ffi.rb +107 -76
  34. data/lib/prism/lex_compat.rb +17 -1
  35. data/lib/prism/node.rb +4580 -2607
  36. data/lib/prism/node_ext.rb +27 -4
  37. data/lib/prism/parse_result.rb +75 -29
  38. data/lib/prism/serialize.rb +633 -305
  39. data/lib/prism/translation/parser/compiler.rb +1838 -0
  40. data/lib/prism/translation/parser/lexer.rb +335 -0
  41. data/lib/prism/translation/parser/rubocop.rb +45 -0
  42. data/lib/prism/translation/parser.rb +190 -0
  43. data/lib/prism/translation/parser33.rb +12 -0
  44. data/lib/prism/translation/parser34.rb +12 -0
  45. data/lib/prism/translation/ripper.rb +696 -0
  46. data/lib/prism/translation/ruby_parser.rb +1521 -0
  47. data/lib/prism/translation.rb +11 -0
  48. data/lib/prism.rb +1 -1
  49. data/prism.gemspec +18 -7
  50. data/rbi/prism.rbi +150 -88
  51. data/rbi/prism_static.rbi +15 -3
  52. data/sig/prism.rbs +996 -961
  53. data/sig/prism_static.rbs +123 -46
  54. data/src/diagnostic.c +264 -219
  55. data/src/encoding.c +21 -26
  56. data/src/node.c +2 -6
  57. data/src/options.c +29 -5
  58. data/src/prettyprint.c +176 -44
  59. data/src/prism.c +1499 -564
  60. data/src/serialize.c +35 -21
  61. data/src/token_type.c +353 -4
  62. data/src/util/pm_buffer.c +11 -0
  63. data/src/util/pm_constant_pool.c +37 -11
  64. data/src/util/pm_newline_list.c +6 -15
  65. data/src/util/pm_string.c +0 -7
  66. data/src/util/pm_strpbrk.c +122 -14
  67. metadata +16 -5
  68. data/docs/building.md +0 -29
  69. data/lib/prism/ripper_compat.rb +0 -207
data/src/prism.c CHANGED
@@ -51,6 +51,7 @@ debug_context(pm_context_t context) {
51
51
  case PM_CONTEXT_IF: return "IF";
52
52
  case PM_CONTEXT_MAIN: return "MAIN";
53
53
  case PM_CONTEXT_MODULE: return "MODULE";
54
+ case PM_CONTEXT_NONE: return "NONE";
54
55
  case PM_CONTEXT_PARENS: return "PARENS";
55
56
  case PM_CONTEXT_POSTEXE: return "POSTEXE";
56
57
  case PM_CONTEXT_PREDICATE: return "PREDICATE";
@@ -164,7 +165,7 @@ debug_state(pm_parser_t *parser) {
164
165
 
165
166
  PRISM_ATTRIBUTE_UNUSED static void
166
167
  debug_token(pm_token_t * token) {
167
- fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_to_str(token->type), (int) (token->end - token->start), token->start);
168
+ fprintf(stderr, "%s: \"%.*s\"\n", pm_token_type_human(token->type), (int) (token->end - token->start), token->start);
168
169
  }
169
170
 
170
171
  #endif
@@ -423,6 +424,11 @@ lex_state_beg_p(pm_parser_t *parser) {
423
424
  return lex_state_p(parser, PM_LEX_STATE_BEG_ANY) || ((parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED));
424
425
  }
425
426
 
427
+ static inline bool
428
+ lex_state_arg_labeled_p(pm_parser_t *parser) {
429
+ return (parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
430
+ }
431
+
426
432
  static inline bool
427
433
  lex_state_arg_p(pm_parser_t *parser) {
428
434
  return lex_state_p(parser, PM_LEX_STATE_ARG_ANY);
@@ -487,7 +493,8 @@ pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_
487
493
  /**
488
494
  * Append an error to the list of errors on the parser using a format string.
489
495
  */
490
- #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
496
+ #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) \
497
+ pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
491
498
 
492
499
  /**
493
500
  * Append an error to the list of errors on the parser using the location of the
@@ -502,7 +509,8 @@ pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
502
509
  * Append an error to the list of errors on the parser using the given location
503
510
  * using a format string.
504
511
  */
505
- #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (location)->start, (location)->end, diag_id, __VA_ARGS__)
512
+ #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) \
513
+ PM_PARSER_ERR_FORMAT(parser, (location)->start, (location)->end, diag_id, __VA_ARGS__)
506
514
 
507
515
  /**
508
516
  * Append an error to the list of errors on the parser using the location of the
@@ -517,7 +525,15 @@ pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_
517
525
  * Append an error to the list of errors on the parser using the location of the
518
526
  * given node and a format string.
519
527
  */
520
- #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, node->location.start, node->location.end, diag_id, __VA_ARGS__)
528
+ #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) \
529
+ PM_PARSER_ERR_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__)
530
+
531
+ /**
532
+ * Append an error to the list of errors on the parser using the location of the
533
+ * given node and a format string, and add on the content of the node.
534
+ */
535
+ #define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, diag_id) \
536
+ PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, (int) ((node)->location.end - (node)->location.start), (const char *) (node)->location.start)
521
537
 
522
538
  /**
523
539
  * Append an error to the list of errors on the parser using the location of the
@@ -541,16 +557,22 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_
541
557
  * Append an error to the list of errors on the parser using the location of the
542
558
  * given token and a format string.
543
559
  */
544
- #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__)
560
+ #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) \
561
+ PM_PARSER_ERR_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__)
562
+
563
+ /**
564
+ * Append an error to the list of errors on the parser using the location of the
565
+ * given token and a format string, and add on the content of the token.
566
+ */
567
+ #define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \
568
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start)
545
569
 
546
570
  /**
547
571
  * Append a warning to the list of warnings on the parser.
548
572
  */
549
573
  static inline void
550
574
  pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
551
- if (!parser->suppress_warnings) {
552
- pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
553
- }
575
+ pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
554
576
  }
555
577
 
556
578
  /**
@@ -813,6 +835,9 @@ typedef struct {
813
835
 
814
836
  /** The optional block attached to the call. */
815
837
  pm_node_t *block;
838
+
839
+ /** The flag indicating whether this arguments list has forwarding argument. */
840
+ bool has_forwarding;
816
841
  } pm_arguments_t;
817
842
 
818
843
  /**
@@ -864,6 +889,105 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
864
889
  pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK);
865
890
  }
866
891
 
892
+ /******************************************************************************/
893
+ /* Basic character checks */
894
+ /******************************************************************************/
895
+
896
+ /**
897
+ * This function is used extremely frequently to lex all of the identifiers in a
898
+ * source file, so it's important that it be as fast as possible. For this
899
+ * reason we have the encoding_changed boolean to check if we need to go through
900
+ * the function pointer or can just directly use the UTF-8 functions.
901
+ */
902
+ static inline size_t
903
+ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
904
+ if (parser->encoding_changed) {
905
+ size_t width;
906
+ if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
907
+ return width;
908
+ } else if (*b == '_') {
909
+ return 1;
910
+ } else if (*b >= 0x80) {
911
+ return parser->encoding->char_width(b, parser->end - b);
912
+ } else {
913
+ return 0;
914
+ }
915
+ } else if (*b < 0x80) {
916
+ return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
917
+ } else {
918
+ return pm_encoding_utf_8_char_width(b, parser->end - b);
919
+ }
920
+ }
921
+
922
+ /**
923
+ * Similar to char_is_identifier but this function assumes that the encoding
924
+ * has not been changed.
925
+ */
926
+ static inline size_t
927
+ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
928
+ if (*b < 0x80) {
929
+ return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
930
+ } else {
931
+ return pm_encoding_utf_8_char_width(b, end - b);
932
+ }
933
+ }
934
+
935
+ /**
936
+ * Like the above, this function is also used extremely frequently to lex all of
937
+ * the identifiers in a source file once the first character has been found. So
938
+ * it's important that it be as fast as possible.
939
+ */
940
+ static inline size_t
941
+ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
942
+ if (parser->encoding_changed) {
943
+ size_t width;
944
+ if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
945
+ return width;
946
+ } else if (*b == '_') {
947
+ return 1;
948
+ } else if (*b >= 0x80) {
949
+ return parser->encoding->char_width(b, parser->end - b);
950
+ } else {
951
+ return 0;
952
+ }
953
+ }
954
+ return char_is_identifier_utf8(b, parser->end);
955
+ }
956
+
957
+ // Here we're defining a perfect hash for the characters that are allowed in
958
+ // global names. This is used to quickly check the next character after a $ to
959
+ // see if it's a valid character for a global name.
960
+ #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
961
+ #define PUNCT(idx) ( \
962
+ BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
963
+ BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
964
+ BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
965
+ BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
966
+ BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
967
+ BIT('0', idx))
968
+
969
+ const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
970
+
971
+ #undef BIT
972
+ #undef PUNCT
973
+
974
+ static inline bool
975
+ char_is_global_name_punctuation(const uint8_t b) {
976
+ const unsigned int i = (const unsigned int) b;
977
+ if (i <= 0x20 || 0x7e < i) return false;
978
+
979
+ return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
980
+ }
981
+
982
+ static inline bool
983
+ token_is_setter_name(pm_token_t *token) {
984
+ return (
985
+ (token->type == PM_TOKEN_IDENTIFIER) &&
986
+ (token->end - token->start >= 2) &&
987
+ (token->end[-1] == '=')
988
+ );
989
+ }
990
+
867
991
  /******************************************************************************/
868
992
  /* Node flag handling functions */
869
993
  /******************************************************************************/
@@ -884,6 +1008,22 @@ pm_node_flag_unset(pm_node_t *node, pm_node_flags_t flag) {
884
1008
  node->flags &= (pm_node_flags_t) ~flag;
885
1009
  }
886
1010
 
1011
+ /**
1012
+ * Set the repeated parameter flag on the given node.
1013
+ */
1014
+ static inline void
1015
+ pm_node_flag_set_repeated_parameter(pm_node_t *node) {
1016
+ assert(PM_NODE_TYPE(node) == PM_BLOCK_LOCAL_VARIABLE_NODE ||
1017
+ PM_NODE_TYPE(node) == PM_BLOCK_PARAMETER_NODE ||
1018
+ PM_NODE_TYPE(node) == PM_KEYWORD_REST_PARAMETER_NODE ||
1019
+ PM_NODE_TYPE(node) == PM_OPTIONAL_KEYWORD_PARAMETER_NODE ||
1020
+ PM_NODE_TYPE(node) == PM_OPTIONAL_PARAMETER_NODE ||
1021
+ PM_NODE_TYPE(node) == PM_REQUIRED_KEYWORD_PARAMETER_NODE ||
1022
+ PM_NODE_TYPE(node) == PM_REQUIRED_PARAMETER_NODE ||
1023
+ PM_NODE_TYPE(node) == PM_REST_PARAMETER_NODE);
1024
+
1025
+ pm_node_flag_set(node, PM_PARAMETER_FLAGS_REPEATED_PARAMETER);
1026
+ }
887
1027
 
888
1028
  /******************************************************************************/
889
1029
  /* Node creation functions */
@@ -977,7 +1117,7 @@ static inline void *
977
1117
  pm_alloc_node(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, size_t size) {
978
1118
  void *memory = calloc(1, size);
979
1119
  if (memory == NULL) {
980
- fprintf(stderr, "Failed to allocate %zu bytes\n", size);
1120
+ fprintf(stderr, "Failed to allocate %d bytes\n", (int) size);
981
1121
  abort();
982
1122
  }
983
1123
  return memory;
@@ -1325,7 +1465,7 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1325
1465
  pm_assoc_node_t *node = PM_ALLOC_NODE(parser, pm_assoc_node_t);
1326
1466
  const uint8_t *end;
1327
1467
 
1328
- if (value != NULL) {
1468
+ if (value != NULL && value->location.end > key->location.end) {
1329
1469
  end = value->location.end;
1330
1470
  } else if (operator->type != PM_TOKEN_NOT_PROVIDED) {
1331
1471
  end = operator->end;
@@ -1333,6 +1473,13 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1333
1473
  end = key->location.end;
1334
1474
  }
1335
1475
 
1476
+ // Hash string keys will be frozen, so we can mark them as frozen here so
1477
+ // that the compiler picks them up and also when we check for static literal
1478
+ // on the keys it gets factored in.
1479
+ if (PM_NODE_TYPE_P(key, PM_STRING_NODE)) {
1480
+ key->flags |= PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL;
1481
+ }
1482
+
1336
1483
  // If the key and value of this assoc node are both static literals, then
1337
1484
  // we can mark this node as a static literal.
1338
1485
  pm_node_flags_t flags = 0;
@@ -1490,7 +1637,7 @@ pm_block_argument_node_create(pm_parser_t *parser, const pm_token_t *operator, p
1490
1637
  * Allocate and initialize a new BlockNode node.
1491
1638
  */
1492
1639
  static pm_block_node_t *
1493
- pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, uint32_t locals_body_index, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) {
1640
+ pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) {
1494
1641
  pm_block_node_t *node = PM_ALLOC_NODE(parser, pm_block_node_t);
1495
1642
 
1496
1643
  *node = (pm_block_node_t) {
@@ -1499,7 +1646,6 @@ pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, uint32_
1499
1646
  .location = { .start = opening->start, .end = closing->end },
1500
1647
  },
1501
1648
  .locals = *locals,
1502
- .locals_body_index = locals_body_index,
1503
1649
  .parameters = parameters,
1504
1650
  .body = body,
1505
1651
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
@@ -1645,12 +1791,13 @@ pm_break_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
1645
1791
  * in the various specializations of this function.
1646
1792
  */
1647
1793
  static pm_call_node_t *
1648
- pm_call_node_create(pm_parser_t *parser) {
1794
+ pm_call_node_create(pm_parser_t *parser, pm_node_flags_t flags) {
1649
1795
  pm_call_node_t *node = PM_ALLOC_NODE(parser, pm_call_node_t);
1650
1796
 
1651
1797
  *node = (pm_call_node_t) {
1652
1798
  {
1653
1799
  .type = PM_CALL_NODE,
1800
+ .flags = flags,
1654
1801
  .location = PM_LOCATION_NULL_VALUE(parser),
1655
1802
  },
1656
1803
  .receiver = NULL,
@@ -1666,6 +1813,15 @@ pm_call_node_create(pm_parser_t *parser) {
1666
1813
  return node;
1667
1814
  }
1668
1815
 
1816
+ /**
1817
+ * Returns the value that the ignore visibility flag should be set to for the
1818
+ * given receiver.
1819
+ */
1820
+ static inline pm_node_flags_t
1821
+ pm_call_node_ignore_visibility_flag(const pm_node_t *receiver) {
1822
+ return PM_NODE_TYPE_P(receiver, PM_SELF_NODE) ? PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY : 0;
1823
+ }
1824
+
1669
1825
  /**
1670
1826
  * Allocate and initialize a new CallNode node from an aref or an aset
1671
1827
  * expression.
@@ -1674,7 +1830,7 @@ static pm_call_node_t *
1674
1830
  pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_t *arguments) {
1675
1831
  pm_assert_value_expression(parser, receiver);
1676
1832
 
1677
- pm_call_node_t *node = pm_call_node_create(parser);
1833
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1678
1834
 
1679
1835
  node->base.location.start = receiver->location.start;
1680
1836
  node->base.location.end = pm_arguments_end(arguments);
@@ -1700,7 +1856,7 @@ pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t
1700
1856
  pm_assert_value_expression(parser, receiver);
1701
1857
  pm_assert_value_expression(parser, argument);
1702
1858
 
1703
- pm_call_node_t *node = pm_call_node_create(parser);
1859
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1704
1860
 
1705
1861
  node->base.location.start = MIN(receiver->location.start, argument->location.start);
1706
1862
  node->base.location.end = MAX(receiver->location.end, argument->location.end);
@@ -1723,7 +1879,7 @@ static pm_call_node_t *
1723
1879
  pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_token_t *message, pm_arguments_t *arguments) {
1724
1880
  pm_assert_value_expression(parser, receiver);
1725
1881
 
1726
- pm_call_node_t *node = pm_call_node_create(parser);
1882
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1727
1883
 
1728
1884
  node->base.location.start = receiver->location.start;
1729
1885
  const uint8_t *end = pm_arguments_end(arguments);
@@ -1754,7 +1910,7 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o
1754
1910
  */
1755
1911
  static pm_call_node_t *
1756
1912
  pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments_t *arguments) {
1757
- pm_call_node_t *node = pm_call_node_create(parser);
1913
+ pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY);
1758
1914
 
1759
1915
  node->base.location.start = message->start;
1760
1916
  node->base.location.end = pm_arguments_end(arguments);
@@ -1776,7 +1932,7 @@ static pm_call_node_t *
1776
1932
  pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *message, pm_arguments_t *arguments) {
1777
1933
  pm_assert_value_expression(parser, receiver);
1778
1934
 
1779
- pm_call_node_t *node = pm_call_node_create(parser);
1935
+ pm_call_node_t *node = pm_call_node_create(parser, receiver == NULL ? 0 : pm_call_node_ignore_visibility_flag(receiver));
1780
1936
 
1781
1937
  node->base.location.start = message->start;
1782
1938
  if (arguments->closing_loc.start != NULL) {
@@ -1802,7 +1958,7 @@ static pm_call_node_t *
1802
1958
  pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *operator, pm_arguments_t *arguments) {
1803
1959
  pm_assert_value_expression(parser, receiver);
1804
1960
 
1805
- pm_call_node_t *node = pm_call_node_create(parser);
1961
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1806
1962
 
1807
1963
  node->base.location.start = receiver->location.start;
1808
1964
  node->base.location.end = pm_arguments_end(arguments);
@@ -1829,7 +1985,7 @@ static pm_call_node_t *
1829
1985
  pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *receiver, const char *name) {
1830
1986
  pm_assert_value_expression(parser, receiver);
1831
1987
 
1832
- pm_call_node_t *node = pm_call_node_create(parser);
1988
+ pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver));
1833
1989
 
1834
1990
  node->base.location.start = operator->start;
1835
1991
  node->base.location.end = receiver->location.end;
@@ -1847,7 +2003,7 @@ pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *
1847
2003
  */
1848
2004
  static pm_call_node_t *
1849
2005
  pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) {
1850
- pm_call_node_t *node = pm_call_node_create(parser);
2006
+ pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY);
1851
2007
 
1852
2008
  node->base.location = PM_LOCATION_TOKEN_VALUE(message);
1853
2009
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message);
@@ -1885,11 +2041,12 @@ pm_call_node_index_p(pm_call_node_t *node) {
1885
2041
  * operator assignment.
1886
2042
  */
1887
2043
  static inline bool
1888
- pm_call_node_writable_p(pm_call_node_t *node) {
2044
+ pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) {
1889
2045
  return (
1890
2046
  (node->message_loc.start != NULL) &&
1891
2047
  (node->message_loc.end[-1] != '!') &&
1892
2048
  (node->message_loc.end[-1] != '?') &&
2049
+ char_is_identifier_start(parser, node->message_loc.start) &&
1893
2050
  (node->opening_loc.start == NULL) &&
1894
2051
  (node->arguments == NULL) &&
1895
2052
  (node->block == NULL)
@@ -2167,11 +2324,12 @@ pm_call_target_node_create(pm_parser_t *parser, pm_call_node_t *target) {
2167
2324
  static pm_index_target_node_t *
2168
2325
  pm_index_target_node_create(pm_parser_t *parser, pm_call_node_t *target) {
2169
2326
  pm_index_target_node_t *node = PM_ALLOC_NODE(parser, pm_index_target_node_t);
2327
+ pm_node_flags_t flags = target->base.flags;
2170
2328
 
2171
2329
  *node = (pm_index_target_node_t) {
2172
2330
  {
2173
2331
  .type = PM_INDEX_TARGET_NODE,
2174
- .flags = target->base.flags,
2332
+ .flags = flags | PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE,
2175
2333
  .location = target->base.location
2176
2334
  },
2177
2335
  .receiver = target->receiver,
@@ -2701,18 +2859,62 @@ pm_constant_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *targ
2701
2859
  return node;
2702
2860
  }
2703
2861
 
2862
+ /**
2863
+ * Check if the receiver of a `def` node is allowed.
2864
+ */
2865
+ static void
2866
+ pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) {
2867
+ switch (PM_NODE_TYPE(node)) {
2868
+ case PM_BEGIN_NODE: {
2869
+ const pm_begin_node_t *cast = (pm_begin_node_t *) node;
2870
+ if (cast->statements != NULL) pm_def_node_receiver_check(parser, (pm_node_t *) cast->statements);
2871
+ break;
2872
+ }
2873
+ case PM_PARENTHESES_NODE: {
2874
+ const pm_parentheses_node_t *cast = (const pm_parentheses_node_t *) node;
2875
+ if (cast->body != NULL) pm_def_node_receiver_check(parser, cast->body);
2876
+ break;
2877
+ }
2878
+ case PM_STATEMENTS_NODE: {
2879
+ const pm_statements_node_t *cast = (const pm_statements_node_t *) node;
2880
+ pm_def_node_receiver_check(parser, cast->body.nodes[cast->body.size - 1]);
2881
+ break;
2882
+ }
2883
+ case PM_ARRAY_NODE:
2884
+ case PM_FLOAT_NODE:
2885
+ case PM_IMAGINARY_NODE:
2886
+ case PM_INTEGER_NODE:
2887
+ case PM_INTERPOLATED_REGULAR_EXPRESSION_NODE:
2888
+ case PM_INTERPOLATED_STRING_NODE:
2889
+ case PM_INTERPOLATED_SYMBOL_NODE:
2890
+ case PM_INTERPOLATED_X_STRING_NODE:
2891
+ case PM_RATIONAL_NODE:
2892
+ case PM_REGULAR_EXPRESSION_NODE:
2893
+ case PM_SOURCE_ENCODING_NODE:
2894
+ case PM_SOURCE_FILE_NODE:
2895
+ case PM_SOURCE_LINE_NODE:
2896
+ case PM_STRING_NODE:
2897
+ case PM_SYMBOL_NODE:
2898
+ case PM_X_STRING_NODE:
2899
+ pm_parser_err_node(parser, node, PM_ERR_SINGLETON_FOR_LITERALS);
2900
+ break;
2901
+ default:
2902
+ break;
2903
+ }
2904
+ }
2905
+
2704
2906
  /**
2705
2907
  * Allocate and initialize a new DefNode node.
2706
2908
  */
2707
2909
  static pm_def_node_t *
2708
2910
  pm_def_node_create(
2709
2911
  pm_parser_t *parser,
2710
- const pm_token_t *name,
2912
+ pm_constant_id_t name,
2913
+ const pm_token_t *name_loc,
2711
2914
  pm_node_t *receiver,
2712
2915
  pm_parameters_node_t *parameters,
2713
2916
  pm_node_t *body,
2714
2917
  pm_constant_id_list_t *locals,
2715
- uint32_t locals_body_index,
2716
2918
  const pm_token_t *def_keyword,
2717
2919
  const pm_token_t *operator,
2718
2920
  const pm_token_t *lparen,
@@ -2729,18 +2931,21 @@ pm_def_node_create(
2729
2931
  end = end_keyword->end;
2730
2932
  }
2731
2933
 
2934
+ if ((receiver != NULL) && PM_NODE_TYPE_P(receiver, PM_PARENTHESES_NODE)) {
2935
+ pm_def_node_receiver_check(parser, receiver);
2936
+ }
2937
+
2732
2938
  *node = (pm_def_node_t) {
2733
2939
  {
2734
2940
  .type = PM_DEF_NODE,
2735
2941
  .location = { .start = def_keyword->start, .end = end },
2736
2942
  },
2737
- .name = pm_parser_constant_id_token(parser, name),
2738
- .name_loc = PM_LOCATION_TOKEN_VALUE(name),
2943
+ .name = name,
2944
+ .name_loc = PM_LOCATION_TOKEN_VALUE(name_loc),
2739
2945
  .receiver = receiver,
2740
2946
  .parameters = parameters,
2741
2947
  .body = body,
2742
2948
  .locals = *locals,
2743
- .locals_body_index = locals_body_index,
2744
2949
  .def_keyword_loc = PM_LOCATION_TOKEN_VALUE(def_keyword),
2745
2950
  .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator),
2746
2951
  .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen),
@@ -3962,9 +4167,8 @@ pm_keyword_hash_node_create(pm_parser_t *parser) {
3962
4167
  */
3963
4168
  static void
3964
4169
  pm_keyword_hash_node_elements_append(pm_keyword_hash_node_t *hash, pm_node_t *element) {
3965
- // If the element being added is not an AssocNode or does not have a symbol key, then
3966
- // we want to turn the STATIC_KEYS flag off.
3967
- // TODO: Rename the flag to SYMBOL_KEYS instead.
4170
+ // If the element being added is not an AssocNode or does not have a symbol
4171
+ // key, then we want to turn the SYMBOL_KEYS flag off.
3968
4172
  if (!PM_NODE_TYPE_P(element, PM_ASSOC_NODE) || !PM_NODE_TYPE_P(((pm_assoc_node_t *) element)->key, PM_SYMBOL_NODE)) {
3969
4173
  pm_node_flag_unset((pm_node_t *)hash, PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS);
3970
4174
  }
@@ -4051,7 +4255,6 @@ static pm_lambda_node_t *
4051
4255
  pm_lambda_node_create(
4052
4256
  pm_parser_t *parser,
4053
4257
  pm_constant_id_list_t *locals,
4054
- uint32_t locals_body_index,
4055
4258
  const pm_token_t *operator,
4056
4259
  const pm_token_t *opening,
4057
4260
  const pm_token_t *closing,
@@ -4069,7 +4272,6 @@ pm_lambda_node_create(
4069
4272
  },
4070
4273
  },
4071
4274
  .locals = *locals,
4072
- .locals_body_index = locals_body_index,
4073
4275
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
4074
4276
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
4075
4277
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
@@ -4161,12 +4363,10 @@ pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, c
4161
4363
  }
4162
4364
 
4163
4365
  /**
4164
- * Allocate a new LocalVariableReadNode node.
4366
+ * Allocate a new LocalVariableReadNode node with constant_id.
4165
4367
  */
4166
4368
  static pm_local_variable_read_node_t *
4167
- pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) {
4168
- pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name);
4169
-
4369
+ pm_local_variable_read_node_create_constant_id(pm_parser_t *parser, const pm_token_t *name, pm_constant_id_t name_id, uint32_t depth) {
4170
4370
  if (parser->current_param_name == name_id) {
4171
4371
  pm_parser_err_token(parser, name, PM_ERR_PARAMETER_CIRCULAR);
4172
4372
  }
@@ -4185,6 +4385,15 @@ pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name,
4185
4385
  return node;
4186
4386
  }
4187
4387
 
4388
+ /**
4389
+ * Allocate a new LocalVariableReadNode node.
4390
+ */
4391
+ static pm_local_variable_read_node_t *
4392
+ pm_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name, uint32_t depth) {
4393
+ pm_constant_id_t name_id = pm_parser_constant_id_token(parser, name);
4394
+ return pm_local_variable_read_node_create_constant_id(parser, name, name_id, depth);
4395
+ }
4396
+
4188
4397
  /**
4189
4398
  * Allocate and initialize a new LocalVariableWriteNode node.
4190
4399
  */
@@ -4210,6 +4419,57 @@ pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name,
4210
4419
  return node;
4211
4420
  }
4212
4421
 
4422
+ /**
4423
+ * Returns true if the given bounds comprise `it`.
4424
+ */
4425
+ static inline bool
4426
+ pm_token_is_it(const uint8_t *start, const uint8_t *end) {
4427
+ return (end - start == 2) && (start[0] == 'i') && (start[1] == 't');
4428
+ }
4429
+
4430
+ /**
4431
+ * Returns true if the given node is `it` default parameter.
4432
+ */
4433
+ static inline bool
4434
+ pm_node_is_it(pm_parser_t *parser, pm_node_t *node) {
4435
+ // Check if it's a local variable reference
4436
+ if (node->type != PM_CALL_NODE) {
4437
+ return false;
4438
+ }
4439
+
4440
+ // Check if it's a variable call
4441
+ pm_call_node_t *call_node = (pm_call_node_t *) node;
4442
+ if (!pm_call_node_variable_call_p(call_node)) {
4443
+ return false;
4444
+ }
4445
+
4446
+ // Check if it's called `it`
4447
+ pm_constant_id_t id = ((pm_call_node_t *)node)->name;
4448
+ pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, id);
4449
+ return pm_token_is_it(constant->start, constant->start + constant->length);
4450
+ }
4451
+
4452
+ /**
4453
+ * Convert a `it` variable call node to a node for `it` default parameter.
4454
+ */
4455
+ static pm_node_t *
4456
+ pm_node_check_it(pm_parser_t *parser, pm_node_t *node) {
4457
+ if (
4458
+ (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3_0) &&
4459
+ !parser->current_scope->closed &&
4460
+ pm_node_is_it(parser, node)
4461
+ ) {
4462
+ if (parser->current_scope->explicit_params) {
4463
+ pm_parser_err_previous(parser, PM_ERR_IT_NOT_ALLOWED);
4464
+ } else {
4465
+ pm_node_destroy(parser, node);
4466
+ pm_constant_id_t name_id = pm_parser_constant_id_constant(parser, "0it", 3);
4467
+ node = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
4468
+ }
4469
+ }
4470
+ return node;
4471
+ }
4472
+
4213
4473
  /**
4214
4474
  * Returns true if the given bounds comprise a numbered parameter (i.e., they
4215
4475
  * are of the form /^_\d$/).
@@ -4402,13 +4662,20 @@ pm_multi_target_node_create(pm_parser_t *parser) {
4402
4662
  */
4403
4663
  static void
4404
4664
  pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t *node, pm_node_t *target) {
4405
- if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE) || PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4665
+ if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE)) {
4406
4666
  if (node->rest == NULL) {
4407
4667
  node->rest = target;
4408
4668
  } else {
4409
4669
  pm_parser_err_node(parser, target, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
4410
4670
  pm_node_list_append(&node->rights, target);
4411
4671
  }
4672
+ } else if (PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4673
+ if (node->rest == NULL) {
4674
+ node->rest = target;
4675
+ } else {
4676
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST);
4677
+ pm_node_list_append(&node->rights, target);
4678
+ }
4412
4679
  } else if (node->rest == NULL) {
4413
4680
  pm_node_list_append(&node->lefts, target);
4414
4681
  } else {
@@ -5195,7 +5462,7 @@ pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword)
5195
5462
  .flags = PM_NODE_FLAG_STATIC_LITERAL,
5196
5463
  .location = PM_LOCATION_TOKEN_VALUE(file_keyword),
5197
5464
  },
5198
- .filepath = parser->filepath_string,
5465
+ .filepath = parser->filepath
5199
5466
  };
5200
5467
 
5201
5468
  return node;
@@ -5372,18 +5639,59 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
5372
5639
  return node;
5373
5640
  }
5374
5641
 
5642
+ /**
5643
+ * Read through the contents of a string and check if it consists solely of US ASCII code points.
5644
+ */
5645
+ static bool
5646
+ pm_ascii_only_p(const pm_string_t *contents) {
5647
+ const size_t length = pm_string_length(contents);
5648
+ const uint8_t *source = pm_string_source(contents);
5649
+
5650
+ for (size_t index = 0; index < length; index++) {
5651
+ if (source[index] & 0x80) return false;
5652
+ }
5653
+
5654
+ return true;
5655
+ }
5656
+
5657
+ /**
5658
+ * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
5659
+ * encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
5660
+ * points. Otherwise, the encoding may be explicitly set with an escape
5661
+ * sequence.
5662
+ */
5663
+ static inline pm_node_flags_t
5664
+ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
5665
+ if (parser->explicit_encoding != NULL) {
5666
+ // A Symbol may optionally have its encoding explicitly set. This will
5667
+ // happen if an escape sequence results in a non-ASCII code point.
5668
+ if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
5669
+ return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
5670
+ } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
5671
+ return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
5672
+ }
5673
+ } else if (pm_ascii_only_p(contents)) {
5674
+ // Ruby stipulates that all source files must use an ASCII-compatible
5675
+ // encoding. Thus, all symbols appearing in source are eligible for
5676
+ // "downgrading" to US-ASCII.
5677
+ return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
5678
+ }
5679
+
5680
+ return 0;
5681
+ }
5682
+
5375
5683
  /**
5376
5684
  * Allocate and initialize a new SymbolNode node with the given unescaped
5377
5685
  * string.
5378
5686
  */
5379
5687
  static pm_symbol_node_t *
5380
- pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) {
5688
+ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) {
5381
5689
  pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t);
5382
5690
 
5383
5691
  *node = (pm_symbol_node_t) {
5384
5692
  {
5385
5693
  .type = PM_SYMBOL_NODE,
5386
- .flags = PM_NODE_FLAG_STATIC_LITERAL,
5694
+ .flags = PM_NODE_FLAG_STATIC_LITERAL | flags,
5387
5695
  .location = {
5388
5696
  .start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start),
5389
5697
  .end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end)
@@ -5403,7 +5711,7 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
5403
5711
  */
5404
5712
  static inline pm_symbol_node_t *
5405
5713
  pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
5406
- return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY);
5714
+ return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0);
5407
5715
  }
5408
5716
 
5409
5717
  /**
@@ -5411,7 +5719,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
5411
5719
  */
5412
5720
  static pm_symbol_node_t *
5413
5721
  pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
5414
- pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
5722
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
5415
5723
  parser->current_string = PM_STRING_EMPTY;
5416
5724
  return node;
5417
5725
  }
@@ -5433,6 +5741,8 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
5433
5741
 
5434
5742
  assert((label.end - label.start) >= 0);
5435
5743
  pm_string_shared_init(&node->unescaped, label.start, label.end);
5744
+ pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
5745
+
5436
5746
  break;
5437
5747
  }
5438
5748
  case PM_TOKEN_MISSING: {
@@ -5495,6 +5805,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
5495
5805
  .unescaped = node->unescaped
5496
5806
  };
5497
5807
 
5808
+ pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
5809
+
5498
5810
  // We are explicitly _not_ using pm_node_destroy here because we don't want
5499
5811
  // to trash the unescaped string. We could instead copy the string if we
5500
5812
  // know that it is owned, but we're taking the fast path for now.
@@ -5885,6 +6197,7 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
5885
6197
  .closed = closed,
5886
6198
  .explicit_params = false,
5887
6199
  .numbered_parameters = 0,
6200
+ .forwarding_params = 0,
5888
6201
  };
5889
6202
 
5890
6203
  pm_constant_id_list_init(&scope->locals);
@@ -5893,6 +6206,76 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
5893
6206
  return true;
5894
6207
  }
5895
6208
 
6209
+ static void
6210
+ pm_parser_scope_forwarding_param_check(pm_parser_t *parser, const pm_token_t * token, const uint8_t mask, pm_diagnostic_id_t diag)
6211
+ {
6212
+ pm_scope_t *scope = parser->current_scope;
6213
+ while (scope) {
6214
+ if (scope->forwarding_params & mask) {
6215
+ if (!scope->closed) {
6216
+ pm_parser_err_token(parser, token, diag);
6217
+ return;
6218
+ }
6219
+ return;
6220
+ }
6221
+ if (scope->closed) break;
6222
+ scope = scope->previous;
6223
+ }
6224
+
6225
+ pm_parser_err_token(parser, token, diag);
6226
+ }
6227
+
6228
+ static inline void
6229
+ pm_parser_scope_forwarding_block_check(pm_parser_t *parser, const pm_token_t * token)
6230
+ {
6231
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_BLOCK, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
6232
+ }
6233
+
6234
+ static void
6235
+ pm_parser_scope_forwarding_positionals_check(pm_parser_t *parser, const pm_token_t * token)
6236
+ {
6237
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_POSITIONALS, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
6238
+ }
6239
+
6240
+ static inline void
6241
+ pm_parser_scope_forwarding_all_check(pm_parser_t *parser, const pm_token_t * token)
6242
+ {
6243
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_ALL, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
6244
+ }
6245
+
6246
+ static inline void
6247
+ pm_parser_scope_forwarding_keywords_check(pm_parser_t *parser, const pm_token_t * token)
6248
+ {
6249
+ pm_parser_scope_forwarding_param_check(parser, token, PM_FORWARDING_KEYWORDS, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
6250
+ }
6251
+
6252
+ /**
6253
+ * Save the current param name as the return value and set it to the given
6254
+ * constant id.
6255
+ */
6256
+ static inline pm_constant_id_t
6257
+ pm_parser_current_param_name_set(pm_parser_t *parser, pm_constant_id_t current_param_name) {
6258
+ pm_constant_id_t saved_param_name = parser->current_param_name;
6259
+ parser->current_param_name = current_param_name;
6260
+ return saved_param_name;
6261
+ }
6262
+
6263
+ /**
6264
+ * Save the current param name as the return value and clear it.
6265
+ */
6266
+ static inline pm_constant_id_t
6267
+ pm_parser_current_param_name_unset(pm_parser_t *parser) {
6268
+ return pm_parser_current_param_name_set(parser, PM_CONSTANT_ID_UNSET);
6269
+ }
6270
+
6271
+ /**
6272
+ * Restore the current param name from the given value.
6273
+ */
6274
+ static inline void
6275
+ pm_parser_current_param_name_restore(pm_parser_t *parser, pm_constant_id_t saved_param_name) {
6276
+ parser->current_param_name = saved_param_name;
6277
+ }
6278
+
5896
6279
  /**
5897
6280
  * Check if any of the currently visible scopes contain a local variable
5898
6281
  * described by the given constant id.
@@ -5969,26 +6352,41 @@ pm_parser_local_add_owned(pm_parser_t *parser, const uint8_t *start, size_t leng
5969
6352
  return constant_id;
5970
6353
  }
5971
6354
 
6355
+ /**
6356
+ * Add a local variable from a constant string to the current scope.
6357
+ */
6358
+ static pm_constant_id_t
6359
+ pm_parser_local_add_constant(pm_parser_t *parser, const char *start, size_t length) {
6360
+ pm_constant_id_t constant_id = pm_parser_constant_id_constant(parser, start, length);
6361
+ if (constant_id != 0) pm_parser_local_add(parser, constant_id);
6362
+ return constant_id;
6363
+ }
6364
+
5972
6365
  /**
5973
6366
  * Add a parameter name to the current scope and check whether the name of the
5974
6367
  * parameter is unique or not.
6368
+ *
6369
+ * Returns `true` if this is a duplicate parameter name, otherwise returns
6370
+ * false.
5975
6371
  */
5976
- static void
6372
+ static bool
5977
6373
  pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) {
5978
6374
  // We want to check whether the parameter name is a numbered parameter or
5979
6375
  // not.
5980
6376
  pm_refute_numbered_parameter(parser, name->start, name->end);
5981
6377
 
5982
- // We want to ignore any parameter name that starts with an underscore.
5983
- if ((name->start < name->end) && (*name->start == '_')) return;
5984
-
5985
6378
  // Otherwise we'll fetch the constant id for the parameter name and check
5986
6379
  // whether it's already in the current scope.
5987
6380
  pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, name);
5988
6381
 
5989
6382
  if (pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
5990
- pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
6383
+ // Add an error if the parameter doesn't start with _ and has been seen before
6384
+ if ((name->start < name->end) && (*name->start != '_')) {
6385
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
6386
+ }
6387
+ return true;
5991
6388
  }
6389
+ return false;
5992
6390
  }
5993
6391
 
5994
6392
  /**
@@ -6003,105 +6401,6 @@ pm_parser_scope_pop(pm_parser_t *parser) {
6003
6401
  free(scope);
6004
6402
  }
6005
6403
 
6006
- /******************************************************************************/
6007
- /* Basic character checks */
6008
- /******************************************************************************/
6009
-
6010
- /**
6011
- * This function is used extremely frequently to lex all of the identifiers in a
6012
- * source file, so it's important that it be as fast as possible. For this
6013
- * reason we have the encoding_changed boolean to check if we need to go through
6014
- * the function pointer or can just directly use the UTF-8 functions.
6015
- */
6016
- static inline size_t
6017
- char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
6018
- if (parser->encoding_changed) {
6019
- size_t width;
6020
- if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
6021
- return width;
6022
- } else if (*b == '_') {
6023
- return 1;
6024
- } else if (*b >= 0x80) {
6025
- return parser->encoding->char_width(b, parser->end - b);
6026
- } else {
6027
- return 0;
6028
- }
6029
- } else if (*b < 0x80) {
6030
- return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
6031
- } else {
6032
- return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
6033
- }
6034
- }
6035
-
6036
- /**
6037
- * Similar to char_is_identifier but this function assumes that the encoding
6038
- * has not been changed.
6039
- */
6040
- static inline size_t
6041
- char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
6042
- if (*b < 0x80) {
6043
- return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
6044
- } else {
6045
- return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
6046
- }
6047
- }
6048
-
6049
- /**
6050
- * Like the above, this function is also used extremely frequently to lex all of
6051
- * the identifiers in a source file once the first character has been found. So
6052
- * it's important that it be as fast as possible.
6053
- */
6054
- static inline size_t
6055
- char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
6056
- if (parser->encoding_changed) {
6057
- size_t width;
6058
- if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
6059
- return width;
6060
- } else if (*b == '_') {
6061
- return 1;
6062
- } else if (*b >= 0x80) {
6063
- return parser->encoding->char_width(b, parser->end - b);
6064
- } else {
6065
- return 0;
6066
- }
6067
- }
6068
- return char_is_identifier_utf8(b, parser->end);
6069
- }
6070
-
6071
- // Here we're defining a perfect hash for the characters that are allowed in
6072
- // global names. This is used to quickly check the next character after a $ to
6073
- // see if it's a valid character for a global name.
6074
- #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
6075
- #define PUNCT(idx) ( \
6076
- BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
6077
- BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
6078
- BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
6079
- BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
6080
- BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
6081
- BIT('0', idx))
6082
-
6083
- const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
6084
-
6085
- #undef BIT
6086
- #undef PUNCT
6087
-
6088
- static inline bool
6089
- char_is_global_name_punctuation(const uint8_t b) {
6090
- const unsigned int i = (const unsigned int) b;
6091
- if (i <= 0x20 || 0x7e < i) return false;
6092
-
6093
- return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
6094
- }
6095
-
6096
- static inline bool
6097
- token_is_setter_name(pm_token_t *token) {
6098
- return (
6099
- (token->type == PM_TOKEN_IDENTIFIER) &&
6100
- (token->end - token->start >= 2) &&
6101
- (token->end[-1] == '=')
6102
- );
6103
- }
6104
-
6105
6404
  /******************************************************************************/
6106
6405
  /* Stack helpers */
6107
6406
  /******************************************************************************/
@@ -6317,8 +6616,10 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
6317
6616
  */
6318
6617
  static void
6319
6618
  parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
6320
- if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
6619
+ if ((start + 4 <= end) && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
6321
6620
  parser->frozen_string_literal = true;
6621
+ } else if ((start + 5 <= end) && pm_strncasecmp(start, (const uint8_t *) "false", 5) == 0) {
6622
+ parser->frozen_string_literal = false;
6322
6623
  }
6323
6624
  }
6324
6625
 
@@ -6541,21 +6842,27 @@ context_terminator(pm_context_t context, pm_token_t *token) {
6541
6842
  return token->type == PM_TOKEN_BRACE_RIGHT;
6542
6843
  case PM_CONTEXT_PREDICATE:
6543
6844
  return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON;
6845
+ case PM_CONTEXT_NONE:
6846
+ return false;
6544
6847
  }
6545
6848
 
6546
6849
  return false;
6547
6850
  }
6548
6851
 
6549
- static bool
6550
- context_recoverable(pm_parser_t *parser, pm_token_t *token) {
6852
+ /**
6853
+ * Returns the context that the given token is found to be terminating, or
6854
+ * returns PM_CONTEXT_NONE.
6855
+ */
6856
+ static pm_context_t
6857
+ context_recoverable(const pm_parser_t *parser, pm_token_t *token) {
6551
6858
  pm_context_node_t *context_node = parser->current_context;
6552
6859
 
6553
6860
  while (context_node != NULL) {
6554
- if (context_terminator(context_node->context, token)) return true;
6861
+ if (context_terminator(context_node->context, token)) return context_node->context;
6555
6862
  context_node = context_node->prev;
6556
6863
  }
6557
6864
 
6558
- return false;
6865
+ return PM_CONTEXT_NONE;
6559
6866
  }
6560
6867
 
6561
6868
  static bool
@@ -6583,7 +6890,7 @@ context_pop(pm_parser_t *parser) {
6583
6890
  }
6584
6891
 
6585
6892
  static bool
6586
- context_p(pm_parser_t *parser, pm_context_t context) {
6893
+ context_p(const pm_parser_t *parser, pm_context_t context) {
6587
6894
  pm_context_node_t *context_node = parser->current_context;
6588
6895
 
6589
6896
  while (context_node != NULL) {
@@ -6595,7 +6902,7 @@ context_p(pm_parser_t *parser, pm_context_t context) {
6595
6902
  }
6596
6903
 
6597
6904
  static bool
6598
- context_def_p(pm_parser_t *parser) {
6905
+ context_def_p(const pm_parser_t *parser) {
6599
6906
  pm_context_node_t *context_node = parser->current_context;
6600
6907
 
6601
6908
  while (context_node != NULL) {
@@ -6618,6 +6925,55 @@ context_def_p(pm_parser_t *parser) {
6618
6925
  return false;
6619
6926
  }
6620
6927
 
6928
+ /**
6929
+ * Returns a human readable string for the given context, used in error
6930
+ * messages.
6931
+ */
6932
+ static const char *
6933
+ context_human(pm_context_t context) {
6934
+ switch (context) {
6935
+ case PM_CONTEXT_NONE:
6936
+ assert(false && "unreachable");
6937
+ return "";
6938
+ case PM_CONTEXT_BEGIN: return "begin statement";
6939
+ case PM_CONTEXT_BLOCK_BRACES: return "'{'..'}' block";
6940
+ case PM_CONTEXT_BLOCK_KEYWORDS: return "'do'..'end' block";
6941
+ case PM_CONTEXT_CASE_WHEN: return "'when' clause";
6942
+ case PM_CONTEXT_CASE_IN: return "'in' clause";
6943
+ case PM_CONTEXT_CLASS: return "class definition";
6944
+ case PM_CONTEXT_DEF: return "method definition";
6945
+ case PM_CONTEXT_DEF_PARAMS: return "method parameters";
6946
+ case PM_CONTEXT_DEFAULT_PARAMS: return "parameter default value";
6947
+ case PM_CONTEXT_ELSE: return "'else' clause";
6948
+ case PM_CONTEXT_ELSIF: return "'elsif' clause";
6949
+ case PM_CONTEXT_EMBEXPR: return "embedded expression";
6950
+ case PM_CONTEXT_ENSURE: return "'ensure' clause";
6951
+ case PM_CONTEXT_ENSURE_DEF: return "'ensure' clause";
6952
+ case PM_CONTEXT_FOR: return "for loop";
6953
+ case PM_CONTEXT_FOR_INDEX: return "for loop index";
6954
+ case PM_CONTEXT_IF: return "if statement";
6955
+ case PM_CONTEXT_LAMBDA_BRACES: return "'{'..'}' lambda block";
6956
+ case PM_CONTEXT_LAMBDA_DO_END: return "'do'..'end' lambda block";
6957
+ case PM_CONTEXT_MAIN: return "top level context";
6958
+ case PM_CONTEXT_MODULE: return "module definition";
6959
+ case PM_CONTEXT_PARENS: return "parentheses";
6960
+ case PM_CONTEXT_POSTEXE: return "'END' block";
6961
+ case PM_CONTEXT_PREDICATE: return "predicate";
6962
+ case PM_CONTEXT_PREEXE: return "'BEGIN' block";
6963
+ case PM_CONTEXT_RESCUE_ELSE: return "'else' clause";
6964
+ case PM_CONTEXT_RESCUE_ELSE_DEF: return "'else' clause";
6965
+ case PM_CONTEXT_RESCUE: return "'rescue' clause";
6966
+ case PM_CONTEXT_RESCUE_DEF: return "'rescue' clause";
6967
+ case PM_CONTEXT_SCLASS: return "singleton class definition";
6968
+ case PM_CONTEXT_UNLESS: return "unless statement";
6969
+ case PM_CONTEXT_UNTIL: return "until statement";
6970
+ case PM_CONTEXT_WHILE: return "while statement";
6971
+ }
6972
+
6973
+ assert(false && "unreachable");
6974
+ return "";
6975
+ }
6976
+
6621
6977
  /******************************************************************************/
6622
6978
  /* Specific token lexers */
6623
6979
  /******************************************************************************/
@@ -6843,7 +7199,7 @@ lex_numeric(pm_parser_t *parser) {
6843
7199
  static pm_token_type_t
6844
7200
  lex_global_variable(pm_parser_t *parser) {
6845
7201
  if (parser->current.end >= parser->end) {
6846
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7202
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6847
7203
  return PM_TOKEN_GLOBAL_VARIABLE;
6848
7204
  }
6849
7205
 
@@ -6884,7 +7240,7 @@ lex_global_variable(pm_parser_t *parser) {
6884
7240
  } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
6885
7241
 
6886
7242
  // $0 isn't allowed to be followed by anything.
6887
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7243
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6888
7244
  }
6889
7245
 
6890
7246
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -6915,7 +7271,7 @@ lex_global_variable(pm_parser_t *parser) {
6915
7271
  } else {
6916
7272
  // If we get here, then we have a $ followed by something that isn't
6917
7273
  // recognized as a global variable.
6918
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7274
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
6919
7275
  }
6920
7276
 
6921
7277
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -7360,6 +7716,28 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
7360
7716
  pm_buffer_append_byte(buffer, byte);
7361
7717
  }
7362
7718
 
7719
+ /**
7720
+ * Write each byte of the given escaped character into the buffer.
7721
+ */
7722
+ static inline void
7723
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
7724
+ size_t width;
7725
+ if (parser->encoding_changed) {
7726
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
7727
+ } else {
7728
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
7729
+ }
7730
+
7731
+ // TODO: If the character is invalid in the given encoding, then we'll just
7732
+ // push one byte into the buffer. This should actually be an error.
7733
+ width = (width == 0) ? 1 : width;
7734
+
7735
+ for (size_t index = 0; index < width; index++) {
7736
+ escape_write_byte_encoded(parser, buffer, *parser->current.end);
7737
+ parser->current.end++;
7738
+ }
7739
+ }
7740
+
7363
7741
  /**
7364
7742
  * The regular expression engine doesn't support the same escape sequences as
7365
7743
  * Ruby does. So first we have to read the escape sequence, and then we have to
@@ -7698,7 +8076,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
7698
8076
  /* fallthrough */
7699
8077
  default: {
7700
8078
  if (parser->current.end < parser->end) {
7701
- escape_write_byte_encoded(parser, buffer, *parser->current.end++);
8079
+ escape_write_escape_encoded(parser, buffer);
7702
8080
  }
7703
8081
  return;
7704
8082
  }
@@ -7797,10 +8175,10 @@ lex_at_variable(pm_parser_t *parser) {
7797
8175
  while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
7798
8176
  parser->current.end += width;
7799
8177
  }
7800
- } else if (type == PM_TOKEN_CLASS_VARIABLE) {
7801
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
7802
8178
  } else {
7803
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
8179
+ pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE;
8180
+ size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8181
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start);
7804
8182
  }
7805
8183
 
7806
8184
  // If we're lexing an embedded variable, then we need to pop back into the
@@ -7975,14 +8353,43 @@ typedef struct {
7975
8353
  * Push the given byte into the token buffer.
7976
8354
  */
7977
8355
  static inline void
7978
- pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
8356
+ pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) {
7979
8357
  pm_buffer_append_byte(&token_buffer->buffer, byte);
7980
8358
  }
7981
8359
 
8360
+ /**
8361
+ * Append the given bytes into the token buffer.
8362
+ */
8363
+ static inline void
8364
+ pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
8365
+ pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
8366
+ }
8367
+
8368
+ /**
8369
+ * Push an escaped character into the token buffer.
8370
+ */
8371
+ static inline void
8372
+ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
8373
+ // First, determine the width of the character to be escaped.
8374
+ size_t width;
8375
+ if (parser->encoding_changed) {
8376
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8377
+ } else {
8378
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
8379
+ }
8380
+
8381
+ // TODO: If the character is invalid in the given encoding, then we'll just
8382
+ // push one byte into the buffer. This should actually be an error.
8383
+ width = (width == 0 ? 1 : width);
8384
+
8385
+ // Now, push the bytes into the buffer.
8386
+ pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
8387
+ parser->current.end += width;
8388
+ }
8389
+
7982
8390
  /**
7983
8391
  * When we're about to return from lexing the current token and we know for sure
7984
8392
  * that we have found an escape sequence, this function is called to copy the
7985
- *
7986
8393
  * contents of the token buffer into the current string on the parser so that it
7987
8394
  * can be attached to the correct node.
7988
8395
  */
@@ -7997,7 +8404,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
7997
8404
  * string. If we haven't pushed anything into the buffer, this means that we
7998
8405
  * never found an escape sequence, so we can directly reference the bounds of
7999
8406
  * the current string. Either way, at the return of this function it is expected
8000
- *
8001
8407
  * that parser->current_string is established in such a way that it can be
8002
8408
  * attached to a node.
8003
8409
  */
@@ -8016,7 +8422,6 @@ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
8016
8422
  * point into the buffer because we're about to provide a string that has
8017
8423
  * different content than a direct slice of the source.
8018
8424
  *
8019
- *
8020
8425
  * It is expected that the parser's current token end will be pointing at one
8021
8426
  * byte past the backslash that starts the escape sequence.
8022
8427
  */
@@ -8070,6 +8475,34 @@ pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor,
8070
8475
  return whitespace;
8071
8476
  }
8072
8477
 
8478
+ /**
8479
+ * Lex past the delimiter of a percent literal. Handle newlines and heredocs
8480
+ * appropriately.
8481
+ */
8482
+ static uint8_t
8483
+ pm_lex_percent_delimiter(pm_parser_t *parser) {
8484
+ size_t eol_length = match_eol(parser);
8485
+
8486
+ if (eol_length) {
8487
+ if (parser->heredoc_end) {
8488
+ // If we have already lexed a heredoc, then the newline has already
8489
+ // been added to the list. In this case we want to just flush the
8490
+ // heredoc end.
8491
+ parser_flush_heredoc_end(parser);
8492
+ } else {
8493
+ // Otherwise, we'll add the newline to the list of newlines.
8494
+ pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1);
8495
+ }
8496
+
8497
+ const uint8_t delimiter = *parser->current.end;
8498
+ parser->current.end += eol_length;
8499
+
8500
+ return delimiter;
8501
+ }
8502
+
8503
+ return *parser->current.end++;
8504
+ }
8505
+
8073
8506
  /**
8074
8507
  * This is a convenience macro that will set the current token type, call the
8075
8508
  * lex callback, and then return from the parser_lex function.
@@ -8635,7 +9068,7 @@ parser_lex(pm_parser_t *parser) {
8635
9068
  // this is not a valid heredoc declaration. In this case we
8636
9069
  // will add an error, but we will still return a heredoc
8637
9070
  // start.
8638
- pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
9071
+ pm_parser_err_current(parser, PM_ERR_HEREDOC_TERM);
8639
9072
  body_start = parser->end;
8640
9073
  } else {
8641
9074
  // Otherwise, we want to indicate that the body of the
@@ -8826,12 +9259,10 @@ parser_lex(pm_parser_t *parser) {
8826
9259
  LEX(PM_TOKEN_PLUS_EQUAL);
8827
9260
  }
8828
9261
 
8829
- bool spcarg = lex_state_spcarg_p(parser, space_seen);
8830
- if (spcarg) {
8831
- pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS);
8832
- }
8833
-
8834
- if (lex_state_beg_p(parser) || spcarg) {
9262
+ if (
9263
+ lex_state_beg_p(parser) ||
9264
+ (lex_state_spcarg_p(parser, space_seen) ? (pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS), true) : false)
9265
+ ) {
8835
9266
  lex_state_set(parser, PM_LEX_STATE_BEG);
8836
9267
 
8837
9268
  if (pm_char_is_decimal_digit(peek(parser))) {
@@ -8871,11 +9302,12 @@ parser_lex(pm_parser_t *parser) {
8871
9302
  }
8872
9303
 
8873
9304
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
8874
- if (spcarg) {
9305
+ bool is_beg = lex_state_beg_p(parser);
9306
+ if (!is_beg && spcarg) {
8875
9307
  pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS);
8876
9308
  }
8877
9309
 
8878
- if (lex_state_beg_p(parser) || spcarg) {
9310
+ if (is_beg || spcarg) {
8879
9311
  lex_state_set(parser, PM_LEX_STATE_BEG);
8880
9312
  LEX(pm_char_is_decimal_digit(peek(parser)) ? PM_TOKEN_UMINUS_NUM : PM_TOKEN_UMINUS);
8881
9313
  }
@@ -9026,15 +9458,8 @@ parser_lex(pm_parser_t *parser) {
9026
9458
  pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
9027
9459
  }
9028
9460
 
9029
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9030
-
9031
- size_t eol_length = match_eol(parser);
9032
- if (eol_length) {
9033
- parser->current.end += eol_length;
9034
- pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
9035
- } else {
9036
- parser->current.end++;
9037
- }
9461
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9462
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9038
9463
 
9039
9464
  if (parser->current.end < parser->end) {
9040
9465
  LEX(PM_TOKEN_STRING_BEGIN);
@@ -9054,7 +9479,7 @@ parser_lex(pm_parser_t *parser) {
9054
9479
  parser->current.end++;
9055
9480
 
9056
9481
  if (parser->current.end < parser->end) {
9057
- lex_mode_push_list(parser, false, *parser->current.end++);
9482
+ lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
9058
9483
  } else {
9059
9484
  lex_mode_push_list_eof(parser);
9060
9485
  }
@@ -9065,7 +9490,7 @@ parser_lex(pm_parser_t *parser) {
9065
9490
  parser->current.end++;
9066
9491
 
9067
9492
  if (parser->current.end < parser->end) {
9068
- lex_mode_push_list(parser, true, *parser->current.end++);
9493
+ lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
9069
9494
  } else {
9070
9495
  lex_mode_push_list_eof(parser);
9071
9496
  }
@@ -9076,9 +9501,8 @@ parser_lex(pm_parser_t *parser) {
9076
9501
  parser->current.end++;
9077
9502
 
9078
9503
  if (parser->current.end < parser->end) {
9079
- lex_mode_push_regexp(parser, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9080
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9081
- parser->current.end++;
9504
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9505
+ lex_mode_push_regexp(parser, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9082
9506
  } else {
9083
9507
  lex_mode_push_regexp(parser, '\0', '\0');
9084
9508
  }
@@ -9089,9 +9513,8 @@ parser_lex(pm_parser_t *parser) {
9089
9513
  parser->current.end++;
9090
9514
 
9091
9515
  if (parser->current.end < parser->end) {
9092
- lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9093
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9094
- parser->current.end++;
9516
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9517
+ lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9095
9518
  } else {
9096
9519
  lex_mode_push_string_eof(parser);
9097
9520
  }
@@ -9102,9 +9525,8 @@ parser_lex(pm_parser_t *parser) {
9102
9525
  parser->current.end++;
9103
9526
 
9104
9527
  if (parser->current.end < parser->end) {
9105
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9106
- pm_newline_list_check_append(&parser->newline_list, parser->current.end);
9107
- parser->current.end++;
9528
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9529
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9108
9530
  } else {
9109
9531
  lex_mode_push_string_eof(parser);
9110
9532
  }
@@ -9115,9 +9537,9 @@ parser_lex(pm_parser_t *parser) {
9115
9537
  parser->current.end++;
9116
9538
 
9117
9539
  if (parser->current.end < parser->end) {
9118
- lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9540
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9541
+ lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9119
9542
  lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
9120
- parser->current.end++;
9121
9543
  } else {
9122
9544
  lex_mode_push_string_eof(parser);
9123
9545
  }
@@ -9128,7 +9550,7 @@ parser_lex(pm_parser_t *parser) {
9128
9550
  parser->current.end++;
9129
9551
 
9130
9552
  if (parser->current.end < parser->end) {
9131
- lex_mode_push_list(parser, false, *parser->current.end++);
9553
+ lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
9132
9554
  } else {
9133
9555
  lex_mode_push_list_eof(parser);
9134
9556
  }
@@ -9139,7 +9561,7 @@ parser_lex(pm_parser_t *parser) {
9139
9561
  parser->current.end++;
9140
9562
 
9141
9563
  if (parser->current.end < parser->end) {
9142
- lex_mode_push_list(parser, true, *parser->current.end++);
9564
+ lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
9143
9565
  } else {
9144
9566
  lex_mode_push_list_eof(parser);
9145
9567
  }
@@ -9150,8 +9572,8 @@ parser_lex(pm_parser_t *parser) {
9150
9572
  parser->current.end++;
9151
9573
 
9152
9574
  if (parser->current.end < parser->end) {
9153
- lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
9154
- parser->current.end++;
9575
+ const uint8_t delimiter = pm_lex_percent_delimiter(parser);
9576
+ lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
9155
9577
  } else {
9156
9578
  lex_mode_push_string_eof(parser);
9157
9579
  }
@@ -9195,11 +9617,21 @@ parser_lex(pm_parser_t *parser) {
9195
9617
  if (*parser->current.start != '_') {
9196
9618
  size_t width = char_is_identifier_start(parser, parser->current.start);
9197
9619
 
9198
- // If this isn't the beginning of an identifier, then it's an invalid
9199
- // token as we've exhausted all of the other options. We'll skip past
9200
- // it and return the next token.
9620
+ // If this isn't the beginning of an identifier, then
9621
+ // it's an invalid token as we've exhausted all of the
9622
+ // other options. We'll skip past it and return the next
9623
+ // token after adding an appropriate error message.
9201
9624
  if (!width) {
9202
- pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
9625
+ pm_diagnostic_id_t diag_id;
9626
+ if (*parser->current.start >= 0x80) {
9627
+ diag_id = PM_ERR_INVALID_MULTIBYTE_CHARACTER;
9628
+ } else if (char_is_ascii_printable(*parser->current.start) || (*parser->current.start == '\\')) {
9629
+ diag_id = PM_ERR_INVALID_PRINTABLE_CHARACTER;
9630
+ } else {
9631
+ diag_id = PM_ERR_INVALID_CHARACTER;
9632
+ }
9633
+
9634
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, *parser->current.start);
9203
9635
  goto lex_next_token;
9204
9636
  }
9205
9637
 
@@ -9306,7 +9738,7 @@ parser_lex(pm_parser_t *parser) {
9306
9738
  // and then find the first one.
9307
9739
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9308
9740
  const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
9309
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9741
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9310
9742
 
9311
9743
  // If we haven't found an escape yet, then this buffer will be
9312
9744
  // unallocated since we can refer directly to the source string.
@@ -9315,7 +9747,7 @@ parser_lex(pm_parser_t *parser) {
9315
9747
  while (breakpoint != NULL) {
9316
9748
  // If we hit a null byte, skip directly past it.
9317
9749
  if (*breakpoint == '\0') {
9318
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
9750
+ breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1), true);
9319
9751
  continue;
9320
9752
  }
9321
9753
 
@@ -9334,7 +9766,7 @@ parser_lex(pm_parser_t *parser) {
9334
9766
  // we need to continue on past it.
9335
9767
  if (lex_mode->as.list.nesting > 0) {
9336
9768
  parser->current.end = breakpoint + 1;
9337
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9769
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9338
9770
  lex_mode->as.list.nesting--;
9339
9771
  continue;
9340
9772
  }
@@ -9377,18 +9809,18 @@ parser_lex(pm_parser_t *parser) {
9377
9809
  case '\t':
9378
9810
  case '\v':
9379
9811
  case '\\':
9380
- pm_token_buffer_push(&token_buffer, peeked);
9812
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9381
9813
  parser->current.end++;
9382
9814
  break;
9383
9815
  case '\r':
9384
9816
  parser->current.end++;
9385
9817
  if (peek(parser) != '\n') {
9386
- pm_token_buffer_push(&token_buffer, '\r');
9818
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9387
9819
  break;
9388
9820
  }
9389
9821
  /* fallthrough */
9390
9822
  case '\n':
9391
- pm_token_buffer_push(&token_buffer, '\n');
9823
+ pm_token_buffer_push_byte(&token_buffer, '\n');
9392
9824
 
9393
9825
  if (parser->heredoc_end) {
9394
9826
  // ... if we are on the same line as a heredoc,
@@ -9406,21 +9838,20 @@ parser_lex(pm_parser_t *parser) {
9406
9838
  break;
9407
9839
  default:
9408
9840
  if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
9409
- pm_token_buffer_push(&token_buffer, peeked);
9841
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9410
9842
  parser->current.end++;
9411
9843
  } else if (lex_mode->as.list.interpolation) {
9412
9844
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9413
9845
  } else {
9414
- pm_token_buffer_push(&token_buffer, '\\');
9415
- pm_token_buffer_push(&token_buffer, peeked);
9416
- parser->current.end++;
9846
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9847
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9417
9848
  }
9418
9849
 
9419
9850
  break;
9420
9851
  }
9421
9852
 
9422
9853
  token_buffer.cursor = parser->current.end;
9423
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9854
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9424
9855
  continue;
9425
9856
  }
9426
9857
 
@@ -9433,7 +9864,7 @@ parser_lex(pm_parser_t *parser) {
9433
9864
  // that looked like an interpolated class or instance variable
9434
9865
  // like "#@" but wasn't actually. In this case we'll just skip
9435
9866
  // to the next breakpoint.
9436
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9867
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9437
9868
  continue;
9438
9869
  }
9439
9870
 
@@ -9448,7 +9879,7 @@ parser_lex(pm_parser_t *parser) {
9448
9879
  // and find the next breakpoint.
9449
9880
  assert(*breakpoint == lex_mode->as.list.incrementor);
9450
9881
  parser->current.end = breakpoint + 1;
9451
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9882
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9452
9883
  lex_mode->as.list.nesting++;
9453
9884
  continue;
9454
9885
  }
@@ -9487,14 +9918,14 @@ parser_lex(pm_parser_t *parser) {
9487
9918
  // regular expression. We'll use strpbrk to find the first of these
9488
9919
  // characters.
9489
9920
  const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
9490
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9921
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9491
9922
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
9492
9923
 
9493
9924
  while (breakpoint != NULL) {
9494
9925
  // If we hit a null byte, skip directly past it.
9495
9926
  if (*breakpoint == '\0') {
9496
9927
  parser->current.end = breakpoint + 1;
9497
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9928
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9498
9929
  continue;
9499
9930
  }
9500
9931
 
@@ -9516,7 +9947,7 @@ parser_lex(pm_parser_t *parser) {
9516
9947
  // If the terminator is not a newline, then we can set
9517
9948
  // the next breakpoint and continue.
9518
9949
  parser->current.end = breakpoint + 1;
9519
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9950
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9520
9951
  continue;
9521
9952
  }
9522
9953
  }
@@ -9526,7 +9957,7 @@ parser_lex(pm_parser_t *parser) {
9526
9957
  if (*breakpoint == lex_mode->as.regexp.terminator) {
9527
9958
  if (lex_mode->as.regexp.nesting > 0) {
9528
9959
  parser->current.end = breakpoint + 1;
9529
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9960
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9530
9961
  lex_mode->as.regexp.nesting--;
9531
9962
  continue;
9532
9963
  }
@@ -9571,9 +10002,9 @@ parser_lex(pm_parser_t *parser) {
9571
10002
  parser->current.end++;
9572
10003
  if (peek(parser) != '\n') {
9573
10004
  if (lex_mode->as.regexp.terminator != '\r') {
9574
- pm_token_buffer_push(&token_buffer, '\\');
10005
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9575
10006
  }
9576
- pm_token_buffer_push(&token_buffer, '\r');
10007
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9577
10008
  break;
9578
10009
  }
9579
10010
  /* fallthrough */
@@ -9608,25 +10039,24 @@ parser_lex(pm_parser_t *parser) {
9608
10039
  case '$': case ')': case '*': case '+':
9609
10040
  case '.': case '>': case '?': case ']':
9610
10041
  case '^': case '|': case '}':
9611
- pm_token_buffer_push(&token_buffer, '\\');
10042
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9612
10043
  break;
9613
10044
  default:
9614
10045
  break;
9615
10046
  }
9616
10047
 
9617
- pm_token_buffer_push(&token_buffer, peeked);
10048
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9618
10049
  parser->current.end++;
9619
10050
  break;
9620
10051
  }
9621
10052
 
9622
- if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
9623
- pm_token_buffer_push(&token_buffer, peeked);
9624
- parser->current.end++;
10053
+ if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
10054
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9625
10055
  break;
9626
10056
  }
9627
10057
 
9628
10058
  token_buffer.cursor = parser->current.end;
9629
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10059
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9630
10060
  continue;
9631
10061
  }
9632
10062
 
@@ -9639,7 +10069,7 @@ parser_lex(pm_parser_t *parser) {
9639
10069
  // something that looked like an interpolated class or
9640
10070
  // instance variable like "#@" but wasn't actually. In
9641
10071
  // this case we'll just skip to the next breakpoint.
9642
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10072
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9643
10073
  continue;
9644
10074
  }
9645
10075
 
@@ -9654,7 +10084,7 @@ parser_lex(pm_parser_t *parser) {
9654
10084
  // and find the next breakpoint.
9655
10085
  assert(*breakpoint == lex_mode->as.regexp.incrementor);
9656
10086
  parser->current.end = breakpoint + 1;
9657
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10087
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9658
10088
  lex_mode->as.regexp.nesting++;
9659
10089
  continue;
9660
10090
  }
@@ -9690,7 +10120,7 @@ parser_lex(pm_parser_t *parser) {
9690
10120
  // string. We'll use strpbrk to find the first of these characters.
9691
10121
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9692
10122
  const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
9693
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10123
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9694
10124
 
9695
10125
  // If we haven't found an escape yet, then this buffer will be
9696
10126
  // unallocated since we can refer directly to the source string.
@@ -9702,7 +10132,7 @@ parser_lex(pm_parser_t *parser) {
9702
10132
  if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
9703
10133
  lex_mode->as.string.nesting++;
9704
10134
  parser->current.end = breakpoint + 1;
9705
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10135
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9706
10136
  continue;
9707
10137
  }
9708
10138
 
@@ -9714,7 +10144,7 @@ parser_lex(pm_parser_t *parser) {
9714
10144
  // to continue on past it.
9715
10145
  if (lex_mode->as.string.nesting > 0) {
9716
10146
  parser->current.end = breakpoint + 1;
9717
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10147
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9718
10148
  lex_mode->as.string.nesting--;
9719
10149
  continue;
9720
10150
  }
@@ -9756,7 +10186,7 @@ parser_lex(pm_parser_t *parser) {
9756
10186
  if (parser->heredoc_end == NULL) {
9757
10187
  pm_newline_list_append(&parser->newline_list, breakpoint);
9758
10188
  parser->current.end = breakpoint + 1;
9759
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10189
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9760
10190
  continue;
9761
10191
  } else {
9762
10192
  parser->current.end = breakpoint + 1;
@@ -9770,7 +10200,7 @@ parser_lex(pm_parser_t *parser) {
9770
10200
  case '\0':
9771
10201
  // Skip directly past the null character.
9772
10202
  parser->current.end = breakpoint + 1;
9773
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10203
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9774
10204
  break;
9775
10205
  case '\\': {
9776
10206
  // Here we hit escapes.
@@ -9788,23 +10218,23 @@ parser_lex(pm_parser_t *parser) {
9788
10218
 
9789
10219
  switch (peeked) {
9790
10220
  case '\\':
9791
- pm_token_buffer_push(&token_buffer, '\\');
10221
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9792
10222
  parser->current.end++;
9793
10223
  break;
9794
10224
  case '\r':
9795
10225
  parser->current.end++;
9796
10226
  if (peek(parser) != '\n') {
9797
10227
  if (!lex_mode->as.string.interpolation) {
9798
- pm_token_buffer_push(&token_buffer, '\\');
10228
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9799
10229
  }
9800
- pm_token_buffer_push(&token_buffer, '\r');
10230
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9801
10231
  break;
9802
10232
  }
9803
10233
  /* fallthrough */
9804
10234
  case '\n':
9805
10235
  if (!lex_mode->as.string.interpolation) {
9806
- pm_token_buffer_push(&token_buffer, '\\');
9807
- pm_token_buffer_push(&token_buffer, '\n');
10236
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10237
+ pm_token_buffer_push_byte(&token_buffer, '\n');
9808
10238
  }
9809
10239
 
9810
10240
  if (parser->heredoc_end) {
@@ -9823,24 +10253,23 @@ parser_lex(pm_parser_t *parser) {
9823
10253
  break;
9824
10254
  default:
9825
10255
  if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
9826
- pm_token_buffer_push(&token_buffer, peeked);
10256
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9827
10257
  parser->current.end++;
9828
10258
  } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
9829
- pm_token_buffer_push(&token_buffer, peeked);
10259
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9830
10260
  parser->current.end++;
9831
10261
  } else if (lex_mode->as.string.interpolation) {
9832
10262
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9833
10263
  } else {
9834
- pm_token_buffer_push(&token_buffer, '\\');
9835
- pm_token_buffer_push(&token_buffer, peeked);
9836
- parser->current.end++;
10264
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10265
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9837
10266
  }
9838
10267
 
9839
10268
  break;
9840
10269
  }
9841
10270
 
9842
10271
  token_buffer.cursor = parser->current.end;
9843
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10272
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9844
10273
  break;
9845
10274
  }
9846
10275
  case '#': {
@@ -9851,7 +10280,7 @@ parser_lex(pm_parser_t *parser) {
9851
10280
  // looked like an interpolated class or instance variable like "#@"
9852
10281
  // but wasn't actually. In this case we'll just skip to the next
9853
10282
  // breakpoint.
9854
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10283
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9855
10284
  break;
9856
10285
  }
9857
10286
 
@@ -9888,15 +10317,22 @@ parser_lex(pm_parser_t *parser) {
9888
10317
  parser->next_start = NULL;
9889
10318
  }
9890
10319
 
9891
- // We'll check if we're at the end of the file. If we are, then we need to
9892
- // return the EOF token.
10320
+ // Now let's grab the information about the identifier off of the
10321
+ // current lex mode.
10322
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
10323
+
10324
+ // We'll check if we're at the end of the file. If we are, then we
10325
+ // will add an error (because we weren't able to find the
10326
+ // terminator) but still continue parsing so that content after the
10327
+ // declaration of the heredoc can be parsed.
9893
10328
  if (parser->current.end >= parser->end) {
9894
- LEX(PM_TOKEN_EOF);
10329
+ pm_parser_err_current(parser, PM_ERR_HEREDOC_TERM);
10330
+ parser->next_start = lex_mode->as.heredoc.next_start;
10331
+ parser->heredoc_end = parser->current.end;
10332
+ lex_state_set(parser, PM_LEX_STATE_END);
10333
+ LEX(PM_TOKEN_HEREDOC_END);
9895
10334
  }
9896
10335
 
9897
- // Now let's grab the information about the identifier off of the current
9898
- // lex mode.
9899
- pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9900
10336
  const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
9901
10337
  size_t ident_length = lex_mode->as.heredoc.ident_length;
9902
10338
 
@@ -9972,7 +10408,7 @@ parser_lex(pm_parser_t *parser) {
9972
10408
  breakpoints[2] = '\0';
9973
10409
  }
9974
10410
 
9975
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10411
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9976
10412
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
9977
10413
  bool was_escaped_newline = false;
9978
10414
 
@@ -9981,7 +10417,7 @@ parser_lex(pm_parser_t *parser) {
9981
10417
  case '\0':
9982
10418
  // Skip directly past the null character.
9983
10419
  parser->current.end = breakpoint + 1;
9984
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10420
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9985
10421
  break;
9986
10422
  case '\n': {
9987
10423
  if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
@@ -10056,7 +10492,7 @@ parser_lex(pm_parser_t *parser) {
10056
10492
  // Otherwise we hit a newline and it wasn't followed by
10057
10493
  // a terminator, so we can continue parsing.
10058
10494
  parser->current.end = breakpoint + 1;
10059
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10495
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10060
10496
  break;
10061
10497
  }
10062
10498
  case '\\': {
@@ -10083,21 +10519,20 @@ parser_lex(pm_parser_t *parser) {
10083
10519
  case '\r':
10084
10520
  parser->current.end++;
10085
10521
  if (peek(parser) != '\n') {
10086
- pm_token_buffer_push(&token_buffer, '\\');
10087
- pm_token_buffer_push(&token_buffer, '\r');
10522
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10523
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10088
10524
  break;
10089
10525
  }
10090
10526
  /* fallthrough */
10091
10527
  case '\n':
10092
- pm_token_buffer_push(&token_buffer, '\\');
10093
- pm_token_buffer_push(&token_buffer, '\n');
10528
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10529
+ pm_token_buffer_push_byte(&token_buffer, '\n');
10094
10530
  token_buffer.cursor = parser->current.end + 1;
10095
10531
  breakpoint = parser->current.end;
10096
10532
  continue;
10097
10533
  default:
10098
- parser->current.end++;
10099
- pm_token_buffer_push(&token_buffer, '\\');
10100
- pm_token_buffer_push(&token_buffer, peeked);
10534
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10535
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10101
10536
  break;
10102
10537
  }
10103
10538
  } else {
@@ -10105,7 +10540,7 @@ parser_lex(pm_parser_t *parser) {
10105
10540
  case '\r':
10106
10541
  parser->current.end++;
10107
10542
  if (peek(parser) != '\n') {
10108
- pm_token_buffer_push(&token_buffer, '\r');
10543
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10109
10544
  break;
10110
10545
  }
10111
10546
  /* fallthrough */
@@ -10121,7 +10556,7 @@ parser_lex(pm_parser_t *parser) {
10121
10556
  }
10122
10557
 
10123
10558
  token_buffer.cursor = parser->current.end;
10124
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10559
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10125
10560
  break;
10126
10561
  }
10127
10562
  case '#': {
@@ -10133,7 +10568,7 @@ parser_lex(pm_parser_t *parser) {
10133
10568
  // or instance variable like "#@" but wasn't
10134
10569
  // actually. In this case we'll just skip to the
10135
10570
  // next breakpoint.
10136
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10571
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10137
10572
  break;
10138
10573
  }
10139
10574
 
@@ -10184,8 +10619,8 @@ parser_lex(pm_parser_t *parser) {
10184
10619
  typedef enum {
10185
10620
  PM_BINDING_POWER_UNSET = 0, // used to indicate this token cannot be used as an infix operator
10186
10621
  PM_BINDING_POWER_STATEMENT = 2,
10187
- PM_BINDING_POWER_MODIFIER = 4, // if unless until while
10188
- PM_BINDING_POWER_MODIFIER_RESCUE = 6, // rescue
10622
+ PM_BINDING_POWER_MODIFIER_RESCUE = 4, // rescue
10623
+ PM_BINDING_POWER_MODIFIER = 6, // if unless until while
10189
10624
  PM_BINDING_POWER_COMPOSITION = 8, // and or
10190
10625
  PM_BINDING_POWER_NOT = 10, // not
10191
10626
  PM_BINDING_POWER_MATCH = 12, // => in
@@ -10239,15 +10674,15 @@ typedef struct {
10239
10674
  #define RIGHT_ASSOCIATIVE_UNARY(precedence) { precedence, precedence, false, false }
10240
10675
 
10241
10676
  pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = {
10677
+ // rescue
10678
+ [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
10679
+
10242
10680
  // if unless until while
10243
10681
  [PM_TOKEN_KEYWORD_IF_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10244
10682
  [PM_TOKEN_KEYWORD_UNLESS_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10245
10683
  [PM_TOKEN_KEYWORD_UNTIL_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10246
10684
  [PM_TOKEN_KEYWORD_WHILE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER),
10247
10685
 
10248
- // rescue
10249
- [PM_TOKEN_KEYWORD_RESCUE_MODIFIER] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_MODIFIER_RESCUE),
10250
-
10251
10686
  // and or
10252
10687
  [PM_TOKEN_KEYWORD_AND] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
10253
10688
  [PM_TOKEN_KEYWORD_OR] = LEFT_ASSOCIATIVE(PM_BINDING_POWER_COMPOSITION),
@@ -10381,14 +10816,6 @@ match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2,
10381
10816
  return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4);
10382
10817
  }
10383
10818
 
10384
- /**
10385
- * Returns true if the current token is any of the five given types.
10386
- */
10387
- static inline bool
10388
- match5(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5) {
10389
- return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5);
10390
- }
10391
-
10392
10819
  /**
10393
10820
  * Returns true if the current token is any of the six given types.
10394
10821
  */
@@ -10654,7 +11081,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
10654
11081
  return target;
10655
11082
  case PM_BACK_REFERENCE_READ_NODE:
10656
11083
  case PM_NUMBERED_REFERENCE_READ_NODE:
10657
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11084
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
10658
11085
  return target;
10659
11086
  case PM_GLOBAL_VARIABLE_READ_NODE:
10660
11087
  assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t));
@@ -10792,7 +11219,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
10792
11219
  }
10793
11220
  case PM_BACK_REFERENCE_READ_NODE:
10794
11221
  case PM_NUMBERED_REFERENCE_READ_NODE:
10795
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11222
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
10796
11223
  /* fallthrough */
10797
11224
  case PM_GLOBAL_VARIABLE_READ_NODE: {
10798
11225
  pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value);
@@ -10866,7 +11293,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
10866
11293
  return target;
10867
11294
  }
10868
11295
 
10869
- if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
11296
+ if (char_is_identifier_start(parser, call->message_loc.start)) {
10870
11297
  // When we get here, we have a method call, because it was
10871
11298
  // previously marked as a method call but now we have an =. This
10872
11299
  // looks like:
@@ -10967,7 +11394,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
10967
11394
  pm_multi_target_node_targets_append(parser, result, target);
10968
11395
  } else if (!match1(parser, PM_TOKEN_EOF)) {
10969
11396
  // If we get here, then we have a trailing , in a multi target node.
10970
- // We'll set the implicit rest flag to indicate this.
11397
+ // We'll add an implicit rest node to represent this.
10971
11398
  pm_node_t *rest = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
10972
11399
  pm_multi_target_node_targets_append(parser, result, rest);
10973
11400
  break;
@@ -10984,6 +11411,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
10984
11411
  static pm_node_t *
10985
11412
  parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t binding_power) {
10986
11413
  pm_node_t *result = parse_targets(parser, first_target, binding_power);
11414
+ accept1(parser, PM_TOKEN_NEWLINE);
10987
11415
 
10988
11416
  // Ensure that we have either an = or a ) after the targets.
10989
11417
  if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) {
@@ -11024,7 +11452,7 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11024
11452
  break;
11025
11453
  }
11026
11454
 
11027
- // If we have a terminator, then we will parse all consequtive terminators
11455
+ // If we have a terminator, then we will parse all consecutive terminators
11028
11456
  // and then continue parsing the statements list.
11029
11457
  if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
11030
11458
  // If we have a terminator, then we will continue parsing the statements
@@ -11056,8 +11484,13 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11056
11484
 
11057
11485
  while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
11058
11486
  if (context_terminator(context, &parser->current)) break;
11059
- } else {
11060
- expect1(parser, PM_TOKEN_NEWLINE, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11487
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
11488
+ // This is an inlined version of accept1 because the error that we
11489
+ // want to add has varargs. If this happens again, we should
11490
+ // probably extract a helper function.
11491
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
11492
+ parser->previous.start = parser->previous.end;
11493
+ parser->previous.type = PM_TOKEN_MISSING;
11061
11494
  }
11062
11495
  }
11063
11496
 
@@ -11084,8 +11517,9 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
11084
11517
 
11085
11518
  if (token_begins_expression_p(parser->current.type)) {
11086
11519
  value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
11087
- } else if (pm_parser_local_depth(parser, &operator) == -1) {
11088
- pm_parser_err_token(parser, &operator, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
11520
+ }
11521
+ else {
11522
+ pm_parser_scope_forwarding_keywords_check(parser, &operator);
11089
11523
  }
11090
11524
 
11091
11525
  element = (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
@@ -11234,13 +11668,8 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11234
11668
  if (token_begins_expression_p(parser->current.type)) {
11235
11669
  expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_ARGUMENT);
11236
11670
  } else {
11237
- if (pm_parser_local_depth(parser, &operator) == -1) {
11238
- // A block forwarding in a method having `...` parameter (e.g. `def foo(...); bar(&); end`) is available.
11239
- pm_constant_id_t ellipsis_id = pm_parser_constant_id_constant(parser, "...", 3);
11240
- if (pm_parser_local_depth_constant_id(parser, ellipsis_id) == -1) {
11241
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
11242
- }
11243
- }
11671
+ // A block forwarding in a method having `...` parameter (e.g. `def foo(...); bar(&); end`) is available.
11672
+ pm_parser_scope_forwarding_block_check(parser, &operator);
11244
11673
  }
11245
11674
 
11246
11675
  argument = (pm_node_t *) pm_block_argument_node_create(parser, &operator, expression);
@@ -11258,10 +11687,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11258
11687
  pm_token_t operator = parser->previous;
11259
11688
 
11260
11689
  if (match4(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_SEMICOLON, PM_TOKEN_BRACKET_RIGHT)) {
11261
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
11262
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
11263
- }
11264
-
11690
+ pm_parser_scope_forwarding_positionals_check(parser, &operator);
11265
11691
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, NULL);
11266
11692
  } else {
11267
11693
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT);
@@ -11287,15 +11713,14 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11287
11713
  pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_RANGE, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
11288
11714
  argument = (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
11289
11715
  } else {
11290
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
11291
- pm_parser_err_previous(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
11292
- }
11716
+ pm_parser_scope_forwarding_all_check(parser, &parser->previous);
11293
11717
  if (parsed_first_argument && terminator == PM_TOKEN_EOF) {
11294
11718
  pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORWARDING_UNBOUND);
11295
11719
  }
11296
11720
 
11297
11721
  argument = (pm_node_t *) pm_forwarding_arguments_node_create(parser, &parser->previous);
11298
11722
  parse_arguments_append(parser, arguments, argument);
11723
+ arguments->has_forwarding = true;
11299
11724
  parsed_forwarding_arguments = true;
11300
11725
  break;
11301
11726
  }
@@ -11338,6 +11763,9 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
11338
11763
  }
11339
11764
 
11340
11765
  parsed_bare_hash = true;
11766
+ } else if (accept1(parser, PM_TOKEN_KEYWORD_IN)) {
11767
+ // TODO: Could we solve this with binding powers instead?
11768
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_IN);
11341
11769
  }
11342
11770
 
11343
11771
  parse_arguments_append(parser, arguments, argument);
@@ -11414,7 +11842,9 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
11414
11842
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11415
11843
  pm_token_t name = parser->previous;
11416
11844
  value = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
11417
- pm_parser_parameter_name_check(parser, &name);
11845
+ if (pm_parser_parameter_name_check(parser, &name)) {
11846
+ pm_node_flag_set_repeated_parameter(value);
11847
+ }
11418
11848
  pm_parser_local_add_token(parser, &name);
11419
11849
  }
11420
11850
 
@@ -11424,7 +11854,9 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
11424
11854
  pm_token_t name = parser->previous;
11425
11855
 
11426
11856
  param = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
11427
- pm_parser_parameter_name_check(parser, &name);
11857
+ if (pm_parser_parameter_name_check(parser, &name)) {
11858
+ pm_node_flag_set_repeated_parameter(param);
11859
+ }
11428
11860
  pm_parser_local_add_token(parser, &name);
11429
11861
  }
11430
11862
 
@@ -11541,19 +11973,20 @@ parse_parameters(
11541
11973
  pm_token_t operator = parser->previous;
11542
11974
  pm_token_t name;
11543
11975
 
11976
+ bool repeated = false;
11544
11977
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11545
11978
  name = parser->previous;
11546
- pm_parser_parameter_name_check(parser, &name);
11979
+ repeated = pm_parser_parameter_name_check(parser, &name);
11547
11980
  pm_parser_local_add_token(parser, &name);
11548
11981
  } else {
11549
11982
  name = not_provided(parser);
11550
-
11551
- if (allows_forwarding_parameters) {
11552
- pm_parser_local_add_token(parser, &operator);
11553
- }
11983
+ parser->current_scope->forwarding_params |= PM_FORWARDING_BLOCK;
11554
11984
  }
11555
11985
 
11556
11986
  pm_block_parameter_node_t *param = pm_block_parameter_node_create(parser, &name, &operator);
11987
+ if (repeated) {
11988
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
11989
+ }
11557
11990
  if (params->block == NULL) {
11558
11991
  pm_parameters_node_block_set(params, param);
11559
11992
  } else {
@@ -11572,9 +12005,8 @@ parse_parameters(
11572
12005
  update_parameter_state(parser, &parser->current, &order);
11573
12006
  parser_lex(parser);
11574
12007
 
11575
- if (allows_forwarding_parameters) {
11576
- pm_parser_local_add_token(parser, &parser->previous);
11577
- }
12008
+ parser->current_scope->forwarding_params |= PM_FORWARDING_BLOCK;
12009
+ parser->current_scope->forwarding_params |= PM_FORWARDING_ALL;
11578
12010
 
11579
12011
  pm_forwarding_parameter_node_t *param = pm_forwarding_parameter_node_create(parser, &parser->previous);
11580
12012
  if (params->keyword_rest != NULL) {
@@ -11626,20 +12058,23 @@ parse_parameters(
11626
12058
  }
11627
12059
 
11628
12060
  pm_token_t name = parser->previous;
11629
- pm_parser_parameter_name_check(parser, &name);
12061
+ bool repeated = pm_parser_parameter_name_check(parser, &name);
11630
12062
  pm_parser_local_add_token(parser, &name);
11631
12063
 
11632
12064
  if (accept1(parser, PM_TOKEN_EQUAL)) {
11633
12065
  pm_token_t operator = parser->previous;
11634
12066
  context_push(parser, PM_CONTEXT_DEFAULT_PARAMS);
11635
- pm_constant_id_t old_param_name = parser->current_param_name;
11636
- parser->current_param_name = pm_parser_constant_id_token(parser, &name);
12067
+
12068
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_set(parser, pm_parser_constant_id_token(parser, &name));
11637
12069
  pm_node_t *value = parse_value_expression(parser, binding_power, false, PM_ERR_PARAMETER_NO_DEFAULT);
11638
12070
 
11639
12071
  pm_optional_parameter_node_t *param = pm_optional_parameter_node_create(parser, &name, &operator, value);
12072
+ if (repeated) {
12073
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12074
+ }
11640
12075
  pm_parameters_node_optionals_append(params, param);
11641
12076
 
11642
- parser->current_param_name = old_param_name;
12077
+ pm_parser_current_param_name_restore(parser, saved_param_name);
11643
12078
  context_pop(parser);
11644
12079
 
11645
12080
  // If parsing the value of the parameter resulted in error recovery,
@@ -11651,9 +12086,15 @@ parse_parameters(
11651
12086
  }
11652
12087
  } else if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) {
11653
12088
  pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name);
12089
+ if (repeated) {
12090
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12091
+ }
11654
12092
  pm_parameters_node_requireds_append(params, (pm_node_t *) param);
11655
12093
  } else {
11656
12094
  pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name);
12095
+ if (repeated) {
12096
+ pm_node_flag_set_repeated_parameter((pm_node_t *)param);
12097
+ }
11657
12098
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
11658
12099
  }
11659
12100
 
@@ -11668,7 +12109,7 @@ parse_parameters(
11668
12109
  pm_token_t local = name;
11669
12110
  local.end -= 1;
11670
12111
 
11671
- pm_parser_parameter_name_check(parser, &local);
12112
+ bool repeated = pm_parser_parameter_name_check(parser, &local);
11672
12113
  pm_parser_local_add_token(parser, &local);
11673
12114
 
11674
12115
  switch (parser->current.type) {
@@ -11676,6 +12117,9 @@ parse_parameters(
11676
12117
  case PM_TOKEN_PARENTHESIS_RIGHT:
11677
12118
  case PM_TOKEN_PIPE: {
11678
12119
  pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
12120
+ if (repeated) {
12121
+ pm_node_flag_set_repeated_parameter(param);
12122
+ }
11679
12123
  pm_parameters_node_keywords_append(params, param);
11680
12124
  break;
11681
12125
  }
@@ -11687,6 +12131,9 @@ parse_parameters(
11687
12131
  }
11688
12132
 
11689
12133
  pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
12134
+ if (repeated) {
12135
+ pm_node_flag_set_repeated_parameter(param);
12136
+ }
11690
12137
  pm_parameters_node_keywords_append(params, param);
11691
12138
  break;
11692
12139
  }
@@ -11695,17 +12142,22 @@ parse_parameters(
11695
12142
 
11696
12143
  if (token_begins_expression_p(parser->current.type)) {
11697
12144
  context_push(parser, PM_CONTEXT_DEFAULT_PARAMS);
11698
- pm_constant_id_t old_param_name = parser->current_param_name;
11699
- parser->current_param_name = pm_parser_constant_id_token(parser, &local);
12145
+
12146
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_set(parser, pm_parser_constant_id_token(parser, &local));
11700
12147
  pm_node_t *value = parse_value_expression(parser, binding_power, false, PM_ERR_PARAMETER_NO_DEFAULT_KW);
11701
- parser->current_param_name = old_param_name;
12148
+
12149
+ pm_parser_current_param_name_restore(parser, saved_param_name);
11702
12150
  context_pop(parser);
12151
+
11703
12152
  param = (pm_node_t *) pm_optional_keyword_parameter_node_create(parser, &name, value);
11704
12153
  }
11705
12154
  else {
11706
12155
  param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name);
11707
12156
  }
11708
12157
 
12158
+ if (repeated) {
12159
+ pm_node_flag_set_repeated_parameter(param);
12160
+ }
11709
12161
  pm_parameters_node_keywords_append(params, param);
11710
12162
 
11711
12163
  // If parsing the value of the parameter resulted in error recovery,
@@ -11728,20 +12180,21 @@ parse_parameters(
11728
12180
 
11729
12181
  pm_token_t operator = parser->previous;
11730
12182
  pm_token_t name;
11731
-
12183
+ bool repeated = false;
11732
12184
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11733
12185
  name = parser->previous;
11734
- pm_parser_parameter_name_check(parser, &name);
12186
+ repeated = pm_parser_parameter_name_check(parser, &name);
11735
12187
  pm_parser_local_add_token(parser, &name);
11736
12188
  } else {
11737
12189
  name = not_provided(parser);
11738
12190
 
11739
- if (allows_forwarding_parameters) {
11740
- pm_parser_local_add_token(parser, &operator);
11741
- }
12191
+ parser->current_scope->forwarding_params |= PM_FORWARDING_POSITIONALS;
11742
12192
  }
11743
12193
 
11744
12194
  pm_node_t *param = (pm_node_t *) pm_rest_parameter_node_create(parser, &operator, &name);
12195
+ if (repeated) {
12196
+ pm_node_flag_set_repeated_parameter(param);
12197
+ }
11745
12198
  if (params->rest == NULL) {
11746
12199
  pm_parameters_node_rest_set(params, param);
11747
12200
  } else {
@@ -11764,19 +12217,21 @@ parse_parameters(
11764
12217
  } else {
11765
12218
  pm_token_t name;
11766
12219
 
12220
+ bool repeated = false;
11767
12221
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
11768
12222
  name = parser->previous;
11769
- pm_parser_parameter_name_check(parser, &name);
12223
+ repeated = pm_parser_parameter_name_check(parser, &name);
11770
12224
  pm_parser_local_add_token(parser, &name);
11771
12225
  } else {
11772
12226
  name = not_provided(parser);
11773
12227
 
11774
- if (allows_forwarding_parameters) {
11775
- pm_parser_local_add_token(parser, &operator);
11776
- }
12228
+ parser->current_scope->forwarding_params |= PM_FORWARDING_KEYWORDS;
11777
12229
  }
11778
12230
 
11779
12231
  param = (pm_node_t *) pm_keyword_rest_parameter_node_create(parser, &operator, &name);
12232
+ if (repeated) {
12233
+ pm_node_flag_set_repeated_parameter(param);
12234
+ }
11780
12235
  }
11781
12236
 
11782
12237
  if (params->keyword_rest == NULL) {
@@ -11964,25 +12419,10 @@ parse_rescues(pm_parser_t *parser, pm_begin_node_t *parent_node, bool def_p) {
11964
12419
  }
11965
12420
 
11966
12421
  static inline pm_begin_node_t *
11967
- parse_rescues_as_begin(pm_parser_t *parser, pm_statements_node_t *statements, bool def_p) {
12422
+ parse_rescues_as_begin(pm_parser_t *parser, const uint8_t *start, pm_statements_node_t *statements, bool def_p) {
11968
12423
  pm_token_t no_begin_token = not_provided(parser);
11969
12424
  pm_begin_node_t *begin_node = pm_begin_node_create(parser, &no_begin_token, statements);
11970
12425
  parse_rescues(parser, begin_node, def_p);
11971
-
11972
- // All nodes within a begin node are optional, so we look
11973
- // for the earliest possible node that we can use to set
11974
- // the BeginNode's start location
11975
- const uint8_t *start = begin_node->base.location.start;
11976
- if (begin_node->statements) {
11977
- start = begin_node->statements->base.location.start;
11978
- } else if (begin_node->rescue_clause) {
11979
- start = begin_node->rescue_clause->base.location.start;
11980
- } else if (begin_node->else_clause) {
11981
- start = begin_node->else_clause->base.location.start;
11982
- } else if (begin_node->ensure_clause) {
11983
- start = begin_node->ensure_clause->base.location.start;
11984
- }
11985
-
11986
12426
  begin_node->base.location.start = start;
11987
12427
  return begin_node;
11988
12428
  }
@@ -12012,10 +12452,13 @@ parse_block_parameters(
12012
12452
  if ((opening->type != PM_TOKEN_NOT_PROVIDED) && accept1(parser, PM_TOKEN_SEMICOLON)) {
12013
12453
  do {
12014
12454
  expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE);
12015
- pm_parser_parameter_name_check(parser, &parser->previous);
12455
+ bool repeated = pm_parser_parameter_name_check(parser, &parser->previous);
12016
12456
  pm_parser_local_add_token(parser, &parser->previous);
12017
12457
 
12018
12458
  pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous);
12459
+ if (repeated) {
12460
+ pm_node_flag_set_repeated_parameter((pm_node_t *)local);
12461
+ }
12019
12462
  pm_block_parameters_node_append_local(block_parameters, local);
12020
12463
  } while (accept1(parser, PM_TOKEN_COMMA));
12021
12464
  }
@@ -12031,8 +12474,10 @@ parse_block(pm_parser_t *parser) {
12031
12474
  pm_token_t opening = parser->previous;
12032
12475
  accept1(parser, PM_TOKEN_NEWLINE);
12033
12476
 
12477
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
12034
12478
  pm_accepts_block_stack_push(parser, true);
12035
12479
  pm_parser_scope_push(parser, false);
12480
+
12036
12481
  pm_block_parameters_node_t *block_parameters = NULL;
12037
12482
 
12038
12483
  if (accept1(parser, PM_TOKEN_PIPE)) {
@@ -12053,12 +12498,6 @@ parse_block(pm_parser_t *parser) {
12053
12498
  pm_block_parameters_node_closing_set(block_parameters, &parser->previous);
12054
12499
  }
12055
12500
 
12056
- uint32_t locals_body_index = 0;
12057
-
12058
- if (block_parameters) {
12059
- locals_body_index = (uint32_t) parser->current_scope->locals.size;
12060
- }
12061
-
12062
12501
  accept1(parser, PM_TOKEN_NEWLINE);
12063
12502
  pm_node_t *statements = NULL;
12064
12503
 
@@ -12078,7 +12517,7 @@ parse_block(pm_parser_t *parser) {
12078
12517
 
12079
12518
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
12080
12519
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
12081
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
12520
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) statements, false);
12082
12521
  }
12083
12522
  }
12084
12523
 
@@ -12090,13 +12529,14 @@ parse_block(pm_parser_t *parser) {
12090
12529
 
12091
12530
  if (parameters == NULL && (maximum > 0)) {
12092
12531
  parameters = (pm_node_t *) pm_numbered_parameters_node_create(parser, &(pm_location_t) { .start = opening.start, .end = parser->previous.end }, maximum);
12093
- locals_body_index = maximum;
12094
12532
  }
12095
12533
 
12096
12534
  pm_constant_id_list_t locals = parser->current_scope->locals;
12097
12535
  pm_parser_scope_pop(parser);
12098
12536
  pm_accepts_block_stack_pop(parser);
12099
- return pm_block_node_create(parser, &locals, locals_body_index, &opening, parameters, statements, &parser->previous);
12537
+ pm_parser_current_param_name_restore(parser, saved_param_name);
12538
+
12539
+ return pm_block_node_create(parser, &locals, &opening, parameters, statements, &parser->previous);
12100
12540
  }
12101
12541
 
12102
12542
  /**
@@ -12157,14 +12597,20 @@ parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accept
12157
12597
  }
12158
12598
 
12159
12599
  if (block != NULL) {
12160
- if (arguments->block == NULL) {
12600
+ if (arguments->block == NULL && !arguments->has_forwarding) {
12161
12601
  arguments->block = (pm_node_t *) block;
12162
12602
  } else {
12163
- pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
12164
- if (arguments->arguments == NULL) {
12165
- arguments->arguments = pm_arguments_node_create(parser);
12603
+ if (arguments->has_forwarding) {
12604
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_FORWARDING);
12605
+ } else {
12606
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
12607
+ }
12608
+ if (arguments->block != NULL) {
12609
+ if (arguments->arguments == NULL) {
12610
+ arguments->arguments = pm_arguments_node_create(parser);
12611
+ }
12612
+ pm_arguments_node_arguments_append(arguments->arguments, arguments->block);
12166
12613
  }
12167
- pm_arguments_node_arguments_append(arguments->arguments, arguments->block);
12168
12614
  arguments->block = (pm_node_t *) block;
12169
12615
  }
12170
12616
  }
@@ -12384,8 +12830,14 @@ static inline pm_node_flags_t
12384
12830
  parse_unescaped_encoding(const pm_parser_t *parser) {
12385
12831
  if (parser->explicit_encoding != NULL) {
12386
12832
  if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
12833
+ // If the there's an explicit encoding and it's using a UTF-8 escape
12834
+ // sequence, then mark the string as UTF-8.
12387
12835
  return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
12388
12836
  } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
12837
+ // If there's a non-UTF-8 escape sequence being used, then the
12838
+ // string uses the source encoding, unless the source is marked as
12839
+ // US-ASCII. In that case the string is forced as ASCII-8BIT in
12840
+ // order to keep the string valid.
12389
12841
  return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
12390
12842
  }
12391
12843
  }
@@ -12509,14 +12961,54 @@ parse_string_part(pm_parser_t *parser) {
12509
12961
  }
12510
12962
  }
12511
12963
 
12964
+ /**
12965
+ * When creating a symbol, unary operators that cannot be binary operators
12966
+ * automatically drop trailing `@` characters. This happens at the parser level,
12967
+ * such that `~@` is parsed as `~` and `!@` is parsed as `!`. We do that here.
12968
+ */
12969
+ static const uint8_t *
12970
+ parse_operator_symbol_name(const pm_token_t *name) {
12971
+ switch (name->type) {
12972
+ case PM_TOKEN_TILDE:
12973
+ case PM_TOKEN_BANG:
12974
+ if (name->end[-1] == '@') return name->end - 1;
12975
+ /* fallthrough */
12976
+ default:
12977
+ return name->end;
12978
+ }
12979
+ }
12980
+
12981
+ static pm_node_t *
12982
+ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_state_t next_state) {
12983
+ pm_token_t closing = not_provided(parser);
12984
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, opening, &parser->current, &closing);
12985
+
12986
+ const uint8_t *end = parse_operator_symbol_name(&parser->current);
12987
+
12988
+ if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
12989
+ parser_lex(parser);
12990
+
12991
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, end);
12992
+ pm_node_flag_set((pm_node_t *) symbol, PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING);
12993
+
12994
+ return (pm_node_t *) symbol;
12995
+ }
12996
+
12997
+ /**
12998
+ * Parse a symbol node. This function will get called immediately after finding
12999
+ * a symbol opening token. This handles parsing bare symbols and interpolated
13000
+ * symbols.
13001
+ */
12512
13002
  static pm_node_t *
12513
13003
  parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_state) {
12514
- pm_token_t opening = parser->previous;
13004
+ const pm_token_t opening = parser->previous;
12515
13005
 
12516
13006
  if (lex_mode->mode != PM_LEX_STRING) {
12517
13007
  if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
12518
13008
 
12519
13009
  switch (parser->current.type) {
13010
+ case PM_CASE_OPERATOR:
13011
+ return parse_operator_symbol(parser, &opening, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
12520
13012
  case PM_TOKEN_IDENTIFIER:
12521
13013
  case PM_TOKEN_CONSTANT:
12522
13014
  case PM_TOKEN_INSTANCE_VARIABLE:
@@ -12528,10 +13020,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12528
13020
  case PM_CASE_KEYWORD:
12529
13021
  parser_lex(parser);
12530
13022
  break;
12531
- case PM_CASE_OPERATOR:
12532
- lex_state_set(parser, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
12533
- parser_lex(parser);
12534
- break;
12535
13023
  default:
12536
13024
  expect2(parser, PM_TOKEN_IDENTIFIER, PM_TOKEN_METHOD_NAME, PM_ERR_SYMBOL_INVALID);
12537
13025
  break;
@@ -12541,6 +13029,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12541
13029
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12542
13030
 
12543
13031
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13032
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13033
+
12544
13034
  return (pm_node_t *) symbol;
12545
13035
  }
12546
13036
 
@@ -12637,7 +13127,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12637
13127
  } else {
12638
13128
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
12639
13129
  }
12640
- return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13130
+
13131
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
12641
13132
  }
12642
13133
 
12643
13134
  /**
@@ -12647,8 +13138,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
12647
13138
  static inline pm_node_t *
12648
13139
  parse_undef_argument(pm_parser_t *parser) {
12649
13140
  switch (parser->current.type) {
13141
+ case PM_CASE_OPERATOR: {
13142
+ const pm_token_t opening = not_provided(parser);
13143
+ return parse_operator_symbol(parser, &opening, PM_LEX_STATE_NONE);
13144
+ }
12650
13145
  case PM_CASE_KEYWORD:
12651
- case PM_CASE_OPERATOR:
12652
13146
  case PM_TOKEN_CONSTANT:
12653
13147
  case PM_TOKEN_IDENTIFIER:
12654
13148
  case PM_TOKEN_METHOD_NAME: {
@@ -12659,6 +13153,8 @@ parse_undef_argument(pm_parser_t *parser) {
12659
13153
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12660
13154
 
12661
13155
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13156
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13157
+
12662
13158
  return (pm_node_t *) symbol;
12663
13159
  }
12664
13160
  case PM_TOKEN_SYMBOL_BEGIN: {
@@ -12682,21 +13178,24 @@ parse_undef_argument(pm_parser_t *parser) {
12682
13178
  static inline pm_node_t *
12683
13179
  parse_alias_argument(pm_parser_t *parser, bool first) {
12684
13180
  switch (parser->current.type) {
12685
- case PM_CASE_OPERATOR:
13181
+ case PM_CASE_OPERATOR: {
13182
+ const pm_token_t opening = not_provided(parser);
13183
+ return parse_operator_symbol(parser, &opening, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE);
13184
+ }
12686
13185
  case PM_CASE_KEYWORD:
12687
13186
  case PM_TOKEN_CONSTANT:
12688
13187
  case PM_TOKEN_IDENTIFIER:
12689
13188
  case PM_TOKEN_METHOD_NAME: {
12690
- if (first) {
12691
- lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
12692
- }
12693
-
13189
+ if (first) lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
12694
13190
  parser_lex(parser);
13191
+
12695
13192
  pm_token_t opening = not_provided(parser);
12696
13193
  pm_token_t closing = not_provided(parser);
12697
13194
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
12698
13195
 
12699
13196
  pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
13197
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
13198
+
12700
13199
  return (pm_node_t *) symbol;
12701
13200
  }
12702
13201
  case PM_TOKEN_SYMBOL_BEGIN: {
@@ -12733,6 +13232,64 @@ outer_scope_using_numbered_parameters_p(pm_parser_t *parser) {
12733
13232
  return false;
12734
13233
  }
12735
13234
 
13235
+ /**
13236
+ * These are the names of the various numbered parameters. We have them here so
13237
+ * that when we insert them into the constant pool we can use a constant string
13238
+ * and not have to allocate.
13239
+ */
13240
+ static const char * const pm_numbered_parameter_names[] = {
13241
+ "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9"
13242
+ };
13243
+
13244
+ /**
13245
+ * Parse an identifier into either a local variable read. If the local variable
13246
+ * is not found, it returns NULL instead.
13247
+ */
13248
+ static pm_local_variable_read_node_t *
13249
+ parse_variable(pm_parser_t *parser) {
13250
+ int depth;
13251
+ if ((depth = pm_parser_local_depth(parser, &parser->previous)) != -1) {
13252
+ return pm_local_variable_read_node_create(parser, &parser->previous, (uint32_t) depth);
13253
+ }
13254
+
13255
+ if (!parser->current_scope->closed && pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end)) {
13256
+ // Now that we know we have a numbered parameter, we need to check
13257
+ // if it's allowed in this context. If it is, then we will create a
13258
+ // local variable read. If it's not, then we'll create a normal call
13259
+ // node but add an error.
13260
+ if (parser->current_scope->explicit_params) {
13261
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
13262
+ } else if (outer_scope_using_numbered_parameters_p(parser)) {
13263
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
13264
+ } else {
13265
+ // Indicate that this scope is using numbered params so that child
13266
+ // scopes cannot. We subtract the value for the character '0' to get
13267
+ // the actual integer value of the number (only _1 through _9 are
13268
+ // valid).
13269
+ uint8_t numbered_parameters = (uint8_t) (parser->previous.start[1] - '0');
13270
+ if (numbered_parameters > parser->current_scope->numbered_parameters) {
13271
+ parser->current_scope->numbered_parameters = numbered_parameters;
13272
+ pm_parser_numbered_parameters_set(parser, numbered_parameters);
13273
+ }
13274
+
13275
+ // When you use a numbered parameter, it implies the existence
13276
+ // of all of the locals that exist before it. For example,
13277
+ // referencing _2 means that _1 must exist. Therefore here we
13278
+ // loop through all of the possibilities and add them into the
13279
+ // constant pool.
13280
+ for (uint8_t numbered_parameter = 1; numbered_parameter <= numbered_parameters - 1; numbered_parameter++) {
13281
+ pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameter - 1], 2);
13282
+ }
13283
+
13284
+ // Finally we can create the local variable read node.
13285
+ pm_constant_id_t name_id = pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameters - 1], 2);
13286
+ return pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13287
+ }
13288
+ }
13289
+
13290
+ return NULL;
13291
+ }
13292
+
12736
13293
  /**
12737
13294
  * Parse an identifier into either a local variable read or a call.
12738
13295
  */
@@ -12741,56 +13298,8 @@ parse_variable_call(pm_parser_t *parser) {
12741
13298
  pm_node_flags_t flags = 0;
12742
13299
 
12743
13300
  if (!match1(parser, PM_TOKEN_PARENTHESIS_LEFT) && (parser->previous.end[-1] != '!') && (parser->previous.end[-1] != '?')) {
12744
- int depth;
12745
- if ((depth = pm_parser_local_depth(parser, &parser->previous)) != -1) {
12746
- return (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, (uint32_t) depth);
12747
- }
12748
-
12749
- if (!parser->current_scope->closed && pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end)) {
12750
- // Now that we know we have a numbered parameter, we need to check
12751
- // if it's allowed in this context. If it is, then we will create a
12752
- // local variable read. If it's not, then we'll create a normal call
12753
- // node but add an error.
12754
- if (parser->current_scope->explicit_params) {
12755
- pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
12756
- } else if (outer_scope_using_numbered_parameters_p(parser)) {
12757
- pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
12758
- } else {
12759
- // Indicate that this scope is using numbered params so that child
12760
- // scopes cannot.
12761
- uint8_t number = parser->previous.start[1];
12762
-
12763
- // We subtract the value for the character '0' to get the actual
12764
- // integer value of the number (only _1 through _9 are valid)
12765
- uint8_t numbered_parameters = (uint8_t) (number - '0');
12766
- if (numbered_parameters > parser->current_scope->numbered_parameters) {
12767
- parser->current_scope->numbered_parameters = numbered_parameters;
12768
- pm_parser_numbered_parameters_set(parser, numbered_parameters);
12769
- }
12770
-
12771
- // When you use a numbered parameter, it implies the existence
12772
- // of all of the locals that exist before it. For example,
12773
- // referencing _2 means that _1 must exist. Therefore here we
12774
- // loop through all of the possibilities and add them into the
12775
- // constant pool.
12776
- uint8_t current = '1';
12777
- uint8_t *value;
12778
-
12779
- while (current < number) {
12780
- value = malloc(2);
12781
- value[0] = '_';
12782
- value[1] = current++;
12783
- pm_parser_local_add_owned(parser, value, 2);
12784
- }
12785
-
12786
- // Now we can add the actual token that is being used. For
12787
- // this one we can add a shared version since it is directly
12788
- // referenced in the source.
12789
- pm_parser_local_add_token(parser, &parser->previous);
12790
- return (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
12791
- }
12792
- }
12793
-
13301
+ pm_local_variable_read_node_t *node = parse_variable(parser);
13302
+ if (node != NULL) return (pm_node_t *) node;
12794
13303
  flags |= PM_CALL_NODE_FLAGS_VARIABLE_CALL;
12795
13304
  }
12796
13305
 
@@ -13076,43 +13585,77 @@ parse_pattern_keyword_rest(pm_parser_t *parser) {
13076
13585
  return (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
13077
13586
  }
13078
13587
 
13588
+ /**
13589
+ * Create an implicit node for the value of a hash pattern that has omitted the
13590
+ * value. This will use an implicit local variable target.
13591
+ */
13592
+ static pm_node_t *
13593
+ parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_symbol_node_t *key) {
13594
+ const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13595
+ pm_constant_id_t name = pm_parser_constant_id_location(parser, value_loc->start, value_loc->end);
13596
+
13597
+ int current_depth = pm_parser_local_depth_constant_id(parser, name);
13598
+ uint32_t depth;
13599
+
13600
+ if (current_depth == -1) {
13601
+ pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13602
+ depth = 0;
13603
+ } else {
13604
+ depth = (uint32_t) current_depth;
13605
+ }
13606
+
13607
+ pm_local_variable_target_node_t *target = pm_local_variable_target_node_create_values(parser, value_loc, name, depth);
13608
+ return (pm_node_t *) pm_implicit_node_create(parser, (pm_node_t *) target);
13609
+ }
13610
+
13079
13611
  /**
13080
13612
  * Parse a hash pattern.
13081
13613
  */
13082
13614
  static pm_hash_pattern_node_t *
13083
- parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_assoc) {
13615
+ parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_node) {
13084
13616
  pm_node_list_t assocs = { 0 };
13085
13617
  pm_node_t *rest = NULL;
13086
13618
 
13087
- switch (PM_NODE_TYPE(first_assoc)) {
13088
- case PM_ASSOC_NODE: {
13089
- if (!match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
13090
- // Here we have a value for the first assoc in the list, so we will
13091
- // parse it now and update the first assoc.
13092
- pm_node_t *value = parse_pattern(parser, false, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY);
13619
+ switch (PM_NODE_TYPE(first_node)) {
13620
+ case PM_ASSOC_SPLAT_NODE:
13621
+ case PM_NO_KEYWORDS_PARAMETER_NODE:
13622
+ rest = first_node;
13623
+ break;
13624
+ case PM_SYMBOL_NODE: {
13625
+ if (pm_symbol_node_label_p(first_node)) {
13626
+ pm_node_t *value;
13627
+
13628
+ if (!match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
13629
+ // Here we have a value for the first assoc in the list, so
13630
+ // we will parse it now.
13631
+ value = parse_pattern(parser, false, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY);
13632
+ } else {
13633
+ // Otherwise, we will create an implicit local variable
13634
+ // target for the value.
13635
+ value = parse_pattern_hash_implicit_value(parser, (pm_symbol_node_t *) first_node);
13636
+ }
13093
13637
 
13094
- pm_assoc_node_t *assoc = (pm_assoc_node_t *) first_assoc;
13095
- assoc->base.location.end = value->location.end;
13096
- assoc->value = value;
13097
- } else {
13098
- pm_node_t *key = ((pm_assoc_node_t *) first_assoc)->key;
13638
+ pm_token_t operator = not_provided(parser);
13639
+ pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value);
13099
13640
 
13100
- if (PM_NODE_TYPE_P(key, PM_SYMBOL_NODE)) {
13101
- const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13102
- pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13103
- }
13641
+ pm_node_list_append(&assocs, assoc);
13642
+ break;
13104
13643
  }
13644
+ }
13645
+ /* fallthrough */
13646
+ default: {
13647
+ // If we get anything else, then this is an error. For this we'll
13648
+ // create a missing node for the value and create an assoc node for
13649
+ // the first node in the list.
13650
+ pm_parser_err_node(parser, first_node, PM_ERR_PATTERN_HASH_KEY_LABEL);
13651
+
13652
+ pm_token_t operator = not_provided(parser);
13653
+ pm_node_t *value = (pm_node_t *) pm_missing_node_create(parser, first_node->location.start, first_node->location.end);
13654
+ pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value);
13105
13655
 
13106
- pm_node_list_append(&assocs, first_assoc);
13656
+ pm_node_list_append(&assocs, assoc);
13107
13657
  break;
13108
13658
  }
13109
- case PM_ASSOC_SPLAT_NODE:
13110
- case PM_NO_KEYWORDS_PARAMETER_NODE:
13111
- rest = first_assoc;
13112
- break;
13113
- default:
13114
- assert(false);
13115
- break;
13116
13659
  }
13117
13660
 
13118
13661
  // If there are any other assocs, then we'll parse them now.
@@ -13141,6 +13684,7 @@ parse_pattern_hash(pm_parser_t *parser, pm_node_t *first_assoc) {
13141
13684
  } else {
13142
13685
  const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc;
13143
13686
  pm_parser_local_add_location(parser, value_loc->start, value_loc->end);
13687
+ value = parse_pattern_hash_implicit_value(parser, (pm_symbol_node_t *) key);
13144
13688
  }
13145
13689
 
13146
13690
  pm_token_t operator = not_provided(parser);
@@ -13246,45 +13790,29 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
13246
13790
  // pattern node.
13247
13791
  node = pm_hash_pattern_node_empty_create(parser, &opening, &parser->previous);
13248
13792
  } else {
13249
- pm_node_t *first_assoc;
13793
+ pm_node_t *first_node;
13250
13794
 
13251
13795
  switch (parser->current.type) {
13252
- case PM_TOKEN_LABEL: {
13796
+ case PM_TOKEN_LABEL:
13253
13797
  parser_lex(parser);
13254
-
13255
- pm_symbol_node_t *key = pm_symbol_node_label_create(parser, &parser->previous);
13256
- pm_token_t operator = not_provided(parser);
13257
-
13258
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, (pm_node_t *) key, &operator, NULL);
13798
+ first_node = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous);
13259
13799
  break;
13260
- }
13261
13800
  case PM_TOKEN_USTAR_STAR:
13262
- first_assoc = parse_pattern_keyword_rest(parser);
13801
+ first_node = parse_pattern_keyword_rest(parser);
13263
13802
  break;
13264
- case PM_TOKEN_STRING_BEGIN: {
13265
- pm_node_t *key = parse_expression(parser, PM_BINDING_POWER_MAX, false, PM_ERR_PATTERN_HASH_KEY);
13266
- pm_token_t operator = not_provided(parser);
13267
-
13268
- if (!pm_symbol_node_label_p(key)) {
13269
- pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_LABEL);
13270
- }
13271
-
13272
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, key, &operator, NULL);
13803
+ case PM_TOKEN_STRING_BEGIN:
13804
+ first_node = parse_expression(parser, PM_BINDING_POWER_MAX, false, PM_ERR_PATTERN_HASH_KEY);
13273
13805
  break;
13274
- }
13275
13806
  default: {
13276
13807
  parser_lex(parser);
13277
13808
  pm_parser_err_previous(parser, PM_ERR_PATTERN_HASH_KEY);
13278
13809
 
13279
- pm_missing_node_t *key = pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
13280
- pm_token_t operator = not_provided(parser);
13281
-
13282
- first_assoc = (pm_node_t *) pm_assoc_node_create(parser, (pm_node_t *) key, &operator, NULL);
13810
+ first_node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
13283
13811
  break;
13284
13812
  }
13285
13813
  }
13286
13814
 
13287
- node = parse_pattern_hash(parser, first_assoc);
13815
+ node = parse_pattern_hash(parser, first_node);
13288
13816
 
13289
13817
  accept1(parser, PM_TOKEN_NEWLINE);
13290
13818
  expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_PATTERN_TERM_BRACE);
@@ -13350,7 +13878,16 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
13350
13878
  switch (parser->current.type) {
13351
13879
  case PM_TOKEN_IDENTIFIER: {
13352
13880
  parser_lex(parser);
13353
- pm_node_t *variable = (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
13881
+ pm_node_t *variable = (pm_node_t *) parse_variable(parser);
13882
+ if (variable == NULL) {
13883
+ if (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3_0 && pm_token_is_it(parser->previous.start, parser->previous.end)) {
13884
+ pm_constant_id_t name_id = pm_parser_constant_id_constant(parser, "0it", 3);
13885
+ variable = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13886
+ } else {
13887
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE);
13888
+ variable = (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
13889
+ }
13890
+ }
13354
13891
 
13355
13892
  return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable);
13356
13893
  }
@@ -13519,9 +14056,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13519
14056
  case PM_TOKEN_LABEL: {
13520
14057
  parser_lex(parser);
13521
14058
  pm_node_t *key = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous);
13522
- pm_token_t operator = not_provided(parser);
13523
-
13524
- return (pm_node_t *) parse_pattern_hash(parser, (pm_node_t *) pm_assoc_node_create(parser, key, &operator, NULL));
14059
+ return (pm_node_t *) parse_pattern_hash(parser, key);
13525
14060
  }
13526
14061
  case PM_TOKEN_USTAR_STAR: {
13527
14062
  node = parse_pattern_keyword_rest(parser);
@@ -13544,8 +14079,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13544
14079
  // If we got a dynamic label symbol, then we need to treat it like the
13545
14080
  // beginning of a hash pattern.
13546
14081
  if (pm_symbol_node_label_p(node)) {
13547
- pm_token_t operator = not_provided(parser);
13548
- return (pm_node_t *) parse_pattern_hash(parser, (pm_node_t *) pm_assoc_node_create(parser, node, &operator, NULL));
14082
+ return (pm_node_t *) parse_pattern_hash(parser, node);
13549
14083
  }
13550
14084
 
13551
14085
  if (top_pattern && match1(parser, PM_TOKEN_COMMA)) {
@@ -13558,7 +14092,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
13558
14092
  // Gather up all of the patterns into the list.
13559
14093
  while (accept1(parser, PM_TOKEN_COMMA)) {
13560
14094
  // Break early here in case we have a trailing comma.
13561
- if (match5(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
14095
+ if (match6(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) {
13562
14096
  node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
13563
14097
  pm_node_list_append(&nodes, node);
13564
14098
  break;
@@ -13644,7 +14178,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13644
14178
  assert(parser->current.type == PM_TOKEN_STRING_BEGIN);
13645
14179
 
13646
14180
  bool concating = false;
13647
- bool state_is_arg_labeled = lex_state_p(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
14181
+ bool state_is_arg_labeled = lex_state_arg_labeled_p(parser);
13648
14182
 
13649
14183
  while (match1(parser, PM_TOKEN_STRING_BEGIN)) {
13650
14184
  pm_node_t *node = NULL;
@@ -13659,7 +14193,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13659
14193
  parser_lex(parser);
13660
14194
 
13661
14195
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13662
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14196
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13663
14197
  // If we get here, then we have an end immediately after a
13664
14198
  // start. In that case we'll create an empty content token and
13665
14199
  // return an uninterpolated string.
@@ -13672,7 +14206,6 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13672
14206
  // If we get here, then we have an end of a label immediately
13673
14207
  // after a start. In that case we'll create an empty symbol
13674
14208
  // node.
13675
- pm_token_t opening = not_provided(parser);
13676
14209
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
13677
14210
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous);
13678
14211
 
@@ -13716,15 +14249,19 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13716
14249
  parser_lex(parser);
13717
14250
  } while (match1(parser, PM_TOKEN_STRING_CONTENT));
13718
14251
 
13719
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14252
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13720
14253
  node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
13721
14254
  } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
13722
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14255
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
13723
14256
  } else if (match1(parser, PM_TOKEN_EOF)) {
13724
- pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_TERM);
14257
+ pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
13725
14258
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
14259
+ } else if (accept1(parser, PM_TOKEN_STRING_END)) {
14260
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13726
14261
  } else {
13727
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14262
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_type_human(parser->previous.type));
14263
+ parser->previous.start = parser->previous.end;
14264
+ parser->previous.type = PM_TOKEN_MISSING;
13728
14265
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13729
14266
  }
13730
14267
  } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
@@ -13739,9 +14276,9 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13739
14276
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13740
14277
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
13741
14278
  pm_node_flag_set(node, parse_unescaped_encoding(parser));
13742
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14279
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
13743
14280
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
13744
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14281
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
13745
14282
  } else {
13746
14283
  // If we get here, then we have interpolation so we'll need
13747
14284
  // to create a string or symbol node with interpolation.
@@ -13830,11 +14367,34 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
13830
14367
  return current;
13831
14368
  }
13832
14369
 
14370
+ /**
14371
+ * Append an error to the error list on the parser using the given diagnostic
14372
+ * ID. This function is a specialization that handles formatting the specific
14373
+ * kind of error that is being appended.
14374
+ */
14375
+ static void
14376
+ pm_parser_err_prefix(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
14377
+ switch (diag_id) {
14378
+ case PM_ERR_HASH_KEY: {
14379
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, pm_token_type_human(parser->previous.type));
14380
+ break;
14381
+ }
14382
+ case PM_ERR_UNARY_RECEIVER: {
14383
+ const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_type_human(parser->current.type));
14384
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, human, parser->previous.start[0]);
14385
+ break;
14386
+ }
14387
+ default:
14388
+ pm_parser_err_previous(parser, diag_id);
14389
+ break;
14390
+ }
14391
+ }
14392
+
13833
14393
  /**
13834
14394
  * Parse an expression that begins with the previous node that we just lexed.
13835
14395
  */
13836
14396
  static inline pm_node_t *
13837
- parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call) {
14397
+ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
13838
14398
  switch (parser->current.type) {
13839
14399
  case PM_TOKEN_BRACKET_LEFT_ARRAY: {
13840
14400
  parser_lex(parser);
@@ -13866,9 +14426,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
13866
14426
  pm_node_t *expression = NULL;
13867
14427
 
13868
14428
  if (match3(parser, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_EOF)) {
13869
- if (pm_parser_local_depth(parser, &parser->previous) == -1) {
13870
- pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
13871
- }
14429
+ pm_parser_scope_forwarding_positionals_check(parser, &operator);
13872
14430
  } else {
13873
14431
  expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_ARRAY_EXPRESSION_AFTER_STAR);
13874
14432
  }
@@ -14016,7 +14574,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14016
14574
  // If we didn't find a terminator and we didn't find a right
14017
14575
  // parenthesis, then this is a syntax error.
14018
14576
  if (!terminator_found) {
14019
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14577
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14020
14578
  }
14021
14579
 
14022
14580
  // Parse each statement within the parentheses.
@@ -14045,7 +14603,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14045
14603
  } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
14046
14604
  break;
14047
14605
  } else {
14048
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14606
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14049
14607
  }
14050
14608
  }
14051
14609
 
@@ -14113,7 +14671,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14113
14671
  if (
14114
14672
  match1(parser, PM_TOKEN_PARENTHESIS_LEFT) ||
14115
14673
  (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) ||
14116
- (pm_accepts_block_stack_p(parser) && match2(parser, PM_TOKEN_KEYWORD_DO, PM_TOKEN_BRACE_LEFT))
14674
+ (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) ||
14675
+ match1(parser, PM_TOKEN_BRACE_LEFT)
14117
14676
  ) {
14118
14677
  pm_arguments_t arguments = { 0 };
14119
14678
  parse_arguments_list(parser, &arguments, true, accepts_command_call);
@@ -14237,7 +14796,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14237
14796
  // a block, so we need to check for that here.
14238
14797
  if (
14239
14798
  (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) ||
14240
- (pm_accepts_block_stack_p(parser) && match2(parser, PM_TOKEN_KEYWORD_DO, PM_TOKEN_BRACE_LEFT))
14799
+ (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) ||
14800
+ match1(parser, PM_TOKEN_BRACE_LEFT)
14241
14801
  ) {
14242
14802
  pm_arguments_t arguments = { 0 };
14243
14803
  parse_arguments_list(parser, &arguments, true, accepts_command_call);
@@ -14250,6 +14810,31 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14250
14810
 
14251
14811
  if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) {
14252
14812
  node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX);
14813
+ } else {
14814
+ // Check if `it` is not going to be assigned.
14815
+ switch (parser->current.type) {
14816
+ case PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL:
14817
+ case PM_TOKEN_AMPERSAND_EQUAL:
14818
+ case PM_TOKEN_CARET_EQUAL:
14819
+ case PM_TOKEN_EQUAL:
14820
+ case PM_TOKEN_GREATER_GREATER_EQUAL:
14821
+ case PM_TOKEN_LESS_LESS_EQUAL:
14822
+ case PM_TOKEN_MINUS_EQUAL:
14823
+ case PM_TOKEN_PARENTHESIS_RIGHT:
14824
+ case PM_TOKEN_PERCENT_EQUAL:
14825
+ case PM_TOKEN_PIPE_EQUAL:
14826
+ case PM_TOKEN_PIPE_PIPE_EQUAL:
14827
+ case PM_TOKEN_PLUS_EQUAL:
14828
+ case PM_TOKEN_SLASH_EQUAL:
14829
+ case PM_TOKEN_STAR_EQUAL:
14830
+ case PM_TOKEN_STAR_STAR_EQUAL:
14831
+ break;
14832
+ default:
14833
+ // Once we know it's neither a method call nor an
14834
+ // assignment, we can finally create `it` default
14835
+ // parameter.
14836
+ node = pm_node_check_it(parser, node);
14837
+ }
14253
14838
  }
14254
14839
 
14255
14840
  return node;
@@ -14286,6 +14871,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14286
14871
  // If we get here, then we tried to find something in the
14287
14872
  // heredoc but couldn't actually parse anything, so we'll just
14288
14873
  // return a missing node.
14874
+ //
14875
+ // parse_string_part handles its own errors, so there is no need
14876
+ // for us to add one here.
14289
14877
  node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
14290
14878
  } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
14291
14879
  // If we get here, then the part that we parsed was plain string
@@ -14549,11 +15137,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14549
15137
  // for guard clauses in the form of `if` or `unless` statements.
14550
15138
  if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) {
14551
15139
  pm_token_t keyword = parser->previous;
14552
- pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, true, PM_ERR_CONDITIONAL_IF_PREDICATE);
15140
+ pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, PM_ERR_CONDITIONAL_IF_PREDICATE);
14553
15141
  pattern = (pm_node_t *) pm_if_node_modifier_create(parser, pattern, &keyword, predicate);
14554
15142
  } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) {
14555
15143
  pm_token_t keyword = parser->previous;
14556
- pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, true, PM_ERR_CONDITIONAL_UNLESS_PREDICATE);
15144
+ pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, PM_ERR_CONDITIONAL_UNLESS_PREDICATE);
14557
15145
  pattern = (pm_node_t *) pm_unless_node_modifier_create(parser, pattern, &keyword, predicate);
14558
15146
  }
14559
15147
 
@@ -14742,8 +15330,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14742
15330
  pm_token_t operator = parser->previous;
14743
15331
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_NOT, true, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS);
14744
15332
 
14745
- pm_constant_id_t old_param_name = parser->current_param_name;
14746
- parser->current_param_name = 0;
15333
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
14747
15334
  pm_parser_scope_push(parser, true);
14748
15335
  accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
14749
15336
 
@@ -14756,15 +15343,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14756
15343
 
14757
15344
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
14758
15345
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
14759
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15346
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
14760
15347
  }
14761
15348
 
14762
15349
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
14763
-
14764
15350
  pm_constant_id_list_t locals = parser->current_scope->locals;
15351
+
14765
15352
  pm_parser_scope_pop(parser);
14766
- parser->current_param_name = old_param_name;
14767
15353
  pm_do_loop_stack_pop(parser);
15354
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15355
+
14768
15356
  return (pm_node_t *) pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous);
14769
15357
  }
14770
15358
 
@@ -14790,9 +15378,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14790
15378
  superclass = NULL;
14791
15379
  }
14792
15380
 
14793
- pm_constant_id_t old_param_name = parser->current_param_name;
14794
- parser->current_param_name = 0;
15381
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
14795
15382
  pm_parser_scope_push(parser, true);
15383
+
14796
15384
  if (inheritance_operator.type != PM_TOKEN_NOT_PROVIDED) {
14797
15385
  expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END);
14798
15386
  } else {
@@ -14808,7 +15396,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14808
15396
 
14809
15397
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
14810
15398
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
14811
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15399
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
14812
15400
  }
14813
15401
 
14814
15402
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
@@ -14818,9 +15406,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14818
15406
  }
14819
15407
 
14820
15408
  pm_constant_id_list_t locals = parser->current_scope->locals;
15409
+
14821
15410
  pm_parser_scope_pop(parser);
14822
- parser->current_param_name = old_param_name;
14823
15411
  pm_do_loop_stack_pop(parser);
15412
+ pm_parser_current_param_name_restore(parser, saved_param_name);
14824
15413
 
14825
15414
  if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) {
14826
15415
  pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME);
@@ -14835,18 +15424,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14835
15424
  pm_token_t operator = not_provided(parser);
14836
15425
  pm_token_t name = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = def_keyword.end, .end = def_keyword.end };
14837
15426
 
14838
- // This context is necessary for lexing `...` in a bare params correctly.
14839
- // It must be pushed before lexing the first param, so it is here.
15427
+ // This context is necessary for lexing `...` in a bare params
15428
+ // correctly. It must be pushed before lexing the first param, so it
15429
+ // is here.
14840
15430
  context_push(parser, PM_CONTEXT_DEF_PARAMS);
15431
+ pm_constant_id_t saved_param_name;
15432
+
14841
15433
  parser_lex(parser);
14842
- pm_constant_id_t old_param_name = parser->current_param_name;
14843
15434
 
14844
15435
  switch (parser->current.type) {
14845
15436
  case PM_CASE_OPERATOR:
15437
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14846
15438
  pm_parser_scope_push(parser, true);
14847
- parser->current_param_name = 0;
14848
15439
  lex_state_set(parser, PM_LEX_STATE_ENDFN);
14849
15440
  parser_lex(parser);
15441
+
14850
15442
  name = parser->previous;
14851
15443
  break;
14852
15444
  case PM_TOKEN_IDENTIFIER: {
@@ -14854,18 +15446,20 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14854
15446
 
14855
15447
  if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) {
14856
15448
  receiver = parse_variable_call(parser);
15449
+ receiver = pm_node_check_it(parser, receiver);
14857
15450
 
15451
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14858
15452
  pm_parser_scope_push(parser, true);
14859
- parser->current_param_name = 0;
14860
15453
  lex_state_set(parser, PM_LEX_STATE_FNAME);
14861
15454
  parser_lex(parser);
14862
15455
 
14863
15456
  operator = parser->previous;
14864
15457
  name = parse_method_definition_name(parser);
14865
15458
  } else {
15459
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14866
15460
  pm_refute_numbered_parameter(parser, parser->previous.start, parser->previous.end);
14867
15461
  pm_parser_scope_push(parser, true);
14868
- parser->current_param_name = 0;
15462
+
14869
15463
  name = parser->previous;
14870
15464
  }
14871
15465
 
@@ -14882,9 +15476,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14882
15476
  case PM_TOKEN_KEYWORD___FILE__:
14883
15477
  case PM_TOKEN_KEYWORD___LINE__:
14884
15478
  case PM_TOKEN_KEYWORD___ENCODING__: {
15479
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14885
15480
  pm_parser_scope_push(parser, true);
14886
- parser->current_param_name = 0;
14887
15481
  parser_lex(parser);
15482
+
14888
15483
  pm_token_t identifier = parser->previous;
14889
15484
 
14890
15485
  if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) {
@@ -14946,6 +15541,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14946
15541
  pm_token_t lparen = parser->previous;
14947
15542
  pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_STATEMENT, true, PM_ERR_DEF_RECEIVER);
14948
15543
 
15544
+ accept1(parser, PM_TOKEN_NEWLINE);
14949
15545
  expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN);
14950
15546
  pm_token_t rparen = parser->previous;
14951
15547
 
@@ -14955,8 +15551,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14955
15551
  operator = parser->previous;
14956
15552
  receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen);
14957
15553
 
15554
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14958
15555
  pm_parser_scope_push(parser, true);
14959
- parser->current_param_name = 0;
14960
15556
 
14961
15557
  // To push `PM_CONTEXT_DEF_PARAMS` again is for the same reason as described the above.
14962
15558
  context_push(parser, PM_CONTEXT_DEF_PARAMS);
@@ -14964,8 +15560,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14964
15560
  break;
14965
15561
  }
14966
15562
  default:
15563
+ saved_param_name = pm_parser_current_param_name_unset(parser);
14967
15564
  pm_parser_scope_push(parser, true);
14968
- parser->current_param_name = 0;
15565
+
14969
15566
  name = parse_method_definition_name(parser);
14970
15567
  break;
14971
15568
  }
@@ -15018,8 +15615,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15018
15615
  }
15019
15616
  }
15020
15617
 
15021
- uint32_t locals_body_index = (uint32_t) parser->current_scope->locals.size;
15022
-
15023
15618
  context_pop(parser);
15024
15619
  pm_node_t *statements = NULL;
15025
15620
  pm_token_t equal;
@@ -15070,7 +15665,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15070
15665
 
15071
15666
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15072
15667
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15073
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, true);
15668
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, def_keyword.start, (pm_statements_node_t *) statements, true);
15074
15669
  }
15075
15670
 
15076
15671
  pm_accepts_block_stack_pop(parser);
@@ -15080,17 +15675,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15080
15675
  }
15081
15676
 
15082
15677
  pm_constant_id_list_t locals = parser->current_scope->locals;
15083
- parser->current_param_name = old_param_name;
15678
+
15084
15679
  pm_parser_scope_pop(parser);
15680
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15681
+
15682
+ /**
15683
+ * If the final character is @. As is the case when defining
15684
+ * methods to override the unary operators, we should ignore
15685
+ * the @ in the same way we do for symbols.
15686
+ */
15687
+ pm_constant_id_t name_id = pm_parser_constant_id_location(parser, name.start, parse_operator_symbol_name(&name));
15085
15688
 
15086
15689
  return (pm_node_t *) pm_def_node_create(
15087
15690
  parser,
15691
+ name_id,
15088
15692
  &name,
15089
15693
  receiver,
15090
15694
  params,
15091
15695
  statements,
15092
15696
  &locals,
15093
- locals_body_index,
15094
15697
  &def_keyword,
15095
15698
  &operator,
15096
15699
  &lparen,
@@ -15309,9 +15912,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15309
15912
  pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME);
15310
15913
  }
15311
15914
 
15312
- pm_constant_id_t old_param_name = parser->current_param_name;
15313
- parser->current_param_name = 0;
15915
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
15314
15916
  pm_parser_scope_push(parser, true);
15917
+
15315
15918
  accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE);
15316
15919
  pm_node_t *statements = NULL;
15317
15920
 
@@ -15323,12 +15926,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15323
15926
 
15324
15927
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15325
15928
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15326
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15929
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, module_keyword.start, (pm_statements_node_t *) statements, false);
15327
15930
  }
15328
15931
 
15329
15932
  pm_constant_id_list_t locals = parser->current_scope->locals;
15330
15933
  pm_parser_scope_pop(parser);
15331
- parser->current_param_name = old_param_name;
15934
+ pm_parser_current_param_name_restore(parser, saved_param_name);
15332
15935
 
15333
15936
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM);
15334
15937
 
@@ -15914,6 +16517,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15914
16517
  // context of a multiple assignment. We enforce that here. We'll
15915
16518
  // still lex past it though and create a missing node place.
15916
16519
  if (binding_power != PM_BINDING_POWER_STATEMENT) {
16520
+ pm_parser_err_prefix(parser, diag_id);
15917
16521
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
15918
16522
  }
15919
16523
 
@@ -15936,7 +16540,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15936
16540
  parser_lex(parser);
15937
16541
 
15938
16542
  pm_token_t operator = parser->previous;
15939
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER_BANG);
16543
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER);
15940
16544
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "!");
15941
16545
 
15942
16546
  pm_conditional_predicate(receiver);
@@ -15946,7 +16550,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15946
16550
  parser_lex(parser);
15947
16551
 
15948
16552
  pm_token_t operator = parser->previous;
15949
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_TILDE);
16553
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15950
16554
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "~");
15951
16555
 
15952
16556
  return (pm_node_t *) node;
@@ -15955,7 +16559,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15955
16559
  parser_lex(parser);
15956
16560
 
15957
16561
  pm_token_t operator = parser->previous;
15958
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16562
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15959
16563
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "-@");
15960
16564
 
15961
16565
  return (pm_node_t *) node;
@@ -15964,7 +16568,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15964
16568
  parser_lex(parser);
15965
16569
 
15966
16570
  pm_token_t operator = parser->previous;
15967
- pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16571
+ pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
15968
16572
 
15969
16573
  if (accept1(parser, PM_TOKEN_STAR_STAR)) {
15970
16574
  pm_token_t exponent_operator = parser->previous;
@@ -15995,7 +16599,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15995
16599
  parser_lex(parser);
15996
16600
 
15997
16601
  pm_token_t operator = parser->previous;
16602
+ pm_constant_id_t saved_param_name = pm_parser_current_param_name_unset(parser);
15998
16603
  pm_parser_scope_push(parser, false);
16604
+
15999
16605
  pm_block_parameters_node_t *block_parameters;
16000
16606
 
16001
16607
  switch (parser->current.type) {
@@ -16030,12 +16636,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16030
16636
  }
16031
16637
  }
16032
16638
 
16033
- uint32_t locals_body_index = 0;
16034
-
16035
- if (block_parameters) {
16036
- locals_body_index = (uint32_t) parser->current_scope->locals.size;
16037
- }
16038
-
16039
16639
  pm_token_t opening;
16040
16640
  pm_node_t *body = NULL;
16041
16641
  parser->lambda_enclosure_nesting = previous_lambda_enclosure_nesting;
@@ -16059,7 +16659,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16059
16659
 
16060
16660
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
16061
16661
  assert(body == NULL || PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE));
16062
- body = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) body, false);
16662
+ body = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) body, false);
16063
16663
  }
16064
16664
 
16065
16665
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END);
@@ -16070,19 +16670,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16070
16670
 
16071
16671
  if (parameters == NULL && (maximum > 0)) {
16072
16672
  parameters = (pm_node_t *) pm_numbered_parameters_node_create(parser, &(pm_location_t) { .start = operator.start, .end = parser->previous.end }, maximum);
16073
- locals_body_index = maximum;
16074
16673
  }
16075
16674
 
16076
16675
  pm_constant_id_list_t locals = parser->current_scope->locals;
16676
+
16077
16677
  pm_parser_scope_pop(parser);
16078
16678
  pm_accepts_block_stack_pop(parser);
16079
- return (pm_node_t *) pm_lambda_node_create(parser, &locals, locals_body_index, &operator, &opening, &parser->previous, parameters, body);
16679
+ pm_parser_current_param_name_restore(parser, saved_param_name);
16680
+
16681
+ return (pm_node_t *) pm_lambda_node_create(parser, &locals, &operator, &opening, &parser->previous, parameters, body);
16080
16682
  }
16081
16683
  case PM_TOKEN_UPLUS: {
16082
16684
  parser_lex(parser);
16083
16685
 
16084
16686
  pm_token_t operator = parser->previous;
16085
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_PLUS);
16687
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16086
16688
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "+@");
16087
16689
 
16088
16690
  return (pm_node_t *) node;
@@ -16095,12 +16697,34 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16095
16697
 
16096
16698
  return parse_symbol(parser, &lex_mode, PM_LEX_STATE_END);
16097
16699
  }
16098
- default:
16099
- if (context_recoverable(parser, &parser->current)) {
16700
+ default: {
16701
+ pm_context_t recoverable = context_recoverable(parser, &parser->current);
16702
+
16703
+ if (recoverable != PM_CONTEXT_NONE) {
16100
16704
  parser->recovering = true;
16705
+
16706
+ // If the given error is not the generic one, then we'll add it
16707
+ // here because it will provide more context in addition to the
16708
+ // recoverable error that we will also add.
16709
+ if (diag_id != PM_ERR_CANNOT_PARSE_EXPRESSION) {
16710
+ pm_parser_err_prefix(parser, diag_id);
16711
+ }
16712
+
16713
+ // If we get here, then we are assuming this token is closing a
16714
+ // parent context, so we'll indicate that to the user so that
16715
+ // they know how we behaved.
16716
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_type_human(parser->current.type), context_human(recoverable));
16717
+ } else if (diag_id == PM_ERR_CANNOT_PARSE_EXPRESSION) {
16718
+ // We're going to make a special case here, because "cannot
16719
+ // parse expression" is pretty generic, and we know here that we
16720
+ // have an unexpected token.
16721
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type));
16722
+ } else {
16723
+ pm_parser_err_prefix(parser, diag_id);
16101
16724
  }
16102
16725
 
16103
16726
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
16727
+ }
16104
16728
  }
16105
16729
  }
16106
16730
 
@@ -16145,7 +16769,18 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
16145
16769
  if (is_single_value && match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) {
16146
16770
  pm_token_t rescue = parser->current;
16147
16771
  parser_lex(parser);
16148
- pm_node_t *right = parse_expression(parser, binding_power, false, PM_ERR_RESCUE_MODIFIER_VALUE);
16772
+
16773
+ bool accepts_command_call_inner = false;
16774
+
16775
+ // RHS can accept command call iff the value is a call with arguments but without paranthesis.
16776
+ if (PM_NODE_TYPE_P(value, PM_CALL_NODE)) {
16777
+ pm_call_node_t *call_node = (pm_call_node_t *)value;
16778
+ if ((call_node->arguments != NULL) && (call_node->opening_loc.start == NULL)) {
16779
+ accepts_command_call_inner = true;
16780
+ }
16781
+ }
16782
+
16783
+ pm_node_t *right = parse_expression(parser, binding_power, accepts_command_call_inner, PM_ERR_RESCUE_MODIFIER_VALUE);
16149
16784
 
16150
16785
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
16151
16786
  }
@@ -16330,7 +16965,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16330
16965
  switch (PM_NODE_TYPE(node)) {
16331
16966
  case PM_BACK_REFERENCE_READ_NODE:
16332
16967
  case PM_NUMBERED_REFERENCE_READ_NODE:
16333
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16968
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16334
16969
  /* fallthrough */
16335
16970
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16336
16971
  parser_lex(parser);
@@ -16412,7 +17047,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16412
17047
  }
16413
17048
 
16414
17049
  // If this node cannot be writable, then we have an error.
16415
- if (pm_call_node_writable_p(cast)) {
17050
+ if (pm_call_node_writable_p(parser, cast)) {
16416
17051
  parse_write_name(parser, &cast->name);
16417
17052
  } else {
16418
17053
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -16441,7 +17076,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16441
17076
  switch (PM_NODE_TYPE(node)) {
16442
17077
  case PM_BACK_REFERENCE_READ_NODE:
16443
17078
  case PM_NUMBERED_REFERENCE_READ_NODE:
16444
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17079
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16445
17080
  /* fallthrough */
16446
17081
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16447
17082
  parser_lex(parser);
@@ -16523,7 +17158,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16523
17158
  }
16524
17159
 
16525
17160
  // If this node cannot be writable, then we have an error.
16526
- if (pm_call_node_writable_p(cast)) {
17161
+ if (pm_call_node_writable_p(parser, cast)) {
16527
17162
  parse_write_name(parser, &cast->name);
16528
17163
  } else {
16529
17164
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -16562,7 +17197,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16562
17197
  switch (PM_NODE_TYPE(node)) {
16563
17198
  case PM_BACK_REFERENCE_READ_NODE:
16564
17199
  case PM_NUMBERED_REFERENCE_READ_NODE:
16565
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17200
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16566
17201
  /* fallthrough */
16567
17202
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16568
17203
  parser_lex(parser);
@@ -16644,7 +17279,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16644
17279
  }
16645
17280
 
16646
17281
  // If this node cannot be writable, then we have an error.
16647
- if (pm_call_node_writable_p(cast)) {
17282
+ if (pm_call_node_writable_p(parser, cast)) {
16648
17283
  parse_write_name(parser, &cast->name);
16649
17284
  } else {
16650
17285
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -17063,15 +17698,12 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17063
17698
  */
17064
17699
  static pm_node_t *
17065
17700
  parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id) {
17066
- pm_token_t recovery = parser->previous;
17067
- pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call);
17701
+ pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call, diag_id);
17068
17702
 
17069
17703
  switch (PM_NODE_TYPE(node)) {
17070
17704
  case PM_MISSING_NODE:
17071
17705
  // If we found a syntax error, then the type of node returned by
17072
- // parse_expression_prefix is going to be a missing node. In that
17073
- // case we need to add the error message to the parser's error list.
17074
- pm_parser_err(parser, recovery.end, recovery.end, diag_id);
17706
+ // parse_expression_prefix is going to be a missing node.
17075
17707
  return node;
17076
17708
  case PM_PRE_EXECUTION_NODE:
17077
17709
  case PM_POST_EXECUTION_NODE:
@@ -17080,7 +17712,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
17080
17712
  case PM_UNDEF_NODE:
17081
17713
  // These expressions are statements, and cannot be followed by
17082
17714
  // operators (except modifiers).
17083
- if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER_RESCUE) {
17715
+ if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) {
17084
17716
  return node;
17085
17717
  }
17086
17718
  break;
@@ -17175,9 +17807,14 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc
17175
17807
 
17176
17808
  static pm_node_t *
17177
17809
  parse_program(pm_parser_t *parser) {
17178
- pm_parser_scope_push(parser, !parser->current_scope);
17179
- parser_lex(parser);
17810
+ // If the current scope is NULL, then we want to push a new top level scope.
17811
+ // The current scope could exist in the event that we are parsing an eval
17812
+ // and the user has passed into scopes that already exist.
17813
+ if (parser->current_scope == NULL) {
17814
+ pm_parser_scope_push(parser, true);
17815
+ }
17180
17816
 
17817
+ parser_lex(parser);
17181
17818
  pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_MAIN);
17182
17819
  if (!statements) {
17183
17820
  statements = pm_statements_node_create(parser);
@@ -17224,6 +17861,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17224
17861
  .current = { .type = PM_TOKEN_EOF, .start = source, .end = source },
17225
17862
  .next_start = NULL,
17226
17863
  .heredoc_end = NULL,
17864
+ .data_loc = { .start = NULL, .end = NULL },
17227
17865
  .comment_list = { 0 },
17228
17866
  .magic_comment_list = { 0 },
17229
17867
  .warning_list = { 0 },
@@ -17234,7 +17872,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17234
17872
  .encoding_changed_callback = NULL,
17235
17873
  .encoding_comment_start = source,
17236
17874
  .lex_callback = NULL,
17237
- .filepath_string = { 0 },
17875
+ .filepath = { 0 },
17238
17876
  .constant_pool = { 0 },
17239
17877
  .newline_list = { 0 },
17240
17878
  .integer_base = 0,
@@ -17248,8 +17886,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17248
17886
  .in_keyword_arg = false,
17249
17887
  .current_param_name = 0,
17250
17888
  .semantic_token_seen = false,
17251
- .frozen_string_literal = false,
17252
- .suppress_warnings = false
17889
+ .frozen_string_literal = false
17253
17890
  };
17254
17891
 
17255
17892
  // Initialize the constant pool. We're going to completely guess as to the
@@ -17278,7 +17915,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17278
17915
  // If options were provided to this parse, establish them here.
17279
17916
  if (options != NULL) {
17280
17917
  // filepath option
17281
- parser->filepath_string = options->filepath;
17918
+ parser->filepath = options->filepath;
17282
17919
 
17283
17920
  // line option
17284
17921
  parser->start_line = options->line;
@@ -17295,10 +17932,8 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17295
17932
  parser->frozen_string_literal = true;
17296
17933
  }
17297
17934
 
17298
- // suppress_warnings option
17299
- if (options->suppress_warnings) {
17300
- parser->suppress_warnings = true;
17301
- }
17935
+ // version option
17936
+ parser->version = options->version;
17302
17937
 
17303
17938
  // scopes option
17304
17939
  for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) {
@@ -17382,7 +18017,7 @@ pm_magic_comment_list_free(pm_list_t *list) {
17382
18017
  */
17383
18018
  PRISM_EXPORTED_FUNCTION void
17384
18019
  pm_parser_free(pm_parser_t *parser) {
17385
- pm_string_free(&parser->filepath_string);
18020
+ pm_string_free(&parser->filepath);
17386
18021
  pm_diagnostic_list_free(&parser->error_list);
17387
18022
  pm_diagnostic_list_free(&parser->warning_list);
17388
18023
  pm_comment_list_free(&parser->comment_list);
@@ -17484,3 +18119,303 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
17484
18119
  #undef PM_LOCATION_NODE_VALUE
17485
18120
  #undef PM_LOCATION_NULL_VALUE
17486
18121
  #undef PM_LOCATION_TOKEN_VALUE
18122
+
18123
+ /** An error that is going to be formatted into the output. */
18124
+ typedef struct {
18125
+ /** A pointer to the diagnostic that was generated during parsing. */
18126
+ pm_diagnostic_t *error;
18127
+
18128
+ /** The start line of the diagnostic message. */
18129
+ int32_t line;
18130
+
18131
+ /** The column start of the diagnostic message. */
18132
+ uint32_t column_start;
18133
+
18134
+ /** The column end of the diagnostic message. */
18135
+ uint32_t column_end;
18136
+ } pm_error_t;
18137
+
18138
+ /** The format that will be used to format the errors into the output. */
18139
+ typedef struct {
18140
+ /** The prefix that will be used for line numbers. */
18141
+ const char *number_prefix;
18142
+
18143
+ /** The prefix that will be used for blank lines. */
18144
+ const char *blank_prefix;
18145
+
18146
+ /** The divider that will be used between sections of source code. */
18147
+ const char *divider;
18148
+
18149
+ /** The length of the blank prefix. */
18150
+ size_t blank_prefix_length;
18151
+
18152
+ /** The length of the divider. */
18153
+ size_t divider_length;
18154
+ } pm_error_format_t;
18155
+
18156
+ #define PM_COLOR_GRAY "\033[38;5;102m"
18157
+ #define PM_COLOR_RED "\033[1;31m"
18158
+ #define PM_COLOR_RESET "\033[0m"
18159
+
18160
+ static inline pm_error_t *
18161
+ pm_parser_errors_format_sort(const pm_parser_t *parser, const pm_list_t *error_list, const pm_newline_list_t *newline_list) {
18162
+ pm_error_t *errors = calloc(error_list->size, sizeof(pm_error_t));
18163
+ int32_t start_line = parser->start_line;
18164
+
18165
+ for (pm_diagnostic_t *error = (pm_diagnostic_t *) error_list->head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
18166
+ pm_line_column_t start = pm_newline_list_line_column(newline_list, error->location.start, start_line);
18167
+ pm_line_column_t end = pm_newline_list_line_column(newline_list, error->location.end, start_line);
18168
+
18169
+ // We're going to insert this error into the array in sorted order. We
18170
+ // do this by finding the first error that has a line number greater
18171
+ // than the current error and then inserting the current error before
18172
+ // that one.
18173
+ size_t index = 0;
18174
+ while (
18175
+ (index < error_list->size) &&
18176
+ (errors[index].error != NULL) &&
18177
+ (
18178
+ (errors[index].line < start.line) ||
18179
+ ((errors[index].line == start.line) && (errors[index].column_start < start.column))
18180
+ )
18181
+ ) index++;
18182
+
18183
+ // Now we're going to shift all of the errors after this one down one
18184
+ // index to make room for the new error.
18185
+ if (index + 1 < error_list->size) {
18186
+ memmove(&errors[index + 1], &errors[index], sizeof(pm_error_t) * (error_list->size - index - 1));
18187
+ }
18188
+
18189
+ // Finally, we'll insert the error into the array.
18190
+ uint32_t column_end;
18191
+ if (start.line == end.line) {
18192
+ column_end = end.column;
18193
+ } else {
18194
+ column_end = (uint32_t) (newline_list->offsets[start.line - start_line + 1] - newline_list->offsets[start.line - start_line] - 1);
18195
+ }
18196
+
18197
+ // Ensure we have at least one column of error.
18198
+ if (start.column == column_end) column_end++;
18199
+
18200
+ errors[index] = (pm_error_t) {
18201
+ .error = error,
18202
+ .line = start.line,
18203
+ .column_start = start.column,
18204
+ .column_end = column_end
18205
+ };
18206
+ }
18207
+
18208
+ return errors;
18209
+ }
18210
+
18211
+ static inline void
18212
+ pm_parser_errors_format_line(const pm_parser_t *parser, const pm_newline_list_t *newline_list, const char *number_prefix, int32_t line, pm_buffer_t *buffer) {
18213
+ size_t index = (size_t) (line - parser->start_line);
18214
+
18215
+ const uint8_t *start = &parser->start[newline_list->offsets[index]];
18216
+ const uint8_t *end;
18217
+
18218
+ if (index >= newline_list->size - 1) {
18219
+ end = parser->end;
18220
+ } else {
18221
+ end = &parser->start[newline_list->offsets[index + 1]];
18222
+ }
18223
+
18224
+ pm_buffer_append_format(buffer, number_prefix, line);
18225
+ pm_buffer_append_string(buffer, (const char *) start, (size_t) (end - start));
18226
+
18227
+ if (end == parser->end && end[-1] != '\n') {
18228
+ pm_buffer_append_string(buffer, "\n", 1);
18229
+ }
18230
+ }
18231
+
18232
+ /**
18233
+ * Format the errors on the parser into the given buffer.
18234
+ */
18235
+ PRISM_EXPORTED_FUNCTION void
18236
+ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool colorize) {
18237
+ const pm_list_t *error_list = &parser->error_list;
18238
+ assert(error_list->size != 0);
18239
+
18240
+ // First, we're going to sort all of the errors by line number using an
18241
+ // insertion sort into a newly allocated array.
18242
+ const int32_t start_line = parser->start_line;
18243
+ const pm_newline_list_t *newline_list = &parser->newline_list;
18244
+ pm_error_t *errors = pm_parser_errors_format_sort(parser, error_list, newline_list);
18245
+
18246
+ // Now we're going to determine how we're going to format line numbers and
18247
+ // blank lines based on the maximum number of digits in the line numbers
18248
+ // that are going to be displayed.
18249
+ pm_error_format_t error_format;
18250
+ int32_t max_line_number = errors[error_list->size - 1].line - start_line;
18251
+
18252
+ if (max_line_number < 10) {
18253
+ if (colorize) {
18254
+ error_format = (pm_error_format_t) {
18255
+ .number_prefix = PM_COLOR_GRAY "%1" PRIi32 " | " PM_COLOR_RESET,
18256
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18257
+ .divider = PM_COLOR_GRAY " ~~~~~" PM_COLOR_RESET "\n"
18258
+ };
18259
+ } else {
18260
+ error_format = (pm_error_format_t) {
18261
+ .number_prefix = "%1" PRIi32 " | ",
18262
+ .blank_prefix = " | ",
18263
+ .divider = " ~~~~~\n"
18264
+ };
18265
+ }
18266
+ } else if (max_line_number < 100) {
18267
+ if (colorize) {
18268
+ error_format = (pm_error_format_t) {
18269
+ .number_prefix = PM_COLOR_GRAY "%2" PRIi32 " | " PM_COLOR_RESET,
18270
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18271
+ .divider = PM_COLOR_GRAY " ~~~~~~" PM_COLOR_RESET "\n"
18272
+ };
18273
+ } else {
18274
+ error_format = (pm_error_format_t) {
18275
+ .number_prefix = "%2" PRIi32 " | ",
18276
+ .blank_prefix = " | ",
18277
+ .divider = " ~~~~~~\n"
18278
+ };
18279
+ }
18280
+ } else if (max_line_number < 1000) {
18281
+ if (colorize) {
18282
+ error_format = (pm_error_format_t) {
18283
+ .number_prefix = PM_COLOR_GRAY "%3" PRIi32 " | " PM_COLOR_RESET,
18284
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18285
+ .divider = PM_COLOR_GRAY " ~~~~~~~" PM_COLOR_RESET "\n"
18286
+ };
18287
+ } else {
18288
+ error_format = (pm_error_format_t) {
18289
+ .number_prefix = "%3" PRIi32 " | ",
18290
+ .blank_prefix = " | ",
18291
+ .divider = " ~~~~~~~\n"
18292
+ };
18293
+ }
18294
+ } else if (max_line_number < 10000) {
18295
+ if (colorize) {
18296
+ error_format = (pm_error_format_t) {
18297
+ .number_prefix = PM_COLOR_GRAY "%4" PRIi32 " | " PM_COLOR_RESET,
18298
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18299
+ .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18300
+ };
18301
+ } else {
18302
+ error_format = (pm_error_format_t) {
18303
+ .number_prefix = "%4" PRIi32 " | ",
18304
+ .blank_prefix = " | ",
18305
+ .divider = " ~~~~~~~~\n"
18306
+ };
18307
+ }
18308
+ } else {
18309
+ if (colorize) {
18310
+ error_format = (pm_error_format_t) {
18311
+ .number_prefix = PM_COLOR_GRAY "%5" PRIi32 " | " PM_COLOR_RESET,
18312
+ .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18313
+ .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18314
+ };
18315
+ } else {
18316
+ error_format = (pm_error_format_t) {
18317
+ .number_prefix = "%5" PRIi32 " | ",
18318
+ .blank_prefix = " | ",
18319
+ .divider = " ~~~~~~~~\n"
18320
+ };
18321
+ }
18322
+ }
18323
+
18324
+ error_format.blank_prefix_length = strlen(error_format.blank_prefix);
18325
+ error_format.divider_length = strlen(error_format.divider);
18326
+
18327
+ // Now we're going to iterate through every error in our error list and
18328
+ // display it. While we're iterating, we will display some padding lines of
18329
+ // the source before the error to give some context. We'll be careful not to
18330
+ // display the same line twice in case the errors are close enough in the
18331
+ // source.
18332
+ int32_t last_line = 0;
18333
+ const pm_encoding_t *encoding = parser->encoding;
18334
+
18335
+ for (size_t index = 0; index < error_list->size; index++) {
18336
+ pm_error_t *error = &errors[index];
18337
+
18338
+ // Here we determine how many lines of padding of the source to display,
18339
+ // based on the difference from the last line that was displayed.
18340
+ if (error->line - last_line > 1) {
18341
+ if (error->line - last_line > 2) {
18342
+ if ((index != 0) && (error->line - last_line > 3)) {
18343
+ pm_buffer_append_string(buffer, error_format.divider, error_format.divider_length);
18344
+ }
18345
+
18346
+ pm_buffer_append_string(buffer, " ", 2);
18347
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line - 2, buffer);
18348
+ }
18349
+
18350
+ pm_buffer_append_string(buffer, " ", 2);
18351
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line - 1, buffer);
18352
+ }
18353
+
18354
+ // If this is the first error or we're on a new line, then we'll display
18355
+ // the line that has the error in it.
18356
+ if ((index == 0) || (error->line != last_line)) {
18357
+ if (colorize) {
18358
+ pm_buffer_append_string(buffer, PM_COLOR_RED "> " PM_COLOR_RESET, 13);
18359
+ } else {
18360
+ pm_buffer_append_string(buffer, "> ", 2);
18361
+ }
18362
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, error->line, buffer);
18363
+ }
18364
+
18365
+ // Now we'll display the actual error message. We'll do this by first
18366
+ // putting the prefix to the line, then a bunch of blank spaces
18367
+ // depending on the column, then as many carets as we need to display
18368
+ // the width of the error, then the error message itself.
18369
+ //
18370
+ // Note that this doesn't take into account the width of the actual
18371
+ // character when displayed in the terminal. For some east-asian
18372
+ // languages or emoji, this means it can be thrown off pretty badly. We
18373
+ // will need to solve this eventually.
18374
+ pm_buffer_append_string(buffer, " ", 2);
18375
+ pm_buffer_append_string(buffer, error_format.blank_prefix, error_format.blank_prefix_length);
18376
+
18377
+ size_t column = 0;
18378
+ const uint8_t *start = &parser->start[newline_list->offsets[error->line - start_line]];
18379
+
18380
+ while (column < error->column_end) {
18381
+ if (column < error->column_start) {
18382
+ pm_buffer_append_byte(buffer, ' ');
18383
+ } else if (colorize) {
18384
+ pm_buffer_append_string(buffer, PM_COLOR_RED "^" PM_COLOR_RESET, 12);
18385
+ } else {
18386
+ pm_buffer_append_byte(buffer, '^');
18387
+ }
18388
+
18389
+ size_t char_width = encoding->char_width(start + column, parser->end - (start + column));
18390
+ column += (char_width == 0 ? 1 : char_width);
18391
+ }
18392
+
18393
+ pm_buffer_append_byte(buffer, ' ');
18394
+
18395
+ const char *message = error->error->message;
18396
+ pm_buffer_append_string(buffer, message, strlen(message));
18397
+ pm_buffer_append_byte(buffer, '\n');
18398
+
18399
+ // Here we determine how many lines of padding to display after the
18400
+ // error, depending on where the next error is in source.
18401
+ last_line = error->line;
18402
+ int32_t next_line = (index == error_list->size - 1) ? ((int32_t) newline_list->size) : errors[index + 1].line;
18403
+
18404
+ if (next_line - last_line > 1) {
18405
+ pm_buffer_append_string(buffer, " ", 2);
18406
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, ++last_line, buffer);
18407
+ }
18408
+
18409
+ if (next_line - last_line > 1) {
18410
+ pm_buffer_append_string(buffer, " ", 2);
18411
+ pm_parser_errors_format_line(parser, newline_list, error_format.number_prefix, ++last_line, buffer);
18412
+ }
18413
+ }
18414
+
18415
+ // Finally, we'll free the array of errors that we allocated.
18416
+ free(errors);
18417
+ }
18418
+
18419
+ #undef PM_COLOR_GRAY
18420
+ #undef PM_COLOR_RED
18421
+ #undef PM_COLOR_RESET