prism 0.22.0 → 0.24.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +39 -1
  3. data/README.md +2 -1
  4. data/docs/releasing.md +67 -17
  5. data/docs/ruby_parser_translation.md +19 -0
  6. data/docs/serialization.md +2 -0
  7. data/ext/prism/api_node.c +1982 -1538
  8. data/ext/prism/extension.c +12 -7
  9. data/ext/prism/extension.h +2 -2
  10. data/include/prism/diagnostic.h +3 -4
  11. data/include/prism/encoding.h +7 -0
  12. data/include/prism/util/pm_constant_pool.h +1 -1
  13. data/include/prism/util/pm_newline_list.h +4 -3
  14. data/include/prism/util/pm_strpbrk.h +4 -1
  15. data/include/prism/version.h +2 -2
  16. data/lib/prism/desugar_compiler.rb +225 -80
  17. data/lib/prism/dsl.rb +302 -299
  18. data/lib/prism/ffi.rb +103 -77
  19. data/lib/prism/lex_compat.rb +1 -0
  20. data/lib/prism/node.rb +3624 -2114
  21. data/lib/prism/node_ext.rb +25 -2
  22. data/lib/prism/parse_result.rb +56 -19
  23. data/lib/prism/serialize.rb +605 -303
  24. data/lib/prism/translation/parser/compiler.rb +1 -1
  25. data/lib/prism/translation/parser/rubocop.rb +11 -3
  26. data/lib/prism/translation/parser.rb +25 -12
  27. data/lib/prism/translation/parser33.rb +12 -0
  28. data/lib/prism/translation/parser34.rb +12 -0
  29. data/lib/prism/translation/ripper.rb +696 -0
  30. data/lib/prism/translation/ruby_parser.rb +1521 -0
  31. data/lib/prism/translation.rb +3 -3
  32. data/lib/prism.rb +0 -1
  33. data/prism.gemspec +6 -2
  34. data/src/diagnostic.c +10 -11
  35. data/src/encoding.c +16 -17
  36. data/src/options.c +7 -2
  37. data/src/prettyprint.c +3 -3
  38. data/src/prism.c +172 -97
  39. data/src/serialize.c +24 -13
  40. data/src/token_type.c +3 -3
  41. data/src/util/pm_constant_pool.c +1 -1
  42. data/src/util/pm_newline_list.c +6 -3
  43. data/src/util/pm_strpbrk.c +122 -14
  44. metadata +8 -4
  45. data/lib/prism/ripper_compat.rb +0 -285
data/src/prism.c CHANGED
@@ -51,6 +51,7 @@ debug_context(pm_context_t context) {
51
51
  case PM_CONTEXT_IF: return "IF";
52
52
  case PM_CONTEXT_MAIN: return "MAIN";
53
53
  case PM_CONTEXT_MODULE: return "MODULE";
54
+ case PM_CONTEXT_NONE: return "NONE";
54
55
  case PM_CONTEXT_PARENS: return "PARENS";
55
56
  case PM_CONTEXT_POSTEXE: return "POSTEXE";
56
57
  case PM_CONTEXT_PREDICATE: return "PREDICATE";
@@ -492,7 +493,8 @@ pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_
492
493
  /**
493
494
  * Append an error to the list of errors on the parser using a format string.
494
495
  */
495
- #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
496
+ #define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) \
497
+ pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__)
496
498
 
497
499
  /**
498
500
  * Append an error to the list of errors on the parser using the location of the
@@ -507,7 +509,8 @@ pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
507
509
  * Append an error to the list of errors on the parser using the given location
508
510
  * using a format string.
509
511
  */
510
- #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (location)->start, (location)->end, diag_id, __VA_ARGS__)
512
+ #define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) \
513
+ PM_PARSER_ERR_FORMAT(parser, (location)->start, (location)->end, diag_id, __VA_ARGS__)
511
514
 
512
515
  /**
513
516
  * Append an error to the list of errors on the parser using the location of the
@@ -522,7 +525,15 @@ pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_
522
525
  * Append an error to the list of errors on the parser using the location of the
523
526
  * given node and a format string.
524
527
  */
525
- #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, node->location.start, node->location.end, diag_id, __VA_ARGS__)
528
+ #define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) \
529
+ PM_PARSER_ERR_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__)
530
+
531
+ /**
532
+ * Append an error to the list of errors on the parser using the location of the
533
+ * given node and a format string, and add on the content of the node.
534
+ */
535
+ #define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, diag_id) \
536
+ PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, (int) ((node)->location.end - (node)->location.start), (const char *) (node)->location.start)
526
537
 
527
538
  /**
528
539
  * Append an error to the list of errors on the parser using the location of the
@@ -546,7 +557,15 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_
546
557
  * Append an error to the list of errors on the parser using the location of the
547
558
  * given token and a format string.
548
559
  */
549
- #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__)
560
+ #define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) \
561
+ PM_PARSER_ERR_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__)
562
+
563
+ /**
564
+ * Append an error to the list of errors on the parser using the location of the
565
+ * given token and a format string, and add on the content of the token.
566
+ */
567
+ #define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \
568
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start)
550
569
 
551
570
  /**
552
571
  * Append a warning to the list of warnings on the parser.
@@ -2890,7 +2909,8 @@ pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) {
2890
2909
  static pm_def_node_t *
2891
2910
  pm_def_node_create(
2892
2911
  pm_parser_t *parser,
2893
- const pm_token_t *name,
2912
+ pm_constant_id_t name,
2913
+ const pm_token_t *name_loc,
2894
2914
  pm_node_t *receiver,
2895
2915
  pm_parameters_node_t *parameters,
2896
2916
  pm_node_t *body,
@@ -2920,8 +2940,8 @@ pm_def_node_create(
2920
2940
  .type = PM_DEF_NODE,
2921
2941
  .location = { .start = def_keyword->start, .end = end },
2922
2942
  },
2923
- .name = pm_parser_constant_id_token(parser, name),
2924
- .name_loc = PM_LOCATION_TOKEN_VALUE(name),
2943
+ .name = name,
2944
+ .name_loc = PM_LOCATION_TOKEN_VALUE(name_loc),
2925
2945
  .receiver = receiver,
2926
2946
  .parameters = parameters,
2927
2947
  .body = body,
@@ -4642,13 +4662,20 @@ pm_multi_target_node_create(pm_parser_t *parser) {
4642
4662
  */
4643
4663
  static void
4644
4664
  pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t *node, pm_node_t *target) {
4645
- if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE) || PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4665
+ if (PM_NODE_TYPE_P(target, PM_SPLAT_NODE)) {
4646
4666
  if (node->rest == NULL) {
4647
4667
  node->rest = target;
4648
4668
  } else {
4649
4669
  pm_parser_err_node(parser, target, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
4650
4670
  pm_node_list_append(&node->rights, target);
4651
4671
  }
4672
+ } else if (PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) {
4673
+ if (node->rest == NULL) {
4674
+ node->rest = target;
4675
+ } else {
4676
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST);
4677
+ pm_node_list_append(&node->rights, target);
4678
+ }
4652
4679
  } else if (node->rest == NULL) {
4653
4680
  pm_node_list_append(&node->lefts, target);
4654
4681
  } else {
@@ -7172,7 +7199,7 @@ lex_numeric(pm_parser_t *parser) {
7172
7199
  static pm_token_type_t
7173
7200
  lex_global_variable(pm_parser_t *parser) {
7174
7201
  if (parser->current.end >= parser->end) {
7175
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7202
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
7176
7203
  return PM_TOKEN_GLOBAL_VARIABLE;
7177
7204
  }
7178
7205
 
@@ -7213,7 +7240,7 @@ lex_global_variable(pm_parser_t *parser) {
7213
7240
  } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
7214
7241
 
7215
7242
  // $0 isn't allowed to be followed by anything.
7216
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7243
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
7217
7244
  }
7218
7245
 
7219
7246
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -7244,7 +7271,7 @@ lex_global_variable(pm_parser_t *parser) {
7244
7271
  } else {
7245
7272
  // If we get here, then we have a $ followed by something that isn't
7246
7273
  // recognized as a global variable.
7247
- pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
7274
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_INVALID_VARIABLE_GLOBAL);
7248
7275
  }
7249
7276
 
7250
7277
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -8148,10 +8175,10 @@ lex_at_variable(pm_parser_t *parser) {
8148
8175
  while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0) {
8149
8176
  parser->current.end += width;
8150
8177
  }
8151
- } else if (type == PM_TOKEN_CLASS_VARIABLE) {
8152
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
8153
8178
  } else {
8154
- pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
8179
+ pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE;
8180
+ size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8181
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start);
8155
8182
  }
8156
8183
 
8157
8184
  // If we're lexing an embedded variable, then we need to pop back into the
@@ -9711,7 +9738,7 @@ parser_lex(pm_parser_t *parser) {
9711
9738
  // and then find the first one.
9712
9739
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
9713
9740
  const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
9714
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9741
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9715
9742
 
9716
9743
  // If we haven't found an escape yet, then this buffer will be
9717
9744
  // unallocated since we can refer directly to the source string.
@@ -9720,7 +9747,7 @@ parser_lex(pm_parser_t *parser) {
9720
9747
  while (breakpoint != NULL) {
9721
9748
  // If we hit a null byte, skip directly past it.
9722
9749
  if (*breakpoint == '\0') {
9723
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
9750
+ breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1), true);
9724
9751
  continue;
9725
9752
  }
9726
9753
 
@@ -9739,7 +9766,7 @@ parser_lex(pm_parser_t *parser) {
9739
9766
  // we need to continue on past it.
9740
9767
  if (lex_mode->as.list.nesting > 0) {
9741
9768
  parser->current.end = breakpoint + 1;
9742
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9769
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9743
9770
  lex_mode->as.list.nesting--;
9744
9771
  continue;
9745
9772
  }
@@ -9824,7 +9851,7 @@ parser_lex(pm_parser_t *parser) {
9824
9851
  }
9825
9852
 
9826
9853
  token_buffer.cursor = parser->current.end;
9827
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9854
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9828
9855
  continue;
9829
9856
  }
9830
9857
 
@@ -9837,7 +9864,7 @@ parser_lex(pm_parser_t *parser) {
9837
9864
  // that looked like an interpolated class or instance variable
9838
9865
  // like "#@" but wasn't actually. In this case we'll just skip
9839
9866
  // to the next breakpoint.
9840
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9867
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9841
9868
  continue;
9842
9869
  }
9843
9870
 
@@ -9852,7 +9879,7 @@ parser_lex(pm_parser_t *parser) {
9852
9879
  // and find the next breakpoint.
9853
9880
  assert(*breakpoint == lex_mode->as.list.incrementor);
9854
9881
  parser->current.end = breakpoint + 1;
9855
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9882
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
9856
9883
  lex_mode->as.list.nesting++;
9857
9884
  continue;
9858
9885
  }
@@ -9891,14 +9918,14 @@ parser_lex(pm_parser_t *parser) {
9891
9918
  // regular expression. We'll use strpbrk to find the first of these
9892
9919
  // characters.
9893
9920
  const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
9894
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9921
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9895
9922
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
9896
9923
 
9897
9924
  while (breakpoint != NULL) {
9898
9925
  // If we hit a null byte, skip directly past it.
9899
9926
  if (*breakpoint == '\0') {
9900
9927
  parser->current.end = breakpoint + 1;
9901
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9928
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9902
9929
  continue;
9903
9930
  }
9904
9931
 
@@ -9920,7 +9947,7 @@ parser_lex(pm_parser_t *parser) {
9920
9947
  // If the terminator is not a newline, then we can set
9921
9948
  // the next breakpoint and continue.
9922
9949
  parser->current.end = breakpoint + 1;
9923
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9950
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9924
9951
  continue;
9925
9952
  }
9926
9953
  }
@@ -9930,7 +9957,7 @@ parser_lex(pm_parser_t *parser) {
9930
9957
  if (*breakpoint == lex_mode->as.regexp.terminator) {
9931
9958
  if (lex_mode->as.regexp.nesting > 0) {
9932
9959
  parser->current.end = breakpoint + 1;
9933
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
9960
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
9934
9961
  lex_mode->as.regexp.nesting--;
9935
9962
  continue;
9936
9963
  }
@@ -10029,7 +10056,7 @@ parser_lex(pm_parser_t *parser) {
10029
10056
  }
10030
10057
 
10031
10058
  token_buffer.cursor = parser->current.end;
10032
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10059
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
10033
10060
  continue;
10034
10061
  }
10035
10062
 
@@ -10042,7 +10069,7 @@ parser_lex(pm_parser_t *parser) {
10042
10069
  // something that looked like an interpolated class or
10043
10070
  // instance variable like "#@" but wasn't actually. In
10044
10071
  // this case we'll just skip to the next breakpoint.
10045
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10072
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
10046
10073
  continue;
10047
10074
  }
10048
10075
 
@@ -10057,7 +10084,7 @@ parser_lex(pm_parser_t *parser) {
10057
10084
  // and find the next breakpoint.
10058
10085
  assert(*breakpoint == lex_mode->as.regexp.incrementor);
10059
10086
  parser->current.end = breakpoint + 1;
10060
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10087
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
10061
10088
  lex_mode->as.regexp.nesting++;
10062
10089
  continue;
10063
10090
  }
@@ -10093,7 +10120,7 @@ parser_lex(pm_parser_t *parser) {
10093
10120
  // string. We'll use strpbrk to find the first of these characters.
10094
10121
  pm_lex_mode_t *lex_mode = parser->lex_modes.current;
10095
10122
  const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
10096
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10123
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10097
10124
 
10098
10125
  // If we haven't found an escape yet, then this buffer will be
10099
10126
  // unallocated since we can refer directly to the source string.
@@ -10105,7 +10132,7 @@ parser_lex(pm_parser_t *parser) {
10105
10132
  if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
10106
10133
  lex_mode->as.string.nesting++;
10107
10134
  parser->current.end = breakpoint + 1;
10108
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10135
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10109
10136
  continue;
10110
10137
  }
10111
10138
 
@@ -10117,7 +10144,7 @@ parser_lex(pm_parser_t *parser) {
10117
10144
  // to continue on past it.
10118
10145
  if (lex_mode->as.string.nesting > 0) {
10119
10146
  parser->current.end = breakpoint + 1;
10120
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10147
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10121
10148
  lex_mode->as.string.nesting--;
10122
10149
  continue;
10123
10150
  }
@@ -10159,7 +10186,7 @@ parser_lex(pm_parser_t *parser) {
10159
10186
  if (parser->heredoc_end == NULL) {
10160
10187
  pm_newline_list_append(&parser->newline_list, breakpoint);
10161
10188
  parser->current.end = breakpoint + 1;
10162
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10189
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10163
10190
  continue;
10164
10191
  } else {
10165
10192
  parser->current.end = breakpoint + 1;
@@ -10173,7 +10200,7 @@ parser_lex(pm_parser_t *parser) {
10173
10200
  case '\0':
10174
10201
  // Skip directly past the null character.
10175
10202
  parser->current.end = breakpoint + 1;
10176
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10203
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10177
10204
  break;
10178
10205
  case '\\': {
10179
10206
  // Here we hit escapes.
@@ -10242,7 +10269,7 @@ parser_lex(pm_parser_t *parser) {
10242
10269
  }
10243
10270
 
10244
10271
  token_buffer.cursor = parser->current.end;
10245
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10272
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10246
10273
  break;
10247
10274
  }
10248
10275
  case '#': {
@@ -10253,7 +10280,7 @@ parser_lex(pm_parser_t *parser) {
10253
10280
  // looked like an interpolated class or instance variable like "#@"
10254
10281
  // but wasn't actually. In this case we'll just skip to the next
10255
10282
  // breakpoint.
10256
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10283
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10257
10284
  break;
10258
10285
  }
10259
10286
 
@@ -10381,7 +10408,7 @@ parser_lex(pm_parser_t *parser) {
10381
10408
  breakpoints[2] = '\0';
10382
10409
  }
10383
10410
 
10384
- const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10411
+ const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10385
10412
  pm_token_buffer_t token_buffer = { { 0 }, 0 };
10386
10413
  bool was_escaped_newline = false;
10387
10414
 
@@ -10390,7 +10417,7 @@ parser_lex(pm_parser_t *parser) {
10390
10417
  case '\0':
10391
10418
  // Skip directly past the null character.
10392
10419
  parser->current.end = breakpoint + 1;
10393
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10420
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10394
10421
  break;
10395
10422
  case '\n': {
10396
10423
  if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
@@ -10465,7 +10492,7 @@ parser_lex(pm_parser_t *parser) {
10465
10492
  // Otherwise we hit a newline and it wasn't followed by
10466
10493
  // a terminator, so we can continue parsing.
10467
10494
  parser->current.end = breakpoint + 1;
10468
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10495
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10469
10496
  break;
10470
10497
  }
10471
10498
  case '\\': {
@@ -10529,7 +10556,7 @@ parser_lex(pm_parser_t *parser) {
10529
10556
  }
10530
10557
 
10531
10558
  token_buffer.cursor = parser->current.end;
10532
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10559
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10533
10560
  break;
10534
10561
  }
10535
10562
  case '#': {
@@ -10541,7 +10568,7 @@ parser_lex(pm_parser_t *parser) {
10541
10568
  // or instance variable like "#@" but wasn't
10542
10569
  // actually. In this case we'll just skip to the
10543
10570
  // next breakpoint.
10544
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
10571
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
10545
10572
  break;
10546
10573
  }
10547
10574
 
@@ -11054,7 +11081,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
11054
11081
  return target;
11055
11082
  case PM_BACK_REFERENCE_READ_NODE:
11056
11083
  case PM_NUMBERED_REFERENCE_READ_NODE:
11057
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11084
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11058
11085
  return target;
11059
11086
  case PM_GLOBAL_VARIABLE_READ_NODE:
11060
11087
  assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t));
@@ -11192,7 +11219,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
11192
11219
  }
11193
11220
  case PM_BACK_REFERENCE_READ_NODE:
11194
11221
  case PM_NUMBERED_REFERENCE_READ_NODE:
11195
- pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11222
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY);
11196
11223
  /* fallthrough */
11197
11224
  case PM_GLOBAL_VARIABLE_READ_NODE: {
11198
11225
  pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value);
@@ -11367,7 +11394,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
11367
11394
  pm_multi_target_node_targets_append(parser, result, target);
11368
11395
  } else if (!match1(parser, PM_TOKEN_EOF)) {
11369
11396
  // If we get here, then we have a trailing , in a multi target node.
11370
- // We'll set the implicit rest flag to indicate this.
11397
+ // We'll add an implicit rest node to represent this.
11371
11398
  pm_node_t *rest = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
11372
11399
  pm_multi_target_node_targets_append(parser, result, rest);
11373
11400
  break;
@@ -11457,8 +11484,13 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11457
11484
 
11458
11485
  while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON));
11459
11486
  if (context_terminator(context, &parser->current)) break;
11460
- } else {
11461
- expect1(parser, PM_TOKEN_NEWLINE, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11487
+ } else if (!accept1(parser, PM_TOKEN_NEWLINE)) {
11488
+ // This is an inlined version of accept1 because the error that we
11489
+ // want to add has varargs. If this happens again, we should
11490
+ // probably extract a helper function.
11491
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
11492
+ parser->previous.start = parser->previous.end;
11493
+ parser->previous.type = PM_TOKEN_MISSING;
11462
11494
  }
11463
11495
  }
11464
11496
 
@@ -13852,7 +13884,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
13852
13884
  pm_constant_id_t name_id = pm_parser_constant_id_constant(parser, "0it", 3);
13853
13885
  variable = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13854
13886
  } else {
13855
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE, (int) (parser->previous.end - parser->previous.start), parser->previous.start);
13887
+ PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE);
13856
13888
  variable = (pm_node_t *) pm_local_variable_read_node_create(parser, &parser->previous, 0);
13857
13889
  }
13858
13890
  }
@@ -14161,7 +14193,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
14161
14193
  parser_lex(parser);
14162
14194
 
14163
14195
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
14164
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14196
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
14165
14197
  // If we get here, then we have an end immediately after a
14166
14198
  // start. In that case we'll create an empty content token and
14167
14199
  // return an uninterpolated string.
@@ -14174,7 +14206,6 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
14174
14206
  // If we get here, then we have an end of a label immediately
14175
14207
  // after a start. In that case we'll create an empty symbol
14176
14208
  // node.
14177
- pm_token_t opening = not_provided(parser);
14178
14209
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
14179
14210
  pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous);
14180
14211
 
@@ -14218,15 +14249,19 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
14218
14249
  parser_lex(parser);
14219
14250
  } while (match1(parser, PM_TOKEN_STRING_CONTENT));
14220
14251
 
14221
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14252
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
14222
14253
  node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
14223
14254
  } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
14224
14255
  node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
14225
14256
  } else if (match1(parser, PM_TOKEN_EOF)) {
14226
- pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_TERM);
14257
+ pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
14227
14258
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
14259
+ } else if (accept1(parser, PM_TOKEN_STRING_END)) {
14260
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14228
14261
  } else {
14229
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14262
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_type_human(parser->previous.type));
14263
+ parser->previous.start = parser->previous.end;
14264
+ parser->previous.type = PM_TOKEN_MISSING;
14230
14265
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
14231
14266
  }
14232
14267
  } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
@@ -14241,7 +14276,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
14241
14276
  if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
14242
14277
  node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
14243
14278
  pm_node_flag_set(node, parse_unescaped_encoding(parser));
14244
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
14279
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
14245
14280
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
14246
14281
  node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
14247
14282
  } else {
@@ -14332,6 +14367,29 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
14332
14367
  return current;
14333
14368
  }
14334
14369
 
14370
+ /**
14371
+ * Append an error to the error list on the parser using the given diagnostic
14372
+ * ID. This function is a specialization that handles formatting the specific
14373
+ * kind of error that is being appended.
14374
+ */
14375
+ static void
14376
+ pm_parser_err_prefix(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
14377
+ switch (diag_id) {
14378
+ case PM_ERR_HASH_KEY: {
14379
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, pm_token_type_human(parser->previous.type));
14380
+ break;
14381
+ }
14382
+ case PM_ERR_UNARY_RECEIVER: {
14383
+ const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_type_human(parser->current.type));
14384
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, human, parser->previous.start[0]);
14385
+ break;
14386
+ }
14387
+ default:
14388
+ pm_parser_err_previous(parser, diag_id);
14389
+ break;
14390
+ }
14391
+ }
14392
+
14335
14393
  /**
14336
14394
  * Parse an expression that begins with the previous node that we just lexed.
14337
14395
  */
@@ -14516,7 +14574,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14516
14574
  // If we didn't find a terminator and we didn't find a right
14517
14575
  // parenthesis, then this is a syntax error.
14518
14576
  if (!terminator_found) {
14519
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14577
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14520
14578
  }
14521
14579
 
14522
14580
  // Parse each statement within the parentheses.
@@ -14545,7 +14603,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
14545
14603
  } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
14546
14604
  break;
14547
14605
  } else {
14548
- pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
14606
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type));
14549
14607
  }
14550
14608
  }
14551
14609
 
@@ -15626,10 +15684,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15626
15684
  * methods to override the unary operators, we should ignore
15627
15685
  * the @ in the same way we do for symbols.
15628
15686
  */
15629
- name.end = parse_operator_symbol_name(&name);
15687
+ pm_constant_id_t name_id = pm_parser_constant_id_location(parser, name.start, parse_operator_symbol_name(&name));
15630
15688
 
15631
15689
  return (pm_node_t *) pm_def_node_create(
15632
15690
  parser,
15691
+ name_id,
15633
15692
  &name,
15634
15693
  receiver,
15635
15694
  params,
@@ -16458,7 +16517,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16458
16517
  // context of a multiple assignment. We enforce that here. We'll
16459
16518
  // still lex past it though and create a missing node place.
16460
16519
  if (binding_power != PM_BINDING_POWER_STATEMENT) {
16461
- pm_parser_err_previous(parser, diag_id);
16520
+ pm_parser_err_prefix(parser, diag_id);
16462
16521
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
16463
16522
  }
16464
16523
 
@@ -16481,7 +16540,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16481
16540
  parser_lex(parser);
16482
16541
 
16483
16542
  pm_token_t operator = parser->previous;
16484
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER_BANG);
16543
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, PM_ERR_UNARY_RECEIVER);
16485
16544
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "!");
16486
16545
 
16487
16546
  pm_conditional_predicate(receiver);
@@ -16491,7 +16550,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16491
16550
  parser_lex(parser);
16492
16551
 
16493
16552
  pm_token_t operator = parser->previous;
16494
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_TILDE);
16553
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16495
16554
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "~");
16496
16555
 
16497
16556
  return (pm_node_t *) node;
@@ -16500,7 +16559,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16500
16559
  parser_lex(parser);
16501
16560
 
16502
16561
  pm_token_t operator = parser->previous;
16503
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16562
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16504
16563
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "-@");
16505
16564
 
16506
16565
  return (pm_node_t *) node;
@@ -16509,7 +16568,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16509
16568
  parser_lex(parser);
16510
16569
 
16511
16570
  pm_token_t operator = parser->previous;
16512
- pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_MINUS);
16571
+ pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16513
16572
 
16514
16573
  if (accept1(parser, PM_TOKEN_STAR_STAR)) {
16515
16574
  pm_token_t exponent_operator = parser->previous;
@@ -16625,7 +16684,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16625
16684
  parser_lex(parser);
16626
16685
 
16627
16686
  pm_token_t operator = parser->previous;
16628
- pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER_PLUS);
16687
+ pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, PM_ERR_UNARY_RECEIVER);
16629
16688
  pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "+@");
16630
16689
 
16631
16690
  return (pm_node_t *) node;
@@ -16648,7 +16707,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16648
16707
  // here because it will provide more context in addition to the
16649
16708
  // recoverable error that we will also add.
16650
16709
  if (diag_id != PM_ERR_CANNOT_PARSE_EXPRESSION) {
16651
- pm_parser_err_previous(parser, diag_id);
16710
+ pm_parser_err_prefix(parser, diag_id);
16652
16711
  }
16653
16712
 
16654
16713
  // If we get here, then we are assuming this token is closing a
@@ -16661,7 +16720,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16661
16720
  // have an unexpected token.
16662
16721
  PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type));
16663
16722
  } else {
16664
- pm_parser_err_previous(parser, diag_id);
16723
+ pm_parser_err_prefix(parser, diag_id);
16665
16724
  }
16666
16725
 
16667
16726
  return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
@@ -16710,7 +16769,18 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding
16710
16769
  if (is_single_value && match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) {
16711
16770
  pm_token_t rescue = parser->current;
16712
16771
  parser_lex(parser);
16713
- pm_node_t *right = parse_expression(parser, binding_power, false, PM_ERR_RESCUE_MODIFIER_VALUE);
16772
+
16773
+ bool accepts_command_call_inner = false;
16774
+
16775
+ // RHS can accept command call iff the value is a call with arguments but without paranthesis.
16776
+ if (PM_NODE_TYPE_P(value, PM_CALL_NODE)) {
16777
+ pm_call_node_t *call_node = (pm_call_node_t *)value;
16778
+ if ((call_node->arguments != NULL) && (call_node->opening_loc.start == NULL)) {
16779
+ accepts_command_call_inner = true;
16780
+ }
16781
+ }
16782
+
16783
+ pm_node_t *right = parse_expression(parser, binding_power, accepts_command_call_inner, PM_ERR_RESCUE_MODIFIER_VALUE);
16714
16784
 
16715
16785
  return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right);
16716
16786
  }
@@ -16895,7 +16965,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16895
16965
  switch (PM_NODE_TYPE(node)) {
16896
16966
  case PM_BACK_REFERENCE_READ_NODE:
16897
16967
  case PM_NUMBERED_REFERENCE_READ_NODE:
16898
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16968
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
16899
16969
  /* fallthrough */
16900
16970
  case PM_GLOBAL_VARIABLE_READ_NODE: {
16901
16971
  parser_lex(parser);
@@ -17006,7 +17076,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17006
17076
  switch (PM_NODE_TYPE(node)) {
17007
17077
  case PM_BACK_REFERENCE_READ_NODE:
17008
17078
  case PM_NUMBERED_REFERENCE_READ_NODE:
17009
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17079
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17010
17080
  /* fallthrough */
17011
17081
  case PM_GLOBAL_VARIABLE_READ_NODE: {
17012
17082
  parser_lex(parser);
@@ -17127,7 +17197,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17127
17197
  switch (PM_NODE_TYPE(node)) {
17128
17198
  case PM_BACK_REFERENCE_READ_NODE:
17129
17199
  case PM_NUMBERED_REFERENCE_READ_NODE:
17130
- pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17200
+ PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, PM_ERR_WRITE_TARGET_READONLY);
17131
17201
  /* fallthrough */
17132
17202
  case PM_GLOBAL_VARIABLE_READ_NODE: {
17133
17203
  parser_lex(parser);
@@ -17791,6 +17861,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17791
17861
  .current = { .type = PM_TOKEN_EOF, .start = source, .end = source },
17792
17862
  .next_start = NULL,
17793
17863
  .heredoc_end = NULL,
17864
+ .data_loc = { .start = NULL, .end = NULL },
17794
17865
  .comment_list = { 0 },
17795
17866
  .magic_comment_list = { 0 },
17796
17867
  .warning_list = { 0 },
@@ -18055,7 +18126,7 @@ typedef struct {
18055
18126
  pm_diagnostic_t *error;
18056
18127
 
18057
18128
  /** The start line of the diagnostic message. */
18058
- uint32_t line;
18129
+ int32_t line;
18059
18130
 
18060
18131
  /** The column start of the diagnostic message. */
18061
18132
  uint32_t column_start;
@@ -18087,12 +18158,13 @@ typedef struct {
18087
18158
  #define PM_COLOR_RESET "\033[0m"
18088
18159
 
18089
18160
  static inline pm_error_t *
18090
- pm_parser_errors_format_sort(const pm_list_t *error_list, const pm_newline_list_t *newline_list) {
18161
+ pm_parser_errors_format_sort(const pm_parser_t *parser, const pm_list_t *error_list, const pm_newline_list_t *newline_list) {
18091
18162
  pm_error_t *errors = calloc(error_list->size, sizeof(pm_error_t));
18163
+ int32_t start_line = parser->start_line;
18092
18164
 
18093
18165
  for (pm_diagnostic_t *error = (pm_diagnostic_t *) error_list->head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
18094
- pm_line_column_t start = pm_newline_list_line_column(newline_list, error->location.start);
18095
- pm_line_column_t end = pm_newline_list_line_column(newline_list, error->location.end);
18166
+ pm_line_column_t start = pm_newline_list_line_column(newline_list, error->location.start, start_line);
18167
+ pm_line_column_t end = pm_newline_list_line_column(newline_list, error->location.end, start_line);
18096
18168
 
18097
18169
  // We're going to insert this error into the array in sorted order. We
18098
18170
  // do this by finding the first error that has a line number greater
@@ -18103,8 +18175,8 @@ pm_parser_errors_format_sort(const pm_list_t *error_list, const pm_newline_list_
18103
18175
  (index < error_list->size) &&
18104
18176
  (errors[index].error != NULL) &&
18105
18177
  (
18106
- (errors[index].line < ((uint32_t) start.line)) ||
18107
- (errors[index].line == ((uint32_t) start.line) && errors[index].column_start < ((uint32_t) start.column))
18178
+ (errors[index].line < start.line) ||
18179
+ ((errors[index].line == start.line) && (errors[index].column_start < start.column))
18108
18180
  )
18109
18181
  ) index++;
18110
18182
 
@@ -18117,18 +18189,18 @@ pm_parser_errors_format_sort(const pm_list_t *error_list, const pm_newline_list_
18117
18189
  // Finally, we'll insert the error into the array.
18118
18190
  uint32_t column_end;
18119
18191
  if (start.line == end.line) {
18120
- column_end = (uint32_t) end.column;
18192
+ column_end = end.column;
18121
18193
  } else {
18122
- column_end = (uint32_t) (newline_list->offsets[start.line] - newline_list->offsets[start.line - 1] - 1);
18194
+ column_end = (uint32_t) (newline_list->offsets[start.line - start_line + 1] - newline_list->offsets[start.line - start_line] - 1);
18123
18195
  }
18124
18196
 
18125
18197
  // Ensure we have at least one column of error.
18126
- if (((uint32_t) start.column) == column_end) column_end++;
18198
+ if (start.column == column_end) column_end++;
18127
18199
 
18128
18200
  errors[index] = (pm_error_t) {
18129
18201
  .error = error,
18130
- .line = (uint32_t) start.line,
18131
- .column_start = (uint32_t) start.column,
18202
+ .line = start.line,
18203
+ .column_start = start.column,
18132
18204
  .column_end = column_end
18133
18205
  };
18134
18206
  }
@@ -18137,17 +18209,19 @@ pm_parser_errors_format_sort(const pm_list_t *error_list, const pm_newline_list_
18137
18209
  }
18138
18210
 
18139
18211
  static inline void
18140
- pm_parser_errors_format_line(const pm_parser_t *parser, const pm_newline_list_t *newline_list, const char *number_prefix, size_t line, pm_buffer_t *buffer) {
18141
- const uint8_t *start = &parser->start[newline_list->offsets[line - 1]];
18212
+ pm_parser_errors_format_line(const pm_parser_t *parser, const pm_newline_list_t *newline_list, const char *number_prefix, int32_t line, pm_buffer_t *buffer) {
18213
+ size_t index = (size_t) (line - parser->start_line);
18214
+
18215
+ const uint8_t *start = &parser->start[newline_list->offsets[index]];
18142
18216
  const uint8_t *end;
18143
18217
 
18144
- if (line >= newline_list->size) {
18218
+ if (index >= newline_list->size - 1) {
18145
18219
  end = parser->end;
18146
18220
  } else {
18147
- end = &parser->start[newline_list->offsets[line]];
18221
+ end = &parser->start[newline_list->offsets[index + 1]];
18148
18222
  }
18149
18223
 
18150
- pm_buffer_append_format(buffer, number_prefix, (uint32_t) line);
18224
+ pm_buffer_append_format(buffer, number_prefix, line);
18151
18225
  pm_buffer_append_string(buffer, (const char *) start, (size_t) (end - start));
18152
18226
 
18153
18227
  if (end == parser->end && end[-1] != '\n') {
@@ -18165,25 +18239,26 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18165
18239
 
18166
18240
  // First, we're going to sort all of the errors by line number using an
18167
18241
  // insertion sort into a newly allocated array.
18242
+ const int32_t start_line = parser->start_line;
18168
18243
  const pm_newline_list_t *newline_list = &parser->newline_list;
18169
- pm_error_t *errors = pm_parser_errors_format_sort(error_list, newline_list);
18244
+ pm_error_t *errors = pm_parser_errors_format_sort(parser, error_list, newline_list);
18170
18245
 
18171
18246
  // Now we're going to determine how we're going to format line numbers and
18172
18247
  // blank lines based on the maximum number of digits in the line numbers
18173
18248
  // that are going to be displayed.
18174
18249
  pm_error_format_t error_format;
18175
- size_t max_line_number = errors[error_list->size - 1].line;
18250
+ int32_t max_line_number = errors[error_list->size - 1].line - start_line;
18176
18251
 
18177
18252
  if (max_line_number < 10) {
18178
18253
  if (colorize) {
18179
18254
  error_format = (pm_error_format_t) {
18180
- .number_prefix = PM_COLOR_GRAY "%1" PRIu32 " | " PM_COLOR_RESET,
18255
+ .number_prefix = PM_COLOR_GRAY "%1" PRIi32 " | " PM_COLOR_RESET,
18181
18256
  .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18182
18257
  .divider = PM_COLOR_GRAY " ~~~~~" PM_COLOR_RESET "\n"
18183
18258
  };
18184
18259
  } else {
18185
18260
  error_format = (pm_error_format_t) {
18186
- .number_prefix = "%1" PRIu32 " | ",
18261
+ .number_prefix = "%1" PRIi32 " | ",
18187
18262
  .blank_prefix = " | ",
18188
18263
  .divider = " ~~~~~\n"
18189
18264
  };
@@ -18191,13 +18266,13 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18191
18266
  } else if (max_line_number < 100) {
18192
18267
  if (colorize) {
18193
18268
  error_format = (pm_error_format_t) {
18194
- .number_prefix = PM_COLOR_GRAY "%2" PRIu32 " | " PM_COLOR_RESET,
18269
+ .number_prefix = PM_COLOR_GRAY "%2" PRIi32 " | " PM_COLOR_RESET,
18195
18270
  .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18196
18271
  .divider = PM_COLOR_GRAY " ~~~~~~" PM_COLOR_RESET "\n"
18197
18272
  };
18198
18273
  } else {
18199
18274
  error_format = (pm_error_format_t) {
18200
- .number_prefix = "%2" PRIu32 " | ",
18275
+ .number_prefix = "%2" PRIi32 " | ",
18201
18276
  .blank_prefix = " | ",
18202
18277
  .divider = " ~~~~~~\n"
18203
18278
  };
@@ -18205,13 +18280,13 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18205
18280
  } else if (max_line_number < 1000) {
18206
18281
  if (colorize) {
18207
18282
  error_format = (pm_error_format_t) {
18208
- .number_prefix = PM_COLOR_GRAY "%3" PRIu32 " | " PM_COLOR_RESET,
18283
+ .number_prefix = PM_COLOR_GRAY "%3" PRIi32 " | " PM_COLOR_RESET,
18209
18284
  .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18210
18285
  .divider = PM_COLOR_GRAY " ~~~~~~~" PM_COLOR_RESET "\n"
18211
18286
  };
18212
18287
  } else {
18213
18288
  error_format = (pm_error_format_t) {
18214
- .number_prefix = "%3" PRIu32 " | ",
18289
+ .number_prefix = "%3" PRIi32 " | ",
18215
18290
  .blank_prefix = " | ",
18216
18291
  .divider = " ~~~~~~~\n"
18217
18292
  };
@@ -18219,13 +18294,13 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18219
18294
  } else if (max_line_number < 10000) {
18220
18295
  if (colorize) {
18221
18296
  error_format = (pm_error_format_t) {
18222
- .number_prefix = PM_COLOR_GRAY "%4" PRIu32 " | " PM_COLOR_RESET,
18297
+ .number_prefix = PM_COLOR_GRAY "%4" PRIi32 " | " PM_COLOR_RESET,
18223
18298
  .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18224
18299
  .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18225
18300
  };
18226
18301
  } else {
18227
18302
  error_format = (pm_error_format_t) {
18228
- .number_prefix = "%4" PRIu32 " | ",
18303
+ .number_prefix = "%4" PRIi32 " | ",
18229
18304
  .blank_prefix = " | ",
18230
18305
  .divider = " ~~~~~~~~\n"
18231
18306
  };
@@ -18233,13 +18308,13 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18233
18308
  } else {
18234
18309
  if (colorize) {
18235
18310
  error_format = (pm_error_format_t) {
18236
- .number_prefix = PM_COLOR_GRAY "%5" PRIu32 " | " PM_COLOR_RESET,
18311
+ .number_prefix = PM_COLOR_GRAY "%5" PRIi32 " | " PM_COLOR_RESET,
18237
18312
  .blank_prefix = PM_COLOR_GRAY " | " PM_COLOR_RESET,
18238
18313
  .divider = PM_COLOR_GRAY " ~~~~~~~~" PM_COLOR_RESET "\n"
18239
18314
  };
18240
18315
  } else {
18241
18316
  error_format = (pm_error_format_t) {
18242
- .number_prefix = "%5" PRIu32 " | ",
18317
+ .number_prefix = "%5" PRIi32 " | ",
18243
18318
  .blank_prefix = " | ",
18244
18319
  .divider = " ~~~~~~~~\n"
18245
18320
  };
@@ -18254,7 +18329,7 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18254
18329
  // the source before the error to give some context. We'll be careful not to
18255
18330
  // display the same line twice in case the errors are close enough in the
18256
18331
  // source.
18257
- uint32_t last_line = 0;
18332
+ int32_t last_line = 0;
18258
18333
  const pm_encoding_t *encoding = parser->encoding;
18259
18334
 
18260
18335
  for (size_t index = 0; index < error_list->size; index++) {
@@ -18300,7 +18375,7 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18300
18375
  pm_buffer_append_string(buffer, error_format.blank_prefix, error_format.blank_prefix_length);
18301
18376
 
18302
18377
  size_t column = 0;
18303
- const uint8_t *start = &parser->start[newline_list->offsets[error->line - 1]];
18378
+ const uint8_t *start = &parser->start[newline_list->offsets[error->line - start_line]];
18304
18379
 
18305
18380
  while (column < error->column_end) {
18306
18381
  if (column < error->column_start) {
@@ -18324,7 +18399,7 @@ pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool col
18324
18399
  // Here we determine how many lines of padding to display after the
18325
18400
  // error, depending on where the next error is in source.
18326
18401
  last_line = error->line;
18327
- size_t next_line = (index == error_list->size - 1) ? newline_list->size : errors[index + 1].line;
18402
+ int32_t next_line = (index == error_list->size - 1) ? ((int32_t) newline_list->size) : errors[index + 1].line;
18328
18403
 
18329
18404
  if (next_line - last_line > 1) {
18330
18405
  pm_buffer_append_string(buffer, " ", 2);