yarp 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -1
  3. data/Makefile +5 -1
  4. data/config.yml +156 -125
  5. data/docs/encoding.md +5 -5
  6. data/docs/serialization.md +2 -2
  7. data/ext/yarp/api_node.c +142 -98
  8. data/ext/yarp/extension.c +21 -7
  9. data/ext/yarp/extension.h +1 -1
  10. data/include/yarp/ast.h +327 -18
  11. data/include/yarp/defines.h +2 -1
  12. data/include/yarp/diagnostic.h +3 -3
  13. data/include/yarp/enc/yp_encoding.h +10 -10
  14. data/include/yarp/parser.h +19 -19
  15. data/include/yarp/regexp.h +1 -1
  16. data/include/yarp/unescape.h +4 -4
  17. data/include/yarp/util/yp_buffer.h +3 -0
  18. data/include/yarp/util/yp_char.h +16 -16
  19. data/include/yarp/util/yp_constant_pool.h +2 -2
  20. data/include/yarp/util/yp_newline_list.h +5 -5
  21. data/include/yarp/util/yp_string.h +4 -4
  22. data/include/yarp/util/yp_string_list.h +0 -3
  23. data/include/yarp/util/yp_strpbrk.h +1 -1
  24. data/include/yarp/version.h +2 -2
  25. data/include/yarp.h +5 -4
  26. data/lib/yarp/desugar_visitor.rb +59 -122
  27. data/lib/yarp/node.rb +230 -240
  28. data/lib/yarp/serialize.rb +16 -16
  29. data/lib/yarp.rb +5 -5
  30. data/src/diagnostic.c +1 -1
  31. data/src/enc/yp_big5.c +15 -42
  32. data/src/enc/yp_euc_jp.c +16 -43
  33. data/src/enc/yp_gbk.c +19 -46
  34. data/src/enc/yp_shift_jis.c +16 -43
  35. data/src/enc/yp_tables.c +36 -38
  36. data/src/enc/yp_unicode.c +20 -25
  37. data/src/enc/yp_windows_31j.c +16 -43
  38. data/src/node.c +1271 -899
  39. data/src/prettyprint.c +87 -48
  40. data/src/regexp.c +21 -21
  41. data/src/serialize.c +28 -15
  42. data/src/unescape.c +151 -121
  43. data/src/util/yp_buffer.c +7 -2
  44. data/src/util/yp_char.c +34 -34
  45. data/src/util/yp_constant_pool.c +4 -4
  46. data/src/util/yp_memchr.c +1 -1
  47. data/src/util/yp_newline_list.c +5 -4
  48. data/src/util/yp_string.c +22 -20
  49. data/src/util/yp_string_list.c +0 -6
  50. data/src/util/yp_strncasecmp.c +3 -6
  51. data/src/util/yp_strpbrk.c +8 -8
  52. data/src/yarp.c +355 -216
  53. data/yarp.gemspec +1 -1
  54. metadata +2 -2
data/src/yarp.c CHANGED
@@ -161,14 +161,18 @@ debug_token(yp_token_t * token) {
161
161
 
162
162
  #endif
163
163
 
164
+ /* Macros for min/max. */
165
+ #define MIN(a,b) (((a)<(b))?(a):(b))
166
+ #define MAX(a,b) (((a)>(b))?(a):(b))
167
+
164
168
  /******************************************************************************/
165
169
  /* Lex mode manipulations */
166
170
  /******************************************************************************/
167
171
 
168
172
  // Returns the incrementor character that should be used to increment the
169
173
  // nesting count if one is possible.
170
- static inline char
171
- lex_mode_incrementor(const char start) {
174
+ static inline uint8_t
175
+ lex_mode_incrementor(const uint8_t start) {
172
176
  switch (start) {
173
177
  case '(':
174
178
  case '[':
@@ -182,8 +186,8 @@ lex_mode_incrementor(const char start) {
182
186
 
183
187
  // Returns the matching character that should be used to terminate a list
184
188
  // beginning with the given character.
185
- static inline char
186
- lex_mode_terminator(const char start) {
189
+ static inline uint8_t
190
+ lex_mode_terminator(const uint8_t start) {
187
191
  switch (start) {
188
192
  case '(':
189
193
  return ')';
@@ -221,9 +225,9 @@ lex_mode_push(yp_parser_t *parser, yp_lex_mode_t lex_mode) {
221
225
 
222
226
  // Push on a new list lex mode.
223
227
  static inline bool
224
- lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
225
- char incrementor = lex_mode_incrementor(delimiter);
226
- char terminator = lex_mode_terminator(delimiter);
228
+ lex_mode_push_list(yp_parser_t *parser, bool interpolation, uint8_t delimiter) {
229
+ uint8_t incrementor = lex_mode_incrementor(delimiter);
230
+ uint8_t terminator = lex_mode_terminator(delimiter);
227
231
 
228
232
  yp_lex_mode_t lex_mode = {
229
233
  .mode = YP_LEX_LIST,
@@ -237,7 +241,7 @@ lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
237
241
 
238
242
  // These are the places where we need to split up the content of the list.
239
243
  // We'll use strpbrk to find the first of these characters.
240
- char *breakpoints = lex_mode.as.list.breakpoints;
244
+ uint8_t *breakpoints = lex_mode.as.list.breakpoints;
241
245
  memcpy(breakpoints, "\\ \t\f\r\v\n\0\0\0", sizeof(lex_mode.as.list.breakpoints));
242
246
 
243
247
  // Now we'll add the terminator to the list of breakpoints.
@@ -260,7 +264,7 @@ lex_mode_push_list(yp_parser_t *parser, bool interpolation, char delimiter) {
260
264
 
261
265
  // Push on a new regexp lex mode.
262
266
  static inline bool
263
- lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
267
+ lex_mode_push_regexp(yp_parser_t *parser, uint8_t incrementor, uint8_t terminator) {
264
268
  yp_lex_mode_t lex_mode = {
265
269
  .mode = YP_LEX_REGEXP,
266
270
  .as.regexp = {
@@ -273,7 +277,7 @@ lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
273
277
  // These are the places where we need to split up the content of the
274
278
  // regular expression. We'll use strpbrk to find the first of these
275
279
  // characters.
276
- char *breakpoints = lex_mode.as.regexp.breakpoints;
280
+ uint8_t *breakpoints = lex_mode.as.regexp.breakpoints;
277
281
  memcpy(breakpoints, "\n\\#\0\0", sizeof(lex_mode.as.regexp.breakpoints));
278
282
 
279
283
  // First we'll add the terminator.
@@ -289,7 +293,7 @@ lex_mode_push_regexp(yp_parser_t *parser, char incrementor, char terminator) {
289
293
 
290
294
  // Push on a new string lex mode.
291
295
  static inline bool
292
- lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed, char incrementor, char terminator) {
296
+ lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed, uint8_t incrementor, uint8_t terminator) {
293
297
  yp_lex_mode_t lex_mode = {
294
298
  .mode = YP_LEX_STRING,
295
299
  .as.string = {
@@ -303,7 +307,7 @@ lex_mode_push_string(yp_parser_t *parser, bool interpolation, bool label_allowed
303
307
 
304
308
  // These are the places where we need to split up the content of the
305
309
  // string. We'll use strpbrk to find the first of these characters.
306
- char *breakpoints = lex_mode.as.string.breakpoints;
310
+ uint8_t *breakpoints = lex_mode.as.string.breakpoints;
307
311
  memcpy(breakpoints, "\n\\\0\0\0", sizeof(lex_mode.as.string.breakpoints));
308
312
 
309
313
  // Now add in the terminator.
@@ -380,6 +384,9 @@ lex_state_arg_p(yp_parser_t *parser) {
380
384
 
381
385
  static inline bool
382
386
  lex_state_spcarg_p(yp_parser_t *parser, bool space_seen) {
387
+ if (parser->current.end >= parser->end) {
388
+ return false;
389
+ }
383
390
  return lex_state_arg_p(parser) && space_seen && !yp_char_is_whitespace(*parser->current.end);
384
391
  }
385
392
 
@@ -420,7 +427,7 @@ debug_lex_state_set(yp_parser_t *parser, yp_lex_state_t state, char const * call
420
427
 
421
428
  // Retrieve the constant pool id for the given location.
422
429
  static inline yp_constant_id_t
423
- yp_parser_constant_id_location(yp_parser_t *parser, const char *start, const char *end) {
430
+ yp_parser_constant_id_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
424
431
  return yp_constant_pool_insert(&parser->constant_pool, start, (size_t) (end - start));
425
432
  }
426
433
 
@@ -606,13 +613,45 @@ yp_scope_node_init(yp_node_t *node, yp_scope_node_t *scope) {
606
613
  /* Node creation functions */
607
614
  /******************************************************************************/
608
615
 
616
+ // Parse the decimal number represented by the range of bytes. returns
617
+ // UINT32_MAX if the number fails to parse. This function assumes that the range
618
+ // of bytes has already been validated to contain only decimal digits.
619
+ static uint32_t
620
+ parse_decimal_number(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
621
+ ptrdiff_t diff = end - start;
622
+ assert(diff > 0 && ((unsigned long) diff < SIZE_MAX));
623
+ size_t length = (size_t) diff;
624
+
625
+ char *digits = calloc(length + 1, sizeof(char));
626
+ memcpy(digits, start, length);
627
+ digits[length] = '\0';
628
+
629
+ char *endptr;
630
+ errno = 0;
631
+ unsigned long value = strtoul(digits, &endptr, 10);
632
+
633
+ if ((digits == endptr) || (*endptr != '\0') || (errno == ERANGE)) {
634
+ yp_diagnostic_list_append(&parser->error_list, start, end, "invalid decimal number");
635
+ value = UINT32_MAX;
636
+ }
637
+
638
+ free(digits);
639
+
640
+ if (value > UINT32_MAX) {
641
+ yp_diagnostic_list_append(&parser->error_list, start, end, "invalid decimal number");
642
+ value = UINT32_MAX;
643
+ }
644
+
645
+ return (uint32_t) value;
646
+ }
647
+
609
648
  // Parse out the options for a regular expression.
610
649
  static inline yp_node_flags_t
611
650
  yp_regular_expression_flags_create(const yp_token_t *closing) {
612
651
  yp_node_flags_t flags = 0;
613
652
 
614
653
  if (closing->type == YP_TOKEN_REGEXP_END) {
615
- for (const char *flag = closing->start + 1; flag < closing->end; flag++) {
654
+ for (const uint8_t *flag = closing->start + 1; flag < closing->end; flag++) {
616
655
  switch (*flag) {
617
656
  case 'i': flags |= YP_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE; break;
618
657
  case 'm': flags |= YP_REGULAR_EXPRESSION_FLAGS_MULTI_LINE; break;
@@ -654,7 +693,7 @@ yp_alloc_node(YP_ATTRIBUTE_UNUSED yp_parser_t *parser, size_t size) {
654
693
 
655
694
  // Allocate a new MissingNode node.
656
695
  static yp_missing_node_t *
657
- yp_missing_node_create(yp_parser_t *parser, const char *start, const char *end) {
696
+ yp_missing_node_create(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
658
697
  yp_missing_node_t *node = YP_ALLOC_NODE(parser, yp_missing_node_t);
659
698
  *node = (yp_missing_node_t) {{ .type = YP_NODE_MISSING_NODE, .location = { .start = start, .end = end } }};
660
699
  return node;
@@ -923,7 +962,7 @@ yp_array_pattern_node_requireds_append(yp_array_pattern_node_t *node, yp_node_t
923
962
  static yp_assoc_node_t *
924
963
  yp_assoc_node_create(yp_parser_t *parser, yp_node_t *key, const yp_token_t *operator, yp_node_t *value) {
925
964
  yp_assoc_node_t *node = YP_ALLOC_NODE(parser, yp_assoc_node_t);
926
- const char *end;
965
+ const uint8_t *end;
927
966
 
928
967
  if (value != NULL) {
929
968
  end = value->location.end;
@@ -1107,7 +1146,7 @@ static yp_block_parameters_node_t *
1107
1146
  yp_block_parameters_node_create(yp_parser_t *parser, yp_parameters_node_t *parameters, const yp_token_t *opening) {
1108
1147
  yp_block_parameters_node_t *node = YP_ALLOC_NODE(parser, yp_block_parameters_node_t);
1109
1148
 
1110
- const char *start;
1149
+ const uint8_t *start;
1111
1150
  if (opening->type != YP_TOKEN_NOT_PROVIDED) {
1112
1151
  start = opening->start;
1113
1152
  } else if (parameters != NULL) {
@@ -1116,7 +1155,7 @@ yp_block_parameters_node_create(yp_parser_t *parser, yp_parameters_node_t *param
1116
1155
  start = NULL;
1117
1156
  }
1118
1157
 
1119
- const char *end;
1158
+ const uint8_t *end;
1120
1159
  if (parameters != NULL) {
1121
1160
  end = parameters->base.location.end;
1122
1161
  } else if (opening->type != YP_TOKEN_NOT_PROVIDED) {
@@ -1237,8 +1276,8 @@ static yp_call_node_t *
1237
1276
  yp_call_node_binary_create(yp_parser_t *parser, yp_node_t *receiver, yp_token_t *operator, yp_node_t *argument) {
1238
1277
  yp_call_node_t *node = yp_call_node_create(parser);
1239
1278
 
1240
- node->base.location.start = receiver->location.start;
1241
- node->base.location.end = argument->location.end;
1279
+ node->base.location.start = MIN(receiver->location.start, argument->location.start);
1280
+ node->base.location.end = MAX(receiver->location.end, argument->location.end);
1242
1281
 
1243
1282
  node->receiver = receiver;
1244
1283
  node->message_loc = YP_OPTIONAL_LOCATION_TOKEN_VALUE(operator);
@@ -1434,7 +1473,7 @@ yp_call_operator_write_node_create(yp_parser_t *parser, yp_call_node_t *target,
1434
1473
  .target = target,
1435
1474
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
1436
1475
  .value = value,
1437
- .operator_id = yp_parser_constant_id_location(parser, operator->start, operator->end - 1)
1476
+ .operator = yp_parser_constant_id_location(parser, operator->start, operator->end - 1)
1438
1477
  };
1439
1478
 
1440
1479
  return node;
@@ -1555,8 +1594,7 @@ yp_class_node_create(yp_parser_t *parser, yp_constant_id_list_t *locals, const y
1555
1594
 
1556
1595
  // Allocate and initialize a new ClassVariableAndWriteNode node.
1557
1596
  static yp_class_variable_and_write_node_t *
1558
- yp_class_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1559
- assert(YP_NODE_TYPE_P(target, YP_NODE_CLASS_VARIABLE_READ_NODE));
1597
+ yp_class_variable_and_write_node_create(yp_parser_t *parser, yp_class_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1560
1598
  assert(operator->type == YP_TOKEN_AMPERSAND_AMPERSAND_EQUAL);
1561
1599
  yp_class_variable_and_write_node_t *node = YP_ALLOC_NODE(parser, yp_class_variable_and_write_node_t);
1562
1600
 
@@ -1564,11 +1602,12 @@ yp_class_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target,
1564
1602
  {
1565
1603
  .type = YP_NODE_CLASS_VARIABLE_AND_WRITE_NODE,
1566
1604
  .location = {
1567
- .start = target->location.start,
1605
+ .start = target->base.location.start,
1568
1606
  .end = value->location.end
1569
1607
  }
1570
1608
  },
1571
- .name_loc = target->location,
1609
+ .name = target->name,
1610
+ .name_loc = target->base.location,
1572
1611
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
1573
1612
  .value = value
1574
1613
  };
@@ -1578,18 +1617,19 @@ yp_class_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target,
1578
1617
 
1579
1618
  // Allocate and initialize a new ClassVariableOperatorWriteNode node.
1580
1619
  static yp_class_variable_operator_write_node_t *
1581
- yp_class_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1620
+ yp_class_variable_operator_write_node_create(yp_parser_t *parser, yp_class_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1582
1621
  yp_class_variable_operator_write_node_t *node = YP_ALLOC_NODE(parser, yp_class_variable_operator_write_node_t);
1583
1622
 
1584
1623
  *node = (yp_class_variable_operator_write_node_t) {
1585
1624
  {
1586
1625
  .type = YP_NODE_CLASS_VARIABLE_OPERATOR_WRITE_NODE,
1587
1626
  .location = {
1588
- .start = target->location.start,
1627
+ .start = target->base.location.start,
1589
1628
  .end = value->location.end
1590
1629
  }
1591
1630
  },
1592
- .name_loc = target->location,
1631
+ .name = target->name,
1632
+ .name_loc = target->base.location,
1593
1633
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
1594
1634
  .value = value,
1595
1635
  .operator = yp_parser_constant_id_location(parser, operator->start, operator->end - 1)
@@ -1600,8 +1640,7 @@ yp_class_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *tar
1600
1640
 
1601
1641
  // Allocate and initialize a new ClassVariableOrWriteNode node.
1602
1642
  static yp_class_variable_or_write_node_t *
1603
- yp_class_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1604
- assert(YP_NODE_TYPE_P(target, YP_NODE_CLASS_VARIABLE_READ_NODE));
1643
+ yp_class_variable_or_write_node_create(yp_parser_t *parser, yp_class_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
1605
1644
  assert(operator->type == YP_TOKEN_PIPE_PIPE_EQUAL);
1606
1645
  yp_class_variable_or_write_node_t *node = YP_ALLOC_NODE(parser, yp_class_variable_or_write_node_t);
1607
1646
 
@@ -1609,11 +1648,12 @@ yp_class_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, c
1609
1648
  {
1610
1649
  .type = YP_NODE_CLASS_VARIABLE_OR_WRITE_NODE,
1611
1650
  .location = {
1612
- .start = target->location.start,
1651
+ .start = target->base.location.start,
1613
1652
  .end = value->location.end
1614
1653
  }
1615
1654
  },
1616
- .name_loc = target->location,
1655
+ .name = target->name,
1656
+ .name_loc = target->base.location,
1617
1657
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
1618
1658
  .value = value
1619
1659
  };
@@ -1626,13 +1666,21 @@ static yp_class_variable_read_node_t *
1626
1666
  yp_class_variable_read_node_create(yp_parser_t *parser, const yp_token_t *token) {
1627
1667
  assert(token->type == YP_TOKEN_CLASS_VARIABLE);
1628
1668
  yp_class_variable_read_node_t *node = YP_ALLOC_NODE(parser, yp_class_variable_read_node_t);
1629
- *node = (yp_class_variable_read_node_t) {{ .type = YP_NODE_CLASS_VARIABLE_READ_NODE, .location = YP_LOCATION_TOKEN_VALUE(token) }};
1669
+
1670
+ *node = (yp_class_variable_read_node_t) {
1671
+ {
1672
+ .type = YP_NODE_CLASS_VARIABLE_READ_NODE,
1673
+ .location = YP_LOCATION_TOKEN_VALUE(token)
1674
+ },
1675
+ .name = yp_parser_constant_id_location(parser, token->start, token->end)
1676
+ };
1677
+
1630
1678
  return node;
1631
1679
  }
1632
1680
 
1633
1681
  // Initialize a new ClassVariableWriteNode node from a ClassVariableRead node.
1634
1682
  static yp_class_variable_write_node_t *
1635
- yp_class_variable_read_node_to_class_variable_write_node(yp_parser_t *parser, yp_class_variable_read_node_t *read_node, yp_token_t *operator, yp_node_t *value) {
1683
+ yp_class_variable_write_node_create(yp_parser_t *parser, yp_class_variable_read_node_t *read_node, yp_token_t *operator, yp_node_t *value) {
1636
1684
  yp_class_variable_write_node_t *node = YP_ALLOC_NODE(parser, yp_class_variable_write_node_t);
1637
1685
 
1638
1686
  *node = (yp_class_variable_write_node_t) {
@@ -1643,6 +1691,7 @@ yp_class_variable_read_node_to_class_variable_write_node(yp_parser_t *parser, yp
1643
1691
  .end = value->location.end
1644
1692
  },
1645
1693
  },
1694
+ .name = read_node->name,
1646
1695
  .name_loc = YP_LOCATION_NODE_VALUE((yp_node_t *) read_node),
1647
1696
  .operator_loc = YP_OPTIONAL_LOCATION_TOKEN_VALUE(operator),
1648
1697
  .value = value
@@ -1875,7 +1924,7 @@ yp_def_node_create(
1875
1924
  const yp_token_t *end_keyword
1876
1925
  ) {
1877
1926
  yp_def_node_t *node = YP_ALLOC_NODE(parser, yp_def_node_t);
1878
- const char *end;
1927
+ const uint8_t *end;
1879
1928
 
1880
1929
  if (end_keyword->type == YP_TOKEN_NOT_PROVIDED) {
1881
1930
  end = body->location.end;
@@ -1930,7 +1979,7 @@ yp_defined_node_create(yp_parser_t *parser, const yp_token_t *lparen, yp_node_t
1930
1979
  static yp_else_node_t *
1931
1980
  yp_else_node_create(yp_parser_t *parser, const yp_token_t *else_keyword, yp_statements_node_t *statements, const yp_token_t *end_keyword) {
1932
1981
  yp_else_node_t *node = YP_ALLOC_NODE(parser, yp_else_node_t);
1933
- const char *end = NULL;
1982
+ const uint8_t *end = NULL;
1934
1983
  if ((end_keyword->type == YP_TOKEN_NOT_PROVIDED) && (statements != NULL)) {
1935
1984
  end = statements->base.location.end;
1936
1985
  } else {
@@ -2410,7 +2459,7 @@ yp_if_node_create(yp_parser_t *parser,
2410
2459
  yp_flip_flop(predicate);
2411
2460
  yp_if_node_t *node = YP_ALLOC_NODE(parser, yp_if_node_t);
2412
2461
 
2413
- const char *end;
2462
+ const uint8_t *end;
2414
2463
  if (end_keyword->type != YP_TOKEN_NOT_PROVIDED) {
2415
2464
  end = end_keyword->end;
2416
2465
  } else if (consequent != NULL) {
@@ -2593,7 +2642,7 @@ static yp_in_node_t *
2593
2642
  yp_in_node_create(yp_parser_t *parser, yp_node_t *pattern, yp_statements_node_t *statements, const yp_token_t *in_keyword, const yp_token_t *then_keyword) {
2594
2643
  yp_in_node_t *node = YP_ALLOC_NODE(parser, yp_in_node_t);
2595
2644
 
2596
- const char *end;
2645
+ const uint8_t *end;
2597
2646
  if (statements != NULL) {
2598
2647
  end = statements->base.location.end;
2599
2648
  } else if (then_keyword->type != YP_TOKEN_NOT_PROVIDED) {
@@ -2621,8 +2670,7 @@ yp_in_node_create(yp_parser_t *parser, yp_node_t *pattern, yp_statements_node_t
2621
2670
 
2622
2671
  // Allocate and initialize a new InstanceVariableAndWriteNode node.
2623
2672
  static yp_instance_variable_and_write_node_t *
2624
- yp_instance_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2625
- assert(YP_NODE_TYPE_P(target, YP_NODE_INSTANCE_VARIABLE_READ_NODE));
2673
+ yp_instance_variable_and_write_node_create(yp_parser_t *parser, yp_instance_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2626
2674
  assert(operator->type == YP_TOKEN_AMPERSAND_AMPERSAND_EQUAL);
2627
2675
  yp_instance_variable_and_write_node_t *node = YP_ALLOC_NODE(parser, yp_instance_variable_and_write_node_t);
2628
2676
 
@@ -2630,11 +2678,12 @@ yp_instance_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *targe
2630
2678
  {
2631
2679
  .type = YP_NODE_INSTANCE_VARIABLE_AND_WRITE_NODE,
2632
2680
  .location = {
2633
- .start = target->location.start,
2681
+ .start = target->base.location.start,
2634
2682
  .end = value->location.end
2635
2683
  }
2636
2684
  },
2637
- .name_loc = target->location,
2685
+ .name = target->name,
2686
+ .name_loc = target->base.location,
2638
2687
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
2639
2688
  .value = value
2640
2689
  };
@@ -2644,18 +2693,19 @@ yp_instance_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *targe
2644
2693
 
2645
2694
  // Allocate and initialize a new InstanceVariableOperatorWriteNode node.
2646
2695
  static yp_instance_variable_operator_write_node_t *
2647
- yp_instance_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2696
+ yp_instance_variable_operator_write_node_create(yp_parser_t *parser, yp_instance_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2648
2697
  yp_instance_variable_operator_write_node_t *node = YP_ALLOC_NODE(parser, yp_instance_variable_operator_write_node_t);
2649
2698
 
2650
2699
  *node = (yp_instance_variable_operator_write_node_t) {
2651
2700
  {
2652
2701
  .type = YP_NODE_INSTANCE_VARIABLE_OPERATOR_WRITE_NODE,
2653
2702
  .location = {
2654
- .start = target->location.start,
2703
+ .start = target->base.location.start,
2655
2704
  .end = value->location.end
2656
2705
  }
2657
2706
  },
2658
- .name_loc = target->location,
2707
+ .name = target->name,
2708
+ .name_loc = target->base.location,
2659
2709
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
2660
2710
  .value = value,
2661
2711
  .operator = yp_parser_constant_id_location(parser, operator->start, operator->end - 1)
@@ -2666,8 +2716,7 @@ yp_instance_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *
2666
2716
 
2667
2717
  // Allocate and initialize a new InstanceVariableOrWriteNode node.
2668
2718
  static yp_instance_variable_or_write_node_t *
2669
- yp_instance_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2670
- assert(YP_NODE_TYPE_P(target, YP_NODE_INSTANCE_VARIABLE_READ_NODE));
2719
+ yp_instance_variable_or_write_node_create(yp_parser_t *parser, yp_instance_variable_read_node_t *target, const yp_token_t *operator, yp_node_t *value) {
2671
2720
  assert(operator->type == YP_TOKEN_PIPE_PIPE_EQUAL);
2672
2721
  yp_instance_variable_or_write_node_t *node = YP_ALLOC_NODE(parser, yp_instance_variable_or_write_node_t);
2673
2722
 
@@ -2675,11 +2724,12 @@ yp_instance_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target
2675
2724
  {
2676
2725
  .type = YP_NODE_INSTANCE_VARIABLE_OR_WRITE_NODE,
2677
2726
  .location = {
2678
- .start = target->location.start,
2727
+ .start = target->base.location.start,
2679
2728
  .end = value->location.end
2680
2729
  }
2681
2730
  },
2682
- .name_loc = target->location,
2731
+ .name = target->name,
2732
+ .name_loc = target->base.location,
2683
2733
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
2684
2734
  .value = value
2685
2735
  };
@@ -2693,9 +2743,13 @@ yp_instance_variable_read_node_create(yp_parser_t *parser, const yp_token_t *tok
2693
2743
  assert(token->type == YP_TOKEN_INSTANCE_VARIABLE);
2694
2744
  yp_instance_variable_read_node_t *node = YP_ALLOC_NODE(parser, yp_instance_variable_read_node_t);
2695
2745
 
2696
- *node = (yp_instance_variable_read_node_t) {{
2697
- .type = YP_NODE_INSTANCE_VARIABLE_READ_NODE, .location = YP_LOCATION_TOKEN_VALUE(token)
2698
- }};
2746
+ *node = (yp_instance_variable_read_node_t) {
2747
+ {
2748
+ .type = YP_NODE_INSTANCE_VARIABLE_READ_NODE,
2749
+ .location = YP_LOCATION_TOKEN_VALUE(token)
2750
+ },
2751
+ .name = yp_parser_constant_id_location(parser, token->start, token->end)
2752
+ };
2699
2753
 
2700
2754
  return node;
2701
2755
  }
@@ -2712,6 +2766,7 @@ yp_instance_variable_write_node_create(yp_parser_t *parser, yp_instance_variable
2712
2766
  .end = value->location.end
2713
2767
  }
2714
2768
  },
2769
+ .name = read_node->name,
2715
2770
  .name_loc = YP_LOCATION_NODE_BASE_VALUE(read_node),
2716
2771
  .operator_loc = YP_OPTIONAL_LOCATION_TOKEN_VALUE(operator),
2717
2772
  .value = value
@@ -2743,8 +2798,13 @@ yp_interpolated_regular_expression_node_create(yp_parser_t *parser, const yp_tok
2743
2798
 
2744
2799
  static inline void
2745
2800
  yp_interpolated_regular_expression_node_append(yp_interpolated_regular_expression_node_t *node, yp_node_t *part) {
2801
+ if (node->base.location.start > part->location.start) {
2802
+ node->base.location.start = part->location.start;
2803
+ }
2804
+ if (node->base.location.end < part->location.end) {
2805
+ node->base.location.end = part->location.end;
2806
+ }
2746
2807
  yp_node_list_append(&node->parts, part);
2747
- node->base.location.end = part->location.end;
2748
2808
  }
2749
2809
 
2750
2810
  static inline void
@@ -2816,10 +2876,11 @@ yp_interpolated_symbol_node_create(yp_parser_t *parser, const yp_token_t *openin
2816
2876
 
2817
2877
  static inline void
2818
2878
  yp_interpolated_symbol_node_append(yp_interpolated_symbol_node_t *node, yp_node_t *part) {
2819
- yp_node_list_append(&node->parts, part);
2820
- if (!node->base.location.start) {
2879
+ if (node->parts.size == 0 && node->opening_loc.start == NULL) {
2821
2880
  node->base.location.start = part->location.start;
2822
2881
  }
2882
+
2883
+ yp_node_list_append(&node->parts, part);
2823
2884
  node->base.location.end = part->location.end;
2824
2885
  }
2825
2886
 
@@ -2959,7 +3020,7 @@ yp_lambda_node_create(
2959
3020
 
2960
3021
  // Allocate and initialize a new LocalVariableAndWriteNode node.
2961
3022
  static yp_local_variable_and_write_node_t *
2962
- yp_local_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t constant_id, uint32_t depth) {
3023
+ yp_local_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t name, uint32_t depth) {
2963
3024
  assert(YP_NODE_TYPE_P(target, YP_NODE_LOCAL_VARIABLE_READ_NODE) || YP_NODE_TYPE_P(target, YP_NODE_CALL_NODE));
2964
3025
  assert(operator->type == YP_TOKEN_AMPERSAND_AMPERSAND_EQUAL);
2965
3026
  yp_local_variable_and_write_node_t *node = YP_ALLOC_NODE(parser, yp_local_variable_and_write_node_t);
@@ -2975,7 +3036,7 @@ yp_local_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target,
2975
3036
  .name_loc = target->location,
2976
3037
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
2977
3038
  .value = value,
2978
- .constant_id = constant_id,
3039
+ .name = name,
2979
3040
  .depth = depth
2980
3041
  };
2981
3042
 
@@ -2984,7 +3045,7 @@ yp_local_variable_and_write_node_create(yp_parser_t *parser, yp_node_t *target,
2984
3045
 
2985
3046
  // Allocate and initialize a new LocalVariableOperatorWriteNode node.
2986
3047
  static yp_local_variable_operator_write_node_t *
2987
- yp_local_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t constant_id, uint32_t depth) {
3048
+ yp_local_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t name, uint32_t depth) {
2988
3049
  yp_local_variable_operator_write_node_t *node = YP_ALLOC_NODE(parser, yp_local_variable_operator_write_node_t);
2989
3050
 
2990
3051
  *node = (yp_local_variable_operator_write_node_t) {
@@ -2998,8 +3059,8 @@ yp_local_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *tar
2998
3059
  .name_loc = target->location,
2999
3060
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
3000
3061
  .value = value,
3001
- .constant_id = constant_id,
3002
- .operator_id = yp_parser_constant_id_location(parser, operator->start, operator->end - 1),
3062
+ .name = name,
3063
+ .operator = yp_parser_constant_id_location(parser, operator->start, operator->end - 1),
3003
3064
  .depth = depth
3004
3065
  };
3005
3066
 
@@ -3008,7 +3069,7 @@ yp_local_variable_operator_write_node_create(yp_parser_t *parser, yp_node_t *tar
3008
3069
 
3009
3070
  // Allocate and initialize a new LocalVariableOrWriteNode node.
3010
3071
  static yp_local_variable_or_write_node_t *
3011
- yp_local_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t constant_id, uint32_t depth) {
3072
+ yp_local_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, const yp_token_t *operator, yp_node_t *value, yp_constant_id_t name, uint32_t depth) {
3012
3073
  assert(YP_NODE_TYPE_P(target, YP_NODE_LOCAL_VARIABLE_READ_NODE) || YP_NODE_TYPE_P(target, YP_NODE_CALL_NODE));
3013
3074
  assert(operator->type == YP_TOKEN_PIPE_PIPE_EQUAL);
3014
3075
  yp_local_variable_or_write_node_t *node = YP_ALLOC_NODE(parser, yp_local_variable_or_write_node_t);
@@ -3024,7 +3085,7 @@ yp_local_variable_or_write_node_create(yp_parser_t *parser, yp_node_t *target, c
3024
3085
  .name_loc = target->location,
3025
3086
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
3026
3087
  .value = value,
3027
- .constant_id = constant_id,
3088
+ .name = name,
3028
3089
  .depth = depth
3029
3090
  };
3030
3091
 
@@ -3041,7 +3102,7 @@ yp_local_variable_read_node_create(yp_parser_t *parser, const yp_token_t *name,
3041
3102
  .type = YP_NODE_LOCAL_VARIABLE_READ_NODE,
3042
3103
  .location = YP_LOCATION_TOKEN_VALUE(name)
3043
3104
  },
3044
- .constant_id = yp_parser_constant_id_token(parser, name),
3105
+ .name = yp_parser_constant_id_token(parser, name),
3045
3106
  .depth = depth
3046
3107
  };
3047
3108
 
@@ -3050,7 +3111,7 @@ yp_local_variable_read_node_create(yp_parser_t *parser, const yp_token_t *name,
3050
3111
 
3051
3112
  // Allocate and initialize a new LocalVariableWriteNode node.
3052
3113
  static yp_local_variable_write_node_t *
3053
- yp_local_variable_write_node_create(yp_parser_t *parser, yp_constant_id_t constant_id, uint32_t depth, yp_node_t *value, const yp_location_t *name_loc, const yp_token_t *operator) {
3114
+ yp_local_variable_write_node_create(yp_parser_t *parser, yp_constant_id_t name, uint32_t depth, yp_node_t *value, const yp_location_t *name_loc, const yp_token_t *operator) {
3054
3115
  yp_local_variable_write_node_t *node = YP_ALLOC_NODE(parser, yp_local_variable_write_node_t);
3055
3116
 
3056
3117
  *node = (yp_local_variable_write_node_t) {
@@ -3061,7 +3122,7 @@ yp_local_variable_write_node_create(yp_parser_t *parser, yp_constant_id_t consta
3061
3122
  .end = value->location.end
3062
3123
  }
3063
3124
  },
3064
- .constant_id = constant_id,
3125
+ .name = name,
3065
3126
  .depth = depth,
3066
3127
  .value = value,
3067
3128
  .name_loc = *name_loc,
@@ -3081,7 +3142,7 @@ yp_local_variable_target_node_create(yp_parser_t *parser, const yp_token_t *name
3081
3142
  .type = YP_NODE_LOCAL_VARIABLE_TARGET_NODE,
3082
3143
  .location = YP_LOCATION_TOKEN_VALUE(name)
3083
3144
  },
3084
- .constant_id = yp_parser_constant_id_token(parser, name),
3145
+ .name = yp_parser_constant_id_token(parser, name),
3085
3146
  .depth = 0
3086
3147
  };
3087
3148
 
@@ -3260,7 +3321,8 @@ yp_numbered_reference_read_node_create(yp_parser_t *parser, const yp_token_t *na
3260
3321
  {
3261
3322
  .type = YP_NODE_NUMBERED_REFERENCE_READ_NODE,
3262
3323
  .location = YP_LOCATION_TOKEN_VALUE(name),
3263
- }
3324
+ },
3325
+ .number = parse_decimal_number(parser, name->start + 1, name->end)
3264
3326
  };
3265
3327
 
3266
3328
  return node;
@@ -3279,7 +3341,7 @@ yp_optional_parameter_node_create(yp_parser_t *parser, const yp_token_t *name, c
3279
3341
  .end = value->location.end
3280
3342
  }
3281
3343
  },
3282
- .constant_id = yp_parser_constant_id_token(parser, name),
3344
+ .name = yp_parser_constant_id_token(parser, name),
3283
3345
  .name_loc = YP_LOCATION_TOKEN_VALUE(name),
3284
3346
  .operator_loc = YP_LOCATION_TOKEN_VALUE(operator),
3285
3347
  .value = value
@@ -3576,8 +3638,8 @@ yp_regular_expression_node_create(yp_parser_t *parser, const yp_token_t *opening
3576
3638
  .type = YP_NODE_REGULAR_EXPRESSION_NODE,
3577
3639
  .flags = yp_regular_expression_flags_create(closing),
3578
3640
  .location = {
3579
- .start = opening->start,
3580
- .end = closing->end
3641
+ .start = MIN(opening->start, closing->start),
3642
+ .end = MAX(opening->end, closing->end)
3581
3643
  }
3582
3644
  },
3583
3645
  .opening_loc = YP_LOCATION_TOKEN_VALUE(opening),
@@ -3630,7 +3692,7 @@ yp_required_parameter_node_create(yp_parser_t *parser, const yp_token_t *token)
3630
3692
  .type = YP_NODE_REQUIRED_PARAMETER_NODE,
3631
3693
  .location = YP_LOCATION_TOKEN_VALUE(token)
3632
3694
  },
3633
- .constant_id = yp_parser_constant_id_token(parser, token)
3695
+ .name = yp_parser_constant_id_token(parser, token)
3634
3696
  };
3635
3697
 
3636
3698
  return node;
@@ -3881,19 +3943,21 @@ yp_statements_node_body_length(yp_statements_node_t *node) {
3881
3943
 
3882
3944
  // Set the location of the given StatementsNode.
3883
3945
  static void
3884
- yp_statements_node_location_set(yp_statements_node_t *node, const char *start, const char *end) {
3946
+ yp_statements_node_location_set(yp_statements_node_t *node, const uint8_t *start, const uint8_t *end) {
3885
3947
  node->base.location = (yp_location_t) { .start = start, .end = end };
3886
3948
  }
3887
3949
 
3888
3950
  // Append a new node to the given StatementsNode node's body.
3889
3951
  static void
3890
3952
  yp_statements_node_body_append(yp_statements_node_t *node, yp_node_t *statement) {
3891
- if (yp_statements_node_body_length(node) == 0) {
3953
+ if (yp_statements_node_body_length(node) == 0 || statement->location.start < node->base.location.start) {
3892
3954
  node->base.location.start = statement->location.start;
3893
3955
  }
3956
+ if (statement->location.end > node->base.location.end) {
3957
+ node->base.location.end = statement->location.end;
3958
+ }
3894
3959
 
3895
3960
  yp_node_list_append(&node->body, statement);
3896
- node->base.location.end = statement->location.end;
3897
3961
 
3898
3962
  // Every statement gets marked as a place where a newline can occur.
3899
3963
  statement->flags |= YP_NODE_FLAG_NEWLINE;
@@ -3947,7 +4011,7 @@ yp_super_node_create(yp_parser_t *parser, const yp_token_t *keyword, yp_argument
3947
4011
  assert(keyword->type == YP_TOKEN_KEYWORD_SUPER);
3948
4012
  yp_super_node_t *node = YP_ALLOC_NODE(parser, yp_super_node_t);
3949
4013
 
3950
- const char *end;
4014
+ const uint8_t *end;
3951
4015
  if (arguments->block != NULL) {
3952
4016
  end = arguments->block->base.location.end;
3953
4017
  } else if (arguments->closing_loc.start != NULL) {
@@ -4038,7 +4102,7 @@ yp_symbol_node_label_create(yp_parser_t *parser, const yp_token_t *token) {
4038
4102
  // Check if the given node is a label in a hash.
4039
4103
  static bool
4040
4104
  yp_symbol_node_label_p(yp_node_t *node) {
4041
- const char *end = NULL;
4105
+ const uint8_t *end = NULL;
4042
4106
 
4043
4107
  switch (YP_NODE_TYPE(node)) {
4044
4108
  case YP_NODE_SYMBOL_NODE:
@@ -4146,7 +4210,7 @@ yp_unless_node_create(yp_parser_t *parser, const yp_token_t *keyword, yp_node_t
4146
4210
  yp_flip_flop(predicate);
4147
4211
  yp_unless_node_t *node = YP_ALLOC_NODE(parser, yp_unless_node_t);
4148
4212
 
4149
- const char *end;
4213
+ const uint8_t *end;
4150
4214
  if (statements != NULL) {
4151
4215
  end = statements->base.location.end;
4152
4216
  } else {
@@ -4363,7 +4427,7 @@ static yp_yield_node_t *
4363
4427
  yp_yield_node_create(yp_parser_t *parser, const yp_token_t *keyword, const yp_location_t *lparen_loc, yp_arguments_node_t *arguments, const yp_location_t *rparen_loc) {
4364
4428
  yp_yield_node_t *node = YP_ALLOC_NODE(parser, yp_yield_node_t);
4365
4429
 
4366
- const char *end;
4430
+ const uint8_t *end;
4367
4431
  if (rparen_loc->start != NULL) {
4368
4432
  end = rparen_loc->end;
4369
4433
  } else if (arguments != NULL) {
@@ -4437,7 +4501,7 @@ yp_parser_local_depth(yp_parser_t *parser, yp_token_t *token) {
4437
4501
 
4438
4502
  // Add a local variable from a location to the current scope.
4439
4503
  static yp_constant_id_t
4440
- yp_parser_local_add_location(yp_parser_t *parser, const char *start, const char *end) {
4504
+ yp_parser_local_add_location(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
4441
4505
  yp_constant_id_t constant_id = yp_parser_constant_id_location(parser, start, end);
4442
4506
 
4443
4507
  if (!yp_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
@@ -4486,15 +4550,13 @@ yp_parser_scope_pop(yp_parser_t *parser) {
4486
4550
  // reason we have the encoding_changed boolean to check if we need to go through
4487
4551
  // the function pointer or can just directly use the UTF-8 functions.
4488
4552
  static inline size_t
4489
- char_is_identifier_start(yp_parser_t *parser, const char *c) {
4490
- const unsigned char uc = (unsigned char) *c;
4491
-
4553
+ char_is_identifier_start(yp_parser_t *parser, const uint8_t *b) {
4492
4554
  if (parser->encoding_changed) {
4493
- return parser->encoding.alpha_char(c, parser->end - c) || (uc == '_') || (uc >= 0x80);
4494
- } else if (uc < 0x80) {
4495
- return (yp_encoding_unicode_table[uc] & YP_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (uc == '_');
4555
+ return parser->encoding.alpha_char(b, parser->end - b) || (*b == '_') || (*b >= 0x80);
4556
+ } else if (*b < 0x80) {
4557
+ return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
4496
4558
  } else {
4497
- return (size_t) (yp_encoding_utf_8_alpha_char(c, parser->end - c) || 1u);
4559
+ return (size_t) (yp_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
4498
4560
  }
4499
4561
  }
4500
4562
 
@@ -4502,15 +4564,13 @@ char_is_identifier_start(yp_parser_t *parser, const char *c) {
4502
4564
  // the identifiers in a source file once the first character has been found. So
4503
4565
  // it's important that it be as fast as possible.
4504
4566
  static inline size_t
4505
- char_is_identifier(yp_parser_t *parser, const char *c) {
4506
- const unsigned char uc = (unsigned char) *c;
4507
-
4567
+ char_is_identifier(yp_parser_t *parser, const uint8_t *b) {
4508
4568
  if (parser->encoding_changed) {
4509
- return parser->encoding.alnum_char(c, parser->end - c) || (uc == '_') || (uc >= 0x80);
4510
- } else if (uc < 0x80) {
4511
- return (yp_encoding_unicode_table[uc] & YP_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (uc == '_');
4569
+ return parser->encoding.alnum_char(b, parser->end - b) || (*b == '_') || (*b >= 0x80);
4570
+ } else if (*b < 0x80) {
4571
+ return (yp_encoding_unicode_table[*b] & YP_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_');
4512
4572
  } else {
4513
- return (size_t) (yp_encoding_utf_8_alnum_char(c, parser->end - c) || 1u);
4573
+ return (size_t) (yp_encoding_utf_8_alnum_char(b, parser->end - b) || 1u);
4514
4574
  }
4515
4575
  }
4516
4576
 
@@ -4532,15 +4592,15 @@ const unsigned int yp_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = {
4532
4592
  #undef PUNCT
4533
4593
 
4534
4594
  static inline bool
4535
- char_is_global_name_punctuation(const char c) {
4536
- const unsigned int i = (const unsigned int) c;
4595
+ char_is_global_name_punctuation(const uint8_t b) {
4596
+ const unsigned int i = (const unsigned int) b;
4537
4597
  if (i <= 0x20 || 0x7e < i) return false;
4538
4598
 
4539
- return (yp_global_name_punctuation_hash[(i - 0x20) / 32] >> (c % 32)) & 1;
4599
+ return (yp_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
4540
4600
  }
4541
4601
 
4542
4602
  static inline bool
4543
- token_is_numbered_parameter(const char *start, const char *end) {
4603
+ token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
4544
4604
  return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (yp_char_is_decimal_digit(start[1]));
4545
4605
  }
4546
4606
 
@@ -4594,8 +4654,8 @@ yp_do_loop_stack_p(yp_parser_t *parser) {
4594
4654
 
4595
4655
  // Get the next character in the source starting from +cursor+. If that position
4596
4656
  // is beyond the end of the source then return '\0'.
4597
- static inline char
4598
- peek_at(yp_parser_t *parser, const char *cursor) {
4657
+ static inline uint8_t
4658
+ peek_at(yp_parser_t *parser, const uint8_t *cursor) {
4599
4659
  if (cursor < parser->end) {
4600
4660
  return *cursor;
4601
4661
  } else {
@@ -4606,33 +4666,33 @@ peek_at(yp_parser_t *parser, const char *cursor) {
4606
4666
  // Get the next character in the source starting from parser->current.end and
4607
4667
  // adding the given offset. If that position is beyond the end of the source
4608
4668
  // then return '\0'.
4609
- static inline char
4669
+ static inline uint8_t
4610
4670
  peek_offset(yp_parser_t *parser, ptrdiff_t offset) {
4611
4671
  return peek_at(parser, parser->current.end + offset);
4612
4672
  }
4613
4673
 
4614
4674
  // Get the next character in the source starting from parser->current.end. If
4615
4675
  // that position is beyond the end of the source then return '\0'.
4616
- static inline char
4676
+ static inline uint8_t
4617
4677
  peek(yp_parser_t *parser) {
4618
4678
  return peek_at(parser, parser->current.end);
4619
4679
  }
4620
4680
 
4621
4681
  // Get the next string of length len in the source starting from parser->current.end.
4622
4682
  // If the string extends beyond the end of the source, return the empty string ""
4623
- static inline const char*
4683
+ static inline const uint8_t *
4624
4684
  peek_string(yp_parser_t *parser, size_t len) {
4625
4685
  if (parser->current.end + len <= parser->end) {
4626
4686
  return parser->current.end;
4627
4687
  } else {
4628
- return "";
4688
+ return (const uint8_t *) "";
4629
4689
  }
4630
4690
  }
4631
4691
 
4632
4692
  // If the character to be read matches the given value, then returns true and
4633
4693
  // advanced the current pointer.
4634
4694
  static inline bool
4635
- match(yp_parser_t *parser, char value) {
4695
+ match(yp_parser_t *parser, uint8_t value) {
4636
4696
  if (peek(parser) == value) {
4637
4697
  parser->current.end++;
4638
4698
  return true;
@@ -4643,7 +4703,7 @@ match(yp_parser_t *parser, char value) {
4643
4703
  // Return the length of the line ending string starting at +cursor+, or 0 if it
4644
4704
  // is not a line ending. This function is intended to be CRLF/LF agnostic.
4645
4705
  static inline size_t
4646
- match_eol_at(yp_parser_t *parser, const char *cursor) {
4706
+ match_eol_at(yp_parser_t *parser, const uint8_t *cursor) {
4647
4707
  if (peek_at(parser, cursor) == '\n') {
4648
4708
  return 1;
4649
4709
  }
@@ -4670,8 +4730,8 @@ match_eol(yp_parser_t *parser) {
4670
4730
  }
4671
4731
 
4672
4732
  // Skip to the next newline character or NUL byte.
4673
- static inline const char *
4674
- next_newline(const char *cursor, ptrdiff_t length) {
4733
+ static inline const uint8_t *
4734
+ next_newline(const uint8_t *cursor, ptrdiff_t length) {
4675
4735
  assert(length >= 0);
4676
4736
 
4677
4737
  // Note that it's okay for us to use memchr here to look for \n because none
@@ -4682,17 +4742,17 @@ next_newline(const char *cursor, ptrdiff_t length) {
4682
4742
 
4683
4743
  // Find the start of the encoding comment. This is effectively an inlined
4684
4744
  // version of strnstr with some modifications.
4685
- static inline const char *
4686
- parser_lex_encoding_comment_start(yp_parser_t *parser, const char *cursor, ptrdiff_t remaining) {
4745
+ static inline const uint8_t *
4746
+ parser_lex_encoding_comment_start(yp_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
4687
4747
  assert(remaining >= 0);
4688
4748
  size_t length = (size_t) remaining;
4689
4749
 
4690
4750
  size_t key_length = strlen("coding:");
4691
4751
  if (key_length > length) return NULL;
4692
4752
 
4693
- const char *cursor_limit = cursor + length - key_length + 1;
4753
+ const uint8_t *cursor_limit = cursor + length - key_length + 1;
4694
4754
  while ((cursor = yp_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
4695
- if (strncmp(cursor, "coding", key_length - 1) == 0) {
4755
+ if (memcmp(cursor, "coding", key_length - 1) == 0) {
4696
4756
  size_t whitespace_after_coding = yp_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
4697
4757
  size_t cur_pos = key_length + whitespace_after_coding;
4698
4758
 
@@ -4711,13 +4771,13 @@ parser_lex_encoding_comment_start(yp_parser_t *parser, const char *cursor, ptrdi
4711
4771
  // actions are necessary for it here.
4712
4772
  static void
4713
4773
  parser_lex_encoding_comment(yp_parser_t *parser) {
4714
- const char *start = parser->current.start + 1;
4715
- const char *end = next_newline(start, parser->end - start);
4774
+ const uint8_t *start = parser->current.start + 1;
4775
+ const uint8_t *end = next_newline(start, parser->end - start);
4716
4776
  if (end == NULL) end = parser->end;
4717
4777
 
4718
4778
  // These are the patterns we're going to match to find the encoding comment.
4719
4779
  // This is definitely not complete or even really correct.
4720
- const char *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
4780
+ const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
4721
4781
 
4722
4782
  // If we didn't find anything that matched our patterns, then return. Note
4723
4783
  // that this does a _very_ poor job of actually finding the encoding, and
@@ -4730,7 +4790,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
4730
4790
 
4731
4791
  // Now determine the end of the encoding string. This is either the end of
4732
4792
  // the line, the first whitespace character, or a punctuation mark.
4733
- const char *encoding_end = yp_strpbrk(parser, encoding_start, " \t\f\r\v\n;,", end - encoding_start);
4793
+ const uint8_t *encoding_end = yp_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
4734
4794
  encoding_end = encoding_end == NULL ? end : encoding_end;
4735
4795
 
4736
4796
  // Finally, we can determine the width of the encoding string.
@@ -4752,7 +4812,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
4752
4812
  // Extensions like utf-8 can contain extra encoding details like,
4753
4813
  // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
4754
4814
  // treat any encoding starting utf-8 as utf-8.
4755
- if ((encoding_start + 5 <= parser->end) && (yp_strncasecmp(encoding_start, "utf-8", 5) == 0)) {
4815
+ if ((encoding_start + 5 <= parser->end) && (yp_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
4756
4816
  // We don't need to do anything here because the default encoding is
4757
4817
  // already UTF-8. We'll just return.
4758
4818
  return;
@@ -4761,7 +4821,7 @@ parser_lex_encoding_comment(yp_parser_t *parser) {
4761
4821
  // Next, we're going to loop through each of the encodings that we handle
4762
4822
  // explicitly. If we found one that we understand, we'll use that value.
4763
4823
  #define ENCODING(value, prebuilt) \
4764
- if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && yp_strncasecmp(encoding_start, value, width) == 0) { \
4824
+ if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && yp_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
4765
4825
  parser->encoding = prebuilt; \
4766
4826
  parser->encoding_changed |= true; \
4767
4827
  if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@@ -4901,14 +4961,9 @@ context_push(yp_parser_t *parser, yp_context_t context) {
4901
4961
 
4902
4962
  static void
4903
4963
  context_pop(yp_parser_t *parser) {
4904
- if (parser->current_context->prev == NULL) {
4905
- free(parser->current_context);
4906
- parser->current_context = NULL;
4907
- } else {
4908
- yp_context_node_t *prev = parser->current_context->prev;
4909
- free(parser->current_context);
4910
- parser->current_context = prev;
4911
- }
4964
+ yp_context_node_t *prev = parser->current_context->prev;
4965
+ free(parser->current_context);
4966
+ parser->current_context = prev;
4912
4967
  }
4913
4968
 
4914
4969
  static bool
@@ -4992,7 +5047,8 @@ lex_numeric_prefix(yp_parser_t *parser) {
4992
5047
  // 0d1111 is a decimal number
4993
5048
  case 'd':
4994
5049
  case 'D':
4995
- if (yp_char_is_decimal_digit(*++parser->current.end)) {
5050
+ parser->current.end++;
5051
+ if (yp_char_is_decimal_digit(peek(parser))) {
4996
5052
  parser->current.end += yp_strspn_decimal_number(parser->current.end, parser->end - parser->current.end);
4997
5053
  } else {
4998
5054
  yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Invalid decimal number.");
@@ -5003,7 +5059,8 @@ lex_numeric_prefix(yp_parser_t *parser) {
5003
5059
  // 0b1111 is a binary number
5004
5060
  case 'b':
5005
5061
  case 'B':
5006
- if (yp_char_is_binary_digit(*++parser->current.end)) {
5062
+ parser->current.end++;
5063
+ if (yp_char_is_binary_digit(peek(parser))) {
5007
5064
  parser->current.end += yp_strspn_binary_number(parser->current.end, parser->end - parser->current.end);
5008
5065
  } else {
5009
5066
  yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Invalid binary number.");
@@ -5014,7 +5071,8 @@ lex_numeric_prefix(yp_parser_t *parser) {
5014
5071
  // 0o1111 is an octal number
5015
5072
  case 'o':
5016
5073
  case 'O':
5017
- if (yp_char_is_octal_digit(*++parser->current.end)) {
5074
+ parser->current.end++;
5075
+ if (yp_char_is_octal_digit(peek(parser))) {
5018
5076
  parser->current.end += yp_strspn_octal_number(parser->current.end, parser->end - parser->current.end);
5019
5077
  } else {
5020
5078
  yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Invalid octal number.");
@@ -5038,7 +5096,8 @@ lex_numeric_prefix(yp_parser_t *parser) {
5038
5096
  // 0x1111 is a hexadecimal number
5039
5097
  case 'x':
5040
5098
  case 'X':
5041
- if (yp_char_is_hexadecimal_digit(*++parser->current.end)) {
5099
+ parser->current.end++;
5100
+ if (yp_char_is_hexadecimal_digit(peek(parser))) {
5042
5101
  parser->current.end += yp_strspn_hexadecimal_number(parser->current.end, parser->end - parser->current.end);
5043
5102
  } else {
5044
5103
  yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Invalid hexadecimal number.");
@@ -5084,7 +5143,7 @@ lex_numeric(yp_parser_t *parser) {
5084
5143
  if (parser->current.end < parser->end) {
5085
5144
  type = lex_numeric_prefix(parser);
5086
5145
 
5087
- const char *end = parser->current.end;
5146
+ const uint8_t *end = parser->current.end;
5088
5147
  yp_token_type_t suffix_type = type;
5089
5148
 
5090
5149
  if (type == YP_TOKEN_INTEGER) {
@@ -5109,8 +5168,8 @@ lex_numeric(yp_parser_t *parser) {
5109
5168
  }
5110
5169
  }
5111
5170
 
5112
- const unsigned char uc = (const unsigned char) peek(parser);
5113
- if (uc != '\0' && (uc >= 0x80 || ((uc >= 'a' && uc <= 'z') || (uc >= 'A' && uc <= 'Z')) || uc == '_')) {
5171
+ const uint8_t b = peek(parser);
5172
+ if (b != '\0' && (b >= 0x80 || ((b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z')) || b == '_')) {
5114
5173
  parser->current.end = end;
5115
5174
  } else {
5116
5175
  type = suffix_type;
@@ -5122,6 +5181,11 @@ lex_numeric(yp_parser_t *parser) {
5122
5181
 
5123
5182
  static yp_token_type_t
5124
5183
  lex_global_variable(yp_parser_t *parser) {
5184
+ if (parser->current.end >= parser->end) {
5185
+ yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Invalid global variable.");
5186
+ return YP_TOKEN_GLOBAL_VARIABLE;
5187
+ }
5188
+
5125
5189
  switch (*parser->current.end) {
5126
5190
  case '~': // $~: match-data
5127
5191
  case '*': // $*: argv
@@ -5210,7 +5274,7 @@ lex_keyword(yp_parser_t *parser, const char *value, yp_lex_state_t state, yp_tok
5210
5274
  yp_lex_state_t last_state = parser->lex_state;
5211
5275
 
5212
5276
  const size_t vlen = strlen(value);
5213
- if (parser->current.start + vlen <= parser->end && strncmp(parser->current.start, value, vlen) == 0) {
5277
+ if (parser->current.start + vlen <= parser->end && memcmp(parser->current.start, value, vlen) == 0) {
5214
5278
  if (parser->lex_state & YP_LEX_STATE_FNAME) {
5215
5279
  lex_state_set(parser, YP_LEX_STATE_ENDFN);
5216
5280
  } else {
@@ -5376,7 +5440,7 @@ current_token_starts_line(yp_parser_t *parser) {
5376
5440
  // this token type.
5377
5441
  //
5378
5442
  static yp_token_type_t
5379
- lex_interpolation(yp_parser_t *parser, const char *pound) {
5443
+ lex_interpolation(yp_parser_t *parser, const uint8_t *pound) {
5380
5444
  // If there is no content following this #, then we're at the end of
5381
5445
  // the string and we can safely return string content.
5382
5446
  if (pound + 1 >= parser->end) {
@@ -5397,7 +5461,7 @@ lex_interpolation(yp_parser_t *parser, const char *pound) {
5397
5461
 
5398
5462
  // If we're looking at a @ and there's another @, then we'll skip past the
5399
5463
  // second @.
5400
- const char *variable = pound + 2;
5464
+ const uint8_t *variable = pound + 2;
5401
5465
  if (*variable == '@' && pound + 3 < parser->end) variable++;
5402
5466
 
5403
5467
  if (char_is_identifier_start(parser, variable)) {
@@ -5433,7 +5497,7 @@ lex_interpolation(yp_parser_t *parser, const char *pound) {
5433
5497
  // This is the character that we're going to check to see if it is the
5434
5498
  // start of an identifier that would indicate that this is a global
5435
5499
  // variable.
5436
- const char *check = pound + 2;
5500
+ const uint8_t *check = pound + 2;
5437
5501
 
5438
5502
  if (pound[2] == '-') {
5439
5503
  if (pound + 3 >= parser->end) {
@@ -5624,7 +5688,7 @@ parser_comment(yp_parser_t *parser, yp_comment_type_t type) {
5624
5688
  static yp_token_type_t
5625
5689
  lex_embdoc(yp_parser_t *parser) {
5626
5690
  // First, lex out the EMBDOC_BEGIN token.
5627
- const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5691
+ const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5628
5692
 
5629
5693
  if (newline == NULL) {
5630
5694
  parser->current.end = parser->end;
@@ -5647,9 +5711,9 @@ lex_embdoc(yp_parser_t *parser) {
5647
5711
 
5648
5712
  // If we've hit the end of the embedded documentation then we'll return that
5649
5713
  // token here.
5650
- if (strncmp(parser->current.end, "=end", 4) == 0 &&
5714
+ if (memcmp(parser->current.end, "=end", 4) == 0 &&
5651
5715
  (parser->current.end + 4 == parser->end || yp_char_is_whitespace(parser->current.end[4]))) {
5652
- const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5716
+ const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5653
5717
 
5654
5718
  if (newline == NULL) {
5655
5719
  parser->current.end = parser->end;
@@ -5669,7 +5733,7 @@ lex_embdoc(yp_parser_t *parser) {
5669
5733
 
5670
5734
  // Otherwise, we'll parse until the end of the line and return a line of
5671
5735
  // embedded documentation.
5672
- const char *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5736
+ const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end);
5673
5737
 
5674
5738
  if (newline == NULL) {
5675
5739
  parser->current.end = parser->end;
@@ -5819,7 +5883,7 @@ parser_lex(yp_parser_t *parser) {
5819
5883
  LEX(YP_TOKEN_EOF);
5820
5884
 
5821
5885
  case '#': { // comments
5822
- const char *ending = next_newline(parser->current.end, parser->end - parser->current.end);
5886
+ const uint8_t *ending = next_newline(parser->current.end, parser->end - parser->current.end);
5823
5887
 
5824
5888
  parser->current.end = ending == NULL ? parser->end : ending + 1;
5825
5889
  parser->current.type = YP_TOKEN_COMMENT;
@@ -5888,7 +5952,7 @@ parser_lex(yp_parser_t *parser) {
5888
5952
  // (either . or &.) that starts the next line. If there is, then this
5889
5953
  // is going to become an ignored newline and we're going to instead
5890
5954
  // return the call operator.
5891
- const char *next_content = parser->next_start == NULL ? parser->current.end : parser->next_start;
5955
+ const uint8_t *next_content = parser->next_start == NULL ? parser->current.end : parser->next_start;
5892
5956
  next_content += yp_strspn_inline_whitespace(next_content, parser->end - next_content);
5893
5957
 
5894
5958
  if (next_content < parser->end) {
@@ -5899,15 +5963,15 @@ parser_lex(yp_parser_t *parser) {
5899
5963
  // Otherwise we'll return a regular newline.
5900
5964
  if (next_content[0] == '#') {
5901
5965
  // Here we look for a "." or "&." following a "\n".
5902
- const char *following = next_newline(next_content, parser->end - next_content);
5966
+ const uint8_t *following = next_newline(next_content, parser->end - next_content);
5903
5967
 
5904
- while (following && (following < parser->end)) {
5968
+ while (following && (following + 1 < parser->end)) {
5905
5969
  following++;
5906
5970
  following += yp_strspn_inline_whitespace(following, parser->end - following);
5907
5971
 
5908
5972
  // If this is not followed by a comment, then we can break out
5909
5973
  // of this loop.
5910
- if (*following != '#') break;
5974
+ if (peek_at(parser, following) != '#') break;
5911
5975
 
5912
5976
  // If there is a comment, then we need to find the end of the
5913
5977
  // comment and continue searching from there.
@@ -6150,7 +6214,7 @@ parser_lex(yp_parser_t *parser) {
6150
6214
 
6151
6215
  // = => =~ == === =begin
6152
6216
  case '=':
6153
- if (current_token_starts_line(parser) && strncmp(peek_string(parser, 5), "begin", 5) == 0 && yp_char_is_whitespace(peek_offset(parser, 5))) {
6217
+ if (current_token_starts_line(parser) && memcmp(peek_string(parser, 5), "begin", 5) == 0 && yp_char_is_whitespace(peek_offset(parser, 5))) {
6154
6218
  yp_token_type_t type = lex_embdoc(parser);
6155
6219
 
6156
6220
  if (type == YP_TOKEN_EOF) {
@@ -6188,7 +6252,7 @@ parser_lex(yp_parser_t *parser) {
6188
6252
  !lex_state_end_p(parser) &&
6189
6253
  (!lex_state_p(parser, YP_LEX_STATE_ARG_ANY) || lex_state_p(parser, YP_LEX_STATE_LABELED) || space_seen)
6190
6254
  ) {
6191
- const char *end = parser->current.end;
6255
+ const uint8_t *end = parser->current.end;
6192
6256
 
6193
6257
  yp_heredoc_quote_t quote = YP_HEREDOC_QUOTE_NONE;
6194
6258
  yp_heredoc_indent_t indent = YP_HEREDOC_INDENT_NONE;
@@ -6210,7 +6274,7 @@ parser_lex(yp_parser_t *parser) {
6210
6274
  quote = YP_HEREDOC_QUOTE_SINGLE;
6211
6275
  }
6212
6276
 
6213
- const char *ident_start = parser->current.end;
6277
+ const uint8_t *ident_start = parser->current.end;
6214
6278
  size_t width = 0;
6215
6279
 
6216
6280
  if (parser->current.end >= parser->end) {
@@ -6233,7 +6297,7 @@ parser_lex(yp_parser_t *parser) {
6233
6297
  }
6234
6298
 
6235
6299
  size_t ident_length = (size_t) (parser->current.end - ident_start);
6236
- if (quote != YP_HEREDOC_QUOTE_NONE && !match(parser, (char) quote)) {
6300
+ if (quote != YP_HEREDOC_QUOTE_NONE && !match(parser, (uint8_t) quote)) {
6237
6301
  // TODO: handle unterminated heredoc
6238
6302
  }
6239
6303
 
@@ -6249,7 +6313,7 @@ parser_lex(yp_parser_t *parser) {
6249
6313
  });
6250
6314
 
6251
6315
  if (parser->heredoc_end == NULL) {
6252
- const char *body_start = next_newline(parser->current.end, parser->end - parser->current.end);
6316
+ const uint8_t *body_start = next_newline(parser->current.end, parser->end - parser->current.end);
6253
6317
 
6254
6318
  if (body_start == NULL) {
6255
6319
  // If there is no newline after the heredoc identifier, then
@@ -6574,7 +6638,7 @@ parser_lex(yp_parser_t *parser) {
6574
6638
  LEX(YP_TOKEN_COLON_COLON);
6575
6639
  }
6576
6640
 
6577
- if (lex_state_end_p(parser) || yp_char_is_whitespace(*parser->current.end) || peek(parser) == '#') {
6641
+ if (lex_state_end_p(parser) || yp_char_is_whitespace(peek(parser)) || peek(parser) == '#') {
6578
6642
  lex_state_set(parser, YP_LEX_STATE_BEG);
6579
6643
  LEX(YP_TOKEN_COLON);
6580
6644
  }
@@ -6815,7 +6879,7 @@ parser_lex(yp_parser_t *parser) {
6815
6879
  if (
6816
6880
  ((parser->current.end - parser->current.start) == 7) &&
6817
6881
  current_token_starts_line(parser) &&
6818
- (strncmp(parser->current.start, "__END__", 7) == 0) &&
6882
+ (memcmp(parser->current.start, "__END__", 7) == 0) &&
6819
6883
  (parser->current.end == parser->end || match_eol(parser))
6820
6884
  )
6821
6885
  {
@@ -6891,8 +6955,8 @@ parser_lex(yp_parser_t *parser) {
6891
6955
  // Here we'll get a list of the places where strpbrk should break,
6892
6956
  // and then find the first one.
6893
6957
  yp_lex_mode_t *lex_mode = parser->lex_modes.current;
6894
- const char *breakpoints = lex_mode->as.list.breakpoints;
6895
- const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
6958
+ const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
6959
+ const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
6896
6960
 
6897
6961
  while (breakpoint != NULL) {
6898
6962
  // If we hit a null byte, skip directly past it.
@@ -6940,10 +7004,25 @@ parser_lex(yp_parser_t *parser) {
6940
7004
  if (*breakpoint == '\\') {
6941
7005
  yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
6942
7006
  size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7007
+ if (difference == 0) {
7008
+ // we're at the end of the file
7009
+ breakpoint = NULL;
7010
+ continue;
7011
+ }
6943
7012
 
6944
- // If the result is an escaped newline, then we need to
6945
- // track that newline.
6946
- yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
7013
+ // If the result is an escaped newline ...
7014
+ if (breakpoint[difference - 1] == '\n') {
7015
+ if (parser->heredoc_end) {
7016
+ // ... if we are on the same line as a heredoc, flush the heredoc and
7017
+ // continue parsing after heredoc_end.
7018
+ parser->current.end = breakpoint + difference;
7019
+ parser_flush_heredoc_end(parser);
7020
+ LEX(YP_TOKEN_STRING_CONTENT);
7021
+ } else {
7022
+ // ... else track the newline.
7023
+ yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7024
+ }
7025
+ }
6947
7026
 
6948
7027
  breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
6949
7028
  continue;
@@ -6998,8 +7077,8 @@ parser_lex(yp_parser_t *parser) {
6998
7077
  // These are the places where we need to split up the content of the
6999
7078
  // regular expression. We'll use strpbrk to find the first of these
7000
7079
  // characters.
7001
- const char *breakpoints = lex_mode->as.regexp.breakpoints;
7002
- const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7080
+ const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
7081
+ const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7003
7082
 
7004
7083
  while (breakpoint != NULL) {
7005
7084
  // If we hit a null byte, skip directly past it.
@@ -7062,9 +7141,14 @@ parser_lex(yp_parser_t *parser) {
7062
7141
  // and find the next breakpoint.
7063
7142
  if (*breakpoint == '\\') {
7064
7143
  size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
7144
+ if (difference == 0) {
7145
+ // we're at the end of the file
7146
+ breakpoint = NULL;
7147
+ continue;
7148
+ }
7065
7149
 
7066
7150
  // If the result is an escaped newline ...
7067
- if (*(breakpoint + difference - 1) == '\n') {
7151
+ if (breakpoint[difference - 1] == '\n') {
7068
7152
  if (parser->heredoc_end) {
7069
7153
  // ... if we are on the same line as a heredoc, flush the heredoc and
7070
7154
  // continue parsing after heredoc_end.
@@ -7126,8 +7210,8 @@ parser_lex(yp_parser_t *parser) {
7126
7210
 
7127
7211
  // These are the places where we need to split up the content of the
7128
7212
  // string. We'll use strpbrk to find the first of these characters.
7129
- const char *breakpoints = parser->lex_modes.current->as.string.breakpoints;
7130
- const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7213
+ const uint8_t *breakpoints = parser->lex_modes.current->as.string.breakpoints;
7214
+ const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7131
7215
 
7132
7216
  while (breakpoint != NULL) {
7133
7217
  // If we hit the incrementor, then we'll increment then nesting and
@@ -7212,9 +7296,14 @@ parser_lex(yp_parser_t *parser) {
7212
7296
  // find the next breakpoint.
7213
7297
  yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
7214
7298
  size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7299
+ if (difference == 0) {
7300
+ // we're at the end of the file
7301
+ breakpoint = NULL;
7302
+ break;
7303
+ }
7215
7304
 
7216
7305
  // If the result is an escaped newline ...
7217
- if (*(breakpoint + difference - 1) == '\n') {
7306
+ if (breakpoint[difference - 1] == '\n') {
7218
7307
  if (parser->heredoc_end) {
7219
7308
  // ... if we are on the same line as a heredoc, flush the heredoc and
7220
7309
  // continue parsing after heredoc_end.
@@ -7272,18 +7361,18 @@ parser_lex(yp_parser_t *parser) {
7272
7361
 
7273
7362
  // Now let's grab the information about the identifier off of the current
7274
7363
  // lex mode.
7275
- const char *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
7364
+ const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
7276
7365
  size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length;
7277
7366
 
7278
7367
  // If we are immediately following a newline and we have hit the
7279
7368
  // terminator, then we need to return the ending of the heredoc.
7280
7369
  if (current_token_starts_line(parser)) {
7281
- const char *start = parser->current.start;
7370
+ const uint8_t *start = parser->current.start;
7282
7371
  if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
7283
7372
  start += yp_strspn_inline_whitespace(start, parser->end - start);
7284
7373
  }
7285
7374
 
7286
- if ((start + ident_length <= parser->end) && (strncmp(start, ident_start, ident_length) == 0)) {
7375
+ if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
7287
7376
  bool matched = true;
7288
7377
  bool at_end = false;
7289
7378
 
@@ -7318,14 +7407,14 @@ parser_lex(yp_parser_t *parser) {
7318
7407
  // Otherwise we'll be parsing string content. These are the places where
7319
7408
  // we need to split up the content of the heredoc. We'll use strpbrk to
7320
7409
  // find the first of these characters.
7321
- char breakpoints[] = "\n\\#";
7410
+ uint8_t breakpoints[] = "\n\\#";
7322
7411
 
7323
7412
  yp_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
7324
7413
  if (quote == YP_HEREDOC_QUOTE_SINGLE) {
7325
7414
  breakpoints[2] = '\0';
7326
7415
  }
7327
7416
 
7328
- const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7417
+ const uint8_t *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7329
7418
 
7330
7419
  while (breakpoint != NULL) {
7331
7420
  switch (*breakpoint) {
@@ -7342,7 +7431,7 @@ parser_lex(yp_parser_t *parser) {
7342
7431
 
7343
7432
  yp_newline_list_append(&parser->newline_list, breakpoint);
7344
7433
 
7345
- const char *start = breakpoint + 1;
7434
+ const uint8_t *start = breakpoint + 1;
7346
7435
  if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
7347
7436
  start += yp_strspn_inline_whitespace(start, parser->end - start);
7348
7437
  }
@@ -7353,7 +7442,7 @@ parser_lex(yp_parser_t *parser) {
7353
7442
  // again and return the end of the heredoc.
7354
7443
  if (
7355
7444
  (start + ident_length <= parser->end) &&
7356
- (strncmp(start, ident_start, ident_length) == 0)
7445
+ (memcmp(start, ident_start, ident_length) == 0)
7357
7446
  ) {
7358
7447
  // Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
7359
7448
  if (
@@ -7383,6 +7472,11 @@ parser_lex(yp_parser_t *parser) {
7383
7472
  } else {
7384
7473
  yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
7385
7474
  size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7475
+ if (difference == 0) {
7476
+ // we're at the end of the file
7477
+ breakpoint = NULL;
7478
+ break;
7479
+ }
7386
7480
 
7387
7481
  yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
7388
7482
 
@@ -7453,6 +7547,17 @@ yp_symbol_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *openin
7453
7547
  return node;
7454
7548
  }
7455
7549
 
7550
+ static yp_string_node_t *
7551
+ yp_char_literal_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) {
7552
+ yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing);
7553
+
7554
+ assert((content->end - content->start) >= 0);
7555
+ yp_string_shared_init(&node->unescaped, content->start, content->end);
7556
+
7557
+ yp_unescape_manipulate_char_literal(parser, &node->unescaped, unescape_type);
7558
+ return node;
7559
+ }
7560
+
7456
7561
  static yp_string_node_t *
7457
7562
  yp_string_node_create_and_unescape(yp_parser_t *parser, const yp_token_t *opening, const yp_token_t *content, const yp_token_t *closing, yp_unescape_type_t unescape_type) {
7458
7563
  yp_string_node_t *node = yp_string_node_create(parser, opening, content, closing);
@@ -7918,10 +8023,11 @@ parse_target(yp_parser_t *parser, yp_node_t *target) {
7918
8023
  // the previous method name in, and append an =.
7919
8024
  size_t length = yp_string_length(&call->name);
7920
8025
 
7921
- char *name = calloc(length + 2, sizeof(char));
8026
+ uint8_t *name = calloc(length + 1, sizeof(uint8_t));
7922
8027
  if (name == NULL) return NULL;
7923
8028
 
7924
- snprintf(name, length + 2, "%.*s=", (int) length, yp_string_source(&call->name));
8029
+ memcpy(name, yp_string_source(&call->name), length);
8030
+ name[length] = '=';
7925
8031
 
7926
8032
  // Now switch the name to the new string.
7927
8033
  yp_string_free(&call->name);
@@ -7962,7 +8068,7 @@ parse_write(yp_parser_t *parser, yp_node_t *target, yp_token_t *operator, yp_nod
7962
8068
  case YP_NODE_MISSING_NODE:
7963
8069
  return target;
7964
8070
  case YP_NODE_CLASS_VARIABLE_READ_NODE: {
7965
- yp_class_variable_write_node_t *write_node = yp_class_variable_read_node_to_class_variable_write_node(parser, (yp_class_variable_read_node_t *) target, operator, value);
8071
+ yp_class_variable_write_node_t *write_node = yp_class_variable_write_node_create(parser, (yp_class_variable_read_node_t *) target, operator, value);
7966
8072
  yp_node_destroy(parser, target);
7967
8073
  return (yp_node_t *) write_node;
7968
8074
  }
@@ -7987,7 +8093,7 @@ parse_write(yp_parser_t *parser, yp_node_t *target, yp_token_t *operator, yp_nod
7987
8093
  case YP_NODE_LOCAL_VARIABLE_READ_NODE: {
7988
8094
  yp_local_variable_read_node_t *local_read = (yp_local_variable_read_node_t *) target;
7989
8095
 
7990
- yp_constant_id_t constant_id = local_read->constant_id;
8096
+ yp_constant_id_t constant_id = local_read->name;
7991
8097
  uint32_t depth = local_read->depth;
7992
8098
 
7993
8099
  yp_location_t name_loc = target->location;
@@ -8075,10 +8181,11 @@ parse_write(yp_parser_t *parser, yp_node_t *target, yp_token_t *operator, yp_nod
8075
8181
  // the previous method name in, and append an =.
8076
8182
  size_t length = yp_string_length(&call->name);
8077
8183
 
8078
- char *name = calloc(length + 2, sizeof(char));
8184
+ uint8_t *name = calloc(length + 1, sizeof(uint8_t));
8079
8185
  if (name == NULL) return NULL;
8080
8186
 
8081
- snprintf(name, length + 2, "%.*s=", (int) length, yp_string_source(&call->name));
8187
+ memcpy(name, yp_string_source(&call->name), length);
8188
+ name[length] = '=';
8082
8189
 
8083
8190
  // Now switch the name to the new string.
8084
8191
  yp_string_free(&call->name);
@@ -9043,10 +9150,12 @@ parse_rescues(yp_parser_t *parser, yp_begin_node_t *parent_node) {
9043
9150
  }
9044
9151
 
9045
9152
  if (!match_any_type_p(parser, 3, YP_TOKEN_KEYWORD_ELSE, YP_TOKEN_KEYWORD_ENSURE, YP_TOKEN_KEYWORD_END)) {
9153
+ yp_accepts_block_stack_push(parser, true);
9046
9154
  yp_statements_node_t *statements = parse_statements(parser, YP_CONTEXT_RESCUE);
9047
9155
  if (statements) {
9048
9156
  yp_rescue_node_statements_set(rescue, statements);
9049
9157
  }
9158
+ yp_accepts_block_stack_pop(parser);
9050
9159
  accept_any(parser, 2, YP_TOKEN_NEWLINE, YP_TOKEN_SEMICOLON);
9051
9160
  }
9052
9161
 
@@ -9063,7 +9172,7 @@ parse_rescues(yp_parser_t *parser, yp_begin_node_t *parent_node) {
9063
9172
  // since we won't know the end until we've found all consequent
9064
9173
  // clauses. This sets the end location on all rescues once we know it
9065
9174
  if (current) {
9066
- const char *end_to_set = current->base.location.end;
9175
+ const uint8_t *end_to_set = current->base.location.end;
9067
9176
  current = parent_node->rescue_clause;
9068
9177
  while (current) {
9069
9178
  current->base.location.end = end_to_set;
@@ -9077,7 +9186,9 @@ parse_rescues(yp_parser_t *parser, yp_begin_node_t *parent_node) {
9077
9186
 
9078
9187
  yp_statements_node_t *else_statements = NULL;
9079
9188
  if (!match_any_type_p(parser, 2, YP_TOKEN_KEYWORD_END, YP_TOKEN_KEYWORD_ENSURE)) {
9189
+ yp_accepts_block_stack_push(parser, true);
9080
9190
  else_statements = parse_statements(parser, YP_CONTEXT_RESCUE_ELSE);
9191
+ yp_accepts_block_stack_pop(parser);
9081
9192
  accept_any(parser, 2, YP_TOKEN_NEWLINE, YP_TOKEN_SEMICOLON);
9082
9193
  }
9083
9194
 
@@ -9091,7 +9202,9 @@ parse_rescues(yp_parser_t *parser, yp_begin_node_t *parent_node) {
9091
9202
 
9092
9203
  yp_statements_node_t *ensure_statements = NULL;
9093
9204
  if (!match_type_p(parser, YP_TOKEN_KEYWORD_END)) {
9205
+ yp_accepts_block_stack_push(parser, true);
9094
9206
  ensure_statements = parse_statements(parser, YP_CONTEXT_ENSURE);
9207
+ yp_accepts_block_stack_pop(parser);
9095
9208
  accept_any(parser, 2, YP_TOKEN_NEWLINE, YP_TOKEN_SEMICOLON);
9096
9209
  }
9097
9210
 
@@ -9116,7 +9229,7 @@ parse_rescues_as_begin(yp_parser_t *parser, yp_statements_node_t *statements) {
9116
9229
  // All nodes within a begin node are optional, so we look
9117
9230
  // for the earliest possible node that we can use to set
9118
9231
  // the BeginNode's start location
9119
- const char * start = begin_node->base.location.start;
9232
+ const uint8_t *start = begin_node->base.location.start;
9120
9233
  if (begin_node->statements) {
9121
9234
  start = begin_node->statements->base.location.start;
9122
9235
  } else if (begin_node->rescue_clause) {
@@ -9201,7 +9314,9 @@ parse_block(yp_parser_t *parser) {
9201
9314
  } else {
9202
9315
  if (!match_type_p(parser, YP_TOKEN_KEYWORD_END)) {
9203
9316
  if (!match_any_type_p(parser, 3, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ELSE, YP_TOKEN_KEYWORD_ENSURE)) {
9317
+ yp_accepts_block_stack_push(parser, true);
9204
9318
  statements = (yp_node_t *) parse_statements(parser, YP_CONTEXT_BLOCK_KEYWORDS);
9319
+ yp_accepts_block_stack_pop(parser);
9205
9320
  }
9206
9321
 
9207
9322
  if (match_any_type_p(parser, 2, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE)) {
@@ -9782,14 +9897,14 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
9782
9897
  yp_node_t *node = nodes->nodes[index];
9783
9898
 
9784
9899
  if (!YP_NODE_TYPE_P(node, YP_NODE_STRING_NODE)) continue;
9785
- yp_location_t *content_loc = &((yp_string_node_t *) node)->content_loc;
9900
+ const yp_location_t *content_loc = &((yp_string_node_t *) node)->content_loc;
9786
9901
 
9787
9902
  // If the previous node wasn't a string node, we don't want to trim
9788
9903
  // whitespace. This could happen after an interpolated expression or
9789
9904
  // variable.
9790
9905
  if (index == 0 || YP_NODE_TYPE_P(nodes->nodes[index - 1], YP_NODE_STRING_NODE)) {
9791
9906
  int cur_whitespace;
9792
- const char *cur_char = content_loc->start;
9907
+ const uint8_t *cur_char = content_loc->start;
9793
9908
 
9794
9909
  while (cur_char && cur_char < content_loc->end) {
9795
9910
  // Any empty newlines aren't included in the minimum whitespace
@@ -9880,15 +9995,15 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
9880
9995
  // destination to move bytes into. We'll also use it for bounds checking
9881
9996
  // since we don't require that these strings be null terminated.
9882
9997
  size_t dest_length = yp_string_length(string);
9883
- char *source_start = string->source;
9998
+ uint8_t *source_start = (uint8_t *) string->source;
9884
9999
 
9885
- const char *source_cursor = source_start;
9886
- const char *source_end = source_cursor + dest_length;
10000
+ const uint8_t *source_cursor = source_start;
10001
+ const uint8_t *source_end = source_cursor + dest_length;
9887
10002
 
9888
10003
  // We're going to move bytes backward in the string when we get leading
9889
10004
  // whitespace, so we'll maintain a pointer to the current position in the
9890
10005
  // string that we're writing to.
9891
- char *dest_cursor = source_start;
10006
+ uint8_t *dest_cursor = source_start;
9892
10007
 
9893
10008
  while (source_cursor < source_end) {
9894
10009
  // If we need to dedent the next element within the heredoc or the next
@@ -9915,7 +10030,7 @@ parse_heredoc_dedent(yp_parser_t *parser, yp_node_t *node, yp_heredoc_quote_t qu
9915
10030
 
9916
10031
  // At this point we have dedented all that we need to, so we need to find
9917
10032
  // the next newline.
9918
- const char *breakpoint = next_newline(source_cursor, source_end - source_cursor);
10033
+ const uint8_t *breakpoint = next_newline(source_cursor, source_end - source_cursor);
9919
10034
 
9920
10035
  if (breakpoint == NULL) {
9921
10036
  // If there isn't another newline, then we can just move the rest of the
@@ -10127,7 +10242,7 @@ parse_pattern_hash(yp_parser_t *parser, yp_node_t *first_assoc) {
10127
10242
  yp_node_t *key = ((yp_assoc_node_t *) first_assoc)->key;
10128
10243
 
10129
10244
  if (YP_NODE_TYPE_P(key, YP_NODE_SYMBOL_NODE)) {
10130
- yp_location_t *value_loc = &((yp_symbol_node_t *) key)->value_loc;
10245
+ const yp_location_t *value_loc = &((yp_symbol_node_t *) key)->value_loc;
10131
10246
  yp_parser_local_add_location(parser, value_loc->start, value_loc->end);
10132
10247
  }
10133
10248
  }
@@ -10155,7 +10270,7 @@ parse_pattern_hash(yp_parser_t *parser, yp_node_t *first_assoc) {
10155
10270
  if (!match_any_type_p(parser, 7, YP_TOKEN_COMMA, YP_TOKEN_KEYWORD_THEN, YP_TOKEN_BRACE_RIGHT, YP_TOKEN_BRACKET_RIGHT, YP_TOKEN_PARENTHESIS_RIGHT, YP_TOKEN_NEWLINE, YP_TOKEN_SEMICOLON)) {
10156
10271
  value = parse_pattern(parser, false, "Expected a pattern expression after the key.");
10157
10272
  } else {
10158
- yp_location_t *value_loc = &((yp_symbol_node_t *) key)->value_loc;
10273
+ const yp_location_t *value_loc = &((yp_symbol_node_t *) key)->value_loc;
10159
10274
  yp_parser_local_add_location(parser, value_loc->start, value_loc->end);
10160
10275
  }
10161
10276
 
@@ -10817,7 +10932,7 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
10817
10932
 
10818
10933
  yp_token_t closing = not_provided(parser);
10819
10934
 
10820
- return (yp_node_t *) yp_string_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL);
10935
+ return (yp_node_t *) yp_char_literal_node_create_and_unescape(parser, &opening, &content, &closing, YP_UNESCAPE_ALL);
10821
10936
  }
10822
10937
  case YP_TOKEN_CLASS_VARIABLE: {
10823
10938
  parser_lex(parser);
@@ -11362,7 +11477,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
11362
11477
 
11363
11478
  yp_node_t *statements = NULL;
11364
11479
  if (!match_any_type_p(parser, 3, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE, YP_TOKEN_KEYWORD_END)) {
11480
+ yp_accepts_block_stack_push(parser, true);
11365
11481
  statements = (yp_node_t *) parse_statements(parser, YP_CONTEXT_SCLASS);
11482
+ yp_accepts_block_stack_pop(parser);
11366
11483
  }
11367
11484
 
11368
11485
  if (match_any_type_p(parser, 2, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE)) {
@@ -11643,7 +11760,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
11643
11760
  yp_do_loop_stack_push(parser, false);
11644
11761
 
11645
11762
  if (!match_any_type_p(parser, 3, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE, YP_TOKEN_KEYWORD_END)) {
11763
+ yp_accepts_block_stack_push(parser, true);
11646
11764
  statements = (yp_node_t *) parse_statements(parser, YP_CONTEXT_DEF);
11765
+ yp_accepts_block_stack_pop(parser);
11647
11766
  }
11648
11767
 
11649
11768
  if (match_any_type_p(parser, 2, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE)) {
@@ -11933,14 +12052,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
11933
12052
  yp_array_node_t *array = yp_array_node_create(parser, &parser->previous);
11934
12053
 
11935
12054
  while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
11936
- if (yp_array_node_size(array) == 0) {
11937
- accept(parser, YP_TOKEN_WORDS_SEP);
11938
- } else {
11939
- expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the symbols in a `%i` list.");
11940
- if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
11941
- }
11942
-
12055
+ accept(parser, YP_TOKEN_WORDS_SEP);
11943
12056
  if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
12057
+
11944
12058
  expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a symbol in a `%i` list.");
11945
12059
 
11946
12060
  yp_token_t opening = not_provided(parser);
@@ -11995,6 +12109,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
11995
12109
  // to the list of child nodes.
11996
12110
  yp_node_t *part = parse_string_part(parser);
11997
12111
  yp_interpolated_symbol_node_append((yp_interpolated_symbol_node_t *) current, part);
12112
+ } else if (YP_NODE_TYPE_P(current, YP_NODE_SYMBOL_NODE)) {
12113
+ // If we hit string content and the current node is a string node,
12114
+ // then we need to convert the current node into an interpolated
12115
+ // string and add the string content to the list of child nodes.
12116
+ yp_token_t opening = not_provided(parser);
12117
+ yp_token_t closing = not_provided(parser);
12118
+ yp_interpolated_symbol_node_t *interpolated =
12119
+ yp_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
12120
+ yp_interpolated_symbol_node_append(interpolated, current);
12121
+
12122
+ yp_node_t *part = parse_string_part(parser);
12123
+ yp_interpolated_symbol_node_append(interpolated, part);
12124
+ current = (yp_node_t *) interpolated;
11998
12125
  } else {
11999
12126
  assert(false && "unreachable");
12000
12127
  }
@@ -12097,12 +12224,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
12097
12224
  accept(parser, YP_TOKEN_WORDS_SEP);
12098
12225
 
12099
12226
  while (!match_any_type_p(parser, 2, YP_TOKEN_STRING_END, YP_TOKEN_EOF)) {
12100
- if (yp_array_node_size(array) == 0) {
12101
- accept(parser, YP_TOKEN_WORDS_SEP);
12102
- } else {
12103
- expect(parser, YP_TOKEN_WORDS_SEP, "Expected a separator for the strings in a `%w` list.");
12104
- if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
12105
- }
12227
+ accept(parser, YP_TOKEN_WORDS_SEP);
12228
+ if (match_type_p(parser, YP_TOKEN_STRING_END)) break;
12229
+
12106
12230
  expect(parser, YP_TOKEN_STRING_CONTENT, "Expected a string in a `%w` list.");
12107
12231
 
12108
12232
  yp_token_t opening = not_provided(parser);
@@ -12152,6 +12276,19 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
12152
12276
  // to the list of child nodes.
12153
12277
  yp_node_t *part = parse_string_part(parser);
12154
12278
  yp_interpolated_string_node_append((yp_interpolated_string_node_t *) current, part);
12279
+ } else if (YP_NODE_TYPE_P(current, YP_NODE_STRING_NODE)) {
12280
+ // If we hit string content and the current node is a string node,
12281
+ // then we need to convert the current node into an interpolated
12282
+ // string and add the string content to the list of child nodes.
12283
+ yp_token_t opening = not_provided(parser);
12284
+ yp_token_t closing = not_provided(parser);
12285
+ yp_interpolated_string_node_t *interpolated =
12286
+ yp_interpolated_string_node_create(parser, &opening, NULL, &closing);
12287
+ yp_interpolated_string_node_append(interpolated, current);
12288
+
12289
+ yp_node_t *part = parse_string_part(parser);
12290
+ yp_interpolated_string_node_append(interpolated, part);
12291
+ current = (yp_node_t *) interpolated;
12155
12292
  } else {
12156
12293
  assert(false && "unreachable");
12157
12294
  }
@@ -12482,7 +12619,9 @@ parse_expression_prefix(yp_parser_t *parser, yp_binding_power_t binding_power) {
12482
12619
  opening = parser->previous;
12483
12620
 
12484
12621
  if (!match_any_type_p(parser, 3, YP_TOKEN_KEYWORD_END, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE)) {
12622
+ yp_accepts_block_stack_push(parser, true);
12485
12623
  body = (yp_node_t *) parse_statements(parser, YP_CONTEXT_LAMBDA_DO_END);
12624
+ yp_accepts_block_stack_pop(parser);
12486
12625
  }
12487
12626
 
12488
12627
  if (match_any_type_p(parser, 2, YP_TOKEN_KEYWORD_RESCUE, YP_TOKEN_KEYWORD_ENSURE)) {
@@ -12759,7 +12898,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12759
12898
  parser_lex(parser);
12760
12899
 
12761
12900
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after &&=");
12762
- yp_node_t *result = (yp_node_t *) yp_class_variable_and_write_node_create(parser, node, &token, value);
12901
+ yp_node_t *result = (yp_node_t *) yp_class_variable_and_write_node_create(parser, (yp_class_variable_read_node_t *) node, &token, value);
12763
12902
 
12764
12903
  yp_node_destroy(parser, node);
12765
12904
  return result;
@@ -12783,7 +12922,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12783
12922
  parser_lex(parser);
12784
12923
 
12785
12924
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after &&=");
12786
- yp_node_t *result = (yp_node_t *) yp_instance_variable_and_write_node_create(parser, node, &token, value);
12925
+ yp_node_t *result = (yp_node_t *) yp_instance_variable_and_write_node_create(parser, (yp_instance_variable_read_node_t *) node, &token, value);
12787
12926
 
12788
12927
  yp_node_destroy(parser, node);
12789
12928
  return result;
@@ -12793,7 +12932,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12793
12932
  parser_lex(parser);
12794
12933
 
12795
12934
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after &&=");
12796
- yp_node_t *result = (yp_node_t *) yp_local_variable_and_write_node_create(parser, node, &token, value, cast->constant_id, cast->depth);
12935
+ yp_node_t *result = (yp_node_t *) yp_local_variable_and_write_node_create(parser, node, &token, value, cast->name, cast->depth);
12797
12936
 
12798
12937
  yp_node_destroy(parser, node);
12799
12938
  return result;
@@ -12860,7 +12999,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12860
12999
  parser_lex(parser);
12861
13000
 
12862
13001
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after ||=");
12863
- yp_node_t *result = (yp_node_t *) yp_class_variable_or_write_node_create(parser, node, &token, value);
13002
+ yp_node_t *result = (yp_node_t *) yp_class_variable_or_write_node_create(parser, (yp_class_variable_read_node_t *) node, &token, value);
12864
13003
 
12865
13004
  yp_node_destroy(parser, node);
12866
13005
  return result;
@@ -12884,7 +13023,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12884
13023
  parser_lex(parser);
12885
13024
 
12886
13025
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after ||=");
12887
- yp_node_t *result = (yp_node_t *) yp_instance_variable_or_write_node_create(parser, node, &token, value);
13026
+ yp_node_t *result = (yp_node_t *) yp_instance_variable_or_write_node_create(parser, (yp_instance_variable_read_node_t *) node, &token, value);
12888
13027
 
12889
13028
  yp_node_destroy(parser, node);
12890
13029
  return result;
@@ -12894,7 +13033,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12894
13033
  parser_lex(parser);
12895
13034
 
12896
13035
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after ||=");
12897
- yp_node_t *result = (yp_node_t *) yp_local_variable_or_write_node_create(parser, node, &token, value, cast->constant_id, cast->depth);
13036
+ yp_node_t *result = (yp_node_t *) yp_local_variable_or_write_node_create(parser, node, &token, value, cast->name, cast->depth);
12898
13037
 
12899
13038
  yp_node_destroy(parser, node);
12900
13039
  return result;
@@ -12971,7 +13110,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12971
13110
  parser_lex(parser);
12972
13111
 
12973
13112
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after the operator.");
12974
- yp_node_t *result = (yp_node_t *) yp_class_variable_operator_write_node_create(parser, node, &token, value);
13113
+ yp_node_t *result = (yp_node_t *) yp_class_variable_operator_write_node_create(parser, (yp_class_variable_read_node_t *) node, &token, value);
12975
13114
 
12976
13115
  yp_node_destroy(parser, node);
12977
13116
  return result;
@@ -12995,7 +13134,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
12995
13134
  parser_lex(parser);
12996
13135
 
12997
13136
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after the operator.");
12998
- yp_node_t *result = (yp_node_t *) yp_instance_variable_operator_write_node_create(parser, node, &token, value);
13137
+ yp_node_t *result = (yp_node_t *) yp_instance_variable_operator_write_node_create(parser, (yp_instance_variable_read_node_t *) node, &token, value);
12999
13138
 
13000
13139
  yp_node_destroy(parser, node);
13001
13140
  return result;
@@ -13005,7 +13144,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
13005
13144
  parser_lex(parser);
13006
13145
 
13007
13146
  yp_node_t *value = parse_expression(parser, binding_power, "Expected a value after the operator.");
13008
- yp_node_t *result = (yp_node_t *) yp_local_variable_operator_write_node_create(parser, node, &token, value, cast->constant_id, cast->depth);
13147
+ yp_node_t *result = (yp_node_t *) yp_local_variable_operator_write_node_create(parser, node, &token, value, cast->name, cast->depth);
13009
13148
 
13010
13149
  yp_node_destroy(parser, node);
13011
13150
  return result;
@@ -13083,7 +13222,7 @@ parse_expression_infix(yp_parser_t *parser, yp_node_t *node, yp_binding_power_t
13083
13222
  yp_string_list_t named_captures;
13084
13223
  yp_string_list_init(&named_captures);
13085
13224
 
13086
- yp_location_t *content_loc = &((yp_regular_expression_node_t *) node)->content_loc;
13225
+ const yp_location_t *content_loc = &((yp_regular_expression_node_t *) node)->content_loc;
13087
13226
 
13088
13227
  if (yp_regexp_named_capture_group_names(content_loc->start, (size_t) (content_loc->end - content_loc->start), &named_captures, parser->encoding_changed, &parser->encoding)) {
13089
13228
  for (size_t index = 0; index < named_captures.length; index++) {
@@ -13507,7 +13646,7 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
13507
13646
  uint32_t local_size = yp_metadata_read_u32(metadata);
13508
13647
  metadata += 4;
13509
13648
 
13510
- yp_parser_local_add_location(parser, metadata, metadata + local_size);
13649
+ yp_parser_local_add_location(parser, (const uint8_t *) metadata, (const uint8_t *) (metadata + local_size));
13511
13650
  metadata += local_size;
13512
13651
  }
13513
13652
  }
@@ -13519,7 +13658,7 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
13519
13658
 
13520
13659
  // Initialize a parser with the given start and end pointers.
13521
13660
  YP_EXPORTED_FUNCTION void
13522
- yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
13661
+ yp_parser_init(yp_parser_t *parser, const uint8_t *source, size_t size, const char *filepath) {
13523
13662
  assert(source != NULL);
13524
13663
 
13525
13664
  // Set filepath to the file that was passed
@@ -13591,7 +13730,7 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
13591
13730
  yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
13592
13731
 
13593
13732
  // Skip past the UTF-8 BOM if it exists.
13594
- if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
13733
+ if (size >= 3 && source[0] == 0xef && source[1] == 0xbb && source[2] == 0xbf) {
13595
13734
  parser->current.end += 3;
13596
13735
  parser->encoding_comment_start += 3;
13597
13736
  }
@@ -13599,7 +13738,7 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
13599
13738
  // If the first two bytes of the source are a shebang, then we'll indicate
13600
13739
  // that the encoding comment is at the end of the shebang.
13601
13740
  if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
13602
- const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
13741
+ const uint8_t *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
13603
13742
  if (encoding_comment_start) {
13604
13743
  parser->encoding_comment_start = encoding_comment_start + 1;
13605
13744
  }
@@ -13671,7 +13810,7 @@ yp_serialize(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) {
13671
13810
  // Parse and serialize the AST represented by the given source to the given
13672
13811
  // buffer.
13673
13812
  YP_EXPORTED_FUNCTION void
13674
- yp_parse_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
13813
+ yp_parse_serialize(const uint8_t *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
13675
13814
  yp_parser_t parser;
13676
13815
  yp_parser_init(&parser, source, size, NULL);
13677
13816
  if (metadata) yp_parser_metadata(&parser, metadata);