prism 0.13.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
data/src/prism.c CHANGED
@@ -421,6 +421,63 @@ debug_lex_state_set(pm_parser_t *parser, pm_lex_state_t state, char const * call
421
421
  #define lex_state_set(parser, state) debug_lex_state_set(parser, state, __func__, __LINE__)
422
422
  #endif
423
423
 
424
+ /******************************************************************************/
425
+ /* Diagnostic-related functions */
426
+ /******************************************************************************/
427
+
428
+ // Append an error to the list of errors on the parser.
429
+ static inline void
430
+ pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
431
+ pm_diagnostic_list_append(&parser->error_list, start, end, diag_id);
432
+ }
433
+
434
+ // Append an error to the list of errors on the parser using the location of the
435
+ // current token.
436
+ static inline void
437
+ pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
438
+ pm_parser_err(parser, parser->current.start, parser->current.end, diag_id);
439
+ }
440
+
441
+ // Append an error to the list of errors on the parser using the given location.
442
+ static inline void
443
+ pm_parser_err_location(pm_parser_t *parser, const pm_location_t *location, pm_diagnostic_id_t diag_id) {
444
+ pm_parser_err(parser, location->start, location->end, diag_id);
445
+ }
446
+
447
+ // Append an error to the list of errors on the parser using the location of the
448
+ // given node.
449
+ static inline void
450
+ pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) {
451
+ pm_parser_err(parser, node->location.start, node->location.end, diag_id);
452
+ }
453
+
454
+ // Append an error to the list of errors on the parser using the location of the
455
+ // previous token.
456
+ static inline void
457
+ pm_parser_err_previous(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
458
+ pm_parser_err(parser, parser->previous.start, parser->previous.end, diag_id);
459
+ }
460
+
461
+ // Append an error to the list of errors on the parser using the location of the
462
+ // given token.
463
+ static inline void
464
+ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) {
465
+ pm_parser_err(parser, token->start, token->end, diag_id);
466
+ }
467
+
468
+ // Append a warning to the list of warnings on the parser.
469
+ static inline void
470
+ pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
471
+ pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
472
+ }
473
+
474
+ // Append a warning to the list of warnings on the parser using the location of
475
+ // the given token.
476
+ static inline void
477
+ pm_parser_warn_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) {
478
+ pm_parser_warn(parser, token->start, token->end, diag_id);
479
+ }
480
+
424
481
  /******************************************************************************/
425
482
  /* Node-related functions */
426
483
  /******************************************************************************/
@@ -437,6 +494,22 @@ pm_parser_constant_id_owned(pm_parser_t *parser, const uint8_t *start, size_t le
437
494
  return pm_constant_pool_insert_owned(&parser->constant_pool, start, length);
438
495
  }
439
496
 
497
+ // Retrieve the constant pool id for the given static literal C string.
498
+ static inline pm_constant_id_t
499
+ pm_parser_constant_id_static(pm_parser_t *parser, const char *start, size_t length) {
500
+ uint8_t *owned_copy;
501
+ if (length > 0) {
502
+ owned_copy = malloc(length);
503
+ memcpy(owned_copy, start, length);
504
+ } else {
505
+ owned_copy = malloc(1);
506
+ owned_copy[0] = '\0';
507
+ }
508
+ return pm_constant_pool_insert_owned(&parser->constant_pool, owned_copy, length);
509
+ // Does not work because the static literal cannot be serialized as an offset of source
510
+ // return pm_constant_pool_insert_shared(&parser->constant_pool, start, length);
511
+ }
512
+
440
513
  // Retrieve the constant pool id for the given token.
441
514
  static inline pm_constant_id_t
442
515
  pm_parser_constant_id_token(pm_parser_t *parser, const pm_token_t *token) {
@@ -582,12 +655,7 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
582
655
 
583
656
  // If we didn't hit a case before this check, then at this point we need to
584
657
  // add a syntax error.
585
- pm_diagnostic_list_append(
586
- &parser->error_list,
587
- block->base.location.start,
588
- block->base.location.end,
589
- PM_ERR_ARGUMENT_UNEXPECTED_BLOCK
590
- );
658
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK);
591
659
  }
592
660
 
593
661
  /******************************************************************************/
@@ -601,6 +669,7 @@ pm_scope_node_init(pm_node_t *node, pm_scope_node_t *scope) {
601
669
  scope->base.location.start = node->location.start;
602
670
  scope->base.location.end = node->location.end;
603
671
 
672
+ scope->ast_node = node;
604
673
  scope->parameters = NULL;
605
674
  scope->body = NULL;
606
675
  pm_constant_id_list_init(&scope->locals);
@@ -626,6 +695,11 @@ pm_scope_node_init(pm_node_t *node, pm_scope_node_t *scope) {
626
695
  scope->locals = cast->locals;
627
696
  break;
628
697
  }
698
+ case PM_FOR_NODE: {
699
+ pm_for_node_t *cast = (pm_for_node_t *)node;
700
+ scope->body = (pm_node_t *)cast->statements;
701
+ break;
702
+ }
629
703
  case PM_LAMBDA_NODE: {
630
704
  pm_lambda_node_t *cast = (pm_lambda_node_t *) node;
631
705
  if (cast->parameters) scope->parameters = cast->parameters->parameters;
@@ -679,14 +753,14 @@ parse_decimal_number(pm_parser_t *parser, const uint8_t *start, const uint8_t *e
679
753
  unsigned long value = strtoul(digits, &endptr, 10);
680
754
 
681
755
  if ((digits == endptr) || (*endptr != '\0') || (errno == ERANGE)) {
682
- pm_diagnostic_list_append(&parser->error_list, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
756
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
683
757
  value = UINT32_MAX;
684
758
  }
685
759
 
686
760
  free(digits);
687
761
 
688
762
  if (value > UINT32_MAX) {
689
- pm_diagnostic_list_append(&parser->error_list, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
763
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
690
764
  value = UINT32_MAX;
691
765
  }
692
766
 
@@ -907,7 +981,7 @@ pm_array_node_elements_append(pm_array_node_t *node, pm_node_t *element) {
907
981
 
908
982
  // If the element is not a static literal, then the array is not a static
909
983
  // literal. Turn that flag off.
910
- if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || (element->flags & PM_NODE_FLAG_STATIC_LITERAL) == 0) {
984
+ if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || PM_NODE_TYPE_P(element, PM_RANGE_NODE) || (element->flags & PM_NODE_FLAG_STATIC_LITERAL) == 0) {
911
985
  node->base.flags &= (pm_node_flags_t) ~PM_NODE_FLAG_STATIC_LITERAL;
912
986
  }
913
987
  }
@@ -1051,8 +1125,10 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1051
1125
  end = key->location.end;
1052
1126
  }
1053
1127
 
1128
+ // If the key and value of this assoc node are both static literals, then
1129
+ // we can mark this node as a static literal.
1054
1130
  pm_node_flags_t flags = 0;
1055
- if (value && !PM_NODE_TYPE_P(value, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(value, PM_HASH_NODE)) {
1131
+ if (value && !PM_NODE_TYPE_P(value, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(value, PM_HASH_NODE) && !PM_NODE_TYPE_P(value, PM_RANGE_NODE)) {
1056
1132
  flags = key->flags & value->flags & PM_NODE_FLAG_STATIC_LITERAL;
1057
1133
  }
1058
1134
 
@@ -1341,7 +1417,8 @@ pm_call_node_create(pm_parser_t *parser) {
1341
1417
  .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
1342
1418
  .arguments = NULL,
1343
1419
  .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
1344
- .block = NULL
1420
+ .block = NULL,
1421
+ .name = 0
1345
1422
  };
1346
1423
 
1347
1424
  return node;
@@ -1369,7 +1446,7 @@ pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_
1369
1446
  node->closing_loc = arguments->closing_loc;
1370
1447
  node->block = arguments->block;
1371
1448
 
1372
- pm_string_constant_init(&node->name, "[]", 2);
1449
+ node->name = pm_parser_constant_id_static(parser, "[]", 2);
1373
1450
  return node;
1374
1451
  }
1375
1452
 
@@ -1388,7 +1465,7 @@ pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t
1388
1465
  pm_arguments_node_arguments_append(arguments, argument);
1389
1466
  node->arguments = arguments;
1390
1467
 
1391
- pm_string_shared_init(&node->name, operator->start, operator->end);
1468
+ node->name = pm_parser_constant_id_token(parser, operator);
1392
1469
  return node;
1393
1470
  }
1394
1471
 
@@ -1420,7 +1497,7 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o
1420
1497
  node->base.flags |= PM_CALL_NODE_FLAGS_SAFE_NAVIGATION;
1421
1498
  }
1422
1499
 
1423
- pm_string_shared_init(&node->name, message->start, message->end);
1500
+ node->name = pm_parser_constant_id_token(parser, message);
1424
1501
  return node;
1425
1502
  }
1426
1503
 
@@ -1447,7 +1524,7 @@ pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments
1447
1524
  node->closing_loc = arguments->closing_loc;
1448
1525
  node->block = arguments->block;
1449
1526
 
1450
- pm_string_shared_init(&node->name, message->start, message->end);
1527
+ node->name = pm_parser_constant_id_token(parser, message);
1451
1528
  return node;
1452
1529
  }
1453
1530
 
@@ -1469,7 +1546,7 @@ pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *me
1469
1546
  node->arguments = arguments->arguments;
1470
1547
  node->closing_loc = arguments->closing_loc;
1471
1548
 
1472
- pm_string_constant_init(&node->name, "!", 1);
1549
+ node->name = pm_parser_constant_id_static(parser, "!", 1);
1473
1550
  return node;
1474
1551
  }
1475
1552
 
@@ -1496,7 +1573,7 @@ pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token
1496
1573
  node->base.flags |= PM_CALL_NODE_FLAGS_SAFE_NAVIGATION;
1497
1574
  }
1498
1575
 
1499
- pm_string_constant_init(&node->name, "call", 4);
1576
+ node->name = pm_parser_constant_id_static(parser, "call", 4);
1500
1577
  return node;
1501
1578
  }
1502
1579
 
@@ -1511,7 +1588,7 @@ pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *
1511
1588
  node->receiver = receiver;
1512
1589
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator);
1513
1590
 
1514
- pm_string_constant_init(&node->name, name, strlen(name));
1591
+ node->name = pm_parser_constant_id_static(parser, name, strlen(name));
1515
1592
  return node;
1516
1593
  }
1517
1594
 
@@ -1524,7 +1601,7 @@ pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) {
1524
1601
  node->base.location = PM_LOCATION_TOKEN_VALUE(message);
1525
1602
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message);
1526
1603
 
1527
- pm_string_shared_init(&node->name, message->start, message->end);
1604
+ node->name = pm_parser_constant_id_token(parser, message);
1528
1605
  return node;
1529
1606
  }
1530
1607
 
@@ -1537,17 +1614,18 @@ pm_call_node_variable_call_p(pm_call_node_t *node) {
1537
1614
 
1538
1615
  // Initialize the read name by reading the write name and chopping off the '='.
1539
1616
  static void
1540
- pm_call_write_read_name_init(pm_string_t *read_name, pm_string_t *write_name) {
1541
- if (write_name->length >= 1) {
1542
- size_t length = write_name->length - 1;
1617
+ pm_call_write_read_name_init(pm_parser_t *parser, pm_constant_id_t *read_name, pm_constant_id_t *write_name) {
1618
+ pm_constant_t *write_constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *write_name);
1619
+ if (write_constant->length >= 1) {
1620
+ size_t length = write_constant->length - 1;
1543
1621
 
1544
1622
  void *memory = malloc(length);
1545
- memcpy(memory, write_name->source, length);
1623
+ memcpy(memory, write_constant->start, length);
1546
1624
 
1547
- pm_string_owned_init(read_name, (uint8_t *) memory, length);
1625
+ *read_name = pm_constant_pool_insert_owned(&parser->constant_pool, (uint8_t *) memory, length);
1548
1626
  } else {
1549
1627
  // We can get here if the message was missing because of a syntax error.
1550
- pm_string_constant_init(read_name, "", 0);
1628
+ *read_name = pm_parser_constant_id_static(parser, "", 0);
1551
1629
  }
1552
1630
  }
1553
1631
 
@@ -1573,13 +1651,13 @@ pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const
1573
1651
  .opening_loc = target->opening_loc,
1574
1652
  .arguments = target->arguments,
1575
1653
  .closing_loc = target->closing_loc,
1576
- .read_name = PM_EMPTY_STRING,
1654
+ .read_name = 0,
1577
1655
  .write_name = target->name,
1578
1656
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1579
1657
  .value = value
1580
1658
  };
1581
1659
 
1582
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1660
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1583
1661
 
1584
1662
  // Here we're going to free the target, since it is no longer necessary.
1585
1663
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -1610,14 +1688,14 @@ pm_call_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target,
1610
1688
  .opening_loc = target->opening_loc,
1611
1689
  .arguments = target->arguments,
1612
1690
  .closing_loc = target->closing_loc,
1613
- .read_name = PM_EMPTY_STRING,
1691
+ .read_name = 0,
1614
1692
  .write_name = target->name,
1615
1693
  .operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1),
1616
1694
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1617
1695
  .value = value
1618
1696
  };
1619
1697
 
1620
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1698
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1621
1699
 
1622
1700
  // Here we're going to free the target, since it is no longer necessary.
1623
1701
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -1649,13 +1727,13 @@ pm_call_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const
1649
1727
  .opening_loc = target->opening_loc,
1650
1728
  .arguments = target->arguments,
1651
1729
  .closing_loc = target->closing_loc,
1652
- .read_name = PM_EMPTY_STRING,
1730
+ .read_name = 0,
1653
1731
  .write_name = target->name,
1654
1732
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1655
1733
  .value = value
1656
1734
  };
1657
1735
 
1658
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1736
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1659
1737
 
1660
1738
  // Here we're going to free the target, since it is no longer necessary.
1661
1739
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -3372,11 +3450,20 @@ pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name,
3372
3450
  return node;
3373
3451
  }
3374
3452
 
3453
+ static inline bool
3454
+ token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
3455
+ return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1]));
3456
+ }
3457
+
3375
3458
  // Allocate and initialize a new LocalVariableTargetNode node.
3376
3459
  static pm_local_variable_target_node_t *
3377
3460
  pm_local_variable_target_node_create(pm_parser_t *parser, const pm_token_t *name) {
3378
3461
  pm_local_variable_target_node_t *node = PM_ALLOC_NODE(parser, pm_local_variable_target_node_t);
3379
3462
 
3463
+ if (token_is_numbered_parameter(name->start, name->end)) {
3464
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NUMBERED_RESERVED);
3465
+ }
3466
+
3380
3467
  *node = (pm_local_variable_target_node_t) {
3381
3468
  {
3382
3469
  .type = PM_LOCAL_VARIABLE_TARGET_NODE,
@@ -3870,10 +3957,27 @@ pm_pre_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, con
3870
3957
  static pm_range_node_t *
3871
3958
  pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) {
3872
3959
  pm_range_node_t *node = PM_ALLOC_NODE(parser, pm_range_node_t);
3960
+ pm_node_flags_t flags = 0;
3961
+
3962
+ // Indicate that this node an exclusive range if the operator is `...`.
3963
+ if (operator->type == PM_TOKEN_DOT_DOT_DOT || operator->type == PM_TOKEN_UDOT_DOT_DOT) {
3964
+ flags |= PM_RANGE_FLAGS_EXCLUDE_END;
3965
+ }
3966
+
3967
+ // Indicate that this node is a static literal (i.e., can be compiled with
3968
+ // a putobject in CRuby) if the left and right are implicit nil, explicit
3969
+ // nil, or integers.
3970
+ if (
3971
+ (left == NULL || PM_NODE_TYPE_P(left, PM_NIL_NODE) || PM_NODE_TYPE_P(left, PM_INTEGER_NODE)) &&
3972
+ (right == NULL || PM_NODE_TYPE_P(right, PM_NIL_NODE) || PM_NODE_TYPE_P(right, PM_INTEGER_NODE))
3973
+ ) {
3974
+ flags |= PM_NODE_FLAG_STATIC_LITERAL;
3975
+ }
3873
3976
 
3874
3977
  *node = (pm_range_node_t) {
3875
3978
  {
3876
3979
  .type = PM_RANGE_NODE,
3980
+ .flags = flags,
3877
3981
  .location = {
3878
3982
  .start = (left == NULL ? operator->start : left->location.start),
3879
3983
  .end = (right == NULL ? operator->end : right->location.end)
@@ -3884,15 +3988,6 @@ pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *ope
3884
3988
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator)
3885
3989
  };
3886
3990
 
3887
- switch (operator->type) {
3888
- case PM_TOKEN_DOT_DOT_DOT:
3889
- case PM_TOKEN_UDOT_DOT_DOT:
3890
- node->base.flags |= PM_RANGE_FLAGS_EXCLUDE_END;
3891
- break;
3892
- default:
3893
- break;
3894
- }
3895
-
3896
3991
  return node;
3897
3992
  }
3898
3993
 
@@ -3906,9 +4001,10 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) {
3906
4001
  return node;
3907
4002
  }
3908
4003
 
3909
- // Allocate a new RegularExpressionNode node.
4004
+ // Allocate a new initialize a new RegularExpressionNode node with the given
4005
+ // unescaped string.
3910
4006
  static pm_regular_expression_node_t *
3911
- pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4007
+ pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) {
3912
4008
  pm_regular_expression_node_t *node = PM_ALLOC_NODE(parser, pm_regular_expression_node_t);
3913
4009
 
3914
4010
  *node = (pm_regular_expression_node_t) {
@@ -3923,12 +4019,18 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening
3923
4019
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
3924
4020
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
3925
4021
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
3926
- .unescaped = PM_EMPTY_STRING
4022
+ .unescaped = *unescaped
3927
4023
  };
3928
4024
 
3929
4025
  return node;
3930
4026
  }
3931
4027
 
4028
+ // Allocate a new initialize a new RegularExpressionNode node.
4029
+ static inline pm_regular_expression_node_t *
4030
+ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4031
+ return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4032
+ }
4033
+
3932
4034
  // Allocate a new RequiredDestructuredParameterNode node.
3933
4035
  static pm_required_destructured_parameter_node_t *
3934
4036
  pm_required_destructured_parameter_node_create(pm_parser_t *parser, const pm_token_t *opening) {
@@ -4274,9 +4376,9 @@ pm_string_concat_node_create(pm_parser_t *parser, pm_node_t *left, pm_node_t *ri
4274
4376
  return node;
4275
4377
  }
4276
4378
 
4277
- // Allocate a new StringNode node.
4278
- static pm_string_node_t *
4279
- pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4379
+ // Allocate a new StringNode node with the current string on the parser.
4380
+ static inline pm_string_node_t *
4381
+ pm_string_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *string) {
4280
4382
  pm_string_node_t *node = PM_ALLOC_NODE(parser, pm_string_node_t);
4281
4383
  pm_node_flags_t flags = 0;
4282
4384
 
@@ -4296,12 +4398,27 @@ pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
4296
4398
  .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening),
4297
4399
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
4298
4400
  .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing),
4299
- .unescaped = PM_EMPTY_STRING
4401
+ .unescaped = *string
4300
4402
  };
4301
4403
 
4302
4404
  return node;
4303
4405
  }
4304
4406
 
4407
+ // Allocate a new StringNode node.
4408
+ static pm_string_node_t *
4409
+ pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4410
+ return pm_string_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4411
+ }
4412
+
4413
+ // Allocate a new StringNode node and create it using the current string on the
4414
+ // parser.
4415
+ static pm_string_node_t *
4416
+ pm_string_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4417
+ pm_string_node_t *node = pm_string_node_create_unescaped(parser, opening, content, closing, &parser->current_string);
4418
+ parser->current_string = PM_EMPTY_STRING;
4419
+ return node;
4420
+ }
4421
+
4305
4422
  // Allocate and initialize a new SuperNode node.
4306
4423
  static pm_super_node_t *
4307
4424
  pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_t *arguments) {
@@ -4338,9 +4455,10 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
4338
4455
  return node;
4339
4456
  }
4340
4457
 
4341
- // Allocate a new SymbolNode node.
4458
+ // Allocate and initialize a new SymbolNode node with the given unescaped
4459
+ // string.
4342
4460
  static pm_symbol_node_t *
4343
- pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4461
+ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) {
4344
4462
  pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t);
4345
4463
 
4346
4464
  *node = (pm_symbol_node_t) {
@@ -4355,12 +4473,26 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
4355
4473
  .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening),
4356
4474
  .value_loc = PM_LOCATION_TOKEN_VALUE(value),
4357
4475
  .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing),
4358
- .unescaped = PM_EMPTY_STRING
4476
+ .unescaped = *unescaped
4359
4477
  };
4360
4478
 
4361
4479
  return node;
4362
4480
  }
4363
4481
 
4482
+ // Allocate and initialize a new SymbolNode node.
4483
+ static inline pm_symbol_node_t *
4484
+ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4485
+ return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_EMPTY_STRING);
4486
+ }
4487
+
4488
+ // Allocate and initialize a new SymbolNode node with the current string.
4489
+ static pm_symbol_node_t *
4490
+ pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4491
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
4492
+ parser->current_string = PM_EMPTY_STRING;
4493
+ return node;
4494
+ }
4495
+
4364
4496
  // Allocate and initialize a new SymbolNode node from a label.
4365
4497
  static pm_symbol_node_t *
4366
4498
  pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
@@ -4376,8 +4508,6 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
4376
4508
 
4377
4509
  assert((label.end - label.start) >= 0);
4378
4510
  pm_string_shared_init(&node->unescaped, label.start, label.end);
4379
-
4380
- pm_unescape_manipulate_string(parser, &node->unescaped, PM_UNESCAPE_ALL);
4381
4511
  break;
4382
4512
  }
4383
4513
  case PM_TOKEN_MISSING: {
@@ -4710,9 +4840,10 @@ pm_while_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm
4710
4840
  return node;
4711
4841
  }
4712
4842
 
4713
- // Allocate and initialize a new XStringNode node.
4843
+ // Allocate and initialize a new XStringNode node with the given unescaped
4844
+ // string.
4714
4845
  static pm_x_string_node_t *
4715
- pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4846
+ pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) {
4716
4847
  pm_x_string_node_t *node = PM_ALLOC_NODE(parser, pm_x_string_node_t);
4717
4848
 
4718
4849
  *node = (pm_x_string_node_t) {
@@ -4726,12 +4857,18 @@ pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_
4726
4857
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
4727
4858
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
4728
4859
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
4729
- .unescaped = PM_EMPTY_STRING
4860
+ .unescaped = *unescaped
4730
4861
  };
4731
4862
 
4732
4863
  return node;
4733
4864
  }
4734
4865
 
4866
+ // Allocate and initialize a new XStringNode node.
4867
+ static inline pm_x_string_node_t *
4868
+ pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4869
+ return pm_xstring_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4870
+ }
4871
+
4735
4872
  // Allocate a new YieldNode node.
4736
4873
  static pm_yield_node_t *
4737
4874
  pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_location_t *lparen_loc, pm_arguments_node_t *arguments, const pm_location_t *rparen_loc) {
@@ -4765,8 +4902,6 @@ pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_lo
4765
4902
  return node;
4766
4903
  }
4767
4904
 
4768
-
4769
- #undef PM_EMPTY_STRING
4770
4905
  #undef PM_ALLOC_NODE
4771
4906
 
4772
4907
  /******************************************************************************/
@@ -4783,7 +4918,8 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
4783
4918
  .previous = parser->current_scope,
4784
4919
  .closed = closed,
4785
4920
  .explicit_params = false,
4786
- .numbered_params = false
4921
+ .numbered_params = false,
4922
+ .transparent = false
4787
4923
  };
4788
4924
 
4789
4925
  pm_constant_id_list_init(&scope->locals);
@@ -4792,6 +4928,25 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
4792
4928
  return true;
4793
4929
  }
4794
4930
 
4931
+ // Allocate and initialize a new scope. Push it onto the scope stack.
4932
+ static bool
4933
+ pm_parser_scope_push_transparent(pm_parser_t *parser) {
4934
+ pm_scope_t *scope = (pm_scope_t *) malloc(sizeof(pm_scope_t));
4935
+ if (scope == NULL) return false;
4936
+
4937
+ *scope = (pm_scope_t) {
4938
+ .previous = parser->current_scope,
4939
+ .closed = false,
4940
+ .explicit_params = false,
4941
+ .numbered_params = false,
4942
+ .transparent = true
4943
+ };
4944
+
4945
+ parser->current_scope = scope;
4946
+
4947
+ return true;
4948
+ }
4949
+
4795
4950
  // Check if the current scope has a given local variables.
4796
4951
  static int
4797
4952
  pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
@@ -4800,7 +4955,8 @@ pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
4800
4955
  int depth = 0;
4801
4956
 
4802
4957
  while (scope != NULL) {
4803
- if (pm_constant_id_list_includes(&scope->locals, constant_id)) return depth;
4958
+ if (!scope->transparent &&
4959
+ pm_constant_id_list_includes(&scope->locals, constant_id)) return depth;
4804
4960
  if (scope->closed) break;
4805
4961
 
4806
4962
  scope = scope->previous;
@@ -4813,8 +4969,12 @@ pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
4813
4969
  // Add a constant id to the local table of the current scope.
4814
4970
  static inline void
4815
4971
  pm_parser_local_add(pm_parser_t *parser, pm_constant_id_t constant_id) {
4816
- if (!pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
4817
- pm_constant_id_list_append(&parser->current_scope->locals, constant_id);
4972
+ pm_scope_t *scope = parser->current_scope;
4973
+ while (scope && scope->transparent) scope = scope->previous;
4974
+
4975
+ assert(scope != NULL);
4976
+ if (!pm_constant_id_list_includes(&scope->locals, constant_id)) {
4977
+ pm_constant_id_list_append(&scope->locals, constant_id);
4818
4978
  }
4819
4979
  }
4820
4980
 
@@ -4839,18 +4999,13 @@ pm_parser_local_add_owned(pm_parser_t *parser, const uint8_t *start, size_t leng
4839
4999
  if (constant_id != 0) pm_parser_local_add(parser, constant_id);
4840
5000
  }
4841
5001
 
4842
- static inline bool
4843
- token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
4844
- return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1]));
4845
- }
4846
-
4847
5002
  // Add a parameter name to the current scope and check whether the name of the
4848
5003
  // parameter is unique or not.
4849
5004
  static void
4850
- pm_parser_parameter_name_check(pm_parser_t *parser, pm_token_t *name) {
5005
+ pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) {
4851
5006
  // We want to check whether the parameter name is a numbered parameter or not.
4852
5007
  if (token_is_numbered_parameter(name->start, name->end)) {
4853
- pm_diagnostic_list_append(&parser->error_list, name->start, name->end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
5008
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NUMBERED_RESERVED);
4854
5009
  }
4855
5010
 
4856
5011
  // We want to ignore any parameter name that starts with an underscore.
@@ -4861,7 +5016,7 @@ pm_parser_parameter_name_check(pm_parser_t *parser, pm_token_t *name) {
4861
5016
  pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, name);
4862
5017
 
4863
5018
  if (pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
4864
- pm_diagnostic_list_append(&parser->error_list, name->start, name->end, PM_ERR_PARAMETER_NAME_REPEAT);
5019
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
4865
5020
  }
4866
5021
  }
4867
5022
 
@@ -5007,17 +5162,6 @@ peek(pm_parser_t *parser) {
5007
5162
  return peek_at(parser, parser->current.end);
5008
5163
  }
5009
5164
 
5010
- // Get the next string of length len in the source starting from parser->current.end.
5011
- // If the string extends beyond the end of the source, return the empty string ""
5012
- static inline const uint8_t *
5013
- peek_string(pm_parser_t *parser, size_t len) {
5014
- if (parser->current.end + len <= parser->end) {
5015
- return parser->current.end;
5016
- } else {
5017
- return (const uint8_t *) "";
5018
- }
5019
- }
5020
-
5021
5165
  // If the character to be read matches the given value, then returns true and
5022
5166
  // advanced the current pointer.
5023
5167
  static inline bool
@@ -5069,66 +5213,17 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
5069
5213
  return memchr(cursor, '\n', (size_t) length);
5070
5214
  }
5071
5215
 
5072
- // Find the start of the encoding comment. This is effectively an inlined
5073
- // version of strnstr with some modifications.
5074
- static inline const uint8_t *
5075
- parser_lex_encoding_comment_start(pm_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
5076
- assert(remaining >= 0);
5077
- size_t length = (size_t) remaining;
5078
-
5079
- size_t key_length = strlen("coding:");
5080
- if (key_length > length) return NULL;
5081
-
5082
- const uint8_t *cursor_limit = cursor + length - key_length + 1;
5083
- while ((cursor = pm_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5084
- if (memcmp(cursor, "coding", key_length - 1) == 0) {
5085
- size_t whitespace_after_coding = pm_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
5086
- size_t cur_pos = key_length + whitespace_after_coding;
5087
-
5088
- if (cursor[cur_pos - 1] == ':' || cursor[cur_pos - 1] == '=') {
5089
- return cursor + cur_pos;
5090
- }
5091
- }
5092
-
5093
- cursor++;
5094
- }
5095
-
5096
- return NULL;
5097
- }
5098
-
5099
5216
  // Here we're going to check if this is a "magic" comment, and perform whatever
5100
5217
  // actions are necessary for it here.
5101
5218
  static void
5102
- parser_lex_encoding_comment(pm_parser_t *parser) {
5103
- const uint8_t *start = parser->current.start + 1;
5104
- const uint8_t *end = parser->current.end;
5105
-
5106
- // These are the patterns we're going to match to find the encoding comment.
5107
- // This is definitely not complete or even really correct.
5108
- const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
5109
-
5110
- // If we didn't find anything that matched our patterns, then return. Note
5111
- // that this does a _very_ poor job of actually finding the encoding, and
5112
- // there is a lot of work to do here to better reflect actual magic comment
5113
- // parsing from CRuby, but this at least gets us part of the way there.
5114
- if (encoding_start == NULL) return;
5115
-
5116
- // Skip any non-newline whitespace after the "coding:" or "coding=".
5117
- encoding_start += pm_strspn_inline_whitespace(encoding_start, end - encoding_start);
5118
-
5119
- // Now determine the end of the encoding string. This is either the end of
5120
- // the line, the first whitespace character, or a punctuation mark.
5121
- const uint8_t *encoding_end = pm_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
5122
- encoding_end = encoding_end == NULL ? end : encoding_end;
5123
-
5124
- // Finally, we can determine the width of the encoding string.
5125
- size_t width = (size_t) (encoding_end - encoding_start);
5219
+ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5220
+ size_t width = (size_t) (end - start);
5126
5221
 
5127
5222
  // First, we're going to call out to a user-defined callback if one was
5128
5223
  // provided. If they return an encoding struct that we can use, then we'll
5129
5224
  // use that here.
5130
5225
  if (parser->encoding_decode_callback != NULL) {
5131
- pm_encoding_t *encoding = parser->encoding_decode_callback(parser, encoding_start, width);
5226
+ pm_encoding_t *encoding = parser->encoding_decode_callback(parser, start, width);
5132
5227
 
5133
5228
  if (encoding != NULL) {
5134
5229
  parser->encoding = *encoding;
@@ -5140,7 +5235,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5140
5235
  // Extensions like utf-8 can contain extra encoding details like,
5141
5236
  // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
5142
5237
  // treat any encoding starting utf-8 as utf-8.
5143
- if ((encoding_start + 5 <= parser->end) && (pm_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
5238
+ if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
5144
5239
  // We don't need to do anything here because the default encoding is
5145
5240
  // already UTF-8. We'll just return.
5146
5241
  return;
@@ -5149,7 +5244,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5149
5244
  // Next, we're going to loop through each of the encodings that we handle
5150
5245
  // explicitly. If we found one that we understand, we'll use that value.
5151
5246
  #define ENCODING(value, prebuilt) \
5152
- if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && pm_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
5247
+ if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
5153
5248
  parser->encoding = prebuilt; \
5154
5249
  parser->encoding_changed |= true; \
5155
5250
  if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@@ -5198,40 +5293,220 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5198
5293
  // didn't understand the encoding that the user was trying to use. In this
5199
5294
  // case we'll keep using the default encoding but add an error to the
5200
5295
  // parser to indicate an unsuccessful parse.
5201
- pm_diagnostic_list_append(&parser->error_list, encoding_start, encoding_end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5296
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5297
+ }
5298
+
5299
+ // Look for a specific pattern of "coding" and potentially set the encoding on
5300
+ // the parser.
5301
+ static void
5302
+ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
5303
+ const uint8_t *cursor = parser->current.start + 1;
5304
+ const uint8_t *end = parser->current.end;
5305
+
5306
+ bool separator = false;
5307
+ while (true) {
5308
+ if (end - cursor <= 6) return;
5309
+ switch (cursor[6]) {
5310
+ case 'C': case 'c': cursor += 6; continue;
5311
+ case 'O': case 'o': cursor += 5; continue;
5312
+ case 'D': case 'd': cursor += 4; continue;
5313
+ case 'I': case 'i': cursor += 3; continue;
5314
+ case 'N': case 'n': cursor += 2; continue;
5315
+ case 'G': case 'g': cursor += 1; continue;
5316
+ case '=': case ':':
5317
+ separator = true;
5318
+ cursor += 6;
5319
+ break;
5320
+ default:
5321
+ cursor += 6;
5322
+ if (pm_char_is_whitespace(*cursor)) break;
5323
+ continue;
5324
+ }
5325
+ if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break;
5326
+ separator = false;
5327
+ }
5328
+
5329
+ while (true) {
5330
+ do {
5331
+ if (++cursor >= end) return;
5332
+ } while (pm_char_is_whitespace(*cursor));
5333
+
5334
+ if (separator) break;
5335
+ if (*cursor != '=' && *cursor != ':') return;
5336
+
5337
+ separator = true;
5338
+ cursor++;
5339
+ }
5340
+
5341
+ const uint8_t *value_start = cursor;
5342
+ while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end);
5343
+
5344
+ parser_lex_magic_comment_encoding_value(parser, value_start, cursor);
5202
5345
  }
5203
5346
 
5204
5347
  // Check if this is a magic comment that includes the frozen_string_literal
5205
5348
  // pragma. If it does, set that field on the parser.
5206
5349
  static void
5207
- parser_lex_frozen_string_literal_comment(pm_parser_t *parser) {
5208
- const uint8_t *cursor = parser->current.start + 1;
5350
+ parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5351
+ if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
5352
+ parser->frozen_string_literal = true;
5353
+ }
5354
+ }
5355
+
5356
+ static inline bool
5357
+ pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
5358
+ return b == '\'' || b == '"' || b == ':' || b == ';';
5359
+ }
5360
+
5361
+ // Find an emacs magic comment marker (-*-) within the given bounds. If one is
5362
+ // found, it returns a pointer to the start of the marker. Otherwise it returns
5363
+ // NULL.
5364
+ static inline const uint8_t *
5365
+ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
5366
+ while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5367
+ if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
5368
+ return cursor;
5369
+ }
5370
+ cursor++;
5371
+ }
5372
+ return NULL;
5373
+ }
5374
+
5375
+ // Parse the current token on the parser to see if it's a magic comment and
5376
+ // potentially perform some action based on that. A regular expression that this
5377
+ // function is effectively matching is:
5378
+ //
5379
+ // %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
5380
+ //
5381
+ // It returns true if it consumes the entire comment. Otherwise it returns
5382
+ // false.
5383
+ static inline bool
5384
+ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
5385
+ const uint8_t *start = parser->current.start + 1;
5209
5386
  const uint8_t *end = parser->current.end;
5387
+ if (end - start <= 7) return false;
5388
+
5389
+ const uint8_t *cursor;
5390
+ bool indicator = false;
5210
5391
 
5211
- size_t key_length = strlen("frozen_string_literal");
5212
- if (key_length > (size_t) (end - cursor)) return;
5392
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5393
+ start = cursor + 3;
5213
5394
 
5214
- const uint8_t *cursor_limit = cursor + (end - cursor) - key_length + 1;
5395
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5396
+ end = cursor;
5397
+ indicator = true;
5398
+ } else {
5399
+ // If we have a start marker but not an end marker, then we cannot
5400
+ // have a magic comment.
5401
+ return false;
5402
+ }
5403
+ }
5215
5404
 
5216
- while ((cursor = pm_memchr(cursor, 'f', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5217
- if (memcmp(cursor, "frozen_string_literal", key_length) == 0) {
5218
- cursor += key_length;
5219
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5405
+ cursor = start;
5406
+ while (cursor < end) {
5407
+ while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++;
5220
5408
 
5221
- if (*cursor == ':' || *cursor == '=') {
5222
- cursor++;
5223
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5409
+ const uint8_t *key_start = cursor;
5410
+ while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++;
5224
5411
 
5225
- if (cursor + 4 <= end && pm_strncasecmp(cursor, (const uint8_t *) "true", 4) == 0) {
5226
- parser->frozen_string_literal = true;
5227
- }
5412
+ const uint8_t *key_end = cursor;
5413
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5414
+ if (cursor == end) break;
5228
5415
 
5229
- return;
5416
+ if (*cursor == ':') {
5417
+ cursor++;
5418
+ } else {
5419
+ if (!indicator) return false;
5420
+ continue;
5421
+ }
5422
+
5423
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5424
+ if (cursor == end) break;
5425
+
5426
+ const uint8_t *value_start;
5427
+ const uint8_t *value_end;
5428
+
5429
+ if (*cursor == '"') {
5430
+ value_start = ++cursor;
5431
+ for (; cursor < end && *cursor != '"'; cursor++) {
5432
+ if (*cursor == '\\' && (cursor + 1 < end)) cursor++;
5230
5433
  }
5434
+ value_end = cursor;
5435
+ } else {
5436
+ value_start = cursor;
5437
+ while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++;
5438
+ value_end = cursor;
5231
5439
  }
5232
5440
 
5233
- cursor++;
5441
+ if (indicator) {
5442
+ while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
5443
+ } else {
5444
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5445
+ if (cursor != end) return false;
5446
+ }
5447
+
5448
+ // Here, we need to do some processing on the key to swap out dashes for
5449
+ // underscores. We only need to do this if there _is_ a dash in the key.
5450
+ pm_string_t key;
5451
+ const size_t key_length = (size_t) (key_end - key_start);
5452
+ const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, &parser->encoding);
5453
+
5454
+ if (dash == NULL) {
5455
+ pm_string_shared_init(&key, key_start, key_end);
5456
+ } else {
5457
+ size_t width = (size_t) (key_end - key_start);
5458
+ uint8_t *buffer = malloc(width);
5459
+ if (buffer == NULL) break;
5460
+
5461
+ memcpy(buffer, key_start, width);
5462
+ buffer[dash - key_start] = '_';
5463
+
5464
+ while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
5465
+ buffer[dash - key_start] = '_';
5466
+ }
5467
+
5468
+ pm_string_owned_init(&key, buffer, width);
5469
+ }
5470
+
5471
+ // Finally, we can start checking the key against the list of known
5472
+ // magic comment keys, and potentially change state based on that.
5473
+ const uint8_t *key_source = pm_string_source(&key);
5474
+
5475
+ // We only want to attempt to compare against encoding comments if it's
5476
+ // the first line in the file (or the second in the case of a shebang).
5477
+ if (parser->current.start == parser->encoding_comment_start) {
5478
+ if (
5479
+ (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
5480
+ (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
5481
+ ) {
5482
+ parser_lex_magic_comment_encoding_value(parser, value_start, value_end);
5483
+ }
5484
+ }
5485
+
5486
+ // We only want to handle frozen string literal comments if it's before
5487
+ // any semantic tokens have been seen.
5488
+ if (!semantic_token_seen) {
5489
+ if (key_length == 21 && pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) {
5490
+ parser_lex_magic_comment_frozen_string_literal_value(parser, value_start, value_end);
5491
+ }
5492
+ }
5493
+
5494
+ // When we're done, we want to free the string in case we had to
5495
+ // allocate memory for it.
5496
+ pm_string_free(&key);
5497
+
5498
+ // Allocate a new magic comment node to append to the parser's list.
5499
+ pm_magic_comment_t *magic_comment;
5500
+ if ((magic_comment = (pm_magic_comment_t *) calloc(sizeof(pm_magic_comment_t), 1)) != NULL) {
5501
+ magic_comment->key_start = key_start;
5502
+ magic_comment->value_start = value_start;
5503
+ magic_comment->key_length = (uint32_t) key_length;
5504
+ magic_comment->value_length = (uint32_t) (value_end - value_start);
5505
+ pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment);
5506
+ }
5234
5507
  }
5508
+
5509
+ return true;
5235
5510
  }
5236
5511
 
5237
5512
  /******************************************************************************/
@@ -5366,7 +5641,7 @@ context_def_p(pm_parser_t *parser) {
5366
5641
  static void
5367
5642
  pm_strspn_number_validate(pm_parser_t *parser, const uint8_t *invalid) {
5368
5643
  if (invalid != NULL) {
5369
- pm_diagnostic_list_append(&parser->error_list, invalid, invalid + 1, PM_ERR_INVALID_NUMBER_UNDERSCORE);
5644
+ pm_parser_err(parser, invalid, invalid + 1, PM_ERR_INVALID_NUMBER_UNDERSCORE);
5370
5645
  }
5371
5646
  }
5372
5647
 
@@ -5430,7 +5705,7 @@ lex_optional_float_suffix(pm_parser_t *parser) {
5430
5705
  parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end);
5431
5706
  type = PM_TOKEN_FLOAT;
5432
5707
  } else {
5433
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_FLOAT_EXPONENT);
5708
+ pm_parser_err_current(parser, PM_ERR_INVALID_FLOAT_EXPONENT);
5434
5709
  type = PM_TOKEN_FLOAT;
5435
5710
  }
5436
5711
  }
@@ -5451,7 +5726,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5451
5726
  if (pm_char_is_decimal_digit(peek(parser))) {
5452
5727
  parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end);
5453
5728
  } else {
5454
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_DECIMAL);
5729
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_DECIMAL);
5455
5730
  }
5456
5731
 
5457
5732
  break;
@@ -5463,7 +5738,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5463
5738
  if (pm_char_is_binary_digit(peek(parser))) {
5464
5739
  parser->current.end += pm_strspn_binary_number_validate(parser, parser->current.end);
5465
5740
  } else {
5466
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_BINARY);
5741
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_BINARY);
5467
5742
  }
5468
5743
 
5469
5744
  parser->integer_base = PM_INTEGER_BASE_FLAGS_BINARY;
@@ -5476,7 +5751,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5476
5751
  if (pm_char_is_octal_digit(peek(parser))) {
5477
5752
  parser->current.end += pm_strspn_octal_number_validate(parser, parser->current.end);
5478
5753
  } else {
5479
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_OCTAL);
5754
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_OCTAL);
5480
5755
  }
5481
5756
 
5482
5757
  parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL;
@@ -5503,7 +5778,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5503
5778
  if (pm_char_is_hexadecimal_digit(peek(parser))) {
5504
5779
  parser->current.end += pm_strspn_hexadecimal_number_validate(parser, parser->current.end);
5505
5780
  } else {
5506
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_HEXADECIMAL);
5781
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_HEXADECIMAL);
5507
5782
  }
5508
5783
 
5509
5784
  parser->integer_base = PM_INTEGER_BASE_FLAGS_HEXADECIMAL;
@@ -5581,7 +5856,7 @@ lex_numeric(pm_parser_t *parser) {
5581
5856
  static pm_token_type_t
5582
5857
  lex_global_variable(pm_parser_t *parser) {
5583
5858
  if (parser->current.end >= parser->end) {
5584
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5859
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5585
5860
  return PM_TOKEN_GLOBAL_VARIABLE;
5586
5861
  }
5587
5862
 
@@ -5622,7 +5897,7 @@ lex_global_variable(pm_parser_t *parser) {
5622
5897
  } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
5623
5898
 
5624
5899
  // $0 isn't allowed to be followed by anything.
5625
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5900
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5626
5901
  }
5627
5902
 
5628
5903
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -5653,7 +5928,7 @@ lex_global_variable(pm_parser_t *parser) {
5653
5928
  } else {
5654
5929
  // If we get here, then we have a $ followed by something that isn't
5655
5930
  // recognized as a global variable.
5656
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5931
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5657
5932
  }
5658
5933
 
5659
5934
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -5962,52 +6237,475 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
5962
6237
  }
5963
6238
  }
5964
6239
 
5965
- // This function is responsible for lexing either a character literal or the ?
5966
- // operator. The supported character literals are described below.
5967
- //
5968
- // \a bell, ASCII 07h (BEL)
5969
- // \b backspace, ASCII 08h (BS)
5970
- // \t horizontal tab, ASCII 09h (TAB)
5971
- // \n newline (line feed), ASCII 0Ah (LF)
5972
- // \v vertical tab, ASCII 0Bh (VT)
5973
- // \f form feed, ASCII 0Ch (FF)
5974
- // \r carriage return, ASCII 0Dh (CR)
5975
- // \e escape, ASCII 1Bh (ESC)
5976
- // \s space, ASCII 20h (SPC)
5977
- // \\ backslash
5978
- // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
5979
- // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
5980
- // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
5981
- // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
5982
- // \cx or \C-x control character, where x is an ASCII printable character
5983
- // \M-x meta character, where x is an ASCII printable character
5984
- // \M-\C-x meta control character, where x is an ASCII printable character
5985
- // \M-\cx same as above
5986
- // \c\M-x same as above
5987
- // \c? or \C-? delete, ASCII 7Fh (DEL)
5988
- //
5989
- static pm_token_type_t
5990
- lex_question_mark(pm_parser_t *parser) {
5991
- if (lex_state_end_p(parser)) {
5992
- lex_state_set(parser, PM_LEX_STATE_BEG);
5993
- return PM_TOKEN_QUESTION_MARK;
5994
- }
6240
+ static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0;
6241
+ static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1;
6242
+ static const uint8_t PM_ESCAPE_FLAG_META = 0x2;
6243
+ static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4;
6244
+ static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8;
5995
6245
 
5996
- if (parser->current.end >= parser->end) {
5997
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_QUESTION_MARK);
5998
- return PM_TOKEN_CHARACTER_LITERAL;
5999
- }
6246
+ // This is a lookup table for whether or not an ASCII character is printable.
6247
+ static const bool ascii_printable_chars[] = {
6248
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
6249
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6250
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6251
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6252
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6253
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
6254
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6255
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
6256
+ };
6000
6257
 
6001
- if (pm_char_is_whitespace(*parser->current.end)) {
6002
- lex_state_set(parser, PM_LEX_STATE_BEG);
6003
- return PM_TOKEN_QUESTION_MARK;
6004
- }
6258
+ static inline bool
6259
+ char_is_ascii_printable(const uint8_t b) {
6260
+ return (b < 0x80) && ascii_printable_chars[b];
6261
+ }
6262
+
6263
+ // Return the value that a hexadecimal digit character represents. For example,
6264
+ // transform 'a' into 10, 'b' into 11, etc.
6265
+ static inline uint8_t
6266
+ escape_hexadecimal_digit(const uint8_t value) {
6267
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
6268
+ }
6269
+
6270
+ // Scan the 4 digits of a Unicode escape into the value. Returns the number of
6271
+ // digits scanned. This function assumes that the characters have already been
6272
+ // validated.
6273
+ static inline uint32_t
6274
+ escape_unicode(const uint8_t *string, size_t length) {
6275
+ uint32_t value = 0;
6276
+ for (size_t index = 0; index < length; index++) {
6277
+ if (index != 0) value <<= 4;
6278
+ value |= escape_hexadecimal_digit(string[index]);
6279
+ }
6280
+ return value;
6281
+ }
6282
+
6283
+ // Escape a single character value based on the given flags.
6284
+ static inline uint8_t
6285
+ escape_byte(uint8_t value, const uint8_t flags) {
6286
+ if (flags & PM_ESCAPE_FLAG_CONTROL) value &= 0x1f;
6287
+ if (flags & PM_ESCAPE_FLAG_META) value |= 0x80;
6288
+ return value;
6289
+ }
6290
+
6291
+ // Write a unicode codepoint to the given buffer.
6292
+ static inline void
6293
+ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) {
6294
+ if (value <= 0x7F) { // 0xxxxxxx
6295
+ pm_buffer_append_u8(buffer, (uint8_t) value);
6296
+ } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
6297
+ pm_buffer_append_u8(buffer, (uint8_t) (0xC0 | (value >> 6)));
6298
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6299
+ } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx
6300
+ pm_buffer_append_u8(buffer, (uint8_t) (0xE0 | (value >> 12)));
6301
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
6302
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6303
+ } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
6304
+ pm_buffer_append_u8(buffer, (uint8_t) (0xF0 | (value >> 18)));
6305
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F)));
6306
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
6307
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6308
+ } else {
6309
+ pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
6310
+ pm_buffer_append_u8(buffer, 0xEF);
6311
+ pm_buffer_append_u8(buffer, 0xBF);
6312
+ pm_buffer_append_u8(buffer, 0xBD);
6313
+ }
6314
+ }
6315
+
6316
+ // The regular expression engine doesn't support the same escape sequences as
6317
+ // Ruby does. So first we have to read the escape sequence, and then we have to
6318
+ // format it like the regular expression engine expects it. For example, in Ruby
6319
+ // if we have:
6320
+ //
6321
+ // /\M-\C-?/
6322
+ //
6323
+ // then the first byte is actually 255, so we have to rewrite this as:
6324
+ //
6325
+ // /\xFF/
6326
+ //
6327
+ // Note that in this case there is a literal \ byte in the regular expression
6328
+ // source so that the regular expression engine will perform its own unescaping.
6329
+ static inline void
6330
+ escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
6331
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6332
+ pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
6333
+
6334
+ uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF);
6335
+ uint8_t byte2 = (uint8_t) (byte & 0xF);
6336
+
6337
+ if (byte1 >= 0xA) {
6338
+ pm_buffer_append_u8(buffer, (uint8_t) ((byte1 - 0xA) + 'A'));
6339
+ } else {
6340
+ pm_buffer_append_u8(buffer, (uint8_t) (byte1 + '0'));
6341
+ }
6342
+
6343
+ if (byte2 >= 0xA) {
6344
+ pm_buffer_append_u8(buffer, (uint8_t) (byte2 - 0xA + 'A'));
6345
+ } else {
6346
+ pm_buffer_append_u8(buffer, (uint8_t) (byte2 + '0'));
6347
+ }
6348
+ } else {
6349
+ pm_buffer_append_u8(buffer, byte);
6350
+ }
6351
+ }
6352
+
6353
+ // Read the value of an escape into the buffer.
6354
+ static void
6355
+ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
6356
+ switch (peek(parser)) {
6357
+ case '\\': {
6358
+ parser->current.end++;
6359
+ pm_buffer_append_u8(buffer, '\\');
6360
+ return;
6361
+ }
6362
+ case '\'': {
6363
+ parser->current.end++;
6364
+ pm_buffer_append_u8(buffer, '\'');
6365
+ return;
6366
+ }
6367
+ case 'a': {
6368
+ parser->current.end++;
6369
+ pm_buffer_append_u8(buffer, '\a');
6370
+ return;
6371
+ }
6372
+ case 'b': {
6373
+ parser->current.end++;
6374
+ pm_buffer_append_u8(buffer, '\b');
6375
+ return;
6376
+ }
6377
+ case 'e': {
6378
+ parser->current.end++;
6379
+ pm_buffer_append_u8(buffer, '\033');
6380
+ return;
6381
+ }
6382
+ case 'f': {
6383
+ parser->current.end++;
6384
+ pm_buffer_append_u8(buffer, '\f');
6385
+ return;
6386
+ }
6387
+ case 'n': {
6388
+ parser->current.end++;
6389
+ pm_buffer_append_u8(buffer, '\n');
6390
+ return;
6391
+ }
6392
+ case 'r': {
6393
+ parser->current.end++;
6394
+ pm_buffer_append_u8(buffer, '\r');
6395
+ return;
6396
+ }
6397
+ case 's': {
6398
+ parser->current.end++;
6399
+ pm_buffer_append_u8(buffer, ' ');
6400
+ return;
6401
+ }
6402
+ case 't': {
6403
+ parser->current.end++;
6404
+ pm_buffer_append_u8(buffer, '\t');
6405
+ return;
6406
+ }
6407
+ case 'v': {
6408
+ parser->current.end++;
6409
+ pm_buffer_append_u8(buffer, '\v');
6410
+ return;
6411
+ }
6412
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': {
6413
+ uint8_t value = (uint8_t) (*parser->current.end - '0');
6414
+ parser->current.end++;
6415
+
6416
+ if (pm_char_is_octal_digit(peek(parser))) {
6417
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0'));
6418
+ parser->current.end++;
6419
+
6420
+ if (pm_char_is_octal_digit(peek(parser))) {
6421
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0'));
6422
+ parser->current.end++;
6423
+ }
6424
+ }
6425
+
6426
+ pm_buffer_append_u8(buffer, value);
6427
+ return;
6428
+ }
6429
+ case 'x': {
6430
+ const uint8_t *start = parser->current.end - 1;
6431
+
6432
+ parser->current.end++;
6433
+ uint8_t byte = peek(parser);
6434
+
6435
+ if (pm_char_is_hexadecimal_digit(byte)) {
6436
+ uint8_t value = escape_hexadecimal_digit(byte);
6437
+ parser->current.end++;
6438
+
6439
+ byte = peek(parser);
6440
+ if (pm_char_is_hexadecimal_digit(byte)) {
6441
+ value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(byte));
6442
+ parser->current.end++;
6443
+ }
6444
+
6445
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6446
+ pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
6447
+ } else {
6448
+ pm_buffer_append_u8(buffer, value);
6449
+ }
6450
+ } else {
6451
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
6452
+ }
6453
+
6454
+ return;
6455
+ }
6456
+ case 'u': {
6457
+ const uint8_t *start = parser->current.end - 1;
6458
+ parser->current.end++;
6459
+
6460
+ if (
6461
+ (parser->current.end + 4 <= parser->end) &&
6462
+ pm_char_is_hexadecimal_digit(parser->current.end[0]) &&
6463
+ pm_char_is_hexadecimal_digit(parser->current.end[1]) &&
6464
+ pm_char_is_hexadecimal_digit(parser->current.end[2]) &&
6465
+ pm_char_is_hexadecimal_digit(parser->current.end[3])
6466
+ ) {
6467
+ uint32_t value = escape_unicode(parser->current.end, 4);
6468
+
6469
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6470
+ pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
6471
+ } else {
6472
+ escape_write_unicode(parser, buffer, start, parser->current.end + 4, value);
6473
+ }
6474
+
6475
+ parser->current.end += 4;
6476
+ } else if (peek(parser) == '{') {
6477
+ const uint8_t *unicode_codepoints_start = parser->current.end - 2;
6478
+
6479
+ parser->current.end++;
6480
+ parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
6481
+
6482
+ const uint8_t *extra_codepoints_start = NULL;
6483
+ int codepoints_count = 0;
6484
+
6485
+ while ((parser->current.end < parser->end) && (*parser->current.end != '}')) {
6486
+ const uint8_t *unicode_start = parser->current.end;
6487
+ size_t hexadecimal_length = pm_strspn_hexadecimal_digit(parser->current.end, parser->end - parser->current.end);
6488
+
6489
+ if (hexadecimal_length > 6) {
6490
+ // \u{nnnn} character literal allows only 1-6 hexadecimal digits
6491
+ pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
6492
+ } else if (hexadecimal_length == 0) {
6493
+ // there are not hexadecimal characters
6494
+ pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE);
6495
+ return;
6496
+ }
6497
+
6498
+ parser->current.end += hexadecimal_length;
6499
+ codepoints_count++;
6500
+ if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count == 2) {
6501
+ extra_codepoints_start = unicode_start;
6502
+ }
6503
+
6504
+ if (!(flags & PM_ESCAPE_FLAG_REGEXP)) {
6505
+ uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
6506
+ escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value);
6507
+ }
6508
+
6509
+ parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
6510
+ }
6511
+
6512
+ // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
6513
+ if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) {
6514
+ pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
6515
+ }
6516
+
6517
+ if (peek(parser) == '}') {
6518
+ parser->current.end++;
6519
+ } else {
6520
+ pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
6521
+ }
6522
+
6523
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6524
+ pm_buffer_append_bytes(buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start));
6525
+ }
6526
+ } else {
6527
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
6528
+ }
6529
+
6530
+ return;
6531
+ }
6532
+ case 'c': {
6533
+ parser->current.end++;
6534
+ if (parser->current.end == parser->end) {
6535
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6536
+ return;
6537
+ }
6538
+
6539
+ uint8_t peeked = peek(parser);
6540
+ switch (peeked) {
6541
+ case '?': {
6542
+ parser->current.end++;
6543
+ escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
6544
+ return;
6545
+ }
6546
+ case '\\':
6547
+ if (flags & PM_ESCAPE_FLAG_CONTROL) {
6548
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
6549
+ return;
6550
+ }
6551
+ parser->current.end++;
6552
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
6553
+ return;
6554
+ default: {
6555
+ if (!char_is_ascii_printable(peeked)) {
6556
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6557
+ return;
6558
+ }
6559
+
6560
+ parser->current.end++;
6561
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
6562
+ return;
6563
+ }
6564
+ }
6565
+ }
6566
+ case 'C': {
6567
+ parser->current.end++;
6568
+ if (peek(parser) != '-') {
6569
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6570
+ return;
6571
+ }
6572
+
6573
+ parser->current.end++;
6574
+ if (parser->current.end == parser->end) {
6575
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6576
+ return;
6577
+ }
6578
+
6579
+ uint8_t peeked = peek(parser);
6580
+ switch (peeked) {
6581
+ case '?': {
6582
+ parser->current.end++;
6583
+ escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
6584
+ return;
6585
+ }
6586
+ case '\\':
6587
+ if (flags & PM_ESCAPE_FLAG_CONTROL) {
6588
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
6589
+ return;
6590
+ }
6591
+ parser->current.end++;
6592
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
6593
+ return;
6594
+ default: {
6595
+ if (!char_is_ascii_printable(peeked)) {
6596
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6597
+ return;
6598
+ }
6599
+
6600
+ parser->current.end++;
6601
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
6602
+ return;
6603
+ }
6604
+ }
6605
+ }
6606
+ case 'M': {
6607
+ parser->current.end++;
6608
+ if (peek(parser) != '-') {
6609
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6610
+ return;
6611
+ }
6612
+
6613
+ parser->current.end++;
6614
+ if (parser->current.end == parser->end) {
6615
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6616
+ return;
6617
+ }
6618
+
6619
+ uint8_t peeked = peek(parser);
6620
+ if (peeked == '\\') {
6621
+ if (flags & PM_ESCAPE_FLAG_META) {
6622
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META_REPEAT);
6623
+ return;
6624
+ }
6625
+ parser->current.end++;
6626
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_META);
6627
+ return;
6628
+ }
6629
+
6630
+ if (!char_is_ascii_printable(peeked)) {
6631
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6632
+ return;
6633
+ }
6634
+
6635
+ parser->current.end++;
6636
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
6637
+ return;
6638
+ }
6639
+ case '\r': {
6640
+ if (peek_offset(parser, 1) == '\n') {
6641
+ parser->current.end += 2;
6642
+ pm_buffer_append_u8(buffer, '\n');
6643
+ return;
6644
+ }
6645
+ }
6646
+ /* fallthrough */
6647
+ default: {
6648
+ if (parser->current.end < parser->end) {
6649
+ pm_buffer_append_u8(buffer, *parser->current.end++);
6650
+ }
6651
+ return;
6652
+ }
6653
+ }
6654
+ }
6655
+
6656
+ // This function is responsible for lexing either a character literal or the ?
6657
+ // operator. The supported character literals are described below.
6658
+ //
6659
+ // \a bell, ASCII 07h (BEL)
6660
+ // \b backspace, ASCII 08h (BS)
6661
+ // \t horizontal tab, ASCII 09h (TAB)
6662
+ // \n newline (line feed), ASCII 0Ah (LF)
6663
+ // \v vertical tab, ASCII 0Bh (VT)
6664
+ // \f form feed, ASCII 0Ch (FF)
6665
+ // \r carriage return, ASCII 0Dh (CR)
6666
+ // \e escape, ASCII 1Bh (ESC)
6667
+ // \s space, ASCII 20h (SPC)
6668
+ // \\ backslash
6669
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
6670
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
6671
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
6672
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
6673
+ // \cx or \C-x control character, where x is an ASCII printable character
6674
+ // \M-x meta character, where x is an ASCII printable character
6675
+ // \M-\C-x meta control character, where x is an ASCII printable character
6676
+ // \M-\cx same as above
6677
+ // \c\M-x same as above
6678
+ // \c? or \C-? delete, ASCII 7Fh (DEL)
6679
+ //
6680
+ static pm_token_type_t
6681
+ lex_question_mark(pm_parser_t *parser) {
6682
+ if (lex_state_end_p(parser)) {
6683
+ lex_state_set(parser, PM_LEX_STATE_BEG);
6684
+ return PM_TOKEN_QUESTION_MARK;
6685
+ }
6686
+
6687
+ if (parser->current.end >= parser->end) {
6688
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_QUESTION_MARK);
6689
+ pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end);
6690
+ return PM_TOKEN_CHARACTER_LITERAL;
6691
+ }
6692
+
6693
+ if (pm_char_is_whitespace(*parser->current.end)) {
6694
+ lex_state_set(parser, PM_LEX_STATE_BEG);
6695
+ return PM_TOKEN_QUESTION_MARK;
6696
+ }
6005
6697
 
6006
6698
  lex_state_set(parser, PM_LEX_STATE_BEG);
6007
6699
 
6008
- if (parser->current.start[1] == '\\') {
6700
+ if (match(parser, '\\')) {
6009
6701
  lex_state_set(parser, PM_LEX_STATE_END);
6010
- parser->current.end += pm_unescape_calculate_difference(parser, parser->current.start + 1, PM_UNESCAPE_ALL, true);
6702
+
6703
+ pm_buffer_t buffer;
6704
+ pm_buffer_init_capacity(&buffer, 3);
6705
+
6706
+ escape_read(parser, &buffer, PM_ESCAPE_FLAG_SINGLE);
6707
+ pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);
6708
+
6011
6709
  return PM_TOKEN_CHARACTER_LITERAL;
6012
6710
  } else {
6013
6711
  size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end);
@@ -6024,6 +6722,7 @@ lex_question_mark(pm_parser_t *parser) {
6024
6722
  ) {
6025
6723
  lex_state_set(parser, PM_LEX_STATE_END);
6026
6724
  parser->current.end += encoding_width;
6725
+ pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end);
6027
6726
  return PM_TOKEN_CHARACTER_LITERAL;
6028
6727
  }
6029
6728
  }
@@ -6045,9 +6744,9 @@ lex_at_variable(pm_parser_t *parser) {
6045
6744
  parser->current.end += width;
6046
6745
  }
6047
6746
  } else if (type == PM_TOKEN_CLASS_VARIABLE) {
6048
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
6747
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
6049
6748
  } else {
6050
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
6749
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
6051
6750
  }
6052
6751
 
6053
6752
  // If we're lexing an embedded variable, then we need to pop back into the
@@ -6070,7 +6769,7 @@ parser_lex_callback(pm_parser_t *parser) {
6070
6769
  // Return a new comment node of the specified type.
6071
6770
  static inline pm_comment_t *
6072
6771
  parser_comment(pm_parser_t *parser, pm_comment_type_t type) {
6073
- pm_comment_t *comment = (pm_comment_t *) malloc(sizeof(pm_comment_t));
6772
+ pm_comment_t *comment = (pm_comment_t *) calloc(sizeof(pm_comment_t), 1);
6074
6773
  if (comment == NULL) return NULL;
6075
6774
 
6076
6775
  *comment = (pm_comment_t) {
@@ -6146,7 +6845,7 @@ lex_embdoc(pm_parser_t *parser) {
6146
6845
  parser_lex_callback(parser);
6147
6846
  }
6148
6847
 
6149
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EMBDOC_TERM);
6848
+ pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
6150
6849
 
6151
6850
  comment->end = parser->current.end;
6152
6851
  pm_list_append(&parser->comment_list, (pm_list_node_t *) comment);
@@ -6177,6 +6876,113 @@ parser_flush_heredoc_end(pm_parser_t *parser) {
6177
6876
  parser->heredoc_end = NULL;
6178
6877
  }
6179
6878
 
6879
+ // When we're lexing certain types (strings, symbols, lists, etc.) we have
6880
+ // string content associated with the tokens. For example:
6881
+ //
6882
+ // "foo"
6883
+ //
6884
+ // In this case, the string content is foo. Since there is no escaping, there's
6885
+ // no need to track additional information and the token can be returned as
6886
+ // normal. However, if we have escape sequences:
6887
+ //
6888
+ // "foo\n"
6889
+ //
6890
+ // then the bytes in the string are "f", "o", "o", "\", "n", but we want to
6891
+ // provide out consumers with the string content "f", "o", "o", "\n". In these
6892
+ // cases, when we find the first escape sequence, we initialize a pm_buffer_t
6893
+ // to keep track of the string content. Then in the parser, it will
6894
+ // automatically attach the string content to the node that it belongs to.
6895
+ typedef struct {
6896
+ pm_buffer_t buffer;
6897
+ const uint8_t *cursor;
6898
+ } pm_token_buffer_t;
6899
+
6900
+ // Push the given byte into the token buffer.
6901
+ static inline void
6902
+ pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
6903
+ pm_buffer_append_u8(&token_buffer->buffer, byte);
6904
+ }
6905
+
6906
+ // When we're about to return from lexing the current token and we know for sure
6907
+ // that we have found an escape sequence, this function is called to copy the
6908
+ // contents of the token buffer into the current string on the parser so that it
6909
+ // can be attached to the correct node.
6910
+ static inline void
6911
+ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6912
+ pm_string_owned_init(&parser->current_string, (uint8_t *) token_buffer->buffer.value, token_buffer->buffer.length);
6913
+ }
6914
+
6915
+ // When we're about to return from lexing the current token, we need to flush
6916
+ // all of the content that we have pushed into the buffer into the current
6917
+ // string. If we haven't pushed anything into the buffer, this means that we
6918
+ // never found an escape sequence, so we can directly reference the bounds of
6919
+ // the current string. Either way, at the return of this function it is expected
6920
+ // that parser->current_string is established in such a way that it can be
6921
+ // attached to a node.
6922
+ static void
6923
+ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6924
+ if (token_buffer->cursor == NULL) {
6925
+ pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
6926
+ } else {
6927
+ pm_buffer_append_bytes(&token_buffer->buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor));
6928
+ pm_token_buffer_copy(parser, token_buffer);
6929
+ }
6930
+ }
6931
+
6932
+ // When we've found an escape sequence, we need to copy everything up to this
6933
+ // point into the buffer because we're about to provide a string that has
6934
+ // different content than a direct slice of the source.
6935
+ //
6936
+ // It is expected that the parser's current token end will be pointing at one
6937
+ // byte past the backslash that starts the escape sequence.
6938
+ static void
6939
+ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6940
+ const uint8_t *start;
6941
+ if (token_buffer->cursor == NULL) {
6942
+ pm_buffer_init_capacity(&token_buffer->buffer, 16);
6943
+ start = parser->current.start;
6944
+ } else {
6945
+ start = token_buffer->cursor;
6946
+ }
6947
+
6948
+ const uint8_t *end = parser->current.end - 1;
6949
+ pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start));
6950
+ }
6951
+
6952
+ // Effectively the same thing as pm_strspn_inline_whitespace, but in the case of
6953
+ // a tilde heredoc expands out tab characters to the nearest tab boundaries.
6954
+ static inline size_t
6955
+ pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor, pm_heredoc_indent_t indent) {
6956
+ size_t whitespace = 0;
6957
+
6958
+ switch (indent) {
6959
+ case PM_HEREDOC_INDENT_NONE:
6960
+ // Do nothing, we can't match a terminator with
6961
+ // indentation and there's no need to calculate common
6962
+ // whitespace.
6963
+ break;
6964
+ case PM_HEREDOC_INDENT_DASH:
6965
+ // Skip past inline whitespace.
6966
+ *cursor += pm_strspn_inline_whitespace(*cursor, parser->end - *cursor);
6967
+ break;
6968
+ case PM_HEREDOC_INDENT_TILDE:
6969
+ // Skip past inline whitespace and calculate common
6970
+ // whitespace.
6971
+ while (*cursor < parser->end && pm_char_is_inline_whitespace(**cursor)) {
6972
+ if (**cursor == '\t') {
6973
+ whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
6974
+ } else {
6975
+ whitespace++;
6976
+ }
6977
+ (*cursor)++;
6978
+ }
6979
+
6980
+ break;
6981
+ }
6982
+
6983
+ return whitespace;
6984
+ }
6985
+
6180
6986
  // This is a convenience macro that will set the current token type, call the
6181
6987
  // lex callback, and then return from the parser_lex function.
6182
6988
  #define LEX(token_type) parser->current.type = token_type; parser_lex_callback(parser); return
@@ -6301,12 +7107,16 @@ parser_lex(pm_parser_t *parser) {
6301
7107
  parser->current.type = PM_TOKEN_COMMENT;
6302
7108
  parser_lex_callback(parser);
6303
7109
 
6304
- if (parser->current.start == parser->encoding_comment_start) {
6305
- parser_lex_encoding_comment(parser);
6306
- }
7110
+ // Here, parse the comment to see if it's a magic comment
7111
+ // and potentially change state on the parser.
7112
+ if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) {
7113
+ ptrdiff_t length = parser->current.end - parser->current.start;
6307
7114
 
6308
- if (!semantic_token_seen) {
6309
- parser_lex_frozen_string_literal_comment(parser);
7115
+ // If we didn't find a magic comment within the first
7116
+ // pass and we're at the start of the file, then we need
7117
+ // to do another pass to potentially find other patterns
7118
+ // for encoding comments.
7119
+ if (length >= 10) parser_lex_magic_comment_encoding(parser);
6310
7120
  }
6311
7121
 
6312
7122
  lexed_comment = true;
@@ -6588,7 +7398,7 @@ parser_lex(pm_parser_t *parser) {
6588
7398
  pm_token_type_t type = PM_TOKEN_STAR;
6589
7399
 
6590
7400
  if (lex_state_spcarg_p(parser, space_seen)) {
6591
- pm_diagnostic_list_append(&parser->warning_list, parser->current.start, parser->current.end, PM_WARN_AMBIGUOUS_PREFIX_STAR);
7401
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_STAR);
6592
7402
  type = PM_TOKEN_USTAR;
6593
7403
  } else if (lex_state_beg_p(parser)) {
6594
7404
  type = PM_TOKEN_USTAR;
@@ -6626,7 +7436,7 @@ parser_lex(pm_parser_t *parser) {
6626
7436
 
6627
7437
  // = => =~ == === =begin
6628
7438
  case '=':
6629
- if (current_token_starts_line(parser) && memcmp(peek_string(parser, 5), "begin", 5) == 0 && pm_char_is_whitespace(peek_offset(parser, 5))) {
7439
+ if (current_token_starts_line(parser) && (parser->current.end + 5 <= parser->end) && memcmp(parser->current.end, "begin", 5) == 0 && pm_char_is_whitespace(peek_offset(parser, 5))) {
6630
7440
  pm_token_type_t type = lex_embdoc(parser);
6631
7441
 
6632
7442
  if (type == PM_TOKEN_EOF) {
@@ -6720,7 +7530,8 @@ parser_lex(pm_parser_t *parser) {
6720
7530
  .ident_length = ident_length,
6721
7531
  .next_start = parser->current.end,
6722
7532
  .quote = quote,
6723
- .indent = indent
7533
+ .indent = indent,
7534
+ .common_whitespace = (size_t) -1
6724
7535
  }
6725
7536
  });
6726
7537
 
@@ -6732,7 +7543,7 @@ parser_lex(pm_parser_t *parser) {
6732
7543
  // this is not a valid heredoc declaration. In this case we
6733
7544
  // will add an error, but we will still return a heredoc
6734
7545
  // start.
6735
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EMBDOC_TERM);
7546
+ pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
6736
7547
  body_start = parser->end;
6737
7548
  } else {
6738
7549
  // Otherwise, we want to indicate that the body of the
@@ -6925,12 +7736,7 @@ parser_lex(pm_parser_t *parser) {
6925
7736
 
6926
7737
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
6927
7738
  if (spcarg) {
6928
- pm_diagnostic_list_append(
6929
- &parser->warning_list,
6930
- parser->current.start,
6931
- parser->current.end,
6932
- PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS
6933
- );
7739
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS);
6934
7740
  }
6935
7741
 
6936
7742
  if (lex_state_beg_p(parser) || spcarg) {
@@ -6974,12 +7780,7 @@ parser_lex(pm_parser_t *parser) {
6974
7780
 
6975
7781
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
6976
7782
  if (spcarg) {
6977
- pm_diagnostic_list_append(
6978
- &parser->warning_list,
6979
- parser->current.start,
6980
- parser->current.end,
6981
- PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS
6982
- );
7783
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS);
6983
7784
  }
6984
7785
 
6985
7786
  if (lex_state_beg_p(parser) || spcarg) {
@@ -7076,7 +7877,7 @@ parser_lex(pm_parser_t *parser) {
7076
7877
  }
7077
7878
 
7078
7879
  if (lex_state_spcarg_p(parser, space_seen)) {
7079
- pm_diagnostic_list_append(&parser->warning_list, parser->current.start, parser->current.end, PM_WARN_AMBIGUOUS_SLASH);
7880
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_SLASH);
7080
7881
  lex_mode_push_regexp(parser, '\0', '/');
7081
7882
  LEX(PM_TOKEN_REGEXP_BEGIN);
7082
7883
  }
@@ -7116,7 +7917,7 @@ parser_lex(pm_parser_t *parser) {
7116
7917
  // operator because we don't want to move into the string
7117
7918
  // lex mode unnecessarily.
7118
7919
  if ((lex_state_beg_p(parser) || lex_state_arg_p(parser)) && (parser->current.end >= parser->end)) {
7119
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
7920
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7120
7921
  LEX(PM_TOKEN_PERCENT);
7121
7922
  }
7122
7923
 
@@ -7149,7 +7950,7 @@ parser_lex(pm_parser_t *parser) {
7149
7950
  // validate that here.
7150
7951
  uint8_t delimiter = peek_offset(parser, 1);
7151
7952
  if (delimiter >= 0x80 || parser->encoding.alnum_char(&delimiter, 1)) {
7152
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
7953
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7153
7954
  goto lex_next_token;
7154
7955
  }
7155
7956
 
@@ -7249,7 +8050,7 @@ parser_lex(pm_parser_t *parser) {
7249
8050
  // unparseable. In this case we'll just drop it from the parser
7250
8051
  // and skip past it and hope that the next token is something
7251
8052
  // that we can parse.
7252
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
8053
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7253
8054
  goto lex_next_token;
7254
8055
  }
7255
8056
  }
@@ -7285,7 +8086,7 @@ parser_lex(pm_parser_t *parser) {
7285
8086
  // token as we've exhausted all of the other options. We'll skip past
7286
8087
  // it and return the next token.
7287
8088
  if (!width) {
7288
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_TOKEN);
8089
+ pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
7289
8090
  goto lex_next_token;
7290
8091
  }
7291
8092
 
@@ -7351,7 +8152,7 @@ parser_lex(pm_parser_t *parser) {
7351
8152
  }
7352
8153
  }
7353
8154
  }
7354
- case PM_LEX_LIST:
8155
+ case PM_LEX_LIST: {
7355
8156
  if (parser->next_start != NULL) {
7356
8157
  parser->current.end = parser->next_start;
7357
8158
  parser->next_start = NULL;
@@ -7394,6 +8195,10 @@ parser_lex(pm_parser_t *parser) {
7394
8195
  const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
7395
8196
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7396
8197
 
8198
+ // If we haven't found an escape yet, then this buffer will be
8199
+ // unallocated since we can refer directly to the source string.
8200
+ pm_token_buffer_t token_buffer = { 0 };
8201
+
7397
8202
  while (breakpoint != NULL) {
7398
8203
  // If we hit a null byte, skip directly past it.
7399
8204
  if (*breakpoint == '\0') {
@@ -7405,16 +8210,18 @@ parser_lex(pm_parser_t *parser) {
7405
8210
  // now, so we can return an element of the list.
7406
8211
  if (pm_char_is_whitespace(*breakpoint)) {
7407
8212
  parser->current.end = breakpoint;
8213
+ pm_token_buffer_flush(parser, &token_buffer);
7408
8214
  LEX(PM_TOKEN_STRING_CONTENT);
7409
8215
  }
7410
8216
 
7411
- //If we hit the terminator, we need to check which token to
8217
+ // If we hit the terminator, we need to check which token to
7412
8218
  // return.
7413
8219
  if (*breakpoint == lex_mode->as.list.terminator) {
7414
8220
  // If this terminator doesn't actually close the list, then
7415
8221
  // we need to continue on past it.
7416
8222
  if (lex_mode->as.list.nesting > 0) {
7417
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8223
+ parser->current.end = breakpoint + 1;
8224
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7418
8225
  lex_mode->as.list.nesting--;
7419
8226
  continue;
7420
8227
  }
@@ -7423,6 +8230,7 @@ parser_lex(pm_parser_t *parser) {
7423
8230
  // past content, then we can return a list node.
7424
8231
  if (breakpoint > parser->current.start) {
7425
8232
  parser->current.end = breakpoint;
8233
+ pm_token_buffer_flush(parser, &token_buffer);
7426
8234
  LEX(PM_TOKEN_STRING_CONTENT);
7427
8235
  }
7428
8236
 
@@ -7438,59 +8246,109 @@ parser_lex(pm_parser_t *parser) {
7438
8246
  // literally. In this case we'll skip past the next character
7439
8247
  // and find the next breakpoint.
7440
8248
  if (*breakpoint == '\\') {
7441
- pm_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? PM_UNESCAPE_ALL : PM_UNESCAPE_MINIMAL;
7442
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7443
- if (difference == 0) {
7444
- // we're at the end of the file
8249
+ parser->current.end = breakpoint + 1;
8250
+
8251
+ // If we've hit the end of the file, then break out of the
8252
+ // loop by setting the breakpoint to NULL.
8253
+ if (parser->current.end == parser->end) {
7445
8254
  breakpoint = NULL;
7446
8255
  continue;
7447
8256
  }
7448
8257
 
7449
- // If the result is an escaped newline ...
7450
- if (breakpoint[difference - 1] == '\n') {
7451
- if (parser->heredoc_end) {
7452
- // ... if we are on the same line as a heredoc, flush the heredoc and
7453
- // continue parsing after heredoc_end.
7454
- parser->current.end = breakpoint + difference;
7455
- parser_flush_heredoc_end(parser);
7456
- LEX(PM_TOKEN_STRING_CONTENT);
7457
- } else {
7458
- // ... else track the newline.
7459
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7460
- }
8258
+ pm_token_buffer_escape(parser, &token_buffer);
8259
+ uint8_t peeked = peek(parser);
8260
+
8261
+ switch (peeked) {
8262
+ case ' ':
8263
+ case '\f':
8264
+ case '\t':
8265
+ case '\v':
8266
+ case '\\':
8267
+ pm_token_buffer_push(&token_buffer, peeked);
8268
+ parser->current.end++;
8269
+ break;
8270
+ case '\r':
8271
+ parser->current.end++;
8272
+ if (peek(parser) != '\n') {
8273
+ pm_token_buffer_push(&token_buffer, '\r');
8274
+ break;
8275
+ }
8276
+ /* fallthrough */
8277
+ case '\n':
8278
+ pm_token_buffer_push(&token_buffer, '\n');
8279
+
8280
+ if (parser->heredoc_end) {
8281
+ // ... if we are on the same line as a heredoc,
8282
+ // flush the heredoc and continue parsing after
8283
+ // heredoc_end.
8284
+ parser_flush_heredoc_end(parser);
8285
+ pm_token_buffer_copy(parser, &token_buffer);
8286
+ LEX(PM_TOKEN_STRING_CONTENT);
8287
+ } else {
8288
+ // ... else track the newline.
8289
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8290
+ }
8291
+
8292
+ parser->current.end++;
8293
+ break;
8294
+ default:
8295
+ if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
8296
+ pm_token_buffer_push(&token_buffer, peeked);
8297
+ parser->current.end++;
8298
+ } else if (lex_mode->as.list.interpolation) {
8299
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8300
+ } else {
8301
+ pm_token_buffer_push(&token_buffer, '\\');
8302
+ pm_token_buffer_push(&token_buffer, peeked);
8303
+ parser->current.end++;
8304
+ }
8305
+
8306
+ break;
7461
8307
  }
7462
8308
 
7463
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8309
+ token_buffer.cursor = parser->current.end;
8310
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7464
8311
  continue;
7465
8312
  }
7466
8313
 
7467
8314
  // If we hit a #, then we will attempt to lex interpolation.
7468
8315
  if (*breakpoint == '#') {
7469
8316
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7470
- if (type != PM_TOKEN_NOT_PROVIDED) {
7471
- LEX(type);
8317
+
8318
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8319
+ // If we haven't returned at this point then we had something
8320
+ // that looked like an interpolated class or instance variable
8321
+ // like "#@" but wasn't actually. In this case we'll just skip
8322
+ // to the next breakpoint.
8323
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8324
+ continue;
7472
8325
  }
7473
8326
 
7474
- // If we haven't returned at this point then we had something
7475
- // that looked like an interpolated class or instance variable
7476
- // like "#@" but wasn't actually. In this case we'll just skip
7477
- // to the next breakpoint.
7478
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7479
- continue;
8327
+ if (type == PM_TOKEN_STRING_CONTENT) {
8328
+ pm_token_buffer_flush(parser, &token_buffer);
8329
+ }
8330
+
8331
+ LEX(type);
7480
8332
  }
7481
8333
 
7482
8334
  // If we've hit the incrementor, then we need to skip past it
7483
8335
  // and find the next breakpoint.
7484
8336
  assert(*breakpoint == lex_mode->as.list.incrementor);
7485
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8337
+ parser->current.end = breakpoint + 1;
8338
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7486
8339
  lex_mode->as.list.nesting++;
7487
8340
  continue;
7488
8341
  }
7489
8342
 
7490
- // If we were unable to find a breakpoint, then this token hits the end of
7491
- // the file.
7492
- LEX(PM_TOKEN_EOF);
8343
+ if (parser->current.end > parser->current.start) {
8344
+ pm_token_buffer_flush(parser, &token_buffer);
8345
+ LEX(PM_TOKEN_STRING_CONTENT);
8346
+ }
7493
8347
 
8348
+ // If we were unable to find a breakpoint, then this token hits the
8349
+ // end of the file.
8350
+ LEX(PM_TOKEN_EOF);
8351
+ }
7494
8352
  case PM_LEX_REGEXP: {
7495
8353
  // First, we'll set to start of this token to be the current end.
7496
8354
  if (parser->next_start == NULL) {
@@ -7515,11 +8373,13 @@ parser_lex(pm_parser_t *parser) {
7515
8373
  // characters.
7516
8374
  const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
7517
8375
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8376
+ pm_token_buffer_t token_buffer = { 0 };
7518
8377
 
7519
8378
  while (breakpoint != NULL) {
7520
8379
  // If we hit a null byte, skip directly past it.
7521
8380
  if (*breakpoint == '\0') {
7522
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8381
+ parser->current.end = breakpoint + 1;
8382
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7523
8383
  continue;
7524
8384
  }
7525
8385
 
@@ -7540,7 +8400,8 @@ parser_lex(pm_parser_t *parser) {
7540
8400
  if (lex_mode->as.regexp.terminator != '\n') {
7541
8401
  // If the terminator is not a newline, then we can set
7542
8402
  // the next breakpoint and continue.
7543
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8403
+ parser->current.end = breakpoint + 1;
8404
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7544
8405
  continue;
7545
8406
  }
7546
8407
  }
@@ -7549,7 +8410,8 @@ parser_lex(pm_parser_t *parser) {
7549
8410
  // token to return.
7550
8411
  if (*breakpoint == lex_mode->as.regexp.terminator) {
7551
8412
  if (lex_mode->as.regexp.nesting > 0) {
7552
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8413
+ parser->current.end = breakpoint + 1;
8414
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7553
8415
  lex_mode->as.regexp.nesting--;
7554
8416
  continue;
7555
8417
  }
@@ -7559,11 +8421,12 @@ parser_lex(pm_parser_t *parser) {
7559
8421
  // first.
7560
8422
  if (breakpoint > parser->current.start) {
7561
8423
  parser->current.end = breakpoint;
8424
+ pm_token_buffer_flush(parser, &token_buffer);
7562
8425
  LEX(PM_TOKEN_STRING_CONTENT);
7563
8426
  }
7564
8427
 
7565
- // Since we've hit the terminator of the regular expression, we now
7566
- // need to parse the options.
8428
+ // Since we've hit the terminator of the regular expression,
8429
+ // we now need to parse the options.
7567
8430
  parser->current.end = breakpoint + 1;
7568
8431
  parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
7569
8432
 
@@ -7576,56 +8439,103 @@ parser_lex(pm_parser_t *parser) {
7576
8439
  // literally. In this case we'll skip past the next character
7577
8440
  // and find the next breakpoint.
7578
8441
  if (*breakpoint == '\\') {
7579
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, PM_UNESCAPE_ALL, false);
7580
- if (difference == 0) {
7581
- // we're at the end of the file
8442
+ parser->current.end = breakpoint + 1;
8443
+
8444
+ // If we've hit the end of the file, then break out of the
8445
+ // loop by setting the breakpoint to NULL.
8446
+ if (parser->current.end == parser->end) {
7582
8447
  breakpoint = NULL;
7583
8448
  continue;
7584
8449
  }
7585
8450
 
7586
- // If the result is an escaped newline ...
7587
- if (breakpoint[difference - 1] == '\n') {
7588
- if (parser->heredoc_end) {
7589
- // ... if we are on the same line as a heredoc, flush the heredoc and
7590
- // continue parsing after heredoc_end.
7591
- parser->current.end = breakpoint + difference;
7592
- parser_flush_heredoc_end(parser);
7593
- LEX(PM_TOKEN_STRING_CONTENT);
7594
- } else {
7595
- // ... else track the newline.
7596
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7597
- }
8451
+ pm_token_buffer_escape(parser, &token_buffer);
8452
+ uint8_t peeked = peek(parser);
8453
+
8454
+ switch (peeked) {
8455
+ case '\r':
8456
+ parser->current.end++;
8457
+ if (peek(parser) != '\n') {
8458
+ pm_token_buffer_push(&token_buffer, '\\');
8459
+ pm_token_buffer_push(&token_buffer, '\r');
8460
+ break;
8461
+ }
8462
+ /* fallthrough */
8463
+ case '\n':
8464
+ if (parser->heredoc_end) {
8465
+ // ... if we are on the same line as a heredoc,
8466
+ // flush the heredoc and continue parsing after
8467
+ // heredoc_end.
8468
+ parser_flush_heredoc_end(parser);
8469
+ pm_token_buffer_copy(parser, &token_buffer);
8470
+ LEX(PM_TOKEN_STRING_CONTENT);
8471
+ } else {
8472
+ // ... else track the newline.
8473
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8474
+ }
8475
+
8476
+ parser->current.end++;
8477
+ break;
8478
+ case 'c':
8479
+ case 'C':
8480
+ case 'M':
8481
+ case 'u':
8482
+ case 'x':
8483
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP);
8484
+ break;
8485
+ default:
8486
+ if (lex_mode->as.regexp.terminator == '/' && peeked == '/') {
8487
+ pm_token_buffer_push(&token_buffer, peeked);
8488
+ parser->current.end++;
8489
+ break;
8490
+ }
8491
+
8492
+ if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
8493
+ pm_token_buffer_push(&token_buffer, peeked);
8494
+ parser->current.end++;
8495
+ break;
7598
8496
  }
7599
8497
 
7600
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8498
+ token_buffer.cursor = parser->current.end;
8499
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7601
8500
  continue;
7602
8501
  }
7603
8502
 
7604
8503
  // If we hit a #, then we will attempt to lex interpolation.
7605
8504
  if (*breakpoint == '#') {
7606
8505
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7607
- if (type != PM_TOKEN_NOT_PROVIDED) {
7608
- LEX(type);
8506
+
8507
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8508
+ // If we haven't returned at this point then we had
8509
+ // something that looked like an interpolated class or
8510
+ // instance variable like "#@" but wasn't actually. In
8511
+ // this case we'll just skip to the next breakpoint.
8512
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8513
+ continue;
7609
8514
  }
7610
8515
 
7611
- // If we haven't returned at this point then we had
7612
- // something that looked like an interpolated class or
7613
- // instance variable like "#@" but wasn't actually. In this
7614
- // case we'll just skip to the next breakpoint.
7615
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7616
- continue;
8516
+ if (type == PM_TOKEN_STRING_CONTENT) {
8517
+ pm_token_buffer_flush(parser, &token_buffer);
8518
+ }
8519
+
8520
+ LEX(type);
7617
8521
  }
7618
8522
 
7619
8523
  // If we've hit the incrementor, then we need to skip past it
7620
8524
  // and find the next breakpoint.
7621
8525
  assert(*breakpoint == lex_mode->as.regexp.incrementor);
7622
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8526
+ parser->current.end = breakpoint + 1;
8527
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7623
8528
  lex_mode->as.regexp.nesting++;
7624
8529
  continue;
7625
8530
  }
7626
8531
 
7627
- // At this point, the breakpoint is NULL which means we were unable to
7628
- // find anything before the end of the file.
8532
+ if (parser->current.end > parser->current.start) {
8533
+ pm_token_buffer_flush(parser, &token_buffer);
8534
+ LEX(PM_TOKEN_STRING_CONTENT);
8535
+ }
8536
+
8537
+ // If we were unable to find a breakpoint, then this token hits the
8538
+ // end of the file.
7629
8539
  LEX(PM_TOKEN_EOF);
7630
8540
  }
7631
8541
  case PM_LEX_STRING: {
@@ -7646,30 +8556,34 @@ parser_lex(pm_parser_t *parser) {
7646
8556
 
7647
8557
  // These are the places where we need to split up the content of the
7648
8558
  // string. We'll use strpbrk to find the first of these characters.
7649
- const uint8_t *breakpoints = parser->lex_modes.current->as.string.breakpoints;
8559
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
8560
+ const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
7650
8561
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7651
8562
 
8563
+ // If we haven't found an escape yet, then this buffer will be
8564
+ // unallocated since we can refer directly to the source string.
8565
+ pm_token_buffer_t token_buffer = { 0 };
8566
+
7652
8567
  while (breakpoint != NULL) {
7653
8568
  // If we hit the incrementor, then we'll increment then nesting and
7654
8569
  // continue lexing.
7655
- if (
7656
- parser->lex_modes.current->as.string.incrementor != '\0' &&
7657
- *breakpoint == parser->lex_modes.current->as.string.incrementor
7658
- ) {
7659
- parser->lex_modes.current->as.string.nesting++;
7660
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8570
+ if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
8571
+ lex_mode->as.string.nesting++;
8572
+ parser->current.end = breakpoint + 1;
8573
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7661
8574
  continue;
7662
8575
  }
7663
8576
 
7664
8577
  // Note that we have to check the terminator here first because we could
7665
8578
  // potentially be parsing a % string that has a # character as the
7666
8579
  // terminator.
7667
- if (*breakpoint == parser->lex_modes.current->as.string.terminator) {
8580
+ if (*breakpoint == lex_mode->as.string.terminator) {
7668
8581
  // If this terminator doesn't actually close the string, then we need
7669
8582
  // to continue on past it.
7670
- if (parser->lex_modes.current->as.string.nesting > 0) {
7671
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
7672
- parser->lex_modes.current->as.string.nesting--;
8583
+ if (lex_mode->as.string.nesting > 0) {
8584
+ parser->current.end = breakpoint + 1;
8585
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8586
+ lex_mode->as.string.nesting--;
7673
8587
  continue;
7674
8588
  }
7675
8589
 
@@ -7677,6 +8591,7 @@ parser_lex(pm_parser_t *parser) {
7677
8591
  // then we need to return that content as string content first.
7678
8592
  if (breakpoint > parser->current.start) {
7679
8593
  parser->current.end = breakpoint;
8594
+ pm_token_buffer_flush(parser, &token_buffer);
7680
8595
  LEX(PM_TOKEN_STRING_CONTENT);
7681
8596
  }
7682
8597
 
@@ -7690,11 +8605,7 @@ parser_lex(pm_parser_t *parser) {
7690
8605
  parser->current.end = breakpoint + 1;
7691
8606
  }
7692
8607
 
7693
- if (
7694
- parser->lex_modes.current->as.string.label_allowed &&
7695
- (peek(parser) == ':') &&
7696
- (peek_offset(parser, 1) != ':')
7697
- ) {
8608
+ if (lex_mode->as.string.label_allowed && (peek(parser) == ':') && (peek_offset(parser, 1) != ':')) {
7698
8609
  parser->current.end++;
7699
8610
  lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
7700
8611
  lex_mode_pop(parser);
@@ -7712,11 +8623,13 @@ parser_lex(pm_parser_t *parser) {
7712
8623
  if (*breakpoint == '\n') {
7713
8624
  if (parser->heredoc_end == NULL) {
7714
8625
  pm_newline_list_append(&parser->newline_list, breakpoint);
7715
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8626
+ parser->current.end = breakpoint + 1;
8627
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7716
8628
  continue;
7717
8629
  } else {
7718
8630
  parser->current.end = breakpoint + 1;
7719
8631
  parser_flush_heredoc_end(parser);
8632
+ pm_token_buffer_flush(parser, &token_buffer);
7720
8633
  LEX(PM_TOKEN_STRING_CONTENT);
7721
8634
  }
7722
8635
  }
@@ -7724,58 +8637,110 @@ parser_lex(pm_parser_t *parser) {
7724
8637
  switch (*breakpoint) {
7725
8638
  case '\0':
7726
8639
  // Skip directly past the null character.
7727
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8640
+ parser->current.end = breakpoint + 1;
8641
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7728
8642
  break;
7729
8643
  case '\\': {
7730
- // If we hit escapes, then we need to treat the next token
7731
- // literally. In this case we'll skip past the next character and
7732
- // find the next breakpoint.
7733
- pm_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? PM_UNESCAPE_ALL : PM_UNESCAPE_MINIMAL;
7734
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7735
- if (difference == 0) {
7736
- // we're at the end of the file
8644
+ // Here we hit escapes.
8645
+ parser->current.end = breakpoint + 1;
8646
+
8647
+ // If we've hit the end of the file, then break out of
8648
+ // the loop by setting the breakpoint to NULL.
8649
+ if (parser->current.end == parser->end) {
7737
8650
  breakpoint = NULL;
7738
- break;
8651
+ continue;
7739
8652
  }
7740
8653
 
7741
- // If the result is an escaped newline ...
7742
- if (breakpoint[difference - 1] == '\n') {
7743
- if (parser->heredoc_end) {
7744
- // ... if we are on the same line as a heredoc, flush the heredoc and
7745
- // continue parsing after heredoc_end.
7746
- parser->current.end = breakpoint + difference;
7747
- parser_flush_heredoc_end(parser);
7748
- LEX(PM_TOKEN_STRING_CONTENT);
7749
- } else {
7750
- // ... else track the newline.
7751
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7752
- }
8654
+ pm_token_buffer_escape(parser, &token_buffer);
8655
+ uint8_t peeked = peek(parser);
8656
+
8657
+ switch (peeked) {
8658
+ case '\\':
8659
+ pm_token_buffer_push(&token_buffer, '\\');
8660
+ parser->current.end++;
8661
+ break;
8662
+ case '\r':
8663
+ parser->current.end++;
8664
+ if (peek(parser) != '\n') {
8665
+ if (!lex_mode->as.string.interpolation) {
8666
+ pm_token_buffer_push(&token_buffer, '\\');
8667
+ }
8668
+ pm_token_buffer_push(&token_buffer, '\r');
8669
+ break;
8670
+ }
8671
+ /* fallthrough */
8672
+ case '\n':
8673
+ if (!lex_mode->as.string.interpolation) {
8674
+ pm_token_buffer_push(&token_buffer, '\\');
8675
+ pm_token_buffer_push(&token_buffer, '\n');
8676
+ }
8677
+
8678
+ if (parser->heredoc_end) {
8679
+ // ... if we are on the same line as a heredoc,
8680
+ // flush the heredoc and continue parsing after
8681
+ // heredoc_end.
8682
+ parser_flush_heredoc_end(parser);
8683
+ pm_token_buffer_copy(parser, &token_buffer);
8684
+ LEX(PM_TOKEN_STRING_CONTENT);
8685
+ } else {
8686
+ // ... else track the newline.
8687
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8688
+ }
8689
+
8690
+ parser->current.end++;
8691
+ break;
8692
+ default:
8693
+ if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
8694
+ pm_token_buffer_push(&token_buffer, peeked);
8695
+ parser->current.end++;
8696
+ } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
8697
+ pm_token_buffer_push(&token_buffer, peeked);
8698
+ parser->current.end++;
8699
+ } else if (lex_mode->as.string.interpolation) {
8700
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8701
+ } else {
8702
+ pm_token_buffer_push(&token_buffer, '\\');
8703
+ pm_token_buffer_push(&token_buffer, peeked);
8704
+ parser->current.end++;
8705
+ }
8706
+
8707
+ break;
7753
8708
  }
7754
8709
 
7755
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8710
+ token_buffer.cursor = parser->current.end;
8711
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7756
8712
  break;
7757
8713
  }
7758
8714
  case '#': {
7759
8715
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7760
- if (type != PM_TOKEN_NOT_PROVIDED) {
7761
- LEX(type);
8716
+
8717
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8718
+ // If we haven't returned at this point then we had something that
8719
+ // looked like an interpolated class or instance variable like "#@"
8720
+ // but wasn't actually. In this case we'll just skip to the next
8721
+ // breakpoint.
8722
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8723
+ break;
7762
8724
  }
7763
8725
 
7764
- // If we haven't returned at this point then we had something that
7765
- // looked like an interpolated class or instance variable like "#@"
7766
- // but wasn't actually. In this case we'll just skip to the next
7767
- // breakpoint.
7768
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7769
- break;
8726
+ if (type == PM_TOKEN_STRING_CONTENT) {
8727
+ pm_token_buffer_flush(parser, &token_buffer);
8728
+ }
8729
+
8730
+ LEX(type);
7770
8731
  }
7771
8732
  default:
7772
8733
  assert(false && "unreachable");
7773
8734
  }
7774
8735
  }
7775
8736
 
8737
+ if (parser->current.end > parser->current.start) {
8738
+ pm_token_buffer_flush(parser, &token_buffer);
8739
+ LEX(PM_TOKEN_STRING_CONTENT);
8740
+ }
8741
+
7776
8742
  // If we've hit the end of the string, then this is an unterminated
7777
8743
  // string. In that case we'll return the EOF token.
7778
- parser->current.end = parser->end;
7779
8744
  LEX(PM_TOKEN_EOF);
7780
8745
  }
7781
8746
  case PM_LEX_HEREDOC: {
@@ -7797,16 +8762,15 @@ parser_lex(pm_parser_t *parser) {
7797
8762
 
7798
8763
  // Now let's grab the information about the identifier off of the current
7799
8764
  // lex mode.
7800
- const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
7801
- size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length;
8765
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
8766
+ const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
8767
+ size_t ident_length = lex_mode->as.heredoc.ident_length;
7802
8768
 
7803
8769
  // If we are immediately following a newline and we have hit the
7804
8770
  // terminator, then we need to return the ending of the heredoc.
7805
8771
  if (current_token_starts_line(parser)) {
7806
8772
  const uint8_t *start = parser->current.start;
7807
- if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
7808
- start += pm_strspn_inline_whitespace(start, parser->end - start);
7809
- }
8773
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
7810
8774
 
7811
8775
  if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
7812
8776
  bool matched = true;
@@ -7824,10 +8788,10 @@ parser_lex(pm_parser_t *parser) {
7824
8788
  }
7825
8789
 
7826
8790
  if (matched) {
7827
- if (*parser->lex_modes.current->as.heredoc.next_start == '\\') {
8791
+ if (*lex_mode->as.heredoc.next_start == '\\') {
7828
8792
  parser->next_start = NULL;
7829
8793
  } else {
7830
- parser->next_start = parser->lex_modes.current->as.heredoc.next_start;
8794
+ parser->next_start = lex_mode->as.heredoc.next_start;
7831
8795
  parser->heredoc_end = parser->current.end;
7832
8796
  }
7833
8797
 
@@ -7838,61 +8802,91 @@ parser_lex(pm_parser_t *parser) {
7838
8802
  LEX(PM_TOKEN_HEREDOC_END);
7839
8803
  }
7840
8804
  }
8805
+
8806
+ if (
8807
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8808
+ (lex_mode->as.heredoc.common_whitespace > whitespace) &&
8809
+ peek_at(parser, start) != '\n'
8810
+ ) {
8811
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8812
+ }
7841
8813
  }
7842
8814
 
7843
- // Otherwise we'll be parsing string content. These are the places where
7844
- // we need to split up the content of the heredoc. We'll use strpbrk to
7845
- // find the first of these characters.
8815
+ // Otherwise we'll be parsing string content. These are the places
8816
+ // where we need to split up the content of the heredoc. We'll use
8817
+ // strpbrk to find the first of these characters.
7846
8818
  uint8_t breakpoints[] = "\n\\#";
7847
8819
 
7848
- pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
8820
+ pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
7849
8821
  if (quote == PM_HEREDOC_QUOTE_SINGLE) {
7850
8822
  breakpoints[2] = '\0';
7851
8823
  }
7852
8824
 
7853
8825
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8826
+ pm_token_buffer_t token_buffer = { 0 };
8827
+ bool was_escaped_newline = false;
7854
8828
 
7855
8829
  while (breakpoint != NULL) {
7856
8830
  switch (*breakpoint) {
7857
8831
  case '\0':
7858
8832
  // Skip directly past the null character.
7859
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8833
+ parser->current.end = breakpoint + 1;
8834
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7860
8835
  break;
7861
8836
  case '\n': {
7862
8837
  if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
7863
8838
  parser_flush_heredoc_end(parser);
7864
8839
  parser->current.end = breakpoint + 1;
8840
+ pm_token_buffer_flush(parser, &token_buffer);
7865
8841
  LEX(PM_TOKEN_STRING_CONTENT);
7866
8842
  }
7867
8843
 
7868
8844
  pm_newline_list_append(&parser->newline_list, breakpoint);
7869
8845
 
8846
+ // If we have a - or ~ heredoc, then we can match after
8847
+ // some leading whitespace.
7870
8848
  const uint8_t *start = breakpoint + 1;
7871
- if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
7872
- start += pm_strspn_inline_whitespace(start, parser->end - start);
7873
- }
8849
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
7874
8850
 
7875
- // If we have hit a newline that is followed by a valid terminator,
7876
- // then we need to return the content of the heredoc here as string
7877
- // content. Then, the next time a token is lexed, it will match
7878
- // again and return the end of the heredoc.
8851
+ // If we have hit a newline that is followed by a valid
8852
+ // terminator, then we need to return the content of the
8853
+ // heredoc here as string content. Then, the next time a
8854
+ // token is lexed, it will match again and return the
8855
+ // end of the heredoc.
7879
8856
  if (
8857
+ !was_escaped_newline &&
7880
8858
  (start + ident_length <= parser->end) &&
7881
8859
  (memcmp(start, ident_start, ident_length) == 0)
7882
8860
  ) {
7883
- // Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
8861
+ // Heredoc terminators must be followed by a
8862
+ // newline, CRLF, or EOF to be valid.
7884
8863
  if (
7885
8864
  start + ident_length == parser->end ||
7886
8865
  match_eol_at(parser, start + ident_length)
7887
8866
  ) {
7888
8867
  parser->current.end = breakpoint + 1;
8868
+ pm_token_buffer_flush(parser, &token_buffer);
8869
+ LEX(PM_TOKEN_STRING_CONTENT);
8870
+ }
8871
+ }
8872
+
8873
+ if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
8874
+ if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
8875
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8876
+ }
8877
+
8878
+ parser->current.end = breakpoint + 1;
8879
+
8880
+ if (!was_escaped_newline) {
8881
+ pm_token_buffer_flush(parser, &token_buffer);
7889
8882
  LEX(PM_TOKEN_STRING_CONTENT);
7890
8883
  }
7891
8884
  }
7892
8885
 
7893
- // Otherwise we hit a newline and it wasn't followed by a
7894
- // terminator, so we can continue parsing.
7895
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8886
+ // Otherwise we hit a newline and it wasn't followed by
8887
+ // a terminator, so we can continue parsing.
8888
+ parser->current.end = breakpoint + 1;
8889
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7896
8890
  break;
7897
8891
  }
7898
8892
  case '\\': {
@@ -7902,46 +8896,98 @@ parser_lex(pm_parser_t *parser) {
7902
8896
  // stop looping before the newline and not after the
7903
8897
  // newline so that we can still potentially find the
7904
8898
  // terminator of the heredoc.
7905
- size_t eol_length = match_eol_at(parser, breakpoint + 1);
7906
- if (eol_length) {
7907
- breakpoint += eol_length;
7908
- } else {
7909
- pm_unescape_type_t unescape_type = (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL;
7910
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7911
- if (difference == 0) {
7912
- // we're at the end of the file
7913
- breakpoint = NULL;
7914
- break;
7915
- }
8899
+ parser->current.end = breakpoint + 1;
8900
+
8901
+ // If we've hit the end of the file, then break out of
8902
+ // the loop by setting the breakpoint to NULL.
8903
+ if (parser->current.end == parser->end) {
8904
+ breakpoint = NULL;
8905
+ continue;
8906
+ }
7916
8907
 
7917
- pm_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
8908
+ pm_token_buffer_escape(parser, &token_buffer);
8909
+ uint8_t peeked = peek(parser);
7918
8910
 
7919
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8911
+ if (quote == PM_HEREDOC_QUOTE_SINGLE) {
8912
+ switch (peeked) {
8913
+ case '\r':
8914
+ parser->current.end++;
8915
+ if (peek(parser) != '\n') {
8916
+ pm_token_buffer_push(&token_buffer, '\\');
8917
+ pm_token_buffer_push(&token_buffer, '\r');
8918
+ break;
8919
+ }
8920
+ /* fallthrough */
8921
+ case '\n':
8922
+ pm_token_buffer_push(&token_buffer, '\\');
8923
+ pm_token_buffer_push(&token_buffer, '\n');
8924
+ token_buffer.cursor = parser->current.end + 1;
8925
+ breakpoint = parser->current.end;
8926
+ continue;
8927
+ default:
8928
+ parser->current.end++;
8929
+ pm_token_buffer_push(&token_buffer, '\\');
8930
+ pm_token_buffer_push(&token_buffer, peeked);
8931
+ break;
8932
+ }
8933
+ } else {
8934
+ switch (peeked) {
8935
+ case '\r':
8936
+ parser->current.end++;
8937
+ if (peek(parser) != '\n') {
8938
+ pm_token_buffer_push(&token_buffer, '\r');
8939
+ break;
8940
+ }
8941
+ /* fallthrough */
8942
+ case '\n':
8943
+ was_escaped_newline = true;
8944
+ token_buffer.cursor = parser->current.end + 1;
8945
+ breakpoint = parser->current.end;
8946
+ continue;
8947
+ default:
8948
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8949
+ break;
8950
+ }
7920
8951
  }
7921
8952
 
8953
+ token_buffer.cursor = parser->current.end;
8954
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7922
8955
  break;
7923
8956
  }
7924
8957
  case '#': {
7925
8958
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7926
- if (type != PM_TOKEN_NOT_PROVIDED) {
7927
- LEX(type);
8959
+
8960
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8961
+ // If we haven't returned at this point then we had
8962
+ // something that looked like an interpolated class
8963
+ // or instance variable like "#@" but wasn't
8964
+ // actually. In this case we'll just skip to the
8965
+ // next breakpoint.
8966
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8967
+ break;
7928
8968
  }
7929
8969
 
7930
- // If we haven't returned at this point then we had something
7931
- // that looked like an interpolated class or instance variable
7932
- // like "#@" but wasn't actually. In this case we'll just skip
7933
- // to the next breakpoint.
7934
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7935
- break;
8970
+ if (type == PM_TOKEN_STRING_CONTENT) {
8971
+ pm_token_buffer_flush(parser, &token_buffer);
8972
+ }
8973
+
8974
+ LEX(type);
7936
8975
  }
7937
8976
  default:
7938
8977
  assert(false && "unreachable");
7939
8978
  }
8979
+
8980
+ was_escaped_newline = false;
8981
+ }
8982
+
8983
+ if (parser->current.end > parser->current.start) {
8984
+ parser->current.end = parser->end;
8985
+ pm_token_buffer_flush(parser, &token_buffer);
8986
+ LEX(PM_TOKEN_STRING_CONTENT);
7940
8987
  }
7941
8988
 
7942
8989
  // If we've hit the end of the string, then this is an unterminated
7943
8990
  // heredoc. In that case we'll return the EOF token.
7944
- parser->current.end = parser->end;
7945
8991
  LEX(PM_TOKEN_EOF);
7946
8992
  }
7947
8993
  }
@@ -7955,67 +9001,6 @@ parser_lex(pm_parser_t *parser) {
7955
9001
  /* Parse functions */
7956
9002
  /******************************************************************************/
7957
9003
 
7958
- // When we are parsing certain content, we need to unescape the content to
7959
- // provide to the consumers of the parser. The following functions accept a range
7960
- // of characters from the source and unescapes into the provided type.
7961
- //
7962
- // We have functions for unescaping regular expression nodes, string nodes,
7963
- // symbol nodes, and xstring nodes
7964
- static pm_regular_expression_node_t *
7965
- pm_regular_expression_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7966
- pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, opening, content, closing);
7967
-
7968
- assert((content->end - content->start) >= 0);
7969
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7970
-
7971
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
7972
- return node;
7973
- }
7974
-
7975
- static pm_symbol_node_t *
7976
- pm_symbol_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7977
- pm_symbol_node_t *node = pm_symbol_node_create(parser, opening, content, closing);
7978
-
7979
- assert((content->end - content->start) >= 0);
7980
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7981
-
7982
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
7983
- return node;
7984
- }
7985
-
7986
- static pm_string_node_t *
7987
- pm_char_literal_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7988
- pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing);
7989
-
7990
- assert((content->end - content->start) >= 0);
7991
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7992
-
7993
- pm_unescape_manipulate_char_literal(parser, &node->unescaped, unescape_type);
7994
- return node;
7995
- }
7996
-
7997
- static pm_string_node_t *
7998
- pm_string_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7999
- pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing);
8000
-
8001
- assert((content->end - content->start) >= 0);
8002
- pm_string_shared_init(&node->unescaped, content->start, content->end);
8003
-
8004
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
8005
- return node;
8006
- }
8007
-
8008
- static pm_x_string_node_t *
8009
- pm_xstring_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
8010
- pm_x_string_node_t *node = pm_xstring_node_create(parser, opening, content, closing);
8011
-
8012
- assert((content->end - content->start) >= 0);
8013
- pm_string_shared_init(&node->unescaped, content->start, content->end);
8014
-
8015
- pm_unescape_manipulate_string(parser, &node->unescaped, PM_UNESCAPE_ALL);
8016
- return node;
8017
- }
8018
-
8019
9004
  // These are the various precedence rules. Because we are using a Pratt parser,
8020
9005
  // they are named binding power to represent the manner in which nodes are bound
8021
9006
  // together in the stack.
@@ -8269,7 +9254,7 @@ expect1(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id) {
8269
9254
  if (accept1(parser, type)) return;
8270
9255
 
8271
9256
  const uint8_t *location = parser->previous.end;
8272
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9257
+ pm_parser_err(parser, location, location, diag_id);
8273
9258
 
8274
9259
  parser->previous.start = location;
8275
9260
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8282,7 +9267,7 @@ expect2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_di
8282
9267
  if (accept2(parser, type1, type2)) return;
8283
9268
 
8284
9269
  const uint8_t *location = parser->previous.end;
8285
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9270
+ pm_parser_err(parser, location, location, diag_id);
8286
9271
 
8287
9272
  parser->previous.start = location;
8288
9273
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8294,7 +9279,7 @@ expect3(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_to
8294
9279
  if (accept3(parser, type1, type2, type3)) return;
8295
9280
 
8296
9281
  const uint8_t *location = parser->previous.end;
8297
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9282
+ pm_parser_err(parser, location, location, diag_id);
8298
9283
 
8299
9284
  parser->previous.start = location;
8300
9285
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8389,23 +9374,23 @@ parse_starred_expression(pm_parser_t *parser, pm_binding_power_t binding_power,
8389
9374
  }
8390
9375
 
8391
9376
  // Convert the name of a method into the corresponding write method name. For
8392
- // exmaple, foo would be turned into foo=.
9377
+ // example, foo would be turned into foo=.
8393
9378
  static void
8394
- parse_write_name(pm_string_t *string) {
9379
+ parse_write_name(pm_parser_t *parser, pm_constant_id_t *name_field) {
8395
9380
  // The method name needs to change. If we previously had
8396
9381
  // foo, we now need foo=. In this case we'll allocate a new
8397
9382
  // owned string, copy the previous method name in, and
8398
9383
  // append an =.
8399
- size_t length = pm_string_length(string);
9384
+ pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *name_field);
9385
+ size_t length = constant->length;
8400
9386
  uint8_t *name = calloc(length + 1, sizeof(uint8_t));
8401
9387
  if (name == NULL) return;
8402
9388
 
8403
- memcpy(name, pm_string_source(string), length);
9389
+ memcpy(name, constant->start, length);
8404
9390
  name[length] = '=';
8405
9391
 
8406
9392
  // Now switch the name to the new string.
8407
- pm_string_free(string);
8408
- pm_string_owned_init(string, name, length + 1);
9393
+ *name_field = pm_constant_pool_insert_owned(&parser->constant_pool, name, length + 1);
8409
9394
  }
8410
9395
 
8411
9396
  // Convert the given node into a valid target node.
@@ -8428,7 +9413,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8428
9413
  return target;
8429
9414
  case PM_BACK_REFERENCE_READ_NODE:
8430
9415
  case PM_NUMBERED_REFERENCE_READ_NODE:
8431
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_READONLY);
9416
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
8432
9417
  return target;
8433
9418
  case PM_GLOBAL_VARIABLE_READ_NODE:
8434
9419
  assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t));
@@ -8436,7 +9421,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8436
9421
  return target;
8437
9422
  case PM_LOCAL_VARIABLE_READ_NODE:
8438
9423
  if (token_is_numbered_parameter(target->location.start, target->location.end)) {
8439
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9424
+ pm_parser_err_node(parser, target, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8440
9425
  } else {
8441
9426
  assert(sizeof(pm_local_variable_target_node_t) == sizeof(pm_local_variable_read_node_t));
8442
9427
  target->type = PM_LOCAL_VARIABLE_TARGET_NODE;
@@ -8489,21 +9474,23 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8489
9474
  pm_parser_local_add_location(parser, message.start, message.end);
8490
9475
  pm_node_destroy(parser, target);
8491
9476
 
9477
+ uint32_t depth = 0;
9478
+ for (pm_scope_t *scope = parser->current_scope; scope && scope->transparent; depth++, scope = scope->previous);
8492
9479
  const pm_token_t name = { .type = PM_TOKEN_IDENTIFIER, .start = message.start, .end = message.end };
8493
- target = (pm_node_t *) pm_local_variable_read_node_create(parser, &name, 0);
9480
+ target = (pm_node_t *) pm_local_variable_read_node_create(parser, &name, depth);
8494
9481
 
8495
9482
  assert(sizeof(pm_local_variable_target_node_t) == sizeof(pm_local_variable_read_node_t));
8496
9483
  target->type = PM_LOCAL_VARIABLE_TARGET_NODE;
8497
9484
 
8498
9485
  if (token_is_numbered_parameter(message.start, message.end)) {
8499
- pm_diagnostic_list_append(&parser->error_list, message.start, message.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9486
+ pm_parser_err_location(parser, &message, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8500
9487
  }
8501
9488
 
8502
9489
  return target;
8503
9490
  }
8504
9491
 
8505
9492
  if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
8506
- parse_write_name(&call->name);
9493
+ parse_write_name(parser, &call->name);
8507
9494
  return (pm_node_t *) call;
8508
9495
  }
8509
9496
  }
@@ -8518,9 +9505,8 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8518
9505
  (call->message_loc.end[-1] == ']') &&
8519
9506
  (call->block == NULL)
8520
9507
  ) {
8521
- // Free the previous name and replace it with "[]=".
8522
- pm_string_free(&call->name);
8523
- pm_string_constant_init(&call->name, "[]=", 3);
9508
+ // Replace the name with "[]=".
9509
+ call->name = pm_parser_constant_id_static(parser, "[]=", 3);
8524
9510
  return target;
8525
9511
  }
8526
9512
  }
@@ -8529,7 +9515,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8529
9515
  // In this case we have a node that we don't know how to convert
8530
9516
  // into a target. We need to treat it as an error. For now, we'll
8531
9517
  // mark it as an error and just skip right past it.
8532
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9518
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_UNEXPECTED);
8533
9519
  return target;
8534
9520
  }
8535
9521
  }
@@ -8542,7 +9528,7 @@ parse_target_validate(pm_parser_t *parser, pm_node_t *target) {
8542
9528
 
8543
9529
  // Ensure that we have either an = or a ) after the targets.
8544
9530
  if (!match3(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_KEYWORD_IN)) {
8545
- pm_diagnostic_list_append(&parser->error_list, result->location.start, result->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9531
+ pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED);
8546
9532
  }
8547
9533
 
8548
9534
  return result;
@@ -8568,7 +9554,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8568
9554
  }
8569
9555
  case PM_BACK_REFERENCE_READ_NODE:
8570
9556
  case PM_NUMBERED_REFERENCE_READ_NODE:
8571
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_READONLY);
9557
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
8572
9558
  /* fallthrough */
8573
9559
  case PM_GLOBAL_VARIABLE_READ_NODE: {
8574
9560
  pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value);
@@ -8577,7 +9563,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8577
9563
  }
8578
9564
  case PM_LOCAL_VARIABLE_READ_NODE: {
8579
9565
  if (token_is_numbered_parameter(target->location.start, target->location.end)) {
8580
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9566
+ pm_parser_err_node(parser, target, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8581
9567
  }
8582
9568
 
8583
9569
  pm_local_variable_read_node_t *local_read = (pm_local_variable_read_node_t *) target;
@@ -8642,7 +9628,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8642
9628
  target = (pm_node_t *) pm_local_variable_write_node_create(parser, constant_id, 0, value, &message, operator);
8643
9629
 
8644
9630
  if (token_is_numbered_parameter(message.start, message.end)) {
8645
- pm_diagnostic_list_append(&parser->error_list, message.start, message.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9631
+ pm_parser_err_location(parser, &message, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8646
9632
  }
8647
9633
 
8648
9634
  return target;
@@ -8665,7 +9651,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8665
9651
  pm_arguments_node_arguments_append(arguments, value);
8666
9652
  call->base.location.end = arguments->base.location.end;
8667
9653
 
8668
- parse_write_name(&call->name);
9654
+ parse_write_name(parser, &call->name);
8669
9655
  return (pm_node_t *) call;
8670
9656
  }
8671
9657
  }
@@ -8686,9 +9672,8 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8686
9672
  pm_arguments_node_arguments_append(call->arguments, value);
8687
9673
  target->location.end = value->location.end;
8688
9674
 
8689
- // Free the previous name and replace it with "[]=".
8690
- pm_string_free(&call->name);
8691
- pm_string_constant_init(&call->name, "[]=", 3);
9675
+ // Replace the name with "[]=".
9676
+ call->name = pm_parser_constant_id_static(parser, "[]=", 3);
8692
9677
  return target;
8693
9678
  }
8694
9679
 
@@ -8704,7 +9689,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8704
9689
  // In this case we have a node that we don't know how to convert into a
8705
9690
  // target. We need to treat it as an error. For now, we'll mark it as an
8706
9691
  // error and just skip right past it.
8707
- pm_diagnostic_list_append(&parser->error_list, operator->start, operator->end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9692
+ pm_parser_err_token(parser, operator, PM_ERR_WRITE_TARGET_UNEXPECTED);
8708
9693
  return target;
8709
9694
  }
8710
9695
  }
@@ -8730,7 +9715,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
8730
9715
  // anonymous. It can be the final target or be in the middle if
8731
9716
  // there haven't been any others yet.
8732
9717
  if (has_splat) {
8733
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
9718
+ pm_parser_err_previous(parser, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
8734
9719
  }
8735
9720
 
8736
9721
  pm_token_t star_operator = parser->previous;
@@ -8770,7 +9755,7 @@ parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_
8770
9755
 
8771
9756
  // Ensure that we have either an = or a ) after the targets.
8772
9757
  if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) {
8773
- pm_diagnostic_list_append(&parser->error_list, result->location.start, result->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9758
+ pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED);
8774
9759
  }
8775
9760
 
8776
9761
  return result;
@@ -8863,7 +9848,7 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
8863
9848
  if (token_begins_expression_p(parser->current.type)) {
8864
9849
  value = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
8865
9850
  } else if (pm_parser_local_depth(parser, &operator) == -1) {
8866
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
9851
+ pm_parser_err_token(parser, &operator, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
8867
9852
  }
8868
9853
 
8869
9854
  element = (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
@@ -8970,7 +9955,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
8970
9955
 
8971
9956
  while (!match1(parser, PM_TOKEN_EOF)) {
8972
9957
  if (parsed_block_argument) {
8973
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_AFTER_BLOCK);
9958
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_AFTER_BLOCK);
8974
9959
  }
8975
9960
 
8976
9961
  pm_node_t *argument = NULL;
@@ -8979,7 +9964,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
8979
9964
  case PM_TOKEN_USTAR_STAR:
8980
9965
  case PM_TOKEN_LABEL: {
8981
9966
  if (parsed_bare_hash) {
8982
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_BARE_HASH);
9967
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_BARE_HASH);
8983
9968
  }
8984
9969
 
8985
9970
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -9001,7 +9986,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9001
9986
  if (token_begins_expression_p(parser->current.type)) {
9002
9987
  expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_ARGUMENT);
9003
9988
  } else if (pm_parser_local_depth(parser, &operator) == -1) {
9004
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
9989
+ pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
9005
9990
  }
9006
9991
 
9007
9992
  argument = (pm_node_t *) pm_block_argument_node_create(parser, &operator, expression);
@@ -9020,7 +10005,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9020
10005
 
9021
10006
  if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA)) {
9022
10007
  if (pm_parser_local_depth(parser, &parser->previous) == -1) {
9023
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
10008
+ pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
9024
10009
  }
9025
10010
 
9026
10011
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, NULL);
@@ -9028,7 +10013,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9028
10013
  pm_node_t *expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT);
9029
10014
 
9030
10015
  if (parsed_bare_hash) {
9031
- pm_diagnostic_list_append(&parser->error_list, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT);
10016
+ pm_parser_err(parser, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT);
9032
10017
  }
9033
10018
 
9034
10019
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, expression);
@@ -9049,7 +10034,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9049
10034
  argument = (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
9050
10035
  } else {
9051
10036
  if (pm_parser_local_depth(parser, &parser->previous) == -1) {
9052
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
10037
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
9053
10038
  }
9054
10039
 
9055
10040
  argument = (pm_node_t *) pm_forwarding_arguments_node_create(parser, &parser->previous);
@@ -9066,7 +10051,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9066
10051
 
9067
10052
  if (pm_symbol_node_label_p(argument) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) {
9068
10053
  if (parsed_bare_hash) {
9069
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_BARE_HASH);
10054
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_BARE_HASH);
9070
10055
  }
9071
10056
 
9072
10057
  pm_token_t operator;
@@ -9145,7 +10130,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9145
10130
 
9146
10131
  if (node->parameters.size > 0 && match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
9147
10132
  if (parsed_splat) {
9148
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
10133
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
9149
10134
  }
9150
10135
 
9151
10136
  param = (pm_node_t *) pm_splat_node_create(parser, &parser->previous, NULL);
@@ -9157,7 +10142,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9157
10142
  param = (pm_node_t *) parse_required_destructured_parameter(parser);
9158
10143
  } else if (accept1(parser, PM_TOKEN_USTAR)) {
9159
10144
  if (parsed_splat) {
9160
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
10145
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
9161
10146
  }
9162
10147
 
9163
10148
  pm_token_t star = parser->previous;
@@ -9166,6 +10151,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9166
10151
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
9167
10152
  pm_token_t name = parser->previous;
9168
10153
  value = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
10154
+ pm_parser_parameter_name_check(parser, &name);
9169
10155
  pm_parser_local_add_token(parser, &name);
9170
10156
  }
9171
10157
 
@@ -9176,6 +10162,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9176
10162
  pm_token_t name = parser->previous;
9177
10163
 
9178
10164
  param = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
10165
+ pm_parser_parameter_name_check(parser, &name);
9179
10166
  pm_parser_local_add_token(parser, &name);
9180
10167
  }
9181
10168
 
@@ -9237,12 +10224,12 @@ update_parameter_state(pm_parser_t *parser, pm_token_t *token, pm_parameters_ord
9237
10224
  }
9238
10225
 
9239
10226
  if (token->type == PM_TOKEN_USTAR && *current == PM_PARAMETERS_ORDER_AFTER_OPTIONAL) {
9240
- pm_diagnostic_list_append(&parser->error_list, token->start, token->end, PM_ERR_PARAMETER_STAR);
10227
+ pm_parser_err_token(parser, token, PM_ERR_PARAMETER_STAR);
9241
10228
  }
9242
10229
 
9243
10230
  if (*current == PM_PARAMETERS_ORDER_NOTHING_AFTER || state > *current) {
9244
10231
  // We know what transition we failed on, so we can provide a better error here.
9245
- pm_diagnostic_list_append(&parser->error_list, token->start, token->end, PM_ERR_PARAMETER_ORDER);
10232
+ pm_parser_err_token(parser, token, PM_ERR_PARAMETER_ORDER);
9246
10233
  } else if (state < *current) {
9247
10234
  *current = state;
9248
10235
  }
@@ -9297,7 +10284,7 @@ parse_parameters(
9297
10284
  if (params->block == NULL) {
9298
10285
  pm_parameters_node_block_set(params, param);
9299
10286
  } else {
9300
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_BLOCK_MULTI);
10287
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_BLOCK_MULTI);
9301
10288
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9302
10289
  }
9303
10290
 
@@ -9305,7 +10292,7 @@ parse_parameters(
9305
10292
  }
9306
10293
  case PM_TOKEN_UDOT_DOT_DOT: {
9307
10294
  if (!allows_forwarding_parameter) {
9308
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
10295
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
9309
10296
  }
9310
10297
  if (order > PM_PARAMETERS_ORDER_NOTHING_AFTER) {
9311
10298
  update_parameter_state(parser, &parser->current, &order);
@@ -9318,7 +10305,7 @@ parse_parameters(
9318
10305
  // forwarding parameter and move the keyword rest parameter to the posts list.
9319
10306
  pm_node_t *keyword_rest = params->keyword_rest;
9320
10307
  pm_parameters_node_posts_append(params, keyword_rest);
9321
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_UNEXPECTED_FWD);
10308
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_FWD);
9322
10309
  params->keyword_rest = NULL;
9323
10310
  }
9324
10311
  pm_parameters_node_keyword_rest_set(params, (pm_node_t *)param);
@@ -9337,19 +10324,19 @@ parse_parameters(
9337
10324
  parser_lex(parser);
9338
10325
  switch (parser->previous.type) {
9339
10326
  case PM_TOKEN_CONSTANT:
9340
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_CONSTANT);
10327
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CONSTANT);
9341
10328
  break;
9342
10329
  case PM_TOKEN_INSTANCE_VARIABLE:
9343
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_IVAR);
10330
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_IVAR);
9344
10331
  break;
9345
10332
  case PM_TOKEN_GLOBAL_VARIABLE:
9346
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_GLOBAL);
10333
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_GLOBAL);
9347
10334
  break;
9348
10335
  case PM_TOKEN_CLASS_VARIABLE:
9349
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_CLASS);
10336
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CLASS);
9350
10337
  break;
9351
10338
  case PM_TOKEN_METHOD_NAME:
9352
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_METHOD_NAME);
10339
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_METHOD_NAME);
9353
10340
  break;
9354
10341
  default: break;
9355
10342
  }
@@ -9466,7 +10453,7 @@ parse_parameters(
9466
10453
  if (params->rest == NULL) {
9467
10454
  pm_parameters_node_rest_set(params, param);
9468
10455
  } else {
9469
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_SPLAT_MULTI);
10456
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_SPLAT_MULTI);
9470
10457
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9471
10458
  }
9472
10459
 
@@ -9500,7 +10487,7 @@ parse_parameters(
9500
10487
  if (params->keyword_rest == NULL) {
9501
10488
  pm_parameters_node_keyword_rest_set(params, param);
9502
10489
  } else {
9503
- pm_diagnostic_list_append(&parser->error_list, param->location.start, param->location.end, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI);
10490
+ pm_parser_err_node(parser, param, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI);
9504
10491
  pm_parameters_node_posts_append(params, param);
9505
10492
  }
9506
10493
 
@@ -9518,11 +10505,11 @@ parse_parameters(
9518
10505
  if (params->rest == NULL) {
9519
10506
  pm_parameters_node_rest_set(params, param);
9520
10507
  } else {
9521
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_SPLAT_MULTI);
10508
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_SPLAT_MULTI);
9522
10509
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9523
10510
  }
9524
10511
  } else {
9525
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_WILD_LOOSE_COMMA);
10512
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA);
9526
10513
  }
9527
10514
  }
9528
10515
 
@@ -9725,9 +10712,10 @@ parse_block_parameters(
9725
10712
  }
9726
10713
 
9727
10714
  pm_block_parameters_node_t *block_parameters = pm_block_parameters_node_create(parser, parameters, opening);
9728
- if (accept1(parser, PM_TOKEN_SEMICOLON)) {
10715
+ if ((opening->type != PM_TOKEN_NOT_PROVIDED) && accept1(parser, PM_TOKEN_SEMICOLON)) {
9729
10716
  do {
9730
10717
  expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE);
10718
+ pm_parser_parameter_name_check(parser, &parser->previous);
9731
10719
  pm_parser_local_add_token(parser, &parser->previous);
9732
10720
 
9733
10721
  pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous);
@@ -9850,7 +10838,7 @@ parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accept
9850
10838
  if (arguments->block == NULL) {
9851
10839
  arguments->block = (pm_node_t *) block;
9852
10840
  } else {
9853
- pm_diagnostic_list_append(&parser->error_list, block->base.location.start, block->base.location.end, PM_ERR_ARGUMENT_BLOCK_MULTI);
10841
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
9854
10842
  if (arguments->arguments == NULL) {
9855
10843
  arguments->arguments = pm_arguments_node_create(parser);
9856
10844
  }
@@ -9873,7 +10861,7 @@ parse_predicate(pm_parser_t *parser, pm_binding_power_t binding_power, pm_contex
9873
10861
  bool predicate_closed = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
9874
10862
  predicate_closed |= accept1(parser, PM_TOKEN_KEYWORD_THEN);
9875
10863
  if (!predicate_closed) {
9876
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_CONDITIONAL_PREDICATE_TERM);
10864
+ pm_parser_err_current(parser, PM_ERR_CONDITIONAL_PREDICATE_TERM);
9877
10865
  }
9878
10866
 
9879
10867
  context_pop(parser);
@@ -10057,25 +11045,12 @@ parse_string_part(pm_parser_t *parser) {
10057
11045
  // "aaa #{bbb} #@ccc ddd"
10058
11046
  // ^^^^ ^ ^^^^
10059
11047
  case PM_TOKEN_STRING_CONTENT: {
10060
- pm_unescape_type_t unescape_type = PM_UNESCAPE_ALL;
10061
-
10062
- if (parser->lex_modes.current->mode == PM_LEX_HEREDOC) {
10063
- if (parser->lex_modes.current->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
10064
- // If we're in a tilde heredoc, we want to unescape it later
10065
- // because we don't want unescaped newlines to disappear
10066
- // before we handle them in the dedent.
10067
- unescape_type = PM_UNESCAPE_NONE;
10068
- } else if (parser->lex_modes.current->as.heredoc.quote == PM_HEREDOC_QUOTE_SINGLE) {
10069
- unescape_type = PM_UNESCAPE_MINIMAL;
10070
- }
10071
- }
10072
-
10073
- parser_lex(parser);
10074
-
10075
11048
  pm_token_t opening = not_provided(parser);
10076
11049
  pm_token_t closing = not_provided(parser);
11050
+ pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
10077
11051
 
10078
- return (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, unescape_type);
11052
+ parser_lex(parser);
11053
+ return node;
10079
11054
  }
10080
11055
  // Here the lexer has returned the beginning of an embedded expression. In
10081
11056
  // that case we'll parse the inner statements and return that as the part.
@@ -10166,7 +11141,7 @@ parse_string_part(pm_parser_t *parser) {
10166
11141
  }
10167
11142
  default:
10168
11143
  parser_lex(parser);
10169
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_CANNOT_PARSE_STRING_PART);
11144
+ pm_parser_err_previous(parser, PM_ERR_CANNOT_PARSE_STRING_PART);
10170
11145
  return NULL;
10171
11146
  }
10172
11147
  }
@@ -10177,7 +11152,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10177
11152
 
10178
11153
  if (lex_mode->mode != PM_LEX_STRING) {
10179
11154
  if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
10180
- pm_token_t symbol;
10181
11155
 
10182
11156
  switch (parser->current.type) {
10183
11157
  case PM_TOKEN_IDENTIFIER:
@@ -10190,21 +11164,21 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10190
11164
  case PM_TOKEN_BACK_REFERENCE:
10191
11165
  case PM_CASE_KEYWORD:
10192
11166
  parser_lex(parser);
10193
- symbol = parser->previous;
10194
11167
  break;
10195
11168
  case PM_CASE_OPERATOR:
10196
11169
  lex_state_set(parser, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
10197
11170
  parser_lex(parser);
10198
- symbol = parser->previous;
10199
11171
  break;
10200
11172
  default:
10201
11173
  expect2(parser, PM_TOKEN_IDENTIFIER, PM_TOKEN_METHOD_NAME, PM_ERR_SYMBOL_INVALID);
10202
- symbol = parser->previous;
10203
11174
  break;
10204
11175
  }
10205
11176
 
10206
11177
  pm_token_t closing = not_provided(parser);
10207
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &symbol, &closing, PM_UNESCAPE_ALL);
11178
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
11179
+
11180
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11181
+ return (pm_node_t *) symbol;
10208
11182
  }
10209
11183
 
10210
11184
  if (lex_mode->as.string.interpolation) {
@@ -10215,7 +11189,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10215
11189
 
10216
11190
  pm_token_t content = not_provided(parser);
10217
11191
  pm_token_t closing = parser->previous;
10218
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &closing, PM_UNESCAPE_NONE);
11192
+ return (pm_node_t *) pm_symbol_node_create(parser, &opening, &content, &closing);
10219
11193
  }
10220
11194
 
10221
11195
  // Now we can parse the first part of the symbol.
@@ -10248,18 +11222,23 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10248
11222
  }
10249
11223
 
10250
11224
  pm_token_t content;
10251
- if (accept1(parser, PM_TOKEN_STRING_CONTENT)) {
10252
- content = parser->previous;
11225
+ pm_string_t unescaped;
11226
+
11227
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
11228
+ content = parser->current;
11229
+ unescaped = parser->current_string;
11230
+ parser_lex(parser);
10253
11231
  } else {
10254
11232
  content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end };
11233
+ pm_string_shared_init(&unescaped, content.start, content.end);
10255
11234
  }
10256
11235
 
10257
11236
  if (next_state != PM_LEX_STATE_NONE) {
10258
11237
  lex_state_set(parser, next_state);
10259
11238
  }
10260
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
10261
11239
 
10262
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
11240
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
11241
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
10263
11242
  }
10264
11243
 
10265
11244
  // Parse an argument to undef which can either be a bare word, a
@@ -10276,8 +11255,10 @@ parse_undef_argument(pm_parser_t *parser) {
10276
11255
 
10277
11256
  pm_token_t opening = not_provided(parser);
10278
11257
  pm_token_t closing = not_provided(parser);
11258
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
10279
11259
 
10280
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
11260
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11261
+ return (pm_node_t *) symbol;
10281
11262
  }
10282
11263
  case PM_TOKEN_SYMBOL_BEGIN: {
10283
11264
  pm_lex_mode_t lex_mode = *parser->lex_modes.current;
@@ -10286,7 +11267,7 @@ parse_undef_argument(pm_parser_t *parser) {
10286
11267
  return parse_symbol(parser, &lex_mode, PM_LEX_STATE_NONE);
10287
11268
  }
10288
11269
  default:
10289
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_UNDEF_ARGUMENT);
11270
+ pm_parser_err_current(parser, PM_ERR_UNDEF_ARGUMENT);
10290
11271
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
10291
11272
  }
10292
11273
  }
@@ -10310,8 +11291,10 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
10310
11291
  parser_lex(parser);
10311
11292
  pm_token_t opening = not_provided(parser);
10312
11293
  pm_token_t closing = not_provided(parser);
11294
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
10313
11295
 
10314
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
11296
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11297
+ return (pm_node_t *) symbol;
10315
11298
  }
10316
11299
  case PM_TOKEN_SYMBOL_BEGIN: {
10317
11300
  pm_lex_mode_t lex_mode = *parser->lex_modes.current;
@@ -10329,7 +11312,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
10329
11312
  parser_lex(parser);
10330
11313
  return (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous);
10331
11314
  default:
10332
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ALIAS_ARGUMENT);
11315
+ pm_parser_err_current(parser, PM_ERR_ALIAS_ARGUMENT);
10333
11316
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
10334
11317
  }
10335
11318
  }
@@ -10366,9 +11349,9 @@ parse_variable_call(pm_parser_t *parser) {
10366
11349
  // local variable read. If it's not, then we'll create a normal call
10367
11350
  // node but add an error.
10368
11351
  if (parser->current_scope->explicit_params) {
10369
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
11352
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
10370
11353
  } else if (outer_scope_using_numbered_params_p(parser)) {
10371
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
11354
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
10372
11355
  } else {
10373
11356
  // When you use a numbered parameter, it implies the existence
10374
11357
  // of all of the locals that exist before it. For example,
@@ -10421,76 +11404,8 @@ parse_method_definition_name(pm_parser_t *parser) {
10421
11404
  }
10422
11405
  }
10423
11406
 
10424
- static int
10425
- parse_heredoc_common_whitespace_for_single_node(pm_parser_t *parser, pm_node_t *node, int common_whitespace)
10426
- {
10427
- const pm_location_t *content_loc = &((pm_string_node_t *) node)->content_loc;
10428
- int cur_whitespace;
10429
- const uint8_t *cur_char = content_loc->start;
10430
-
10431
- while (cur_char && cur_char < content_loc->end) {
10432
- // Any empty newlines aren't included in the minimum whitespace
10433
- // calculation.
10434
- size_t eol_length;
10435
- while ((eol_length = match_eol_at(parser, cur_char))) {
10436
- cur_char += eol_length;
10437
- }
10438
-
10439
- if (cur_char == content_loc->end) break;
10440
-
10441
- cur_whitespace = 0;
10442
-
10443
- while (pm_char_is_inline_whitespace(*cur_char) && cur_char < content_loc->end) {
10444
- if (cur_char[0] == '\t') {
10445
- cur_whitespace = (cur_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
10446
- } else {
10447
- cur_whitespace++;
10448
- }
10449
- cur_char++;
10450
- }
10451
-
10452
- // If we hit a newline, then we have encountered a line that
10453
- // contains only whitespace, and it shouldn't be considered in
10454
- // the calculation of common leading whitespace.
10455
- eol_length = match_eol_at(parser, cur_char);
10456
- if (eol_length) {
10457
- cur_char += eol_length;
10458
- continue;
10459
- }
10460
-
10461
- if (cur_whitespace < common_whitespace || common_whitespace == -1) {
10462
- common_whitespace = cur_whitespace;
10463
- }
10464
-
10465
- cur_char = next_newline(cur_char + 1, parser->end - (cur_char + 1));
10466
- if (cur_char) cur_char++;
10467
- }
10468
- return common_whitespace;
10469
- }
10470
-
10471
- // Calculate the common leading whitespace for each line in a heredoc.
10472
- static int
10473
- parse_heredoc_common_whitespace(pm_parser_t *parser, pm_node_list_t *nodes) {
10474
- int common_whitespace = -1;
10475
-
10476
- for (size_t index = 0; index < nodes->size; index++) {
10477
- pm_node_t *node = nodes->nodes[index];
10478
- if (!PM_NODE_TYPE_P(node, PM_STRING_NODE)) continue;
10479
-
10480
- // If the previous node wasn't a string node, we don't want to trim
10481
- // whitespace. This could happen after an interpolated expression or
10482
- // variable.
10483
- if (index == 0 || PM_NODE_TYPE_P(nodes->nodes[index - 1], PM_STRING_NODE)) {
10484
- common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, common_whitespace);
10485
- }
10486
- }
10487
-
10488
- return common_whitespace;
10489
- }
10490
-
10491
- static pm_string_t *
10492
- parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, int common_whitespace, pm_heredoc_quote_t quote)
10493
- {
11407
+ static void
11408
+ parse_heredoc_dedent_string(pm_string_t *string, size_t common_whitespace) {
10494
11409
  // Get a reference to the string struct that is being held by the string
10495
11410
  // node. This is the value we're going to actually manipulate.
10496
11411
  pm_string_ensure_owned(string);
@@ -10499,85 +11414,37 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
10499
11414
  // destination to move bytes into. We'll also use it for bounds checking
10500
11415
  // since we don't require that these strings be null terminated.
10501
11416
  size_t dest_length = pm_string_length(string);
10502
- uint8_t *source_start = (uint8_t *) string->source;
10503
-
10504
- const uint8_t *source_cursor = source_start;
11417
+ const uint8_t *source_cursor = (uint8_t *) string->source;
10505
11418
  const uint8_t *source_end = source_cursor + dest_length;
10506
11419
 
10507
11420
  // We're going to move bytes backward in the string when we get leading
10508
11421
  // whitespace, so we'll maintain a pointer to the current position in the
10509
11422
  // string that we're writing to.
10510
- uint8_t *dest_cursor = source_start;
10511
-
10512
- while (source_cursor < source_end) {
10513
- // If we need to dedent the next element within the heredoc or the next
10514
- // line within the string node, then we'll do it here.
10515
- if (dedent_node) {
10516
- int trimmed_whitespace = 0;
10517
-
10518
- // While we haven't reached the amount of common whitespace that we need
10519
- // to trim and we haven't reached the end of the string, we'll keep
10520
- // trimming whitespace. Trimming in this context means skipping over
10521
- // these bytes such that they aren't copied into the new string.
10522
- while ((source_cursor < source_end) && pm_char_is_inline_whitespace(*source_cursor) && trimmed_whitespace < common_whitespace) {
10523
- if (*source_cursor == '\t') {
10524
- trimmed_whitespace = (trimmed_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
10525
- if (trimmed_whitespace > common_whitespace) break;
10526
- } else {
10527
- trimmed_whitespace++;
10528
- }
10529
-
10530
- source_cursor++;
10531
- dest_length--;
10532
- }
10533
- }
10534
-
10535
- // At this point we have dedented all that we need to, so we need to find
10536
- // the next newline.
10537
- const uint8_t *breakpoint = next_newline(source_cursor, source_end - source_cursor);
10538
-
10539
- if (breakpoint == NULL) {
10540
- // If there isn't another newline, then we can just move the rest of the
10541
- // string and break from the loop.
10542
- memmove(dest_cursor, source_cursor, (size_t) (source_end - source_cursor));
10543
- break;
11423
+ size_t trimmed_whitespace = 0;
11424
+
11425
+ // While we haven't reached the amount of common whitespace that we need to
11426
+ // trim and we haven't reached the end of the string, we'll keep trimming
11427
+ // whitespace. Trimming in this context means skipping over these bytes such
11428
+ // that they aren't copied into the new string.
11429
+ while ((source_cursor < source_end) && pm_char_is_inline_whitespace(*source_cursor) && trimmed_whitespace < common_whitespace) {
11430
+ if (*source_cursor == '\t') {
11431
+ trimmed_whitespace = (trimmed_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
11432
+ if (trimmed_whitespace > common_whitespace) break;
11433
+ } else {
11434
+ trimmed_whitespace++;
10544
11435
  }
10545
11436
 
10546
- // Otherwise, we need to move everything including the newline, and
10547
- // then set the dedent_node flag to true.
10548
- if (breakpoint < source_end) breakpoint++;
10549
- memmove(dest_cursor, source_cursor, (size_t) (breakpoint - source_cursor));
10550
- dest_cursor += (breakpoint - source_cursor);
10551
- source_cursor = breakpoint;
10552
- dedent_node = true;
11437
+ source_cursor++;
11438
+ dest_length--;
10553
11439
  }
10554
11440
 
10555
- // We only want to write this node into the list if it has any content.
11441
+ memmove((uint8_t *) string->source, source_cursor, (size_t) (source_end - source_cursor));
10556
11442
  string->length = dest_length;
10557
-
10558
- if (dest_length != 0) {
10559
- pm_unescape_manipulate_string(parser, string, (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL);
10560
- }
10561
- return string;
10562
11443
  }
10563
11444
 
10564
11445
  // Take a heredoc node that is indented by a ~ and trim the leading whitespace.
10565
11446
  static void
10566
- parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote)
10567
- {
10568
- pm_node_list_t *nodes;
10569
-
10570
- if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
10571
- nodes = &((pm_interpolated_x_string_node_t *) heredoc_node)->parts;
10572
- } else {
10573
- nodes = &((pm_interpolated_string_node_t *) heredoc_node)->parts;
10574
- }
10575
-
10576
- // First, calculate how much common whitespace we need to trim. If there is
10577
- // none or it's 0, then we can return early.
10578
- int common_whitespace;
10579
- if ((common_whitespace = parse_heredoc_common_whitespace(parser, nodes)) <= 0) return;
10580
-
11447
+ parse_heredoc_dedent(pm_parser_t *parser, pm_node_list_t *nodes, size_t common_whitespace) {
10581
11448
  // The next node should be dedented if it's the first node in the list or if
10582
11449
  // if follows a string node.
10583
11450
  bool dedent_next = true;
@@ -10600,7 +11467,10 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_qu
10600
11467
  }
10601
11468
 
10602
11469
  pm_string_node_t *string_node = ((pm_string_node_t *) node);
10603
- parse_heredoc_dedent_single_node(parser, &string_node->unescaped, dedent_next, common_whitespace, quote);
11470
+ if (dedent_next) {
11471
+ parse_heredoc_dedent_string(&string_node->unescaped, common_whitespace);
11472
+ }
11473
+
10604
11474
  if (string_node->unescaped.length == 0) {
10605
11475
  pm_node_destroy(parser, node);
10606
11476
  } else {
@@ -10929,13 +11799,13 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
10929
11799
  case PM_TOKEN_STRING_BEGIN:
10930
11800
  key = parse_expression(parser, PM_BINDING_POWER_MAX, PM_ERR_PATTERN_HASH_KEY);
10931
11801
  if (!pm_symbol_node_label_p(key)) {
10932
- pm_diagnostic_list_append(&parser->error_list, key->location.start, key->location.end, PM_ERR_PATTERN_HASH_KEY_LABEL);
11802
+ pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_LABEL);
10933
11803
  }
10934
11804
 
10935
11805
  break;
10936
11806
  default:
10937
11807
  parser_lex(parser);
10938
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PATTERN_HASH_KEY);
11808
+ pm_parser_err_previous(parser, PM_ERR_PATTERN_HASH_KEY);
10939
11809
  key = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
10940
11810
  break;
10941
11811
  }
@@ -10970,7 +11840,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
10970
11840
  return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
10971
11841
  }
10972
11842
  default: {
10973
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE);
11843
+ pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE);
10974
11844
  pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end);
10975
11845
  return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
10976
11846
  }
@@ -11058,7 +11928,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11058
11928
  default: {
11059
11929
  // If we get here, then we have a pin operator followed by something
11060
11930
  // not understood. We'll create a missing node and return that.
11061
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN);
11931
+ pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN);
11062
11932
  pm_node_t *variable = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end);
11063
11933
  return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable);
11064
11934
  }
@@ -11082,7 +11952,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11082
11952
  return parse_pattern_constant_path(parser, node);
11083
11953
  }
11084
11954
  default:
11085
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, diag_id);
11955
+ pm_parser_err_current(parser, diag_id);
11086
11956
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
11087
11957
  }
11088
11958
  }
@@ -11126,7 +11996,7 @@ parse_pattern_primitives(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11126
11996
  break;
11127
11997
  }
11128
11998
  default: {
11129
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, diag_id);
11999
+ pm_parser_err_current(parser, diag_id);
11130
12000
  pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
11131
12001
 
11132
12002
  if (node == NULL) {
@@ -11218,7 +12088,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
11218
12088
  // will continue to parse the rest of the patterns, but we will indicate
11219
12089
  // it as an error.
11220
12090
  if (trailing_rest) {
11221
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PATTERN_REST);
12091
+ pm_parser_err_previous(parser, PM_ERR_PATTERN_REST);
11222
12092
  }
11223
12093
 
11224
12094
  trailing_rest = true;
@@ -11284,6 +12154,7 @@ static inline pm_node_t *
11284
12154
  parse_strings(pm_parser_t *parser) {
11285
12155
  assert(parser->current.type == PM_TOKEN_STRING_BEGIN);
11286
12156
  pm_node_t *result = NULL;
12157
+ bool state_is_arg_labeled = lex_state_p(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
11287
12158
 
11288
12159
  while (match1(parser, PM_TOKEN_STRING_BEGIN)) {
11289
12160
  pm_node_t *node = NULL;
@@ -11301,17 +12172,30 @@ parse_strings(pm_parser_t *parser) {
11301
12172
  // start. In that case we'll create an empty content token and
11302
12173
  // return an uninterpolated string.
11303
12174
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11304
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_NONE);
12175
+ pm_string_node_t *string = pm_string_node_create(parser, &opening, &content, &parser->previous);
12176
+
12177
+ pm_string_shared_init(&string->unescaped, content.start, content.end);
12178
+ node = (pm_node_t *) string;
11305
12179
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11306
12180
  // If we get here, then we have an end of a label immediately
11307
12181
  // after a start. In that case we'll create an empty symbol
11308
12182
  // node.
11309
12183
  pm_token_t opening = not_provided(parser);
11310
12184
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11311
- node = (pm_node_t *) pm_symbol_node_create(parser, &opening, &content, &parser->previous);
12185
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous);
12186
+
12187
+ pm_string_shared_init(&symbol->unescaped, content.start, content.end);
12188
+ node = (pm_node_t *) symbol;
11312
12189
  } else if (!lex_interpolation) {
11313
12190
  // If we don't accept interpolation then we expect the string to
11314
12191
  // start with a single string content node.
12192
+ pm_string_t unescaped;
12193
+ if (match1(parser, PM_TOKEN_EOF)) {
12194
+ unescaped = PM_EMPTY_STRING;
12195
+ } else {
12196
+ unescaped = parser->current_string;
12197
+ }
12198
+
11315
12199
  expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_EXPECT_STRING_CONTENT);
11316
12200
  pm_token_t content = parser->previous;
11317
12201
 
@@ -11330,21 +12214,22 @@ parse_strings(pm_parser_t *parser) {
11330
12214
  pm_node_list_t parts = PM_EMPTY_NODE_LIST;
11331
12215
 
11332
12216
  pm_token_t delimiters = not_provided(parser);
11333
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &delimiters, &content, &delimiters, PM_UNESCAPE_MINIMAL);
12217
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &delimiters, &content, &delimiters, &unescaped);
11334
12218
  pm_node_list_append(&parts, part);
11335
12219
 
11336
- while (accept1(parser, PM_TOKEN_STRING_CONTENT)) {
11337
- part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &delimiters, &parser->previous, &delimiters, PM_UNESCAPE_MINIMAL);
12220
+ do {
12221
+ part = (pm_node_t *) pm_string_node_create_current_string(parser, &delimiters, &parser->current, &delimiters);
11338
12222
  pm_node_list_append(&parts, part);
11339
- }
12223
+ parser_lex(parser);
12224
+ } while (match1(parser, PM_TOKEN_STRING_CONTENT));
11340
12225
 
11341
12226
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
11342
12227
  node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
11343
- } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11344
- node = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12228
+ } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
12229
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11345
12230
  } else {
11346
12231
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
11347
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_MINIMAL);
12232
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11348
12233
  }
11349
12234
  } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
11350
12235
  // In this case we've hit string content so we know the string
@@ -11352,12 +12237,14 @@ parse_strings(pm_parser_t *parser) {
11352
12237
  // following token is the end (in which case we can return a
11353
12238
  // plain string) or if it's not then it has interpolation.
11354
12239
  pm_token_t content = parser->current;
12240
+ pm_string_t unescaped = parser->current_string;
11355
12241
  parser_lex(parser);
11356
12242
 
11357
- if (accept1(parser, PM_TOKEN_STRING_END)) {
11358
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12243
+ if (match1(parser, PM_TOKEN_STRING_END)) {
12244
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
12245
+ parser_lex(parser);
11359
12246
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11360
- node = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12247
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11361
12248
  } else {
11362
12249
  // If we get here, then we have interpolation so we'll need
11363
12250
  // to create a string or symbol node with interpolation.
@@ -11365,7 +12252,7 @@ parse_strings(pm_parser_t *parser) {
11365
12252
  pm_token_t string_opening = not_provided(parser);
11366
12253
  pm_token_t string_closing = not_provided(parser);
11367
12254
 
11368
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &string_opening, &parser->previous, &string_closing, PM_UNESCAPE_ALL);
12255
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped);
11369
12256
  pm_node_list_append(&parts, part);
11370
12257
 
11371
12258
  while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
@@ -11374,7 +12261,7 @@ parse_strings(pm_parser_t *parser) {
11374
12261
  }
11375
12262
  }
11376
12263
 
11377
- if (accept1(parser, PM_TOKEN_LABEL_END)) {
12264
+ if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
11378
12265
  node = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
11379
12266
  } else {
11380
12267
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM);
@@ -11382,11 +12269,11 @@ parse_strings(pm_parser_t *parser) {
11382
12269
  }
11383
12270
  }
11384
12271
  } else {
11385
- // If we get here, then the first part of the string is not
11386
- // plain string content, in which case we need to parse the
11387
- // string as an interpolated string.
12272
+ // If we get here, then the first part of the string is not plain
12273
+ // string content, in which case we need to parse the string as an
12274
+ // interpolated string.
11388
12275
  pm_node_list_t parts = PM_EMPTY_NODE_LIST;
11389
- pm_node_t *part = NULL;
12276
+ pm_node_t *part;
11390
12277
 
11391
12278
  while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
11392
12279
  if ((part = parse_string_part(parser)) != NULL) {
@@ -11418,7 +12305,7 @@ parse_strings(pm_parser_t *parser) {
11418
12305
  // If it cannot be concatenated with the previous node, then we'll
11419
12306
  // need to add a syntax error.
11420
12307
  if (!PM_NODE_TYPE_P(node, PM_STRING_NODE) && !PM_NODE_TYPE_P(node, PM_INTERPOLATED_STRING_NODE)) {
11421
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_STRING_CONCATENATION);
12308
+ pm_parser_err_node(parser, node, PM_ERR_STRING_CONCATENATION);
11422
12309
  }
11423
12310
 
11424
12311
  // Either way we will create a concat node to hold the strings
@@ -11464,7 +12351,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11464
12351
  element = (pm_node_t *) pm_splat_node_create(parser, &operator, expression);
11465
12352
  } else if (match2(parser, PM_TOKEN_LABEL, PM_TOKEN_USTAR_STAR)) {
11466
12353
  if (parsed_bare_hash) {
11467
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EXPRESSION_BARE_HASH);
12354
+ pm_parser_err_current(parser, PM_ERR_EXPRESSION_BARE_HASH);
11468
12355
  }
11469
12356
 
11470
12357
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -11480,7 +12367,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11480
12367
 
11481
12368
  if (pm_symbol_node_label_p(element) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) {
11482
12369
  if (parsed_bare_hash) {
11483
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_EXPRESSION_BARE_HASH);
12370
+ pm_parser_err_previous(parser, PM_ERR_EXPRESSION_BARE_HASH);
11484
12371
  }
11485
12372
 
11486
12373
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -11598,7 +12485,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11598
12485
  // If we didn't find a terminator and we didn't find a right
11599
12486
  // parenthesis, then this is a syntax error.
11600
12487
  if (!terminator_found) {
11601
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
12488
+ pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11602
12489
  }
11603
12490
 
11604
12491
  // Parse each statement within the parentheses.
@@ -11627,7 +12514,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11627
12514
  } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
11628
12515
  break;
11629
12516
  } else {
11630
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
12517
+ pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11631
12518
  }
11632
12519
  }
11633
12520
 
@@ -11665,7 +12552,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11665
12552
  content.start = content.start + 1;
11666
12553
 
11667
12554
  pm_token_t closing = not_provided(parser);
11668
- pm_node_t *node = (pm_node_t *) pm_char_literal_node_create_and_unescape(parser, &opening, &content, &closing, PM_UNESCAPE_ALL);
12555
+ pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
11669
12556
 
11670
12557
  // Characters can be followed by strings in which case they are
11671
12558
  // automatically concatenated.
@@ -11839,9 +12726,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11839
12726
  case PM_TOKEN_HEREDOC_START: {
11840
12727
  // Here we have found a heredoc. We'll parse it and add it to the
11841
12728
  // list of strings.
11842
- assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC);
11843
- pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
11844
- pm_heredoc_indent_t indent = parser->lex_modes.current->as.heredoc.indent;
12729
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
12730
+ assert(lex_mode->mode == PM_LEX_HEREDOC);
12731
+ pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
12732
+ pm_heredoc_indent_t indent = lex_mode->as.heredoc.indent;
11845
12733
 
11846
12734
  parser_lex(parser);
11847
12735
  pm_token_t opening = parser->previous;
@@ -11857,9 +12745,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11857
12745
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11858
12746
 
11859
12747
  if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
11860
- node = (pm_node_t *) pm_xstring_node_create_and_unescape(parser, &opening, &content, &parser->previous);
12748
+ node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_EMPTY_STRING);
11861
12749
  } else {
11862
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_NONE);
12750
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_EMPTY_STRING);
11863
12751
  }
11864
12752
 
11865
12753
  node->location.end = opening.end;
@@ -11884,15 +12772,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11884
12772
  cast->base.type = PM_X_STRING_NODE;
11885
12773
  }
11886
12774
 
11887
- lex_state_set(parser, PM_LEX_STATE_END);
11888
- expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
12775
+ size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
12776
+ if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
12777
+ parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
12778
+ }
11889
12779
 
11890
12780
  node = (pm_node_t *) cast;
11891
-
11892
- if (indent == PM_HEREDOC_INDENT_TILDE) {
11893
- int common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, -1);
11894
- parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, common_whitespace, quote);
11895
- }
12781
+ lex_state_set(parser, PM_LEX_STATE_END);
12782
+ expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
11896
12783
  } else {
11897
12784
  // If we get here, then we have multiple parts in the heredoc,
11898
12785
  // so we'll need to create an interpolated string node to hold
@@ -11931,8 +12818,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11931
12818
 
11932
12819
  // If this is a heredoc that is indented with a ~, then we need
11933
12820
  // to dedent each line by the common leading whitespace.
11934
- if (indent == PM_HEREDOC_INDENT_TILDE) {
11935
- parse_heredoc_dedent(parser, node, quote);
12821
+ size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
12822
+ if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
12823
+ pm_node_list_t *nodes;
12824
+ if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
12825
+ nodes = &((pm_interpolated_x_string_node_t *) node)->parts;
12826
+ } else {
12827
+ nodes = &((pm_interpolated_string_node_t *) node)->parts;
12828
+ }
12829
+
12830
+ parse_heredoc_dedent(parser, nodes, common_whitespace);
11936
12831
  }
11937
12832
  }
11938
12833
 
@@ -11995,10 +12890,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11995
12890
  case PM_GLOBAL_VARIABLE_READ_NODE: {
11996
12891
  if (PM_NODE_TYPE_P(old_name, PM_BACK_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_GLOBAL_VARIABLE_READ_NODE)) {
11997
12892
  if (PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE)) {
11998
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12893
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
11999
12894
  }
12000
12895
  } else {
12001
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12896
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
12002
12897
  }
12003
12898
 
12004
12899
  return (pm_node_t *) pm_alias_global_variable_node_create(parser, &keyword, new_name, old_name);
@@ -12006,7 +12901,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12006
12901
  case PM_SYMBOL_NODE:
12007
12902
  case PM_INTERPOLATED_SYMBOL_NODE: {
12008
12903
  if (!PM_NODE_TYPE_P(old_name, PM_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_INTERPOLATED_SYMBOL_NODE)) {
12009
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12904
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
12010
12905
  }
12011
12906
  }
12012
12907
  /* fallthrough */
@@ -12032,7 +12927,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12032
12927
  }
12033
12928
 
12034
12929
  if (accept1(parser, PM_TOKEN_KEYWORD_END)) {
12035
- pm_diagnostic_list_append(&parser->error_list, case_keyword.start, case_keyword.end, PM_ERR_CASE_MISSING_CONDITIONS);
12930
+ pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS);
12036
12931
  return (pm_node_t *) pm_case_node_create(parser, &case_keyword, predicate, NULL, &parser->previous);
12037
12932
  }
12038
12933
 
@@ -12142,7 +13037,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12142
13037
  // If we didn't parse any conditions (in or when) then we need to
12143
13038
  // indicate that we have an error.
12144
13039
  if (case_node->conditions.size == 0) {
12145
- pm_diagnostic_list_append(&parser->error_list, case_keyword.start, case_keyword.end, PM_ERR_CASE_MISSING_CONDITIONS);
13040
+ pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS);
12146
13041
  }
12147
13042
 
12148
13043
  accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
@@ -12185,12 +13080,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12185
13080
  pm_begin_node_end_keyword_set(begin_node, &parser->previous);
12186
13081
 
12187
13082
  if ((begin_node->else_clause != NULL) && (begin_node->rescue_clause == NULL)) {
12188
- pm_diagnostic_list_append(
12189
- &parser->error_list,
12190
- begin_node->else_clause->base.location.start,
12191
- begin_node->else_clause->base.location.end,
12192
- PM_ERR_BEGIN_LONELY_ELSE
12193
- );
13083
+ pm_parser_err_node(parser, (pm_node_t *) begin_node->else_clause, PM_ERR_BEGIN_LONELY_ELSE);
12194
13084
  }
12195
13085
 
12196
13086
  return (pm_node_t *) begin_node;
@@ -12206,7 +13096,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12206
13096
  expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BEGIN_UPCASE_TERM);
12207
13097
  pm_context_t context = parser->current_context->context;
12208
13098
  if ((context != PM_CONTEXT_MAIN) && (context != PM_CONTEXT_PREEXE)) {
12209
- pm_diagnostic_list_append(&parser->error_list, keyword.start, keyword.end, PM_ERR_BEGIN_UPCASE_TOPLEVEL);
13099
+ pm_parser_err_token(parser, &keyword, PM_ERR_BEGIN_UPCASE_TOPLEVEL);
12210
13100
  }
12211
13101
  return (pm_node_t *) pm_pre_execution_node_create(parser, &keyword, &opening, statements, &parser->previous);
12212
13102
  }
@@ -12239,7 +13129,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12239
13129
  (parser->current_context->context == PM_CONTEXT_CLASS) ||
12240
13130
  (parser->current_context->context == PM_CONTEXT_MODULE)
12241
13131
  ) {
12242
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_RETURN_INVALID);
13132
+ pm_parser_err_current(parser, PM_ERR_RETURN_INVALID);
12243
13133
  }
12244
13134
  return (pm_node_t *) pm_return_node_create(parser, &keyword, arguments.arguments);
12245
13135
  }
@@ -12305,7 +13195,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12305
13195
  pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_ERR_CLASS_NAME);
12306
13196
  pm_token_t name = parser->previous;
12307
13197
  if (name.type != PM_TOKEN_CONSTANT) {
12308
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_CLASS_NAME);
13198
+ pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME);
12309
13199
  }
12310
13200
 
12311
13201
  pm_token_t inheritance_operator;
@@ -12346,7 +13236,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12346
13236
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
12347
13237
 
12348
13238
  if (context_def_p(parser)) {
12349
- pm_diagnostic_list_append(&parser->error_list, class_keyword.start, class_keyword.end, PM_ERR_CLASS_IN_METHOD);
13239
+ pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD);
12350
13240
  }
12351
13241
 
12352
13242
  pm_constant_id_list_t locals = parser->current_scope->locals;
@@ -12354,7 +13244,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12354
13244
  pm_do_loop_stack_pop(parser);
12355
13245
 
12356
13246
  if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) {
12357
- pm_diagnostic_list_append(&parser->error_list, constant_path->location.start, constant_path->location.end, PM_ERR_CLASS_NAME);
13247
+ pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME);
12358
13248
  }
12359
13249
 
12360
13250
  return (pm_node_t *) pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, &inheritance_operator, superclass, statements, &parser->previous);
@@ -12486,7 +13376,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12486
13376
  // If, after all that, we were unable to find a method name, add an
12487
13377
  // error to the error list.
12488
13378
  if (name.type == PM_TOKEN_MISSING) {
12489
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_DEF_NAME);
13379
+ pm_parser_err_previous(parser, PM_ERR_DEF_NAME);
12490
13380
  }
12491
13381
 
12492
13382
  pm_token_t lparen;
@@ -12538,7 +13428,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12538
13428
 
12539
13429
  if (accept1(parser, PM_TOKEN_EQUAL)) {
12540
13430
  if (token_is_setter_name(&name)) {
12541
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_DEF_ENDLESS_SETTER);
13431
+ pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER);
12542
13432
  }
12543
13433
  equal = parser->previous;
12544
13434
 
@@ -12656,6 +13546,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12656
13546
  parser_lex(parser);
12657
13547
  pm_token_t for_keyword = parser->previous;
12658
13548
  pm_node_t *index;
13549
+ pm_parser_scope_push_transparent(parser);
12659
13550
 
12660
13551
  // First, parse out the first index expression.
12661
13552
  if (accept1(parser, PM_TOKEN_USTAR)) {
@@ -12670,7 +13561,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12670
13561
  } else if (token_begins_expression_p(parser->current.type)) {
12671
13562
  index = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA);
12672
13563
  } else {
12673
- pm_diagnostic_list_append(&parser->error_list, for_keyword.start, for_keyword.end, PM_ERR_FOR_INDEX);
13564
+ pm_parser_err_token(parser, &for_keyword, PM_ERR_FOR_INDEX);
12674
13565
  index = (pm_node_t *) pm_missing_node_create(parser, for_keyword.start, for_keyword.end);
12675
13566
  }
12676
13567
 
@@ -12681,6 +13572,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12681
13572
  index = parse_target(parser, index);
12682
13573
  }
12683
13574
 
13575
+ pm_parser_scope_pop(parser);
12684
13576
  pm_do_loop_stack_push(parser, true);
12685
13577
 
12686
13578
  expect1(parser, PM_TOKEN_KEYWORD_IN, PM_ERR_FOR_IN);
@@ -12700,8 +13592,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12700
13592
  pm_statements_node_t *statements = NULL;
12701
13593
 
12702
13594
  if (!accept1(parser, PM_TOKEN_KEYWORD_END)) {
13595
+ pm_parser_scope_push_transparent(parser);
12703
13596
  statements = parse_statements(parser, PM_CONTEXT_FOR);
12704
13597
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_FOR_TERM);
13598
+ pm_parser_scope_pop(parser);
12705
13599
  }
12706
13600
 
12707
13601
  return (pm_node_t *) pm_for_node_create(parser, index, collection, statements, &for_keyword, &in_keyword, &do_keyword, &parser->previous);
@@ -12797,7 +13691,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12797
13691
  // syntax error. We handle that here as well.
12798
13692
  name = parser->previous;
12799
13693
  if (name.type != PM_TOKEN_CONSTANT) {
12800
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_MODULE_NAME);
13694
+ pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME);
12801
13695
  }
12802
13696
 
12803
13697
  pm_parser_scope_push(parser, true);
@@ -12821,7 +13715,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12821
13715
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM);
12822
13716
 
12823
13717
  if (context_def_p(parser)) {
12824
- pm_diagnostic_list_append(&parser->error_list, module_keyword.start, module_keyword.end, PM_ERR_MODULE_IN_METHOD);
13718
+ pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD);
12825
13719
  }
12826
13720
 
12827
13721
  return (pm_node_t *) pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous);
@@ -12891,13 +13785,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12891
13785
  accept1(parser, PM_TOKEN_WORDS_SEP);
12892
13786
  if (match1(parser, PM_TOKEN_STRING_END)) break;
12893
13787
 
12894
- expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
12895
-
12896
- pm_token_t opening = not_provided(parser);
12897
- pm_token_t closing = not_provided(parser);
13788
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13789
+ pm_token_t opening = not_provided(parser);
13790
+ pm_token_t closing = not_provided(parser);
13791
+ pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing));
13792
+ }
12898
13793
 
12899
- pm_node_t *symbol = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_MINIMAL);
12900
- pm_array_node_elements_append(array, symbol);
13794
+ expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
12901
13795
  }
12902
13796
 
12903
13797
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_LOWER_TERM);
@@ -12937,26 +13831,26 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12937
13831
  // If we hit content and the current node is NULL, then this is
12938
13832
  // the first string content we've seen. In that case we're going
12939
13833
  // to create a new string node and set that to the current.
13834
+ current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing);
12940
13835
  parser_lex(parser);
12941
- current = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
12942
13836
  } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) {
12943
13837
  // If we hit string content and the current node is an
12944
13838
  // interpolated string, then we need to append the string content
12945
13839
  // to the list of child nodes.
12946
- pm_node_t *part = parse_string_part(parser);
12947
- pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part);
13840
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
13841
+ parser_lex(parser);
13842
+
13843
+ pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string);
12948
13844
  } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) {
12949
13845
  // If we hit string content and the current node is a string node,
12950
13846
  // then we need to convert the current node into an interpolated
12951
13847
  // string and add the string content to the list of child nodes.
12952
- pm_token_t opening = not_provided(parser);
12953
- pm_token_t closing = not_provided(parser);
12954
- pm_interpolated_symbol_node_t *interpolated =
12955
- pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
12956
- pm_interpolated_symbol_node_append(interpolated, current);
13848
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing);
13849
+ parser_lex(parser);
12957
13850
 
12958
- pm_node_t *part = parse_string_part(parser);
12959
- pm_interpolated_symbol_node_append(interpolated, part);
13851
+ pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
13852
+ pm_interpolated_symbol_node_append(interpolated, current);
13853
+ pm_interpolated_symbol_node_append(interpolated, string);
12960
13854
  current = (pm_node_t *) interpolated;
12961
13855
  } else {
12962
13856
  assert(false && "unreachable");
@@ -13063,12 +13957,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13063
13957
  accept1(parser, PM_TOKEN_WORDS_SEP);
13064
13958
  if (match1(parser, PM_TOKEN_STRING_END)) break;
13065
13959
 
13066
- expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
13960
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13961
+ pm_token_t opening = not_provided(parser);
13962
+ pm_token_t closing = not_provided(parser);
13067
13963
 
13068
- pm_token_t opening = not_provided(parser);
13069
- pm_token_t closing = not_provided(parser);
13070
- pm_node_t *string = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_WHITESPACE);
13071
- pm_array_node_elements_append(array, string);
13964
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
13965
+ pm_array_node_elements_append(array, string);
13966
+ }
13967
+
13968
+ expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
13072
13969
  }
13073
13970
 
13074
13971
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM);
@@ -13101,29 +13998,29 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13101
13998
  break;
13102
13999
  }
13103
14000
  case PM_TOKEN_STRING_CONTENT: {
14001
+ pm_token_t opening = not_provided(parser);
14002
+ pm_token_t closing = not_provided(parser);
14003
+
14004
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
14005
+ parser_lex(parser);
14006
+
13104
14007
  if (current == NULL) {
13105
14008
  // If we hit content and the current node is NULL, then this is
13106
14009
  // the first string content we've seen. In that case we're going
13107
14010
  // to create a new string node and set that to the current.
13108
- current = parse_string_part(parser);
14011
+ current = string;
13109
14012
  } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
13110
14013
  // If we hit string content and the current node is an
13111
14014
  // interpolated string, then we need to append the string content
13112
14015
  // to the list of child nodes.
13113
- pm_node_t *part = parse_string_part(parser);
13114
- pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part);
14016
+ pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string);
13115
14017
  } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
13116
14018
  // If we hit string content and the current node is a string node,
13117
14019
  // then we need to convert the current node into an interpolated
13118
14020
  // string and add the string content to the list of child nodes.
13119
- pm_token_t opening = not_provided(parser);
13120
- pm_token_t closing = not_provided(parser);
13121
- pm_interpolated_string_node_t *interpolated =
13122
- pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
14021
+ pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
13123
14022
  pm_interpolated_string_node_append(interpolated, current);
13124
-
13125
- pm_node_t *part = parse_string_part(parser);
13126
- pm_interpolated_string_node_append(interpolated, part);
14023
+ pm_interpolated_string_node_append(interpolated, string);
13127
14024
  current = (pm_node_t *) interpolated;
13128
14025
  } else {
13129
14026
  assert(false && "unreachable");
@@ -13218,7 +14115,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13218
14115
  };
13219
14116
 
13220
14117
  parser_lex(parser);
13221
- return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
14118
+ return (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
13222
14119
  }
13223
14120
 
13224
14121
  pm_interpolated_regular_expression_node_t *node;
@@ -13228,6 +14125,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13228
14125
  // expression at least has something in it. We'll need to check if the
13229
14126
  // following token is the end (in which case we can return a plain
13230
14127
  // regular expression) or if it's not then it has interpolation.
14128
+ pm_string_t unescaped = parser->current_string;
13231
14129
  pm_token_t content = parser->current;
13232
14130
  parser_lex(parser);
13233
14131
 
@@ -13235,7 +14133,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13235
14133
  // without interpolation, which can be represented more succinctly and
13236
14134
  // more easily compiled.
13237
14135
  if (accept1(parser, PM_TOKEN_REGEXP_END)) {
13238
- return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
14136
+ return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13239
14137
  }
13240
14138
 
13241
14139
  // If we get here, then we have interpolation so we'll need to create
@@ -13244,7 +14142,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13244
14142
 
13245
14143
  pm_token_t opening = not_provided(parser);
13246
14144
  pm_token_t closing = not_provided(parser);
13247
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
14145
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
13248
14146
  pm_interpolated_regular_expression_node_append(node, part);
13249
14147
  } else {
13250
14148
  // If the first part of the body of the regular expression is not a
@@ -13255,9 +14153,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13255
14153
 
13256
14154
  // Now that we're here and we have interpolation, we'll parse all of the
13257
14155
  // parts into the list.
14156
+ pm_node_t *part;
13258
14157
  while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) {
13259
- pm_node_t *part = parse_string_part(parser);
13260
- if (part != NULL) {
14158
+ if ((part = parse_string_part(parser)) != NULL) {
13261
14159
  pm_interpolated_regular_expression_node_append(node, part);
13262
14160
  }
13263
14161
  }
@@ -13293,35 +14191,37 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13293
14191
  pm_interpolated_x_string_node_t *node;
13294
14192
 
13295
14193
  if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13296
- // In this case we've hit string content so we know the string at least
13297
- // has something in it. We'll need to check if the following token is
13298
- // the end (in which case we can return a plain string) or if it's not
13299
- // then it has interpolation.
14194
+ // In this case we've hit string content so we know the string
14195
+ // at least has something in it. We'll need to check if the
14196
+ // following token is the end (in which case we can return a
14197
+ // plain string) or if it's not then it has interpolation.
14198
+ pm_string_t unescaped = parser->current_string;
13300
14199
  pm_token_t content = parser->current;
13301
14200
  parser_lex(parser);
13302
14201
 
13303
14202
  if (accept1(parser, PM_TOKEN_STRING_END)) {
13304
- return (pm_node_t *) pm_xstring_node_create_and_unescape(parser, &opening, &content, &parser->previous);
14203
+ return (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13305
14204
  }
13306
14205
 
13307
- // If we get here, then we have interpolation so we'll need to create
13308
- // a string node with interpolation.
14206
+ // If we get here, then we have interpolation so we'll need to
14207
+ // create a string node with interpolation.
13309
14208
  node = pm_interpolated_xstring_node_create(parser, &opening, &opening);
13310
14209
 
13311
14210
  pm_token_t opening = not_provided(parser);
13312
14211
  pm_token_t closing = not_provided(parser);
13313
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
14212
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
14213
+
13314
14214
  pm_interpolated_xstring_node_append(node, part);
13315
14215
  } else {
13316
- // If the first part of the body of the string is not a string content,
13317
- // then we have interpolation and we need to create an interpolated
13318
- // string node.
14216
+ // If the first part of the body of the string is not a string
14217
+ // content, then we have interpolation and we need to create an
14218
+ // interpolated string node.
13319
14219
  node = pm_interpolated_xstring_node_create(parser, &opening, &opening);
13320
14220
  }
13321
14221
 
14222
+ pm_node_t *part;
13322
14223
  while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13323
- pm_node_t *part = parse_string_part(parser);
13324
- if (part != NULL) {
14224
+ if ((part = parse_string_part(parser)) != NULL) {
13325
14225
  pm_interpolated_xstring_node_append(node, part);
13326
14226
  }
13327
14227
  }
@@ -13542,7 +14442,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_
13542
14442
  static void
13543
14443
  parse_call_operator_write_block(pm_parser_t *parser, pm_call_node_t *call_node, const pm_token_t *operator) {
13544
14444
  if (call_node->block != NULL) {
13545
- pm_diagnostic_list_append(&parser->error_list, operator->start, operator->end, PM_ERR_OPERATOR_WRITE_BLOCK);
14445
+ pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_BLOCK);
13546
14446
  pm_node_destroy(parser, (pm_node_t *) call_node->block);
13547
14447
  call_node->block = NULL;
13548
14448
  }
@@ -13590,7 +14490,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13590
14490
  // In this case we have an = sign, but we don't know what it's for. We
13591
14491
  // need to treat it as an error. For now, we'll mark it as an error
13592
14492
  // and just skip right past it.
13593
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL);
14493
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL);
13594
14494
  return node;
13595
14495
  }
13596
14496
  }
@@ -13598,7 +14498,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13598
14498
  switch (PM_NODE_TYPE(node)) {
13599
14499
  case PM_BACK_REFERENCE_READ_NODE:
13600
14500
  case PM_NUMBERED_REFERENCE_READ_NODE:
13601
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14501
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13602
14502
  /* fallthrough */
13603
14503
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13604
14504
  parser_lex(parser);
@@ -13661,7 +14561,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13661
14561
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13662
14562
 
13663
14563
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13664
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14564
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13665
14565
  }
13666
14566
 
13667
14567
  parser_lex(parser);
@@ -13683,7 +14583,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13683
14583
  }
13684
14584
  case PM_MULTI_WRITE_NODE: {
13685
14585
  parser_lex(parser);
13686
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_AMPAMPEQ_MULTI_ASSIGN);
14586
+ pm_parser_err_token(parser, &token, PM_ERR_AMPAMPEQ_MULTI_ASSIGN);
13687
14587
  return node;
13688
14588
  }
13689
14589
  default:
@@ -13692,7 +14592,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13692
14592
  // In this case we have an &&= sign, but we don't know what it's for.
13693
14593
  // We need to treat it as an error. For now, we'll mark it as an error
13694
14594
  // and just skip right past it.
13695
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ);
14595
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ);
13696
14596
  return node;
13697
14597
  }
13698
14598
  }
@@ -13700,7 +14600,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13700
14600
  switch (PM_NODE_TYPE(node)) {
13701
14601
  case PM_BACK_REFERENCE_READ_NODE:
13702
14602
  case PM_NUMBERED_REFERENCE_READ_NODE:
13703
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14603
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13704
14604
  /* fallthrough */
13705
14605
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13706
14606
  parser_lex(parser);
@@ -13763,7 +14663,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13763
14663
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13764
14664
 
13765
14665
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13766
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14666
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13767
14667
  }
13768
14668
 
13769
14669
  parser_lex(parser);
@@ -13785,7 +14685,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13785
14685
  }
13786
14686
  case PM_MULTI_WRITE_NODE: {
13787
14687
  parser_lex(parser);
13788
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN);
14688
+ pm_parser_err_token(parser, &token, PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN);
13789
14689
  return node;
13790
14690
  }
13791
14691
  default:
@@ -13794,7 +14694,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13794
14694
  // In this case we have an ||= sign, but we don't know what it's for.
13795
14695
  // We need to treat it as an error. For now, we'll mark it as an error
13796
14696
  // and just skip right past it.
13797
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ);
14697
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ);
13798
14698
  return node;
13799
14699
  }
13800
14700
  }
@@ -13812,7 +14712,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13812
14712
  switch (PM_NODE_TYPE(node)) {
13813
14713
  case PM_BACK_REFERENCE_READ_NODE:
13814
14714
  case PM_NUMBERED_REFERENCE_READ_NODE:
13815
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14715
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13816
14716
  /* fallthrough */
13817
14717
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13818
14718
  parser_lex(parser);
@@ -13875,7 +14775,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13875
14775
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13876
14776
 
13877
14777
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13878
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14778
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13879
14779
  }
13880
14780
 
13881
14781
  parser_lex(parser);
@@ -13897,7 +14797,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13897
14797
  }
13898
14798
  case PM_MULTI_WRITE_NODE: {
13899
14799
  parser_lex(parser);
13900
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_OPERATOR_MULTI_ASSIGN);
14800
+ pm_parser_err_token(parser, &token, PM_ERR_OPERATOR_MULTI_ASSIGN);
13901
14801
  return node;
13902
14802
  }
13903
14803
  default:
@@ -13906,7 +14806,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13906
14806
  // In this case we have an operator but we don't know what it's for.
13907
14807
  // We need to treat it as an error. For now, we'll mark it as an error
13908
14808
  // and just skip right past it.
13909
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
14809
+ pm_parser_err_previous(parser, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
13910
14810
  return node;
13911
14811
  }
13912
14812
  }
@@ -14021,7 +14921,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14021
14921
  break;
14022
14922
  }
14023
14923
  default: {
14024
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_DEF_NAME);
14924
+ pm_parser_err_current(parser, PM_ERR_DEF_NAME);
14025
14925
  message = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end };
14026
14926
  }
14027
14927
  }
@@ -14172,7 +15072,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14172
15072
  return (pm_node_t *) pm_call_node_shorthand_create(parser, node, &delimiter, &arguments);
14173
15073
  }
14174
15074
  default: {
14175
- pm_diagnostic_list_append(&parser->error_list, delimiter.start, delimiter.end, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT);
15075
+ pm_parser_err_token(parser, &delimiter, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT);
14176
15076
  pm_node_t *child = (pm_node_t *) pm_missing_node_create(parser, delimiter.start, delimiter.end);
14177
15077
  return (pm_node_t *)pm_constant_path_node_create(parser, node, &delimiter, child);
14178
15078
  }
@@ -14220,7 +15120,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14220
15120
 
14221
15121
  if (block != NULL) {
14222
15122
  if (arguments.block != NULL) {
14223
- pm_diagnostic_list_append(&parser->error_list, block->base.location.start, block->base.location.end, PM_ERR_ARGUMENT_AFTER_BLOCK);
15123
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_AFTER_BLOCK);
14224
15124
  if (arguments.arguments == NULL) {
14225
15125
  arguments.arguments = pm_arguments_node_create(parser);
14226
15126
  }
@@ -14283,7 +15183,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, pm_diagn
14283
15183
  // parse_expression_prefix is going to be a missing node. In that case we need
14284
15184
  // to add the error message to the parser's error list.
14285
15185
  if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) {
14286
- pm_diagnostic_list_append(&parser->error_list, recovery.end, recovery.end, diag_id);
15186
+ pm_parser_err(parser, recovery.end, recovery.end, diag_id);
14287
15187
  return node;
14288
15188
  }
14289
15189
 
@@ -14428,6 +15328,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const ch
14428
15328
  .next_start = NULL,
14429
15329
  .heredoc_end = NULL,
14430
15330
  .comment_list = PM_LIST_EMPTY,
15331
+ .magic_comment_list = PM_LIST_EMPTY,
14431
15332
  .warning_list = PM_LIST_EMPTY,
14432
15333
  .error_list = PM_LIST_EMPTY,
14433
15334
  .current_scope = NULL,
@@ -14441,6 +15342,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const ch
14441
15342
  .constant_pool = PM_CONSTANT_POOL_EMPTY,
14442
15343
  .newline_list = PM_NEWLINE_LIST_EMPTY,
14443
15344
  .integer_base = 0,
15345
+ .current_string = PM_EMPTY_STRING,
14444
15346
  .command_start = true,
14445
15347
  .recovering = false,
14446
15348
  .encoding_changed = false,
@@ -14521,6 +15423,19 @@ pm_comment_list_free(pm_list_t *list) {
14521
15423
  }
14522
15424
  }
14523
15425
 
15426
+ // Free all of the memory associated with the magic comment list.
15427
+ static inline void
15428
+ pm_magic_comment_list_free(pm_list_t *list) {
15429
+ pm_list_node_t *node, *next;
15430
+
15431
+ for (node = list->head; node != NULL; node = next) {
15432
+ next = node->next;
15433
+
15434
+ pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) node;
15435
+ free(magic_comment);
15436
+ }
15437
+ }
15438
+
14524
15439
  // Free any memory associated with the given parser.
14525
15440
  PRISM_EXPORTED_FUNCTION void
14526
15441
  pm_parser_free(pm_parser_t *parser) {
@@ -14528,6 +15443,7 @@ pm_parser_free(pm_parser_t *parser) {
14528
15443
  pm_diagnostic_list_free(&parser->error_list);
14529
15444
  pm_diagnostic_list_free(&parser->warning_list);
14530
15445
  pm_comment_list_free(&parser->comment_list);
15446
+ pm_magic_comment_list_free(&parser->magic_comment_list);
14531
15447
  pm_constant_pool_free(&parser->constant_pool);
14532
15448
  pm_newline_list_free(&parser->newline_list);
14533
15449
 
@@ -14578,10 +15494,11 @@ pm_parse_serialize(const uint8_t *source, size_t size, pm_buffer_t *buffer, cons
14578
15494
  pm_parser_free(&parser);
14579
15495
  }
14580
15496
 
14581
- #undef PM_LOCATION_NULL_VALUE
14582
- #undef PM_LOCATION_TOKEN_VALUE
14583
- #undef PM_LOCATION_NODE_VALUE
14584
- #undef PM_LOCATION_NODE_BASE_VALUE
14585
15497
  #undef PM_CASE_KEYWORD
14586
15498
  #undef PM_CASE_OPERATOR
14587
15499
  #undef PM_CASE_WRITABLE
15500
+ #undef PM_EMPTY_STRING
15501
+ #undef PM_LOCATION_NODE_BASE_VALUE
15502
+ #undef PM_LOCATION_NODE_VALUE
15503
+ #undef PM_LOCATION_NULL_VALUE
15504
+ #undef PM_LOCATION_TOKEN_VALUE