prism 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/prism.c CHANGED
@@ -421,6 +421,63 @@ debug_lex_state_set(pm_parser_t *parser, pm_lex_state_t state, char const * call
421
421
  #define lex_state_set(parser, state) debug_lex_state_set(parser, state, __func__, __LINE__)
422
422
  #endif
423
423
 
424
+ /******************************************************************************/
425
+ /* Diagnostic-related functions */
426
+ /******************************************************************************/
427
+
428
+ // Append an error to the list of errors on the parser.
429
+ static inline void
430
+ pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
431
+ pm_diagnostic_list_append(&parser->error_list, start, end, diag_id);
432
+ }
433
+
434
+ // Append an error to the list of errors on the parser using the location of the
435
+ // current token.
436
+ static inline void
437
+ pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
438
+ pm_parser_err(parser, parser->current.start, parser->current.end, diag_id);
439
+ }
440
+
441
+ // Append an error to the list of errors on the parser using the given location.
442
+ static inline void
443
+ pm_parser_err_location(pm_parser_t *parser, const pm_location_t *location, pm_diagnostic_id_t diag_id) {
444
+ pm_parser_err(parser, location->start, location->end, diag_id);
445
+ }
446
+
447
+ // Append an error to the list of errors on the parser using the location of the
448
+ // given node.
449
+ static inline void
450
+ pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) {
451
+ pm_parser_err(parser, node->location.start, node->location.end, diag_id);
452
+ }
453
+
454
+ // Append an error to the list of errors on the parser using the location of the
455
+ // previous token.
456
+ static inline void
457
+ pm_parser_err_previous(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
458
+ pm_parser_err(parser, parser->previous.start, parser->previous.end, diag_id);
459
+ }
460
+
461
+ // Append an error to the list of errors on the parser using the location of the
462
+ // given token.
463
+ static inline void
464
+ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) {
465
+ pm_parser_err(parser, token->start, token->end, diag_id);
466
+ }
467
+
468
+ // Append a warning to the list of warnings on the parser.
469
+ static inline void
470
+ pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
471
+ pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id);
472
+ }
473
+
474
+ // Append a warning to the list of warnings on the parser using the location of
475
+ // the given token.
476
+ static inline void
477
+ pm_parser_warn_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) {
478
+ pm_parser_warn(parser, token->start, token->end, diag_id);
479
+ }
480
+
424
481
  /******************************************************************************/
425
482
  /* Node-related functions */
426
483
  /******************************************************************************/
@@ -437,6 +494,22 @@ pm_parser_constant_id_owned(pm_parser_t *parser, const uint8_t *start, size_t le
437
494
  return pm_constant_pool_insert_owned(&parser->constant_pool, start, length);
438
495
  }
439
496
 
497
+ // Retrieve the constant pool id for the given static literal C string.
498
+ static inline pm_constant_id_t
499
+ pm_parser_constant_id_static(pm_parser_t *parser, const char *start, size_t length) {
500
+ uint8_t *owned_copy;
501
+ if (length > 0) {
502
+ owned_copy = malloc(length);
503
+ memcpy(owned_copy, start, length);
504
+ } else {
505
+ owned_copy = malloc(1);
506
+ owned_copy[0] = '\0';
507
+ }
508
+ return pm_constant_pool_insert_owned(&parser->constant_pool, owned_copy, length);
509
+ // Does not work because the static literal cannot be serialized as an offset of source
510
+ // return pm_constant_pool_insert_shared(&parser->constant_pool, start, length);
511
+ }
512
+
440
513
  // Retrieve the constant pool id for the given token.
441
514
  static inline pm_constant_id_t
442
515
  pm_parser_constant_id_token(pm_parser_t *parser, const pm_token_t *token) {
@@ -582,12 +655,7 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
582
655
 
583
656
  // If we didn't hit a case before this check, then at this point we need to
584
657
  // add a syntax error.
585
- pm_diagnostic_list_append(
586
- &parser->error_list,
587
- block->base.location.start,
588
- block->base.location.end,
589
- PM_ERR_ARGUMENT_UNEXPECTED_BLOCK
590
- );
658
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK);
591
659
  }
592
660
 
593
661
  /******************************************************************************/
@@ -601,6 +669,7 @@ pm_scope_node_init(pm_node_t *node, pm_scope_node_t *scope) {
601
669
  scope->base.location.start = node->location.start;
602
670
  scope->base.location.end = node->location.end;
603
671
 
672
+ scope->ast_node = node;
604
673
  scope->parameters = NULL;
605
674
  scope->body = NULL;
606
675
  pm_constant_id_list_init(&scope->locals);
@@ -626,6 +695,11 @@ pm_scope_node_init(pm_node_t *node, pm_scope_node_t *scope) {
626
695
  scope->locals = cast->locals;
627
696
  break;
628
697
  }
698
+ case PM_FOR_NODE: {
699
+ pm_for_node_t *cast = (pm_for_node_t *)node;
700
+ scope->body = (pm_node_t *)cast->statements;
701
+ break;
702
+ }
629
703
  case PM_LAMBDA_NODE: {
630
704
  pm_lambda_node_t *cast = (pm_lambda_node_t *) node;
631
705
  if (cast->parameters) scope->parameters = cast->parameters->parameters;
@@ -679,14 +753,14 @@ parse_decimal_number(pm_parser_t *parser, const uint8_t *start, const uint8_t *e
679
753
  unsigned long value = strtoul(digits, &endptr, 10);
680
754
 
681
755
  if ((digits == endptr) || (*endptr != '\0') || (errno == ERANGE)) {
682
- pm_diagnostic_list_append(&parser->error_list, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
756
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
683
757
  value = UINT32_MAX;
684
758
  }
685
759
 
686
760
  free(digits);
687
761
 
688
762
  if (value > UINT32_MAX) {
689
- pm_diagnostic_list_append(&parser->error_list, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
763
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL);
690
764
  value = UINT32_MAX;
691
765
  }
692
766
 
@@ -907,7 +981,7 @@ pm_array_node_elements_append(pm_array_node_t *node, pm_node_t *element) {
907
981
 
908
982
  // If the element is not a static literal, then the array is not a static
909
983
  // literal. Turn that flag off.
910
- if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || (element->flags & PM_NODE_FLAG_STATIC_LITERAL) == 0) {
984
+ if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || PM_NODE_TYPE_P(element, PM_RANGE_NODE) || (element->flags & PM_NODE_FLAG_STATIC_LITERAL) == 0) {
911
985
  node->base.flags &= (pm_node_flags_t) ~PM_NODE_FLAG_STATIC_LITERAL;
912
986
  }
913
987
  }
@@ -1051,8 +1125,10 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper
1051
1125
  end = key->location.end;
1052
1126
  }
1053
1127
 
1128
+ // If the key and value of this assoc node are both static literals, then
1129
+ // we can mark this node as a static literal.
1054
1130
  pm_node_flags_t flags = 0;
1055
- if (value && !PM_NODE_TYPE_P(value, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(value, PM_HASH_NODE)) {
1131
+ if (value && !PM_NODE_TYPE_P(value, PM_ARRAY_NODE) && !PM_NODE_TYPE_P(value, PM_HASH_NODE) && !PM_NODE_TYPE_P(value, PM_RANGE_NODE)) {
1056
1132
  flags = key->flags & value->flags & PM_NODE_FLAG_STATIC_LITERAL;
1057
1133
  }
1058
1134
 
@@ -1341,7 +1417,8 @@ pm_call_node_create(pm_parser_t *parser) {
1341
1417
  .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
1342
1418
  .arguments = NULL,
1343
1419
  .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE,
1344
- .block = NULL
1420
+ .block = NULL,
1421
+ .name = 0
1345
1422
  };
1346
1423
 
1347
1424
  return node;
@@ -1369,7 +1446,7 @@ pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_
1369
1446
  node->closing_loc = arguments->closing_loc;
1370
1447
  node->block = arguments->block;
1371
1448
 
1372
- pm_string_constant_init(&node->name, "[]", 2);
1449
+ node->name = pm_parser_constant_id_static(parser, "[]", 2);
1373
1450
  return node;
1374
1451
  }
1375
1452
 
@@ -1388,7 +1465,7 @@ pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t
1388
1465
  pm_arguments_node_arguments_append(arguments, argument);
1389
1466
  node->arguments = arguments;
1390
1467
 
1391
- pm_string_shared_init(&node->name, operator->start, operator->end);
1468
+ node->name = pm_parser_constant_id_token(parser, operator);
1392
1469
  return node;
1393
1470
  }
1394
1471
 
@@ -1420,7 +1497,7 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o
1420
1497
  node->base.flags |= PM_CALL_NODE_FLAGS_SAFE_NAVIGATION;
1421
1498
  }
1422
1499
 
1423
- pm_string_shared_init(&node->name, message->start, message->end);
1500
+ node->name = pm_parser_constant_id_token(parser, message);
1424
1501
  return node;
1425
1502
  }
1426
1503
 
@@ -1447,7 +1524,7 @@ pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments
1447
1524
  node->closing_loc = arguments->closing_loc;
1448
1525
  node->block = arguments->block;
1449
1526
 
1450
- pm_string_shared_init(&node->name, message->start, message->end);
1527
+ node->name = pm_parser_constant_id_token(parser, message);
1451
1528
  return node;
1452
1529
  }
1453
1530
 
@@ -1469,7 +1546,7 @@ pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *me
1469
1546
  node->arguments = arguments->arguments;
1470
1547
  node->closing_loc = arguments->closing_loc;
1471
1548
 
1472
- pm_string_constant_init(&node->name, "!", 1);
1549
+ node->name = pm_parser_constant_id_static(parser, "!", 1);
1473
1550
  return node;
1474
1551
  }
1475
1552
 
@@ -1496,7 +1573,7 @@ pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token
1496
1573
  node->base.flags |= PM_CALL_NODE_FLAGS_SAFE_NAVIGATION;
1497
1574
  }
1498
1575
 
1499
- pm_string_constant_init(&node->name, "call", 4);
1576
+ node->name = pm_parser_constant_id_static(parser, "call", 4);
1500
1577
  return node;
1501
1578
  }
1502
1579
 
@@ -1511,7 +1588,7 @@ pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t *
1511
1588
  node->receiver = receiver;
1512
1589
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator);
1513
1590
 
1514
- pm_string_constant_init(&node->name, name, strlen(name));
1591
+ node->name = pm_parser_constant_id_static(parser, name, strlen(name));
1515
1592
  return node;
1516
1593
  }
1517
1594
 
@@ -1524,7 +1601,7 @@ pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) {
1524
1601
  node->base.location = PM_LOCATION_TOKEN_VALUE(message);
1525
1602
  node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message);
1526
1603
 
1527
- pm_string_shared_init(&node->name, message->start, message->end);
1604
+ node->name = pm_parser_constant_id_token(parser, message);
1528
1605
  return node;
1529
1606
  }
1530
1607
 
@@ -1537,17 +1614,18 @@ pm_call_node_variable_call_p(pm_call_node_t *node) {
1537
1614
 
1538
1615
  // Initialize the read name by reading the write name and chopping off the '='.
1539
1616
  static void
1540
- pm_call_write_read_name_init(pm_string_t *read_name, pm_string_t *write_name) {
1541
- if (write_name->length >= 1) {
1542
- size_t length = write_name->length - 1;
1617
+ pm_call_write_read_name_init(pm_parser_t *parser, pm_constant_id_t *read_name, pm_constant_id_t *write_name) {
1618
+ pm_constant_t *write_constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *write_name);
1619
+ if (write_constant->length >= 1) {
1620
+ size_t length = write_constant->length - 1;
1543
1621
 
1544
1622
  void *memory = malloc(length);
1545
- memcpy(memory, write_name->source, length);
1623
+ memcpy(memory, write_constant->start, length);
1546
1624
 
1547
- pm_string_owned_init(read_name, (uint8_t *) memory, length);
1625
+ *read_name = pm_constant_pool_insert_owned(&parser->constant_pool, (uint8_t *) memory, length);
1548
1626
  } else {
1549
1627
  // We can get here if the message was missing because of a syntax error.
1550
- pm_string_constant_init(read_name, "", 0);
1628
+ *read_name = pm_parser_constant_id_static(parser, "", 0);
1551
1629
  }
1552
1630
  }
1553
1631
 
@@ -1573,13 +1651,13 @@ pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const
1573
1651
  .opening_loc = target->opening_loc,
1574
1652
  .arguments = target->arguments,
1575
1653
  .closing_loc = target->closing_loc,
1576
- .read_name = PM_EMPTY_STRING,
1654
+ .read_name = 0,
1577
1655
  .write_name = target->name,
1578
1656
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1579
1657
  .value = value
1580
1658
  };
1581
1659
 
1582
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1660
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1583
1661
 
1584
1662
  // Here we're going to free the target, since it is no longer necessary.
1585
1663
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -1610,14 +1688,14 @@ pm_call_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target,
1610
1688
  .opening_loc = target->opening_loc,
1611
1689
  .arguments = target->arguments,
1612
1690
  .closing_loc = target->closing_loc,
1613
- .read_name = PM_EMPTY_STRING,
1691
+ .read_name = 0,
1614
1692
  .write_name = target->name,
1615
1693
  .operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1),
1616
1694
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1617
1695
  .value = value
1618
1696
  };
1619
1697
 
1620
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1698
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1621
1699
 
1622
1700
  // Here we're going to free the target, since it is no longer necessary.
1623
1701
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -1649,13 +1727,13 @@ pm_call_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const
1649
1727
  .opening_loc = target->opening_loc,
1650
1728
  .arguments = target->arguments,
1651
1729
  .closing_loc = target->closing_loc,
1652
- .read_name = PM_EMPTY_STRING,
1730
+ .read_name = 0,
1653
1731
  .write_name = target->name,
1654
1732
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator),
1655
1733
  .value = value
1656
1734
  };
1657
1735
 
1658
- pm_call_write_read_name_init(&node->read_name, &node->write_name);
1736
+ pm_call_write_read_name_init(parser, &node->read_name, &node->write_name);
1659
1737
 
1660
1738
  // Here we're going to free the target, since it is no longer necessary.
1661
1739
  // However, we don't want to call `pm_node_destroy` because we want to keep
@@ -3372,11 +3450,20 @@ pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name,
3372
3450
  return node;
3373
3451
  }
3374
3452
 
3453
+ static inline bool
3454
+ token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
3455
+ return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1]));
3456
+ }
3457
+
3375
3458
  // Allocate and initialize a new LocalVariableTargetNode node.
3376
3459
  static pm_local_variable_target_node_t *
3377
3460
  pm_local_variable_target_node_create(pm_parser_t *parser, const pm_token_t *name) {
3378
3461
  pm_local_variable_target_node_t *node = PM_ALLOC_NODE(parser, pm_local_variable_target_node_t);
3379
3462
 
3463
+ if (token_is_numbered_parameter(name->start, name->end)) {
3464
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NUMBERED_RESERVED);
3465
+ }
3466
+
3380
3467
  *node = (pm_local_variable_target_node_t) {
3381
3468
  {
3382
3469
  .type = PM_LOCAL_VARIABLE_TARGET_NODE,
@@ -3870,10 +3957,27 @@ pm_pre_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, con
3870
3957
  static pm_range_node_t *
3871
3958
  pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) {
3872
3959
  pm_range_node_t *node = PM_ALLOC_NODE(parser, pm_range_node_t);
3960
+ pm_node_flags_t flags = 0;
3961
+
3962
+ // Indicate that this node an exclusive range if the operator is `...`.
3963
+ if (operator->type == PM_TOKEN_DOT_DOT_DOT || operator->type == PM_TOKEN_UDOT_DOT_DOT) {
3964
+ flags |= PM_RANGE_FLAGS_EXCLUDE_END;
3965
+ }
3966
+
3967
+ // Indicate that this node is a static literal (i.e., can be compiled with
3968
+ // a putobject in CRuby) if the left and right are implicit nil, explicit
3969
+ // nil, or integers.
3970
+ if (
3971
+ (left == NULL || PM_NODE_TYPE_P(left, PM_NIL_NODE) || PM_NODE_TYPE_P(left, PM_INTEGER_NODE)) &&
3972
+ (right == NULL || PM_NODE_TYPE_P(right, PM_NIL_NODE) || PM_NODE_TYPE_P(right, PM_INTEGER_NODE))
3973
+ ) {
3974
+ flags |= PM_NODE_FLAG_STATIC_LITERAL;
3975
+ }
3873
3976
 
3874
3977
  *node = (pm_range_node_t) {
3875
3978
  {
3876
3979
  .type = PM_RANGE_NODE,
3980
+ .flags = flags,
3877
3981
  .location = {
3878
3982
  .start = (left == NULL ? operator->start : left->location.start),
3879
3983
  .end = (right == NULL ? operator->end : right->location.end)
@@ -3884,15 +3988,6 @@ pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *ope
3884
3988
  .operator_loc = PM_LOCATION_TOKEN_VALUE(operator)
3885
3989
  };
3886
3990
 
3887
- switch (operator->type) {
3888
- case PM_TOKEN_DOT_DOT_DOT:
3889
- case PM_TOKEN_UDOT_DOT_DOT:
3890
- node->base.flags |= PM_RANGE_FLAGS_EXCLUDE_END;
3891
- break;
3892
- default:
3893
- break;
3894
- }
3895
-
3896
3991
  return node;
3897
3992
  }
3898
3993
 
@@ -3906,9 +4001,10 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) {
3906
4001
  return node;
3907
4002
  }
3908
4003
 
3909
- // Allocate a new RegularExpressionNode node.
4004
+ // Allocate a new initialize a new RegularExpressionNode node with the given
4005
+ // unescaped string.
3910
4006
  static pm_regular_expression_node_t *
3911
- pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4007
+ pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) {
3912
4008
  pm_regular_expression_node_t *node = PM_ALLOC_NODE(parser, pm_regular_expression_node_t);
3913
4009
 
3914
4010
  *node = (pm_regular_expression_node_t) {
@@ -3923,12 +4019,18 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening
3923
4019
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
3924
4020
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
3925
4021
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
3926
- .unescaped = PM_EMPTY_STRING
4022
+ .unescaped = *unescaped
3927
4023
  };
3928
4024
 
3929
4025
  return node;
3930
4026
  }
3931
4027
 
4028
+ // Allocate a new initialize a new RegularExpressionNode node.
4029
+ static inline pm_regular_expression_node_t *
4030
+ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4031
+ return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4032
+ }
4033
+
3932
4034
  // Allocate a new RequiredDestructuredParameterNode node.
3933
4035
  static pm_required_destructured_parameter_node_t *
3934
4036
  pm_required_destructured_parameter_node_create(pm_parser_t *parser, const pm_token_t *opening) {
@@ -4274,9 +4376,9 @@ pm_string_concat_node_create(pm_parser_t *parser, pm_node_t *left, pm_node_t *ri
4274
4376
  return node;
4275
4377
  }
4276
4378
 
4277
- // Allocate a new StringNode node.
4278
- static pm_string_node_t *
4279
- pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4379
+ // Allocate a new StringNode node with the current string on the parser.
4380
+ static inline pm_string_node_t *
4381
+ pm_string_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *string) {
4280
4382
  pm_string_node_t *node = PM_ALLOC_NODE(parser, pm_string_node_t);
4281
4383
  pm_node_flags_t flags = 0;
4282
4384
 
@@ -4296,12 +4398,27 @@ pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
4296
4398
  .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening),
4297
4399
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
4298
4400
  .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing),
4299
- .unescaped = PM_EMPTY_STRING
4401
+ .unescaped = *string
4300
4402
  };
4301
4403
 
4302
4404
  return node;
4303
4405
  }
4304
4406
 
4407
+ // Allocate a new StringNode node.
4408
+ static pm_string_node_t *
4409
+ pm_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4410
+ return pm_string_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4411
+ }
4412
+
4413
+ // Allocate a new StringNode node and create it using the current string on the
4414
+ // parser.
4415
+ static pm_string_node_t *
4416
+ pm_string_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4417
+ pm_string_node_t *node = pm_string_node_create_unescaped(parser, opening, content, closing, &parser->current_string);
4418
+ parser->current_string = PM_EMPTY_STRING;
4419
+ return node;
4420
+ }
4421
+
4305
4422
  // Allocate and initialize a new SuperNode node.
4306
4423
  static pm_super_node_t *
4307
4424
  pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_t *arguments) {
@@ -4338,9 +4455,10 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
4338
4455
  return node;
4339
4456
  }
4340
4457
 
4341
- // Allocate a new SymbolNode node.
4458
+ // Allocate and initialize a new SymbolNode node with the given unescaped
4459
+ // string.
4342
4460
  static pm_symbol_node_t *
4343
- pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4461
+ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) {
4344
4462
  pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t);
4345
4463
 
4346
4464
  *node = (pm_symbol_node_t) {
@@ -4355,12 +4473,26 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
4355
4473
  .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening),
4356
4474
  .value_loc = PM_LOCATION_TOKEN_VALUE(value),
4357
4475
  .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing),
4358
- .unescaped = PM_EMPTY_STRING
4476
+ .unescaped = *unescaped
4359
4477
  };
4360
4478
 
4361
4479
  return node;
4362
4480
  }
4363
4481
 
4482
+ // Allocate and initialize a new SymbolNode node.
4483
+ static inline pm_symbol_node_t *
4484
+ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4485
+ return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_EMPTY_STRING);
4486
+ }
4487
+
4488
+ // Allocate and initialize a new SymbolNode node with the current string.
4489
+ static pm_symbol_node_t *
4490
+ pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
4491
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
4492
+ parser->current_string = PM_EMPTY_STRING;
4493
+ return node;
4494
+ }
4495
+
4364
4496
  // Allocate and initialize a new SymbolNode node from a label.
4365
4497
  static pm_symbol_node_t *
4366
4498
  pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
@@ -4376,8 +4508,6 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
4376
4508
 
4377
4509
  assert((label.end - label.start) >= 0);
4378
4510
  pm_string_shared_init(&node->unescaped, label.start, label.end);
4379
-
4380
- pm_unescape_manipulate_string(parser, &node->unescaped, PM_UNESCAPE_ALL);
4381
4511
  break;
4382
4512
  }
4383
4513
  case PM_TOKEN_MISSING: {
@@ -4710,9 +4840,10 @@ pm_while_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm
4710
4840
  return node;
4711
4841
  }
4712
4842
 
4713
- // Allocate and initialize a new XStringNode node.
4843
+ // Allocate and initialize a new XStringNode node with the given unescaped
4844
+ // string.
4714
4845
  static pm_x_string_node_t *
4715
- pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4846
+ pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) {
4716
4847
  pm_x_string_node_t *node = PM_ALLOC_NODE(parser, pm_x_string_node_t);
4717
4848
 
4718
4849
  *node = (pm_x_string_node_t) {
@@ -4726,12 +4857,18 @@ pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_
4726
4857
  .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
4727
4858
  .content_loc = PM_LOCATION_TOKEN_VALUE(content),
4728
4859
  .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
4729
- .unescaped = PM_EMPTY_STRING
4860
+ .unescaped = *unescaped
4730
4861
  };
4731
4862
 
4732
4863
  return node;
4733
4864
  }
4734
4865
 
4866
+ // Allocate and initialize a new XStringNode node.
4867
+ static inline pm_x_string_node_t *
4868
+ pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
4869
+ return pm_xstring_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
4870
+ }
4871
+
4735
4872
  // Allocate a new YieldNode node.
4736
4873
  static pm_yield_node_t *
4737
4874
  pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_location_t *lparen_loc, pm_arguments_node_t *arguments, const pm_location_t *rparen_loc) {
@@ -4765,8 +4902,6 @@ pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_lo
4765
4902
  return node;
4766
4903
  }
4767
4904
 
4768
-
4769
- #undef PM_EMPTY_STRING
4770
4905
  #undef PM_ALLOC_NODE
4771
4906
 
4772
4907
  /******************************************************************************/
@@ -4783,7 +4918,8 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
4783
4918
  .previous = parser->current_scope,
4784
4919
  .closed = closed,
4785
4920
  .explicit_params = false,
4786
- .numbered_params = false
4921
+ .numbered_params = false,
4922
+ .transparent = false
4787
4923
  };
4788
4924
 
4789
4925
  pm_constant_id_list_init(&scope->locals);
@@ -4792,6 +4928,25 @@ pm_parser_scope_push(pm_parser_t *parser, bool closed) {
4792
4928
  return true;
4793
4929
  }
4794
4930
 
4931
+ // Allocate and initialize a new scope. Push it onto the scope stack.
4932
+ static bool
4933
+ pm_parser_scope_push_transparent(pm_parser_t *parser) {
4934
+ pm_scope_t *scope = (pm_scope_t *) malloc(sizeof(pm_scope_t));
4935
+ if (scope == NULL) return false;
4936
+
4937
+ *scope = (pm_scope_t) {
4938
+ .previous = parser->current_scope,
4939
+ .closed = false,
4940
+ .explicit_params = false,
4941
+ .numbered_params = false,
4942
+ .transparent = true
4943
+ };
4944
+
4945
+ parser->current_scope = scope;
4946
+
4947
+ return true;
4948
+ }
4949
+
4795
4950
  // Check if the current scope has a given local variables.
4796
4951
  static int
4797
4952
  pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
@@ -4800,7 +4955,8 @@ pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
4800
4955
  int depth = 0;
4801
4956
 
4802
4957
  while (scope != NULL) {
4803
- if (pm_constant_id_list_includes(&scope->locals, constant_id)) return depth;
4958
+ if (!scope->transparent &&
4959
+ pm_constant_id_list_includes(&scope->locals, constant_id)) return depth;
4804
4960
  if (scope->closed) break;
4805
4961
 
4806
4962
  scope = scope->previous;
@@ -4813,8 +4969,12 @@ pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) {
4813
4969
  // Add a constant id to the local table of the current scope.
4814
4970
  static inline void
4815
4971
  pm_parser_local_add(pm_parser_t *parser, pm_constant_id_t constant_id) {
4816
- if (!pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
4817
- pm_constant_id_list_append(&parser->current_scope->locals, constant_id);
4972
+ pm_scope_t *scope = parser->current_scope;
4973
+ while (scope && scope->transparent) scope = scope->previous;
4974
+
4975
+ assert(scope != NULL);
4976
+ if (!pm_constant_id_list_includes(&scope->locals, constant_id)) {
4977
+ pm_constant_id_list_append(&scope->locals, constant_id);
4818
4978
  }
4819
4979
  }
4820
4980
 
@@ -4839,18 +4999,13 @@ pm_parser_local_add_owned(pm_parser_t *parser, const uint8_t *start, size_t leng
4839
4999
  if (constant_id != 0) pm_parser_local_add(parser, constant_id);
4840
5000
  }
4841
5001
 
4842
- static inline bool
4843
- token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) {
4844
- return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1]));
4845
- }
4846
-
4847
5002
  // Add a parameter name to the current scope and check whether the name of the
4848
5003
  // parameter is unique or not.
4849
5004
  static void
4850
- pm_parser_parameter_name_check(pm_parser_t *parser, pm_token_t *name) {
5005
+ pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) {
4851
5006
  // We want to check whether the parameter name is a numbered parameter or not.
4852
5007
  if (token_is_numbered_parameter(name->start, name->end)) {
4853
- pm_diagnostic_list_append(&parser->error_list, name->start, name->end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
5008
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NUMBERED_RESERVED);
4854
5009
  }
4855
5010
 
4856
5011
  // We want to ignore any parameter name that starts with an underscore.
@@ -4861,7 +5016,7 @@ pm_parser_parameter_name_check(pm_parser_t *parser, pm_token_t *name) {
4861
5016
  pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, name);
4862
5017
 
4863
5018
  if (pm_constant_id_list_includes(&parser->current_scope->locals, constant_id)) {
4864
- pm_diagnostic_list_append(&parser->error_list, name->start, name->end, PM_ERR_PARAMETER_NAME_REPEAT);
5019
+ pm_parser_err_token(parser, name, PM_ERR_PARAMETER_NAME_REPEAT);
4865
5020
  }
4866
5021
  }
4867
5022
 
@@ -5007,17 +5162,6 @@ peek(pm_parser_t *parser) {
5007
5162
  return peek_at(parser, parser->current.end);
5008
5163
  }
5009
5164
 
5010
- // Get the next string of length len in the source starting from parser->current.end.
5011
- // If the string extends beyond the end of the source, return the empty string ""
5012
- static inline const uint8_t *
5013
- peek_string(pm_parser_t *parser, size_t len) {
5014
- if (parser->current.end + len <= parser->end) {
5015
- return parser->current.end;
5016
- } else {
5017
- return (const uint8_t *) "";
5018
- }
5019
- }
5020
-
5021
5165
  // If the character to be read matches the given value, then returns true and
5022
5166
  // advanced the current pointer.
5023
5167
  static inline bool
@@ -5069,66 +5213,17 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
5069
5213
  return memchr(cursor, '\n', (size_t) length);
5070
5214
  }
5071
5215
 
5072
- // Find the start of the encoding comment. This is effectively an inlined
5073
- // version of strnstr with some modifications.
5074
- static inline const uint8_t *
5075
- parser_lex_encoding_comment_start(pm_parser_t *parser, const uint8_t *cursor, ptrdiff_t remaining) {
5076
- assert(remaining >= 0);
5077
- size_t length = (size_t) remaining;
5078
-
5079
- size_t key_length = strlen("coding:");
5080
- if (key_length > length) return NULL;
5081
-
5082
- const uint8_t *cursor_limit = cursor + length - key_length + 1;
5083
- while ((cursor = pm_memchr(cursor, 'c', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5084
- if (memcmp(cursor, "coding", key_length - 1) == 0) {
5085
- size_t whitespace_after_coding = pm_strspn_inline_whitespace(cursor + key_length - 1, parser->end - (cursor + key_length - 1));
5086
- size_t cur_pos = key_length + whitespace_after_coding;
5087
-
5088
- if (cursor[cur_pos - 1] == ':' || cursor[cur_pos - 1] == '=') {
5089
- return cursor + cur_pos;
5090
- }
5091
- }
5092
-
5093
- cursor++;
5094
- }
5095
-
5096
- return NULL;
5097
- }
5098
-
5099
5216
  // Here we're going to check if this is a "magic" comment, and perform whatever
5100
5217
  // actions are necessary for it here.
5101
5218
  static void
5102
- parser_lex_encoding_comment(pm_parser_t *parser) {
5103
- const uint8_t *start = parser->current.start + 1;
5104
- const uint8_t *end = parser->current.end;
5105
-
5106
- // These are the patterns we're going to match to find the encoding comment.
5107
- // This is definitely not complete or even really correct.
5108
- const uint8_t *encoding_start = parser_lex_encoding_comment_start(parser, start, end - start);
5109
-
5110
- // If we didn't find anything that matched our patterns, then return. Note
5111
- // that this does a _very_ poor job of actually finding the encoding, and
5112
- // there is a lot of work to do here to better reflect actual magic comment
5113
- // parsing from CRuby, but this at least gets us part of the way there.
5114
- if (encoding_start == NULL) return;
5115
-
5116
- // Skip any non-newline whitespace after the "coding:" or "coding=".
5117
- encoding_start += pm_strspn_inline_whitespace(encoding_start, end - encoding_start);
5118
-
5119
- // Now determine the end of the encoding string. This is either the end of
5120
- // the line, the first whitespace character, or a punctuation mark.
5121
- const uint8_t *encoding_end = pm_strpbrk(parser, encoding_start, (const uint8_t *) " \t\f\r\v\n;,", end - encoding_start);
5122
- encoding_end = encoding_end == NULL ? end : encoding_end;
5123
-
5124
- // Finally, we can determine the width of the encoding string.
5125
- size_t width = (size_t) (encoding_end - encoding_start);
5219
+ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5220
+ size_t width = (size_t) (end - start);
5126
5221
 
5127
5222
  // First, we're going to call out to a user-defined callback if one was
5128
5223
  // provided. If they return an encoding struct that we can use, then we'll
5129
5224
  // use that here.
5130
5225
  if (parser->encoding_decode_callback != NULL) {
5131
- pm_encoding_t *encoding = parser->encoding_decode_callback(parser, encoding_start, width);
5226
+ pm_encoding_t *encoding = parser->encoding_decode_callback(parser, start, width);
5132
5227
 
5133
5228
  if (encoding != NULL) {
5134
5229
  parser->encoding = *encoding;
@@ -5140,7 +5235,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5140
5235
  // Extensions like utf-8 can contain extra encoding details like,
5141
5236
  // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
5142
5237
  // treat any encoding starting utf-8 as utf-8.
5143
- if ((encoding_start + 5 <= parser->end) && (pm_strncasecmp(encoding_start, (const uint8_t *) "utf-8", 5) == 0)) {
5238
+ if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
5144
5239
  // We don't need to do anything here because the default encoding is
5145
5240
  // already UTF-8. We'll just return.
5146
5241
  return;
@@ -5149,7 +5244,7 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5149
5244
  // Next, we're going to loop through each of the encodings that we handle
5150
5245
  // explicitly. If we found one that we understand, we'll use that value.
5151
5246
  #define ENCODING(value, prebuilt) \
5152
- if (width == sizeof(value) - 1 && encoding_start + width <= parser->end && pm_strncasecmp(encoding_start, (const uint8_t *) value, width) == 0) { \
5247
+ if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
5153
5248
  parser->encoding = prebuilt; \
5154
5249
  parser->encoding_changed |= true; \
5155
5250
  if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
@@ -5198,40 +5293,220 @@ parser_lex_encoding_comment(pm_parser_t *parser) {
5198
5293
  // didn't understand the encoding that the user was trying to use. In this
5199
5294
  // case we'll keep using the default encoding but add an error to the
5200
5295
  // parser to indicate an unsuccessful parse.
5201
- pm_diagnostic_list_append(&parser->error_list, encoding_start, encoding_end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5296
+ pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
5297
+ }
5298
+
5299
+ // Look for a specific pattern of "coding" and potentially set the encoding on
5300
+ // the parser.
5301
+ static void
5302
+ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
5303
+ const uint8_t *cursor = parser->current.start + 1;
5304
+ const uint8_t *end = parser->current.end;
5305
+
5306
+ bool separator = false;
5307
+ while (true) {
5308
+ if (end - cursor <= 6) return;
5309
+ switch (cursor[6]) {
5310
+ case 'C': case 'c': cursor += 6; continue;
5311
+ case 'O': case 'o': cursor += 5; continue;
5312
+ case 'D': case 'd': cursor += 4; continue;
5313
+ case 'I': case 'i': cursor += 3; continue;
5314
+ case 'N': case 'n': cursor += 2; continue;
5315
+ case 'G': case 'g': cursor += 1; continue;
5316
+ case '=': case ':':
5317
+ separator = true;
5318
+ cursor += 6;
5319
+ break;
5320
+ default:
5321
+ cursor += 6;
5322
+ if (pm_char_is_whitespace(*cursor)) break;
5323
+ continue;
5324
+ }
5325
+ if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break;
5326
+ separator = false;
5327
+ }
5328
+
5329
+ while (true) {
5330
+ do {
5331
+ if (++cursor >= end) return;
5332
+ } while (pm_char_is_whitespace(*cursor));
5333
+
5334
+ if (separator) break;
5335
+ if (*cursor != '=' && *cursor != ':') return;
5336
+
5337
+ separator = true;
5338
+ cursor++;
5339
+ }
5340
+
5341
+ const uint8_t *value_start = cursor;
5342
+ while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end);
5343
+
5344
+ parser_lex_magic_comment_encoding_value(parser, value_start, cursor);
5202
5345
  }
5203
5346
 
5204
5347
  // Check if this is a magic comment that includes the frozen_string_literal
5205
5348
  // pragma. If it does, set that field on the parser.
5206
5349
  static void
5207
- parser_lex_frozen_string_literal_comment(pm_parser_t *parser) {
5208
- const uint8_t *cursor = parser->current.start + 1;
5350
+ parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
5351
+ if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
5352
+ parser->frozen_string_literal = true;
5353
+ }
5354
+ }
5355
+
5356
+ static inline bool
5357
+ pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
5358
+ return b == '\'' || b == '"' || b == ':' || b == ';';
5359
+ }
5360
+
5361
+ // Find an emacs magic comment marker (-*-) within the given bounds. If one is
5362
+ // found, it returns a pointer to the start of the marker. Otherwise it returns
5363
+ // NULL.
5364
+ static inline const uint8_t *
5365
+ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
5366
+ while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5367
+ if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
5368
+ return cursor;
5369
+ }
5370
+ cursor++;
5371
+ }
5372
+ return NULL;
5373
+ }
5374
+
5375
+ // Parse the current token on the parser to see if it's a magic comment and
5376
+ // potentially perform some action based on that. A regular expression that this
5377
+ // function is effectively matching is:
5378
+ //
5379
+ // %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
5380
+ //
5381
+ // It returns true if it consumes the entire comment. Otherwise it returns
5382
+ // false.
5383
+ static inline bool
5384
+ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
5385
+ const uint8_t *start = parser->current.start + 1;
5209
5386
  const uint8_t *end = parser->current.end;
5387
+ if (end - start <= 7) return false;
5388
+
5389
+ const uint8_t *cursor;
5390
+ bool indicator = false;
5210
5391
 
5211
- size_t key_length = strlen("frozen_string_literal");
5212
- if (key_length > (size_t) (end - cursor)) return;
5392
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5393
+ start = cursor + 3;
5213
5394
 
5214
- const uint8_t *cursor_limit = cursor + (end - cursor) - key_length + 1;
5395
+ if ((cursor = parser_lex_magic_comment_emacs_marker(parser, start, end)) != NULL) {
5396
+ end = cursor;
5397
+ indicator = true;
5398
+ } else {
5399
+ // If we have a start marker but not an end marker, then we cannot
5400
+ // have a magic comment.
5401
+ return false;
5402
+ }
5403
+ }
5215
5404
 
5216
- while ((cursor = pm_memchr(cursor, 'f', (size_t) (cursor_limit - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
5217
- if (memcmp(cursor, "frozen_string_literal", key_length) == 0) {
5218
- cursor += key_length;
5219
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5405
+ cursor = start;
5406
+ while (cursor < end) {
5407
+ while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++;
5220
5408
 
5221
- if (*cursor == ':' || *cursor == '=') {
5222
- cursor++;
5223
- cursor += pm_strspn_inline_whitespace(cursor, end - cursor);
5409
+ const uint8_t *key_start = cursor;
5410
+ while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++;
5224
5411
 
5225
- if (cursor + 4 <= end && pm_strncasecmp(cursor, (const uint8_t *) "true", 4) == 0) {
5226
- parser->frozen_string_literal = true;
5227
- }
5412
+ const uint8_t *key_end = cursor;
5413
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5414
+ if (cursor == end) break;
5228
5415
 
5229
- return;
5416
+ if (*cursor == ':') {
5417
+ cursor++;
5418
+ } else {
5419
+ if (!indicator) return false;
5420
+ continue;
5421
+ }
5422
+
5423
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5424
+ if (cursor == end) break;
5425
+
5426
+ const uint8_t *value_start;
5427
+ const uint8_t *value_end;
5428
+
5429
+ if (*cursor == '"') {
5430
+ value_start = ++cursor;
5431
+ for (; cursor < end && *cursor != '"'; cursor++) {
5432
+ if (*cursor == '\\' && (cursor + 1 < end)) cursor++;
5230
5433
  }
5434
+ value_end = cursor;
5435
+ } else {
5436
+ value_start = cursor;
5437
+ while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++;
5438
+ value_end = cursor;
5231
5439
  }
5232
5440
 
5233
- cursor++;
5441
+ if (indicator) {
5442
+ while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
5443
+ } else {
5444
+ while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
5445
+ if (cursor != end) return false;
5446
+ }
5447
+
5448
+ // Here, we need to do some processing on the key to swap out dashes for
5449
+ // underscores. We only need to do this if there _is_ a dash in the key.
5450
+ pm_string_t key;
5451
+ const size_t key_length = (size_t) (key_end - key_start);
5452
+ const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, &parser->encoding);
5453
+
5454
+ if (dash == NULL) {
5455
+ pm_string_shared_init(&key, key_start, key_end);
5456
+ } else {
5457
+ size_t width = (size_t) (key_end - key_start);
5458
+ uint8_t *buffer = malloc(width);
5459
+ if (buffer == NULL) break;
5460
+
5461
+ memcpy(buffer, key_start, width);
5462
+ buffer[dash - key_start] = '_';
5463
+
5464
+ while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
5465
+ buffer[dash - key_start] = '_';
5466
+ }
5467
+
5468
+ pm_string_owned_init(&key, buffer, width);
5469
+ }
5470
+
5471
+ // Finally, we can start checking the key against the list of known
5472
+ // magic comment keys, and potentially change state based on that.
5473
+ const uint8_t *key_source = pm_string_source(&key);
5474
+
5475
+ // We only want to attempt to compare against encoding comments if it's
5476
+ // the first line in the file (or the second in the case of a shebang).
5477
+ if (parser->current.start == parser->encoding_comment_start) {
5478
+ if (
5479
+ (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
5480
+ (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
5481
+ ) {
5482
+ parser_lex_magic_comment_encoding_value(parser, value_start, value_end);
5483
+ }
5484
+ }
5485
+
5486
+ // We only want to handle frozen string literal comments if it's before
5487
+ // any semantic tokens have been seen.
5488
+ if (!semantic_token_seen) {
5489
+ if (key_length == 21 && pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) {
5490
+ parser_lex_magic_comment_frozen_string_literal_value(parser, value_start, value_end);
5491
+ }
5492
+ }
5493
+
5494
+ // When we're done, we want to free the string in case we had to
5495
+ // allocate memory for it.
5496
+ pm_string_free(&key);
5497
+
5498
+ // Allocate a new magic comment node to append to the parser's list.
5499
+ pm_magic_comment_t *magic_comment;
5500
+ if ((magic_comment = (pm_magic_comment_t *) calloc(sizeof(pm_magic_comment_t), 1)) != NULL) {
5501
+ magic_comment->key_start = key_start;
5502
+ magic_comment->value_start = value_start;
5503
+ magic_comment->key_length = (uint32_t) key_length;
5504
+ magic_comment->value_length = (uint32_t) (value_end - value_start);
5505
+ pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment);
5506
+ }
5234
5507
  }
5508
+
5509
+ return true;
5235
5510
  }
5236
5511
 
5237
5512
  /******************************************************************************/
@@ -5366,7 +5641,7 @@ context_def_p(pm_parser_t *parser) {
5366
5641
  static void
5367
5642
  pm_strspn_number_validate(pm_parser_t *parser, const uint8_t *invalid) {
5368
5643
  if (invalid != NULL) {
5369
- pm_diagnostic_list_append(&parser->error_list, invalid, invalid + 1, PM_ERR_INVALID_NUMBER_UNDERSCORE);
5644
+ pm_parser_err(parser, invalid, invalid + 1, PM_ERR_INVALID_NUMBER_UNDERSCORE);
5370
5645
  }
5371
5646
  }
5372
5647
 
@@ -5430,7 +5705,7 @@ lex_optional_float_suffix(pm_parser_t *parser) {
5430
5705
  parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end);
5431
5706
  type = PM_TOKEN_FLOAT;
5432
5707
  } else {
5433
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_FLOAT_EXPONENT);
5708
+ pm_parser_err_current(parser, PM_ERR_INVALID_FLOAT_EXPONENT);
5434
5709
  type = PM_TOKEN_FLOAT;
5435
5710
  }
5436
5711
  }
@@ -5451,7 +5726,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5451
5726
  if (pm_char_is_decimal_digit(peek(parser))) {
5452
5727
  parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end);
5453
5728
  } else {
5454
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_DECIMAL);
5729
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_DECIMAL);
5455
5730
  }
5456
5731
 
5457
5732
  break;
@@ -5463,7 +5738,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5463
5738
  if (pm_char_is_binary_digit(peek(parser))) {
5464
5739
  parser->current.end += pm_strspn_binary_number_validate(parser, parser->current.end);
5465
5740
  } else {
5466
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_BINARY);
5741
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_BINARY);
5467
5742
  }
5468
5743
 
5469
5744
  parser->integer_base = PM_INTEGER_BASE_FLAGS_BINARY;
@@ -5476,7 +5751,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5476
5751
  if (pm_char_is_octal_digit(peek(parser))) {
5477
5752
  parser->current.end += pm_strspn_octal_number_validate(parser, parser->current.end);
5478
5753
  } else {
5479
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_OCTAL);
5754
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_OCTAL);
5480
5755
  }
5481
5756
 
5482
5757
  parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL;
@@ -5503,7 +5778,7 @@ lex_numeric_prefix(pm_parser_t *parser) {
5503
5778
  if (pm_char_is_hexadecimal_digit(peek(parser))) {
5504
5779
  parser->current.end += pm_strspn_hexadecimal_number_validate(parser, parser->current.end);
5505
5780
  } else {
5506
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_NUMBER_HEXADECIMAL);
5781
+ pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_HEXADECIMAL);
5507
5782
  }
5508
5783
 
5509
5784
  parser->integer_base = PM_INTEGER_BASE_FLAGS_HEXADECIMAL;
@@ -5581,7 +5856,7 @@ lex_numeric(pm_parser_t *parser) {
5581
5856
  static pm_token_type_t
5582
5857
  lex_global_variable(pm_parser_t *parser) {
5583
5858
  if (parser->current.end >= parser->end) {
5584
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5859
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5585
5860
  return PM_TOKEN_GLOBAL_VARIABLE;
5586
5861
  }
5587
5862
 
@@ -5622,7 +5897,7 @@ lex_global_variable(pm_parser_t *parser) {
5622
5897
  } while (parser->current.end < parser->end && (width = char_is_identifier(parser, parser->current.end)) > 0);
5623
5898
 
5624
5899
  // $0 isn't allowed to be followed by anything.
5625
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5900
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5626
5901
  }
5627
5902
 
5628
5903
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -5653,7 +5928,7 @@ lex_global_variable(pm_parser_t *parser) {
5653
5928
  } else {
5654
5929
  // If we get here, then we have a $ followed by something that isn't
5655
5930
  // recognized as a global variable.
5656
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_VARIABLE_GLOBAL);
5931
+ pm_parser_err_current(parser, PM_ERR_INVALID_VARIABLE_GLOBAL);
5657
5932
  }
5658
5933
 
5659
5934
  return PM_TOKEN_GLOBAL_VARIABLE;
@@ -5962,52 +6237,475 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
5962
6237
  }
5963
6238
  }
5964
6239
 
5965
- // This function is responsible for lexing either a character literal or the ?
5966
- // operator. The supported character literals are described below.
5967
- //
5968
- // \a bell, ASCII 07h (BEL)
5969
- // \b backspace, ASCII 08h (BS)
5970
- // \t horizontal tab, ASCII 09h (TAB)
5971
- // \n newline (line feed), ASCII 0Ah (LF)
5972
- // \v vertical tab, ASCII 0Bh (VT)
5973
- // \f form feed, ASCII 0Ch (FF)
5974
- // \r carriage return, ASCII 0Dh (CR)
5975
- // \e escape, ASCII 1Bh (ESC)
5976
- // \s space, ASCII 20h (SPC)
5977
- // \\ backslash
5978
- // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
5979
- // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
5980
- // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
5981
- // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
5982
- // \cx or \C-x control character, where x is an ASCII printable character
5983
- // \M-x meta character, where x is an ASCII printable character
5984
- // \M-\C-x meta control character, where x is an ASCII printable character
5985
- // \M-\cx same as above
5986
- // \c\M-x same as above
5987
- // \c? or \C-? delete, ASCII 7Fh (DEL)
5988
- //
5989
- static pm_token_type_t
5990
- lex_question_mark(pm_parser_t *parser) {
5991
- if (lex_state_end_p(parser)) {
5992
- lex_state_set(parser, PM_LEX_STATE_BEG);
5993
- return PM_TOKEN_QUESTION_MARK;
5994
- }
6240
+ static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0;
6241
+ static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1;
6242
+ static const uint8_t PM_ESCAPE_FLAG_META = 0x2;
6243
+ static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4;
6244
+ static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8;
5995
6245
 
5996
- if (parser->current.end >= parser->end) {
5997
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_QUESTION_MARK);
5998
- return PM_TOKEN_CHARACTER_LITERAL;
5999
- }
6246
+ // This is a lookup table for whether or not an ASCII character is printable.
6247
+ static const bool ascii_printable_chars[] = {
6248
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
6249
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
6250
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6251
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6252
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6253
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
6254
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6255
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
6256
+ };
6000
6257
 
6001
- if (pm_char_is_whitespace(*parser->current.end)) {
6002
- lex_state_set(parser, PM_LEX_STATE_BEG);
6003
- return PM_TOKEN_QUESTION_MARK;
6004
- }
6258
+ static inline bool
6259
+ char_is_ascii_printable(const uint8_t b) {
6260
+ return (b < 0x80) && ascii_printable_chars[b];
6261
+ }
6262
+
6263
+ // Return the value that a hexadecimal digit character represents. For example,
6264
+ // transform 'a' into 10, 'b' into 11, etc.
6265
+ static inline uint8_t
6266
+ escape_hexadecimal_digit(const uint8_t value) {
6267
+ return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
6268
+ }
6269
+
6270
+ // Scan the 4 digits of a Unicode escape into the value. Returns the number of
6271
+ // digits scanned. This function assumes that the characters have already been
6272
+ // validated.
6273
+ static inline uint32_t
6274
+ escape_unicode(const uint8_t *string, size_t length) {
6275
+ uint32_t value = 0;
6276
+ for (size_t index = 0; index < length; index++) {
6277
+ if (index != 0) value <<= 4;
6278
+ value |= escape_hexadecimal_digit(string[index]);
6279
+ }
6280
+ return value;
6281
+ }
6282
+
6283
+ // Escape a single character value based on the given flags.
6284
+ static inline uint8_t
6285
+ escape_byte(uint8_t value, const uint8_t flags) {
6286
+ if (flags & PM_ESCAPE_FLAG_CONTROL) value &= 0x1f;
6287
+ if (flags & PM_ESCAPE_FLAG_META) value |= 0x80;
6288
+ return value;
6289
+ }
6290
+
6291
+ // Write a unicode codepoint to the given buffer.
6292
+ static inline void
6293
+ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) {
6294
+ if (value <= 0x7F) { // 0xxxxxxx
6295
+ pm_buffer_append_u8(buffer, (uint8_t) value);
6296
+ } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
6297
+ pm_buffer_append_u8(buffer, (uint8_t) (0xC0 | (value >> 6)));
6298
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6299
+ } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx
6300
+ pm_buffer_append_u8(buffer, (uint8_t) (0xE0 | (value >> 12)));
6301
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
6302
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6303
+ } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
6304
+ pm_buffer_append_u8(buffer, (uint8_t) (0xF0 | (value >> 18)));
6305
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F)));
6306
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
6307
+ pm_buffer_append_u8(buffer, (uint8_t) (0x80 | (value & 0x3F)));
6308
+ } else {
6309
+ pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
6310
+ pm_buffer_append_u8(buffer, 0xEF);
6311
+ pm_buffer_append_u8(buffer, 0xBF);
6312
+ pm_buffer_append_u8(buffer, 0xBD);
6313
+ }
6314
+ }
6315
+
6316
+ // The regular expression engine doesn't support the same escape sequences as
6317
+ // Ruby does. So first we have to read the escape sequence, and then we have to
6318
+ // format it like the regular expression engine expects it. For example, in Ruby
6319
+ // if we have:
6320
+ //
6321
+ // /\M-\C-?/
6322
+ //
6323
+ // then the first byte is actually 255, so we have to rewrite this as:
6324
+ //
6325
+ // /\xFF/
6326
+ //
6327
+ // Note that in this case there is a literal \ byte in the regular expression
6328
+ // source so that the regular expression engine will perform its own unescaping.
6329
+ static inline void
6330
+ escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
6331
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6332
+ pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
6333
+
6334
+ uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF);
6335
+ uint8_t byte2 = (uint8_t) (byte & 0xF);
6336
+
6337
+ if (byte1 >= 0xA) {
6338
+ pm_buffer_append_u8(buffer, (uint8_t) ((byte1 - 0xA) + 'A'));
6339
+ } else {
6340
+ pm_buffer_append_u8(buffer, (uint8_t) (byte1 + '0'));
6341
+ }
6342
+
6343
+ if (byte2 >= 0xA) {
6344
+ pm_buffer_append_u8(buffer, (uint8_t) (byte2 - 0xA + 'A'));
6345
+ } else {
6346
+ pm_buffer_append_u8(buffer, (uint8_t) (byte2 + '0'));
6347
+ }
6348
+ } else {
6349
+ pm_buffer_append_u8(buffer, byte);
6350
+ }
6351
+ }
6352
+
6353
+ // Read the value of an escape into the buffer.
6354
+ static void
6355
+ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
6356
+ switch (peek(parser)) {
6357
+ case '\\': {
6358
+ parser->current.end++;
6359
+ pm_buffer_append_u8(buffer, '\\');
6360
+ return;
6361
+ }
6362
+ case '\'': {
6363
+ parser->current.end++;
6364
+ pm_buffer_append_u8(buffer, '\'');
6365
+ return;
6366
+ }
6367
+ case 'a': {
6368
+ parser->current.end++;
6369
+ pm_buffer_append_u8(buffer, '\a');
6370
+ return;
6371
+ }
6372
+ case 'b': {
6373
+ parser->current.end++;
6374
+ pm_buffer_append_u8(buffer, '\b');
6375
+ return;
6376
+ }
6377
+ case 'e': {
6378
+ parser->current.end++;
6379
+ pm_buffer_append_u8(buffer, '\033');
6380
+ return;
6381
+ }
6382
+ case 'f': {
6383
+ parser->current.end++;
6384
+ pm_buffer_append_u8(buffer, '\f');
6385
+ return;
6386
+ }
6387
+ case 'n': {
6388
+ parser->current.end++;
6389
+ pm_buffer_append_u8(buffer, '\n');
6390
+ return;
6391
+ }
6392
+ case 'r': {
6393
+ parser->current.end++;
6394
+ pm_buffer_append_u8(buffer, '\r');
6395
+ return;
6396
+ }
6397
+ case 's': {
6398
+ parser->current.end++;
6399
+ pm_buffer_append_u8(buffer, ' ');
6400
+ return;
6401
+ }
6402
+ case 't': {
6403
+ parser->current.end++;
6404
+ pm_buffer_append_u8(buffer, '\t');
6405
+ return;
6406
+ }
6407
+ case 'v': {
6408
+ parser->current.end++;
6409
+ pm_buffer_append_u8(buffer, '\v');
6410
+ return;
6411
+ }
6412
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': {
6413
+ uint8_t value = (uint8_t) (*parser->current.end - '0');
6414
+ parser->current.end++;
6415
+
6416
+ if (pm_char_is_octal_digit(peek(parser))) {
6417
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0'));
6418
+ parser->current.end++;
6419
+
6420
+ if (pm_char_is_octal_digit(peek(parser))) {
6421
+ value = ((uint8_t) (value << 3)) | ((uint8_t) (*parser->current.end - '0'));
6422
+ parser->current.end++;
6423
+ }
6424
+ }
6425
+
6426
+ pm_buffer_append_u8(buffer, value);
6427
+ return;
6428
+ }
6429
+ case 'x': {
6430
+ const uint8_t *start = parser->current.end - 1;
6431
+
6432
+ parser->current.end++;
6433
+ uint8_t byte = peek(parser);
6434
+
6435
+ if (pm_char_is_hexadecimal_digit(byte)) {
6436
+ uint8_t value = escape_hexadecimal_digit(byte);
6437
+ parser->current.end++;
6438
+
6439
+ byte = peek(parser);
6440
+ if (pm_char_is_hexadecimal_digit(byte)) {
6441
+ value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(byte));
6442
+ parser->current.end++;
6443
+ }
6444
+
6445
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6446
+ pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
6447
+ } else {
6448
+ pm_buffer_append_u8(buffer, value);
6449
+ }
6450
+ } else {
6451
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
6452
+ }
6453
+
6454
+ return;
6455
+ }
6456
+ case 'u': {
6457
+ const uint8_t *start = parser->current.end - 1;
6458
+ parser->current.end++;
6459
+
6460
+ if (
6461
+ (parser->current.end + 4 <= parser->end) &&
6462
+ pm_char_is_hexadecimal_digit(parser->current.end[0]) &&
6463
+ pm_char_is_hexadecimal_digit(parser->current.end[1]) &&
6464
+ pm_char_is_hexadecimal_digit(parser->current.end[2]) &&
6465
+ pm_char_is_hexadecimal_digit(parser->current.end[3])
6466
+ ) {
6467
+ uint32_t value = escape_unicode(parser->current.end, 4);
6468
+
6469
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6470
+ pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
6471
+ } else {
6472
+ escape_write_unicode(parser, buffer, start, parser->current.end + 4, value);
6473
+ }
6474
+
6475
+ parser->current.end += 4;
6476
+ } else if (peek(parser) == '{') {
6477
+ const uint8_t *unicode_codepoints_start = parser->current.end - 2;
6478
+
6479
+ parser->current.end++;
6480
+ parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
6481
+
6482
+ const uint8_t *extra_codepoints_start = NULL;
6483
+ int codepoints_count = 0;
6484
+
6485
+ while ((parser->current.end < parser->end) && (*parser->current.end != '}')) {
6486
+ const uint8_t *unicode_start = parser->current.end;
6487
+ size_t hexadecimal_length = pm_strspn_hexadecimal_digit(parser->current.end, parser->end - parser->current.end);
6488
+
6489
+ if (hexadecimal_length > 6) {
6490
+ // \u{nnnn} character literal allows only 1-6 hexadecimal digits
6491
+ pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
6492
+ } else if (hexadecimal_length == 0) {
6493
+ // there are not hexadecimal characters
6494
+ pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE);
6495
+ return;
6496
+ }
6497
+
6498
+ parser->current.end += hexadecimal_length;
6499
+ codepoints_count++;
6500
+ if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count == 2) {
6501
+ extra_codepoints_start = unicode_start;
6502
+ }
6503
+
6504
+ if (!(flags & PM_ESCAPE_FLAG_REGEXP)) {
6505
+ uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
6506
+ escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value);
6507
+ }
6508
+
6509
+ parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
6510
+ }
6511
+
6512
+ // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
6513
+ if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) {
6514
+ pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
6515
+ }
6516
+
6517
+ if (peek(parser) == '}') {
6518
+ parser->current.end++;
6519
+ } else {
6520
+ pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
6521
+ }
6522
+
6523
+ if (flags & PM_ESCAPE_FLAG_REGEXP) {
6524
+ pm_buffer_append_bytes(buffer, unicode_codepoints_start, (size_t) (parser->current.end - unicode_codepoints_start));
6525
+ }
6526
+ } else {
6527
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
6528
+ }
6529
+
6530
+ return;
6531
+ }
6532
+ case 'c': {
6533
+ parser->current.end++;
6534
+ if (parser->current.end == parser->end) {
6535
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6536
+ return;
6537
+ }
6538
+
6539
+ uint8_t peeked = peek(parser);
6540
+ switch (peeked) {
6541
+ case '?': {
6542
+ parser->current.end++;
6543
+ escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
6544
+ return;
6545
+ }
6546
+ case '\\':
6547
+ if (flags & PM_ESCAPE_FLAG_CONTROL) {
6548
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
6549
+ return;
6550
+ }
6551
+ parser->current.end++;
6552
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
6553
+ return;
6554
+ default: {
6555
+ if (!char_is_ascii_printable(peeked)) {
6556
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6557
+ return;
6558
+ }
6559
+
6560
+ parser->current.end++;
6561
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
6562
+ return;
6563
+ }
6564
+ }
6565
+ }
6566
+ case 'C': {
6567
+ parser->current.end++;
6568
+ if (peek(parser) != '-') {
6569
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6570
+ return;
6571
+ }
6572
+
6573
+ parser->current.end++;
6574
+ if (parser->current.end == parser->end) {
6575
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6576
+ return;
6577
+ }
6578
+
6579
+ uint8_t peeked = peek(parser);
6580
+ switch (peeked) {
6581
+ case '?': {
6582
+ parser->current.end++;
6583
+ escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
6584
+ return;
6585
+ }
6586
+ case '\\':
6587
+ if (flags & PM_ESCAPE_FLAG_CONTROL) {
6588
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
6589
+ return;
6590
+ }
6591
+ parser->current.end++;
6592
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
6593
+ return;
6594
+ default: {
6595
+ if (!char_is_ascii_printable(peeked)) {
6596
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
6597
+ return;
6598
+ }
6599
+
6600
+ parser->current.end++;
6601
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
6602
+ return;
6603
+ }
6604
+ }
6605
+ }
6606
+ case 'M': {
6607
+ parser->current.end++;
6608
+ if (peek(parser) != '-') {
6609
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6610
+ return;
6611
+ }
6612
+
6613
+ parser->current.end++;
6614
+ if (parser->current.end == parser->end) {
6615
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6616
+ return;
6617
+ }
6618
+
6619
+ uint8_t peeked = peek(parser);
6620
+ if (peeked == '\\') {
6621
+ if (flags & PM_ESCAPE_FLAG_META) {
6622
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META_REPEAT);
6623
+ return;
6624
+ }
6625
+ parser->current.end++;
6626
+ escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_META);
6627
+ return;
6628
+ }
6629
+
6630
+ if (!char_is_ascii_printable(peeked)) {
6631
+ pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_META);
6632
+ return;
6633
+ }
6634
+
6635
+ parser->current.end++;
6636
+ escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
6637
+ return;
6638
+ }
6639
+ case '\r': {
6640
+ if (peek_offset(parser, 1) == '\n') {
6641
+ parser->current.end += 2;
6642
+ pm_buffer_append_u8(buffer, '\n');
6643
+ return;
6644
+ }
6645
+ }
6646
+ /* fallthrough */
6647
+ default: {
6648
+ if (parser->current.end < parser->end) {
6649
+ pm_buffer_append_u8(buffer, *parser->current.end++);
6650
+ }
6651
+ return;
6652
+ }
6653
+ }
6654
+ }
6655
+
6656
+ // This function is responsible for lexing either a character literal or the ?
6657
+ // operator. The supported character literals are described below.
6658
+ //
6659
+ // \a bell, ASCII 07h (BEL)
6660
+ // \b backspace, ASCII 08h (BS)
6661
+ // \t horizontal tab, ASCII 09h (TAB)
6662
+ // \n newline (line feed), ASCII 0Ah (LF)
6663
+ // \v vertical tab, ASCII 0Bh (VT)
6664
+ // \f form feed, ASCII 0Ch (FF)
6665
+ // \r carriage return, ASCII 0Dh (CR)
6666
+ // \e escape, ASCII 1Bh (ESC)
6667
+ // \s space, ASCII 20h (SPC)
6668
+ // \\ backslash
6669
+ // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7])
6670
+ // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
6671
+ // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
6672
+ // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
6673
+ // \cx or \C-x control character, where x is an ASCII printable character
6674
+ // \M-x meta character, where x is an ASCII printable character
6675
+ // \M-\C-x meta control character, where x is an ASCII printable character
6676
+ // \M-\cx same as above
6677
+ // \c\M-x same as above
6678
+ // \c? or \C-? delete, ASCII 7Fh (DEL)
6679
+ //
6680
+ static pm_token_type_t
6681
+ lex_question_mark(pm_parser_t *parser) {
6682
+ if (lex_state_end_p(parser)) {
6683
+ lex_state_set(parser, PM_LEX_STATE_BEG);
6684
+ return PM_TOKEN_QUESTION_MARK;
6685
+ }
6686
+
6687
+ if (parser->current.end >= parser->end) {
6688
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_QUESTION_MARK);
6689
+ pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end);
6690
+ return PM_TOKEN_CHARACTER_LITERAL;
6691
+ }
6692
+
6693
+ if (pm_char_is_whitespace(*parser->current.end)) {
6694
+ lex_state_set(parser, PM_LEX_STATE_BEG);
6695
+ return PM_TOKEN_QUESTION_MARK;
6696
+ }
6005
6697
 
6006
6698
  lex_state_set(parser, PM_LEX_STATE_BEG);
6007
6699
 
6008
- if (parser->current.start[1] == '\\') {
6700
+ if (match(parser, '\\')) {
6009
6701
  lex_state_set(parser, PM_LEX_STATE_END);
6010
- parser->current.end += pm_unescape_calculate_difference(parser, parser->current.start + 1, PM_UNESCAPE_ALL, true);
6702
+
6703
+ pm_buffer_t buffer;
6704
+ pm_buffer_init_capacity(&buffer, 3);
6705
+
6706
+ escape_read(parser, &buffer, PM_ESCAPE_FLAG_SINGLE);
6707
+ pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);
6708
+
6011
6709
  return PM_TOKEN_CHARACTER_LITERAL;
6012
6710
  } else {
6013
6711
  size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end);
@@ -6024,6 +6722,7 @@ lex_question_mark(pm_parser_t *parser) {
6024
6722
  ) {
6025
6723
  lex_state_set(parser, PM_LEX_STATE_END);
6026
6724
  parser->current.end += encoding_width;
6725
+ pm_string_shared_init(&parser->current_string, parser->current.start + 1, parser->current.end);
6027
6726
  return PM_TOKEN_CHARACTER_LITERAL;
6028
6727
  }
6029
6728
  }
@@ -6045,9 +6744,9 @@ lex_at_variable(pm_parser_t *parser) {
6045
6744
  parser->current.end += width;
6046
6745
  }
6047
6746
  } else if (type == PM_TOKEN_CLASS_VARIABLE) {
6048
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
6747
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_CLASS);
6049
6748
  } else {
6050
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
6749
+ pm_parser_err_current(parser, PM_ERR_INCOMPLETE_VARIABLE_INSTANCE);
6051
6750
  }
6052
6751
 
6053
6752
  // If we're lexing an embedded variable, then we need to pop back into the
@@ -6070,7 +6769,7 @@ parser_lex_callback(pm_parser_t *parser) {
6070
6769
  // Return a new comment node of the specified type.
6071
6770
  static inline pm_comment_t *
6072
6771
  parser_comment(pm_parser_t *parser, pm_comment_type_t type) {
6073
- pm_comment_t *comment = (pm_comment_t *) malloc(sizeof(pm_comment_t));
6772
+ pm_comment_t *comment = (pm_comment_t *) calloc(sizeof(pm_comment_t), 1);
6074
6773
  if (comment == NULL) return NULL;
6075
6774
 
6076
6775
  *comment = (pm_comment_t) {
@@ -6146,7 +6845,7 @@ lex_embdoc(pm_parser_t *parser) {
6146
6845
  parser_lex_callback(parser);
6147
6846
  }
6148
6847
 
6149
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EMBDOC_TERM);
6848
+ pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
6150
6849
 
6151
6850
  comment->end = parser->current.end;
6152
6851
  pm_list_append(&parser->comment_list, (pm_list_node_t *) comment);
@@ -6177,6 +6876,113 @@ parser_flush_heredoc_end(pm_parser_t *parser) {
6177
6876
  parser->heredoc_end = NULL;
6178
6877
  }
6179
6878
 
6879
+ // When we're lexing certain types (strings, symbols, lists, etc.) we have
6880
+ // string content associated with the tokens. For example:
6881
+ //
6882
+ // "foo"
6883
+ //
6884
+ // In this case, the string content is foo. Since there is no escaping, there's
6885
+ // no need to track additional information and the token can be returned as
6886
+ // normal. However, if we have escape sequences:
6887
+ //
6888
+ // "foo\n"
6889
+ //
6890
+ // then the bytes in the string are "f", "o", "o", "\", "n", but we want to
6891
+ // provide out consumers with the string content "f", "o", "o", "\n". In these
6892
+ // cases, when we find the first escape sequence, we initialize a pm_buffer_t
6893
+ // to keep track of the string content. Then in the parser, it will
6894
+ // automatically attach the string content to the node that it belongs to.
6895
+ typedef struct {
6896
+ pm_buffer_t buffer;
6897
+ const uint8_t *cursor;
6898
+ } pm_token_buffer_t;
6899
+
6900
+ // Push the given byte into the token buffer.
6901
+ static inline void
6902
+ pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
6903
+ pm_buffer_append_u8(&token_buffer->buffer, byte);
6904
+ }
6905
+
6906
+ // When we're about to return from lexing the current token and we know for sure
6907
+ // that we have found an escape sequence, this function is called to copy the
6908
+ // contents of the token buffer into the current string on the parser so that it
6909
+ // can be attached to the correct node.
6910
+ static inline void
6911
+ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6912
+ pm_string_owned_init(&parser->current_string, (uint8_t *) token_buffer->buffer.value, token_buffer->buffer.length);
6913
+ }
6914
+
6915
+ // When we're about to return from lexing the current token, we need to flush
6916
+ // all of the content that we have pushed into the buffer into the current
6917
+ // string. If we haven't pushed anything into the buffer, this means that we
6918
+ // never found an escape sequence, so we can directly reference the bounds of
6919
+ // the current string. Either way, at the return of this function it is expected
6920
+ // that parser->current_string is established in such a way that it can be
6921
+ // attached to a node.
6922
+ static void
6923
+ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6924
+ if (token_buffer->cursor == NULL) {
6925
+ pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
6926
+ } else {
6927
+ pm_buffer_append_bytes(&token_buffer->buffer, token_buffer->cursor, (size_t) (parser->current.end - token_buffer->cursor));
6928
+ pm_token_buffer_copy(parser, token_buffer);
6929
+ }
6930
+ }
6931
+
6932
+ // When we've found an escape sequence, we need to copy everything up to this
6933
+ // point into the buffer because we're about to provide a string that has
6934
+ // different content than a direct slice of the source.
6935
+ //
6936
+ // It is expected that the parser's current token end will be pointing at one
6937
+ // byte past the backslash that starts the escape sequence.
6938
+ static void
6939
+ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
6940
+ const uint8_t *start;
6941
+ if (token_buffer->cursor == NULL) {
6942
+ pm_buffer_init_capacity(&token_buffer->buffer, 16);
6943
+ start = parser->current.start;
6944
+ } else {
6945
+ start = token_buffer->cursor;
6946
+ }
6947
+
6948
+ const uint8_t *end = parser->current.end - 1;
6949
+ pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start));
6950
+ }
6951
+
6952
+ // Effectively the same thing as pm_strspn_inline_whitespace, but in the case of
6953
+ // a tilde heredoc expands out tab characters to the nearest tab boundaries.
6954
+ static inline size_t
6955
+ pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor, pm_heredoc_indent_t indent) {
6956
+ size_t whitespace = 0;
6957
+
6958
+ switch (indent) {
6959
+ case PM_HEREDOC_INDENT_NONE:
6960
+ // Do nothing, we can't match a terminator with
6961
+ // indentation and there's no need to calculate common
6962
+ // whitespace.
6963
+ break;
6964
+ case PM_HEREDOC_INDENT_DASH:
6965
+ // Skip past inline whitespace.
6966
+ *cursor += pm_strspn_inline_whitespace(*cursor, parser->end - *cursor);
6967
+ break;
6968
+ case PM_HEREDOC_INDENT_TILDE:
6969
+ // Skip past inline whitespace and calculate common
6970
+ // whitespace.
6971
+ while (*cursor < parser->end && pm_char_is_inline_whitespace(**cursor)) {
6972
+ if (**cursor == '\t') {
6973
+ whitespace = (whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
6974
+ } else {
6975
+ whitespace++;
6976
+ }
6977
+ (*cursor)++;
6978
+ }
6979
+
6980
+ break;
6981
+ }
6982
+
6983
+ return whitespace;
6984
+ }
6985
+
6180
6986
  // This is a convenience macro that will set the current token type, call the
6181
6987
  // lex callback, and then return from the parser_lex function.
6182
6988
  #define LEX(token_type) parser->current.type = token_type; parser_lex_callback(parser); return
@@ -6301,12 +7107,16 @@ parser_lex(pm_parser_t *parser) {
6301
7107
  parser->current.type = PM_TOKEN_COMMENT;
6302
7108
  parser_lex_callback(parser);
6303
7109
 
6304
- if (parser->current.start == parser->encoding_comment_start) {
6305
- parser_lex_encoding_comment(parser);
6306
- }
7110
+ // Here, parse the comment to see if it's a magic comment
7111
+ // and potentially change state on the parser.
7112
+ if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) {
7113
+ ptrdiff_t length = parser->current.end - parser->current.start;
6307
7114
 
6308
- if (!semantic_token_seen) {
6309
- parser_lex_frozen_string_literal_comment(parser);
7115
+ // If we didn't find a magic comment within the first
7116
+ // pass and we're at the start of the file, then we need
7117
+ // to do another pass to potentially find other patterns
7118
+ // for encoding comments.
7119
+ if (length >= 10) parser_lex_magic_comment_encoding(parser);
6310
7120
  }
6311
7121
 
6312
7122
  lexed_comment = true;
@@ -6588,7 +7398,7 @@ parser_lex(pm_parser_t *parser) {
6588
7398
  pm_token_type_t type = PM_TOKEN_STAR;
6589
7399
 
6590
7400
  if (lex_state_spcarg_p(parser, space_seen)) {
6591
- pm_diagnostic_list_append(&parser->warning_list, parser->current.start, parser->current.end, PM_WARN_AMBIGUOUS_PREFIX_STAR);
7401
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_PREFIX_STAR);
6592
7402
  type = PM_TOKEN_USTAR;
6593
7403
  } else if (lex_state_beg_p(parser)) {
6594
7404
  type = PM_TOKEN_USTAR;
@@ -6626,7 +7436,7 @@ parser_lex(pm_parser_t *parser) {
6626
7436
 
6627
7437
  // = => =~ == === =begin
6628
7438
  case '=':
6629
- if (current_token_starts_line(parser) && memcmp(peek_string(parser, 5), "begin", 5) == 0 && pm_char_is_whitespace(peek_offset(parser, 5))) {
7439
+ if (current_token_starts_line(parser) && (parser->current.end + 5 <= parser->end) && memcmp(parser->current.end, "begin", 5) == 0 && pm_char_is_whitespace(peek_offset(parser, 5))) {
6630
7440
  pm_token_type_t type = lex_embdoc(parser);
6631
7441
 
6632
7442
  if (type == PM_TOKEN_EOF) {
@@ -6720,7 +7530,8 @@ parser_lex(pm_parser_t *parser) {
6720
7530
  .ident_length = ident_length,
6721
7531
  .next_start = parser->current.end,
6722
7532
  .quote = quote,
6723
- .indent = indent
7533
+ .indent = indent,
7534
+ .common_whitespace = (size_t) -1
6724
7535
  }
6725
7536
  });
6726
7537
 
@@ -6732,7 +7543,7 @@ parser_lex(pm_parser_t *parser) {
6732
7543
  // this is not a valid heredoc declaration. In this case we
6733
7544
  // will add an error, but we will still return a heredoc
6734
7545
  // start.
6735
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EMBDOC_TERM);
7546
+ pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM);
6736
7547
  body_start = parser->end;
6737
7548
  } else {
6738
7549
  // Otherwise, we want to indicate that the body of the
@@ -6925,12 +7736,7 @@ parser_lex(pm_parser_t *parser) {
6925
7736
 
6926
7737
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
6927
7738
  if (spcarg) {
6928
- pm_diagnostic_list_append(
6929
- &parser->warning_list,
6930
- parser->current.start,
6931
- parser->current.end,
6932
- PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS
6933
- );
7739
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS);
6934
7740
  }
6935
7741
 
6936
7742
  if (lex_state_beg_p(parser) || spcarg) {
@@ -6974,12 +7780,7 @@ parser_lex(pm_parser_t *parser) {
6974
7780
 
6975
7781
  bool spcarg = lex_state_spcarg_p(parser, space_seen);
6976
7782
  if (spcarg) {
6977
- pm_diagnostic_list_append(
6978
- &parser->warning_list,
6979
- parser->current.start,
6980
- parser->current.end,
6981
- PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS
6982
- );
7783
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS);
6983
7784
  }
6984
7785
 
6985
7786
  if (lex_state_beg_p(parser) || spcarg) {
@@ -7076,7 +7877,7 @@ parser_lex(pm_parser_t *parser) {
7076
7877
  }
7077
7878
 
7078
7879
  if (lex_state_spcarg_p(parser, space_seen)) {
7079
- pm_diagnostic_list_append(&parser->warning_list, parser->current.start, parser->current.end, PM_WARN_AMBIGUOUS_SLASH);
7880
+ pm_parser_warn_token(parser, &parser->current, PM_WARN_AMBIGUOUS_SLASH);
7080
7881
  lex_mode_push_regexp(parser, '\0', '/');
7081
7882
  LEX(PM_TOKEN_REGEXP_BEGIN);
7082
7883
  }
@@ -7116,7 +7917,7 @@ parser_lex(pm_parser_t *parser) {
7116
7917
  // operator because we don't want to move into the string
7117
7918
  // lex mode unnecessarily.
7118
7919
  if ((lex_state_beg_p(parser) || lex_state_arg_p(parser)) && (parser->current.end >= parser->end)) {
7119
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
7920
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7120
7921
  LEX(PM_TOKEN_PERCENT);
7121
7922
  }
7122
7923
 
@@ -7149,7 +7950,7 @@ parser_lex(pm_parser_t *parser) {
7149
7950
  // validate that here.
7150
7951
  uint8_t delimiter = peek_offset(parser, 1);
7151
7952
  if (delimiter >= 0x80 || parser->encoding.alnum_char(&delimiter, 1)) {
7152
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
7953
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7153
7954
  goto lex_next_token;
7154
7955
  }
7155
7956
 
@@ -7249,7 +8050,7 @@ parser_lex(pm_parser_t *parser) {
7249
8050
  // unparseable. In this case we'll just drop it from the parser
7250
8051
  // and skip past it and hope that the next token is something
7251
8052
  // that we can parse.
7252
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_PERCENT);
8053
+ pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
7253
8054
  goto lex_next_token;
7254
8055
  }
7255
8056
  }
@@ -7285,7 +8086,7 @@ parser_lex(pm_parser_t *parser) {
7285
8086
  // token as we've exhausted all of the other options. We'll skip past
7286
8087
  // it and return the next token.
7287
8088
  if (!width) {
7288
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_INVALID_TOKEN);
8089
+ pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
7289
8090
  goto lex_next_token;
7290
8091
  }
7291
8092
 
@@ -7351,7 +8152,7 @@ parser_lex(pm_parser_t *parser) {
7351
8152
  }
7352
8153
  }
7353
8154
  }
7354
- case PM_LEX_LIST:
8155
+ case PM_LEX_LIST: {
7355
8156
  if (parser->next_start != NULL) {
7356
8157
  parser->current.end = parser->next_start;
7357
8158
  parser->next_start = NULL;
@@ -7394,6 +8195,10 @@ parser_lex(pm_parser_t *parser) {
7394
8195
  const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
7395
8196
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7396
8197
 
8198
+ // If we haven't found an escape yet, then this buffer will be
8199
+ // unallocated since we can refer directly to the source string.
8200
+ pm_token_buffer_t token_buffer = { 0 };
8201
+
7397
8202
  while (breakpoint != NULL) {
7398
8203
  // If we hit a null byte, skip directly past it.
7399
8204
  if (*breakpoint == '\0') {
@@ -7405,16 +8210,18 @@ parser_lex(pm_parser_t *parser) {
7405
8210
  // now, so we can return an element of the list.
7406
8211
  if (pm_char_is_whitespace(*breakpoint)) {
7407
8212
  parser->current.end = breakpoint;
8213
+ pm_token_buffer_flush(parser, &token_buffer);
7408
8214
  LEX(PM_TOKEN_STRING_CONTENT);
7409
8215
  }
7410
8216
 
7411
- //If we hit the terminator, we need to check which token to
8217
+ // If we hit the terminator, we need to check which token to
7412
8218
  // return.
7413
8219
  if (*breakpoint == lex_mode->as.list.terminator) {
7414
8220
  // If this terminator doesn't actually close the list, then
7415
8221
  // we need to continue on past it.
7416
8222
  if (lex_mode->as.list.nesting > 0) {
7417
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8223
+ parser->current.end = breakpoint + 1;
8224
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7418
8225
  lex_mode->as.list.nesting--;
7419
8226
  continue;
7420
8227
  }
@@ -7423,6 +8230,7 @@ parser_lex(pm_parser_t *parser) {
7423
8230
  // past content, then we can return a list node.
7424
8231
  if (breakpoint > parser->current.start) {
7425
8232
  parser->current.end = breakpoint;
8233
+ pm_token_buffer_flush(parser, &token_buffer);
7426
8234
  LEX(PM_TOKEN_STRING_CONTENT);
7427
8235
  }
7428
8236
 
@@ -7438,59 +8246,109 @@ parser_lex(pm_parser_t *parser) {
7438
8246
  // literally. In this case we'll skip past the next character
7439
8247
  // and find the next breakpoint.
7440
8248
  if (*breakpoint == '\\') {
7441
- pm_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? PM_UNESCAPE_ALL : PM_UNESCAPE_MINIMAL;
7442
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7443
- if (difference == 0) {
7444
- // we're at the end of the file
8249
+ parser->current.end = breakpoint + 1;
8250
+
8251
+ // If we've hit the end of the file, then break out of the
8252
+ // loop by setting the breakpoint to NULL.
8253
+ if (parser->current.end == parser->end) {
7445
8254
  breakpoint = NULL;
7446
8255
  continue;
7447
8256
  }
7448
8257
 
7449
- // If the result is an escaped newline ...
7450
- if (breakpoint[difference - 1] == '\n') {
7451
- if (parser->heredoc_end) {
7452
- // ... if we are on the same line as a heredoc, flush the heredoc and
7453
- // continue parsing after heredoc_end.
7454
- parser->current.end = breakpoint + difference;
7455
- parser_flush_heredoc_end(parser);
7456
- LEX(PM_TOKEN_STRING_CONTENT);
7457
- } else {
7458
- // ... else track the newline.
7459
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7460
- }
8258
+ pm_token_buffer_escape(parser, &token_buffer);
8259
+ uint8_t peeked = peek(parser);
8260
+
8261
+ switch (peeked) {
8262
+ case ' ':
8263
+ case '\f':
8264
+ case '\t':
8265
+ case '\v':
8266
+ case '\\':
8267
+ pm_token_buffer_push(&token_buffer, peeked);
8268
+ parser->current.end++;
8269
+ break;
8270
+ case '\r':
8271
+ parser->current.end++;
8272
+ if (peek(parser) != '\n') {
8273
+ pm_token_buffer_push(&token_buffer, '\r');
8274
+ break;
8275
+ }
8276
+ /* fallthrough */
8277
+ case '\n':
8278
+ pm_token_buffer_push(&token_buffer, '\n');
8279
+
8280
+ if (parser->heredoc_end) {
8281
+ // ... if we are on the same line as a heredoc,
8282
+ // flush the heredoc and continue parsing after
8283
+ // heredoc_end.
8284
+ parser_flush_heredoc_end(parser);
8285
+ pm_token_buffer_copy(parser, &token_buffer);
8286
+ LEX(PM_TOKEN_STRING_CONTENT);
8287
+ } else {
8288
+ // ... else track the newline.
8289
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8290
+ }
8291
+
8292
+ parser->current.end++;
8293
+ break;
8294
+ default:
8295
+ if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
8296
+ pm_token_buffer_push(&token_buffer, peeked);
8297
+ parser->current.end++;
8298
+ } else if (lex_mode->as.list.interpolation) {
8299
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8300
+ } else {
8301
+ pm_token_buffer_push(&token_buffer, '\\');
8302
+ pm_token_buffer_push(&token_buffer, peeked);
8303
+ parser->current.end++;
8304
+ }
8305
+
8306
+ break;
7461
8307
  }
7462
8308
 
7463
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8309
+ token_buffer.cursor = parser->current.end;
8310
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7464
8311
  continue;
7465
8312
  }
7466
8313
 
7467
8314
  // If we hit a #, then we will attempt to lex interpolation.
7468
8315
  if (*breakpoint == '#') {
7469
8316
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7470
- if (type != PM_TOKEN_NOT_PROVIDED) {
7471
- LEX(type);
8317
+
8318
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8319
+ // If we haven't returned at this point then we had something
8320
+ // that looked like an interpolated class or instance variable
8321
+ // like "#@" but wasn't actually. In this case we'll just skip
8322
+ // to the next breakpoint.
8323
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8324
+ continue;
7472
8325
  }
7473
8326
 
7474
- // If we haven't returned at this point then we had something
7475
- // that looked like an interpolated class or instance variable
7476
- // like "#@" but wasn't actually. In this case we'll just skip
7477
- // to the next breakpoint.
7478
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7479
- continue;
8327
+ if (type == PM_TOKEN_STRING_CONTENT) {
8328
+ pm_token_buffer_flush(parser, &token_buffer);
8329
+ }
8330
+
8331
+ LEX(type);
7480
8332
  }
7481
8333
 
7482
8334
  // If we've hit the incrementor, then we need to skip past it
7483
8335
  // and find the next breakpoint.
7484
8336
  assert(*breakpoint == lex_mode->as.list.incrementor);
7485
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8337
+ parser->current.end = breakpoint + 1;
8338
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7486
8339
  lex_mode->as.list.nesting++;
7487
8340
  continue;
7488
8341
  }
7489
8342
 
7490
- // If we were unable to find a breakpoint, then this token hits the end of
7491
- // the file.
7492
- LEX(PM_TOKEN_EOF);
8343
+ if (parser->current.end > parser->current.start) {
8344
+ pm_token_buffer_flush(parser, &token_buffer);
8345
+ LEX(PM_TOKEN_STRING_CONTENT);
8346
+ }
7493
8347
 
8348
+ // If we were unable to find a breakpoint, then this token hits the
8349
+ // end of the file.
8350
+ LEX(PM_TOKEN_EOF);
8351
+ }
7494
8352
  case PM_LEX_REGEXP: {
7495
8353
  // First, we'll set to start of this token to be the current end.
7496
8354
  if (parser->next_start == NULL) {
@@ -7515,11 +8373,13 @@ parser_lex(pm_parser_t *parser) {
7515
8373
  // characters.
7516
8374
  const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
7517
8375
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8376
+ pm_token_buffer_t token_buffer = { 0 };
7518
8377
 
7519
8378
  while (breakpoint != NULL) {
7520
8379
  // If we hit a null byte, skip directly past it.
7521
8380
  if (*breakpoint == '\0') {
7522
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8381
+ parser->current.end = breakpoint + 1;
8382
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7523
8383
  continue;
7524
8384
  }
7525
8385
 
@@ -7540,7 +8400,8 @@ parser_lex(pm_parser_t *parser) {
7540
8400
  if (lex_mode->as.regexp.terminator != '\n') {
7541
8401
  // If the terminator is not a newline, then we can set
7542
8402
  // the next breakpoint and continue.
7543
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8403
+ parser->current.end = breakpoint + 1;
8404
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7544
8405
  continue;
7545
8406
  }
7546
8407
  }
@@ -7549,7 +8410,8 @@ parser_lex(pm_parser_t *parser) {
7549
8410
  // token to return.
7550
8411
  if (*breakpoint == lex_mode->as.regexp.terminator) {
7551
8412
  if (lex_mode->as.regexp.nesting > 0) {
7552
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8413
+ parser->current.end = breakpoint + 1;
8414
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7553
8415
  lex_mode->as.regexp.nesting--;
7554
8416
  continue;
7555
8417
  }
@@ -7559,11 +8421,12 @@ parser_lex(pm_parser_t *parser) {
7559
8421
  // first.
7560
8422
  if (breakpoint > parser->current.start) {
7561
8423
  parser->current.end = breakpoint;
8424
+ pm_token_buffer_flush(parser, &token_buffer);
7562
8425
  LEX(PM_TOKEN_STRING_CONTENT);
7563
8426
  }
7564
8427
 
7565
- // Since we've hit the terminator of the regular expression, we now
7566
- // need to parse the options.
8428
+ // Since we've hit the terminator of the regular expression,
8429
+ // we now need to parse the options.
7567
8430
  parser->current.end = breakpoint + 1;
7568
8431
  parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
7569
8432
 
@@ -7576,56 +8439,103 @@ parser_lex(pm_parser_t *parser) {
7576
8439
  // literally. In this case we'll skip past the next character
7577
8440
  // and find the next breakpoint.
7578
8441
  if (*breakpoint == '\\') {
7579
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, PM_UNESCAPE_ALL, false);
7580
- if (difference == 0) {
7581
- // we're at the end of the file
8442
+ parser->current.end = breakpoint + 1;
8443
+
8444
+ // If we've hit the end of the file, then break out of the
8445
+ // loop by setting the breakpoint to NULL.
8446
+ if (parser->current.end == parser->end) {
7582
8447
  breakpoint = NULL;
7583
8448
  continue;
7584
8449
  }
7585
8450
 
7586
- // If the result is an escaped newline ...
7587
- if (breakpoint[difference - 1] == '\n') {
7588
- if (parser->heredoc_end) {
7589
- // ... if we are on the same line as a heredoc, flush the heredoc and
7590
- // continue parsing after heredoc_end.
7591
- parser->current.end = breakpoint + difference;
7592
- parser_flush_heredoc_end(parser);
7593
- LEX(PM_TOKEN_STRING_CONTENT);
7594
- } else {
7595
- // ... else track the newline.
7596
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7597
- }
8451
+ pm_token_buffer_escape(parser, &token_buffer);
8452
+ uint8_t peeked = peek(parser);
8453
+
8454
+ switch (peeked) {
8455
+ case '\r':
8456
+ parser->current.end++;
8457
+ if (peek(parser) != '\n') {
8458
+ pm_token_buffer_push(&token_buffer, '\\');
8459
+ pm_token_buffer_push(&token_buffer, '\r');
8460
+ break;
8461
+ }
8462
+ /* fallthrough */
8463
+ case '\n':
8464
+ if (parser->heredoc_end) {
8465
+ // ... if we are on the same line as a heredoc,
8466
+ // flush the heredoc and continue parsing after
8467
+ // heredoc_end.
8468
+ parser_flush_heredoc_end(parser);
8469
+ pm_token_buffer_copy(parser, &token_buffer);
8470
+ LEX(PM_TOKEN_STRING_CONTENT);
8471
+ } else {
8472
+ // ... else track the newline.
8473
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8474
+ }
8475
+
8476
+ parser->current.end++;
8477
+ break;
8478
+ case 'c':
8479
+ case 'C':
8480
+ case 'M':
8481
+ case 'u':
8482
+ case 'x':
8483
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP);
8484
+ break;
8485
+ default:
8486
+ if (lex_mode->as.regexp.terminator == '/' && peeked == '/') {
8487
+ pm_token_buffer_push(&token_buffer, peeked);
8488
+ parser->current.end++;
8489
+ break;
8490
+ }
8491
+
8492
+ if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
8493
+ pm_token_buffer_push(&token_buffer, peeked);
8494
+ parser->current.end++;
8495
+ break;
7598
8496
  }
7599
8497
 
7600
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8498
+ token_buffer.cursor = parser->current.end;
8499
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7601
8500
  continue;
7602
8501
  }
7603
8502
 
7604
8503
  // If we hit a #, then we will attempt to lex interpolation.
7605
8504
  if (*breakpoint == '#') {
7606
8505
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7607
- if (type != PM_TOKEN_NOT_PROVIDED) {
7608
- LEX(type);
8506
+
8507
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8508
+ // If we haven't returned at this point then we had
8509
+ // something that looked like an interpolated class or
8510
+ // instance variable like "#@" but wasn't actually. In
8511
+ // this case we'll just skip to the next breakpoint.
8512
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8513
+ continue;
7609
8514
  }
7610
8515
 
7611
- // If we haven't returned at this point then we had
7612
- // something that looked like an interpolated class or
7613
- // instance variable like "#@" but wasn't actually. In this
7614
- // case we'll just skip to the next breakpoint.
7615
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7616
- continue;
8516
+ if (type == PM_TOKEN_STRING_CONTENT) {
8517
+ pm_token_buffer_flush(parser, &token_buffer);
8518
+ }
8519
+
8520
+ LEX(type);
7617
8521
  }
7618
8522
 
7619
8523
  // If we've hit the incrementor, then we need to skip past it
7620
8524
  // and find the next breakpoint.
7621
8525
  assert(*breakpoint == lex_mode->as.regexp.incrementor);
7622
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8526
+ parser->current.end = breakpoint + 1;
8527
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7623
8528
  lex_mode->as.regexp.nesting++;
7624
8529
  continue;
7625
8530
  }
7626
8531
 
7627
- // At this point, the breakpoint is NULL which means we were unable to
7628
- // find anything before the end of the file.
8532
+ if (parser->current.end > parser->current.start) {
8533
+ pm_token_buffer_flush(parser, &token_buffer);
8534
+ LEX(PM_TOKEN_STRING_CONTENT);
8535
+ }
8536
+
8537
+ // If we were unable to find a breakpoint, then this token hits the
8538
+ // end of the file.
7629
8539
  LEX(PM_TOKEN_EOF);
7630
8540
  }
7631
8541
  case PM_LEX_STRING: {
@@ -7646,30 +8556,34 @@ parser_lex(pm_parser_t *parser) {
7646
8556
 
7647
8557
  // These are the places where we need to split up the content of the
7648
8558
  // string. We'll use strpbrk to find the first of these characters.
7649
- const uint8_t *breakpoints = parser->lex_modes.current->as.string.breakpoints;
8559
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
8560
+ const uint8_t *breakpoints = lex_mode->as.string.breakpoints;
7650
8561
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7651
8562
 
8563
+ // If we haven't found an escape yet, then this buffer will be
8564
+ // unallocated since we can refer directly to the source string.
8565
+ pm_token_buffer_t token_buffer = { 0 };
8566
+
7652
8567
  while (breakpoint != NULL) {
7653
8568
  // If we hit the incrementor, then we'll increment then nesting and
7654
8569
  // continue lexing.
7655
- if (
7656
- parser->lex_modes.current->as.string.incrementor != '\0' &&
7657
- *breakpoint == parser->lex_modes.current->as.string.incrementor
7658
- ) {
7659
- parser->lex_modes.current->as.string.nesting++;
7660
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8570
+ if (lex_mode->as.string.incrementor != '\0' && *breakpoint == lex_mode->as.string.incrementor) {
8571
+ lex_mode->as.string.nesting++;
8572
+ parser->current.end = breakpoint + 1;
8573
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7661
8574
  continue;
7662
8575
  }
7663
8576
 
7664
8577
  // Note that we have to check the terminator here first because we could
7665
8578
  // potentially be parsing a % string that has a # character as the
7666
8579
  // terminator.
7667
- if (*breakpoint == parser->lex_modes.current->as.string.terminator) {
8580
+ if (*breakpoint == lex_mode->as.string.terminator) {
7668
8581
  // If this terminator doesn't actually close the string, then we need
7669
8582
  // to continue on past it.
7670
- if (parser->lex_modes.current->as.string.nesting > 0) {
7671
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
7672
- parser->lex_modes.current->as.string.nesting--;
8583
+ if (lex_mode->as.string.nesting > 0) {
8584
+ parser->current.end = breakpoint + 1;
8585
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8586
+ lex_mode->as.string.nesting--;
7673
8587
  continue;
7674
8588
  }
7675
8589
 
@@ -7677,6 +8591,7 @@ parser_lex(pm_parser_t *parser) {
7677
8591
  // then we need to return that content as string content first.
7678
8592
  if (breakpoint > parser->current.start) {
7679
8593
  parser->current.end = breakpoint;
8594
+ pm_token_buffer_flush(parser, &token_buffer);
7680
8595
  LEX(PM_TOKEN_STRING_CONTENT);
7681
8596
  }
7682
8597
 
@@ -7690,11 +8605,7 @@ parser_lex(pm_parser_t *parser) {
7690
8605
  parser->current.end = breakpoint + 1;
7691
8606
  }
7692
8607
 
7693
- if (
7694
- parser->lex_modes.current->as.string.label_allowed &&
7695
- (peek(parser) == ':') &&
7696
- (peek_offset(parser, 1) != ':')
7697
- ) {
8608
+ if (lex_mode->as.string.label_allowed && (peek(parser) == ':') && (peek_offset(parser, 1) != ':')) {
7698
8609
  parser->current.end++;
7699
8610
  lex_state_set(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
7700
8611
  lex_mode_pop(parser);
@@ -7712,11 +8623,13 @@ parser_lex(pm_parser_t *parser) {
7712
8623
  if (*breakpoint == '\n') {
7713
8624
  if (parser->heredoc_end == NULL) {
7714
8625
  pm_newline_list_append(&parser->newline_list, breakpoint);
7715
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8626
+ parser->current.end = breakpoint + 1;
8627
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7716
8628
  continue;
7717
8629
  } else {
7718
8630
  parser->current.end = breakpoint + 1;
7719
8631
  parser_flush_heredoc_end(parser);
8632
+ pm_token_buffer_flush(parser, &token_buffer);
7720
8633
  LEX(PM_TOKEN_STRING_CONTENT);
7721
8634
  }
7722
8635
  }
@@ -7724,58 +8637,110 @@ parser_lex(pm_parser_t *parser) {
7724
8637
  switch (*breakpoint) {
7725
8638
  case '\0':
7726
8639
  // Skip directly past the null character.
7727
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8640
+ parser->current.end = breakpoint + 1;
8641
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7728
8642
  break;
7729
8643
  case '\\': {
7730
- // If we hit escapes, then we need to treat the next token
7731
- // literally. In this case we'll skip past the next character and
7732
- // find the next breakpoint.
7733
- pm_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? PM_UNESCAPE_ALL : PM_UNESCAPE_MINIMAL;
7734
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7735
- if (difference == 0) {
7736
- // we're at the end of the file
8644
+ // Here we hit escapes.
8645
+ parser->current.end = breakpoint + 1;
8646
+
8647
+ // If we've hit the end of the file, then break out of
8648
+ // the loop by setting the breakpoint to NULL.
8649
+ if (parser->current.end == parser->end) {
7737
8650
  breakpoint = NULL;
7738
- break;
8651
+ continue;
7739
8652
  }
7740
8653
 
7741
- // If the result is an escaped newline ...
7742
- if (breakpoint[difference - 1] == '\n') {
7743
- if (parser->heredoc_end) {
7744
- // ... if we are on the same line as a heredoc, flush the heredoc and
7745
- // continue parsing after heredoc_end.
7746
- parser->current.end = breakpoint + difference;
7747
- parser_flush_heredoc_end(parser);
7748
- LEX(PM_TOKEN_STRING_CONTENT);
7749
- } else {
7750
- // ... else track the newline.
7751
- pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
7752
- }
8654
+ pm_token_buffer_escape(parser, &token_buffer);
8655
+ uint8_t peeked = peek(parser);
8656
+
8657
+ switch (peeked) {
8658
+ case '\\':
8659
+ pm_token_buffer_push(&token_buffer, '\\');
8660
+ parser->current.end++;
8661
+ break;
8662
+ case '\r':
8663
+ parser->current.end++;
8664
+ if (peek(parser) != '\n') {
8665
+ if (!lex_mode->as.string.interpolation) {
8666
+ pm_token_buffer_push(&token_buffer, '\\');
8667
+ }
8668
+ pm_token_buffer_push(&token_buffer, '\r');
8669
+ break;
8670
+ }
8671
+ /* fallthrough */
8672
+ case '\n':
8673
+ if (!lex_mode->as.string.interpolation) {
8674
+ pm_token_buffer_push(&token_buffer, '\\');
8675
+ pm_token_buffer_push(&token_buffer, '\n');
8676
+ }
8677
+
8678
+ if (parser->heredoc_end) {
8679
+ // ... if we are on the same line as a heredoc,
8680
+ // flush the heredoc and continue parsing after
8681
+ // heredoc_end.
8682
+ parser_flush_heredoc_end(parser);
8683
+ pm_token_buffer_copy(parser, &token_buffer);
8684
+ LEX(PM_TOKEN_STRING_CONTENT);
8685
+ } else {
8686
+ // ... else track the newline.
8687
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
8688
+ }
8689
+
8690
+ parser->current.end++;
8691
+ break;
8692
+ default:
8693
+ if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
8694
+ pm_token_buffer_push(&token_buffer, peeked);
8695
+ parser->current.end++;
8696
+ } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
8697
+ pm_token_buffer_push(&token_buffer, peeked);
8698
+ parser->current.end++;
8699
+ } else if (lex_mode->as.string.interpolation) {
8700
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8701
+ } else {
8702
+ pm_token_buffer_push(&token_buffer, '\\');
8703
+ pm_token_buffer_push(&token_buffer, peeked);
8704
+ parser->current.end++;
8705
+ }
8706
+
8707
+ break;
7753
8708
  }
7754
8709
 
7755
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8710
+ token_buffer.cursor = parser->current.end;
8711
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7756
8712
  break;
7757
8713
  }
7758
8714
  case '#': {
7759
8715
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7760
- if (type != PM_TOKEN_NOT_PROVIDED) {
7761
- LEX(type);
8716
+
8717
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8718
+ // If we haven't returned at this point then we had something that
8719
+ // looked like an interpolated class or instance variable like "#@"
8720
+ // but wasn't actually. In this case we'll just skip to the next
8721
+ // breakpoint.
8722
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8723
+ break;
7762
8724
  }
7763
8725
 
7764
- // If we haven't returned at this point then we had something that
7765
- // looked like an interpolated class or instance variable like "#@"
7766
- // but wasn't actually. In this case we'll just skip to the next
7767
- // breakpoint.
7768
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7769
- break;
8726
+ if (type == PM_TOKEN_STRING_CONTENT) {
8727
+ pm_token_buffer_flush(parser, &token_buffer);
8728
+ }
8729
+
8730
+ LEX(type);
7770
8731
  }
7771
8732
  default:
7772
8733
  assert(false && "unreachable");
7773
8734
  }
7774
8735
  }
7775
8736
 
8737
+ if (parser->current.end > parser->current.start) {
8738
+ pm_token_buffer_flush(parser, &token_buffer);
8739
+ LEX(PM_TOKEN_STRING_CONTENT);
8740
+ }
8741
+
7776
8742
  // If we've hit the end of the string, then this is an unterminated
7777
8743
  // string. In that case we'll return the EOF token.
7778
- parser->current.end = parser->end;
7779
8744
  LEX(PM_TOKEN_EOF);
7780
8745
  }
7781
8746
  case PM_LEX_HEREDOC: {
@@ -7797,16 +8762,15 @@ parser_lex(pm_parser_t *parser) {
7797
8762
 
7798
8763
  // Now let's grab the information about the identifier off of the current
7799
8764
  // lex mode.
7800
- const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start;
7801
- size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length;
8765
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
8766
+ const uint8_t *ident_start = lex_mode->as.heredoc.ident_start;
8767
+ size_t ident_length = lex_mode->as.heredoc.ident_length;
7802
8768
 
7803
8769
  // If we are immediately following a newline and we have hit the
7804
8770
  // terminator, then we need to return the ending of the heredoc.
7805
8771
  if (current_token_starts_line(parser)) {
7806
8772
  const uint8_t *start = parser->current.start;
7807
- if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
7808
- start += pm_strspn_inline_whitespace(start, parser->end - start);
7809
- }
8773
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
7810
8774
 
7811
8775
  if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
7812
8776
  bool matched = true;
@@ -7824,10 +8788,10 @@ parser_lex(pm_parser_t *parser) {
7824
8788
  }
7825
8789
 
7826
8790
  if (matched) {
7827
- if (*parser->lex_modes.current->as.heredoc.next_start == '\\') {
8791
+ if (*lex_mode->as.heredoc.next_start == '\\') {
7828
8792
  parser->next_start = NULL;
7829
8793
  } else {
7830
- parser->next_start = parser->lex_modes.current->as.heredoc.next_start;
8794
+ parser->next_start = lex_mode->as.heredoc.next_start;
7831
8795
  parser->heredoc_end = parser->current.end;
7832
8796
  }
7833
8797
 
@@ -7838,61 +8802,91 @@ parser_lex(pm_parser_t *parser) {
7838
8802
  LEX(PM_TOKEN_HEREDOC_END);
7839
8803
  }
7840
8804
  }
8805
+
8806
+ if (
8807
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
8808
+ (lex_mode->as.heredoc.common_whitespace > whitespace) &&
8809
+ peek_at(parser, start) != '\n'
8810
+ ) {
8811
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8812
+ }
7841
8813
  }
7842
8814
 
7843
- // Otherwise we'll be parsing string content. These are the places where
7844
- // we need to split up the content of the heredoc. We'll use strpbrk to
7845
- // find the first of these characters.
8815
+ // Otherwise we'll be parsing string content. These are the places
8816
+ // where we need to split up the content of the heredoc. We'll use
8817
+ // strpbrk to find the first of these characters.
7846
8818
  uint8_t breakpoints[] = "\n\\#";
7847
8819
 
7848
- pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
8820
+ pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
7849
8821
  if (quote == PM_HEREDOC_QUOTE_SINGLE) {
7850
8822
  breakpoints[2] = '\0';
7851
8823
  }
7852
8824
 
7853
8825
  const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8826
+ pm_token_buffer_t token_buffer = { 0 };
8827
+ bool was_escaped_newline = false;
7854
8828
 
7855
8829
  while (breakpoint != NULL) {
7856
8830
  switch (*breakpoint) {
7857
8831
  case '\0':
7858
8832
  // Skip directly past the null character.
7859
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8833
+ parser->current.end = breakpoint + 1;
8834
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7860
8835
  break;
7861
8836
  case '\n': {
7862
8837
  if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
7863
8838
  parser_flush_heredoc_end(parser);
7864
8839
  parser->current.end = breakpoint + 1;
8840
+ pm_token_buffer_flush(parser, &token_buffer);
7865
8841
  LEX(PM_TOKEN_STRING_CONTENT);
7866
8842
  }
7867
8843
 
7868
8844
  pm_newline_list_append(&parser->newline_list, breakpoint);
7869
8845
 
8846
+ // If we have a - or ~ heredoc, then we can match after
8847
+ // some leading whitespace.
7870
8848
  const uint8_t *start = breakpoint + 1;
7871
- if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) {
7872
- start += pm_strspn_inline_whitespace(start, parser->end - start);
7873
- }
8849
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
7874
8850
 
7875
- // If we have hit a newline that is followed by a valid terminator,
7876
- // then we need to return the content of the heredoc here as string
7877
- // content. Then, the next time a token is lexed, it will match
7878
- // again and return the end of the heredoc.
8851
+ // If we have hit a newline that is followed by a valid
8852
+ // terminator, then we need to return the content of the
8853
+ // heredoc here as string content. Then, the next time a
8854
+ // token is lexed, it will match again and return the
8855
+ // end of the heredoc.
7879
8856
  if (
8857
+ !was_escaped_newline &&
7880
8858
  (start + ident_length <= parser->end) &&
7881
8859
  (memcmp(start, ident_start, ident_length) == 0)
7882
8860
  ) {
7883
- // Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
8861
+ // Heredoc terminators must be followed by a
8862
+ // newline, CRLF, or EOF to be valid.
7884
8863
  if (
7885
8864
  start + ident_length == parser->end ||
7886
8865
  match_eol_at(parser, start + ident_length)
7887
8866
  ) {
7888
8867
  parser->current.end = breakpoint + 1;
8868
+ pm_token_buffer_flush(parser, &token_buffer);
8869
+ LEX(PM_TOKEN_STRING_CONTENT);
8870
+ }
8871
+ }
8872
+
8873
+ if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
8874
+ if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
8875
+ lex_mode->as.heredoc.common_whitespace = whitespace;
8876
+ }
8877
+
8878
+ parser->current.end = breakpoint + 1;
8879
+
8880
+ if (!was_escaped_newline) {
8881
+ pm_token_buffer_flush(parser, &token_buffer);
7889
8882
  LEX(PM_TOKEN_STRING_CONTENT);
7890
8883
  }
7891
8884
  }
7892
8885
 
7893
- // Otherwise we hit a newline and it wasn't followed by a
7894
- // terminator, so we can continue parsing.
7895
- breakpoint = pm_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
8886
+ // Otherwise we hit a newline and it wasn't followed by
8887
+ // a terminator, so we can continue parsing.
8888
+ parser->current.end = breakpoint + 1;
8889
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7896
8890
  break;
7897
8891
  }
7898
8892
  case '\\': {
@@ -7902,46 +8896,98 @@ parser_lex(pm_parser_t *parser) {
7902
8896
  // stop looping before the newline and not after the
7903
8897
  // newline so that we can still potentially find the
7904
8898
  // terminator of the heredoc.
7905
- size_t eol_length = match_eol_at(parser, breakpoint + 1);
7906
- if (eol_length) {
7907
- breakpoint += eol_length;
7908
- } else {
7909
- pm_unescape_type_t unescape_type = (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL;
7910
- size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
7911
- if (difference == 0) {
7912
- // we're at the end of the file
7913
- breakpoint = NULL;
7914
- break;
7915
- }
8899
+ parser->current.end = breakpoint + 1;
8900
+
8901
+ // If we've hit the end of the file, then break out of
8902
+ // the loop by setting the breakpoint to NULL.
8903
+ if (parser->current.end == parser->end) {
8904
+ breakpoint = NULL;
8905
+ continue;
8906
+ }
7916
8907
 
7917
- pm_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
8908
+ pm_token_buffer_escape(parser, &token_buffer);
8909
+ uint8_t peeked = peek(parser);
7918
8910
 
7919
- breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
8911
+ if (quote == PM_HEREDOC_QUOTE_SINGLE) {
8912
+ switch (peeked) {
8913
+ case '\r':
8914
+ parser->current.end++;
8915
+ if (peek(parser) != '\n') {
8916
+ pm_token_buffer_push(&token_buffer, '\\');
8917
+ pm_token_buffer_push(&token_buffer, '\r');
8918
+ break;
8919
+ }
8920
+ /* fallthrough */
8921
+ case '\n':
8922
+ pm_token_buffer_push(&token_buffer, '\\');
8923
+ pm_token_buffer_push(&token_buffer, '\n');
8924
+ token_buffer.cursor = parser->current.end + 1;
8925
+ breakpoint = parser->current.end;
8926
+ continue;
8927
+ default:
8928
+ parser->current.end++;
8929
+ pm_token_buffer_push(&token_buffer, '\\');
8930
+ pm_token_buffer_push(&token_buffer, peeked);
8931
+ break;
8932
+ }
8933
+ } else {
8934
+ switch (peeked) {
8935
+ case '\r':
8936
+ parser->current.end++;
8937
+ if (peek(parser) != '\n') {
8938
+ pm_token_buffer_push(&token_buffer, '\r');
8939
+ break;
8940
+ }
8941
+ /* fallthrough */
8942
+ case '\n':
8943
+ was_escaped_newline = true;
8944
+ token_buffer.cursor = parser->current.end + 1;
8945
+ breakpoint = parser->current.end;
8946
+ continue;
8947
+ default:
8948
+ escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
8949
+ break;
8950
+ }
7920
8951
  }
7921
8952
 
8953
+ token_buffer.cursor = parser->current.end;
8954
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7922
8955
  break;
7923
8956
  }
7924
8957
  case '#': {
7925
8958
  pm_token_type_t type = lex_interpolation(parser, breakpoint);
7926
- if (type != PM_TOKEN_NOT_PROVIDED) {
7927
- LEX(type);
8959
+
8960
+ if (type == PM_TOKEN_NOT_PROVIDED) {
8961
+ // If we haven't returned at this point then we had
8962
+ // something that looked like an interpolated class
8963
+ // or instance variable like "#@" but wasn't
8964
+ // actually. In this case we'll just skip to the
8965
+ // next breakpoint.
8966
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
8967
+ break;
7928
8968
  }
7929
8969
 
7930
- // If we haven't returned at this point then we had something
7931
- // that looked like an interpolated class or instance variable
7932
- // like "#@" but wasn't actually. In this case we'll just skip
7933
- // to the next breakpoint.
7934
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
7935
- break;
8970
+ if (type == PM_TOKEN_STRING_CONTENT) {
8971
+ pm_token_buffer_flush(parser, &token_buffer);
8972
+ }
8973
+
8974
+ LEX(type);
7936
8975
  }
7937
8976
  default:
7938
8977
  assert(false && "unreachable");
7939
8978
  }
8979
+
8980
+ was_escaped_newline = false;
8981
+ }
8982
+
8983
+ if (parser->current.end > parser->current.start) {
8984
+ parser->current.end = parser->end;
8985
+ pm_token_buffer_flush(parser, &token_buffer);
8986
+ LEX(PM_TOKEN_STRING_CONTENT);
7940
8987
  }
7941
8988
 
7942
8989
  // If we've hit the end of the string, then this is an unterminated
7943
8990
  // heredoc. In that case we'll return the EOF token.
7944
- parser->current.end = parser->end;
7945
8991
  LEX(PM_TOKEN_EOF);
7946
8992
  }
7947
8993
  }
@@ -7955,67 +9001,6 @@ parser_lex(pm_parser_t *parser) {
7955
9001
  /* Parse functions */
7956
9002
  /******************************************************************************/
7957
9003
 
7958
- // When we are parsing certain content, we need to unescape the content to
7959
- // provide to the consumers of the parser. The following functions accept a range
7960
- // of characters from the source and unescapes into the provided type.
7961
- //
7962
- // We have functions for unescaping regular expression nodes, string nodes,
7963
- // symbol nodes, and xstring nodes
7964
- static pm_regular_expression_node_t *
7965
- pm_regular_expression_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7966
- pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, opening, content, closing);
7967
-
7968
- assert((content->end - content->start) >= 0);
7969
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7970
-
7971
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
7972
- return node;
7973
- }
7974
-
7975
- static pm_symbol_node_t *
7976
- pm_symbol_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7977
- pm_symbol_node_t *node = pm_symbol_node_create(parser, opening, content, closing);
7978
-
7979
- assert((content->end - content->start) >= 0);
7980
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7981
-
7982
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
7983
- return node;
7984
- }
7985
-
7986
- static pm_string_node_t *
7987
- pm_char_literal_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7988
- pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing);
7989
-
7990
- assert((content->end - content->start) >= 0);
7991
- pm_string_shared_init(&node->unescaped, content->start, content->end);
7992
-
7993
- pm_unescape_manipulate_char_literal(parser, &node->unescaped, unescape_type);
7994
- return node;
7995
- }
7996
-
7997
- static pm_string_node_t *
7998
- pm_string_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
7999
- pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing);
8000
-
8001
- assert((content->end - content->start) >= 0);
8002
- pm_string_shared_init(&node->unescaped, content->start, content->end);
8003
-
8004
- pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
8005
- return node;
8006
- }
8007
-
8008
- static pm_x_string_node_t *
8009
- pm_xstring_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
8010
- pm_x_string_node_t *node = pm_xstring_node_create(parser, opening, content, closing);
8011
-
8012
- assert((content->end - content->start) >= 0);
8013
- pm_string_shared_init(&node->unescaped, content->start, content->end);
8014
-
8015
- pm_unescape_manipulate_string(parser, &node->unescaped, PM_UNESCAPE_ALL);
8016
- return node;
8017
- }
8018
-
8019
9004
  // These are the various precedence rules. Because we are using a Pratt parser,
8020
9005
  // they are named binding power to represent the manner in which nodes are bound
8021
9006
  // together in the stack.
@@ -8269,7 +9254,7 @@ expect1(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id) {
8269
9254
  if (accept1(parser, type)) return;
8270
9255
 
8271
9256
  const uint8_t *location = parser->previous.end;
8272
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9257
+ pm_parser_err(parser, location, location, diag_id);
8273
9258
 
8274
9259
  parser->previous.start = location;
8275
9260
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8282,7 +9267,7 @@ expect2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_di
8282
9267
  if (accept2(parser, type1, type2)) return;
8283
9268
 
8284
9269
  const uint8_t *location = parser->previous.end;
8285
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9270
+ pm_parser_err(parser, location, location, diag_id);
8286
9271
 
8287
9272
  parser->previous.start = location;
8288
9273
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8294,7 +9279,7 @@ expect3(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_to
8294
9279
  if (accept3(parser, type1, type2, type3)) return;
8295
9280
 
8296
9281
  const uint8_t *location = parser->previous.end;
8297
- pm_diagnostic_list_append(&parser->error_list, location, location, diag_id);
9282
+ pm_parser_err(parser, location, location, diag_id);
8298
9283
 
8299
9284
  parser->previous.start = location;
8300
9285
  parser->previous.type = PM_TOKEN_MISSING;
@@ -8389,23 +9374,23 @@ parse_starred_expression(pm_parser_t *parser, pm_binding_power_t binding_power,
8389
9374
  }
8390
9375
 
8391
9376
  // Convert the name of a method into the corresponding write method name. For
8392
- // exmaple, foo would be turned into foo=.
9377
+ // example, foo would be turned into foo=.
8393
9378
  static void
8394
- parse_write_name(pm_string_t *string) {
9379
+ parse_write_name(pm_parser_t *parser, pm_constant_id_t *name_field) {
8395
9380
  // The method name needs to change. If we previously had
8396
9381
  // foo, we now need foo=. In this case we'll allocate a new
8397
9382
  // owned string, copy the previous method name in, and
8398
9383
  // append an =.
8399
- size_t length = pm_string_length(string);
9384
+ pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *name_field);
9385
+ size_t length = constant->length;
8400
9386
  uint8_t *name = calloc(length + 1, sizeof(uint8_t));
8401
9387
  if (name == NULL) return;
8402
9388
 
8403
- memcpy(name, pm_string_source(string), length);
9389
+ memcpy(name, constant->start, length);
8404
9390
  name[length] = '=';
8405
9391
 
8406
9392
  // Now switch the name to the new string.
8407
- pm_string_free(string);
8408
- pm_string_owned_init(string, name, length + 1);
9393
+ *name_field = pm_constant_pool_insert_owned(&parser->constant_pool, name, length + 1);
8409
9394
  }
8410
9395
 
8411
9396
  // Convert the given node into a valid target node.
@@ -8428,7 +9413,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8428
9413
  return target;
8429
9414
  case PM_BACK_REFERENCE_READ_NODE:
8430
9415
  case PM_NUMBERED_REFERENCE_READ_NODE:
8431
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_READONLY);
9416
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
8432
9417
  return target;
8433
9418
  case PM_GLOBAL_VARIABLE_READ_NODE:
8434
9419
  assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t));
@@ -8436,7 +9421,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8436
9421
  return target;
8437
9422
  case PM_LOCAL_VARIABLE_READ_NODE:
8438
9423
  if (token_is_numbered_parameter(target->location.start, target->location.end)) {
8439
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9424
+ pm_parser_err_node(parser, target, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8440
9425
  } else {
8441
9426
  assert(sizeof(pm_local_variable_target_node_t) == sizeof(pm_local_variable_read_node_t));
8442
9427
  target->type = PM_LOCAL_VARIABLE_TARGET_NODE;
@@ -8489,21 +9474,23 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8489
9474
  pm_parser_local_add_location(parser, message.start, message.end);
8490
9475
  pm_node_destroy(parser, target);
8491
9476
 
9477
+ uint32_t depth = 0;
9478
+ for (pm_scope_t *scope = parser->current_scope; scope && scope->transparent; depth++, scope = scope->previous);
8492
9479
  const pm_token_t name = { .type = PM_TOKEN_IDENTIFIER, .start = message.start, .end = message.end };
8493
- target = (pm_node_t *) pm_local_variable_read_node_create(parser, &name, 0);
9480
+ target = (pm_node_t *) pm_local_variable_read_node_create(parser, &name, depth);
8494
9481
 
8495
9482
  assert(sizeof(pm_local_variable_target_node_t) == sizeof(pm_local_variable_read_node_t));
8496
9483
  target->type = PM_LOCAL_VARIABLE_TARGET_NODE;
8497
9484
 
8498
9485
  if (token_is_numbered_parameter(message.start, message.end)) {
8499
- pm_diagnostic_list_append(&parser->error_list, message.start, message.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9486
+ pm_parser_err_location(parser, &message, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8500
9487
  }
8501
9488
 
8502
9489
  return target;
8503
9490
  }
8504
9491
 
8505
9492
  if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
8506
- parse_write_name(&call->name);
9493
+ parse_write_name(parser, &call->name);
8507
9494
  return (pm_node_t *) call;
8508
9495
  }
8509
9496
  }
@@ -8518,9 +9505,8 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8518
9505
  (call->message_loc.end[-1] == ']') &&
8519
9506
  (call->block == NULL)
8520
9507
  ) {
8521
- // Free the previous name and replace it with "[]=".
8522
- pm_string_free(&call->name);
8523
- pm_string_constant_init(&call->name, "[]=", 3);
9508
+ // Replace the name with "[]=".
9509
+ call->name = pm_parser_constant_id_static(parser, "[]=", 3);
8524
9510
  return target;
8525
9511
  }
8526
9512
  }
@@ -8529,7 +9515,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
8529
9515
  // In this case we have a node that we don't know how to convert
8530
9516
  // into a target. We need to treat it as an error. For now, we'll
8531
9517
  // mark it as an error and just skip right past it.
8532
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9518
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_UNEXPECTED);
8533
9519
  return target;
8534
9520
  }
8535
9521
  }
@@ -8542,7 +9528,7 @@ parse_target_validate(pm_parser_t *parser, pm_node_t *target) {
8542
9528
 
8543
9529
  // Ensure that we have either an = or a ) after the targets.
8544
9530
  if (!match3(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_KEYWORD_IN)) {
8545
- pm_diagnostic_list_append(&parser->error_list, result->location.start, result->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9531
+ pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED);
8546
9532
  }
8547
9533
 
8548
9534
  return result;
@@ -8568,7 +9554,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8568
9554
  }
8569
9555
  case PM_BACK_REFERENCE_READ_NODE:
8570
9556
  case PM_NUMBERED_REFERENCE_READ_NODE:
8571
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_WRITE_TARGET_READONLY);
9557
+ pm_parser_err_node(parser, target, PM_ERR_WRITE_TARGET_READONLY);
8572
9558
  /* fallthrough */
8573
9559
  case PM_GLOBAL_VARIABLE_READ_NODE: {
8574
9560
  pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value);
@@ -8577,7 +9563,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8577
9563
  }
8578
9564
  case PM_LOCAL_VARIABLE_READ_NODE: {
8579
9565
  if (token_is_numbered_parameter(target->location.start, target->location.end)) {
8580
- pm_diagnostic_list_append(&parser->error_list, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9566
+ pm_parser_err_node(parser, target, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8581
9567
  }
8582
9568
 
8583
9569
  pm_local_variable_read_node_t *local_read = (pm_local_variable_read_node_t *) target;
@@ -8642,7 +9628,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8642
9628
  target = (pm_node_t *) pm_local_variable_write_node_create(parser, constant_id, 0, value, &message, operator);
8643
9629
 
8644
9630
  if (token_is_numbered_parameter(message.start, message.end)) {
8645
- pm_diagnostic_list_append(&parser->error_list, message.start, message.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
9631
+ pm_parser_err_location(parser, &message, PM_ERR_PARAMETER_NUMBERED_RESERVED);
8646
9632
  }
8647
9633
 
8648
9634
  return target;
@@ -8665,7 +9651,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8665
9651
  pm_arguments_node_arguments_append(arguments, value);
8666
9652
  call->base.location.end = arguments->base.location.end;
8667
9653
 
8668
- parse_write_name(&call->name);
9654
+ parse_write_name(parser, &call->name);
8669
9655
  return (pm_node_t *) call;
8670
9656
  }
8671
9657
  }
@@ -8686,9 +9672,8 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8686
9672
  pm_arguments_node_arguments_append(call->arguments, value);
8687
9673
  target->location.end = value->location.end;
8688
9674
 
8689
- // Free the previous name and replace it with "[]=".
8690
- pm_string_free(&call->name);
8691
- pm_string_constant_init(&call->name, "[]=", 3);
9675
+ // Replace the name with "[]=".
9676
+ call->name = pm_parser_constant_id_static(parser, "[]=", 3);
8692
9677
  return target;
8693
9678
  }
8694
9679
 
@@ -8704,7 +9689,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
8704
9689
  // In this case we have a node that we don't know how to convert into a
8705
9690
  // target. We need to treat it as an error. For now, we'll mark it as an
8706
9691
  // error and just skip right past it.
8707
- pm_diagnostic_list_append(&parser->error_list, operator->start, operator->end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9692
+ pm_parser_err_token(parser, operator, PM_ERR_WRITE_TARGET_UNEXPECTED);
8708
9693
  return target;
8709
9694
  }
8710
9695
  }
@@ -8730,7 +9715,7 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b
8730
9715
  // anonymous. It can be the final target or be in the middle if
8731
9716
  // there haven't been any others yet.
8732
9717
  if (has_splat) {
8733
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
9718
+ pm_parser_err_previous(parser, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS);
8734
9719
  }
8735
9720
 
8736
9721
  pm_token_t star_operator = parser->previous;
@@ -8770,7 +9755,7 @@ parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_
8770
9755
 
8771
9756
  // Ensure that we have either an = or a ) after the targets.
8772
9757
  if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) {
8773
- pm_diagnostic_list_append(&parser->error_list, result->location.start, result->location.end, PM_ERR_WRITE_TARGET_UNEXPECTED);
9758
+ pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED);
8774
9759
  }
8775
9760
 
8776
9761
  return result;
@@ -8863,7 +9848,7 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
8863
9848
  if (token_begins_expression_p(parser->current.type)) {
8864
9849
  value = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
8865
9850
  } else if (pm_parser_local_depth(parser, &operator) == -1) {
8866
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
9851
+ pm_parser_err_token(parser, &operator, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH);
8867
9852
  }
8868
9853
 
8869
9854
  element = (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator);
@@ -8970,7 +9955,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
8970
9955
 
8971
9956
  while (!match1(parser, PM_TOKEN_EOF)) {
8972
9957
  if (parsed_block_argument) {
8973
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_AFTER_BLOCK);
9958
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_AFTER_BLOCK);
8974
9959
  }
8975
9960
 
8976
9961
  pm_node_t *argument = NULL;
@@ -8979,7 +9964,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
8979
9964
  case PM_TOKEN_USTAR_STAR:
8980
9965
  case PM_TOKEN_LABEL: {
8981
9966
  if (parsed_bare_hash) {
8982
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_BARE_HASH);
9967
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_BARE_HASH);
8983
9968
  }
8984
9969
 
8985
9970
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -9001,7 +9986,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9001
9986
  if (token_begins_expression_p(parser->current.type)) {
9002
9987
  expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_ARGUMENT);
9003
9988
  } else if (pm_parser_local_depth(parser, &operator) == -1) {
9004
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
9989
+ pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_AMP);
9005
9990
  }
9006
9991
 
9007
9992
  argument = (pm_node_t *) pm_block_argument_node_create(parser, &operator, expression);
@@ -9020,7 +10005,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9020
10005
 
9021
10006
  if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA)) {
9022
10007
  if (pm_parser_local_depth(parser, &parser->previous) == -1) {
9023
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
10008
+ pm_parser_err_token(parser, &operator, PM_ERR_ARGUMENT_NO_FORWARDING_STAR);
9024
10009
  }
9025
10010
 
9026
10011
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, NULL);
@@ -9028,7 +10013,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9028
10013
  pm_node_t *expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT);
9029
10014
 
9030
10015
  if (parsed_bare_hash) {
9031
- pm_diagnostic_list_append(&parser->error_list, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT);
10016
+ pm_parser_err(parser, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT);
9032
10017
  }
9033
10018
 
9034
10019
  argument = (pm_node_t *) pm_splat_node_create(parser, &operator, expression);
@@ -9049,7 +10034,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9049
10034
  argument = (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
9050
10035
  } else {
9051
10036
  if (pm_parser_local_depth(parser, &parser->previous) == -1) {
9052
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
10037
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
9053
10038
  }
9054
10039
 
9055
10040
  argument = (pm_node_t *) pm_forwarding_arguments_node_create(parser, &parser->previous);
@@ -9066,7 +10051,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for
9066
10051
 
9067
10052
  if (pm_symbol_node_label_p(argument) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) {
9068
10053
  if (parsed_bare_hash) {
9069
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_BARE_HASH);
10054
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_BARE_HASH);
9070
10055
  }
9071
10056
 
9072
10057
  pm_token_t operator;
@@ -9145,7 +10130,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9145
10130
 
9146
10131
  if (node->parameters.size > 0 && match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
9147
10132
  if (parsed_splat) {
9148
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
10133
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
9149
10134
  }
9150
10135
 
9151
10136
  param = (pm_node_t *) pm_splat_node_create(parser, &parser->previous, NULL);
@@ -9157,7 +10142,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9157
10142
  param = (pm_node_t *) parse_required_destructured_parameter(parser);
9158
10143
  } else if (accept1(parser, PM_TOKEN_USTAR)) {
9159
10144
  if (parsed_splat) {
9160
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
10145
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_SPLAT);
9161
10146
  }
9162
10147
 
9163
10148
  pm_token_t star = parser->previous;
@@ -9166,6 +10151,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9166
10151
  if (accept1(parser, PM_TOKEN_IDENTIFIER)) {
9167
10152
  pm_token_t name = parser->previous;
9168
10153
  value = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
10154
+ pm_parser_parameter_name_check(parser, &name);
9169
10155
  pm_parser_local_add_token(parser, &name);
9170
10156
  }
9171
10157
 
@@ -9176,6 +10162,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) {
9176
10162
  pm_token_t name = parser->previous;
9177
10163
 
9178
10164
  param = (pm_node_t *) pm_required_parameter_node_create(parser, &name);
10165
+ pm_parser_parameter_name_check(parser, &name);
9179
10166
  pm_parser_local_add_token(parser, &name);
9180
10167
  }
9181
10168
 
@@ -9237,12 +10224,12 @@ update_parameter_state(pm_parser_t *parser, pm_token_t *token, pm_parameters_ord
9237
10224
  }
9238
10225
 
9239
10226
  if (token->type == PM_TOKEN_USTAR && *current == PM_PARAMETERS_ORDER_AFTER_OPTIONAL) {
9240
- pm_diagnostic_list_append(&parser->error_list, token->start, token->end, PM_ERR_PARAMETER_STAR);
10227
+ pm_parser_err_token(parser, token, PM_ERR_PARAMETER_STAR);
9241
10228
  }
9242
10229
 
9243
10230
  if (*current == PM_PARAMETERS_ORDER_NOTHING_AFTER || state > *current) {
9244
10231
  // We know what transition we failed on, so we can provide a better error here.
9245
- pm_diagnostic_list_append(&parser->error_list, token->start, token->end, PM_ERR_PARAMETER_ORDER);
10232
+ pm_parser_err_token(parser, token, PM_ERR_PARAMETER_ORDER);
9246
10233
  } else if (state < *current) {
9247
10234
  *current = state;
9248
10235
  }
@@ -9297,7 +10284,7 @@ parse_parameters(
9297
10284
  if (params->block == NULL) {
9298
10285
  pm_parameters_node_block_set(params, param);
9299
10286
  } else {
9300
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_BLOCK_MULTI);
10287
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_BLOCK_MULTI);
9301
10288
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9302
10289
  }
9303
10290
 
@@ -9305,7 +10292,7 @@ parse_parameters(
9305
10292
  }
9306
10293
  case PM_TOKEN_UDOT_DOT_DOT: {
9307
10294
  if (!allows_forwarding_parameter) {
9308
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
10295
+ pm_parser_err_current(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES);
9309
10296
  }
9310
10297
  if (order > PM_PARAMETERS_ORDER_NOTHING_AFTER) {
9311
10298
  update_parameter_state(parser, &parser->current, &order);
@@ -9318,7 +10305,7 @@ parse_parameters(
9318
10305
  // forwarding parameter and move the keyword rest parameter to the posts list.
9319
10306
  pm_node_t *keyword_rest = params->keyword_rest;
9320
10307
  pm_parameters_node_posts_append(params, keyword_rest);
9321
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_UNEXPECTED_FWD);
10308
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_FWD);
9322
10309
  params->keyword_rest = NULL;
9323
10310
  }
9324
10311
  pm_parameters_node_keyword_rest_set(params, (pm_node_t *)param);
@@ -9337,19 +10324,19 @@ parse_parameters(
9337
10324
  parser_lex(parser);
9338
10325
  switch (parser->previous.type) {
9339
10326
  case PM_TOKEN_CONSTANT:
9340
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_CONSTANT);
10327
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CONSTANT);
9341
10328
  break;
9342
10329
  case PM_TOKEN_INSTANCE_VARIABLE:
9343
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_IVAR);
10330
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_IVAR);
9344
10331
  break;
9345
10332
  case PM_TOKEN_GLOBAL_VARIABLE:
9346
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_GLOBAL);
10333
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_GLOBAL);
9347
10334
  break;
9348
10335
  case PM_TOKEN_CLASS_VARIABLE:
9349
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_ARGUMENT_FORMAL_CLASS);
10336
+ pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORMAL_CLASS);
9350
10337
  break;
9351
10338
  case PM_TOKEN_METHOD_NAME:
9352
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_METHOD_NAME);
10339
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_METHOD_NAME);
9353
10340
  break;
9354
10341
  default: break;
9355
10342
  }
@@ -9466,7 +10453,7 @@ parse_parameters(
9466
10453
  if (params->rest == NULL) {
9467
10454
  pm_parameters_node_rest_set(params, param);
9468
10455
  } else {
9469
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_SPLAT_MULTI);
10456
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_SPLAT_MULTI);
9470
10457
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9471
10458
  }
9472
10459
 
@@ -9500,7 +10487,7 @@ parse_parameters(
9500
10487
  if (params->keyword_rest == NULL) {
9501
10488
  pm_parameters_node_keyword_rest_set(params, param);
9502
10489
  } else {
9503
- pm_diagnostic_list_append(&parser->error_list, param->location.start, param->location.end, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI);
10490
+ pm_parser_err_node(parser, param, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI);
9504
10491
  pm_parameters_node_posts_append(params, param);
9505
10492
  }
9506
10493
 
@@ -9518,11 +10505,11 @@ parse_parameters(
9518
10505
  if (params->rest == NULL) {
9519
10506
  pm_parameters_node_rest_set(params, param);
9520
10507
  } else {
9521
- pm_diagnostic_list_append(&parser->error_list, param->base.location.start, param->base.location.end, PM_ERR_PARAMETER_SPLAT_MULTI);
10508
+ pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_SPLAT_MULTI);
9522
10509
  pm_parameters_node_posts_append(params, (pm_node_t *) param);
9523
10510
  }
9524
10511
  } else {
9525
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PARAMETER_WILD_LOOSE_COMMA);
10512
+ pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA);
9526
10513
  }
9527
10514
  }
9528
10515
 
@@ -9725,9 +10712,10 @@ parse_block_parameters(
9725
10712
  }
9726
10713
 
9727
10714
  pm_block_parameters_node_t *block_parameters = pm_block_parameters_node_create(parser, parameters, opening);
9728
- if (accept1(parser, PM_TOKEN_SEMICOLON)) {
10715
+ if ((opening->type != PM_TOKEN_NOT_PROVIDED) && accept1(parser, PM_TOKEN_SEMICOLON)) {
9729
10716
  do {
9730
10717
  expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_BLOCK_PARAM_LOCAL_VARIABLE);
10718
+ pm_parser_parameter_name_check(parser, &parser->previous);
9731
10719
  pm_parser_local_add_token(parser, &parser->previous);
9732
10720
 
9733
10721
  pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous);
@@ -9850,7 +10838,7 @@ parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accept
9850
10838
  if (arguments->block == NULL) {
9851
10839
  arguments->block = (pm_node_t *) block;
9852
10840
  } else {
9853
- pm_diagnostic_list_append(&parser->error_list, block->base.location.start, block->base.location.end, PM_ERR_ARGUMENT_BLOCK_MULTI);
10841
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI);
9854
10842
  if (arguments->arguments == NULL) {
9855
10843
  arguments->arguments = pm_arguments_node_create(parser);
9856
10844
  }
@@ -9873,7 +10861,7 @@ parse_predicate(pm_parser_t *parser, pm_binding_power_t binding_power, pm_contex
9873
10861
  bool predicate_closed = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
9874
10862
  predicate_closed |= accept1(parser, PM_TOKEN_KEYWORD_THEN);
9875
10863
  if (!predicate_closed) {
9876
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_CONDITIONAL_PREDICATE_TERM);
10864
+ pm_parser_err_current(parser, PM_ERR_CONDITIONAL_PREDICATE_TERM);
9877
10865
  }
9878
10866
 
9879
10867
  context_pop(parser);
@@ -10057,25 +11045,12 @@ parse_string_part(pm_parser_t *parser) {
10057
11045
  // "aaa #{bbb} #@ccc ddd"
10058
11046
  // ^^^^ ^ ^^^^
10059
11047
  case PM_TOKEN_STRING_CONTENT: {
10060
- pm_unescape_type_t unescape_type = PM_UNESCAPE_ALL;
10061
-
10062
- if (parser->lex_modes.current->mode == PM_LEX_HEREDOC) {
10063
- if (parser->lex_modes.current->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
10064
- // If we're in a tilde heredoc, we want to unescape it later
10065
- // because we don't want unescaped newlines to disappear
10066
- // before we handle them in the dedent.
10067
- unescape_type = PM_UNESCAPE_NONE;
10068
- } else if (parser->lex_modes.current->as.heredoc.quote == PM_HEREDOC_QUOTE_SINGLE) {
10069
- unescape_type = PM_UNESCAPE_MINIMAL;
10070
- }
10071
- }
10072
-
10073
- parser_lex(parser);
10074
-
10075
11048
  pm_token_t opening = not_provided(parser);
10076
11049
  pm_token_t closing = not_provided(parser);
11050
+ pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
10077
11051
 
10078
- return (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, unescape_type);
11052
+ parser_lex(parser);
11053
+ return node;
10079
11054
  }
10080
11055
  // Here the lexer has returned the beginning of an embedded expression. In
10081
11056
  // that case we'll parse the inner statements and return that as the part.
@@ -10166,7 +11141,7 @@ parse_string_part(pm_parser_t *parser) {
10166
11141
  }
10167
11142
  default:
10168
11143
  parser_lex(parser);
10169
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_CANNOT_PARSE_STRING_PART);
11144
+ pm_parser_err_previous(parser, PM_ERR_CANNOT_PARSE_STRING_PART);
10170
11145
  return NULL;
10171
11146
  }
10172
11147
  }
@@ -10177,7 +11152,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10177
11152
 
10178
11153
  if (lex_mode->mode != PM_LEX_STRING) {
10179
11154
  if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state);
10180
- pm_token_t symbol;
10181
11155
 
10182
11156
  switch (parser->current.type) {
10183
11157
  case PM_TOKEN_IDENTIFIER:
@@ -10190,21 +11164,21 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10190
11164
  case PM_TOKEN_BACK_REFERENCE:
10191
11165
  case PM_CASE_KEYWORD:
10192
11166
  parser_lex(parser);
10193
- symbol = parser->previous;
10194
11167
  break;
10195
11168
  case PM_CASE_OPERATOR:
10196
11169
  lex_state_set(parser, next_state == PM_LEX_STATE_NONE ? PM_LEX_STATE_ENDFN : next_state);
10197
11170
  parser_lex(parser);
10198
- symbol = parser->previous;
10199
11171
  break;
10200
11172
  default:
10201
11173
  expect2(parser, PM_TOKEN_IDENTIFIER, PM_TOKEN_METHOD_NAME, PM_ERR_SYMBOL_INVALID);
10202
- symbol = parser->previous;
10203
11174
  break;
10204
11175
  }
10205
11176
 
10206
11177
  pm_token_t closing = not_provided(parser);
10207
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &symbol, &closing, PM_UNESCAPE_ALL);
11178
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
11179
+
11180
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11181
+ return (pm_node_t *) symbol;
10208
11182
  }
10209
11183
 
10210
11184
  if (lex_mode->as.string.interpolation) {
@@ -10215,7 +11189,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10215
11189
 
10216
11190
  pm_token_t content = not_provided(parser);
10217
11191
  pm_token_t closing = parser->previous;
10218
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &closing, PM_UNESCAPE_NONE);
11192
+ return (pm_node_t *) pm_symbol_node_create(parser, &opening, &content, &closing);
10219
11193
  }
10220
11194
 
10221
11195
  // Now we can parse the first part of the symbol.
@@ -10248,18 +11222,23 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
10248
11222
  }
10249
11223
 
10250
11224
  pm_token_t content;
10251
- if (accept1(parser, PM_TOKEN_STRING_CONTENT)) {
10252
- content = parser->previous;
11225
+ pm_string_t unescaped;
11226
+
11227
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
11228
+ content = parser->current;
11229
+ unescaped = parser->current_string;
11230
+ parser_lex(parser);
10253
11231
  } else {
10254
11232
  content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end };
11233
+ pm_string_shared_init(&unescaped, content.start, content.end);
10255
11234
  }
10256
11235
 
10257
11236
  if (next_state != PM_LEX_STATE_NONE) {
10258
11237
  lex_state_set(parser, next_state);
10259
11238
  }
10260
- expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
10261
11239
 
10262
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
11240
+ expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
11241
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
10263
11242
  }
10264
11243
 
10265
11244
  // Parse an argument to undef which can either be a bare word, a
@@ -10276,8 +11255,10 @@ parse_undef_argument(pm_parser_t *parser) {
10276
11255
 
10277
11256
  pm_token_t opening = not_provided(parser);
10278
11257
  pm_token_t closing = not_provided(parser);
11258
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
10279
11259
 
10280
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
11260
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11261
+ return (pm_node_t *) symbol;
10281
11262
  }
10282
11263
  case PM_TOKEN_SYMBOL_BEGIN: {
10283
11264
  pm_lex_mode_t lex_mode = *parser->lex_modes.current;
@@ -10286,7 +11267,7 @@ parse_undef_argument(pm_parser_t *parser) {
10286
11267
  return parse_symbol(parser, &lex_mode, PM_LEX_STATE_NONE);
10287
11268
  }
10288
11269
  default:
10289
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_UNDEF_ARGUMENT);
11270
+ pm_parser_err_current(parser, PM_ERR_UNDEF_ARGUMENT);
10290
11271
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
10291
11272
  }
10292
11273
  }
@@ -10310,8 +11291,10 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
10310
11291
  parser_lex(parser);
10311
11292
  pm_token_t opening = not_provided(parser);
10312
11293
  pm_token_t closing = not_provided(parser);
11294
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
10313
11295
 
10314
- return (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
11296
+ pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
11297
+ return (pm_node_t *) symbol;
10315
11298
  }
10316
11299
  case PM_TOKEN_SYMBOL_BEGIN: {
10317
11300
  pm_lex_mode_t lex_mode = *parser->lex_modes.current;
@@ -10329,7 +11312,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
10329
11312
  parser_lex(parser);
10330
11313
  return (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous);
10331
11314
  default:
10332
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_ALIAS_ARGUMENT);
11315
+ pm_parser_err_current(parser, PM_ERR_ALIAS_ARGUMENT);
10333
11316
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
10334
11317
  }
10335
11318
  }
@@ -10366,9 +11349,9 @@ parse_variable_call(pm_parser_t *parser) {
10366
11349
  // local variable read. If it's not, then we'll create a normal call
10367
11350
  // node but add an error.
10368
11351
  if (parser->current_scope->explicit_params) {
10369
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
11352
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_NOT_ALLOWED);
10370
11353
  } else if (outer_scope_using_numbered_params_p(parser)) {
10371
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
11354
+ pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
10372
11355
  } else {
10373
11356
  // When you use a numbered parameter, it implies the existence
10374
11357
  // of all of the locals that exist before it. For example,
@@ -10421,76 +11404,8 @@ parse_method_definition_name(pm_parser_t *parser) {
10421
11404
  }
10422
11405
  }
10423
11406
 
10424
- static int
10425
- parse_heredoc_common_whitespace_for_single_node(pm_parser_t *parser, pm_node_t *node, int common_whitespace)
10426
- {
10427
- const pm_location_t *content_loc = &((pm_string_node_t *) node)->content_loc;
10428
- int cur_whitespace;
10429
- const uint8_t *cur_char = content_loc->start;
10430
-
10431
- while (cur_char && cur_char < content_loc->end) {
10432
- // Any empty newlines aren't included in the minimum whitespace
10433
- // calculation.
10434
- size_t eol_length;
10435
- while ((eol_length = match_eol_at(parser, cur_char))) {
10436
- cur_char += eol_length;
10437
- }
10438
-
10439
- if (cur_char == content_loc->end) break;
10440
-
10441
- cur_whitespace = 0;
10442
-
10443
- while (pm_char_is_inline_whitespace(*cur_char) && cur_char < content_loc->end) {
10444
- if (cur_char[0] == '\t') {
10445
- cur_whitespace = (cur_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
10446
- } else {
10447
- cur_whitespace++;
10448
- }
10449
- cur_char++;
10450
- }
10451
-
10452
- // If we hit a newline, then we have encountered a line that
10453
- // contains only whitespace, and it shouldn't be considered in
10454
- // the calculation of common leading whitespace.
10455
- eol_length = match_eol_at(parser, cur_char);
10456
- if (eol_length) {
10457
- cur_char += eol_length;
10458
- continue;
10459
- }
10460
-
10461
- if (cur_whitespace < common_whitespace || common_whitespace == -1) {
10462
- common_whitespace = cur_whitespace;
10463
- }
10464
-
10465
- cur_char = next_newline(cur_char + 1, parser->end - (cur_char + 1));
10466
- if (cur_char) cur_char++;
10467
- }
10468
- return common_whitespace;
10469
- }
10470
-
10471
- // Calculate the common leading whitespace for each line in a heredoc.
10472
- static int
10473
- parse_heredoc_common_whitespace(pm_parser_t *parser, pm_node_list_t *nodes) {
10474
- int common_whitespace = -1;
10475
-
10476
- for (size_t index = 0; index < nodes->size; index++) {
10477
- pm_node_t *node = nodes->nodes[index];
10478
- if (!PM_NODE_TYPE_P(node, PM_STRING_NODE)) continue;
10479
-
10480
- // If the previous node wasn't a string node, we don't want to trim
10481
- // whitespace. This could happen after an interpolated expression or
10482
- // variable.
10483
- if (index == 0 || PM_NODE_TYPE_P(nodes->nodes[index - 1], PM_STRING_NODE)) {
10484
- common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, common_whitespace);
10485
- }
10486
- }
10487
-
10488
- return common_whitespace;
10489
- }
10490
-
10491
- static pm_string_t *
10492
- parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool dedent_node, int common_whitespace, pm_heredoc_quote_t quote)
10493
- {
11407
+ static void
11408
+ parse_heredoc_dedent_string(pm_string_t *string, size_t common_whitespace) {
10494
11409
  // Get a reference to the string struct that is being held by the string
10495
11410
  // node. This is the value we're going to actually manipulate.
10496
11411
  pm_string_ensure_owned(string);
@@ -10499,85 +11414,37 @@ parse_heredoc_dedent_single_node(pm_parser_t *parser, pm_string_t *string, bool
10499
11414
  // destination to move bytes into. We'll also use it for bounds checking
10500
11415
  // since we don't require that these strings be null terminated.
10501
11416
  size_t dest_length = pm_string_length(string);
10502
- uint8_t *source_start = (uint8_t *) string->source;
10503
-
10504
- const uint8_t *source_cursor = source_start;
11417
+ const uint8_t *source_cursor = (uint8_t *) string->source;
10505
11418
  const uint8_t *source_end = source_cursor + dest_length;
10506
11419
 
10507
11420
  // We're going to move bytes backward in the string when we get leading
10508
11421
  // whitespace, so we'll maintain a pointer to the current position in the
10509
11422
  // string that we're writing to.
10510
- uint8_t *dest_cursor = source_start;
10511
-
10512
- while (source_cursor < source_end) {
10513
- // If we need to dedent the next element within the heredoc or the next
10514
- // line within the string node, then we'll do it here.
10515
- if (dedent_node) {
10516
- int trimmed_whitespace = 0;
10517
-
10518
- // While we haven't reached the amount of common whitespace that we need
10519
- // to trim and we haven't reached the end of the string, we'll keep
10520
- // trimming whitespace. Trimming in this context means skipping over
10521
- // these bytes such that they aren't copied into the new string.
10522
- while ((source_cursor < source_end) && pm_char_is_inline_whitespace(*source_cursor) && trimmed_whitespace < common_whitespace) {
10523
- if (*source_cursor == '\t') {
10524
- trimmed_whitespace = (trimmed_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
10525
- if (trimmed_whitespace > common_whitespace) break;
10526
- } else {
10527
- trimmed_whitespace++;
10528
- }
10529
-
10530
- source_cursor++;
10531
- dest_length--;
10532
- }
10533
- }
10534
-
10535
- // At this point we have dedented all that we need to, so we need to find
10536
- // the next newline.
10537
- const uint8_t *breakpoint = next_newline(source_cursor, source_end - source_cursor);
10538
-
10539
- if (breakpoint == NULL) {
10540
- // If there isn't another newline, then we can just move the rest of the
10541
- // string and break from the loop.
10542
- memmove(dest_cursor, source_cursor, (size_t) (source_end - source_cursor));
10543
- break;
11423
+ size_t trimmed_whitespace = 0;
11424
+
11425
+ // While we haven't reached the amount of common whitespace that we need to
11426
+ // trim and we haven't reached the end of the string, we'll keep trimming
11427
+ // whitespace. Trimming in this context means skipping over these bytes such
11428
+ // that they aren't copied into the new string.
11429
+ while ((source_cursor < source_end) && pm_char_is_inline_whitespace(*source_cursor) && trimmed_whitespace < common_whitespace) {
11430
+ if (*source_cursor == '\t') {
11431
+ trimmed_whitespace = (trimmed_whitespace / PM_TAB_WHITESPACE_SIZE + 1) * PM_TAB_WHITESPACE_SIZE;
11432
+ if (trimmed_whitespace > common_whitespace) break;
11433
+ } else {
11434
+ trimmed_whitespace++;
10544
11435
  }
10545
11436
 
10546
- // Otherwise, we need to move everything including the newline, and
10547
- // then set the dedent_node flag to true.
10548
- if (breakpoint < source_end) breakpoint++;
10549
- memmove(dest_cursor, source_cursor, (size_t) (breakpoint - source_cursor));
10550
- dest_cursor += (breakpoint - source_cursor);
10551
- source_cursor = breakpoint;
10552
- dedent_node = true;
11437
+ source_cursor++;
11438
+ dest_length--;
10553
11439
  }
10554
11440
 
10555
- // We only want to write this node into the list if it has any content.
11441
+ memmove((uint8_t *) string->source, source_cursor, (size_t) (source_end - source_cursor));
10556
11442
  string->length = dest_length;
10557
-
10558
- if (dest_length != 0) {
10559
- pm_unescape_manipulate_string(parser, string, (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL);
10560
- }
10561
- return string;
10562
11443
  }
10563
11444
 
10564
11445
  // Take a heredoc node that is indented by a ~ and trim the leading whitespace.
10565
11446
  static void
10566
- parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_quote_t quote)
10567
- {
10568
- pm_node_list_t *nodes;
10569
-
10570
- if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
10571
- nodes = &((pm_interpolated_x_string_node_t *) heredoc_node)->parts;
10572
- } else {
10573
- nodes = &((pm_interpolated_string_node_t *) heredoc_node)->parts;
10574
- }
10575
-
10576
- // First, calculate how much common whitespace we need to trim. If there is
10577
- // none or it's 0, then we can return early.
10578
- int common_whitespace;
10579
- if ((common_whitespace = parse_heredoc_common_whitespace(parser, nodes)) <= 0) return;
10580
-
11447
+ parse_heredoc_dedent(pm_parser_t *parser, pm_node_list_t *nodes, size_t common_whitespace) {
10581
11448
  // The next node should be dedented if it's the first node in the list or if
10582
11449
  // if follows a string node.
10583
11450
  bool dedent_next = true;
@@ -10600,7 +11467,10 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_t *heredoc_node, pm_heredoc_qu
10600
11467
  }
10601
11468
 
10602
11469
  pm_string_node_t *string_node = ((pm_string_node_t *) node);
10603
- parse_heredoc_dedent_single_node(parser, &string_node->unescaped, dedent_next, common_whitespace, quote);
11470
+ if (dedent_next) {
11471
+ parse_heredoc_dedent_string(&string_node->unescaped, common_whitespace);
11472
+ }
11473
+
10604
11474
  if (string_node->unescaped.length == 0) {
10605
11475
  pm_node_destroy(parser, node);
10606
11476
  } else {
@@ -10929,13 +11799,13 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
10929
11799
  case PM_TOKEN_STRING_BEGIN:
10930
11800
  key = parse_expression(parser, PM_BINDING_POWER_MAX, PM_ERR_PATTERN_HASH_KEY);
10931
11801
  if (!pm_symbol_node_label_p(key)) {
10932
- pm_diagnostic_list_append(&parser->error_list, key->location.start, key->location.end, PM_ERR_PATTERN_HASH_KEY_LABEL);
11802
+ pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_LABEL);
10933
11803
  }
10934
11804
 
10935
11805
  break;
10936
11806
  default:
10937
11807
  parser_lex(parser);
10938
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PATTERN_HASH_KEY);
11808
+ pm_parser_err_previous(parser, PM_ERR_PATTERN_HASH_KEY);
10939
11809
  key = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
10940
11810
  break;
10941
11811
  }
@@ -10970,7 +11840,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
10970
11840
  return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
10971
11841
  }
10972
11842
  default: {
10973
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE);
11843
+ pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE);
10974
11844
  pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end);
10975
11845
  return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right);
10976
11846
  }
@@ -11058,7 +11928,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11058
11928
  default: {
11059
11929
  // If we get here, then we have a pin operator followed by something
11060
11930
  // not understood. We'll create a missing node and return that.
11061
- pm_diagnostic_list_append(&parser->error_list, operator.start, operator.end, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN);
11931
+ pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN);
11062
11932
  pm_node_t *variable = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end);
11063
11933
  return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable);
11064
11934
  }
@@ -11082,7 +11952,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11082
11952
  return parse_pattern_constant_path(parser, node);
11083
11953
  }
11084
11954
  default:
11085
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, diag_id);
11955
+ pm_parser_err_current(parser, diag_id);
11086
11956
  return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
11087
11957
  }
11088
11958
  }
@@ -11126,7 +11996,7 @@ parse_pattern_primitives(pm_parser_t *parser, pm_diagnostic_id_t diag_id) {
11126
11996
  break;
11127
11997
  }
11128
11998
  default: {
11129
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, diag_id);
11999
+ pm_parser_err_current(parser, diag_id);
11130
12000
  pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end);
11131
12001
 
11132
12002
  if (node == NULL) {
@@ -11218,7 +12088,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
11218
12088
  // will continue to parse the rest of the patterns, but we will indicate
11219
12089
  // it as an error.
11220
12090
  if (trailing_rest) {
11221
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_PATTERN_REST);
12091
+ pm_parser_err_previous(parser, PM_ERR_PATTERN_REST);
11222
12092
  }
11223
12093
 
11224
12094
  trailing_rest = true;
@@ -11284,6 +12154,7 @@ static inline pm_node_t *
11284
12154
  parse_strings(pm_parser_t *parser) {
11285
12155
  assert(parser->current.type == PM_TOKEN_STRING_BEGIN);
11286
12156
  pm_node_t *result = NULL;
12157
+ bool state_is_arg_labeled = lex_state_p(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
11287
12158
 
11288
12159
  while (match1(parser, PM_TOKEN_STRING_BEGIN)) {
11289
12160
  pm_node_t *node = NULL;
@@ -11301,17 +12172,30 @@ parse_strings(pm_parser_t *parser) {
11301
12172
  // start. In that case we'll create an empty content token and
11302
12173
  // return an uninterpolated string.
11303
12174
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11304
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_NONE);
12175
+ pm_string_node_t *string = pm_string_node_create(parser, &opening, &content, &parser->previous);
12176
+
12177
+ pm_string_shared_init(&string->unescaped, content.start, content.end);
12178
+ node = (pm_node_t *) string;
11305
12179
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11306
12180
  // If we get here, then we have an end of a label immediately
11307
12181
  // after a start. In that case we'll create an empty symbol
11308
12182
  // node.
11309
12183
  pm_token_t opening = not_provided(parser);
11310
12184
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11311
- node = (pm_node_t *) pm_symbol_node_create(parser, &opening, &content, &parser->previous);
12185
+ pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous);
12186
+
12187
+ pm_string_shared_init(&symbol->unescaped, content.start, content.end);
12188
+ node = (pm_node_t *) symbol;
11312
12189
  } else if (!lex_interpolation) {
11313
12190
  // If we don't accept interpolation then we expect the string to
11314
12191
  // start with a single string content node.
12192
+ pm_string_t unescaped;
12193
+ if (match1(parser, PM_TOKEN_EOF)) {
12194
+ unescaped = PM_EMPTY_STRING;
12195
+ } else {
12196
+ unescaped = parser->current_string;
12197
+ }
12198
+
11315
12199
  expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_EXPECT_STRING_CONTENT);
11316
12200
  pm_token_t content = parser->previous;
11317
12201
 
@@ -11330,21 +12214,22 @@ parse_strings(pm_parser_t *parser) {
11330
12214
  pm_node_list_t parts = PM_EMPTY_NODE_LIST;
11331
12215
 
11332
12216
  pm_token_t delimiters = not_provided(parser);
11333
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &delimiters, &content, &delimiters, PM_UNESCAPE_MINIMAL);
12217
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &delimiters, &content, &delimiters, &unescaped);
11334
12218
  pm_node_list_append(&parts, part);
11335
12219
 
11336
- while (accept1(parser, PM_TOKEN_STRING_CONTENT)) {
11337
- part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &delimiters, &parser->previous, &delimiters, PM_UNESCAPE_MINIMAL);
12220
+ do {
12221
+ part = (pm_node_t *) pm_string_node_create_current_string(parser, &delimiters, &parser->current, &delimiters);
11338
12222
  pm_node_list_append(&parts, part);
11339
- }
12223
+ parser_lex(parser);
12224
+ } while (match1(parser, PM_TOKEN_STRING_CONTENT));
11340
12225
 
11341
12226
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
11342
12227
  node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous);
11343
- } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11344
- node = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12228
+ } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
12229
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11345
12230
  } else {
11346
12231
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM);
11347
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_MINIMAL);
12232
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11348
12233
  }
11349
12234
  } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
11350
12235
  // In this case we've hit string content so we know the string
@@ -11352,12 +12237,14 @@ parse_strings(pm_parser_t *parser) {
11352
12237
  // following token is the end (in which case we can return a
11353
12238
  // plain string) or if it's not then it has interpolation.
11354
12239
  pm_token_t content = parser->current;
12240
+ pm_string_t unescaped = parser->current_string;
11355
12241
  parser_lex(parser);
11356
12242
 
11357
- if (accept1(parser, PM_TOKEN_STRING_END)) {
11358
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12243
+ if (match1(parser, PM_TOKEN_STRING_END)) {
12244
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
12245
+ parser_lex(parser);
11359
12246
  } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
11360
- node = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
12247
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
11361
12248
  } else {
11362
12249
  // If we get here, then we have interpolation so we'll need
11363
12250
  // to create a string or symbol node with interpolation.
@@ -11365,7 +12252,7 @@ parse_strings(pm_parser_t *parser) {
11365
12252
  pm_token_t string_opening = not_provided(parser);
11366
12253
  pm_token_t string_closing = not_provided(parser);
11367
12254
 
11368
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &string_opening, &parser->previous, &string_closing, PM_UNESCAPE_ALL);
12255
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped);
11369
12256
  pm_node_list_append(&parts, part);
11370
12257
 
11371
12258
  while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
@@ -11374,7 +12261,7 @@ parse_strings(pm_parser_t *parser) {
11374
12261
  }
11375
12262
  }
11376
12263
 
11377
- if (accept1(parser, PM_TOKEN_LABEL_END)) {
12264
+ if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
11378
12265
  node = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous);
11379
12266
  } else {
11380
12267
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM);
@@ -11382,11 +12269,11 @@ parse_strings(pm_parser_t *parser) {
11382
12269
  }
11383
12270
  }
11384
12271
  } else {
11385
- // If we get here, then the first part of the string is not
11386
- // plain string content, in which case we need to parse the
11387
- // string as an interpolated string.
12272
+ // If we get here, then the first part of the string is not plain
12273
+ // string content, in which case we need to parse the string as an
12274
+ // interpolated string.
11388
12275
  pm_node_list_t parts = PM_EMPTY_NODE_LIST;
11389
- pm_node_t *part = NULL;
12276
+ pm_node_t *part;
11390
12277
 
11391
12278
  while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
11392
12279
  if ((part = parse_string_part(parser)) != NULL) {
@@ -11418,7 +12305,7 @@ parse_strings(pm_parser_t *parser) {
11418
12305
  // If it cannot be concatenated with the previous node, then we'll
11419
12306
  // need to add a syntax error.
11420
12307
  if (!PM_NODE_TYPE_P(node, PM_STRING_NODE) && !PM_NODE_TYPE_P(node, PM_INTERPOLATED_STRING_NODE)) {
11421
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_STRING_CONCATENATION);
12308
+ pm_parser_err_node(parser, node, PM_ERR_STRING_CONCATENATION);
11422
12309
  }
11423
12310
 
11424
12311
  // Either way we will create a concat node to hold the strings
@@ -11464,7 +12351,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11464
12351
  element = (pm_node_t *) pm_splat_node_create(parser, &operator, expression);
11465
12352
  } else if (match2(parser, PM_TOKEN_LABEL, PM_TOKEN_USTAR_STAR)) {
11466
12353
  if (parsed_bare_hash) {
11467
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_EXPRESSION_BARE_HASH);
12354
+ pm_parser_err_current(parser, PM_ERR_EXPRESSION_BARE_HASH);
11468
12355
  }
11469
12356
 
11470
12357
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -11480,7 +12367,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11480
12367
 
11481
12368
  if (pm_symbol_node_label_p(element) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) {
11482
12369
  if (parsed_bare_hash) {
11483
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_EXPRESSION_BARE_HASH);
12370
+ pm_parser_err_previous(parser, PM_ERR_EXPRESSION_BARE_HASH);
11484
12371
  }
11485
12372
 
11486
12373
  pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser);
@@ -11598,7 +12485,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11598
12485
  // If we didn't find a terminator and we didn't find a right
11599
12486
  // parenthesis, then this is a syntax error.
11600
12487
  if (!terminator_found) {
11601
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
12488
+ pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11602
12489
  }
11603
12490
 
11604
12491
  // Parse each statement within the parentheses.
@@ -11627,7 +12514,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11627
12514
  } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) {
11628
12515
  break;
11629
12516
  } else {
11630
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
12517
+ pm_parser_err(parser, parser->current.start, parser->current.start, PM_ERR_EXPECT_EOL_AFTER_STATEMENT);
11631
12518
  }
11632
12519
  }
11633
12520
 
@@ -11665,7 +12552,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11665
12552
  content.start = content.start + 1;
11666
12553
 
11667
12554
  pm_token_t closing = not_provided(parser);
11668
- pm_node_t *node = (pm_node_t *) pm_char_literal_node_create_and_unescape(parser, &opening, &content, &closing, PM_UNESCAPE_ALL);
12555
+ pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
11669
12556
 
11670
12557
  // Characters can be followed by strings in which case they are
11671
12558
  // automatically concatenated.
@@ -11839,9 +12726,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11839
12726
  case PM_TOKEN_HEREDOC_START: {
11840
12727
  // Here we have found a heredoc. We'll parse it and add it to the
11841
12728
  // list of strings.
11842
- assert(parser->lex_modes.current->mode == PM_LEX_HEREDOC);
11843
- pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote;
11844
- pm_heredoc_indent_t indent = parser->lex_modes.current->as.heredoc.indent;
12729
+ pm_lex_mode_t *lex_mode = parser->lex_modes.current;
12730
+ assert(lex_mode->mode == PM_LEX_HEREDOC);
12731
+ pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
12732
+ pm_heredoc_indent_t indent = lex_mode->as.heredoc.indent;
11845
12733
 
11846
12734
  parser_lex(parser);
11847
12735
  pm_token_t opening = parser->previous;
@@ -11857,9 +12745,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11857
12745
  pm_token_t content = parse_strings_empty_content(parser->previous.start);
11858
12746
 
11859
12747
  if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
11860
- node = (pm_node_t *) pm_xstring_node_create_and_unescape(parser, &opening, &content, &parser->previous);
12748
+ node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_EMPTY_STRING);
11861
12749
  } else {
11862
- node = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_NONE);
12750
+ node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_EMPTY_STRING);
11863
12751
  }
11864
12752
 
11865
12753
  node->location.end = opening.end;
@@ -11884,15 +12772,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11884
12772
  cast->base.type = PM_X_STRING_NODE;
11885
12773
  }
11886
12774
 
11887
- lex_state_set(parser, PM_LEX_STATE_END);
11888
- expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
12775
+ size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
12776
+ if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
12777
+ parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
12778
+ }
11889
12779
 
11890
12780
  node = (pm_node_t *) cast;
11891
-
11892
- if (indent == PM_HEREDOC_INDENT_TILDE) {
11893
- int common_whitespace = parse_heredoc_common_whitespace_for_single_node(parser, node, -1);
11894
- parse_heredoc_dedent_single_node(parser, &cast->unescaped, true, common_whitespace, quote);
11895
- }
12781
+ lex_state_set(parser, PM_LEX_STATE_END);
12782
+ expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
11896
12783
  } else {
11897
12784
  // If we get here, then we have multiple parts in the heredoc,
11898
12785
  // so we'll need to create an interpolated string node to hold
@@ -11931,8 +12818,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11931
12818
 
11932
12819
  // If this is a heredoc that is indented with a ~, then we need
11933
12820
  // to dedent each line by the common leading whitespace.
11934
- if (indent == PM_HEREDOC_INDENT_TILDE) {
11935
- parse_heredoc_dedent(parser, node, quote);
12821
+ size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
12822
+ if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
12823
+ pm_node_list_t *nodes;
12824
+ if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
12825
+ nodes = &((pm_interpolated_x_string_node_t *) node)->parts;
12826
+ } else {
12827
+ nodes = &((pm_interpolated_string_node_t *) node)->parts;
12828
+ }
12829
+
12830
+ parse_heredoc_dedent(parser, nodes, common_whitespace);
11936
12831
  }
11937
12832
  }
11938
12833
 
@@ -11995,10 +12890,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
11995
12890
  case PM_GLOBAL_VARIABLE_READ_NODE: {
11996
12891
  if (PM_NODE_TYPE_P(old_name, PM_BACK_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE) || PM_NODE_TYPE_P(old_name, PM_GLOBAL_VARIABLE_READ_NODE)) {
11997
12892
  if (PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE)) {
11998
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12893
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
11999
12894
  }
12000
12895
  } else {
12001
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12896
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
12002
12897
  }
12003
12898
 
12004
12899
  return (pm_node_t *) pm_alias_global_variable_node_create(parser, &keyword, new_name, old_name);
@@ -12006,7 +12901,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12006
12901
  case PM_SYMBOL_NODE:
12007
12902
  case PM_INTERPOLATED_SYMBOL_NODE: {
12008
12903
  if (!PM_NODE_TYPE_P(old_name, PM_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_INTERPOLATED_SYMBOL_NODE)) {
12009
- pm_diagnostic_list_append(&parser->error_list, old_name->location.start, old_name->location.end, PM_ERR_ALIAS_ARGUMENT);
12904
+ pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT);
12010
12905
  }
12011
12906
  }
12012
12907
  /* fallthrough */
@@ -12032,7 +12927,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12032
12927
  }
12033
12928
 
12034
12929
  if (accept1(parser, PM_TOKEN_KEYWORD_END)) {
12035
- pm_diagnostic_list_append(&parser->error_list, case_keyword.start, case_keyword.end, PM_ERR_CASE_MISSING_CONDITIONS);
12930
+ pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS);
12036
12931
  return (pm_node_t *) pm_case_node_create(parser, &case_keyword, predicate, NULL, &parser->previous);
12037
12932
  }
12038
12933
 
@@ -12142,7 +13037,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12142
13037
  // If we didn't parse any conditions (in or when) then we need to
12143
13038
  // indicate that we have an error.
12144
13039
  if (case_node->conditions.size == 0) {
12145
- pm_diagnostic_list_append(&parser->error_list, case_keyword.start, case_keyword.end, PM_ERR_CASE_MISSING_CONDITIONS);
13040
+ pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS);
12146
13041
  }
12147
13042
 
12148
13043
  accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON);
@@ -12185,12 +13080,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12185
13080
  pm_begin_node_end_keyword_set(begin_node, &parser->previous);
12186
13081
 
12187
13082
  if ((begin_node->else_clause != NULL) && (begin_node->rescue_clause == NULL)) {
12188
- pm_diagnostic_list_append(
12189
- &parser->error_list,
12190
- begin_node->else_clause->base.location.start,
12191
- begin_node->else_clause->base.location.end,
12192
- PM_ERR_BEGIN_LONELY_ELSE
12193
- );
13083
+ pm_parser_err_node(parser, (pm_node_t *) begin_node->else_clause, PM_ERR_BEGIN_LONELY_ELSE);
12194
13084
  }
12195
13085
 
12196
13086
  return (pm_node_t *) begin_node;
@@ -12206,7 +13096,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12206
13096
  expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BEGIN_UPCASE_TERM);
12207
13097
  pm_context_t context = parser->current_context->context;
12208
13098
  if ((context != PM_CONTEXT_MAIN) && (context != PM_CONTEXT_PREEXE)) {
12209
- pm_diagnostic_list_append(&parser->error_list, keyword.start, keyword.end, PM_ERR_BEGIN_UPCASE_TOPLEVEL);
13099
+ pm_parser_err_token(parser, &keyword, PM_ERR_BEGIN_UPCASE_TOPLEVEL);
12210
13100
  }
12211
13101
  return (pm_node_t *) pm_pre_execution_node_create(parser, &keyword, &opening, statements, &parser->previous);
12212
13102
  }
@@ -12239,7 +13129,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12239
13129
  (parser->current_context->context == PM_CONTEXT_CLASS) ||
12240
13130
  (parser->current_context->context == PM_CONTEXT_MODULE)
12241
13131
  ) {
12242
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_RETURN_INVALID);
13132
+ pm_parser_err_current(parser, PM_ERR_RETURN_INVALID);
12243
13133
  }
12244
13134
  return (pm_node_t *) pm_return_node_create(parser, &keyword, arguments.arguments);
12245
13135
  }
@@ -12305,7 +13195,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12305
13195
  pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_ERR_CLASS_NAME);
12306
13196
  pm_token_t name = parser->previous;
12307
13197
  if (name.type != PM_TOKEN_CONSTANT) {
12308
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_CLASS_NAME);
13198
+ pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME);
12309
13199
  }
12310
13200
 
12311
13201
  pm_token_t inheritance_operator;
@@ -12346,7 +13236,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12346
13236
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
12347
13237
 
12348
13238
  if (context_def_p(parser)) {
12349
- pm_diagnostic_list_append(&parser->error_list, class_keyword.start, class_keyword.end, PM_ERR_CLASS_IN_METHOD);
13239
+ pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD);
12350
13240
  }
12351
13241
 
12352
13242
  pm_constant_id_list_t locals = parser->current_scope->locals;
@@ -12354,7 +13244,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12354
13244
  pm_do_loop_stack_pop(parser);
12355
13245
 
12356
13246
  if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) {
12357
- pm_diagnostic_list_append(&parser->error_list, constant_path->location.start, constant_path->location.end, PM_ERR_CLASS_NAME);
13247
+ pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME);
12358
13248
  }
12359
13249
 
12360
13250
  return (pm_node_t *) pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, &inheritance_operator, superclass, statements, &parser->previous);
@@ -12486,7 +13376,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12486
13376
  // If, after all that, we were unable to find a method name, add an
12487
13377
  // error to the error list.
12488
13378
  if (name.type == PM_TOKEN_MISSING) {
12489
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_DEF_NAME);
13379
+ pm_parser_err_previous(parser, PM_ERR_DEF_NAME);
12490
13380
  }
12491
13381
 
12492
13382
  pm_token_t lparen;
@@ -12538,7 +13428,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12538
13428
 
12539
13429
  if (accept1(parser, PM_TOKEN_EQUAL)) {
12540
13430
  if (token_is_setter_name(&name)) {
12541
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_DEF_ENDLESS_SETTER);
13431
+ pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER);
12542
13432
  }
12543
13433
  equal = parser->previous;
12544
13434
 
@@ -12656,6 +13546,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12656
13546
  parser_lex(parser);
12657
13547
  pm_token_t for_keyword = parser->previous;
12658
13548
  pm_node_t *index;
13549
+ pm_parser_scope_push_transparent(parser);
12659
13550
 
12660
13551
  // First, parse out the first index expression.
12661
13552
  if (accept1(parser, PM_TOKEN_USTAR)) {
@@ -12670,7 +13561,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12670
13561
  } else if (token_begins_expression_p(parser->current.type)) {
12671
13562
  index = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA);
12672
13563
  } else {
12673
- pm_diagnostic_list_append(&parser->error_list, for_keyword.start, for_keyword.end, PM_ERR_FOR_INDEX);
13564
+ pm_parser_err_token(parser, &for_keyword, PM_ERR_FOR_INDEX);
12674
13565
  index = (pm_node_t *) pm_missing_node_create(parser, for_keyword.start, for_keyword.end);
12675
13566
  }
12676
13567
 
@@ -12681,6 +13572,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12681
13572
  index = parse_target(parser, index);
12682
13573
  }
12683
13574
 
13575
+ pm_parser_scope_pop(parser);
12684
13576
  pm_do_loop_stack_push(parser, true);
12685
13577
 
12686
13578
  expect1(parser, PM_TOKEN_KEYWORD_IN, PM_ERR_FOR_IN);
@@ -12700,8 +13592,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12700
13592
  pm_statements_node_t *statements = NULL;
12701
13593
 
12702
13594
  if (!accept1(parser, PM_TOKEN_KEYWORD_END)) {
13595
+ pm_parser_scope_push_transparent(parser);
12703
13596
  statements = parse_statements(parser, PM_CONTEXT_FOR);
12704
13597
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_FOR_TERM);
13598
+ pm_parser_scope_pop(parser);
12705
13599
  }
12706
13600
 
12707
13601
  return (pm_node_t *) pm_for_node_create(parser, index, collection, statements, &for_keyword, &in_keyword, &do_keyword, &parser->previous);
@@ -12797,7 +13691,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12797
13691
  // syntax error. We handle that here as well.
12798
13692
  name = parser->previous;
12799
13693
  if (name.type != PM_TOKEN_CONSTANT) {
12800
- pm_diagnostic_list_append(&parser->error_list, name.start, name.end, PM_ERR_MODULE_NAME);
13694
+ pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME);
12801
13695
  }
12802
13696
 
12803
13697
  pm_parser_scope_push(parser, true);
@@ -12821,7 +13715,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12821
13715
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM);
12822
13716
 
12823
13717
  if (context_def_p(parser)) {
12824
- pm_diagnostic_list_append(&parser->error_list, module_keyword.start, module_keyword.end, PM_ERR_MODULE_IN_METHOD);
13718
+ pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD);
12825
13719
  }
12826
13720
 
12827
13721
  return (pm_node_t *) pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous);
@@ -12891,13 +13785,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12891
13785
  accept1(parser, PM_TOKEN_WORDS_SEP);
12892
13786
  if (match1(parser, PM_TOKEN_STRING_END)) break;
12893
13787
 
12894
- expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
12895
-
12896
- pm_token_t opening = not_provided(parser);
12897
- pm_token_t closing = not_provided(parser);
13788
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13789
+ pm_token_t opening = not_provided(parser);
13790
+ pm_token_t closing = not_provided(parser);
13791
+ pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing));
13792
+ }
12898
13793
 
12899
- pm_node_t *symbol = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_MINIMAL);
12900
- pm_array_node_elements_append(array, symbol);
13794
+ expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
12901
13795
  }
12902
13796
 
12903
13797
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_LOWER_TERM);
@@ -12937,26 +13831,26 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
12937
13831
  // If we hit content and the current node is NULL, then this is
12938
13832
  // the first string content we've seen. In that case we're going
12939
13833
  // to create a new string node and set that to the current.
13834
+ current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing);
12940
13835
  parser_lex(parser);
12941
- current = (pm_node_t *) pm_symbol_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
12942
13836
  } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) {
12943
13837
  // If we hit string content and the current node is an
12944
13838
  // interpolated string, then we need to append the string content
12945
13839
  // to the list of child nodes.
12946
- pm_node_t *part = parse_string_part(parser);
12947
- pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part);
13840
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
13841
+ parser_lex(parser);
13842
+
13843
+ pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string);
12948
13844
  } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) {
12949
13845
  // If we hit string content and the current node is a string node,
12950
13846
  // then we need to convert the current node into an interpolated
12951
13847
  // string and add the string content to the list of child nodes.
12952
- pm_token_t opening = not_provided(parser);
12953
- pm_token_t closing = not_provided(parser);
12954
- pm_interpolated_symbol_node_t *interpolated =
12955
- pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
12956
- pm_interpolated_symbol_node_append(interpolated, current);
13848
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing);
13849
+ parser_lex(parser);
12957
13850
 
12958
- pm_node_t *part = parse_string_part(parser);
12959
- pm_interpolated_symbol_node_append(interpolated, part);
13851
+ pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing);
13852
+ pm_interpolated_symbol_node_append(interpolated, current);
13853
+ pm_interpolated_symbol_node_append(interpolated, string);
12960
13854
  current = (pm_node_t *) interpolated;
12961
13855
  } else {
12962
13856
  assert(false && "unreachable");
@@ -13063,12 +13957,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13063
13957
  accept1(parser, PM_TOKEN_WORDS_SEP);
13064
13958
  if (match1(parser, PM_TOKEN_STRING_END)) break;
13065
13959
 
13066
- expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
13960
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13961
+ pm_token_t opening = not_provided(parser);
13962
+ pm_token_t closing = not_provided(parser);
13067
13963
 
13068
- pm_token_t opening = not_provided(parser);
13069
- pm_token_t closing = not_provided(parser);
13070
- pm_node_t *string = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_WHITESPACE);
13071
- pm_array_node_elements_append(array, string);
13964
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
13965
+ pm_array_node_elements_append(array, string);
13966
+ }
13967
+
13968
+ expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT);
13072
13969
  }
13073
13970
 
13074
13971
  expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM);
@@ -13101,29 +13998,29 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13101
13998
  break;
13102
13999
  }
13103
14000
  case PM_TOKEN_STRING_CONTENT: {
14001
+ pm_token_t opening = not_provided(parser);
14002
+ pm_token_t closing = not_provided(parser);
14003
+
14004
+ pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
14005
+ parser_lex(parser);
14006
+
13104
14007
  if (current == NULL) {
13105
14008
  // If we hit content and the current node is NULL, then this is
13106
14009
  // the first string content we've seen. In that case we're going
13107
14010
  // to create a new string node and set that to the current.
13108
- current = parse_string_part(parser);
14011
+ current = string;
13109
14012
  } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
13110
14013
  // If we hit string content and the current node is an
13111
14014
  // interpolated string, then we need to append the string content
13112
14015
  // to the list of child nodes.
13113
- pm_node_t *part = parse_string_part(parser);
13114
- pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part);
14016
+ pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string);
13115
14017
  } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
13116
14018
  // If we hit string content and the current node is a string node,
13117
14019
  // then we need to convert the current node into an interpolated
13118
14020
  // string and add the string content to the list of child nodes.
13119
- pm_token_t opening = not_provided(parser);
13120
- pm_token_t closing = not_provided(parser);
13121
- pm_interpolated_string_node_t *interpolated =
13122
- pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
14021
+ pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
13123
14022
  pm_interpolated_string_node_append(interpolated, current);
13124
-
13125
- pm_node_t *part = parse_string_part(parser);
13126
- pm_interpolated_string_node_append(interpolated, part);
14023
+ pm_interpolated_string_node_append(interpolated, string);
13127
14024
  current = (pm_node_t *) interpolated;
13128
14025
  } else {
13129
14026
  assert(false && "unreachable");
@@ -13218,7 +14115,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13218
14115
  };
13219
14116
 
13220
14117
  parser_lex(parser);
13221
- return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
14118
+ return (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
13222
14119
  }
13223
14120
 
13224
14121
  pm_interpolated_regular_expression_node_t *node;
@@ -13228,6 +14125,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13228
14125
  // expression at least has something in it. We'll need to check if the
13229
14126
  // following token is the end (in which case we can return a plain
13230
14127
  // regular expression) or if it's not then it has interpolation.
14128
+ pm_string_t unescaped = parser->current_string;
13231
14129
  pm_token_t content = parser->current;
13232
14130
  parser_lex(parser);
13233
14131
 
@@ -13235,7 +14133,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13235
14133
  // without interpolation, which can be represented more succinctly and
13236
14134
  // more easily compiled.
13237
14135
  if (accept1(parser, PM_TOKEN_REGEXP_END)) {
13238
- return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
14136
+ return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13239
14137
  }
13240
14138
 
13241
14139
  // If we get here, then we have interpolation so we'll need to create
@@ -13244,7 +14142,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13244
14142
 
13245
14143
  pm_token_t opening = not_provided(parser);
13246
14144
  pm_token_t closing = not_provided(parser);
13247
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
14145
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
13248
14146
  pm_interpolated_regular_expression_node_append(node, part);
13249
14147
  } else {
13250
14148
  // If the first part of the body of the regular expression is not a
@@ -13255,9 +14153,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13255
14153
 
13256
14154
  // Now that we're here and we have interpolation, we'll parse all of the
13257
14155
  // parts into the list.
14156
+ pm_node_t *part;
13258
14157
  while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) {
13259
- pm_node_t *part = parse_string_part(parser);
13260
- if (part != NULL) {
14158
+ if ((part = parse_string_part(parser)) != NULL) {
13261
14159
  pm_interpolated_regular_expression_node_append(node, part);
13262
14160
  }
13263
14161
  }
@@ -13293,35 +14191,37 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
13293
14191
  pm_interpolated_x_string_node_t *node;
13294
14192
 
13295
14193
  if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
13296
- // In this case we've hit string content so we know the string at least
13297
- // has something in it. We'll need to check if the following token is
13298
- // the end (in which case we can return a plain string) or if it's not
13299
- // then it has interpolation.
14194
+ // In this case we've hit string content so we know the string
14195
+ // at least has something in it. We'll need to check if the
14196
+ // following token is the end (in which case we can return a
14197
+ // plain string) or if it's not then it has interpolation.
14198
+ pm_string_t unescaped = parser->current_string;
13300
14199
  pm_token_t content = parser->current;
13301
14200
  parser_lex(parser);
13302
14201
 
13303
14202
  if (accept1(parser, PM_TOKEN_STRING_END)) {
13304
- return (pm_node_t *) pm_xstring_node_create_and_unescape(parser, &opening, &content, &parser->previous);
14203
+ return (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
13305
14204
  }
13306
14205
 
13307
- // If we get here, then we have interpolation so we'll need to create
13308
- // a string node with interpolation.
14206
+ // If we get here, then we have interpolation so we'll need to
14207
+ // create a string node with interpolation.
13309
14208
  node = pm_interpolated_xstring_node_create(parser, &opening, &opening);
13310
14209
 
13311
14210
  pm_token_t opening = not_provided(parser);
13312
14211
  pm_token_t closing = not_provided(parser);
13313
- pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
14212
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
14213
+
13314
14214
  pm_interpolated_xstring_node_append(node, part);
13315
14215
  } else {
13316
- // If the first part of the body of the string is not a string content,
13317
- // then we have interpolation and we need to create an interpolated
13318
- // string node.
14216
+ // If the first part of the body of the string is not a string
14217
+ // content, then we have interpolation and we need to create an
14218
+ // interpolated string node.
13319
14219
  node = pm_interpolated_xstring_node_create(parser, &opening, &opening);
13320
14220
  }
13321
14221
 
14222
+ pm_node_t *part;
13322
14223
  while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
13323
- pm_node_t *part = parse_string_part(parser);
13324
- if (part != NULL) {
14224
+ if ((part = parse_string_part(parser)) != NULL) {
13325
14225
  pm_interpolated_xstring_node_append(node, part);
13326
14226
  }
13327
14227
  }
@@ -13542,7 +14442,7 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_
13542
14442
  static void
13543
14443
  parse_call_operator_write_block(pm_parser_t *parser, pm_call_node_t *call_node, const pm_token_t *operator) {
13544
14444
  if (call_node->block != NULL) {
13545
- pm_diagnostic_list_append(&parser->error_list, operator->start, operator->end, PM_ERR_OPERATOR_WRITE_BLOCK);
14445
+ pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_BLOCK);
13546
14446
  pm_node_destroy(parser, (pm_node_t *) call_node->block);
13547
14447
  call_node->block = NULL;
13548
14448
  }
@@ -13590,7 +14490,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13590
14490
  // In this case we have an = sign, but we don't know what it's for. We
13591
14491
  // need to treat it as an error. For now, we'll mark it as an error
13592
14492
  // and just skip right past it.
13593
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL);
14493
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL);
13594
14494
  return node;
13595
14495
  }
13596
14496
  }
@@ -13598,7 +14498,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13598
14498
  switch (PM_NODE_TYPE(node)) {
13599
14499
  case PM_BACK_REFERENCE_READ_NODE:
13600
14500
  case PM_NUMBERED_REFERENCE_READ_NODE:
13601
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14501
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13602
14502
  /* fallthrough */
13603
14503
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13604
14504
  parser_lex(parser);
@@ -13661,7 +14561,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13661
14561
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13662
14562
 
13663
14563
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13664
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14564
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13665
14565
  }
13666
14566
 
13667
14567
  parser_lex(parser);
@@ -13683,7 +14583,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13683
14583
  }
13684
14584
  case PM_MULTI_WRITE_NODE: {
13685
14585
  parser_lex(parser);
13686
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_AMPAMPEQ_MULTI_ASSIGN);
14586
+ pm_parser_err_token(parser, &token, PM_ERR_AMPAMPEQ_MULTI_ASSIGN);
13687
14587
  return node;
13688
14588
  }
13689
14589
  default:
@@ -13692,7 +14592,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13692
14592
  // In this case we have an &&= sign, but we don't know what it's for.
13693
14593
  // We need to treat it as an error. For now, we'll mark it as an error
13694
14594
  // and just skip right past it.
13695
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ);
14595
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ);
13696
14596
  return node;
13697
14597
  }
13698
14598
  }
@@ -13700,7 +14600,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13700
14600
  switch (PM_NODE_TYPE(node)) {
13701
14601
  case PM_BACK_REFERENCE_READ_NODE:
13702
14602
  case PM_NUMBERED_REFERENCE_READ_NODE:
13703
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14603
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13704
14604
  /* fallthrough */
13705
14605
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13706
14606
  parser_lex(parser);
@@ -13763,7 +14663,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13763
14663
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13764
14664
 
13765
14665
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13766
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14666
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13767
14667
  }
13768
14668
 
13769
14669
  parser_lex(parser);
@@ -13785,7 +14685,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13785
14685
  }
13786
14686
  case PM_MULTI_WRITE_NODE: {
13787
14687
  parser_lex(parser);
13788
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN);
14688
+ pm_parser_err_token(parser, &token, PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN);
13789
14689
  return node;
13790
14690
  }
13791
14691
  default:
@@ -13794,7 +14694,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13794
14694
  // In this case we have an ||= sign, but we don't know what it's for.
13795
14695
  // We need to treat it as an error. For now, we'll mark it as an error
13796
14696
  // and just skip right past it.
13797
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ);
14697
+ pm_parser_err_token(parser, &token, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ);
13798
14698
  return node;
13799
14699
  }
13800
14700
  }
@@ -13812,7 +14712,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13812
14712
  switch (PM_NODE_TYPE(node)) {
13813
14713
  case PM_BACK_REFERENCE_READ_NODE:
13814
14714
  case PM_NUMBERED_REFERENCE_READ_NODE:
13815
- pm_diagnostic_list_append(&parser->error_list, node->location.start, node->location.end, PM_ERR_WRITE_TARGET_READONLY);
14715
+ pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_READONLY);
13816
14716
  /* fallthrough */
13817
14717
  case PM_GLOBAL_VARIABLE_READ_NODE: {
13818
14718
  parser_lex(parser);
@@ -13875,7 +14775,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13875
14775
  pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc.start, message_loc.end);
13876
14776
 
13877
14777
  if (token_is_numbered_parameter(message_loc.start, message_loc.end)) {
13878
- pm_diagnostic_list_append(&parser->error_list, message_loc.start, message_loc.end, PM_ERR_PARAMETER_NUMBERED_RESERVED);
14778
+ pm_parser_err_location(parser, &message_loc, PM_ERR_PARAMETER_NUMBERED_RESERVED);
13879
14779
  }
13880
14780
 
13881
14781
  parser_lex(parser);
@@ -13897,7 +14797,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13897
14797
  }
13898
14798
  case PM_MULTI_WRITE_NODE: {
13899
14799
  parser_lex(parser);
13900
- pm_diagnostic_list_append(&parser->error_list, token.start, token.end, PM_ERR_OPERATOR_MULTI_ASSIGN);
14800
+ pm_parser_err_token(parser, &token, PM_ERR_OPERATOR_MULTI_ASSIGN);
13901
14801
  return node;
13902
14802
  }
13903
14803
  default:
@@ -13906,7 +14806,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
13906
14806
  // In this case we have an operator but we don't know what it's for.
13907
14807
  // We need to treat it as an error. For now, we'll mark it as an error
13908
14808
  // and just skip right past it.
13909
- pm_diagnostic_list_append(&parser->error_list, parser->previous.start, parser->previous.end, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
14809
+ pm_parser_err_previous(parser, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR);
13910
14810
  return node;
13911
14811
  }
13912
14812
  }
@@ -14021,7 +14921,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14021
14921
  break;
14022
14922
  }
14023
14923
  default: {
14024
- pm_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, PM_ERR_DEF_NAME);
14924
+ pm_parser_err_current(parser, PM_ERR_DEF_NAME);
14025
14925
  message = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end };
14026
14926
  }
14027
14927
  }
@@ -14172,7 +15072,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14172
15072
  return (pm_node_t *) pm_call_node_shorthand_create(parser, node, &delimiter, &arguments);
14173
15073
  }
14174
15074
  default: {
14175
- pm_diagnostic_list_append(&parser->error_list, delimiter.start, delimiter.end, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT);
15075
+ pm_parser_err_token(parser, &delimiter, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT);
14176
15076
  pm_node_t *child = (pm_node_t *) pm_missing_node_create(parser, delimiter.start, delimiter.end);
14177
15077
  return (pm_node_t *)pm_constant_path_node_create(parser, node, &delimiter, child);
14178
15078
  }
@@ -14220,7 +15120,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
14220
15120
 
14221
15121
  if (block != NULL) {
14222
15122
  if (arguments.block != NULL) {
14223
- pm_diagnostic_list_append(&parser->error_list, block->base.location.start, block->base.location.end, PM_ERR_ARGUMENT_AFTER_BLOCK);
15123
+ pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_AFTER_BLOCK);
14224
15124
  if (arguments.arguments == NULL) {
14225
15125
  arguments.arguments = pm_arguments_node_create(parser);
14226
15126
  }
@@ -14283,7 +15183,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, pm_diagn
14283
15183
  // parse_expression_prefix is going to be a missing node. In that case we need
14284
15184
  // to add the error message to the parser's error list.
14285
15185
  if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) {
14286
- pm_diagnostic_list_append(&parser->error_list, recovery.end, recovery.end, diag_id);
15186
+ pm_parser_err(parser, recovery.end, recovery.end, diag_id);
14287
15187
  return node;
14288
15188
  }
14289
15189
 
@@ -14428,6 +15328,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const ch
14428
15328
  .next_start = NULL,
14429
15329
  .heredoc_end = NULL,
14430
15330
  .comment_list = PM_LIST_EMPTY,
15331
+ .magic_comment_list = PM_LIST_EMPTY,
14431
15332
  .warning_list = PM_LIST_EMPTY,
14432
15333
  .error_list = PM_LIST_EMPTY,
14433
15334
  .current_scope = NULL,
@@ -14441,6 +15342,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const ch
14441
15342
  .constant_pool = PM_CONSTANT_POOL_EMPTY,
14442
15343
  .newline_list = PM_NEWLINE_LIST_EMPTY,
14443
15344
  .integer_base = 0,
15345
+ .current_string = PM_EMPTY_STRING,
14444
15346
  .command_start = true,
14445
15347
  .recovering = false,
14446
15348
  .encoding_changed = false,
@@ -14521,6 +15423,19 @@ pm_comment_list_free(pm_list_t *list) {
14521
15423
  }
14522
15424
  }
14523
15425
 
15426
+ // Free all of the memory associated with the magic comment list.
15427
+ static inline void
15428
+ pm_magic_comment_list_free(pm_list_t *list) {
15429
+ pm_list_node_t *node, *next;
15430
+
15431
+ for (node = list->head; node != NULL; node = next) {
15432
+ next = node->next;
15433
+
15434
+ pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) node;
15435
+ free(magic_comment);
15436
+ }
15437
+ }
15438
+
14524
15439
  // Free any memory associated with the given parser.
14525
15440
  PRISM_EXPORTED_FUNCTION void
14526
15441
  pm_parser_free(pm_parser_t *parser) {
@@ -14528,6 +15443,7 @@ pm_parser_free(pm_parser_t *parser) {
14528
15443
  pm_diagnostic_list_free(&parser->error_list);
14529
15444
  pm_diagnostic_list_free(&parser->warning_list);
14530
15445
  pm_comment_list_free(&parser->comment_list);
15446
+ pm_magic_comment_list_free(&parser->magic_comment_list);
14531
15447
  pm_constant_pool_free(&parser->constant_pool);
14532
15448
  pm_newline_list_free(&parser->newline_list);
14533
15449
 
@@ -14578,10 +15494,11 @@ pm_parse_serialize(const uint8_t *source, size_t size, pm_buffer_t *buffer, cons
14578
15494
  pm_parser_free(&parser);
14579
15495
  }
14580
15496
 
14581
- #undef PM_LOCATION_NULL_VALUE
14582
- #undef PM_LOCATION_TOKEN_VALUE
14583
- #undef PM_LOCATION_NODE_VALUE
14584
- #undef PM_LOCATION_NODE_BASE_VALUE
14585
15497
  #undef PM_CASE_KEYWORD
14586
15498
  #undef PM_CASE_OPERATOR
14587
15499
  #undef PM_CASE_WRITABLE
15500
+ #undef PM_EMPTY_STRING
15501
+ #undef PM_LOCATION_NODE_BASE_VALUE
15502
+ #undef PM_LOCATION_NODE_VALUE
15503
+ #undef PM_LOCATION_NULL_VALUE
15504
+ #undef PM_LOCATION_TOKEN_VALUE