prism 0.20.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/src/prism.c CHANGED
@@ -870,6 +870,105 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b
870
870
  pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK);
871
871
  }
872
872
 
873
+ /******************************************************************************/
874
+ /* Basic character checks */
875
+ /******************************************************************************/
876
+
877
+ /**
878
+ * This function is used extremely frequently to lex all of the identifiers in a
879
+ * source file, so it's important that it be as fast as possible. For this
880
+ * reason we have the encoding_changed boolean to check if we need to go through
881
+ * the function pointer or can just directly use the UTF-8 functions.
882
+ */
883
+ static inline size_t
884
+ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b) {
885
+ if (parser->encoding_changed) {
886
+ size_t width;
887
+ if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
888
+ return width;
889
+ } else if (*b == '_') {
890
+ return 1;
891
+ } else if (*b >= 0x80) {
892
+ return parser->encoding->char_width(b, parser->end - b);
893
+ } else {
894
+ return 0;
895
+ }
896
+ } else if (*b < 0x80) {
897
+ return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
898
+ } else {
899
+ return pm_encoding_utf_8_char_width(b, parser->end - b);
900
+ }
901
+ }
902
+
903
+ /**
904
+ * Similar to char_is_identifier but this function assumes that the encoding
905
+ * has not been changed.
906
+ */
907
+ static inline size_t
908
+ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
909
+ if (*b < 0x80) {
910
+ return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
911
+ } else {
912
+ return pm_encoding_utf_8_char_width(b, end - b);
913
+ }
914
+ }
915
+
916
+ /**
917
+ * Like the above, this function is also used extremely frequently to lex all of
918
+ * the identifiers in a source file once the first character has been found. So
919
+ * it's important that it be as fast as possible.
920
+ */
921
+ static inline size_t
922
+ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
923
+ if (parser->encoding_changed) {
924
+ size_t width;
925
+ if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
926
+ return width;
927
+ } else if (*b == '_') {
928
+ return 1;
929
+ } else if (*b >= 0x80) {
930
+ return parser->encoding->char_width(b, parser->end - b);
931
+ } else {
932
+ return 0;
933
+ }
934
+ }
935
+ return char_is_identifier_utf8(b, parser->end);
936
+ }
937
+
938
+ // Here we're defining a perfect hash for the characters that are allowed in
939
+ // global names. This is used to quickly check the next character after a $ to
940
+ // see if it's a valid character for a global name.
941
+ #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
942
+ #define PUNCT(idx) ( \
943
+ BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
944
+ BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
945
+ BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
946
+ BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
947
+ BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
948
+ BIT('0', idx))
949
+
950
+ const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
951
+
952
+ #undef BIT
953
+ #undef PUNCT
954
+
955
+ static inline bool
956
+ char_is_global_name_punctuation(const uint8_t b) {
957
+ const unsigned int i = (const unsigned int) b;
958
+ if (i <= 0x20 || 0x7e < i) return false;
959
+
960
+ return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
961
+ }
962
+
963
+ static inline bool
964
+ token_is_setter_name(pm_token_t *token) {
965
+ return (
966
+ (token->type == PM_TOKEN_IDENTIFIER) &&
967
+ (token->end - token->start >= 2) &&
968
+ (token->end[-1] == '=')
969
+ );
970
+ }
971
+
873
972
  /******************************************************************************/
874
973
  /* Node flag handling functions */
875
974
  /******************************************************************************/
@@ -1923,11 +2022,12 @@ pm_call_node_index_p(pm_call_node_t *node) {
1923
2022
  * operator assignment.
1924
2023
  */
1925
2024
  static inline bool
1926
- pm_call_node_writable_p(pm_call_node_t *node) {
2025
+ pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) {
1927
2026
  return (
1928
2027
  (node->message_loc.start != NULL) &&
1929
2028
  (node->message_loc.end[-1] != '!') &&
1930
2029
  (node->message_loc.end[-1] != '?') &&
2030
+ char_is_identifier_start(parser, node->message_loc.start) &&
1931
2031
  (node->opening_loc.start == NULL) &&
1932
2032
  (node->arguments == NULL) &&
1933
2033
  (node->block == NULL)
@@ -2744,19 +2844,21 @@ pm_constant_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *targ
2744
2844
  * Check if the receiver of a `def` node is allowed.
2745
2845
  */
2746
2846
  static void
2747
- pm_check_def_receiver(pm_parser_t *parser, pm_node_t *receiver) {
2748
- switch (receiver->type) {
2847
+ pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) {
2848
+ switch (PM_NODE_TYPE(node)) {
2749
2849
  case PM_BEGIN_NODE: {
2750
- pm_begin_node_t *begin_node = (pm_begin_node_t *)receiver;
2751
- pm_check_def_receiver(parser, (pm_node_t *) begin_node->statements);
2850
+ const pm_begin_node_t *cast = (pm_begin_node_t *) node;
2851
+ if (cast->statements != NULL) pm_def_node_receiver_check(parser, (pm_node_t *) cast->statements);
2752
2852
  break;
2753
2853
  }
2754
- case PM_PARENTHESES_NODE:
2755
- pm_check_def_receiver(parser, ((pm_parentheses_node_t *) receiver)->body);
2854
+ case PM_PARENTHESES_NODE: {
2855
+ const pm_parentheses_node_t *cast = (const pm_parentheses_node_t *) node;
2856
+ if (cast->body != NULL) pm_def_node_receiver_check(parser, cast->body);
2756
2857
  break;
2858
+ }
2757
2859
  case PM_STATEMENTS_NODE: {
2758
- pm_statements_node_t *statements_node = (pm_statements_node_t *)receiver;
2759
- pm_check_def_receiver(parser, statements_node->body.nodes[statements_node->body.size - 1]);
2860
+ const pm_statements_node_t *cast = (const pm_statements_node_t *) node;
2861
+ pm_def_node_receiver_check(parser, cast->body.nodes[cast->body.size - 1]);
2760
2862
  break;
2761
2863
  }
2762
2864
  case PM_ARRAY_NODE:
@@ -2775,7 +2877,10 @@ pm_check_def_receiver(pm_parser_t *parser, pm_node_t *receiver) {
2775
2877
  case PM_STRING_NODE:
2776
2878
  case PM_SYMBOL_NODE:
2777
2879
  case PM_X_STRING_NODE:
2778
- pm_parser_err_node(parser, receiver, PM_ERR_SINGLETON_FOR_LITERALS);
2880
+ pm_parser_err_node(parser, node, PM_ERR_SINGLETON_FOR_LITERALS);
2881
+ break;
2882
+ default:
2883
+ break;
2779
2884
  }
2780
2885
  }
2781
2886
 
@@ -2807,7 +2912,7 @@ pm_def_node_create(
2807
2912
  }
2808
2913
 
2809
2914
  if ((receiver != NULL) && PM_NODE_TYPE_P(receiver, PM_PARENTHESES_NODE)) {
2810
- pm_check_def_receiver(parser, receiver);
2915
+ pm_def_node_receiver_check(parser, receiver);
2811
2916
  }
2812
2917
 
2813
2918
  *node = (pm_def_node_t) {
@@ -5330,7 +5435,7 @@ pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword)
5330
5435
  .flags = PM_NODE_FLAG_STATIC_LITERAL,
5331
5436
  .location = PM_LOCATION_TOKEN_VALUE(file_keyword),
5332
5437
  },
5333
- .filepath = parser->filepath_string,
5438
+ .filepath = parser->filepath
5334
5439
  };
5335
5440
 
5336
5441
  return node;
@@ -6220,6 +6325,16 @@ pm_parser_local_add_owned(pm_parser_t *parser, const uint8_t *start, size_t leng
6220
6325
  return constant_id;
6221
6326
  }
6222
6327
 
6328
+ /**
6329
+ * Add a local variable from a constant string to the current scope.
6330
+ */
6331
+ static pm_constant_id_t
6332
+ pm_parser_local_add_constant(pm_parser_t *parser, const char *start, size_t length) {
6333
+ pm_constant_id_t constant_id = pm_parser_constant_id_constant(parser, start, length);
6334
+ if (constant_id != 0) pm_parser_local_add(parser, constant_id);
6335
+ return constant_id;
6336
+ }
6337
+
6223
6338
  /**
6224
6339
  * Add a parameter name to the current scope and check whether the name of the
6225
6340
  * parameter is unique or not.
@@ -6259,105 +6374,6 @@ pm_parser_scope_pop(pm_parser_t *parser) {
6259
6374
  free(scope);
6260
6375
  }
6261
6376
 
6262
- /******************************************************************************/
6263
- /* Basic character checks */
6264
- /******************************************************************************/
6265
-
6266
- /**
6267
- * This function is used extremely frequently to lex all of the identifiers in a
6268
- * source file, so it's important that it be as fast as possible. For this
6269
- * reason we have the encoding_changed boolean to check if we need to go through
6270
- * the function pointer or can just directly use the UTF-8 functions.
6271
- */
6272
- static inline size_t
6273
- char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
6274
- if (parser->encoding_changed) {
6275
- size_t width;
6276
- if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
6277
- return width;
6278
- } else if (*b == '_') {
6279
- return 1;
6280
- } else if (*b >= 0x80) {
6281
- return parser->encoding->char_width(b, parser->end - b);
6282
- } else {
6283
- return 0;
6284
- }
6285
- } else if (*b < 0x80) {
6286
- return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
6287
- } else {
6288
- return pm_encoding_utf_8_char_width(b, parser->end - b);
6289
- }
6290
- }
6291
-
6292
- /**
6293
- * Similar to char_is_identifier but this function assumes that the encoding
6294
- * has not been changed.
6295
- */
6296
- static inline size_t
6297
- char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
6298
- if (*b < 0x80) {
6299
- return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
6300
- } else {
6301
- return pm_encoding_utf_8_char_width(b, end - b);
6302
- }
6303
- }
6304
-
6305
- /**
6306
- * Like the above, this function is also used extremely frequently to lex all of
6307
- * the identifiers in a source file once the first character has been found. So
6308
- * it's important that it be as fast as possible.
6309
- */
6310
- static inline size_t
6311
- char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
6312
- if (parser->encoding_changed) {
6313
- size_t width;
6314
- if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
6315
- return width;
6316
- } else if (*b == '_') {
6317
- return 1;
6318
- } else if (*b >= 0x80) {
6319
- return parser->encoding->char_width(b, parser->end - b);
6320
- } else {
6321
- return 0;
6322
- }
6323
- }
6324
- return char_is_identifier_utf8(b, parser->end);
6325
- }
6326
-
6327
- // Here we're defining a perfect hash for the characters that are allowed in
6328
- // global names. This is used to quickly check the next character after a $ to
6329
- // see if it's a valid character for a global name.
6330
- #define BIT(c, idx) (((c) / 32 - 1 == idx) ? (1U << ((c) % 32)) : 0)
6331
- #define PUNCT(idx) ( \
6332
- BIT('~', idx) | BIT('*', idx) | BIT('$', idx) | BIT('?', idx) | \
6333
- BIT('!', idx) | BIT('@', idx) | BIT('/', idx) | BIT('\\', idx) | \
6334
- BIT(';', idx) | BIT(',', idx) | BIT('.', idx) | BIT('=', idx) | \
6335
- BIT(':', idx) | BIT('<', idx) | BIT('>', idx) | BIT('\"', idx) | \
6336
- BIT('&', idx) | BIT('`', idx) | BIT('\'', idx) | BIT('+', idx) | \
6337
- BIT('0', idx))
6338
-
6339
- const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { PUNCT(0), PUNCT(1), PUNCT(2) };
6340
-
6341
- #undef BIT
6342
- #undef PUNCT
6343
-
6344
- static inline bool
6345
- char_is_global_name_punctuation(const uint8_t b) {
6346
- const unsigned int i = (const unsigned int) b;
6347
- if (i <= 0x20 || 0x7e < i) return false;
6348
-
6349
- return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1;
6350
- }
6351
-
6352
- static inline bool
6353
- token_is_setter_name(pm_token_t *token) {
6354
- return (
6355
- (token->type == PM_TOKEN_IDENTIFIER) &&
6356
- (token->end - token->start >= 2) &&
6357
- (token->end[-1] == '=')
6358
- );
6359
- }
6360
-
6361
6377
  /******************************************************************************/
6362
6378
  /* Stack helpers */
6363
6379
  /******************************************************************************/
@@ -7673,6 +7689,28 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte
7673
7689
  pm_buffer_append_byte(buffer, byte);
7674
7690
  }
7675
7691
 
7692
+ /**
7693
+ * Write each byte of the given escaped character into the buffer.
7694
+ */
7695
+ static inline void
7696
+ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer) {
7697
+ size_t width;
7698
+ if (parser->encoding_changed) {
7699
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
7700
+ } else {
7701
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
7702
+ }
7703
+
7704
+ // TODO: If the character is invalid in the given encoding, then we'll just
7705
+ // push one byte into the buffer. This should actually be an error.
7706
+ width = (width == 0) ? 1 : width;
7707
+
7708
+ for (size_t index = 0; index < width; index++) {
7709
+ escape_write_byte_encoded(parser, buffer, *parser->current.end);
7710
+ parser->current.end++;
7711
+ }
7712
+ }
7713
+
7676
7714
  /**
7677
7715
  * The regular expression engine doesn't support the same escape sequences as
7678
7716
  * Ruby does. So first we have to read the escape sequence, and then we have to
@@ -8011,7 +8049,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
8011
8049
  /* fallthrough */
8012
8050
  default: {
8013
8051
  if (parser->current.end < parser->end) {
8014
- escape_write_byte_encoded(parser, buffer, *parser->current.end++);
8052
+ escape_write_escape_encoded(parser, buffer);
8015
8053
  }
8016
8054
  return;
8017
8055
  }
@@ -8288,10 +8326,40 @@ typedef struct {
8288
8326
  * Push the given byte into the token buffer.
8289
8327
  */
8290
8328
  static inline void
8291
- pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
8329
+ pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) {
8292
8330
  pm_buffer_append_byte(&token_buffer->buffer, byte);
8293
8331
  }
8294
8332
 
8333
+ /**
8334
+ * Append the given bytes into the token buffer.
8335
+ */
8336
+ static inline void
8337
+ pm_token_buffer_push_bytes(pm_token_buffer_t *token_buffer, const uint8_t *bytes, size_t length) {
8338
+ pm_buffer_append_bytes(&token_buffer->buffer, bytes, length);
8339
+ }
8340
+
8341
+ /**
8342
+ * Push an escaped character into the token buffer.
8343
+ */
8344
+ static inline void
8345
+ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parser) {
8346
+ // First, determine the width of the character to be escaped.
8347
+ size_t width;
8348
+ if (parser->encoding_changed) {
8349
+ width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
8350
+ } else {
8351
+ width = pm_encoding_utf_8_char_width(parser->current.end, parser->end - parser->current.end);
8352
+ }
8353
+
8354
+ // TODO: If the character is invalid in the given encoding, then we'll just
8355
+ // push one byte into the buffer. This should actually be an error.
8356
+ width = (width == 0 ? 1 : width);
8357
+
8358
+ // Now, push the bytes into the buffer.
8359
+ pm_token_buffer_push_bytes(token_buffer, parser->current.end, width);
8360
+ parser->current.end += width;
8361
+ }
8362
+
8295
8363
  /**
8296
8364
  * When we're about to return from lexing the current token and we know for sure
8297
8365
  * that we have found an escape sequence, this function is called to copy the
@@ -9522,11 +9590,21 @@ parser_lex(pm_parser_t *parser) {
9522
9590
  if (*parser->current.start != '_') {
9523
9591
  size_t width = char_is_identifier_start(parser, parser->current.start);
9524
9592
 
9525
- // If this isn't the beginning of an identifier, then it's an invalid
9526
- // token as we've exhausted all of the other options. We'll skip past
9527
- // it and return the next token.
9593
+ // If this isn't the beginning of an identifier, then
9594
+ // it's an invalid token as we've exhausted all of the
9595
+ // other options. We'll skip past it and return the next
9596
+ // token after adding an appropriate error message.
9528
9597
  if (!width) {
9529
- pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
9598
+ pm_diagnostic_id_t diag_id;
9599
+ if (*parser->current.start >= 0x80) {
9600
+ diag_id = PM_ERR_INVALID_MULTIBYTE_CHARACTER;
9601
+ } else if (char_is_ascii_printable(*parser->current.start) || (*parser->current.start == '\\')) {
9602
+ diag_id = PM_ERR_INVALID_PRINTABLE_CHARACTER;
9603
+ } else {
9604
+ diag_id = PM_ERR_INVALID_CHARACTER;
9605
+ }
9606
+
9607
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, *parser->current.start);
9530
9608
  goto lex_next_token;
9531
9609
  }
9532
9610
 
@@ -9704,18 +9782,18 @@ parser_lex(pm_parser_t *parser) {
9704
9782
  case '\t':
9705
9783
  case '\v':
9706
9784
  case '\\':
9707
- pm_token_buffer_push(&token_buffer, peeked);
9785
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9708
9786
  parser->current.end++;
9709
9787
  break;
9710
9788
  case '\r':
9711
9789
  parser->current.end++;
9712
9790
  if (peek(parser) != '\n') {
9713
- pm_token_buffer_push(&token_buffer, '\r');
9791
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9714
9792
  break;
9715
9793
  }
9716
9794
  /* fallthrough */
9717
9795
  case '\n':
9718
- pm_token_buffer_push(&token_buffer, '\n');
9796
+ pm_token_buffer_push_byte(&token_buffer, '\n');
9719
9797
 
9720
9798
  if (parser->heredoc_end) {
9721
9799
  // ... if we are on the same line as a heredoc,
@@ -9733,14 +9811,13 @@ parser_lex(pm_parser_t *parser) {
9733
9811
  break;
9734
9812
  default:
9735
9813
  if (peeked == lex_mode->as.list.incrementor || peeked == lex_mode->as.list.terminator) {
9736
- pm_token_buffer_push(&token_buffer, peeked);
9814
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9737
9815
  parser->current.end++;
9738
9816
  } else if (lex_mode->as.list.interpolation) {
9739
9817
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
9740
9818
  } else {
9741
- pm_token_buffer_push(&token_buffer, '\\');
9742
- pm_token_buffer_push(&token_buffer, peeked);
9743
- parser->current.end++;
9819
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9820
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9744
9821
  }
9745
9822
 
9746
9823
  break;
@@ -9898,9 +9975,9 @@ parser_lex(pm_parser_t *parser) {
9898
9975
  parser->current.end++;
9899
9976
  if (peek(parser) != '\n') {
9900
9977
  if (lex_mode->as.regexp.terminator != '\r') {
9901
- pm_token_buffer_push(&token_buffer, '\\');
9978
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9902
9979
  }
9903
- pm_token_buffer_push(&token_buffer, '\r');
9980
+ pm_token_buffer_push_byte(&token_buffer, '\r');
9904
9981
  break;
9905
9982
  }
9906
9983
  /* fallthrough */
@@ -9935,20 +10012,19 @@ parser_lex(pm_parser_t *parser) {
9935
10012
  case '$': case ')': case '*': case '+':
9936
10013
  case '.': case '>': case '?': case ']':
9937
10014
  case '^': case '|': case '}':
9938
- pm_token_buffer_push(&token_buffer, '\\');
10015
+ pm_token_buffer_push_byte(&token_buffer, '\\');
9939
10016
  break;
9940
10017
  default:
9941
10018
  break;
9942
10019
  }
9943
10020
 
9944
- pm_token_buffer_push(&token_buffer, peeked);
10021
+ pm_token_buffer_push_byte(&token_buffer, peeked);
9945
10022
  parser->current.end++;
9946
10023
  break;
9947
10024
  }
9948
10025
 
9949
- if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
9950
- pm_token_buffer_push(&token_buffer, peeked);
9951
- parser->current.end++;
10026
+ if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer, '\\');
10027
+ pm_token_buffer_push_escaped(&token_buffer, parser);
9952
10028
  break;
9953
10029
  }
9954
10030
 
@@ -10115,23 +10191,23 @@ parser_lex(pm_parser_t *parser) {
10115
10191
 
10116
10192
  switch (peeked) {
10117
10193
  case '\\':
10118
- pm_token_buffer_push(&token_buffer, '\\');
10194
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10119
10195
  parser->current.end++;
10120
10196
  break;
10121
10197
  case '\r':
10122
10198
  parser->current.end++;
10123
10199
  if (peek(parser) != '\n') {
10124
10200
  if (!lex_mode->as.string.interpolation) {
10125
- pm_token_buffer_push(&token_buffer, '\\');
10201
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10126
10202
  }
10127
- pm_token_buffer_push(&token_buffer, '\r');
10203
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10128
10204
  break;
10129
10205
  }
10130
10206
  /* fallthrough */
10131
10207
  case '\n':
10132
10208
  if (!lex_mode->as.string.interpolation) {
10133
- pm_token_buffer_push(&token_buffer, '\\');
10134
- pm_token_buffer_push(&token_buffer, '\n');
10209
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10210
+ pm_token_buffer_push_byte(&token_buffer, '\n');
10135
10211
  }
10136
10212
 
10137
10213
  if (parser->heredoc_end) {
@@ -10150,17 +10226,16 @@ parser_lex(pm_parser_t *parser) {
10150
10226
  break;
10151
10227
  default:
10152
10228
  if (lex_mode->as.string.incrementor != '\0' && peeked == lex_mode->as.string.incrementor) {
10153
- pm_token_buffer_push(&token_buffer, peeked);
10229
+ pm_token_buffer_push_byte(&token_buffer, peeked);
10154
10230
  parser->current.end++;
10155
10231
  } else if (lex_mode->as.string.terminator != '\0' && peeked == lex_mode->as.string.terminator) {
10156
- pm_token_buffer_push(&token_buffer, peeked);
10232
+ pm_token_buffer_push_byte(&token_buffer, peeked);
10157
10233
  parser->current.end++;
10158
10234
  } else if (lex_mode->as.string.interpolation) {
10159
10235
  escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE);
10160
10236
  } else {
10161
- pm_token_buffer_push(&token_buffer, '\\');
10162
- pm_token_buffer_push(&token_buffer, peeked);
10163
- parser->current.end++;
10237
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10238
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10164
10239
  }
10165
10240
 
10166
10241
  break;
@@ -10417,21 +10492,20 @@ parser_lex(pm_parser_t *parser) {
10417
10492
  case '\r':
10418
10493
  parser->current.end++;
10419
10494
  if (peek(parser) != '\n') {
10420
- pm_token_buffer_push(&token_buffer, '\\');
10421
- pm_token_buffer_push(&token_buffer, '\r');
10495
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10496
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10422
10497
  break;
10423
10498
  }
10424
10499
  /* fallthrough */
10425
10500
  case '\n':
10426
- pm_token_buffer_push(&token_buffer, '\\');
10427
- pm_token_buffer_push(&token_buffer, '\n');
10501
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10502
+ pm_token_buffer_push_byte(&token_buffer, '\n');
10428
10503
  token_buffer.cursor = parser->current.end + 1;
10429
10504
  breakpoint = parser->current.end;
10430
10505
  continue;
10431
10506
  default:
10432
- parser->current.end++;
10433
- pm_token_buffer_push(&token_buffer, '\\');
10434
- pm_token_buffer_push(&token_buffer, peeked);
10507
+ pm_token_buffer_push_byte(&token_buffer, '\\');
10508
+ pm_token_buffer_push_escaped(&token_buffer, parser);
10435
10509
  break;
10436
10510
  }
10437
10511
  } else {
@@ -10439,7 +10513,7 @@ parser_lex(pm_parser_t *parser) {
10439
10513
  case '\r':
10440
10514
  parser->current.end++;
10441
10515
  if (peek(parser) != '\n') {
10442
- pm_token_buffer_push(&token_buffer, '\r');
10516
+ pm_token_buffer_push_byte(&token_buffer, '\r');
10443
10517
  break;
10444
10518
  }
10445
10519
  /* fallthrough */
@@ -10715,14 +10789,6 @@ match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2,
10715
10789
  return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4);
10716
10790
  }
10717
10791
 
10718
- /**
10719
- * Returns true if the current token is any of the five given types.
10720
- */
10721
- static inline bool
10722
- match5(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5) {
10723
- return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5);
10724
- }
10725
-
10726
10792
  /**
10727
10793
  * Returns true if the current token is any of the six given types.
10728
10794
  */
@@ -11359,7 +11425,7 @@ parse_statements(pm_parser_t *parser, pm_context_t context) {
11359
11425
  break;
11360
11426
  }
11361
11427
 
11362
- // If we have a terminator, then we will parse all consequtive terminators
11428
+ // If we have a terminator, then we will parse all consecutive terminators
11363
11429
  // and then continue parsing the statements list.
11364
11430
  if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
11365
11431
  // If we have a terminator, then we will continue parsing the statements
@@ -12321,25 +12387,10 @@ parse_rescues(pm_parser_t *parser, pm_begin_node_t *parent_node, bool def_p) {
12321
12387
  }
12322
12388
 
12323
12389
  static inline pm_begin_node_t *
12324
- parse_rescues_as_begin(pm_parser_t *parser, pm_statements_node_t *statements, bool def_p) {
12390
+ parse_rescues_as_begin(pm_parser_t *parser, const uint8_t *start, pm_statements_node_t *statements, bool def_p) {
12325
12391
  pm_token_t no_begin_token = not_provided(parser);
12326
12392
  pm_begin_node_t *begin_node = pm_begin_node_create(parser, &no_begin_token, statements);
12327
12393
  parse_rescues(parser, begin_node, def_p);
12328
-
12329
- // All nodes within a begin node are optional, so we look
12330
- // for the earliest possible node that we can use to set
12331
- // the BeginNode's start location
12332
- const uint8_t *start = begin_node->base.location.start;
12333
- if (begin_node->statements) {
12334
- start = begin_node->statements->base.location.start;
12335
- } else if (begin_node->rescue_clause) {
12336
- start = begin_node->rescue_clause->base.location.start;
12337
- } else if (begin_node->else_clause) {
12338
- start = begin_node->else_clause->base.location.start;
12339
- } else if (begin_node->ensure_clause) {
12340
- start = begin_node->ensure_clause->base.location.start;
12341
- }
12342
-
12343
12394
  begin_node->base.location.start = start;
12344
12395
  return begin_node;
12345
12396
  }
@@ -12434,7 +12485,7 @@ parse_block(pm_parser_t *parser) {
12434
12485
 
12435
12486
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
12436
12487
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
12437
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
12488
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) statements, false);
12438
12489
  }
12439
12490
  }
12440
12491
 
@@ -13149,6 +13200,15 @@ outer_scope_using_numbered_parameters_p(pm_parser_t *parser) {
13149
13200
  return false;
13150
13201
  }
13151
13202
 
13203
+ /**
13204
+ * These are the names of the various numbered parameters. We have them here so
13205
+ * that when we insert them into the constant pool we can use a constant string
13206
+ * and not have to allocate.
13207
+ */
13208
+ static const char * const pm_numbered_parameter_names[] = {
13209
+ "_1", "_2", "_3", "_4", "_5", "_6", "_7", "_8", "_9"
13210
+ };
13211
+
13152
13212
  /**
13153
13213
  * Parse an identifier into either a local variable read. If the local variable
13154
13214
  * is not found, it returns NULL instead.
@@ -13171,12 +13231,10 @@ parse_variable(pm_parser_t *parser) {
13171
13231
  pm_parser_err_previous(parser, PM_ERR_NUMBERED_PARAMETER_OUTER_SCOPE);
13172
13232
  } else {
13173
13233
  // Indicate that this scope is using numbered params so that child
13174
- // scopes cannot.
13175
- uint8_t number = parser->previous.start[1];
13176
-
13177
- // We subtract the value for the character '0' to get the actual
13178
- // integer value of the number (only _1 through _9 are valid)
13179
- uint8_t numbered_parameters = (uint8_t) (number - '0');
13234
+ // scopes cannot. We subtract the value for the character '0' to get
13235
+ // the actual integer value of the number (only _1 through _9 are
13236
+ // valid).
13237
+ uint8_t numbered_parameters = (uint8_t) (parser->previous.start[1] - '0');
13180
13238
  if (numbered_parameters > parser->current_scope->numbered_parameters) {
13181
13239
  parser->current_scope->numbered_parameters = numbered_parameters;
13182
13240
  pm_parser_numbered_parameters_set(parser, numbered_parameters);
@@ -13187,21 +13245,13 @@ parse_variable(pm_parser_t *parser) {
13187
13245
  // referencing _2 means that _1 must exist. Therefore here we
13188
13246
  // loop through all of the possibilities and add them into the
13189
13247
  // constant pool.
13190
- uint8_t current = '1';
13191
- uint8_t *value;
13192
-
13193
- while (current < number) {
13194
- value = malloc(2);
13195
- value[0] = '_';
13196
- value[1] = current++;
13197
- pm_parser_local_add_owned(parser, value, 2);
13248
+ for (uint8_t numbered_parameter = 1; numbered_parameter <= numbered_parameters - 1; numbered_parameter++) {
13249
+ pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameter - 1], 2);
13198
13250
  }
13199
13251
 
13200
- // Now we can add the actual token that is being used. For
13201
- // this one we can add a shared version since it is directly
13202
- // referenced in the source.
13203
- pm_parser_local_add_token(parser, &parser->previous);
13204
- return pm_local_variable_read_node_create(parser, &parser->previous, 0);
13252
+ // Finally we can create the local variable read node.
13253
+ pm_constant_id_t name_id = pm_parser_local_add_constant(parser, pm_numbered_parameter_names[numbered_parameters - 1], 2);
13254
+ return pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0);
13205
13255
  }
13206
13256
  }
13207
13257
 
@@ -14010,7 +14060,7 @@ parse_pattern(pm_parser_t *parser, bool top_pattern, pm_diagnostic_id_t diag_id)
14010
14060
  // Gather up all of the patterns into the list.
14011
14061
  while (accept1(parser, PM_TOKEN_COMMA)) {
14012
14062
  // Break early here in case we have a trailing comma.
14013
- if (match5(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) {
14063
+ if (match6(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) {
14014
14064
  node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous);
14015
14065
  pm_node_list_append(&nodes, node);
14016
14066
  break;
@@ -15235,7 +15285,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15235
15285
 
15236
15286
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15237
15287
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15238
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15288
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
15239
15289
  }
15240
15290
 
15241
15291
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
@@ -15288,7 +15338,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15288
15338
 
15289
15339
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15290
15340
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15291
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15341
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, class_keyword.start, (pm_statements_node_t *) statements, false);
15292
15342
  }
15293
15343
 
15294
15344
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM);
@@ -15557,7 +15607,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15557
15607
 
15558
15608
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15559
15609
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15560
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, true);
15610
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, def_keyword.start, (pm_statements_node_t *) statements, true);
15561
15611
  }
15562
15612
 
15563
15613
  pm_accepts_block_stack_pop(parser);
@@ -15817,7 +15867,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
15817
15867
 
15818
15868
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
15819
15869
  assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE));
15820
- statements = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) statements, false);
15870
+ statements = (pm_node_t *) parse_rescues_as_begin(parser, module_keyword.start, (pm_statements_node_t *) statements, false);
15821
15871
  }
15822
15872
 
15823
15873
  pm_constant_id_list_t locals = parser->current_scope->locals;
@@ -16550,7 +16600,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
16550
16600
 
16551
16601
  if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) {
16552
16602
  assert(body == NULL || PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE));
16553
- body = (pm_node_t *) parse_rescues_as_begin(parser, (pm_statements_node_t *) body, false);
16603
+ body = (pm_node_t *) parse_rescues_as_begin(parser, opening.start, (pm_statements_node_t *) body, false);
16554
16604
  }
16555
16605
 
16556
16606
  expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END);
@@ -16927,7 +16977,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
16927
16977
  }
16928
16978
 
16929
16979
  // If this node cannot be writable, then we have an error.
16930
- if (pm_call_node_writable_p(cast)) {
16980
+ if (pm_call_node_writable_p(parser, cast)) {
16931
16981
  parse_write_name(parser, &cast->name);
16932
16982
  } else {
16933
16983
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -17038,7 +17088,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17038
17088
  }
17039
17089
 
17040
17090
  // If this node cannot be writable, then we have an error.
17041
- if (pm_call_node_writable_p(cast)) {
17091
+ if (pm_call_node_writable_p(parser, cast)) {
17042
17092
  parse_write_name(parser, &cast->name);
17043
17093
  } else {
17044
17094
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -17159,7 +17209,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
17159
17209
  }
17160
17210
 
17161
17211
  // If this node cannot be writable, then we have an error.
17162
- if (pm_call_node_writable_p(cast)) {
17212
+ if (pm_call_node_writable_p(parser, cast)) {
17163
17213
  parse_write_name(parser, &cast->name);
17164
17214
  } else {
17165
17215
  pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_UNEXPECTED);
@@ -17751,7 +17801,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17751
17801
  .encoding_changed_callback = NULL,
17752
17802
  .encoding_comment_start = source,
17753
17803
  .lex_callback = NULL,
17754
- .filepath_string = { 0 },
17804
+ .filepath = { 0 },
17755
17805
  .constant_pool = { 0 },
17756
17806
  .newline_list = { 0 },
17757
17807
  .integer_base = 0,
@@ -17794,7 +17844,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
17794
17844
  // If options were provided to this parse, establish them here.
17795
17845
  if (options != NULL) {
17796
17846
  // filepath option
17797
- parser->filepath_string = options->filepath;
17847
+ parser->filepath = options->filepath;
17798
17848
 
17799
17849
  // line option
17800
17850
  parser->start_line = options->line;
@@ -17896,7 +17946,7 @@ pm_magic_comment_list_free(pm_list_t *list) {
17896
17946
  */
17897
17947
  PRISM_EXPORTED_FUNCTION void
17898
17948
  pm_parser_free(pm_parser_t *parser) {
17899
- pm_string_free(&parser->filepath_string);
17949
+ pm_string_free(&parser->filepath);
17900
17950
  pm_diagnostic_list_free(&parser->error_list);
17901
17951
  pm_diagnostic_list_free(&parser->warning_list);
17902
17952
  pm_comment_list_free(&parser->comment_list);
@@ -18060,7 +18110,9 @@ pm_parser_errors_format_sort(const pm_list_t *error_list, const pm_newline_list_
18060
18110
 
18061
18111
  // Now we're going to shift all of the errors after this one down one
18062
18112
  // index to make room for the new error.
18063
- memcpy(&errors[index + 1], &errors[index], sizeof(pm_error_t) * (error_list->size - index - 1));
18113
+ if (index + 1 < error_list->size) {
18114
+ memmove(&errors[index + 1], &errors[index], sizeof(pm_error_t) * (error_list->size - index - 1));
18115
+ }
18064
18116
 
18065
18117
  // Finally, we'll insert the error into the array.
18066
18118
  uint32_t column_end;