prism 0.29.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +77 -1
  3. data/CONTRIBUTING.md +0 -4
  4. data/README.md +4 -0
  5. data/config.yml +498 -145
  6. data/docs/fuzzing.md +1 -1
  7. data/docs/parsing_rules.md +4 -1
  8. data/docs/ripper_translation.md +22 -0
  9. data/docs/serialization.md +3 -0
  10. data/ext/prism/api_node.c +2858 -2082
  11. data/ext/prism/extconf.rb +1 -1
  12. data/ext/prism/extension.c +203 -421
  13. data/ext/prism/extension.h +2 -2
  14. data/include/prism/ast.h +1732 -453
  15. data/include/prism/defines.h +36 -0
  16. data/include/prism/diagnostic.h +23 -6
  17. data/include/prism/node.h +0 -21
  18. data/include/prism/options.h +94 -3
  19. data/include/prism/parser.h +57 -28
  20. data/include/prism/regexp.h +18 -8
  21. data/include/prism/static_literals.h +3 -2
  22. data/include/prism/util/pm_char.h +1 -2
  23. data/include/prism/util/pm_constant_pool.h +0 -8
  24. data/include/prism/util/pm_integer.h +22 -15
  25. data/include/prism/util/pm_newline_list.h +11 -0
  26. data/include/prism/util/pm_string.h +28 -12
  27. data/include/prism/version.h +3 -3
  28. data/include/prism.h +0 -11
  29. data/lib/prism/compiler.rb +3 -0
  30. data/lib/prism/desugar_compiler.rb +111 -74
  31. data/lib/prism/dispatcher.rb +16 -1
  32. data/lib/prism/dot_visitor.rb +45 -34
  33. data/lib/prism/dsl.rb +660 -468
  34. data/lib/prism/ffi.rb +64 -6
  35. data/lib/prism/inspect_visitor.rb +294 -64
  36. data/lib/prism/lex_compat.rb +1 -1
  37. data/lib/prism/mutation_compiler.rb +11 -6
  38. data/lib/prism/node.rb +2469 -4973
  39. data/lib/prism/node_ext.rb +91 -14
  40. data/lib/prism/parse_result/comments.rb +0 -7
  41. data/lib/prism/parse_result/errors.rb +65 -0
  42. data/lib/prism/parse_result/newlines.rb +101 -11
  43. data/lib/prism/parse_result.rb +43 -3
  44. data/lib/prism/reflection.rb +10 -8
  45. data/lib/prism/serialize.rb +484 -609
  46. data/lib/prism/translation/parser/compiler.rb +152 -132
  47. data/lib/prism/translation/parser/lexer.rb +26 -4
  48. data/lib/prism/translation/parser.rb +9 -4
  49. data/lib/prism/translation/ripper.rb +22 -20
  50. data/lib/prism/translation/ruby_parser.rb +73 -13
  51. data/lib/prism/visitor.rb +3 -0
  52. data/lib/prism.rb +0 -4
  53. data/prism.gemspec +3 -5
  54. data/rbi/prism/dsl.rbi +521 -0
  55. data/rbi/prism/node.rbi +744 -4837
  56. data/rbi/prism/visitor.rbi +3 -0
  57. data/rbi/prism.rbi +36 -30
  58. data/sig/prism/dsl.rbs +190 -303
  59. data/sig/prism/mutation_compiler.rbs +1 -0
  60. data/sig/prism/node.rbs +759 -628
  61. data/sig/prism/parse_result.rbs +2 -0
  62. data/sig/prism/visitor.rbs +1 -0
  63. data/sig/prism.rbs +103 -64
  64. data/src/diagnostic.c +62 -28
  65. data/src/node.c +499 -1754
  66. data/src/options.c +76 -27
  67. data/src/prettyprint.c +156 -112
  68. data/src/prism.c +2773 -2081
  69. data/src/regexp.c +202 -69
  70. data/src/serialize.c +170 -50
  71. data/src/static_literals.c +63 -84
  72. data/src/token_type.c +4 -4
  73. data/src/util/pm_constant_pool.c +0 -8
  74. data/src/util/pm_integer.c +53 -25
  75. data/src/util/pm_newline_list.c +29 -0
  76. data/src/util/pm_string.c +130 -80
  77. data/src/util/pm_strpbrk.c +32 -6
  78. metadata +4 -6
  79. data/include/prism/util/pm_string_list.h +0 -44
  80. data/lib/prism/debug.rb +0 -249
  81. data/lib/prism/translation/parser/rubocop.rb +0 -73
  82. data/src/util/pm_string_list.c +0 -28
@@ -21,38 +21,35 @@ VALUE rb_cPrismParseError;
21
21
  VALUE rb_cPrismParseWarning;
22
22
  VALUE rb_cPrismResult;
23
23
  VALUE rb_cPrismParseResult;
24
+ VALUE rb_cPrismLexResult;
24
25
  VALUE rb_cPrismParseLexResult;
25
26
 
26
27
  VALUE rb_cPrismDebugEncoding;
27
28
 
28
- ID rb_option_id_command_line;
29
- ID rb_option_id_encoding;
30
- ID rb_option_id_filepath;
31
- ID rb_option_id_frozen_string_literal;
32
- ID rb_option_id_line;
33
- ID rb_option_id_scopes;
34
- ID rb_option_id_version;
35
- ID rb_prism_source_id_for;
29
+ ID rb_id_option_command_line;
30
+ ID rb_id_option_encoding;
31
+ ID rb_id_option_filepath;
32
+ ID rb_id_option_frozen_string_literal;
33
+ ID rb_id_option_line;
34
+ ID rb_id_option_main_script;
35
+ ID rb_id_option_partial_script;
36
+ ID rb_id_option_scopes;
37
+ ID rb_id_option_version;
38
+ ID rb_id_source_for;
36
39
 
37
40
  /******************************************************************************/
38
41
  /* IO of Ruby code */
39
42
  /******************************************************************************/
40
43
 
41
44
  /**
42
- * Check if the given VALUE is a string. If it's nil, then return NULL. If it's
43
- * not a string, then raise a type error. Otherwise return the VALUE as a C
44
- * string.
45
+ * Check if the given VALUE is a string. If it's not a string, then raise a
46
+ * TypeError. Otherwise return the VALUE as a C string.
45
47
  */
46
48
  static const char *
47
49
  check_string(VALUE value) {
48
- // If the value is nil, then we don't need to do anything.
49
- if (NIL_P(value)) {
50
- return NULL;
51
- }
52
-
53
50
  // Check if the value is a string. If it's not, then raise a type error.
54
51
  if (!RB_TYPE_P(value, T_STRING)) {
55
- rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(value));
52
+ rb_raise(rb_eTypeError, "wrong argument type %" PRIsVALUE " (expected String)", rb_obj_class(value));
56
53
  }
57
54
 
58
55
  // Otherwise, return the value as a C string.
@@ -66,7 +63,7 @@ static void
66
63
  input_load_string(pm_string_t *input, VALUE string) {
67
64
  // Check if the string is a string. If it's not, then raise a type error.
68
65
  if (!RB_TYPE_P(string, T_STRING)) {
69
- rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string));
66
+ rb_raise(rb_eTypeError, "wrong argument type %" PRIsVALUE " (expected String)", rb_obj_class(string));
70
67
  }
71
68
 
72
69
  pm_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string));
@@ -135,15 +132,21 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
135
132
  pm_options_t *options = (pm_options_t *) argument;
136
133
  ID key_id = SYM2ID(key);
137
134
 
138
- if (key_id == rb_option_id_filepath) {
135
+ if (key_id == rb_id_option_filepath) {
139
136
  if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value));
140
- } else if (key_id == rb_option_id_encoding) {
141
- if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
142
- } else if (key_id == rb_option_id_line) {
137
+ } else if (key_id == rb_id_option_encoding) {
138
+ if (!NIL_P(value)) {
139
+ if (value == Qfalse) {
140
+ pm_options_encoding_locked_set(options, true);
141
+ } else {
142
+ pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
143
+ }
144
+ }
145
+ } else if (key_id == rb_id_option_line) {
143
146
  if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value));
144
- } else if (key_id == rb_option_id_frozen_string_literal) {
147
+ } else if (key_id == rb_id_option_frozen_string_literal) {
145
148
  if (!NIL_P(value)) pm_options_frozen_string_literal_set(options, RTEST(value));
146
- } else if (key_id == rb_option_id_version) {
149
+ } else if (key_id == rb_id_option_version) {
147
150
  if (!NIL_P(value)) {
148
151
  const char *version = check_string(value);
149
152
 
@@ -151,9 +154,9 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
151
154
  rb_raise(rb_eArgError, "invalid version: %" PRIsVALUE, value);
152
155
  }
153
156
  }
154
- } else if (key_id == rb_option_id_scopes) {
157
+ } else if (key_id == rb_id_option_scopes) {
155
158
  if (!NIL_P(value)) build_options_scopes(options, value);
156
- } else if (key_id == rb_option_id_command_line) {
159
+ } else if (key_id == rb_id_option_command_line) {
157
160
  if (!NIL_P(value)) {
158
161
  const char *string = check_string(value);
159
162
  uint8_t command_line = 0;
@@ -172,6 +175,10 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
172
175
 
173
176
  pm_options_command_line_set(options, command_line);
174
177
  }
178
+ } else if (key_id == rb_id_option_main_script) {
179
+ if (!NIL_P(value)) pm_options_main_script_set(options, RTEST(value));
180
+ } else if (key_id == rb_id_option_partial_script) {
181
+ if (!NIL_P(value)) pm_options_partial_script_set(options, RTEST(value));
175
182
  } else {
176
183
  rb_raise(rb_eArgError, "unknown keyword: %" PRIsVALUE, key);
177
184
  }
@@ -206,6 +213,7 @@ build_options(VALUE argument) {
206
213
  static void
207
214
  extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) {
208
215
  options->line = 1; // default
216
+
209
217
  if (!NIL_P(keywords)) {
210
218
  struct build_options_data data = { .options = options, .keywords = keywords };
211
219
  struct build_options_data *argument = &data;
@@ -246,27 +254,41 @@ string_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options)
246
254
  * Read options for methods that look like (filepath, **options).
247
255
  */
248
256
  static void
249
- file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options) {
257
+ file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, VALUE *encoded_filepath) {
250
258
  VALUE filepath;
251
259
  VALUE keywords;
252
260
  rb_scan_args(argc, argv, "1:", &filepath, &keywords);
253
261
 
254
262
  Check_Type(filepath, T_STRING);
263
+ *encoded_filepath = rb_str_encode_ospath(filepath);
264
+ extract_options(options, *encoded_filepath, keywords);
255
265
 
256
- extract_options(options, filepath, keywords);
266
+ const char *source = (const char *) pm_string_source(&options->filepath);
267
+ pm_string_init_result_t result;
257
268
 
258
- const char * string_source = (const char *) pm_string_source(&options->filepath);
259
-
260
- if (!pm_string_file_init(input, string_source)) {
261
- pm_options_free(options);
269
+ switch (result = pm_string_file_init(input, source)) {
270
+ case PM_STRING_INIT_SUCCESS:
271
+ break;
272
+ case PM_STRING_INIT_ERROR_GENERIC: {
273
+ pm_options_free(options);
262
274
 
263
275
  #ifdef _WIN32
264
- int e = rb_w32_map_errno(GetLastError());
276
+ int e = rb_w32_map_errno(GetLastError());
265
277
  #else
266
- int e = errno;
278
+ int e = errno;
267
279
  #endif
268
280
 
269
- rb_syserr_fail(e, string_source);
281
+ rb_syserr_fail(e, source);
282
+ break;
283
+ }
284
+ case PM_STRING_INIT_ERROR_DIRECTORY:
285
+ pm_options_free(options);
286
+ rb_syserr_fail(EISDIR, source);
287
+ break;
288
+ default:
289
+ pm_options_free(options);
290
+ rb_raise(rb_eRuntimeError, "Unknown error (%d) initializing file: %s", result, source);
291
+ break;
270
292
  }
271
293
  }
272
294
 
@@ -344,7 +366,8 @@ dump_file(int argc, VALUE *argv, VALUE self) {
344
366
  pm_string_t input;
345
367
  pm_options_t options = { 0 };
346
368
 
347
- file_options(argc, argv, &input, &options);
369
+ VALUE encoded_filepath;
370
+ file_options(argc, argv, &input, &options, &encoded_filepath);
348
371
 
349
372
  VALUE value = dump_input(&input, &options);
350
373
  pm_string_free(&input);
@@ -364,7 +387,7 @@ dump_file(int argc, VALUE *argv, VALUE self) {
364
387
  */
365
388
  static VALUE
366
389
  parser_comments(pm_parser_t *parser, VALUE source) {
367
- VALUE comments = rb_ary_new();
390
+ VALUE comments = rb_ary_new_capa(parser->comment_list.size);
368
391
 
369
392
  for (pm_comment_t *comment = (pm_comment_t *) parser->comment_list.head; comment != NULL; comment = (pm_comment_t *) comment->node.next) {
370
393
  VALUE location_argv[] = {
@@ -386,7 +409,7 @@ parser_comments(pm_parser_t *parser, VALUE source) {
386
409
  */
387
410
  static VALUE
388
411
  parser_magic_comments(pm_parser_t *parser, VALUE source) {
389
- VALUE magic_comments = rb_ary_new();
412
+ VALUE magic_comments = rb_ary_new_capa(parser->magic_comment_list.size);
390
413
 
391
414
  for (pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) parser->magic_comment_list.head; magic_comment != NULL; magic_comment = (pm_magic_comment_t *) magic_comment->node.next) {
392
415
  VALUE key_loc_argv[] = {
@@ -436,7 +459,7 @@ parser_data_loc(const pm_parser_t *parser, VALUE source) {
436
459
  */
437
460
  static VALUE
438
461
  parser_errors(pm_parser_t *parser, rb_encoding *encoding, VALUE source) {
439
- VALUE errors = rb_ary_new();
462
+ VALUE errors = rb_ary_new_capa(parser->error_list.size);
440
463
  pm_diagnostic_t *error;
441
464
 
442
465
  for (error = (pm_diagnostic_t *) parser->error_list.head; error != NULL; error = (pm_diagnostic_t *) error->node.next) {
@@ -479,7 +502,7 @@ parser_errors(pm_parser_t *parser, rb_encoding *encoding, VALUE source) {
479
502
  */
480
503
  static VALUE
481
504
  parser_warnings(pm_parser_t *parser, rb_encoding *encoding, VALUE source) {
482
- VALUE warnings = rb_ary_new();
505
+ VALUE warnings = rb_ary_new_capa(parser->warning_list.size);
483
506
  pm_diagnostic_t *warning;
484
507
 
485
508
  for (warning = (pm_diagnostic_t *) parser->warning_list.head; warning != NULL; warning = (pm_diagnostic_t *) warning->node.next) {
@@ -556,9 +579,10 @@ static void
556
579
  parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) {
557
580
  parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
558
581
 
559
- VALUE yields = rb_ary_new_capa(2);
560
- rb_ary_push(yields, pm_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source));
561
- rb_ary_push(yields, INT2FIX(parser->lex_state));
582
+ VALUE yields = rb_assoc_new(
583
+ pm_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source),
584
+ INT2FIX(parser->lex_state)
585
+ );
562
586
 
563
587
  rb_ary_push(parse_lex_data->tokens, yields);
564
588
  }
@@ -599,8 +623,8 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
599
623
  pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
600
624
 
601
625
  VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
602
- VALUE offsets = rb_ary_new();
603
- VALUE source = rb_funcall(rb_cPrismSource, rb_prism_source_id_for, 3, source_string, LONG2NUM(parser.start_line), offsets);
626
+ VALUE offsets = rb_ary_new_capa(parser.newline_list.size);
627
+ VALUE source = rb_funcall(rb_cPrismSource, rb_id_source_for, 3, source_string, LONG2NUM(parser.start_line), offsets);
604
628
 
605
629
  parse_lex_data_t parse_lex_data = {
606
630
  .source = source,
@@ -628,16 +652,16 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
628
652
  rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index]));
629
653
  }
630
654
 
631
- VALUE value;
655
+ VALUE result;
632
656
  if (return_nodes) {
633
- value = rb_ary_new_capa(2);
657
+ VALUE value = rb_ary_new_capa(2);
634
658
  rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
635
659
  rb_ary_push(value, parse_lex_data.tokens);
660
+ result = parse_result_create(rb_cPrismParseLexResult, &parser, value, parse_lex_data.encoding, source);
636
661
  } else {
637
- value = parse_lex_data.tokens;
662
+ result = parse_result_create(rb_cPrismLexResult, &parser, parse_lex_data.tokens, parse_lex_data.encoding, source);
638
663
  }
639
664
 
640
- VALUE result = parse_result_create(rb_cPrismParseLexResult, &parser, value, parse_lex_data.encoding, source);
641
665
  pm_node_destroy(&parser, node);
642
666
  pm_parser_free(&parser);
643
667
 
@@ -646,10 +670,10 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
646
670
 
647
671
  /**
648
672
  * call-seq:
649
- * Prism::lex(source, **options) -> Array
673
+ * Prism::lex(source, **options) -> LexResult
650
674
  *
651
- * Return an array of Token instances corresponding to the given string. For
652
- * supported options, see Prism::parse.
675
+ * Return a LexResult instance that contains an array of Token instances
676
+ * corresponding to the given string. For supported options, see Prism::parse.
653
677
  */
654
678
  static VALUE
655
679
  lex(int argc, VALUE *argv, VALUE self) {
@@ -666,17 +690,18 @@ lex(int argc, VALUE *argv, VALUE self) {
666
690
 
667
691
  /**
668
692
  * call-seq:
669
- * Prism::lex_file(filepath, **options) -> Array
693
+ * Prism::lex_file(filepath, **options) -> LexResult
670
694
  *
671
- * Return an array of Token instances corresponding to the given file. For
672
- * supported options, see Prism::parse.
695
+ * Return a LexResult instance that contains an array of Token instances
696
+ * corresponding to the given file. For supported options, see Prism::parse.
673
697
  */
674
698
  static VALUE
675
699
  lex_file(int argc, VALUE *argv, VALUE self) {
676
700
  pm_string_t input;
677
701
  pm_options_t options = { 0 };
678
702
 
679
- file_options(argc, argv, &input, &options);
703
+ VALUE encoded_filepath;
704
+ file_options(argc, argv, &input, &options, &encoded_filepath);
680
705
 
681
706
  VALUE value = parse_lex_input(&input, &options, false);
682
707
  pm_string_free(&input);
@@ -728,14 +753,27 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
728
753
  * has been set. This should be a boolean or nil.
729
754
  * * `line` - the line number that the parse starts on. This should be an
730
755
  * integer or nil. Note that this is 1-indexed.
756
+ * * `main_script` - a boolean indicating whether or not the source being parsed
757
+ * is the main script being run by the interpreter. This controls whether
758
+ * or not shebangs are parsed for additional flags and whether or not the
759
+ * parser will attempt to find a matching shebang if the first one does
760
+ * not contain the word "ruby".
761
+ * * `partial_script` - when the file being parsed is considered a "partial"
762
+ * script, jumps will not be marked as errors if they are not contained
763
+ * within loops/blocks. This is used in the case that you're parsing a
764
+ * script that you know will be embedded inside another script later, but
765
+ * you do not have that context yet. For example, when parsing an ERB
766
+ * template that will be evaluated inside another script.
731
767
  * * `scopes` - the locals that are in scope surrounding the code that is being
732
768
  * parsed. This should be an array of arrays of symbols or nil. Scopes are
733
769
  * ordered from the outermost scope to the innermost one.
734
770
  * * `version` - the version of Ruby syntax that prism should used to parse Ruby
735
771
  * code. By default prism assumes you want to parse with the latest version
736
772
  * of Ruby syntax (which you can trigger with `nil` or `"latest"`). You
737
- * may also restrict the syntax to a specific version of Ruby. The
738
- * supported values are `"3.3.0"` and `"3.4.0"`.
773
+ * may also restrict the syntax to a specific version of Ruby, e.g., with `"3.3.0"`.
774
+ * To parse with the same syntax version that the current Ruby is running
775
+ * use `version: RUBY_VERSION`. Raises ArgumentError if the version is not
776
+ * currently supported by Prism.
739
777
  */
740
778
  static VALUE
741
779
  parse(int argc, VALUE *argv, VALUE self) {
@@ -761,6 +799,85 @@ parse(int argc, VALUE *argv, VALUE self) {
761
799
  return value;
762
800
  }
763
801
 
802
+ /**
803
+ * call-seq:
804
+ * Prism::parse_file(filepath, **options) -> ParseResult
805
+ *
806
+ * Parse the given file and return a ParseResult instance. For supported
807
+ * options, see Prism::parse.
808
+ */
809
+ static VALUE
810
+ parse_file(int argc, VALUE *argv, VALUE self) {
811
+ pm_string_t input;
812
+ pm_options_t options = { 0 };
813
+
814
+ VALUE encoded_filepath;
815
+ file_options(argc, argv, &input, &options, &encoded_filepath);
816
+
817
+ VALUE value = parse_input(&input, &options);
818
+ pm_string_free(&input);
819
+ pm_options_free(&options);
820
+
821
+ return value;
822
+ }
823
+
824
+ /**
825
+ * Parse the given input and return nothing.
826
+ */
827
+ static void
828
+ profile_input(pm_string_t *input, const pm_options_t *options) {
829
+ pm_parser_t parser;
830
+ pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
831
+
832
+ pm_node_t *node = pm_parse(&parser);
833
+ pm_node_destroy(&parser, node);
834
+ pm_parser_free(&parser);
835
+ }
836
+
837
+ /**
838
+ * call-seq:
839
+ * Prism::profile(source, **options) -> nil
840
+ *
841
+ * Parse the given string and return nothing. This method is meant to allow
842
+ * profilers to avoid the overhead of reifying the AST to Ruby. For supported
843
+ * options, see Prism::parse.
844
+ */
845
+ static VALUE
846
+ profile(int argc, VALUE *argv, VALUE self) {
847
+ pm_string_t input;
848
+ pm_options_t options = { 0 };
849
+
850
+ string_options(argc, argv, &input, &options);
851
+ profile_input(&input, &options);
852
+ pm_string_free(&input);
853
+ pm_options_free(&options);
854
+
855
+ return Qnil;
856
+ }
857
+
858
+ /**
859
+ * call-seq:
860
+ * Prism::profile_file(filepath, **options) -> nil
861
+ *
862
+ * Parse the given file and return nothing. This method is meant to allow
863
+ * profilers to avoid the overhead of reifying the AST to Ruby. For supported
864
+ * options, see Prism::parse.
865
+ */
866
+ static VALUE
867
+ profile_file(int argc, VALUE *argv, VALUE self) {
868
+ pm_string_t input;
869
+ pm_options_t options = { 0 };
870
+
871
+ VALUE encoded_filepath;
872
+ file_options(argc, argv, &input, &options, &encoded_filepath);
873
+
874
+ profile_input(&input, &options);
875
+ pm_string_free(&input);
876
+ pm_options_free(&options);
877
+
878
+ return Qnil;
879
+ }
880
+
764
881
  /**
765
882
  * An implementation of fgets that is suitable for use with Ruby IO objects.
766
883
  */
@@ -773,8 +890,8 @@ parse_stream_fgets(char *string, int size, void *stream) {
773
890
  return NULL;
774
891
  }
775
892
 
776
- const char *cstr = StringValueCStr(line);
777
- size_t length = strlen(cstr);
893
+ const char *cstr = RSTRING_PTR(line);
894
+ long length = RSTRING_LEN(line);
778
895
 
779
896
  memcpy(string, cstr, length);
780
897
  string[length] = '\0';
@@ -815,27 +932,6 @@ parse_stream(int argc, VALUE *argv, VALUE self) {
815
932
  return result;
816
933
  }
817
934
 
818
- /**
819
- * call-seq:
820
- * Prism::parse_file(filepath, **options) -> ParseResult
821
- *
822
- * Parse the given file and return a ParseResult instance. For supported
823
- * options, see Prism::parse.
824
- */
825
- static VALUE
826
- parse_file(int argc, VALUE *argv, VALUE self) {
827
- pm_string_t input;
828
- pm_options_t options = { 0 };
829
-
830
- file_options(argc, argv, &input, &options);
831
-
832
- VALUE value = parse_input(&input, &options);
833
- pm_string_free(&input);
834
- pm_options_free(&options);
835
-
836
- return value;
837
- }
838
-
839
935
  /**
840
936
  * Parse the given input and return an array of Comment objects.
841
937
  */
@@ -888,7 +984,8 @@ parse_file_comments(int argc, VALUE *argv, VALUE self) {
888
984
  pm_string_t input;
889
985
  pm_options_t options = { 0 };
890
986
 
891
- file_options(argc, argv, &input, &options);
987
+ VALUE encoded_filepath;
988
+ file_options(argc, argv, &input, &options, &encoded_filepath);
892
989
 
893
990
  VALUE value = parse_input_comments(&input, &options);
894
991
  pm_string_free(&input);
@@ -899,9 +996,9 @@ parse_file_comments(int argc, VALUE *argv, VALUE self) {
899
996
 
900
997
  /**
901
998
  * call-seq:
902
- * Prism::parse_lex(source, **options) -> ParseResult
999
+ * Prism::parse_lex(source, **options) -> ParseLexResult
903
1000
  *
904
- * Parse the given string and return a ParseResult instance that contains a
1001
+ * Parse the given string and return a ParseLexResult instance that contains a
905
1002
  * 2-element array, where the first element is the AST and the second element is
906
1003
  * an array of Token instances.
907
1004
  *
@@ -926,9 +1023,9 @@ parse_lex(int argc, VALUE *argv, VALUE self) {
926
1023
 
927
1024
  /**
928
1025
  * call-seq:
929
- * Prism::parse_lex_file(filepath, **options) -> ParseResult
1026
+ * Prism::parse_lex_file(filepath, **options) -> ParseLexResult
930
1027
  *
931
- * Parse the given file and return a ParseResult instance that contains a
1028
+ * Parse the given file and return a ParseLexResult instance that contains a
932
1029
  * 2-element array, where the first element is the AST and the second element is
933
1030
  * an array of Token instances.
934
1031
  *
@@ -943,7 +1040,8 @@ parse_lex_file(int argc, VALUE *argv, VALUE self) {
943
1040
  pm_string_t input;
944
1041
  pm_options_t options = { 0 };
945
1042
 
946
- file_options(argc, argv, &input, &options);
1043
+ VALUE encoded_filepath;
1044
+ file_options(argc, argv, &input, &options, &encoded_filepath);
947
1045
 
948
1046
  VALUE value = parse_lex_input(&input, &options, true);
949
1047
  pm_string_free(&input);
@@ -1013,7 +1111,8 @@ parse_file_success_p(int argc, VALUE *argv, VALUE self) {
1013
1111
  pm_string_t input;
1014
1112
  pm_options_t options = { 0 };
1015
1113
 
1016
- file_options(argc, argv, &input, &options);
1114
+ VALUE encoded_filepath;
1115
+ file_options(argc, argv, &input, &options, &encoded_filepath);
1017
1116
 
1018
1117
  VALUE result = parse_input_success_p(&input, &options);
1019
1118
  pm_string_free(&input);
@@ -1034,303 +1133,6 @@ parse_file_failure_p(int argc, VALUE *argv, VALUE self) {
1034
1133
  return RTEST(parse_file_success_p(argc, argv, self)) ? Qfalse : Qtrue;
1035
1134
  }
1036
1135
 
1037
- /******************************************************************************/
1038
- /* Utility functions exposed to make testing easier */
1039
- /******************************************************************************/
1040
-
1041
- /**
1042
- * call-seq:
1043
- * Debug::named_captures(source) -> Array
1044
- *
1045
- * Returns an array of strings corresponding to the named capture groups in the
1046
- * given source string. If prism was unable to parse the regular expression,
1047
- * this function returns nil.
1048
- */
1049
- static VALUE
1050
- named_captures(VALUE self, VALUE source) {
1051
- pm_string_list_t string_list = { 0 };
1052
-
1053
- if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) {
1054
- pm_string_list_free(&string_list);
1055
- return Qnil;
1056
- }
1057
-
1058
- VALUE names = rb_ary_new();
1059
- for (size_t index = 0; index < string_list.length; index++) {
1060
- const pm_string_t *string = &string_list.strings[index];
1061
- rb_ary_push(names, rb_str_new((const char *) pm_string_source(string), pm_string_length(string)));
1062
- }
1063
-
1064
- pm_string_list_free(&string_list);
1065
- return names;
1066
- }
1067
-
1068
- /**
1069
- * call-seq:
1070
- * Debug::integer_parse(source) -> [Integer, String]
1071
- *
1072
- * Parses the given source string and returns the integer it represents, as well
1073
- * as a decimal string representation.
1074
- */
1075
- static VALUE
1076
- integer_parse(VALUE self, VALUE source) {
1077
- const uint8_t *start = (const uint8_t *) RSTRING_PTR(source);
1078
- size_t length = RSTRING_LEN(source);
1079
-
1080
- pm_integer_t integer = { 0 };
1081
- pm_integer_parse(&integer, PM_INTEGER_BASE_UNKNOWN, start, start + length);
1082
-
1083
- pm_buffer_t buffer = { 0 };
1084
- pm_integer_string(&buffer, &integer);
1085
-
1086
- VALUE string = rb_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer));
1087
- pm_buffer_free(&buffer);
1088
-
1089
- VALUE result = rb_ary_new_capa(2);
1090
- rb_ary_push(result, pm_integer_new(&integer));
1091
- rb_ary_push(result, string);
1092
- pm_integer_free(&integer);
1093
-
1094
- return result;
1095
- }
1096
-
1097
- /**
1098
- * call-seq:
1099
- * Debug::memsize(source) -> { length: xx, memsize: xx, node_count: xx }
1100
- *
1101
- * Return a hash of information about the given source string's memory usage.
1102
- */
1103
- static VALUE
1104
- memsize(VALUE self, VALUE string) {
1105
- pm_parser_t parser;
1106
- size_t length = RSTRING_LEN(string);
1107
- pm_parser_init(&parser, (const uint8_t *) RSTRING_PTR(string), length, NULL);
1108
-
1109
- pm_node_t *node = pm_parse(&parser);
1110
- pm_memsize_t memsize;
1111
- pm_node_memsize(node, &memsize);
1112
-
1113
- pm_node_destroy(&parser, node);
1114
- pm_parser_free(&parser);
1115
-
1116
- VALUE result = rb_hash_new();
1117
- rb_hash_aset(result, ID2SYM(rb_intern("length")), INT2FIX(length));
1118
- rb_hash_aset(result, ID2SYM(rb_intern("memsize")), INT2FIX(memsize.memsize));
1119
- rb_hash_aset(result, ID2SYM(rb_intern("node_count")), INT2FIX(memsize.node_count));
1120
- return result;
1121
- }
1122
-
1123
- /**
1124
- * call-seq:
1125
- * Debug::profile_file(filepath) -> nil
1126
- *
1127
- * Parse the file, but do nothing with the result. This is used to profile the
1128
- * parser for memory and speed.
1129
- */
1130
- static VALUE
1131
- profile_file(VALUE self, VALUE filepath) {
1132
- pm_string_t input;
1133
-
1134
- const char *checked = check_string(filepath);
1135
- Check_Type(filepath, T_STRING);
1136
-
1137
- if (!pm_string_mapped_init(&input, checked)) {
1138
- #ifdef _WIN32
1139
- int e = rb_w32_map_errno(GetLastError());
1140
- #else
1141
- int e = errno;
1142
- #endif
1143
-
1144
- rb_syserr_fail(e, checked);
1145
- }
1146
-
1147
- pm_options_t options = { 0 };
1148
- pm_options_filepath_set(&options, checked);
1149
-
1150
- pm_parser_t parser;
1151
- pm_parser_init(&parser, pm_string_source(&input), pm_string_length(&input), &options);
1152
-
1153
- pm_node_t *node = pm_parse(&parser);
1154
- pm_node_destroy(&parser, node);
1155
- pm_parser_free(&parser);
1156
- pm_options_free(&options);
1157
- pm_string_free(&input);
1158
-
1159
- return Qnil;
1160
- }
1161
-
1162
- #ifndef PRISM_EXCLUDE_PRETTYPRINT
1163
-
1164
- /**
1165
- * call-seq:
1166
- * Debug::inspect_node(source) -> inspected
1167
- *
1168
- * Inspect the AST that represents the given source using the prism pretty print
1169
- * as opposed to the Ruby implementation.
1170
- */
1171
- static VALUE
1172
- inspect_node(VALUE self, VALUE source) {
1173
- pm_string_t input;
1174
- input_load_string(&input, source);
1175
-
1176
- pm_parser_t parser;
1177
- pm_parser_init(&parser, pm_string_source(&input), pm_string_length(&input), NULL);
1178
-
1179
- pm_node_t *node = pm_parse(&parser);
1180
- pm_buffer_t buffer = { 0 };
1181
-
1182
- pm_prettyprint(&buffer, &parser, node);
1183
-
1184
- rb_encoding *encoding = rb_enc_find(parser.encoding->name);
1185
- VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
1186
-
1187
- pm_buffer_free(&buffer);
1188
- pm_node_destroy(&parser, node);
1189
- pm_parser_free(&parser);
1190
-
1191
- return string;
1192
- }
1193
-
1194
- #endif
1195
-
1196
- /**
1197
- * call-seq:
1198
- * Debug::format_errors(source, colorize) -> String
1199
- *
1200
- * Format the errors that are found when parsing the given source string.
1201
- */
1202
- static VALUE
1203
- format_errors(VALUE self, VALUE source, VALUE colorize) {
1204
- pm_string_t input;
1205
- input_load_string(&input, source);
1206
-
1207
- pm_parser_t parser;
1208
- pm_parser_init(&parser, pm_string_source(&input), pm_string_length(&input), NULL);
1209
-
1210
- pm_node_t *node = pm_parse(&parser);
1211
- pm_buffer_t buffer = { 0 };
1212
-
1213
- pm_parser_errors_format(&parser, &parser.error_list, &buffer, RTEST(colorize), true);
1214
-
1215
- rb_encoding *encoding = rb_enc_find(parser.encoding->name);
1216
- VALUE result = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
1217
-
1218
- pm_buffer_free(&buffer);
1219
- pm_node_destroy(&parser, node);
1220
- pm_parser_free(&parser);
1221
- pm_string_free(&input);
1222
-
1223
- return result;
1224
- }
1225
-
1226
- /**
1227
- * call-seq:
1228
- * Debug::static_inspect(source) -> String
1229
- *
1230
- * Inspect the node as it would be inspected by the warnings used in static
1231
- * literal sets.
1232
- */
1233
- static VALUE
1234
- static_inspect(int argc, VALUE *argv, VALUE self) {
1235
- pm_string_t input;
1236
- pm_options_t options = { 0 };
1237
- string_options(argc, argv, &input, &options);
1238
-
1239
- pm_parser_t parser;
1240
- pm_parser_init(&parser, pm_string_source(&input), pm_string_length(&input), &options);
1241
-
1242
- pm_node_t *program = pm_parse(&parser);
1243
- pm_node_t *node = ((pm_program_node_t *) program)->statements->body.nodes[0];
1244
-
1245
- pm_buffer_t buffer = { 0 };
1246
- pm_static_literal_inspect(&buffer, &parser.newline_list, parser.start_line, parser.encoding->name, node);
1247
-
1248
- rb_encoding *encoding = rb_enc_find(parser.encoding->name);
1249
- VALUE result = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
1250
-
1251
- pm_buffer_free(&buffer);
1252
- pm_node_destroy(&parser, program);
1253
- pm_parser_free(&parser);
1254
- pm_string_free(&input);
1255
- pm_options_free(&options);
1256
-
1257
- return result;
1258
- }
1259
-
1260
- /**
1261
- * call-seq: Debug::Encoding.all -> Array[Debug::Encoding]
1262
- *
1263
- * Return an array of all of the encodings that prism knows about.
1264
- */
1265
- static VALUE
1266
- encoding_all(VALUE self) {
1267
- VALUE encodings = rb_ary_new();
1268
-
1269
- for (size_t index = 0; index < PM_ENCODING_MAXIMUM; index++) {
1270
- const pm_encoding_t *encoding = &pm_encodings[index];
1271
-
1272
- VALUE encoding_argv[] = { rb_str_new_cstr(encoding->name), encoding->multibyte ? Qtrue : Qfalse };
1273
- rb_ary_push(encodings, rb_class_new_instance(2, encoding_argv, rb_cPrismDebugEncoding));
1274
- }
1275
-
1276
- return encodings;
1277
- }
1278
-
1279
- static const pm_encoding_t *
1280
- encoding_find(VALUE name) {
1281
- const uint8_t *source = (const uint8_t *) RSTRING_PTR(name);
1282
- size_t length = RSTRING_LEN(name);
1283
-
1284
- const pm_encoding_t *encoding = pm_encoding_find(source, source + length);
1285
- if (encoding == NULL) { rb_raise(rb_eArgError, "Unknown encoding: %s", source); }
1286
-
1287
- return encoding;
1288
- }
1289
-
1290
- /**
1291
- * call-seq: Debug::Encoding.width(source) -> Integer
1292
- *
1293
- * Returns the width of the first character in the given string if it is valid
1294
- * in the encoding. If it is not, this function returns 0.
1295
- */
1296
- static VALUE
1297
- encoding_char_width(VALUE self, VALUE name, VALUE value) {
1298
- return ULONG2NUM(encoding_find(name)->char_width((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)));
1299
- }
1300
-
1301
- /**
1302
- * call-seq: Debug::Encoding.alnum?(source) -> true | false
1303
- *
1304
- * Returns true if the first character in the given string is an alphanumeric
1305
- * character in the encoding.
1306
- */
1307
- static VALUE
1308
- encoding_alnum_char(VALUE self, VALUE name, VALUE value) {
1309
- return encoding_find(name)->alnum_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) > 0 ? Qtrue : Qfalse;
1310
- }
1311
-
1312
- /**
1313
- * call-seq: Debug::Encoding.alpha?(source) -> true | false
1314
- *
1315
- * Returns true if the first character in the given string is an alphabetic
1316
- * character in the encoding.
1317
- */
1318
- static VALUE
1319
- encoding_alpha_char(VALUE self, VALUE name, VALUE value) {
1320
- return encoding_find(name)->alpha_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) > 0 ? Qtrue : Qfalse;
1321
- }
1322
-
1323
- /**
1324
- * call-seq: Debug::Encoding.upper?(source) -> true | false
1325
- *
1326
- * Returns true if the first character in the given string is an uppercase
1327
- * character in the encoding.
1328
- */
1329
- static VALUE
1330
- encoding_isupper_char(VALUE self, VALUE name, VALUE value) {
1331
- return encoding_find(name)->isupper_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) ? Qtrue : Qfalse;
1332
- }
1333
-
1334
1136
  /******************************************************************************/
1335
1137
  /* Initialization of the extension */
1336
1138
  /******************************************************************************/
@@ -1364,22 +1166,23 @@ Init_prism(void) {
1364
1166
  rb_cPrismMagicComment = rb_define_class_under(rb_cPrism, "MagicComment", rb_cObject);
1365
1167
  rb_cPrismParseError = rb_define_class_under(rb_cPrism, "ParseError", rb_cObject);
1366
1168
  rb_cPrismParseWarning = rb_define_class_under(rb_cPrism, "ParseWarning", rb_cObject);
1367
-
1368
1169
  rb_cPrismResult = rb_define_class_under(rb_cPrism, "Result", rb_cObject);
1369
1170
  rb_cPrismParseResult = rb_define_class_under(rb_cPrism, "ParseResult", rb_cPrismResult);
1171
+ rb_cPrismLexResult = rb_define_class_under(rb_cPrism, "LexResult", rb_cPrismResult);
1370
1172
  rb_cPrismParseLexResult = rb_define_class_under(rb_cPrism, "ParseLexResult", rb_cPrismResult);
1371
1173
 
1372
- // Intern all of the options that we support so that we don't have to do it
1373
- // every time we parse.
1374
- rb_option_id_command_line = rb_intern_const("command_line");
1375
- rb_option_id_encoding = rb_intern_const("encoding");
1376
- rb_option_id_filepath = rb_intern_const("filepath");
1377
- rb_option_id_frozen_string_literal = rb_intern_const("frozen_string_literal");
1378
- rb_option_id_line = rb_intern_const("line");
1379
- rb_option_id_scopes = rb_intern_const("scopes");
1380
- rb_option_id_version = rb_intern_const("version");
1381
-
1382
- rb_prism_source_id_for = rb_intern("for");
1174
+ // Intern all of the IDs eagerly that we support so that we don't have to do
1175
+ // it every time we parse.
1176
+ rb_id_option_command_line = rb_intern_const("command_line");
1177
+ rb_id_option_encoding = rb_intern_const("encoding");
1178
+ rb_id_option_filepath = rb_intern_const("filepath");
1179
+ rb_id_option_frozen_string_literal = rb_intern_const("frozen_string_literal");
1180
+ rb_id_option_line = rb_intern_const("line");
1181
+ rb_id_option_main_script = rb_intern_const("main_script");
1182
+ rb_id_option_partial_script = rb_intern_const("partial_script");
1183
+ rb_id_option_scopes = rb_intern_const("scopes");
1184
+ rb_id_option_version = rb_intern_const("version");
1185
+ rb_id_source_for = rb_intern("for");
1383
1186
 
1384
1187
  /**
1385
1188
  * The version of the prism library.
@@ -1390,8 +1193,10 @@ Init_prism(void) {
1390
1193
  rb_define_singleton_method(rb_cPrism, "lex", lex, -1);
1391
1194
  rb_define_singleton_method(rb_cPrism, "lex_file", lex_file, -1);
1392
1195
  rb_define_singleton_method(rb_cPrism, "parse", parse, -1);
1393
- rb_define_singleton_method(rb_cPrism, "parse_stream", parse_stream, -1);
1394
1196
  rb_define_singleton_method(rb_cPrism, "parse_file", parse_file, -1);
1197
+ rb_define_singleton_method(rb_cPrism, "profile", profile, -1);
1198
+ rb_define_singleton_method(rb_cPrism, "profile_file", profile_file, -1);
1199
+ rb_define_singleton_method(rb_cPrism, "parse_stream", parse_stream, -1);
1395
1200
  rb_define_singleton_method(rb_cPrism, "parse_comments", parse_comments, -1);
1396
1201
  rb_define_singleton_method(rb_cPrism, "parse_file_comments", parse_file_comments, -1);
1397
1202
  rb_define_singleton_method(rb_cPrism, "parse_lex", parse_lex, -1);
@@ -1406,29 +1211,6 @@ Init_prism(void) {
1406
1211
  rb_define_singleton_method(rb_cPrism, "dump_file", dump_file, -1);
1407
1212
  #endif
1408
1213
 
1409
- // Next, the functions that will be called by the parser to perform various
1410
- // internal tasks. We expose these to make them easier to test.
1411
- VALUE rb_cPrismDebug = rb_define_module_under(rb_cPrism, "Debug");
1412
- rb_define_singleton_method(rb_cPrismDebug, "named_captures", named_captures, 1);
1413
- rb_define_singleton_method(rb_cPrismDebug, "integer_parse", integer_parse, 1);
1414
- rb_define_singleton_method(rb_cPrismDebug, "memsize", memsize, 1);
1415
- rb_define_singleton_method(rb_cPrismDebug, "profile_file", profile_file, 1);
1416
- rb_define_singleton_method(rb_cPrismDebug, "format_errors", format_errors, 2);
1417
- rb_define_singleton_method(rb_cPrismDebug, "static_inspect", static_inspect, -1);
1418
-
1419
- #ifndef PRISM_EXCLUDE_PRETTYPRINT
1420
- rb_define_singleton_method(rb_cPrismDebug, "inspect_node", inspect_node, 1);
1421
- #endif
1422
-
1423
- // Next, define the functions that are exposed through the private
1424
- // Debug::Encoding class.
1425
- rb_cPrismDebugEncoding = rb_define_class_under(rb_cPrismDebug, "Encoding", rb_cObject);
1426
- rb_define_singleton_method(rb_cPrismDebugEncoding, "all", encoding_all, 0);
1427
- rb_define_singleton_method(rb_cPrismDebugEncoding, "_width", encoding_char_width, 2);
1428
- rb_define_singleton_method(rb_cPrismDebugEncoding, "_alnum?", encoding_alnum_char, 2);
1429
- rb_define_singleton_method(rb_cPrismDebugEncoding, "_alpha?", encoding_alpha_char, 2);
1430
- rb_define_singleton_method(rb_cPrismDebugEncoding, "_upper?", encoding_isupper_char, 2);
1431
-
1432
1214
  // Next, initialize the other APIs.
1433
1215
  Init_prism_api_node();
1434
1216
  Init_prism_pack();