RubyGems - prism - Versions diffs - 0.22.0 → 0.23.0 - Mend

prism 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +24 -1
data/README.md +2 -1
data/docs/releasing.md +67 -17
data/docs/ruby_parser_translation.md +19 -0
data/docs/serialization.md +2 -0
data/ext/prism/api_node.c +784 -785
data/ext/prism/extension.c +12 -7
data/ext/prism/extension.h +2 -2
data/include/prism/diagnostic.h +3 -4
data/include/prism/encoding.h +7 -0
data/include/prism/util/pm_constant_pool.h +1 -1
data/include/prism/util/pm_strpbrk.h +4 -1
data/include/prism/version.h +2 -2
data/lib/prism/ffi.rb +1 -1
data/lib/prism/lex_compat.rb +1 -0
data/lib/prism/node_ext.rb +25 -2
data/lib/prism/parse_result.rb +44 -15
data/lib/prism/serialize.rb +12 -6
data/lib/prism/translation/parser.rb +10 -9
data/lib/prism/translation/ripper.rb +577 -0
data/lib/prism/translation/ruby_parser.rb +1521 -0
data/lib/prism/translation.rb +3 -3
data/lib/prism.rb +0 -1
data/prism.gemspec +4 -2
data/src/diagnostic.c +10 -11
data/src/encoding.c +16 -17
data/src/options.c +7 -2
data/src/prism.c +124 -64
data/src/serialize.c +24 -13
data/src/token_type.c +3 -3
data/src/util/pm_constant_pool.c +1 -1
data/src/util/pm_strpbrk.c +122 -14
metadata +6 -4
data/lib/prism/ripper_compat.rb +0 -285

data/ext/prism/extension.c CHANGED Viewed

@@ -542,8 +542,9 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
     pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
+    VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input));
     VALUE offsets = rb_ary_new();
-    VALUE source_argv[] = { rb_str_new((const char *) pm_string_source(input), pm_string_length(input)), ULONG2NUM(parser.start_line), offsets };
+    VALUE source_argv[] = { source_string, LONG2NUM(parser.start_line), offsets };
     VALUE source = rb_class_new_instance(3, source_argv, rb_cPrismSource);
     parse_lex_data_t parse_lex_data = {
@@ -561,17 +562,21 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
     parser.lex_callback = &lex_callback;
     pm_node_t *node = pm_parse(&parser);
-    // Here we need to update the source range to have the correct newline
-    // offsets. We do it here because we've already created the object and given
-    // it over to all of the tokens.
+    // Here we need to update the Source object to have the correct
+    // encoding for the source string and the correct newline offsets.
+    // We do it here because we've already created the Source object and given
+    // it over to all of the tokens, and both of these are only set after pm_parse().
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
+    rb_enc_associate(source_string, encoding);
     for (size_t index = 0; index < parser.newline_list.size; index++) {
-        rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
+        rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index]));
     }
     VALUE value;
     if (return_nodes) {
         value = rb_ary_new_capa(2);
-        rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding));
+        rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source));
         rb_ary_push(value, parse_lex_data.tokens);
     } else {
         value = parse_lex_data.tokens;
@@ -650,7 +655,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
     VALUE source = pm_source_new(&parser, encoding);
     VALUE result_argv[] = {
-        pm_ast_new(&parser, node, encoding),
+        pm_ast_new(&parser, node, encoding, source),
         parser_comments(&parser, source),
         parser_magic_comments(&parser, source),
         parser_data_loc(&parser, source),

data/ext/prism/extension.h CHANGED Viewed

@@ -1,7 +1,7 @@
 #ifndef PRISM_EXT_NODE_H
 #define PRISM_EXT_NODE_H
-#define EXPECTED_PRISM_VERSION "0.22.0"
+#define EXPECTED_PRISM_VERSION "0.23.0"
 #include <ruby.h>
 #include <ruby/encoding.h>
@@ -9,7 +9,7 @@
 VALUE pm_source_new(pm_parser_t *parser, rb_encoding *encoding);
 VALUE pm_token_new(pm_parser_t *parser, pm_token_t *token, rb_encoding *encoding, VALUE source);
-VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding);
+VALUE pm_ast_new(pm_parser_t *parser, pm_node_t *node, rb_encoding *encoding, VALUE source);
 void Init_prism_api_node(void);
 void Init_prism_pack(void);

data/include/prism/diagnostic.h CHANGED Viewed

@@ -219,6 +219,7 @@ typedef enum {
     PM_ERR_MODULE_NAME,
     PM_ERR_MODULE_TERM,
     PM_ERR_MULTI_ASSIGN_MULTI_SPLATS,
+    PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST,
     PM_ERR_NOT_EXPRESSION,
     PM_ERR_NO_LOCAL_VARIABLE,
     PM_ERR_NUMBER_LITERAL_UNDERSCORE,
@@ -272,6 +273,7 @@ typedef enum {
     PM_ERR_STATEMENT_UNDEF,
     PM_ERR_STRING_CONCATENATION,
     PM_ERR_STRING_INTERPOLATED_TERM,
+    PM_ERR_STRING_LITERAL_EOF,
     PM_ERR_STRING_LITERAL_TERM,
     PM_ERR_SYMBOL_INVALID,
     PM_ERR_SYMBOL_TERM_DYNAMIC,
@@ -279,10 +281,7 @@ typedef enum {
     PM_ERR_TERNARY_COLON,
     PM_ERR_TERNARY_EXPRESSION_FALSE,
     PM_ERR_TERNARY_EXPRESSION_TRUE,
-    PM_ERR_UNARY_RECEIVER_BANG,
-    PM_ERR_UNARY_RECEIVER_MINUS,
-    PM_ERR_UNARY_RECEIVER_PLUS,
-    PM_ERR_UNARY_RECEIVER_TILDE,
+    PM_ERR_UNARY_RECEIVER,
     PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT,
     PM_ERR_UNEXPECTED_TOKEN_IGNORE,
     PM_ERR_UNDEF_ARGUMENT,

data/include/prism/encoding.h CHANGED Viewed

@@ -245,6 +245,13 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
  */
 #define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
+/**
+ * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
+ * can compare against it because invalid multibyte characters are not a thing
+ * in this encoding.
+ */
+#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
 /**
  * Parse the given name of an encoding and return a pointer to the corresponding
  * encoding struct if one can be found, otherwise return NULL.

data/include/prism/util/pm_constant_pool.h CHANGED Viewed

@@ -163,7 +163,7 @@ pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool,
  * @param length The length of the constant.
  * @return The id of the constant.
  */
-pm_constant_id_t pm_constant_pool_find(pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length);
 /**
  * Insert a constant into a constant pool that is a slice of a source string.

data/include/prism/util/pm_strpbrk.h CHANGED Viewed

@@ -7,6 +7,7 @@
 #define PRISM_STRPBRK_H
 #include "prism/defines.h"
+#include "prism/diagnostic.h"
 #include "prism/parser.h"
 #include <stddef.h>
@@ -35,9 +36,11 @@
  * @param source The source to search.
  * @param charset The charset to search for.
  * @param length The maximum number of bytes to search.
+ * @param validate Whether to validate that the source string is valid in the
+ *     current encoding of the parser.
  * @return A pointer to the first character in the source string that is in the
  *     charset, or NULL if no such character exists.
  */
-const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
+const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
 #endif

data/include/prism/version.h CHANGED Viewed

@@ -14,7 +14,7 @@
 /**
  * The minor version of the Prism library as an int.
  */
-#define PRISM_VERSION_MINOR 22
+#define PRISM_VERSION_MINOR 23
 /**
  * The patch version of the Prism library as an int.
@@ -24,6 +24,6 @@
 /**
  * The version of the Prism library as a constant string.
  */
-#define PRISM_VERSION "0.22.0"
+#define PRISM_VERSION "0.23.0"
 #endif

data/lib/prism/ffi.rb CHANGED Viewed

@@ -317,7 +317,7 @@ module Prism
       values << (options.fetch(:frozen_string_literal, false) ? 1 : 0)
       template << "C"
-      values << { nil => 0, "3.3.0" => 1, "latest" => 0 }.fetch(options[:version])
+      values << { nil => 0, "3.3.0" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])
       template << "L"
       if (scopes = options[:scopes])

data/lib/prism/lex_compat.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "delegate"
+require "ripper"
 module Prism
   # This class is responsible for lexing the source using prism and then

data/lib/prism/node_ext.rb CHANGED Viewed

@@ -94,7 +94,7 @@ module Prism
     # Returns the full name of this constant. For example: "Foo"
     def full_name
-      name.name
+      name.to_s
     end
   end
@@ -135,7 +135,17 @@ module Prism
     # Returns the list of parts for the full name of this constant path.
     # For example: [:Foo, :Bar]
     def full_name_parts
-      (parent&.full_name_parts || [:""]).push(child.name)
+      parts = case parent
+      when ConstantPathNode, ConstantReadNode
+        parent.full_name_parts
+      when nil
+        [:""]
+      else
+        raise ConstantPathNode::DynamicPartsInConstantPathError,
+          "Constant path target contains dynamic parts. Cannot compute full name"
+      end
+      parts.push(child.name)
     end
     # Returns the full name of this constant path. For example: "Foo::Bar"
@@ -144,6 +154,19 @@ module Prism
     end
   end
+  class ConstantTargetNode < Node
+    # Returns the list of parts for the full name of this constant.
+    # For example: [:Foo]
+    def full_name_parts
+      [name]
+    end
+    # Returns the full name of this constant. For example: "Foo"
+    def full_name
+      name.to_s
+    end
+  end
   class ParametersNode < Node
     # Mirrors the Method#parameters method.
     def signature

data/lib/prism/parse_result.rb CHANGED Viewed

@@ -9,18 +9,16 @@ module Prism
     attr_reader :source
     # The line number where this source starts.
-    attr_accessor :start_line
+    attr_reader :start_line
     # The list of newline byte offsets in the source code.
     attr_reader :offsets
-    # Create a new source object with the given source code and newline byte
-    # offsets. If no newline byte offsets are given, they will be computed from
-    # the source code.
-    def initialize(source, start_line = 1, offsets = compute_offsets(source))
+    # Create a new source object with the given source code.
+    def initialize(source, start_line = 1, offsets = [])
       @source = source
-      @start_line = start_line
-      @offsets = offsets
+      @start_line = start_line # set after parsing is done
+      @offsets = offsets # set after parsing is done
     end
     # Perform a byteslice on the source code using the given byte offset and
@@ -56,6 +54,23 @@ module Prism
       character_offset(byte_offset) - character_offset(line_start(byte_offset))
     end
+    # Returns the offset from the start of the file for the given byte offset
+    # counting in code units for the given encoding.
+    #
+    # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
+    # concept of code units that differs from the number of characters in other
+    # encodings, it is not captured here.
+    def code_units_offset(byte_offset, encoding)
+      byteslice = source.byteslice(0, byte_offset).encode(encoding)
+      (encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE) ? (byteslice.bytesize / 2) : byteslice.length
+    end
+    # Returns the column number in code units for the given encoding for the
+    # given byte offset.
+    def code_units_column(byte_offset, encoding)
+      code_units_offset(byte_offset, encoding) - code_units_offset(line_start(byte_offset), encoding)
+    end
     private
     # Binary search through the offsets to find the line number for the given
@@ -77,14 +92,6 @@ module Prism
       left - 1
     end
-    # Find all of the newlines in the source code and return their byte offsets
-    # from the start of the string an array.
-    def compute_offsets(code)
-      offsets = [0]
-      code.b.scan("\n") { offsets << $~.end(0) }
-      offsets
-    end
   end
   # This represents a location in the source.
@@ -138,6 +145,11 @@ module Prism
       source.character_offset(start_offset)
     end
+    # The offset from the start of the file in code units of the given encoding.
+    def start_code_units_offset(encoding = Encoding::UTF_16LE)
+      source.code_units_offset(start_offset, encoding)
+    end
     # The byte offset from the beginning of the source where this location ends.
     def end_offset
       start_offset + length
@@ -149,6 +161,11 @@ module Prism
       source.character_offset(end_offset)
     end
+    # The offset from the start of the file in code units of the given encoding.
+    def end_code_units_offset(encoding = Encoding::UTF_16LE)
+      source.code_units_offset(end_offset, encoding)
+    end
     # The line number where this location starts.
     def start_line
       source.line(start_offset)
@@ -177,6 +194,12 @@ module Prism
       source.character_column(start_offset)
     end
+    # The column number in code units of the given encoding where this location
+    # starts from the start of the line.
+    def start_code_units_column(encoding = Encoding::UTF_16LE)
+      source.code_units_column(start_offset, encoding)
+    end
     # The column number in bytes where this location ends from the start of the
     # line.
     def end_column
@@ -189,6 +212,12 @@ module Prism
       source.character_column(end_offset)
     end
+    # The column number in code units of the given encoding where this location
+    # ends from the start of the line.
+    def end_code_units_column(encoding = Encoding::UTF_16LE)
+      source.code_units_column(end_offset, encoding)
+    end
     # Implement the hash pattern matching interface for Location.
     def deconstruct_keys(keys)
       { start_offset: start_offset, end_offset: end_offset }

data/lib/prism/serialize.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Prism
     # The minor version of prism that we are expecting to find in the serialized
     # strings.
-    MINOR_VERSION = 22
+    MINOR_VERSION = 23
     # The patch version of prism that we are expecting to find in the serialized
     # strings.
@@ -86,11 +86,15 @@ module Prism
       end
       def load_start_line
-        source.start_line = load_varsint
+        source.instance_variable_set :@start_line, load_varsint
+      end
+      def load_line_offsets
+        source.instance_variable_set :@offsets, Array.new(load_varuint) { load_varuint }
       end
       def load_comments
-        load_varuint.times.map do
+        Array.new(load_varuint) do
           case load_varuint
           when 0 then InlineComment.new(load_location)
           when 1 then EmbDocComment.new(load_location)
@@ -101,10 +105,10 @@ module Prism
       def load_metadata
         comments = load_comments
-        magic_comments = load_varuint.times.map { MagicComment.new(load_location, load_location) }
+        magic_comments = Array.new(load_varuint) { MagicComment.new(load_location, load_location) }
         data_loc = load_optional_location
-        errors = load_varuint.times.map { ParseError.new(load_embedded_string, load_location, load_error_level) }
-        warnings = load_varuint.times.map { ParseWarning.new(load_embedded_string, load_location, load_warning_level) }
+        errors = Array.new(load_varuint) { ParseError.new(load_embedded_string, load_location, load_error_level) }
+        warnings = Array.new(load_varuint) { ParseWarning.new(load_embedded_string, load_location, load_warning_level) }
         [comments, magic_comments, data_loc, errors, warnings]
       end
@@ -125,6 +129,7 @@ module Prism
         tokens = load_tokens
         encoding = load_encoding
         load_start_line
+        load_line_offsets
         comments, magic_comments, data_loc, errors, warnings = load_metadata
         tokens.each { |token,| token.value.force_encoding(encoding) }
@@ -136,6 +141,7 @@ module Prism
         load_header
         load_encoding
         load_start_line
+        load_line_offsets
         comments, magic_comments, data_loc, errors, warnings = load_metadata

data/lib/prism/translation/parser.rb CHANGED Viewed

@@ -124,20 +124,21 @@ module Prism
       # build the parser gem AST.
       #
       # If the bytesize of the source is the same as the length, then we can
-      # just use the offset directly. Otherwise, we build a hash that functions
-      # as a cache for the conversion.
-      #
-      # This is a good opportunity for some optimizations. If the source file
-      # has any multi-byte characters, this can tank the performance of the
-      # translator. We could make this significantly faster by using a
-      # different data structure for the cache.
+      # just use the offset directly. Otherwise, we build an array where the
+      # index is the byte offset and the value is the character offset.
       def build_offset_cache(source)
         if source.bytesize == source.length
           -> (offset) { offset }
         else
-          Hash.new do |hash, offset|
-            hash[offset] = source.byteslice(0, offset).length
+          offset_cache = []
+          offset = 0
+          source.each_char do |char|
+            char.bytesize.times { offset_cache << offset }
+            offset += 1
           end
+          offset_cache << offset
         end
       end