RubyGems - prism - Versions diffs - 1.1.0 → 1.2.0 - Mend

prism 1.1.0 → 1.2.0

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +17 -1
data/config.yml +2 -1
data/ext/prism/extension.h +1 -1
data/include/prism/ast.h +1 -1
data/include/prism/diagnostic.h +1 -0
data/include/prism/parser.h +25 -12
data/include/prism/version.h +2 -2
data/lib/prism/node.rb +1 -1
data/lib/prism/parse_result.rb +140 -3
data/lib/prism/serialize.rb +13 -1
data/lib/prism/translation/parser.rb +3 -3
data/lib/prism/translation/ripper.rb +1 -5
data/lib/prism/translation/ruby_parser.rb +2 -2
data/prism.gemspec +1 -1
data/rbi/prism/node.rbi +5777 -1701
data/rbi/prism/parse_result.rbi +29 -0
data/rbi/prism.rbi +34 -34
data/sig/prism/node.rbs +1 -90
data/sig/prism/parse_result.rbs +20 -0
data/src/diagnostic.c +3 -1
data/src/prism.c +223 -115
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ea89f88aef2ec51d2cfb5868cf873ab256393f5ba632381a3a53631c2506dbc3
-  data.tar.gz: 4da85d79e85d5cca843eb6e71cea07efc80a6e77608aa3027e1cd75c3c3b735a
+  metadata.gz: f16f842a06eec8141246c60a39f509a59817de34ab4be3a33502ada040ac1602
+  data.tar.gz: 38bd30b4ba63fe67892a0138be5b7b2c84fb6bd66e011871a07ee453be7b0aec
 SHA512:
-  metadata.gz: 1b7f92a58fa176b04aab230f49dcdc08f9b810575426402db6fc9eee0921ac5ddddf1f01f18bb9429a914955759e8d4cee5a70674e14eb0b0169018ce615780e
-  data.tar.gz: e4a5a6ba40bc7692c6c904f452dd8ff18881703c99d51969507619368748eec29936dc3f521e8967b6d728ea331f26f9f670b44daedde2106f649104aefe0c30
+  metadata.gz: 9877dc80270515e91c5357418a67721fa832c6de44d755047f576d69a3b09129d64da19e5ac767d184df74afe40982f56c88be3851ce7fc7c99c1d0bbf15ec77
+  data.tar.gz: 4578c2f1e2e934f763d6c55ae84dc076b0676ac0560c13364db1ade75425da76d64b7a4cf8fb07ea723568fbfc58c20fb8055e9dec364ac5da05765d26398d39

data/CHANGELOG.md CHANGED Viewed

@@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 ## [Unreleased]
+## [1.2.0] - 2024-10-10
+### Added
+- Introduce `Prism::CodeUnitsCache`.
+### Changed
+- Properly handle lexing global variables that begin with `$-`.
+- Properly reject invalid multi writes within parentheses.
+- Fix unary `*` binding power.
+- Set `contains_keywords` flag for implicit `gets` calls when `-p` is used.
+- Properly reject invalid non-associative operator patterns.
+- Do not warn about unused variables declared on negative lines.
 ## [1.1.0] - 2024-10-02
 ### Added
@@ -591,7 +606,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a
 - 🎉 Initial release! 🎉
-[unreleased]: https://github.com/ruby/prism/compare/v1.1.0...HEAD
+[unreleased]: https://github.com/ruby/prism/compare/v1.2.0...HEAD
+[1.2.0]: https://github.com/ruby/prism/compare/v1.1.0...v1.2.0
 [1.1.0]: https://github.com/ruby/prism/compare/v1.0.0...v1.1.0
 [1.0.0]: https://github.com/ruby/prism/compare/v0.30.0...v1.0.0
 [0.30.0]: https://github.com/ruby/prism/compare/v0.29.0...v0.30.0

data/config.yml CHANGED Viewed

@@ -141,6 +141,7 @@ errors:
   - INSTANCE_VARIABLE_BARE
   - INVALID_BLOCK_EXIT
   - INVALID_CHARACTER
+  - INVALID_COMMA
   - INVALID_ENCODING_MAGIC_COMMENT
   - INVALID_ESCAPE_CHARACTER
   - INVALID_FLOAT_EXPONENT
@@ -3684,7 +3685,7 @@ nodes:
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           end
-      `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `exception` field.
+      `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `reference` field.
   - name: RestParameterNode
     flags: ParameterFlags
     fields:

data/ext/prism/extension.h CHANGED Viewed

@@ -1,7 +1,7 @@
 #ifndef PRISM_EXT_NODE_H
 #define PRISM_EXT_NODE_H
-#define EXPECTED_PRISM_VERSION "1.1.0"
+#define EXPECTED_PRISM_VERSION "1.2.0"
 #include <ruby.h>
 #include <ruby/encoding.h>

data/include/prism/ast.h CHANGED Viewed

@@ -6490,7 +6490,7 @@ typedef struct pm_rescue_modifier_node {
  *     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  *     end
  *
- * `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `exception` field.
+ * `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `reference` field.
  *
  * Type: ::PM_RESCUE_NODE
  *

data/include/prism/diagnostic.h CHANGED Viewed

@@ -170,6 +170,7 @@ typedef enum {
     PM_ERR_INSTANCE_VARIABLE_BARE,
     PM_ERR_INVALID_BLOCK_EXIT,
     PM_ERR_INVALID_CHARACTER,
+    PM_ERR_INVALID_COMMA,
     PM_ERR_INVALID_ENCODING_MAGIC_COMMENT,
     PM_ERR_INVALID_ESCAPE_CHARACTER,
     PM_ERR_INVALID_FLOAT_EXPONENT,

data/include/prism/parser.h CHANGED Viewed

@@ -82,6 +82,23 @@ typedef enum {
     PM_HEREDOC_INDENT_TILDE,
 } pm_heredoc_indent_t;
+/**
+ * All of the information necessary to store to lexing a heredoc.
+ */
+typedef struct {
+    /** A pointer to the start of the heredoc identifier. */
+    const uint8_t *ident_start;
+    /** The length of the heredoc identifier. */
+    size_t ident_length;
+    /** The type of quote that the heredoc uses. */
+    pm_heredoc_quote_t quote;
+    /** The type of indentation that the heredoc uses. */
+    pm_heredoc_indent_t indent;
+} pm_heredoc_lex_mode_t;
 /**
  * When lexing Ruby source, the lexer has a small amount of state to tell which
  * kind of token it is currently lexing. For example, when we find the start of
@@ -210,17 +227,10 @@ typedef struct pm_lex_mode {
         } string;
         struct {
-            /** A pointer to the start of the heredoc identifier. */
-            const uint8_t *ident_start;
-            /** The length of the heredoc identifier. */
-            size_t ident_length;
-            /** The type of quote that the heredoc uses. */
-            pm_heredoc_quote_t quote;
-            /** The type of indentation that the heredoc uses. */
-            pm_heredoc_indent_t indent;
+            /**
+             * All of the data necessary to lex a heredoc.
+             */
+            pm_heredoc_lex_mode_t base;
             /**
              * This is the pointer to the character where lexing should resume
@@ -233,7 +243,7 @@ typedef struct pm_lex_mode {
              * line so that we know how much to dedent each line in the case of
              * a tilde heredoc.
              */
-            size_t common_whitespace;
+            size_t *common_whitespace;
             /** True if the previous token ended with a line continuation. */
             bool line_continuation;
@@ -382,6 +392,9 @@ typedef enum {
     /** a rescue statement within a module statement */
     PM_CONTEXT_MODULE_RESCUE,
+    /** a multiple target expression */
+    PM_CONTEXT_MULTI_TARGET,
     /** a parenthesized expression */
     PM_CONTEXT_PARENS,

data/include/prism/version.h CHANGED Viewed

@@ -14,7 +14,7 @@
 /**
  * The minor version of the Prism library as an int.
  */
-#define PRISM_VERSION_MINOR 1
+#define PRISM_VERSION_MINOR 2
 /**
  * The patch version of the Prism library as an int.
@@ -24,6 +24,6 @@
 /**
  * The version of the Prism library as a constant string.
  */
-#define PRISM_VERSION "1.1.0"
+#define PRISM_VERSION "1.2.0"
 #endif

data/lib/prism/node.rb CHANGED Viewed

@@ -14219,7 +14219,7 @@ module Prism
   #     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   #     end
   #
-  # `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `exception` field.
+  # `Foo, *splat, Bar` are in the `exceptions` field. `ex` is in the `reference` field.
   class RescueNode < Node
     # Initialize a new RescueNode node.
     def initialize(source, node_id, location, flags, keyword_loc, exceptions, operator_loc, reference, statements, subsequent)

data/lib/prism/parse_result.rb CHANGED Viewed

@@ -12,6 +12,21 @@ module Prism
     def self.for(source, start_line = 1, offsets = [])
       if source.ascii_only?
         ASCIISource.new(source, start_line, offsets)
+      elsif source.encoding == Encoding::BINARY
+        source.force_encoding(Encoding::UTF_8)
+        if source.valid_encoding?
+          new(source, start_line, offsets)
+        else
+          # This is an extremely niche use case where the file is marked as
+          # binary, contains multi-byte characters, and those characters are not
+          # valid UTF-8. In this case we'll mark it as binary and fall back to
+          # treating everything as a single-byte character. This _may_ cause
+          # problems when asking for code units, but it appears to be the
+          # cleanest solution at the moment.
+          source.force_encoding(Encoding::BINARY)
+          ASCIISource.new(source, start_line, offsets)
+        end
       else
         new(source, start_line, offsets)
       end
@@ -89,8 +104,14 @@ module Prism
     # This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
     # concept of code units that differs from the number of characters in other
     # encodings, it is not captured here.
+    #
+    # We purposefully replace invalid and undefined characters with replacement
+    # characters in this conversion. This happens for two reasons. First, it's
+    # possible that the given byte offset will not occur on a character
+    # boundary. Second, it's possible that the source code will contain a
+    # character that has no equivalent in the given encoding.
     def code_units_offset(byte_offset, encoding)
-      byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
+      byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
       if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
         byteslice.bytesize / 2
@@ -99,6 +120,12 @@ module Prism
       end
     end
+    # Generate a cache that targets a specific encoding for calculating code
+    # unit offsets.
+    def code_units_cache(encoding)
+      CodeUnitsCache.new(source, encoding)
+    end
     # Returns the column number in code units for the given encoding for the
     # given byte offset.
     def code_units_column(byte_offset, encoding)
@@ -128,10 +155,84 @@ module Prism
     end
   end
+  # A cache that can be used to quickly compute code unit offsets from byte
+  # offsets. It purposefully provides only a single #[] method to access the
+  # cache in order to minimize surface area.
+  #
+  # Note that there are some known issues here that may or may not be addressed
+  # in the future:
+  #
+  # * The first is that there are issues when the cache computes values that are
+  #   not on character boundaries. This can result in subsequent computations
+  #   being off by one or more code units.
+  # * The second is that this cache is currently unbounded. In theory we could
+  #   introduce some kind of LRU cache to limit the number of entries, but this
+  #   has not yet been implemented.
+  #
+  class CodeUnitsCache
+    class UTF16Counter # :nodoc:
+      def initialize(source, encoding)
+        @source = source
+        @encoding = encoding
+      end
+      def count(byte_offset, byte_length)
+        @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
+      end
+    end
+    class LengthCounter # :nodoc:
+      def initialize(source, encoding)
+        @source = source
+        @encoding = encoding
+      end
+      def count(byte_offset, byte_length)
+        @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
+      end
+    end
+    private_constant :UTF16Counter, :LengthCounter
+    # Initialize a new cache with the given source and encoding.
+    def initialize(source, encoding)
+      @source = source
+      @counter =
+        if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
+          UTF16Counter.new(source, encoding)
+        else
+          LengthCounter.new(source, encoding)
+        end
+      @cache = {}
+      @offsets = []
+    end
+    # Retrieve the code units offset from the given byte offset.
+    def [](byte_offset)
+      @cache[byte_offset] ||=
+        if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
+          @offsets << byte_offset
+          @counter.count(0, byte_offset)
+        elsif index == 0
+          @offsets.unshift(byte_offset)
+          @counter.count(0, byte_offset)
+        else
+          @offsets.insert(index, byte_offset)
+          offset = @offsets[index - 1]
+          @cache[offset] + @counter.count(offset, byte_offset - offset)
+        end
+    end
+  end
   # Specialized version of Prism::Source for source code that includes ASCII
   # characters only. This class is used to apply performance optimizations that
-  # cannot be applied to sources that include multibyte characters. Sources that
-  # include multibyte characters are represented by the Prism::Source class.
+  # cannot be applied to sources that include multibyte characters.
+  #
+  # In the extremely rare case that a source includes multi-byte characters but
+  # is marked as binary because of a magic encoding comment and it cannot be
+  # eagerly converted to UTF-8, this class will be used as well. This is because
+  # at that point we will treat everything as single-byte characters.
   class ASCIISource < Source
     # Return the character offset for the given byte offset.
     def character_offset(byte_offset)
@@ -153,6 +254,13 @@ module Prism
       byte_offset
     end
+    # Returns a cache that is the identity function in order to maintain the
+    # same interface. We can do this because code units are always equivalent to
+    # byte offsets for ASCII-only sources.
+    def code_units_cache(encoding)
+      ->(byte_offset) { byte_offset }
+    end
     # Specialized version of `code_units_column` that does not depend on
     # `code_units_offset`, which is a more expensive operation. This is
     # essentially the same as `Prism::Source#column`.
@@ -262,6 +370,12 @@ module Prism
       source.code_units_offset(start_offset, encoding)
     end
+    # The start offset from the start of the file in code units using the given
+    # cache to fetch or calculate the value.
+    def cached_start_code_units_offset(cache)
+      cache[start_offset]
+    end
     # The byte offset from the beginning of the source where this location ends.
     def end_offset
       start_offset + length
@@ -278,6 +392,12 @@ module Prism
       source.code_units_offset(end_offset, encoding)
     end
+    # The end offset from the start of the file in code units using the given
+    # cache to fetch or calculate the value.
+    def cached_end_code_units_offset(cache)
+      cache[end_offset]
+    end
     # The line number where this location starts.
     def start_line
       source.line(start_offset)
@@ -312,6 +432,12 @@ module Prism
       source.code_units_column(start_offset, encoding)
     end
+    # The start column in code units using the given cache to fetch or calculate
+    # the value.
+    def cached_start_code_units_column(cache)
+      cache[start_offset] - cache[source.line_start(start_offset)]
+    end
     # The column number in bytes where this location ends from the start of the
     # line.
     def end_column
@@ -330,6 +456,12 @@ module Prism
       source.code_units_column(end_offset, encoding)
     end
+    # The end column in code units using the given cache to fetch or calculate
+    # the value.
+    def cached_end_code_units_column(cache)
+      cache[end_offset] - cache[source.line_start(end_offset)]
+    end
     # Implement the hash pattern matching interface for Location.
     def deconstruct_keys(keys)
       { start_offset: start_offset, end_offset: end_offset }
@@ -579,6 +711,11 @@ module Prism
     def failure?
       !success?
     end
+    # Create a code units cache for the given encoding.
+    def code_units_cache(encoding)
+      source.code_units_cache(encoding)
+    end
   end
   # This is a result specific to the `parse` and `parse_file` methods.

data/lib/prism/serialize.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module Prism
     # The minor version of prism that we are expecting to find in the serialized
     # strings.
-    MINOR_VERSION = 1
+    MINOR_VERSION = 2
     # The patch version of prism that we are expecting to find in the serialized
     # strings.
@@ -28,10 +28,21 @@ module Prism
     def self.load(input, serialized)
       input = input.dup
       source = Source.for(input)
       loader = Loader.new(source, serialized)
       result = loader.load_result
       input.force_encoding(loader.encoding)
+      # This is an extremely niche use-case where the file was marked as binary
+      # but it contained UTF-8-encoded characters. In that case we will actually
+      # put it back to UTF-8 to give the location APIs the best chance of being
+      # correct.
+      if !input.ascii_only? && input.encoding == Encoding::BINARY
+        input.force_encoding(Encoding::UTF_8)
+        input.force_encoding(Encoding::BINARY) unless input.valid_encoding?
+      end
       result
     end
@@ -267,6 +278,7 @@ module Prism
         :instance_variable_bare,
         :invalid_block_exit,
         :invalid_character,
+        :invalid_comma,
         :invalid_encoding_magic_comment,
         :invalid_escape_character,
         :invalid_float_exponent,

data/lib/prism/translation/parser.rb CHANGED Viewed

@@ -51,7 +51,7 @@ module Prism
         source = source_buffer.source
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
         build_ast(result.value, offset_cache)
       ensure
@@ -64,7 +64,7 @@ module Prism
         source = source_buffer.source
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
         [
           build_ast(result.value, offset_cache),
@@ -83,7 +83,7 @@ module Prism
         offset_cache = build_offset_cache(source)
         result =
           begin
-            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
+            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
           rescue ::Parser::SyntaxError
             raise if !recover
           end

data/lib/prism/translation/ripper.rb CHANGED Viewed

@@ -3269,11 +3269,7 @@ module Prism
       # Lazily initialize the parse result.
       def result
-        @result ||=
-          begin
-            scopes = RUBY_VERSION >= "3.3.0" ? [] : [[]]
-            Prism.parse(source, scopes: scopes)
-          end
+        @result ||= Prism.parse(source, partial_script: true)
       end
       ##########################################################################

data/lib/prism/translation/ruby_parser.rb CHANGED Viewed

@@ -1596,13 +1596,13 @@ module Prism
       # Parse the given source and translate it into the seattlerb/ruby_parser
       # gem's Sexp format.
       def parse(source, filepath = "(string)")
-        translate(Prism.parse(source, filepath: filepath, scopes: [[]]), filepath)
+        translate(Prism.parse(source, filepath: filepath, partial_script: true), filepath)
       end
       # Parse the given file and translate it into the seattlerb/ruby_parser
       # gem's Sexp format.
       def parse_file(filepath)
-        translate(Prism.parse_file(filepath, scopes: [[]]), filepath)
+        translate(Prism.parse_file(filepath, partial_script: true), filepath)
       end
       class << self

data/prism.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |spec|
   spec.name = "prism"
-  spec.version = "1.1.0"
+  spec.version = "1.2.0"
   spec.authors = ["Shopify"]
   spec.email = ["ruby@shopify.com"]