RubyGems - prism - Versions diffs - 1.7.0 → 1.9.0 - Mend

prism 1.7.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +31 -1
data/Makefile +7 -1
data/config.yml +4 -4
data/docs/releasing.md +2 -4
data/docs/ripper_translation.md +8 -17
data/docs/ruby_api.md +1 -0
data/ext/prism/extension.h +1 -1
data/include/prism/ast.h +4 -4
data/include/prism/version.h +2 -2
data/lib/prism/compiler.rb +152 -152
data/lib/prism/lex_compat.rb +133 -150
data/lib/prism/node.rb +1131 -20
data/lib/prism/parse_result.rb +9 -0
data/lib/prism/serialize.rb +1 -1
data/lib/prism/translation/parser_current.rb +1 -1
data/lib/prism/translation/parser_versions.rb +36 -0
data/lib/prism/translation/ripper/filter.rb +53 -0
data/lib/prism/translation/ripper/lexer.rb +135 -0
data/lib/prism/translation/ripper.rb +84 -38
data/lib/prism/translation/ruby_parser.rb +1 -1
data/lib/prism/translation.rb +5 -5
data/lib/prism/visitor.rb +152 -152
data/lib/prism.rb +1 -14
data/prism.gemspec +5 -11
data/rbi/prism/node.rbi +3 -0
data/rbi/prism/translation/parser_versions.rbi +23 -0
data/rbi/prism.rbi +0 -3
data/sig/prism/node.rbs +4 -0
data/sig/prism/parse_result.rbs +1 -0
data/sig/prism.rbs +54 -40
data/src/prism.c +48 -27
metadata +5 -11
data/lib/prism/translation/parser33.rb +0 -13
data/lib/prism/translation/parser34.rb +0 -13
data/lib/prism/translation/parser35.rb +0 -8
data/lib/prism/translation/parser40.rb +0 -13
data/lib/prism/translation/parser41.rb +0 -13
data/rbi/prism/translation/parser33.rbi +0 -6
data/rbi/prism/translation/parser34.rbi +0 -6
data/rbi/prism/translation/parser35.rbi +0 -4
data/rbi/prism/translation/parser40.rbi +0 -6
data/rbi/prism/translation/parser41.rbi +0 -6

data/lib/prism/lex_compat.rb CHANGED Viewed

@@ -1,9 +1,6 @@
 # frozen_string_literal: true
 # :markup: markdown
-require "delegate"
-require "ripper"
 module Prism
   # This class is responsible for lexing the source using prism and then
   # converting those tokens to be compatible with Ripper. In the vast majority
@@ -202,87 +199,51 @@ module Prism
     # When we produce tokens, we produce the same arrays that Ripper does.
     # However, we add a couple of convenience methods onto them to make them a
     # little easier to work with. We delegate all other methods to the array.
-    class Token < SimpleDelegator
-      # @dynamic initialize, each, []
+    class Token < BasicObject
+      # Create a new token object with the given ripper-compatible array.
+      def initialize(array)
+        @array = array
+      end
       # The location of the token in the source.
       def location
-        self[0]
+        @array[0]
       end
       # The type of the token.
       def event
-        self[1]
+        @array[1]
       end
       # The slice of the source that this token represents.
       def value
-        self[2]
+        @array[2]
       end
       # The state of the lexer when this token was produced.
       def state
-        self[3]
+        @array[3]
       end
-    end
-    # Ripper doesn't include the rest of the token in the event, so we need to
-    # trim it down to just the content on the first line when comparing.
-    class EndContentToken < Token
+      # We want to pretend that this is just an Array.
       def ==(other) # :nodoc:
-        [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
+        @array == other
       end
-    end
-    # Tokens where state should be ignored
-    # used for :on_comment, :on_heredoc_end, :on_embexpr_end
-    class IgnoreStateToken < Token
-      def ==(other) # :nodoc:
-        self[0...-1] == other[0...-1]
-      end
-    end
-    # Ident tokens for the most part are exactly the same, except sometimes we
-    # know an ident is a local when ripper doesn't (when they are introduced
-    # through named captures in regular expressions). In that case we don't
-    # compare the state.
-    class IdentToken < Token
-      def ==(other) # :nodoc:
-        (self[0...-1] == other[0...-1]) && (
-          (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
-          (other[3] & Ripper::EXPR_ARG_ANY != 0)
-        )
+      def respond_to_missing?(name, include_private = false) # :nodoc:
+        @array.respond_to?(name, include_private)
       end
-    end
-    # Ignored newlines can occasionally have a LABEL state attached to them, so
-    # we compare the state differently here.
-    class IgnoredNewlineToken < Token
-      def ==(other) # :nodoc:
-        return false unless self[0...-1] == other[0...-1]
-        if self[3] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
-          other[3] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED != 0
-        else
-          self[3] == other[3]
-        end
+      def method_missing(name, ...) # :nodoc:
+        @array.send(name, ...)
       end
     end
-    # If we have an identifier that follows a method name like:
-    #
-    #     def foo bar
-    #
-    # then Ripper will mark bar as END|LABEL if there is a local in a parent
-    # scope named bar because it hasn't pushed the local table yet. We do this
-    # more accurately, so we need to allow comparing against both END and
-    # END|LABEL.
-    class ParamToken < Token
+    # Tokens where state should be ignored
+    # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
+    class IgnoreStateToken < Token
       def ==(other) # :nodoc:
-        (self[0...-1] == other[0...-1]) && (
-          (other[3] == Ripper::EXPR_END) ||
-          (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
-        )
+        self[0...-1] == other[0...-1]
       end
     end
@@ -615,10 +576,15 @@ module Prism
     private_constant :Heredoc
-    attr_reader :source, :options
+    # In previous versions of Ruby, Ripper wouldn't flush the bom before the
+    # first token, so we had to have a hack in place to account for that.
+    BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
+    private_constant :BOM_FLUSHED
-    def initialize(source, **options)
-      @source = source
+    attr_reader :options
+    def initialize(code, **options)
+      @code = code
       @options = options
     end
@@ -628,16 +594,14 @@ module Prism
       state = :default
       heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
-      result = Prism.lex(source, **options)
+      result = Prism.lex(@code, **options)
+      source = result.source
       result_value = result.value
-      previous_state = nil #: Ripper::Lexer::State?
+      previous_state = nil #: State?
       last_heredoc_end = nil #: Integer?
+      eof_token = nil
-      # In previous versions of Ruby, Ripper wouldn't flush the bom before the
-      # first token, so we had to have a hack in place to account for that. This
-      # checks for that behavior.
-      bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
-      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
+      bom = source.slice(0, 3) == "\xEF\xBB\xBF"
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
@@ -651,7 +615,7 @@ module Prism
         if bom && lineno == 1
           column -= 3
-          if index == 0 && column == 0 && !bom_flushed
+          if index == 0 && column == 0 && !BOM_FLUSHED
             flushed =
               case token.type
               when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
@@ -675,12 +639,15 @@ module Prism
         event = RIPPER.fetch(token.type)
         value = token.value
-        lex_state = Ripper::Lexer::State.new(lex_state)
+        lex_state = Translation::Ripper::Lexer::State.cached(lex_state)
         token =
           case event
           when :on___end__
-            EndContentToken.new([[lineno, column], event, value, lex_state])
+            # Ripper doesn't include the rest of the token in the event, so we need to
+            # trim it down to just the content on the first line.
+            value = value[0..value.index("\n")]
+            Token.new([[lineno, column], event, value, lex_state])
           when :on_comment
             IgnoreStateToken.new([[lineno, column], event, value, lex_state])
           when :on_heredoc_end
@@ -688,33 +655,18 @@ module Prism
             # want to bother comparing the state on them.
             last_heredoc_end = token.location.end_offset
             IgnoreStateToken.new([[lineno, column], event, value, lex_state])
-          when :on_ident
-            if lex_state == Ripper::EXPR_END
-              # If we have an identifier that follows a method name like:
-              #
-              #     def foo bar
-              #
-              # then Ripper will mark bar as END|LABEL if there is a local in a
-              # parent scope named bar because it hasn't pushed the local table
-              # yet. We do this more accurately, so we need to allow comparing
-              # against both END and END|LABEL.
-              ParamToken.new([[lineno, column], event, value, lex_state])
-            elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
-              # In the event that we're comparing identifiers, we're going to
-              # allow a little divergence. Ripper doesn't account for local
-              # variables introduced through named captures in regexes, and we
-              # do, which accounts for this difference.
-              IdentToken.new([[lineno, column], event, value, lex_state])
-            else
-              Token.new([[lineno, column], event, value, lex_state])
-            end
           when :on_embexpr_end
             IgnoreStateToken.new([[lineno, column], event, value, lex_state])
-          when :on_ignored_nl
-            # Ignored newlines can occasionally have a LABEL state attached to
-            # them which doesn't actually impact anything. We don't mirror that
-            # state so we ignored it.
-            IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
+          when :on_words_sep
+            # Ripper emits one token each per line.
+            value.each_line.with_index do |line, index|
+              if index > 0
+                lineno += 1
+                column = 0
+              end
+              tokens << Token.new([[lineno, column], event, line, lex_state])
+            end
+            tokens.pop
           when :on_regexp_end
             # On regex end, Ripper scans and then sets end state, so the ripper
             # lexed output is begin, when it should be end. prism sets lex state
@@ -739,13 +691,14 @@ module Prism
                   counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
                 end
-                Ripper::Lexer::State.new(result_value[current_index][1])
+                Translation::Ripper::Lexer::State.cached(result_value[current_index][1])
               else
                 previous_state
               end
             Token.new([[lineno, column], event, value, lex_state])
           when :on_eof
+            eof_token = token
             previous_token = result_value[index - 1][0]
             # If we're at the end of the file and the previous token was a
@@ -768,7 +721,7 @@ module Prism
                   end_offset += 3
                 end
-                tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+                tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
               end
             end
@@ -859,70 +812,100 @@ module Prism
       # Drop the EOF token from the list
       tokens = tokens[0...-1]
-      # We sort by location to compare against Ripper's output
-      tokens.sort_by!(&:location)
-      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
-    end
-  end
-  private_constant :LexCompat
+      # We sort by location because Ripper.lex sorts.
+      # Manually implemented instead of `sort_by!(&:location)` for performance.
+      tokens.sort_by! do |token|
+        line, column = token.location
+        source.byte_offset(line, column)
+      end
-  # This is a class that wraps the Ripper lexer to produce almost exactly the
-  # same tokens.
-  class LexRipper # :nodoc:
-    attr_reader :source
+      # Add :on_sp tokens
+      tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
-    def initialize(source)
-      @source = source
+      Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
     end
-    def result
-      previous = [] #: [[Integer, Integer], Symbol, String, untyped] | []
-      results = [] #: Array[[[Integer, Integer], Symbol, String, untyped]]
-      lex(source).each do |token|
-        case token[1]
-        when :on_sp
-          # skip
-        when :on_tstring_content
-          if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
-            previous[2] << token[2]
+    def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
+      new_tokens = []
+      prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
+      prev_token_end = bom ? 3 : 0
+      tokens.each do |token|
+        line, column = token.location
+        start_offset = source.byte_offset(line, column)
+        # Ripper reports columns on line 1 without counting the BOM, so we
+        # adjust to get the real offset
+        start_offset += 3 if line == 1 && bom
+        if start_offset > prev_token_end
+          sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
+          sp_line = source.line(prev_token_end)
+          sp_column = source.column(prev_token_end)
+          # Ripper reports columns on line 1 without counting the BOM
+          sp_column -= 3 if sp_line == 1 && bom
+          continuation_index = sp_value.byteindex("\\")
+          # ripper emits up to three :on_sp tokens when line continuations are used
+          if continuation_index
+            next_whitespace_index = continuation_index + 1
+            next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
+            next_whitespace_index += 1
+            first_whitespace = sp_value[0...continuation_index]
+            continuation = sp_value[continuation_index...next_whitespace_index]
+            second_whitespace = sp_value[next_whitespace_index..]
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              first_whitespace,
+              prev_token_state
+            ]) unless first_whitespace.empty?
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column + continuation_index],
+              :on_sp,
+              continuation,
+              prev_token_state
+            ])
+            new_tokens << IgnoreStateToken.new([
+              [sp_line + 1, 0],
+              :on_sp,
+              second_whitespace,
+              prev_token_state
+            ]) unless second_whitespace.empty?
           else
-            results << token
-            previous = token
+            new_tokens << IgnoreStateToken.new([
+              [sp_line, sp_column],
+              :on_sp,
+              sp_value,
+              prev_token_state
+            ])
           end
-        when :on_words_sep
-          if previous[1] == :on_words_sep
-            previous[2] << token[2]
-          else
-            results << token
-            previous = token
-          end
-        else
-          results << token
-          previous = token
         end
-      end
-      results
-    end
-    private
-    if Ripper.method(:lex).parameters.assoc(:keyrest)
-      def lex(source)
-        Ripper.lex(source, raise_errors: true)
+        new_tokens << token
+        prev_token_state = token.state
+        prev_token_end = start_offset + token.value.bytesize
       end
-    else
-      def lex(source)
-        ripper = Ripper::Lexer.new(source)
-        ripper.lex.tap do |result|
-          raise SyntaxError, ripper.errors.map(&:message).join(' ;') if ripper.errors.any?
+      unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
+        end_offset = eof_token.location.end_offset
+        if prev_token_end < end_offset
+          new_tokens << IgnoreStateToken.new([
+            [source.line(prev_token_end), source.column(prev_token_end)],
+            :on_sp,
+            source.slice(prev_token_end, end_offset - prev_token_end),
+            prev_token_state
+          ])
         end
       end
+      new_tokens
     end
   end
-  private_constant :LexRipper
+  private_constant :LexCompat
 end