RubyGems - yarp - Versions diffs - 0.7.0 → 0.9.0 - Mend

yarp 0.7.0 → 0.9.0

Files changed (42) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +54 -2
data/Makefile +2 -0
data/README.md +9 -5
data/config.yml +160 -93
data/docs/configuration.md +1 -0
data/docs/ruby_api.md +2 -0
data/docs/serialization.md +1 -1
data/docs/testing.md +2 -2
data/ext/yarp/api_node.c +361 -238
data/ext/yarp/extension.c +75 -26
data/ext/yarp/extension.h +2 -2
data/include/yarp/ast.h +226 -175
data/include/yarp/defines.h +5 -0
data/include/yarp/node.h +10 -0
data/include/yarp/unescape.h +4 -2
data/include/yarp/util/yp_buffer.h +9 -1
data/include/yarp/util/yp_constant_pool.h +3 -0
data/include/yarp/util/yp_list.h +7 -7
data/include/yarp/util/yp_newline_list.h +7 -0
data/include/yarp/util/yp_state_stack.h +1 -1
data/include/yarp/version.h +2 -2
data/include/yarp.h +10 -0
data/lib/yarp/desugar_visitor.rb +267 -0
data/lib/yarp/ffi.rb +89 -48
data/lib/yarp/lex_compat.rb +93 -25
data/lib/yarp/mutation_visitor.rb +683 -0
data/lib/yarp/node.rb +2061 -422
data/lib/yarp/serialize.rb +162 -120
data/lib/yarp.rb +54 -8
data/src/node.c +360 -304
data/src/prettyprint.c +190 -152
data/src/serialize.c +382 -340
data/src/token_type.c +2 -2
data/src/unescape.c +89 -77
data/src/util/yp_buffer.c +18 -0
data/src/util/yp_list.c +7 -16
data/src/util/yp_newline_list.c +10 -0
data/src/util/yp_state_stack.c +0 -6
data/src/yarp.c +941 -596
data/yarp.gemspec +3 -1
metadata +4 -2

data/lib/yarp/lex_compat.rb CHANGED Viewed

@@ -208,18 +208,9 @@ module YARP
       end
     end
-    # It is extremely non obvious which state the parser is in when comments get
-    # dispatched. Because of this we don't both comparing state when comparing
-    # against other comment tokens.
-    class CommentToken < Token
-      def ==(other)
-        self[0...-1] == other[0...-1]
-      end
-    end
-    # Heredoc end tokens are emitted in an odd order, so we don't compare the
-    # state on them.
-    class HeredocEndToken < Token
+    # Tokens where state should be ignored
+    # used for :on_comment, :on_heredoc_end, :on_embexpr_end
+    class IgnoreStateToken < Token
       def ==(other)
         self[0...-1] == other[0...-1]
       end
@@ -252,6 +243,23 @@ module YARP
       end
     end
+    # If we have an identifier that follows a method name like:
+    #
+    #     def foo bar
+    #
+    # then Ripper will mark bar as END|LABEL if there is a local in a parent
+    # scope named bar because it hasn't pushed the local table yet. We do this
+    # more accurately, so we need to allow comparing against both END and
+    # END|LABEL.
+    class ParamToken < Token
+      def ==(other)
+        (self[0...-1] == other[0...-1]) && (
+          (other[3] == Ripper::EXPR_END) ||
+          (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
+        )
+      end
+    end
     # A heredoc in this case is a list of tokens that belong to the body of the
     # heredoc that should be appended onto the list of tokens when the heredoc
     # closes.
@@ -558,18 +566,45 @@ module YARP
       result_value = result.value
       previous_state = nil
-      # If there's a UTF-8 byte-order mark as the start of the file, then ripper
-      # sets every token's on the first line back by 6 bytes. It also keeps the
-      # byte order mark in the first token's value. This is weird, and I don't
-      # want to mirror that in our parser. So instead, we'll match up the values
-      # here, and then match up the locations as we process the tokens.
-      bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
-      result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
+      # In previous versions of Ruby, Ripper wouldn't flush the bom before the
+      # first token, so we had to have a hack in place to account for that. This
+      # checks for that behavior.
+      bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
+      bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
       result_value.each_with_index do |(token, lex_state), index|
         lineno = token.location.start_line
         column = token.location.start_column
-        column -= index == 0 ? 6 : 3 if bom && lineno == 1
+        # If there's a UTF-8 byte-order mark as the start of the file, then for
+        # certain tokens ripper sets the first token back by 3 bytes. It also
+        # keeps the byte order mark in the first token's value. This is weird,
+        # and I don't want to mirror that in our parser. So instead, we'll match
+        # up the columns and values here.
+        if bom && lineno == 1
+          column -= 3
+          if index == 0 && column == 0 && !bom_flushed
+            flushed =
+              case token.type
+              when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
+                  :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
+                  :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
+                  :PERCENT_UPPER_W, :STRING_BEGIN
+                true
+              when :REGEXP_BEGIN, :SYMBOL_BEGIN
+                token.value.start_with?("%")
+              else
+                false
+              end
+            unless flushed
+              column -= 3
+              value = token.value
+              value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
+            end
+          end
+        end
         event = RIPPER.fetch(token.type)
         value = token.value
@@ -580,13 +615,23 @@ module YARP
           when :on___end__
             EndContentToken.new([[lineno, column], event, value, lex_state])
           when :on_comment
-            CommentToken.new([[lineno, column], event, value, lex_state])
+            IgnoreStateToken.new([[lineno, column], event, value, lex_state])
           when :on_heredoc_end
             # Heredoc end tokens can be emitted in an odd order, so we don't
             # want to bother comparing the state on them.
-            HeredocEndToken.new([[lineno, column], event, value, lex_state])
-          when :on_embexpr_end, :on_ident
-            if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
+            IgnoreStateToken.new([[lineno, column], event, value, lex_state])
+          when :on_ident
+            if lex_state == Ripper::EXPR_END
+              # If we have an identifier that follows a method name like:
+              #
+              #     def foo bar
+              #
+              # then Ripper will mark bar as END|LABEL if there is a local in a
+              # parent scope named bar because it hasn't pushed the local table
+              # yet. We do this more accurately, so we need to allow comparing
+              # against both END and END|LABEL.
+              ParamToken.new([[lineno, column], event, value, lex_state])
+            elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
               # In the event that we're comparing identifiers, we're going to
               # allow a little divergence. Ripper doesn't account for local
               # variables introduced through named captures in regexes, and we
@@ -595,6 +640,8 @@ module YARP
             else
               Token.new([[lineno, column], event, value, lex_state])
             end
+          when :on_embexpr_end
+            IgnoreStateToken.new([[lineno, column], event, value, lex_state])
           when :on_ignored_nl
             # Ignored newlines can occasionally have a LABEL state attached to
             # them which doesn't actually impact anything. We don't mirror that
@@ -629,6 +676,26 @@ module YARP
                 previous_state
               end
+            Token.new([[lineno, column], event, value, lex_state])
+          when :on_eof
+            previous_token = result_value[index - 1][0]
+            # If we're at the end of the file and the previous token was a
+            # comment and there is still whitespace after the comment, then
+            # Ripper will append a on_nl token (even though there isn't
+            # necessarily a newline). We mirror that here.
+            start_offset = previous_token.location.end_offset
+            end_offset = token.location.start_offset
+            if previous_token.type == :COMMENT && start_offset < end_offset
+              if bom
+                start_offset += 3
+                end_offset += 3
+              end
+              tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
+            end
             Token.new([[lineno, column], event, value, lex_state])
           else
             Token.new([[lineno, column], event, value, lex_state])
@@ -713,7 +780,8 @@ module YARP
         end
       end
-      tokens.reject! { |t| t.event == :on_eof }
+      # Drop the EOF token from the list
+      tokens = tokens[0...-1]
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)