RubyGems - prism - Versions diffs - 0.22.0 → 0.23.0 - Mend

prism 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +24 -1
data/README.md +2 -1
data/docs/releasing.md +67 -17
data/docs/ruby_parser_translation.md +19 -0
data/docs/serialization.md +2 -0
data/ext/prism/api_node.c +784 -785
data/ext/prism/extension.c +12 -7
data/ext/prism/extension.h +2 -2
data/include/prism/diagnostic.h +3 -4
data/include/prism/encoding.h +7 -0
data/include/prism/util/pm_constant_pool.h +1 -1
data/include/prism/util/pm_strpbrk.h +4 -1
data/include/prism/version.h +2 -2
data/lib/prism/ffi.rb +1 -1
data/lib/prism/lex_compat.rb +1 -0
data/lib/prism/node_ext.rb +25 -2
data/lib/prism/parse_result.rb +44 -15
data/lib/prism/serialize.rb +12 -6
data/lib/prism/translation/parser.rb +10 -9
data/lib/prism/translation/ripper.rb +577 -0
data/lib/prism/translation/ruby_parser.rb +1521 -0
data/lib/prism/translation.rb +3 -3
data/lib/prism.rb +0 -1
data/prism.gemspec +4 -2
data/src/diagnostic.c +10 -11
data/src/encoding.c +16 -17
data/src/options.c +7 -2
data/src/prism.c +124 -64
data/src/serialize.c +24 -13
data/src/token_type.c +3 -3
data/src/util/pm_constant_pool.c +1 -1
data/src/util/pm_strpbrk.c +122 -14
metadata +6 -4
data/lib/prism/ripper_compat.rb +0 -285

data/src/serialize.c CHANGED Viewed

@@ -1843,6 +1843,17 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
     }
 }
+static void
+pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) {
+    uint32_t size = pm_sizet_to_u32(list->size);
+    pm_buffer_append_varuint(buffer, size);
+    for (uint32_t i = 0; i < size; i++) {
+        uint32_t offset = pm_sizet_to_u32(list->offsets[i]);
+        pm_buffer_append_varuint(buffer, offset);
+    }
+}
 static void
 pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) {
     // serialize type
@@ -1929,19 +1940,25 @@ pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
     pm_buffer_append_string(buffer, encoding->name, encoding_length);
 }
-#line 218 "serialize.c.erb"
-/**
- * Serialize the encoding, metadata, nodes, and constant pool.
- */
-void
-pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
+static void
+pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) {
     pm_serialize_encoding(parser->encoding, buffer);
     pm_buffer_append_varsint(buffer, parser->start_line);
+    pm_serialize_newline_list(&parser->newline_list, buffer);
     pm_serialize_comment_list(parser, &parser->comment_list, buffer);
     pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer);
     pm_serialize_data_loc(parser, buffer);
     pm_serialize_diagnostic_list(parser, &parser->error_list, buffer);
     pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
+}
+#line 243 "serialize.c.erb"
+/**
+ * Serialize the metadata, nodes, and constant pool.
+ */
+void
+pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
+    pm_serialize_metadata(parser, buffer);
     // Here we're going to leave space for the offset of the constant pool in
     // the buffer.
@@ -2032,13 +2049,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
     // Append 0 to mark end of tokens.
     pm_buffer_append_byte(buffer, 0);
-    pm_serialize_encoding(parser.encoding, buffer);
-    pm_buffer_append_varsint(buffer, parser.start_line);
-    pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
-    pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
-    pm_serialize_data_loc(&parser, buffer);
-    pm_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
-    pm_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
+    pm_serialize_metadata(&parser, buffer);
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);

data/src/token_type.c CHANGED Viewed

@@ -469,7 +469,7 @@ pm_token_type_human(pm_token_type_t token_type) {
         case PM_TOKEN_HEREDOC_START:
             return "heredoc beginning";
         case PM_TOKEN_IDENTIFIER:
-            return "local variable or method identifier";
+            return "local variable or method";
         case PM_TOKEN_IGNORED_NEWLINE:
             return "ignored newline";
         case PM_TOKEN_INSTANCE_VARIABLE:
@@ -579,7 +579,7 @@ pm_token_type_human(pm_token_type_t token_type) {
         case PM_TOKEN_LABEL:
             return "label";
         case PM_TOKEN_LABEL_END:
-            return "':'";
+            return "label terminator";
         case PM_TOKEN_LAMBDA_BEGIN:
             return "'{'";
         case PM_TOKEN_LESS:
@@ -681,7 +681,7 @@ pm_token_type_human(pm_token_type_t token_type) {
         case PM_TOKEN_UPLUS:
             return "'+'";
         case PM_TOKEN_USTAR:
-            return "'*'";
+            return "*";
         case PM_TOKEN_USTAR_STAR:
             return "'**'";
         case PM_TOKEN_WORDS_SEP:

data/src/util/pm_constant_pool.c CHANGED Viewed

@@ -186,7 +186,7 @@ pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t
  * the constant is not found.
  */
 pm_constant_id_t
-pm_constant_pool_find(pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
+pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
     assert(is_power_of_two(pool->capacity));
     const uint32_t mask = pool->capacity - 1;

data/src/util/pm_strpbrk.c CHANGED Viewed

@@ -1,10 +1,18 @@
 #include "prism/util/pm_strpbrk.h"
 /**
- * This is the slow path that does care about the encoding.
+ * Add an invalid multibyte character error to the parser.
+ */
+static inline void
+pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+    pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
+}
+/**
+ * This is the default path.
  */
 static inline const uint8_t *
-pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
+pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
     size_t index = 0;
     while (index < maximum) {
@@ -12,22 +20,39 @@ pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const ui
             return source + index;
         }
-        size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
-        if (width == 0) {
-            return NULL;
-        }
+        if (source[index] < 0x80) {
+            index++;
+        } else {
+            size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
-        index += width;
+            if (width > 0) {
+                index += width;
+            } else if (!validate) {
+                index++;
+            } else {
+                // At this point we know we have an invalid multibyte character.
+                // We'll walk forward as far as we can until we find the next
+                // valid character so that we don't spam the user with a ton of
+                // the same kind of error.
+                const size_t start = index;
+                do {
+                    index++;
+                } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+            }
+        }
     }
     return NULL;
 }
 /**
- * This is the fast path that does not care about the encoding.
+ * This is the path when the encoding is ASCII-8BIT.
  */
 static inline const uint8_t *
-pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t maximum) {
+pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
     size_t index = 0;
     while (index < maximum) {
@@ -41,6 +66,85 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
     return NULL;
 }
+/**
+ * This is the slow path that does care about the encoding.
+ */
+static inline const uint8_t *
+pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+    size_t index = 0;
+    while (index < maximum) {
+        if (strchr((const char *) charset, source[index]) != NULL) {
+            return source + index;
+        }
+        if (source[index] < 0x80) {
+            index++;
+        } else {
+            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            if (width > 0) {
+                index += width;
+            } else if (!validate) {
+                index++;
+            } else {
+                // At this point we know we have an invalid multibyte character.
+                // We'll walk forward as far as we can until we find the next
+                // valid character so that we don't spam the user with a ton of
+                // the same kind of error.
+                const size_t start = index;
+                do {
+                    index++;
+                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+            }
+        }
+    }
+    return NULL;
+}
+/**
+ * This is the fast path that does not care about the encoding because we know
+ * the encoding only supports single-byte characters.
+ */
+static inline const uint8_t *
+pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+    size_t index = 0;
+    while (index < maximum) {
+        if (strchr((const char *) charset, source[index]) != NULL) {
+            return source + index;
+        }
+        if (source[index] < 0x80 || !validate) {
+            index++;
+        } else {
+            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            if (width > 0) {
+                index += width;
+            } else {
+                // At this point we know we have an invalid multibyte character.
+                // We'll walk forward as far as we can until we find the next
+                // valid character so that we don't spam the user with a ton of
+                // the same kind of error.
+                const size_t start = index;
+                do {
+                    index++;
+                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+            }
+        }
+    }
+    return NULL;
+}
 /**
  * Here we have rolled our own version of strpbrk. The standard library strpbrk
  * has undefined behavior when the source string is not null-terminated. We want
@@ -57,16 +161,20 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
  *
  * Finally, we want to support encodings wherein the charset could contain
  * characters that are trailing bytes of multi-byte characters. For example, in
- * Shift-JIS, the backslash character can be a trailing byte. In that case we
+ * Shift_JIS, the backslash character can be a trailing byte. In that case we
  * need to take a slower path and iterate one multi-byte character at a time.
  */
 const uint8_t *
-pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
+pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
     if (length <= 0) {
         return NULL;
-    } else if (parser->encoding_changed && parser->encoding->multibyte) {
-        return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
+    } else if (!parser->encoding_changed) {
+        return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
+    } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
+        return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
+    } else if (parser->encoding->multibyte) {
+        return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
     } else {
-        return pm_strpbrk_single_byte(source, charset, (size_t) length);
+        return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
     }
 }

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: prism
 version: !ruby/object:Gem::Version
-  version: 0.22.0
+  version: 0.23.0
 platform: ruby
 authors:
 - Shopify
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2024-02-07 00:00:00.000000000 Z
+date: 2024-02-14 00:00:00.000000000 Z
 dependencies: []
 description:
 email:
@@ -40,6 +40,7 @@ files:
 - docs/releasing.md
 - docs/ripper.md
 - docs/ruby_api.md
+- docs/ruby_parser_translation.md
 - docs/serialization.md
 - docs/testing.md
 - ext/prism/api_node.c
@@ -88,13 +89,14 @@ files:
 - lib/prism/parse_result/comments.rb
 - lib/prism/parse_result/newlines.rb
 - lib/prism/pattern.rb
-- lib/prism/ripper_compat.rb
 - lib/prism/serialize.rb
 - lib/prism/translation.rb
 - lib/prism/translation/parser.rb
 - lib/prism/translation/parser/compiler.rb
 - lib/prism/translation/parser/lexer.rb
 - lib/prism/translation/parser/rubocop.rb
+- lib/prism/translation/ripper.rb
+- lib/prism/translation/ruby_parser.rb
 - lib/prism/visitor.rb
 - prism.gemspec
 - rbi/prism.rbi
@@ -144,7 +146,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.6.0.dev
+rubygems_version: 3.4.1
 signing_key:
 specification_version: 4
 summary: Prism Ruby parser

data/lib/prism/ripper_compat.rb DELETED Viewed

@@ -1,285 +0,0 @@
-# frozen_string_literal: true
-require "ripper"
-module Prism
-  # Note: This integration is not finished, and therefore still has many
-  # inconsistencies with Ripper. If you'd like to help out, pull requests would
-  # be greatly appreciated!
-  #
-  # This class is meant to provide a compatibility layer between prism and
-  # Ripper. It functions by parsing the entire tree first and then walking it
-  # and executing each of the Ripper callbacks as it goes.
-  #
-  # This class is going to necessarily be slower than the native Ripper API. It
-  # is meant as a stopgap until developers migrate to using prism. It is also
-  # meant as a test harness for the prism parser.
-  #
-  # To use this class, you treat `Prism::RipperCompat` effectively as you would
-  # treat the `Ripper` class.
-  class RipperCompat < Visitor
-    # This class mirrors the ::Ripper::SexpBuilder subclass of ::Ripper that
-    # returns the arrays of [type, *children].
-    class SexpBuilder < RipperCompat
-      private
-      Ripper::PARSER_EVENTS.each do |event|
-        define_method(:"on_#{event}") do |*args|
-          [event, *args]
-        end
-      end
-      Ripper::SCANNER_EVENTS.each do |event|
-        define_method(:"on_#{event}") do |value|
-          [:"@#{event}", value, [lineno, column]]
-        end
-      end
-    end
-    # This class mirrors the ::Ripper::SexpBuilderPP subclass of ::Ripper that
-    # returns the same values as ::Ripper::SexpBuilder except with a couple of
-    # niceties that flatten linked lists into arrays.
-    class SexpBuilderPP < SexpBuilder
-      private
-      def _dispatch_event_new # :nodoc:
-        []
-      end
-      def _dispatch_event_push(list, item) # :nodoc:
-        list << item
-        list
-      end
-      Ripper::PARSER_EVENT_TABLE.each do |event, arity|
-        case event
-        when /_new\z/
-          alias_method :"on_#{event}", :_dispatch_event_new if arity == 0
-        when /_add\z/
-          alias_method :"on_#{event}", :_dispatch_event_push
-        end
-      end
-    end
-    # The source that is being parsed.
-    attr_reader :source
-    # The current line number of the parser.
-    attr_reader :lineno
-    # The current column number of the parser.
-    attr_reader :column
-    # Create a new RipperCompat object with the given source.
-    def initialize(source)
-      @source = source
-      @result = nil
-      @lineno = nil
-      @column = nil
-    end
-    ############################################################################
-    # Public interface
-    ############################################################################
-    # True if the parser encountered an error during parsing.
-    def error?
-      result.failure?
-    end
-    # Parse the source and return the result.
-    def parse
-      result.magic_comments.each do |magic_comment|
-        on_magic_comment(magic_comment.key, magic_comment.value)
-      end
-      if error?
-        result.errors.each do |error|
-          on_parse_error(error.message)
-        end
-        nil
-      else
-        result.value.accept(self)
-      end
-    end
-    ############################################################################
-    # Visitor methods
-    ############################################################################
-    # Visit an ArrayNode node.
-    def visit_array_node(node)
-      elements = visit_elements(node.elements) unless node.elements.empty?
-      bounds(node.location)
-      on_array(elements)
-    end
-    # Visit a CallNode node.
-    def visit_call_node(node)
-      if node.variable_call?
-        if node.message.match?(/^[[:alpha:]_]/)
-          bounds(node.message_loc)
-          return on_vcall(on_ident(node.message))
-        end
-        raise NotImplementedError, "Non-alpha variable call"
-      end
-      if node.opening_loc.nil?
-        left = visit(node.receiver)
-        if node.arguments&.arguments&.length == 1
-          right = visit(node.arguments.arguments.first)
-          on_binary(left, node.name, right)
-        elsif !node.arguments || node.arguments.empty?
-          on_unary(node.name, left)
-        else
-          raise NotImplementedError, "More than two arguments for operator"
-        end
-      else
-        raise NotImplementedError, "Non-nil opening_loc"
-      end
-    end
-    # Visit a FloatNode node.
-    def visit_float_node(node)
-      visit_number(node) { |text| on_float(text) }
-    end
-    # Visit a ImaginaryNode node.
-    def visit_imaginary_node(node)
-      visit_number(node) { |text| on_imaginary(text) }
-    end
-    # Visit an IntegerNode node.
-    def visit_integer_node(node)
-      visit_number(node) { |text| on_int(text) }
-    end
-    # Visit a ParenthesesNode node.
-    def visit_parentheses_node(node)
-      body =
-        if node.body.nil?
-          on_stmts_add(on_stmts_new, on_void_stmt)
-        else
-          visit(node.body)
-        end
-      bounds(node.location)
-      on_paren(body)
-    end
-    # Visit a ProgramNode node.
-    def visit_program_node(node)
-      statements = visit(node.statements)
-      bounds(node.location)
-      on_program(statements)
-    end
-    # Visit a RangeNode node.
-    def visit_range_node(node)
-      left = visit(node.left)
-      right = visit(node.right)
-      bounds(node.location)
-      if node.exclude_end?
-        on_dot3(left, right)
-      else
-        on_dot2(left, right)
-      end
-    end
-    # Visit a RationalNode node.
-    def visit_rational_node(node)
-      visit_number(node) { |text| on_rational(text) }
-    end
-    # Visit a StatementsNode node.
-    def visit_statements_node(node)
-      bounds(node.location)
-      node.body.inject(on_stmts_new) do |stmts, stmt|
-        on_stmts_add(stmts, visit(stmt))
-      end
-    end
-    ############################################################################
-    # Entrypoints for subclasses
-    ############################################################################
-    # This is a convenience method that runs the SexpBuilder subclass parser.
-    def self.sexp_raw(source)
-      SexpBuilder.new(source).parse
-    end
-    # This is a convenience method that runs the SexpBuilderPP subclass parser.
-    def self.sexp(source)
-      SexpBuilderPP.new(source).parse
-    end
-    private
-    # Visit a list of elements, like the elements of an array or arguments.
-    def visit_elements(elements)
-      bounds(elements.first.location)
-      elements.inject(on_args_new) do |args, element|
-        on_args_add(args, visit(element))
-      end
-    end
-    # Visit a node that represents a number. We need to explicitly handle the
-    # unary - operator.
-    def visit_number(node)
-      slice = node.slice
-      location = node.location
-      if slice[0] == "-"
-        bounds_values(location.start_line, location.start_column + 1)
-        value = yield slice[1..-1]
-        bounds(node.location)
-        on_unary(RUBY_ENGINE == "jruby" ? :- : :-@, value)
-      else
-        bounds(location)
-        yield slice
-      end
-    end
-    # This method is responsible for updating lineno and column information
-    # to reflect the current node.
-    #
-    # This method could be drastically improved with some caching on the start
-    # of every line, but for now it's good enough.
-    def bounds(location)
-      @lineno = location.start_line
-      @column = location.start_column
-    end
-    # If we need to do something unusual, we can directly update the line number
-    # and column to reflect the current node.
-    def bounds_values(lineno, column)
-      @lineno = lineno
-      @column = column
-    end
-    # Lazily initialize the parse result.
-    def result
-      @result ||= Prism.parse(source)
-    end
-    def _dispatch0; end # :nodoc:
-    def _dispatch1(_); end # :nodoc:
-    def _dispatch2(_, _); end # :nodoc:
-    def _dispatch3(_, _, _); end # :nodoc:
-    def _dispatch4(_, _, _, _); end # :nodoc:
-    def _dispatch5(_, _, _, _, _); end # :nodoc:
-    def _dispatch7(_, _, _, _, _, _, _); end # :nodoc:
-    alias_method :on_parse_error, :_dispatch1
-    alias_method :on_magic_comment, :_dispatch2
-    (Ripper::SCANNER_EVENT_TABLE.merge(Ripper::PARSER_EVENT_TABLE)).each do |event, arity|
-      alias_method :"on_#{event}", :"_dispatch#{arity}"
-    end
-  end
-end