RubyGems - prism - Versions diffs - 1.3.0 → 1.5.0 - Mend

prism 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +46 -1
data/Makefile +2 -1
data/README.md +1 -0
data/config.yml +273 -37
data/docs/parser_translation.md +8 -23
data/docs/releasing.md +1 -1
data/docs/ripper_translation.md +1 -1
data/docs/ruby_api.md +1 -1
data/ext/prism/api_node.c +1816 -1303
data/ext/prism/extension.c +244 -110
data/ext/prism/extension.h +4 -4
data/include/prism/ast.h +291 -49
data/include/prism/defines.h +4 -1
data/include/prism/diagnostic.h +4 -0
data/include/prism/options.h +89 -3
data/include/prism/regexp.h +2 -2
data/include/prism/util/pm_buffer.h +18 -0
data/include/prism/util/pm_integer.h +4 -0
data/include/prism/util/pm_list.h +6 -0
data/include/prism/util/pm_string.h +12 -2
data/include/prism/version.h +2 -2
data/include/prism.h +41 -16
data/lib/prism/compiler.rb +456 -151
data/lib/prism/desugar_compiler.rb +1 -0
data/lib/prism/dispatcher.rb +16 -0
data/lib/prism/dot_visitor.rb +21 -1
data/lib/prism/dsl.rb +13 -2
data/lib/prism/ffi.rb +62 -34
data/lib/prism/inspect_visitor.rb +5 -1
data/lib/prism/lex_compat.rb +1 -0
data/lib/prism/mutation_compiler.rb +3 -0
data/lib/prism/node.rb +554 -345
data/lib/prism/node_ext.rb +4 -1
data/lib/prism/pack.rb +2 -0
data/lib/prism/parse_result/comments.rb +1 -0
data/lib/prism/parse_result/errors.rb +1 -0
data/lib/prism/parse_result/newlines.rb +2 -1
data/lib/prism/parse_result.rb +53 -0
data/lib/prism/pattern.rb +1 -0
data/lib/prism/polyfill/append_as_bytes.rb +15 -0
data/lib/prism/polyfill/scan_byte.rb +14 -0
data/lib/prism/polyfill/warn.rb +42 -0
data/lib/prism/reflection.rb +5 -2
data/lib/prism/relocation.rb +1 -0
data/lib/prism/serialize.rb +1275 -783
data/lib/prism/string_query.rb +1 -0
data/lib/prism/translation/parser/builder.rb +62 -0
data/lib/prism/translation/parser/compiler.rb +230 -152
data/lib/prism/translation/parser/lexer.rb +446 -64
data/lib/prism/translation/parser.rb +64 -4
data/lib/prism/translation/parser33.rb +1 -0
data/lib/prism/translation/parser34.rb +1 -0
data/lib/prism/translation/parser35.rb +13 -0
data/lib/prism/translation/parser_current.rb +24 -0
data/lib/prism/translation/ripper/sexp.rb +1 -0
data/lib/prism/translation/ripper.rb +30 -4
data/lib/prism/translation/ruby_parser.rb +291 -7
data/lib/prism/translation.rb +3 -0
data/lib/prism/visitor.rb +457 -152
data/lib/prism.rb +5 -3
data/prism.gemspec +9 -1
data/rbi/prism/dsl.rbi +9 -6
data/rbi/prism/node.rbi +43 -16
data/rbi/prism/parse_result.rbi +17 -0
data/rbi/prism/translation/parser35.rbi +6 -0
data/rbi/prism.rbi +39 -36
data/sig/prism/dispatcher.rbs +3 -0
data/sig/prism/dsl.rbs +7 -5
data/sig/prism/node.rbs +461 -37
data/sig/prism/node_ext.rbs +84 -17
data/sig/prism/parse_result/comments.rbs +38 -0
data/sig/prism/parse_result.rbs +14 -0
data/sig/prism/reflection.rbs +1 -1
data/sig/prism/serialize.rbs +4 -2
data/sig/prism.rbs +22 -1
data/src/diagnostic.c +9 -3
data/src/node.c +23 -0
data/src/options.c +33 -2
data/src/prettyprint.c +32 -0
data/src/prism.c +620 -242
data/src/serialize.c +8 -0
data/src/token_type.c +36 -34
data/src/util/pm_buffer.c +40 -0
data/src/util/pm_constant_pool.c +6 -2
data/src/util/pm_strncasecmp.c +13 -1
metadata +11 -7

data/lib/prism/translation/parser/lexer.rb CHANGED Viewed

@@ -1,4 +1,9 @@
 # frozen_string_literal: true
+# :markup: markdown
+require "strscan"
+require_relative "../../polyfill/append_as_bytes"
+require_relative "../../polyfill/scan_byte"
 module Prism
   module Translation
@@ -6,16 +11,17 @@ module Prism
       # Accepts a list of prism tokens and converts them into the expected
       # format for the parser gem.
       class Lexer
+        # These tokens are always skipped
+        TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
+        private_constant :TYPES_ALWAYS_SKIP
         # The direct translating of types between the two lexers.
         TYPES = {
           # These tokens should never appear in the output of the lexer.
-          EOF: nil,
           MISSING: nil,
           NOT_PROVIDED: nil,
-          IGNORED_NEWLINE: nil,
           EMBDOC_END: nil,
           EMBDOC_LINE: nil,
-          __END__: nil,
           # These tokens have more or less direct mappings.
           AMPERSAND: :tAMPER2,
@@ -191,16 +197,24 @@ module Prism
         #
         # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
         # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
-        LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
+        LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
         # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
         # The following token types are listed as those classified as `tLPAREN`.
-        LPAREN_CONVERSION_TOKEN_TYPES = [
-          :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
-          :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
-        ]
+        LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
+          :kBREAK, :tCARET, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
+          :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS, :tLCURLY
+        ])
+        # Types of tokens that are allowed to continue a method call with comments in-between.
+        # For these, the parser gem doesn't emit a newline token after the last comment.
+        COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
+        private_constant :COMMENT_CONTINUATION_TYPES
+        # Heredocs are complex and require us to keep track of a bit of info to refer to later
+        HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
-        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
+        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
         # The Parser::Source::Buffer that the tokens were lexed from.
         attr_reader :source_buffer
@@ -230,46 +244,78 @@ module Prism
           index = 0
           length = lexed.length
-          heredoc_identifier_stack = []
+          heredoc_stack = []
+          quote_stack = []
+          # The parser gem emits the newline tokens for comments out of order. This saves
+          # that token location to emit at a later time to properly line everything up.
+          # https://github.com/whitequark/parser/issues/1025
+          comment_newline_location = nil
           while index < length
             token, state = lexed[index]
             index += 1
-            next if %i[IGNORED_NEWLINE __END__ EOF].include?(token.type)
+            next if TYPES_ALWAYS_SKIP.include?(token.type)
             type = TYPES.fetch(token.type)
             value = token.value
-            location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
+            location = range(token.location.start_offset, token.location.end_offset)
             case type
             when :kDO
-              types = tokens.map(&:first)
-              nearest_lambda_token_type = types.reverse.find { |type| LAMBDA_TOKEN_TYPES.include?(type) }
+              nearest_lambda_token = tokens.reverse_each.find do |token|
+                LAMBDA_TOKEN_TYPES.include?(token.first)
+              end
-              if nearest_lambda_token_type == :tLAMBDA
+              if nearest_lambda_token&.first == :tLAMBDA
                 type = :kDO_LAMBDA
               end
             when :tCHARACTER
               value.delete_prefix!("?")
+              # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
+              value = unescape_string(value, "?")
             when :tCOMMENT
               if token.type == :EMBDOC_BEGIN
-                start_index = index
-                while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
+                while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
                   value += next_token.value
                   index += 1
                 end
-                if start_index != index
-                  value += next_token.value
-                  location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
-                  index += 1
-                end
+                value += next_token.value
+                location = range(token.location.start_offset, next_token.location.end_offset)
+                index += 1
               else
-                value.chomp!
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
+                is_at_eol = value.chomp!.nil?
+                location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
+                prev_token, _ = lexed[index - 2] if index - 2 >= 0
+                next_token, _ = lexed[index]
+                is_inline_comment = prev_token&.location&.start_line == token.location.start_line
+                if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+                  nl_location = range(token.location.end_offset - 1, token.location.end_offset)
+                  tokens << [:tNL, [nil, nl_location]]
+                  next
+                elsif is_inline_comment && next_token&.type == :COMMENT
+                  comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
+                elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+                  tokens << [:tNL, [nil, comment_newline_location]]
+                  comment_newline_location = nil
+                  next
+                end
               end
             when :tNL
+              next_token, _ = lexed[index]
+              # Newlines after comments are emitted out of order.
+              if next_token&.type == :COMMENT
+                comment_newline_location = location
+                next
+              end
               value = nil
             when :tFLOAT
               value = parse_float(value)
@@ -277,8 +323,8 @@ module Prism
               value = parse_complex(value)
             when :tINTEGER
               if value.start_with?("+")
-                tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
+                tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
+                location = range(token.location.start_offset + 1, token.location.end_offset)
               end
               value = parse_integer(value)
@@ -297,92 +343,196 @@ module Prism
             when :tRATIONAL
               value = parse_rational(value)
             when :tSPACE
+              location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
               value = nil
             when :tSTRING_BEG
-              if token.type == :HEREDOC_START
-                heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
-              end
-              if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
+              next_token, _ = lexed[index]
+              next_next_token, _ = lexed[index + 1]
+              basic_quotes = value == '"' || value == "'"
+              if basic_quotes && next_token&.type == :STRING_END
                 next_location = token.location.join(next_token.location)
                 type = :tSTRING
                 value = ""
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
-              elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
-                next_location = token.location.join(next_next_token.location)
-                type = :tSTRING
-                value = next_token.value.gsub("\\\\", "\\")
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
-                index += 2
-              elsif value.start_with?("<<")
+              elsif value.start_with?("'", '"', "%")
+                if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
+                  string_value = next_token.value
+                  if simplify_string?(string_value, value)
+                    next_location = token.location.join(next_next_token.location)
+                    if percent_array?(value)
+                      value = percent_array_unescape(string_value)
+                    else
+                      value = unescape_string(string_value, value)
+                    end
+                    type = :tSTRING
+                    location = range(next_location.start_offset, next_location.end_offset)
+                    index += 2
+                    tokens << [type, [value, location]]
+                    next
+                  end
+                end
+                quote_stack.push(value)
+              elsif token.type == :HEREDOC_START
                 quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
+                heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
+                heredoc = HeredocData.new(
+                  identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
+                  common_whitespace: 0,
+                )
                 if quote == "`"
                   type = :tXSTRING_BEG
-                  value = "<<`"
+                end
+                # The parser gem trims whitespace from squiggly heredocs. We must record
+                # the most common whitespace to later remove.
+                if heredoc_type == "~" || heredoc_type == "`"
+                  heredoc.common_whitespace = calculate_heredoc_whitespace(index)
+                end
+                if quote == "'" || quote == '"' || quote == "`"
+                  value = "<<#{quote}"
                 else
-                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
+                  value = '<<"'
                 end
+                heredoc_stack.push(heredoc)
+                quote_stack.push(value)
               end
             when :tSTRING_CONTENT
-              unless (lines = token.value.lines).one?
-                start_offset = offset_cache[token.location.start_offset]
-                lines.map do |line|
-                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+              is_percent_array = percent_array?(quote_stack.last)
+              if (lines = token.value.lines).one?
+                # Prism usually emits a single token for strings with line continuations.
+                # For squiggly heredocs they are not joined so we do that manually here.
+                current_string = +""
+                current_length = 0
+                start_offset = token.location.start_offset
+                while token.type == :STRING_CONTENT
+                  current_length += token.value.bytesize
+                  # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                  prev_token, _ = lexed[index - 2] if index - 2 >= 0
+                  is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
+                  # The parser gem only removes indentation when the heredoc is not nested
+                  not_nested = heredoc_stack.size == 1
+                  if is_percent_array
+                    value = percent_array_unescape(token.value)
+                  elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                    value = trim_heredoc_whitespace(token.value, current_heredoc)
+                  end
+                  current_string << unescape_string(value, quote_stack.last)
+                  relevant_backslash_count = if quote_stack.last.start_with?("%W", "%I")
+                                               0 # the last backslash escapes the newline
+                                             else
+                                               token.value[/(\\{1,})\n/, 1]&.length || 0
+                                             end
+                  if relevant_backslash_count.even? || !interpolation?(quote_stack.last)
+                    tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
+                    break
+                  end
+                  token, _ = lexed[index]
+                  index += 1
+                end
+              else
+                # When the parser gem encounters a line continuation inside of a multiline string,
+                # it emits a single string node. The backslash (and remaining newline) is removed.
+                current_line = +""
+                adjustment = 0
+                start_offset = token.location.start_offset
+                emit = false
+                lines.each.with_index do |line, index|
                   chomped_line = line.chomp
-                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
-                    adjustment = match[:backslashes].size / 2
-                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
-                    if match[:backslashes].size.odd?
-                      adjusted_line.delete_suffix!("\\")
-                      adjustment += 2
+                  backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
+                  is_interpolation = interpolation?(quote_stack.last)
+                  if backslash_count.odd? && (is_interpolation || is_percent_array)
+                    if is_percent_array
+                      current_line << percent_array_unescape(line)
+                      adjustment += 1
                     else
-                      adjusted_line << newline
+                      chomped_line.delete_suffix!("\\")
+                      current_line << chomped_line
+                      adjustment += 2
                     end
+                    # If the string ends with a line continuation emit the remainder
+                    emit = index == lines.count - 1
                   else
-                    adjusted_line = line
-                    adjustment = 0
+                    current_line << line
+                    emit = true
                   end
-                  end_offset = start_offset + adjusted_line.length + adjustment
-                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
-                  start_offset = end_offset
+                  if emit
+                    end_offset = start_offset + current_line.bytesize + adjustment
+                    tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
+                    start_offset = end_offset
+                    current_line = +""
+                    adjustment = 0
+                  end
                 end
-                next
               end
+              next
             when :tSTRING_DVAR
               value = nil
             when :tSTRING_END
               if token.type == :HEREDOC_END && value.end_with?("\n")
                 newline_length = value.end_with?("\r\n") ? 2 : 1
-                value = heredoc_identifier_stack.pop
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
+                value = heredoc_stack.pop.identifier
+                location = range(token.location.start_offset, token.location.end_offset - newline_length)
               elsif token.type == :REGEXP_END
                 value = value[0]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
+                location = range(token.location.start_offset, token.location.start_offset + 1)
+              end
+              if percent_array?(quote_stack.pop)
+                prev_token, _ = lexed[index - 2] if index - 2 >= 0
+                empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
+                ends_with_whitespace = prev_token&.type == :WORDS_SEP
+                # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
+                if !empty && !ends_with_whitespace
+                  tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
+                end
               end
             when :tSYMBEG
-              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
+              if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
                 next_location = token.location.join(next_token.location)
                 type = :tSYMBOL
                 value = next_token.value
                 value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
+              else
+                quote_stack.push(value)
               end
             when :tFID
               if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
                 type = :tIDENTIFIER
               end
             when :tXSTRING_BEG
-              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
+              if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
+                # self.`()
                 type = :tBACK_REF2
               end
+              quote_stack.push(value)
+            when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
+              if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
+                index += 1
+              end
+              quote_stack.push(value)
+            when :tREGEXP_BEG
+              quote_stack.push(value)
             end
             tokens << [type, [value, location]]
             if token.type == :REGEXP_END
-              tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
+              tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
             end
           end
@@ -391,6 +541,11 @@ module Prism
         private
+        # Creates a new parser range, taking prisms byte offsets into account
+        def range(start_offset, end_offset)
+          Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
+        end
         # Parse an integer from the string representation.
         def parse_integer(value)
           Integer(value)
@@ -432,6 +587,233 @@ module Prism
         rescue ArgumentError
           0r
         end
+        # Wonky heredoc tab/spaces rules.
+        # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
+        def calculate_heredoc_whitespace(heredoc_token_index)
+          next_token_index = heredoc_token_index
+          nesting_level = 0
+          previous_line = -1
+          result = Float::MAX
+          while (next_token = lexed[next_token_index]&.first)
+            next_token_index += 1
+            next_next_token, _ = lexed[next_token_index]
+            first_token_on_line = next_token.location.start_column == 0
+            # String content inside nested heredocs and interpolation is ignored
+            if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
+              # When interpolation is the first token of a line there is no string
+              # content to check against. There will be no common whitespace.
+              if nesting_level == 0 && first_token_on_line
+                result = 0
+              end
+              nesting_level += 1
+            elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
+              nesting_level -= 1
+              # When we encountered the matching heredoc end, we can exit
+              break if nesting_level == -1
+            elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
+              common_whitespace = 0
+              next_token.value[/^\s*/].each_char do |char|
+                if char == "\t"
+                  common_whitespace = (common_whitespace / 8 + 1) * 8;
+                else
+                  common_whitespace += 1
+                end
+              end
+              is_first_token_on_line = next_token.location.start_line != previous_line
+              # Whitespace is significant if followed by interpolation
+              whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
+              if is_first_token_on_line && !whitespace_only && common_whitespace < result
+                result = common_whitespace
+                previous_line = next_token.location.start_line
+              end
+            end
+          end
+          result
+        end
+        # Wonky heredoc tab/spaces rules.
+        # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
+        def trim_heredoc_whitespace(string, heredoc)
+          trimmed_whitespace = 0
+          trimmed_characters = 0
+          while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
+            if string[trimmed_characters] == "\t"
+              trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
+              break if trimmed_whitespace > heredoc.common_whitespace
+            else
+              trimmed_whitespace += 1
+            end
+            trimmed_characters += 1
+          end
+          string[trimmed_characters..]
+        end
+        # Escape sequences that have special and should appear unescaped in the resulting string.
+        ESCAPES = {
+          "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
+          "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
+          "v" => "\v", "\\" => "\\"
+        }.freeze
+        private_constant :ESCAPES
+        # When one of these delimiters is encountered, then the other
+        # one is allowed to be escaped as well.
+        DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
+        private_constant :DELIMITER_SYMETRY
+        # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
+        REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
+        private_constant :REGEXP_META_CHARACTERS
+        # Apply Ruby string escaping rules
+        def unescape_string(string, quote)
+          # In single-quoted heredocs, everything is taken literally.
+          return string if quote == "<<'"
+          # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
+          return string unless string.include?("\\")
+          # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
+          delimiter = quote[-1]
+          if regexp?(quote)
+            # Should be escaped handled to single-quoted heredocs. The only character that is
+            # allowed to be escaped is the delimiter, except when that also has special meaning
+            # in the regexp. Since all the symetry delimiters have special meaning, they don't need
+            # to be considered separately.
+            if REGEXP_META_CHARACTERS.include?(delimiter)
+              string
+            else
+              # There can never be an even amount of backslashes. It would be a syntax error.
+              string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
+            end
+          elsif interpolation?(quote)
+            # Appending individual escape sequences may force the string out of its intended
+            # encoding. Start out with binary and force it back later.
+            result = "".b
+            scanner = StringScanner.new(string)
+            while (skipped = scanner.skip_until(/\\/))
+              # Append what was just skipped over, excluding the found backslash.
+              result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
+              escape_read(result, scanner, false, false)
+            end
+            # Add remaining chars
+            result.append_as_bytes(string.byteslice(scanner.pos..))
+            result.force_encoding(source_buffer.source.encoding)
+          else
+            delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
+            string.gsub(/\\([\\#{delimiters}])/, '\1')
+          end
+        end
+        # Certain strings are merged into a single string token.
+        def simplify_string?(value, quote)
+          case quote
+          when "'"
+            # Only simplify 'foo'
+            !value.include?("\n")
+          when '"'
+            # Simplify when every line ends with a line continuation, or it is the last line
+            value.lines.all? do |line|
+              !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
+            end
+          else
+            # %q and similar are never simplified
+            false
+          end
+        end
+        # Escape a byte value, given the control and meta flags.
+        def escape_build(value, control, meta)
+          value &= 0x9f if control
+          value |= 0x80 if meta
+          value
+        end
+        # Read an escape out of the string scanner, given the control and meta
+        # flags, and push the unescaped value into the result.
+        def escape_read(result, scanner, control, meta)
+          if scanner.skip("\n")
+            # Line continuation
+          elsif (value = ESCAPES[scanner.peek(1)])
+            # Simple single-character escape sequences like \n
+            result.append_as_bytes(value)
+            scanner.pos += 1
+          elsif (value = scanner.scan(/[0-7]{1,3}/))
+            # \nnn
+            result.append_as_bytes(escape_build(value.to_i(8), control, meta))
+          elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
+            # \xnn
+            result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
+          elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
+            # \unnnn
+            result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
+          elsif scanner.skip("u{}")
+            # https://github.com/whitequark/parser/issues/856
+          elsif (value = scanner.scan(/u{.*?}/))
+            # \u{nnnn ...}
+            value[2..-2].split.each do |unicode|
+              result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
+            end
+          elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
+            # \cx or \C-x where x is an ASCII printable character
+            escape_read(result, scanner, true, meta)
+          elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
+            # \M-x where x is an ASCII printable character
+            escape_read(result, scanner, control, true)
+          elsif (byte = scanner.scan_byte)
+            # Something else after an escape.
+            if control && byte == 0x3f # ASCII '?'
+              result.append_as_bytes(escape_build(0x7f, false, meta))
+            else
+              result.append_as_bytes(escape_build(byte, control, meta))
+            end
+          end
+        end
+        # In a percent array, certain whitespace can be preceeded with a backslash,
+        # causing the following characters to be part of the previous element.
+        def percent_array_unescape(string)
+          string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
+            full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
+            full_match
+          end
+        end
+        # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
+        def percent_array_leading_whitespace(string)
+          return 1 if string.start_with?("\n")
+          leading_whitespace = 0
+          string.each_char do |c|
+            break if c == "\n"
+            leading_whitespace += 1
+          end
+          leading_whitespace
+        end
+        # Determine if characters preceeded by a backslash should be escaped or not
+        def interpolation?(quote)
+          !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
+        end
+        # Regexp allow interpolation but are handled differently during unescaping
+        def regexp?(quote)
+          quote == "/" || quote.start_with?("%r")
+        end
+        # Determine if the string is part of a %-style array.
+        def percent_array?(quote)
+          quote.start_with?("%w", "%W", "%i", "%I")
+        end
       end
     end
   end