RubyGems - ruby_parser - Versions diffs - 3.13.0 → 3.15.0 - Mend

ruby_parser 3.13.0 → 3.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

checksums.yaml +4 -4
checksums.yaml.gz.sig +0 -0
data.tar.gz.sig +0 -0
data/.autotest +18 -29
data/History.rdoc +123 -0
data/Manifest.txt +2 -0
data/README.rdoc +3 -3
data/Rakefile +41 -24
data/bin/ruby_parse +3 -1
data/bin/ruby_parse_extract_error +18 -35
data/compare/normalize.rb +43 -3
data/debugging.md +39 -0
data/lib/rp_extensions.rb +1 -1
data/lib/ruby20_parser.rb +3654 -3466
data/lib/ruby20_parser.y +504 -327
data/lib/ruby21_parser.rb +3643 -3455
data/lib/ruby21_parser.y +512 -334
data/lib/ruby22_parser.rb +3669 -3492
data/lib/ruby22_parser.y +513 -335
data/lib/ruby23_parser.rb +3692 -3499
data/lib/ruby23_parser.y +513 -335
data/lib/ruby24_parser.rb +3685 -3463
data/lib/ruby24_parser.y +517 -331
data/lib/ruby25_parser.rb +3685 -3462
data/lib/ruby25_parser.y +517 -331
data/lib/ruby26_parser.rb +3696 -3471
data/lib/ruby26_parser.y +523 -335
data/lib/ruby27_parser.rb +7224 -0
data/lib/ruby27_parser.y +2657 -0
data/lib/ruby_lexer.rb +611 -495
data/lib/ruby_lexer.rex +27 -28
data/lib/ruby_lexer.rex.rb +71 -31
data/lib/ruby_parser.rb +31 -27
data/lib/ruby_parser.yy +529 -336
data/lib/ruby_parser_extras.rb +720 -449
data/test/test_ruby_lexer.rb +1560 -1412
data/test/test_ruby_parser.rb +2611 -1912
data/test/test_ruby_parser_extras.rb +39 -4
data/tools/munge.rb +12 -6
data/tools/ripper.rb +19 -3
metadata +25 -18
metadata.gz.sig +4 -1

data/lib/ruby_lexer.rb CHANGED

@@ -4,40 +4,9 @@
 $DEBUG = true if ENV["DEBUG"]
 class RubyLexer
   # :stopdoc:
-  HAS_ENC = "".respond_to? :encoding
-  IDENT_CHAR = if HAS_ENC then
-                 /[\w\u0080-\u{10ffff}]/u
-               else
-                 /[\w\x80-\xFF]/n
-               end
   EOF = :eof_haha!
-  # ruby constants for strings (should this be moved somewhere else?)
-  STR_FUNC_BORING = 0x00
-  STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
-  STR_FUNC_EXPAND = 0x02
-  STR_FUNC_REGEXP = 0x04
-  STR_FUNC_QWORDS = 0x08
-  STR_FUNC_SYMBOL = 0x10
-  STR_FUNC_INDENT = 0x20 # <<-HEREDOC
-  STR_FUNC_ICNTNT = 0x40 # <<~HEREDOC
-  STR_SQUOTE = STR_FUNC_BORING
-  STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
-  STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
-  STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
-  STR_SSYM   = STR_FUNC_SYMBOL
-  STR_DSYM   = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
-  EXPR_BEG_ANY =  [:expr_beg, :expr_mid,  :expr_class  ]
-  EXPR_ARG_ANY =  [:expr_arg, :expr_cmdarg,            ]
-  EXPR_END_ANY =  [:expr_end, :expr_endarg, :expr_endfn]
   ESCAPES = {
     "a"    => "\007",
     "b"    => "\010",
@@ -54,6 +23,8 @@ class RubyLexer
     "c\?"  => 127.chr,
   }
+  HAS_ENC = "".respond_to? :encoding
   TOKENS = {
     "!"   => :tBANG,
     "!="  => :tNEQ,
@@ -70,13 +41,26 @@ class RubyLexer
     "->"  => :tLAMBDA,
   }
-  TAB_WIDTH = 8
-  @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
+  @@regexp_cache = Hash.new { |h, k| h[k] = Regexp.new(Regexp.escape(k)) }
   @@regexp_cache[nil] = nil
+  if $DEBUG then
+    attr_reader :lex_state
+    def lex_state= o
+      return if @lex_state == o
+      raise ArgumentError, "bad state: %p" % [o] unless State === o
+      warn "lex_state: %p -> %p" % [lex_state, o]
+      @lex_state = o
+    end
+  end
   # :startdoc:
+  attr_accessor :lex_state unless $DEBUG
   attr_accessor :lineno # we're bypassing oedipus' lineno handling.
   attr_accessor :brace_nest
   attr_accessor :cmdarg
@@ -90,7 +74,6 @@ class RubyLexer
   # Additional context surrounding tokens that both the lexer and
   # grammar use.
-  attr_accessor :lex_state
   attr_accessor :lex_strterm
   attr_accessor :lpar_beg
   attr_accessor :paren_nest
@@ -99,24 +82,14 @@ class RubyLexer
   attr_accessor :string_buffer
   attr_accessor :string_nest
-  if $DEBUG then
-    alias lex_state= lex_state=
-    def lex_state=o
-      return if @lex_state == o
-      c = caller.first
-      c = caller[1] if c =~ /\bresult\b/
-      warn "lex_state: %p -> %p from %s" % [@lex_state, o, c.clean_caller]
-      @lex_state = o
-    end
-  end
   # Last token read via next_token.
   attr_accessor :token
   attr_writer :comments
   def initialize _ = nil
-    @lex_state = :expr_none
+    @lex_state = nil # remove one warning under $DEBUG
+    self.lex_state = EXPR_NONE
     self.cond   = RubyParserStuff::StackState.new(:cond, $DEBUG)
     self.cmdarg = RubyParserStuff::StackState.new(:cmdarg, $DEBUG)
@@ -125,17 +98,22 @@ class RubyLexer
   end
   def arg_ambiguous
-    self.warning("Ambiguous first argument. make sure.")
+    self.warning "Ambiguous first argument. make sure."
   end
   def arg_state
-    in_arg_state? ? :expr_arg : :expr_beg
+    is_after_operator? ? EXPR_ARG : EXPR_BEG
   end
   def beginning_of_line?
     ss.bol?
   end
-  alias :bol? :beginning_of_line? # to make .rex file more readable
+  alias bol? beginning_of_line? # to make .rex file more readable
+  def check re
+    ss.check re
+  end
   def comments # TODO: remove this... maybe comment_string + attr_accessor
     c = @comments.join
@@ -143,30 +121,41 @@ class RubyLexer
     c
   end
+  def eat_whitespace
+    r = scan(/\s+/)
+    self.extra_lineno += r.count("\n") if r
+    r
+  end
   def end_of_stream?
     ss.eos?
   end
   def expr_dot?
-    lex_state == :expr_dot
+    lex_state =~ EXPR_DOT
   end
-  def expr_fname?
-    lex_state == :expr_fname
+  def expr_fname? # REFACTOR
+    lex_state =~ EXPR_FNAME
   end
   def expr_result token, text
     cond.push false
     cmdarg.push false
-    result :expr_beg, token, text
+    result EXPR_BEG, token, text
+  end
+  def fixup_lineno extra = 0
+    self.lineno += self.extra_lineno + extra
+    self.extra_lineno = 0
   end
   def heredoc here # TODO: rewrite / remove
     _, eos, func, last_line = here
-    indent         = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
-    content_indent = (func & STR_FUNC_ICNTNT) != 0
-    expand         = (func & STR_FUNC_EXPAND) != 0
+    indent         = func =~ STR_FUNC_INDENT ? "[ \t]*" : nil
+    expand         = func =~ STR_FUNC_EXPAND
+    eol            = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n"
     eos_re         = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
     err_msg        = "can't match #{eos_re.inspect} anywhere in "
@@ -175,30 +164,35 @@ class RubyLexer
     if beginning_of_line? && scan(eos_re) then
       self.lineno += 1
       ss.unread_many last_line # TODO: figure out how to remove this
-      return :tSTRING_END, eos
+      return :tSTRING_END, [eos, func] # TODO: calculate squiggle width at lex?
     end
     self.string_buffer = []
     if expand then
       case
-      when scan(/#[$@]/) then
-        ss.pos -= 1 # FIX omg stupid
+      when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
+        # TODO: !ISASCII
+        # ?! see parser_peek_variable_name
+        return :tSTRING_DVAR, matched
+      when scan(/#(?=\@\@?[a-zA-Z_])/) then
+        # TODO: !ISASCII
         return :tSTRING_DVAR, matched
       when scan(/#[{]/) then
+        self.command_start = true
         return :tSTRING_DBEG, matched
       when scan(/#/) then
-        string_buffer << '#'
+        string_buffer << "#"
       end
       begin
-        c = tokadd_string func, "\n", nil
+        c = tokadd_string func, eol, nil
         rb_compile_error err_msg if
           c == RubyLexer::EOF
-        if c != "\n" then
-          return :tSTRING_CONTENT, string_buffer.join.delete("\r")
+        if c != eol then
+          return :tSTRING_CONTENT, string_buffer.join
         else
           string_buffer << scan(/\n/)
         end
@@ -214,64 +208,26 @@ class RubyLexer
     self.lex_strterm = [:heredoc, eos, func, last_line]
-    string_content = string_buffer.join.delete("\r")
-    string_content = heredoc_dedent(string_content) if content_indent && ruby23plus?
+    string_content = begin
+                       s = string_buffer.join
+                       s.b.force_encoding Encoding::UTF_8
+                     end
     return :tSTRING_CONTENT, string_content
   end
-  def heredoc_dedent(string_content)
-    width = string_content.scan(/^[ \t]*(?=\S)/).map do |whitespace|
-      heredoc_whitespace_indent_size whitespace
-    end.min || 0
-    string_content.split("\n", -1).map do |line|
-      dedent_string line, width
-    end.join "\n"
-  end
-  def dedent_string(string, width)
-    characters_skipped = 0
-    indentation_skipped = 0
-    string.chars.each do |char|
-      break if indentation_skipped >= width
-      if char == ' '
-        characters_skipped += 1
-        indentation_skipped += 1
-      elsif char == "\t"
-        proposed = TAB_WIDTH * (indentation_skipped / TAB_WIDTH + 1)
-        break if (proposed > width)
-        characters_skipped += 1
-        indentation_skipped = proposed
-      end
-    end
-    string[characters_skipped..-1]
-  end
-  def heredoc_whitespace_indent_size(whitespace)
-    whitespace.chars.inject 0 do |size, char|
-      if char == "\t"
-        size + TAB_WIDTH
-      else
-        size + 1
-      end
-    end
-  end
   def heredoc_identifier # TODO: remove / rewrite
     term, func = nil, STR_FUNC_BORING
     self.string_buffer = []
-    heredoc_indent_mods = '-'
+    heredoc_indent_mods = "-"
     heredoc_indent_mods += '\~' if ruby23plus?
     case
     when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
       term = ss[2]
-      func |= STR_FUNC_INDENT unless ss[1].empty?
-      func |= STR_FUNC_ICNTNT if ss[1] == '~'
+      func |= STR_FUNC_INDENT unless ss[1].empty? # TODO: this seems wrong
+      func |= STR_FUNC_ICNTNT if ss[1] == "~"
       func |= case term
               when "\'" then
                 STR_SQUOTE
@@ -288,7 +244,7 @@ class RubyLexer
       func |= STR_DQUOTE
       unless ss[1].empty? then
         func |= STR_FUNC_INDENT
-        func |= STR_FUNC_ICNTNT if ss[1] == '~'
+        func |= STR_FUNC_ICNTNT if ss[1] == "~"
       end
       string_buffer << ss[2]
     else
@@ -304,23 +260,15 @@ class RubyLexer
     self.lex_strterm = [:heredoc, string_buffer.join, func, line]
-    if term == '`' then
+    if term == "`" then
       result nil, :tXSTRING_BEG, "`"
     else
       result nil, :tSTRING_BEG, "\""
     end
   end
-  def in_fname?
-    in_lex_state? :expr_fname
-  end
-  def in_arg_state? # TODO: rename is_after_operator?
-    in_lex_state? :expr_fname, :expr_dot
-  end
-  def in_lex_state?(*states)
-    states.include? lex_state
+  def in_fname? # REFACTOR
+    lex_state =~ EXPR_FNAME
   end
   def int_with_base base
@@ -328,42 +276,35 @@ class RubyLexer
     text = matched
     case
-    when text.end_with?('ri')
-      return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
-    when text.end_with?('r')
-      return result(:expr_end, :tRATIONAL, Rational(text.chop.to_i(base)))
-    when text.end_with?('i')
-      return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
+    when text.end_with?("ri")
+      return result(EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop.to_i(base))))
+    when text.end_with?("r")
+      return result(EXPR_NUM, :tRATIONAL, Rational(text.chop.to_i(base)))
+    when text.end_with?("i")
+      return result(EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_i(base)))
     else
-      return result(:expr_end, :tINTEGER, text.to_i(base))
+      return result(EXPR_NUM, :tINTEGER, text.to_i(base))
     end
   end
+  def is_after_operator?
+    lex_state =~ EXPR_FNAME|EXPR_DOT
+  end
   def is_arg?
-    in_lex_state?(*EXPR_ARG_ANY)
+    lex_state =~ EXPR_ARG_ANY
   end
   def is_beg?
-    # TODO: in_lex_state?(*EXPR_BEG_ANY) || lex_state == [:expr_arg, :expr_labeled]
-    in_lex_state?(*EXPR_BEG_ANY, :expr_value, :expr_labeled)
+    lex_state =~ EXPR_BEG_ANY || lex_state == EXPR_LAB # yes, == EXPR_LAB
   end
   def is_end?
-    in_lex_state?(*EXPR_END_ANY)
-  end
-  def lvar_defined? id
-    # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
-    self.parser.env[id.to_sym] == :lvar
-  end
-  def ruby22_label?
-    ruby22plus? and is_label_possible?
+    lex_state =~ EXPR_END_ANY
   end
   def is_label_possible?
-    (in_lex_state?(:expr_beg, :expr_endfn) && !cmd_state) || is_arg?
+    (lex_state =~ EXPR_LABEL|EXPR_ENDFN && !cmd_state) || is_arg?
   end
   def is_label_suffix?
@@ -378,6 +319,16 @@ class RubyLexer
     lpar_beg && lpar_beg == paren_nest
   end
+  def is_local_id id
+    # maybe just make this false for now
+    self.parser.env[id.to_sym] == :lvar # HACK: this isn't remotely right
+  end
+  def lvar_defined? id
+    # TODO: (dyna_in_block? && dvar_defined?(id)) || local_id?(id)
+    self.parser.env[id.to_sym] == :lvar
+  end
   def matched
     ss.matched
   end
@@ -386,11 +337,139 @@ class RubyLexer
     not is_end?
   end
+  def parse_quote # TODO: remove / rewrite
+    beg, nnd, short_hand, c = nil, nil, false, nil
+    if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
+      rb_compile_error "unknown type of %string" if ss.matched_size == 2
+      c, beg, short_hand = matched, getch, false
+    else                               # Short-hand (e.g. %{, %., %!, etc)
+      c, beg, short_hand = "Q", getch, true
+    end
+    if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
+      rb_compile_error "unterminated quoted string meets end of file"
+    end
+    # Figure nnd-char.  "\0" is special to indicate beg=nnd and that no nesting?
+    nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
+    nnd, beg = beg, "\0" if nnd.nil?
+    token_type, text = nil, "%#{c}#{beg}"
+    token_type, string_type = case c
+                              when "Q" then
+                                ch = short_hand ? nnd : c + beg
+                                text = "%#{ch}"
+                                [:tSTRING_BEG,   STR_DQUOTE]
+                              when "q" then
+                                [:tSTRING_BEG,   STR_SQUOTE]
+                              when "W" then
+                                eat_whitespace
+                                [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_QWORDS]
+                              when "w" then
+                                eat_whitespace
+                                [:tQWORDS_BEG,   STR_SQUOTE | STR_FUNC_QWORDS]
+                              when "x" then
+                                [:tXSTRING_BEG,  STR_XQUOTE]
+                              when "r" then
+                                [:tREGEXP_BEG,   STR_REGEXP]
+                              when "s" then
+                                self.lex_state = EXPR_FNAME
+                                [:tSYMBEG,       STR_SSYM]
+                              when "I" then
+                                eat_whitespace
+                                [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
+                              when "i" then
+                                eat_whitespace
+                                [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
+                              end
+    rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
+      token_type.nil?
+    raise "huh" unless string_type
+    string string_type, nnd, beg
+    return token_type, text
+  end
+  def parse_string quote # TODO: rewrite / remove
+    _, string_type, term, open = quote
+    space = false # FIX: remove these
+    func = string_type
+    paren = open
+    term_re = @@regexp_cache[term]
+    qwords = func =~ STR_FUNC_QWORDS
+    regexp = func =~ STR_FUNC_REGEXP
+    expand = func =~ STR_FUNC_EXPAND
+    unless func then # nil'ed from qwords below. *sigh*
+      return :tSTRING_END, nil
+    end
+    space = true if qwords and eat_whitespace
+    if self.string_nest == 0 && scan(/#{term_re}/) then
+      if qwords then
+        quote[1] = nil
+        return :tSPACE, nil
+      elsif regexp then
+        return :tREGEXP_END, self.regx_options
+      else
+        return :tSTRING_END, term
+      end
+    end
+    return :tSPACE, nil if space
+    self.string_buffer = []
+    if expand
+      case
+      when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
+        # TODO: !ISASCII
+        # ?! see parser_peek_variable_name
+        return :tSTRING_DVAR, nil
+      when scan(/#(?=\@\@?[a-zA-Z_])/) then
+        # TODO: !ISASCII
+        return :tSTRING_DVAR, nil
+      when scan(/#[{]/) then
+        self.command_start = true
+        return :tSTRING_DBEG, nil
+      when scan(/#/) then
+        string_buffer << "#"
+      end
+    end
+    if tokadd_string(func, term, paren) == RubyLexer::EOF then
+      if func =~ STR_FUNC_REGEXP then
+        rb_compile_error "unterminated regexp meets end of file"
+      else
+        rb_compile_error "unterminated string meets end of file"
+      end
+    end
+    return :tSTRING_CONTENT, string_buffer.join
+  end
+  def possibly_escape_string text, check
+    content = match[1]
+    if text =~ check then
+      content.gsub(ESC) { unescape $1 }
+    else
+      content.gsub(/\\\\/, "\\").gsub(/\\\'/, "'")
+    end
+  end
   def process_amper text
     token = if is_arg? && space_seen && !check(/\s/) then
                warning("`&' interpreted as argument prefix")
                :tAMPER
-             elsif in_lex_state? :expr_beg, :expr_mid then
+             elsif lex_state =~ EXPR_BEG|EXPR_MID then
                :tAMPER
              else
                :tAMPER2
@@ -402,7 +481,7 @@ class RubyLexer
   def process_backref text
     token = ss[1].to_sym
     # TODO: can't do lineno hack w/ symbol
-    result :expr_end, :tBACK_REF, token
+    result EXPR_END, :tBACK_REF, token
   end
   def process_begin text
@@ -420,54 +499,33 @@ class RubyLexer
   end
   def process_brace_close text
-    # matching compare/parse23.y:8561
-    cond.lexpop
-    cmdarg.lexpop
     case matched
     when "}" then
       self.brace_nest -= 1
-      self.lex_state   = :expr_endarg # TODO: :expr_end ? Look at 2.6
       return :tSTRING_DEND, matched if brace_nest < 0
+    end
+    # matching compare/parse26.y:8099
+    cond.pop
+    cmdarg.pop
+    case matched
+    when "}" then
+      self.lex_state   = ruby24minus? ? EXPR_ENDARG : EXPR_END
       return :tRCURLY, matched
     when "]" then
       self.paren_nest -= 1
-      self.lex_state   = :expr_endarg
+      self.lex_state   = ruby24minus? ? EXPR_ENDARG : EXPR_END
       return :tRBRACK, matched
     when ")" then
       self.paren_nest -= 1
-      self.lex_state   = :expr_endfn
+      self.lex_state   = EXPR_ENDFN
       return :tRPAREN, matched
     else
       raise "Unknown bracing: #{matched.inspect}"
     end
   end
-  def process_colon1 text
-    # ?: / then / when
-    if is_end? || check(/\s/) then
-      return result :expr_beg, :tCOLON, text
-    end
-    case
-    when scan(/\'/) then
-      string STR_SSYM
-    when scan(/\"/) then
-      string STR_DSYM
-    end
-    result :expr_fname, :tSYMBEG, text
-  end
-  def process_colon2 text
-    if is_beg? || in_lex_state?(:expr_class) || is_space_arg? then
-      result :expr_beg, :tCOLON3, text
-    else
-      result :expr_dot, :tCOLON2, text
-    end
-  end
   def process_brace_open text
     # matching compare/parse23.y:8694
     self.brace_nest += 1
@@ -479,67 +537,111 @@ class RubyLexer
       return expr_result(:tLAMBEG, "{")
     end
-    token = case lex_state
-            when :expr_labeled then
+    token = case
+            when lex_state =~ EXPR_LABELED then
               :tLBRACE     # hash
-            when *EXPR_ARG_ANY, :expr_end, :expr_endfn then
-              :tLCURLY     # block (primary)
-            when :expr_endarg
+            when lex_state =~ EXPR_ARG_ANY|EXPR_END|EXPR_ENDFN then
+              :tLCURLY     # block (primary) "{" in parse.y
+            when lex_state =~ EXPR_ENDARG then
               :tLBRACE_ARG # block (expr)
             else
               :tLBRACE     # hash
             end
-    # TODO: self.lex_state |= :expr_label if token != :tLBRACE_ARG
+    state = token == :tLBRACE_ARG ? EXPR_BEG : EXPR_PAR
     self.command_start = true if token != :tLBRACE
-    return expr_result(token, "{")
+    cond.push false
+    cmdarg.push false
+    result state, token, text
+  end
+  def process_colon1 text
+    # ?: / then / when
+    if is_end? || check(/\s/) then
+      return result EXPR_BEG, :tCOLON, text
+    end
+    case
+    when scan(/\'/) then
+      string STR_SSYM
+    when scan(/\"/) then
+      string STR_DSYM
+    end
+    result EXPR_FNAME, :tSYMBEG, text
+  end
+  def process_colon2 text
+    if is_beg? || lex_state =~ EXPR_CLASS || is_space_arg? then
+      result EXPR_BEG, :tCOLON3, text
+    else
+      result EXPR_DOT, :tCOLON2, text
+    end
   end
   def process_float text
     rb_compile_error "Invalid numeric format" if text =~ /__/
     case
-    when text.end_with?('ri')
-      return result(:expr_end, :tIMAGINARY, Complex(0, Rational(text.chop.chop)))
-    when text.end_with?('r')
-      return result(:expr_end, :tRATIONAL, Rational(text.chop))
-    when text.end_with?('i')
-      return result(:expr_end, :tIMAGINARY, Complex(0, text.chop.to_f))
+    when text.end_with?("ri")
+      return result EXPR_NUM, :tIMAGINARY, Complex(0, Rational(text.chop.chop))
+    when text.end_with?("i")
+      return result EXPR_NUM, :tIMAGINARY, Complex(0, text.chop.to_f)
+    when text.end_with?("r")
+      return result EXPR_NUM, :tRATIONAL,  Rational(text.chop)
     else
-      return result(:expr_end, :tFLOAT, text.to_f)
+      return result EXPR_NUM, :tFLOAT, text.to_f
     end
   end
   def process_gvar text
     text.lineno = self.lineno
-    result(:expr_end, :tGVAR, text)
+    result EXPR_END, :tGVAR, text
   end
   def process_gvar_oddity text
-    return result :expr_end, "$", "$" if text == "$" # TODO: wtf is this?
+    return result EXPR_END, "$", "$" if text == "$" # TODO: wtf is this?
     rb_compile_error "#{text.inspect} is not allowed as a global variable name"
   end
   def process_ivar text
     tok_id = text =~ /^@@/ ? :tCVAR : :tIVAR
     text.lineno = self.lineno
-    return result(:expr_end, tok_id, text)
+    result EXPR_END, tok_id, text
+  end
+  def process_label text
+    symbol = possibly_escape_string text, /^\"/
+    result EXPR_LAB, :tLABEL, [symbol, self.lineno]
+  end
+  def process_label_or_string text
+    if @was_label && text =~ /:\Z/ then
+      @was_label = nil
+      return process_label text
+    elsif text =~ /:\Z/ then
+      ss.pos -= 1 # put back ":"
+      text = text[0..-2]
+    end
+    result EXPR_END, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\\'/, "\'")
   end
   def process_lchevron text
-    if (!in_lex_state?(:expr_dot, :expr_class) &&
+    if (lex_state !~ EXPR_DOT|EXPR_CLASS &&
         !is_end? &&
-        (!is_arg? || space_seen)) then # TODO: || in_state(:expr_labeled)
+        (!is_arg? || lex_state =~ EXPR_LABELED || space_seen)) then
       tok = self.heredoc_identifier
       return tok if tok
     end
-    if in_arg_state? then
-      self.lex_state = :expr_arg
+    if is_after_operator? then
+      self.lex_state = EXPR_ARG
     else
-      self.command_start = true if lex_state == :expr_class
-      self.lex_state = :expr_beg
+      self.command_start = true if lex_state =~ EXPR_CLASS
+      self.lex_state = EXPR_BEG
     end
     return result(lex_state, :tLSHFT, "\<\<")
@@ -549,14 +651,14 @@ class RubyLexer
     c = matched
     hit = false
-    if c == '#' then
+    if c == "#" then
       ss.pos -= 1
       # TODO: handle magic comments
       while scan(/\s*\#.*(\n+|\z)/) do
         hit = true
         self.lineno += matched.lines.to_a.size
-        @comments << matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
+        @comments << matched.gsub(/^ +#/, "#").gsub(/^ +$/, "")
       end
       return nil if end_of_stream?
@@ -567,17 +669,15 @@ class RubyLexer
     # Replace a string of newlines with a single one
     self.lineno += matched.lines.to_a.size if scan(/\n+/)
-    # TODO: remove :expr_value -- audit all uses of it
-    c = in_lex_state?(:expr_beg, :expr_value, :expr_class,
-                      :expr_fname, :expr_dot) && !in_lex_state?(:expr_labeled)
+    c = (lex_state =~ EXPR_BEG|EXPR_CLASS|EXPR_FNAME|EXPR_DOT &&
+         lex_state !~ EXPR_LABELED)
     # TODO: figure out what token_seen is for
-    # TODO: if c || self.lex_state == [:expr_beg, :expr_labeled] then
-    if c || self.lex_state == :expr_labeled then
+    if c || self.lex_state == EXPR_LAB then # yes, == EXPR_LAB
       # ignore if !fallthrough?
       if !c && parser.in_kwarg then
         # normal newline
-        return result(:expr_beg, :tNL, nil)
+        self.command_start = true
+        return result EXPR_BEG, :tNL, nil
       else
         return # skip
       end
@@ -592,41 +692,46 @@ class RubyLexer
     self.command_start = true
-    return result(:expr_beg, :tNL, nil)
+    return result(EXPR_BEG, :tNL, nil)
   end
   def process_nthref text
     # TODO: can't do lineno hack w/ number
-    result :expr_end, :tNTH_REF, ss[1].to_i
+    result EXPR_END, :tNTH_REF, ss[1].to_i
   end
   def process_paren text
-    token = process_paren19
+    token = if is_beg? then
+              :tLPAREN
+            elsif !space_seen then
+              # foo( ... ) => method call, no ambiguity
+              :tLPAREN2
+            elsif is_space_arg? then
+              :tLPAREN_ARG
+            elsif lex_state =~ EXPR_ENDFN && !lambda_beginning? then
+              # TODO:
+              # warn("parentheses after method name is interpreted as " \
+              #      "an argument list, not a decomposed argument")
+              :tLPAREN2
+            else
+              :tLPAREN2 # plain "(" in parse.y
+            end
     self.paren_nest += 1
-    # TODO: add :expr_label to :expr_beg (set in expr_result below)
-    return expr_result(token, "(")
-  end
-  def process_paren19
-    if is_beg? then
-      :tLPAREN
-    elsif is_space_arg? then
-      :tLPAREN_ARG
-    else
-      :tLPAREN2 # plain '(' in parse.y
-    end
+    cond.push false
+    cmdarg.push false
+    result EXPR_PAR, token, text
   end
   def process_percent text
     return parse_quote if is_beg?
-    return result(:expr_beg, :tOP_ASGN, "%") if scan(/\=/)
+    return result EXPR_BEG, :tOP_ASGN, "%" if scan(/\=/)
-    return parse_quote if is_arg? && space_seen && ! check(/\s/)
+    return parse_quote if is_space_arg?(check(/\s/)) || (lex_state =~ EXPR_FITEM && check(/s/))
-    return result(:arg_state, :tPERCENT, "%")
+    return result :arg_state, :tPERCENT, "%"
   end
   def process_plus_minus text
@@ -637,33 +742,33 @@ class RubyLexer
                     [:tUMINUS, :tMINUS]
                   end
-    if in_arg_state? then
+    if is_after_operator? then
       if scan(/@/) then
-        return result(:expr_arg, utype, "#{sign}@")
+        return result(EXPR_ARG, utype, "#{sign}@")
       else
-        return result(:expr_arg, type, sign)
+        return result(EXPR_ARG, type, sign)
       end
     end
-    return result(:expr_beg, :tOP_ASGN, sign) if scan(/\=/)
+    return result(EXPR_BEG, :tOP_ASGN, sign) if scan(/\=/)
-    if (is_beg? || (is_arg? && space_seen && !check(/\s/))) then
+    if is_beg? || (is_arg? && space_seen && !check(/\s/)) then
       arg_ambiguous if is_arg?
       if check(/\d/) then
         return nil if utype == :tUPLUS
-        return result(:expr_beg, :tUMINUS_NUM, sign)
+        return result EXPR_BEG, :tUMINUS_NUM, sign
       end
-      return result(:expr_beg, utype, sign)
+      return result EXPR_BEG, utype, sign
     end
-    return result(:expr_beg, type, sign)
+    result EXPR_BEG, type, sign
   end
   def process_questionmark text
     if is_end? then
-      return result(:expr_value, :tEH, "?")
+      return result EXPR_BEG, :tEH, "?"
     end
     if end_of_stream? then
@@ -672,12 +777,12 @@ class RubyLexer
     if check(/\s|\v/) then
       unless is_arg? then
-        c2 = { " " => 's',
-              "\n" => 'n',
-              "\t" => 't',
-              "\v" => 'v',
-              "\r" => 'r',
-              "\f" => 'f' }[matched]
+        c2 = { " " => "s",
+              "\n" => "n",
+              "\t" => "t",
+              "\v" => "v",
+              "\r" => "r",
+              "\f" => "f" }[matched]
         if c2 then
           warning("invalid character syntax; use ?\\" + c2)
@@ -685,18 +790,28 @@ class RubyLexer
       end
       # ternary
-      return result(:expr_value, :tEH, "?")
+      return result EXPR_BEG, :tEH, "?"
     elsif check(/\w(?=\w)/) then # ternary, also
-      return result(:expr_beg, :tEH, "?")
+      return result EXPR_BEG, :tEH, "?"
     end
     c = if scan(/\\/) then
           self.read_escape
         else
-          ss.getch
+          getch
         end
-    return result(:expr_end, :tSTRING, c)
+    result EXPR_END, :tSTRING, c
+  end
+  def process_simple_string text
+    replacement = text[1..-2].gsub(ESC) {
+      unescape($1).b.force_encoding Encoding::UTF_8
+    }
+    replacement = replacement.b unless replacement.valid_encoding?
+    result EXPR_END, :tSTRING, replacement
   end
   def process_slash text
@@ -707,7 +822,7 @@ class RubyLexer
     end
     if scan(/\=/) then
-      return result(:expr_beg, :tOP_ASGN, "/")
+      return result(EXPR_BEG, :tOP_ASGN, "/")
     end
     if is_arg? && space_seen then
@@ -726,73 +841,68 @@ class RubyLexer
     token = nil
-    if in_arg_state? then
+    if is_after_operator? then
       case
       when scan(/\]\=/) then
         self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
-        return result(:expr_arg, :tASET, "[]=")
+        return result EXPR_ARG, :tASET, "[]="
       when scan(/\]/) then
         self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
-        return result(:expr_arg, :tAREF, "[]")
+        return result EXPR_ARG, :tAREF, "[]"
       else
         rb_compile_error "unexpected '['"
       end
     elsif is_beg? then
       token = :tLBRACK
-    elsif is_arg? && space_seen then
+    elsif is_arg? && (space_seen || lex_state =~ EXPR_LABELED) then
       token = :tLBRACK
     else
       token = :tLBRACK2
     end
-    # TODO: this is done by expr_result except "|EXPR_LABEL")
-    # SET_LEX_STATE(EXPR_BEG|EXPR_LABEL);
-    expr_result token, "["
-  end
-  def possibly_escape_string text, check
-    content = match[1]
-    if text =~ check then
-      content.gsub(ESC) { unescape $1 }
-    else
-      content.gsub(/\\\\/, "\\").gsub(/\\'/, "'")
-    end
+    cond.push false
+    cmdarg.push false
+    result EXPR_PAR, token, text
   end
-  def process_symbol text
-    symbol = possibly_escape_string text, /^:"/
+  def process_string # TODO: rewrite / remove
+    # matches top of parser_yylex in compare/parse23.y:8113
+    token = if lex_strterm[0] == :heredoc then
+              self.heredoc lex_strterm
+            else
+              self.parse_string lex_strterm
+            end
-    return result(:expr_end, :tSYMBOL, symbol)
-  end
+    token_type, c = token
-  def was_label?
-    @was_label = ruby22_label?
-    true
-  end
+    # matches parser_string_term from 2.3, but way off from 2.5
+    if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
+      if ((lex_state =~ EXPR_BEG|EXPR_ENDFN &&
+           !cond.is_in_state) || is_arg?) &&
+          is_label_suffix? then
+        scan(/:/)
+        token_type = token[0] = :tLABEL_END
+      end
+    end
-  def process_label_or_string text
-    if @was_label && text =~ /:\Z/ then
-      @was_label = nil
-      return process_label text
-    elsif text =~ /:\Z/ then
-      ss.pos -= 1 # put back ":"
-      text = text[0..-2]
+    if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
+      self.lex_strterm = nil
+      self.lex_state   = (token_type == :tLABEL_END) ? EXPR_PAR : EXPR_LIT
     end
-    result :expr_end, :tSTRING, text[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
+    return token
   end
-  def process_label text
-    symbol = possibly_escape_string text, /^"/
+  def process_symbol text
+    symbol = possibly_escape_string text, /^:\"/ # stupid emacs
-    result(:expr_labeled, :tLABEL, [symbol, self.lineno]) # TODO: expr_arg|expr_labeled
+    result EXPR_LIT, :tSYMBOL, symbol
   end
   def process_token text
     # matching: parse_ident in compare/parse23.y:7989
     # TODO: make this always return [token, lineno]
-    self.last_state = lex_state
+    # FIX: remove: self.last_state = lex_state
     token = self.token = text
     token << matched if scan(/[\!\?](?!=)/)
@@ -801,7 +911,7 @@ class RubyLexer
       case
       when token =~ /[!?]$/ then
         :tFID
-      when in_lex_state?(:expr_fname) && scan(/=(?:(?![~>=])|(?==>))/) then
+      when lex_state =~ EXPR_FNAME && scan(/=(?:(?![~>=])|(?==>))/) then
         # ident=, not =~ => == or followed by =>
         # TODO test lexing of a=>b vs a==>b
         token << matched
@@ -814,31 +924,33 @@ class RubyLexer
     if is_label_possible? and is_label_suffix? then
       scan(/:/)
-      # TODO: :expr_arg|:expr_labeled
-      return result :expr_labeled, :tLABEL, [token, self.lineno]
+      # TODO: propagate the lineno to ALL results
+      return result EXPR_LAB, :tLABEL, [token, self.lineno]
     end
-    # TODO: mb == ENC_CODERANGE_7BIT && !in_lex_state?(:expr_dot)
-    unless in_lex_state? :expr_dot then
+    # TODO: mb == ENC_CODERANGE_7BIT && lex_state !~ EXPR_DOT
+    if lex_state !~ EXPR_DOT then
       # See if it is a reserved word.
       keyword = RubyParserStuff::Keyword.keyword token
       return process_token_keyword keyword if keyword
-    end # unless in_lex_state? :expr_dot
+    end
     # matching: compare/parse23.y:8079
-    state = if is_beg? or is_arg? or in_lex_state? :expr_dot then
-              cmd_state ? :expr_cmdarg : :expr_arg
-            elsif in_lex_state? :expr_fname then
-              :expr_endfn
+    state = if is_beg? or is_arg? or lex_state =~ EXPR_DOT then
+              cmd_state ? EXPR_CMDARG : EXPR_ARG
+            elsif lex_state =~ EXPR_FNAME then
+              EXPR_ENDFN
             else
-              :expr_end
+              EXPR_END
             end
-    if not [:expr_dot, :expr_fname].include? last_state and
-        (tok_id == :tIDENTIFIER) and # not :expr_fname, not attrasgn
+    tok_id = :tIDENTIFIER if tok_id == :tCONSTANT && is_local_id(token)
+    if last_state !~ EXPR_DOT|EXPR_FNAME and
+        (tok_id == :tIDENTIFIER) and # not EXPR_FNAME, not attrasgn
         lvar_defined?(token) then
-      state = :expr_end # TODO: EXPR_END|EXPR_LABEL
+      state = EXPR_END|EXPR_LABEL
     end
     token.lineno = self.lineno # yes, on a string. I know... I know...
@@ -853,32 +965,30 @@ class RubyLexer
     value = [token, self.lineno]
-    return result(lex_state, keyword.id0, value) if state == :expr_fname
+    return result(lex_state, keyword.id0, value) if state =~ EXPR_FNAME
-    self.command_start = true if lex_state == :expr_beg
+    self.command_start = true if lex_state =~ EXPR_BEG
     case
-    when keyword.id0 == :kDO then
+    when keyword.id0 == :kDO then # parse26.y line 7591
       case
       when lambda_beginning? then
         self.lpar_beg = nil # lambda_beginning? == FALSE in the body of "-> do ... end"
-        self.paren_nest -= 1
-        result(lex_state, :kDO_LAMBDA, value)
+        self.paren_nest -= 1 # TODO: question this?
+        result lex_state, :kDO_LAMBDA, value
       when cond.is_in_state then
-        result(lex_state, :kDO_COND, value)
-      when cmdarg.is_in_state && state != :expr_cmdarg then
-        result(lex_state, :kDO_BLOCK, value)
-      when [:expr_beg, :expr_endarg].include?(state) then
-        result(lex_state, :kDO_BLOCK, value)
+        result lex_state, :kDO_COND, value
+      when cmdarg.is_in_state && state != EXPR_CMDARG then
+        result lex_state, :kDO_BLOCK, value
       else
-        result(lex_state, :kDO, value)
+        result lex_state, :kDO, value
       end
-    when [:expr_beg, :expr_labeled].include?(state) then
-      result(lex_state, keyword.id0, value)
+    when state =~ EXPR_PAD then
+      result lex_state, keyword.id0, value
     when keyword.id0 != keyword.id1 then
-      result(:expr_beg, keyword.id1, value) # TODO: :expr_beg|:expr_label
+      result EXPR_PAR, keyword.id1, value
     else
-      result(lex_state, keyword.id1, value)
+      result lex_state, keyword.id1, value
     end
   end
@@ -886,9 +996,9 @@ class RubyLexer
     ss.unscan # put back "_"
     if beginning_of_line? && scan(/\__END__(\r?\n|\Z)/) then
-      return [RubyLexer::EOF, RubyLexer::EOF]
-    elsif scan(/\_\w*/) then
-      return process_token matched
+      [RubyLexer::EOF, RubyLexer::EOF]
+    elsif scan(/#{IDENT_CHAR}+/) then
+      process_token matched
     end
   end
@@ -921,10 +1031,11 @@ class RubyLexer
     when scan(/s/) then                   # space
       " "
     when scan(/[0-7]{1,3}/) then          # octal constant
-      (matched.to_i(8) & 0xFF).chr
+      (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
     when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
-      ss[1].to_i(16).chr
-    when check(/M-\\[\\MCc]/) then
+      # TODO: force encode everything to UTF-8?
+      ss[1].to_i(16).chr.force_encoding Encoding::UTF_8
+    when check(/M-\\./) then
       scan(/M-\\/) # eat it
       c = self.read_escape
       c[0] = (c[0].ord | 0x80).chr
@@ -938,6 +1049,11 @@ class RubyLexer
       c = self.read_escape
       c[0] = (c[0].ord & 0x9f).chr
       c
+    when check(/(C-|c)\\(?!u|\\)/) then
+      scan(/(C-|c)\\/) # eat it
+      c = read_escape
+      c[0] = (c[0].ord & 0x9f).chr
+      c
     when scan(/C-\?|c\?/) then
       127.chr
     when scan(/(C-|c)(.)/) then
@@ -946,15 +1062,25 @@ class RubyLexer
       c
     when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
       matched
-    when scan(/u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/) then
-      [ss[1].delete("{}").to_i(16)].pack("U")
+    when scan(/u(\h{4})/) then
+      [ss[1].to_i(16)].pack("U")
+    when scan(/u(\h{1,3})/) then
+      rb_compile_error "Invalid escape character syntax"
+    when scan(/u\{(\h+(?:\s+\h+)*)\}/) then
+      ss[1].split.map { |s| s.to_i(16) }.pack("U*")
     when scan(/[McCx0-9]/) || end_of_stream? then
       rb_compile_error("Invalid escape character syntax")
     else
-      ss.getch
+      getch
     end.dup
   end
+  def getch
+    c = ss.getch
+    c = ss.getch if c == "\r" && ss.peek(1) == "\n"
+    c
+  end
   def regx_options # TODO: rewrite / remove
     good, bad = [], []
@@ -974,7 +1100,7 @@ class RubyLexer
     self.brace_nest    = 0
     self.command_start = true
     self.comments      = []
-    self.lex_state     = :expr_none
+    self.lex_state     = EXPR_NONE
     self.lex_strterm   = nil
     self.lineno        = 1
     self.lpar_beg      = nil
@@ -988,29 +1114,30 @@ class RubyLexer
     self.cmdarg.reset
   end
-  def result lex_state, token, text # :nodoc:
-    lex_state = self.arg_state if lex_state == :arg_state
-    self.lex_state = lex_state if lex_state
+  def result new_state, token, text # :nodoc:
+    new_state = self.arg_state if new_state == :arg_state
+    self.lex_state = new_state if new_state
     [token, text]
   end
-  def scan re
-    ss.scan re
+  def ruby22_label?
+    ruby22plus? and is_label_possible?
   end
-  def check re
-    ss.check re
+  def ruby22plus?
+    parser.class.version >= 22
   end
-  def eat_whitespace
-    r = scan(/\s+/)
-    self.extra_lineno += r.count("\n") if r
-    r
+  def ruby23plus?
+    parser.class.version >= 23
   end
-  def fixup_lineno extra = 0
-    self.lineno += self.extra_lineno + extra
-    self.extra_lineno = 0
+  def ruby24minus?
+    parser.class.version <= 24
+  end
+  def scan re
+    ss.scan re
   end
   def scanner_class # TODO: design this out of oedipus_lex. or something.
@@ -1033,12 +1160,6 @@ class RubyLexer
     self.lex_strterm = [:strterm, type, beg, nnd]
   end
-  # TODO: consider
-  # def src= src
-  #   raise "bad src: #{src.inspect}" unless String === src
-  #   @src = RPStringScanner.new(src)
-  # end
   def tokadd_escape term # TODO: rewrite / remove
     case
     when scan(/\\\n/) then
@@ -1057,8 +1178,10 @@ class RubyLexer
       prev = self.string_buffer.last
       if term == chr && prev && prev.end_with?("(?") then
         self.string_buffer << chr
+      elsif term == chr || chr.ascii_only? then
+        self.string_buffer << matched # dunno why we keep them for ascii
       else
-        self.string_buffer << matched
+        self.string_buffer << chr # HACK? this is such a rat's nest
       end
     else
       rb_compile_error "Invalid escape character syntax"
@@ -1066,22 +1189,24 @@ class RubyLexer
   end
   def tokadd_string(func, term, paren) # TODO: rewrite / remove
-    qwords = (func & STR_FUNC_QWORDS) != 0
-    escape = (func & STR_FUNC_ESCAPE) != 0
-    expand = (func & STR_FUNC_EXPAND) != 0
-    regexp = (func & STR_FUNC_REGEXP) != 0
-    symbol = (func & STR_FUNC_SYMBOL) != 0
+    qwords = func =~ STR_FUNC_QWORDS
+    escape = func =~ STR_FUNC_ESCAPE
+    expand = func =~ STR_FUNC_EXPAND
+    regexp = func =~ STR_FUNC_REGEXP
+    symbol = func =~ STR_FUNC_SYMBOL
     paren_re = @@regexp_cache[paren]
-    term_re  = @@regexp_cache[term]
+    term_re  = if term == "\n"
+                 /#{Regexp.escape "\r"}?#{Regexp.escape "\n"}/
+               else
+                 @@regexp_cache[term]
+               end
     until end_of_stream? do
       c = nil
       handled = true
       case
-      when paren_re && scan(paren_re) then
-        self.string_nest += 1
       when scan(term_re) then
         if self.string_nest == 0 then
           ss.pos -= 1
@@ -1089,7 +1214,9 @@ class RubyLexer
         else
           self.string_nest -= 1
         end
-      when expand && scan(/#(?=[\$\@\{])/) then
+      when paren_re && scan(paren_re) then
+        self.string_nest += 1
+      when expand && scan(/#(?=[\$\@\{])/) then # TODO: this seems wrong
         ss.pos -= 1
         break
       when qwords && scan(/\s/) then
@@ -1103,7 +1230,7 @@ class RubyLexer
           string_buffer << "\n"
           next
         when qwords && scan(/\\\s/) then
-          c = ' '
+          c = " "
         when expand && scan(/\\\n/) then
           next
         when regexp && check(/\\/) then
@@ -1128,12 +1255,16 @@ class RubyLexer
       end # top case
       unless handled then
-        t = Regexp.escape term
-        x = Regexp.escape(paren) if paren && paren != "\000"
+        t = if term == "\n"
+              Regexp.escape "\r\n"
+            else
+              Regexp.escape term
+            end
+        x = Regexp.escape paren if paren && paren != "\000"
         re = if qwords then
-               /[^#{t}#{x}\#\0\\\s]+|./ # |. to pick up whatever
+               /[^#{t}#{x}\#\\\s]+|./ # |. to pick up whatever
              else
-               /[^#{t}#{x}\#\0\\]+|./
+               /[^#{t}#{x}\#\\]+|./
              end
         scan re
@@ -1173,12 +1304,15 @@ class RubyLexer
           s
         when /^[McCx0-9]/ then
           rb_compile_error("Invalid escape character syntax")
-        when /u([0-9a-fA-F]{2,4}|\{[0-9a-fA-F]{2,6}\})/ then
+        when /u(\h{4})/ then
           [$1.delete("{}").to_i(16)].pack("U")
+        when /u(\h{1,3})/ then
+          rb_compile_error("Invalid escape character syntax")
+        when /u\{(\h+(?:\s+\h+)*)\}/ then
+          $1.split.map { |s| s.to_i(16) }.pack("U*")
         else
           s
         end
-    x.force_encoding "UTF-8" if HAS_ENC
     x
   end
@@ -1186,172 +1320,154 @@ class RubyLexer
     # do nothing for now
   end
-  def ruby22plus?
-    parser.class.version >= 22
-  end
-  def ruby23plus?
-    parser.class.version >= 23
+  def was_label?
+    @was_label = ruby22_label?
+    true
   end
-  def process_string # TODO: rewrite / remove
-    # matches top of parser_yylex in compare/parse23.y:8113
-    token = if lex_strterm[0] == :heredoc then
-              self.heredoc lex_strterm
-            else
-              self.parse_string lex_strterm
-            end
+  class State
+    attr_accessor :n
+    attr_accessor :names
-    token_type, c = token
+    # TODO: take a shared hash of strings for inspect/to_s
+    def initialize o, names
+      raise ArgumentError, "bad state: %p" % [o] unless Integer === o # TODO: remove
-    # matches parser_string_term
-    if ruby22plus? && token_type == :tSTRING_END && ["'", '"'].include?(c) then
-      if (([:expr_beg, :expr_endfn].include?(lex_state) &&
-           !cond.is_in_state) || is_arg?) &&
-          is_label_suffix? then
-        scan(/:/)
-        token_type = token[0] = :tLABEL_END
-      end
+      self.n = o
+      self.names = names
     end
-    if [:tSTRING_END, :tREGEXP_END, :tLABEL_END].include? token_type then
-      self.lex_strterm = nil
-      # TODO: :expr_beg|:expr_label
-      self.lex_state   = (token_type == :tLABEL_END) ? :expr_label : :expr_end
+    def == o
+      self.equal?(o) || (o.class == self.class && o.n == self.n)
     end
-    return token
-  end
-  def parse_quote # TODO: remove / rewrite
-    beg, nnd, short_hand, c = nil, nil, false, nil
-    if scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
-      rb_compile_error "unknown type of %string" if ss.matched_size == 2
-      c, beg, short_hand = matched, ss.getch, false
-    else                               # Short-hand (e.g. %{, %., %!, etc)
-      c, beg, short_hand = 'Q', ss.getch, true
-    end
-    if end_of_stream? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
-      rb_compile_error "unterminated quoted string meets end of file"
-    end
-    # Figure nnd-char.  "\0" is special to indicate beg=nnd and that no nesting?
-    nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
-    nnd, beg = beg, "\0" if nnd.nil?
-    token_type, text = nil, "%#{c}#{beg}"
-    token_type, string_type = case c
-                              when 'Q' then
-                                ch = short_hand ? nnd : c + beg
-                                text = "%#{ch}"
-                                [:tSTRING_BEG,   STR_DQUOTE]
-                              when 'q' then
-                                [:tSTRING_BEG,   STR_SQUOTE]
-                              when 'W' then
-                                eat_whitespace
-                                [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_QWORDS]
-                              when 'w' then
-                                eat_whitespace
-                                [:tQWORDS_BEG,   STR_SQUOTE | STR_FUNC_QWORDS]
-                              when 'x' then
-                                [:tXSTRING_BEG,  STR_XQUOTE]
-                              when 'r' then
-                                [:tREGEXP_BEG,   STR_REGEXP]
-                              when 's' then
-                                self.lex_state  = :expr_fname
-                                [:tSYMBEG,       STR_SSYM]
-                              when 'I' then
-                                eat_whitespace
-                                [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
-                              when 'i' then
-                                eat_whitespace
-                                [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
-                              end
-    rb_compile_error "Bad %string type. Expected [QqWwIixrs], found '#{c}'." if
-      token_type.nil?
-    raise "huh" unless string_type
-    string string_type, nnd, beg
-    return token_type, text
-  end
-  def parse_string quote # TODO: rewrite / remove
-    _, string_type, term, open = quote
-    space = false # FIX: remove these
-    func = string_type
-    paren = open
-    term_re = @@regexp_cache[term]
-    qwords = (func & STR_FUNC_QWORDS) != 0
-    regexp = (func & STR_FUNC_REGEXP) != 0
-    expand = (func & STR_FUNC_EXPAND) != 0
-    unless func then # nil'ed from qwords below. *sigh*
-      return :tSTRING_END, nil
+    def =~ v
+      (self.n & v.n) != 0
     end
-    space = true if qwords and eat_whitespace
-    if self.string_nest == 0 && scan(/#{term_re}/) then
-      if qwords then
-        quote[1] = nil
-        return :tSPACE, nil
-      elsif regexp then
-        return :tREGEXP_END, self.regx_options
-      else
-        return :tSTRING_END, term
-      end
+    def | v
+      raise ArgumentError, "Incompatible State: %p vs %p" % [self, v] unless
+        self.names == v.names
+      self.class.new(self.n | v.n, self.names)
     end
-    return :tSPACE, nil if space
+    def inspect
+      return "Value(0)" if n.zero? # HACK?
-    self.string_buffer = []
-    if expand
-      case
-      when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
-        # TODO: !ISASCII
-        # ?! see parser_peek_variable_name
-        return :tSTRING_DVAR, nil
-      when scan(/#(?=\@\@?[a-zA-Z_])/) then
-        # TODO: !ISASCII
-        return :tSTRING_DVAR, nil
-      when scan(/#[{]/) then
-        self.command_start = true
-        return :tSTRING_DBEG, nil
-      when scan(/#/) then
-        string_buffer << '#'
-      end
+      names.map { |v, k| k if self =~ v }.
+        compact.
+        join("|").
+        gsub(/(?:EXPR_|STR_(?:FUNC_)?)/, "")
     end
-    if tokadd_string(func, term, paren) == RubyLexer::EOF then
-      rb_compile_error "unterminated string meets end of file"
+    alias to_s inspect
+    module Values
+      expr_names = {}
+      EXPR_NONE    = State.new    0x0, expr_names
+      EXPR_BEG     = State.new    0x1, expr_names
+      EXPR_END     = State.new    0x2, expr_names
+      EXPR_ENDARG  = State.new    0x4, expr_names
+      EXPR_ENDFN   = State.new    0x8, expr_names
+      EXPR_ARG     = State.new   0x10, expr_names
+      EXPR_CMDARG  = State.new   0x20, expr_names
+      EXPR_MID     = State.new   0x40, expr_names
+      EXPR_FNAME   = State.new   0x80, expr_names
+      EXPR_DOT     = State.new  0x100, expr_names
+      EXPR_CLASS   = State.new  0x200, expr_names
+      EXPR_LABEL   = State.new  0x400, expr_names
+      EXPR_LABELED = State.new  0x800, expr_names
+      EXPR_FITEM   = State.new 0x1000, expr_names
+      EXPR_BEG_ANY = EXPR_BEG | EXPR_MID    | EXPR_CLASS
+      EXPR_ARG_ANY = EXPR_ARG | EXPR_CMDARG
+      EXPR_END_ANY = EXPR_END | EXPR_ENDARG | EXPR_ENDFN
+      # extra fake lex_state names to make things a bit cleaner
+      EXPR_LAB = EXPR_ARG|EXPR_LABELED
+      EXPR_LIT = EXPR_END|EXPR_ENDARG
+      EXPR_PAR = EXPR_BEG|EXPR_LABEL
+      EXPR_PAD = EXPR_BEG|EXPR_LABELED
+      EXPR_NUM = EXPR_LIT
+      expr_names.merge!(EXPR_NONE    => "EXPR_NONE",
+                        EXPR_BEG     => "EXPR_BEG",
+                        EXPR_END     => "EXPR_END",
+                        EXPR_ENDARG  => "EXPR_ENDARG",
+                        EXPR_ENDFN   => "EXPR_ENDFN",
+                        EXPR_ARG     => "EXPR_ARG",
+                        EXPR_CMDARG  => "EXPR_CMDARG",
+                        EXPR_MID     => "EXPR_MID",
+                        EXPR_FNAME   => "EXPR_FNAME",
+                        EXPR_DOT     => "EXPR_DOT",
+                        EXPR_CLASS   => "EXPR_CLASS",
+                        EXPR_LABEL   => "EXPR_LABEL",
+                        EXPR_LABELED => "EXPR_LABELED",
+                        EXPR_FITEM   => "EXPR_FITEM")
+      # ruby constants for strings
+      str_func_names = {}
+      STR_FUNC_BORING = State.new 0x00,    str_func_names
+      STR_FUNC_ESCAPE = State.new 0x01,    str_func_names
+      STR_FUNC_EXPAND = State.new 0x02,    str_func_names
+      STR_FUNC_REGEXP = State.new 0x04,    str_func_names
+      STR_FUNC_QWORDS = State.new 0x08,    str_func_names
+      STR_FUNC_SYMBOL = State.new 0x10,    str_func_names
+      STR_FUNC_INDENT = State.new 0x20,    str_func_names # <<-HEREDOC
+      STR_FUNC_LABEL  = State.new 0x40,    str_func_names
+      STR_FUNC_LIST   = State.new 0x4000,  str_func_names
+      STR_FUNC_TERM   = State.new 0x8000,  str_func_names
+      STR_FUNC_ICNTNT = State.new 0x10000, str_func_names # <<~HEREDOC -- TODO: remove?
+      # TODO: check parser25.y on how they do STR_FUNC_INDENT
+      STR_SQUOTE = STR_FUNC_BORING
+      STR_DQUOTE = STR_FUNC_EXPAND
+      STR_XQUOTE = STR_FUNC_EXPAND
+      STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
+      STR_SWORD  = STR_FUNC_QWORDS | STR_FUNC_LIST
+      STR_DWORD  = STR_FUNC_QWORDS | STR_FUNC_EXPAND | STR_FUNC_LIST
+      STR_SSYM   = STR_FUNC_SYMBOL
+      STR_DSYM   = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
+      str_func_names.merge!(STR_FUNC_ESCAPE => "STR_FUNC_ESCAPE",
+                            STR_FUNC_EXPAND => "STR_FUNC_EXPAND",
+                            STR_FUNC_REGEXP => "STR_FUNC_REGEXP",
+                            STR_FUNC_QWORDS => "STR_FUNC_QWORDS",
+                            STR_FUNC_SYMBOL => "STR_FUNC_SYMBOL",
+                            STR_FUNC_INDENT => "STR_FUNC_INDENT",
+                            STR_FUNC_LABEL  => "STR_FUNC_LABEL",
+                            STR_FUNC_LIST   => "STR_FUNC_LIST",
+                            STR_FUNC_TERM   => "STR_FUNC_TERM",
+                            STR_FUNC_ICNTNT => "STR_FUNC_ICNTNT",
+                            STR_SQUOTE      => "STR_SQUOTE")
     end
-    return :tSTRING_CONTENT, string_buffer.join
+    include Values
   end
+  include State::Values
 end
 require "ruby_lexer.rex"
 if ENV["RP_LINENO_DEBUG"] then
   class RubyLexer
-    alias :old_lineno= :lineno=
     def d o
       $stderr.puts o.inspect
     end
+    alias old_lineno= lineno=
     def lineno= n
       self.old_lineno= n
       where = caller.first.split(/:/).first(2).join(":")
-      d :lineno => [n, where, ss && ss.rest[0,40]]
+      d :lineno => [n, where, ss && ss.rest[0, 40]]
     end
   end
 end