RubyGems - fabiokung-ruby_parser - Versions diffs - 2.0.2 - Mend

fabiokung-ruby_parser 2.0.2

Files changed (15) hide show

data/.autotest +44 -0
data/History.txt +134 -0
data/Manifest.txt +13 -0
data/README.txt +86 -0
data/Rakefile +155 -0
data/bin/ruby_parse +88 -0
data/lib/gauntlet_rubyparser.rb +120 -0
data/lib/ruby_lexer.rb +1331 -0
data/lib/ruby_parser.rb +5545 -0
data/lib/ruby_parser.y +1790 -0
data/lib/ruby_parser_extras.rb +1031 -0
data/test/test_ruby_lexer.rb +1738 -0
data/test/test_ruby_parser.rb +466 -0
data/test/test_ruby_parser_extras.rb +177 -0
metadata +100 -0

data/lib/ruby_lexer.rb ADDED Viewed

@@ -0,0 +1,1331 @@
+$: << File.expand_path("~/Work/p4/zss/src/ParseTree/dev/lib") # for me, not you.
+require 'sexp'
+require 'ruby_parser_extras'
+class RubyLexer
+  attr_accessor :command_start
+  attr_accessor :cmdarg
+  attr_accessor :cond
+  attr_accessor :nest
+  ESC_RE = /\\([0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-.|(C-|c)\?|(C-|c).|[^0-7xMCc])/
+  # Additional context surrounding tokens that both the lexer and
+  # grammar use.
+  attr_reader :lex_state
+  attr_accessor :lex_strterm
+  attr_accessor :parser # HACK for very end of lexer... *sigh*
+  # Stream of data that yylex examines.
+  attr_reader :src
+  # Last token read via yylex.
+  attr_accessor :token
+  attr_accessor :string_buffer
+  # Value of last token which had a value associated with it.
+  attr_accessor :yacc_value
+  # What handles warnings
+  attr_accessor :warnings
+  EOF = :eof_haha!
+  # ruby constants for strings (should this be moved somewhere else?)
+  STR_FUNC_BORING = 0x00
+  STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP
+  STR_FUNC_EXPAND = 0x02
+  STR_FUNC_REGEXP = 0x04
+  STR_FUNC_AWORDS = 0x08
+  STR_FUNC_SYMBOL = 0x10
+  STR_FUNC_INDENT = 0x20 # <<-HEREDOC
+  STR_SQUOTE = STR_FUNC_BORING
+  STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
+  STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND
+  STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND
+  STR_SSYM   = STR_FUNC_SYMBOL
+  STR_DSYM   = STR_FUNC_SYMBOL | STR_FUNC_EXPAND
+  # How the parser advances to the next token.
+  #
+  # @return true if not at end of file (EOF).
+  def advance
+    r = yylex
+    self.token = r
+    raise "yylex returned nil" unless r
+    return RubyLexer::EOF != r
+  end
+  def arg_ambiguous
+    self.warning("Ambiguous first argument. make sure.")
+  end
+  def comments
+    c = @comments.join
+    @comments.clear
+    c
+  end
+  def expr_beg_push val
+    cond.push false
+    cmdarg.push false
+    self.lex_state = :expr_beg
+    self.yacc_value = val
+  end
+  def fix_arg_lex_state
+    self.lex_state = if lex_state == :expr_fname || lex_state == :expr_dot
+                       :expr_arg
+                     else
+                       :expr_beg
+                     end
+  end
+  def heredoc here # 63 lines
+    _, eos, func, last_line = here
+    indent  = (func & STR_FUNC_INDENT) != 0
+    expand  = (func & STR_FUNC_EXPAND) != 0
+    eos_re  = indent ? /[ \t]*#{eos}(\r?\n|\z)/ : /#{eos}(\r?\n|\z)/
+    err_msg = "can't match #{eos_re.inspect} anywhere in "
+    rb_compile_error err_msg if
+      src.eos?
+    if src.beginning_of_line? && src.scan(eos_re) then
+      src.unread_many last_line # TODO: figure out how to remove this
+      self.yacc_value = eos
+      return :tSTRING_END
+    end
+    self.string_buffer = []
+    if expand then
+      case
+      when src.scan(/#[$@]/) then
+        src.pos -= 1 # FIX omg stupid
+        self.yacc_value = src.matched
+        return :tSTRING_DVAR
+      when src.scan(/#[{]/) then
+        self.yacc_value = src.matched
+        return :tSTRING_DBEG
+      when src.scan(/#/) then
+        string_buffer << '#'
+      end
+      until src.scan(eos_re) do
+        c = tokadd_string func, "\n", nil
+        rb_compile_error err_msg if
+          c == RubyLexer::EOF
+        if c != "\n" then
+          self.yacc_value = string_buffer.join.delete("\r")
+          return :tSTRING_CONTENT
+        else
+          string_buffer << src.scan(/\n/)
+        end
+        rb_compile_error err_msg if
+          src.eos?
+      end
+      # tack on a NL after the heredoc token - FIX NL should not be needed
+      src.unread_many(eos + "\n") # TODO: remove this... stupid stupid stupid
+    else
+      until src.check(eos_re) do
+        string_buffer << src.scan(/.*(\n|\z)/)
+        rb_compile_error err_msg if
+          src.eos?
+      end
+    end
+    self.lex_strterm = [:heredoc, eos, func, last_line]
+    self.yacc_value = string_buffer.join.delete("\r")
+    return :tSTRING_CONTENT
+  end
+  def heredoc_identifier # 51 lines
+    term, func = nil, STR_FUNC_BORING
+    self.string_buffer = []
+    case
+    when src.scan(/(-?)(['"`])(.*?)\2/) then
+      term = src[2]
+      unless src[1].empty? then
+        func |= STR_FUNC_INDENT
+      end
+      func |= case term
+              when "\'" then
+                STR_SQUOTE
+              when '"' then
+                STR_DQUOTE
+              else
+                STR_XQUOTE
+              end
+      string_buffer << src[3]
+    when src.scan(/-?(['"`])(?!\1*\Z)/) then
+      rb_compile_error "unterminated here document identifier"
+    when src.scan(/(-?)(\w+)/) then
+      term = '"'
+      func |= STR_DQUOTE
+      unless src[1].empty? then
+        func |= STR_FUNC_INDENT
+      end
+      string_buffer << src[2]
+    else
+      return nil
+    end
+    if src.check(/.*\n/) then
+      # TODO: think about storing off the char range instead
+      line = src.string[src.pos, src.matched_size]
+      src.string[src.pos, src.matched_size] = "\n"
+      src.pos += 1
+    else
+      line = nil
+    end
+    self.lex_strterm = [:heredoc, string_buffer.join, func, line]
+    if term == '`' then
+      self.yacc_value = "`"
+      return :tXSTRING_BEG
+    else
+      self.yacc_value = "\""
+      return :tSTRING_BEG
+    end
+  end
+  def initialize
+    self.cond = StackState.new(:cond)
+    self.cmdarg = StackState.new(:cmdarg)
+    self.nest = 0
+    @comments = []
+    reset
+  end
+  def int_with_base base
+    rb_compile_error "Invalid numeric format" if src.matched =~ /__/
+    self.yacc_value = src.matched.to_i(base)
+    return :tINTEGER
+  end
+  def lex_state= o
+    raise "wtf?" unless Symbol === o
+    @lex_state = o
+  end
+  attr_writer :lineno
+  def lineno
+    @lineno ||= src.lineno
+  end
+  ##
+  #  Parse a number from the input stream.
+  #
+  # @param c The first character of the number.
+  # @return A int constant wich represents a token.
+  def parse_number
+    self.lex_state = :expr_end
+    case
+    when src.scan(/[+-]?0[xbd]\b/) then
+      rb_compile_error "Invalid numeric format"
+    when src.scan(/[+-]?0x[a-f0-9_]+/i) then
+      int_with_base(16)
+    when src.scan(/[+-]?0b[01_]+/) then
+      int_with_base(2)
+    when src.scan(/[+-]?0d[0-9_]+/) then
+      int_with_base(10)
+    when src.scan(/[+-]?0o?[0-7_]*[89]/) then
+      rb_compile_error "Illegal octal digit."
+    when src.scan(/[+-]?0o?[0-7_]+|0o/) then
+      int_with_base(8)
+    when src.scan(/[+-]?[\d_]+_(e|\.)/) then
+      rb_compile_error "Trailing '_' in number."
+    when src.scan(/[+-]?[\d_]+\.[\d_]+(e[+-]?[\d_]+)?\b|[+-]?[\d_]+e[+-]?[\d_]+\b/i) then
+      number = src.matched
+      if number =~ /__/ then
+        rb_compile_error "Invalid numeric format"
+      end
+      self.yacc_value = number.to_f
+      :tFLOAT
+    when src.scan(/[+-]?0\b/) then
+      int_with_base(10)
+    when src.scan(/[+-]?[\d_]+\b/) then
+      int_with_base(10)
+    else
+      rb_compile_error "Bad number format"
+    end
+  end
+  def parse_quote # 58 lines
+    beg, nnd, short_hand, c = nil, nil, false, nil
+    if src.scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}).
+      rb_compile_error "unknown type of %string" if src.matched_size == 2
+      c, beg, short_hand = src.matched, src.getch, false
+    else                               # Short-hand (e.g. %{, %., %!, etc)
+      c, beg, short_hand = 'Q', src.getch, true
+    end
+    if src.eos? or c == RubyLexer::EOF or beg == RubyLexer::EOF then
+      rb_compile_error "unterminated quoted string meets end of file"
+    end
+    # Figure nnd-char.  "\0" is special to indicate beg=nnd and that no nesting?
+    nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg]
+    nnd, beg = beg, "\0" if nnd.nil?
+    token_type, self.yacc_value = nil, "%#{c}#{beg}"
+    token_type, string_type = case c
+                              when 'Q' then
+                                ch = short_hand ? nnd : c + beg
+                                self.yacc_value = "%#{ch}"
+                                [:tSTRING_BEG,   STR_DQUOTE]
+                              when 'q' then
+                                [:tSTRING_BEG,   STR_SQUOTE]
+                              when 'W' then
+                                src.scan(/\s*/)
+                                [:tWORDS_BEG,    STR_DQUOTE | STR_FUNC_AWORDS]
+                              when 'w' then
+                                src.scan(/\s*/)
+                                [:tAWORDS_BEG,   STR_SQUOTE | STR_FUNC_AWORDS]
+                              when 'x' then
+                                [:tXSTRING_BEG,  STR_XQUOTE]
+                              when 'r' then
+                                [:tREGEXP_BEG,   STR_REGEXP]
+                              when 's' then
+                                self.lex_state  = :expr_fname
+                                [:tSYMBEG,       STR_SSYM]
+                              end
+    rb_compile_error "Bad %string type. Expected [Qqwxr\W], found '#{c}'." if
+      token_type.nil?
+    self.lex_strterm = [:strterm, string_type, nnd, beg]
+    return token_type
+  end
+  def parse_string(quote) # 65 lines
+    _, string_type, term, open = quote
+    space = false # FIX: remove these
+    func = string_type
+    paren = open
+    term_re = Regexp.escape term
+    awords = (func & STR_FUNC_AWORDS) != 0
+    regexp = (func & STR_FUNC_REGEXP) != 0
+    expand = (func & STR_FUNC_EXPAND) != 0
+    unless func then # FIX: impossible, prolly needs == 0
+      self.lineno = nil
+      return :tSTRING_END
+    end
+    space = true if awords and src.scan(/\s+/)
+    if self.nest == 0 && src.scan(/#{term_re}/) then
+      if awords then
+        quote[1] = nil
+        return :tSPACE
+      elsif regexp then
+        self.yacc_value = self.regx_options
+        self.lineno = nil
+        return :tREGEXP_END
+      else
+        self.yacc_value = term
+        self.lineno = nil
+        return :tSTRING_END
+      end
+    end
+    if space then
+      return :tSPACE
+    end
+    self.string_buffer = []
+    if expand
+      case
+      when src.scan(/#(?=[$@])/) then
+        return :tSTRING_DVAR
+      when src.scan(/#[{]/) then
+        return :tSTRING_DBEG
+      when src.scan(/#/) then
+        string_buffer << '#'
+      end
+    end
+    if tokadd_string(func, term, paren) == RubyLexer::EOF then
+      rb_compile_error "unterminated string meets end of file"
+    end
+    self.yacc_value = string_buffer.join
+    return :tSTRING_CONTENT
+  end
+  def rb_compile_error msg
+    msg += ". near line #{self.lineno}: #{src.rest[/^.*/].inspect}"
+    raise SyntaxError, msg
+  end
+  def read_escape # 51 lines
+    case
+    when src.scan(/\\/) then                  # Backslash
+      '\\'
+    when src.scan(/n/) then                   # newline
+      "\n"
+    when src.scan(/t/) then                   # horizontal tab
+      "\t"
+    when src.scan(/r/) then                   # carriage-return
+      "\r"
+    when src.scan(/f/) then                   # form-feed
+      "\f"
+    when src.scan(/v/) then                   # vertical tab
+      "\13"
+    when src.scan(/a/) then                   # alarm(bell)
+      "\007"
+    when src.scan(/e/) then                   # escape
+      "\033"
+    when src.scan(/b/) then                   # backspace
+      "\010"
+    when src.scan(/s/) then                   # space
+      " "
+    when src.scan(/[0-7]{1,3}/) then          # octal constant
+      src.matched.to_i(8).chr
+    when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
+      src[1].to_i(16).chr
+    when src.scan(/M-\\/) then
+      c = self.read_escape
+      c[0] = (c[0].ord | 0x80).chr
+      c
+    when src.scan(/M-(.)/) then
+      c = src[1]
+      c[0] = (c[0].ord | 0x80).chr
+      c
+    when src.scan(/C-\\|c\\/) then
+      c = self.read_escape
+      c[0] = (c[0].ord & 0x9f).chr
+      c
+    when src.scan(/C-\?|c\?/) then
+      0177.chr
+    when src.scan(/(C-|c)(.)/) then
+      c = src[2]
+      c[0] = (c[0].ord & 0x9f).chr
+      c
+    when src.scan(/[McCx0-9]/) || src.eos? then
+      rb_compile_error("Invalid escape character syntax")
+    else
+      src.getch
+    end
+  end
+  def regx_options # 15 lines
+    good, bad = [], []
+    if src.scan(/[a-z]+/) then
+      good, bad = src.matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ }
+    end
+    unless bad.empty? then
+      rb_compile_error("unknown regexp option%s - %s" %
+                       [(bad.size > 1 ? "s" : ""), bad.join.inspect])
+    end
+    return good.join
+  end
+  def reset
+    self.command_start = true
+    self.lex_strterm   = nil
+    self.token         = nil
+    self.yacc_value    = nil
+    @src       = nil
+    @lex_state = nil
+  end
+  def src= src
+    raise "bad src: #{src.inspect}" unless String === src
+    @src = RPStringScanner.new(src)
+  end
+  def tokadd_escape term # 20 lines
+    case
+    when src.scan(/\\\n/) then
+      # just ignore
+    when src.scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
+      self.string_buffer << src.matched
+    when src.scan(/\\([MC]-|c)(?=\\)/) then
+      self.string_buffer << src.matched
+      self.tokadd_escape term
+    when src.scan(/\\([MC]-|c)(.)/) then
+      self.string_buffer << src.matched
+    when src.scan(/\\[McCx]/) then
+      rb_compile_error "Invalid escape character syntax"
+    when src.scan(/\\(.)/m) then
+      self.string_buffer << src.matched
+    else
+      rb_compile_error "Invalid escape character syntax"
+    end
+  end
+  def tokadd_string(func, term, paren) # 105 lines
+    awords = (func & STR_FUNC_AWORDS) != 0
+    escape = (func & STR_FUNC_ESCAPE) != 0
+    expand = (func & STR_FUNC_EXPAND) != 0
+    regexp = (func & STR_FUNC_REGEXP) != 0
+    symbol = (func & STR_FUNC_SYMBOL) != 0
+    paren_re = paren.nil? ? nil : Regexp.new(Regexp.escape(paren))
+    term_re  = Regexp.new(Regexp.escape(term))
+    until src.eos? do
+      c = nil
+      handled = true
+      case
+      when self.nest == 0 && src.scan(term_re) then
+        src.pos -= 1
+        break
+      when paren_re && src.scan(paren_re) then
+        self.nest += 1
+      when src.scan(term_re) then
+        self.nest -= 1
+      when awords && src.scan(/\s/) then
+        src.pos -= 1
+        break
+      when expand && src.scan(/#(?=[\$\@\{])/) then
+        src.pos -= 1
+        break
+      when expand && src.scan(/#(?!\n)/) then
+        # do nothing
+      when src.check(/\\/) then
+        case
+        when awords && src.scan(/\\\n/) then
+          string_buffer << "\n"
+          next
+        when awords && src.scan(/\\\s/) then
+          c = ' '
+        when expand && src.scan(/\\\n/) then
+          next
+        when regexp && src.check(/\\/) then
+          self.tokadd_escape term
+          next
+        when expand && src.scan(/\\/) then
+          c = self.read_escape
+        when src.scan(/\\\n/) then
+          # do nothing
+        when src.scan(/\\\\/) then
+          string_buffer << '\\' if escape
+          c = '\\'
+        when src.scan(/\\/) then
+          unless src.scan(term_re) || paren.nil? || src.scan(paren_re) then
+            string_buffer << "\\"
+          end
+        else
+          handled = false
+        end
+      else
+        handled = false
+      end # case
+      unless handled then
+        t = Regexp.escape term
+        x = Regexp.escape(paren) if paren && paren != "\000"
+        re = if awords then
+               /[^#{t}#{x}\#\0\\\n\ ]+|./ # |. to pick up whatever
+             else
+               /[^#{t}#{x}\#\0\\]+|./
+             end
+        src.scan re
+        c = src.matched
+        rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/
+      end # unless handled
+      c ||= src.matched
+      string_buffer << c
+    end # until
+    c ||= src.matched
+    c = RubyLexer::EOF if src.eos?
+    return c
+  end
+  def unescape s
+    r = {
+      "a"    => "\007",
+      "b"    => "\010",
+      "e"    => "\033",
+      "f"    => "\f",
+      "n"    => "\n",
+      "r"    => "\r",
+      "s"    => " ",
+      "t"    => "\t",
+      "v"    => "\13",
+      "\\"   => '\\',
+      "\n"   => "",
+      "C-\?" => 0177.chr,
+      "c\?"  => 0177.chr,
+    }[s]
+    return r if r
+    case s
+    when /^[0-7]{1,3}/ then
+      $&.to_i(8).chr
+    when /^x([0-9a-fA-F]{1,2})/ then
+      $1.to_i(16).chr
+    when /^M-(.)/ then
+      ($1[0].ord | 0x80).chr
+    when /^(C-|c)(.)/ then
+      ($2[0].ord & 0x9f).chr
+    when /^[McCx0-9]/ then
+      rb_compile_error("Invalid escape character syntax")
+    else
+      s
+    end
+  end
+  def warning s
+    # do nothing for now
+  end
+  ##
+  # Returns the next token. Also sets yy_val is needed.
+  #
+  # @return Description of the Returned Value
+  def yylex # 826 lines
+    c = ''
+    space_seen = false
+    command_state = false
+    src = self.src
+    self.token = nil
+    self.yacc_value = nil
+    return yylex_string if lex_strterm
+    command_state = self.command_start
+    self.command_start = false
+    last_state = lex_state
+    loop do # START OF CASE
+      if src.scan(/\ |\t|\r|\f|\13/) then # white spaces, 13 = '\v
+        space_seen = true
+        next
+      elsif src.check(/[^a-zA-Z]/) then
+        if src.scan(/\n|#/) then
+          self.lineno = nil
+          c = src.matched
+          if c == '#' then
+            src.unread c # ok
+            while src.scan(/\s*#.*(\n+|\z)/) do
+              @comments << src.matched.gsub(/^ +#/, '#').gsub(/^ +$/, '')
+            end
+            if src.eos? then
+              return RubyLexer::EOF
+            end
+          end
+          # Replace a string of newlines with a single one
+          src.scan(/\n+/)
+          if [:expr_beg, :expr_fname,
+              :expr_dot, :expr_class].include? lex_state then
+            next
+          end
+          self.command_start = true
+          self.lex_state = :expr_beg
+          return :tNL
+        elsif src.scan(/[\]\)\}]/) then
+          cond.lexpop
+          cmdarg.lexpop
+          self.lex_state = :expr_end
+          self.yacc_value = src.matched
+          result = {
+            ")" => :tRPAREN,
+            "]" => :tRBRACK,
+            "}" => :tRCURLY
+          }[src.matched]
+          return result
+        elsif src.check(/\./) then
+          if src.scan(/\.\.\./) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "..."
+            return :tDOT3
+          elsif src.scan(/\.\./) then
+            self.lex_state = :expr_beg
+            self.yacc_value = ".."
+            return :tDOT2
+          elsif src.scan(/\.\d/) then
+            rb_compile_error "no .<digit> floating literal anymore put 0 before dot"
+          elsif src.scan(/\./) then
+            self.lex_state = :expr_dot
+            self.yacc_value = "."
+            return :tDOT
+          end
+        elsif src.scan(/\,/) then
+          self.lex_state = :expr_beg
+          self.yacc_value = ","
+          return :tCOMMA
+        elsif src.scan(/\(/) then
+          result = :tLPAREN2
+          self.command_start = true
+          if lex_state == :expr_beg || lex_state == :expr_mid then
+            result = :tLPAREN
+          elsif space_seen then
+            if lex_state == :expr_cmdarg then
+              result = :tLPAREN_ARG
+            elsif lex_state == :expr_arg then
+              warning("don't put space before argument parentheses")
+              result = :tLPAREN2
+            end
+          end
+          self.expr_beg_push "("
+          return result
+        elsif src.check(/\=/) then
+          if src.scan(/\=\=\=/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "==="
+            return :tEQQ
+          elsif src.scan(/\=\=/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "=="
+            return :tEQ
+          elsif src.scan(/\=~/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "=~"
+            return :tMATCH
+          elsif src.scan(/\=>/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "=>"
+            return :tASSOC
+          elsif src.scan(/\=/) then
+            if src.was_begin_of_line and src.scan(/begin(?=\s)/) then
+              @comments << '=' << src.matched
+              unless src.scan(/.*?\n=end\s*(\n|\z)/m) then
+                @comments.clear
+                rb_compile_error("embedded document meets end of file")
+              end
+              @comments << src.matched
+              next
+            else
+              self.fix_arg_lex_state
+              self.yacc_value = '='
+              return :tEQL
+            end
+          end
+        elsif src.scan(/\"(#{ESC_RE}|#(#{ESC_RE}|[^\{\#\@\$\"\\])|[^\"\\\#])*\"/o) then
+          self.yacc_value = src.matched[1..-2].gsub(ESC_RE) { unescape $1 }
+          self.lex_state = :expr_end
+          return :tSTRING
+        elsif src.scan(/\"/) then # FALLBACK
+          self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this
+          self.yacc_value = "\""
+          return :tSTRING_BEG
+        elsif src.scan(/\@\@?\w*/) then
+          self.token = src.matched
+          rb_compile_error "`#{token}` is not allowed as a variable name" if
+            token =~ /\@\d/
+          return process_token(command_state)
+        elsif src.scan(/\:\:/) then
+          if (lex_state == :expr_beg ||
+              lex_state == :expr_mid ||
+              lex_state == :expr_class ||
+              (lex_state.is_argument && space_seen)) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "::"
+            return :tCOLON3
+          end
+          self.lex_state = :expr_dot
+          self.yacc_value = "::"
+          return :tCOLON2
+        elsif lex_state != :expr_end && lex_state != :expr_endarg && src.scan(/:([a-zA-Z_]\w*(?:[?!]|=(?!>))?)/) then
+          self.yacc_value = src[1]
+          self.lex_state = :expr_end
+          return :tSYMBOL
+        elsif src.scan(/\:/) then
+          # ?: / then / when
+          if (lex_state == :expr_end || lex_state == :expr_endarg||
+              src.check(/\s/)) then
+            self.lex_state = :expr_beg
+            self.yacc_value = ":"
+            return :tCOLON
+          end
+          case
+          when src.scan(/\'/) then
+            self.lex_strterm = [:strterm, STR_SSYM, src.matched, "\0"]
+          when src.scan(/\"/) then
+            self.lex_strterm = [:strterm, STR_DSYM, src.matched, "\0"]
+          end
+          self.lex_state = :expr_fname
+          self.yacc_value = ":"
+          return :tSYMBEG
+        elsif src.check(/[0-9]/) then
+          return parse_number
+        elsif src.scan(/\[/) then
+          result = src.matched
+          if lex_state == :expr_fname || lex_state == :expr_dot then
+            self.lex_state = :expr_arg
+            case
+            when src.scan(/\]\=/) then
+              self.yacc_value = "[]="
+              return :tASET
+            when src.scan(/\]/) then
+              self.yacc_value = "[]"
+              return :tAREF
+            else
+              rb_compile_error "unexpected '['"
+            end
+          elsif lex_state == :expr_beg || lex_state == :expr_mid then
+            result = :tLBRACK
+          elsif lex_state.is_argument && space_seen then
+            result = :tLBRACK
+          end
+          self.expr_beg_push "["
+          return result
+        elsif src.scan(/\'(\\.|[^\'])*\'/) then
+          self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
+          self.lex_state = :expr_end
+          return :tSTRING
+        elsif src.check(/\|/) then
+          if src.scan(/\|\|\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "||"
+            return :tOP_ASGN
+          elsif src.scan(/\|\|/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "||"
+            return :tOROP
+          elsif src.scan(/\|\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "|"
+            return :tOP_ASGN
+          elsif src.scan(/\|/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "|"
+            return :tPIPE
+          end
+        elsif src.scan(/\{/) then
+          result = if lex_state.is_argument || lex_state == :expr_end then
+                     :tLCURLY      #  block (primary)
+                   elsif lex_state == :expr_endarg then
+                     :tLBRACE_ARG  #  block (expr)
+                   else
+                     :tLBRACE      #  hash
+                   end
+          self.expr_beg_push "{"
+          return result
+        elsif src.scan(/[+-]/) then
+          sign = src.matched
+          utype, type = if sign == "+" then
+                          [:tUPLUS, :tPLUS]
+                        else
+                          [:tUMINUS, :tMINUS]
+                        end
+          if lex_state == :expr_fname || lex_state == :expr_dot then
+            self.lex_state = :expr_arg
+            if src.scan(/@/) then
+              self.yacc_value = "#{sign}@"
+              return utype
+            else
+              self.yacc_value = sign
+              return type
+            end
+          end
+          if src.scan(/\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = sign
+            return :tOP_ASGN
+          end
+          if (lex_state == :expr_beg || lex_state == :expr_mid ||
+              (lex_state.is_argument && space_seen && !src.check(/\s/))) then
+            if lex_state.is_argument then
+              arg_ambiguous
+            end
+            self.lex_state = :expr_beg
+            self.yacc_value = sign
+            if src.check(/\d/) then
+              if utype == :tUPLUS then
+                return self.parse_number
+              else
+                return :tUMINUS_NUM
+              end
+            end
+            return utype
+          end
+          self.lex_state = :expr_beg
+          self.yacc_value = sign
+          return type
+        elsif src.check(/\*/) then
+          if src.scan(/\*\*=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "**"
+            return :tOP_ASGN
+          elsif src.scan(/\*\*/) then
+            self.yacc_value = "**"
+            self.fix_arg_lex_state
+            return :tPOW
+          elsif src.scan(/\*\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "*"
+            return :tOP_ASGN
+          elsif src.scan(/\*/) then
+            result = if lex_state.is_argument && space_seen && src.check(/\S/) then
+                       warning("`*' interpreted as argument prefix")
+                       :tSTAR
+                     elsif lex_state == :expr_beg || lex_state == :expr_mid then
+                       :tSTAR
+                     else
+                       :tSTAR2
+                     end
+            self.yacc_value = "*"
+            self.fix_arg_lex_state
+            return result
+          end
+        elsif src.check(/\!/) then
+          if src.scan(/\!\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "!="
+            return :tNEQ
+          elsif src.scan(/\!~/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "!~"
+            return :tNMATCH
+          elsif src.scan(/\!/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "!"
+            return :tBANG
+          end
+        elsif src.check(/\</) then
+          if src.scan(/\<\=\>/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "<=>"
+            return :tCMP
+          elsif src.scan(/\<\=/) then
+            self.fix_arg_lex_state
+            self.yacc_value = "<="
+            return :tLEQ
+          elsif src.scan(/\<\<\=/) then
+            self.fix_arg_lex_state
+            self.lex_state = :expr_beg
+            self.yacc_value = "\<\<"
+            return :tOP_ASGN
+          elsif src.scan(/\<\</) then
+            if (! [:expr_end,    :expr_dot,
+                   :expr_endarg, :expr_class].include?(lex_state) &&
+                (!lex_state.is_argument || space_seen)) then
+              tok = self.heredoc_identifier
+              if tok then
+                return tok
+              end
+            end
+            self.fix_arg_lex_state
+            self.yacc_value = "\<\<"
+            return :tLSHFT
+          elsif src.scan(/\</) then
+            self.fix_arg_lex_state
+            self.yacc_value = "<"
+            return :tLT
+          end
+        elsif src.check(/\>/) then
+          if src.scan(/\>\=/) then
+            self.fix_arg_lex_state
+            self.yacc_value = ">="
+            return :tGEQ
+          elsif src.scan(/\>\>=/) then
+            self.fix_arg_lex_state
+            self.lex_state = :expr_beg
+            self.yacc_value = ">>"
+            return :tOP_ASGN
+          elsif src.scan(/\>\>/) then
+            self.fix_arg_lex_state
+            self.yacc_value = ">>"
+            return :tRSHFT
+          elsif src.scan(/\>/) then
+            self.fix_arg_lex_state
+            self.yacc_value = ">"
+            return :tGT
+          end
+        elsif src.scan(/\`/) then
+          self.yacc_value = "`"
+          case lex_state
+          when :expr_fname then
+            self.lex_state = :expr_end
+            return :tBACK_REF2
+          when :expr_dot then
+            self.lex_state = if command_state then
+                               :expr_cmdarg
+                             else
+                               :expr_arg
+                             end
+            return :tBACK_REF2
+          end
+          self.lex_strterm = [:strterm, STR_XQUOTE, '`', "\0"]
+          return :tXSTRING_BEG
+        elsif src.scan(/\?/) then
+          if lex_state == :expr_end || lex_state == :expr_endarg then
+            self.lex_state = :expr_beg
+            self.yacc_value = "?"
+            return :tEH
+          end
+          if src.eos? then
+            rb_compile_error "incomplete character syntax"
+          end
+          if src.check(/\s|\v/) then
+            unless lex_state.is_argument then
+              c2 = { " " => 's',
+                    "\n" => 'n',
+                    "\t" => 't',
+                    "\v" => 'v',
+                    "\r" => 'r',
+                    "\f" => 'f' }[src.matched]
+              if c2 then
+                warning("invalid character syntax; use ?\\" + c2)
+              end
+            end
+            # ternary
+            self.lex_state = :expr_beg
+            self.yacc_value = "?"
+            return :tEH
+          elsif src.check(/\w(?=\w)/) then # ternary, also
+            self.lex_state = :expr_beg
+            self.yacc_value = "?"
+            return :tEH
+          end
+          c = if src.scan(/\\/) then
+                self.read_escape
+              else
+                src.getch
+              end
+          self.lex_state = :expr_end
+          self.yacc_value = c[0].ord & 0xff
+          return :tINTEGER
+        elsif src.check(/\&/) then
+          if src.scan(/\&\&\=/) then
+            self.yacc_value = "&&"
+            self.lex_state = :expr_beg
+            return :tOP_ASGN
+          elsif src.scan(/\&\&/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "&&"
+            return :tANDOP
+          elsif src.scan(/\&\=/) then
+            self.yacc_value = "&"
+            self.lex_state = :expr_beg
+            return :tOP_ASGN
+          elsif src.scan(/&/) then
+            result = if lex_state.is_argument && space_seen &&
+                         !src.check(/\s/) then
+                       warning("`&' interpreted as argument prefix")
+                       :tAMPER
+                     elsif lex_state == :expr_beg || lex_state == :expr_mid then
+                       :tAMPER
+                     else
+                       :tAMPER2
+                     end
+            self.fix_arg_lex_state
+            self.yacc_value = "&"
+            return result
+          end
+        elsif src.scan(/\//) then
+          if lex_state == :expr_beg || lex_state == :expr_mid then
+            self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
+            self.yacc_value = "/"
+            return :tREGEXP_BEG
+          end
+          if src.scan(/\=/) then
+            self.yacc_value = "/"
+            self.lex_state = :expr_beg
+            return :tOP_ASGN
+          end
+          if lex_state.is_argument && space_seen then
+            unless src.scan(/\s/) then
+              arg_ambiguous
+              self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"]
+              self.yacc_value = "/"
+              return :tREGEXP_BEG
+            end
+          end
+          self.fix_arg_lex_state
+          self.yacc_value = "/"
+          return :tDIVIDE
+        elsif src.scan(/\^=/) then
+          self.lex_state = :expr_beg
+          self.yacc_value = "^"
+          return :tOP_ASGN
+        elsif src.scan(/\^/) then
+          self.fix_arg_lex_state
+          self.yacc_value = "^"
+          return :tCARET
+        elsif src.scan(/\;/) then
+          self.command_start = true
+          self.lex_state = :expr_beg
+          self.yacc_value = ";"
+          return :tSEMI
+        elsif src.scan(/\~/) then
+          if lex_state == :expr_fname || lex_state == :expr_dot then
+            src.scan(/@/)
+          end
+          self.fix_arg_lex_state
+          self.yacc_value = "~"
+          return :tTILDE
+        elsif src.scan(/\\/) then
+          if src.scan(/\n/) then
+            self.lineno = nil
+            space_seen = true
+            next
+          end
+          rb_compile_error "bare backslash only allowed before newline"
+        elsif src.scan(/\%/) then
+          if lex_state == :expr_beg || lex_state == :expr_mid then
+            return parse_quote
+          end
+          if src.scan(/\=/) then
+            self.lex_state = :expr_beg
+            self.yacc_value = "%"
+            return :tOP_ASGN
+          end
+          if lex_state.is_argument && space_seen && ! src.check(/\s/) then
+            return parse_quote
+          end
+          self.fix_arg_lex_state
+          self.yacc_value = "%"
+          return :tPERCENT
+        elsif src.check(/\$/) then
+          if src.scan(/(\$_)(\w+)/) then
+            self.lex_state = :expr_end
+            self.token = src.matched
+            return process_token(command_state)
+          elsif src.scan(/\$_/) then
+            self.lex_state = :expr_end
+            self.token = src.matched
+            self.yacc_value = src.matched
+            return :tGVAR
+          elsif src.scan(/\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/) then
+            self.lex_state = :expr_end
+            self.yacc_value = src.matched
+            return :tGVAR
+          elsif src.scan(/\$([\&\`\'\+])/) then
+            self.lex_state = :expr_end
+            # Explicit reference to these vars as symbols...
+            if last_state == :expr_fname then
+              self.yacc_value = src.matched
+              return :tGVAR
+            else
+              self.yacc_value = src[1].to_sym
+              return :tBACK_REF
+            end
+          elsif src.scan(/\$([1-9]\d*)/) then
+            self.lex_state = :expr_end
+            if last_state == :expr_fname then
+              self.yacc_value = src.matched
+              return :tGVAR
+            else
+              self.yacc_value = src[1].to_i
+              return :tNTH_REF
+            end
+          elsif src.scan(/\$0/) then
+            self.lex_state = :expr_end
+            self.token = src.matched
+            return process_token(command_state)
+          elsif src.scan(/\$\W|\$\z/) then # TODO: remove?
+            self.lex_state = :expr_end
+            self.yacc_value = "$"
+            return "$"
+          elsif src.scan(/\$\w+/)
+            self.lex_state = :expr_end
+            self.token = src.matched
+            return process_token(command_state)
+          end
+        elsif src.check(/\_/) then
+          if src.beginning_of_line? && src.scan(/\__END__(\n|\Z)/) then
+            self.lineno = nil
+            return RubyLexer::EOF
+          elsif src.scan(/\_\w*/) then
+            self.token = src.matched
+            return process_token(command_state)
+          end
+        end
+      end # END OF CASE
+      if src.scan(/\004|\032|\000/) || src.eos? then # ^D, ^Z, EOF
+        return RubyLexer::EOF
+      else # alpha check
+        if src.scan(/\W/) then
+          rb_compile_error "Invalid char #{src.matched.inspect} in expression"
+        end
+      end
+      self.token = src.matched if self.src.scan(/\w+/)
+      return process_token(command_state)
+    end
+  end
+  def process_token(command_state)
+    token << src.matched if token =~ /^\w/ && src.scan(/[\!\?](?!=)/)
+    result = nil
+    last_state = lex_state
+    case token
+    when /^\$/ then
+      self.lex_state, result = :expr_end, :tGVAR
+    when /^@@/ then
+      self.lex_state, result = :expr_end, :tCVAR
+    when /^@/ then
+      self.lex_state, result = :expr_end, :tIVAR
+    else
+      if token =~ /[!?]$/ then
+        result = :tFID
+      else
+        if lex_state == :expr_fname then
+          # ident=, not =~ => == or followed by =>
+          # TODO test lexing of a=>b vs a==>b
+          if src.scan(/=(?:(?![~>=])|(?==>))/) then
+            result = :tIDENTIFIER
+            token << src.matched
+          end
+        end
+        result ||= if token =~ /^[A-Z]/ then
+                     :tCONSTANT
+                   else
+                     :tIDENTIFIER
+                   end
+      end
+      unless lex_state == :expr_dot then
+        # See if it is a reserved word.
+        keyword = Keyword.keyword token
+        if keyword then
+          state           = lex_state
+          self.lex_state  = keyword.state
+          self.yacc_value = token
+          if keyword.id0 == :kDO then
+            self.command_start = true
+            return :kDO_COND  if cond.is_in_state
+            return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg
+            return :kDO_BLOCK if state == :expr_endarg
+            return :kDO
+          end
+          return keyword.id0 if state == :expr_beg
+          self.lex_state = :expr_beg if keyword.id0 != keyword.id1
+          return keyword.id1
+        end
+      end
+      if (lex_state == :expr_beg || lex_state == :expr_mid ||
+          lex_state == :expr_dot || lex_state == :expr_arg ||
+          lex_state == :expr_cmdarg) then
+        if command_state then
+          self.lex_state = :expr_cmdarg
+        else
+          self.lex_state = :expr_arg
+        end
+      else
+        self.lex_state = :expr_end
+      end
+    end
+    self.yacc_value = token
+    self.lex_state = :expr_end if
+      last_state != :expr_dot && self.parser.env[token.to_sym] == :lvar
+    return result
+  end
+  def yylex_string # 23 lines
+    token = if lex_strterm[0] == :heredoc then
+              self.heredoc lex_strterm
+            else
+              self.parse_string lex_strterm
+            end
+    if token == :tSTRING_END || token == :tREGEXP_END then
+      self.lineno      = nil
+      self.lex_strterm = nil
+      self.lex_state   = :expr_end
+    end
+    return token
+  end
+end