RubyGems - parser - Versions diffs - 0.9.alpha1 → 0.9.0 - Mend

parser 0.9.alpha1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/.travis.yml +4 -3
data/AST_FORMAT.md +1338 -0
data/README.md +58 -3
data/Rakefile +32 -12
data/bin/benchmark +47 -0
data/bin/explain-parse +14 -0
data/bin/parse +6 -0
data/lib/parser.rb +84 -0
data/lib/parser/all.rb +2 -0
data/lib/parser/ast/node.rb +11 -0
data/lib/parser/ast/processor.rb +8 -0
data/lib/parser/base.rb +116 -0
data/lib/parser/builders/default.rb +654 -0
data/lib/parser/compatibility/ruby1_8.rb +13 -0
data/lib/parser/diagnostic.rb +44 -0
data/lib/parser/diagnostic/engine.rb +44 -0
data/lib/parser/lexer.rl +335 -245
data/lib/parser/lexer/explanation.rb +37 -0
data/lib/parser/{lexer_literal.rb → lexer/literal.rb} +22 -12
data/lib/parser/lexer/stack_state.rb +38 -0
data/lib/parser/ruby18.y +1957 -0
data/lib/parser/ruby19.y +2154 -0
data/lib/parser/source/buffer.rb +78 -0
data/lib/parser/source/map.rb +20 -0
data/lib/parser/source/map/operator.rb +15 -0
data/lib/parser/source/map/variable_assignment.rb +15 -0
data/lib/parser/source/range.rb +66 -0
data/lib/parser/static_environment.rb +12 -6
data/parser.gemspec +23 -13
data/test/helper.rb +45 -0
data/test/parse_helper.rb +204 -0
data/test/racc_coverage_helper.rb +130 -0
data/test/test_diagnostic.rb +47 -0
data/test/test_diagnostic_engine.rb +58 -0
data/test/test_lexer.rb +601 -357
data/test/test_lexer_stack_state.rb +69 -0
data/test/test_parse_helper.rb +74 -0
data/test/test_parser.rb +3654 -0
data/test/test_source_buffer.rb +80 -0
data/test/test_source_range.rb +51 -0
data/test/test_static_environment.rb +1 -4
metadata +137 -12

data/lib/parser/compatibility/ruby1_8.rb ADDED

@@ -0,0 +1,13 @@
+class String
+  alias original_percent %
+  def %(arg, *args)
+    if arg.is_a?(Hash)
+      gsub(/%\{(\w+)\}/) do
+        arg[$1.to_sym]
+      end
+    else
+      original_percent(arg, *args)
+    end
+  end
+end

data/lib/parser/diagnostic.rb ADDED

@@ -0,0 +1,44 @@
+module Parser
+  class Diagnostic
+    LEVELS = [:note, :warning, :error, :fatal].freeze
+    attr_reader :level, :message
+    attr_reader :location, :highlights
+    def initialize(level, message, location, highlights=[])
+      unless LEVELS.include?(level)
+        raise ArgumentError,
+              "Diagnostic#level must be one of #{LEVELS.join(', ')}; " \
+              "#{level.inspect} provided."
+      end
+      @level       = level
+      @message     = message.to_s.dup.freeze
+      @location    = location
+      @highlights  = highlights.dup.freeze
+      freeze
+    end
+    def render
+      source_line    = @location.source_line
+      highlight_line = ' ' * source_line.length
+      @highlights.each do |hilight|
+        range = hilight.column_range
+        highlight_line[range] = '~' * hilight.size
+      end
+      range = @location.column_range
+      highlight_line[range] = '^' * @location.size
+      [
+        "#{@location.to_s}: #{@level}: #{@message}",
+        source_line,
+        highlight_line,
+      ]
+    end
+  end
+end

data/lib/parser/diagnostic/engine.rb ADDED

@@ -0,0 +1,44 @@
+module Parser
+  class Diagnostic::Engine
+    attr_accessor :consumer
+    attr_accessor :all_errors_are_fatal
+    attr_accessor :ignore_warnings
+    def initialize(consumer=nil)
+      @consumer             = consumer
+      @all_errors_are_fatal = false
+      @ignore_warnings      = false
+    end
+    def process(diagnostic)
+      if ignore?(diagnostic)
+        # do nothing
+      elsif @consumer
+        @consumer.call(diagnostic)
+      end
+      if raise?(diagnostic)
+        raise Parser::SyntaxError, diagnostic.message
+      end
+      self
+    end
+    protected
+    def ignore?(diagnostic)
+      @ignore_warnings &&
+            diagnostic.level == :warning
+    end
+    def raise?(diagnostic)
+      (@all_errors_are_fatal &&
+          diagnostic.level == :error) ||
+        diagnostic.level == :fatal
+    end
+  end
+end

data/lib/parser/lexer.rl CHANGED

@@ -3,6 +3,9 @@
 #
 # === BEFORE YOU START ===
 #
+# Read the Ruby Hacking Guide chapter 11, available in English at
+# http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/
+#
 # Remember two things about Ragel scanners:
 #
 #   1) Longest match wins.
@@ -38,6 +41,11 @@
 #       emit($whatever)
 #       fnext $next_state; fbreak;
 #
+#    If you perform `fgoto` in an action which does not emit a token nor
+#    rewinds the stream pointer, the parser's side-effectful,
+#    context-sensitive lookahead actions will break in a hard to detect
+#    and debug way.
+#
 #  * If an action does not emit a token:
 #
 #       fgoto $next_state;
@@ -56,6 +64,8 @@
 #    `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result
 #    _will_ invoke the action `act`.
 #
+#    e_something stands for "something with **e**mbedded action".
+#
 #  * EOF is explicit and is matched by `c_eof`. If you want to introspect
 #    the state of the lexer, add this rule to the state:
 #
@@ -66,49 +76,53 @@
 #       NoMethodError: undefined method `ord' for nil:NilClass
 #
-require 'parser/lexer_literal'
-require 'parser/syntax_error'
 class Parser::Lexer
   %% write data nofinal;
   # %
-  attr_reader   :source
+  attr_reader   :source_buffer
+  attr_accessor :diagnostics
   attr_accessor :static_env
-  attr_reader   :location, :comments
+  attr_accessor :cond, :cmdarg
+  attr_reader   :comments
   def initialize(version)
-    @version = version
+    @version    = version
+    @static_env = nil
     reset
   end
   def reset(reset_state=true)
+    # Ragel-related variables:
     if reset_state
       # Unit tests set state prior to resetting lexer.
-      @cs  = self.class.lex_en_line_begin
+      @cs     = self.class.lex_en_line_begin
+      @cond   = StackState.new('cond')
+      @cmdarg = StackState.new('cmdarg')
     end
-    # Ragel-internal variables:
-    @p     = 0   # stream position (saved manually in #advance)
-    @ts    = nil # token start
-    @te    = nil # token end
-    @act   = 0   # next action
+    @p             = 0   # stream position (saved manually in #advance)
+    @ts            = nil # token start
+    @te            = nil # token end
+    @act           = 0   # next action
-    @stack = []  # state stack
-    @top   = 0   # state stack top pointer
+    @stack         = []  # state stack
+    @top           = 0   # state stack top pointer
+    # Lexer state:
     @token_queue   = []
     @literal_stack = []
-    @newlines      = [0] # sorted set of \n positions
-    @newline_s     = nil # location of last encountered newline
-    @location      = nil # location of last #advance'd token
     @comments      = ""  # collected comments
+    @newline_s     = nil # location of last encountered newline
     @num_base      = nil # last numeric base
     @num_digits_s  = nil # starting position of numeric digits
@@ -125,15 +139,21 @@ class Parser::Lexer
     @lambda_stack  = []
   end
-  def source=(source)
-    # Heredoc processing coupled with weird newline quirks
-    # require three '\0' (EOF) chars to be appended; after
-    # `p = @heredoc_s`, if `p` points at EOF, the FSM could
-    # not bail out early enough and will crash.
-    #
-    # Patches accepted.
-    #
-    @source = source.gsub(/\r\n/, "\n") + "\0\0\0"
+  def source_buffer=(source_buffer)
+    @source_buffer = source_buffer
+    if @source_buffer
+      # Heredoc processing coupled with weird newline quirks
+      # require three '\0' (EOF) chars to be appended; after
+      # `p = @heredoc_s`, if `p` points at EOF, the FSM could
+      # not bail out early enough and will crash.
+      #
+      # Patches accepted.
+      #
+      @source = @source_buffer.source.gsub(/\r\n/, "\n") + "\0\0\0"
+    else
+      @source = nil
+    end
   end
   LEX_STATES = {
@@ -159,7 +179,7 @@ class Parser::Lexer
   # Return next token: [type, value].
   def advance
     if @token_queue.any?
-      return with_location(@token_queue.shift)
+      return @token_queue.shift
     end
     # Ugly, but dependent on Ragel output. Consider refactoring it somehow.
@@ -183,24 +203,14 @@ class Parser::Lexer
     @p = p
     if @token_queue.any?
-      with_location(@token_queue.shift)
+      @token_queue.shift
     elsif @cs == self.class.lex_error
-      with_location([ false, '$undefined', p, p + 1 ])
+      [ false, [ '$error', range(p - 1, p) ] ]
     else
-      with_location([ false, '$end',       p, p + 1 ])
+      [ false, [ '$eof',   range(p - 1, p) ] ]
     end
   end
-  # Like #advance, but also pretty-print the token and its position
-  # in the stream to `stdout`.
-  def advance_and_decorate
-    type, val = advance
-    puts decorate(location, "\e[0;32m#{type} #{val.inspect}\e[0m")
-    [type, val]
-  end
   # Return the current collected comment block and clear the storage.
   def clear_comments
     comments  = @comments
@@ -209,103 +219,42 @@ class Parser::Lexer
     comments
   end
-  # Lex `str` for the Ruby version `version` with initial state `state`.
-  #
-  # The tokens displayed by this function are not the same as tokens
-  # consumed by parser, because the parser manipulates lexer state on
-  # its own.
-  def self.do(source, state=nil, version=19)
-    lex = new(version)
-    lex.source = source
-    lex.state  = state if state
-    loop do
-      type, val = lex.advance_and_decorate
-      break if !type
-    end
-    puts "Lex state: #{lex.state}"
-  end
-  # Used by LexerLiteral to emit tokens for string content.
-  def emit(type, value = tok, s = @ts, e = @te)
-    if s.nil? || e.nil?
-      raise "broken #emit invocation in #{caller[0]}"
-    end
-    @token_queue << [ type, value, s, e ]
-  end
-  def emit_table(table, s = @ts, e = @te)
-    token = tok(s, e)
-    emit(table[token], token, s, e)
-  end
-  # shim
-  def lineno
-    @location[0] + 1
-  end
   protected
   def eof_char?(char)
     [0x04, 0x1a, 0x00].include? char.ord
   end
-  def ruby18?
-    @version == 18
+  def version?(*versions)
+    versions.include?(@version)
   end
-  def ruby19?
-    @version == 19
+  def stack_pop
+    @top -= 1
+    @stack[@top]
   end
   def tok(s = @ts, e = @te)
     @source[s...e]
   end
-  def record_newline(p)
-    @newlines = (@newlines + [p]).uniq.sort
-  end
-  def dissect_location(start, finish)
-    line_number    = @newlines.rindex { |nl| start >= nl }
-    line_first_col = @newlines[line_number]
-    start_col   = start  - line_first_col
-    finish_col  = finish - line_first_col
-    [ line_number, start_col, finish_col ]
+  def range(s = @ts, e = @te)
+    Parser::Source::Range.new(@source_buffer, s, e - 1)
   end
-  def with_location(item)
-    type, value, start, finish = *item
-    @location = dissect_location(start, finish)
-    [ type, value ]
+  def emit(type, value = tok, s = @ts, e = @te)
+    @token_queue << [ type, [ value, range(s, e) ] ]
   end
-  def decorate(location, message="")
-    line_number, from, to = location
-    line = @source.lines.drop(line_number).first
-    line[from...to] = "\e[4m#{line[from...to]}\e[0m"
-    tail_len   = to - from - 1
-    tail       = "~" * (tail_len >= 0 ? tail_len : 0)
-    decoration =  "#{" " * from}\e[1;31m^#{tail}\e[0m #{message}"
-    [ line, decoration ]
-  end
+  def emit_table(table, s = @ts, e = @te)
+    value = tok(s, e)
-  def warning(message, start = @ts, finish = @te)
-    $stderr.puts "warning: #{message}"
-    $stderr.puts decorate(dissect_location(start, finish))
+    emit(table[value], value, s, e)
   end
-  def error(message)
-    raise Parser::SyntaxError, message
+  def diagnostic(type, message, location=range, highlights=[])
+    @diagnostics.process(
+        Parser::Diagnostic.new(type, message, location, highlights))
   end
   #
@@ -313,10 +262,10 @@ class Parser::Lexer
   #
   def push_literal(*args)
-    new_literal = Parser::LexerLiteral.new(self, *args)
+    new_literal = Literal.new(self, *args)
     @literal_stack.push(new_literal)
-    if    new_literal.type == :tWORDS_BEG
+    if new_literal.type == :tWORDS_BEG
       self.class.lex_en_interp_words
     elsif new_literal.type == :tQWORDS_BEG
       self.class.lex_en_plain_words
@@ -328,7 +277,7 @@ class Parser::Lexer
   end
   def literal
-    @literal_stack[-1]
+    @literal_stack.last
   end
   def pop_literal
@@ -361,7 +310,6 @@ class Parser::Lexer
     '=>'  => :tASSOC,   '::'  => :tCOLON2,  '===' => :tEQQ,
     '<=>' => :tCMP,     '[]'  => :tAREF,    '[]=' => :tASET,
     '{'   => :tLCURLY,  '}'   => :tRCURLY,  '`'   => :tBACK_REF2,
-    'do'  => :kDO
   }
   PUNCTUATION_BEGIN = {
@@ -407,7 +355,6 @@ class Parser::Lexer
     #
     # This action is embedded directly into c_nl, as it is idempotent and
     # there are no cases when we need to skip it.
-    record_newline(p + 1)
     @newline_s = p
   }
@@ -514,8 +461,8 @@ class Parser::Lexer
   # Ruby accepts (and fails on) variables with leading digit
   # in literal context, but not in unquoted symbol body.
-  class_var_v    = '@@' [0-9]? bareword;
-  instance_var_v = '@' [0-9]? bareword;
+  class_var_v    = '@@' c_alnum+;
+  instance_var_v = '@' c_alnum+;
   #
   # === ESCAPE SEQUENCE PARSING ===
@@ -538,7 +485,12 @@ class Parser::Lexer
       codepoint = codepoint_str.to_i(16)
       if codepoint >= 0x110000
-        @escape = lambda { error "invalid Unicode codepoint (too large)" }
+        @escape = lambda do
+          # TODO better location reporting
+          diagnostic :error, Parser::ERRORS[:unicode_point_too_large],
+                     range(@escape_s, p)
+        end
         break
       end
@@ -551,30 +503,32 @@ class Parser::Lexer
       'a' => "\a", 'b'  => "\b", 'e'  => "\e", 'f' => "\f",
       'n' => "\n", 'r'  => "\r", 's'  => "\s", 't' => "\t",
       'v' => "\v", '\\' => "\\"
-    }.fetch(@source[p - 1], @source[p - 1])
+    }.fetch(@source[p - 1].chr, @source[p - 1].chr)
   }
   action invalid_complex_escape {
-    @escape = lambda { error "invalid escape character syntax" }
+    @escape = lambda do
+      diagnostic :error, Parser::ERRORS[:invalid_escape]
+    end
   }
   action slash_c_char {
-    @escape = (@escape.ord & 0x9f).chr
+    @escape = (@escape[0].ord & 0x9f).chr
   }
   action slash_m_char {
-    @escape = (@escape.ord | 0x80).chr
+    @escape = (@escape[0].ord | 0x80).chr
   }
   maybe_escaped_char = (
         '\\' c_any      %unescape_char
-    | ( c_any - [\\] )  % { @escape = @source[p - 1] }
+    | ( c_any - [\\] )  % { @escape = @source[p - 1].chr }
   );
   maybe_escaped_ctrl_char = ( # why?!
         '\\' c_any      %unescape_char %slash_c_char
     |   '?'             % { @escape = "\x7f" }
-    | ( c_any - [\\?] ) % { @escape = @source[p - 1] } %slash_c_char
+    | ( c_any - [\\?] ) % { @escape = @source[p - 1].chr } %slash_c_char
   );
   escape = (
@@ -592,7 +546,12 @@ class Parser::Lexer
       # %q[\x]
     | 'x' ( c_any - xdigit )
-      % { @escape = lambda { error "invalid hex escape" } }
+      % {
+        @escape = lambda do
+          diagnostic :error, Parser::ERRORS[:invalid_hex_escape],
+                     range(@escape_s - 1, p + 2)
+        end
+      }
       # %q[\u123] %q[\u{12]
     | 'u' ( c_any{0,4}  -
@@ -602,7 +561,12 @@ class Parser::Lexer
             | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
             )
           )
-      % { @escape = lambda { error "invalid Unicode escape" } }
+      % {
+        @escape = lambda do
+          diagnostic :error, Parser::ERRORS[:invalid_unicode_escape],
+                     range(@escape_s - 1, p)
+        end
+      }
       # \u{123 456}
     | 'u{' ( xdigit{1,6} [ \t] )*
@@ -611,7 +575,12 @@ class Parser::Lexer
       | ( xdigit* ( c_any - xdigit - '}' )+ '}'
         | ( c_any - '}' )* c_eof
         | xdigit{7,}
-        ) % { @escape = lambda { error "unterminated Unicode escape" } }
+        ) % {
+          @escape = lambda do
+            diagnostic :fatal, Parser::ERRORS[:unterminated_unicode],
+                       range(p - 1, p)
+          end
+        }
       )
       # \C-\a \cx
@@ -635,7 +604,10 @@ class Parser::Lexer
     | ( c_any - [0-7xuCMc] ) %unescape_char
-    | c_eof % { error "escape sequence meets end of file" }
+    | c_eof % {
+      diagnostic :fatal, Parser::ERRORS[:escape_eof],
+                 range(p - 1, p)
+    }
   );
   # Use rules in form of `e_bs escape' when you need to parse a sequence.
@@ -666,7 +638,7 @@ class Parser::Lexer
   # of positions in the input stream, namely @heredoc_e
   # (HEREDOC declaration End) and @herebody_s (HEREdoc BODY line Start).
   #
-  # @heredoc_e is simply contained inside the corresponding LexerLiteral, and
+  # @heredoc_e is simply contained inside the corresponding Literal, and
   # when the heredoc is closed, the lexing is restarted from that position.
   #
   # @herebody_s is quite more complex. First, @herebody_s changes after each
@@ -683,14 +655,14 @@ class Parser::Lexer
     # After every heredoc was parsed, @herebody_s contains the
     # position of next token after all heredocs.
     if @herebody_s
-      p = @herebody_s
+      p = @herebody_s - 1
       @herebody_s = nil
     end
   };
   action extend_string {
     if literal.nest_and_try_closing tok, @ts, @te
-      fgoto *pop_literal;
+      fnext *pop_literal; fbreak;
     else
       literal.extend_string tok, @ts, @te
     end
@@ -701,10 +673,10 @@ class Parser::Lexer
       # If the literal is actually closed by the backslash,
       # rewind the input prior to consuming the escape sequence.
       p = @escape_s - 1
-      fgoto *pop_literal;
+      fnext *pop_literal; fbreak;
     else
       # Get the first character after the backslash.
-      escaped_char = @source[@escape_s]
+      escaped_char = @source[@escape_s].chr
       if literal.munge_escape? escaped_char
         # If this particular literal uses this character as an opening
@@ -765,11 +737,12 @@ class Parser::Lexer
     end
     if is_eof
-      error "unterminated string meets end of file"
+      diagnostic :fatal, Parser::ERRORS[:string_eof],
+                 range(literal.str_s, literal.str_s + 1)
     end
     # A literal newline is appended if the heredoc was _not_ closed
-    # this time. See also LexerLiteral#nest_and_try_closing for rationale of
+    # this time. See also Literal#nest_and_try_closing for rationale of
     # calling #flush_string here.
     literal.extend_string tok, @ts, @te
     literal.flush_string
@@ -782,8 +755,7 @@ class Parser::Lexer
   # Interpolations with immediate variable names simply call into
   # the corresponding machine.
-  interp_var =
-      '#' ( global_var | class_var_v | instance_var_v );
+  interp_var = '#' ( global_var | class_var_v | instance_var_v );
   action extend_interp_var {
     literal.flush_string
@@ -808,6 +780,8 @@ class Parser::Lexer
   interp_code = '#{';
   e_lbrace = '{' % {
+    @cond.push(false); @cmdarg.push(false)
     if literal
       literal.start_interp_brace
     end
@@ -827,7 +801,7 @@ class Parser::Lexer
         end
         fhold;
-        fnext *@stack.pop;
+        fnext *stack_pop;
         fbreak;
       end
     end
@@ -872,6 +846,7 @@ class Parser::Lexer
   *|;
   plain_string := |*
+      '\\' c_nl   => extend_string_eol;
       e_bs c_any  => extend_string_escaped;
       c_eol       => extend_string_eol;
       c_any       => extend_string;
@@ -882,11 +857,12 @@ class Parser::Lexer
       => {
         unknown_options = tok.scan(/[^imxouesn]/)
         if unknown_options.any?
-          error "unknown regexp options: #{unknown_options.join}"
+          message = Parser::ERRORS[:regexp_options] % { :options => unknown_options.join }
+          diagnostic :error, message
         end
         emit(:tREGEXP_OPT)
-        fgoto expr_end;
+        fnext expr_end; fbreak;
       };
       any
@@ -904,11 +880,17 @@ class Parser::Lexer
   # The default longest-match scanning does not work here due
   # to sheer ambiguity.
+  ambiguous_fid_suffix =       # actual    parsed
+      [?!]  %{ tm = p }      | # a?        a?
+      '!='  %{ tm = p - 2 }    # a!=b      a != b
+  ;
   ambiguous_ident_suffix =     # actual    parsed
-      [?!=] %{ tm = p } |      # a?        a?
-      '=='  %{ tm = p - 2 } |  # a==b      a == b
-      '=~'  %{ tm = p - 2 } |  # a=~b      a =~ b
-      '=>'  %{ tm = p - 2 } |  # a=>b      a => b
+      ambiguous_fid_suffix   |
+      '='   %{ tm = p }      | # a=        a=
+      '=='  %{ tm = p - 2 }  | # a==b      a == b
+      '=~'  %{ tm = p - 2 }  | # a=~b      a =~ b
+      '=>'  %{ tm = p - 2 }  | # a=>b      a => b
       '===' %{ tm = p - 3 }    # a===b     a === b
   ;
@@ -922,15 +904,24 @@ class Parser::Lexer
       '::'  %{ tm = p - 2 }    # A::B      A :: B
   ;
+  # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embegging
+  # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace.
+  e_lbrack = '[' % {
+    @cond.push(false); @cmdarg.push(false)
+  };
   # Ruby 1.9 lambdas require parentheses counting in order to
   # emit correct opening kDO/tLBRACE.
   e_lparen = '(' % {
-      @paren_nest += 1
+    @cond.push(false); @cmdarg.push(false)
+    @paren_nest += 1
   };
   e_rparen = ')' % {
-      @paren_nest -= 1
+    @paren_nest -= 1
   };
   # Variable lexing code is accessed from both expressions and
@@ -940,30 +931,36 @@ class Parser::Lexer
       global_var
       => {
         if    tok =~ /^\$([1-9][0-9]*)$/
-          emit(:tNTH_REF, $1.to_i)
+          emit(:tNTH_REF, tok(@ts + 1).to_i)
         elsif tok =~ /^\$([&`'+])$/
-          emit(:tBACK_REF, $1.to_sym)
+          emit(:tBACK_REF)
         else
           emit(:tGVAR)
         end
-        fnext *@stack.pop; fbreak;
+        fnext *stack_pop; fbreak;
       };
       class_var_v
       => {
-        error "`#{tok}' is not allowed as a class variable name" if tok =~ /^@@[0-9]/
+        if tok =~ /^@@[0-9]/
+          message = Parser::ERRORS[:cvar_name] % { :name => tok }
+          diagnostic :error, message
+        end
         emit(:tCVAR)
-        fnext *@stack.pop; fbreak;
+        fnext *stack_pop; fbreak;
       };
       instance_var_v
       => {
-        error "`#{tok}' is not allowed as an instance variable name" if tok =~ /^@[0-9]/
+        if tok =~ /^@[0-9]/
+          message = Parser::ERRORS[:ivar_name] % { :name => tok }
+          diagnostic :error, message
+        end
         emit(:tIVAR)
-        fnext *@stack.pop; fbreak;
+        fnext *stack_pop; fbreak;
       };
   *|;
@@ -996,11 +993,11 @@ class Parser::Lexer
            fnext expr_end; fbreak; };
       ':'
-      => { fhold; fgoto expr_end; };
+      => { fhold; fgoto expr_beg; };
       global_var
-      => { emit(:tGVAR)
-           fbreak; };
+      => { p = @ts - 1
+           fcall expr_variable; };
       c_space_nl+;
@@ -1015,12 +1012,16 @@ class Parser::Lexer
   # Transitions to `expr_arg` afterwards.
   #
   expr_dot := |*
-      bareword
+      constant
+      => { emit(:tCONSTANT)
+           fnext expr_arg; fbreak; };
+      call_or_var
       => { emit(:tIDENTIFIER)
            fnext expr_arg; fbreak; };
-      bareword ambiguous_ident_suffix
-      => { emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm)
+      call_or_var ambiguous_ident_suffix
+      => { emit(:tFID, tok(@ts, tm), @ts, tm)
            fnext expr_arg; p = tm - 1; fbreak; };
       operator_fname      |
@@ -1031,6 +1032,8 @@ class Parser::Lexer
       c_space_nl+;
+      '#' c_line* c_nl;
       c_any
       => { fhold; fgoto expr_end; };
@@ -1059,8 +1062,8 @@ class Parser::Lexer
       # meth [...]
       # Array argument. Compare with indexing `meth[...]`.
-      c_space+ '['
-      => { emit(:tLBRACK, '[', @te - 1, @te);
+      c_space+ e_lbrack
+      => { emit(:tLBRACK, '[', @te - 1, @te)
            fnext expr_beg; fbreak; };
       # cmd {}
@@ -1076,12 +1079,6 @@ class Parser::Lexer
         end
       };
-      # a.b
-      # Dot-call.
-      '.' | '::'
-      => { emit_table(PUNCTUATION);
-           fnext expr_dot; fbreak; };
       #
       # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG
       #
@@ -1091,11 +1088,22 @@ class Parser::Lexer
       c_space+ '?'
       => { fhold; fgoto expr_beg; };
+               # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
+      c_space+ ( '%' [^= ]
+               # a /foo/ (but not "a / foo" or "a /=foo")
+               | '/' ( c_any - c_space_nl - '=' )
+               # a <<HEREDOC
+               | '<<'
+               )
+      => { fhold; fhold; fgoto expr_beg; };
       # x +1
       # Ambiguous unary operator or regexp literal.
       c_space+ [+\-/]
       => {
-        warning "ambiguous first argument; put parentheses or even spaces", @te - 1, @te
+        diagnostic :warning, Parser::ERRORS[:ambiguous_literal],
+                   range(@te - 1, @te)
         fhold; fhold; fgoto expr_beg;
       };
@@ -1103,11 +1111,23 @@ class Parser::Lexer
       # Ambiguous splat or block-pass.
       c_space+ [*&]
       => {
-        what = tok(@te - 1, @te)
-        warning "`#{what}' interpreted as argument prefix", @te - 1, @te
+        message = Parser::ERRORS[:ambiguous_prefix] % { :prefix => tok(@te - 1, @te) }
+        diagnostic :warning, message,
+                   range(@te - 1, @te)
         fhold; fgoto expr_beg;
       };
+      # x ::Foo
+      # Ambiguous toplevel constant access.
+      c_space+ '::'
+      => { fhold; fhold; fgoto expr_beg; };
+      # x:b
+      # Symbol.
+      c_space* ':'
+      => { fhold; fgoto expr_beg; };
       #
       # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END
       #
@@ -1129,7 +1149,7 @@ class Parser::Lexer
         fgoto expr_end;
       };
-      c_space* c_nl
+      c_space* ( '#' c_line* )? c_nl
       => { fhold; fgoto expr_end; };
       c_any
@@ -1152,8 +1172,7 @@ class Parser::Lexer
   # `{` as `tLBRACE_ARG`.
   #
   # The default post-`expr_endarg` state is `expr_end`, so this state also handles
-  # `do` (as `kDO_BLOCK` in `expr_beg`). (I have no clue why the parser cannot
-  # just handle `kDO`.)
+  # `do` (as `kDO_BLOCK` in `expr_beg`).
   expr_endarg := |*
       e_lbrace
       => { emit(:tLBRACE_ARG)
@@ -1183,6 +1202,8 @@ class Parser::Lexer
       c_space+;
+      '#' c_line*;
       c_nl
       => { fhold; fgoto expr_end; };
@@ -1221,39 +1242,32 @@ class Parser::Lexer
       # STRING AND REGEXP LITERALS
       #
-      # a / 42
-      # a % 42
-      # a %= 42 (disambiguation with %=string=)
-      [/%] c_space_nl | '%=' # /
-      => {
-        fhold; fhold;
-        fgoto expr_end;
-      };
       # /regexp/oui
-      '/'
+      # /=/ (disambiguation with /=)
+      '/' c_any
       => {
-        type, delimiter = tok, tok
-        fgoto *push_literal(type, delimiter, @ts);
+        type = delimiter = tok[0].chr
+        fhold; fgoto *push_literal(type, delimiter, @ts);
       };
       # %<string>
       '%' ( c_any - [A-Za-z] )
       => {
-        type, delimiter = tok[0], tok[-1]
+        type, delimiter = tok[0].chr, tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
       };
       # %w(we are the people)
       '%' [A-Za-z]+ c_any
       => {
-        type, delimiter = tok[0..-2], tok[-1]
+        type, delimiter = tok[0..-2], tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
       };
       '%' c_eof
       => {
-        error "unterminated string meets end of file"
+        diagnostic :fatal, Parser::ERRORS[:string_eof],
+                   range(@ts, @ts + 1)
       };
       # Heredoc start.
@@ -1280,6 +1294,31 @@ class Parser::Lexer
         p = @herebody_s - 1
       };
+      #
+      # SYMBOL LITERALS
+      #
+      # :"bar", :'baz'
+      ':' ['"] # '
+      => {
+        type, delimiter = tok, tok[-1].chr
+        fgoto *push_literal(type, delimiter, @ts);
+      };
+      ':' bareword ambiguous_symbol_suffix
+      => {
+        emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm)
+        p = tm - 1
+        fnext expr_end; fbreak;
+      };
+      ':' ( bareword | global_var | class_var | instance_var |
+            operator_fname | operator_arithmetic | operator_rest )
+      => {
+        emit(:tSYMBOL, tok(@ts + 1), @ts)
+        fnext expr_end; fbreak;
+      };
       #
       # AMBIGUOUS TERNARY OPERATOR
       #
@@ -1293,20 +1332,22 @@ class Parser::Lexer
         value = @escape || tok(@ts + 1)
-        if ruby18?
-          emit(:tINTEGER, value.ord)
+        if version?(18)
+          emit(:tINTEGER, value[0].ord)
         else
           emit(:tSTRING, value)
         end
-        fbreak;
+        fnext expr_end; fbreak;
       };
       '?' c_space_nl
       => {
         escape = { " "  => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t',
                    "\v" => '\v', "\f" => '\f' }[tok[@ts + 1]]
-        warning "invalid character syntax; use ?#{escape}", @ts
+        message = Parser::ERRORS[:invalid_escape_use] % { :escape => escape }
+        diagnostic :warning, message,
+                   range(@ts, @ts + 1)
         p = @ts - 1
         fgoto expr_end;
@@ -1314,7 +1355,8 @@ class Parser::Lexer
       '?' c_eof
       => {
-        error "incomplete character syntax"
+        diagnostic :fatal, Parser::ERRORS[:incomplete_escape],
+                   range(@ts, @ts + 1)
       };
       # f ?aa : b: Disambiguate with a character literal.
@@ -1328,15 +1370,20 @@ class Parser::Lexer
       # KEYWORDS AND PUNCTUATION
       #
-      # a(+b)
-      punctuation_begin |
+      # a([1, 2])
+      e_lbrack    |
       # a({b=>c})
-      e_lbrace          |
+      e_lbrace    |
       # a()
       e_lparen
       => { emit_table(PUNCTUATION_BEGIN)
            fbreak; };
+      # a(+b)
+      punctuation_begin
+      => { emit_table(PUNCTUATION_BEGIN)
+           fbreak; };
       # rescue Exception => e: Block rescue.
       # Special because it should transition to expr_mid.
       'rescue'
@@ -1356,7 +1403,7 @@ class Parser::Lexer
       => {
         fhold;
-        if ruby18?
+        if version?(18)
           emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
           fhold; # continue as a symbol
         else
@@ -1383,14 +1430,32 @@ class Parser::Lexer
       => {
         emit(:tIDENTIFIER)
-        if @static_env && @static_env.declared?(tok.to_sym)
-          fgoto expr_end;
+        if @static_env && @static_env.declared?(tok)
+          fnext expr_end; fbreak;
         else
-          fgoto expr_arg;
+          fnext expr_arg; fbreak;
         end
       };
-      c_space_nl+;
+      #
+      # WHITESPACE
+      #
+      c_space_nl;
+      '\\\n';
+      '#' c_line* c_eol
+      => { @comments << tok
+           fhold; };
+      c_nl '=begin' ( c_space | c_eol )
+      => { p = @ts - 1
+           fgoto line_begin; };
+      #
+      # DEFAULT TRANSITION
+      #
       # The following rules match most binary and all unary operators.
       # Rules for binary operators provide better error reporting.
@@ -1439,11 +1504,21 @@ class Parser::Lexer
           if tok == '{'
             emit(:tLAMBEG)
-          else
+          else # 'do'
             emit(:kDO_LAMBDA)
           end
         else
-          emit_table(PUNCTUATION)
+          if tok == '{'
+            emit_table(PUNCTUATION)
+          else # 'do'
+            if @cond.active?
+              emit(:kDO_COND)
+            elsif @cmdarg.active?
+              emit(:kDO_BLOCK)
+            else
+              emit(:kDO)
+            end
+          end
         end
         fnext expr_value; fbreak;
@@ -1457,7 +1532,7 @@ class Parser::Lexer
       => { emit_table(KEYWORDS)
            fnext expr_fname; fbreak; };
-      'class' c_space_nl '<<'
+      'class' c_space_nl* '<<'
       => { emit(:kCLASS, 'class', @ts, @ts + 5)
            emit(:tLSHFT, '<<',    @te - 2, @te)
            fnext expr_beg; fbreak; };
@@ -1480,13 +1555,23 @@ class Parser::Lexer
       => {
         emit_table(KEYWORDS)
-        if ruby18? && tok == 'not'
+        if version?(18) && tok == 'not'
           fnext expr_beg; fbreak;
         else
           fnext expr_arg; fbreak;
         end
       };
+      '__ENCODING__'
+      => {
+        if version?(18)
+          emit(:tIDENTIFIER)
+        else
+          emit_table(KEYWORDS)
+        end
+        fbreak;
+      };
       keyword_with_end
       => { emit_table(KEYWORDS)
            fbreak; };
@@ -1503,7 +1588,8 @@ class Parser::Lexer
                ( digit+ '_' )* digit* '_'?
       | '0' [Bb]  %{ @num_base = 2;  @num_digits_s = p }
                ( [01]+ '_' )* [01]* '_'?
-      | [1-9]     %{ @num_base = 10; @num_digits_s = @ts }
+      | [1-9] digit*
+                  %{ @num_base = 10; @num_digits_s = @ts }
                ( '_' digit+ )* digit* '_'?
       | '0'       %{ @num_base = 8;  @num_digits_s = @ts }
                ( '_' digit+ )* digit* '_'?
@@ -1512,14 +1598,17 @@ class Parser::Lexer
         digits = tok(@num_digits_s)
         if digits.end_with? '_'
-          error "trailing `_' in number"
-        elsif digits.empty? && @num_base == 8 && ruby18?
+          diagnostic :error, Parser::ERRORS[:trailing_underscore],
+                     range(@te - 1, @te)
+        elsif digits.empty? && @num_base == 8 && version?(18)
           # 1.8 did not raise an error on 0o.
           digits = "0"
         elsif digits.empty?
-          error "numeric literal without digits"
-        elsif @num_base == 8 && digits =~ /[89]/
-          error "invalid octal digit"
+          diagnostic :error, Parser::ERRORS[:empty_numeric]
+        elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/))
+          invalid_s = @num_digits_s + invalid_idx
+          diagnostic :error, Parser::ERRORS[:invalid_octal],
+                     range(invalid_s, invalid_s + 1)
         end
         emit(:tINTEGER, digits.to_i(@num_base))
@@ -1537,7 +1626,7 @@ class Parser::Lexer
       )
       => {
         if tok.start_with? '.'
-          error "no .<digit> floating literal anymore; put 0 before dot"
+          diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
         elsif tok =~ /^[eE]/
           # The rule above allows to specify floats as just `e10', which is
           # certainly not a float. Send a patch if you can do this better.
@@ -1550,25 +1639,16 @@ class Parser::Lexer
       };
       #
-      # SYMBOL LITERALS
+      # STRING AND XSTRING LITERALS
       #
-      # `echo foo` | :"bar" | :'baz'
-      '`' | ':'? ['"] # '
+      # `echo foo`, "bar", 'baz'
+      '`' | ['"] # '
       => {
-        type, delimiter = tok, tok[-1]
+        type, delimiter = tok, tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
       };
-      ':' bareword ambiguous_symbol_suffix
-      => { emit(:tSYMBOL, tok(@ts + 1, tm))
-           p = tm - 1; fbreak; };
-      ':' ( bareword | global_var | class_var | instance_var |
-            operator_fname | operator_arithmetic | operator_rest )
-      => { emit(:tSYMBOL, tok(@ts + 1))
-           fbreak; };
       #
       # CONSTANTS AND VARIABLES
       #
@@ -1578,7 +1658,7 @@ class Parser::Lexer
            fbreak; };
       constant ambiguous_const_suffix
-      => { emit(:tCONSTANT, tok(@ts, tm))
+      => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
            p = tm - 1; fbreak; };
       global_var | class_var_v | instance_var_v
@@ -1588,7 +1668,7 @@ class Parser::Lexer
       # METHOD CALLS
       #
-      '.'
+      '.' | '::'
       => { emit_table(PUNCTUATION)
            fnext expr_dot; fbreak; };
@@ -1596,8 +1676,9 @@ class Parser::Lexer
       => { emit(:tIDENTIFIER)
            fnext expr_arg; fbreak; };
-      call_or_var [?!]
-      => { emit(:tFID)
+      call_or_var ambiguous_fid_suffix
+      => { emit(:tFID, tok(@ts, tm), @ts, tm)
+           p = tm - 1
            fnext expr_arg; fbreak; };
       #
@@ -1613,6 +1694,7 @@ class Parser::Lexer
       e_rbrace | e_rparen | ']'
       => { emit_table(PUNCTUATION)
+           @cond.lexpop; @cmdarg.lexpop
            fbreak; };
       operator_arithmetic '='
@@ -1623,6 +1705,10 @@ class Parser::Lexer
       => { emit_table(PUNCTUATION)
            fnext expr_value; fbreak; };
+      e_lbrack
+      => { emit_table(PUNCTUATION)
+           fnext expr_beg; fbreak; };
       punctuation_end
       => { emit_table(PUNCTUATION)
            fnext expr_beg; fbreak; };
@@ -1632,11 +1718,16 @@ class Parser::Lexer
       #
       '\\' e_heredoc_nl;
-      '\\' ( any - c_nl ) {
-        error "bare backslash only allowed before newline"
+      '\\' c_line {
+        diagnostic :error, Parser::ERRORS[:bare_backslash],
+                   range(@ts, @ts + 1)
+        fhold;
       };
-      '#' ( c_any - c_nl )*
+      c_space+;
+      '#' c_line*
       => { @comments << tok(@ts, @te + 1) };
       e_heredoc_nl
@@ -1646,11 +1737,10 @@ class Parser::Lexer
       => { emit_table(PUNCTUATION)
            fnext expr_value; fbreak; };
-      c_space+;
       c_any
       => {
-        error "unexpected #{tok.inspect}"
+        message = Parser::ERRORS[:unexpected] % { :character => tok.inspect }
+        diagnostic :fatal, message
       };
       c_eof => do_eof;
@@ -1681,10 +1771,10 @@ class Parser::Lexer
       c_line* c_nl
       => { @comments << tok };
-      any
+      c_eof
       => {
-        @comments = ""
-        error "embedded document meats end of file (and they embark on a romantic journey)"
+        # TODO better location information here
+        diagnostic :fatal, Parser::ERRORS[:embedded_document], range(p - 1, p)
       };
   *|;