RubyGems - parser - Versions diffs - 2.0.0.pre2 → 2.0.0.pre3 - Mend

parser 2.0.0.pre2 → 2.0.0.pre3

Files changed (38) hide show

checksums.yaml +4 -4
data/.yardopts +2 -2
data/CHANGELOG.md +55 -0
data/Gemfile +0 -2
data/README.md +58 -4
data/lib/gauntlet_parser.rb +121 -0
data/lib/parser.rb +31 -24
data/lib/parser/ast/node.rb +6 -4
data/lib/parser/ast/processor.rb +3 -0
data/lib/parser/base.rb +18 -17
data/lib/parser/builders/default.rb +61 -9
data/lib/parser/compatibility/ruby1_8.rb +7 -0
data/lib/parser/diagnostic.rb +18 -5
data/lib/parser/diagnostic/engine.rb +12 -11
data/lib/parser/lexer.rl +288 -133
data/lib/parser/lexer/explanation.rb +1 -1
data/lib/parser/lexer/literal.rb +49 -17
data/lib/parser/rewriter.rb +2 -0
data/lib/parser/ruby18.y +1 -17
data/lib/parser/ruby19.y +7 -18
data/lib/parser/ruby20.y +9 -28
data/lib/parser/ruby21.y +11 -34
data/lib/parser/runner.rb +6 -1
data/lib/parser/source/buffer.rb +44 -21
data/lib/parser/source/comment.rb +35 -0
data/lib/parser/source/comment/associator.rb +3 -0
data/lib/parser/source/map.rb +2 -4
data/lib/parser/source/range.rb +7 -0
data/lib/parser/source/rewriter.rb +3 -0
data/lib/parser/source/rewriter/action.rb +3 -0
data/lib/parser/syntax_error.rb +7 -2
data/lib/parser/version.rb +1 -1
data/parser.gemspec +2 -0
data/test/parse_helper.rb +5 -3
data/test/test_encoding.rb +29 -0
data/test/test_lexer.rb +780 -514
data/test/test_parser.rb +185 -11
metadata +17 -2

data/lib/parser/builders/default.rb CHANGED

@@ -1,13 +1,35 @@
 module Parser
+  ##
+  # Default AST builder. Uses {AST::Node}s.
+  #
   class Builders::Default
+    ##
+    # @api private
     attr_accessor :parser
+    ##
+    # If set to true, `__FILE__` and `__LINE__` are transformed to
+    # literal nodes. For example, `s(:str, "lib/foo.rb")` and `s(:int, 10)`.
+    #
+    # If set to false, `__FILE__` and `__LINE__` are emitted as-is,
+    # i.e. as `s(:__FILE__)` and `s(:__LINE__)` nodes.
+    #
+    # Source maps are identical in both cases.
+    #
+    # @return [TrueClass|FalseClass]
     attr_accessor :emit_file_line_as_literals
+    ##
+    # Initializes attributes:
+    #
+    #   * `emit_file_line_as_literals`: `true`
     def initialize
       @emit_file_line_as_literals = true
     end
+    # @!parse private
     #
     # Literals
     #
@@ -359,6 +381,10 @@ module Parser
       end
     end
+    def const_op_assignable(node)
+      node.updated(:casgn)
+    end
     def assign(lhs, eql_t, rhs)
       (lhs << rhs).updated(nil, nil,
         :location => lhs.loc.
@@ -462,8 +488,9 @@ module Parser
     # Formal arguments
     #
-    def args(begin_t, args, end_t)
-      n(:args, [ *check_duplicate_args(args) ],
+    def args(begin_t, args, end_t, check_args=true)
+      args = check_duplicate_args(args) if check_args
+      n(:args, args,
         collection_map(begin_t, args, end_t))
     end
@@ -636,8 +663,24 @@ module Parser
             receiver.children.count == 2 &&
             receiver.children.first.type == :str
-        regexp_str, _regopt = *receiver
-        regexp_body, = *regexp_str
+        str_node, opt_node = *receiver
+        regexp_body, = *str_node
+        *regexp_opt  = *opt_node
+        if defined?(Encoding)
+          regexp_body = case
+          when regexp_opt.include?(:u)
+            regexp_body.encode(Encoding::UTF_8)
+          when regexp_opt.include?(:e)
+            regexp_body.encode(Encoding::EUC_JP)
+          when regexp_opt.include?(:s)
+            regexp_body.encode(Encoding::WINDOWS_31J)
+          when regexp_opt.include?(:n)
+            regexp_body.encode(Encoding::BINARY)
+          else
+            regexp_body
+          end
+        end
         Regexp.new(regexp_body).names.each do |name|
           @parser.static_env.declare(name)
@@ -880,10 +923,15 @@ module Parser
         when :erange then :eflipflop
         end
-        cond.updated(type, [
-          check_condition(lhs),
-          check_condition(rhs)
-        ])
+        if [:and, :or].include?(cond.type) &&
+               @parser.version == 18
+          cond
+        else
+          cond.updated(type, [
+            check_condition(lhs),
+            check_condition(rhs)
+          ])
+        end
       when :regexp
         n(:match_current_line, [ cond ], nil)
@@ -1230,7 +1278,11 @@ module Parser
       end
       if else_t
-        end_l = else_e.loc.expression
+        if else_e.nil?
+          end_l = loc(else_t)
+        else
+          end_l = else_e.loc.expression
+        end
       elsif !body_es.last.nil?
         end_l = body_es.last.loc.expression
       else

data/lib/parser/compatibility/ruby1_8.rb CHANGED

@@ -1,3 +1,10 @@
+##
+# @api public
+#
+# This monkeypatch extends Ruby 1.8 {String#%} with an ability
+# to replace named capture groups, i.e.
+# `"foo: %{bar}" % { :bar => 10 } # => "foo: 10"`.
+#
 class String
   alias original_percent %

data/lib/parser/diagnostic.rb CHANGED

@@ -1,17 +1,22 @@
 module Parser
   ##
+  # @api public
+  #
   # @!attribute [r] level
-  #  @return [Symbol]
+  #  @see LEVELS
+  #  @return [Symbol] diagnostic level
   #
   # @!attribute [r] message
-  #  @return [String]
+  #  @return [String] error message
   #
   # @!attribute [r] location
-  #  @return [Parser::Source::Map]
+  #  Main error-related source range.
+  #  @return [Parser::Source::Range]
   #
   # @!attribute [r] highlights
-  #  @return [Array]
+  #  Supplementary error-related source ranges.
+  #  @return [Array<Parser::Source::Range>]
   #
   class Diagnostic
     ##
@@ -46,7 +51,15 @@ module Parser
     end
     ##
-    # Renders the diagnostic message as an array of three lines.
+    # Renders the diagnostic message as a clang-like diagnostic.
+    #
+    # @example
+    #  diagnostic.render # =>
+    #  # [
+    #  #   "(fragment:0):1:5: error: unexpected token $end",
+    #  #   "foo +",
+    #  #   "    ^"
+    #  # ]
     #
     # @return [Array<String>]
     #

data/lib/parser/diagnostic/engine.rb CHANGED

@@ -4,22 +4,23 @@ module Parser
   # {Parser::Diagnostic::Engine} provides a basic API for dealing with
   # diagnostics by delegating them to registered consumers.
   #
-  # Basic usage is as following:
+  # @example
+  #  buffer      = Parser::Source::Buffer.new(__FILE__)
+  #  buffer.code = 'foobar'
   #
-  #     buffer      = Parser::Source::Buffer.new(__FILE__)
-  #     buffer.code = 'foobar'
+  #  consumer = lambda do |diagnostic|
+  #    puts diagnostic.message
+  #  end
   #
-  #     consumer = lambda do |diagnostic|
-  #       puts diagnostic.message
-  #     end
+  #  engine     = Parser::Diagnostic::Engine.new(consumer)
+  #  diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
   #
-  #     engine     = Parser::Diagnostic::Engine.new(consumer)
-  #     diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
+  #  engine.process(diagnostic) # => "warning!"
   #
-  #     engine.process(diagnostic) # => "warning!"
+  # @api public
   #
   # @!attribute [rw] consumer
-  #  @return [#call]
+  #  @return [#call(Diagnostic)]
   #
   # @!attribute [rw] all_errors_are_fatal
   #  When set to `true` any error that is encountered will result in
@@ -37,7 +38,7 @@ module Parser
     attr_accessor :ignore_warnings
     ##
-    # @param [#call] consumer
+    # @param [#call(Diagnostic)] consumer
     #
     def initialize(consumer=nil)
       @consumer             = consumer

data/lib/parser/lexer.rl CHANGED

@@ -82,6 +82,7 @@ class Parser::Lexer
   # %
   attr_reader   :source_buffer
+  attr_reader   :encoding
   attr_accessor :diagnostics
   attr_accessor :static_env
@@ -110,6 +111,10 @@ class Parser::Lexer
       @cmdarg = StackState.new('cmdarg')
     end
+    @source        = nil # source string
+    @source_pts    = nil # @source as a codepoint array
+    @encoding      = nil # target encoding for output strings
     @p             = 0   # stream position (saved manually in #advance)
     @ts            = nil # token start
     @te            = nil # token end
@@ -141,27 +146,51 @@ class Parser::Lexer
     # encountered after a matching closing parenthesis.
     @paren_nest    = 0
     @lambda_stack  = []
+    # If the lexer is in `command state' (aka expr_value)
+    # at the entry to #advance, it will transition to expr_cmdarg
+    # instead of expr_arg at certain points.
+    @command_state = false
   end
   def source_buffer=(source_buffer)
     @source_buffer = source_buffer
     if @source_buffer
-      # Heredoc processing coupled with weird newline quirks
-      # require three '\0' (EOF) chars to be appended; after
-      # `p = @heredoc_s`, if `p` points at EOF, the FSM could
-      # not bail out early enough and will crash.
-      #
-      # Patches accepted.
-      #
-      @source = @source_buffer.source + "\0\0\0"
+      @source = @source_buffer.source + "\0"
-      if @source.length > 0 && @source[0].ord == 0xfeff
+      if defined?(Encoding) && @source.encoding == Encoding::UTF_8
+        @source_pts = @source.unpack('U*')
+      else
+        @source_pts = @source.unpack('C*')
+      end
+      if defined?(Encoding)
+        @encoding = @source.encoding
+      end
+      if @source_pts.size > 1_000_000 && @source.respond_to?(:encode)
+        # A heuristic: if the buffer is larger than 1M, then
+        # store it in UTF-32 and convert the tokens as they're
+        # going out. If it's smaller, the conversion overhead
+        # dominates runtime and this stops being beneficial.
+        #
+        # This is not really a good heuristic, as the result
+        # heavily depends on token/character ratio. If it's low,
+        # say the gem consists mostly of long identifiers and
+        # symbols, then storing the source in UTF-8 would be faster.
+        #
+        # Patches accepted.
+        @source = @source.encode(Encoding::UTF_32LE)
+      end
+      if @source_pts[0] == 0xfeff
         # Skip byte order mark.
         @p = 1
       end
     else
-      @source = nil
+      @source     = nil
+      @source_pts = nil
     end
   end
@@ -173,9 +202,15 @@ class Parser::Lexer
     :expr_beg    => lex_en_expr_beg,
     :expr_mid    => lex_en_expr_mid,
     :expr_arg    => lex_en_expr_arg,
+    :expr_cmdarg => lex_en_expr_cmdarg,
     :expr_end    => lex_en_expr_end,
     :expr_endarg => lex_en_expr_endarg,
     :expr_endfn  => lex_en_expr_endfn,
+    :interp_string => lex_en_interp_string,
+    :interp_words  => lex_en_interp_words,
+    :plain_string  => lex_en_plain_string,
+    :plain_words   => lex_en_plain_string,
   }
   def state
@@ -204,8 +239,12 @@ class Parser::Lexer
     _lex_trans_actions      = self.class.send :_lex_trans_actions
     _lex_to_state_actions   = self.class.send :_lex_to_state_actions
     _lex_from_state_actions = self.class.send :_lex_from_state_actions
+    _lex_eof_trans          = self.class.send :_lex_eof_trans
+    p, pe, eof = @p, @source.length + 1, @source.length + 1
-    p, pe, eof = @p, @source.length + 1, nil
+    @command_state = (@cs == self.class.lex_en_expr_value ||
+                      @cs == self.class.lex_en_line_begin)
     %% write exec;
     # %
@@ -223,8 +262,8 @@ class Parser::Lexer
   protected
-  def eof_char?(char)
-    [0x04, 0x1a, 0x00].include? char.ord
+  def eof_codepoint?(point)
+    [0x04, 0x1a, 0x00].include? point
   end
   def version?(*versions)
@@ -236,8 +275,22 @@ class Parser::Lexer
     @stack[@top]
   end
-  def tok(s = @ts, e = @te)
-    @source[s...e]
+  if "".respond_to?(:encode)
+    def encode_escape(ord)
+      ord.chr.force_encoding(@encoding)
+    end
+    def tok(s = @ts, e = @te)
+      @source[s...e].encode(@encoding)
+    end
+  else
+    def encode_escape(ord)
+      ord.chr
+    end
+    def tok(s = @ts, e = @te)
+      @source[s...e]
+    end
   end
   def range(s = @ts, e = @te)
@@ -260,6 +313,24 @@ class Parser::Lexer
     emit(table[value], value, s, e)
   end
+  def emit_do(do_block=false)
+    if @cond.active?
+      emit(:kDO_COND)
+    elsif @cmdarg.active? || do_block
+      emit(:kDO_BLOCK)
+    else
+      emit(:kDO)
+    end
+  end
+  def arg_or_cmdarg
+    if @command_state
+      self.class.lex_en_expr_cmdarg
+    else
+      self.class.lex_en_expr_arg
+    end
+  end
   def emit_comment(s = @ts, e = @te)
     if @comments
       @comments.push(Parser::Source::Comment.new(range(s, e)))
@@ -351,9 +422,9 @@ class Parser::Lexer
   }
   KEYWORDS_BEGIN = {
-    'if'     => :kIF,          'unless' => :kUNLESS,
-    'while'  => :kWHILE,       'until'  => :kUNTIL,
-    'rescue' => :kRESCUE
+    'if'     => :kIF,          'unless'   => :kUNLESS,
+    'while'  => :kWHILE,       'until'    => :kUNTIL,
+    'rescue' => :kRESCUE,      'defined?' => :kDEFINED,
   }
   %w(class module def undef begin end then elsif else ensure case when
@@ -366,7 +437,7 @@ class Parser::Lexer
   # %
   access @;
-  getkey @source[p].ord;
+  getkey (@source_pts[p] || 0);
   # === CHARACTER CLASSES ===
   #
@@ -384,13 +455,16 @@ class Parser::Lexer
     @newline_s = p
   }
-  c_nl       = '\r'? '\n' $ do_nl;
+  c_nl       = '\n' $ do_nl;
   c_space    = [ \t\r\f\v];
   c_space_nl = c_space | c_nl;
-  c_eof      = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
+  c_eof      = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
   c_eol      = c_nl | c_eof;
-  c_any      = any - c_eof - zlen;
-  c_line     = c_any - c_nl;
+  c_any      = any - c_eof;
+  c_nl_zlen  = c_nl | zlen;
+  c_line     = any - c_nl_zlen;
   c_unicode  = c_any - 0x00..0x7f;
   c_upper    = [A-Z];
@@ -403,7 +477,7 @@ class Parser::Lexer
     # This allows to feed the lexer more data if needed; this is only used
     # in tests.
     #
-    # Note that this action is not embedded into e_eof like e_nl and e_bs
+    # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
     # below. This is due to the fact that scanner state at EOF is observed
     # by tests, and encapsulating it in a rule would break the introspection.
     fhold; fbreak;
@@ -524,7 +598,7 @@ class Parser::Lexer
         break
       end
-      @escape += codepoint.chr(Encoding::UTF_8)
+      @escape     += codepoint.chr(Encoding::UTF_8)
       codepoint_s += codepoint_str.length + 1
     end
   }
@@ -544,11 +618,11 @@ class Parser::Lexer
   }
   action slash_c_char {
-    @escape = (@escape[0].ord & 0x9f).chr
+    @escape = encode_escape(@escape[0].ord & 0x9f)
   }
   action slash_m_char {
-    @escape = (@escape[0].ord | 0x80).chr
+    @escape = encode_escape(@escape[0].ord | 0x80)
   }
   maybe_escaped_char = (
@@ -565,11 +639,11 @@ class Parser::Lexer
   escape = (
       # \377
       [0-7]{1,3}
-      % { @escape = tok(@escape_s, p).to_i(8).chr }
+      % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
       # \xff
     | ( 'x' xdigit{1,2}
-        % { @escape = tok(@escape_s + 1, p).to_i(16).chr }
+        % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
       # \u263a
       | 'u' xdigit{4}
         % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
@@ -586,10 +660,10 @@ class Parser::Lexer
       # %q[\u123] %q[\u{12]
     | 'u' ( c_any{0,4}  -
-            xdigit{4}   -          # \u1234 is valid
-            ( '{' xdigit{1,3}      # \u{1 \u{12 \u{123 are valid
-            | '{' xdigit [ \t}]    # \u{1. \u{1} are valid
-            | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
+            xdigit{4}   -           # \u1234 is valid
+            ( '{' xdigit{1,3}       # \u{1 \u{12 \u{123 are valid
+            | '{' xdigit [ \t}] any # \u{1. \u{1} are valid
+            | '{' xdigit{2} [ \t}]  # \u{12. \u{12} are valid
             )
           )
       % {
@@ -631,7 +705,7 @@ class Parser::Lexer
     | 'C' c_any %invalid_complex_escape
     | 'M' c_any %invalid_complex_escape
-    | ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
+    | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
     | ( c_any - [0-7xuCMc] ) %unescape_char
@@ -692,10 +766,12 @@ class Parser::Lexer
   };
   action extend_string {
-    if !literal.heredoc? && literal.nest_and_try_closing(tok, @ts, @te)
+    string = @source[@ts...@te]
+    if !literal.heredoc? && literal.nest_and_try_closing(string, @ts, @te)
       fnext *pop_literal; fbreak;
     else
-      literal.extend_string(tok, @ts, @te)
+      literal.extend_string(string, @ts, @te)
     end
   }
@@ -748,35 +824,56 @@ class Parser::Lexer
   # As heredoc closing line can immediately precede EOF, this action
   # has to handle such case specially.
   action extend_string_eol {
-    is_eof = eof_char? @source[p]
+    if @te == pe
+      diagnostic :fatal, Parser::ERRORS[:string_eof],
+                 range(literal.str_s, literal.str_s + 1)
+    end
     if literal.heredoc?
+      line = tok(@herebody_s, @ts).gsub(/\r+$/, '')
       # Try ending the heredoc with the complete most recently
       # scanned line. @herebody_s always refers to the start of such line.
-      if literal.nest_and_try_closing(tok(@herebody_s, @ts),
-                                      @herebody_s, @ts)
+      if literal.nest_and_try_closing(line, @herebody_s, @ts)
         # Adjust @herebody_s to point to the next line.
         @herebody_s = @te
         # Continue regular lexing after the heredoc reference (<<END).
         p = literal.heredoc_e - 1
-        fgoto *pop_literal;
+        fnext *pop_literal; fbreak;
       else
         # Ditto.
         @herebody_s = @te
       end
-    end
+    else
+      # Try ending the literal with a newline.
+      if literal.nest_and_try_closing(tok, @ts, @te)
+        fnext *pop_literal; fbreak;
+      end
-    if is_eof
-      diagnostic :fatal, Parser::ERRORS[:string_eof],
-                 range(literal.str_s, literal.str_s + 1)
+      if @herebody_s
+        # This is a regular literal intertwined with a heredoc. Like:
+        #
+        #     p <<-foo+"1
+        #     bar
+        #     foo
+        #     2"
+        #
+        # which, incidentally, evaluates to "bar\n12".
+        p = @herebody_s - 1
+        @herebody_s = nil
+      end
     end
-    # A literal newline is appended if the heredoc was _not_ closed
-    # this time. See also Literal#nest_and_try_closing for rationale of
-    # calling #flush_string here.
-    literal.extend_string tok, @ts, @te
-    literal.flush_string
+    if literal.words? && !eof_codepoint?(@source_pts[p])
+      literal.extend_space @ts, @te
+    else
+      # A literal newline is appended if the heredoc was _not_ closed
+      # this time (see fbreak above). See also Literal#nest_and_try_closing
+      # for rationale of calling #flush_string here.
+      literal.extend_string tok, @ts, @te
+      literal.flush_string
+    end
   }
   action extend_string_space {
@@ -850,11 +947,13 @@ class Parser::Lexer
     emit(:tSTRING_DBEG, '#{')
-    literal.saved_herebody_s = @herebody_s
-    @herebody_s = nil
+    if literal.heredoc?
+      literal.saved_herebody_s = @herebody_s
+      @herebody_s = nil
+    end
     literal.start_interp_brace
-    fcall expr_beg;
+    fcall expr_value;
   }
   # Actual string parsers are simply combined from the primitives defined
@@ -864,7 +963,7 @@ class Parser::Lexer
       interp_code => extend_interp_code;
       interp_var  => extend_interp_var;
       e_bs escape => extend_string_escaped;
-      c_space_nl+ => extend_string_space;
+      c_space+    => extend_string_space;
       c_eol       => extend_string_eol;
       c_any       => extend_string;
   *|;
@@ -879,7 +978,7 @@ class Parser::Lexer
   plain_words := |*
       e_bs c_any  => extend_string_escaped;
-      c_space_nl+ => extend_string_space;
+      c_space+    => extend_string_space;
       c_eol       => extend_string_eol;
       c_any       => extend_string;
   *|;
@@ -930,7 +1029,10 @@ class Parser::Lexer
     ;
   w_comment =
-      '#' %{ @sharp_s = p - 1 } c_line* %{ emit_comment(@sharp_s, p) }
+      '#'     %{ @sharp_s = p - 1 }
+      # The (p == pe) condition compensates for added "\0" and
+      # the way Ragel handles EOF.
+      c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
     ;
   w_space_comment =
@@ -1014,15 +1116,14 @@ class Parser::Lexer
     @paren_nest -= 1
   };
-  # Ruby >=1.9.2 is context-sensitive wrt/ local identifiers.
+  # Ruby is context-sensitive wrt/ local identifiers.
   action local_ident {
     emit(:tIDENTIFIER)
-    if !version?(18) &&
-          !@static_env.nil? && @static_env.declared?(tok)
+    if !@static_env.nil? && @static_env.declared?(tok)
       fnext expr_end; fbreak;
     else
-      fnext expr_arg; fbreak;
+      fnext *arg_or_cmdarg; fbreak;
     end
   }
@@ -1140,15 +1241,15 @@ class Parser::Lexer
   expr_dot := |*
       constant
       => { emit(:tCONSTANT)
-           fnext expr_arg; fbreak; };
+           fnext *arg_or_cmdarg; fbreak; };
       call_or_var
       => { emit(:tIDENTIFIER)
-           fnext expr_arg; fbreak; };
+           fnext *arg_or_cmdarg; fbreak; };
-      call_or_var ambiguous_ident_suffix
+      bareword ambiguous_fid_suffix
       => { emit(:tFID, tok(@ts, tm), @ts, tm)
-           fnext expr_arg; p = tm - 1; fbreak; };
+           fnext *arg_or_cmdarg; p = tm - 1; fbreak; };
       # See the comment in `expr_fname`.
       operator_fname      |
@@ -1176,8 +1277,15 @@ class Parser::Lexer
       # cmd (1 + 2)
       # See below the rationale about expr_endarg.
       w_space+ e_lparen
-      => { emit(:tLPAREN_ARG, '(', @te - 1, @te)
-           fnext expr_beg; fbreak; };
+      => {
+        if version?(18)
+          emit(:tLPAREN2, '(', @te - 1, @te)
+          fnext expr_value; fbreak;
+        else
+          emit(:tLPAREN_ARG, '(', @te - 1, @te)
+          fnext expr_beg; fbreak;
+        end
+      };
       # meth(1 + 2)
       # Regular method call.
@@ -1210,13 +1318,12 @@ class Parser::Lexer
       # a ?b
       # Character literal.
-      w_space+ '?'
+      w_space* '?'
       => { fhold; fgoto expr_beg; };
                # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
-      w_space+ ( '%' [^= ]
                # a /foo/ (but not "a / foo" or "a /=foo")
-               | '/' ( c_any - c_space_nl - '=' )
+      w_space+ ( [%/] ( c_any - c_space_nl - '=' ) # /
                # a <<HEREDOC
                | '<<'
                )
@@ -1263,14 +1370,14 @@ class Parser::Lexer
       # a ? b
       # Ternary operator.
-      w_space+ '?' c_space_nl
-      => { fhold; fhold; fgoto expr_end; };
+      w_space+ %{ tm = p } '?' c_space_nl
+      => { p = tm - 1; fgoto expr_end; };
       # x + 1: Binary operator or operator-assignment.
       w_space* operator_arithmetic
                   ( '=' | c_space_nl )?    |
       # x rescue y: Modifier keyword.
-      w_space+ keyword_modifier            |
+      w_space* keyword_modifier            |
       # Miscellanea.
       w_space* punctuation_end
       => {
@@ -1292,6 +1399,43 @@ class Parser::Lexer
       c_eof => do_eof;
   *|;
+  # The previous token was an identifier which was seen while in the
+  # command mode (that is, the state at the beginning of #advance was
+  # expr_value). This state is very similar to expr_arg, but disambiguates
+  # two very rare and specific condition:
+  #   * In 1.8 mode, "foo (lambda do end)".
+  #   * In 1.9+ mode, "f x: -> do foo do end end".
+  expr_cmdarg := |*
+      w_space+ e_lparen
+      => {
+        emit(:tLPAREN_ARG, '(', @te - 1, @te)
+        if version?(18)
+          fnext expr_value; fbreak;
+        else
+          fnext expr_beg; fbreak;
+        end
+      };
+      w_space* 'do'
+      => {
+        if @cond.active?
+          emit(:kDO_COND, 'do', @te - 2, @te)
+        else
+          emit(:kDO, 'do', @te - 2, @te)
+        end
+        fnext expr_value; fbreak;
+      };
+      c_any             |
+      # Disambiguate with the `do' rule above.
+      w_space* bareword |
+      w_space* label
+      => { p = @ts - 1
+           fgoto expr_arg; };
+      c_eof => do_eof;
+  *|;
   # The rationale for this state is pretty complex. Normally, if an argument
   # is passed to a command and then there is a block (tLCURLY...tRCURLY),
   # the block is attached to the innermost argument (`f` in `m f {}`), or it
@@ -1313,8 +1457,8 @@ class Parser::Lexer
            fnext expr_value; };
       'do'
-      => { emit(:kDO_BLOCK)
-           fnext expr_value; };
+      => { emit_do(true)
+           fnext expr_value; fbreak; };
       w_space_comment;
@@ -1334,6 +1478,9 @@ class Parser::Lexer
       => { emit_table(KEYWORDS)
            fnext expr_beg; fbreak; };
+      bareword
+      => { p = @ts - 1; fgoto expr_beg; };
       w_space_comment;
       w_newline
@@ -1383,7 +1530,7 @@ class Parser::Lexer
       };
       # %<string>
-      '%' ( c_any - [A-Za-z] )
+      '%' ( any - [A-Za-z] )
       => {
         type, delimiter = tok[0].chr, tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
@@ -1517,8 +1664,9 @@ class Parser::Lexer
       # rescue Exception => e: Block rescue.
       # Special because it should transition to expr_mid.
-      'rescue'
-      => { emit_table(KEYWORDS_BEGIN)
+      'rescue' %{ tm = p } '=>'?
+      => { emit_table(KEYWORDS_BEGIN, @ts, tm)
+           p = tm - 1
            fnext expr_mid; fbreak; };
       # if a: Statement if.
@@ -1535,8 +1683,17 @@ class Parser::Lexer
         fhold;
         if version?(18)
-          emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
+          ident = tok(@ts, @te - 2)
+          emit((tok[0] =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
+               ident, @ts, @te - 2)
           fhold; # continue as a symbol
+          if !@static_env.nil? && @static_env.declared?(ident)
+            fnext expr_end;
+          else
+            fnext *arg_or_cmdarg;
+          end
         else
           emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
         end
@@ -1557,7 +1714,8 @@ class Parser::Lexer
       # a = 42;     a [42]: Indexing.
       # def a; end; a [42]: Array argument.
-      call_or_var => local_ident;
+      call_or_var
+      => local_ident;
       #
       # WHITESPACE
@@ -1565,7 +1723,7 @@ class Parser::Lexer
       w_any;
-      e_heredoc_nl '=begin' ( c_space | c_eol )
+      e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
       => { p = @ts - 1
            fgoto line_begin; };
@@ -1594,6 +1752,9 @@ class Parser::Lexer
       w_space_comment;
+      w_newline
+      => { fgoto line_begin; };
       c_any
       => { fhold; fgoto expr_beg; };
@@ -1627,13 +1788,7 @@ class Parser::Lexer
           if tok == '{'
             emit_table(PUNCTUATION)
           else # 'do'
-            if @cond.active?
-              emit(:kDO_COND)
-            elsif @cmdarg.active?
-              emit(:kDO_BLOCK)
-            else
-              emit(:kDO)
-            end
+            emit_do
           end
         end
@@ -1682,6 +1837,12 @@ class Parser::Lexer
       => {
         if version?(18)
           emit(:tIDENTIFIER)
+          if !@static_env.nil? && @static_env.declared?(tok)
+            fnext expr_end;
+          else
+            fnext *arg_or_cmdarg;
+          end
         else
           emit_table(KEYWORDS)
         end
@@ -1707,20 +1868,16 @@ class Parser::Lexer
       | [1-9] digit*
                   %{ @num_base = 10; @num_digits_s = @ts }
                ( '_' digit+ )* digit* '_'?
-      | '0'       %{ @num_base = 8;  @num_digits_s = @ts }
+      | '0' digit*
+                  %{ @num_base = 8;  @num_digits_s = @ts }
                ( '_' digit+ )* digit* '_'?
-      ) %{ tm = p } c_alpha?
+      )
       => {
-        unless (char = tok(tm, @te)).empty?
-          diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
-                     range(tm, tm + 1)
-        end
-        digits = tok(@num_digits_s, tm)
+        digits = tok(@num_digits_s)
         if digits.end_with? '_'
-          diagnostic :error, Parser::ERRORS[:trailing_underscore],
-                     range(tm - 1, tm)
+          diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => '_' },
+                     range(@te - 1, @te)
         elsif digits.empty? && @num_base == 8 && version?(18)
           # 1.8 did not raise an error on 0o.
           digits = "0"
@@ -1732,39 +1889,27 @@ class Parser::Lexer
                      range(invalid_s, invalid_s + 1)
         end
-        emit(:tINTEGER, digits.to_i(@num_base), @ts, tm)
-        p = tm - 1
+        emit(:tINTEGER, digits.to_i(@num_base))
         fbreak;
       };
-      # Floating point literals cannot start with 0 except when a dot
-      # follows immediately, probably to avoid confusion with octal literals.
-      ( [1-9] [0-9]* ( '_' digit+ )* |
-        '0'
-      )?
-      (
-          '.' ( digit+ '_' )* digit+ |
-        ( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
-      ) %{ tm = p } c_alpha?
+      '.' ( digit+ '_' )* digit+
       => {
-        unless (char = tok(tm, @te)).empty?
-          diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
-                     range(tm, tm + 1)
-        end
-        digits = tok(@ts, tm)
+        diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
+      };
-        if digits.start_with? '.'
-          diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
-        elsif digits =~ /^[eE]/
-          # The rule above allows to specify floats as just `e10', which is
-          # certainly not a float. Send a patch if you can do this better.
-          emit(:tIDENTIFIER, digits, @ts, tm)
-          fbreak;
+      (
+        ( [1-9] [0-9]* ( '_' digit+ )* | '0' )
+        ( '.' ( digit+ '_' )* digit+ )?
+        ( [eE] [+\-]? ( digit+ '_' )* digit* )?
+      )
+      => {
+        if tok.end_with? 'e'
+          diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => 'e' },
+                     range(@te - 1, @te)
         end
-        emit(:tFLOAT, digits.to_f, @ts, tm)
-        p = tm - 1
+        emit(:tFLOAT, tok.to_f)
         fbreak;
       };
@@ -1785,7 +1930,7 @@ class Parser::Lexer
       constant
       => { emit(:tCONSTANT)
-           fnext expr_arg; fbreak; };
+           fnext *arg_or_cmdarg; fbreak; };
       constant ambiguous_const_suffix
       => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
@@ -1802,9 +1947,10 @@ class Parser::Lexer
       => { emit_table(PUNCTUATION)
            fnext expr_dot; fbreak; };
-      call_or_var => local_ident;
+      call_or_var
+      => local_ident;
-      call_or_var ambiguous_fid_suffix
+      bareword ambiguous_fid_suffix
       => { emit(:tFID, tok(@ts, tm), @ts, tm)
            p = tm - 1
            fnext expr_arg; fbreak; };
@@ -1821,9 +1967,18 @@ class Parser::Lexer
            fnext expr_beg; fbreak; };
       e_rbrace | e_rparen | ']'
-      => { emit_table(PUNCTUATION)
-           @cond.lexpop; @cmdarg.lexpop
-           fbreak; };
+      => {
+        emit_table(PUNCTUATION)
+        @cond.lexpop; @cmdarg.lexpop
+        if %w"} ]".include?(tok)
+          fnext expr_endarg;
+        else # )
+          # fnext expr_endfn; ?
+        end
+        fbreak;
+      };
       operator_arithmetic '='
       => { emit(:tOP_ASGN, tok(@ts, @te - 1))
@@ -1887,15 +2042,15 @@ class Parser::Lexer
   #
   line_comment := |*
-      '=end' c_line* c_nl?
+      '=end' c_line* c_nl_zlen
       => {
         emit_comment(@eq_begin_s, @te)
         fgoto line_begin;
       };
-      c_any;
+      c_line* c_nl;
-      c_eof
+      c_line* zlen
       => {
         diagnostic :fatal, Parser::ERRORS[:embedded_document],
                    range(@eq_begin_s, @eq_begin_s + '=begin'.length)
@@ -1905,11 +2060,11 @@ class Parser::Lexer
   line_begin := |*
       w_any;
-      '=begin' ( c_space | c_eol )
+      '=begin' ( c_space | c_nl_zlen )
       => { @eq_begin_s = @ts
            fgoto line_comment; };
-      '__END__' c_eol
+      '__END__' c_nl_zlen
       => { p = pe - 1 };
       c_any