parser 2.0.0.pre2 → 2.0.0.pre3

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,13 +1,35 @@
1
1
  module Parser
2
2
 
3
+ ##
4
+ # Default AST builder. Uses {AST::Node}s.
5
+ #
3
6
  class Builders::Default
7
+ ##
8
+ # @api private
4
9
  attr_accessor :parser
10
+
11
+ ##
12
+ # If set to true, `__FILE__` and `__LINE__` are transformed to
13
+ # literal nodes. For example, `s(:str, "lib/foo.rb")` and `s(:int, 10)`.
14
+ #
15
+ # If set to false, `__FILE__` and `__LINE__` are emitted as-is,
16
+ # i.e. as `s(:__FILE__)` and `s(:__LINE__)` nodes.
17
+ #
18
+ # Source maps are identical in both cases.
19
+ #
20
+ # @return [TrueClass|FalseClass]
5
21
  attr_accessor :emit_file_line_as_literals
6
22
 
23
+ ##
24
+ # Initializes attributes:
25
+ #
26
+ # * `emit_file_line_as_literals`: `true`
7
27
  def initialize
8
28
  @emit_file_line_as_literals = true
9
29
  end
10
30
 
31
+ # @!parse private
32
+
11
33
  #
12
34
  # Literals
13
35
  #
@@ -359,6 +381,10 @@ module Parser
359
381
  end
360
382
  end
361
383
 
384
+ def const_op_assignable(node)
385
+ node.updated(:casgn)
386
+ end
387
+
362
388
  def assign(lhs, eql_t, rhs)
363
389
  (lhs << rhs).updated(nil, nil,
364
390
  :location => lhs.loc.
@@ -462,8 +488,9 @@ module Parser
462
488
  # Formal arguments
463
489
  #
464
490
 
465
- def args(begin_t, args, end_t)
466
- n(:args, [ *check_duplicate_args(args) ],
491
+ def args(begin_t, args, end_t, check_args=true)
492
+ args = check_duplicate_args(args) if check_args
493
+ n(:args, args,
467
494
  collection_map(begin_t, args, end_t))
468
495
  end
469
496
 
@@ -636,8 +663,24 @@ module Parser
636
663
  receiver.children.count == 2 &&
637
664
  receiver.children.first.type == :str
638
665
 
639
- regexp_str, _regopt = *receiver
640
- regexp_body, = *regexp_str
666
+ str_node, opt_node = *receiver
667
+ regexp_body, = *str_node
668
+ *regexp_opt = *opt_node
669
+
670
+ if defined?(Encoding)
671
+ regexp_body = case
672
+ when regexp_opt.include?(:u)
673
+ regexp_body.encode(Encoding::UTF_8)
674
+ when regexp_opt.include?(:e)
675
+ regexp_body.encode(Encoding::EUC_JP)
676
+ when regexp_opt.include?(:s)
677
+ regexp_body.encode(Encoding::WINDOWS_31J)
678
+ when regexp_opt.include?(:n)
679
+ regexp_body.encode(Encoding::BINARY)
680
+ else
681
+ regexp_body
682
+ end
683
+ end
641
684
 
642
685
  Regexp.new(regexp_body).names.each do |name|
643
686
  @parser.static_env.declare(name)
@@ -880,10 +923,15 @@ module Parser
880
923
  when :erange then :eflipflop
881
924
  end
882
925
 
883
- cond.updated(type, [
884
- check_condition(lhs),
885
- check_condition(rhs)
886
- ])
926
+ if [:and, :or].include?(cond.type) &&
927
+ @parser.version == 18
928
+ cond
929
+ else
930
+ cond.updated(type, [
931
+ check_condition(lhs),
932
+ check_condition(rhs)
933
+ ])
934
+ end
887
935
 
888
936
  when :regexp
889
937
  n(:match_current_line, [ cond ], nil)
@@ -1230,7 +1278,11 @@ module Parser
1230
1278
  end
1231
1279
 
1232
1280
  if else_t
1233
- end_l = else_e.loc.expression
1281
+ if else_e.nil?
1282
+ end_l = loc(else_t)
1283
+ else
1284
+ end_l = else_e.loc.expression
1285
+ end
1234
1286
  elsif !body_es.last.nil?
1235
1287
  end_l = body_es.last.loc.expression
1236
1288
  else
@@ -1,3 +1,10 @@
1
+ ##
2
+ # @api public
3
+ #
4
+ # This monkeypatch extends Ruby 1.8 {String#%} with an ability
5
+ # to replace named capture groups, i.e.
6
+ # `"foo: %{bar}" % { :bar => 10 } # => "foo: 10"`.
7
+ #
1
8
  class String
2
9
  alias original_percent %
3
10
 
@@ -1,17 +1,22 @@
1
1
  module Parser
2
2
 
3
3
  ##
4
+ # @api public
5
+ #
4
6
  # @!attribute [r] level
5
- # @return [Symbol]
7
+ # @see LEVELS
8
+ # @return [Symbol] diagnostic level
6
9
  #
7
10
  # @!attribute [r] message
8
- # @return [String]
11
+ # @return [String] error message
9
12
  #
10
13
  # @!attribute [r] location
11
- # @return [Parser::Source::Map]
14
+ # Main error-related source range.
15
+ # @return [Parser::Source::Range]
12
16
  #
13
17
  # @!attribute [r] highlights
14
- # @return [Array]
18
+ # Supplementary error-related source ranges.
19
+ # @return [Array<Parser::Source::Range>]
15
20
  #
16
21
  class Diagnostic
17
22
  ##
@@ -46,7 +51,15 @@ module Parser
46
51
  end
47
52
 
48
53
  ##
49
- # Renders the diagnostic message as an array of three lines.
54
+ # Renders the diagnostic message as a clang-like diagnostic.
55
+ #
56
+ # @example
57
+ # diagnostic.render # =>
58
+ # # [
59
+ # # "(fragment:0):1:5: error: unexpected token $end",
60
+ # # "foo +",
61
+ # # " ^"
62
+ # # ]
50
63
  #
51
64
  # @return [Array<String>]
52
65
  #
@@ -4,22 +4,23 @@ module Parser
4
4
  # {Parser::Diagnostic::Engine} provides a basic API for dealing with
5
5
  # diagnostics by delegating them to registered consumers.
6
6
  #
7
- # Basic usage is as following:
7
+ # @example
8
+ # buffer = Parser::Source::Buffer.new(__FILE__)
9
+ # buffer.code = 'foobar'
8
10
  #
9
- # buffer = Parser::Source::Buffer.new(__FILE__)
10
- # buffer.code = 'foobar'
11
+ # consumer = lambda do |diagnostic|
12
+ # puts diagnostic.message
13
+ # end
11
14
  #
12
- # consumer = lambda do |diagnostic|
13
- # puts diagnostic.message
14
- # end
15
+ # engine = Parser::Diagnostic::Engine.new(consumer)
16
+ # diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
15
17
  #
16
- # engine = Parser::Diagnostic::Engine.new(consumer)
17
- # diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
18
+ # engine.process(diagnostic) # => "warning!"
18
19
  #
19
- # engine.process(diagnostic) # => "warning!"
20
+ # @api public
20
21
  #
21
22
  # @!attribute [rw] consumer
22
- # @return [#call]
23
+ # @return [#call(Diagnostic)]
23
24
  #
24
25
  # @!attribute [rw] all_errors_are_fatal
25
26
  # When set to `true` any error that is encountered will result in
@@ -37,7 +38,7 @@ module Parser
37
38
  attr_accessor :ignore_warnings
38
39
 
39
40
  ##
40
- # @param [#call] consumer
41
+ # @param [#call(Diagnostic)] consumer
41
42
  #
42
43
  def initialize(consumer=nil)
43
44
  @consumer = consumer
@@ -82,6 +82,7 @@ class Parser::Lexer
82
82
  # %
83
83
 
84
84
  attr_reader :source_buffer
85
+ attr_reader :encoding
85
86
 
86
87
  attr_accessor :diagnostics
87
88
  attr_accessor :static_env
@@ -110,6 +111,10 @@ class Parser::Lexer
110
111
  @cmdarg = StackState.new('cmdarg')
111
112
  end
112
113
 
114
+ @source = nil # source string
115
+ @source_pts = nil # @source as a codepoint array
116
+ @encoding = nil # target encoding for output strings
117
+
113
118
  @p = 0 # stream position (saved manually in #advance)
114
119
  @ts = nil # token start
115
120
  @te = nil # token end
@@ -141,27 +146,51 @@ class Parser::Lexer
141
146
  # encountered after a matching closing parenthesis.
142
147
  @paren_nest = 0
143
148
  @lambda_stack = []
149
+
150
+ # If the lexer is in `command state' (aka expr_value)
151
+ # at the entry to #advance, it will transition to expr_cmdarg
152
+ # instead of expr_arg at certain points.
153
+ @command_state = false
144
154
  end
145
155
 
146
156
  def source_buffer=(source_buffer)
147
157
  @source_buffer = source_buffer
148
158
 
149
159
  if @source_buffer
150
- # Heredoc processing coupled with weird newline quirks
151
- # require three '\0' (EOF) chars to be appended; after
152
- # `p = @heredoc_s`, if `p` points at EOF, the FSM could
153
- # not bail out early enough and will crash.
154
- #
155
- # Patches accepted.
156
- #
157
- @source = @source_buffer.source + "\0\0\0"
160
+ @source = @source_buffer.source + "\0"
158
161
 
159
- if @source.length > 0 && @source[0].ord == 0xfeff
162
+ if defined?(Encoding) && @source.encoding == Encoding::UTF_8
163
+ @source_pts = @source.unpack('U*')
164
+ else
165
+ @source_pts = @source.unpack('C*')
166
+ end
167
+
168
+ if defined?(Encoding)
169
+ @encoding = @source.encoding
170
+ end
171
+
172
+ if @source_pts.size > 1_000_000 && @source.respond_to?(:encode)
173
+ # A heuristic: if the buffer is larger than 1M, then
174
+ # store it in UTF-32 and convert the tokens as they're
175
+ # going out. If it's smaller, the conversion overhead
176
+ # dominates runtime and this stops being beneficial.
177
+ #
178
+ # This is not really a good heuristic, as the result
179
+ # heavily depends on token/character ratio. If it's low,
180
+ # say the gem consists mostly of long identifiers and
181
+ # symbols, then storing the source in UTF-8 would be faster.
182
+ #
183
+ # Patches accepted.
184
+ @source = @source.encode(Encoding::UTF_32LE)
185
+ end
186
+
187
+ if @source_pts[0] == 0xfeff
160
188
  # Skip byte order mark.
161
189
  @p = 1
162
190
  end
163
191
  else
164
- @source = nil
192
+ @source = nil
193
+ @source_pts = nil
165
194
  end
166
195
  end
167
196
 
@@ -173,9 +202,15 @@ class Parser::Lexer
173
202
  :expr_beg => lex_en_expr_beg,
174
203
  :expr_mid => lex_en_expr_mid,
175
204
  :expr_arg => lex_en_expr_arg,
205
+ :expr_cmdarg => lex_en_expr_cmdarg,
176
206
  :expr_end => lex_en_expr_end,
177
207
  :expr_endarg => lex_en_expr_endarg,
178
208
  :expr_endfn => lex_en_expr_endfn,
209
+
210
+ :interp_string => lex_en_interp_string,
211
+ :interp_words => lex_en_interp_words,
212
+ :plain_string => lex_en_plain_string,
213
+ :plain_words => lex_en_plain_string,
179
214
  }
180
215
 
181
216
  def state
@@ -204,8 +239,12 @@ class Parser::Lexer
204
239
  _lex_trans_actions = self.class.send :_lex_trans_actions
205
240
  _lex_to_state_actions = self.class.send :_lex_to_state_actions
206
241
  _lex_from_state_actions = self.class.send :_lex_from_state_actions
242
+ _lex_eof_trans = self.class.send :_lex_eof_trans
243
+
244
+ p, pe, eof = @p, @source.length + 1, @source.length + 1
207
245
 
208
- p, pe, eof = @p, @source.length + 1, nil
246
+ @command_state = (@cs == self.class.lex_en_expr_value ||
247
+ @cs == self.class.lex_en_line_begin)
209
248
 
210
249
  %% write exec;
211
250
  # %
@@ -223,8 +262,8 @@ class Parser::Lexer
223
262
 
224
263
  protected
225
264
 
226
- def eof_char?(char)
227
- [0x04, 0x1a, 0x00].include? char.ord
265
+ def eof_codepoint?(point)
266
+ [0x04, 0x1a, 0x00].include? point
228
267
  end
229
268
 
230
269
  def version?(*versions)
@@ -236,8 +275,22 @@ class Parser::Lexer
236
275
  @stack[@top]
237
276
  end
238
277
 
239
- def tok(s = @ts, e = @te)
240
- @source[s...e]
278
+ if "".respond_to?(:encode)
279
+ def encode_escape(ord)
280
+ ord.chr.force_encoding(@encoding)
281
+ end
282
+
283
+ def tok(s = @ts, e = @te)
284
+ @source[s...e].encode(@encoding)
285
+ end
286
+ else
287
+ def encode_escape(ord)
288
+ ord.chr
289
+ end
290
+
291
+ def tok(s = @ts, e = @te)
292
+ @source[s...e]
293
+ end
241
294
  end
242
295
 
243
296
  def range(s = @ts, e = @te)
@@ -260,6 +313,24 @@ class Parser::Lexer
260
313
  emit(table[value], value, s, e)
261
314
  end
262
315
 
316
+ def emit_do(do_block=false)
317
+ if @cond.active?
318
+ emit(:kDO_COND)
319
+ elsif @cmdarg.active? || do_block
320
+ emit(:kDO_BLOCK)
321
+ else
322
+ emit(:kDO)
323
+ end
324
+ end
325
+
326
+ def arg_or_cmdarg
327
+ if @command_state
328
+ self.class.lex_en_expr_cmdarg
329
+ else
330
+ self.class.lex_en_expr_arg
331
+ end
332
+ end
333
+
263
334
  def emit_comment(s = @ts, e = @te)
264
335
  if @comments
265
336
  @comments.push(Parser::Source::Comment.new(range(s, e)))
@@ -351,9 +422,9 @@ class Parser::Lexer
351
422
  }
352
423
 
353
424
  KEYWORDS_BEGIN = {
354
- 'if' => :kIF, 'unless' => :kUNLESS,
355
- 'while' => :kWHILE, 'until' => :kUNTIL,
356
- 'rescue' => :kRESCUE
425
+ 'if' => :kIF, 'unless' => :kUNLESS,
426
+ 'while' => :kWHILE, 'until' => :kUNTIL,
427
+ 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
357
428
  }
358
429
 
359
430
  %w(class module def undef begin end then elsif else ensure case when
@@ -366,7 +437,7 @@ class Parser::Lexer
366
437
  # %
367
438
 
368
439
  access @;
369
- getkey @source[p].ord;
440
+ getkey (@source_pts[p] || 0);
370
441
 
371
442
  # === CHARACTER CLASSES ===
372
443
  #
@@ -384,13 +455,16 @@ class Parser::Lexer
384
455
  @newline_s = p
385
456
  }
386
457
 
387
- c_nl = '\r'? '\n' $ do_nl;
458
+ c_nl = '\n' $ do_nl;
388
459
  c_space = [ \t\r\f\v];
389
460
  c_space_nl = c_space | c_nl;
390
- c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
461
+
462
+ c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
391
463
  c_eol = c_nl | c_eof;
392
- c_any = any - c_eof - zlen;
393
- c_line = c_any - c_nl;
464
+ c_any = any - c_eof;
465
+
466
+ c_nl_zlen = c_nl | zlen;
467
+ c_line = any - c_nl_zlen;
394
468
 
395
469
  c_unicode = c_any - 0x00..0x7f;
396
470
  c_upper = [A-Z];
@@ -403,7 +477,7 @@ class Parser::Lexer
403
477
  # This allows to feed the lexer more data if needed; this is only used
404
478
  # in tests.
405
479
  #
406
- # Note that this action is not embedded into e_eof like e_nl and e_bs
480
+ # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
407
481
  # below. This is due to the fact that scanner state at EOF is observed
408
482
  # by tests, and encapsulating it in a rule would break the introspection.
409
483
  fhold; fbreak;
@@ -524,7 +598,7 @@ class Parser::Lexer
524
598
  break
525
599
  end
526
600
 
527
- @escape += codepoint.chr(Encoding::UTF_8)
601
+ @escape += codepoint.chr(Encoding::UTF_8)
528
602
  codepoint_s += codepoint_str.length + 1
529
603
  end
530
604
  }
@@ -544,11 +618,11 @@ class Parser::Lexer
544
618
  }
545
619
 
546
620
  action slash_c_char {
547
- @escape = (@escape[0].ord & 0x9f).chr
621
+ @escape = encode_escape(@escape[0].ord & 0x9f)
548
622
  }
549
623
 
550
624
  action slash_m_char {
551
- @escape = (@escape[0].ord | 0x80).chr
625
+ @escape = encode_escape(@escape[0].ord | 0x80)
552
626
  }
553
627
 
554
628
  maybe_escaped_char = (
@@ -565,11 +639,11 @@ class Parser::Lexer
565
639
  escape = (
566
640
  # \377
567
641
  [0-7]{1,3}
568
- % { @escape = tok(@escape_s, p).to_i(8).chr }
642
+ % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
569
643
 
570
644
  # \xff
571
645
  | ( 'x' xdigit{1,2}
572
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr }
646
+ % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
573
647
  # \u263a
574
648
  | 'u' xdigit{4}
575
649
  % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
@@ -586,10 +660,10 @@ class Parser::Lexer
586
660
 
587
661
  # %q[\u123] %q[\u{12]
588
662
  | 'u' ( c_any{0,4} -
589
- xdigit{4} - # \u1234 is valid
590
- ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
591
- | '{' xdigit [ \t}] # \u{1. \u{1} are valid
592
- | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
663
+ xdigit{4} - # \u1234 is valid
664
+ ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
665
+ | '{' xdigit [ \t}] any # \u{1. \u{1} are valid
666
+ | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
593
667
  )
594
668
  )
595
669
  % {
@@ -631,7 +705,7 @@ class Parser::Lexer
631
705
 
632
706
  | 'C' c_any %invalid_complex_escape
633
707
  | 'M' c_any %invalid_complex_escape
634
- | ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
708
+ | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
635
709
 
636
710
  | ( c_any - [0-7xuCMc] ) %unescape_char
637
711
 
@@ -692,10 +766,12 @@ class Parser::Lexer
692
766
  };
693
767
 
694
768
  action extend_string {
695
- if !literal.heredoc? && literal.nest_and_try_closing(tok, @ts, @te)
769
+ string = @source[@ts...@te]
770
+
771
+ if !literal.heredoc? && literal.nest_and_try_closing(string, @ts, @te)
696
772
  fnext *pop_literal; fbreak;
697
773
  else
698
- literal.extend_string(tok, @ts, @te)
774
+ literal.extend_string(string, @ts, @te)
699
775
  end
700
776
  }
701
777
 
@@ -748,35 +824,56 @@ class Parser::Lexer
748
824
  # As heredoc closing line can immediately precede EOF, this action
749
825
  # has to handle such case specially.
750
826
  action extend_string_eol {
751
- is_eof = eof_char? @source[p]
827
+ if @te == pe
828
+ diagnostic :fatal, Parser::ERRORS[:string_eof],
829
+ range(literal.str_s, literal.str_s + 1)
830
+ end
752
831
 
753
832
  if literal.heredoc?
833
+ line = tok(@herebody_s, @ts).gsub(/\r+$/, '')
834
+
754
835
  # Try ending the heredoc with the complete most recently
755
836
  # scanned line. @herebody_s always refers to the start of such line.
756
- if literal.nest_and_try_closing(tok(@herebody_s, @ts),
757
- @herebody_s, @ts)
837
+ if literal.nest_and_try_closing(line, @herebody_s, @ts)
758
838
  # Adjust @herebody_s to point to the next line.
759
839
  @herebody_s = @te
760
840
 
761
841
  # Continue regular lexing after the heredoc reference (<<END).
762
842
  p = literal.heredoc_e - 1
763
- fgoto *pop_literal;
843
+ fnext *pop_literal; fbreak;
764
844
  else
765
845
  # Ditto.
766
846
  @herebody_s = @te
767
847
  end
768
- end
848
+ else
849
+ # Try ending the literal with a newline.
850
+ if literal.nest_and_try_closing(tok, @ts, @te)
851
+ fnext *pop_literal; fbreak;
852
+ end
769
853
 
770
- if is_eof
771
- diagnostic :fatal, Parser::ERRORS[:string_eof],
772
- range(literal.str_s, literal.str_s + 1)
854
+ if @herebody_s
855
+ # This is a regular literal intertwined with a heredoc. Like:
856
+ #
857
+ # p <<-foo+"1
858
+ # bar
859
+ # foo
860
+ # 2"
861
+ #
862
+ # which, incidentally, evaluates to "bar\n12".
863
+ p = @herebody_s - 1
864
+ @herebody_s = nil
865
+ end
773
866
  end
774
867
 
775
- # A literal newline is appended if the heredoc was _not_ closed
776
- # this time. See also Literal#nest_and_try_closing for rationale of
777
- # calling #flush_string here.
778
- literal.extend_string tok, @ts, @te
779
- literal.flush_string
868
+ if literal.words? && !eof_codepoint?(@source_pts[p])
869
+ literal.extend_space @ts, @te
870
+ else
871
+ # A literal newline is appended if the heredoc was _not_ closed
872
+ # this time (see fbreak above). See also Literal#nest_and_try_closing
873
+ # for rationale of calling #flush_string here.
874
+ literal.extend_string tok, @ts, @te
875
+ literal.flush_string
876
+ end
780
877
  }
781
878
 
782
879
  action extend_string_space {
@@ -850,11 +947,13 @@ class Parser::Lexer
850
947
 
851
948
  emit(:tSTRING_DBEG, '#{')
852
949
 
853
- literal.saved_herebody_s = @herebody_s
854
- @herebody_s = nil
950
+ if literal.heredoc?
951
+ literal.saved_herebody_s = @herebody_s
952
+ @herebody_s = nil
953
+ end
855
954
 
856
955
  literal.start_interp_brace
857
- fcall expr_beg;
956
+ fcall expr_value;
858
957
  }
859
958
 
860
959
  # Actual string parsers are simply combined from the primitives defined
@@ -864,7 +963,7 @@ class Parser::Lexer
864
963
  interp_code => extend_interp_code;
865
964
  interp_var => extend_interp_var;
866
965
  e_bs escape => extend_string_escaped;
867
- c_space_nl+ => extend_string_space;
966
+ c_space+ => extend_string_space;
868
967
  c_eol => extend_string_eol;
869
968
  c_any => extend_string;
870
969
  *|;
@@ -879,7 +978,7 @@ class Parser::Lexer
879
978
 
880
979
  plain_words := |*
881
980
  e_bs c_any => extend_string_escaped;
882
- c_space_nl+ => extend_string_space;
981
+ c_space+ => extend_string_space;
883
982
  c_eol => extend_string_eol;
884
983
  c_any => extend_string;
885
984
  *|;
@@ -930,7 +1029,10 @@ class Parser::Lexer
930
1029
  ;
931
1030
 
932
1031
  w_comment =
933
- '#' %{ @sharp_s = p - 1 } c_line* %{ emit_comment(@sharp_s, p) }
1032
+ '#' %{ @sharp_s = p - 1 }
1033
+ # The (p == pe) condition compensates for added "\0" and
1034
+ # the way Ragel handles EOF.
1035
+ c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
934
1036
  ;
935
1037
 
936
1038
  w_space_comment =
@@ -1014,15 +1116,14 @@ class Parser::Lexer
1014
1116
  @paren_nest -= 1
1015
1117
  };
1016
1118
 
1017
- # Ruby >=1.9.2 is context-sensitive wrt/ local identifiers.
1119
+ # Ruby is context-sensitive wrt/ local identifiers.
1018
1120
  action local_ident {
1019
1121
  emit(:tIDENTIFIER)
1020
1122
 
1021
- if !version?(18) &&
1022
- !@static_env.nil? && @static_env.declared?(tok)
1123
+ if !@static_env.nil? && @static_env.declared?(tok)
1023
1124
  fnext expr_end; fbreak;
1024
1125
  else
1025
- fnext expr_arg; fbreak;
1126
+ fnext *arg_or_cmdarg; fbreak;
1026
1127
  end
1027
1128
  }
1028
1129
 
@@ -1140,15 +1241,15 @@ class Parser::Lexer
1140
1241
  expr_dot := |*
1141
1242
  constant
1142
1243
  => { emit(:tCONSTANT)
1143
- fnext expr_arg; fbreak; };
1244
+ fnext *arg_or_cmdarg; fbreak; };
1144
1245
 
1145
1246
  call_or_var
1146
1247
  => { emit(:tIDENTIFIER)
1147
- fnext expr_arg; fbreak; };
1248
+ fnext *arg_or_cmdarg; fbreak; };
1148
1249
 
1149
- call_or_var ambiguous_ident_suffix
1250
+ bareword ambiguous_fid_suffix
1150
1251
  => { emit(:tFID, tok(@ts, tm), @ts, tm)
1151
- fnext expr_arg; p = tm - 1; fbreak; };
1252
+ fnext *arg_or_cmdarg; p = tm - 1; fbreak; };
1152
1253
 
1153
1254
  # See the comment in `expr_fname`.
1154
1255
  operator_fname |
@@ -1176,8 +1277,15 @@ class Parser::Lexer
1176
1277
  # cmd (1 + 2)
1177
1278
  # See below the rationale about expr_endarg.
1178
1279
  w_space+ e_lparen
1179
- => { emit(:tLPAREN_ARG, '(', @te - 1, @te)
1180
- fnext expr_beg; fbreak; };
1280
+ => {
1281
+ if version?(18)
1282
+ emit(:tLPAREN2, '(', @te - 1, @te)
1283
+ fnext expr_value; fbreak;
1284
+ else
1285
+ emit(:tLPAREN_ARG, '(', @te - 1, @te)
1286
+ fnext expr_beg; fbreak;
1287
+ end
1288
+ };
1181
1289
 
1182
1290
  # meth(1 + 2)
1183
1291
  # Regular method call.
@@ -1210,13 +1318,12 @@ class Parser::Lexer
1210
1318
 
1211
1319
  # a ?b
1212
1320
  # Character literal.
1213
- w_space+ '?'
1321
+ w_space* '?'
1214
1322
  => { fhold; fgoto expr_beg; };
1215
1323
 
1216
1324
  # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1217
- w_space+ ( '%' [^= ]
1218
1325
  # a /foo/ (but not "a / foo" or "a /=foo")
1219
- | '/' ( c_any - c_space_nl - '=' )
1326
+ w_space+ ( [%/] ( c_any - c_space_nl - '=' ) # /
1220
1327
  # a <<HEREDOC
1221
1328
  | '<<'
1222
1329
  )
@@ -1263,14 +1370,14 @@ class Parser::Lexer
1263
1370
 
1264
1371
  # a ? b
1265
1372
  # Ternary operator.
1266
- w_space+ '?' c_space_nl
1267
- => { fhold; fhold; fgoto expr_end; };
1373
+ w_space+ %{ tm = p } '?' c_space_nl
1374
+ => { p = tm - 1; fgoto expr_end; };
1268
1375
 
1269
1376
  # x + 1: Binary operator or operator-assignment.
1270
1377
  w_space* operator_arithmetic
1271
1378
  ( '=' | c_space_nl )? |
1272
1379
  # x rescue y: Modifier keyword.
1273
- w_space+ keyword_modifier |
1380
+ w_space* keyword_modifier |
1274
1381
  # Miscellanea.
1275
1382
  w_space* punctuation_end
1276
1383
  => {
@@ -1292,6 +1399,43 @@ class Parser::Lexer
1292
1399
  c_eof => do_eof;
1293
1400
  *|;
1294
1401
 
1402
+ # The previous token was an identifier which was seen while in the
1403
+ # command mode (that is, the state at the beginning of #advance was
1404
+ # expr_value). This state is very similar to expr_arg, but disambiguates
1405
+ # two very rare and specific condition:
1406
+ # * In 1.8 mode, "foo (lambda do end)".
1407
+ # * In 1.9+ mode, "f x: -> do foo do end end".
1408
+ expr_cmdarg := |*
1409
+ w_space+ e_lparen
1410
+ => {
1411
+ emit(:tLPAREN_ARG, '(', @te - 1, @te)
1412
+ if version?(18)
1413
+ fnext expr_value; fbreak;
1414
+ else
1415
+ fnext expr_beg; fbreak;
1416
+ end
1417
+ };
1418
+
1419
+ w_space* 'do'
1420
+ => {
1421
+ if @cond.active?
1422
+ emit(:kDO_COND, 'do', @te - 2, @te)
1423
+ else
1424
+ emit(:kDO, 'do', @te - 2, @te)
1425
+ end
1426
+ fnext expr_value; fbreak;
1427
+ };
1428
+
1429
+ c_any |
1430
+ # Disambiguate with the `do' rule above.
1431
+ w_space* bareword |
1432
+ w_space* label
1433
+ => { p = @ts - 1
1434
+ fgoto expr_arg; };
1435
+
1436
+ c_eof => do_eof;
1437
+ *|;
1438
+
1295
1439
  # The rationale for this state is pretty complex. Normally, if an argument
1296
1440
  # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1297
1441
  # the block is attached to the innermost argument (`f` in `m f {}`), or it
@@ -1313,8 +1457,8 @@ class Parser::Lexer
1313
1457
  fnext expr_value; };
1314
1458
 
1315
1459
  'do'
1316
- => { emit(:kDO_BLOCK)
1317
- fnext expr_value; };
1460
+ => { emit_do(true)
1461
+ fnext expr_value; fbreak; };
1318
1462
 
1319
1463
  w_space_comment;
1320
1464
 
@@ -1334,6 +1478,9 @@ class Parser::Lexer
1334
1478
  => { emit_table(KEYWORDS)
1335
1479
  fnext expr_beg; fbreak; };
1336
1480
 
1481
+ bareword
1482
+ => { p = @ts - 1; fgoto expr_beg; };
1483
+
1337
1484
  w_space_comment;
1338
1485
 
1339
1486
  w_newline
@@ -1383,7 +1530,7 @@ class Parser::Lexer
1383
1530
  };
1384
1531
 
1385
1532
  # %<string>
1386
- '%' ( c_any - [A-Za-z] )
1533
+ '%' ( any - [A-Za-z] )
1387
1534
  => {
1388
1535
  type, delimiter = tok[0].chr, tok[-1].chr
1389
1536
  fgoto *push_literal(type, delimiter, @ts);
@@ -1517,8 +1664,9 @@ class Parser::Lexer
1517
1664
 
1518
1665
  # rescue Exception => e: Block rescue.
1519
1666
  # Special because it should transition to expr_mid.
1520
- 'rescue'
1521
- => { emit_table(KEYWORDS_BEGIN)
1667
+ 'rescue' %{ tm = p } '=>'?
1668
+ => { emit_table(KEYWORDS_BEGIN, @ts, tm)
1669
+ p = tm - 1
1522
1670
  fnext expr_mid; fbreak; };
1523
1671
 
1524
1672
  # if a: Statement if.
@@ -1535,8 +1683,17 @@ class Parser::Lexer
1535
1683
  fhold;
1536
1684
 
1537
1685
  if version?(18)
1538
- emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
1686
+ ident = tok(@ts, @te - 2)
1687
+
1688
+ emit((tok[0] =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1689
+ ident, @ts, @te - 2)
1539
1690
  fhold; # continue as a symbol
1691
+
1692
+ if !@static_env.nil? && @static_env.declared?(ident)
1693
+ fnext expr_end;
1694
+ else
1695
+ fnext *arg_or_cmdarg;
1696
+ end
1540
1697
  else
1541
1698
  emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1542
1699
  end
@@ -1557,7 +1714,8 @@ class Parser::Lexer
1557
1714
 
1558
1715
  # a = 42; a [42]: Indexing.
1559
1716
  # def a; end; a [42]: Array argument.
1560
- call_or_var => local_ident;
1717
+ call_or_var
1718
+ => local_ident;
1561
1719
 
1562
1720
  #
1563
1721
  # WHITESPACE
@@ -1565,7 +1723,7 @@ class Parser::Lexer
1565
1723
 
1566
1724
  w_any;
1567
1725
 
1568
- e_heredoc_nl '=begin' ( c_space | c_eol )
1726
+ e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
1569
1727
  => { p = @ts - 1
1570
1728
  fgoto line_begin; };
1571
1729
 
@@ -1594,6 +1752,9 @@ class Parser::Lexer
1594
1752
 
1595
1753
  w_space_comment;
1596
1754
 
1755
+ w_newline
1756
+ => { fgoto line_begin; };
1757
+
1597
1758
  c_any
1598
1759
  => { fhold; fgoto expr_beg; };
1599
1760
 
@@ -1627,13 +1788,7 @@ class Parser::Lexer
1627
1788
  if tok == '{'
1628
1789
  emit_table(PUNCTUATION)
1629
1790
  else # 'do'
1630
- if @cond.active?
1631
- emit(:kDO_COND)
1632
- elsif @cmdarg.active?
1633
- emit(:kDO_BLOCK)
1634
- else
1635
- emit(:kDO)
1636
- end
1791
+ emit_do
1637
1792
  end
1638
1793
  end
1639
1794
 
@@ -1682,6 +1837,12 @@ class Parser::Lexer
1682
1837
  => {
1683
1838
  if version?(18)
1684
1839
  emit(:tIDENTIFIER)
1840
+
1841
+ if !@static_env.nil? && @static_env.declared?(tok)
1842
+ fnext expr_end;
1843
+ else
1844
+ fnext *arg_or_cmdarg;
1845
+ end
1685
1846
  else
1686
1847
  emit_table(KEYWORDS)
1687
1848
  end
@@ -1707,20 +1868,16 @@ class Parser::Lexer
1707
1868
  | [1-9] digit*
1708
1869
  %{ @num_base = 10; @num_digits_s = @ts }
1709
1870
  ( '_' digit+ )* digit* '_'?
1710
- | '0' %{ @num_base = 8; @num_digits_s = @ts }
1871
+ | '0' digit*
1872
+ %{ @num_base = 8; @num_digits_s = @ts }
1711
1873
  ( '_' digit+ )* digit* '_'?
1712
- ) %{ tm = p } c_alpha?
1874
+ )
1713
1875
  => {
1714
- unless (char = tok(tm, @te)).empty?
1715
- diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
1716
- range(tm, tm + 1)
1717
- end
1718
-
1719
- digits = tok(@num_digits_s, tm)
1876
+ digits = tok(@num_digits_s)
1720
1877
 
1721
1878
  if digits.end_with? '_'
1722
- diagnostic :error, Parser::ERRORS[:trailing_underscore],
1723
- range(tm - 1, tm)
1879
+ diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => '_' },
1880
+ range(@te - 1, @te)
1724
1881
  elsif digits.empty? && @num_base == 8 && version?(18)
1725
1882
  # 1.8 did not raise an error on 0o.
1726
1883
  digits = "0"
@@ -1732,39 +1889,27 @@ class Parser::Lexer
1732
1889
  range(invalid_s, invalid_s + 1)
1733
1890
  end
1734
1891
 
1735
- emit(:tINTEGER, digits.to_i(@num_base), @ts, tm)
1736
- p = tm - 1
1892
+ emit(:tINTEGER, digits.to_i(@num_base))
1737
1893
  fbreak;
1738
1894
  };
1739
1895
 
1740
- # Floating point literals cannot start with 0 except when a dot
1741
- # follows immediately, probably to avoid confusion with octal literals.
1742
- ( [1-9] [0-9]* ( '_' digit+ )* |
1743
- '0'
1744
- )?
1745
- (
1746
- '.' ( digit+ '_' )* digit+ |
1747
- ( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
1748
- ) %{ tm = p } c_alpha?
1896
+ '.' ( digit+ '_' )* digit+
1749
1897
  => {
1750
- unless (char = tok(tm, @te)).empty?
1751
- diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
1752
- range(tm, tm + 1)
1753
- end
1754
-
1755
- digits = tok(@ts, tm)
1898
+ diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
1899
+ };
1756
1900
 
1757
- if digits.start_with? '.'
1758
- diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
1759
- elsif digits =~ /^[eE]/
1760
- # The rule above allows to specify floats as just `e10', which is
1761
- # certainly not a float. Send a patch if you can do this better.
1762
- emit(:tIDENTIFIER, digits, @ts, tm)
1763
- fbreak;
1901
+ (
1902
+ ( [1-9] [0-9]* ( '_' digit+ )* | '0' )
1903
+ ( '.' ( digit+ '_' )* digit+ )?
1904
+ ( [eE] [+\-]? ( digit+ '_' )* digit* )?
1905
+ )
1906
+ => {
1907
+ if tok.end_with? 'e'
1908
+ diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => 'e' },
1909
+ range(@te - 1, @te)
1764
1910
  end
1765
1911
 
1766
- emit(:tFLOAT, digits.to_f, @ts, tm)
1767
- p = tm - 1
1912
+ emit(:tFLOAT, tok.to_f)
1768
1913
  fbreak;
1769
1914
  };
1770
1915
 
@@ -1785,7 +1930,7 @@ class Parser::Lexer
1785
1930
 
1786
1931
  constant
1787
1932
  => { emit(:tCONSTANT)
1788
- fnext expr_arg; fbreak; };
1933
+ fnext *arg_or_cmdarg; fbreak; };
1789
1934
 
1790
1935
  constant ambiguous_const_suffix
1791
1936
  => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
@@ -1802,9 +1947,10 @@ class Parser::Lexer
1802
1947
  => { emit_table(PUNCTUATION)
1803
1948
  fnext expr_dot; fbreak; };
1804
1949
 
1805
- call_or_var => local_ident;
1950
+ call_or_var
1951
+ => local_ident;
1806
1952
 
1807
- call_or_var ambiguous_fid_suffix
1953
+ bareword ambiguous_fid_suffix
1808
1954
  => { emit(:tFID, tok(@ts, tm), @ts, tm)
1809
1955
  p = tm - 1
1810
1956
  fnext expr_arg; fbreak; };
@@ -1821,9 +1967,18 @@ class Parser::Lexer
1821
1967
  fnext expr_beg; fbreak; };
1822
1968
 
1823
1969
  e_rbrace | e_rparen | ']'
1824
- => { emit_table(PUNCTUATION)
1825
- @cond.lexpop; @cmdarg.lexpop
1826
- fbreak; };
1970
+ => {
1971
+ emit_table(PUNCTUATION)
1972
+ @cond.lexpop; @cmdarg.lexpop
1973
+
1974
+ if %w"} ]".include?(tok)
1975
+ fnext expr_endarg;
1976
+ else # )
1977
+ # fnext expr_endfn; ?
1978
+ end
1979
+
1980
+ fbreak;
1981
+ };
1827
1982
 
1828
1983
  operator_arithmetic '='
1829
1984
  => { emit(:tOP_ASGN, tok(@ts, @te - 1))
@@ -1887,15 +2042,15 @@ class Parser::Lexer
1887
2042
  #
1888
2043
 
1889
2044
  line_comment := |*
1890
- '=end' c_line* c_nl?
2045
+ '=end' c_line* c_nl_zlen
1891
2046
  => {
1892
2047
  emit_comment(@eq_begin_s, @te)
1893
2048
  fgoto line_begin;
1894
2049
  };
1895
2050
 
1896
- c_any;
2051
+ c_line* c_nl;
1897
2052
 
1898
- c_eof
2053
+ c_line* zlen
1899
2054
  => {
1900
2055
  diagnostic :fatal, Parser::ERRORS[:embedded_document],
1901
2056
  range(@eq_begin_s, @eq_begin_s + '=begin'.length)
@@ -1905,11 +2060,11 @@ class Parser::Lexer
1905
2060
  line_begin := |*
1906
2061
  w_any;
1907
2062
 
1908
- '=begin' ( c_space | c_eol )
2063
+ '=begin' ( c_space | c_nl_zlen )
1909
2064
  => { @eq_begin_s = @ts
1910
2065
  fgoto line_comment; };
1911
2066
 
1912
- '__END__' c_eol
2067
+ '__END__' c_nl_zlen
1913
2068
  => { p = pe - 1 };
1914
2069
 
1915
2070
  c_any