parser 2.0.0.pre2 → 2.0.0.pre3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,35 @@
1
1
  module Parser
2
2
 
3
+ ##
4
+ # Default AST builder. Uses {AST::Node}s.
5
+ #
3
6
  class Builders::Default
7
+ ##
8
+ # @api private
4
9
  attr_accessor :parser
10
+
11
+ ##
12
+ # If set to true, `__FILE__` and `__LINE__` are transformed to
13
+ # literal nodes. For example, `s(:str, "lib/foo.rb")` and `s(:int, 10)`.
14
+ #
15
+ # If set to false, `__FILE__` and `__LINE__` are emitted as-is,
16
+ # i.e. as `s(:__FILE__)` and `s(:__LINE__)` nodes.
17
+ #
18
+ # Source maps are identical in both cases.
19
+ #
20
+ # @return [TrueClass|FalseClass]
5
21
  attr_accessor :emit_file_line_as_literals
6
22
 
23
+ ##
24
+ # Initializes attributes:
25
+ #
26
+ # * `emit_file_line_as_literals`: `true`
7
27
  def initialize
8
28
  @emit_file_line_as_literals = true
9
29
  end
10
30
 
31
+ # @!parse private
32
+
11
33
  #
12
34
  # Literals
13
35
  #
@@ -359,6 +381,10 @@ module Parser
359
381
  end
360
382
  end
361
383
 
384
+ def const_op_assignable(node)
385
+ node.updated(:casgn)
386
+ end
387
+
362
388
  def assign(lhs, eql_t, rhs)
363
389
  (lhs << rhs).updated(nil, nil,
364
390
  :location => lhs.loc.
@@ -462,8 +488,9 @@ module Parser
462
488
  # Formal arguments
463
489
  #
464
490
 
465
- def args(begin_t, args, end_t)
466
- n(:args, [ *check_duplicate_args(args) ],
491
+ def args(begin_t, args, end_t, check_args=true)
492
+ args = check_duplicate_args(args) if check_args
493
+ n(:args, args,
467
494
  collection_map(begin_t, args, end_t))
468
495
  end
469
496
 
@@ -636,8 +663,24 @@ module Parser
636
663
  receiver.children.count == 2 &&
637
664
  receiver.children.first.type == :str
638
665
 
639
- regexp_str, _regopt = *receiver
640
- regexp_body, = *regexp_str
666
+ str_node, opt_node = *receiver
667
+ regexp_body, = *str_node
668
+ *regexp_opt = *opt_node
669
+
670
+ if defined?(Encoding)
671
+ regexp_body = case
672
+ when regexp_opt.include?(:u)
673
+ regexp_body.encode(Encoding::UTF_8)
674
+ when regexp_opt.include?(:e)
675
+ regexp_body.encode(Encoding::EUC_JP)
676
+ when regexp_opt.include?(:s)
677
+ regexp_body.encode(Encoding::WINDOWS_31J)
678
+ when regexp_opt.include?(:n)
679
+ regexp_body.encode(Encoding::BINARY)
680
+ else
681
+ regexp_body
682
+ end
683
+ end
641
684
 
642
685
  Regexp.new(regexp_body).names.each do |name|
643
686
  @parser.static_env.declare(name)
@@ -880,10 +923,15 @@ module Parser
880
923
  when :erange then :eflipflop
881
924
  end
882
925
 
883
- cond.updated(type, [
884
- check_condition(lhs),
885
- check_condition(rhs)
886
- ])
926
+ if [:and, :or].include?(cond.type) &&
927
+ @parser.version == 18
928
+ cond
929
+ else
930
+ cond.updated(type, [
931
+ check_condition(lhs),
932
+ check_condition(rhs)
933
+ ])
934
+ end
887
935
 
888
936
  when :regexp
889
937
  n(:match_current_line, [ cond ], nil)
@@ -1230,7 +1278,11 @@ module Parser
1230
1278
  end
1231
1279
 
1232
1280
  if else_t
1233
- end_l = else_e.loc.expression
1281
+ if else_e.nil?
1282
+ end_l = loc(else_t)
1283
+ else
1284
+ end_l = else_e.loc.expression
1285
+ end
1234
1286
  elsif !body_es.last.nil?
1235
1287
  end_l = body_es.last.loc.expression
1236
1288
  else
@@ -1,3 +1,10 @@
1
+ ##
2
+ # @api public
3
+ #
4
+ # This monkeypatch extends Ruby 1.8 {String#%} with an ability
5
+ # to replace named capture groups, i.e.
6
+ # `"foo: %{bar}" % { :bar => 10 } # => "foo: 10"`.
7
+ #
1
8
  class String
2
9
  alias original_percent %
3
10
 
@@ -1,17 +1,22 @@
1
1
  module Parser
2
2
 
3
3
  ##
4
+ # @api public
5
+ #
4
6
  # @!attribute [r] level
5
- # @return [Symbol]
7
+ # @see LEVELS
8
+ # @return [Symbol] diagnostic level
6
9
  #
7
10
  # @!attribute [r] message
8
- # @return [String]
11
+ # @return [String] error message
9
12
  #
10
13
  # @!attribute [r] location
11
- # @return [Parser::Source::Map]
14
+ # Main error-related source range.
15
+ # @return [Parser::Source::Range]
12
16
  #
13
17
  # @!attribute [r] highlights
14
- # @return [Array]
18
+ # Supplementary error-related source ranges.
19
+ # @return [Array<Parser::Source::Range>]
15
20
  #
16
21
  class Diagnostic
17
22
  ##
@@ -46,7 +51,15 @@ module Parser
46
51
  end
47
52
 
48
53
  ##
49
- # Renders the diagnostic message as an array of three lines.
54
+ # Renders the diagnostic message as a clang-like diagnostic.
55
+ #
56
+ # @example
57
+ # diagnostic.render # =>
58
+ # # [
59
+ # # "(fragment:0):1:5: error: unexpected token $end",
60
+ # # "foo +",
61
+ # # " ^"
62
+ # # ]
50
63
  #
51
64
  # @return [Array<String>]
52
65
  #
@@ -4,22 +4,23 @@ module Parser
4
4
  # {Parser::Diagnostic::Engine} provides a basic API for dealing with
5
5
  # diagnostics by delegating them to registered consumers.
6
6
  #
7
- # Basic usage is as following:
7
+ # @example
8
+ # buffer = Parser::Source::Buffer.new(__FILE__)
9
+ # buffer.code = 'foobar'
8
10
  #
9
- # buffer = Parser::Source::Buffer.new(__FILE__)
10
- # buffer.code = 'foobar'
11
+ # consumer = lambda do |diagnostic|
12
+ # puts diagnostic.message
13
+ # end
11
14
  #
12
- # consumer = lambda do |diagnostic|
13
- # puts diagnostic.message
14
- # end
15
+ # engine = Parser::Diagnostic::Engine.new(consumer)
16
+ # diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
15
17
  #
16
- # engine = Parser::Diagnostic::Engine.new(consumer)
17
- # diagnostic = Parser::Diagnostic.new(:warning, 'warning!', buffer, 1..2)
18
+ # engine.process(diagnostic) # => "warning!"
18
19
  #
19
- # engine.process(diagnostic) # => "warning!"
20
+ # @api public
20
21
  #
21
22
  # @!attribute [rw] consumer
22
- # @return [#call]
23
+ # @return [#call(Diagnostic)]
23
24
  #
24
25
  # @!attribute [rw] all_errors_are_fatal
25
26
  # When set to `true` any error that is encountered will result in
@@ -37,7 +38,7 @@ module Parser
37
38
  attr_accessor :ignore_warnings
38
39
 
39
40
  ##
40
- # @param [#call] consumer
41
+ # @param [#call(Diagnostic)] consumer
41
42
  #
42
43
  def initialize(consumer=nil)
43
44
  @consumer = consumer
@@ -82,6 +82,7 @@ class Parser::Lexer
82
82
  # %
83
83
 
84
84
  attr_reader :source_buffer
85
+ attr_reader :encoding
85
86
 
86
87
  attr_accessor :diagnostics
87
88
  attr_accessor :static_env
@@ -110,6 +111,10 @@ class Parser::Lexer
110
111
  @cmdarg = StackState.new('cmdarg')
111
112
  end
112
113
 
114
+ @source = nil # source string
115
+ @source_pts = nil # @source as a codepoint array
116
+ @encoding = nil # target encoding for output strings
117
+
113
118
  @p = 0 # stream position (saved manually in #advance)
114
119
  @ts = nil # token start
115
120
  @te = nil # token end
@@ -141,27 +146,51 @@ class Parser::Lexer
141
146
  # encountered after a matching closing parenthesis.
142
147
  @paren_nest = 0
143
148
  @lambda_stack = []
149
+
150
+ # If the lexer is in `command state' (aka expr_value)
151
+ # at the entry to #advance, it will transition to expr_cmdarg
152
+ # instead of expr_arg at certain points.
153
+ @command_state = false
144
154
  end
145
155
 
146
156
  def source_buffer=(source_buffer)
147
157
  @source_buffer = source_buffer
148
158
 
149
159
  if @source_buffer
150
- # Heredoc processing coupled with weird newline quirks
151
- # require three '\0' (EOF) chars to be appended; after
152
- # `p = @heredoc_s`, if `p` points at EOF, the FSM could
153
- # not bail out early enough and will crash.
154
- #
155
- # Patches accepted.
156
- #
157
- @source = @source_buffer.source + "\0\0\0"
160
+ @source = @source_buffer.source + "\0"
158
161
 
159
- if @source.length > 0 && @source[0].ord == 0xfeff
162
+ if defined?(Encoding) && @source.encoding == Encoding::UTF_8
163
+ @source_pts = @source.unpack('U*')
164
+ else
165
+ @source_pts = @source.unpack('C*')
166
+ end
167
+
168
+ if defined?(Encoding)
169
+ @encoding = @source.encoding
170
+ end
171
+
172
+ if @source_pts.size > 1_000_000 && @source.respond_to?(:encode)
173
+ # A heuristic: if the buffer is larger than 1M, then
174
+ # store it in UTF-32 and convert the tokens as they're
175
+ # going out. If it's smaller, the conversion overhead
176
+ # dominates runtime and this stops being beneficial.
177
+ #
178
+ # This is not really a good heuristic, as the result
179
+ # heavily depends on token/character ratio. If it's low,
180
+ # say the gem consists mostly of long identifiers and
181
+ # symbols, then storing the source in UTF-8 would be faster.
182
+ #
183
+ # Patches accepted.
184
+ @source = @source.encode(Encoding::UTF_32LE)
185
+ end
186
+
187
+ if @source_pts[0] == 0xfeff
160
188
  # Skip byte order mark.
161
189
  @p = 1
162
190
  end
163
191
  else
164
- @source = nil
192
+ @source = nil
193
+ @source_pts = nil
165
194
  end
166
195
  end
167
196
 
@@ -173,9 +202,15 @@ class Parser::Lexer
173
202
  :expr_beg => lex_en_expr_beg,
174
203
  :expr_mid => lex_en_expr_mid,
175
204
  :expr_arg => lex_en_expr_arg,
205
+ :expr_cmdarg => lex_en_expr_cmdarg,
176
206
  :expr_end => lex_en_expr_end,
177
207
  :expr_endarg => lex_en_expr_endarg,
178
208
  :expr_endfn => lex_en_expr_endfn,
209
+
210
+ :interp_string => lex_en_interp_string,
211
+ :interp_words => lex_en_interp_words,
212
+ :plain_string => lex_en_plain_string,
213
+ :plain_words => lex_en_plain_string,
179
214
  }
180
215
 
181
216
  def state
@@ -204,8 +239,12 @@ class Parser::Lexer
204
239
  _lex_trans_actions = self.class.send :_lex_trans_actions
205
240
  _lex_to_state_actions = self.class.send :_lex_to_state_actions
206
241
  _lex_from_state_actions = self.class.send :_lex_from_state_actions
242
+ _lex_eof_trans = self.class.send :_lex_eof_trans
243
+
244
+ p, pe, eof = @p, @source.length + 1, @source.length + 1
207
245
 
208
- p, pe, eof = @p, @source.length + 1, nil
246
+ @command_state = (@cs == self.class.lex_en_expr_value ||
247
+ @cs == self.class.lex_en_line_begin)
209
248
 
210
249
  %% write exec;
211
250
  # %
@@ -223,8 +262,8 @@ class Parser::Lexer
223
262
 
224
263
  protected
225
264
 
226
- def eof_char?(char)
227
- [0x04, 0x1a, 0x00].include? char.ord
265
+ def eof_codepoint?(point)
266
+ [0x04, 0x1a, 0x00].include? point
228
267
  end
229
268
 
230
269
  def version?(*versions)
@@ -236,8 +275,22 @@ class Parser::Lexer
236
275
  @stack[@top]
237
276
  end
238
277
 
239
- def tok(s = @ts, e = @te)
240
- @source[s...e]
278
+ if "".respond_to?(:encode)
279
+ def encode_escape(ord)
280
+ ord.chr.force_encoding(@encoding)
281
+ end
282
+
283
+ def tok(s = @ts, e = @te)
284
+ @source[s...e].encode(@encoding)
285
+ end
286
+ else
287
+ def encode_escape(ord)
288
+ ord.chr
289
+ end
290
+
291
+ def tok(s = @ts, e = @te)
292
+ @source[s...e]
293
+ end
241
294
  end
242
295
 
243
296
  def range(s = @ts, e = @te)
@@ -260,6 +313,24 @@ class Parser::Lexer
260
313
  emit(table[value], value, s, e)
261
314
  end
262
315
 
316
+ def emit_do(do_block=false)
317
+ if @cond.active?
318
+ emit(:kDO_COND)
319
+ elsif @cmdarg.active? || do_block
320
+ emit(:kDO_BLOCK)
321
+ else
322
+ emit(:kDO)
323
+ end
324
+ end
325
+
326
+ def arg_or_cmdarg
327
+ if @command_state
328
+ self.class.lex_en_expr_cmdarg
329
+ else
330
+ self.class.lex_en_expr_arg
331
+ end
332
+ end
333
+
263
334
  def emit_comment(s = @ts, e = @te)
264
335
  if @comments
265
336
  @comments.push(Parser::Source::Comment.new(range(s, e)))
@@ -351,9 +422,9 @@ class Parser::Lexer
351
422
  }
352
423
 
353
424
  KEYWORDS_BEGIN = {
354
- 'if' => :kIF, 'unless' => :kUNLESS,
355
- 'while' => :kWHILE, 'until' => :kUNTIL,
356
- 'rescue' => :kRESCUE
425
+ 'if' => :kIF, 'unless' => :kUNLESS,
426
+ 'while' => :kWHILE, 'until' => :kUNTIL,
427
+ 'rescue' => :kRESCUE, 'defined?' => :kDEFINED,
357
428
  }
358
429
 
359
430
  %w(class module def undef begin end then elsif else ensure case when
@@ -366,7 +437,7 @@ class Parser::Lexer
366
437
  # %
367
438
 
368
439
  access @;
369
- getkey @source[p].ord;
440
+ getkey (@source_pts[p] || 0);
370
441
 
371
442
  # === CHARACTER CLASSES ===
372
443
  #
@@ -384,13 +455,16 @@ class Parser::Lexer
384
455
  @newline_s = p
385
456
  }
386
457
 
387
- c_nl = '\r'? '\n' $ do_nl;
458
+ c_nl = '\n' $ do_nl;
388
459
  c_space = [ \t\r\f\v];
389
460
  c_space_nl = c_space | c_nl;
390
- c_eof = 0x04 | 0x1a | 0; # ^D, ^Z, EOF
461
+
462
+ c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF
391
463
  c_eol = c_nl | c_eof;
392
- c_any = any - c_eof - zlen;
393
- c_line = c_any - c_nl;
464
+ c_any = any - c_eof;
465
+
466
+ c_nl_zlen = c_nl | zlen;
467
+ c_line = any - c_nl_zlen;
394
468
 
395
469
  c_unicode = c_any - 0x00..0x7f;
396
470
  c_upper = [A-Z];
@@ -403,7 +477,7 @@ class Parser::Lexer
403
477
  # This allows to feed the lexer more data if needed; this is only used
404
478
  # in tests.
405
479
  #
406
- # Note that this action is not embedded into e_eof like e_nl and e_bs
480
+ # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs
407
481
  # below. This is due to the fact that scanner state at EOF is observed
408
482
  # by tests, and encapsulating it in a rule would break the introspection.
409
483
  fhold; fbreak;
@@ -524,7 +598,7 @@ class Parser::Lexer
524
598
  break
525
599
  end
526
600
 
527
- @escape += codepoint.chr(Encoding::UTF_8)
601
+ @escape += codepoint.chr(Encoding::UTF_8)
528
602
  codepoint_s += codepoint_str.length + 1
529
603
  end
530
604
  }
@@ -544,11 +618,11 @@ class Parser::Lexer
544
618
  }
545
619
 
546
620
  action slash_c_char {
547
- @escape = (@escape[0].ord & 0x9f).chr
621
+ @escape = encode_escape(@escape[0].ord & 0x9f)
548
622
  }
549
623
 
550
624
  action slash_m_char {
551
- @escape = (@escape[0].ord | 0x80).chr
625
+ @escape = encode_escape(@escape[0].ord | 0x80)
552
626
  }
553
627
 
554
628
  maybe_escaped_char = (
@@ -565,11 +639,11 @@ class Parser::Lexer
565
639
  escape = (
566
640
  # \377
567
641
  [0-7]{1,3}
568
- % { @escape = tok(@escape_s, p).to_i(8).chr }
642
+ % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) }
569
643
 
570
644
  # \xff
571
645
  | ( 'x' xdigit{1,2}
572
- % { @escape = tok(@escape_s + 1, p).to_i(16).chr }
646
+ % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) }
573
647
  # \u263a
574
648
  | 'u' xdigit{4}
575
649
  % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) }
@@ -586,10 +660,10 @@ class Parser::Lexer
586
660
 
587
661
  # %q[\u123] %q[\u{12]
588
662
  | 'u' ( c_any{0,4} -
589
- xdigit{4} - # \u1234 is valid
590
- ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
591
- | '{' xdigit [ \t}] # \u{1. \u{1} are valid
592
- | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
663
+ xdigit{4} - # \u1234 is valid
664
+ ( '{' xdigit{1,3} # \u{1 \u{12 \u{123 are valid
665
+ | '{' xdigit [ \t}] any # \u{1. \u{1} are valid
666
+ | '{' xdigit{2} [ \t}] # \u{12. \u{12} are valid
593
667
  )
594
668
  )
595
669
  % {
@@ -631,7 +705,7 @@ class Parser::Lexer
631
705
 
632
706
  | 'C' c_any %invalid_complex_escape
633
707
  | 'M' c_any %invalid_complex_escape
634
- | ( 'M-\\C' | 'C-\\M' | 'cM' ) c_any %invalid_complex_escape
708
+ | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape
635
709
 
636
710
  | ( c_any - [0-7xuCMc] ) %unescape_char
637
711
 
@@ -692,10 +766,12 @@ class Parser::Lexer
692
766
  };
693
767
 
694
768
  action extend_string {
695
- if !literal.heredoc? && literal.nest_and_try_closing(tok, @ts, @te)
769
+ string = @source[@ts...@te]
770
+
771
+ if !literal.heredoc? && literal.nest_and_try_closing(string, @ts, @te)
696
772
  fnext *pop_literal; fbreak;
697
773
  else
698
- literal.extend_string(tok, @ts, @te)
774
+ literal.extend_string(string, @ts, @te)
699
775
  end
700
776
  }
701
777
 
@@ -748,35 +824,56 @@ class Parser::Lexer
748
824
  # As heredoc closing line can immediately precede EOF, this action
749
825
  # has to handle such case specially.
750
826
  action extend_string_eol {
751
- is_eof = eof_char? @source[p]
827
+ if @te == pe
828
+ diagnostic :fatal, Parser::ERRORS[:string_eof],
829
+ range(literal.str_s, literal.str_s + 1)
830
+ end
752
831
 
753
832
  if literal.heredoc?
833
+ line = tok(@herebody_s, @ts).gsub(/\r+$/, '')
834
+
754
835
  # Try ending the heredoc with the complete most recently
755
836
  # scanned line. @herebody_s always refers to the start of such line.
756
- if literal.nest_and_try_closing(tok(@herebody_s, @ts),
757
- @herebody_s, @ts)
837
+ if literal.nest_and_try_closing(line, @herebody_s, @ts)
758
838
  # Adjust @herebody_s to point to the next line.
759
839
  @herebody_s = @te
760
840
 
761
841
  # Continue regular lexing after the heredoc reference (<<END).
762
842
  p = literal.heredoc_e - 1
763
- fgoto *pop_literal;
843
+ fnext *pop_literal; fbreak;
764
844
  else
765
845
  # Ditto.
766
846
  @herebody_s = @te
767
847
  end
768
- end
848
+ else
849
+ # Try ending the literal with a newline.
850
+ if literal.nest_and_try_closing(tok, @ts, @te)
851
+ fnext *pop_literal; fbreak;
852
+ end
769
853
 
770
- if is_eof
771
- diagnostic :fatal, Parser::ERRORS[:string_eof],
772
- range(literal.str_s, literal.str_s + 1)
854
+ if @herebody_s
855
+ # This is a regular literal intertwined with a heredoc. Like:
856
+ #
857
+ # p <<-foo+"1
858
+ # bar
859
+ # foo
860
+ # 2"
861
+ #
862
+ # which, incidentally, evaluates to "bar\n12".
863
+ p = @herebody_s - 1
864
+ @herebody_s = nil
865
+ end
773
866
  end
774
867
 
775
- # A literal newline is appended if the heredoc was _not_ closed
776
- # this time. See also Literal#nest_and_try_closing for rationale of
777
- # calling #flush_string here.
778
- literal.extend_string tok, @ts, @te
779
- literal.flush_string
868
+ if literal.words? && !eof_codepoint?(@source_pts[p])
869
+ literal.extend_space @ts, @te
870
+ else
871
+ # A literal newline is appended if the heredoc was _not_ closed
872
+ # this time (see fbreak above). See also Literal#nest_and_try_closing
873
+ # for rationale of calling #flush_string here.
874
+ literal.extend_string tok, @ts, @te
875
+ literal.flush_string
876
+ end
780
877
  }
781
878
 
782
879
  action extend_string_space {
@@ -850,11 +947,13 @@ class Parser::Lexer
850
947
 
851
948
  emit(:tSTRING_DBEG, '#{')
852
949
 
853
- literal.saved_herebody_s = @herebody_s
854
- @herebody_s = nil
950
+ if literal.heredoc?
951
+ literal.saved_herebody_s = @herebody_s
952
+ @herebody_s = nil
953
+ end
855
954
 
856
955
  literal.start_interp_brace
857
- fcall expr_beg;
956
+ fcall expr_value;
858
957
  }
859
958
 
860
959
  # Actual string parsers are simply combined from the primitives defined
@@ -864,7 +963,7 @@ class Parser::Lexer
864
963
  interp_code => extend_interp_code;
865
964
  interp_var => extend_interp_var;
866
965
  e_bs escape => extend_string_escaped;
867
- c_space_nl+ => extend_string_space;
966
+ c_space+ => extend_string_space;
868
967
  c_eol => extend_string_eol;
869
968
  c_any => extend_string;
870
969
  *|;
@@ -879,7 +978,7 @@ class Parser::Lexer
879
978
 
880
979
  plain_words := |*
881
980
  e_bs c_any => extend_string_escaped;
882
- c_space_nl+ => extend_string_space;
981
+ c_space+ => extend_string_space;
883
982
  c_eol => extend_string_eol;
884
983
  c_any => extend_string;
885
984
  *|;
@@ -930,7 +1029,10 @@ class Parser::Lexer
930
1029
  ;
931
1030
 
932
1031
  w_comment =
933
- '#' %{ @sharp_s = p - 1 } c_line* %{ emit_comment(@sharp_s, p) }
1032
+ '#' %{ @sharp_s = p - 1 }
1033
+ # The (p == pe) condition compensates for added "\0" and
1034
+ # the way Ragel handles EOF.
1035
+ c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) }
934
1036
  ;
935
1037
 
936
1038
  w_space_comment =
@@ -1014,15 +1116,14 @@ class Parser::Lexer
1014
1116
  @paren_nest -= 1
1015
1117
  };
1016
1118
 
1017
- # Ruby >=1.9.2 is context-sensitive wrt/ local identifiers.
1119
+ # Ruby is context-sensitive wrt/ local identifiers.
1018
1120
  action local_ident {
1019
1121
  emit(:tIDENTIFIER)
1020
1122
 
1021
- if !version?(18) &&
1022
- !@static_env.nil? && @static_env.declared?(tok)
1123
+ if !@static_env.nil? && @static_env.declared?(tok)
1023
1124
  fnext expr_end; fbreak;
1024
1125
  else
1025
- fnext expr_arg; fbreak;
1126
+ fnext *arg_or_cmdarg; fbreak;
1026
1127
  end
1027
1128
  }
1028
1129
 
@@ -1140,15 +1241,15 @@ class Parser::Lexer
1140
1241
  expr_dot := |*
1141
1242
  constant
1142
1243
  => { emit(:tCONSTANT)
1143
- fnext expr_arg; fbreak; };
1244
+ fnext *arg_or_cmdarg; fbreak; };
1144
1245
 
1145
1246
  call_or_var
1146
1247
  => { emit(:tIDENTIFIER)
1147
- fnext expr_arg; fbreak; };
1248
+ fnext *arg_or_cmdarg; fbreak; };
1148
1249
 
1149
- call_or_var ambiguous_ident_suffix
1250
+ bareword ambiguous_fid_suffix
1150
1251
  => { emit(:tFID, tok(@ts, tm), @ts, tm)
1151
- fnext expr_arg; p = tm - 1; fbreak; };
1252
+ fnext *arg_or_cmdarg; p = tm - 1; fbreak; };
1152
1253
 
1153
1254
  # See the comment in `expr_fname`.
1154
1255
  operator_fname |
@@ -1176,8 +1277,15 @@ class Parser::Lexer
1176
1277
  # cmd (1 + 2)
1177
1278
  # See below the rationale about expr_endarg.
1178
1279
  w_space+ e_lparen
1179
- => { emit(:tLPAREN_ARG, '(', @te - 1, @te)
1180
- fnext expr_beg; fbreak; };
1280
+ => {
1281
+ if version?(18)
1282
+ emit(:tLPAREN2, '(', @te - 1, @te)
1283
+ fnext expr_value; fbreak;
1284
+ else
1285
+ emit(:tLPAREN_ARG, '(', @te - 1, @te)
1286
+ fnext expr_beg; fbreak;
1287
+ end
1288
+ };
1181
1289
 
1182
1290
  # meth(1 + 2)
1183
1291
  # Regular method call.
@@ -1210,13 +1318,12 @@ class Parser::Lexer
1210
1318
 
1211
1319
  # a ?b
1212
1320
  # Character literal.
1213
- w_space+ '?'
1321
+ w_space* '?'
1214
1322
  => { fhold; fgoto expr_beg; };
1215
1323
 
1216
1324
  # a %{1}, a %[1] (but not "a %=1=" or "a % foo")
1217
- w_space+ ( '%' [^= ]
1218
1325
  # a /foo/ (but not "a / foo" or "a /=foo")
1219
- | '/' ( c_any - c_space_nl - '=' )
1326
+ w_space+ ( [%/] ( c_any - c_space_nl - '=' ) # /
1220
1327
  # a <<HEREDOC
1221
1328
  | '<<'
1222
1329
  )
@@ -1263,14 +1370,14 @@ class Parser::Lexer
1263
1370
 
1264
1371
  # a ? b
1265
1372
  # Ternary operator.
1266
- w_space+ '?' c_space_nl
1267
- => { fhold; fhold; fgoto expr_end; };
1373
+ w_space+ %{ tm = p } '?' c_space_nl
1374
+ => { p = tm - 1; fgoto expr_end; };
1268
1375
 
1269
1376
  # x + 1: Binary operator or operator-assignment.
1270
1377
  w_space* operator_arithmetic
1271
1378
  ( '=' | c_space_nl )? |
1272
1379
  # x rescue y: Modifier keyword.
1273
- w_space+ keyword_modifier |
1380
+ w_space* keyword_modifier |
1274
1381
  # Miscellanea.
1275
1382
  w_space* punctuation_end
1276
1383
  => {
@@ -1292,6 +1399,43 @@ class Parser::Lexer
1292
1399
  c_eof => do_eof;
1293
1400
  *|;
1294
1401
 
1402
+ # The previous token was an identifier which was seen while in the
1403
+ # command mode (that is, the state at the beginning of #advance was
1404
+ # expr_value). This state is very similar to expr_arg, but disambiguates
1405
+ # two very rare and specific condition:
1406
+ # * In 1.8 mode, "foo (lambda do end)".
1407
+ # * In 1.9+ mode, "f x: -> do foo do end end".
1408
+ expr_cmdarg := |*
1409
+ w_space+ e_lparen
1410
+ => {
1411
+ emit(:tLPAREN_ARG, '(', @te - 1, @te)
1412
+ if version?(18)
1413
+ fnext expr_value; fbreak;
1414
+ else
1415
+ fnext expr_beg; fbreak;
1416
+ end
1417
+ };
1418
+
1419
+ w_space* 'do'
1420
+ => {
1421
+ if @cond.active?
1422
+ emit(:kDO_COND, 'do', @te - 2, @te)
1423
+ else
1424
+ emit(:kDO, 'do', @te - 2, @te)
1425
+ end
1426
+ fnext expr_value; fbreak;
1427
+ };
1428
+
1429
+ c_any |
1430
+ # Disambiguate with the `do' rule above.
1431
+ w_space* bareword |
1432
+ w_space* label
1433
+ => { p = @ts - 1
1434
+ fgoto expr_arg; };
1435
+
1436
+ c_eof => do_eof;
1437
+ *|;
1438
+
1295
1439
  # The rationale for this state is pretty complex. Normally, if an argument
1296
1440
  # is passed to a command and then there is a block (tLCURLY...tRCURLY),
1297
1441
  # the block is attached to the innermost argument (`f` in `m f {}`), or it
@@ -1313,8 +1457,8 @@ class Parser::Lexer
1313
1457
  fnext expr_value; };
1314
1458
 
1315
1459
  'do'
1316
- => { emit(:kDO_BLOCK)
1317
- fnext expr_value; };
1460
+ => { emit_do(true)
1461
+ fnext expr_value; fbreak; };
1318
1462
 
1319
1463
  w_space_comment;
1320
1464
 
@@ -1334,6 +1478,9 @@ class Parser::Lexer
1334
1478
  => { emit_table(KEYWORDS)
1335
1479
  fnext expr_beg; fbreak; };
1336
1480
 
1481
+ bareword
1482
+ => { p = @ts - 1; fgoto expr_beg; };
1483
+
1337
1484
  w_space_comment;
1338
1485
 
1339
1486
  w_newline
@@ -1383,7 +1530,7 @@ class Parser::Lexer
1383
1530
  };
1384
1531
 
1385
1532
  # %<string>
1386
- '%' ( c_any - [A-Za-z] )
1533
+ '%' ( any - [A-Za-z] )
1387
1534
  => {
1388
1535
  type, delimiter = tok[0].chr, tok[-1].chr
1389
1536
  fgoto *push_literal(type, delimiter, @ts);
@@ -1517,8 +1664,9 @@ class Parser::Lexer
1517
1664
 
1518
1665
  # rescue Exception => e: Block rescue.
1519
1666
  # Special because it should transition to expr_mid.
1520
- 'rescue'
1521
- => { emit_table(KEYWORDS_BEGIN)
1667
+ 'rescue' %{ tm = p } '=>'?
1668
+ => { emit_table(KEYWORDS_BEGIN, @ts, tm)
1669
+ p = tm - 1
1522
1670
  fnext expr_mid; fbreak; };
1523
1671
 
1524
1672
  # if a: Statement if.
@@ -1535,8 +1683,17 @@ class Parser::Lexer
1535
1683
  fhold;
1536
1684
 
1537
1685
  if version?(18)
1538
- emit(:tIDENTIFIER, tok(@ts, @te - 2), @ts, @te - 2)
1686
+ ident = tok(@ts, @te - 2)
1687
+
1688
+ emit((tok[0] =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER,
1689
+ ident, @ts, @te - 2)
1539
1690
  fhold; # continue as a symbol
1691
+
1692
+ if !@static_env.nil? && @static_env.declared?(ident)
1693
+ fnext expr_end;
1694
+ else
1695
+ fnext *arg_or_cmdarg;
1696
+ end
1540
1697
  else
1541
1698
  emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1)
1542
1699
  end
@@ -1557,7 +1714,8 @@ class Parser::Lexer
1557
1714
 
1558
1715
  # a = 42; a [42]: Indexing.
1559
1716
  # def a; end; a [42]: Array argument.
1560
- call_or_var => local_ident;
1717
+ call_or_var
1718
+ => local_ident;
1561
1719
 
1562
1720
  #
1563
1721
  # WHITESPACE
@@ -1565,7 +1723,7 @@ class Parser::Lexer
1565
1723
 
1566
1724
  w_any;
1567
1725
 
1568
- e_heredoc_nl '=begin' ( c_space | c_eol )
1726
+ e_heredoc_nl '=begin' ( c_space | c_nl_zlen )
1569
1727
  => { p = @ts - 1
1570
1728
  fgoto line_begin; };
1571
1729
 
@@ -1594,6 +1752,9 @@ class Parser::Lexer
1594
1752
 
1595
1753
  w_space_comment;
1596
1754
 
1755
+ w_newline
1756
+ => { fgoto line_begin; };
1757
+
1597
1758
  c_any
1598
1759
  => { fhold; fgoto expr_beg; };
1599
1760
 
@@ -1627,13 +1788,7 @@ class Parser::Lexer
1627
1788
  if tok == '{'
1628
1789
  emit_table(PUNCTUATION)
1629
1790
  else # 'do'
1630
- if @cond.active?
1631
- emit(:kDO_COND)
1632
- elsif @cmdarg.active?
1633
- emit(:kDO_BLOCK)
1634
- else
1635
- emit(:kDO)
1636
- end
1791
+ emit_do
1637
1792
  end
1638
1793
  end
1639
1794
 
@@ -1682,6 +1837,12 @@ class Parser::Lexer
1682
1837
  => {
1683
1838
  if version?(18)
1684
1839
  emit(:tIDENTIFIER)
1840
+
1841
+ if !@static_env.nil? && @static_env.declared?(tok)
1842
+ fnext expr_end;
1843
+ else
1844
+ fnext *arg_or_cmdarg;
1845
+ end
1685
1846
  else
1686
1847
  emit_table(KEYWORDS)
1687
1848
  end
@@ -1707,20 +1868,16 @@ class Parser::Lexer
1707
1868
  | [1-9] digit*
1708
1869
  %{ @num_base = 10; @num_digits_s = @ts }
1709
1870
  ( '_' digit+ )* digit* '_'?
1710
- | '0' %{ @num_base = 8; @num_digits_s = @ts }
1871
+ | '0' digit*
1872
+ %{ @num_base = 8; @num_digits_s = @ts }
1711
1873
  ( '_' digit+ )* digit* '_'?
1712
- ) %{ tm = p } c_alpha?
1874
+ )
1713
1875
  => {
1714
- unless (char = tok(tm, @te)).empty?
1715
- diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
1716
- range(tm, tm + 1)
1717
- end
1718
-
1719
- digits = tok(@num_digits_s, tm)
1876
+ digits = tok(@num_digits_s)
1720
1877
 
1721
1878
  if digits.end_with? '_'
1722
- diagnostic :error, Parser::ERRORS[:trailing_underscore],
1723
- range(tm - 1, tm)
1879
+ diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => '_' },
1880
+ range(@te - 1, @te)
1724
1881
  elsif digits.empty? && @num_base == 8 && version?(18)
1725
1882
  # 1.8 did not raise an error on 0o.
1726
1883
  digits = "0"
@@ -1732,39 +1889,27 @@ class Parser::Lexer
1732
1889
  range(invalid_s, invalid_s + 1)
1733
1890
  end
1734
1891
 
1735
- emit(:tINTEGER, digits.to_i(@num_base), @ts, tm)
1736
- p = tm - 1
1892
+ emit(:tINTEGER, digits.to_i(@num_base))
1737
1893
  fbreak;
1738
1894
  };
1739
1895
 
1740
- # Floating point literals cannot start with 0 except when a dot
1741
- # follows immediately, probably to avoid confusion with octal literals.
1742
- ( [1-9] [0-9]* ( '_' digit+ )* |
1743
- '0'
1744
- )?
1745
- (
1746
- '.' ( digit+ '_' )* digit+ |
1747
- ( '.' ( digit+ '_' )* digit+ )? [eE] [+\-]? ( digit+ '_' )* digit+
1748
- ) %{ tm = p } c_alpha?
1896
+ '.' ( digit+ '_' )* digit+
1749
1897
  => {
1750
- unless (char = tok(tm, @te)).empty?
1751
- diagnostic :fatal, Parser::ERRORS[:unexpected] % { :character => char },
1752
- range(tm, tm + 1)
1753
- end
1754
-
1755
- digits = tok(@ts, tm)
1898
+ diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
1899
+ };
1756
1900
 
1757
- if digits.start_with? '.'
1758
- diagnostic :error, Parser::ERRORS[:no_dot_digit_literal]
1759
- elsif digits =~ /^[eE]/
1760
- # The rule above allows to specify floats as just `e10', which is
1761
- # certainly not a float. Send a patch if you can do this better.
1762
- emit(:tIDENTIFIER, digits, @ts, tm)
1763
- fbreak;
1901
+ (
1902
+ ( [1-9] [0-9]* ( '_' digit+ )* | '0' )
1903
+ ( '.' ( digit+ '_' )* digit+ )?
1904
+ ( [eE] [+\-]? ( digit+ '_' )* digit* )?
1905
+ )
1906
+ => {
1907
+ if tok.end_with? 'e'
1908
+ diagnostic :error, Parser::ERRORS[:trailing_in_number] % { :character => 'e' },
1909
+ range(@te - 1, @te)
1764
1910
  end
1765
1911
 
1766
- emit(:tFLOAT, digits.to_f, @ts, tm)
1767
- p = tm - 1
1912
+ emit(:tFLOAT, tok.to_f)
1768
1913
  fbreak;
1769
1914
  };
1770
1915
 
@@ -1785,7 +1930,7 @@ class Parser::Lexer
1785
1930
 
1786
1931
  constant
1787
1932
  => { emit(:tCONSTANT)
1788
- fnext expr_arg; fbreak; };
1933
+ fnext *arg_or_cmdarg; fbreak; };
1789
1934
 
1790
1935
  constant ambiguous_const_suffix
1791
1936
  => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm)
@@ -1802,9 +1947,10 @@ class Parser::Lexer
1802
1947
  => { emit_table(PUNCTUATION)
1803
1948
  fnext expr_dot; fbreak; };
1804
1949
 
1805
- call_or_var => local_ident;
1950
+ call_or_var
1951
+ => local_ident;
1806
1952
 
1807
- call_or_var ambiguous_fid_suffix
1953
+ bareword ambiguous_fid_suffix
1808
1954
  => { emit(:tFID, tok(@ts, tm), @ts, tm)
1809
1955
  p = tm - 1
1810
1956
  fnext expr_arg; fbreak; };
@@ -1821,9 +1967,18 @@ class Parser::Lexer
1821
1967
  fnext expr_beg; fbreak; };
1822
1968
 
1823
1969
  e_rbrace | e_rparen | ']'
1824
- => { emit_table(PUNCTUATION)
1825
- @cond.lexpop; @cmdarg.lexpop
1826
- fbreak; };
1970
+ => {
1971
+ emit_table(PUNCTUATION)
1972
+ @cond.lexpop; @cmdarg.lexpop
1973
+
1974
+ if %w"} ]".include?(tok)
1975
+ fnext expr_endarg;
1976
+ else # )
1977
+ # fnext expr_endfn; ?
1978
+ end
1979
+
1980
+ fbreak;
1981
+ };
1827
1982
 
1828
1983
  operator_arithmetic '='
1829
1984
  => { emit(:tOP_ASGN, tok(@ts, @te - 1))
@@ -1887,15 +2042,15 @@ class Parser::Lexer
1887
2042
  #
1888
2043
 
1889
2044
  line_comment := |*
1890
- '=end' c_line* c_nl?
2045
+ '=end' c_line* c_nl_zlen
1891
2046
  => {
1892
2047
  emit_comment(@eq_begin_s, @te)
1893
2048
  fgoto line_begin;
1894
2049
  };
1895
2050
 
1896
- c_any;
2051
+ c_line* c_nl;
1897
2052
 
1898
- c_eof
2053
+ c_line* zlen
1899
2054
  => {
1900
2055
  diagnostic :fatal, Parser::ERRORS[:embedded_document],
1901
2056
  range(@eq_begin_s, @eq_begin_s + '=begin'.length)
@@ -1905,11 +2060,11 @@ class Parser::Lexer
1905
2060
  line_begin := |*
1906
2061
  w_any;
1907
2062
 
1908
- '=begin' ( c_space | c_eol )
2063
+ '=begin' ( c_space | c_nl_zlen )
1909
2064
  => { @eq_begin_s = @ts
1910
2065
  fgoto line_comment; };
1911
2066
 
1912
- '__END__' c_eol
2067
+ '__END__' c_nl_zlen
1913
2068
  => { p = pe - 1 };
1914
2069
 
1915
2070
  c_any