ruby_parser 3.1.3 → 3.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/ruby_lexer.rb CHANGED
@@ -1,4 +1,4 @@
1
- # encoding: US-ASCII
1
+ # encoding: UTF-8
2
2
 
3
3
  class RubyLexer
4
4
 
@@ -6,20 +6,20 @@ class RubyLexer
6
6
  RUBY19 = "".respond_to? :encoding
7
7
 
8
8
  IDENT_CHAR_RE = if RUBY19 then
9
- /[\w\u0080-\uFFFF]/u
9
+ /[\w\u0080-\u{10ffff}]/u
10
10
  else
11
- /[\w\x80-\xFF]/
11
+ /[\w\x80-\xFF]/n
12
12
  end
13
13
 
14
- IDENT_RE = /^#{IDENT_CHAR_RE}+/
14
+ IDENT_RE = /^#{IDENT_CHAR_RE}+/o
15
15
 
16
16
  attr_accessor :command_start
17
17
  attr_accessor :cmdarg
18
18
  attr_accessor :cond
19
19
  attr_accessor :tern # TODO: rename ternary damnit... wtf
20
- attr_accessor :nest
20
+ attr_accessor :string_nest
21
21
 
22
- ESC_RE = /\\((?>[0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc]))/
22
+ ESC_RE = /\\((?>[0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc]))/u
23
23
  # :startdoc:
24
24
 
25
25
  ##
@@ -51,6 +51,9 @@ class RubyLexer
51
51
  attr_accessor :warnings
52
52
 
53
53
  attr_accessor :space_seen
54
+ attr_accessor :paren_nest
55
+ attr_accessor :brace_nest
56
+ attr_accessor :lpar_beg
54
57
 
55
58
  EOF = :eof_haha!
56
59
 
@@ -75,6 +78,7 @@ class RubyLexer
75
78
  TOKENS = {
76
79
  "!" => :tBANG,
77
80
  "!=" => :tNEQ,
81
+ # "!@" => :tUBANG,
78
82
  "!~" => :tNMATCH,
79
83
  "," => :tCOMMA,
80
84
  ".." => :tDOT2,
@@ -129,9 +133,9 @@ class RubyLexer
129
133
  def heredoc here # 63 lines
130
134
  _, eos, func, last_line = here
131
135
 
132
- indent = (func & STR_FUNC_INDENT) != 0
136
+ indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
133
137
  expand = (func & STR_FUNC_EXPAND) != 0
134
- eos_re = indent ? /[ \t]*#{eos}(\r?\n|\z)/ : /#{eos}(\r?\n|\z)/
138
+ eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
135
139
  err_msg = "can't match #{eos_re.inspect} anywhere in "
136
140
 
137
141
  rb_compile_error err_msg if
@@ -207,7 +211,7 @@ class RubyLexer
207
211
  string_buffer << src[3]
208
212
  when src.scan(/-?([\'\"\`])(?!\1*\Z)/) then
209
213
  rb_compile_error "unterminated here document identifier"
210
- when src.scan(/(-?)(\w+)/) then
214
+ when src.scan(/(-?)(#{IDENT_CHAR_RE}+)/) then
211
215
  term = '"'
212
216
  func |= STR_DQUOTE
213
217
  unless src[1].empty? then
@@ -243,10 +247,14 @@ class RubyLexer
243
247
 
244
248
  def initialize v = 18
245
249
  self.version = v
246
- self.cond = RubyParserStuff::StackState.new(:cond)
250
+ self.cond = RubyParserStuff::StackState.new(:cond)
247
251
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg)
248
- self.tern = RubyParserStuff::StackState.new(:tern)
249
- self.nest = 0
252
+ self.tern = RubyParserStuff::StackState.new(:tern)
253
+ self.string_nest = 0
254
+ self.paren_nest = 0
255
+ self.brace_nest = 0
256
+ self.lpar_beg = nil
257
+
250
258
  @comments = []
251
259
 
252
260
  reset
@@ -254,8 +262,6 @@ class RubyLexer
254
262
 
255
263
  def int_with_base base
256
264
  rb_compile_error "Invalid numeric format" if src.matched =~ /__/
257
- rb_compile_error "numeric literal without digits" if
258
- ruby19 and src.matched =~ /0o/i
259
265
 
260
266
  self.yacc_value = src.matched.to_i(base)
261
267
  return :tINTEGER
@@ -349,6 +355,10 @@ class RubyLexer
349
355
  when 's' then
350
356
  self.lex_state = :expr_fname
351
357
  [:tSYMBEG, STR_SSYM]
358
+ when 'I' then
359
+ [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
360
+ when 'i' then
361
+ [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
352
362
  end
353
363
 
354
364
  rb_compile_error "Bad %string type. Expected [Qq\Wwxrs], found '#{c}'." if
@@ -365,7 +375,7 @@ class RubyLexer
365
375
  space = false # FIX: remove these
366
376
  func = string_type
367
377
  paren = open
368
- term_re = Regexp.escape term
378
+ term_re = @@regexp_cache[term]
369
379
 
370
380
  qwords = (func & STR_FUNC_QWORDS) != 0
371
381
  regexp = (func & STR_FUNC_REGEXP) != 0
@@ -378,9 +388,9 @@ class RubyLexer
378
388
 
379
389
  space = true if qwords and src.scan(/\s+/)
380
390
 
381
- if self.nest == 0 && src.scan(/#{term_re}/) then
391
+ if self.string_nest == 0 && src.scan(/#{term_re}/) then
382
392
  if qwords then
383
- quote[1] = nil
393
+ quote[1] = nil # TODO: make struct
384
394
  return :tSPACE
385
395
  elsif regexp then
386
396
  self.yacc_value = self.regx_options
@@ -447,7 +457,7 @@ class RubyLexer
447
457
  when src.scan(/s/) then # space
448
458
  " "
449
459
  when src.scan(/[0-7]{1,3}/) then # octal constant
450
- src.matched.to_i(8).chr
460
+ (src.matched.to_i(8) & 0xFF).chr
451
461
  when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
452
462
  src[1].to_i(16).chr
453
463
  when src.check(/M-\\[\\MCc]/) then
@@ -470,6 +480,8 @@ class RubyLexer
470
480
  c = src[2]
471
481
  c[0] = (c[0].ord & 0x9f).chr
472
482
  c
483
+ when src.scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
484
+ src.matched
473
485
  when src.scan(/[McCx0-9]/) || src.eos? then
474
486
  rb_compile_error("Invalid escape character syntax")
475
487
  else
@@ -535,6 +547,9 @@ class RubyLexer
535
547
  end
536
548
  end
537
549
 
550
+ @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
551
+ @@regexp_cache[nil] = nil
552
+
538
553
  def tokadd_string(func, term, paren) # 105 lines
539
554
  qwords = (func & STR_FUNC_QWORDS) != 0
540
555
  escape = (func & STR_FUNC_ESCAPE) != 0
@@ -542,24 +557,27 @@ class RubyLexer
542
557
  regexp = (func & STR_FUNC_REGEXP) != 0
543
558
  symbol = (func & STR_FUNC_SYMBOL) != 0
544
559
 
545
- paren_re = paren.nil? ? nil : Regexp.new(Regexp.escape(paren))
546
- term_re = Regexp.new(Regexp.escape(term))
560
+ paren_re = @@regexp_cache[paren]
561
+ term_re = @@regexp_cache[term]
547
562
 
548
563
  until src.eos? do
549
564
  c = nil
550
565
  handled = true
566
+
551
567
  case
552
- when self.nest == 0 && src.scan(term_re) then
553
- src.pos -= 1
554
- break
555
568
  when paren_re && src.scan(paren_re) then
556
- self.nest += 1
569
+ self.string_nest += 1
557
570
  when src.scan(term_re) then
558
- self.nest -= 1
559
- when qwords && src.scan(/\s/) then
571
+ if self.string_nest == 0 then
572
+ src.pos -= 1
573
+ break
574
+ else
575
+ self.string_nest -= 1
576
+ end
577
+ when expand && src.scan(/#(?=[\$\@\{])/) then
560
578
  src.pos -= 1
561
579
  break
562
- when expand && src.scan(/#(?=[\$\@\{])/) then
580
+ when qwords && src.scan(/\s/) then
563
581
  src.pos -= 1
564
582
  break
565
583
  when expand && src.scan(/#(?!\n)/) then
@@ -589,13 +607,12 @@ class RubyLexer
589
607
  end
590
608
  else
591
609
  handled = false
592
- end
610
+ end # inner /\\/ case
593
611
  else
594
612
  handled = false
595
- end # case
613
+ end # top case
596
614
 
597
615
  unless handled then
598
-
599
616
  t = Regexp.escape term
600
617
  x = Regexp.escape(paren) if paren && paren != "\000"
601
618
  re = if qwords then
@@ -617,7 +634,6 @@ class RubyLexer
617
634
  c ||= src.matched
618
635
  c = RubyLexer::EOF if src.eos?
619
636
 
620
-
621
637
  return c
622
638
  end
623
639
 
@@ -642,20 +658,24 @@ class RubyLexer
642
658
 
643
659
  return r if r
644
660
 
645
- case s
646
- when /^[0-7]{1,3}/ then
647
- $&.to_i(8).chr
648
- when /^x([0-9a-fA-F]{1,2})/ then
649
- $1.to_i(16).chr
650
- when /^M-(.)/ then
651
- ($1[0].ord | 0x80).chr
652
- when /^(C-|c)(.)/ then
653
- ($2[0].ord & 0x9f).chr
654
- when /^[McCx0-9]/ then
655
- rb_compile_error("Invalid escape character syntax")
656
- else
657
- s
658
- end
661
+ x = case s
662
+ when /^[0-7]{1,3}/ then
663
+ ($&.to_i(8) & 0xFF).chr
664
+ when /^x([0-9a-fA-F]{1,2})/ then
665
+ $1.to_i(16).chr
666
+ when /^M-(.)/ then
667
+ ($1[0].ord | 0x80).chr
668
+ when /^(C-|c)(.)/ then
669
+ ($2[0].ord & 0x9f).chr
670
+ when /^[89a-f]/i then # bad octal or hex... ignore? that's what MRI does :(
671
+ s
672
+ when /^[McCx0-9]/ then
673
+ rb_compile_error("Invalid escape character syntax")
674
+ else
675
+ s
676
+ end
677
+ x.force_encoding "UTF-8" if RUBY19
678
+ x
659
679
  end
660
680
 
661
681
  def warning s
@@ -704,8 +724,8 @@ class RubyLexer
704
724
  # Replace a string of newlines with a single one
705
725
  src.scan(/\n+/)
706
726
 
707
- next if in_lex_state?(:expr_beg, :expr_fname, :expr_dot, :expr_class,
708
- :expr_value)
727
+ next if in_lex_state?(:expr_beg, :expr_value, :expr_class,
728
+ :expr_fname, :expr_dot)
709
729
 
710
730
  if src.scan(/([\ \t\r\f\v]*)\./) then
711
731
  self.space_seen = true unless src[1].empty?
@@ -718,10 +738,22 @@ class RubyLexer
718
738
  self.lex_state = :expr_beg
719
739
  return :tNL
720
740
  elsif src.scan(/[\]\)\}]/) then
741
+ if src.matched == "}" then
742
+ self.brace_nest -= 1
743
+ else
744
+ self.paren_nest -= 1
745
+ end
746
+
721
747
  cond.lexpop
722
748
  cmdarg.lexpop
723
749
  tern.lexpop
724
- self.lex_state = :expr_end
750
+
751
+ self.lex_state = if src.matched == ")" then
752
+ :expr_endfn
753
+ else
754
+ :expr_endarg
755
+ end
756
+
725
757
  self.yacc_value = src.matched
726
758
  result = {
727
759
  ")" => :tRPAREN,
@@ -729,6 +761,25 @@ class RubyLexer
729
761
  "}" => :tRCURLY
730
762
  }[src.matched]
731
763
  return result
764
+ elsif src.scan(/\!/) then
765
+ if in_lex_state?(:expr_fname, :expr_dot) then
766
+ self.lex_state = :expr_arg
767
+
768
+ if src.scan(/@/) then
769
+ self.yacc_value = "!@"
770
+ return :tUBANG
771
+ end
772
+ else
773
+ self.lex_state = :expr_beg
774
+ end
775
+
776
+ if src.scan(/[=~]/) then
777
+ self.yacc_value = "!#{src.matched}"
778
+ else
779
+ self.yacc_value = "!"
780
+ end
781
+
782
+ return TOKENS[self.yacc_value]
732
783
  elsif src.scan(/\.\.\.?|,|![=~]?/) then
733
784
  self.lex_state = :expr_beg
734
785
  tok = self.yacc_value = src.matched
@@ -748,6 +799,8 @@ class RubyLexer
748
799
  yylex_paren19
749
800
  end
750
801
 
802
+ self.paren_nest += 1
803
+
751
804
  self.expr_beg_push "("
752
805
 
753
806
  return result
@@ -778,7 +831,7 @@ class RubyLexer
778
831
  self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this
779
832
  self.yacc_value = "\""
780
833
  return :tSTRING_BEG
781
- elsif src.scan(/\@\@?\w+/) then
834
+ elsif src.scan(/\@\@?#{IDENT_CHAR_RE}+/o) then
782
835
  self.token = src.matched
783
836
 
784
837
  rb_compile_error "`#{token}` is not allowed as a variable name" if
@@ -822,15 +875,19 @@ class RubyLexer
822
875
  elsif src.check(/[0-9]/) then
823
876
  return parse_number
824
877
  elsif src.scan(/\[/) then
878
+ self.paren_nest += 1
879
+
825
880
  result = src.matched
826
881
 
827
882
  if in_lex_state? :expr_fname, :expr_dot then
828
883
  self.lex_state = :expr_arg
829
884
  case
830
885
  when src.scan(/\]\=/) then
886
+ self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
831
887
  self.yacc_value = "[]="
832
888
  return :tASET
833
889
  when src.scan(/\]/) then
890
+ self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
834
891
  self.yacc_value = "[]"
835
892
  return :tAREF
836
893
  else
@@ -850,7 +907,7 @@ class RubyLexer
850
907
 
851
908
  return result
852
909
  elsif src.scan(/\'(\\.|[^\'])*\'/) then
853
- self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
910
+ self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'") # "
854
911
  self.lex_state = :expr_end
855
912
  return :tSTRING
856
913
  elsif src.check(/\|/) then
@@ -872,13 +929,17 @@ class RubyLexer
872
929
  return :tPIPE
873
930
  end
874
931
  elsif src.scan(/\{/) then
875
- if defined?(@hack_expects_lambda) && @hack_expects_lambda
876
- @hack_expects_lambda = false
877
- self.lex_state = :expr_beg
932
+ self.brace_nest += 1
933
+ if lpar_beg && lpar_beg == paren_nest then
934
+ self.lpar_beg = nil
935
+ self.paren_nest -= 1
936
+
937
+ expr_beg_push "{"
938
+
878
939
  return :tLAMBEG
879
940
  end
880
941
 
881
- result = if is_arg? || in_lex_state?(:expr_end) then
942
+ result = if is_arg? || in_lex_state?(:expr_end, :expr_endfn) then
882
943
  :tLCURLY # block (primary)
883
944
  elsif in_lex_state?(:expr_endarg) then
884
945
  :tLBRACE_ARG # block (expr)
@@ -892,8 +953,7 @@ class RubyLexer
892
953
 
893
954
  return result
894
955
  elsif src.scan(/->/) then
895
- @hack_expects_lambda = true
896
- self.lex_state = :expr_arg
956
+ self.lex_state = :expr_endfn
897
957
  return :tLAMBDA
898
958
  elsif src.scan(/[+-]/) then
899
959
  sign = src.matched
@@ -920,8 +980,7 @@ class RubyLexer
920
980
  return :tOP_ASGN
921
981
  end
922
982
 
923
- if (is_beg? ||
924
- (is_arg? && space_seen && !src.check(/\s/))) then
983
+ if (is_beg? || (is_arg? && space_seen && !src.check(/\s/))) then
925
984
  if is_arg? then
926
985
  arg_ambiguous
927
986
  end
@@ -949,25 +1008,35 @@ class RubyLexer
949
1008
  self.yacc_value = "**"
950
1009
  return :tOP_ASGN
951
1010
  elsif src.scan(/\*\*/) then
1011
+ result = if is_space_arg? src.check(/./m) then
1012
+ warning "`**' interpreted as argument prefix"
1013
+ :tDSTAR
1014
+ elsif is_beg? then
1015
+ :tDSTAR
1016
+ else
1017
+ # TODO: warn_balanced("**", "argument prefix");
1018
+ :tPOW
1019
+ end
952
1020
  self.yacc_value = "**"
953
1021
  self.fix_arg_lex_state
954
- return :tPOW
1022
+ return result
955
1023
  elsif src.scan(/\*\=/) then
956
1024
  self.lex_state = :expr_beg
957
1025
  self.yacc_value = "*"
958
1026
  return :tOP_ASGN
959
1027
  elsif src.scan(/\*/) then
960
- result = if is_arg? && space_seen && src.check(/\S/) then
1028
+ result = if is_space_arg? src.check(/./m) then
961
1029
  warning("`*' interpreted as argument prefix")
962
1030
  :tSTAR
963
1031
  elsif is_beg? then
964
1032
  :tSTAR
965
1033
  else
966
- :tSTAR2
1034
+ # TODO: warn_balanced("*", "argument prefix");
1035
+ :tSTAR2 # TODO: rename
967
1036
  end
1037
+
968
1038
  self.yacc_value = "*"
969
1039
  self.fix_arg_lex_state
970
-
971
1040
  return result
972
1041
  end
973
1042
  elsif src.check(/\</) then
@@ -985,8 +1054,8 @@ class RubyLexer
985
1054
  self.yacc_value = "\<\<"
986
1055
  return :tOP_ASGN
987
1056
  elsif src.scan(/\<\</) then
988
- if (! in_lex_state?(:expr_end, :expr_dot,
989
- :expr_endarg, :expr_class) &&
1057
+ if (!in_lex_state?(:expr_dot, :expr_class) &&
1058
+ !is_end? &&
990
1059
  (!is_arg? || space_seen)) then
991
1060
  tok = self.heredoc_identifier
992
1061
  return tok if tok
@@ -1278,18 +1347,30 @@ class RubyLexer
1278
1347
  result
1279
1348
  end
1280
1349
 
1281
- def is_end?
1282
- in_lex_state? :expr_end, :expr_endarg, :expr_endfn
1350
+ def yylex_paren19
1351
+ if is_beg? then
1352
+ :tLPAREN
1353
+ elsif is_space_arg? then
1354
+ :tLPAREN_ARG
1355
+ else
1356
+ :tLPAREN2 # plain '(' in parse.y
1357
+ end
1283
1358
  end
1284
1359
 
1285
1360
  def is_arg?
1286
1361
  in_lex_state? :expr_arg, :expr_cmdarg
1287
1362
  end
1288
1363
 
1364
+ def is_end?
1365
+ in_lex_state? :expr_end, :expr_endarg, :expr_endfn
1366
+ end
1367
+
1289
1368
  def is_beg?
1290
- in_lex_state? :expr_beg, :expr_mid, :expr_value, :expr_class
1369
+ in_lex_state? :expr_beg, :expr_value, :expr_mid, :expr_class
1291
1370
  end
1292
1371
 
1372
+ # TODO #define IS_AFTER_OPERATOR() IS_lex_state(EXPR_FNAME | EXPR_DOT)
1373
+
1293
1374
  def is_space_arg? c = "x"
1294
1375
  is_arg? and space_seen and c !~ /\s/
1295
1376
  end
@@ -1298,23 +1379,7 @@ class RubyLexer
1298
1379
  (in_lex_state?(:expr_beg) && !command_state) || is_arg?
1299
1380
  end
1300
1381
 
1301
- def yylex_paren19 # TODO: move or remove
1302
- result =
1303
- if is_beg? then
1304
- :tLPAREN
1305
- elsif is_space_arg? then
1306
- :tLPAREN_ARG
1307
- else
1308
- :tLPAREN2 # plain '(' in parse.y
1309
- end
1310
-
1311
- # paren_nest++; # TODO
1312
-
1313
- result
1314
- end
1315
-
1316
1382
  def process_token(command_state)
1317
-
1318
1383
  token << src.matched if token =~ IDENT_RE && src.scan(/[\!\?](?!=)/)
1319
1384
 
1320
1385
  result = nil
@@ -1379,22 +1444,23 @@ class RubyLexer
1379
1444
  return keyword.id0
1380
1445
  end
1381
1446
 
1447
+ self.command_start = true if lex_state == :expr_beg
1448
+
1382
1449
  if keyword.id0 == :kDO then
1383
- self.command_start = true
1450
+ if lpar_beg && lpar_beg == paren_nest then
1451
+ self.lpar_beg = nil
1452
+ self.paren_nest -= 1
1384
1453
 
1385
- if defined?(@hack_expects_lambda) && @hack_expects_lambda
1386
- @hack_expects_lambda = false
1387
1454
  return :kDO_LAMBDA
1388
1455
  end
1389
1456
 
1390
1457
  return :kDO_COND if cond.is_in_state
1391
1458
  return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg
1392
- return :kDO_BLOCK if state == :expr_endarg
1393
-
1459
+ return :kDO_BLOCK if [:expr_beg, :expr_endarg].include? state
1394
1460
  return :kDO
1395
1461
  end
1396
1462
 
1397
- return keyword.id0 if state == :expr_beg or state == :expr_value
1463
+ return keyword.id0 if [:expr_beg, :expr_value].include? state
1398
1464
 
1399
1465
  self.lex_state = :expr_beg if keyword.id0 != keyword.id1
1400
1466
 
@@ -1406,13 +1472,13 @@ class RubyLexer
1406
1472
  # if (mb == ENC_CODERANGE_7BIT && lex_state != EXPR_DOT) {
1407
1473
 
1408
1474
  self.lex_state =
1409
- if is_beg? || in_lex_state?(:expr_dot) || is_arg? then
1475
+ if is_beg? || is_arg? || in_lex_state?(:expr_dot) then
1410
1476
  if command_state then
1411
1477
  :expr_cmdarg
1412
1478
  else
1413
1479
  :expr_arg
1414
1480
  end
1415
- elsif ruby19 && in_lex_state?(:expr_fname) then
1481
+ elsif !ruby18 && in_lex_state?(:expr_fname) then
1416
1482
  :expr_endfn
1417
1483
  else
1418
1484
  :expr_end
@@ -1422,8 +1488,10 @@ class RubyLexer
1422
1488
 
1423
1489
  self.yacc_value = token
1424
1490
 
1425
- self.lex_state = :expr_end if
1426
- last_state != :expr_dot && self.parser.env[token.to_sym] == :lvar
1491
+ if (![:expr_dot, :expr_fname].include?(last_state) &&
1492
+ self.parser.env[token.to_sym] == :lvar) then
1493
+ self.lex_state = :expr_end
1494
+ end
1427
1495
 
1428
1496
  return result
1429
1497
  end