ruby_parser 3.1.3 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/ruby_lexer.rb CHANGED
@@ -1,4 +1,4 @@
1
- # encoding: US-ASCII
1
+ # encoding: UTF-8
2
2
 
3
3
  class RubyLexer
4
4
 
@@ -6,20 +6,20 @@ class RubyLexer
6
6
  RUBY19 = "".respond_to? :encoding
7
7
 
8
8
  IDENT_CHAR_RE = if RUBY19 then
9
- /[\w\u0080-\uFFFF]/u
9
+ /[\w\u0080-\u{10ffff}]/u
10
10
  else
11
- /[\w\x80-\xFF]/
11
+ /[\w\x80-\xFF]/n
12
12
  end
13
13
 
14
- IDENT_RE = /^#{IDENT_CHAR_RE}+/
14
+ IDENT_RE = /^#{IDENT_CHAR_RE}+/o
15
15
 
16
16
  attr_accessor :command_start
17
17
  attr_accessor :cmdarg
18
18
  attr_accessor :cond
19
19
  attr_accessor :tern # TODO: rename ternary damnit... wtf
20
- attr_accessor :nest
20
+ attr_accessor :string_nest
21
21
 
22
- ESC_RE = /\\((?>[0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc]))/
22
+ ESC_RE = /\\((?>[0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc]))/u
23
23
  # :startdoc:
24
24
 
25
25
  ##
@@ -51,6 +51,9 @@ class RubyLexer
51
51
  attr_accessor :warnings
52
52
 
53
53
  attr_accessor :space_seen
54
+ attr_accessor :paren_nest
55
+ attr_accessor :brace_nest
56
+ attr_accessor :lpar_beg
54
57
 
55
58
  EOF = :eof_haha!
56
59
 
@@ -75,6 +78,7 @@ class RubyLexer
75
78
  TOKENS = {
76
79
  "!" => :tBANG,
77
80
  "!=" => :tNEQ,
81
+ # "!@" => :tUBANG,
78
82
  "!~" => :tNMATCH,
79
83
  "," => :tCOMMA,
80
84
  ".." => :tDOT2,
@@ -129,9 +133,9 @@ class RubyLexer
129
133
  def heredoc here # 63 lines
130
134
  _, eos, func, last_line = here
131
135
 
132
- indent = (func & STR_FUNC_INDENT) != 0
136
+ indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil
133
137
  expand = (func & STR_FUNC_EXPAND) != 0
134
- eos_re = indent ? /[ \t]*#{eos}(\r?\n|\z)/ : /#{eos}(\r?\n|\z)/
138
+ eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/
135
139
  err_msg = "can't match #{eos_re.inspect} anywhere in "
136
140
 
137
141
  rb_compile_error err_msg if
@@ -207,7 +211,7 @@ class RubyLexer
207
211
  string_buffer << src[3]
208
212
  when src.scan(/-?([\'\"\`])(?!\1*\Z)/) then
209
213
  rb_compile_error "unterminated here document identifier"
210
- when src.scan(/(-?)(\w+)/) then
214
+ when src.scan(/(-?)(#{IDENT_CHAR_RE}+)/) then
211
215
  term = '"'
212
216
  func |= STR_DQUOTE
213
217
  unless src[1].empty? then
@@ -243,10 +247,14 @@ class RubyLexer
243
247
 
244
248
  def initialize v = 18
245
249
  self.version = v
246
- self.cond = RubyParserStuff::StackState.new(:cond)
250
+ self.cond = RubyParserStuff::StackState.new(:cond)
247
251
  self.cmdarg = RubyParserStuff::StackState.new(:cmdarg)
248
- self.tern = RubyParserStuff::StackState.new(:tern)
249
- self.nest = 0
252
+ self.tern = RubyParserStuff::StackState.new(:tern)
253
+ self.string_nest = 0
254
+ self.paren_nest = 0
255
+ self.brace_nest = 0
256
+ self.lpar_beg = nil
257
+
250
258
  @comments = []
251
259
 
252
260
  reset
@@ -254,8 +262,6 @@ class RubyLexer
254
262
 
255
263
  def int_with_base base
256
264
  rb_compile_error "Invalid numeric format" if src.matched =~ /__/
257
- rb_compile_error "numeric literal without digits" if
258
- ruby19 and src.matched =~ /0o/i
259
265
 
260
266
  self.yacc_value = src.matched.to_i(base)
261
267
  return :tINTEGER
@@ -349,6 +355,10 @@ class RubyLexer
349
355
  when 's' then
350
356
  self.lex_state = :expr_fname
351
357
  [:tSYMBEG, STR_SSYM]
358
+ when 'I' then
359
+ [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
360
+ when 'i' then
361
+ [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
352
362
  end
353
363
 
354
364
  rb_compile_error "Bad %string type. Expected [Qq\Wwxrs], found '#{c}'." if
@@ -365,7 +375,7 @@ class RubyLexer
365
375
  space = false # FIX: remove these
366
376
  func = string_type
367
377
  paren = open
368
- term_re = Regexp.escape term
378
+ term_re = @@regexp_cache[term]
369
379
 
370
380
  qwords = (func & STR_FUNC_QWORDS) != 0
371
381
  regexp = (func & STR_FUNC_REGEXP) != 0
@@ -378,9 +388,9 @@ class RubyLexer
378
388
 
379
389
  space = true if qwords and src.scan(/\s+/)
380
390
 
381
- if self.nest == 0 && src.scan(/#{term_re}/) then
391
+ if self.string_nest == 0 && src.scan(/#{term_re}/) then
382
392
  if qwords then
383
- quote[1] = nil
393
+ quote[1] = nil # TODO: make struct
384
394
  return :tSPACE
385
395
  elsif regexp then
386
396
  self.yacc_value = self.regx_options
@@ -447,7 +457,7 @@ class RubyLexer
447
457
  when src.scan(/s/) then # space
448
458
  " "
449
459
  when src.scan(/[0-7]{1,3}/) then # octal constant
450
- src.matched.to_i(8).chr
460
+ (src.matched.to_i(8) & 0xFF).chr
451
461
  when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
452
462
  src[1].to_i(16).chr
453
463
  when src.check(/M-\\[\\MCc]/) then
@@ -470,6 +480,8 @@ class RubyLexer
470
480
  c = src[2]
471
481
  c[0] = (c[0].ord & 0x9f).chr
472
482
  c
483
+ when src.scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
484
+ src.matched
473
485
  when src.scan(/[McCx0-9]/) || src.eos? then
474
486
  rb_compile_error("Invalid escape character syntax")
475
487
  else
@@ -535,6 +547,9 @@ class RubyLexer
535
547
  end
536
548
  end
537
549
 
550
+ @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) }
551
+ @@regexp_cache[nil] = nil
552
+
538
553
  def tokadd_string(func, term, paren) # 105 lines
539
554
  qwords = (func & STR_FUNC_QWORDS) != 0
540
555
  escape = (func & STR_FUNC_ESCAPE) != 0
@@ -542,24 +557,27 @@ class RubyLexer
542
557
  regexp = (func & STR_FUNC_REGEXP) != 0
543
558
  symbol = (func & STR_FUNC_SYMBOL) != 0
544
559
 
545
- paren_re = paren.nil? ? nil : Regexp.new(Regexp.escape(paren))
546
- term_re = Regexp.new(Regexp.escape(term))
560
+ paren_re = @@regexp_cache[paren]
561
+ term_re = @@regexp_cache[term]
547
562
 
548
563
  until src.eos? do
549
564
  c = nil
550
565
  handled = true
566
+
551
567
  case
552
- when self.nest == 0 && src.scan(term_re) then
553
- src.pos -= 1
554
- break
555
568
  when paren_re && src.scan(paren_re) then
556
- self.nest += 1
569
+ self.string_nest += 1
557
570
  when src.scan(term_re) then
558
- self.nest -= 1
559
- when qwords && src.scan(/\s/) then
571
+ if self.string_nest == 0 then
572
+ src.pos -= 1
573
+ break
574
+ else
575
+ self.string_nest -= 1
576
+ end
577
+ when expand && src.scan(/#(?=[\$\@\{])/) then
560
578
  src.pos -= 1
561
579
  break
562
- when expand && src.scan(/#(?=[\$\@\{])/) then
580
+ when qwords && src.scan(/\s/) then
563
581
  src.pos -= 1
564
582
  break
565
583
  when expand && src.scan(/#(?!\n)/) then
@@ -589,13 +607,12 @@ class RubyLexer
589
607
  end
590
608
  else
591
609
  handled = false
592
- end
610
+ end # inner /\\/ case
593
611
  else
594
612
  handled = false
595
- end # case
613
+ end # top case
596
614
 
597
615
  unless handled then
598
-
599
616
  t = Regexp.escape term
600
617
  x = Regexp.escape(paren) if paren && paren != "\000"
601
618
  re = if qwords then
@@ -617,7 +634,6 @@ class RubyLexer
617
634
  c ||= src.matched
618
635
  c = RubyLexer::EOF if src.eos?
619
636
 
620
-
621
637
  return c
622
638
  end
623
639
 
@@ -642,20 +658,24 @@ class RubyLexer
642
658
 
643
659
  return r if r
644
660
 
645
- case s
646
- when /^[0-7]{1,3}/ then
647
- $&.to_i(8).chr
648
- when /^x([0-9a-fA-F]{1,2})/ then
649
- $1.to_i(16).chr
650
- when /^M-(.)/ then
651
- ($1[0].ord | 0x80).chr
652
- when /^(C-|c)(.)/ then
653
- ($2[0].ord & 0x9f).chr
654
- when /^[McCx0-9]/ then
655
- rb_compile_error("Invalid escape character syntax")
656
- else
657
- s
658
- end
661
+ x = case s
662
+ when /^[0-7]{1,3}/ then
663
+ ($&.to_i(8) & 0xFF).chr
664
+ when /^x([0-9a-fA-F]{1,2})/ then
665
+ $1.to_i(16).chr
666
+ when /^M-(.)/ then
667
+ ($1[0].ord | 0x80).chr
668
+ when /^(C-|c)(.)/ then
669
+ ($2[0].ord & 0x9f).chr
670
+ when /^[89a-f]/i then # bad octal or hex... ignore? that's what MRI does :(
671
+ s
672
+ when /^[McCx0-9]/ then
673
+ rb_compile_error("Invalid escape character syntax")
674
+ else
675
+ s
676
+ end
677
+ x.force_encoding "UTF-8" if RUBY19
678
+ x
659
679
  end
660
680
 
661
681
  def warning s
@@ -704,8 +724,8 @@ class RubyLexer
704
724
  # Replace a string of newlines with a single one
705
725
  src.scan(/\n+/)
706
726
 
707
- next if in_lex_state?(:expr_beg, :expr_fname, :expr_dot, :expr_class,
708
- :expr_value)
727
+ next if in_lex_state?(:expr_beg, :expr_value, :expr_class,
728
+ :expr_fname, :expr_dot)
709
729
 
710
730
  if src.scan(/([\ \t\r\f\v]*)\./) then
711
731
  self.space_seen = true unless src[1].empty?
@@ -718,10 +738,22 @@ class RubyLexer
718
738
  self.lex_state = :expr_beg
719
739
  return :tNL
720
740
  elsif src.scan(/[\]\)\}]/) then
741
+ if src.matched == "}" then
742
+ self.brace_nest -= 1
743
+ else
744
+ self.paren_nest -= 1
745
+ end
746
+
721
747
  cond.lexpop
722
748
  cmdarg.lexpop
723
749
  tern.lexpop
724
- self.lex_state = :expr_end
750
+
751
+ self.lex_state = if src.matched == ")" then
752
+ :expr_endfn
753
+ else
754
+ :expr_endarg
755
+ end
756
+
725
757
  self.yacc_value = src.matched
726
758
  result = {
727
759
  ")" => :tRPAREN,
@@ -729,6 +761,25 @@ class RubyLexer
729
761
  "}" => :tRCURLY
730
762
  }[src.matched]
731
763
  return result
764
+ elsif src.scan(/\!/) then
765
+ if in_lex_state?(:expr_fname, :expr_dot) then
766
+ self.lex_state = :expr_arg
767
+
768
+ if src.scan(/@/) then
769
+ self.yacc_value = "!@"
770
+ return :tUBANG
771
+ end
772
+ else
773
+ self.lex_state = :expr_beg
774
+ end
775
+
776
+ if src.scan(/[=~]/) then
777
+ self.yacc_value = "!#{src.matched}"
778
+ else
779
+ self.yacc_value = "!"
780
+ end
781
+
782
+ return TOKENS[self.yacc_value]
732
783
  elsif src.scan(/\.\.\.?|,|![=~]?/) then
733
784
  self.lex_state = :expr_beg
734
785
  tok = self.yacc_value = src.matched
@@ -748,6 +799,8 @@ class RubyLexer
748
799
  yylex_paren19
749
800
  end
750
801
 
802
+ self.paren_nest += 1
803
+
751
804
  self.expr_beg_push "("
752
805
 
753
806
  return result
@@ -778,7 +831,7 @@ class RubyLexer
778
831
  self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this
779
832
  self.yacc_value = "\""
780
833
  return :tSTRING_BEG
781
- elsif src.scan(/\@\@?\w+/) then
834
+ elsif src.scan(/\@\@?#{IDENT_CHAR_RE}+/o) then
782
835
  self.token = src.matched
783
836
 
784
837
  rb_compile_error "`#{token}` is not allowed as a variable name" if
@@ -822,15 +875,19 @@ class RubyLexer
822
875
  elsif src.check(/[0-9]/) then
823
876
  return parse_number
824
877
  elsif src.scan(/\[/) then
878
+ self.paren_nest += 1
879
+
825
880
  result = src.matched
826
881
 
827
882
  if in_lex_state? :expr_fname, :expr_dot then
828
883
  self.lex_state = :expr_arg
829
884
  case
830
885
  when src.scan(/\]\=/) then
886
+ self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
831
887
  self.yacc_value = "[]="
832
888
  return :tASET
833
889
  when src.scan(/\]/) then
890
+ self.paren_nest -= 1 # HACK? I dunno, or bug in MRI
834
891
  self.yacc_value = "[]"
835
892
  return :tAREF
836
893
  else
@@ -850,7 +907,7 @@ class RubyLexer
850
907
 
851
908
  return result
852
909
  elsif src.scan(/\'(\\.|[^\'])*\'/) then
853
- self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'")
910
+ self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'") # "
854
911
  self.lex_state = :expr_end
855
912
  return :tSTRING
856
913
  elsif src.check(/\|/) then
@@ -872,13 +929,17 @@ class RubyLexer
872
929
  return :tPIPE
873
930
  end
874
931
  elsif src.scan(/\{/) then
875
- if defined?(@hack_expects_lambda) && @hack_expects_lambda
876
- @hack_expects_lambda = false
877
- self.lex_state = :expr_beg
932
+ self.brace_nest += 1
933
+ if lpar_beg && lpar_beg == paren_nest then
934
+ self.lpar_beg = nil
935
+ self.paren_nest -= 1
936
+
937
+ expr_beg_push "{"
938
+
878
939
  return :tLAMBEG
879
940
  end
880
941
 
881
- result = if is_arg? || in_lex_state?(:expr_end) then
942
+ result = if is_arg? || in_lex_state?(:expr_end, :expr_endfn) then
882
943
  :tLCURLY # block (primary)
883
944
  elsif in_lex_state?(:expr_endarg) then
884
945
  :tLBRACE_ARG # block (expr)
@@ -892,8 +953,7 @@ class RubyLexer
892
953
 
893
954
  return result
894
955
  elsif src.scan(/->/) then
895
- @hack_expects_lambda = true
896
- self.lex_state = :expr_arg
956
+ self.lex_state = :expr_endfn
897
957
  return :tLAMBDA
898
958
  elsif src.scan(/[+-]/) then
899
959
  sign = src.matched
@@ -920,8 +980,7 @@ class RubyLexer
920
980
  return :tOP_ASGN
921
981
  end
922
982
 
923
- if (is_beg? ||
924
- (is_arg? && space_seen && !src.check(/\s/))) then
983
+ if (is_beg? || (is_arg? && space_seen && !src.check(/\s/))) then
925
984
  if is_arg? then
926
985
  arg_ambiguous
927
986
  end
@@ -949,25 +1008,35 @@ class RubyLexer
949
1008
  self.yacc_value = "**"
950
1009
  return :tOP_ASGN
951
1010
  elsif src.scan(/\*\*/) then
1011
+ result = if is_space_arg? src.check(/./m) then
1012
+ warning "`**' interpreted as argument prefix"
1013
+ :tDSTAR
1014
+ elsif is_beg? then
1015
+ :tDSTAR
1016
+ else
1017
+ # TODO: warn_balanced("**", "argument prefix");
1018
+ :tPOW
1019
+ end
952
1020
  self.yacc_value = "**"
953
1021
  self.fix_arg_lex_state
954
- return :tPOW
1022
+ return result
955
1023
  elsif src.scan(/\*\=/) then
956
1024
  self.lex_state = :expr_beg
957
1025
  self.yacc_value = "*"
958
1026
  return :tOP_ASGN
959
1027
  elsif src.scan(/\*/) then
960
- result = if is_arg? && space_seen && src.check(/\S/) then
1028
+ result = if is_space_arg? src.check(/./m) then
961
1029
  warning("`*' interpreted as argument prefix")
962
1030
  :tSTAR
963
1031
  elsif is_beg? then
964
1032
  :tSTAR
965
1033
  else
966
- :tSTAR2
1034
+ # TODO: warn_balanced("*", "argument prefix");
1035
+ :tSTAR2 # TODO: rename
967
1036
  end
1037
+
968
1038
  self.yacc_value = "*"
969
1039
  self.fix_arg_lex_state
970
-
971
1040
  return result
972
1041
  end
973
1042
  elsif src.check(/\</) then
@@ -985,8 +1054,8 @@ class RubyLexer
985
1054
  self.yacc_value = "\<\<"
986
1055
  return :tOP_ASGN
987
1056
  elsif src.scan(/\<\</) then
988
- if (! in_lex_state?(:expr_end, :expr_dot,
989
- :expr_endarg, :expr_class) &&
1057
+ if (!in_lex_state?(:expr_dot, :expr_class) &&
1058
+ !is_end? &&
990
1059
  (!is_arg? || space_seen)) then
991
1060
  tok = self.heredoc_identifier
992
1061
  return tok if tok
@@ -1278,18 +1347,30 @@ class RubyLexer
1278
1347
  result
1279
1348
  end
1280
1349
 
1281
- def is_end?
1282
- in_lex_state? :expr_end, :expr_endarg, :expr_endfn
1350
+ def yylex_paren19
1351
+ if is_beg? then
1352
+ :tLPAREN
1353
+ elsif is_space_arg? then
1354
+ :tLPAREN_ARG
1355
+ else
1356
+ :tLPAREN2 # plain '(' in parse.y
1357
+ end
1283
1358
  end
1284
1359
 
1285
1360
  def is_arg?
1286
1361
  in_lex_state? :expr_arg, :expr_cmdarg
1287
1362
  end
1288
1363
 
1364
+ def is_end?
1365
+ in_lex_state? :expr_end, :expr_endarg, :expr_endfn
1366
+ end
1367
+
1289
1368
  def is_beg?
1290
- in_lex_state? :expr_beg, :expr_mid, :expr_value, :expr_class
1369
+ in_lex_state? :expr_beg, :expr_value, :expr_mid, :expr_class
1291
1370
  end
1292
1371
 
1372
+ # TODO #define IS_AFTER_OPERATOR() IS_lex_state(EXPR_FNAME | EXPR_DOT)
1373
+
1293
1374
  def is_space_arg? c = "x"
1294
1375
  is_arg? and space_seen and c !~ /\s/
1295
1376
  end
@@ -1298,23 +1379,7 @@ class RubyLexer
1298
1379
  (in_lex_state?(:expr_beg) && !command_state) || is_arg?
1299
1380
  end
1300
1381
 
1301
- def yylex_paren19 # TODO: move or remove
1302
- result =
1303
- if is_beg? then
1304
- :tLPAREN
1305
- elsif is_space_arg? then
1306
- :tLPAREN_ARG
1307
- else
1308
- :tLPAREN2 # plain '(' in parse.y
1309
- end
1310
-
1311
- # paren_nest++; # TODO
1312
-
1313
- result
1314
- end
1315
-
1316
1382
  def process_token(command_state)
1317
-
1318
1383
  token << src.matched if token =~ IDENT_RE && src.scan(/[\!\?](?!=)/)
1319
1384
 
1320
1385
  result = nil
@@ -1379,22 +1444,23 @@ class RubyLexer
1379
1444
  return keyword.id0
1380
1445
  end
1381
1446
 
1447
+ self.command_start = true if lex_state == :expr_beg
1448
+
1382
1449
  if keyword.id0 == :kDO then
1383
- self.command_start = true
1450
+ if lpar_beg && lpar_beg == paren_nest then
1451
+ self.lpar_beg = nil
1452
+ self.paren_nest -= 1
1384
1453
 
1385
- if defined?(@hack_expects_lambda) && @hack_expects_lambda
1386
- @hack_expects_lambda = false
1387
1454
  return :kDO_LAMBDA
1388
1455
  end
1389
1456
 
1390
1457
  return :kDO_COND if cond.is_in_state
1391
1458
  return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg
1392
- return :kDO_BLOCK if state == :expr_endarg
1393
-
1459
+ return :kDO_BLOCK if [:expr_beg, :expr_endarg].include? state
1394
1460
  return :kDO
1395
1461
  end
1396
1462
 
1397
- return keyword.id0 if state == :expr_beg or state == :expr_value
1463
+ return keyword.id0 if [:expr_beg, :expr_value].include? state
1398
1464
 
1399
1465
  self.lex_state = :expr_beg if keyword.id0 != keyword.id1
1400
1466
 
@@ -1406,13 +1472,13 @@ class RubyLexer
1406
1472
  # if (mb == ENC_CODERANGE_7BIT && lex_state != EXPR_DOT) {
1407
1473
 
1408
1474
  self.lex_state =
1409
- if is_beg? || in_lex_state?(:expr_dot) || is_arg? then
1475
+ if is_beg? || is_arg? || in_lex_state?(:expr_dot) then
1410
1476
  if command_state then
1411
1477
  :expr_cmdarg
1412
1478
  else
1413
1479
  :expr_arg
1414
1480
  end
1415
- elsif ruby19 && in_lex_state?(:expr_fname) then
1481
+ elsif !ruby18 && in_lex_state?(:expr_fname) then
1416
1482
  :expr_endfn
1417
1483
  else
1418
1484
  :expr_end
@@ -1422,8 +1488,10 @@ class RubyLexer
1422
1488
 
1423
1489
  self.yacc_value = token
1424
1490
 
1425
- self.lex_state = :expr_end if
1426
- last_state != :expr_dot && self.parser.env[token.to_sym] == :lvar
1491
+ if (![:expr_dot, :expr_fname].include?(last_state) &&
1492
+ self.parser.env[token.to_sym] == :lvar) then
1493
+ self.lex_state = :expr_end
1494
+ end
1427
1495
 
1428
1496
  return result
1429
1497
  end