yarp 0.7.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -208,18 +208,9 @@ module YARP
208
208
  end
209
209
  end
210
210
 
211
- # It is extremely non obvious which state the parser is in when comments get
212
- # dispatched. Because of this we don't both comparing state when comparing
213
- # against other comment tokens.
214
- class CommentToken < Token
215
- def ==(other)
216
- self[0...-1] == other[0...-1]
217
- end
218
- end
219
-
220
- # Heredoc end tokens are emitted in an odd order, so we don't compare the
221
- # state on them.
222
- class HeredocEndToken < Token
211
+ # Tokens where state should be ignored
212
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
213
+ class IgnoreStateToken < Token
223
214
  def ==(other)
224
215
  self[0...-1] == other[0...-1]
225
216
  end
@@ -252,6 +243,23 @@ module YARP
252
243
  end
253
244
  end
254
245
 
246
+ # If we have an identifier that follows a method name like:
247
+ #
248
+ # def foo bar
249
+ #
250
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
251
+ # scope named bar because it hasn't pushed the local table yet. We do this
252
+ # more accurately, so we need to allow comparing against both END and
253
+ # END|LABEL.
254
+ class ParamToken < Token
255
+ def ==(other)
256
+ (self[0...-1] == other[0...-1]) && (
257
+ (other[3] == Ripper::EXPR_END) ||
258
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
259
+ )
260
+ end
261
+ end
262
+
255
263
  # A heredoc in this case is a list of tokens that belong to the body of the
256
264
  # heredoc that should be appended onto the list of tokens when the heredoc
257
265
  # closes.
@@ -558,18 +566,45 @@ module YARP
558
566
  result_value = result.value
559
567
  previous_state = nil
560
568
 
561
- # If there's a UTF-8 byte-order mark as the start of the file, then ripper
562
- # sets every token's on the first line back by 6 bytes. It also keeps the
563
- # byte order mark in the first token's value. This is weird, and I don't
564
- # want to mirror that in our parser. So instead, we'll match up the values
565
- # here, and then match up the locations as we process the tokens.
566
- bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
567
- result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
569
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
570
+ # first token, so we had to have a hack in place to account for that. This
571
+ # checks for that behavior.
572
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
573
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
568
574
 
569
575
  result_value.each_with_index do |(token, lex_state), index|
570
576
  lineno = token.location.start_line
571
577
  column = token.location.start_column
572
- column -= index == 0 ? 6 : 3 if bom && lineno == 1
578
+
579
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
580
+ # certain tokens ripper sets the first token back by 3 bytes. It also
581
+ # keeps the byte order mark in the first token's value. This is weird,
582
+ # and I don't want to mirror that in our parser. So instead, we'll match
583
+ # up the columns and values here.
584
+ if bom && lineno == 1
585
+ column -= 3
586
+
587
+ if index == 0 && column == 0 && !bom_flushed
588
+ flushed =
589
+ case token.type
590
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
591
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
592
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
593
+ :PERCENT_UPPER_W, :STRING_BEGIN
594
+ true
595
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
596
+ token.value.start_with?("%")
597
+ else
598
+ false
599
+ end
600
+
601
+ unless flushed
602
+ column -= 3
603
+ value = token.value
604
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
605
+ end
606
+ end
607
+ end
573
608
 
574
609
  event = RIPPER.fetch(token.type)
575
610
  value = token.value
@@ -580,13 +615,23 @@ module YARP
580
615
  when :on___end__
581
616
  EndContentToken.new([[lineno, column], event, value, lex_state])
582
617
  when :on_comment
583
- CommentToken.new([[lineno, column], event, value, lex_state])
618
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
584
619
  when :on_heredoc_end
585
620
  # Heredoc end tokens can be emitted in an odd order, so we don't
586
621
  # want to bother comparing the state on them.
587
- HeredocEndToken.new([[lineno, column], event, value, lex_state])
588
- when :on_embexpr_end, :on_ident
589
- if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
622
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
623
+ when :on_ident
624
+ if lex_state == Ripper::EXPR_END
625
+ # If we have an identifier that follows a method name like:
626
+ #
627
+ # def foo bar
628
+ #
629
+ # then Ripper will mark bar as END|LABEL if there is a local in a
630
+ # parent scope named bar because it hasn't pushed the local table
631
+ # yet. We do this more accurately, so we need to allow comparing
632
+ # against both END and END|LABEL.
633
+ ParamToken.new([[lineno, column], event, value, lex_state])
634
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
590
635
  # In the event that we're comparing identifiers, we're going to
591
636
  # allow a little divergence. Ripper doesn't account for local
592
637
  # variables introduced through named captures in regexes, and we
@@ -595,6 +640,8 @@ module YARP
595
640
  else
596
641
  Token.new([[lineno, column], event, value, lex_state])
597
642
  end
643
+ when :on_embexpr_end
644
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
598
645
  when :on_ignored_nl
599
646
  # Ignored newlines can occasionally have a LABEL state attached to
600
647
  # them which doesn't actually impact anything. We don't mirror that
@@ -629,6 +676,26 @@ module YARP
629
676
  previous_state
630
677
  end
631
678
 
679
+ Token.new([[lineno, column], event, value, lex_state])
680
+ when :on_eof
681
+ previous_token = result_value[index - 1][0]
682
+
683
+ # If we're at the end of the file and the previous token was a
684
+ # comment and there is still whitespace after the comment, then
685
+ # Ripper will append a on_nl token (even though there isn't
686
+ # necessarily a newline). We mirror that here.
687
+ start_offset = previous_token.location.end_offset
688
+ end_offset = token.location.start_offset
689
+
690
+ if previous_token.type == :COMMENT && start_offset < end_offset
691
+ if bom
692
+ start_offset += 3
693
+ end_offset += 3
694
+ end
695
+
696
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
697
+ end
698
+
632
699
  Token.new([[lineno, column], event, value, lex_state])
633
700
  else
634
701
  Token.new([[lineno, column], event, value, lex_state])
@@ -713,7 +780,8 @@ module YARP
713
780
  end
714
781
  end
715
782
 
716
- tokens.reject! { |t| t.event == :on_eof }
783
+ # Drop the EOF token from the list
784
+ tokens = tokens[0...-1]
717
785
 
718
786
  # We sort by location to compare against Ripper's output
719
787
  tokens.sort_by!(&:location)