prism 1.7.0 → 1.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +31 -1
  3. data/Makefile +7 -1
  4. data/config.yml +4 -4
  5. data/docs/releasing.md +2 -4
  6. data/docs/ripper_translation.md +8 -17
  7. data/docs/ruby_api.md +1 -0
  8. data/ext/prism/extension.h +1 -1
  9. data/include/prism/ast.h +4 -4
  10. data/include/prism/version.h +2 -2
  11. data/lib/prism/compiler.rb +152 -152
  12. data/lib/prism/lex_compat.rb +133 -150
  13. data/lib/prism/node.rb +1131 -20
  14. data/lib/prism/parse_result.rb +9 -0
  15. data/lib/prism/serialize.rb +1 -1
  16. data/lib/prism/translation/parser_current.rb +1 -1
  17. data/lib/prism/translation/parser_versions.rb +36 -0
  18. data/lib/prism/translation/ripper/filter.rb +53 -0
  19. data/lib/prism/translation/ripper/lexer.rb +135 -0
  20. data/lib/prism/translation/ripper.rb +84 -38
  21. data/lib/prism/translation/ruby_parser.rb +1 -1
  22. data/lib/prism/translation.rb +5 -5
  23. data/lib/prism/visitor.rb +152 -152
  24. data/lib/prism.rb +1 -14
  25. data/prism.gemspec +5 -11
  26. data/rbi/prism/node.rbi +3 -0
  27. data/rbi/prism/translation/parser_versions.rbi +23 -0
  28. data/rbi/prism.rbi +0 -3
  29. data/sig/prism/node.rbs +4 -0
  30. data/sig/prism/parse_result.rbs +1 -0
  31. data/sig/prism.rbs +54 -40
  32. data/src/prism.c +48 -27
  33. metadata +5 -11
  34. data/lib/prism/translation/parser33.rb +0 -13
  35. data/lib/prism/translation/parser34.rb +0 -13
  36. data/lib/prism/translation/parser35.rb +0 -8
  37. data/lib/prism/translation/parser40.rb +0 -13
  38. data/lib/prism/translation/parser41.rb +0 -13
  39. data/rbi/prism/translation/parser33.rbi +0 -6
  40. data/rbi/prism/translation/parser34.rbi +0 -6
  41. data/rbi/prism/translation/parser35.rbi +0 -4
  42. data/rbi/prism/translation/parser40.rbi +0 -6
  43. data/rbi/prism/translation/parser41.rbi +0 -6
@@ -1,9 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
  # :markup: markdown
3
3
 
4
- require "delegate"
5
- require "ripper"
6
-
7
4
  module Prism
8
5
  # This class is responsible for lexing the source using prism and then
9
6
  # converting those tokens to be compatible with Ripper. In the vast majority
@@ -202,87 +199,51 @@ module Prism
202
199
  # When we produce tokens, we produce the same arrays that Ripper does.
203
200
  # However, we add a couple of convenience methods onto them to make them a
204
201
  # little easier to work with. We delegate all other methods to the array.
205
- class Token < SimpleDelegator
206
- # @dynamic initialize, each, []
202
+ class Token < BasicObject
203
+ # Create a new token object with the given ripper-compatible array.
204
+ def initialize(array)
205
+ @array = array
206
+ end
207
207
 
208
208
  # The location of the token in the source.
209
209
  def location
210
- self[0]
210
+ @array[0]
211
211
  end
212
212
 
213
213
  # The type of the token.
214
214
  def event
215
- self[1]
215
+ @array[1]
216
216
  end
217
217
 
218
218
  # The slice of the source that this token represents.
219
219
  def value
220
- self[2]
220
+ @array[2]
221
221
  end
222
222
 
223
223
  # The state of the lexer when this token was produced.
224
224
  def state
225
- self[3]
225
+ @array[3]
226
226
  end
227
- end
228
227
 
229
- # Ripper doesn't include the rest of the token in the event, so we need to
230
- # trim it down to just the content on the first line when comparing.
231
- class EndContentToken < Token
228
+ # We want to pretend that this is just an Array.
232
229
  def ==(other) # :nodoc:
233
- [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
230
+ @array == other
234
231
  end
235
- end
236
-
237
- # Tokens where state should be ignored
238
- # used for :on_comment, :on_heredoc_end, :on_embexpr_end
239
- class IgnoreStateToken < Token
240
- def ==(other) # :nodoc:
241
- self[0...-1] == other[0...-1]
242
- end
243
- end
244
232
 
245
- # Ident tokens for the most part are exactly the same, except sometimes we
246
- # know an ident is a local when ripper doesn't (when they are introduced
247
- # through named captures in regular expressions). In that case we don't
248
- # compare the state.
249
- class IdentToken < Token
250
- def ==(other) # :nodoc:
251
- (self[0...-1] == other[0...-1]) && (
252
- (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
253
- (other[3] & Ripper::EXPR_ARG_ANY != 0)
254
- )
233
+ def respond_to_missing?(name, include_private = false) # :nodoc:
234
+ @array.respond_to?(name, include_private)
255
235
  end
256
- end
257
-
258
- # Ignored newlines can occasionally have a LABEL state attached to them, so
259
- # we compare the state differently here.
260
- class IgnoredNewlineToken < Token
261
- def ==(other) # :nodoc:
262
- return false unless self[0...-1] == other[0...-1]
263
236
 
264
- if self[3] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
265
- other[3] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED != 0
266
- else
267
- self[3] == other[3]
268
- end
237
+ def method_missing(name, ...) # :nodoc:
238
+ @array.send(name, ...)
269
239
  end
270
240
  end
271
241
 
272
- # If we have an identifier that follows a method name like:
273
- #
274
- # def foo bar
275
- #
276
- # then Ripper will mark bar as END|LABEL if there is a local in a parent
277
- # scope named bar because it hasn't pushed the local table yet. We do this
278
- # more accurately, so we need to allow comparing against both END and
279
- # END|LABEL.
280
- class ParamToken < Token
242
+ # Tokens where state should be ignored
243
+ # used for :on_sp, :on_comment, :on_heredoc_end, :on_embexpr_end
244
+ class IgnoreStateToken < Token
281
245
  def ==(other) # :nodoc:
282
- (self[0...-1] == other[0...-1]) && (
283
- (other[3] == Ripper::EXPR_END) ||
284
- (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
285
- )
246
+ self[0...-1] == other[0...-1]
286
247
  end
287
248
  end
288
249
 
@@ -615,10 +576,15 @@ module Prism
615
576
 
616
577
  private_constant :Heredoc
617
578
 
618
- attr_reader :source, :options
579
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
580
+ # first token, so we had to have a hack in place to account for that.
581
+ BOM_FLUSHED = RUBY_VERSION >= "3.3.0"
582
+ private_constant :BOM_FLUSHED
619
583
 
620
- def initialize(source, **options)
621
- @source = source
584
+ attr_reader :options
585
+
586
+ def initialize(code, **options)
587
+ @code = code
622
588
  @options = options
623
589
  end
624
590
 
@@ -628,16 +594,14 @@ module Prism
628
594
  state = :default
629
595
  heredoc_stack = [[]] #: Array[Array[Heredoc::PlainHeredoc | Heredoc::DashHeredoc | Heredoc::DedentingHeredoc]]
630
596
 
631
- result = Prism.lex(source, **options)
597
+ result = Prism.lex(@code, **options)
598
+ source = result.source
632
599
  result_value = result.value
633
- previous_state = nil #: Ripper::Lexer::State?
600
+ previous_state = nil #: State?
634
601
  last_heredoc_end = nil #: Integer?
602
+ eof_token = nil
635
603
 
636
- # In previous versions of Ruby, Ripper wouldn't flush the bom before the
637
- # first token, so we had to have a hack in place to account for that. This
638
- # checks for that behavior.
639
- bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
640
- bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
604
+ bom = source.slice(0, 3) == "\xEF\xBB\xBF"
641
605
 
642
606
  result_value.each_with_index do |(token, lex_state), index|
643
607
  lineno = token.location.start_line
@@ -651,7 +615,7 @@ module Prism
651
615
  if bom && lineno == 1
652
616
  column -= 3
653
617
 
654
- if index == 0 && column == 0 && !bom_flushed
618
+ if index == 0 && column == 0 && !BOM_FLUSHED
655
619
  flushed =
656
620
  case token.type
657
621
  when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
@@ -675,12 +639,15 @@ module Prism
675
639
 
676
640
  event = RIPPER.fetch(token.type)
677
641
  value = token.value
678
- lex_state = Ripper::Lexer::State.new(lex_state)
642
+ lex_state = Translation::Ripper::Lexer::State.cached(lex_state)
679
643
 
680
644
  token =
681
645
  case event
682
646
  when :on___end__
683
- EndContentToken.new([[lineno, column], event, value, lex_state])
647
+ # Ripper doesn't include the rest of the token in the event, so we need to
648
+ # trim it down to just the content on the first line.
649
+ value = value[0..value.index("\n")]
650
+ Token.new([[lineno, column], event, value, lex_state])
684
651
  when :on_comment
685
652
  IgnoreStateToken.new([[lineno, column], event, value, lex_state])
686
653
  when :on_heredoc_end
@@ -688,33 +655,18 @@ module Prism
688
655
  # want to bother comparing the state on them.
689
656
  last_heredoc_end = token.location.end_offset
690
657
  IgnoreStateToken.new([[lineno, column], event, value, lex_state])
691
- when :on_ident
692
- if lex_state == Ripper::EXPR_END
693
- # If we have an identifier that follows a method name like:
694
- #
695
- # def foo bar
696
- #
697
- # then Ripper will mark bar as END|LABEL if there is a local in a
698
- # parent scope named bar because it hasn't pushed the local table
699
- # yet. We do this more accurately, so we need to allow comparing
700
- # against both END and END|LABEL.
701
- ParamToken.new([[lineno, column], event, value, lex_state])
702
- elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
703
- # In the event that we're comparing identifiers, we're going to
704
- # allow a little divergence. Ripper doesn't account for local
705
- # variables introduced through named captures in regexes, and we
706
- # do, which accounts for this difference.
707
- IdentToken.new([[lineno, column], event, value, lex_state])
708
- else
709
- Token.new([[lineno, column], event, value, lex_state])
710
- end
711
658
  when :on_embexpr_end
712
659
  IgnoreStateToken.new([[lineno, column], event, value, lex_state])
713
- when :on_ignored_nl
714
- # Ignored newlines can occasionally have a LABEL state attached to
715
- # them which doesn't actually impact anything. We don't mirror that
716
- # state so we ignored it.
717
- IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
660
+ when :on_words_sep
661
+ # Ripper emits one token each per line.
662
+ value.each_line.with_index do |line, index|
663
+ if index > 0
664
+ lineno += 1
665
+ column = 0
666
+ end
667
+ tokens << Token.new([[lineno, column], event, line, lex_state])
668
+ end
669
+ tokens.pop
718
670
  when :on_regexp_end
719
671
  # On regex end, Ripper scans and then sets end state, so the ripper
720
672
  # lexed output is begin, when it should be end. prism sets lex state
@@ -739,13 +691,14 @@ module Prism
739
691
  counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
740
692
  end
741
693
 
742
- Ripper::Lexer::State.new(result_value[current_index][1])
694
+ Translation::Ripper::Lexer::State.cached(result_value[current_index][1])
743
695
  else
744
696
  previous_state
745
697
  end
746
698
 
747
699
  Token.new([[lineno, column], event, value, lex_state])
748
700
  when :on_eof
701
+ eof_token = token
749
702
  previous_token = result_value[index - 1][0]
750
703
 
751
704
  # If we're at the end of the file and the previous token was a
@@ -768,7 +721,7 @@ module Prism
768
721
  end_offset += 3
769
722
  end
770
723
 
771
- tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
724
+ tokens << Token.new([[lineno, 0], :on_nl, source.slice(start_offset, end_offset - start_offset), lex_state])
772
725
  end
773
726
  end
774
727
 
@@ -859,70 +812,100 @@ module Prism
859
812
  # Drop the EOF token from the list
860
813
  tokens = tokens[0...-1]
861
814
 
862
- # We sort by location to compare against Ripper's output
863
- tokens.sort_by!(&:location)
864
-
865
- Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, Source.for(source))
866
- end
867
- end
868
-
869
- private_constant :LexCompat
815
+ # We sort by location because Ripper.lex sorts.
816
+ # Manually implemented instead of `sort_by!(&:location)` for performance.
817
+ tokens.sort_by! do |token|
818
+ line, column = token.location
819
+ source.byte_offset(line, column)
820
+ end
870
821
 
871
- # This is a class that wraps the Ripper lexer to produce almost exactly the
872
- # same tokens.
873
- class LexRipper # :nodoc:
874
- attr_reader :source
822
+ # Add :on_sp tokens
823
+ tokens = add_on_sp_tokens(tokens, source, result.data_loc, bom, eof_token)
875
824
 
876
- def initialize(source)
877
- @source = source
825
+ Result.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, source)
878
826
  end
879
827
 
880
- def result
881
- previous = [] #: [[Integer, Integer], Symbol, String, untyped] | []
882
- results = [] #: Array[[[Integer, Integer], Symbol, String, untyped]]
883
-
884
- lex(source).each do |token|
885
- case token[1]
886
- when :on_sp
887
- # skip
888
- when :on_tstring_content
889
- if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
890
- previous[2] << token[2]
828
+ def add_on_sp_tokens(tokens, source, data_loc, bom, eof_token)
829
+ new_tokens = []
830
+
831
+ prev_token_state = Translation::Ripper::Lexer::State.cached(Translation::Ripper::EXPR_BEG)
832
+ prev_token_end = bom ? 3 : 0
833
+
834
+ tokens.each do |token|
835
+ line, column = token.location
836
+ start_offset = source.byte_offset(line, column)
837
+
838
+ # Ripper reports columns on line 1 without counting the BOM, so we
839
+ # adjust to get the real offset
840
+ start_offset += 3 if line == 1 && bom
841
+
842
+ if start_offset > prev_token_end
843
+ sp_value = source.slice(prev_token_end, start_offset - prev_token_end)
844
+ sp_line = source.line(prev_token_end)
845
+ sp_column = source.column(prev_token_end)
846
+ # Ripper reports columns on line 1 without counting the BOM
847
+ sp_column -= 3 if sp_line == 1 && bom
848
+ continuation_index = sp_value.byteindex("\\")
849
+
850
+ # ripper emits up to three :on_sp tokens when line continuations are used
851
+ if continuation_index
852
+ next_whitespace_index = continuation_index + 1
853
+ next_whitespace_index += 1 if sp_value.byteslice(next_whitespace_index) == "\r"
854
+ next_whitespace_index += 1
855
+ first_whitespace = sp_value[0...continuation_index]
856
+ continuation = sp_value[continuation_index...next_whitespace_index]
857
+ second_whitespace = sp_value[next_whitespace_index..]
858
+
859
+ new_tokens << IgnoreStateToken.new([
860
+ [sp_line, sp_column],
861
+ :on_sp,
862
+ first_whitespace,
863
+ prev_token_state
864
+ ]) unless first_whitespace.empty?
865
+
866
+ new_tokens << IgnoreStateToken.new([
867
+ [sp_line, sp_column + continuation_index],
868
+ :on_sp,
869
+ continuation,
870
+ prev_token_state
871
+ ])
872
+
873
+ new_tokens << IgnoreStateToken.new([
874
+ [sp_line + 1, 0],
875
+ :on_sp,
876
+ second_whitespace,
877
+ prev_token_state
878
+ ]) unless second_whitespace.empty?
891
879
  else
892
- results << token
893
- previous = token
880
+ new_tokens << IgnoreStateToken.new([
881
+ [sp_line, sp_column],
882
+ :on_sp,
883
+ sp_value,
884
+ prev_token_state
885
+ ])
894
886
  end
895
- when :on_words_sep
896
- if previous[1] == :on_words_sep
897
- previous[2] << token[2]
898
- else
899
- results << token
900
- previous = token
901
- end
902
- else
903
- results << token
904
- previous = token
905
887
  end
906
- end
907
-
908
- results
909
- end
910
-
911
- private
912
888
 
913
- if Ripper.method(:lex).parameters.assoc(:keyrest)
914
- def lex(source)
915
- Ripper.lex(source, raise_errors: true)
889
+ new_tokens << token
890
+ prev_token_state = token.state
891
+ prev_token_end = start_offset + token.value.bytesize
916
892
  end
917
- else
918
- def lex(source)
919
- ripper = Ripper::Lexer.new(source)
920
- ripper.lex.tap do |result|
921
- raise SyntaxError, ripper.errors.map(&:message).join(' ;') if ripper.errors.any?
893
+
894
+ unless data_loc # no trailing :on_sp with __END__ as it is always preceded by :on_nl
895
+ end_offset = eof_token.location.end_offset
896
+ if prev_token_end < end_offset
897
+ new_tokens << IgnoreStateToken.new([
898
+ [source.line(prev_token_end), source.column(prev_token_end)],
899
+ :on_sp,
900
+ source.slice(prev_token_end, end_offset - prev_token_end),
901
+ prev_token_state
902
+ ])
922
903
  end
923
904
  end
905
+
906
+ new_tokens
924
907
  end
925
908
  end
926
909
 
927
- private_constant :LexRipper
910
+ private_constant :LexCompat
928
911
  end