yarp 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,267 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YARP
4
+ class DesugarVisitor < MutationVisitor
5
+ # @@foo &&= bar
6
+ #
7
+ # becomes
8
+ #
9
+ # @@foo && @@foo = bar
10
+ def visit_class_variable_and_write_node(node)
11
+ AndNode.new(
12
+ ClassVariableReadNode.new(node.name_loc),
13
+ ClassVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
14
+ node.operator_loc,
15
+ node.location
16
+ )
17
+ end
18
+
19
+ # @@foo ||= bar
20
+ #
21
+ # becomes
22
+ #
23
+ # @@foo || @@foo = bar
24
+ def visit_class_variable_or_write_node(node)
25
+ OrNode.new(
26
+ ClassVariableReadNode.new(node.name_loc),
27
+ ClassVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
28
+ node.operator_loc,
29
+ node.location
30
+ )
31
+ end
32
+
33
+ # @@foo += bar
34
+ #
35
+ # becomes
36
+ #
37
+ # @@foo = @@foo + bar
38
+ def visit_class_variable_operator_write_node(node)
39
+ desugar_operator_write_node(node, ClassVariableWriteNode, ClassVariableReadNode)
40
+ end
41
+
42
+ # Foo &&= bar
43
+ #
44
+ # becomes
45
+ #
46
+ # Foo && Foo = bar
47
+ def visit_constant_and_write_node(node)
48
+ AndNode.new(
49
+ ConstantReadNode.new(node.name_loc),
50
+ ConstantWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
51
+ node.operator_loc,
52
+ node.location
53
+ )
54
+ end
55
+
56
+ # Foo ||= bar
57
+ #
58
+ # becomes
59
+ #
60
+ # Foo || Foo = bar
61
+ def visit_constant_or_write_node(node)
62
+ OrNode.new(
63
+ ConstantReadNode.new(node.name_loc),
64
+ ConstantWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
65
+ node.operator_loc,
66
+ node.location
67
+ )
68
+ end
69
+
70
+ # Foo += bar
71
+ #
72
+ # becomes
73
+ #
74
+ # Foo = Foo + bar
75
+ def visit_constant_operator_write_node(node)
76
+ desugar_operator_write_node(node, ConstantWriteNode, ConstantReadNode)
77
+ end
78
+
79
+ # Foo::Bar &&= baz
80
+ #
81
+ # becomes
82
+ #
83
+ # Foo::Bar && Foo::Bar = baz
84
+ def visit_constant_path_and_write_node(node)
85
+ AndNode.new(
86
+ node.target,
87
+ ConstantPathWriteNode.new(node.target, node.value, node.operator_loc, node.location),
88
+ node.operator_loc,
89
+ node.location
90
+ )
91
+ end
92
+
93
+ # Foo::Bar ||= baz
94
+ #
95
+ # becomes
96
+ #
97
+ # Foo::Bar || Foo::Bar = baz
98
+ def visit_constant_path_or_write_node(node)
99
+ OrNode.new(
100
+ node.target,
101
+ ConstantPathWriteNode.new(node.target, node.value, node.operator_loc, node.location),
102
+ node.operator_loc,
103
+ node.location
104
+ )
105
+ end
106
+
107
+ # Foo::Bar += baz
108
+ #
109
+ # becomes
110
+ #
111
+ # Foo::Bar = Foo::Bar + baz
112
+ def visit_constant_path_operator_write_node(node)
113
+ ConstantPathWriteNode.new(
114
+ node.target,
115
+ CallNode.new(
116
+ node.target,
117
+ nil,
118
+ node.operator_loc.copy(length: node.operator_loc.length - 1),
119
+ nil,
120
+ ArgumentsNode.new([node.value], node.value.location),
121
+ nil,
122
+ nil,
123
+ 0,
124
+ node.operator_loc.slice.chomp("="),
125
+ node.location
126
+ ),
127
+ node.operator_loc.copy(start_offset: node.operator_loc.end_offset - 1, length: 1),
128
+ node.location
129
+ )
130
+ end
131
+
132
+ # $foo &&= bar
133
+ #
134
+ # becomes
135
+ #
136
+ # $foo && $foo = bar
137
+ def visit_global_variable_and_write_node(node)
138
+ AndNode.new(
139
+ GlobalVariableReadNode.new(node.name_loc),
140
+ GlobalVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
141
+ node.operator_loc,
142
+ node.location
143
+ )
144
+ end
145
+
146
+ # $foo ||= bar
147
+ #
148
+ # becomes
149
+ #
150
+ # $foo || $foo = bar
151
+ def visit_global_variable_or_write_node(node)
152
+ OrNode.new(
153
+ GlobalVariableReadNode.new(node.name_loc),
154
+ GlobalVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
155
+ node.operator_loc,
156
+ node.location
157
+ )
158
+ end
159
+
160
+ # $foo += bar
161
+ #
162
+ # becomes
163
+ #
164
+ # $foo = $foo + bar
165
+ def visit_global_variable_operator_write_node(node)
166
+ desugar_operator_write_node(node, GlobalVariableWriteNode, GlobalVariableReadNode)
167
+ end
168
+
169
+ # @foo &&= bar
170
+ #
171
+ # becomes
172
+ #
173
+ # @foo && @foo = bar
174
+ def visit_instance_variable_and_write_node(node)
175
+ AndNode.new(
176
+ InstanceVariableReadNode.new(node.name_loc),
177
+ InstanceVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
178
+ node.operator_loc,
179
+ node.location
180
+ )
181
+ end
182
+
183
+ # @foo ||= bar
184
+ #
185
+ # becomes
186
+ #
187
+ # @foo || @foo = bar
188
+ def visit_instance_variable_or_write_node(node)
189
+ OrNode.new(
190
+ InstanceVariableReadNode.new(node.name_loc),
191
+ InstanceVariableWriteNode.new(node.name_loc, node.value, node.operator_loc, node.location),
192
+ node.operator_loc,
193
+ node.location
194
+ )
195
+ end
196
+
197
+ # @foo += bar
198
+ #
199
+ # becomes
200
+ #
201
+ # @foo = @foo + bar
202
+ def visit_instance_variable_operator_write_node(node)
203
+ desugar_operator_write_node(node, InstanceVariableWriteNode, InstanceVariableReadNode)
204
+ end
205
+
206
+ # foo &&= bar
207
+ #
208
+ # becomes
209
+ #
210
+ # foo && foo = bar
211
+ def visit_local_variable_and_write_node(node)
212
+ AndNode.new(
213
+ LocalVariableReadNode.new(node.constant_id, node.depth, node.name_loc),
214
+ LocalVariableWriteNode.new(node.constant_id, node.depth, node.name_loc, node.value, node.operator_loc, node.location),
215
+ node.operator_loc,
216
+ node.location
217
+ )
218
+ end
219
+
220
+ # foo ||= bar
221
+ #
222
+ # becomes
223
+ #
224
+ # foo || foo = bar
225
+ def visit_local_variable_or_write_node(node)
226
+ OrNode.new(
227
+ LocalVariableReadNode.new(node.constant_id, node.depth, node.name_loc),
228
+ LocalVariableWriteNode.new(node.constant_id, node.depth, node.name_loc, node.value, node.operator_loc, node.location),
229
+ node.operator_loc,
230
+ node.location
231
+ )
232
+ end
233
+
234
+ # foo += bar
235
+ #
236
+ # becomes
237
+ #
238
+ # foo = foo + bar
239
+ def visit_local_variable_operator_write_node(node)
240
+ desugar_operator_write_node(node, LocalVariableWriteNode, LocalVariableReadNode, arguments: [node.constant_id, node.depth])
241
+ end
242
+
243
+ private
244
+
245
+ # Desugar `x += y` to `x = x + y`
246
+ def desugar_operator_write_node(node, write_class, read_class, arguments: [])
247
+ write_class.new(
248
+ *arguments,
249
+ node.name_loc,
250
+ CallNode.new(
251
+ read_class.new(*arguments, node.name_loc),
252
+ nil,
253
+ node.operator_loc.copy(length: node.operator_loc.length - 1),
254
+ nil,
255
+ ArgumentsNode.new([node.value], node.value.location),
256
+ nil,
257
+ nil,
258
+ 0,
259
+ node.operator_loc.slice.chomp("="),
260
+ node.location
261
+ ),
262
+ node.operator_loc.copy(start_offset: node.operator_loc.end_offset - 1, length: 1),
263
+ node.location
264
+ )
265
+ end
266
+ end
267
+ end
data/lib/yarp/ffi.rb CHANGED
@@ -70,7 +70,8 @@ module YARP
70
70
  "yarp.h",
71
71
  "yp_version",
72
72
  "yp_parse_serialize",
73
- "yp_lex_serialize"
73
+ "yp_lex_serialize",
74
+ "yp_parse_lex_serialize"
74
75
  )
75
76
 
76
77
  load_exported_functions_from(
@@ -223,4 +224,29 @@ module YARP
223
224
  parse(string.read, filepath)
224
225
  end
225
226
  end
227
+
228
+ # Mirror the YARP.parse_lex API by using the serialization API.
229
+ def self.parse_lex(code, filepath = nil)
230
+ LibRubyParser::YPBuffer.with do |buffer|
231
+ metadata = [filepath.bytesize, filepath.b, 0].pack("LA*L") if filepath
232
+ LibRubyParser.yp_parse_lex_serialize(code, code.bytesize, buffer.pointer, metadata)
233
+
234
+ source = Source.new(code)
235
+ loader = Serialize::Loader.new(source, buffer.read)
236
+
237
+ tokens = loader.load_tokens
238
+ node, comments, errors, warnings = loader.load_nodes
239
+
240
+ tokens.each { |token,| token.value.force_encoding(loader.encoding) }
241
+
242
+ ParseResult.new([node, tokens], comments, errors, warnings, source)
243
+ end
244
+ end
245
+
246
+ # Mirror the YARP.parse_lex_file API by using the serialization API.
247
+ def self.parse_lex_file(filepath)
248
+ LibRubyParser::YPString.with(filepath) do |string|
249
+ parse_lex(string.read, filepath)
250
+ end
251
+ end
226
252
  end
@@ -208,18 +208,9 @@ module YARP
208
208
  end
209
209
  end
210
210
 
211
- # It is extremely non obvious which state the parser is in when comments get
212
- # dispatched. Because of this we don't both comparing state when comparing
213
- # against other comment tokens.
214
- class CommentToken < Token
215
- def ==(other)
216
- self[0...-1] == other[0...-1]
217
- end
218
- end
219
-
220
- # Heredoc end tokens are emitted in an odd order, so we don't compare the
221
- # state on them.
222
- class HeredocEndToken < Token
211
+ # Tokens where state should be ignored
212
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
213
+ class IgnoreStateToken < Token
223
214
  def ==(other)
224
215
  self[0...-1] == other[0...-1]
225
216
  end
@@ -252,6 +243,23 @@ module YARP
252
243
  end
253
244
  end
254
245
 
246
+ # If we have an identifier that follows a method name like:
247
+ #
248
+ # def foo bar
249
+ #
250
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
251
+ # scope named bar because it hasn't pushed the local table yet. We do this
252
+ # more accurately, so we need to allow comparing against both END and
253
+ # END|LABEL.
254
+ class ParamToken < Token
255
+ def ==(other)
256
+ (self[0...-1] == other[0...-1]) && (
257
+ (other[3] == Ripper::EXPR_END) ||
258
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
259
+ )
260
+ end
261
+ end
262
+
255
263
  # A heredoc in this case is a list of tokens that belong to the body of the
256
264
  # heredoc that should be appended onto the list of tokens when the heredoc
257
265
  # closes.
@@ -558,18 +566,45 @@ module YARP
558
566
  result_value = result.value
559
567
  previous_state = nil
560
568
 
561
- # If there's a UTF-8 byte-order mark as the start of the file, then ripper
562
- # sets every token's on the first line back by 6 bytes. It also keeps the
563
- # byte order mark in the first token's value. This is weird, and I don't
564
- # want to mirror that in our parser. So instead, we'll match up the values
565
- # here, and then match up the locations as we process the tokens.
566
- bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
567
- result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
569
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
570
+ # first token, so we had to have a hack in place to account for that. This
571
+ # checks for that behavior.
572
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
573
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
568
574
 
569
575
  result_value.each_with_index do |(token, lex_state), index|
570
576
  lineno = token.location.start_line
571
577
  column = token.location.start_column
572
- column -= index == 0 ? 6 : 3 if bom && lineno == 1
578
+
579
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
580
+ # certain tokens ripper sets the first token back by 3 bytes. It also
581
+ # keeps the byte order mark in the first token's value. This is weird,
582
+ # and I don't want to mirror that in our parser. So instead, we'll match
583
+ # up the columns and values here.
584
+ if bom && lineno == 1
585
+ column -= 3
586
+
587
+ if index == 0 && column == 0 && !bom_flushed
588
+ flushed =
589
+ case token.type
590
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
591
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
592
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
593
+ :PERCENT_UPPER_W, :STRING_BEGIN
594
+ true
595
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
596
+ token.value.start_with?("%")
597
+ else
598
+ false
599
+ end
600
+
601
+ unless flushed
602
+ column -= 3
603
+ value = token.value
604
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
605
+ end
606
+ end
607
+ end
573
608
 
574
609
  event = RIPPER.fetch(token.type)
575
610
  value = token.value
@@ -580,13 +615,23 @@ module YARP
580
615
  when :on___end__
581
616
  EndContentToken.new([[lineno, column], event, value, lex_state])
582
617
  when :on_comment
583
- CommentToken.new([[lineno, column], event, value, lex_state])
618
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
584
619
  when :on_heredoc_end
585
620
  # Heredoc end tokens can be emitted in an odd order, so we don't
586
621
  # want to bother comparing the state on them.
587
- HeredocEndToken.new([[lineno, column], event, value, lex_state])
588
- when :on_embexpr_end, :on_ident
589
- if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
622
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
623
+ when :on_ident
624
+ if lex_state == Ripper::EXPR_END
625
+ # If we have an identifier that follows a method name like:
626
+ #
627
+ # def foo bar
628
+ #
629
+ # then Ripper will mark bar as END|LABEL if there is a local in a
630
+ # parent scope named bar because it hasn't pushed the local table
631
+ # yet. We do this more accurately, so we need to allow comparing
632
+ # against both END and END|LABEL.
633
+ ParamToken.new([[lineno, column], event, value, lex_state])
634
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
590
635
  # In the event that we're comparing identifiers, we're going to
591
636
  # allow a little divergence. Ripper doesn't account for local
592
637
  # variables introduced through named captures in regexes, and we
@@ -595,6 +640,8 @@ module YARP
595
640
  else
596
641
  Token.new([[lineno, column], event, value, lex_state])
597
642
  end
643
+ when :on_embexpr_end
644
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
598
645
  when :on_ignored_nl
599
646
  # Ignored newlines can occasionally have a LABEL state attached to
600
647
  # them which doesn't actually impact anything. We don't mirror that
@@ -629,6 +676,26 @@ module YARP
629
676
  previous_state
630
677
  end
631
678
 
679
+ Token.new([[lineno, column], event, value, lex_state])
680
+ when :on_eof
681
+ previous_token = result_value[index - 1][0]
682
+
683
+ # If we're at the end of the file and the previous token was a
684
+ # comment and there is still whitespace after the comment, then
685
+ # Ripper will append a on_nl token (even though there isn't
686
+ # necessarily a newline). We mirror that here.
687
+ start_offset = previous_token.location.end_offset
688
+ end_offset = token.location.start_offset
689
+
690
+ if previous_token.type == :COMMENT && start_offset < end_offset
691
+ if bom
692
+ start_offset += 3
693
+ end_offset += 3
694
+ end
695
+
696
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
697
+ end
698
+
632
699
  Token.new([[lineno, column], event, value, lex_state])
633
700
  else
634
701
  Token.new([[lineno, column], event, value, lex_state])
@@ -713,7 +780,8 @@ module YARP
713
780
  end
714
781
  end
715
782
 
716
- tokens.reject! { |t| t.event == :on_eof }
783
+ # Drop the EOF token from the list
784
+ tokens = tokens[0...-1]
717
785
 
718
786
  # We sort by location to compare against Ripper's output
719
787
  tokens.sort_by!(&:location)