yarp 0.8.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +48 -1
  3. data/Makefile +5 -1
  4. data/README.md +4 -3
  5. data/config.yml +461 -150
  6. data/docs/configuration.md +1 -0
  7. data/docs/encoding.md +5 -5
  8. data/docs/ruby_api.md +2 -0
  9. data/docs/serialization.md +3 -3
  10. data/docs/testing.md +2 -2
  11. data/ext/yarp/api_node.c +810 -199
  12. data/ext/yarp/extension.c +94 -31
  13. data/ext/yarp/extension.h +2 -2
  14. data/include/yarp/ast.h +653 -150
  15. data/include/yarp/defines.h +2 -1
  16. data/include/yarp/diagnostic.h +3 -3
  17. data/include/yarp/enc/yp_encoding.h +10 -10
  18. data/include/yarp/node.h +10 -0
  19. data/include/yarp/parser.h +19 -19
  20. data/include/yarp/regexp.h +1 -1
  21. data/include/yarp/unescape.h +7 -5
  22. data/include/yarp/util/yp_buffer.h +3 -0
  23. data/include/yarp/util/yp_char.h +16 -16
  24. data/include/yarp/util/yp_constant_pool.h +2 -2
  25. data/include/yarp/util/yp_newline_list.h +7 -4
  26. data/include/yarp/util/yp_string.h +4 -4
  27. data/include/yarp/util/yp_string_list.h +0 -3
  28. data/include/yarp/util/yp_strpbrk.h +1 -1
  29. data/include/yarp/version.h +2 -2
  30. data/include/yarp.h +14 -3
  31. data/lib/yarp/desugar_visitor.rb +204 -0
  32. data/lib/yarp/ffi.rb +27 -1
  33. data/lib/yarp/lex_compat.rb +93 -25
  34. data/lib/yarp/mutation_visitor.rb +683 -0
  35. data/lib/yarp/node.rb +3121 -597
  36. data/lib/yarp/serialize.rb +198 -126
  37. data/lib/yarp.rb +53 -7
  38. data/src/diagnostic.c +1 -1
  39. data/src/enc/yp_big5.c +15 -42
  40. data/src/enc/yp_euc_jp.c +16 -43
  41. data/src/enc/yp_gbk.c +19 -46
  42. data/src/enc/yp_shift_jis.c +16 -43
  43. data/src/enc/yp_tables.c +36 -38
  44. data/src/enc/yp_unicode.c +20 -25
  45. data/src/enc/yp_windows_31j.c +16 -43
  46. data/src/node.c +1444 -836
  47. data/src/prettyprint.c +324 -103
  48. data/src/regexp.c +21 -21
  49. data/src/serialize.c +429 -276
  50. data/src/token_type.c +2 -2
  51. data/src/unescape.c +184 -136
  52. data/src/util/yp_buffer.c +7 -2
  53. data/src/util/yp_char.c +34 -34
  54. data/src/util/yp_constant_pool.c +4 -4
  55. data/src/util/yp_memchr.c +1 -1
  56. data/src/util/yp_newline_list.c +14 -3
  57. data/src/util/yp_string.c +22 -20
  58. data/src/util/yp_string_list.c +0 -6
  59. data/src/util/yp_strncasecmp.c +3 -6
  60. data/src/util/yp_strpbrk.c +8 -8
  61. data/src/yarp.c +1504 -615
  62. data/yarp.gemspec +3 -1
  63. metadata +4 -2
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module YARP
4
+ class DesugarVisitor < MutationVisitor
5
+ # @@foo &&= bar
6
+ #
7
+ # becomes
8
+ #
9
+ # @@foo && @@foo = bar
10
+ def visit_class_variable_and_write_node(node)
11
+ desugar_and_write_node(node, ClassVariableReadNode, ClassVariableWriteNode, arguments: [node.name])
12
+ end
13
+
14
+ # @@foo ||= bar
15
+ #
16
+ # becomes
17
+ #
18
+ # defined?(@@foo) ? @@foo : @@foo = bar
19
+ def visit_class_variable_or_write_node(node)
20
+ desugar_or_write_defined_node(node, ClassVariableReadNode, ClassVariableWriteNode, arguments: [node.name])
21
+ end
22
+
23
+ # @@foo += bar
24
+ #
25
+ # becomes
26
+ #
27
+ # @@foo = @@foo + bar
28
+ def visit_class_variable_operator_write_node(node)
29
+ desugar_operator_write_node(node, ClassVariableReadNode, ClassVariableWriteNode, arguments: [node.name])
30
+ end
31
+
32
+ # Foo &&= bar
33
+ #
34
+ # becomes
35
+ #
36
+ # Foo && Foo = bar
37
+ def visit_constant_and_write_node(node)
38
+ desugar_and_write_node(node, ConstantReadNode, ConstantWriteNode)
39
+ end
40
+
41
+ # Foo ||= bar
42
+ #
43
+ # becomes
44
+ #
45
+ # defined?(Foo) ? Foo : Foo = bar
46
+ def visit_constant_or_write_node(node)
47
+ desugar_or_write_defined_node(node, ConstantReadNode, ConstantWriteNode)
48
+ end
49
+
50
+ # Foo += bar
51
+ #
52
+ # becomes
53
+ #
54
+ # Foo = Foo + bar
55
+ def visit_constant_operator_write_node(node)
56
+ desugar_operator_write_node(node, ConstantReadNode, ConstantWriteNode)
57
+ end
58
+
59
+ # $foo &&= bar
60
+ #
61
+ # becomes
62
+ #
63
+ # $foo && $foo = bar
64
+ def visit_global_variable_and_write_node(node)
65
+ desugar_and_write_node(node, GlobalVariableReadNode, GlobalVariableWriteNode)
66
+ end
67
+
68
+ # $foo ||= bar
69
+ #
70
+ # becomes
71
+ #
72
+ # defined?($foo) ? $foo : $foo = bar
73
+ def visit_global_variable_or_write_node(node)
74
+ desugar_or_write_defined_node(node, GlobalVariableReadNode, GlobalVariableWriteNode)
75
+ end
76
+
77
+ # $foo += bar
78
+ #
79
+ # becomes
80
+ #
81
+ # $foo = $foo + bar
82
+ def visit_global_variable_operator_write_node(node)
83
+ desugar_operator_write_node(node, GlobalVariableReadNode, GlobalVariableWriteNode)
84
+ end
85
+
86
+ # @foo &&= bar
87
+ #
88
+ # becomes
89
+ #
90
+ # @foo && @foo = bar
91
+ def visit_instance_variable_and_write_node(node)
92
+ desugar_and_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, arguments: [node.name])
93
+ end
94
+
95
+ # @foo ||= bar
96
+ #
97
+ # becomes
98
+ #
99
+ # @foo || @foo = bar
100
+ def visit_instance_variable_or_write_node(node)
101
+ desugar_or_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, arguments: [node.name])
102
+ end
103
+
104
+ # @foo += bar
105
+ #
106
+ # becomes
107
+ #
108
+ # @foo = @foo + bar
109
+ def visit_instance_variable_operator_write_node(node)
110
+ desugar_operator_write_node(node, InstanceVariableReadNode, InstanceVariableWriteNode, arguments: [node.name])
111
+ end
112
+
113
+ # foo &&= bar
114
+ #
115
+ # becomes
116
+ #
117
+ # foo && foo = bar
118
+ def visit_local_variable_and_write_node(node)
119
+ desugar_and_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, arguments: [node.name, node.depth])
120
+ end
121
+
122
+ # foo ||= bar
123
+ #
124
+ # becomes
125
+ #
126
+ # foo || foo = bar
127
+ def visit_local_variable_or_write_node(node)
128
+ desugar_or_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, arguments: [node.name, node.depth])
129
+ end
130
+
131
+ # foo += bar
132
+ #
133
+ # becomes
134
+ #
135
+ # foo = foo + bar
136
+ def visit_local_variable_operator_write_node(node)
137
+ desugar_operator_write_node(node, LocalVariableReadNode, LocalVariableWriteNode, arguments: [node.name, node.depth])
138
+ end
139
+
140
+ private
141
+
142
+ # Desugar `x &&= y` to `x && x = y`
143
+ def desugar_and_write_node(node, read_class, write_class, arguments: [])
144
+ AndNode.new(
145
+ read_class.new(*arguments, node.name_loc),
146
+ write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location),
147
+ node.operator_loc,
148
+ node.location
149
+ )
150
+ end
151
+
152
+ # Desugar `x += y` to `x = x + y`
153
+ def desugar_operator_write_node(node, read_class, write_class, arguments: [])
154
+ write_class.new(
155
+ *arguments,
156
+ node.name_loc,
157
+ CallNode.new(
158
+ read_class.new(*arguments, node.name_loc),
159
+ nil,
160
+ node.operator_loc.copy(length: node.operator_loc.length - 1),
161
+ nil,
162
+ ArgumentsNode.new([node.value], node.value.location),
163
+ nil,
164
+ nil,
165
+ 0,
166
+ node.operator_loc.slice.chomp("="),
167
+ node.location
168
+ ),
169
+ node.operator_loc.copy(start_offset: node.operator_loc.end_offset - 1, length: 1),
170
+ node.location
171
+ )
172
+ end
173
+
174
+ # Desugar `x ||= y` to `x || x = y`
175
+ def desugar_or_write_node(node, read_class, write_class, arguments: [])
176
+ OrNode.new(
177
+ read_class.new(*arguments, node.name_loc),
178
+ write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location),
179
+ node.operator_loc,
180
+ node.location
181
+ )
182
+ end
183
+
184
+ # Desugar `x ||= y` to `defined?(x) ? x : x = y`
185
+ def desugar_or_write_defined_node(node, read_class, write_class, arguments: [])
186
+ IfNode.new(
187
+ node.operator_loc,
188
+ DefinedNode.new(nil, read_class.new(*arguments, node.name_loc), nil, node.operator_loc, node.name_loc),
189
+ StatementsNode.new([read_class.new(*arguments, node.name_loc)], node.location),
190
+ ElseNode.new(
191
+ node.operator_loc,
192
+ StatementsNode.new(
193
+ [write_class.new(*arguments, node.name_loc, node.value, node.operator_loc, node.location)],
194
+ node.location
195
+ ),
196
+ node.operator_loc,
197
+ node.location
198
+ ),
199
+ node.operator_loc,
200
+ node.location
201
+ )
202
+ end
203
+ end
204
+ end
data/lib/yarp/ffi.rb CHANGED
@@ -70,7 +70,8 @@ module YARP
70
70
  "yarp.h",
71
71
  "yp_version",
72
72
  "yp_parse_serialize",
73
- "yp_lex_serialize"
73
+ "yp_lex_serialize",
74
+ "yp_parse_lex_serialize"
74
75
  )
75
76
 
76
77
  load_exported_functions_from(
@@ -223,4 +224,29 @@ module YARP
223
224
  parse(string.read, filepath)
224
225
  end
225
226
  end
227
+
228
+ # Mirror the YARP.parse_lex API by using the serialization API.
229
+ def self.parse_lex(code, filepath = nil)
230
+ LibRubyParser::YPBuffer.with do |buffer|
231
+ metadata = [filepath.bytesize, filepath.b, 0].pack("LA*L") if filepath
232
+ LibRubyParser.yp_parse_lex_serialize(code, code.bytesize, buffer.pointer, metadata)
233
+
234
+ source = Source.new(code)
235
+ loader = Serialize::Loader.new(source, buffer.read)
236
+
237
+ tokens = loader.load_tokens
238
+ node, comments, errors, warnings = loader.load_nodes
239
+
240
+ tokens.each { |token,| token.value.force_encoding(loader.encoding) }
241
+
242
+ ParseResult.new([node, tokens], comments, errors, warnings, source)
243
+ end
244
+ end
245
+
246
+ # Mirror the YARP.parse_lex_file API by using the serialization API.
247
+ def self.parse_lex_file(filepath)
248
+ LibRubyParser::YPString.with(filepath) do |string|
249
+ parse_lex(string.read, filepath)
250
+ end
251
+ end
226
252
  end
@@ -208,18 +208,9 @@ module YARP
208
208
  end
209
209
  end
210
210
 
211
- # It is extremely non obvious which state the parser is in when comments get
212
- # dispatched. Because of this we don't both comparing state when comparing
213
- # against other comment tokens.
214
- class CommentToken < Token
215
- def ==(other)
216
- self[0...-1] == other[0...-1]
217
- end
218
- end
219
-
220
- # Heredoc end tokens are emitted in an odd order, so we don't compare the
221
- # state on them.
222
- class HeredocEndToken < Token
211
+ # Tokens where state should be ignored
212
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
213
+ class IgnoreStateToken < Token
223
214
  def ==(other)
224
215
  self[0...-1] == other[0...-1]
225
216
  end
@@ -252,6 +243,23 @@ module YARP
252
243
  end
253
244
  end
254
245
 
246
+ # If we have an identifier that follows a method name like:
247
+ #
248
+ # def foo bar
249
+ #
250
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
251
+ # scope named bar because it hasn't pushed the local table yet. We do this
252
+ # more accurately, so we need to allow comparing against both END and
253
+ # END|LABEL.
254
+ class ParamToken < Token
255
+ def ==(other)
256
+ (self[0...-1] == other[0...-1]) && (
257
+ (other[3] == Ripper::EXPR_END) ||
258
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
259
+ )
260
+ end
261
+ end
262
+
255
263
  # A heredoc in this case is a list of tokens that belong to the body of the
256
264
  # heredoc that should be appended onto the list of tokens when the heredoc
257
265
  # closes.
@@ -558,18 +566,45 @@ module YARP
558
566
  result_value = result.value
559
567
  previous_state = nil
560
568
 
561
- # If there's a UTF-8 byte-order mark as the start of the file, then ripper
562
- # sets every token's on the first line back by 6 bytes. It also keeps the
563
- # byte order mark in the first token's value. This is weird, and I don't
564
- # want to mirror that in our parser. So instead, we'll match up the values
565
- # here, and then match up the locations as we process the tokens.
566
- bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
567
- result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
569
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
570
+ # first token, so we had to have a hack in place to account for that. This
571
+ # checks for that behavior.
572
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
573
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
568
574
 
569
575
  result_value.each_with_index do |(token, lex_state), index|
570
576
  lineno = token.location.start_line
571
577
  column = token.location.start_column
572
- column -= index == 0 ? 6 : 3 if bom && lineno == 1
578
+
579
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
580
+ # certain tokens ripper sets the first token back by 3 bytes. It also
581
+ # keeps the byte order mark in the first token's value. This is weird,
582
+ # and I don't want to mirror that in our parser. So instead, we'll match
583
+ # up the columns and values here.
584
+ if bom && lineno == 1
585
+ column -= 3
586
+
587
+ if index == 0 && column == 0 && !bom_flushed
588
+ flushed =
589
+ case token.type
590
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
591
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
592
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
593
+ :PERCENT_UPPER_W, :STRING_BEGIN
594
+ true
595
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
596
+ token.value.start_with?("%")
597
+ else
598
+ false
599
+ end
600
+
601
+ unless flushed
602
+ column -= 3
603
+ value = token.value
604
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
605
+ end
606
+ end
607
+ end
573
608
 
574
609
  event = RIPPER.fetch(token.type)
575
610
  value = token.value
@@ -580,13 +615,23 @@ module YARP
580
615
  when :on___end__
581
616
  EndContentToken.new([[lineno, column], event, value, lex_state])
582
617
  when :on_comment
583
- CommentToken.new([[lineno, column], event, value, lex_state])
618
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
584
619
  when :on_heredoc_end
585
620
  # Heredoc end tokens can be emitted in an odd order, so we don't
586
621
  # want to bother comparing the state on them.
587
- HeredocEndToken.new([[lineno, column], event, value, lex_state])
588
- when :on_embexpr_end, :on_ident
589
- if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
622
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
623
+ when :on_ident
624
+ if lex_state == Ripper::EXPR_END
625
+ # If we have an identifier that follows a method name like:
626
+ #
627
+ # def foo bar
628
+ #
629
+ # then Ripper will mark bar as END|LABEL if there is a local in a
630
+ # parent scope named bar because it hasn't pushed the local table
631
+ # yet. We do this more accurately, so we need to allow comparing
632
+ # against both END and END|LABEL.
633
+ ParamToken.new([[lineno, column], event, value, lex_state])
634
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
590
635
  # In the event that we're comparing identifiers, we're going to
591
636
  # allow a little divergence. Ripper doesn't account for local
592
637
  # variables introduced through named captures in regexes, and we
@@ -595,6 +640,8 @@ module YARP
595
640
  else
596
641
  Token.new([[lineno, column], event, value, lex_state])
597
642
  end
643
+ when :on_embexpr_end
644
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
598
645
  when :on_ignored_nl
599
646
  # Ignored newlines can occasionally have a LABEL state attached to
600
647
  # them which doesn't actually impact anything. We don't mirror that
@@ -629,6 +676,26 @@ module YARP
629
676
  previous_state
630
677
  end
631
678
 
679
+ Token.new([[lineno, column], event, value, lex_state])
680
+ when :on_eof
681
+ previous_token = result_value[index - 1][0]
682
+
683
+ # If we're at the end of the file and the previous token was a
684
+ # comment and there is still whitespace after the comment, then
685
+ # Ripper will append a on_nl token (even though there isn't
686
+ # necessarily a newline). We mirror that here.
687
+ start_offset = previous_token.location.end_offset
688
+ end_offset = token.location.start_offset
689
+
690
+ if previous_token.type == :COMMENT && start_offset < end_offset
691
+ if bom
692
+ start_offset += 3
693
+ end_offset += 3
694
+ end
695
+
696
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
697
+ end
698
+
632
699
  Token.new([[lineno, column], event, value, lex_state])
633
700
  else
634
701
  Token.new([[lineno, column], event, value, lex_state])
@@ -713,7 +780,8 @@ module YARP
713
780
  end
714
781
  end
715
782
 
716
- tokens.reject! { |t| t.event == :on_eof }
783
+ # Drop the EOF token from the list
784
+ tokens = tokens[0...-1]
717
785
 
718
786
  # We sort by location to compare against Ripper's output
719
787
  tokens.sort_by!(&:location)