yarp 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +51 -0
- data/LICENSE.md +7 -0
- data/Makefile.in +79 -0
- data/README.md +86 -0
- data/config.h.in +25 -0
- data/config.yml +2147 -0
- data/configure +4487 -0
- data/docs/build_system.md +85 -0
- data/docs/building.md +26 -0
- data/docs/configuration.md +56 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +116 -0
- data/docs/extension.md +20 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/serialization.md +130 -0
- data/docs/testing.md +55 -0
- data/ext/yarp/api_node.c +3680 -0
- data/ext/yarp/api_pack.c +256 -0
- data/ext/yarp/extconf.rb +131 -0
- data/ext/yarp/extension.c +547 -0
- data/ext/yarp/extension.h +18 -0
- data/include/yarp/ast.h +1412 -0
- data/include/yarp/defines.h +54 -0
- data/include/yarp/diagnostic.h +24 -0
- data/include/yarp/enc/yp_encoding.h +94 -0
- data/include/yarp/node.h +36 -0
- data/include/yarp/pack.h +141 -0
- data/include/yarp/parser.h +389 -0
- data/include/yarp/regexp.h +19 -0
- data/include/yarp/unescape.h +42 -0
- data/include/yarp/util/yp_buffer.h +39 -0
- data/include/yarp/util/yp_char.h +75 -0
- data/include/yarp/util/yp_constant_pool.h +64 -0
- data/include/yarp/util/yp_list.h +67 -0
- data/include/yarp/util/yp_memchr.h +14 -0
- data/include/yarp/util/yp_newline_list.h +54 -0
- data/include/yarp/util/yp_state_stack.h +24 -0
- data/include/yarp/util/yp_string.h +57 -0
- data/include/yarp/util/yp_string_list.h +28 -0
- data/include/yarp/util/yp_strpbrk.h +29 -0
- data/include/yarp/version.h +5 -0
- data/include/yarp.h +69 -0
- data/lib/yarp/lex_compat.rb +759 -0
- data/lib/yarp/node.rb +7428 -0
- data/lib/yarp/pack.rb +185 -0
- data/lib/yarp/ripper_compat.rb +174 -0
- data/lib/yarp/serialize.rb +389 -0
- data/lib/yarp.rb +330 -0
- data/src/diagnostic.c +25 -0
- data/src/enc/yp_big5.c +79 -0
- data/src/enc/yp_euc_jp.c +85 -0
- data/src/enc/yp_gbk.c +88 -0
- data/src/enc/yp_shift_jis.c +83 -0
- data/src/enc/yp_tables.c +509 -0
- data/src/enc/yp_unicode.c +2320 -0
- data/src/enc/yp_windows_31j.c +83 -0
- data/src/node.c +2011 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +1782 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1576 -0
- data/src/token_type.c +347 -0
- data/src/unescape.c +576 -0
- data/src/util/yp_buffer.c +78 -0
- data/src/util/yp_char.c +229 -0
- data/src/util/yp_constant_pool.c +147 -0
- data/src/util/yp_list.c +50 -0
- data/src/util/yp_memchr.c +31 -0
- data/src/util/yp_newline_list.c +119 -0
- data/src/util/yp_state_stack.c +25 -0
- data/src/util/yp_string.c +207 -0
- data/src/util/yp_string_list.c +32 -0
- data/src/util/yp_strncasecmp.c +20 -0
- data/src/util/yp_strpbrk.c +66 -0
- data/src/yarp.c +13211 -0
- data/yarp.gemspec +100 -0
- metadata +125 -0
@@ -0,0 +1,759 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "delegate"
|
4
|
+
|
5
|
+
module YARP
|
6
|
+
# This class is responsible for lexing the source using YARP and then
|
7
|
+
# converting those tokens to be compatible with Ripper. In the vast majority
|
8
|
+
# of cases, this is a one-to-one mapping of the token type. Everything else
|
9
|
+
# generally lines up. However, there are a few cases that require special
|
10
|
+
# handling.
|
11
|
+
class LexCompat
|
12
|
+
# This is a mapping of YARP token types to Ripper token types. This is a
|
13
|
+
# many-to-one mapping because we split up our token types, whereas Ripper
|
14
|
+
# tends to group them.
|
15
|
+
RIPPER = {
|
16
|
+
AMPERSAND: :on_op,
|
17
|
+
AMPERSAND_AMPERSAND: :on_op,
|
18
|
+
AMPERSAND_AMPERSAND_EQUAL: :on_op,
|
19
|
+
AMPERSAND_DOT: :on_op,
|
20
|
+
AMPERSAND_EQUAL: :on_op,
|
21
|
+
BACK_REFERENCE: :on_backref,
|
22
|
+
BACKTICK: :on_backtick,
|
23
|
+
BANG: :on_op,
|
24
|
+
BANG_EQUAL: :on_op,
|
25
|
+
BANG_TILDE: :on_op,
|
26
|
+
BRACE_LEFT: :on_lbrace,
|
27
|
+
BRACE_RIGHT: :on_rbrace,
|
28
|
+
BRACKET_LEFT: :on_lbracket,
|
29
|
+
BRACKET_LEFT_ARRAY: :on_lbracket,
|
30
|
+
BRACKET_LEFT_RIGHT: :on_op,
|
31
|
+
BRACKET_LEFT_RIGHT_EQUAL: :on_op,
|
32
|
+
BRACKET_RIGHT: :on_rbracket,
|
33
|
+
CARET: :on_op,
|
34
|
+
CARET_EQUAL: :on_op,
|
35
|
+
CHARACTER_LITERAL: :on_CHAR,
|
36
|
+
CLASS_VARIABLE: :on_cvar,
|
37
|
+
COLON: :on_op,
|
38
|
+
COLON_COLON: :on_op,
|
39
|
+
COMMA: :on_comma,
|
40
|
+
COMMENT: :on_comment,
|
41
|
+
CONSTANT: :on_const,
|
42
|
+
DOT: :on_period,
|
43
|
+
DOT_DOT: :on_op,
|
44
|
+
DOT_DOT_DOT: :on_op,
|
45
|
+
EMBDOC_BEGIN: :on_embdoc_beg,
|
46
|
+
EMBDOC_END: :on_embdoc_end,
|
47
|
+
EMBDOC_LINE: :on_embdoc,
|
48
|
+
EMBEXPR_BEGIN: :on_embexpr_beg,
|
49
|
+
EMBEXPR_END: :on_embexpr_end,
|
50
|
+
EMBVAR: :on_embvar,
|
51
|
+
EOF: :on_eof,
|
52
|
+
EQUAL: :on_op,
|
53
|
+
EQUAL_EQUAL: :on_op,
|
54
|
+
EQUAL_EQUAL_EQUAL: :on_op,
|
55
|
+
EQUAL_GREATER: :on_op,
|
56
|
+
EQUAL_TILDE: :on_op,
|
57
|
+
FLOAT: :on_float,
|
58
|
+
FLOAT_IMAGINARY: :on_imaginary,
|
59
|
+
FLOAT_RATIONAL: :on_rational,
|
60
|
+
FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
|
61
|
+
GREATER: :on_op,
|
62
|
+
GREATER_EQUAL: :on_op,
|
63
|
+
GREATER_GREATER: :on_op,
|
64
|
+
GREATER_GREATER_EQUAL: :on_op,
|
65
|
+
GLOBAL_VARIABLE: :on_gvar,
|
66
|
+
HEREDOC_END: :on_heredoc_end,
|
67
|
+
HEREDOC_START: :on_heredoc_beg,
|
68
|
+
IDENTIFIER: :on_ident,
|
69
|
+
IGNORED_NEWLINE: :on_ignored_nl,
|
70
|
+
INTEGER: :on_int,
|
71
|
+
INTEGER_IMAGINARY: :on_imaginary,
|
72
|
+
INTEGER_RATIONAL: :on_rational,
|
73
|
+
INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
|
74
|
+
INSTANCE_VARIABLE: :on_ivar,
|
75
|
+
INVALID: :INVALID,
|
76
|
+
KEYWORD___ENCODING__: :on_kw,
|
77
|
+
KEYWORD___LINE__: :on_kw,
|
78
|
+
KEYWORD___FILE__: :on_kw,
|
79
|
+
KEYWORD_ALIAS: :on_kw,
|
80
|
+
KEYWORD_AND: :on_kw,
|
81
|
+
KEYWORD_BEGIN: :on_kw,
|
82
|
+
KEYWORD_BEGIN_UPCASE: :on_kw,
|
83
|
+
KEYWORD_BREAK: :on_kw,
|
84
|
+
KEYWORD_CASE: :on_kw,
|
85
|
+
KEYWORD_CLASS: :on_kw,
|
86
|
+
KEYWORD_DEF: :on_kw,
|
87
|
+
KEYWORD_DEFINED: :on_kw,
|
88
|
+
KEYWORD_DO: :on_kw,
|
89
|
+
KEYWORD_DO_LOOP: :on_kw,
|
90
|
+
KEYWORD_ELSE: :on_kw,
|
91
|
+
KEYWORD_ELSIF: :on_kw,
|
92
|
+
KEYWORD_END: :on_kw,
|
93
|
+
KEYWORD_END_UPCASE: :on_kw,
|
94
|
+
KEYWORD_ENSURE: :on_kw,
|
95
|
+
KEYWORD_FALSE: :on_kw,
|
96
|
+
KEYWORD_FOR: :on_kw,
|
97
|
+
KEYWORD_IF: :on_kw,
|
98
|
+
KEYWORD_IF_MODIFIER: :on_kw,
|
99
|
+
KEYWORD_IN: :on_kw,
|
100
|
+
KEYWORD_MODULE: :on_kw,
|
101
|
+
KEYWORD_NEXT: :on_kw,
|
102
|
+
KEYWORD_NIL: :on_kw,
|
103
|
+
KEYWORD_NOT: :on_kw,
|
104
|
+
KEYWORD_OR: :on_kw,
|
105
|
+
KEYWORD_REDO: :on_kw,
|
106
|
+
KEYWORD_RESCUE: :on_kw,
|
107
|
+
KEYWORD_RESCUE_MODIFIER: :on_kw,
|
108
|
+
KEYWORD_RETRY: :on_kw,
|
109
|
+
KEYWORD_RETURN: :on_kw,
|
110
|
+
KEYWORD_SELF: :on_kw,
|
111
|
+
KEYWORD_SUPER: :on_kw,
|
112
|
+
KEYWORD_THEN: :on_kw,
|
113
|
+
KEYWORD_TRUE: :on_kw,
|
114
|
+
KEYWORD_UNDEF: :on_kw,
|
115
|
+
KEYWORD_UNLESS: :on_kw,
|
116
|
+
KEYWORD_UNLESS_MODIFIER: :on_kw,
|
117
|
+
KEYWORD_UNTIL: :on_kw,
|
118
|
+
KEYWORD_UNTIL_MODIFIER: :on_kw,
|
119
|
+
KEYWORD_WHEN: :on_kw,
|
120
|
+
KEYWORD_WHILE: :on_kw,
|
121
|
+
KEYWORD_WHILE_MODIFIER: :on_kw,
|
122
|
+
KEYWORD_YIELD: :on_kw,
|
123
|
+
LABEL: :on_label,
|
124
|
+
LABEL_END: :on_label_end,
|
125
|
+
LAMBDA_BEGIN: :on_tlambeg,
|
126
|
+
LESS: :on_op,
|
127
|
+
LESS_EQUAL: :on_op,
|
128
|
+
LESS_EQUAL_GREATER: :on_op,
|
129
|
+
LESS_LESS: :on_op,
|
130
|
+
LESS_LESS_EQUAL: :on_op,
|
131
|
+
MINUS: :on_op,
|
132
|
+
MINUS_EQUAL: :on_op,
|
133
|
+
MINUS_GREATER: :on_tlambda,
|
134
|
+
NEWLINE: :on_nl,
|
135
|
+
NUMBERED_REFERENCE: :on_backref,
|
136
|
+
PARENTHESIS_LEFT: :on_lparen,
|
137
|
+
PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
|
138
|
+
PARENTHESIS_RIGHT: :on_rparen,
|
139
|
+
PERCENT: :on_op,
|
140
|
+
PERCENT_EQUAL: :on_op,
|
141
|
+
PERCENT_LOWER_I: :on_qsymbols_beg,
|
142
|
+
PERCENT_LOWER_W: :on_qwords_beg,
|
143
|
+
PERCENT_LOWER_X: :on_backtick,
|
144
|
+
PERCENT_UPPER_I: :on_symbols_beg,
|
145
|
+
PERCENT_UPPER_W: :on_words_beg,
|
146
|
+
PIPE: :on_op,
|
147
|
+
PIPE_EQUAL: :on_op,
|
148
|
+
PIPE_PIPE: :on_op,
|
149
|
+
PIPE_PIPE_EQUAL: :on_op,
|
150
|
+
PLUS: :on_op,
|
151
|
+
PLUS_EQUAL: :on_op,
|
152
|
+
QUESTION_MARK: :on_op,
|
153
|
+
RATIONAL_FLOAT: :on_rational,
|
154
|
+
RATIONAL_INTEGER: :on_rational,
|
155
|
+
REGEXP_BEGIN: :on_regexp_beg,
|
156
|
+
REGEXP_END: :on_regexp_end,
|
157
|
+
SEMICOLON: :on_semicolon,
|
158
|
+
SLASH: :on_op,
|
159
|
+
SLASH_EQUAL: :on_op,
|
160
|
+
STAR: :on_op,
|
161
|
+
STAR_EQUAL: :on_op,
|
162
|
+
STAR_STAR: :on_op,
|
163
|
+
STAR_STAR_EQUAL: :on_op,
|
164
|
+
STRING_BEGIN: :on_tstring_beg,
|
165
|
+
STRING_CONTENT: :on_tstring_content,
|
166
|
+
STRING_END: :on_tstring_end,
|
167
|
+
SYMBOL_BEGIN: :on_symbeg,
|
168
|
+
TILDE: :on_op,
|
169
|
+
UAMPERSAND: :on_op,
|
170
|
+
UCOLON_COLON: :on_op,
|
171
|
+
UDOT_DOT: :on_op,
|
172
|
+
UDOT_DOT_DOT: :on_op,
|
173
|
+
UMINUS: :on_op,
|
174
|
+
UMINUS_NUM: :on_op,
|
175
|
+
UPLUS: :on_op,
|
176
|
+
USTAR: :on_op,
|
177
|
+
USTAR_STAR: :on_op,
|
178
|
+
WORDS_SEP: :on_words_sep,
|
179
|
+
"__END__": :on___end__
|
180
|
+
}.freeze
|
181
|
+
|
182
|
+
# When we produce tokens, we produce the same arrays that Ripper does.
|
183
|
+
# However, we add a couple of convenience methods onto them to make them a
|
184
|
+
# little easier to work with. We delegate all other methods to the array.
|
185
|
+
class Token < SimpleDelegator
|
186
|
+
def location
|
187
|
+
self[0]
|
188
|
+
end
|
189
|
+
|
190
|
+
def event
|
191
|
+
self[1]
|
192
|
+
end
|
193
|
+
|
194
|
+
def value
|
195
|
+
self[2]
|
196
|
+
end
|
197
|
+
|
198
|
+
def state
|
199
|
+
self[3]
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
# Ripper doesn't include the rest of the token in the event, so we need to
|
204
|
+
# trim it down to just the content on the first line when comparing.
|
205
|
+
class EndContentToken < Token
|
206
|
+
def ==(other)
|
207
|
+
[self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
# It is extremely non obvious which state the parser is in when comments get
|
212
|
+
# dispatched. Because of this we don't both comparing state when comparing
|
213
|
+
# against other comment tokens.
|
214
|
+
class CommentToken < Token
|
215
|
+
def ==(other)
|
216
|
+
self[0...-1] == other[0...-1]
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Heredoc end tokens are emitted in an odd order, so we don't compare the
|
221
|
+
# state on them.
|
222
|
+
class HeredocEndToken < Token
|
223
|
+
def ==(other)
|
224
|
+
self[0...-1] == other[0...-1]
|
225
|
+
end
|
226
|
+
end
|
227
|
+
|
228
|
+
# Ident tokens for the most part are exactly the same, except sometimes we
|
229
|
+
# know an ident is a local when ripper doesn't (when they are introduced
|
230
|
+
# through named captures in regular expressions). In that case we don't
|
231
|
+
# compare the state.
|
232
|
+
class IdentToken < Token
|
233
|
+
def ==(other)
|
234
|
+
(self[0...-1] == other[0...-1]) && (
|
235
|
+
(other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
|
236
|
+
(other[3] & Ripper::EXPR_ARG_ANY != 0)
|
237
|
+
)
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
# Ignored newlines can occasionally have a LABEL state attached to them, so
|
242
|
+
# we compare the state differently here.
|
243
|
+
class IgnoredNewlineToken < Token
|
244
|
+
def ==(other)
|
245
|
+
return false unless self[0...-1] == other[0...-1]
|
246
|
+
|
247
|
+
if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
|
248
|
+
other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
|
249
|
+
else
|
250
|
+
self[4] == other[4]
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
255
|
+
# A heredoc in this case is a list of tokens that belong to the body of the
|
256
|
+
# heredoc that should be appended onto the list of tokens when the heredoc
|
257
|
+
# closes.
|
258
|
+
module Heredoc
|
259
|
+
# Heredocs that are no dash or tilde heredocs are just a list of tokens.
|
260
|
+
# We need to keep them around so that we can insert them in the correct
|
261
|
+
# order back into the token stream and set the state of the last token to
|
262
|
+
# the state that the heredoc was opened in.
|
263
|
+
class PlainHeredoc
|
264
|
+
attr_reader :tokens
|
265
|
+
|
266
|
+
def initialize
|
267
|
+
@tokens = []
|
268
|
+
end
|
269
|
+
|
270
|
+
def <<(token)
|
271
|
+
tokens << token
|
272
|
+
end
|
273
|
+
|
274
|
+
def to_a
|
275
|
+
tokens
|
276
|
+
end
|
277
|
+
end
|
278
|
+
|
279
|
+
# Dash heredocs are a little more complicated. They are a list of tokens
|
280
|
+
# that need to be split on "\\\n" to mimic Ripper's behavior. We also need
|
281
|
+
# to keep track of the state that the heredoc was opened in.
|
282
|
+
class DashHeredoc
|
283
|
+
attr_reader :split, :tokens
|
284
|
+
|
285
|
+
def initialize(split)
|
286
|
+
@split = split
|
287
|
+
@tokens = []
|
288
|
+
end
|
289
|
+
|
290
|
+
def <<(token)
|
291
|
+
tokens << token
|
292
|
+
end
|
293
|
+
|
294
|
+
def to_a
|
295
|
+
embexpr_balance = 0
|
296
|
+
|
297
|
+
tokens.each_with_object([]) do |token, results|
|
298
|
+
case token.event
|
299
|
+
when :on_embexpr_beg
|
300
|
+
embexpr_balance += 1
|
301
|
+
results << token
|
302
|
+
when :on_embexpr_end
|
303
|
+
embexpr_balance -= 1
|
304
|
+
results << token
|
305
|
+
when :on_tstring_content
|
306
|
+
if embexpr_balance == 0
|
307
|
+
lineno = token[0][0]
|
308
|
+
column = token[0][1]
|
309
|
+
|
310
|
+
if split
|
311
|
+
# Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
|
312
|
+
# to keep the delimiter in the result.
|
313
|
+
token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
|
314
|
+
column = 0 if index > 0
|
315
|
+
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
|
316
|
+
lineno += value.count("\n")
|
317
|
+
end
|
318
|
+
else
|
319
|
+
results << token
|
320
|
+
end
|
321
|
+
else
|
322
|
+
results << token
|
323
|
+
end
|
324
|
+
else
|
325
|
+
results << token
|
326
|
+
end
|
327
|
+
end
|
328
|
+
end
|
329
|
+
end
|
330
|
+
|
331
|
+
# Heredocs that are dedenting heredocs are a little more complicated.
|
332
|
+
# Ripper outputs on_ignored_sp tokens for the whitespace that is being
|
333
|
+
# removed from the output. YARP only modifies the node itself and keeps
|
334
|
+
# the token the same. This simplifies YARP, but makes comparing against
|
335
|
+
# Ripper much harder because there is a length mismatch.
|
336
|
+
#
|
337
|
+
# Fortunately, we already have to pull out the heredoc tokens in order to
|
338
|
+
# insert them into the stream in the correct order. As such, we can do
|
339
|
+
# some extra manipulation on the tokens to make them match Ripper's
|
340
|
+
# output by mirroring the dedent logic that Ripper uses.
|
341
|
+
class DedentingHeredoc
|
342
|
+
TAB_WIDTH = 8
|
343
|
+
|
344
|
+
attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
|
345
|
+
|
346
|
+
def initialize
|
347
|
+
@tokens = []
|
348
|
+
@dedent_next = true
|
349
|
+
@dedent = nil
|
350
|
+
@embexpr_balance = 0
|
351
|
+
end
|
352
|
+
|
353
|
+
# As tokens are coming in, we track the minimum amount of common leading
|
354
|
+
# whitespace on plain string content tokens. This allows us to later
|
355
|
+
# remove that amount of whitespace from the beginning of each line.
|
356
|
+
def <<(token)
|
357
|
+
case token.event
|
358
|
+
when :on_embexpr_beg, :on_heredoc_beg
|
359
|
+
@embexpr_balance += 1
|
360
|
+
when :on_embexpr_end, :on_heredoc_end
|
361
|
+
@embexpr_balance -= 1
|
362
|
+
when :on_tstring_content
|
363
|
+
if embexpr_balance == 0
|
364
|
+
token.value.split(/(?<=\n)/).each_with_index do |line, index|
|
365
|
+
next if line.strip.empty? && line.end_with?("\n")
|
366
|
+
next if !(dedent_next || index > 0)
|
367
|
+
|
368
|
+
leading = line[/\A(\s*)\n?/, 1]
|
369
|
+
next_dedent = 0
|
370
|
+
|
371
|
+
leading.each_char do |char|
|
372
|
+
if char == "\t"
|
373
|
+
next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
|
374
|
+
else
|
375
|
+
next_dedent += 1
|
376
|
+
end
|
377
|
+
end
|
378
|
+
|
379
|
+
@dedent = [dedent, next_dedent].compact.min
|
380
|
+
end
|
381
|
+
end
|
382
|
+
end
|
383
|
+
|
384
|
+
@dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
|
385
|
+
tokens << token
|
386
|
+
end
|
387
|
+
|
388
|
+
def to_a
|
389
|
+
# If every line in the heredoc is blank, we still need to split up the
|
390
|
+
# string content token into multiple tokens.
|
391
|
+
if dedent.nil?
|
392
|
+
results = []
|
393
|
+
embexpr_balance = 0
|
394
|
+
|
395
|
+
tokens.each do |token|
|
396
|
+
case token.event
|
397
|
+
when :on_embexpr_beg, :on_heredoc_beg
|
398
|
+
embexpr_balance += 1
|
399
|
+
results << token
|
400
|
+
when :on_embexpr_end, :on_heredoc_end
|
401
|
+
embexpr_balance -= 1
|
402
|
+
results << token
|
403
|
+
when :on_tstring_content
|
404
|
+
if embexpr_balance == 0
|
405
|
+
lineno = token[0][0]
|
406
|
+
column = token[0][1]
|
407
|
+
|
408
|
+
token.value.split(/(?<=\n)/).each_with_index do |value, index|
|
409
|
+
column = 0 if index > 0
|
410
|
+
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
|
411
|
+
lineno += 1
|
412
|
+
end
|
413
|
+
else
|
414
|
+
results << token
|
415
|
+
end
|
416
|
+
else
|
417
|
+
results << token
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
return results
|
422
|
+
end
|
423
|
+
|
424
|
+
# Otherwise, we're going to run through each token in the list and
|
425
|
+
# insert on_ignored_sp tokens for the amount of dedent that we need to
|
426
|
+
# perform. We also need to remove the dedent from the beginning of
|
427
|
+
# each line of plain string content tokens.
|
428
|
+
results = []
|
429
|
+
dedent_next = true
|
430
|
+
embexpr_balance = 0
|
431
|
+
|
432
|
+
tokens.each do |token|
|
433
|
+
# Notice that the structure of this conditional largely matches the
|
434
|
+
# whitespace calculation we performed above. This is because
|
435
|
+
# checking if the subsequent token needs to be dedented is common to
|
436
|
+
# both the dedent calculation and the ignored_sp insertion.
|
437
|
+
case token.event
|
438
|
+
when :on_embexpr_beg
|
439
|
+
embexpr_balance += 1
|
440
|
+
results << token
|
441
|
+
when :on_embexpr_end
|
442
|
+
embexpr_balance -= 1
|
443
|
+
results << token
|
444
|
+
when :on_tstring_content
|
445
|
+
if embexpr_balance == 0
|
446
|
+
# Here we're going to split the string on newlines, but maintain
|
447
|
+
# the newlines in the resulting array. We'll do that with a look
|
448
|
+
# behind assertion.
|
449
|
+
splits = token.value.split(/(?<=\n)/)
|
450
|
+
index = 0
|
451
|
+
|
452
|
+
while index < splits.length
|
453
|
+
line = splits[index]
|
454
|
+
lineno = token[0][0] + index
|
455
|
+
column = token[0][1]
|
456
|
+
|
457
|
+
# Blank lines do not count toward common leading whitespace
|
458
|
+
# calculation and do not need to be dedented.
|
459
|
+
if dedent_next || index > 0
|
460
|
+
column = 0
|
461
|
+
end
|
462
|
+
|
463
|
+
# If the dedent is 0 and we're not supposed to dedent the next
|
464
|
+
# line or this line doesn't start with whitespace, then we
|
465
|
+
# should concatenate the rest of the string to match ripper.
|
466
|
+
if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
|
467
|
+
line = splits[index..].join
|
468
|
+
index = splits.length
|
469
|
+
end
|
470
|
+
|
471
|
+
# If we are supposed to dedent this line or if this is not the
|
472
|
+
# first line of the string and this line isn't entirely blank,
|
473
|
+
# then we need to insert an on_ignored_sp token and remove the
|
474
|
+
# dedent from the beginning of the line.
|
475
|
+
if (dedent > 0) && (dedent_next || index > 0)
|
476
|
+
deleting = 0
|
477
|
+
deleted_chars = []
|
478
|
+
|
479
|
+
# Gather up all of the characters that we're going to
|
480
|
+
# delete, stopping when you hit a character that would put
|
481
|
+
# you over the dedent amount.
|
482
|
+
line.each_char.with_index do |char, i|
|
483
|
+
case char
|
484
|
+
when "\r"
|
485
|
+
if line.chars[i + 1] == "\n"
|
486
|
+
break
|
487
|
+
end
|
488
|
+
when "\n"
|
489
|
+
break
|
490
|
+
when "\t"
|
491
|
+
deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
|
492
|
+
else
|
493
|
+
deleting += 1
|
494
|
+
end
|
495
|
+
|
496
|
+
break if deleting > dedent
|
497
|
+
deleted_chars << char
|
498
|
+
end
|
499
|
+
|
500
|
+
# If we have something to delete, then delete it from the
|
501
|
+
# string and insert an on_ignored_sp token.
|
502
|
+
if deleted_chars.any?
|
503
|
+
ignored = deleted_chars.join
|
504
|
+
line.delete_prefix!(ignored)
|
505
|
+
|
506
|
+
results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
|
507
|
+
column = ignored.length
|
508
|
+
end
|
509
|
+
end
|
510
|
+
|
511
|
+
results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
|
512
|
+
index += 1
|
513
|
+
end
|
514
|
+
else
|
515
|
+
results << token
|
516
|
+
end
|
517
|
+
else
|
518
|
+
results << token
|
519
|
+
end
|
520
|
+
|
521
|
+
dedent_next =
|
522
|
+
((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
|
523
|
+
embexpr_balance == 0
|
524
|
+
end
|
525
|
+
|
526
|
+
results
|
527
|
+
end
|
528
|
+
end
|
529
|
+
|
530
|
+
# Here we will split between the two types of heredocs and return the
|
531
|
+
# object that will store their tokens.
|
532
|
+
def self.build(opening)
|
533
|
+
case opening.value[2]
|
534
|
+
when "~"
|
535
|
+
DedentingHeredoc.new
|
536
|
+
when "-"
|
537
|
+
DashHeredoc.new(opening.value[3] != "'")
|
538
|
+
else
|
539
|
+
PlainHeredoc.new
|
540
|
+
end
|
541
|
+
end
|
542
|
+
end
|
543
|
+
|
544
|
+
attr_reader :source, :filepath
|
545
|
+
|
546
|
+
def initialize(source, filepath = "")
|
547
|
+
@source = source
|
548
|
+
@filepath = filepath || ""
|
549
|
+
end
|
550
|
+
|
551
|
+
def result
|
552
|
+
tokens = []
|
553
|
+
|
554
|
+
state = :default
|
555
|
+
heredoc_stack = [[]]
|
556
|
+
|
557
|
+
result = YARP.lex(source, @filepath)
|
558
|
+
result_value = result.value
|
559
|
+
previous_state = nil
|
560
|
+
|
561
|
+
# If there's a UTF-8 byte-order mark as the start of the file, then ripper
|
562
|
+
# sets every token's on the first line back by 6 bytes. It also keeps the
|
563
|
+
# byte order mark in the first token's value. This is weird, and I don't
|
564
|
+
# want to mirror that in our parser. So instead, we'll match up the values
|
565
|
+
# here, and then match up the locations as we process the tokens.
|
566
|
+
bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
567
|
+
result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
|
568
|
+
|
569
|
+
result_value.each_with_index do |(token, lex_state), index|
|
570
|
+
lineno = token.location.start_line
|
571
|
+
column = token.location.start_column
|
572
|
+
column -= index == 0 ? 6 : 3 if bom && lineno == 1
|
573
|
+
|
574
|
+
event = RIPPER.fetch(token.type)
|
575
|
+
value = token.value
|
576
|
+
lex_state = Ripper::Lexer::State.new(lex_state)
|
577
|
+
|
578
|
+
token =
|
579
|
+
case event
|
580
|
+
when :on___end__
|
581
|
+
EndContentToken.new([[lineno, column], event, value, lex_state])
|
582
|
+
when :on_comment
|
583
|
+
CommentToken.new([[lineno, column], event, value, lex_state])
|
584
|
+
when :on_heredoc_end
|
585
|
+
# Heredoc end tokens can be emitted in an odd order, so we don't
|
586
|
+
# want to bother comparing the state on them.
|
587
|
+
HeredocEndToken.new([[lineno, column], event, value, lex_state])
|
588
|
+
when :on_embexpr_end, :on_ident
|
589
|
+
if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
|
590
|
+
# In the event that we're comparing identifiers, we're going to
|
591
|
+
# allow a little divergence. Ripper doesn't account for local
|
592
|
+
# variables introduced through named captures in regexes, and we
|
593
|
+
# do, which accounts for this difference.
|
594
|
+
IdentToken.new([[lineno, column], event, value, lex_state])
|
595
|
+
else
|
596
|
+
Token.new([[lineno, column], event, value, lex_state])
|
597
|
+
end
|
598
|
+
when :on_ignored_nl
|
599
|
+
# Ignored newlines can occasionally have a LABEL state attached to
|
600
|
+
# them which doesn't actually impact anything. We don't mirror that
|
601
|
+
# state so we ignored it.
|
602
|
+
IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
|
603
|
+
when :on_regexp_end
|
604
|
+
# On regex end, Ripper scans and then sets end state, so the ripper
|
605
|
+
# lexed output is begin, when it should be end. YARP sets lex state
|
606
|
+
# correctly to end state, but we want to be able to compare against
|
607
|
+
# Ripper's lexed state. So here, if it's a regexp end token, we
|
608
|
+
# output the state as the previous state, solely for the sake of
|
609
|
+
# comparison.
|
610
|
+
previous_token = result_value[index - 1][0]
|
611
|
+
lex_state =
|
612
|
+
if RIPPER.fetch(previous_token.type) == :on_embexpr_end
|
613
|
+
# If the previous token is embexpr_end, then we have to do even
|
614
|
+
# more processing. The end of an embedded expression sets the
|
615
|
+
# state to the state that it had at the beginning of the
|
616
|
+
# embedded expression. So we have to go and find that state and
|
617
|
+
# set it here.
|
618
|
+
counter = 1
|
619
|
+
current_index = index - 1
|
620
|
+
|
621
|
+
until counter == 0
|
622
|
+
current_index -= 1
|
623
|
+
current_event = RIPPER.fetch(result_value[current_index][0].type)
|
624
|
+
counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
|
625
|
+
end
|
626
|
+
|
627
|
+
Ripper::Lexer::State.new(result_value[current_index][1])
|
628
|
+
else
|
629
|
+
previous_state
|
630
|
+
end
|
631
|
+
|
632
|
+
Token.new([[lineno, column], event, value, lex_state])
|
633
|
+
else
|
634
|
+
Token.new([[lineno, column], event, value, lex_state])
|
635
|
+
end
|
636
|
+
|
637
|
+
previous_state = lex_state
|
638
|
+
|
639
|
+
# The order in which tokens appear in our lexer is different from the
|
640
|
+
# order that they appear in Ripper. When we hit the declaration of a
|
641
|
+
# heredoc in YARP, we skip forward and lex the rest of the content of
|
642
|
+
# the heredoc before going back and lexing at the end of the heredoc
|
643
|
+
# identifier.
|
644
|
+
#
|
645
|
+
# To match up to ripper, we keep a small state variable around here to
|
646
|
+
# track whether we're in the middle of a heredoc or not. In this way we
|
647
|
+
# can shuffle around the token to match Ripper's output.
|
648
|
+
case state
|
649
|
+
when :default
|
650
|
+
tokens << token
|
651
|
+
|
652
|
+
if event == :on_heredoc_beg
|
653
|
+
state = :heredoc_opened
|
654
|
+
heredoc_stack.last << Heredoc.build(token)
|
655
|
+
end
|
656
|
+
when :heredoc_opened
|
657
|
+
heredoc_stack.last.last << token
|
658
|
+
|
659
|
+
case event
|
660
|
+
when :on_heredoc_beg
|
661
|
+
heredoc_stack << [Heredoc.build(token)]
|
662
|
+
when :on_heredoc_end
|
663
|
+
state = :heredoc_closed
|
664
|
+
end
|
665
|
+
when :heredoc_closed
|
666
|
+
if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
|
667
|
+
if heredoc_stack.size > 1
|
668
|
+
flushing = heredoc_stack.pop
|
669
|
+
heredoc_stack.last.last << token
|
670
|
+
|
671
|
+
flushing.each do |heredoc|
|
672
|
+
heredoc.to_a.each do |flushed_token|
|
673
|
+
heredoc_stack.last.last << flushed_token
|
674
|
+
end
|
675
|
+
end
|
676
|
+
|
677
|
+
state = :heredoc_opened
|
678
|
+
next
|
679
|
+
end
|
680
|
+
elsif event == :on_heredoc_beg
|
681
|
+
tokens << token
|
682
|
+
state = :heredoc_opened
|
683
|
+
heredoc_stack.last << Heredoc.build(token)
|
684
|
+
next
|
685
|
+
elsif heredoc_stack.size > 1
|
686
|
+
heredoc_stack[-2].last << token
|
687
|
+
next
|
688
|
+
end
|
689
|
+
|
690
|
+
heredoc_stack.last.each do |heredoc|
|
691
|
+
tokens.concat(heredoc.to_a)
|
692
|
+
end
|
693
|
+
|
694
|
+
heredoc_stack.last.clear
|
695
|
+
state = :default
|
696
|
+
|
697
|
+
tokens << token
|
698
|
+
end
|
699
|
+
end
|
700
|
+
|
701
|
+
tokens.reject! { |t| t.event == :on_eof }
|
702
|
+
|
703
|
+
# We sort by location to compare against Ripper's output
|
704
|
+
tokens.sort_by!(&:location)
|
705
|
+
|
706
|
+
if result_value.size - 1 > tokens.size
|
707
|
+
raise StandardError, "Lost tokens when performing lex_compat"
|
708
|
+
end
|
709
|
+
|
710
|
+
ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
714
|
+
# The constant that wraps the behavior of the lexer to match Ripper's output
|
715
|
+
# is an implementation detail, so we don't want it to be public.
|
716
|
+
private_constant :LexCompat
|
717
|
+
|
718
|
+
# Returns an array of tokens that closely resembles that of the Ripper lexer.
|
719
|
+
# The only difference is that since we don't keep track of lexer state in the
|
720
|
+
# same way, it's going to always return the NONE state.
|
721
|
+
def self.lex_compat(source, filepath = "")
|
722
|
+
LexCompat.new(source, filepath).result
|
723
|
+
end
|
724
|
+
|
725
|
+
# This lexes with the Ripper lex. It drops any space events but otherwise
|
726
|
+
# returns the same tokens. Raises SyntaxError if the syntax in source is
|
727
|
+
# invalid.
|
728
|
+
def self.lex_ripper(source)
|
729
|
+
previous = []
|
730
|
+
results = []
|
731
|
+
|
732
|
+
Ripper.lex(source, raise_errors: true).each do |token|
|
733
|
+
case token[1]
|
734
|
+
when :on_sp
|
735
|
+
# skip
|
736
|
+
when :on_tstring_content
|
737
|
+
if previous[1] == :on_tstring_content &&
|
738
|
+
(token[2].start_with?("\#$") || token[2].start_with?("\#@"))
|
739
|
+
previous[2] << token[2]
|
740
|
+
else
|
741
|
+
results << token
|
742
|
+
previous = token
|
743
|
+
end
|
744
|
+
when :on_words_sep
|
745
|
+
if previous[1] == :on_words_sep
|
746
|
+
previous[2] << token[2]
|
747
|
+
else
|
748
|
+
results << token
|
749
|
+
previous = token
|
750
|
+
end
|
751
|
+
else
|
752
|
+
results << token
|
753
|
+
previous = token
|
754
|
+
end
|
755
|
+
end
|
756
|
+
|
757
|
+
results
|
758
|
+
end
|
759
|
+
end
|