prism 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +172 -0
- data/CODE_OF_CONDUCT.md +76 -0
- data/CONTRIBUTING.md +62 -0
- data/LICENSE.md +7 -0
- data/Makefile +84 -0
- data/README.md +89 -0
- data/config.yml +2481 -0
- data/docs/build_system.md +74 -0
- data/docs/building.md +22 -0
- data/docs/configuration.md +60 -0
- data/docs/design.md +53 -0
- data/docs/encoding.md +117 -0
- data/docs/fuzzing.md +93 -0
- data/docs/heredocs.md +36 -0
- data/docs/mapping.md +117 -0
- data/docs/ripper.md +36 -0
- data/docs/ruby_api.md +25 -0
- data/docs/serialization.md +181 -0
- data/docs/testing.md +55 -0
- data/ext/prism/api_node.c +4725 -0
- data/ext/prism/api_pack.c +256 -0
- data/ext/prism/extconf.rb +136 -0
- data/ext/prism/extension.c +626 -0
- data/ext/prism/extension.h +18 -0
- data/include/prism/ast.h +1932 -0
- data/include/prism/defines.h +45 -0
- data/include/prism/diagnostic.h +231 -0
- data/include/prism/enc/pm_encoding.h +95 -0
- data/include/prism/node.h +41 -0
- data/include/prism/pack.h +141 -0
- data/include/prism/parser.h +418 -0
- data/include/prism/regexp.h +19 -0
- data/include/prism/unescape.h +48 -0
- data/include/prism/util/pm_buffer.h +51 -0
- data/include/prism/util/pm_char.h +91 -0
- data/include/prism/util/pm_constant_pool.h +78 -0
- data/include/prism/util/pm_list.h +67 -0
- data/include/prism/util/pm_memchr.h +14 -0
- data/include/prism/util/pm_newline_list.h +61 -0
- data/include/prism/util/pm_state_stack.h +24 -0
- data/include/prism/util/pm_string.h +61 -0
- data/include/prism/util/pm_string_list.h +25 -0
- data/include/prism/util/pm_strpbrk.h +29 -0
- data/include/prism/version.h +4 -0
- data/include/prism.h +82 -0
- data/lib/prism/compiler.rb +465 -0
- data/lib/prism/debug.rb +157 -0
- data/lib/prism/desugar_compiler.rb +206 -0
- data/lib/prism/dispatcher.rb +2051 -0
- data/lib/prism/dsl.rb +750 -0
- data/lib/prism/ffi.rb +251 -0
- data/lib/prism/lex_compat.rb +838 -0
- data/lib/prism/mutation_compiler.rb +718 -0
- data/lib/prism/node.rb +14540 -0
- data/lib/prism/node_ext.rb +55 -0
- data/lib/prism/node_inspector.rb +68 -0
- data/lib/prism/pack.rb +185 -0
- data/lib/prism/parse_result/comments.rb +172 -0
- data/lib/prism/parse_result/newlines.rb +60 -0
- data/lib/prism/parse_result.rb +266 -0
- data/lib/prism/pattern.rb +239 -0
- data/lib/prism/ripper_compat.rb +174 -0
- data/lib/prism/serialize.rb +662 -0
- data/lib/prism/visitor.rb +470 -0
- data/lib/prism.rb +64 -0
- data/prism.gemspec +113 -0
- data/src/diagnostic.c +287 -0
- data/src/enc/pm_big5.c +52 -0
- data/src/enc/pm_euc_jp.c +58 -0
- data/src/enc/pm_gbk.c +61 -0
- data/src/enc/pm_shift_jis.c +56 -0
- data/src/enc/pm_tables.c +507 -0
- data/src/enc/pm_unicode.c +2324 -0
- data/src/enc/pm_windows_31j.c +56 -0
- data/src/node.c +2633 -0
- data/src/pack.c +493 -0
- data/src/prettyprint.c +2136 -0
- data/src/prism.c +14587 -0
- data/src/regexp.c +580 -0
- data/src/serialize.c +1899 -0
- data/src/token_type.c +349 -0
- data/src/unescape.c +637 -0
- data/src/util/pm_buffer.c +103 -0
- data/src/util/pm_char.c +272 -0
- data/src/util/pm_constant_pool.c +252 -0
- data/src/util/pm_list.c +41 -0
- data/src/util/pm_memchr.c +33 -0
- data/src/util/pm_newline_list.c +134 -0
- data/src/util/pm_state_stack.c +19 -0
- data/src/util/pm_string.c +200 -0
- data/src/util/pm_string_list.c +29 -0
- data/src/util/pm_strncasecmp.c +17 -0
- data/src/util/pm_strpbrk.c +66 -0
- metadata +138 -0
@@ -0,0 +1,838 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "delegate"
|
4
|
+
|
5
|
+
module Prism
|
6
|
+
# This class is responsible for lexing the source using prism and then
|
7
|
+
# converting those tokens to be compatible with Ripper. In the vast majority
|
8
|
+
# of cases, this is a one-to-one mapping of the token type. Everything else
|
9
|
+
# generally lines up. However, there are a few cases that require special
|
10
|
+
# handling.
|
11
|
+
class LexCompat
|
12
|
+
# This is a mapping of prism token types to Ripper token types. This is a
|
13
|
+
# many-to-one mapping because we split up our token types, whereas Ripper
|
14
|
+
# tends to group them.
|
15
|
+
RIPPER = {
|
16
|
+
AMPERSAND: :on_op,
|
17
|
+
AMPERSAND_AMPERSAND: :on_op,
|
18
|
+
AMPERSAND_AMPERSAND_EQUAL: :on_op,
|
19
|
+
AMPERSAND_DOT: :on_op,
|
20
|
+
AMPERSAND_EQUAL: :on_op,
|
21
|
+
BACK_REFERENCE: :on_backref,
|
22
|
+
BACKTICK: :on_backtick,
|
23
|
+
BANG: :on_op,
|
24
|
+
BANG_EQUAL: :on_op,
|
25
|
+
BANG_TILDE: :on_op,
|
26
|
+
BRACE_LEFT: :on_lbrace,
|
27
|
+
BRACE_RIGHT: :on_rbrace,
|
28
|
+
BRACKET_LEFT: :on_lbracket,
|
29
|
+
BRACKET_LEFT_ARRAY: :on_lbracket,
|
30
|
+
BRACKET_LEFT_RIGHT: :on_op,
|
31
|
+
BRACKET_LEFT_RIGHT_EQUAL: :on_op,
|
32
|
+
BRACKET_RIGHT: :on_rbracket,
|
33
|
+
CARET: :on_op,
|
34
|
+
CARET_EQUAL: :on_op,
|
35
|
+
CHARACTER_LITERAL: :on_CHAR,
|
36
|
+
CLASS_VARIABLE: :on_cvar,
|
37
|
+
COLON: :on_op,
|
38
|
+
COLON_COLON: :on_op,
|
39
|
+
COMMA: :on_comma,
|
40
|
+
COMMENT: :on_comment,
|
41
|
+
CONSTANT: :on_const,
|
42
|
+
DOT: :on_period,
|
43
|
+
DOT_DOT: :on_op,
|
44
|
+
DOT_DOT_DOT: :on_op,
|
45
|
+
EMBDOC_BEGIN: :on_embdoc_beg,
|
46
|
+
EMBDOC_END: :on_embdoc_end,
|
47
|
+
EMBDOC_LINE: :on_embdoc,
|
48
|
+
EMBEXPR_BEGIN: :on_embexpr_beg,
|
49
|
+
EMBEXPR_END: :on_embexpr_end,
|
50
|
+
EMBVAR: :on_embvar,
|
51
|
+
EOF: :on_eof,
|
52
|
+
EQUAL: :on_op,
|
53
|
+
EQUAL_EQUAL: :on_op,
|
54
|
+
EQUAL_EQUAL_EQUAL: :on_op,
|
55
|
+
EQUAL_GREATER: :on_op,
|
56
|
+
EQUAL_TILDE: :on_op,
|
57
|
+
FLOAT: :on_float,
|
58
|
+
FLOAT_IMAGINARY: :on_imaginary,
|
59
|
+
FLOAT_RATIONAL: :on_rational,
|
60
|
+
FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
|
61
|
+
GREATER: :on_op,
|
62
|
+
GREATER_EQUAL: :on_op,
|
63
|
+
GREATER_GREATER: :on_op,
|
64
|
+
GREATER_GREATER_EQUAL: :on_op,
|
65
|
+
GLOBAL_VARIABLE: :on_gvar,
|
66
|
+
HEREDOC_END: :on_heredoc_end,
|
67
|
+
HEREDOC_START: :on_heredoc_beg,
|
68
|
+
IDENTIFIER: :on_ident,
|
69
|
+
IGNORED_NEWLINE: :on_ignored_nl,
|
70
|
+
INTEGER: :on_int,
|
71
|
+
INTEGER_IMAGINARY: :on_imaginary,
|
72
|
+
INTEGER_RATIONAL: :on_rational,
|
73
|
+
INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
|
74
|
+
INSTANCE_VARIABLE: :on_ivar,
|
75
|
+
INVALID: :INVALID,
|
76
|
+
KEYWORD___ENCODING__: :on_kw,
|
77
|
+
KEYWORD___LINE__: :on_kw,
|
78
|
+
KEYWORD___FILE__: :on_kw,
|
79
|
+
KEYWORD_ALIAS: :on_kw,
|
80
|
+
KEYWORD_AND: :on_kw,
|
81
|
+
KEYWORD_BEGIN: :on_kw,
|
82
|
+
KEYWORD_BEGIN_UPCASE: :on_kw,
|
83
|
+
KEYWORD_BREAK: :on_kw,
|
84
|
+
KEYWORD_CASE: :on_kw,
|
85
|
+
KEYWORD_CLASS: :on_kw,
|
86
|
+
KEYWORD_DEF: :on_kw,
|
87
|
+
KEYWORD_DEFINED: :on_kw,
|
88
|
+
KEYWORD_DO: :on_kw,
|
89
|
+
KEYWORD_DO_LOOP: :on_kw,
|
90
|
+
KEYWORD_ELSE: :on_kw,
|
91
|
+
KEYWORD_ELSIF: :on_kw,
|
92
|
+
KEYWORD_END: :on_kw,
|
93
|
+
KEYWORD_END_UPCASE: :on_kw,
|
94
|
+
KEYWORD_ENSURE: :on_kw,
|
95
|
+
KEYWORD_FALSE: :on_kw,
|
96
|
+
KEYWORD_FOR: :on_kw,
|
97
|
+
KEYWORD_IF: :on_kw,
|
98
|
+
KEYWORD_IF_MODIFIER: :on_kw,
|
99
|
+
KEYWORD_IN: :on_kw,
|
100
|
+
KEYWORD_MODULE: :on_kw,
|
101
|
+
KEYWORD_NEXT: :on_kw,
|
102
|
+
KEYWORD_NIL: :on_kw,
|
103
|
+
KEYWORD_NOT: :on_kw,
|
104
|
+
KEYWORD_OR: :on_kw,
|
105
|
+
KEYWORD_REDO: :on_kw,
|
106
|
+
KEYWORD_RESCUE: :on_kw,
|
107
|
+
KEYWORD_RESCUE_MODIFIER: :on_kw,
|
108
|
+
KEYWORD_RETRY: :on_kw,
|
109
|
+
KEYWORD_RETURN: :on_kw,
|
110
|
+
KEYWORD_SELF: :on_kw,
|
111
|
+
KEYWORD_SUPER: :on_kw,
|
112
|
+
KEYWORD_THEN: :on_kw,
|
113
|
+
KEYWORD_TRUE: :on_kw,
|
114
|
+
KEYWORD_UNDEF: :on_kw,
|
115
|
+
KEYWORD_UNLESS: :on_kw,
|
116
|
+
KEYWORD_UNLESS_MODIFIER: :on_kw,
|
117
|
+
KEYWORD_UNTIL: :on_kw,
|
118
|
+
KEYWORD_UNTIL_MODIFIER: :on_kw,
|
119
|
+
KEYWORD_WHEN: :on_kw,
|
120
|
+
KEYWORD_WHILE: :on_kw,
|
121
|
+
KEYWORD_WHILE_MODIFIER: :on_kw,
|
122
|
+
KEYWORD_YIELD: :on_kw,
|
123
|
+
LABEL: :on_label,
|
124
|
+
LABEL_END: :on_label_end,
|
125
|
+
LAMBDA_BEGIN: :on_tlambeg,
|
126
|
+
LESS: :on_op,
|
127
|
+
LESS_EQUAL: :on_op,
|
128
|
+
LESS_EQUAL_GREATER: :on_op,
|
129
|
+
LESS_LESS: :on_op,
|
130
|
+
LESS_LESS_EQUAL: :on_op,
|
131
|
+
METHOD_NAME: :on_ident,
|
132
|
+
MINUS: :on_op,
|
133
|
+
MINUS_EQUAL: :on_op,
|
134
|
+
MINUS_GREATER: :on_tlambda,
|
135
|
+
NEWLINE: :on_nl,
|
136
|
+
NUMBERED_REFERENCE: :on_backref,
|
137
|
+
PARENTHESIS_LEFT: :on_lparen,
|
138
|
+
PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
|
139
|
+
PARENTHESIS_RIGHT: :on_rparen,
|
140
|
+
PERCENT: :on_op,
|
141
|
+
PERCENT_EQUAL: :on_op,
|
142
|
+
PERCENT_LOWER_I: :on_qsymbols_beg,
|
143
|
+
PERCENT_LOWER_W: :on_qwords_beg,
|
144
|
+
PERCENT_LOWER_X: :on_backtick,
|
145
|
+
PERCENT_UPPER_I: :on_symbols_beg,
|
146
|
+
PERCENT_UPPER_W: :on_words_beg,
|
147
|
+
PIPE: :on_op,
|
148
|
+
PIPE_EQUAL: :on_op,
|
149
|
+
PIPE_PIPE: :on_op,
|
150
|
+
PIPE_PIPE_EQUAL: :on_op,
|
151
|
+
PLUS: :on_op,
|
152
|
+
PLUS_EQUAL: :on_op,
|
153
|
+
QUESTION_MARK: :on_op,
|
154
|
+
RATIONAL_FLOAT: :on_rational,
|
155
|
+
RATIONAL_INTEGER: :on_rational,
|
156
|
+
REGEXP_BEGIN: :on_regexp_beg,
|
157
|
+
REGEXP_END: :on_regexp_end,
|
158
|
+
SEMICOLON: :on_semicolon,
|
159
|
+
SLASH: :on_op,
|
160
|
+
SLASH_EQUAL: :on_op,
|
161
|
+
STAR: :on_op,
|
162
|
+
STAR_EQUAL: :on_op,
|
163
|
+
STAR_STAR: :on_op,
|
164
|
+
STAR_STAR_EQUAL: :on_op,
|
165
|
+
STRING_BEGIN: :on_tstring_beg,
|
166
|
+
STRING_CONTENT: :on_tstring_content,
|
167
|
+
STRING_END: :on_tstring_end,
|
168
|
+
SYMBOL_BEGIN: :on_symbeg,
|
169
|
+
TILDE: :on_op,
|
170
|
+
UAMPERSAND: :on_op,
|
171
|
+
UCOLON_COLON: :on_op,
|
172
|
+
UDOT_DOT: :on_op,
|
173
|
+
UDOT_DOT_DOT: :on_op,
|
174
|
+
UMINUS: :on_op,
|
175
|
+
UMINUS_NUM: :on_op,
|
176
|
+
UPLUS: :on_op,
|
177
|
+
USTAR: :on_op,
|
178
|
+
USTAR_STAR: :on_op,
|
179
|
+
WORDS_SEP: :on_words_sep,
|
180
|
+
"__END__": :on___end__
|
181
|
+
}.freeze
|
182
|
+
|
183
|
+
# When we produce tokens, we produce the same arrays that Ripper does.
|
184
|
+
# However, we add a couple of convenience methods onto them to make them a
|
185
|
+
# little easier to work with. We delegate all other methods to the array.
|
186
|
+
class Token < SimpleDelegator
|
187
|
+
def location
|
188
|
+
self[0]
|
189
|
+
end
|
190
|
+
|
191
|
+
def event
|
192
|
+
self[1]
|
193
|
+
end
|
194
|
+
|
195
|
+
def value
|
196
|
+
self[2]
|
197
|
+
end
|
198
|
+
|
199
|
+
def state
|
200
|
+
self[3]
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
# Ripper doesn't include the rest of the token in the event, so we need to
|
205
|
+
# trim it down to just the content on the first line when comparing.
|
206
|
+
class EndContentToken < Token
|
207
|
+
def ==(other)
|
208
|
+
[self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
|
209
|
+
end
|
210
|
+
end
|
211
|
+
|
212
|
+
# Tokens where state should be ignored
|
213
|
+
# used for :on_comment, :on_heredoc_end, :on_embexpr_end
|
214
|
+
class IgnoreStateToken < Token
|
215
|
+
def ==(other)
|
216
|
+
self[0...-1] == other[0...-1]
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
# Ident tokens for the most part are exactly the same, except sometimes we
|
221
|
+
# know an ident is a local when ripper doesn't (when they are introduced
|
222
|
+
# through named captures in regular expressions). In that case we don't
|
223
|
+
# compare the state.
|
224
|
+
class IdentToken < Token
|
225
|
+
def ==(other)
|
226
|
+
(self[0...-1] == other[0...-1]) && (
|
227
|
+
(other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
|
228
|
+
(other[3] & Ripper::EXPR_ARG_ANY != 0)
|
229
|
+
)
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# Ignored newlines can occasionally have a LABEL state attached to them, so
|
234
|
+
# we compare the state differently here.
|
235
|
+
class IgnoredNewlineToken < Token
|
236
|
+
def ==(other)
|
237
|
+
return false unless self[0...-1] == other[0...-1]
|
238
|
+
|
239
|
+
if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
|
240
|
+
other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
|
241
|
+
else
|
242
|
+
self[4] == other[4]
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
# If we have an identifier that follows a method name like:
|
248
|
+
#
|
249
|
+
# def foo bar
|
250
|
+
#
|
251
|
+
# then Ripper will mark bar as END|LABEL if there is a local in a parent
|
252
|
+
# scope named bar because it hasn't pushed the local table yet. We do this
|
253
|
+
# more accurately, so we need to allow comparing against both END and
|
254
|
+
# END|LABEL.
|
255
|
+
class ParamToken < Token
|
256
|
+
def ==(other)
|
257
|
+
(self[0...-1] == other[0...-1]) && (
|
258
|
+
(other[3] == Ripper::EXPR_END) ||
|
259
|
+
(other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
|
260
|
+
)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
# A heredoc in this case is a list of tokens that belong to the body of the
|
265
|
+
# heredoc that should be appended onto the list of tokens when the heredoc
|
266
|
+
# closes.
|
267
|
+
module Heredoc
|
268
|
+
# Heredocs that are no dash or tilde heredocs are just a list of tokens.
|
269
|
+
# We need to keep them around so that we can insert them in the correct
|
270
|
+
# order back into the token stream and set the state of the last token to
|
271
|
+
# the state that the heredoc was opened in.
|
272
|
+
class PlainHeredoc
|
273
|
+
attr_reader :tokens
|
274
|
+
|
275
|
+
def initialize
|
276
|
+
@tokens = []
|
277
|
+
end
|
278
|
+
|
279
|
+
def <<(token)
|
280
|
+
tokens << token
|
281
|
+
end
|
282
|
+
|
283
|
+
def to_a
|
284
|
+
tokens
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
# Dash heredocs are a little more complicated. They are a list of tokens
|
289
|
+
# that need to be split on "\\\n" to mimic Ripper's behavior. We also need
|
290
|
+
# to keep track of the state that the heredoc was opened in.
|
291
|
+
class DashHeredoc
|
292
|
+
attr_reader :split, :tokens
|
293
|
+
|
294
|
+
def initialize(split)
|
295
|
+
@split = split
|
296
|
+
@tokens = []
|
297
|
+
end
|
298
|
+
|
299
|
+
def <<(token)
|
300
|
+
tokens << token
|
301
|
+
end
|
302
|
+
|
303
|
+
def to_a
|
304
|
+
embexpr_balance = 0
|
305
|
+
|
306
|
+
tokens.each_with_object([]) do |token, results|
|
307
|
+
case token.event
|
308
|
+
when :on_embexpr_beg
|
309
|
+
embexpr_balance += 1
|
310
|
+
results << token
|
311
|
+
when :on_embexpr_end
|
312
|
+
embexpr_balance -= 1
|
313
|
+
results << token
|
314
|
+
when :on_tstring_content
|
315
|
+
if embexpr_balance == 0
|
316
|
+
lineno = token[0][0]
|
317
|
+
column = token[0][1]
|
318
|
+
|
319
|
+
if split
|
320
|
+
# Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
|
321
|
+
# to keep the delimiter in the result.
|
322
|
+
token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
|
323
|
+
column = 0 if index > 0
|
324
|
+
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
|
325
|
+
lineno += value.count("\n")
|
326
|
+
end
|
327
|
+
else
|
328
|
+
results << token
|
329
|
+
end
|
330
|
+
else
|
331
|
+
results << token
|
332
|
+
end
|
333
|
+
else
|
334
|
+
results << token
|
335
|
+
end
|
336
|
+
end
|
337
|
+
end
|
338
|
+
end
|
339
|
+
|
340
|
+
# Heredocs that are dedenting heredocs are a little more complicated.
|
341
|
+
# Ripper outputs on_ignored_sp tokens for the whitespace that is being
|
342
|
+
# removed from the output. prism only modifies the node itself and keeps
|
343
|
+
# the token the same. This simplifies prism, but makes comparing against
|
344
|
+
# Ripper much harder because there is a length mismatch.
|
345
|
+
#
|
346
|
+
# Fortunately, we already have to pull out the heredoc tokens in order to
|
347
|
+
# insert them into the stream in the correct order. As such, we can do
|
348
|
+
# some extra manipulation on the tokens to make them match Ripper's
|
349
|
+
# output by mirroring the dedent logic that Ripper uses.
|
350
|
+
class DedentingHeredoc
|
351
|
+
TAB_WIDTH = 8
|
352
|
+
|
353
|
+
attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
|
354
|
+
|
355
|
+
def initialize
|
356
|
+
@tokens = []
|
357
|
+
@dedent_next = true
|
358
|
+
@dedent = nil
|
359
|
+
@embexpr_balance = 0
|
360
|
+
end
|
361
|
+
|
362
|
+
# As tokens are coming in, we track the minimum amount of common leading
|
363
|
+
# whitespace on plain string content tokens. This allows us to later
|
364
|
+
# remove that amount of whitespace from the beginning of each line.
|
365
|
+
def <<(token)
|
366
|
+
case token.event
|
367
|
+
when :on_embexpr_beg, :on_heredoc_beg
|
368
|
+
@embexpr_balance += 1
|
369
|
+
when :on_embexpr_end, :on_heredoc_end
|
370
|
+
@embexpr_balance -= 1
|
371
|
+
when :on_tstring_content
|
372
|
+
if embexpr_balance == 0
|
373
|
+
token.value.split(/(?<=\n)/).each_with_index do |line, index|
|
374
|
+
next if line.strip.empty? && line.end_with?("\n")
|
375
|
+
next if !(dedent_next || index > 0)
|
376
|
+
|
377
|
+
leading = line[/\A(\s*)\n?/, 1]
|
378
|
+
next_dedent = 0
|
379
|
+
|
380
|
+
leading.each_char do |char|
|
381
|
+
if char == "\t"
|
382
|
+
next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
|
383
|
+
else
|
384
|
+
next_dedent += 1
|
385
|
+
end
|
386
|
+
end
|
387
|
+
|
388
|
+
@dedent = [dedent, next_dedent].compact.min
|
389
|
+
end
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
@dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
|
394
|
+
tokens << token
|
395
|
+
end
|
396
|
+
|
397
|
+
def to_a
|
398
|
+
# If every line in the heredoc is blank, we still need to split up the
|
399
|
+
# string content token into multiple tokens.
|
400
|
+
if dedent.nil?
|
401
|
+
results = []
|
402
|
+
embexpr_balance = 0
|
403
|
+
|
404
|
+
tokens.each do |token|
|
405
|
+
case token.event
|
406
|
+
when :on_embexpr_beg, :on_heredoc_beg
|
407
|
+
embexpr_balance += 1
|
408
|
+
results << token
|
409
|
+
when :on_embexpr_end, :on_heredoc_end
|
410
|
+
embexpr_balance -= 1
|
411
|
+
results << token
|
412
|
+
when :on_tstring_content
|
413
|
+
if embexpr_balance == 0
|
414
|
+
lineno = token[0][0]
|
415
|
+
column = token[0][1]
|
416
|
+
|
417
|
+
token.value.split(/(?<=\n)/).each_with_index do |value, index|
|
418
|
+
column = 0 if index > 0
|
419
|
+
results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
|
420
|
+
lineno += 1
|
421
|
+
end
|
422
|
+
else
|
423
|
+
results << token
|
424
|
+
end
|
425
|
+
else
|
426
|
+
results << token
|
427
|
+
end
|
428
|
+
end
|
429
|
+
|
430
|
+
return results
|
431
|
+
end
|
432
|
+
|
433
|
+
# Otherwise, we're going to run through each token in the list and
|
434
|
+
# insert on_ignored_sp tokens for the amount of dedent that we need to
|
435
|
+
# perform. We also need to remove the dedent from the beginning of
|
436
|
+
# each line of plain string content tokens.
|
437
|
+
results = []
|
438
|
+
dedent_next = true
|
439
|
+
embexpr_balance = 0
|
440
|
+
|
441
|
+
tokens.each do |token|
|
442
|
+
# Notice that the structure of this conditional largely matches the
|
443
|
+
# whitespace calculation we performed above. This is because
|
444
|
+
# checking if the subsequent token needs to be dedented is common to
|
445
|
+
# both the dedent calculation and the ignored_sp insertion.
|
446
|
+
case token.event
|
447
|
+
when :on_embexpr_beg
|
448
|
+
embexpr_balance += 1
|
449
|
+
results << token
|
450
|
+
when :on_embexpr_end
|
451
|
+
embexpr_balance -= 1
|
452
|
+
results << token
|
453
|
+
when :on_tstring_content
|
454
|
+
if embexpr_balance == 0
|
455
|
+
# Here we're going to split the string on newlines, but maintain
|
456
|
+
# the newlines in the resulting array. We'll do that with a look
|
457
|
+
# behind assertion.
|
458
|
+
splits = token.value.split(/(?<=\n)/)
|
459
|
+
index = 0
|
460
|
+
|
461
|
+
while index < splits.length
|
462
|
+
line = splits[index]
|
463
|
+
lineno = token[0][0] + index
|
464
|
+
column = token[0][1]
|
465
|
+
|
466
|
+
# Blank lines do not count toward common leading whitespace
|
467
|
+
# calculation and do not need to be dedented.
|
468
|
+
if dedent_next || index > 0
|
469
|
+
column = 0
|
470
|
+
end
|
471
|
+
|
472
|
+
# If the dedent is 0 and we're not supposed to dedent the next
|
473
|
+
# line or this line doesn't start with whitespace, then we
|
474
|
+
# should concatenate the rest of the string to match ripper.
|
475
|
+
if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
|
476
|
+
line = splits[index..].join
|
477
|
+
index = splits.length
|
478
|
+
end
|
479
|
+
|
480
|
+
# If we are supposed to dedent this line or if this is not the
|
481
|
+
# first line of the string and this line isn't entirely blank,
|
482
|
+
# then we need to insert an on_ignored_sp token and remove the
|
483
|
+
# dedent from the beginning of the line.
|
484
|
+
if (dedent > 0) && (dedent_next || index > 0)
|
485
|
+
deleting = 0
|
486
|
+
deleted_chars = []
|
487
|
+
|
488
|
+
# Gather up all of the characters that we're going to
|
489
|
+
# delete, stopping when you hit a character that would put
|
490
|
+
# you over the dedent amount.
|
491
|
+
line.each_char.with_index do |char, i|
|
492
|
+
case char
|
493
|
+
when "\r"
|
494
|
+
if line.chars[i + 1] == "\n"
|
495
|
+
break
|
496
|
+
end
|
497
|
+
when "\n"
|
498
|
+
break
|
499
|
+
when "\t"
|
500
|
+
deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
|
501
|
+
else
|
502
|
+
deleting += 1
|
503
|
+
end
|
504
|
+
|
505
|
+
break if deleting > dedent
|
506
|
+
deleted_chars << char
|
507
|
+
end
|
508
|
+
|
509
|
+
# If we have something to delete, then delete it from the
|
510
|
+
# string and insert an on_ignored_sp token.
|
511
|
+
if deleted_chars.any?
|
512
|
+
ignored = deleted_chars.join
|
513
|
+
line.delete_prefix!(ignored)
|
514
|
+
|
515
|
+
results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
|
516
|
+
column = ignored.length
|
517
|
+
end
|
518
|
+
end
|
519
|
+
|
520
|
+
results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
|
521
|
+
index += 1
|
522
|
+
end
|
523
|
+
else
|
524
|
+
results << token
|
525
|
+
end
|
526
|
+
else
|
527
|
+
results << token
|
528
|
+
end
|
529
|
+
|
530
|
+
dedent_next =
|
531
|
+
((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
|
532
|
+
embexpr_balance == 0
|
533
|
+
end
|
534
|
+
|
535
|
+
results
|
536
|
+
end
|
537
|
+
end
|
538
|
+
|
539
|
+
# Here we will split between the two types of heredocs and return the
|
540
|
+
# object that will store their tokens.
|
541
|
+
def self.build(opening)
|
542
|
+
case opening.value[2]
|
543
|
+
when "~"
|
544
|
+
DedentingHeredoc.new
|
545
|
+
when "-"
|
546
|
+
DashHeredoc.new(opening.value[3] != "'")
|
547
|
+
else
|
548
|
+
PlainHeredoc.new
|
549
|
+
end
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
attr_reader :source, :filepath
|
554
|
+
|
555
|
+
def initialize(source, filepath = "")
|
556
|
+
@source = source
|
557
|
+
@filepath = filepath || ""
|
558
|
+
end
|
559
|
+
|
560
|
+
def result
|
561
|
+
tokens = []
|
562
|
+
|
563
|
+
state = :default
|
564
|
+
heredoc_stack = [[]]
|
565
|
+
|
566
|
+
result = Prism.lex(source, @filepath)
|
567
|
+
result_value = result.value
|
568
|
+
previous_state = nil
|
569
|
+
|
570
|
+
# In previous versions of Ruby, Ripper wouldn't flush the bom before the
|
571
|
+
# first token, so we had to have a hack in place to account for that. This
|
572
|
+
# checks for that behavior.
|
573
|
+
bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
|
574
|
+
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
|
575
|
+
|
576
|
+
result_value.each_with_index do |(token, lex_state), index|
|
577
|
+
lineno = token.location.start_line
|
578
|
+
column = token.location.start_column
|
579
|
+
|
580
|
+
# If there's a UTF-8 byte-order mark as the start of the file, then for
|
581
|
+
# certain tokens ripper sets the first token back by 3 bytes. It also
|
582
|
+
# keeps the byte order mark in the first token's value. This is weird,
|
583
|
+
# and I don't want to mirror that in our parser. So instead, we'll match
|
584
|
+
# up the columns and values here.
|
585
|
+
if bom && lineno == 1
|
586
|
+
column -= 3
|
587
|
+
|
588
|
+
if index == 0 && column == 0 && !bom_flushed
|
589
|
+
flushed =
|
590
|
+
case token.type
|
591
|
+
when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
|
592
|
+
:GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
|
593
|
+
:PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
|
594
|
+
:PERCENT_UPPER_W, :STRING_BEGIN
|
595
|
+
true
|
596
|
+
when :REGEXP_BEGIN, :SYMBOL_BEGIN
|
597
|
+
token.value.start_with?("%")
|
598
|
+
else
|
599
|
+
false
|
600
|
+
end
|
601
|
+
|
602
|
+
unless flushed
|
603
|
+
column -= 3
|
604
|
+
value = token.value
|
605
|
+
value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
event = RIPPER.fetch(token.type)
|
611
|
+
value = token.value
|
612
|
+
lex_state = Ripper::Lexer::State.new(lex_state)
|
613
|
+
|
614
|
+
token =
|
615
|
+
case event
|
616
|
+
when :on___end__
|
617
|
+
EndContentToken.new([[lineno, column], event, value, lex_state])
|
618
|
+
when :on_comment
|
619
|
+
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
|
620
|
+
when :on_heredoc_end
|
621
|
+
# Heredoc end tokens can be emitted in an odd order, so we don't
|
622
|
+
# want to bother comparing the state on them.
|
623
|
+
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
|
624
|
+
when :on_ident
|
625
|
+
if lex_state == Ripper::EXPR_END
|
626
|
+
# If we have an identifier that follows a method name like:
|
627
|
+
#
|
628
|
+
# def foo bar
|
629
|
+
#
|
630
|
+
# then Ripper will mark bar as END|LABEL if there is a local in a
|
631
|
+
# parent scope named bar because it hasn't pushed the local table
|
632
|
+
# yet. We do this more accurately, so we need to allow comparing
|
633
|
+
# against both END and END|LABEL.
|
634
|
+
ParamToken.new([[lineno, column], event, value, lex_state])
|
635
|
+
elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
|
636
|
+
# In the event that we're comparing identifiers, we're going to
|
637
|
+
# allow a little divergence. Ripper doesn't account for local
|
638
|
+
# variables introduced through named captures in regexes, and we
|
639
|
+
# do, which accounts for this difference.
|
640
|
+
IdentToken.new([[lineno, column], event, value, lex_state])
|
641
|
+
else
|
642
|
+
Token.new([[lineno, column], event, value, lex_state])
|
643
|
+
end
|
644
|
+
when :on_embexpr_end
|
645
|
+
IgnoreStateToken.new([[lineno, column], event, value, lex_state])
|
646
|
+
when :on_ignored_nl
|
647
|
+
# Ignored newlines can occasionally have a LABEL state attached to
|
648
|
+
# them which doesn't actually impact anything. We don't mirror that
|
649
|
+
# state so we ignored it.
|
650
|
+
IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
|
651
|
+
when :on_regexp_end
|
652
|
+
# On regex end, Ripper scans and then sets end state, so the ripper
|
653
|
+
# lexed output is begin, when it should be end. prism sets lex state
|
654
|
+
# correctly to end state, but we want to be able to compare against
|
655
|
+
# Ripper's lexed state. So here, if it's a regexp end token, we
|
656
|
+
# output the state as the previous state, solely for the sake of
|
657
|
+
# comparison.
|
658
|
+
previous_token = result_value[index - 1][0]
|
659
|
+
lex_state =
|
660
|
+
if RIPPER.fetch(previous_token.type) == :on_embexpr_end
|
661
|
+
# If the previous token is embexpr_end, then we have to do even
|
662
|
+
# more processing. The end of an embedded expression sets the
|
663
|
+
# state to the state that it had at the beginning of the
|
664
|
+
# embedded expression. So we have to go and find that state and
|
665
|
+
# set it here.
|
666
|
+
counter = 1
|
667
|
+
current_index = index - 1
|
668
|
+
|
669
|
+
until counter == 0
|
670
|
+
current_index -= 1
|
671
|
+
current_event = RIPPER.fetch(result_value[current_index][0].type)
|
672
|
+
counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
|
673
|
+
end
|
674
|
+
|
675
|
+
Ripper::Lexer::State.new(result_value[current_index][1])
|
676
|
+
else
|
677
|
+
previous_state
|
678
|
+
end
|
679
|
+
|
680
|
+
Token.new([[lineno, column], event, value, lex_state])
|
681
|
+
when :on_eof
|
682
|
+
previous_token = result_value[index - 1][0]
|
683
|
+
|
684
|
+
# If we're at the end of the file and the previous token was a
|
685
|
+
# comment and there is still whitespace after the comment, then
|
686
|
+
# Ripper will append a on_nl token (even though there isn't
|
687
|
+
# necessarily a newline). We mirror that here.
|
688
|
+
start_offset = previous_token.location.end_offset
|
689
|
+
end_offset = token.location.start_offset
|
690
|
+
|
691
|
+
if previous_token.type == :COMMENT && start_offset < end_offset
|
692
|
+
if bom
|
693
|
+
start_offset += 3
|
694
|
+
end_offset += 3
|
695
|
+
end
|
696
|
+
|
697
|
+
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
|
698
|
+
end
|
699
|
+
|
700
|
+
Token.new([[lineno, column], event, value, lex_state])
|
701
|
+
else
|
702
|
+
Token.new([[lineno, column], event, value, lex_state])
|
703
|
+
end
|
704
|
+
|
705
|
+
previous_state = lex_state
|
706
|
+
|
707
|
+
# The order in which tokens appear in our lexer is different from the
|
708
|
+
# order that they appear in Ripper. When we hit the declaration of a
|
709
|
+
# heredoc in prism, we skip forward and lex the rest of the content of
|
710
|
+
# the heredoc before going back and lexing at the end of the heredoc
|
711
|
+
# identifier.
|
712
|
+
#
|
713
|
+
# To match up to ripper, we keep a small state variable around here to
|
714
|
+
# track whether we're in the middle of a heredoc or not. In this way we
|
715
|
+
# can shuffle around the token to match Ripper's output.
|
716
|
+
case state
|
717
|
+
when :default
|
718
|
+
# The default state is when there are no heredocs at all. In this
|
719
|
+
# state we can append the token to the list of tokens and move on.
|
720
|
+
tokens << token
|
721
|
+
|
722
|
+
# If we get the declaration of a heredoc, then we open a new heredoc
|
723
|
+
# and move into the heredoc_opened state.
|
724
|
+
if event == :on_heredoc_beg
|
725
|
+
state = :heredoc_opened
|
726
|
+
heredoc_stack.last << Heredoc.build(token)
|
727
|
+
end
|
728
|
+
when :heredoc_opened
|
729
|
+
# The heredoc_opened state is when we've seen the declaration of a
|
730
|
+
# heredoc and are now lexing the body of the heredoc. In this state we
|
731
|
+
# push tokens onto the most recently created heredoc.
|
732
|
+
heredoc_stack.last.last << token
|
733
|
+
|
734
|
+
case event
|
735
|
+
when :on_heredoc_beg
|
736
|
+
# If we receive a heredoc declaration while lexing the body of a
|
737
|
+
# heredoc, this means we have nested heredocs. In this case we'll
|
738
|
+
# push a new heredoc onto the stack and stay in the heredoc_opened
|
739
|
+
# state since we're now lexing the body of the new heredoc.
|
740
|
+
heredoc_stack << [Heredoc.build(token)]
|
741
|
+
when :on_heredoc_end
|
742
|
+
# If we receive the end of a heredoc, then we're done lexing the
|
743
|
+
# body of the heredoc. In this case we now have a completed heredoc
|
744
|
+
# but need to wait for the next newline to push it into the token
|
745
|
+
# stream.
|
746
|
+
state = :heredoc_closed
|
747
|
+
end
|
748
|
+
when :heredoc_closed
|
749
|
+
if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
|
750
|
+
if heredoc_stack.size > 1
|
751
|
+
flushing = heredoc_stack.pop
|
752
|
+
heredoc_stack.last.last << token
|
753
|
+
|
754
|
+
flushing.each do |heredoc|
|
755
|
+
heredoc.to_a.each do |flushed_token|
|
756
|
+
heredoc_stack.last.last << flushed_token
|
757
|
+
end
|
758
|
+
end
|
759
|
+
|
760
|
+
state = :heredoc_opened
|
761
|
+
next
|
762
|
+
end
|
763
|
+
elsif event == :on_heredoc_beg
|
764
|
+
tokens << token
|
765
|
+
state = :heredoc_opened
|
766
|
+
heredoc_stack.last << Heredoc.build(token)
|
767
|
+
next
|
768
|
+
elsif heredoc_stack.size > 1
|
769
|
+
heredoc_stack[-2].last << token
|
770
|
+
next
|
771
|
+
end
|
772
|
+
|
773
|
+
heredoc_stack.last.each do |heredoc|
|
774
|
+
tokens.concat(heredoc.to_a)
|
775
|
+
end
|
776
|
+
|
777
|
+
heredoc_stack.last.clear
|
778
|
+
state = :default
|
779
|
+
|
780
|
+
tokens << token
|
781
|
+
end
|
782
|
+
end
|
783
|
+
|
784
|
+
# Drop the EOF token from the list
|
785
|
+
tokens = tokens[0...-1]
|
786
|
+
|
787
|
+
# We sort by location to compare against Ripper's output
|
788
|
+
tokens.sort_by!(&:location)
|
789
|
+
|
790
|
+
if result_value.size - 1 > tokens.size
|
791
|
+
raise StandardError, "Lost tokens when performing lex_compat"
|
792
|
+
end
|
793
|
+
|
794
|
+
ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
|
795
|
+
end
|
796
|
+
end
|
797
|
+
|
798
|
+
# This is a class that wraps the Ripper lexer to produce almost exactly the
|
799
|
+
# same tokens.
|
800
|
+
class LexRipper
|
801
|
+
attr_reader :source
|
802
|
+
|
803
|
+
def initialize(source)
|
804
|
+
@source = source
|
805
|
+
end
|
806
|
+
|
807
|
+
def result
|
808
|
+
previous = []
|
809
|
+
results = []
|
810
|
+
|
811
|
+
Ripper.lex(source, raise_errors: true).each do |token|
|
812
|
+
case token[1]
|
813
|
+
when :on_sp
|
814
|
+
# skip
|
815
|
+
when :on_tstring_content
|
816
|
+
if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
|
817
|
+
previous[2] << token[2]
|
818
|
+
else
|
819
|
+
results << token
|
820
|
+
previous = token
|
821
|
+
end
|
822
|
+
when :on_words_sep
|
823
|
+
if previous[1] == :on_words_sep
|
824
|
+
previous[2] << token[2]
|
825
|
+
else
|
826
|
+
results << token
|
827
|
+
previous = token
|
828
|
+
end
|
829
|
+
else
|
830
|
+
results << token
|
831
|
+
previous = token
|
832
|
+
end
|
833
|
+
end
|
834
|
+
|
835
|
+
results
|
836
|
+
end
|
837
|
+
end
|
838
|
+
end
|