jruby-prism-parser 0.24.0-java → 1.4.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/BSDmakefile +58 -0
- data/CHANGELOG.md +269 -1
- data/CONTRIBUTING.md +0 -4
- data/Makefile +25 -18
- data/README.md +57 -6
- data/config.yml +1724 -140
- data/docs/build_system.md +39 -11
- data/docs/configuration.md +4 -0
- data/docs/cruby_compilation.md +1 -1
- data/docs/fuzzing.md +1 -1
- data/docs/parser_translation.md +14 -9
- data/docs/parsing_rules.md +4 -1
- data/docs/releasing.md +8 -10
- data/docs/relocation.md +34 -0
- data/docs/ripper_translation.md +72 -0
- data/docs/ruby_api.md +2 -1
- data/docs/serialization.md +29 -5
- data/ext/prism/api_node.c +3395 -1999
- data/ext/prism/api_pack.c +9 -0
- data/ext/prism/extconf.rb +55 -34
- data/ext/prism/extension.c +597 -346
- data/ext/prism/extension.h +6 -5
- data/include/prism/ast.h +2612 -455
- data/include/prism/defines.h +160 -2
- data/include/prism/diagnostic.h +188 -76
- data/include/prism/encoding.h +22 -4
- data/include/prism/node.h +89 -17
- data/include/prism/options.h +224 -12
- data/include/prism/pack.h +11 -0
- data/include/prism/parser.h +267 -66
- data/include/prism/prettyprint.h +8 -0
- data/include/prism/regexp.h +18 -8
- data/include/prism/static_literals.h +121 -0
- data/include/prism/util/pm_buffer.h +75 -2
- data/include/prism/util/pm_char.h +1 -2
- data/include/prism/util/pm_constant_pool.h +18 -9
- data/include/prism/util/pm_integer.h +126 -0
- data/include/prism/util/pm_list.h +1 -1
- data/include/prism/util/pm_newline_list.h +19 -0
- data/include/prism/util/pm_string.h +48 -8
- data/include/prism/version.h +3 -3
- data/include/prism.h +99 -5
- data/jruby-prism.jar +0 -0
- data/lib/prism/compiler.rb +11 -1
- data/lib/prism/desugar_compiler.rb +113 -74
- data/lib/prism/dispatcher.rb +45 -1
- data/lib/prism/dot_visitor.rb +201 -77
- data/lib/prism/dsl.rb +673 -461
- data/lib/prism/ffi.rb +233 -45
- data/lib/prism/inspect_visitor.rb +2389 -0
- data/lib/prism/lex_compat.rb +35 -16
- data/lib/prism/mutation_compiler.rb +24 -8
- data/lib/prism/node.rb +7731 -8460
- data/lib/prism/node_ext.rb +328 -32
- data/lib/prism/pack.rb +4 -0
- data/lib/prism/parse_result/comments.rb +34 -24
- data/lib/prism/parse_result/errors.rb +65 -0
- data/lib/prism/parse_result/newlines.rb +102 -12
- data/lib/prism/parse_result.rb +448 -44
- data/lib/prism/pattern.rb +28 -10
- data/lib/prism/polyfill/append_as_bytes.rb +15 -0
- data/lib/prism/polyfill/byteindex.rb +13 -0
- data/lib/prism/polyfill/unpack1.rb +14 -0
- data/lib/prism/reflection.rb +413 -0
- data/lib/prism/relocation.rb +504 -0
- data/lib/prism/serialize.rb +1940 -1198
- data/lib/prism/string_query.rb +30 -0
- data/lib/prism/translation/parser/builder.rb +61 -0
- data/lib/prism/translation/parser/compiler.rb +569 -195
- data/lib/prism/translation/parser/lexer.rb +516 -39
- data/lib/prism/translation/parser.rb +177 -12
- data/lib/prism/translation/parser33.rb +1 -1
- data/lib/prism/translation/parser34.rb +1 -1
- data/lib/prism/translation/parser35.rb +12 -0
- data/lib/prism/translation/ripper/sexp.rb +125 -0
- data/lib/prism/translation/ripper/shim.rb +5 -0
- data/lib/prism/translation/ripper.rb +3224 -462
- data/lib/prism/translation/ruby_parser.rb +194 -69
- data/lib/prism/translation.rb +4 -1
- data/lib/prism/version.rb +1 -1
- data/lib/prism/visitor.rb +13 -0
- data/lib/prism.rb +17 -27
- data/prism.gemspec +57 -17
- data/rbi/prism/compiler.rbi +12 -0
- data/rbi/prism/dsl.rbi +524 -0
- data/rbi/prism/inspect_visitor.rbi +12 -0
- data/rbi/prism/node.rbi +8722 -0
- data/rbi/prism/node_ext.rbi +107 -0
- data/rbi/prism/parse_result.rbi +404 -0
- data/rbi/prism/reflection.rbi +58 -0
- data/rbi/prism/string_query.rbi +12 -0
- data/rbi/prism/translation/parser.rbi +11 -0
- data/rbi/prism/translation/parser33.rbi +6 -0
- data/rbi/prism/translation/parser34.rbi +6 -0
- data/rbi/prism/translation/parser35.rbi +6 -0
- data/rbi/prism/translation/ripper.rbi +15 -0
- data/rbi/prism/visitor.rbi +473 -0
- data/rbi/prism.rbi +44 -7745
- data/sig/prism/compiler.rbs +9 -0
- data/sig/prism/dispatcher.rbs +16 -0
- data/sig/prism/dot_visitor.rbs +6 -0
- data/sig/prism/dsl.rbs +351 -0
- data/sig/prism/inspect_visitor.rbs +22 -0
- data/sig/prism/lex_compat.rbs +10 -0
- data/sig/prism/mutation_compiler.rbs +159 -0
- data/sig/prism/node.rbs +3614 -0
- data/sig/prism/node_ext.rbs +82 -0
- data/sig/prism/pack.rbs +43 -0
- data/sig/prism/parse_result.rbs +192 -0
- data/sig/prism/pattern.rbs +13 -0
- data/sig/prism/reflection.rbs +50 -0
- data/sig/prism/relocation.rbs +185 -0
- data/sig/prism/serialize.rbs +8 -0
- data/sig/prism/string_query.rbs +11 -0
- data/sig/prism/visitor.rbs +169 -0
- data/sig/prism.rbs +248 -4767
- data/src/diagnostic.c +672 -230
- data/src/encoding.c +211 -108
- data/src/node.c +7541 -1653
- data/src/options.c +135 -20
- data/src/pack.c +33 -17
- data/src/prettyprint.c +1543 -1485
- data/src/prism.c +7813 -3050
- data/src/regexp.c +225 -73
- data/src/serialize.c +101 -77
- data/src/static_literals.c +617 -0
- data/src/token_type.c +14 -13
- data/src/util/pm_buffer.c +187 -20
- data/src/util/pm_char.c +5 -5
- data/src/util/pm_constant_pool.c +39 -19
- data/src/util/pm_integer.c +670 -0
- data/src/util/pm_list.c +1 -1
- data/src/util/pm_newline_list.c +43 -5
- data/src/util/pm_string.c +213 -33
- data/src/util/pm_strncasecmp.c +13 -1
- data/src/util/pm_strpbrk.c +32 -6
- metadata +55 -19
- data/docs/ripper.md +0 -36
- data/include/prism/util/pm_state_stack.h +0 -42
- data/include/prism/util/pm_string_list.h +0 -44
- data/lib/prism/debug.rb +0 -206
- data/lib/prism/node_inspector.rb +0 -68
- data/lib/prism/translation/parser/rubocop.rb +0 -45
- data/rbi/prism_static.rbi +0 -207
- data/sig/prism_static.rbs +0 -201
- data/src/util/pm_state_stack.c +0 -25
- data/src/util/pm_string_list.c +0 -28
@@ -1,21 +1,25 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "strscan"
|
4
|
+
require_relative "../../polyfill/append_as_bytes"
|
5
|
+
|
3
6
|
module Prism
|
4
7
|
module Translation
|
5
8
|
class Parser
|
6
9
|
# Accepts a list of prism tokens and converts them into the expected
|
7
10
|
# format for the parser gem.
|
8
11
|
class Lexer
|
12
|
+
# These tokens are always skipped
|
13
|
+
TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
|
14
|
+
private_constant :TYPES_ALWAYS_SKIP
|
15
|
+
|
9
16
|
# The direct translating of types between the two lexers.
|
10
17
|
TYPES = {
|
11
18
|
# These tokens should never appear in the output of the lexer.
|
12
|
-
EOF: nil,
|
13
19
|
MISSING: nil,
|
14
20
|
NOT_PROVIDED: nil,
|
15
|
-
IGNORED_NEWLINE: nil,
|
16
21
|
EMBDOC_END: nil,
|
17
22
|
EMBDOC_LINE: nil,
|
18
|
-
__END__: nil,
|
19
23
|
|
20
24
|
# These tokens have more or less direct mappings.
|
21
25
|
AMPERSAND: :tAMPER2,
|
@@ -134,7 +138,7 @@ module Prism
|
|
134
138
|
MINUS_GREATER: :tLAMBDA,
|
135
139
|
NEWLINE: :tNL,
|
136
140
|
NUMBERED_REFERENCE: :tNTH_REF,
|
137
|
-
PARENTHESIS_LEFT: :
|
141
|
+
PARENTHESIS_LEFT: :tLPAREN2,
|
138
142
|
PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG,
|
139
143
|
PARENTHESIS_RIGHT: :tRPAREN,
|
140
144
|
PERCENT: :tPERCENT,
|
@@ -167,22 +171,54 @@ module Prism
|
|
167
171
|
TILDE: :tTILDE,
|
168
172
|
UAMPERSAND: :tAMPER,
|
169
173
|
UCOLON_COLON: :tCOLON3,
|
170
|
-
UDOT_DOT: :
|
174
|
+
UDOT_DOT: :tBDOT2,
|
171
175
|
UDOT_DOT_DOT: :tBDOT3,
|
172
176
|
UMINUS: :tUMINUS,
|
173
177
|
UMINUS_NUM: :tUNARY_NUM,
|
174
178
|
UPLUS: :tUPLUS,
|
175
179
|
USTAR: :tSTAR,
|
176
|
-
USTAR_STAR: :
|
180
|
+
USTAR_STAR: :tDSTAR,
|
177
181
|
WORDS_SEP: :tSPACE
|
178
182
|
}
|
179
183
|
|
180
|
-
|
184
|
+
# These constants represent flags in our lex state. We really, really
|
185
|
+
# don't want to be using them and we really, really don't want to be
|
186
|
+
# exposing them as part of our public API. Unfortunately, we don't have
|
187
|
+
# another way of matching the exact tokens that the parser gem expects
|
188
|
+
# without them. We should find another way to do this, but in the
|
189
|
+
# meantime we'll hide them from the documentation and mark them as
|
190
|
+
# private constants.
|
191
|
+
EXPR_BEG = 0x1 # :nodoc:
|
192
|
+
EXPR_LABEL = 0x400 # :nodoc:
|
193
|
+
|
194
|
+
# It is used to determine whether `do` is of the token type `kDO` or `kDO_LAMBDA`.
|
195
|
+
#
|
196
|
+
# NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
|
197
|
+
# instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
|
198
|
+
LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
|
199
|
+
|
200
|
+
# The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
|
201
|
+
# The following token types are listed as those classified as `tLPAREN`.
|
202
|
+
LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
|
203
|
+
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
|
204
|
+
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
|
205
|
+
])
|
206
|
+
|
207
|
+
# Types of tokens that are allowed to continue a method call with comments in-between.
|
208
|
+
# For these, the parser gem doesn't emit a newline token after the last comment.
|
209
|
+
COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
|
210
|
+
private_constant :COMMENT_CONTINUATION_TYPES
|
211
|
+
|
212
|
+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
|
213
|
+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
|
214
|
+
|
215
|
+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
|
181
216
|
|
182
217
|
# The Parser::Source::Buffer that the tokens were lexed from.
|
183
218
|
attr_reader :source_buffer
|
184
219
|
|
185
|
-
# An array of prism tokens
|
220
|
+
# An array of tuples that contain prism tokens and their associated lex
|
221
|
+
# state when they were lexed.
|
186
222
|
attr_reader :lexed
|
187
223
|
|
188
224
|
# A hash that maps offsets in bytes to offsets in characters.
|
@@ -202,102 +238,293 @@ module Prism
|
|
202
238
|
# Convert the prism tokens into the expected format for the parser gem.
|
203
239
|
def to_a
|
204
240
|
tokens = []
|
241
|
+
|
205
242
|
index = 0
|
243
|
+
length = lexed.length
|
206
244
|
|
207
|
-
|
208
|
-
|
245
|
+
heredoc_stack = []
|
246
|
+
quote_stack = []
|
247
|
+
|
248
|
+
# The parser gem emits the newline tokens for comments out of order. This saves
|
249
|
+
# that token location to emit at a later time to properly line everything up.
|
250
|
+
# https://github.com/whitequark/parser/issues/1025
|
251
|
+
comment_newline_location = nil
|
252
|
+
|
253
|
+
while index < length
|
254
|
+
token, state = lexed[index]
|
209
255
|
index += 1
|
210
|
-
next if
|
256
|
+
next if TYPES_ALWAYS_SKIP.include?(token.type)
|
211
257
|
|
212
258
|
type = TYPES.fetch(token.type)
|
213
259
|
value = token.value
|
214
|
-
location =
|
260
|
+
location = range(token.location.start_offset, token.location.end_offset)
|
215
261
|
|
216
262
|
case type
|
263
|
+
when :kDO
|
264
|
+
nearest_lambda_token = tokens.reverse_each.find do |token|
|
265
|
+
LAMBDA_TOKEN_TYPES.include?(token.first)
|
266
|
+
end
|
267
|
+
|
268
|
+
if nearest_lambda_token&.first == :tLAMBDA
|
269
|
+
type = :kDO_LAMBDA
|
270
|
+
end
|
217
271
|
when :tCHARACTER
|
218
272
|
value.delete_prefix!("?")
|
273
|
+
# Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
|
274
|
+
value = unescape_string(value, "?")
|
219
275
|
when :tCOMMENT
|
220
276
|
if token.type == :EMBDOC_BEGIN
|
221
|
-
|
277
|
+
|
278
|
+
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
|
222
279
|
value += next_token.value
|
223
280
|
index += 1
|
224
281
|
end
|
225
282
|
|
226
283
|
value += next_token.value
|
227
|
-
location =
|
284
|
+
location = range(token.location.start_offset, lexed[index][0].location.end_offset)
|
228
285
|
index += 1
|
229
286
|
else
|
230
|
-
value.chomp
|
231
|
-
location =
|
287
|
+
is_at_eol = value.chomp!.nil?
|
288
|
+
location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
|
289
|
+
|
290
|
+
prev_token = lexed[index - 2][0] if index - 2 >= 0
|
291
|
+
next_token = lexed[index][0]
|
292
|
+
|
293
|
+
is_inline_comment = prev_token&.location&.start_line == token.location.start_line
|
294
|
+
if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
295
|
+
tokens << [:tCOMMENT, [value, location]]
|
296
|
+
|
297
|
+
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
|
298
|
+
tokens << [:tNL, [nil, nl_location]]
|
299
|
+
next
|
300
|
+
elsif is_inline_comment && next_token&.type == :COMMENT
|
301
|
+
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
|
302
|
+
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
303
|
+
tokens << [:tCOMMENT, [value, location]]
|
304
|
+
tokens << [:tNL, [nil, comment_newline_location]]
|
305
|
+
comment_newline_location = nil
|
306
|
+
next
|
307
|
+
end
|
232
308
|
end
|
233
309
|
when :tNL
|
310
|
+
next_token = next_token = lexed[index][0]
|
311
|
+
# Newlines after comments are emitted out of order.
|
312
|
+
if next_token&.type == :COMMENT
|
313
|
+
comment_newline_location = location
|
314
|
+
next
|
315
|
+
end
|
316
|
+
|
234
317
|
value = nil
|
235
318
|
when :tFLOAT
|
236
|
-
value =
|
319
|
+
value = parse_float(value)
|
237
320
|
when :tIMAGINARY
|
238
321
|
value = parse_complex(value)
|
239
322
|
when :tINTEGER
|
240
323
|
if value.start_with?("+")
|
241
|
-
tokens << [:tUNARY_NUM, ["+",
|
242
|
-
location =
|
324
|
+
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
|
325
|
+
location = range(token.location.start_offset + 1, token.location.end_offset)
|
243
326
|
end
|
244
327
|
|
245
|
-
value =
|
328
|
+
value = parse_integer(value)
|
246
329
|
when :tLABEL
|
247
330
|
value.chomp!(":")
|
248
331
|
when :tLABEL_END
|
249
332
|
value.chomp!(":")
|
333
|
+
when :tLCURLY
|
334
|
+
type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
|
335
|
+
when :tLPAREN2
|
336
|
+
type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
|
250
337
|
when :tNTH_REF
|
251
|
-
value =
|
338
|
+
value = parse_integer(value.delete_prefix("$"))
|
252
339
|
when :tOP_ASGN
|
253
340
|
value.chomp!("=")
|
254
341
|
when :tRATIONAL
|
255
342
|
value = parse_rational(value)
|
256
343
|
when :tSPACE
|
344
|
+
location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
|
257
345
|
value = nil
|
258
346
|
when :tSTRING_BEG
|
259
|
-
|
347
|
+
next_token = lexed[index][0]
|
348
|
+
next_next_token = lexed[index + 1][0]
|
349
|
+
basic_quotes = value == '"' || value == "'"
|
350
|
+
|
351
|
+
if basic_quotes && next_token&.type == :STRING_END
|
260
352
|
next_location = token.location.join(next_token.location)
|
261
353
|
type = :tSTRING
|
262
354
|
value = ""
|
263
|
-
location =
|
355
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
264
356
|
index += 1
|
265
|
-
elsif
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
357
|
+
elsif value.start_with?("'", '"', "%")
|
358
|
+
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
|
359
|
+
string_value = next_token.value
|
360
|
+
if simplify_string?(string_value, value)
|
361
|
+
next_location = token.location.join(next_next_token.location)
|
362
|
+
if percent_array?(value)
|
363
|
+
value = percent_array_unescape(string_value)
|
364
|
+
else
|
365
|
+
value = unescape_string(string_value, value)
|
366
|
+
end
|
367
|
+
type = :tSTRING
|
368
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
369
|
+
index += 2
|
370
|
+
tokens << [type, [value, location]]
|
371
|
+
|
372
|
+
next
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
quote_stack.push(value)
|
377
|
+
elsif token.type == :HEREDOC_START
|
272
378
|
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
|
273
|
-
|
379
|
+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
|
380
|
+
heredoc = HeredocData.new(
|
381
|
+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
|
382
|
+
common_whitespace: 0,
|
383
|
+
)
|
384
|
+
|
385
|
+
if quote == "`"
|
386
|
+
type = :tXSTRING_BEG
|
387
|
+
end
|
388
|
+
|
389
|
+
# The parser gem trims whitespace from squiggly heredocs. We must record
|
390
|
+
# the most common whitespace to later remove.
|
391
|
+
if heredoc_type == "~" || heredoc_type == "`"
|
392
|
+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
|
393
|
+
end
|
394
|
+
|
395
|
+
if quote == "'" || quote == '"' || quote == "`"
|
396
|
+
value = "<<#{quote}"
|
397
|
+
else
|
398
|
+
value = '<<"'
|
399
|
+
end
|
400
|
+
|
401
|
+
heredoc_stack.push(heredoc)
|
402
|
+
quote_stack.push(value)
|
403
|
+
end
|
404
|
+
when :tSTRING_CONTENT
|
405
|
+
is_percent_array = percent_array?(quote_stack.last)
|
406
|
+
|
407
|
+
if (lines = token.value.lines).one?
|
408
|
+
# Prism usually emits a single token for strings with line continuations.
|
409
|
+
# For squiggly heredocs they are not joined so we do that manually here.
|
410
|
+
current_string = +""
|
411
|
+
current_length = 0
|
412
|
+
start_offset = token.location.start_offset
|
413
|
+
while token.type == :STRING_CONTENT
|
414
|
+
current_length += token.value.bytesize
|
415
|
+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
|
416
|
+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
|
417
|
+
# The parser gem only removes indentation when the heredoc is not nested
|
418
|
+
not_nested = heredoc_stack.size == 1
|
419
|
+
if is_percent_array
|
420
|
+
value = percent_array_unescape(token.value)
|
421
|
+
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
|
422
|
+
value = trim_heredoc_whitespace(token.value, current_heredoc)
|
423
|
+
end
|
424
|
+
|
425
|
+
current_string << unescape_string(value, quote_stack.last)
|
426
|
+
if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
|
427
|
+
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
|
428
|
+
break
|
429
|
+
end
|
430
|
+
token = lexed[index][0]
|
431
|
+
index += 1
|
432
|
+
end
|
433
|
+
else
|
434
|
+
# When the parser gem encounters a line continuation inside of a multiline string,
|
435
|
+
# it emits a single string node. The backslash (and remaining newline) is removed.
|
436
|
+
current_line = +""
|
437
|
+
adjustment = 0
|
438
|
+
start_offset = token.location.start_offset
|
439
|
+
emit = false
|
440
|
+
|
441
|
+
lines.each.with_index do |line, index|
|
442
|
+
chomped_line = line.chomp
|
443
|
+
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
|
444
|
+
is_interpolation = interpolation?(quote_stack.last)
|
445
|
+
|
446
|
+
if backslash_count.odd? && (is_interpolation || is_percent_array)
|
447
|
+
if is_percent_array
|
448
|
+
current_line << percent_array_unescape(line)
|
449
|
+
adjustment += 1
|
450
|
+
else
|
451
|
+
chomped_line.delete_suffix!("\\")
|
452
|
+
current_line << chomped_line
|
453
|
+
adjustment += 2
|
454
|
+
end
|
455
|
+
# If the string ends with a line continuation emit the remainder
|
456
|
+
emit = index == lines.count - 1
|
457
|
+
else
|
458
|
+
current_line << line
|
459
|
+
emit = true
|
460
|
+
end
|
461
|
+
|
462
|
+
if emit
|
463
|
+
end_offset = start_offset + current_line.bytesize + adjustment
|
464
|
+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
|
465
|
+
start_offset = end_offset
|
466
|
+
current_line = +""
|
467
|
+
adjustment = 0
|
468
|
+
end
|
469
|
+
end
|
274
470
|
end
|
471
|
+
next
|
275
472
|
when :tSTRING_DVAR
|
276
473
|
value = nil
|
277
474
|
when :tSTRING_END
|
278
|
-
if token.type == :
|
475
|
+
if token.type == :HEREDOC_END && value.end_with?("\n")
|
476
|
+
newline_length = value.end_with?("\r\n") ? 2 : 1
|
477
|
+
value = heredoc_stack.pop.identifier
|
478
|
+
location = range(token.location.start_offset, token.location.end_offset - newline_length)
|
479
|
+
elsif token.type == :REGEXP_END
|
279
480
|
value = value[0]
|
280
|
-
location =
|
481
|
+
location = range(token.location.start_offset, token.location.start_offset + 1)
|
482
|
+
end
|
483
|
+
|
484
|
+
if percent_array?(quote_stack.pop)
|
485
|
+
prev_token = lexed[index - 2][0] if index - 2 >= 0
|
486
|
+
empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
|
487
|
+
ends_with_whitespace = prev_token&.type == :WORDS_SEP
|
488
|
+
# parser always emits a space token after content in a percent array, even if no actual whitespace is present.
|
489
|
+
if !empty && !ends_with_whitespace
|
490
|
+
tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
|
491
|
+
end
|
281
492
|
end
|
282
493
|
when :tSYMBEG
|
283
|
-
if (next_token = lexed[index]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR
|
494
|
+
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
|
284
495
|
next_location = token.location.join(next_token.location)
|
285
496
|
type = :tSYMBOL
|
286
497
|
value = next_token.value
|
287
498
|
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
|
288
|
-
location =
|
499
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
289
500
|
index += 1
|
501
|
+
else
|
502
|
+
quote_stack.push(value)
|
290
503
|
end
|
291
504
|
when :tFID
|
292
|
-
if tokens
|
505
|
+
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
|
293
506
|
type = :tIDENTIFIER
|
294
507
|
end
|
508
|
+
when :tXSTRING_BEG
|
509
|
+
if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
|
510
|
+
# self.`()
|
511
|
+
type = :tBACK_REF2
|
512
|
+
end
|
513
|
+
quote_stack.push(value)
|
514
|
+
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
|
515
|
+
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
|
516
|
+
index += 1
|
517
|
+
end
|
518
|
+
|
519
|
+
quote_stack.push(value)
|
520
|
+
when :tREGEXP_BEG
|
521
|
+
quote_stack.push(value)
|
295
522
|
end
|
296
523
|
|
297
524
|
tokens << [type, [value, location]]
|
298
525
|
|
299
526
|
if token.type == :REGEXP_END
|
300
|
-
tokens << [:tREGEXP_OPT, [token.value[1..],
|
527
|
+
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
|
301
528
|
end
|
302
529
|
end
|
303
530
|
|
@@ -306,6 +533,25 @@ module Prism
|
|
306
533
|
|
307
534
|
private
|
308
535
|
|
536
|
+
# Creates a new parser range, taking prisms byte offsets into account
|
537
|
+
def range(start_offset, end_offset)
|
538
|
+
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
|
539
|
+
end
|
540
|
+
|
541
|
+
# Parse an integer from the string representation.
|
542
|
+
def parse_integer(value)
|
543
|
+
Integer(value)
|
544
|
+
rescue ArgumentError
|
545
|
+
0
|
546
|
+
end
|
547
|
+
|
548
|
+
# Parse a float from the string representation.
|
549
|
+
def parse_float(value)
|
550
|
+
Float(value)
|
551
|
+
rescue ArgumentError
|
552
|
+
0.0
|
553
|
+
end
|
554
|
+
|
309
555
|
# Parse a complex from the string representation.
|
310
556
|
def parse_complex(value)
|
311
557
|
value.chomp!("i")
|
@@ -313,10 +559,12 @@ module Prism
|
|
313
559
|
if value.end_with?("r")
|
314
560
|
Complex(0, parse_rational(value))
|
315
561
|
elsif value.start_with?(/0[BbOoDdXx]/)
|
316
|
-
Complex(0,
|
562
|
+
Complex(0, parse_integer(value))
|
317
563
|
else
|
318
564
|
Complex(0, value)
|
319
565
|
end
|
566
|
+
rescue ArgumentError
|
567
|
+
0i
|
320
568
|
end
|
321
569
|
|
322
570
|
# Parse a rational from the string representation.
|
@@ -324,10 +572,239 @@ module Prism
|
|
324
572
|
value.chomp!("r")
|
325
573
|
|
326
574
|
if value.start_with?(/0[BbOoDdXx]/)
|
327
|
-
Rational(
|
575
|
+
Rational(parse_integer(value))
|
328
576
|
else
|
329
577
|
Rational(value)
|
330
578
|
end
|
579
|
+
rescue ArgumentError
|
580
|
+
0r
|
581
|
+
end
|
582
|
+
|
583
|
+
# Wonky heredoc tab/spaces rules.
|
584
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
|
585
|
+
def calculate_heredoc_whitespace(heredoc_token_index)
|
586
|
+
next_token_index = heredoc_token_index
|
587
|
+
nesting_level = 0
|
588
|
+
previous_line = -1
|
589
|
+
result = Float::MAX
|
590
|
+
|
591
|
+
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
|
592
|
+
next_token_index += 1
|
593
|
+
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
|
594
|
+
first_token_on_line = next_token.location.start_column == 0
|
595
|
+
|
596
|
+
# String content inside nested heredocs and interpolation is ignored
|
597
|
+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
|
598
|
+
# When interpolation is the first token of a line there is no string
|
599
|
+
# content to check against. There will be no common whitespace.
|
600
|
+
if nesting_level == 0 && first_token_on_line
|
601
|
+
result = 0
|
602
|
+
end
|
603
|
+
nesting_level += 1
|
604
|
+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
|
605
|
+
nesting_level -= 1
|
606
|
+
# When we encountered the matching heredoc end, we can exit
|
607
|
+
break if nesting_level == -1
|
608
|
+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
|
609
|
+
common_whitespace = 0
|
610
|
+
next_token.value[/^\s*/].each_char do |char|
|
611
|
+
if char == "\t"
|
612
|
+
common_whitespace = (common_whitespace / 8 + 1) * 8;
|
613
|
+
else
|
614
|
+
common_whitespace += 1
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
is_first_token_on_line = next_token.location.start_line != previous_line
|
619
|
+
# Whitespace is significant if followed by interpolation
|
620
|
+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
|
621
|
+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
|
622
|
+
result = common_whitespace
|
623
|
+
previous_line = next_token.location.start_line
|
624
|
+
end
|
625
|
+
end
|
626
|
+
end
|
627
|
+
result
|
628
|
+
end
|
629
|
+
|
630
|
+
# Wonky heredoc tab/spaces rules.
|
631
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
|
632
|
+
def trim_heredoc_whitespace(string, heredoc)
|
633
|
+
trimmed_whitespace = 0
|
634
|
+
trimmed_characters = 0
|
635
|
+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
|
636
|
+
if string[trimmed_characters] == "\t"
|
637
|
+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
|
638
|
+
break if trimmed_whitespace > heredoc.common_whitespace
|
639
|
+
else
|
640
|
+
trimmed_whitespace += 1
|
641
|
+
end
|
642
|
+
trimmed_characters += 1
|
643
|
+
end
|
644
|
+
|
645
|
+
string[trimmed_characters..]
|
646
|
+
end
|
647
|
+
|
648
|
+
# Escape sequences that have special and should appear unescaped in the resulting string.
|
649
|
+
ESCAPES = {
|
650
|
+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
|
651
|
+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
|
652
|
+
"v" => "\v", "\\" => "\\"
|
653
|
+
}.freeze
|
654
|
+
private_constant :ESCAPES
|
655
|
+
|
656
|
+
# When one of these delimiters is encountered, then the other
|
657
|
+
# one is allowed to be escaped as well.
|
658
|
+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
|
659
|
+
private_constant :DELIMITER_SYMETRY
|
660
|
+
|
661
|
+
|
662
|
+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
|
663
|
+
REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
|
664
|
+
private_constant :REGEXP_META_CHARACTERS
|
665
|
+
|
666
|
+
# Apply Ruby string escaping rules
|
667
|
+
def unescape_string(string, quote)
|
668
|
+
# In single-quoted heredocs, everything is taken literally.
|
669
|
+
return string if quote == "<<'"
|
670
|
+
|
671
|
+
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
|
672
|
+
return string unless string.include?("\\")
|
673
|
+
|
674
|
+
# Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
|
675
|
+
delimiter = quote[-1]
|
676
|
+
|
677
|
+
if regexp?(quote)
|
678
|
+
# Should be escaped handled to single-quoted heredocs. The only character that is
|
679
|
+
# allowed to be escaped is the delimiter, except when that also has special meaning
|
680
|
+
# in the regexp. Since all the symetry delimiters have special meaning, they don't need
|
681
|
+
# to be considered separately.
|
682
|
+
if REGEXP_META_CHARACTERS.include?(delimiter)
|
683
|
+
string
|
684
|
+
else
|
685
|
+
# There can never be an even amount of backslashes. It would be a syntax error.
|
686
|
+
string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
|
687
|
+
end
|
688
|
+
elsif interpolation?(quote)
|
689
|
+
# Appending individual escape sequences may force the string out of its intended
|
690
|
+
# encoding. Start out with binary and force it back later.
|
691
|
+
result = "".b
|
692
|
+
|
693
|
+
scanner = StringScanner.new(string)
|
694
|
+
while (skipped = scanner.skip_until(/\\/))
|
695
|
+
# Append what was just skipped over, excluding the found backslash.
|
696
|
+
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
|
697
|
+
escape_read(result, scanner, false, false)
|
698
|
+
end
|
699
|
+
|
700
|
+
# Add remaining chars
|
701
|
+
result.append_as_bytes(string.byteslice(scanner.pos..))
|
702
|
+
result.force_encoding(source_buffer.source.encoding)
|
703
|
+
else
|
704
|
+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
|
705
|
+
string.gsub(/\\([\\#{delimiters}])/, '\1')
|
706
|
+
end
|
707
|
+
end
|
708
|
+
|
709
|
+
# Certain strings are merged into a single string token.
|
710
|
+
def simplify_string?(value, quote)
|
711
|
+
case quote
|
712
|
+
when "'"
|
713
|
+
# Only simplify 'foo'
|
714
|
+
!value.include?("\n")
|
715
|
+
when '"'
|
716
|
+
# Simplify when every line ends with a line continuation, or it is the last line
|
717
|
+
value.lines.all? do |line|
|
718
|
+
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
|
719
|
+
end
|
720
|
+
else
|
721
|
+
# %q and similar are never simplified
|
722
|
+
false
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
# Escape a byte value, given the control and meta flags.
|
727
|
+
def escape_build(value, control, meta)
|
728
|
+
value &= 0x9f if control
|
729
|
+
value |= 0x80 if meta
|
730
|
+
value
|
731
|
+
end
|
732
|
+
|
733
|
+
# Read an escape out of the string scanner, given the control and meta
|
734
|
+
# flags, and push the unescaped value into the result.
|
735
|
+
def escape_read(result, scanner, control, meta)
|
736
|
+
if scanner.skip("\n")
|
737
|
+
# Line continuation
|
738
|
+
elsif (value = ESCAPES[scanner.peek(1)])
|
739
|
+
# Simple single-character escape sequences like \n
|
740
|
+
result.append_as_bytes(value)
|
741
|
+
scanner.pos += 1
|
742
|
+
elsif (value = scanner.scan(/[0-7]{1,3}/))
|
743
|
+
# \nnn
|
744
|
+
result.append_as_bytes(escape_build(value.to_i(8), control, meta))
|
745
|
+
elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
|
746
|
+
# \xnn
|
747
|
+
result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
|
748
|
+
elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
|
749
|
+
# \unnnn
|
750
|
+
result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
|
751
|
+
elsif scanner.skip("u{}")
|
752
|
+
# https://github.com/whitequark/parser/issues/856
|
753
|
+
elsif (value = scanner.scan(/u{.*?}/))
|
754
|
+
# \u{nnnn ...}
|
755
|
+
value[2..-2].split.each do |unicode|
|
756
|
+
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
|
757
|
+
end
|
758
|
+
elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
|
759
|
+
# \cx or \C-x where x is an ASCII printable character
|
760
|
+
escape_read(result, scanner, true, meta)
|
761
|
+
elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
|
762
|
+
# \M-x where x is an ASCII printable character
|
763
|
+
escape_read(result, scanner, control, true)
|
764
|
+
elsif (byte = scanner.get_byte)
|
765
|
+
# Something else after an escape.
|
766
|
+
if control && byte == "?"
|
767
|
+
result.append_as_bytes(escape_build(0x7f, false, meta))
|
768
|
+
else
|
769
|
+
result.append_as_bytes(escape_build(byte.ord, control, meta))
|
770
|
+
end
|
771
|
+
end
|
772
|
+
end
|
773
|
+
|
774
|
+
# In a percent array, certain whitespace can be preceeded with a backslash,
|
775
|
+
# causing the following characters to be part of the previous element.
|
776
|
+
def percent_array_unescape(string)
|
777
|
+
string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
|
778
|
+
full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
|
779
|
+
full_match
|
780
|
+
end
|
781
|
+
end
|
782
|
+
|
783
|
+
# For %-arrays whitespace, the parser gem only considers whitespace before the newline.
|
784
|
+
def percent_array_leading_whitespace(string)
|
785
|
+
return 1 if string.start_with?("\n")
|
786
|
+
|
787
|
+
leading_whitespace = 0
|
788
|
+
string.each_char do |c|
|
789
|
+
break if c == "\n"
|
790
|
+
leading_whitespace += 1
|
791
|
+
end
|
792
|
+
leading_whitespace
|
793
|
+
end
|
794
|
+
|
795
|
+
# Determine if characters preceeded by a backslash should be escaped or not
|
796
|
+
def interpolation?(quote)
|
797
|
+
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
|
798
|
+
end
|
799
|
+
|
800
|
+
# Regexp allow interpolation but are handled differently during unescaping
|
801
|
+
def regexp?(quote)
|
802
|
+
quote == "/" || quote.start_with?("%r")
|
803
|
+
end
|
804
|
+
|
805
|
+
# Determine if the string is part of a %-style array.
|
806
|
+
def percent_array?(quote)
|
807
|
+
quote.start_with?("%w", "%W", "%i", "%I")
|
331
808
|
end
|
332
809
|
end
|
333
810
|
end
|