prism 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +24 -1
- data/config.yml +9 -0
- data/docs/releasing.md +1 -1
- data/docs/ruby_api.md +1 -1
- data/ext/prism/api_node.c +1814 -1303
- data/ext/prism/extension.c +230 -109
- data/ext/prism/extension.h +4 -4
- data/include/prism/ast.h +16 -0
- data/include/prism/defines.h +4 -1
- data/include/prism/options.h +47 -1
- data/include/prism/util/pm_buffer.h +10 -0
- data/include/prism/version.h +2 -2
- data/include/prism.h +4 -4
- data/lib/prism/dot_visitor.rb +16 -0
- data/lib/prism/dsl.rb +10 -2
- data/lib/prism/ffi.rb +45 -27
- data/lib/prism/inspect_visitor.rb +2 -1
- data/lib/prism/node.rb +48 -10
- data/lib/prism/parse_result/newlines.rb +1 -1
- data/lib/prism/parse_result.rb +52 -0
- data/lib/prism/polyfill/append_as_bytes.rb +15 -0
- data/lib/prism/reflection.rb +2 -2
- data/lib/prism/serialize.rb +1252 -765
- data/lib/prism/translation/parser/builder.rb +61 -0
- data/lib/prism/translation/parser/compiler.rb +192 -136
- data/lib/prism/translation/parser/lexer.rb +435 -61
- data/lib/prism/translation/parser.rb +51 -3
- data/lib/prism/translation/parser35.rb +12 -0
- data/lib/prism/translation/ripper.rb +13 -3
- data/lib/prism/translation/ruby_parser.rb +5 -4
- data/lib/prism/translation.rb +1 -0
- data/lib/prism.rb +3 -3
- data/prism.gemspec +5 -1
- data/rbi/prism/dsl.rbi +6 -3
- data/rbi/prism/node.rbi +22 -7
- data/rbi/prism/parse_result.rbi +17 -0
- data/rbi/prism/translation/parser35.rbi +6 -0
- data/rbi/prism.rbi +39 -36
- data/sig/prism/dsl.rbs +4 -2
- data/sig/prism/node.rbs +17 -7
- data/sig/prism/parse_result.rbs +10 -0
- data/sig/prism/serialize.rbs +4 -2
- data/sig/prism.rbs +22 -1
- data/src/diagnostic.c +2 -2
- data/src/node.c +21 -0
- data/src/options.c +31 -0
- data/src/prettyprint.c +30 -0
- data/src/prism.c +374 -118
- data/src/serialize.c +6 -0
- data/src/util/pm_buffer.c +40 -0
- data/src/util/pm_constant_pool.c +6 -2
- data/src/util/pm_strncasecmp.c +13 -1
- metadata +7 -7
@@ -1,21 +1,25 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require "strscan"
|
4
|
+
require_relative "../../polyfill/append_as_bytes"
|
5
|
+
|
3
6
|
module Prism
|
4
7
|
module Translation
|
5
8
|
class Parser
|
6
9
|
# Accepts a list of prism tokens and converts them into the expected
|
7
10
|
# format for the parser gem.
|
8
11
|
class Lexer
|
12
|
+
# These tokens are always skipped
|
13
|
+
TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
|
14
|
+
private_constant :TYPES_ALWAYS_SKIP
|
15
|
+
|
9
16
|
# The direct translating of types between the two lexers.
|
10
17
|
TYPES = {
|
11
18
|
# These tokens should never appear in the output of the lexer.
|
12
|
-
EOF: nil,
|
13
19
|
MISSING: nil,
|
14
20
|
NOT_PROVIDED: nil,
|
15
|
-
IGNORED_NEWLINE: nil,
|
16
21
|
EMBDOC_END: nil,
|
17
22
|
EMBDOC_LINE: nil,
|
18
|
-
__END__: nil,
|
19
23
|
|
20
24
|
# These tokens have more or less direct mappings.
|
21
25
|
AMPERSAND: :tAMPER2,
|
@@ -191,16 +195,24 @@ module Prism
|
|
191
195
|
#
|
192
196
|
# NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
|
193
197
|
# instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
|
194
|
-
LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
|
198
|
+
LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
|
195
199
|
|
196
200
|
# The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
|
197
201
|
# The following token types are listed as those classified as `tLPAREN`.
|
198
|
-
LPAREN_CONVERSION_TOKEN_TYPES = [
|
202
|
+
LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
|
199
203
|
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
|
200
|
-
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
|
201
|
-
]
|
204
|
+
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
|
205
|
+
])
|
206
|
+
|
207
|
+
# Types of tokens that are allowed to continue a method call with comments in-between.
|
208
|
+
# For these, the parser gem doesn't emit a newline token after the last comment.
|
209
|
+
COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
|
210
|
+
private_constant :COMMENT_CONTINUATION_TYPES
|
211
|
+
|
212
|
+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
|
213
|
+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
|
202
214
|
|
203
|
-
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
|
215
|
+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
|
204
216
|
|
205
217
|
# The Parser::Source::Buffer that the tokens were lexed from.
|
206
218
|
attr_reader :source_buffer
|
@@ -230,46 +242,78 @@ module Prism
|
|
230
242
|
index = 0
|
231
243
|
length = lexed.length
|
232
244
|
|
233
|
-
|
245
|
+
heredoc_stack = []
|
246
|
+
quote_stack = []
|
247
|
+
|
248
|
+
# The parser gem emits the newline tokens for comments out of order. This saves
|
249
|
+
# that token location to emit at a later time to properly line everything up.
|
250
|
+
# https://github.com/whitequark/parser/issues/1025
|
251
|
+
comment_newline_location = nil
|
234
252
|
|
235
253
|
while index < length
|
236
254
|
token, state = lexed[index]
|
237
255
|
index += 1
|
238
|
-
next if
|
256
|
+
next if TYPES_ALWAYS_SKIP.include?(token.type)
|
239
257
|
|
240
258
|
type = TYPES.fetch(token.type)
|
241
259
|
value = token.value
|
242
|
-
location =
|
260
|
+
location = range(token.location.start_offset, token.location.end_offset)
|
243
261
|
|
244
262
|
case type
|
245
263
|
when :kDO
|
246
|
-
|
247
|
-
|
264
|
+
nearest_lambda_token = tokens.reverse_each.find do |token|
|
265
|
+
LAMBDA_TOKEN_TYPES.include?(token.first)
|
266
|
+
end
|
248
267
|
|
249
|
-
if
|
268
|
+
if nearest_lambda_token&.first == :tLAMBDA
|
250
269
|
type = :kDO_LAMBDA
|
251
270
|
end
|
252
271
|
when :tCHARACTER
|
253
272
|
value.delete_prefix!("?")
|
273
|
+
# Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
|
274
|
+
value = unescape_string(value, "?")
|
254
275
|
when :tCOMMENT
|
255
276
|
if token.type == :EMBDOC_BEGIN
|
256
|
-
start_index = index
|
257
277
|
|
258
278
|
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
|
259
279
|
value += next_token.value
|
260
280
|
index += 1
|
261
281
|
end
|
262
282
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
index += 1
|
267
|
-
end
|
283
|
+
value += next_token.value
|
284
|
+
location = range(token.location.start_offset, lexed[index][0].location.end_offset)
|
285
|
+
index += 1
|
268
286
|
else
|
269
|
-
value.chomp
|
270
|
-
location =
|
287
|
+
is_at_eol = value.chomp!.nil?
|
288
|
+
location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
|
289
|
+
|
290
|
+
prev_token = lexed[index - 2][0] if index - 2 >= 0
|
291
|
+
next_token = lexed[index][0]
|
292
|
+
|
293
|
+
is_inline_comment = prev_token&.location&.start_line == token.location.start_line
|
294
|
+
if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
295
|
+
tokens << [:tCOMMENT, [value, location]]
|
296
|
+
|
297
|
+
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
|
298
|
+
tokens << [:tNL, [nil, nl_location]]
|
299
|
+
next
|
300
|
+
elsif is_inline_comment && next_token&.type == :COMMENT
|
301
|
+
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
|
302
|
+
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
303
|
+
tokens << [:tCOMMENT, [value, location]]
|
304
|
+
tokens << [:tNL, [nil, comment_newline_location]]
|
305
|
+
comment_newline_location = nil
|
306
|
+
next
|
307
|
+
end
|
271
308
|
end
|
272
309
|
when :tNL
|
310
|
+
next_token = next_token = lexed[index][0]
|
311
|
+
# Newlines after comments are emitted out of order.
|
312
|
+
if next_token&.type == :COMMENT
|
313
|
+
comment_newline_location = location
|
314
|
+
next
|
315
|
+
end
|
316
|
+
|
273
317
|
value = nil
|
274
318
|
when :tFLOAT
|
275
319
|
value = parse_float(value)
|
@@ -277,8 +321,8 @@ module Prism
|
|
277
321
|
value = parse_complex(value)
|
278
322
|
when :tINTEGER
|
279
323
|
if value.start_with?("+")
|
280
|
-
tokens << [:tUNARY_NUM, ["+",
|
281
|
-
location =
|
324
|
+
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
|
325
|
+
location = range(token.location.start_offset + 1, token.location.end_offset)
|
282
326
|
end
|
283
327
|
|
284
328
|
value = parse_integer(value)
|
@@ -297,68 +341,154 @@ module Prism
|
|
297
341
|
when :tRATIONAL
|
298
342
|
value = parse_rational(value)
|
299
343
|
when :tSPACE
|
344
|
+
location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
|
300
345
|
value = nil
|
301
346
|
when :tSTRING_BEG
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
347
|
+
next_token = lexed[index][0]
|
348
|
+
next_next_token = lexed[index + 1][0]
|
349
|
+
basic_quotes = value == '"' || value == "'"
|
350
|
+
|
351
|
+
if basic_quotes && next_token&.type == :STRING_END
|
306
352
|
next_location = token.location.join(next_token.location)
|
307
353
|
type = :tSTRING
|
308
354
|
value = ""
|
309
|
-
location =
|
355
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
310
356
|
index += 1
|
311
|
-
elsif
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
357
|
+
elsif value.start_with?("'", '"', "%")
|
358
|
+
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
|
359
|
+
string_value = next_token.value
|
360
|
+
if simplify_string?(string_value, value)
|
361
|
+
next_location = token.location.join(next_next_token.location)
|
362
|
+
if percent_array?(value)
|
363
|
+
value = percent_array_unescape(string_value)
|
364
|
+
else
|
365
|
+
value = unescape_string(string_value, value)
|
366
|
+
end
|
367
|
+
type = :tSTRING
|
368
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
369
|
+
index += 2
|
370
|
+
tokens << [type, [value, location]]
|
371
|
+
|
372
|
+
next
|
373
|
+
end
|
374
|
+
end
|
375
|
+
|
376
|
+
quote_stack.push(value)
|
377
|
+
elsif token.type == :HEREDOC_START
|
318
378
|
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
|
379
|
+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
|
380
|
+
heredoc = HeredocData.new(
|
381
|
+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
|
382
|
+
common_whitespace: 0,
|
383
|
+
)
|
384
|
+
|
319
385
|
if quote == "`"
|
320
386
|
type = :tXSTRING_BEG
|
321
|
-
|
387
|
+
end
|
388
|
+
|
389
|
+
# The parser gem trims whitespace from squiggly heredocs. We must record
|
390
|
+
# the most common whitespace to later remove.
|
391
|
+
if heredoc_type == "~" || heredoc_type == "`"
|
392
|
+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
|
393
|
+
end
|
394
|
+
|
395
|
+
if quote == "'" || quote == '"' || quote == "`"
|
396
|
+
value = "<<#{quote}"
|
322
397
|
else
|
323
|
-
value = "
|
398
|
+
value = '<<"'
|
324
399
|
end
|
400
|
+
|
401
|
+
heredoc_stack.push(heredoc)
|
402
|
+
quote_stack.push(value)
|
325
403
|
end
|
326
404
|
when :tSTRING_CONTENT
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
405
|
+
is_percent_array = percent_array?(quote_stack.last)
|
406
|
+
|
407
|
+
if (lines = token.value.lines).one?
|
408
|
+
# Prism usually emits a single token for strings with line continuations.
|
409
|
+
# For squiggly heredocs they are not joined so we do that manually here.
|
410
|
+
current_string = +""
|
411
|
+
current_length = 0
|
412
|
+
start_offset = token.location.start_offset
|
413
|
+
while token.type == :STRING_CONTENT
|
414
|
+
current_length += token.value.bytesize
|
415
|
+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
|
416
|
+
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
|
417
|
+
# The parser gem only removes indentation when the heredoc is not nested
|
418
|
+
not_nested = heredoc_stack.size == 1
|
419
|
+
if is_percent_array
|
420
|
+
value = percent_array_unescape(token.value)
|
421
|
+
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
|
422
|
+
value = trim_heredoc_whitespace(token.value, current_heredoc)
|
423
|
+
end
|
424
|
+
|
425
|
+
current_string << unescape_string(value, quote_stack.last)
|
426
|
+
if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
|
427
|
+
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
|
428
|
+
break
|
429
|
+
end
|
430
|
+
token = lexed[index][0]
|
431
|
+
index += 1
|
432
|
+
end
|
433
|
+
else
|
434
|
+
# When the parser gem encounters a line continuation inside of a multiline string,
|
435
|
+
# it emits a single string node. The backslash (and remaining newline) is removed.
|
436
|
+
current_line = +""
|
437
|
+
adjustment = 0
|
438
|
+
start_offset = token.location.start_offset
|
439
|
+
emit = false
|
440
|
+
|
441
|
+
lines.each.with_index do |line, index|
|
331
442
|
chomped_line = line.chomp
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
443
|
+
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
|
444
|
+
is_interpolation = interpolation?(quote_stack.last)
|
445
|
+
|
446
|
+
if backslash_count.odd? && (is_interpolation || is_percent_array)
|
447
|
+
if is_percent_array
|
448
|
+
current_line << percent_array_unescape(line)
|
449
|
+
adjustment += 1
|
338
450
|
else
|
339
|
-
|
451
|
+
chomped_line.delete_suffix!("\\")
|
452
|
+
current_line << chomped_line
|
453
|
+
adjustment += 2
|
340
454
|
end
|
455
|
+
# If the string ends with a line continuation emit the remainder
|
456
|
+
emit = index == lines.count - 1
|
341
457
|
else
|
342
|
-
|
343
|
-
|
458
|
+
current_line << line
|
459
|
+
emit = true
|
344
460
|
end
|
345
461
|
|
346
|
-
|
347
|
-
|
348
|
-
|
462
|
+
if emit
|
463
|
+
end_offset = start_offset + current_line.bytesize + adjustment
|
464
|
+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
|
465
|
+
start_offset = end_offset
|
466
|
+
current_line = +""
|
467
|
+
adjustment = 0
|
468
|
+
end
|
349
469
|
end
|
350
|
-
next
|
351
470
|
end
|
471
|
+
next
|
352
472
|
when :tSTRING_DVAR
|
353
473
|
value = nil
|
354
474
|
when :tSTRING_END
|
355
475
|
if token.type == :HEREDOC_END && value.end_with?("\n")
|
356
476
|
newline_length = value.end_with?("\r\n") ? 2 : 1
|
357
|
-
value =
|
358
|
-
location =
|
477
|
+
value = heredoc_stack.pop.identifier
|
478
|
+
location = range(token.location.start_offset, token.location.end_offset - newline_length)
|
359
479
|
elsif token.type == :REGEXP_END
|
360
480
|
value = value[0]
|
361
|
-
location =
|
481
|
+
location = range(token.location.start_offset, token.location.start_offset + 1)
|
482
|
+
end
|
483
|
+
|
484
|
+
if percent_array?(quote_stack.pop)
|
485
|
+
prev_token = lexed[index - 2][0] if index - 2 >= 0
|
486
|
+
empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
|
487
|
+
ends_with_whitespace = prev_token&.type == :WORDS_SEP
|
488
|
+
# parser always emits a space token after content in a percent array, even if no actual whitespace is present.
|
489
|
+
if !empty && !ends_with_whitespace
|
490
|
+
tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
|
491
|
+
end
|
362
492
|
end
|
363
493
|
when :tSYMBEG
|
364
494
|
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
|
@@ -366,23 +496,35 @@ module Prism
|
|
366
496
|
type = :tSYMBOL
|
367
497
|
value = next_token.value
|
368
498
|
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
|
369
|
-
location =
|
499
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
370
500
|
index += 1
|
501
|
+
else
|
502
|
+
quote_stack.push(value)
|
371
503
|
end
|
372
504
|
when :tFID
|
373
505
|
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
|
374
506
|
type = :tIDENTIFIER
|
375
507
|
end
|
376
508
|
when :tXSTRING_BEG
|
377
|
-
if (next_token = lexed[index][0]) &&
|
509
|
+
if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
|
510
|
+
# self.`()
|
378
511
|
type = :tBACK_REF2
|
379
512
|
end
|
513
|
+
quote_stack.push(value)
|
514
|
+
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
|
515
|
+
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
|
516
|
+
index += 1
|
517
|
+
end
|
518
|
+
|
519
|
+
quote_stack.push(value)
|
520
|
+
when :tREGEXP_BEG
|
521
|
+
quote_stack.push(value)
|
380
522
|
end
|
381
523
|
|
382
524
|
tokens << [type, [value, location]]
|
383
525
|
|
384
526
|
if token.type == :REGEXP_END
|
385
|
-
tokens << [:tREGEXP_OPT, [token.value[1..],
|
527
|
+
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
|
386
528
|
end
|
387
529
|
end
|
388
530
|
|
@@ -391,6 +533,11 @@ module Prism
|
|
391
533
|
|
392
534
|
private
|
393
535
|
|
536
|
+
# Creates a new parser range, taking prisms byte offsets into account
|
537
|
+
def range(start_offset, end_offset)
|
538
|
+
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
|
539
|
+
end
|
540
|
+
|
394
541
|
# Parse an integer from the string representation.
|
395
542
|
def parse_integer(value)
|
396
543
|
Integer(value)
|
@@ -432,6 +579,233 @@ module Prism
|
|
432
579
|
rescue ArgumentError
|
433
580
|
0r
|
434
581
|
end
|
582
|
+
|
583
|
+
# Wonky heredoc tab/spaces rules.
|
584
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
|
585
|
+
def calculate_heredoc_whitespace(heredoc_token_index)
|
586
|
+
next_token_index = heredoc_token_index
|
587
|
+
nesting_level = 0
|
588
|
+
previous_line = -1
|
589
|
+
result = Float::MAX
|
590
|
+
|
591
|
+
while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
|
592
|
+
next_token_index += 1
|
593
|
+
next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
|
594
|
+
first_token_on_line = next_token.location.start_column == 0
|
595
|
+
|
596
|
+
# String content inside nested heredocs and interpolation is ignored
|
597
|
+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
|
598
|
+
# When interpolation is the first token of a line there is no string
|
599
|
+
# content to check against. There will be no common whitespace.
|
600
|
+
if nesting_level == 0 && first_token_on_line
|
601
|
+
result = 0
|
602
|
+
end
|
603
|
+
nesting_level += 1
|
604
|
+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
|
605
|
+
nesting_level -= 1
|
606
|
+
# When we encountered the matching heredoc end, we can exit
|
607
|
+
break if nesting_level == -1
|
608
|
+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
|
609
|
+
common_whitespace = 0
|
610
|
+
next_token.value[/^\s*/].each_char do |char|
|
611
|
+
if char == "\t"
|
612
|
+
common_whitespace = (common_whitespace / 8 + 1) * 8;
|
613
|
+
else
|
614
|
+
common_whitespace += 1
|
615
|
+
end
|
616
|
+
end
|
617
|
+
|
618
|
+
is_first_token_on_line = next_token.location.start_line != previous_line
|
619
|
+
# Whitespace is significant if followed by interpolation
|
620
|
+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
|
621
|
+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
|
622
|
+
result = common_whitespace
|
623
|
+
previous_line = next_token.location.start_line
|
624
|
+
end
|
625
|
+
end
|
626
|
+
end
|
627
|
+
result
|
628
|
+
end
|
629
|
+
|
630
|
+
# Wonky heredoc tab/spaces rules.
|
631
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
|
632
|
+
def trim_heredoc_whitespace(string, heredoc)
|
633
|
+
trimmed_whitespace = 0
|
634
|
+
trimmed_characters = 0
|
635
|
+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
|
636
|
+
if string[trimmed_characters] == "\t"
|
637
|
+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
|
638
|
+
break if trimmed_whitespace > heredoc.common_whitespace
|
639
|
+
else
|
640
|
+
trimmed_whitespace += 1
|
641
|
+
end
|
642
|
+
trimmed_characters += 1
|
643
|
+
end
|
644
|
+
|
645
|
+
string[trimmed_characters..]
|
646
|
+
end
|
647
|
+
|
648
|
+
# Escape sequences that have special and should appear unescaped in the resulting string.
|
649
|
+
ESCAPES = {
|
650
|
+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
|
651
|
+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
|
652
|
+
"v" => "\v", "\\" => "\\"
|
653
|
+
}.freeze
|
654
|
+
private_constant :ESCAPES
|
655
|
+
|
656
|
+
# When one of these delimiters is encountered, then the other
|
657
|
+
# one is allowed to be escaped as well.
|
658
|
+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
|
659
|
+
private_constant :DELIMITER_SYMETRY
|
660
|
+
|
661
|
+
|
662
|
+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
|
663
|
+
REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
|
664
|
+
private_constant :REGEXP_META_CHARACTERS
|
665
|
+
|
666
|
+
# Apply Ruby string escaping rules
|
667
|
+
def unescape_string(string, quote)
|
668
|
+
# In single-quoted heredocs, everything is taken literally.
|
669
|
+
return string if quote == "<<'"
|
670
|
+
|
671
|
+
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
|
672
|
+
return string unless string.include?("\\")
|
673
|
+
|
674
|
+
# Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
|
675
|
+
delimiter = quote[-1]
|
676
|
+
|
677
|
+
if regexp?(quote)
|
678
|
+
# Should be escaped handled to single-quoted heredocs. The only character that is
|
679
|
+
# allowed to be escaped is the delimiter, except when that also has special meaning
|
680
|
+
# in the regexp. Since all the symetry delimiters have special meaning, they don't need
|
681
|
+
# to be considered separately.
|
682
|
+
if REGEXP_META_CHARACTERS.include?(delimiter)
|
683
|
+
string
|
684
|
+
else
|
685
|
+
# There can never be an even amount of backslashes. It would be a syntax error.
|
686
|
+
string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
|
687
|
+
end
|
688
|
+
elsif interpolation?(quote)
|
689
|
+
# Appending individual escape sequences may force the string out of its intended
|
690
|
+
# encoding. Start out with binary and force it back later.
|
691
|
+
result = "".b
|
692
|
+
|
693
|
+
scanner = StringScanner.new(string)
|
694
|
+
while (skipped = scanner.skip_until(/\\/))
|
695
|
+
# Append what was just skipped over, excluding the found backslash.
|
696
|
+
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
|
697
|
+
escape_read(result, scanner, false, false)
|
698
|
+
end
|
699
|
+
|
700
|
+
# Add remaining chars
|
701
|
+
result.append_as_bytes(string.byteslice(scanner.pos..))
|
702
|
+
result.force_encoding(source_buffer.source.encoding)
|
703
|
+
else
|
704
|
+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
|
705
|
+
string.gsub(/\\([\\#{delimiters}])/, '\1')
|
706
|
+
end
|
707
|
+
end
|
708
|
+
|
709
|
+
# Certain strings are merged into a single string token.
|
710
|
+
def simplify_string?(value, quote)
|
711
|
+
case quote
|
712
|
+
when "'"
|
713
|
+
# Only simplify 'foo'
|
714
|
+
!value.include?("\n")
|
715
|
+
when '"'
|
716
|
+
# Simplify when every line ends with a line continuation, or it is the last line
|
717
|
+
value.lines.all? do |line|
|
718
|
+
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
|
719
|
+
end
|
720
|
+
else
|
721
|
+
# %q and similar are never simplified
|
722
|
+
false
|
723
|
+
end
|
724
|
+
end
|
725
|
+
|
726
|
+
# Escape a byte value, given the control and meta flags.
|
727
|
+
def escape_build(value, control, meta)
|
728
|
+
value &= 0x9f if control
|
729
|
+
value |= 0x80 if meta
|
730
|
+
value
|
731
|
+
end
|
732
|
+
|
733
|
+
# Read an escape out of the string scanner, given the control and meta
|
734
|
+
# flags, and push the unescaped value into the result.
|
735
|
+
def escape_read(result, scanner, control, meta)
|
736
|
+
if scanner.skip("\n")
|
737
|
+
# Line continuation
|
738
|
+
elsif (value = ESCAPES[scanner.peek(1)])
|
739
|
+
# Simple single-character escape sequences like \n
|
740
|
+
result.append_as_bytes(value)
|
741
|
+
scanner.pos += 1
|
742
|
+
elsif (value = scanner.scan(/[0-7]{1,3}/))
|
743
|
+
# \nnn
|
744
|
+
result.append_as_bytes(escape_build(value.to_i(8), control, meta))
|
745
|
+
elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
|
746
|
+
# \xnn
|
747
|
+
result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
|
748
|
+
elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
|
749
|
+
# \unnnn
|
750
|
+
result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
|
751
|
+
elsif scanner.skip("u{}")
|
752
|
+
# https://github.com/whitequark/parser/issues/856
|
753
|
+
elsif (value = scanner.scan(/u{.*?}/))
|
754
|
+
# \u{nnnn ...}
|
755
|
+
value[2..-2].split.each do |unicode|
|
756
|
+
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
|
757
|
+
end
|
758
|
+
elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
|
759
|
+
# \cx or \C-x where x is an ASCII printable character
|
760
|
+
escape_read(result, scanner, true, meta)
|
761
|
+
elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
|
762
|
+
# \M-x where x is an ASCII printable character
|
763
|
+
escape_read(result, scanner, control, true)
|
764
|
+
elsif (byte = scanner.get_byte)
|
765
|
+
# Something else after an escape.
|
766
|
+
if control && byte == "?"
|
767
|
+
result.append_as_bytes(escape_build(0x7f, false, meta))
|
768
|
+
else
|
769
|
+
result.append_as_bytes(escape_build(byte.ord, control, meta))
|
770
|
+
end
|
771
|
+
end
|
772
|
+
end
|
773
|
+
|
774
|
+
# In a percent array, certain whitespace can be preceeded with a backslash,
|
775
|
+
# causing the following characters to be part of the previous element.
|
776
|
+
def percent_array_unescape(string)
|
777
|
+
string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
|
778
|
+
full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
|
779
|
+
full_match
|
780
|
+
end
|
781
|
+
end
|
782
|
+
|
783
|
+
# For %-arrays whitespace, the parser gem only considers whitespace before the newline.
|
784
|
+
def percent_array_leading_whitespace(string)
|
785
|
+
return 1 if string.start_with?("\n")
|
786
|
+
|
787
|
+
leading_whitespace = 0
|
788
|
+
string.each_char do |c|
|
789
|
+
break if c == "\n"
|
790
|
+
leading_whitespace += 1
|
791
|
+
end
|
792
|
+
leading_whitespace
|
793
|
+
end
|
794
|
+
|
795
|
+
# Determine if characters preceeded by a backslash should be escaped or not
|
796
|
+
def interpolation?(quote)
|
797
|
+
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
|
798
|
+
end
|
799
|
+
|
800
|
+
# Regexp allow interpolation but are handled differently during unescaping
|
801
|
+
def regexp?(quote)
|
802
|
+
quote == "/" || quote.start_with?("%r")
|
803
|
+
end
|
804
|
+
|
805
|
+
# Determine if the string is part of a %-style array.
|
806
|
+
def percent_array?(quote)
|
807
|
+
quote.start_with?("%w", "%W", "%i", "%I")
|
808
|
+
end
|
435
809
|
end
|
436
810
|
end
|
437
811
|
end
|