prism 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +46 -1
- data/Makefile +2 -1
- data/README.md +1 -0
- data/config.yml +273 -37
- data/docs/parser_translation.md +8 -23
- data/docs/releasing.md +1 -1
- data/docs/ripper_translation.md +1 -1
- data/docs/ruby_api.md +1 -1
- data/ext/prism/api_node.c +1816 -1303
- data/ext/prism/extension.c +244 -110
- data/ext/prism/extension.h +4 -4
- data/include/prism/ast.h +291 -49
- data/include/prism/defines.h +4 -1
- data/include/prism/diagnostic.h +4 -0
- data/include/prism/options.h +89 -3
- data/include/prism/regexp.h +2 -2
- data/include/prism/util/pm_buffer.h +18 -0
- data/include/prism/util/pm_integer.h +4 -0
- data/include/prism/util/pm_list.h +6 -0
- data/include/prism/util/pm_string.h +12 -2
- data/include/prism/version.h +2 -2
- data/include/prism.h +41 -16
- data/lib/prism/compiler.rb +456 -151
- data/lib/prism/desugar_compiler.rb +1 -0
- data/lib/prism/dispatcher.rb +16 -0
- data/lib/prism/dot_visitor.rb +21 -1
- data/lib/prism/dsl.rb +13 -2
- data/lib/prism/ffi.rb +62 -34
- data/lib/prism/inspect_visitor.rb +5 -1
- data/lib/prism/lex_compat.rb +1 -0
- data/lib/prism/mutation_compiler.rb +3 -0
- data/lib/prism/node.rb +554 -345
- data/lib/prism/node_ext.rb +4 -1
- data/lib/prism/pack.rb +2 -0
- data/lib/prism/parse_result/comments.rb +1 -0
- data/lib/prism/parse_result/errors.rb +1 -0
- data/lib/prism/parse_result/newlines.rb +2 -1
- data/lib/prism/parse_result.rb +53 -0
- data/lib/prism/pattern.rb +1 -0
- data/lib/prism/polyfill/append_as_bytes.rb +15 -0
- data/lib/prism/polyfill/scan_byte.rb +14 -0
- data/lib/prism/polyfill/warn.rb +42 -0
- data/lib/prism/reflection.rb +5 -2
- data/lib/prism/relocation.rb +1 -0
- data/lib/prism/serialize.rb +1275 -783
- data/lib/prism/string_query.rb +1 -0
- data/lib/prism/translation/parser/builder.rb +62 -0
- data/lib/prism/translation/parser/compiler.rb +230 -152
- data/lib/prism/translation/parser/lexer.rb +446 -64
- data/lib/prism/translation/parser.rb +64 -4
- data/lib/prism/translation/parser33.rb +1 -0
- data/lib/prism/translation/parser34.rb +1 -0
- data/lib/prism/translation/parser35.rb +13 -0
- data/lib/prism/translation/parser_current.rb +24 -0
- data/lib/prism/translation/ripper/sexp.rb +1 -0
- data/lib/prism/translation/ripper.rb +30 -4
- data/lib/prism/translation/ruby_parser.rb +291 -7
- data/lib/prism/translation.rb +3 -0
- data/lib/prism/visitor.rb +457 -152
- data/lib/prism.rb +5 -3
- data/prism.gemspec +9 -1
- data/rbi/prism/dsl.rbi +9 -6
- data/rbi/prism/node.rbi +43 -16
- data/rbi/prism/parse_result.rbi +17 -0
- data/rbi/prism/translation/parser35.rbi +6 -0
- data/rbi/prism.rbi +39 -36
- data/sig/prism/dispatcher.rbs +3 -0
- data/sig/prism/dsl.rbs +7 -5
- data/sig/prism/node.rbs +461 -37
- data/sig/prism/node_ext.rbs +84 -17
- data/sig/prism/parse_result/comments.rbs +38 -0
- data/sig/prism/parse_result.rbs +14 -0
- data/sig/prism/reflection.rbs +1 -1
- data/sig/prism/serialize.rbs +4 -2
- data/sig/prism.rbs +22 -1
- data/src/diagnostic.c +9 -3
- data/src/node.c +23 -0
- data/src/options.c +33 -2
- data/src/prettyprint.c +32 -0
- data/src/prism.c +620 -242
- data/src/serialize.c +8 -0
- data/src/token_type.c +36 -34
- data/src/util/pm_buffer.c +40 -0
- data/src/util/pm_constant_pool.c +6 -2
- data/src/util/pm_strncasecmp.c +13 -1
- metadata +11 -7
@@ -1,4 +1,9 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
|
+
# :markup: markdown
|
3
|
+
|
4
|
+
require "strscan"
|
5
|
+
require_relative "../../polyfill/append_as_bytes"
|
6
|
+
require_relative "../../polyfill/scan_byte"
|
2
7
|
|
3
8
|
module Prism
|
4
9
|
module Translation
|
@@ -6,16 +11,17 @@ module Prism
|
|
6
11
|
# Accepts a list of prism tokens and converts them into the expected
|
7
12
|
# format for the parser gem.
|
8
13
|
class Lexer
|
14
|
+
# These tokens are always skipped
|
15
|
+
TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
|
16
|
+
private_constant :TYPES_ALWAYS_SKIP
|
17
|
+
|
9
18
|
# The direct translating of types between the two lexers.
|
10
19
|
TYPES = {
|
11
20
|
# These tokens should never appear in the output of the lexer.
|
12
|
-
EOF: nil,
|
13
21
|
MISSING: nil,
|
14
22
|
NOT_PROVIDED: nil,
|
15
|
-
IGNORED_NEWLINE: nil,
|
16
23
|
EMBDOC_END: nil,
|
17
24
|
EMBDOC_LINE: nil,
|
18
|
-
__END__: nil,
|
19
25
|
|
20
26
|
# These tokens have more or less direct mappings.
|
21
27
|
AMPERSAND: :tAMPER2,
|
@@ -191,16 +197,24 @@ module Prism
|
|
191
197
|
#
|
192
198
|
# NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
|
193
199
|
# instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
|
194
|
-
LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
|
200
|
+
LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
|
195
201
|
|
196
202
|
# The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
|
197
203
|
# The following token types are listed as those classified as `tLPAREN`.
|
198
|
-
LPAREN_CONVERSION_TOKEN_TYPES = [
|
199
|
-
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
|
200
|
-
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
|
201
|
-
]
|
204
|
+
LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
|
205
|
+
:kBREAK, :tCARET, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
|
206
|
+
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS, :tLCURLY
|
207
|
+
])
|
208
|
+
|
209
|
+
# Types of tokens that are allowed to continue a method call with comments in-between.
|
210
|
+
# For these, the parser gem doesn't emit a newline token after the last comment.
|
211
|
+
COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
|
212
|
+
private_constant :COMMENT_CONTINUATION_TYPES
|
213
|
+
|
214
|
+
# Heredocs are complex and require us to keep track of a bit of info to refer to later
|
215
|
+
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
|
202
216
|
|
203
|
-
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
|
217
|
+
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
|
204
218
|
|
205
219
|
# The Parser::Source::Buffer that the tokens were lexed from.
|
206
220
|
attr_reader :source_buffer
|
@@ -230,46 +244,78 @@ module Prism
|
|
230
244
|
index = 0
|
231
245
|
length = lexed.length
|
232
246
|
|
233
|
-
|
247
|
+
heredoc_stack = []
|
248
|
+
quote_stack = []
|
249
|
+
|
250
|
+
# The parser gem emits the newline tokens for comments out of order. This saves
|
251
|
+
# that token location to emit at a later time to properly line everything up.
|
252
|
+
# https://github.com/whitequark/parser/issues/1025
|
253
|
+
comment_newline_location = nil
|
234
254
|
|
235
255
|
while index < length
|
236
256
|
token, state = lexed[index]
|
237
257
|
index += 1
|
238
|
-
next if
|
258
|
+
next if TYPES_ALWAYS_SKIP.include?(token.type)
|
239
259
|
|
240
260
|
type = TYPES.fetch(token.type)
|
241
261
|
value = token.value
|
242
|
-
location =
|
262
|
+
location = range(token.location.start_offset, token.location.end_offset)
|
243
263
|
|
244
264
|
case type
|
245
265
|
when :kDO
|
246
|
-
|
247
|
-
|
266
|
+
nearest_lambda_token = tokens.reverse_each.find do |token|
|
267
|
+
LAMBDA_TOKEN_TYPES.include?(token.first)
|
268
|
+
end
|
248
269
|
|
249
|
-
if
|
270
|
+
if nearest_lambda_token&.first == :tLAMBDA
|
250
271
|
type = :kDO_LAMBDA
|
251
272
|
end
|
252
273
|
when :tCHARACTER
|
253
274
|
value.delete_prefix!("?")
|
275
|
+
# Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
|
276
|
+
value = unescape_string(value, "?")
|
254
277
|
when :tCOMMENT
|
255
278
|
if token.type == :EMBDOC_BEGIN
|
256
|
-
start_index = index
|
257
279
|
|
258
|
-
while !((next_token = lexed[index]
|
280
|
+
while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
|
259
281
|
value += next_token.value
|
260
282
|
index += 1
|
261
283
|
end
|
262
284
|
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
index += 1
|
267
|
-
end
|
285
|
+
value += next_token.value
|
286
|
+
location = range(token.location.start_offset, next_token.location.end_offset)
|
287
|
+
index += 1
|
268
288
|
else
|
269
|
-
value.chomp
|
270
|
-
location =
|
289
|
+
is_at_eol = value.chomp!.nil?
|
290
|
+
location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
|
291
|
+
|
292
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
293
|
+
next_token, _ = lexed[index]
|
294
|
+
|
295
|
+
is_inline_comment = prev_token&.location&.start_line == token.location.start_line
|
296
|
+
if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
297
|
+
tokens << [:tCOMMENT, [value, location]]
|
298
|
+
|
299
|
+
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
|
300
|
+
tokens << [:tNL, [nil, nl_location]]
|
301
|
+
next
|
302
|
+
elsif is_inline_comment && next_token&.type == :COMMENT
|
303
|
+
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
|
304
|
+
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
|
305
|
+
tokens << [:tCOMMENT, [value, location]]
|
306
|
+
tokens << [:tNL, [nil, comment_newline_location]]
|
307
|
+
comment_newline_location = nil
|
308
|
+
next
|
309
|
+
end
|
271
310
|
end
|
272
311
|
when :tNL
|
312
|
+
next_token, _ = lexed[index]
|
313
|
+
# Newlines after comments are emitted out of order.
|
314
|
+
if next_token&.type == :COMMENT
|
315
|
+
comment_newline_location = location
|
316
|
+
next
|
317
|
+
end
|
318
|
+
|
273
319
|
value = nil
|
274
320
|
when :tFLOAT
|
275
321
|
value = parse_float(value)
|
@@ -277,8 +323,8 @@ module Prism
|
|
277
323
|
value = parse_complex(value)
|
278
324
|
when :tINTEGER
|
279
325
|
if value.start_with?("+")
|
280
|
-
tokens << [:tUNARY_NUM, ["+",
|
281
|
-
location =
|
326
|
+
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
|
327
|
+
location = range(token.location.start_offset + 1, token.location.end_offset)
|
282
328
|
end
|
283
329
|
|
284
330
|
value = parse_integer(value)
|
@@ -297,92 +343,196 @@ module Prism
|
|
297
343
|
when :tRATIONAL
|
298
344
|
value = parse_rational(value)
|
299
345
|
when :tSPACE
|
346
|
+
location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
|
300
347
|
value = nil
|
301
348
|
when :tSTRING_BEG
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
349
|
+
next_token, _ = lexed[index]
|
350
|
+
next_next_token, _ = lexed[index + 1]
|
351
|
+
basic_quotes = value == '"' || value == "'"
|
352
|
+
|
353
|
+
if basic_quotes && next_token&.type == :STRING_END
|
306
354
|
next_location = token.location.join(next_token.location)
|
307
355
|
type = :tSTRING
|
308
356
|
value = ""
|
309
|
-
location =
|
357
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
310
358
|
index += 1
|
311
|
-
elsif
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
359
|
+
elsif value.start_with?("'", '"', "%")
|
360
|
+
if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
|
361
|
+
string_value = next_token.value
|
362
|
+
if simplify_string?(string_value, value)
|
363
|
+
next_location = token.location.join(next_next_token.location)
|
364
|
+
if percent_array?(value)
|
365
|
+
value = percent_array_unescape(string_value)
|
366
|
+
else
|
367
|
+
value = unescape_string(string_value, value)
|
368
|
+
end
|
369
|
+
type = :tSTRING
|
370
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
371
|
+
index += 2
|
372
|
+
tokens << [type, [value, location]]
|
373
|
+
|
374
|
+
next
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
quote_stack.push(value)
|
379
|
+
elsif token.type == :HEREDOC_START
|
318
380
|
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
|
381
|
+
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
|
382
|
+
heredoc = HeredocData.new(
|
383
|
+
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
|
384
|
+
common_whitespace: 0,
|
385
|
+
)
|
386
|
+
|
319
387
|
if quote == "`"
|
320
388
|
type = :tXSTRING_BEG
|
321
|
-
|
389
|
+
end
|
390
|
+
|
391
|
+
# The parser gem trims whitespace from squiggly heredocs. We must record
|
392
|
+
# the most common whitespace to later remove.
|
393
|
+
if heredoc_type == "~" || heredoc_type == "`"
|
394
|
+
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
|
395
|
+
end
|
396
|
+
|
397
|
+
if quote == "'" || quote == '"' || quote == "`"
|
398
|
+
value = "<<#{quote}"
|
322
399
|
else
|
323
|
-
value = "
|
400
|
+
value = '<<"'
|
324
401
|
end
|
402
|
+
|
403
|
+
heredoc_stack.push(heredoc)
|
404
|
+
quote_stack.push(value)
|
325
405
|
end
|
326
406
|
when :tSTRING_CONTENT
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
407
|
+
is_percent_array = percent_array?(quote_stack.last)
|
408
|
+
|
409
|
+
if (lines = token.value.lines).one?
|
410
|
+
# Prism usually emits a single token for strings with line continuations.
|
411
|
+
# For squiggly heredocs they are not joined so we do that manually here.
|
412
|
+
current_string = +""
|
413
|
+
current_length = 0
|
414
|
+
start_offset = token.location.start_offset
|
415
|
+
while token.type == :STRING_CONTENT
|
416
|
+
current_length += token.value.bytesize
|
417
|
+
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
|
418
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
419
|
+
is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
|
420
|
+
# The parser gem only removes indentation when the heredoc is not nested
|
421
|
+
not_nested = heredoc_stack.size == 1
|
422
|
+
if is_percent_array
|
423
|
+
value = percent_array_unescape(token.value)
|
424
|
+
elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
|
425
|
+
value = trim_heredoc_whitespace(token.value, current_heredoc)
|
426
|
+
end
|
427
|
+
|
428
|
+
current_string << unescape_string(value, quote_stack.last)
|
429
|
+
relevant_backslash_count = if quote_stack.last.start_with?("%W", "%I")
|
430
|
+
0 # the last backslash escapes the newline
|
431
|
+
else
|
432
|
+
token.value[/(\\{1,})\n/, 1]&.length || 0
|
433
|
+
end
|
434
|
+
if relevant_backslash_count.even? || !interpolation?(quote_stack.last)
|
435
|
+
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
|
436
|
+
break
|
437
|
+
end
|
438
|
+
token, _ = lexed[index]
|
439
|
+
index += 1
|
440
|
+
end
|
441
|
+
else
|
442
|
+
# When the parser gem encounters a line continuation inside of a multiline string,
|
443
|
+
# it emits a single string node. The backslash (and remaining newline) is removed.
|
444
|
+
current_line = +""
|
445
|
+
adjustment = 0
|
446
|
+
start_offset = token.location.start_offset
|
447
|
+
emit = false
|
448
|
+
|
449
|
+
lines.each.with_index do |line, index|
|
331
450
|
chomped_line = line.chomp
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
451
|
+
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
|
452
|
+
is_interpolation = interpolation?(quote_stack.last)
|
453
|
+
|
454
|
+
if backslash_count.odd? && (is_interpolation || is_percent_array)
|
455
|
+
if is_percent_array
|
456
|
+
current_line << percent_array_unescape(line)
|
457
|
+
adjustment += 1
|
338
458
|
else
|
339
|
-
|
459
|
+
chomped_line.delete_suffix!("\\")
|
460
|
+
current_line << chomped_line
|
461
|
+
adjustment += 2
|
340
462
|
end
|
463
|
+
# If the string ends with a line continuation emit the remainder
|
464
|
+
emit = index == lines.count - 1
|
341
465
|
else
|
342
|
-
|
343
|
-
|
466
|
+
current_line << line
|
467
|
+
emit = true
|
344
468
|
end
|
345
469
|
|
346
|
-
|
347
|
-
|
348
|
-
|
470
|
+
if emit
|
471
|
+
end_offset = start_offset + current_line.bytesize + adjustment
|
472
|
+
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
|
473
|
+
start_offset = end_offset
|
474
|
+
current_line = +""
|
475
|
+
adjustment = 0
|
476
|
+
end
|
349
477
|
end
|
350
|
-
next
|
351
478
|
end
|
479
|
+
next
|
352
480
|
when :tSTRING_DVAR
|
353
481
|
value = nil
|
354
482
|
when :tSTRING_END
|
355
483
|
if token.type == :HEREDOC_END && value.end_with?("\n")
|
356
484
|
newline_length = value.end_with?("\r\n") ? 2 : 1
|
357
|
-
value =
|
358
|
-
location =
|
485
|
+
value = heredoc_stack.pop.identifier
|
486
|
+
location = range(token.location.start_offset, token.location.end_offset - newline_length)
|
359
487
|
elsif token.type == :REGEXP_END
|
360
488
|
value = value[0]
|
361
|
-
location =
|
489
|
+
location = range(token.location.start_offset, token.location.start_offset + 1)
|
490
|
+
end
|
491
|
+
|
492
|
+
if percent_array?(quote_stack.pop)
|
493
|
+
prev_token, _ = lexed[index - 2] if index - 2 >= 0
|
494
|
+
empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
|
495
|
+
ends_with_whitespace = prev_token&.type == :WORDS_SEP
|
496
|
+
# parser always emits a space token after content in a percent array, even if no actual whitespace is present.
|
497
|
+
if !empty && !ends_with_whitespace
|
498
|
+
tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
|
499
|
+
end
|
362
500
|
end
|
363
501
|
when :tSYMBEG
|
364
|
-
if (next_token = lexed[index]
|
502
|
+
if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
|
365
503
|
next_location = token.location.join(next_token.location)
|
366
504
|
type = :tSYMBOL
|
367
505
|
value = next_token.value
|
368
506
|
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
|
369
|
-
location =
|
507
|
+
location = range(next_location.start_offset, next_location.end_offset)
|
370
508
|
index += 1
|
509
|
+
else
|
510
|
+
quote_stack.push(value)
|
371
511
|
end
|
372
512
|
when :tFID
|
373
513
|
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
|
374
514
|
type = :tIDENTIFIER
|
375
515
|
end
|
376
516
|
when :tXSTRING_BEG
|
377
|
-
if (next_token = lexed[index]
|
517
|
+
if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
|
518
|
+
# self.`()
|
378
519
|
type = :tBACK_REF2
|
379
520
|
end
|
521
|
+
quote_stack.push(value)
|
522
|
+
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
|
523
|
+
if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
|
524
|
+
index += 1
|
525
|
+
end
|
526
|
+
|
527
|
+
quote_stack.push(value)
|
528
|
+
when :tREGEXP_BEG
|
529
|
+
quote_stack.push(value)
|
380
530
|
end
|
381
531
|
|
382
532
|
tokens << [type, [value, location]]
|
383
533
|
|
384
534
|
if token.type == :REGEXP_END
|
385
|
-
tokens << [:tREGEXP_OPT, [token.value[1..],
|
535
|
+
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
|
386
536
|
end
|
387
537
|
end
|
388
538
|
|
@@ -391,6 +541,11 @@ module Prism
|
|
391
541
|
|
392
542
|
private
|
393
543
|
|
544
|
+
# Creates a new parser range, taking prisms byte offsets into account
|
545
|
+
def range(start_offset, end_offset)
|
546
|
+
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
|
547
|
+
end
|
548
|
+
|
394
549
|
# Parse an integer from the string representation.
|
395
550
|
def parse_integer(value)
|
396
551
|
Integer(value)
|
@@ -432,6 +587,233 @@ module Prism
|
|
432
587
|
rescue ArgumentError
|
433
588
|
0r
|
434
589
|
end
|
590
|
+
|
591
|
+
# Wonky heredoc tab/spaces rules.
|
592
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
|
593
|
+
def calculate_heredoc_whitespace(heredoc_token_index)
|
594
|
+
next_token_index = heredoc_token_index
|
595
|
+
nesting_level = 0
|
596
|
+
previous_line = -1
|
597
|
+
result = Float::MAX
|
598
|
+
|
599
|
+
while (next_token = lexed[next_token_index]&.first)
|
600
|
+
next_token_index += 1
|
601
|
+
next_next_token, _ = lexed[next_token_index]
|
602
|
+
first_token_on_line = next_token.location.start_column == 0
|
603
|
+
|
604
|
+
# String content inside nested heredocs and interpolation is ignored
|
605
|
+
if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
|
606
|
+
# When interpolation is the first token of a line there is no string
|
607
|
+
# content to check against. There will be no common whitespace.
|
608
|
+
if nesting_level == 0 && first_token_on_line
|
609
|
+
result = 0
|
610
|
+
end
|
611
|
+
nesting_level += 1
|
612
|
+
elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
|
613
|
+
nesting_level -= 1
|
614
|
+
# When we encountered the matching heredoc end, we can exit
|
615
|
+
break if nesting_level == -1
|
616
|
+
elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
|
617
|
+
common_whitespace = 0
|
618
|
+
next_token.value[/^\s*/].each_char do |char|
|
619
|
+
if char == "\t"
|
620
|
+
common_whitespace = (common_whitespace / 8 + 1) * 8;
|
621
|
+
else
|
622
|
+
common_whitespace += 1
|
623
|
+
end
|
624
|
+
end
|
625
|
+
|
626
|
+
is_first_token_on_line = next_token.location.start_line != previous_line
|
627
|
+
# Whitespace is significant if followed by interpolation
|
628
|
+
whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
|
629
|
+
if is_first_token_on_line && !whitespace_only && common_whitespace < result
|
630
|
+
result = common_whitespace
|
631
|
+
previous_line = next_token.location.start_line
|
632
|
+
end
|
633
|
+
end
|
634
|
+
end
|
635
|
+
result
|
636
|
+
end
|
637
|
+
|
638
|
+
# Wonky heredoc tab/spaces rules.
|
639
|
+
# https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
|
640
|
+
def trim_heredoc_whitespace(string, heredoc)
|
641
|
+
trimmed_whitespace = 0
|
642
|
+
trimmed_characters = 0
|
643
|
+
while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
|
644
|
+
if string[trimmed_characters] == "\t"
|
645
|
+
trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
|
646
|
+
break if trimmed_whitespace > heredoc.common_whitespace
|
647
|
+
else
|
648
|
+
trimmed_whitespace += 1
|
649
|
+
end
|
650
|
+
trimmed_characters += 1
|
651
|
+
end
|
652
|
+
|
653
|
+
string[trimmed_characters..]
|
654
|
+
end
|
655
|
+
|
656
|
+
# Escape sequences that have special and should appear unescaped in the resulting string.
|
657
|
+
ESCAPES = {
|
658
|
+
"a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
|
659
|
+
"n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
|
660
|
+
"v" => "\v", "\\" => "\\"
|
661
|
+
}.freeze
|
662
|
+
private_constant :ESCAPES
|
663
|
+
|
664
|
+
# When one of these delimiters is encountered, then the other
|
665
|
+
# one is allowed to be escaped as well.
|
666
|
+
DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
|
667
|
+
private_constant :DELIMITER_SYMETRY
|
668
|
+
|
669
|
+
|
670
|
+
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
|
671
|
+
REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
|
672
|
+
private_constant :REGEXP_META_CHARACTERS
|
673
|
+
|
674
|
+
# Apply Ruby string escaping rules
|
675
|
+
def unescape_string(string, quote)
|
676
|
+
# In single-quoted heredocs, everything is taken literally.
|
677
|
+
return string if quote == "<<'"
|
678
|
+
|
679
|
+
# OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
|
680
|
+
return string unless string.include?("\\")
|
681
|
+
|
682
|
+
# Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
|
683
|
+
delimiter = quote[-1]
|
684
|
+
|
685
|
+
if regexp?(quote)
|
686
|
+
# Should be escaped handled to single-quoted heredocs. The only character that is
|
687
|
+
# allowed to be escaped is the delimiter, except when that also has special meaning
|
688
|
+
# in the regexp. Since all the symetry delimiters have special meaning, they don't need
|
689
|
+
# to be considered separately.
|
690
|
+
if REGEXP_META_CHARACTERS.include?(delimiter)
|
691
|
+
string
|
692
|
+
else
|
693
|
+
# There can never be an even amount of backslashes. It would be a syntax error.
|
694
|
+
string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
|
695
|
+
end
|
696
|
+
elsif interpolation?(quote)
|
697
|
+
# Appending individual escape sequences may force the string out of its intended
|
698
|
+
# encoding. Start out with binary and force it back later.
|
699
|
+
result = "".b
|
700
|
+
|
701
|
+
scanner = StringScanner.new(string)
|
702
|
+
while (skipped = scanner.skip_until(/\\/))
|
703
|
+
# Append what was just skipped over, excluding the found backslash.
|
704
|
+
result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
|
705
|
+
escape_read(result, scanner, false, false)
|
706
|
+
end
|
707
|
+
|
708
|
+
# Add remaining chars
|
709
|
+
result.append_as_bytes(string.byteslice(scanner.pos..))
|
710
|
+
result.force_encoding(source_buffer.source.encoding)
|
711
|
+
else
|
712
|
+
delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
|
713
|
+
string.gsub(/\\([\\#{delimiters}])/, '\1')
|
714
|
+
end
|
715
|
+
end
|
716
|
+
|
717
|
+
# Certain strings are merged into a single string token.
|
718
|
+
def simplify_string?(value, quote)
|
719
|
+
case quote
|
720
|
+
when "'"
|
721
|
+
# Only simplify 'foo'
|
722
|
+
!value.include?("\n")
|
723
|
+
when '"'
|
724
|
+
# Simplify when every line ends with a line continuation, or it is the last line
|
725
|
+
value.lines.all? do |line|
|
726
|
+
!line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
|
727
|
+
end
|
728
|
+
else
|
729
|
+
# %q and similar are never simplified
|
730
|
+
false
|
731
|
+
end
|
732
|
+
end
|
733
|
+
|
734
|
+
# Escape a byte value, given the control and meta flags.
|
735
|
+
def escape_build(value, control, meta)
|
736
|
+
value &= 0x9f if control
|
737
|
+
value |= 0x80 if meta
|
738
|
+
value
|
739
|
+
end
|
740
|
+
|
741
|
+
# Read an escape out of the string scanner, given the control and meta
|
742
|
+
# flags, and push the unescaped value into the result.
|
743
|
+
def escape_read(result, scanner, control, meta)
|
744
|
+
if scanner.skip("\n")
|
745
|
+
# Line continuation
|
746
|
+
elsif (value = ESCAPES[scanner.peek(1)])
|
747
|
+
# Simple single-character escape sequences like \n
|
748
|
+
result.append_as_bytes(value)
|
749
|
+
scanner.pos += 1
|
750
|
+
elsif (value = scanner.scan(/[0-7]{1,3}/))
|
751
|
+
# \nnn
|
752
|
+
result.append_as_bytes(escape_build(value.to_i(8), control, meta))
|
753
|
+
elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
|
754
|
+
# \xnn
|
755
|
+
result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
|
756
|
+
elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
|
757
|
+
# \unnnn
|
758
|
+
result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
|
759
|
+
elsif scanner.skip("u{}")
|
760
|
+
# https://github.com/whitequark/parser/issues/856
|
761
|
+
elsif (value = scanner.scan(/u{.*?}/))
|
762
|
+
# \u{nnnn ...}
|
763
|
+
value[2..-2].split.each do |unicode|
|
764
|
+
result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
|
765
|
+
end
|
766
|
+
elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
|
767
|
+
# \cx or \C-x where x is an ASCII printable character
|
768
|
+
escape_read(result, scanner, true, meta)
|
769
|
+
elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
|
770
|
+
# \M-x where x is an ASCII printable character
|
771
|
+
escape_read(result, scanner, control, true)
|
772
|
+
elsif (byte = scanner.scan_byte)
|
773
|
+
# Something else after an escape.
|
774
|
+
if control && byte == 0x3f # ASCII '?'
|
775
|
+
result.append_as_bytes(escape_build(0x7f, false, meta))
|
776
|
+
else
|
777
|
+
result.append_as_bytes(escape_build(byte, control, meta))
|
778
|
+
end
|
779
|
+
end
|
780
|
+
end
|
781
|
+
|
782
|
+
# In a percent array, certain whitespace can be preceeded with a backslash,
|
783
|
+
# causing the following characters to be part of the previous element.
|
784
|
+
def percent_array_unescape(string)
|
785
|
+
string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
|
786
|
+
full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
|
787
|
+
full_match
|
788
|
+
end
|
789
|
+
end
|
790
|
+
|
791
|
+
# For %-arrays whitespace, the parser gem only considers whitespace before the newline.
|
792
|
+
def percent_array_leading_whitespace(string)
|
793
|
+
return 1 if string.start_with?("\n")
|
794
|
+
|
795
|
+
leading_whitespace = 0
|
796
|
+
string.each_char do |c|
|
797
|
+
break if c == "\n"
|
798
|
+
leading_whitespace += 1
|
799
|
+
end
|
800
|
+
leading_whitespace
|
801
|
+
end
|
802
|
+
|
803
|
+
# Determine if characters preceeded by a backslash should be escaped or not
|
804
|
+
def interpolation?(quote)
|
805
|
+
!quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
|
806
|
+
end
|
807
|
+
|
808
|
+
# Regexp allow interpolation but are handled differently during unescaping
|
809
|
+
def regexp?(quote)
|
810
|
+
quote == "/" || quote.start_with?("%r")
|
811
|
+
end
|
812
|
+
|
813
|
+
# Determine if the string is part of a %-style array.
|
814
|
+
def percent_array?(quote)
|
815
|
+
quote.start_with?("%w", "%W", "%i", "%I")
|
816
|
+
end
|
435
817
|
end
|
436
818
|
end
|
437
819
|
end
|