prism 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +24 -1
  3. data/config.yml +9 -0
  4. data/docs/releasing.md +1 -1
  5. data/docs/ruby_api.md +1 -1
  6. data/ext/prism/api_node.c +1814 -1303
  7. data/ext/prism/extension.c +230 -109
  8. data/ext/prism/extension.h +4 -4
  9. data/include/prism/ast.h +16 -0
  10. data/include/prism/defines.h +4 -1
  11. data/include/prism/options.h +47 -1
  12. data/include/prism/util/pm_buffer.h +10 -0
  13. data/include/prism/version.h +2 -2
  14. data/include/prism.h +4 -4
  15. data/lib/prism/dot_visitor.rb +16 -0
  16. data/lib/prism/dsl.rb +10 -2
  17. data/lib/prism/ffi.rb +45 -27
  18. data/lib/prism/inspect_visitor.rb +2 -1
  19. data/lib/prism/node.rb +48 -10
  20. data/lib/prism/parse_result/newlines.rb +1 -1
  21. data/lib/prism/parse_result.rb +52 -0
  22. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  23. data/lib/prism/reflection.rb +2 -2
  24. data/lib/prism/serialize.rb +1252 -765
  25. data/lib/prism/translation/parser/builder.rb +61 -0
  26. data/lib/prism/translation/parser/compiler.rb +192 -136
  27. data/lib/prism/translation/parser/lexer.rb +435 -61
  28. data/lib/prism/translation/parser.rb +51 -3
  29. data/lib/prism/translation/parser35.rb +12 -0
  30. data/lib/prism/translation/ripper.rb +13 -3
  31. data/lib/prism/translation/ruby_parser.rb +5 -4
  32. data/lib/prism/translation.rb +1 -0
  33. data/lib/prism.rb +3 -3
  34. data/prism.gemspec +5 -1
  35. data/rbi/prism/dsl.rbi +6 -3
  36. data/rbi/prism/node.rbi +22 -7
  37. data/rbi/prism/parse_result.rbi +17 -0
  38. data/rbi/prism/translation/parser35.rbi +6 -0
  39. data/rbi/prism.rbi +39 -36
  40. data/sig/prism/dsl.rbs +4 -2
  41. data/sig/prism/node.rbs +17 -7
  42. data/sig/prism/parse_result.rbs +10 -0
  43. data/sig/prism/serialize.rbs +4 -2
  44. data/sig/prism.rbs +22 -1
  45. data/src/diagnostic.c +2 -2
  46. data/src/node.c +21 -0
  47. data/src/options.c +31 -0
  48. data/src/prettyprint.c +30 -0
  49. data/src/prism.c +374 -118
  50. data/src/serialize.c +6 -0
  51. data/src/util/pm_buffer.c +40 -0
  52. data/src/util/pm_constant_pool.c +6 -2
  53. data/src/util/pm_strncasecmp.c +13 -1
  54. metadata +7 -7
@@ -1,21 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "strscan"
4
+ require_relative "../../polyfill/append_as_bytes"
5
+
3
6
  module Prism
4
7
  module Translation
5
8
  class Parser
6
9
  # Accepts a list of prism tokens and converts them into the expected
7
10
  # format for the parser gem.
8
11
  class Lexer
12
+ # These tokens are always skipped
13
+ TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
14
+ private_constant :TYPES_ALWAYS_SKIP
15
+
9
16
  # The direct translating of types between the two lexers.
10
17
  TYPES = {
11
18
  # These tokens should never appear in the output of the lexer.
12
- EOF: nil,
13
19
  MISSING: nil,
14
20
  NOT_PROVIDED: nil,
15
- IGNORED_NEWLINE: nil,
16
21
  EMBDOC_END: nil,
17
22
  EMBDOC_LINE: nil,
18
- __END__: nil,
19
23
 
20
24
  # These tokens have more or less direct mappings.
21
25
  AMPERSAND: :tAMPER2,
@@ -191,16 +195,24 @@ module Prism
191
195
  #
192
196
  # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
193
197
  # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
194
- LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
198
+ LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
195
199
 
196
200
  # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
197
201
  # The following token types are listed as those classified as `tLPAREN`.
198
- LPAREN_CONVERSION_TOKEN_TYPES = [
202
+ LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
199
203
  :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
200
- :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
201
- ]
204
+ :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
205
+ ])
206
+
207
+ # Types of tokens that are allowed to continue a method call with comments in-between.
208
+ # For these, the parser gem doesn't emit a newline token after the last comment.
209
+ COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
210
+ private_constant :COMMENT_CONTINUATION_TYPES
211
+
212
+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
213
+ HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
202
214
 
203
- private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
215
+ private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
204
216
 
205
217
  # The Parser::Source::Buffer that the tokens were lexed from.
206
218
  attr_reader :source_buffer
@@ -230,46 +242,78 @@ module Prism
230
242
  index = 0
231
243
  length = lexed.length
232
244
 
233
- heredoc_identifier_stack = []
245
+ heredoc_stack = []
246
+ quote_stack = []
247
+
248
+ # The parser gem emits the newline tokens for comments out of order. This saves
249
+ # that token location to emit at a later time to properly line everything up.
250
+ # https://github.com/whitequark/parser/issues/1025
251
+ comment_newline_location = nil
234
252
 
235
253
  while index < length
236
254
  token, state = lexed[index]
237
255
  index += 1
238
- next if %i[IGNORED_NEWLINE __END__ EOF].include?(token.type)
256
+ next if TYPES_ALWAYS_SKIP.include?(token.type)
239
257
 
240
258
  type = TYPES.fetch(token.type)
241
259
  value = token.value
242
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
260
+ location = range(token.location.start_offset, token.location.end_offset)
243
261
 
244
262
  case type
245
263
  when :kDO
246
- types = tokens.map(&:first)
247
- nearest_lambda_token_type = types.reverse.find { |type| LAMBDA_TOKEN_TYPES.include?(type) }
264
+ nearest_lambda_token = tokens.reverse_each.find do |token|
265
+ LAMBDA_TOKEN_TYPES.include?(token.first)
266
+ end
248
267
 
249
- if nearest_lambda_token_type == :tLAMBDA
268
+ if nearest_lambda_token&.first == :tLAMBDA
250
269
  type = :kDO_LAMBDA
251
270
  end
252
271
  when :tCHARACTER
253
272
  value.delete_prefix!("?")
273
+ # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
274
+ value = unescape_string(value, "?")
254
275
  when :tCOMMENT
255
276
  if token.type == :EMBDOC_BEGIN
256
- start_index = index
257
277
 
258
278
  while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
259
279
  value += next_token.value
260
280
  index += 1
261
281
  end
262
282
 
263
- if start_index != index
264
- value += next_token.value
265
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
266
- index += 1
267
- end
283
+ value += next_token.value
284
+ location = range(token.location.start_offset, lexed[index][0].location.end_offset)
285
+ index += 1
268
286
  else
269
- value.chomp!
270
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
287
+ is_at_eol = value.chomp!.nil?
288
+ location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
289
+
290
+ prev_token = lexed[index - 2][0] if index - 2 >= 0
291
+ next_token = lexed[index][0]
292
+
293
+ is_inline_comment = prev_token&.location&.start_line == token.location.start_line
294
+ if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
295
+ tokens << [:tCOMMENT, [value, location]]
296
+
297
+ nl_location = range(token.location.end_offset - 1, token.location.end_offset)
298
+ tokens << [:tNL, [nil, nl_location]]
299
+ next
300
+ elsif is_inline_comment && next_token&.type == :COMMENT
301
+ comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
302
+ elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
303
+ tokens << [:tCOMMENT, [value, location]]
304
+ tokens << [:tNL, [nil, comment_newline_location]]
305
+ comment_newline_location = nil
306
+ next
307
+ end
271
308
  end
272
309
  when :tNL
310
+ next_token = next_token = lexed[index][0]
311
+ # Newlines after comments are emitted out of order.
312
+ if next_token&.type == :COMMENT
313
+ comment_newline_location = location
314
+ next
315
+ end
316
+
273
317
  value = nil
274
318
  when :tFLOAT
275
319
  value = parse_float(value)
@@ -277,8 +321,8 @@ module Prism
277
321
  value = parse_complex(value)
278
322
  when :tINTEGER
279
323
  if value.start_with?("+")
280
- tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
281
- location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
324
+ tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
325
+ location = range(token.location.start_offset + 1, token.location.end_offset)
282
326
  end
283
327
 
284
328
  value = parse_integer(value)
@@ -297,68 +341,154 @@ module Prism
297
341
  when :tRATIONAL
298
342
  value = parse_rational(value)
299
343
  when :tSPACE
344
+ location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
300
345
  value = nil
301
346
  when :tSTRING_BEG
302
- if token.type == :HEREDOC_START
303
- heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
304
- end
305
- if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
347
+ next_token = lexed[index][0]
348
+ next_next_token = lexed[index + 1][0]
349
+ basic_quotes = value == '"' || value == "'"
350
+
351
+ if basic_quotes && next_token&.type == :STRING_END
306
352
  next_location = token.location.join(next_token.location)
307
353
  type = :tSTRING
308
354
  value = ""
309
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
355
+ location = range(next_location.start_offset, next_location.end_offset)
310
356
  index += 1
311
- elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
312
- next_location = token.location.join(next_next_token.location)
313
- type = :tSTRING
314
- value = next_token.value.gsub("\\\\", "\\")
315
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
316
- index += 2
317
- elsif value.start_with?("<<")
357
+ elsif value.start_with?("'", '"', "%")
358
+ if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
359
+ string_value = next_token.value
360
+ if simplify_string?(string_value, value)
361
+ next_location = token.location.join(next_next_token.location)
362
+ if percent_array?(value)
363
+ value = percent_array_unescape(string_value)
364
+ else
365
+ value = unescape_string(string_value, value)
366
+ end
367
+ type = :tSTRING
368
+ location = range(next_location.start_offset, next_location.end_offset)
369
+ index += 2
370
+ tokens << [type, [value, location]]
371
+
372
+ next
373
+ end
374
+ end
375
+
376
+ quote_stack.push(value)
377
+ elsif token.type == :HEREDOC_START
318
378
  quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
379
+ heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
380
+ heredoc = HeredocData.new(
381
+ identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
382
+ common_whitespace: 0,
383
+ )
384
+
319
385
  if quote == "`"
320
386
  type = :tXSTRING_BEG
321
- value = "<<`"
387
+ end
388
+
389
+ # The parser gem trims whitespace from squiggly heredocs. We must record
390
+ # the most common whitespace to later remove.
391
+ if heredoc_type == "~" || heredoc_type == "`"
392
+ heredoc.common_whitespace = calculate_heredoc_whitespace(index)
393
+ end
394
+
395
+ if quote == "'" || quote == '"' || quote == "`"
396
+ value = "<<#{quote}"
322
397
  else
323
- value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
398
+ value = '<<"'
324
399
  end
400
+
401
+ heredoc_stack.push(heredoc)
402
+ quote_stack.push(value)
325
403
  end
326
404
  when :tSTRING_CONTENT
327
- unless (lines = token.value.lines).one?
328
- start_offset = offset_cache[token.location.start_offset]
329
- lines.map do |line|
330
- newline = line.end_with?("\r\n") ? "\r\n" : "\n"
405
+ is_percent_array = percent_array?(quote_stack.last)
406
+
407
+ if (lines = token.value.lines).one?
408
+ # Prism usually emits a single token for strings with line continuations.
409
+ # For squiggly heredocs they are not joined so we do that manually here.
410
+ current_string = +""
411
+ current_length = 0
412
+ start_offset = token.location.start_offset
413
+ while token.type == :STRING_CONTENT
414
+ current_length += token.value.bytesize
415
+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
416
+ is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
417
+ # The parser gem only removes indentation when the heredoc is not nested
418
+ not_nested = heredoc_stack.size == 1
419
+ if is_percent_array
420
+ value = percent_array_unescape(token.value)
421
+ elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
422
+ value = trim_heredoc_whitespace(token.value, current_heredoc)
423
+ end
424
+
425
+ current_string << unescape_string(value, quote_stack.last)
426
+ if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
427
+ tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
428
+ break
429
+ end
430
+ token = lexed[index][0]
431
+ index += 1
432
+ end
433
+ else
434
+ # When the parser gem encounters a line continuation inside of a multiline string,
435
+ # it emits a single string node. The backslash (and remaining newline) is removed.
436
+ current_line = +""
437
+ adjustment = 0
438
+ start_offset = token.location.start_offset
439
+ emit = false
440
+
441
+ lines.each.with_index do |line, index|
331
442
  chomped_line = line.chomp
332
- if match = chomped_line.match(/(?<backslashes>\\+)\z/)
333
- adjustment = match[:backslashes].size / 2
334
- adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
335
- if match[:backslashes].size.odd?
336
- adjusted_line.delete_suffix!("\\")
337
- adjustment += 2
443
+ backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
444
+ is_interpolation = interpolation?(quote_stack.last)
445
+
446
+ if backslash_count.odd? && (is_interpolation || is_percent_array)
447
+ if is_percent_array
448
+ current_line << percent_array_unescape(line)
449
+ adjustment += 1
338
450
  else
339
- adjusted_line << newline
451
+ chomped_line.delete_suffix!("\\")
452
+ current_line << chomped_line
453
+ adjustment += 2
340
454
  end
455
+ # If the string ends with a line continuation emit the remainder
456
+ emit = index == lines.count - 1
341
457
  else
342
- adjusted_line = line
343
- adjustment = 0
458
+ current_line << line
459
+ emit = true
344
460
  end
345
461
 
346
- end_offset = start_offset + adjusted_line.length + adjustment
347
- tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
348
- start_offset = end_offset
462
+ if emit
463
+ end_offset = start_offset + current_line.bytesize + adjustment
464
+ tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
465
+ start_offset = end_offset
466
+ current_line = +""
467
+ adjustment = 0
468
+ end
349
469
  end
350
- next
351
470
  end
471
+ next
352
472
  when :tSTRING_DVAR
353
473
  value = nil
354
474
  when :tSTRING_END
355
475
  if token.type == :HEREDOC_END && value.end_with?("\n")
356
476
  newline_length = value.end_with?("\r\n") ? 2 : 1
357
- value = heredoc_identifier_stack.pop
358
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
477
+ value = heredoc_stack.pop.identifier
478
+ location = range(token.location.start_offset, token.location.end_offset - newline_length)
359
479
  elsif token.type == :REGEXP_END
360
480
  value = value[0]
361
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
481
+ location = range(token.location.start_offset, token.location.start_offset + 1)
482
+ end
483
+
484
+ if percent_array?(quote_stack.pop)
485
+ prev_token = lexed[index - 2][0] if index - 2 >= 0
486
+ empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
487
+ ends_with_whitespace = prev_token&.type == :WORDS_SEP
488
+ # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
489
+ if !empty && !ends_with_whitespace
490
+ tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
491
+ end
362
492
  end
363
493
  when :tSYMBEG
364
494
  if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
@@ -366,23 +496,35 @@ module Prism
366
496
  type = :tSYMBOL
367
497
  value = next_token.value
368
498
  value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
369
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
499
+ location = range(next_location.start_offset, next_location.end_offset)
370
500
  index += 1
501
+ else
502
+ quote_stack.push(value)
371
503
  end
372
504
  when :tFID
373
505
  if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
374
506
  type = :tIDENTIFIER
375
507
  end
376
508
  when :tXSTRING_BEG
377
- if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
509
+ if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
510
+ # self.`()
378
511
  type = :tBACK_REF2
379
512
  end
513
+ quote_stack.push(value)
514
+ when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
515
+ if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
516
+ index += 1
517
+ end
518
+
519
+ quote_stack.push(value)
520
+ when :tREGEXP_BEG
521
+ quote_stack.push(value)
380
522
  end
381
523
 
382
524
  tokens << [type, [value, location]]
383
525
 
384
526
  if token.type == :REGEXP_END
385
- tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
527
+ tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
386
528
  end
387
529
  end
388
530
 
@@ -391,6 +533,11 @@ module Prism
391
533
 
392
534
  private
393
535
 
536
+ # Creates a new parser range, taking prisms byte offsets into account
537
+ def range(start_offset, end_offset)
538
+ Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
539
+ end
540
+
394
541
  # Parse an integer from the string representation.
395
542
  def parse_integer(value)
396
543
  Integer(value)
@@ -432,6 +579,233 @@ module Prism
432
579
  rescue ArgumentError
433
580
  0r
434
581
  end
582
+
583
+ # Wonky heredoc tab/spaces rules.
584
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
585
+ def calculate_heredoc_whitespace(heredoc_token_index)
586
+ next_token_index = heredoc_token_index
587
+ nesting_level = 0
588
+ previous_line = -1
589
+ result = Float::MAX
590
+
591
+ while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
592
+ next_token_index += 1
593
+ next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
594
+ first_token_on_line = next_token.location.start_column == 0
595
+
596
+ # String content inside nested heredocs and interpolation is ignored
597
+ if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
598
+ # When interpolation is the first token of a line there is no string
599
+ # content to check against. There will be no common whitespace.
600
+ if nesting_level == 0 && first_token_on_line
601
+ result = 0
602
+ end
603
+ nesting_level += 1
604
+ elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
605
+ nesting_level -= 1
606
+ # When we encountered the matching heredoc end, we can exit
607
+ break if nesting_level == -1
608
+ elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
609
+ common_whitespace = 0
610
+ next_token.value[/^\s*/].each_char do |char|
611
+ if char == "\t"
612
+ common_whitespace = (common_whitespace / 8 + 1) * 8;
613
+ else
614
+ common_whitespace += 1
615
+ end
616
+ end
617
+
618
+ is_first_token_on_line = next_token.location.start_line != previous_line
619
+ # Whitespace is significant if followed by interpolation
620
+ whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
621
+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
622
+ result = common_whitespace
623
+ previous_line = next_token.location.start_line
624
+ end
625
+ end
626
+ end
627
+ result
628
+ end
629
+
630
+ # Wonky heredoc tab/spaces rules.
631
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
632
+ def trim_heredoc_whitespace(string, heredoc)
633
+ trimmed_whitespace = 0
634
+ trimmed_characters = 0
635
+ while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
636
+ if string[trimmed_characters] == "\t"
637
+ trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
638
+ break if trimmed_whitespace > heredoc.common_whitespace
639
+ else
640
+ trimmed_whitespace += 1
641
+ end
642
+ trimmed_characters += 1
643
+ end
644
+
645
+ string[trimmed_characters..]
646
+ end
647
+
648
+ # Escape sequences that have special and should appear unescaped in the resulting string.
649
+ ESCAPES = {
650
+ "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
651
+ "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
652
+ "v" => "\v", "\\" => "\\"
653
+ }.freeze
654
+ private_constant :ESCAPES
655
+
656
+ # When one of these delimiters is encountered, then the other
657
+ # one is allowed to be escaped as well.
658
+ DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
659
+ private_constant :DELIMITER_SYMETRY
660
+
661
+
662
+ # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
663
+ REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
664
+ private_constant :REGEXP_META_CHARACTERS
665
+
666
+ # Apply Ruby string escaping rules
667
+ def unescape_string(string, quote)
668
+ # In single-quoted heredocs, everything is taken literally.
669
+ return string if quote == "<<'"
670
+
671
+ # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
672
+ return string unless string.include?("\\")
673
+
674
+ # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
675
+ delimiter = quote[-1]
676
+
677
+ if regexp?(quote)
678
+ # Should be escaped handled to single-quoted heredocs. The only character that is
679
+ # allowed to be escaped is the delimiter, except when that also has special meaning
680
+ # in the regexp. Since all the symetry delimiters have special meaning, they don't need
681
+ # to be considered separately.
682
+ if REGEXP_META_CHARACTERS.include?(delimiter)
683
+ string
684
+ else
685
+ # There can never be an even amount of backslashes. It would be a syntax error.
686
+ string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
687
+ end
688
+ elsif interpolation?(quote)
689
+ # Appending individual escape sequences may force the string out of its intended
690
+ # encoding. Start out with binary and force it back later.
691
+ result = "".b
692
+
693
+ scanner = StringScanner.new(string)
694
+ while (skipped = scanner.skip_until(/\\/))
695
+ # Append what was just skipped over, excluding the found backslash.
696
+ result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
697
+ escape_read(result, scanner, false, false)
698
+ end
699
+
700
+ # Add remaining chars
701
+ result.append_as_bytes(string.byteslice(scanner.pos..))
702
+ result.force_encoding(source_buffer.source.encoding)
703
+ else
704
+ delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
705
+ string.gsub(/\\([\\#{delimiters}])/, '\1')
706
+ end
707
+ end
708
+
709
+ # Certain strings are merged into a single string token.
710
+ def simplify_string?(value, quote)
711
+ case quote
712
+ when "'"
713
+ # Only simplify 'foo'
714
+ !value.include?("\n")
715
+ when '"'
716
+ # Simplify when every line ends with a line continuation, or it is the last line
717
+ value.lines.all? do |line|
718
+ !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
719
+ end
720
+ else
721
+ # %q and similar are never simplified
722
+ false
723
+ end
724
+ end
725
+
726
+ # Escape a byte value, given the control and meta flags.
727
+ def escape_build(value, control, meta)
728
+ value &= 0x9f if control
729
+ value |= 0x80 if meta
730
+ value
731
+ end
732
+
733
+ # Read an escape out of the string scanner, given the control and meta
734
+ # flags, and push the unescaped value into the result.
735
+ def escape_read(result, scanner, control, meta)
736
+ if scanner.skip("\n")
737
+ # Line continuation
738
+ elsif (value = ESCAPES[scanner.peek(1)])
739
+ # Simple single-character escape sequences like \n
740
+ result.append_as_bytes(value)
741
+ scanner.pos += 1
742
+ elsif (value = scanner.scan(/[0-7]{1,3}/))
743
+ # \nnn
744
+ result.append_as_bytes(escape_build(value.to_i(8), control, meta))
745
+ elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
746
+ # \xnn
747
+ result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
748
+ elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
749
+ # \unnnn
750
+ result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
751
+ elsif scanner.skip("u{}")
752
+ # https://github.com/whitequark/parser/issues/856
753
+ elsif (value = scanner.scan(/u{.*?}/))
754
+ # \u{nnnn ...}
755
+ value[2..-2].split.each do |unicode|
756
+ result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
757
+ end
758
+ elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
759
+ # \cx or \C-x where x is an ASCII printable character
760
+ escape_read(result, scanner, true, meta)
761
+ elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
762
+ # \M-x where x is an ASCII printable character
763
+ escape_read(result, scanner, control, true)
764
+ elsif (byte = scanner.get_byte)
765
+ # Something else after an escape.
766
+ if control && byte == "?"
767
+ result.append_as_bytes(escape_build(0x7f, false, meta))
768
+ else
769
+ result.append_as_bytes(escape_build(byte.ord, control, meta))
770
+ end
771
+ end
772
+ end
773
+
774
+ # In a percent array, certain whitespace can be preceeded with a backslash,
775
+ # causing the following characters to be part of the previous element.
776
+ def percent_array_unescape(string)
777
+ string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
778
+ full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
779
+ full_match
780
+ end
781
+ end
782
+
783
+ # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
784
+ def percent_array_leading_whitespace(string)
785
+ return 1 if string.start_with?("\n")
786
+
787
+ leading_whitespace = 0
788
+ string.each_char do |c|
789
+ break if c == "\n"
790
+ leading_whitespace += 1
791
+ end
792
+ leading_whitespace
793
+ end
794
+
795
+ # Determine if characters preceeded by a backslash should be escaped or not
796
+ def interpolation?(quote)
797
+ !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
798
+ end
799
+
800
+ # Regexp allow interpolation but are handled differently during unescaping
801
+ def regexp?(quote)
802
+ quote == "/" || quote.start_with?("%r")
803
+ end
804
+
805
+ # Determine if the string is part of a %-style array.
806
+ def percent_array?(quote)
807
+ quote.start_with?("%w", "%W", "%i", "%I")
808
+ end
435
809
  end
436
810
  end
437
811
  end