prism 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +46 -1
  3. data/Makefile +2 -1
  4. data/README.md +1 -0
  5. data/config.yml +273 -37
  6. data/docs/parser_translation.md +8 -23
  7. data/docs/releasing.md +1 -1
  8. data/docs/ripper_translation.md +1 -1
  9. data/docs/ruby_api.md +1 -1
  10. data/ext/prism/api_node.c +1816 -1303
  11. data/ext/prism/extension.c +244 -110
  12. data/ext/prism/extension.h +4 -4
  13. data/include/prism/ast.h +291 -49
  14. data/include/prism/defines.h +4 -1
  15. data/include/prism/diagnostic.h +4 -0
  16. data/include/prism/options.h +89 -3
  17. data/include/prism/regexp.h +2 -2
  18. data/include/prism/util/pm_buffer.h +18 -0
  19. data/include/prism/util/pm_integer.h +4 -0
  20. data/include/prism/util/pm_list.h +6 -0
  21. data/include/prism/util/pm_string.h +12 -2
  22. data/include/prism/version.h +2 -2
  23. data/include/prism.h +41 -16
  24. data/lib/prism/compiler.rb +456 -151
  25. data/lib/prism/desugar_compiler.rb +1 -0
  26. data/lib/prism/dispatcher.rb +16 -0
  27. data/lib/prism/dot_visitor.rb +21 -1
  28. data/lib/prism/dsl.rb +13 -2
  29. data/lib/prism/ffi.rb +62 -34
  30. data/lib/prism/inspect_visitor.rb +5 -1
  31. data/lib/prism/lex_compat.rb +1 -0
  32. data/lib/prism/mutation_compiler.rb +3 -0
  33. data/lib/prism/node.rb +554 -345
  34. data/lib/prism/node_ext.rb +4 -1
  35. data/lib/prism/pack.rb +2 -0
  36. data/lib/prism/parse_result/comments.rb +1 -0
  37. data/lib/prism/parse_result/errors.rb +1 -0
  38. data/lib/prism/parse_result/newlines.rb +2 -1
  39. data/lib/prism/parse_result.rb +53 -0
  40. data/lib/prism/pattern.rb +1 -0
  41. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  42. data/lib/prism/polyfill/scan_byte.rb +14 -0
  43. data/lib/prism/polyfill/warn.rb +42 -0
  44. data/lib/prism/reflection.rb +5 -2
  45. data/lib/prism/relocation.rb +1 -0
  46. data/lib/prism/serialize.rb +1275 -783
  47. data/lib/prism/string_query.rb +1 -0
  48. data/lib/prism/translation/parser/builder.rb +62 -0
  49. data/lib/prism/translation/parser/compiler.rb +230 -152
  50. data/lib/prism/translation/parser/lexer.rb +446 -64
  51. data/lib/prism/translation/parser.rb +64 -4
  52. data/lib/prism/translation/parser33.rb +1 -0
  53. data/lib/prism/translation/parser34.rb +1 -0
  54. data/lib/prism/translation/parser35.rb +13 -0
  55. data/lib/prism/translation/parser_current.rb +24 -0
  56. data/lib/prism/translation/ripper/sexp.rb +1 -0
  57. data/lib/prism/translation/ripper.rb +30 -4
  58. data/lib/prism/translation/ruby_parser.rb +291 -7
  59. data/lib/prism/translation.rb +3 -0
  60. data/lib/prism/visitor.rb +457 -152
  61. data/lib/prism.rb +5 -3
  62. data/prism.gemspec +9 -1
  63. data/rbi/prism/dsl.rbi +9 -6
  64. data/rbi/prism/node.rbi +43 -16
  65. data/rbi/prism/parse_result.rbi +17 -0
  66. data/rbi/prism/translation/parser35.rbi +6 -0
  67. data/rbi/prism.rbi +39 -36
  68. data/sig/prism/dispatcher.rbs +3 -0
  69. data/sig/prism/dsl.rbs +7 -5
  70. data/sig/prism/node.rbs +461 -37
  71. data/sig/prism/node_ext.rbs +84 -17
  72. data/sig/prism/parse_result/comments.rbs +38 -0
  73. data/sig/prism/parse_result.rbs +14 -0
  74. data/sig/prism/reflection.rbs +1 -1
  75. data/sig/prism/serialize.rbs +4 -2
  76. data/sig/prism.rbs +22 -1
  77. data/src/diagnostic.c +9 -3
  78. data/src/node.c +23 -0
  79. data/src/options.c +33 -2
  80. data/src/prettyprint.c +32 -0
  81. data/src/prism.c +620 -242
  82. data/src/serialize.c +8 -0
  83. data/src/token_type.c +36 -34
  84. data/src/util/pm_buffer.c +40 -0
  85. data/src/util/pm_constant_pool.c +6 -2
  86. data/src/util/pm_strncasecmp.c +13 -1
  87. metadata +11 -7
@@ -1,4 +1,9 @@
1
1
  # frozen_string_literal: true
2
+ # :markup: markdown
3
+
4
+ require "strscan"
5
+ require_relative "../../polyfill/append_as_bytes"
6
+ require_relative "../../polyfill/scan_byte"
2
7
 
3
8
  module Prism
4
9
  module Translation
@@ -6,16 +11,17 @@ module Prism
6
11
  # Accepts a list of prism tokens and converts them into the expected
7
12
  # format for the parser gem.
8
13
  class Lexer
14
+ # These tokens are always skipped
15
+ TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
16
+ private_constant :TYPES_ALWAYS_SKIP
17
+
9
18
  # The direct translating of types between the two lexers.
10
19
  TYPES = {
11
20
  # These tokens should never appear in the output of the lexer.
12
- EOF: nil,
13
21
  MISSING: nil,
14
22
  NOT_PROVIDED: nil,
15
- IGNORED_NEWLINE: nil,
16
23
  EMBDOC_END: nil,
17
24
  EMBDOC_LINE: nil,
18
- __END__: nil,
19
25
 
20
26
  # These tokens have more or less direct mappings.
21
27
  AMPERSAND: :tAMPER2,
@@ -191,16 +197,24 @@ module Prism
191
197
  #
192
198
  # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
193
199
  # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
194
- LAMBDA_TOKEN_TYPES = [:kDO_LAMBDA, :tLAMBDA, :tLAMBEG]
200
+ LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
195
201
 
196
202
  # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
197
203
  # The following token types are listed as those classified as `tLPAREN`.
198
- LPAREN_CONVERSION_TOKEN_TYPES = [
199
- :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
200
- :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
201
- ]
204
+ LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
205
+ :kBREAK, :tCARET, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
206
+ :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS, :tLCURLY
207
+ ])
208
+
209
+ # Types of tokens that are allowed to continue a method call with comments in-between.
210
+ # For these, the parser gem doesn't emit a newline token after the last comment.
211
+ COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
212
+ private_constant :COMMENT_CONTINUATION_TYPES
213
+
214
+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
215
+ HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
202
216
 
203
- private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
217
+ private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
204
218
 
205
219
  # The Parser::Source::Buffer that the tokens were lexed from.
206
220
  attr_reader :source_buffer
@@ -230,46 +244,78 @@ module Prism
230
244
  index = 0
231
245
  length = lexed.length
232
246
 
233
- heredoc_identifier_stack = []
247
+ heredoc_stack = []
248
+ quote_stack = []
249
+
250
+ # The parser gem emits the newline tokens for comments out of order. This saves
251
+ # that token location to emit at a later time to properly line everything up.
252
+ # https://github.com/whitequark/parser/issues/1025
253
+ comment_newline_location = nil
234
254
 
235
255
  while index < length
236
256
  token, state = lexed[index]
237
257
  index += 1
238
- next if %i[IGNORED_NEWLINE __END__ EOF].include?(token.type)
258
+ next if TYPES_ALWAYS_SKIP.include?(token.type)
239
259
 
240
260
  type = TYPES.fetch(token.type)
241
261
  value = token.value
242
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
262
+ location = range(token.location.start_offset, token.location.end_offset)
243
263
 
244
264
  case type
245
265
  when :kDO
246
- types = tokens.map(&:first)
247
- nearest_lambda_token_type = types.reverse.find { |type| LAMBDA_TOKEN_TYPES.include?(type) }
266
+ nearest_lambda_token = tokens.reverse_each.find do |token|
267
+ LAMBDA_TOKEN_TYPES.include?(token.first)
268
+ end
248
269
 
249
- if nearest_lambda_token_type == :tLAMBDA
270
+ if nearest_lambda_token&.first == :tLAMBDA
250
271
  type = :kDO_LAMBDA
251
272
  end
252
273
  when :tCHARACTER
253
274
  value.delete_prefix!("?")
275
+ # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
276
+ value = unescape_string(value, "?")
254
277
  when :tCOMMENT
255
278
  if token.type == :EMBDOC_BEGIN
256
- start_index = index
257
279
 
258
- while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
280
+ while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
259
281
  value += next_token.value
260
282
  index += 1
261
283
  end
262
284
 
263
- if start_index != index
264
- value += next_token.value
265
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
266
- index += 1
267
- end
285
+ value += next_token.value
286
+ location = range(token.location.start_offset, next_token.location.end_offset)
287
+ index += 1
268
288
  else
269
- value.chomp!
270
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
289
+ is_at_eol = value.chomp!.nil?
290
+ location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
291
+
292
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
293
+ next_token, _ = lexed[index]
294
+
295
+ is_inline_comment = prev_token&.location&.start_line == token.location.start_line
296
+ if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
297
+ tokens << [:tCOMMENT, [value, location]]
298
+
299
+ nl_location = range(token.location.end_offset - 1, token.location.end_offset)
300
+ tokens << [:tNL, [nil, nl_location]]
301
+ next
302
+ elsif is_inline_comment && next_token&.type == :COMMENT
303
+ comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
304
+ elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
305
+ tokens << [:tCOMMENT, [value, location]]
306
+ tokens << [:tNL, [nil, comment_newline_location]]
307
+ comment_newline_location = nil
308
+ next
309
+ end
271
310
  end
272
311
  when :tNL
312
+ next_token, _ = lexed[index]
313
+ # Newlines after comments are emitted out of order.
314
+ if next_token&.type == :COMMENT
315
+ comment_newline_location = location
316
+ next
317
+ end
318
+
273
319
  value = nil
274
320
  when :tFLOAT
275
321
  value = parse_float(value)
@@ -277,8 +323,8 @@ module Prism
277
323
  value = parse_complex(value)
278
324
  when :tINTEGER
279
325
  if value.start_with?("+")
280
- tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
281
- location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
326
+ tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
327
+ location = range(token.location.start_offset + 1, token.location.end_offset)
282
328
  end
283
329
 
284
330
  value = parse_integer(value)
@@ -297,92 +343,196 @@ module Prism
297
343
  when :tRATIONAL
298
344
  value = parse_rational(value)
299
345
  when :tSPACE
346
+ location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
300
347
  value = nil
301
348
  when :tSTRING_BEG
302
- if token.type == :HEREDOC_START
303
- heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
304
- end
305
- if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
349
+ next_token, _ = lexed[index]
350
+ next_next_token, _ = lexed[index + 1]
351
+ basic_quotes = value == '"' || value == "'"
352
+
353
+ if basic_quotes && next_token&.type == :STRING_END
306
354
  next_location = token.location.join(next_token.location)
307
355
  type = :tSTRING
308
356
  value = ""
309
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
357
+ location = range(next_location.start_offset, next_location.end_offset)
310
358
  index += 1
311
- elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
312
- next_location = token.location.join(next_next_token.location)
313
- type = :tSTRING
314
- value = next_token.value.gsub("\\\\", "\\")
315
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
316
- index += 2
317
- elsif value.start_with?("<<")
359
+ elsif value.start_with?("'", '"', "%")
360
+ if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
361
+ string_value = next_token.value
362
+ if simplify_string?(string_value, value)
363
+ next_location = token.location.join(next_next_token.location)
364
+ if percent_array?(value)
365
+ value = percent_array_unescape(string_value)
366
+ else
367
+ value = unescape_string(string_value, value)
368
+ end
369
+ type = :tSTRING
370
+ location = range(next_location.start_offset, next_location.end_offset)
371
+ index += 2
372
+ tokens << [type, [value, location]]
373
+
374
+ next
375
+ end
376
+ end
377
+
378
+ quote_stack.push(value)
379
+ elsif token.type == :HEREDOC_START
318
380
  quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
381
+ heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
382
+ heredoc = HeredocData.new(
383
+ identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
384
+ common_whitespace: 0,
385
+ )
386
+
319
387
  if quote == "`"
320
388
  type = :tXSTRING_BEG
321
- value = "<<`"
389
+ end
390
+
391
+ # The parser gem trims whitespace from squiggly heredocs. We must record
392
+ # the most common whitespace to later remove.
393
+ if heredoc_type == "~" || heredoc_type == "`"
394
+ heredoc.common_whitespace = calculate_heredoc_whitespace(index)
395
+ end
396
+
397
+ if quote == "'" || quote == '"' || quote == "`"
398
+ value = "<<#{quote}"
322
399
  else
323
- value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
400
+ value = '<<"'
324
401
  end
402
+
403
+ heredoc_stack.push(heredoc)
404
+ quote_stack.push(value)
325
405
  end
326
406
  when :tSTRING_CONTENT
327
- unless (lines = token.value.lines).one?
328
- start_offset = offset_cache[token.location.start_offset]
329
- lines.map do |line|
330
- newline = line.end_with?("\r\n") ? "\r\n" : "\n"
407
+ is_percent_array = percent_array?(quote_stack.last)
408
+
409
+ if (lines = token.value.lines).one?
410
+ # Prism usually emits a single token for strings with line continuations.
411
+ # For squiggly heredocs they are not joined so we do that manually here.
412
+ current_string = +""
413
+ current_length = 0
414
+ start_offset = token.location.start_offset
415
+ while token.type == :STRING_CONTENT
416
+ current_length += token.value.bytesize
417
+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
418
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
419
+ is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
420
+ # The parser gem only removes indentation when the heredoc is not nested
421
+ not_nested = heredoc_stack.size == 1
422
+ if is_percent_array
423
+ value = percent_array_unescape(token.value)
424
+ elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
425
+ value = trim_heredoc_whitespace(token.value, current_heredoc)
426
+ end
427
+
428
+ current_string << unescape_string(value, quote_stack.last)
429
+ relevant_backslash_count = if quote_stack.last.start_with?("%W", "%I")
430
+ 0 # the last backslash escapes the newline
431
+ else
432
+ token.value[/(\\{1,})\n/, 1]&.length || 0
433
+ end
434
+ if relevant_backslash_count.even? || !interpolation?(quote_stack.last)
435
+ tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
436
+ break
437
+ end
438
+ token, _ = lexed[index]
439
+ index += 1
440
+ end
441
+ else
442
+ # When the parser gem encounters a line continuation inside of a multiline string,
443
+ # it emits a single string node. The backslash (and remaining newline) is removed.
444
+ current_line = +""
445
+ adjustment = 0
446
+ start_offset = token.location.start_offset
447
+ emit = false
448
+
449
+ lines.each.with_index do |line, index|
331
450
  chomped_line = line.chomp
332
- if match = chomped_line.match(/(?<backslashes>\\+)\z/)
333
- adjustment = match[:backslashes].size / 2
334
- adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
335
- if match[:backslashes].size.odd?
336
- adjusted_line.delete_suffix!("\\")
337
- adjustment += 2
451
+ backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
452
+ is_interpolation = interpolation?(quote_stack.last)
453
+
454
+ if backslash_count.odd? && (is_interpolation || is_percent_array)
455
+ if is_percent_array
456
+ current_line << percent_array_unescape(line)
457
+ adjustment += 1
338
458
  else
339
- adjusted_line << newline
459
+ chomped_line.delete_suffix!("\\")
460
+ current_line << chomped_line
461
+ adjustment += 2
340
462
  end
463
+ # If the string ends with a line continuation emit the remainder
464
+ emit = index == lines.count - 1
341
465
  else
342
- adjusted_line = line
343
- adjustment = 0
466
+ current_line << line
467
+ emit = true
344
468
  end
345
469
 
346
- end_offset = start_offset + adjusted_line.length + adjustment
347
- tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
348
- start_offset = end_offset
470
+ if emit
471
+ end_offset = start_offset + current_line.bytesize + adjustment
472
+ tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
473
+ start_offset = end_offset
474
+ current_line = +""
475
+ adjustment = 0
476
+ end
349
477
  end
350
- next
351
478
  end
479
+ next
352
480
  when :tSTRING_DVAR
353
481
  value = nil
354
482
  when :tSTRING_END
355
483
  if token.type == :HEREDOC_END && value.end_with?("\n")
356
484
  newline_length = value.end_with?("\r\n") ? 2 : 1
357
- value = heredoc_identifier_stack.pop
358
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
485
+ value = heredoc_stack.pop.identifier
486
+ location = range(token.location.start_offset, token.location.end_offset - newline_length)
359
487
  elsif token.type == :REGEXP_END
360
488
  value = value[0]
361
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
489
+ location = range(token.location.start_offset, token.location.start_offset + 1)
490
+ end
491
+
492
+ if percent_array?(quote_stack.pop)
493
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
494
+ empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
495
+ ends_with_whitespace = prev_token&.type == :WORDS_SEP
496
+ # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
497
+ if !empty && !ends_with_whitespace
498
+ tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
499
+ end
362
500
  end
363
501
  when :tSYMBEG
364
- if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
502
+ if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
365
503
  next_location = token.location.join(next_token.location)
366
504
  type = :tSYMBOL
367
505
  value = next_token.value
368
506
  value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
369
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
507
+ location = range(next_location.start_offset, next_location.end_offset)
370
508
  index += 1
509
+ else
510
+ quote_stack.push(value)
371
511
  end
372
512
  when :tFID
373
513
  if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
374
514
  type = :tIDENTIFIER
375
515
  end
376
516
  when :tXSTRING_BEG
377
- if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
517
+ if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
518
+ # self.`()
378
519
  type = :tBACK_REF2
379
520
  end
521
+ quote_stack.push(value)
522
+ when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
523
+ if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
524
+ index += 1
525
+ end
526
+
527
+ quote_stack.push(value)
528
+ when :tREGEXP_BEG
529
+ quote_stack.push(value)
380
530
  end
381
531
 
382
532
  tokens << [type, [value, location]]
383
533
 
384
534
  if token.type == :REGEXP_END
385
- tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
535
+ tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
386
536
  end
387
537
  end
388
538
 
@@ -391,6 +541,11 @@ module Prism
391
541
 
392
542
  private
393
543
 
544
+ # Creates a new parser range, taking prisms byte offsets into account
545
+ def range(start_offset, end_offset)
546
+ Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
547
+ end
548
+
394
549
  # Parse an integer from the string representation.
395
550
  def parse_integer(value)
396
551
  Integer(value)
@@ -432,6 +587,233 @@ module Prism
432
587
  rescue ArgumentError
433
588
  0r
434
589
  end
590
+
591
+ # Wonky heredoc tab/spaces rules.
592
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
593
+ def calculate_heredoc_whitespace(heredoc_token_index)
594
+ next_token_index = heredoc_token_index
595
+ nesting_level = 0
596
+ previous_line = -1
597
+ result = Float::MAX
598
+
599
+ while (next_token = lexed[next_token_index]&.first)
600
+ next_token_index += 1
601
+ next_next_token, _ = lexed[next_token_index]
602
+ first_token_on_line = next_token.location.start_column == 0
603
+
604
+ # String content inside nested heredocs and interpolation is ignored
605
+ if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
606
+ # When interpolation is the first token of a line there is no string
607
+ # content to check against. There will be no common whitespace.
608
+ if nesting_level == 0 && first_token_on_line
609
+ result = 0
610
+ end
611
+ nesting_level += 1
612
+ elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
613
+ nesting_level -= 1
614
+ # When we encountered the matching heredoc end, we can exit
615
+ break if nesting_level == -1
616
+ elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
617
+ common_whitespace = 0
618
+ next_token.value[/^\s*/].each_char do |char|
619
+ if char == "\t"
620
+ common_whitespace = (common_whitespace / 8 + 1) * 8;
621
+ else
622
+ common_whitespace += 1
623
+ end
624
+ end
625
+
626
+ is_first_token_on_line = next_token.location.start_line != previous_line
627
+ # Whitespace is significant if followed by interpolation
628
+ whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
629
+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
630
+ result = common_whitespace
631
+ previous_line = next_token.location.start_line
632
+ end
633
+ end
634
+ end
635
+ result
636
+ end
637
+
638
+ # Wonky heredoc tab/spaces rules.
639
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
640
+ def trim_heredoc_whitespace(string, heredoc)
641
+ trimmed_whitespace = 0
642
+ trimmed_characters = 0
643
+ while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
644
+ if string[trimmed_characters] == "\t"
645
+ trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
646
+ break if trimmed_whitespace > heredoc.common_whitespace
647
+ else
648
+ trimmed_whitespace += 1
649
+ end
650
+ trimmed_characters += 1
651
+ end
652
+
653
+ string[trimmed_characters..]
654
+ end
655
+
656
+ # Escape sequences that have special and should appear unescaped in the resulting string.
657
+ ESCAPES = {
658
+ "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
659
+ "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
660
+ "v" => "\v", "\\" => "\\"
661
+ }.freeze
662
+ private_constant :ESCAPES
663
+
664
+ # When one of these delimiters is encountered, then the other
665
+ # one is allowed to be escaped as well.
666
+ DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
667
+ private_constant :DELIMITER_SYMETRY
668
+
669
+
670
+ # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
671
+ REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
672
+ private_constant :REGEXP_META_CHARACTERS
673
+
674
+ # Apply Ruby string escaping rules
675
+ def unescape_string(string, quote)
676
+ # In single-quoted heredocs, everything is taken literally.
677
+ return string if quote == "<<'"
678
+
679
+ # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
680
+ return string unless string.include?("\\")
681
+
682
+ # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
683
+ delimiter = quote[-1]
684
+
685
+ if regexp?(quote)
686
+ # Should be escaped handled to single-quoted heredocs. The only character that is
687
+ # allowed to be escaped is the delimiter, except when that also has special meaning
688
+ # in the regexp. Since all the symetry delimiters have special meaning, they don't need
689
+ # to be considered separately.
690
+ if REGEXP_META_CHARACTERS.include?(delimiter)
691
+ string
692
+ else
693
+ # There can never be an even amount of backslashes. It would be a syntax error.
694
+ string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
695
+ end
696
+ elsif interpolation?(quote)
697
+ # Appending individual escape sequences may force the string out of its intended
698
+ # encoding. Start out with binary and force it back later.
699
+ result = "".b
700
+
701
+ scanner = StringScanner.new(string)
702
+ while (skipped = scanner.skip_until(/\\/))
703
+ # Append what was just skipped over, excluding the found backslash.
704
+ result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
705
+ escape_read(result, scanner, false, false)
706
+ end
707
+
708
+ # Add remaining chars
709
+ result.append_as_bytes(string.byteslice(scanner.pos..))
710
+ result.force_encoding(source_buffer.source.encoding)
711
+ else
712
+ delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
713
+ string.gsub(/\\([\\#{delimiters}])/, '\1')
714
+ end
715
+ end
716
+
717
+ # Certain strings are merged into a single string token.
718
+ def simplify_string?(value, quote)
719
+ case quote
720
+ when "'"
721
+ # Only simplify 'foo'
722
+ !value.include?("\n")
723
+ when '"'
724
+ # Simplify when every line ends with a line continuation, or it is the last line
725
+ value.lines.all? do |line|
726
+ !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
727
+ end
728
+ else
729
+ # %q and similar are never simplified
730
+ false
731
+ end
732
+ end
733
+
734
+ # Escape a byte value, given the control and meta flags.
735
+ def escape_build(value, control, meta)
736
+ value &= 0x9f if control
737
+ value |= 0x80 if meta
738
+ value
739
+ end
740
+
741
+ # Read an escape out of the string scanner, given the control and meta
742
+ # flags, and push the unescaped value into the result.
743
+ def escape_read(result, scanner, control, meta)
744
+ if scanner.skip("\n")
745
+ # Line continuation
746
+ elsif (value = ESCAPES[scanner.peek(1)])
747
+ # Simple single-character escape sequences like \n
748
+ result.append_as_bytes(value)
749
+ scanner.pos += 1
750
+ elsif (value = scanner.scan(/[0-7]{1,3}/))
751
+ # \nnn
752
+ result.append_as_bytes(escape_build(value.to_i(8), control, meta))
753
+ elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
754
+ # \xnn
755
+ result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
756
+ elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
757
+ # \unnnn
758
+ result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
759
+ elsif scanner.skip("u{}")
760
+ # https://github.com/whitequark/parser/issues/856
761
+ elsif (value = scanner.scan(/u{.*?}/))
762
+ # \u{nnnn ...}
763
+ value[2..-2].split.each do |unicode|
764
+ result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
765
+ end
766
+ elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
767
+ # \cx or \C-x where x is an ASCII printable character
768
+ escape_read(result, scanner, true, meta)
769
+ elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
770
+ # \M-x where x is an ASCII printable character
771
+ escape_read(result, scanner, control, true)
772
+ elsif (byte = scanner.scan_byte)
773
+ # Something else after an escape.
774
+ if control && byte == 0x3f # ASCII '?'
775
+ result.append_as_bytes(escape_build(0x7f, false, meta))
776
+ else
777
+ result.append_as_bytes(escape_build(byte, control, meta))
778
+ end
779
+ end
780
+ end
781
+
782
+ # In a percent array, certain whitespace can be preceeded with a backslash,
783
+ # causing the following characters to be part of the previous element.
784
+ def percent_array_unescape(string)
785
+ string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
786
+ full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
787
+ full_match
788
+ end
789
+ end
790
+
791
+ # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
792
+ def percent_array_leading_whitespace(string)
793
+ return 1 if string.start_with?("\n")
794
+
795
+ leading_whitespace = 0
796
+ string.each_char do |c|
797
+ break if c == "\n"
798
+ leading_whitespace += 1
799
+ end
800
+ leading_whitespace
801
+ end
802
+
803
+ # Determine if characters preceeded by a backslash should be escaped or not
804
+ def interpolation?(quote)
805
+ !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
806
+ end
807
+
808
+ # Regexp allow interpolation but are handled differently during unescaping
809
+ def regexp?(quote)
810
+ quote == "/" || quote.start_with?("%r")
811
+ end
812
+
813
+ # Determine if the string is part of a %-style array.
814
+ def percent_array?(quote)
815
+ quote.start_with?("%w", "%W", "%i", "%I")
816
+ end
435
817
  end
436
818
  end
437
819
  end