jruby-prism-parser 0.24.0-java → 1.4.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. checksums.yaml +4 -4
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +269 -1
  4. data/CONTRIBUTING.md +0 -4
  5. data/Makefile +25 -18
  6. data/README.md +57 -6
  7. data/config.yml +1724 -140
  8. data/docs/build_system.md +39 -11
  9. data/docs/configuration.md +4 -0
  10. data/docs/cruby_compilation.md +1 -1
  11. data/docs/fuzzing.md +1 -1
  12. data/docs/parser_translation.md +14 -9
  13. data/docs/parsing_rules.md +4 -1
  14. data/docs/releasing.md +8 -10
  15. data/docs/relocation.md +34 -0
  16. data/docs/ripper_translation.md +72 -0
  17. data/docs/ruby_api.md +2 -1
  18. data/docs/serialization.md +29 -5
  19. data/ext/prism/api_node.c +3395 -1999
  20. data/ext/prism/api_pack.c +9 -0
  21. data/ext/prism/extconf.rb +55 -34
  22. data/ext/prism/extension.c +597 -346
  23. data/ext/prism/extension.h +6 -5
  24. data/include/prism/ast.h +2612 -455
  25. data/include/prism/defines.h +160 -2
  26. data/include/prism/diagnostic.h +188 -76
  27. data/include/prism/encoding.h +22 -4
  28. data/include/prism/node.h +89 -17
  29. data/include/prism/options.h +224 -12
  30. data/include/prism/pack.h +11 -0
  31. data/include/prism/parser.h +267 -66
  32. data/include/prism/prettyprint.h +8 -0
  33. data/include/prism/regexp.h +18 -8
  34. data/include/prism/static_literals.h +121 -0
  35. data/include/prism/util/pm_buffer.h +75 -2
  36. data/include/prism/util/pm_char.h +1 -2
  37. data/include/prism/util/pm_constant_pool.h +18 -9
  38. data/include/prism/util/pm_integer.h +126 -0
  39. data/include/prism/util/pm_list.h +1 -1
  40. data/include/prism/util/pm_newline_list.h +19 -0
  41. data/include/prism/util/pm_string.h +48 -8
  42. data/include/prism/version.h +3 -3
  43. data/include/prism.h +99 -5
  44. data/jruby-prism.jar +0 -0
  45. data/lib/prism/compiler.rb +11 -1
  46. data/lib/prism/desugar_compiler.rb +113 -74
  47. data/lib/prism/dispatcher.rb +45 -1
  48. data/lib/prism/dot_visitor.rb +201 -77
  49. data/lib/prism/dsl.rb +673 -461
  50. data/lib/prism/ffi.rb +233 -45
  51. data/lib/prism/inspect_visitor.rb +2389 -0
  52. data/lib/prism/lex_compat.rb +35 -16
  53. data/lib/prism/mutation_compiler.rb +24 -8
  54. data/lib/prism/node.rb +7731 -8460
  55. data/lib/prism/node_ext.rb +328 -32
  56. data/lib/prism/pack.rb +4 -0
  57. data/lib/prism/parse_result/comments.rb +34 -24
  58. data/lib/prism/parse_result/errors.rb +65 -0
  59. data/lib/prism/parse_result/newlines.rb +102 -12
  60. data/lib/prism/parse_result.rb +448 -44
  61. data/lib/prism/pattern.rb +28 -10
  62. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  63. data/lib/prism/polyfill/byteindex.rb +13 -0
  64. data/lib/prism/polyfill/unpack1.rb +14 -0
  65. data/lib/prism/reflection.rb +413 -0
  66. data/lib/prism/relocation.rb +504 -0
  67. data/lib/prism/serialize.rb +1940 -1198
  68. data/lib/prism/string_query.rb +30 -0
  69. data/lib/prism/translation/parser/builder.rb +61 -0
  70. data/lib/prism/translation/parser/compiler.rb +569 -195
  71. data/lib/prism/translation/parser/lexer.rb +516 -39
  72. data/lib/prism/translation/parser.rb +177 -12
  73. data/lib/prism/translation/parser33.rb +1 -1
  74. data/lib/prism/translation/parser34.rb +1 -1
  75. data/lib/prism/translation/parser35.rb +12 -0
  76. data/lib/prism/translation/ripper/sexp.rb +125 -0
  77. data/lib/prism/translation/ripper/shim.rb +5 -0
  78. data/lib/prism/translation/ripper.rb +3224 -462
  79. data/lib/prism/translation/ruby_parser.rb +194 -69
  80. data/lib/prism/translation.rb +4 -1
  81. data/lib/prism/version.rb +1 -1
  82. data/lib/prism/visitor.rb +13 -0
  83. data/lib/prism.rb +17 -27
  84. data/prism.gemspec +57 -17
  85. data/rbi/prism/compiler.rbi +12 -0
  86. data/rbi/prism/dsl.rbi +524 -0
  87. data/rbi/prism/inspect_visitor.rbi +12 -0
  88. data/rbi/prism/node.rbi +8722 -0
  89. data/rbi/prism/node_ext.rbi +107 -0
  90. data/rbi/prism/parse_result.rbi +404 -0
  91. data/rbi/prism/reflection.rbi +58 -0
  92. data/rbi/prism/string_query.rbi +12 -0
  93. data/rbi/prism/translation/parser.rbi +11 -0
  94. data/rbi/prism/translation/parser33.rbi +6 -0
  95. data/rbi/prism/translation/parser34.rbi +6 -0
  96. data/rbi/prism/translation/parser35.rbi +6 -0
  97. data/rbi/prism/translation/ripper.rbi +15 -0
  98. data/rbi/prism/visitor.rbi +473 -0
  99. data/rbi/prism.rbi +44 -7745
  100. data/sig/prism/compiler.rbs +9 -0
  101. data/sig/prism/dispatcher.rbs +16 -0
  102. data/sig/prism/dot_visitor.rbs +6 -0
  103. data/sig/prism/dsl.rbs +351 -0
  104. data/sig/prism/inspect_visitor.rbs +22 -0
  105. data/sig/prism/lex_compat.rbs +10 -0
  106. data/sig/prism/mutation_compiler.rbs +159 -0
  107. data/sig/prism/node.rbs +3614 -0
  108. data/sig/prism/node_ext.rbs +82 -0
  109. data/sig/prism/pack.rbs +43 -0
  110. data/sig/prism/parse_result.rbs +192 -0
  111. data/sig/prism/pattern.rbs +13 -0
  112. data/sig/prism/reflection.rbs +50 -0
  113. data/sig/prism/relocation.rbs +185 -0
  114. data/sig/prism/serialize.rbs +8 -0
  115. data/sig/prism/string_query.rbs +11 -0
  116. data/sig/prism/visitor.rbs +169 -0
  117. data/sig/prism.rbs +248 -4767
  118. data/src/diagnostic.c +672 -230
  119. data/src/encoding.c +211 -108
  120. data/src/node.c +7541 -1653
  121. data/src/options.c +135 -20
  122. data/src/pack.c +33 -17
  123. data/src/prettyprint.c +1543 -1485
  124. data/src/prism.c +7813 -3050
  125. data/src/regexp.c +225 -73
  126. data/src/serialize.c +101 -77
  127. data/src/static_literals.c +617 -0
  128. data/src/token_type.c +14 -13
  129. data/src/util/pm_buffer.c +187 -20
  130. data/src/util/pm_char.c +5 -5
  131. data/src/util/pm_constant_pool.c +39 -19
  132. data/src/util/pm_integer.c +670 -0
  133. data/src/util/pm_list.c +1 -1
  134. data/src/util/pm_newline_list.c +43 -5
  135. data/src/util/pm_string.c +213 -33
  136. data/src/util/pm_strncasecmp.c +13 -1
  137. data/src/util/pm_strpbrk.c +32 -6
  138. metadata +55 -19
  139. data/docs/ripper.md +0 -36
  140. data/include/prism/util/pm_state_stack.h +0 -42
  141. data/include/prism/util/pm_string_list.h +0 -44
  142. data/lib/prism/debug.rb +0 -206
  143. data/lib/prism/node_inspector.rb +0 -68
  144. data/lib/prism/translation/parser/rubocop.rb +0 -45
  145. data/rbi/prism_static.rbi +0 -207
  146. data/sig/prism_static.rbs +0 -201
  147. data/src/util/pm_state_stack.c +0 -25
  148. data/src/util/pm_string_list.c +0 -28
@@ -1,21 +1,25 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "strscan"
4
+ require_relative "../../polyfill/append_as_bytes"
5
+
3
6
  module Prism
4
7
  module Translation
5
8
  class Parser
6
9
  # Accepts a list of prism tokens and converts them into the expected
7
10
  # format for the parser gem.
8
11
  class Lexer
12
+ # These tokens are always skipped
13
+ TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
14
+ private_constant :TYPES_ALWAYS_SKIP
15
+
9
16
  # The direct translating of types between the two lexers.
10
17
  TYPES = {
11
18
  # These tokens should never appear in the output of the lexer.
12
- EOF: nil,
13
19
  MISSING: nil,
14
20
  NOT_PROVIDED: nil,
15
- IGNORED_NEWLINE: nil,
16
21
  EMBDOC_END: nil,
17
22
  EMBDOC_LINE: nil,
18
- __END__: nil,
19
23
 
20
24
  # These tokens have more or less direct mappings.
21
25
  AMPERSAND: :tAMPER2,
@@ -134,7 +138,7 @@ module Prism
134
138
  MINUS_GREATER: :tLAMBDA,
135
139
  NEWLINE: :tNL,
136
140
  NUMBERED_REFERENCE: :tNTH_REF,
137
- PARENTHESIS_LEFT: :tLPAREN,
141
+ PARENTHESIS_LEFT: :tLPAREN2,
138
142
  PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG,
139
143
  PARENTHESIS_RIGHT: :tRPAREN,
140
144
  PERCENT: :tPERCENT,
@@ -167,22 +171,54 @@ module Prism
167
171
  TILDE: :tTILDE,
168
172
  UAMPERSAND: :tAMPER,
169
173
  UCOLON_COLON: :tCOLON3,
170
- UDOT_DOT: :tDOT2,
174
+ UDOT_DOT: :tBDOT2,
171
175
  UDOT_DOT_DOT: :tBDOT3,
172
176
  UMINUS: :tUMINUS,
173
177
  UMINUS_NUM: :tUNARY_NUM,
174
178
  UPLUS: :tUPLUS,
175
179
  USTAR: :tSTAR,
176
- USTAR_STAR: :tPOW,
180
+ USTAR_STAR: :tDSTAR,
177
181
  WORDS_SEP: :tSPACE
178
182
  }
179
183
 
180
- private_constant :TYPES
184
+ # These constants represent flags in our lex state. We really, really
185
+ # don't want to be using them and we really, really don't want to be
186
+ # exposing them as part of our public API. Unfortunately, we don't have
187
+ # another way of matching the exact tokens that the parser gem expects
188
+ # without them. We should find another way to do this, but in the
189
+ # meantime we'll hide them from the documentation and mark them as
190
+ # private constants.
191
+ EXPR_BEG = 0x1 # :nodoc:
192
+ EXPR_LABEL = 0x400 # :nodoc:
193
+
194
+ # It is used to determine whether `do` is of the token type `kDO` or `kDO_LAMBDA`.
195
+ #
196
+ # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
197
+ # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
198
+ LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
199
+
200
+ # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
201
+ # The following token types are listed as those classified as `tLPAREN`.
202
+ LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
203
+ :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
204
+ :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
205
+ ])
206
+
207
+ # Types of tokens that are allowed to continue a method call with comments in-between.
208
+ # For these, the parser gem doesn't emit a newline token after the last comment.
209
+ COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
210
+ private_constant :COMMENT_CONTINUATION_TYPES
211
+
212
+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
213
+ HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
214
+
215
+ private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
181
216
 
182
217
  # The Parser::Source::Buffer that the tokens were lexed from.
183
218
  attr_reader :source_buffer
184
219
 
185
- # An array of prism tokens that we lexed.
220
+ # An array of tuples that contain prism tokens and their associated lex
221
+ # state when they were lexed.
186
222
  attr_reader :lexed
187
223
 
188
224
  # A hash that maps offsets in bytes to offsets in characters.
@@ -202,102 +238,293 @@ module Prism
202
238
  # Convert the prism tokens into the expected format for the parser gem.
203
239
  def to_a
204
240
  tokens = []
241
+
205
242
  index = 0
243
+ length = lexed.length
206
244
 
207
- while index < lexed.length
208
- token, = lexed[index]
245
+ heredoc_stack = []
246
+ quote_stack = []
247
+
248
+ # The parser gem emits the newline tokens for comments out of order. This saves
249
+ # that token location to emit at a later time to properly line everything up.
250
+ # https://github.com/whitequark/parser/issues/1025
251
+ comment_newline_location = nil
252
+
253
+ while index < length
254
+ token, state = lexed[index]
209
255
  index += 1
210
- next if token.type == :IGNORED_NEWLINE || token.type == :EOF
256
+ next if TYPES_ALWAYS_SKIP.include?(token.type)
211
257
 
212
258
  type = TYPES.fetch(token.type)
213
259
  value = token.value
214
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
260
+ location = range(token.location.start_offset, token.location.end_offset)
215
261
 
216
262
  case type
263
+ when :kDO
264
+ nearest_lambda_token = tokens.reverse_each.find do |token|
265
+ LAMBDA_TOKEN_TYPES.include?(token.first)
266
+ end
267
+
268
+ if nearest_lambda_token&.first == :tLAMBDA
269
+ type = :kDO_LAMBDA
270
+ end
217
271
  when :tCHARACTER
218
272
  value.delete_prefix!("?")
273
+ # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
274
+ value = unescape_string(value, "?")
219
275
  when :tCOMMENT
220
276
  if token.type == :EMBDOC_BEGIN
221
- until (next_token = lexed[index]) && next_token.type == :EMBDOC_END
277
+
278
+ while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
222
279
  value += next_token.value
223
280
  index += 1
224
281
  end
225
282
 
226
283
  value += next_token.value
227
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index].location.end_offset])
284
+ location = range(token.location.start_offset, lexed[index][0].location.end_offset)
228
285
  index += 1
229
286
  else
230
- value.chomp!
231
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
287
+ is_at_eol = value.chomp!.nil?
288
+ location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
289
+
290
+ prev_token = lexed[index - 2][0] if index - 2 >= 0
291
+ next_token = lexed[index][0]
292
+
293
+ is_inline_comment = prev_token&.location&.start_line == token.location.start_line
294
+ if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
295
+ tokens << [:tCOMMENT, [value, location]]
296
+
297
+ nl_location = range(token.location.end_offset - 1, token.location.end_offset)
298
+ tokens << [:tNL, [nil, nl_location]]
299
+ next
300
+ elsif is_inline_comment && next_token&.type == :COMMENT
301
+ comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
302
+ elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
303
+ tokens << [:tCOMMENT, [value, location]]
304
+ tokens << [:tNL, [nil, comment_newline_location]]
305
+ comment_newline_location = nil
306
+ next
307
+ end
232
308
  end
233
309
  when :tNL
310
+ next_token = next_token = lexed[index][0]
311
+ # Newlines after comments are emitted out of order.
312
+ if next_token&.type == :COMMENT
313
+ comment_newline_location = location
314
+ next
315
+ end
316
+
234
317
  value = nil
235
318
  when :tFLOAT
236
- value = Float(value)
319
+ value = parse_float(value)
237
320
  when :tIMAGINARY
238
321
  value = parse_complex(value)
239
322
  when :tINTEGER
240
323
  if value.start_with?("+")
241
- tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
242
- location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
324
+ tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
325
+ location = range(token.location.start_offset + 1, token.location.end_offset)
243
326
  end
244
327
 
245
- value = Integer(value)
328
+ value = parse_integer(value)
246
329
  when :tLABEL
247
330
  value.chomp!(":")
248
331
  when :tLABEL_END
249
332
  value.chomp!(":")
333
+ when :tLCURLY
334
+ type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
335
+ when :tLPAREN2
336
+ type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
250
337
  when :tNTH_REF
251
- value = Integer(value.delete_prefix("$"))
338
+ value = parse_integer(value.delete_prefix("$"))
252
339
  when :tOP_ASGN
253
340
  value.chomp!("=")
254
341
  when :tRATIONAL
255
342
  value = parse_rational(value)
256
343
  when :tSPACE
344
+ location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
257
345
  value = nil
258
346
  when :tSTRING_BEG
259
- if ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_END
347
+ next_token = lexed[index][0]
348
+ next_next_token = lexed[index + 1][0]
349
+ basic_quotes = value == '"' || value == "'"
350
+
351
+ if basic_quotes && next_token&.type == :STRING_END
260
352
  next_location = token.location.join(next_token.location)
261
353
  type = :tSTRING
262
354
  value = ""
263
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
355
+ location = range(next_location.start_offset, next_location.end_offset)
264
356
  index += 1
265
- elsif ["\"", "'"].include?(value) && (next_token = lexed[index]) && next_token.type == :STRING_CONTENT && (next_next_token = lexed[index + 1]) && next_next_token.type == :STRING_END
266
- next_location = token.location.join(next_next_token.location)
267
- type = :tSTRING
268
- value = next_token.value
269
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
270
- index += 2
271
- elsif value.start_with?("<<")
357
+ elsif value.start_with?("'", '"', "%")
358
+ if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
359
+ string_value = next_token.value
360
+ if simplify_string?(string_value, value)
361
+ next_location = token.location.join(next_next_token.location)
362
+ if percent_array?(value)
363
+ value = percent_array_unescape(string_value)
364
+ else
365
+ value = unescape_string(string_value, value)
366
+ end
367
+ type = :tSTRING
368
+ location = range(next_location.start_offset, next_location.end_offset)
369
+ index += 2
370
+ tokens << [type, [value, location]]
371
+
372
+ next
373
+ end
374
+ end
375
+
376
+ quote_stack.push(value)
377
+ elsif token.type == :HEREDOC_START
272
378
  quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
273
- value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
379
+ heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
380
+ heredoc = HeredocData.new(
381
+ identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
382
+ common_whitespace: 0,
383
+ )
384
+
385
+ if quote == "`"
386
+ type = :tXSTRING_BEG
387
+ end
388
+
389
+ # The parser gem trims whitespace from squiggly heredocs. We must record
390
+ # the most common whitespace to later remove.
391
+ if heredoc_type == "~" || heredoc_type == "`"
392
+ heredoc.common_whitespace = calculate_heredoc_whitespace(index)
393
+ end
394
+
395
+ if quote == "'" || quote == '"' || quote == "`"
396
+ value = "<<#{quote}"
397
+ else
398
+ value = '<<"'
399
+ end
400
+
401
+ heredoc_stack.push(heredoc)
402
+ quote_stack.push(value)
403
+ end
404
+ when :tSTRING_CONTENT
405
+ is_percent_array = percent_array?(quote_stack.last)
406
+
407
+ if (lines = token.value.lines).one?
408
+ # Prism usually emits a single token for strings with line continuations.
409
+ # For squiggly heredocs they are not joined so we do that manually here.
410
+ current_string = +""
411
+ current_length = 0
412
+ start_offset = token.location.start_offset
413
+ while token.type == :STRING_CONTENT
414
+ current_length += token.value.bytesize
415
+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
416
+ is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
417
+ # The parser gem only removes indentation when the heredoc is not nested
418
+ not_nested = heredoc_stack.size == 1
419
+ if is_percent_array
420
+ value = percent_array_unescape(token.value)
421
+ elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
422
+ value = trim_heredoc_whitespace(token.value, current_heredoc)
423
+ end
424
+
425
+ current_string << unescape_string(value, quote_stack.last)
426
+ if (backslash_count = token.value[/(\\{1,})\n/, 1]&.length).nil? || backslash_count.even? || !interpolation?(quote_stack.last)
427
+ tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
428
+ break
429
+ end
430
+ token = lexed[index][0]
431
+ index += 1
432
+ end
433
+ else
434
+ # When the parser gem encounters a line continuation inside of a multiline string,
435
+ # it emits a single string node. The backslash (and remaining newline) is removed.
436
+ current_line = +""
437
+ adjustment = 0
438
+ start_offset = token.location.start_offset
439
+ emit = false
440
+
441
+ lines.each.with_index do |line, index|
442
+ chomped_line = line.chomp
443
+ backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
444
+ is_interpolation = interpolation?(quote_stack.last)
445
+
446
+ if backslash_count.odd? && (is_interpolation || is_percent_array)
447
+ if is_percent_array
448
+ current_line << percent_array_unescape(line)
449
+ adjustment += 1
450
+ else
451
+ chomped_line.delete_suffix!("\\")
452
+ current_line << chomped_line
453
+ adjustment += 2
454
+ end
455
+ # If the string ends with a line continuation emit the remainder
456
+ emit = index == lines.count - 1
457
+ else
458
+ current_line << line
459
+ emit = true
460
+ end
461
+
462
+ if emit
463
+ end_offset = start_offset + current_line.bytesize + adjustment
464
+ tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
465
+ start_offset = end_offset
466
+ current_line = +""
467
+ adjustment = 0
468
+ end
469
+ end
274
470
  end
471
+ next
275
472
  when :tSTRING_DVAR
276
473
  value = nil
277
474
  when :tSTRING_END
278
- if token.type == :REGEXP_END
475
+ if token.type == :HEREDOC_END && value.end_with?("\n")
476
+ newline_length = value.end_with?("\r\n") ? 2 : 1
477
+ value = heredoc_stack.pop.identifier
478
+ location = range(token.location.start_offset, token.location.end_offset - newline_length)
479
+ elsif token.type == :REGEXP_END
279
480
  value = value[0]
280
- location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
481
+ location = range(token.location.start_offset, token.location.start_offset + 1)
482
+ end
483
+
484
+ if percent_array?(quote_stack.pop)
485
+ prev_token = lexed[index - 2][0] if index - 2 >= 0
486
+ empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
487
+ ends_with_whitespace = prev_token&.type == :WORDS_SEP
488
+ # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
489
+ if !empty && !ends_with_whitespace
490
+ tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
491
+ end
281
492
  end
282
493
  when :tSYMBEG
283
- if (next_token = lexed[index]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR
494
+ if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
284
495
  next_location = token.location.join(next_token.location)
285
496
  type = :tSYMBOL
286
497
  value = next_token.value
287
498
  value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
288
- location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
499
+ location = range(next_location.start_offset, next_location.end_offset)
289
500
  index += 1
501
+ else
502
+ quote_stack.push(value)
290
503
  end
291
504
  when :tFID
292
- if tokens[-1][0] == :kDEF
505
+ if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
293
506
  type = :tIDENTIFIER
294
507
  end
508
+ when :tXSTRING_BEG
509
+ if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
510
+ # self.`()
511
+ type = :tBACK_REF2
512
+ end
513
+ quote_stack.push(value)
514
+ when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
515
+ if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
516
+ index += 1
517
+ end
518
+
519
+ quote_stack.push(value)
520
+ when :tREGEXP_BEG
521
+ quote_stack.push(value)
295
522
  end
296
523
 
297
524
  tokens << [type, [value, location]]
298
525
 
299
526
  if token.type == :REGEXP_END
300
- tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
527
+ tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
301
528
  end
302
529
  end
303
530
 
@@ -306,6 +533,25 @@ module Prism
306
533
 
307
534
  private
308
535
 
536
+ # Creates a new parser range, taking prisms byte offsets into account
537
+ def range(start_offset, end_offset)
538
+ Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
539
+ end
540
+
541
+ # Parse an integer from the string representation.
542
+ def parse_integer(value)
543
+ Integer(value)
544
+ rescue ArgumentError
545
+ 0
546
+ end
547
+
548
+ # Parse a float from the string representation.
549
+ def parse_float(value)
550
+ Float(value)
551
+ rescue ArgumentError
552
+ 0.0
553
+ end
554
+
309
555
  # Parse a complex from the string representation.
310
556
  def parse_complex(value)
311
557
  value.chomp!("i")
@@ -313,10 +559,12 @@ module Prism
313
559
  if value.end_with?("r")
314
560
  Complex(0, parse_rational(value))
315
561
  elsif value.start_with?(/0[BbOoDdXx]/)
316
- Complex(0, Integer(value))
562
+ Complex(0, parse_integer(value))
317
563
  else
318
564
  Complex(0, value)
319
565
  end
566
+ rescue ArgumentError
567
+ 0i
320
568
  end
321
569
 
322
570
  # Parse a rational from the string representation.
@@ -324,10 +572,239 @@ module Prism
324
572
  value.chomp!("r")
325
573
 
326
574
  if value.start_with?(/0[BbOoDdXx]/)
327
- Rational(Integer(value))
575
+ Rational(parse_integer(value))
328
576
  else
329
577
  Rational(value)
330
578
  end
579
+ rescue ArgumentError
580
+ 0r
581
+ end
582
+
583
+ # Wonky heredoc tab/spaces rules.
584
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
585
+ def calculate_heredoc_whitespace(heredoc_token_index)
586
+ next_token_index = heredoc_token_index
587
+ nesting_level = 0
588
+ previous_line = -1
589
+ result = Float::MAX
590
+
591
+ while (lexed[next_token_index] && next_token = lexed[next_token_index][0])
592
+ next_token_index += 1
593
+ next_next_token = lexed[next_token_index] && lexed[next_token_index][0]
594
+ first_token_on_line = next_token.location.start_column == 0
595
+
596
+ # String content inside nested heredocs and interpolation is ignored
597
+ if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
598
+ # When interpolation is the first token of a line there is no string
599
+ # content to check against. There will be no common whitespace.
600
+ if nesting_level == 0 && first_token_on_line
601
+ result = 0
602
+ end
603
+ nesting_level += 1
604
+ elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
605
+ nesting_level -= 1
606
+ # When we encountered the matching heredoc end, we can exit
607
+ break if nesting_level == -1
608
+ elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
609
+ common_whitespace = 0
610
+ next_token.value[/^\s*/].each_char do |char|
611
+ if char == "\t"
612
+ common_whitespace = (common_whitespace / 8 + 1) * 8;
613
+ else
614
+ common_whitespace += 1
615
+ end
616
+ end
617
+
618
+ is_first_token_on_line = next_token.location.start_line != previous_line
619
+ # Whitespace is significant if followed by interpolation
620
+ whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
621
+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
622
+ result = common_whitespace
623
+ previous_line = next_token.location.start_line
624
+ end
625
+ end
626
+ end
627
+ result
628
+ end
629
+
630
+ # Wonky heredoc tab/spaces rules.
631
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
632
+ def trim_heredoc_whitespace(string, heredoc)
633
+ trimmed_whitespace = 0
634
+ trimmed_characters = 0
635
+ while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
636
+ if string[trimmed_characters] == "\t"
637
+ trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
638
+ break if trimmed_whitespace > heredoc.common_whitespace
639
+ else
640
+ trimmed_whitespace += 1
641
+ end
642
+ trimmed_characters += 1
643
+ end
644
+
645
+ string[trimmed_characters..]
646
+ end
647
+
648
+ # Escape sequences that have special and should appear unescaped in the resulting string.
649
+ ESCAPES = {
650
+ "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
651
+ "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
652
+ "v" => "\v", "\\" => "\\"
653
+ }.freeze
654
+ private_constant :ESCAPES
655
+
656
+ # When one of these delimiters is encountered, then the other
657
+ # one is allowed to be escaped as well.
658
+ DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
659
+ private_constant :DELIMITER_SYMETRY
660
+
661
+
662
+ # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
663
+ REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
664
+ private_constant :REGEXP_META_CHARACTERS
665
+
666
+ # Apply Ruby string escaping rules
667
+ def unescape_string(string, quote)
668
+ # In single-quoted heredocs, everything is taken literally.
669
+ return string if quote == "<<'"
670
+
671
+ # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
672
+ return string unless string.include?("\\")
673
+
674
+ # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
675
+ delimiter = quote[-1]
676
+
677
+ if regexp?(quote)
678
+ # Should be escaped handled to single-quoted heredocs. The only character that is
679
+ # allowed to be escaped is the delimiter, except when that also has special meaning
680
+ # in the regexp. Since all the symetry delimiters have special meaning, they don't need
681
+ # to be considered separately.
682
+ if REGEXP_META_CHARACTERS.include?(delimiter)
683
+ string
684
+ else
685
+ # There can never be an even amount of backslashes. It would be a syntax error.
686
+ string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
687
+ end
688
+ elsif interpolation?(quote)
689
+ # Appending individual escape sequences may force the string out of its intended
690
+ # encoding. Start out with binary and force it back later.
691
+ result = "".b
692
+
693
+ scanner = StringScanner.new(string)
694
+ while (skipped = scanner.skip_until(/\\/))
695
+ # Append what was just skipped over, excluding the found backslash.
696
+ result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
697
+ escape_read(result, scanner, false, false)
698
+ end
699
+
700
+ # Add remaining chars
701
+ result.append_as_bytes(string.byteslice(scanner.pos..))
702
+ result.force_encoding(source_buffer.source.encoding)
703
+ else
704
+ delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
705
+ string.gsub(/\\([\\#{delimiters}])/, '\1')
706
+ end
707
+ end
708
+
709
+ # Certain strings are merged into a single string token.
710
+ def simplify_string?(value, quote)
711
+ case quote
712
+ when "'"
713
+ # Only simplify 'foo'
714
+ !value.include?("\n")
715
+ when '"'
716
+ # Simplify when every line ends with a line continuation, or it is the last line
717
+ value.lines.all? do |line|
718
+ !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
719
+ end
720
+ else
721
+ # %q and similar are never simplified
722
+ false
723
+ end
724
+ end
725
+
726
+ # Escape a byte value, given the control and meta flags.
727
+ def escape_build(value, control, meta)
728
+ value &= 0x9f if control
729
+ value |= 0x80 if meta
730
+ value
731
+ end
732
+
733
+ # Read an escape out of the string scanner, given the control and meta
734
+ # flags, and push the unescaped value into the result.
735
+ def escape_read(result, scanner, control, meta)
736
+ if scanner.skip("\n")
737
+ # Line continuation
738
+ elsif (value = ESCAPES[scanner.peek(1)])
739
+ # Simple single-character escape sequences like \n
740
+ result.append_as_bytes(value)
741
+ scanner.pos += 1
742
+ elsif (value = scanner.scan(/[0-7]{1,3}/))
743
+ # \nnn
744
+ result.append_as_bytes(escape_build(value.to_i(8), control, meta))
745
+ elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
746
+ # \xnn
747
+ result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
748
+ elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
749
+ # \unnnn
750
+ result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
751
+ elsif scanner.skip("u{}")
752
+ # https://github.com/whitequark/parser/issues/856
753
+ elsif (value = scanner.scan(/u{.*?}/))
754
+ # \u{nnnn ...}
755
+ value[2..-2].split.each do |unicode|
756
+ result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
757
+ end
758
+ elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
759
+ # \cx or \C-x where x is an ASCII printable character
760
+ escape_read(result, scanner, true, meta)
761
+ elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
762
+ # \M-x where x is an ASCII printable character
763
+ escape_read(result, scanner, control, true)
764
+ elsif (byte = scanner.get_byte)
765
+ # Something else after an escape.
766
+ if control && byte == "?"
767
+ result.append_as_bytes(escape_build(0x7f, false, meta))
768
+ else
769
+ result.append_as_bytes(escape_build(byte.ord, control, meta))
770
+ end
771
+ end
772
+ end
773
+
774
+ # In a percent array, certain whitespace can be preceeded with a backslash,
775
+ # causing the following characters to be part of the previous element.
776
+ def percent_array_unescape(string)
777
+ string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
778
+ full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
779
+ full_match
780
+ end
781
+ end
782
+
783
+ # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
784
+ def percent_array_leading_whitespace(string)
785
+ return 1 if string.start_with?("\n")
786
+
787
+ leading_whitespace = 0
788
+ string.each_char do |c|
789
+ break if c == "\n"
790
+ leading_whitespace += 1
791
+ end
792
+ leading_whitespace
793
+ end
794
+
795
+ # Determine if characters preceeded by a backslash should be escaped or not
796
+ def interpolation?(quote)
797
+ !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
798
+ end
799
+
800
+ # Regexp allow interpolation but are handled differently during unescaping
801
+ def regexp?(quote)
802
+ quote == "/" || quote.start_with?("%r")
803
+ end
804
+
805
+ # Determine if the string is part of a %-style array.
806
+ def percent_array?(quote)
807
+ quote.start_with?("%w", "%W", "%i", "%I")
331
808
  end
332
809
  end
333
810
  end