ed-precompiled_prism 1.5.2-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. checksums.yaml +7 -0
  2. data/BSDmakefile +58 -0
  3. data/CHANGELOG.md +723 -0
  4. data/CODE_OF_CONDUCT.md +76 -0
  5. data/CONTRIBUTING.md +58 -0
  6. data/LICENSE.md +7 -0
  7. data/Makefile +110 -0
  8. data/README.md +143 -0
  9. data/config.yml +4714 -0
  10. data/docs/build_system.md +119 -0
  11. data/docs/configuration.md +68 -0
  12. data/docs/cruby_compilation.md +27 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +121 -0
  15. data/docs/fuzzing.md +88 -0
  16. data/docs/heredocs.md +36 -0
  17. data/docs/javascript.md +118 -0
  18. data/docs/local_variable_depth.md +229 -0
  19. data/docs/mapping.md +117 -0
  20. data/docs/parser_translation.md +24 -0
  21. data/docs/parsing_rules.md +22 -0
  22. data/docs/releasing.md +98 -0
  23. data/docs/relocation.md +34 -0
  24. data/docs/ripper_translation.md +72 -0
  25. data/docs/ruby_api.md +44 -0
  26. data/docs/ruby_parser_translation.md +19 -0
  27. data/docs/serialization.md +233 -0
  28. data/docs/testing.md +55 -0
  29. data/ext/prism/api_node.c +6941 -0
  30. data/ext/prism/api_pack.c +276 -0
  31. data/ext/prism/extconf.rb +127 -0
  32. data/ext/prism/extension.c +1419 -0
  33. data/ext/prism/extension.h +19 -0
  34. data/include/prism/ast.h +8220 -0
  35. data/include/prism/defines.h +260 -0
  36. data/include/prism/diagnostic.h +456 -0
  37. data/include/prism/encoding.h +283 -0
  38. data/include/prism/node.h +129 -0
  39. data/include/prism/options.h +482 -0
  40. data/include/prism/pack.h +163 -0
  41. data/include/prism/parser.h +933 -0
  42. data/include/prism/prettyprint.h +34 -0
  43. data/include/prism/regexp.h +43 -0
  44. data/include/prism/static_literals.h +121 -0
  45. data/include/prism/util/pm_buffer.h +236 -0
  46. data/include/prism/util/pm_char.h +204 -0
  47. data/include/prism/util/pm_constant_pool.h +218 -0
  48. data/include/prism/util/pm_integer.h +130 -0
  49. data/include/prism/util/pm_list.h +103 -0
  50. data/include/prism/util/pm_memchr.h +29 -0
  51. data/include/prism/util/pm_newline_list.h +113 -0
  52. data/include/prism/util/pm_string.h +200 -0
  53. data/include/prism/util/pm_strncasecmp.h +32 -0
  54. data/include/prism/util/pm_strpbrk.h +46 -0
  55. data/include/prism/version.h +29 -0
  56. data/include/prism.h +408 -0
  57. data/lib/prism/3.0/prism.bundle +0 -0
  58. data/lib/prism/3.1/prism.bundle +0 -0
  59. data/lib/prism/3.2/prism.bundle +0 -0
  60. data/lib/prism/3.3/prism.bundle +0 -0
  61. data/lib/prism/3.4/prism.bundle +0 -0
  62. data/lib/prism/compiler.rb +801 -0
  63. data/lib/prism/desugar_compiler.rb +392 -0
  64. data/lib/prism/dispatcher.rb +2210 -0
  65. data/lib/prism/dot_visitor.rb +4762 -0
  66. data/lib/prism/dsl.rb +1003 -0
  67. data/lib/prism/ffi.rb +570 -0
  68. data/lib/prism/inspect_visitor.rb +2392 -0
  69. data/lib/prism/lex_compat.rb +928 -0
  70. data/lib/prism/mutation_compiler.rb +772 -0
  71. data/lib/prism/node.rb +18816 -0
  72. data/lib/prism/node_ext.rb +511 -0
  73. data/lib/prism/pack.rb +230 -0
  74. data/lib/prism/parse_result/comments.rb +188 -0
  75. data/lib/prism/parse_result/errors.rb +66 -0
  76. data/lib/prism/parse_result/newlines.rb +155 -0
  77. data/lib/prism/parse_result.rb +911 -0
  78. data/lib/prism/pattern.rb +269 -0
  79. data/lib/prism/polyfill/append_as_bytes.rb +15 -0
  80. data/lib/prism/polyfill/byteindex.rb +13 -0
  81. data/lib/prism/polyfill/scan_byte.rb +14 -0
  82. data/lib/prism/polyfill/unpack1.rb +14 -0
  83. data/lib/prism/polyfill/warn.rb +36 -0
  84. data/lib/prism/reflection.rb +416 -0
  85. data/lib/prism/relocation.rb +505 -0
  86. data/lib/prism/serialize.rb +2398 -0
  87. data/lib/prism/string_query.rb +31 -0
  88. data/lib/prism/translation/parser/builder.rb +62 -0
  89. data/lib/prism/translation/parser/compiler.rb +2234 -0
  90. data/lib/prism/translation/parser/lexer.rb +820 -0
  91. data/lib/prism/translation/parser.rb +374 -0
  92. data/lib/prism/translation/parser33.rb +13 -0
  93. data/lib/prism/translation/parser34.rb +13 -0
  94. data/lib/prism/translation/parser35.rb +13 -0
  95. data/lib/prism/translation/parser_current.rb +24 -0
  96. data/lib/prism/translation/ripper/sexp.rb +126 -0
  97. data/lib/prism/translation/ripper/shim.rb +5 -0
  98. data/lib/prism/translation/ripper.rb +3474 -0
  99. data/lib/prism/translation/ruby_parser.rb +1929 -0
  100. data/lib/prism/translation.rb +16 -0
  101. data/lib/prism/visitor.rb +813 -0
  102. data/lib/prism.rb +97 -0
  103. data/prism.gemspec +174 -0
  104. data/rbi/prism/compiler.rbi +12 -0
  105. data/rbi/prism/dsl.rbi +524 -0
  106. data/rbi/prism/inspect_visitor.rbi +12 -0
  107. data/rbi/prism/node.rbi +8734 -0
  108. data/rbi/prism/node_ext.rbi +107 -0
  109. data/rbi/prism/parse_result.rbi +404 -0
  110. data/rbi/prism/reflection.rbi +58 -0
  111. data/rbi/prism/string_query.rbi +12 -0
  112. data/rbi/prism/translation/parser.rbi +11 -0
  113. data/rbi/prism/translation/parser33.rbi +6 -0
  114. data/rbi/prism/translation/parser34.rbi +6 -0
  115. data/rbi/prism/translation/parser35.rbi +6 -0
  116. data/rbi/prism/translation/ripper.rbi +15 -0
  117. data/rbi/prism/visitor.rbi +473 -0
  118. data/rbi/prism.rbi +66 -0
  119. data/sig/prism/compiler.rbs +9 -0
  120. data/sig/prism/dispatcher.rbs +19 -0
  121. data/sig/prism/dot_visitor.rbs +6 -0
  122. data/sig/prism/dsl.rbs +351 -0
  123. data/sig/prism/inspect_visitor.rbs +22 -0
  124. data/sig/prism/lex_compat.rbs +10 -0
  125. data/sig/prism/mutation_compiler.rbs +159 -0
  126. data/sig/prism/node.rbs +4028 -0
  127. data/sig/prism/node_ext.rbs +149 -0
  128. data/sig/prism/pack.rbs +43 -0
  129. data/sig/prism/parse_result/comments.rbs +38 -0
  130. data/sig/prism/parse_result.rbs +196 -0
  131. data/sig/prism/pattern.rbs +13 -0
  132. data/sig/prism/reflection.rbs +50 -0
  133. data/sig/prism/relocation.rbs +185 -0
  134. data/sig/prism/serialize.rbs +8 -0
  135. data/sig/prism/string_query.rbs +11 -0
  136. data/sig/prism/visitor.rbs +169 -0
  137. data/sig/prism.rbs +254 -0
  138. data/src/diagnostic.c +850 -0
  139. data/src/encoding.c +5235 -0
  140. data/src/node.c +8676 -0
  141. data/src/options.c +328 -0
  142. data/src/pack.c +509 -0
  143. data/src/prettyprint.c +8941 -0
  144. data/src/prism.c +23361 -0
  145. data/src/regexp.c +790 -0
  146. data/src/serialize.c +2268 -0
  147. data/src/static_literals.c +617 -0
  148. data/src/token_type.c +703 -0
  149. data/src/util/pm_buffer.c +357 -0
  150. data/src/util/pm_char.c +318 -0
  151. data/src/util/pm_constant_pool.c +342 -0
  152. data/src/util/pm_integer.c +670 -0
  153. data/src/util/pm_list.c +49 -0
  154. data/src/util/pm_memchr.c +35 -0
  155. data/src/util/pm_newline_list.c +125 -0
  156. data/src/util/pm_string.c +381 -0
  157. data/src/util/pm_strncasecmp.c +36 -0
  158. data/src/util/pm_strpbrk.c +206 -0
  159. metadata +202 -0
@@ -0,0 +1,820 @@
1
+ # frozen_string_literal: true
2
+ # :markup: markdown
3
+
4
+ require "strscan"
5
+ require_relative "../../polyfill/append_as_bytes"
6
+ require_relative "../../polyfill/scan_byte"
7
+
8
+ module Prism
9
+ module Translation
10
+ class Parser
11
+ # Accepts a list of prism tokens and converts them into the expected
12
+ # format for the parser gem.
13
+ class Lexer
14
+ # These tokens are always skipped
15
+ TYPES_ALWAYS_SKIP = Set.new(%i[IGNORED_NEWLINE __END__ EOF])
16
+ private_constant :TYPES_ALWAYS_SKIP
17
+
18
+ # The direct translating of types between the two lexers.
19
+ TYPES = {
20
+ # These tokens should never appear in the output of the lexer.
21
+ MISSING: nil,
22
+ NOT_PROVIDED: nil,
23
+ EMBDOC_END: nil,
24
+ EMBDOC_LINE: nil,
25
+
26
+ # These tokens have more or less direct mappings.
27
+ AMPERSAND: :tAMPER2,
28
+ AMPERSAND_AMPERSAND: :tANDOP,
29
+ AMPERSAND_AMPERSAND_EQUAL: :tOP_ASGN,
30
+ AMPERSAND_DOT: :tANDDOT,
31
+ AMPERSAND_EQUAL: :tOP_ASGN,
32
+ BACK_REFERENCE: :tBACK_REF,
33
+ BACKTICK: :tXSTRING_BEG,
34
+ BANG: :tBANG,
35
+ BANG_EQUAL: :tNEQ,
36
+ BANG_TILDE: :tNMATCH,
37
+ BRACE_LEFT: :tLCURLY,
38
+ BRACE_RIGHT: :tRCURLY,
39
+ BRACKET_LEFT: :tLBRACK2,
40
+ BRACKET_LEFT_ARRAY: :tLBRACK,
41
+ BRACKET_LEFT_RIGHT: :tAREF,
42
+ BRACKET_LEFT_RIGHT_EQUAL: :tASET,
43
+ BRACKET_RIGHT: :tRBRACK,
44
+ CARET: :tCARET,
45
+ CARET_EQUAL: :tOP_ASGN,
46
+ CHARACTER_LITERAL: :tCHARACTER,
47
+ CLASS_VARIABLE: :tCVAR,
48
+ COLON: :tCOLON,
49
+ COLON_COLON: :tCOLON2,
50
+ COMMA: :tCOMMA,
51
+ COMMENT: :tCOMMENT,
52
+ CONSTANT: :tCONSTANT,
53
+ DOT: :tDOT,
54
+ DOT_DOT: :tDOT2,
55
+ DOT_DOT_DOT: :tDOT3,
56
+ EMBDOC_BEGIN: :tCOMMENT,
57
+ EMBEXPR_BEGIN: :tSTRING_DBEG,
58
+ EMBEXPR_END: :tSTRING_DEND,
59
+ EMBVAR: :tSTRING_DVAR,
60
+ EQUAL: :tEQL,
61
+ EQUAL_EQUAL: :tEQ,
62
+ EQUAL_EQUAL_EQUAL: :tEQQ,
63
+ EQUAL_GREATER: :tASSOC,
64
+ EQUAL_TILDE: :tMATCH,
65
+ FLOAT: :tFLOAT,
66
+ FLOAT_IMAGINARY: :tIMAGINARY,
67
+ FLOAT_RATIONAL: :tRATIONAL,
68
+ FLOAT_RATIONAL_IMAGINARY: :tIMAGINARY,
69
+ GLOBAL_VARIABLE: :tGVAR,
70
+ GREATER: :tGT,
71
+ GREATER_EQUAL: :tGEQ,
72
+ GREATER_GREATER: :tRSHFT,
73
+ GREATER_GREATER_EQUAL: :tOP_ASGN,
74
+ HEREDOC_START: :tSTRING_BEG,
75
+ HEREDOC_END: :tSTRING_END,
76
+ IDENTIFIER: :tIDENTIFIER,
77
+ INSTANCE_VARIABLE: :tIVAR,
78
+ INTEGER: :tINTEGER,
79
+ INTEGER_IMAGINARY: :tIMAGINARY,
80
+ INTEGER_RATIONAL: :tRATIONAL,
81
+ INTEGER_RATIONAL_IMAGINARY: :tIMAGINARY,
82
+ KEYWORD_ALIAS: :kALIAS,
83
+ KEYWORD_AND: :kAND,
84
+ KEYWORD_BEGIN: :kBEGIN,
85
+ KEYWORD_BEGIN_UPCASE: :klBEGIN,
86
+ KEYWORD_BREAK: :kBREAK,
87
+ KEYWORD_CASE: :kCASE,
88
+ KEYWORD_CLASS: :kCLASS,
89
+ KEYWORD_DEF: :kDEF,
90
+ KEYWORD_DEFINED: :kDEFINED,
91
+ KEYWORD_DO: :kDO,
92
+ KEYWORD_DO_LOOP: :kDO_COND,
93
+ KEYWORD_END: :kEND,
94
+ KEYWORD_END_UPCASE: :klEND,
95
+ KEYWORD_ENSURE: :kENSURE,
96
+ KEYWORD_ELSE: :kELSE,
97
+ KEYWORD_ELSIF: :kELSIF,
98
+ KEYWORD_FALSE: :kFALSE,
99
+ KEYWORD_FOR: :kFOR,
100
+ KEYWORD_IF: :kIF,
101
+ KEYWORD_IF_MODIFIER: :kIF_MOD,
102
+ KEYWORD_IN: :kIN,
103
+ KEYWORD_MODULE: :kMODULE,
104
+ KEYWORD_NEXT: :kNEXT,
105
+ KEYWORD_NIL: :kNIL,
106
+ KEYWORD_NOT: :kNOT,
107
+ KEYWORD_OR: :kOR,
108
+ KEYWORD_REDO: :kREDO,
109
+ KEYWORD_RESCUE: :kRESCUE,
110
+ KEYWORD_RESCUE_MODIFIER: :kRESCUE_MOD,
111
+ KEYWORD_RETRY: :kRETRY,
112
+ KEYWORD_RETURN: :kRETURN,
113
+ KEYWORD_SELF: :kSELF,
114
+ KEYWORD_SUPER: :kSUPER,
115
+ KEYWORD_THEN: :kTHEN,
116
+ KEYWORD_TRUE: :kTRUE,
117
+ KEYWORD_UNDEF: :kUNDEF,
118
+ KEYWORD_UNLESS: :kUNLESS,
119
+ KEYWORD_UNLESS_MODIFIER: :kUNLESS_MOD,
120
+ KEYWORD_UNTIL: :kUNTIL,
121
+ KEYWORD_UNTIL_MODIFIER: :kUNTIL_MOD,
122
+ KEYWORD_WHEN: :kWHEN,
123
+ KEYWORD_WHILE: :kWHILE,
124
+ KEYWORD_WHILE_MODIFIER: :kWHILE_MOD,
125
+ KEYWORD_YIELD: :kYIELD,
126
+ KEYWORD___ENCODING__: :k__ENCODING__,
127
+ KEYWORD___FILE__: :k__FILE__,
128
+ KEYWORD___LINE__: :k__LINE__,
129
+ LABEL: :tLABEL,
130
+ LABEL_END: :tLABEL_END,
131
+ LAMBDA_BEGIN: :tLAMBEG,
132
+ LESS: :tLT,
133
+ LESS_EQUAL: :tLEQ,
134
+ LESS_EQUAL_GREATER: :tCMP,
135
+ LESS_LESS: :tLSHFT,
136
+ LESS_LESS_EQUAL: :tOP_ASGN,
137
+ METHOD_NAME: :tFID,
138
+ MINUS: :tMINUS,
139
+ MINUS_EQUAL: :tOP_ASGN,
140
+ MINUS_GREATER: :tLAMBDA,
141
+ NEWLINE: :tNL,
142
+ NUMBERED_REFERENCE: :tNTH_REF,
143
+ PARENTHESIS_LEFT: :tLPAREN2,
144
+ PARENTHESIS_LEFT_PARENTHESES: :tLPAREN_ARG,
145
+ PARENTHESIS_RIGHT: :tRPAREN,
146
+ PERCENT: :tPERCENT,
147
+ PERCENT_EQUAL: :tOP_ASGN,
148
+ PERCENT_LOWER_I: :tQSYMBOLS_BEG,
149
+ PERCENT_LOWER_W: :tQWORDS_BEG,
150
+ PERCENT_UPPER_I: :tSYMBOLS_BEG,
151
+ PERCENT_UPPER_W: :tWORDS_BEG,
152
+ PERCENT_LOWER_X: :tXSTRING_BEG,
153
+ PLUS: :tPLUS,
154
+ PLUS_EQUAL: :tOP_ASGN,
155
+ PIPE_EQUAL: :tOP_ASGN,
156
+ PIPE: :tPIPE,
157
+ PIPE_PIPE: :tOROP,
158
+ PIPE_PIPE_EQUAL: :tOP_ASGN,
159
+ QUESTION_MARK: :tEH,
160
+ REGEXP_BEGIN: :tREGEXP_BEG,
161
+ REGEXP_END: :tSTRING_END,
162
+ SEMICOLON: :tSEMI,
163
+ SLASH: :tDIVIDE,
164
+ SLASH_EQUAL: :tOP_ASGN,
165
+ STAR: :tSTAR2,
166
+ STAR_EQUAL: :tOP_ASGN,
167
+ STAR_STAR: :tPOW,
168
+ STAR_STAR_EQUAL: :tOP_ASGN,
169
+ STRING_BEGIN: :tSTRING_BEG,
170
+ STRING_CONTENT: :tSTRING_CONTENT,
171
+ STRING_END: :tSTRING_END,
172
+ SYMBOL_BEGIN: :tSYMBEG,
173
+ TILDE: :tTILDE,
174
+ UAMPERSAND: :tAMPER,
175
+ UCOLON_COLON: :tCOLON3,
176
+ UDOT_DOT: :tBDOT2,
177
+ UDOT_DOT_DOT: :tBDOT3,
178
+ UMINUS: :tUMINUS,
179
+ UMINUS_NUM: :tUNARY_NUM,
180
+ UPLUS: :tUPLUS,
181
+ USTAR: :tSTAR,
182
+ USTAR_STAR: :tDSTAR,
183
+ WORDS_SEP: :tSPACE
184
+ }
185
+
186
+ # These constants represent flags in our lex state. We really, really
187
+ # don't want to be using them and we really, really don't want to be
188
+ # exposing them as part of our public API. Unfortunately, we don't have
189
+ # another way of matching the exact tokens that the parser gem expects
190
+ # without them. We should find another way to do this, but in the
191
+ # meantime we'll hide them from the documentation and mark them as
192
+ # private constants.
193
+ EXPR_BEG = 0x1 # :nodoc:
194
+ EXPR_LABEL = 0x400 # :nodoc:
195
+
196
+ # It is used to determine whether `do` is of the token type `kDO` or `kDO_LAMBDA`.
197
+ #
198
+ # NOTE: In edge cases like `-> (foo = -> (bar) {}) do end`, please note that `kDO` is still returned
199
+ # instead of `kDO_LAMBDA`, which is expected: https://github.com/ruby/prism/pull/3046
200
+ LAMBDA_TOKEN_TYPES = Set.new([:kDO_LAMBDA, :tLAMBDA, :tLAMBEG])
201
+
202
+ # The `PARENTHESIS_LEFT` token in Prism is classified as either `tLPAREN` or `tLPAREN2` in the Parser gem.
203
+ # The following token types are listed as those classified as `tLPAREN`.
204
+ LPAREN_CONVERSION_TOKEN_TYPES = Set.new([
205
+ :kBREAK, :tCARET, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
206
+ :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS, :tLCURLY
207
+ ])
208
+
209
+ # Types of tokens that are allowed to continue a method call with comments in-between.
210
+ # For these, the parser gem doesn't emit a newline token after the last comment.
211
+ COMMENT_CONTINUATION_TYPES = Set.new([:COMMENT, :AMPERSAND_DOT, :DOT])
212
+ private_constant :COMMENT_CONTINUATION_TYPES
213
+
214
+ # Heredocs are complex and require us to keep track of a bit of info to refer to later
215
+ HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
216
+
217
+ private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
218
+
219
+ # The Parser::Source::Buffer that the tokens were lexed from.
220
+ attr_reader :source_buffer
221
+
222
+ # An array of tuples that contain prism tokens and their associated lex
223
+ # state when they were lexed.
224
+ attr_reader :lexed
225
+
226
+ # A hash that maps offsets in bytes to offsets in characters.
227
+ attr_reader :offset_cache
228
+
229
+ # Initialize the lexer with the given source buffer, prism tokens, and
230
+ # offset cache.
231
+ def initialize(source_buffer, lexed, offset_cache)
232
+ @source_buffer = source_buffer
233
+ @lexed = lexed
234
+ @offset_cache = offset_cache
235
+ end
236
+
237
+ Range = ::Parser::Source::Range # :nodoc:
238
+ private_constant :Range
239
+
240
+ # Convert the prism tokens into the expected format for the parser gem.
241
+ def to_a
242
+ tokens = []
243
+
244
+ index = 0
245
+ length = lexed.length
246
+
247
+ heredoc_stack = []
248
+ quote_stack = []
249
+
250
+ # The parser gem emits the newline tokens for comments out of order. This saves
251
+ # that token location to emit at a later time to properly line everything up.
252
+ # https://github.com/whitequark/parser/issues/1025
253
+ comment_newline_location = nil
254
+
255
+ while index < length
256
+ token, state = lexed[index]
257
+ index += 1
258
+ next if TYPES_ALWAYS_SKIP.include?(token.type)
259
+
260
+ type = TYPES.fetch(token.type)
261
+ value = token.value
262
+ location = range(token.location.start_offset, token.location.end_offset)
263
+
264
+ case type
265
+ when :kDO
266
+ nearest_lambda_token = tokens.reverse_each.find do |token|
267
+ LAMBDA_TOKEN_TYPES.include?(token.first)
268
+ end
269
+
270
+ if nearest_lambda_token&.first == :tLAMBDA
271
+ type = :kDO_LAMBDA
272
+ end
273
+ when :tCHARACTER
274
+ value.delete_prefix!("?")
275
+ # Character literals behave similar to double-quoted strings. We can use the same escaping mechanism.
276
+ value = unescape_string(value, "?")
277
+ when :tCOMMENT
278
+ if token.type == :EMBDOC_BEGIN
279
+
280
+ while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
281
+ value += next_token.value
282
+ index += 1
283
+ end
284
+
285
+ value += next_token.value
286
+ location = range(token.location.start_offset, next_token.location.end_offset)
287
+ index += 1
288
+ else
289
+ is_at_eol = value.chomp!.nil?
290
+ location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
291
+
292
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
293
+ next_token, _ = lexed[index]
294
+
295
+ is_inline_comment = prev_token&.location&.start_line == token.location.start_line
296
+ if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
297
+ tokens << [:tCOMMENT, [value, location]]
298
+
299
+ nl_location = range(token.location.end_offset - 1, token.location.end_offset)
300
+ tokens << [:tNL, [nil, nl_location]]
301
+ next
302
+ elsif is_inline_comment && next_token&.type == :COMMENT
303
+ comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
304
+ elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
305
+ tokens << [:tCOMMENT, [value, location]]
306
+ tokens << [:tNL, [nil, comment_newline_location]]
307
+ comment_newline_location = nil
308
+ next
309
+ end
310
+ end
311
+ when :tNL
312
+ next_token, _ = lexed[index]
313
+ # Newlines after comments are emitted out of order.
314
+ if next_token&.type == :COMMENT
315
+ comment_newline_location = location
316
+ next
317
+ end
318
+
319
+ value = nil
320
+ when :tFLOAT
321
+ value = parse_float(value)
322
+ when :tIMAGINARY
323
+ value = parse_complex(value)
324
+ when :tINTEGER
325
+ if value.start_with?("+")
326
+ tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
327
+ location = range(token.location.start_offset + 1, token.location.end_offset)
328
+ end
329
+
330
+ value = parse_integer(value)
331
+ when :tLABEL
332
+ value.chomp!(":")
333
+ when :tLABEL_END
334
+ value.chomp!(":")
335
+ when :tLCURLY
336
+ type = :tLBRACE if state == EXPR_BEG | EXPR_LABEL
337
+ when :tLPAREN2
338
+ type = :tLPAREN if tokens.empty? || LPAREN_CONVERSION_TOKEN_TYPES.include?(tokens.dig(-1, 0))
339
+ when :tNTH_REF
340
+ value = parse_integer(value.delete_prefix("$"))
341
+ when :tOP_ASGN
342
+ value.chomp!("=")
343
+ when :tRATIONAL
344
+ value = parse_rational(value)
345
+ when :tSPACE
346
+ location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
347
+ value = nil
348
+ when :tSTRING_BEG
349
+ next_token, _ = lexed[index]
350
+ next_next_token, _ = lexed[index + 1]
351
+ basic_quotes = value == '"' || value == "'"
352
+
353
+ if basic_quotes && next_token&.type == :STRING_END
354
+ next_location = token.location.join(next_token.location)
355
+ type = :tSTRING
356
+ value = ""
357
+ location = range(next_location.start_offset, next_location.end_offset)
358
+ index += 1
359
+ elsif value.start_with?("'", '"', "%")
360
+ if next_token&.type == :STRING_CONTENT && next_next_token&.type == :STRING_END
361
+ string_value = next_token.value
362
+ if simplify_string?(string_value, value)
363
+ next_location = token.location.join(next_next_token.location)
364
+ if percent_array?(value)
365
+ value = percent_array_unescape(string_value)
366
+ else
367
+ value = unescape_string(string_value, value)
368
+ end
369
+ type = :tSTRING
370
+ location = range(next_location.start_offset, next_location.end_offset)
371
+ index += 2
372
+ tokens << [type, [value, location]]
373
+
374
+ next
375
+ end
376
+ end
377
+
378
+ quote_stack.push(value)
379
+ elsif token.type == :HEREDOC_START
380
+ quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
381
+ heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
382
+ heredoc = HeredocData.new(
383
+ identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
384
+ common_whitespace: 0,
385
+ )
386
+
387
+ if quote == "`"
388
+ type = :tXSTRING_BEG
389
+ end
390
+
391
+ # The parser gem trims whitespace from squiggly heredocs. We must record
392
+ # the most common whitespace to later remove.
393
+ if heredoc_type == "~" || heredoc_type == "`"
394
+ heredoc.common_whitespace = calculate_heredoc_whitespace(index)
395
+ end
396
+
397
+ if quote == "'" || quote == '"' || quote == "`"
398
+ value = "<<#{quote}"
399
+ else
400
+ value = '<<"'
401
+ end
402
+
403
+ heredoc_stack.push(heredoc)
404
+ quote_stack.push(value)
405
+ end
406
+ when :tSTRING_CONTENT
407
+ is_percent_array = percent_array?(quote_stack.last)
408
+
409
+ if (lines = token.value.lines).one?
410
+ # Prism usually emits a single token for strings with line continuations.
411
+ # For squiggly heredocs they are not joined so we do that manually here.
412
+ current_string = +""
413
+ current_length = 0
414
+ start_offset = token.location.start_offset
415
+ while token.type == :STRING_CONTENT
416
+ current_length += token.value.bytesize
417
+ # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
418
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
419
+ is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
420
+ # The parser gem only removes indentation when the heredoc is not nested
421
+ not_nested = heredoc_stack.size == 1
422
+ if is_percent_array
423
+ value = percent_array_unescape(token.value)
424
+ elsif is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
425
+ value = trim_heredoc_whitespace(token.value, current_heredoc)
426
+ end
427
+
428
+ current_string << unescape_string(value, quote_stack.last)
429
+ relevant_backslash_count = if quote_stack.last.start_with?("%W", "%I")
430
+ 0 # the last backslash escapes the newline
431
+ else
432
+ token.value[/(\\{1,})\n/, 1]&.length || 0
433
+ end
434
+ if relevant_backslash_count.even? || !interpolation?(quote_stack.last)
435
+ tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
436
+ break
437
+ end
438
+ token, _ = lexed[index]
439
+ index += 1
440
+ end
441
+ else
442
+ # When the parser gem encounters a line continuation inside of a multiline string,
443
+ # it emits a single string node. The backslash (and remaining newline) is removed.
444
+ current_line = +""
445
+ adjustment = 0
446
+ start_offset = token.location.start_offset
447
+ emit = false
448
+
449
+ lines.each.with_index do |line, index|
450
+ chomped_line = line.chomp
451
+ backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
452
+ is_interpolation = interpolation?(quote_stack.last)
453
+
454
+ if backslash_count.odd? && (is_interpolation || is_percent_array)
455
+ if is_percent_array
456
+ current_line << percent_array_unescape(line)
457
+ adjustment += 1
458
+ else
459
+ chomped_line.delete_suffix!("\\")
460
+ current_line << chomped_line
461
+ adjustment += 2
462
+ end
463
+ # If the string ends with a line continuation emit the remainder
464
+ emit = index == lines.count - 1
465
+ else
466
+ current_line << line
467
+ emit = true
468
+ end
469
+
470
+ if emit
471
+ end_offset = start_offset + current_line.bytesize + adjustment
472
+ tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
473
+ start_offset = end_offset
474
+ current_line = +""
475
+ adjustment = 0
476
+ end
477
+ end
478
+ end
479
+ next
480
+ when :tSTRING_DVAR
481
+ value = nil
482
+ when :tSTRING_END
483
+ if token.type == :HEREDOC_END && value.end_with?("\n")
484
+ newline_length = value.end_with?("\r\n") ? 2 : 1
485
+ value = heredoc_stack.pop.identifier
486
+ location = range(token.location.start_offset, token.location.end_offset - newline_length)
487
+ elsif token.type == :REGEXP_END
488
+ value = value[0]
489
+ location = range(token.location.start_offset, token.location.start_offset + 1)
490
+ end
491
+
492
+ if percent_array?(quote_stack.pop)
493
+ prev_token, _ = lexed[index - 2] if index - 2 >= 0
494
+ empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
495
+ ends_with_whitespace = prev_token&.type == :WORDS_SEP
496
+ # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
497
+ if !empty && !ends_with_whitespace
498
+ tokens << [:tSPACE, [nil, range(token.location.start_offset, token.location.start_offset)]]
499
+ end
500
+ end
501
+ when :tSYMBEG
502
+ if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
503
+ next_location = token.location.join(next_token.location)
504
+ type = :tSYMBOL
505
+ value = next_token.value
506
+ value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
507
+ location = range(next_location.start_offset, next_location.end_offset)
508
+ index += 1
509
+ else
510
+ quote_stack.push(value)
511
+ end
512
+ when :tFID
513
+ if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
514
+ type = :tIDENTIFIER
515
+ end
516
+ when :tXSTRING_BEG
517
+ if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
518
+ # self.`()
519
+ type = :tBACK_REF2
520
+ end
521
+ quote_stack.push(value)
522
+ when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
523
+ if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
524
+ index += 1
525
+ end
526
+
527
+ quote_stack.push(value)
528
+ when :tREGEXP_BEG
529
+ quote_stack.push(value)
530
+ end
531
+
532
+ tokens << [type, [value, location]]
533
+
534
+ if token.type == :REGEXP_END
535
+ tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
536
+ end
537
+ end
538
+
539
+ tokens
540
+ end
541
+
542
+ private
543
+
544
+ # Creates a new parser range, taking prisms byte offsets into account
545
+ def range(start_offset, end_offset)
546
+ Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
547
+ end
548
+
549
+ # Parse an integer from the string representation.
550
+ def parse_integer(value)
551
+ Integer(value)
552
+ rescue ArgumentError
553
+ 0
554
+ end
555
+
556
+ # Parse a float from the string representation.
557
+ def parse_float(value)
558
+ Float(value)
559
+ rescue ArgumentError
560
+ 0.0
561
+ end
562
+
563
+ # Parse a complex from the string representation.
564
+ def parse_complex(value)
565
+ value.chomp!("i")
566
+
567
+ if value.end_with?("r")
568
+ Complex(0, parse_rational(value))
569
+ elsif value.start_with?(/0[BbOoDdXx]/)
570
+ Complex(0, parse_integer(value))
571
+ else
572
+ Complex(0, value)
573
+ end
574
+ rescue ArgumentError
575
+ 0i
576
+ end
577
+
578
+ # Parse a rational from the string representation.
579
+ def parse_rational(value)
580
+ value.chomp!("r")
581
+
582
+ if value.start_with?(/0[BbOoDdXx]/)
583
+ Rational(parse_integer(value))
584
+ else
585
+ Rational(value)
586
+ end
587
+ rescue ArgumentError
588
+ 0r
589
+ end
590
+
591
+ # Wonky heredoc tab/spaces rules.
592
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L10548-L10558
593
+ def calculate_heredoc_whitespace(heredoc_token_index)
594
+ next_token_index = heredoc_token_index
595
+ nesting_level = 0
596
+ previous_line = -1
597
+ result = Float::MAX
598
+
599
+ while (next_token = lexed[next_token_index]&.first)
600
+ next_token_index += 1
601
+ next_next_token, _ = lexed[next_token_index]
602
+ first_token_on_line = next_token.location.start_column == 0
603
+
604
+ # String content inside nested heredocs and interpolation is ignored
605
+ if next_token.type == :HEREDOC_START || next_token.type == :EMBEXPR_BEGIN
606
+ # When interpolation is the first token of a line there is no string
607
+ # content to check against. There will be no common whitespace.
608
+ if nesting_level == 0 && first_token_on_line
609
+ result = 0
610
+ end
611
+ nesting_level += 1
612
+ elsif next_token.type == :HEREDOC_END || next_token.type == :EMBEXPR_END
613
+ nesting_level -= 1
614
+ # When we encountered the matching heredoc end, we can exit
615
+ break if nesting_level == -1
616
+ elsif next_token.type == :STRING_CONTENT && nesting_level == 0 && first_token_on_line
617
+ common_whitespace = 0
618
+ next_token.value[/^\s*/].each_char do |char|
619
+ if char == "\t"
620
+ common_whitespace = (common_whitespace / 8 + 1) * 8;
621
+ else
622
+ common_whitespace += 1
623
+ end
624
+ end
625
+
626
+ is_first_token_on_line = next_token.location.start_line != previous_line
627
+ # Whitespace is significant if followed by interpolation
628
+ whitespace_only = common_whitespace == next_token.value.length && next_next_token&.location&.start_line != next_token.location.start_line
629
+ if is_first_token_on_line && !whitespace_only && common_whitespace < result
630
+ result = common_whitespace
631
+ previous_line = next_token.location.start_line
632
+ end
633
+ end
634
+ end
635
+ result
636
+ end
637
+
638
+ # Wonky heredoc tab/spaces rules.
639
+ # https://github.com/ruby/prism/blob/v1.3.0/src/prism.c#L16528-L16545
640
+ def trim_heredoc_whitespace(string, heredoc)
641
+ trimmed_whitespace = 0
642
+ trimmed_characters = 0
643
+ while (string[trimmed_characters] == "\t" || string[trimmed_characters] == " ") && trimmed_whitespace < heredoc.common_whitespace
644
+ if string[trimmed_characters] == "\t"
645
+ trimmed_whitespace = (trimmed_whitespace / 8 + 1) * 8;
646
+ break if trimmed_whitespace > heredoc.common_whitespace
647
+ else
648
+ trimmed_whitespace += 1
649
+ end
650
+ trimmed_characters += 1
651
+ end
652
+
653
+ string[trimmed_characters..]
654
+ end
655
+
656
+ # Escape sequences that have special and should appear unescaped in the resulting string.
657
+ ESCAPES = {
658
+ "a" => "\a", "b" => "\b", "e" => "\e", "f" => "\f",
659
+ "n" => "\n", "r" => "\r", "s" => "\s", "t" => "\t",
660
+ "v" => "\v", "\\" => "\\"
661
+ }.freeze
662
+ private_constant :ESCAPES
663
+
664
+ # When one of these delimiters is encountered, then the other
665
+ # one is allowed to be escaped as well.
666
+ DELIMITER_SYMETRY = { "[" => "]", "(" => ")", "{" => "}", "<" => ">" }.freeze
667
+ private_constant :DELIMITER_SYMETRY
668
+
669
+
670
+ # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/lexer-strings.rl#L14
671
+ REGEXP_META_CHARACTERS = ["\\", "$", "(", ")", "*", "+", ".", "<", ">", "?", "[", "]", "^", "{", "|", "}"]
672
+ private_constant :REGEXP_META_CHARACTERS
673
+
674
+ # Apply Ruby string escaping rules
675
+ def unescape_string(string, quote)
676
+ # In single-quoted heredocs, everything is taken literally.
677
+ return string if quote == "<<'"
678
+
679
+ # OPTIMIZATION: Assume that few strings need escaping to speed up the common case.
680
+ return string unless string.include?("\\")
681
+
682
+ # Enclosing character for the string. `"` for `"foo"`, `{` for `%w{foo}`, etc.
683
+ delimiter = quote[-1]
684
+
685
+ if regexp?(quote)
686
+ # Should be escaped handled to single-quoted heredocs. The only character that is
687
+ # allowed to be escaped is the delimiter, except when that also has special meaning
688
+ # in the regexp. Since all the symetry delimiters have special meaning, they don't need
689
+ # to be considered separately.
690
+ if REGEXP_META_CHARACTERS.include?(delimiter)
691
+ string
692
+ else
693
+ # There can never be an even amount of backslashes. It would be a syntax error.
694
+ string.gsub(/\\(#{Regexp.escape(delimiter)})/, '\1')
695
+ end
696
+ elsif interpolation?(quote)
697
+ # Appending individual escape sequences may force the string out of its intended
698
+ # encoding. Start out with binary and force it back later.
699
+ result = "".b
700
+
701
+ scanner = StringScanner.new(string)
702
+ while (skipped = scanner.skip_until(/\\/))
703
+ # Append what was just skipped over, excluding the found backslash.
704
+ result.append_as_bytes(string.byteslice(scanner.pos - skipped, skipped - 1))
705
+ escape_read(result, scanner, false, false)
706
+ end
707
+
708
+ # Add remaining chars
709
+ result.append_as_bytes(string.byteslice(scanner.pos..))
710
+ result.force_encoding(source_buffer.source.encoding)
711
+ else
712
+ delimiters = Regexp.escape("#{delimiter}#{DELIMITER_SYMETRY[delimiter]}")
713
+ string.gsub(/\\([\\#{delimiters}])/, '\1')
714
+ end
715
+ end
716
+
717
+ # Certain strings are merged into a single string token.
718
+ def simplify_string?(value, quote)
719
+ case quote
720
+ when "'"
721
+ # Only simplify 'foo'
722
+ !value.include?("\n")
723
+ when '"'
724
+ # Simplify when every line ends with a line continuation, or it is the last line
725
+ value.lines.all? do |line|
726
+ !line.end_with?("\n") || line[/(\\*)$/, 1]&.length&.odd?
727
+ end
728
+ else
729
+ # %q and similar are never simplified
730
+ false
731
+ end
732
+ end
733
+
734
+ # Escape a byte value, given the control and meta flags.
735
+ def escape_build(value, control, meta)
736
+ value &= 0x9f if control
737
+ value |= 0x80 if meta
738
+ value
739
+ end
740
+
741
+ # Read an escape out of the string scanner, given the control and meta
742
+ # flags, and push the unescaped value into the result.
743
+ def escape_read(result, scanner, control, meta)
744
+ if scanner.skip("\n")
745
+ # Line continuation
746
+ elsif (value = ESCAPES[scanner.peek(1)])
747
+ # Simple single-character escape sequences like \n
748
+ result.append_as_bytes(value)
749
+ scanner.pos += 1
750
+ elsif (value = scanner.scan(/[0-7]{1,3}/))
751
+ # \nnn
752
+ result.append_as_bytes(escape_build(value.to_i(8), control, meta))
753
+ elsif (value = scanner.scan(/x[0-9a-fA-F]{1,2}/))
754
+ # \xnn
755
+ result.append_as_bytes(escape_build(value[1..].to_i(16), control, meta))
756
+ elsif (value = scanner.scan(/u[0-9a-fA-F]{4}/))
757
+ # \unnnn
758
+ result.append_as_bytes(value[1..].hex.chr(Encoding::UTF_8))
759
+ elsif scanner.skip("u{}")
760
+ # https://github.com/whitequark/parser/issues/856
761
+ elsif (value = scanner.scan(/u{.*?}/))
762
+ # \u{nnnn ...}
763
+ value[2..-2].split.each do |unicode|
764
+ result.append_as_bytes(unicode.hex.chr(Encoding::UTF_8))
765
+ end
766
+ elsif (value = scanner.scan(/c\\?(?=[[:print:]])|C-\\?(?=[[:print:]])/))
767
+ # \cx or \C-x where x is an ASCII printable character
768
+ escape_read(result, scanner, true, meta)
769
+ elsif (value = scanner.scan(/M-\\?(?=[[:print:]])/))
770
+ # \M-x where x is an ASCII printable character
771
+ escape_read(result, scanner, control, true)
772
+ elsif (byte = scanner.scan_byte)
773
+ # Something else after an escape.
774
+ if control && byte == 0x3f # ASCII '?'
775
+ result.append_as_bytes(escape_build(0x7f, false, meta))
776
+ else
777
+ result.append_as_bytes(escape_build(byte, control, meta))
778
+ end
779
+ end
780
+ end
781
+
782
+ # In a percent array, certain whitespace can be preceeded with a backslash,
783
+ # causing the following characters to be part of the previous element.
784
+ def percent_array_unescape(string)
785
+ string.gsub(/(\\)+[ \f\n\r\t\v]/) do |full_match|
786
+ full_match.delete_prefix!("\\") if Regexp.last_match[1].length.odd?
787
+ full_match
788
+ end
789
+ end
790
+
791
+ # For %-arrays whitespace, the parser gem only considers whitespace before the newline.
792
+ def percent_array_leading_whitespace(string)
793
+ return 1 if string.start_with?("\n")
794
+
795
+ leading_whitespace = 0
796
+ string.each_char do |c|
797
+ break if c == "\n"
798
+ leading_whitespace += 1
799
+ end
800
+ leading_whitespace
801
+ end
802
+
803
+ # Determine if characters preceeded by a backslash should be escaped or not
804
+ def interpolation?(quote)
805
+ !quote.end_with?("'") && !quote.start_with?("%q", "%w", "%i", "%s")
806
+ end
807
+
808
+ # Regexp allow interpolation but are handled differently during unescaping
809
+ def regexp?(quote)
810
+ quote == "/" || quote.start_with?("%r")
811
+ end
812
+
813
+ # Determine if the string is part of a %-style array.
814
+ def percent_array?(quote)
815
+ quote.start_with?("%w", "%W", "%i", "%I")
816
+ end
817
+ end
818
+ end
819
+ end
820
+ end