jruby-prism-parser 0.23.0.pre.SNAPSHOT-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +401 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +101 -0
  7. data/README.md +98 -0
  8. data/config.yml +2902 -0
  9. data/docs/build_system.md +91 -0
  10. data/docs/configuration.md +64 -0
  11. data/docs/cruby_compilation.md +27 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +121 -0
  14. data/docs/fuzzing.md +88 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/javascript.md +118 -0
  17. data/docs/local_variable_depth.md +229 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/parser_translation.md +34 -0
  20. data/docs/parsing_rules.md +19 -0
  21. data/docs/releasing.md +98 -0
  22. data/docs/ripper.md +36 -0
  23. data/docs/ruby_api.md +43 -0
  24. data/docs/ruby_parser_translation.md +19 -0
  25. data/docs/serialization.md +209 -0
  26. data/docs/testing.md +55 -0
  27. data/ext/prism/api_node.c +5098 -0
  28. data/ext/prism/api_pack.c +267 -0
  29. data/ext/prism/extconf.rb +110 -0
  30. data/ext/prism/extension.c +1155 -0
  31. data/ext/prism/extension.h +18 -0
  32. data/include/prism/ast.h +5807 -0
  33. data/include/prism/defines.h +102 -0
  34. data/include/prism/diagnostic.h +339 -0
  35. data/include/prism/encoding.h +265 -0
  36. data/include/prism/node.h +57 -0
  37. data/include/prism/options.h +230 -0
  38. data/include/prism/pack.h +152 -0
  39. data/include/prism/parser.h +732 -0
  40. data/include/prism/prettyprint.h +26 -0
  41. data/include/prism/regexp.h +33 -0
  42. data/include/prism/util/pm_buffer.h +155 -0
  43. data/include/prism/util/pm_char.h +205 -0
  44. data/include/prism/util/pm_constant_pool.h +209 -0
  45. data/include/prism/util/pm_list.h +97 -0
  46. data/include/prism/util/pm_memchr.h +29 -0
  47. data/include/prism/util/pm_newline_list.h +93 -0
  48. data/include/prism/util/pm_state_stack.h +42 -0
  49. data/include/prism/util/pm_string.h +150 -0
  50. data/include/prism/util/pm_string_list.h +44 -0
  51. data/include/prism/util/pm_strncasecmp.h +32 -0
  52. data/include/prism/util/pm_strpbrk.h +46 -0
  53. data/include/prism/version.h +29 -0
  54. data/include/prism.h +289 -0
  55. data/jruby-prism.jar +0 -0
  56. data/lib/prism/compiler.rb +486 -0
  57. data/lib/prism/debug.rb +206 -0
  58. data/lib/prism/desugar_compiler.rb +207 -0
  59. data/lib/prism/dispatcher.rb +2150 -0
  60. data/lib/prism/dot_visitor.rb +4634 -0
  61. data/lib/prism/dsl.rb +785 -0
  62. data/lib/prism/ffi.rb +346 -0
  63. data/lib/prism/lex_compat.rb +908 -0
  64. data/lib/prism/mutation_compiler.rb +753 -0
  65. data/lib/prism/node.rb +17864 -0
  66. data/lib/prism/node_ext.rb +212 -0
  67. data/lib/prism/node_inspector.rb +68 -0
  68. data/lib/prism/pack.rb +224 -0
  69. data/lib/prism/parse_result/comments.rb +177 -0
  70. data/lib/prism/parse_result/newlines.rb +64 -0
  71. data/lib/prism/parse_result.rb +498 -0
  72. data/lib/prism/pattern.rb +250 -0
  73. data/lib/prism/serialize.rb +1354 -0
  74. data/lib/prism/translation/parser/compiler.rb +1838 -0
  75. data/lib/prism/translation/parser/lexer.rb +335 -0
  76. data/lib/prism/translation/parser/rubocop.rb +37 -0
  77. data/lib/prism/translation/parser.rb +178 -0
  78. data/lib/prism/translation/ripper.rb +577 -0
  79. data/lib/prism/translation/ruby_parser.rb +1521 -0
  80. data/lib/prism/translation.rb +11 -0
  81. data/lib/prism/version.rb +3 -0
  82. data/lib/prism/visitor.rb +495 -0
  83. data/lib/prism.rb +99 -0
  84. data/prism.gemspec +135 -0
  85. data/rbi/prism.rbi +7767 -0
  86. data/rbi/prism_static.rbi +207 -0
  87. data/sig/prism.rbs +4773 -0
  88. data/sig/prism_static.rbs +201 -0
  89. data/src/diagnostic.c +400 -0
  90. data/src/encoding.c +5132 -0
  91. data/src/node.c +2786 -0
  92. data/src/options.c +213 -0
  93. data/src/pack.c +493 -0
  94. data/src/prettyprint.c +8881 -0
  95. data/src/prism.c +18406 -0
  96. data/src/regexp.c +638 -0
  97. data/src/serialize.c +1554 -0
  98. data/src/token_type.c +700 -0
  99. data/src/util/pm_buffer.c +190 -0
  100. data/src/util/pm_char.c +318 -0
  101. data/src/util/pm_constant_pool.c +322 -0
  102. data/src/util/pm_list.c +49 -0
  103. data/src/util/pm_memchr.c +35 -0
  104. data/src/util/pm_newline_list.c +84 -0
  105. data/src/util/pm_state_stack.c +25 -0
  106. data/src/util/pm_string.c +203 -0
  107. data/src/util/pm_string_list.c +28 -0
  108. data/src/util/pm_strncasecmp.c +24 -0
  109. data/src/util/pm_strpbrk.c +180 -0
  110. metadata +156 -0
@@ -0,0 +1,908 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+ require "ripper"
5
+
6
+ module Prism
7
+ # This class is responsible for lexing the source using prism and then
8
+ # converting those tokens to be compatible with Ripper. In the vast majority
9
+ # of cases, this is a one-to-one mapping of the token type. Everything else
10
+ # generally lines up. However, there are a few cases that require special
11
+ # handling.
12
+ class LexCompat # :nodoc:
13
+ # This is a mapping of prism token types to Ripper token types. This is a
14
+ # many-to-one mapping because we split up our token types, whereas Ripper
15
+ # tends to group them.
16
+ RIPPER = {
17
+ AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND: :on_op,
19
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
20
+ AMPERSAND_DOT: :on_op,
21
+ AMPERSAND_EQUAL: :on_op,
22
+ BACK_REFERENCE: :on_backref,
23
+ BACKTICK: :on_backtick,
24
+ BANG: :on_op,
25
+ BANG_EQUAL: :on_op,
26
+ BANG_TILDE: :on_op,
27
+ BRACE_LEFT: :on_lbrace,
28
+ BRACE_RIGHT: :on_rbrace,
29
+ BRACKET_LEFT: :on_lbracket,
30
+ BRACKET_LEFT_ARRAY: :on_lbracket,
31
+ BRACKET_LEFT_RIGHT: :on_op,
32
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
33
+ BRACKET_RIGHT: :on_rbracket,
34
+ CARET: :on_op,
35
+ CARET_EQUAL: :on_op,
36
+ CHARACTER_LITERAL: :on_CHAR,
37
+ CLASS_VARIABLE: :on_cvar,
38
+ COLON: :on_op,
39
+ COLON_COLON: :on_op,
40
+ COMMA: :on_comma,
41
+ COMMENT: :on_comment,
42
+ CONSTANT: :on_const,
43
+ DOT: :on_period,
44
+ DOT_DOT: :on_op,
45
+ DOT_DOT_DOT: :on_op,
46
+ EMBDOC_BEGIN: :on_embdoc_beg,
47
+ EMBDOC_END: :on_embdoc_end,
48
+ EMBDOC_LINE: :on_embdoc,
49
+ EMBEXPR_BEGIN: :on_embexpr_beg,
50
+ EMBEXPR_END: :on_embexpr_end,
51
+ EMBVAR: :on_embvar,
52
+ EOF: :on_eof,
53
+ EQUAL: :on_op,
54
+ EQUAL_EQUAL: :on_op,
55
+ EQUAL_EQUAL_EQUAL: :on_op,
56
+ EQUAL_GREATER: :on_op,
57
+ EQUAL_TILDE: :on_op,
58
+ FLOAT: :on_float,
59
+ FLOAT_IMAGINARY: :on_imaginary,
60
+ FLOAT_RATIONAL: :on_rational,
61
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
62
+ GREATER: :on_op,
63
+ GREATER_EQUAL: :on_op,
64
+ GREATER_GREATER: :on_op,
65
+ GREATER_GREATER_EQUAL: :on_op,
66
+ GLOBAL_VARIABLE: :on_gvar,
67
+ HEREDOC_END: :on_heredoc_end,
68
+ HEREDOC_START: :on_heredoc_beg,
69
+ IDENTIFIER: :on_ident,
70
+ IGNORED_NEWLINE: :on_ignored_nl,
71
+ INTEGER: :on_int,
72
+ INTEGER_IMAGINARY: :on_imaginary,
73
+ INTEGER_RATIONAL: :on_rational,
74
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
75
+ INSTANCE_VARIABLE: :on_ivar,
76
+ INVALID: :INVALID,
77
+ KEYWORD___ENCODING__: :on_kw,
78
+ KEYWORD___LINE__: :on_kw,
79
+ KEYWORD___FILE__: :on_kw,
80
+ KEYWORD_ALIAS: :on_kw,
81
+ KEYWORD_AND: :on_kw,
82
+ KEYWORD_BEGIN: :on_kw,
83
+ KEYWORD_BEGIN_UPCASE: :on_kw,
84
+ KEYWORD_BREAK: :on_kw,
85
+ KEYWORD_CASE: :on_kw,
86
+ KEYWORD_CLASS: :on_kw,
87
+ KEYWORD_DEF: :on_kw,
88
+ KEYWORD_DEFINED: :on_kw,
89
+ KEYWORD_DO: :on_kw,
90
+ KEYWORD_DO_LOOP: :on_kw,
91
+ KEYWORD_ELSE: :on_kw,
92
+ KEYWORD_ELSIF: :on_kw,
93
+ KEYWORD_END: :on_kw,
94
+ KEYWORD_END_UPCASE: :on_kw,
95
+ KEYWORD_ENSURE: :on_kw,
96
+ KEYWORD_FALSE: :on_kw,
97
+ KEYWORD_FOR: :on_kw,
98
+ KEYWORD_IF: :on_kw,
99
+ KEYWORD_IF_MODIFIER: :on_kw,
100
+ KEYWORD_IN: :on_kw,
101
+ KEYWORD_MODULE: :on_kw,
102
+ KEYWORD_NEXT: :on_kw,
103
+ KEYWORD_NIL: :on_kw,
104
+ KEYWORD_NOT: :on_kw,
105
+ KEYWORD_OR: :on_kw,
106
+ KEYWORD_REDO: :on_kw,
107
+ KEYWORD_RESCUE: :on_kw,
108
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
109
+ KEYWORD_RETRY: :on_kw,
110
+ KEYWORD_RETURN: :on_kw,
111
+ KEYWORD_SELF: :on_kw,
112
+ KEYWORD_SUPER: :on_kw,
113
+ KEYWORD_THEN: :on_kw,
114
+ KEYWORD_TRUE: :on_kw,
115
+ KEYWORD_UNDEF: :on_kw,
116
+ KEYWORD_UNLESS: :on_kw,
117
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
118
+ KEYWORD_UNTIL: :on_kw,
119
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
120
+ KEYWORD_WHEN: :on_kw,
121
+ KEYWORD_WHILE: :on_kw,
122
+ KEYWORD_WHILE_MODIFIER: :on_kw,
123
+ KEYWORD_YIELD: :on_kw,
124
+ LABEL: :on_label,
125
+ LABEL_END: :on_label_end,
126
+ LAMBDA_BEGIN: :on_tlambeg,
127
+ LESS: :on_op,
128
+ LESS_EQUAL: :on_op,
129
+ LESS_EQUAL_GREATER: :on_op,
130
+ LESS_LESS: :on_op,
131
+ LESS_LESS_EQUAL: :on_op,
132
+ METHOD_NAME: :on_ident,
133
+ MINUS: :on_op,
134
+ MINUS_EQUAL: :on_op,
135
+ MINUS_GREATER: :on_tlambda,
136
+ NEWLINE: :on_nl,
137
+ NUMBERED_REFERENCE: :on_backref,
138
+ PARENTHESIS_LEFT: :on_lparen,
139
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
140
+ PARENTHESIS_RIGHT: :on_rparen,
141
+ PERCENT: :on_op,
142
+ PERCENT_EQUAL: :on_op,
143
+ PERCENT_LOWER_I: :on_qsymbols_beg,
144
+ PERCENT_LOWER_W: :on_qwords_beg,
145
+ PERCENT_LOWER_X: :on_backtick,
146
+ PERCENT_UPPER_I: :on_symbols_beg,
147
+ PERCENT_UPPER_W: :on_words_beg,
148
+ PIPE: :on_op,
149
+ PIPE_EQUAL: :on_op,
150
+ PIPE_PIPE: :on_op,
151
+ PIPE_PIPE_EQUAL: :on_op,
152
+ PLUS: :on_op,
153
+ PLUS_EQUAL: :on_op,
154
+ QUESTION_MARK: :on_op,
155
+ RATIONAL_FLOAT: :on_rational,
156
+ RATIONAL_INTEGER: :on_rational,
157
+ REGEXP_BEGIN: :on_regexp_beg,
158
+ REGEXP_END: :on_regexp_end,
159
+ SEMICOLON: :on_semicolon,
160
+ SLASH: :on_op,
161
+ SLASH_EQUAL: :on_op,
162
+ STAR: :on_op,
163
+ STAR_EQUAL: :on_op,
164
+ STAR_STAR: :on_op,
165
+ STAR_STAR_EQUAL: :on_op,
166
+ STRING_BEGIN: :on_tstring_beg,
167
+ STRING_CONTENT: :on_tstring_content,
168
+ STRING_END: :on_tstring_end,
169
+ SYMBOL_BEGIN: :on_symbeg,
170
+ TILDE: :on_op,
171
+ UAMPERSAND: :on_op,
172
+ UCOLON_COLON: :on_op,
173
+ UDOT_DOT: :on_op,
174
+ UDOT_DOT_DOT: :on_op,
175
+ UMINUS: :on_op,
176
+ UMINUS_NUM: :on_op,
177
+ UPLUS: :on_op,
178
+ USTAR: :on_op,
179
+ USTAR_STAR: :on_op,
180
+ WORDS_SEP: :on_words_sep,
181
+ "__END__": :on___end__
182
+ }.freeze
183
+
184
+ # When we produce tokens, we produce the same arrays that Ripper does.
185
+ # However, we add a couple of convenience methods onto them to make them a
186
+ # little easier to work with. We delegate all other methods to the array.
187
+ class Token < SimpleDelegator
188
+ # The location of the token in the source.
189
+ def location
190
+ self[0]
191
+ end
192
+
193
+ # The type of the token.
194
+ def event
195
+ self[1]
196
+ end
197
+
198
+ # The slice of the source that this token represents.
199
+ def value
200
+ self[2]
201
+ end
202
+
203
+ # The state of the lexer when this token was produced.
204
+ def state
205
+ self[3]
206
+ end
207
+ end
208
+
209
+ # Ripper doesn't include the rest of the token in the event, so we need to
210
+ # trim it down to just the content on the first line when comparing.
211
+ class EndContentToken < Token
212
+ def ==(other) # :nodoc:
213
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
214
+ end
215
+ end
216
+
217
+ # Tokens where state should be ignored
218
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
219
+ class IgnoreStateToken < Token
220
+ def ==(other) # :nodoc:
221
+ self[0...-1] == other[0...-1]
222
+ end
223
+ end
224
+
225
+ # Ident tokens for the most part are exactly the same, except sometimes we
226
+ # know an ident is a local when ripper doesn't (when they are introduced
227
+ # through named captures in regular expressions). In that case we don't
228
+ # compare the state.
229
+ class IdentToken < Token
230
+ def ==(other) # :nodoc:
231
+ (self[0...-1] == other[0...-1]) && (
232
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
233
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
234
+ )
235
+ end
236
+ end
237
+
238
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
239
+ # we compare the state differently here.
240
+ class IgnoredNewlineToken < Token
241
+ def ==(other) # :nodoc:
242
+ return false unless self[0...-1] == other[0...-1]
243
+
244
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
245
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
246
+ else
247
+ self[4] == other[4]
248
+ end
249
+ end
250
+ end
251
+
252
+ # If we have an identifier that follows a method name like:
253
+ #
254
+ # def foo bar
255
+ #
256
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
257
+ # scope named bar because it hasn't pushed the local table yet. We do this
258
+ # more accurately, so we need to allow comparing against both END and
259
+ # END|LABEL.
260
+ class ParamToken < Token
261
+ def ==(other) # :nodoc:
262
+ (self[0...-1] == other[0...-1]) && (
263
+ (other[3] == Ripper::EXPR_END) ||
264
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
265
+ )
266
+ end
267
+ end
268
+
269
+ # A heredoc in this case is a list of tokens that belong to the body of the
270
+ # heredoc that should be appended onto the list of tokens when the heredoc
271
+ # closes.
272
+ module Heredoc # :nodoc:
273
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
274
+ # We need to keep them around so that we can insert them in the correct
275
+ # order back into the token stream and set the state of the last token to
276
+ # the state that the heredoc was opened in.
277
+ class PlainHeredoc # :nodoc:
278
+ attr_reader :tokens
279
+
280
+ def initialize
281
+ @tokens = []
282
+ end
283
+
284
+ def <<(token)
285
+ tokens << token
286
+ end
287
+
288
+ def to_a
289
+ tokens
290
+ end
291
+ end
292
+
293
+ # Dash heredocs are a little more complicated. They are a list of tokens
294
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
295
+ # to keep track of the state that the heredoc was opened in.
296
+ class DashHeredoc # :nodoc:
297
+ attr_reader :split, :tokens
298
+
299
+ def initialize(split)
300
+ @split = split
301
+ @tokens = []
302
+ end
303
+
304
+ def <<(token)
305
+ tokens << token
306
+ end
307
+
308
+ def to_a
309
+ embexpr_balance = 0
310
+
311
+ tokens.each_with_object([]) do |token, results|
312
+ case token.event
313
+ when :on_embexpr_beg
314
+ embexpr_balance += 1
315
+ results << token
316
+ when :on_embexpr_end
317
+ embexpr_balance -= 1
318
+ results << token
319
+ when :on_tstring_content
320
+ if embexpr_balance == 0
321
+ lineno = token[0][0]
322
+ column = token[0][1]
323
+
324
+ if split
325
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
326
+ # to keep the delimiter in the result.
327
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
328
+ column = 0 if index > 0
329
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
330
+ lineno += value.count("\n")
331
+ end
332
+ else
333
+ results << token
334
+ end
335
+ else
336
+ results << token
337
+ end
338
+ else
339
+ results << token
340
+ end
341
+ end
342
+ end
343
+ end
344
+
345
+ # Heredocs that are dedenting heredocs are a little more complicated.
346
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
347
+ # removed from the output. prism only modifies the node itself and keeps
348
+ # the token the same. This simplifies prism, but makes comparing against
349
+ # Ripper much harder because there is a length mismatch.
350
+ #
351
+ # Fortunately, we already have to pull out the heredoc tokens in order to
352
+ # insert them into the stream in the correct order. As such, we can do
353
+ # some extra manipulation on the tokens to make them match Ripper's
354
+ # output by mirroring the dedent logic that Ripper uses.
355
+ class DedentingHeredoc # :nodoc:
356
+ TAB_WIDTH = 8
357
+
358
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
359
+
360
+ def initialize
361
+ @tokens = []
362
+ @dedent_next = true
363
+ @dedent = nil
364
+ @embexpr_balance = 0
365
+ @ended_on_newline = false
366
+ end
367
+
368
+ # As tokens are coming in, we track the minimum amount of common leading
369
+ # whitespace on plain string content tokens. This allows us to later
370
+ # remove that amount of whitespace from the beginning of each line.
371
+ def <<(token)
372
+ case token.event
373
+ when :on_embexpr_beg, :on_heredoc_beg
374
+ @embexpr_balance += 1
375
+ @dedent = 0 if @dedent_next && @ended_on_newline
376
+ when :on_embexpr_end, :on_heredoc_end
377
+ @embexpr_balance -= 1
378
+ when :on_tstring_content
379
+ if embexpr_balance == 0
380
+ line = token.value
381
+
382
+ if dedent_next && !(line.strip.empty? && line.end_with?("\n"))
383
+ leading = line[/\A(\s*)\n?/, 1]
384
+ next_dedent = 0
385
+
386
+ leading.each_char do |char|
387
+ if char == "\t"
388
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
389
+ else
390
+ next_dedent += 1
391
+ end
392
+ end
393
+
394
+ @dedent = [dedent, next_dedent].compact.min
395
+ @dedent_next = true
396
+ @ended_on_newline = line.end_with?("\n")
397
+ tokens << token
398
+ return
399
+ end
400
+ end
401
+ end
402
+
403
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
404
+ @ended_on_newline = false
405
+ tokens << token
406
+ end
407
+
408
+ def to_a
409
+ # If every line in the heredoc is blank, we still need to split up the
410
+ # string content token into multiple tokens.
411
+ if dedent.nil?
412
+ results = []
413
+ embexpr_balance = 0
414
+
415
+ tokens.each do |token|
416
+ case token.event
417
+ when :on_embexpr_beg, :on_heredoc_beg
418
+ embexpr_balance += 1
419
+ results << token
420
+ when :on_embexpr_end, :on_heredoc_end
421
+ embexpr_balance -= 1
422
+ results << token
423
+ when :on_tstring_content
424
+ if embexpr_balance == 0
425
+ lineno = token[0][0]
426
+ column = token[0][1]
427
+
428
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
429
+ column = 0 if index > 0
430
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
431
+ lineno += 1
432
+ end
433
+ else
434
+ results << token
435
+ end
436
+ else
437
+ results << token
438
+ end
439
+ end
440
+
441
+ return results
442
+ end
443
+
444
+ # If the minimum common whitespace is 0, then we need to concatenate
445
+ # string nodes together that are immediately adjacent.
446
+ if dedent == 0
447
+ results = []
448
+ embexpr_balance = 0
449
+
450
+ index = 0
451
+ max_index = tokens.length
452
+
453
+ while index < max_index
454
+ token = tokens[index]
455
+ results << token
456
+ index += 1
457
+
458
+ case token.event
459
+ when :on_embexpr_beg, :on_heredoc_beg
460
+ embexpr_balance += 1
461
+ when :on_embexpr_end, :on_heredoc_end
462
+ embexpr_balance -= 1
463
+ when :on_tstring_content
464
+ if embexpr_balance == 0
465
+ while index < max_index && tokens[index].event == :on_tstring_content
466
+ token.value << tokens[index].value
467
+ index += 1
468
+ end
469
+ end
470
+ end
471
+ end
472
+
473
+ return results
474
+ end
475
+
476
+ # Otherwise, we're going to run through each token in the list and
477
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
478
+ # perform. We also need to remove the dedent from the beginning of
479
+ # each line of plain string content tokens.
480
+ results = []
481
+ dedent_next = true
482
+ embexpr_balance = 0
483
+
484
+ tokens.each do |token|
485
+ # Notice that the structure of this conditional largely matches the
486
+ # whitespace calculation we performed above. This is because
487
+ # checking if the subsequent token needs to be dedented is common to
488
+ # both the dedent calculation and the ignored_sp insertion.
489
+ case token.event
490
+ when :on_embexpr_beg
491
+ embexpr_balance += 1
492
+ results << token
493
+ when :on_embexpr_end
494
+ embexpr_balance -= 1
495
+ results << token
496
+ when :on_tstring_content
497
+ if embexpr_balance == 0
498
+ # Here we're going to split the string on newlines, but maintain
499
+ # the newlines in the resulting array. We'll do that with a look
500
+ # behind assertion.
501
+ splits = token.value.split(/(?<=\n)/)
502
+ index = 0
503
+
504
+ while index < splits.length
505
+ line = splits[index]
506
+ lineno = token[0][0] + index
507
+ column = token[0][1]
508
+
509
+ # Blank lines do not count toward common leading whitespace
510
+ # calculation and do not need to be dedented.
511
+ if dedent_next || index > 0
512
+ column = 0
513
+ end
514
+
515
+ # If the dedent is 0 and we're not supposed to dedent the next
516
+ # line or this line doesn't start with whitespace, then we
517
+ # should concatenate the rest of the string to match ripper.
518
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
519
+ line = splits[index..].join
520
+ index = splits.length
521
+ end
522
+
523
+ # If we are supposed to dedent this line or if this is not the
524
+ # first line of the string and this line isn't entirely blank,
525
+ # then we need to insert an on_ignored_sp token and remove the
526
+ # dedent from the beginning of the line.
527
+ if (dedent > 0) && (dedent_next || index > 0)
528
+ deleting = 0
529
+ deleted_chars = []
530
+
531
+ # Gather up all of the characters that we're going to
532
+ # delete, stopping when you hit a character that would put
533
+ # you over the dedent amount.
534
+ line.each_char.with_index do |char, i|
535
+ case char
536
+ when "\r"
537
+ if line[i + 1] == "\n"
538
+ break
539
+ end
540
+ when "\n"
541
+ break
542
+ when "\t"
543
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
544
+ else
545
+ deleting += 1
546
+ end
547
+
548
+ break if deleting > dedent
549
+ deleted_chars << char
550
+ end
551
+
552
+ # If we have something to delete, then delete it from the
553
+ # string and insert an on_ignored_sp token.
554
+ if deleted_chars.any?
555
+ ignored = deleted_chars.join
556
+ line.delete_prefix!(ignored)
557
+
558
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
559
+ column = ignored.length
560
+ end
561
+ end
562
+
563
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
564
+ index += 1
565
+ end
566
+ else
567
+ results << token
568
+ end
569
+ else
570
+ results << token
571
+ end
572
+
573
+ dedent_next =
574
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
575
+ embexpr_balance == 0
576
+ end
577
+
578
+ results
579
+ end
580
+ end
581
+
582
+ # Here we will split between the two types of heredocs and return the
583
+ # object that will store their tokens.
584
+ def self.build(opening)
585
+ case opening.value[2]
586
+ when "~"
587
+ DedentingHeredoc.new
588
+ when "-"
589
+ DashHeredoc.new(opening.value[3] != "'")
590
+ else
591
+ PlainHeredoc.new
592
+ end
593
+ end
594
+ end
595
+
596
+ private_constant :Heredoc
597
+
598
+ attr_reader :source, :options
599
+
600
+ def initialize(source, **options)
601
+ @source = source
602
+ @options = options
603
+ end
604
+
605
+ def result
606
+ tokens = []
607
+
608
+ state = :default
609
+ heredoc_stack = [[]]
610
+
611
+ result = Prism.lex(source, **options)
612
+ result_value = result.value
613
+ previous_state = nil
614
+ last_heredoc_end = nil
615
+
616
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
617
+ # first token, so we had to have a hack in place to account for that. This
618
+ # checks for that behavior.
619
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
620
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
621
+
622
+ result_value.each_with_index do |(token, lex_state), index|
623
+ lineno = token.location.start_line
624
+ column = token.location.start_column
625
+
626
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
627
+ # certain tokens ripper sets the first token back by 3 bytes. It also
628
+ # keeps the byte order mark in the first token's value. This is weird,
629
+ # and I don't want to mirror that in our parser. So instead, we'll match
630
+ # up the columns and values here.
631
+ if bom && lineno == 1
632
+ column -= 3
633
+
634
+ if index == 0 && column == 0 && !bom_flushed
635
+ flushed =
636
+ case token.type
637
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
638
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
639
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
640
+ :PERCENT_UPPER_W, :STRING_BEGIN
641
+ true
642
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
643
+ token.value.start_with?("%")
644
+ else
645
+ false
646
+ end
647
+
648
+ unless flushed
649
+ column -= 3
650
+ value = token.value
651
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
652
+ end
653
+ end
654
+ end
655
+
656
+ event = RIPPER.fetch(token.type)
657
+ value = token.value
658
+ lex_state = Ripper::Lexer::State.new(lex_state)
659
+
660
+ token =
661
+ case event
662
+ when :on___end__
663
+ EndContentToken.new([[lineno, column], event, value, lex_state])
664
+ when :on_comment
665
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
666
+ when :on_heredoc_end
667
+ # Heredoc end tokens can be emitted in an odd order, so we don't
668
+ # want to bother comparing the state on them.
669
+ last_heredoc_end = token.location.end_offset
670
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
671
+ when :on_ident
672
+ if lex_state == Ripper::EXPR_END
673
+ # If we have an identifier that follows a method name like:
674
+ #
675
+ # def foo bar
676
+ #
677
+ # then Ripper will mark bar as END|LABEL if there is a local in a
678
+ # parent scope named bar because it hasn't pushed the local table
679
+ # yet. We do this more accurately, so we need to allow comparing
680
+ # against both END and END|LABEL.
681
+ ParamToken.new([[lineno, column], event, value, lex_state])
682
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
683
+ # In the event that we're comparing identifiers, we're going to
684
+ # allow a little divergence. Ripper doesn't account for local
685
+ # variables introduced through named captures in regexes, and we
686
+ # do, which accounts for this difference.
687
+ IdentToken.new([[lineno, column], event, value, lex_state])
688
+ else
689
+ Token.new([[lineno, column], event, value, lex_state])
690
+ end
691
+ when :on_embexpr_end
692
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
693
+ when :on_ignored_nl
694
+ # Ignored newlines can occasionally have a LABEL state attached to
695
+ # them which doesn't actually impact anything. We don't mirror that
696
+ # state so we ignored it.
697
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
698
+ when :on_regexp_end
699
+ # On regex end, Ripper scans and then sets end state, so the ripper
700
+ # lexed output is begin, when it should be end. prism sets lex state
701
+ # correctly to end state, but we want to be able to compare against
702
+ # Ripper's lexed state. So here, if it's a regexp end token, we
703
+ # output the state as the previous state, solely for the sake of
704
+ # comparison.
705
+ previous_token = result_value[index - 1][0]
706
+ lex_state =
707
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
708
+ # If the previous token is embexpr_end, then we have to do even
709
+ # more processing. The end of an embedded expression sets the
710
+ # state to the state that it had at the beginning of the
711
+ # embedded expression. So we have to go and find that state and
712
+ # set it here.
713
+ counter = 1
714
+ current_index = index - 1
715
+
716
+ until counter == 0
717
+ current_index -= 1
718
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
719
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
720
+ end
721
+
722
+ Ripper::Lexer::State.new(result_value[current_index][1])
723
+ else
724
+ previous_state
725
+ end
726
+
727
+ Token.new([[lineno, column], event, value, lex_state])
728
+ when :on_eof
729
+ previous_token = result_value[index - 1][0]
730
+
731
+ # If we're at the end of the file and the previous token was a
732
+ # comment and there is still whitespace after the comment, then
733
+ # Ripper will append a on_nl token (even though there isn't
734
+ # necessarily a newline). We mirror that here.
735
+ if previous_token.type == :COMMENT
736
+ # If the comment is at the start of a heredoc: <<HEREDOC # comment
737
+ # then the comment's end_offset is up near the heredoc_beg.
738
+ # This is not the correct offset to use for figuring out if
739
+ # there is trailing whitespace after the last token.
740
+ # Use the greater offset of the two to determine the start of
741
+ # the trailing whitespace.
742
+ start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max
743
+ end_offset = token.location.start_offset
744
+
745
+ if start_offset < end_offset
746
+ if bom
747
+ start_offset += 3
748
+ end_offset += 3
749
+ end
750
+
751
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
752
+ end
753
+ end
754
+
755
+ Token.new([[lineno, column], event, value, lex_state])
756
+ else
757
+ Token.new([[lineno, column], event, value, lex_state])
758
+ end
759
+
760
+ previous_state = lex_state
761
+
762
+ # The order in which tokens appear in our lexer is different from the
763
+ # order that they appear in Ripper. When we hit the declaration of a
764
+ # heredoc in prism, we skip forward and lex the rest of the content of
765
+ # the heredoc before going back and lexing at the end of the heredoc
766
+ # identifier.
767
+ #
768
+ # To match up to ripper, we keep a small state variable around here to
769
+ # track whether we're in the middle of a heredoc or not. In this way we
770
+ # can shuffle around the token to match Ripper's output.
771
+ case state
772
+ when :default
773
+ # The default state is when there are no heredocs at all. In this
774
+ # state we can append the token to the list of tokens and move on.
775
+ tokens << token
776
+
777
+ # If we get the declaration of a heredoc, then we open a new heredoc
778
+ # and move into the heredoc_opened state.
779
+ if event == :on_heredoc_beg
780
+ state = :heredoc_opened
781
+ heredoc_stack.last << Heredoc.build(token)
782
+ end
783
+ when :heredoc_opened
784
+ # The heredoc_opened state is when we've seen the declaration of a
785
+ # heredoc and are now lexing the body of the heredoc. In this state we
786
+ # push tokens onto the most recently created heredoc.
787
+ heredoc_stack.last.last << token
788
+
789
+ case event
790
+ when :on_heredoc_beg
791
+ # If we receive a heredoc declaration while lexing the body of a
792
+ # heredoc, this means we have nested heredocs. In this case we'll
793
+ # push a new heredoc onto the stack and stay in the heredoc_opened
794
+ # state since we're now lexing the body of the new heredoc.
795
+ heredoc_stack << [Heredoc.build(token)]
796
+ when :on_heredoc_end
797
+ # If we receive the end of a heredoc, then we're done lexing the
798
+ # body of the heredoc. In this case we now have a completed heredoc
799
+ # but need to wait for the next newline to push it into the token
800
+ # stream.
801
+ state = :heredoc_closed
802
+ end
803
+ when :heredoc_closed
804
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
805
+ if heredoc_stack.size > 1
806
+ flushing = heredoc_stack.pop
807
+ heredoc_stack.last.last << token
808
+
809
+ flushing.each do |heredoc|
810
+ heredoc.to_a.each do |flushed_token|
811
+ heredoc_stack.last.last << flushed_token
812
+ end
813
+ end
814
+
815
+ state = :heredoc_opened
816
+ next
817
+ end
818
+ elsif event == :on_heredoc_beg
819
+ tokens << token
820
+ state = :heredoc_opened
821
+ heredoc_stack.last << Heredoc.build(token)
822
+ next
823
+ elsif heredoc_stack.size > 1
824
+ heredoc_stack[-2].last << token
825
+ next
826
+ end
827
+
828
+ heredoc_stack.last.each do |heredoc|
829
+ tokens.concat(heredoc.to_a)
830
+ end
831
+
832
+ heredoc_stack.last.clear
833
+ state = :default
834
+
835
+ tokens << token
836
+ end
837
+ end
838
+
839
+ # Drop the EOF token from the list
840
+ tokens = tokens[0...-1]
841
+
842
+ # We sort by location to compare against Ripper's output
843
+ tokens.sort_by!(&:location)
844
+
845
+ ParseResult.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, [])
846
+ end
847
+ end
848
+
849
+ private_constant :LexCompat
850
+
851
+ # This is a class that wraps the Ripper lexer to produce almost exactly the
852
+ # same tokens.
853
+ class LexRipper # :nodoc:
854
+ attr_reader :source
855
+
856
+ def initialize(source)
857
+ @source = source
858
+ end
859
+
860
+ def result
861
+ previous = []
862
+ results = []
863
+
864
+ lex(source).each do |token|
865
+ case token[1]
866
+ when :on_sp
867
+ # skip
868
+ when :on_tstring_content
869
+ if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
870
+ previous[2] << token[2]
871
+ else
872
+ results << token
873
+ previous = token
874
+ end
875
+ when :on_words_sep
876
+ if previous[1] == :on_words_sep
877
+ previous[2] << token[2]
878
+ else
879
+ results << token
880
+ previous = token
881
+ end
882
+ else
883
+ results << token
884
+ previous = token
885
+ end
886
+ end
887
+
888
+ results
889
+ end
890
+
891
+ private
892
+
893
+ if Ripper.method(:lex).parameters.assoc(:keyrest)
894
+ def lex(source)
895
+ Ripper.lex(source, raise_errors: true)
896
+ end
897
+ else
898
+ def lex(source)
899
+ ripper = Ripper::Lexer.new(source)
900
+ ripper.lex.tap do |result|
901
+ raise SyntaxError, ripper.errors.map(&:message).join(' ;') if ripper.errors.any?
902
+ end
903
+ end
904
+ end
905
+ end
906
+
907
+ private_constant :LexRipper
908
+ end