jruby-prism-parser 0.23.0.pre.SNAPSHOT-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (110) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +401 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +101 -0
  7. data/README.md +98 -0
  8. data/config.yml +2902 -0
  9. data/docs/build_system.md +91 -0
  10. data/docs/configuration.md +64 -0
  11. data/docs/cruby_compilation.md +27 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +121 -0
  14. data/docs/fuzzing.md +88 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/javascript.md +118 -0
  17. data/docs/local_variable_depth.md +229 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/parser_translation.md +34 -0
  20. data/docs/parsing_rules.md +19 -0
  21. data/docs/releasing.md +98 -0
  22. data/docs/ripper.md +36 -0
  23. data/docs/ruby_api.md +43 -0
  24. data/docs/ruby_parser_translation.md +19 -0
  25. data/docs/serialization.md +209 -0
  26. data/docs/testing.md +55 -0
  27. data/ext/prism/api_node.c +5098 -0
  28. data/ext/prism/api_pack.c +267 -0
  29. data/ext/prism/extconf.rb +110 -0
  30. data/ext/prism/extension.c +1155 -0
  31. data/ext/prism/extension.h +18 -0
  32. data/include/prism/ast.h +5807 -0
  33. data/include/prism/defines.h +102 -0
  34. data/include/prism/diagnostic.h +339 -0
  35. data/include/prism/encoding.h +265 -0
  36. data/include/prism/node.h +57 -0
  37. data/include/prism/options.h +230 -0
  38. data/include/prism/pack.h +152 -0
  39. data/include/prism/parser.h +732 -0
  40. data/include/prism/prettyprint.h +26 -0
  41. data/include/prism/regexp.h +33 -0
  42. data/include/prism/util/pm_buffer.h +155 -0
  43. data/include/prism/util/pm_char.h +205 -0
  44. data/include/prism/util/pm_constant_pool.h +209 -0
  45. data/include/prism/util/pm_list.h +97 -0
  46. data/include/prism/util/pm_memchr.h +29 -0
  47. data/include/prism/util/pm_newline_list.h +93 -0
  48. data/include/prism/util/pm_state_stack.h +42 -0
  49. data/include/prism/util/pm_string.h +150 -0
  50. data/include/prism/util/pm_string_list.h +44 -0
  51. data/include/prism/util/pm_strncasecmp.h +32 -0
  52. data/include/prism/util/pm_strpbrk.h +46 -0
  53. data/include/prism/version.h +29 -0
  54. data/include/prism.h +289 -0
  55. data/jruby-prism.jar +0 -0
  56. data/lib/prism/compiler.rb +486 -0
  57. data/lib/prism/debug.rb +206 -0
  58. data/lib/prism/desugar_compiler.rb +207 -0
  59. data/lib/prism/dispatcher.rb +2150 -0
  60. data/lib/prism/dot_visitor.rb +4634 -0
  61. data/lib/prism/dsl.rb +785 -0
  62. data/lib/prism/ffi.rb +346 -0
  63. data/lib/prism/lex_compat.rb +908 -0
  64. data/lib/prism/mutation_compiler.rb +753 -0
  65. data/lib/prism/node.rb +17864 -0
  66. data/lib/prism/node_ext.rb +212 -0
  67. data/lib/prism/node_inspector.rb +68 -0
  68. data/lib/prism/pack.rb +224 -0
  69. data/lib/prism/parse_result/comments.rb +177 -0
  70. data/lib/prism/parse_result/newlines.rb +64 -0
  71. data/lib/prism/parse_result.rb +498 -0
  72. data/lib/prism/pattern.rb +250 -0
  73. data/lib/prism/serialize.rb +1354 -0
  74. data/lib/prism/translation/parser/compiler.rb +1838 -0
  75. data/lib/prism/translation/parser/lexer.rb +335 -0
  76. data/lib/prism/translation/parser/rubocop.rb +37 -0
  77. data/lib/prism/translation/parser.rb +178 -0
  78. data/lib/prism/translation/ripper.rb +577 -0
  79. data/lib/prism/translation/ruby_parser.rb +1521 -0
  80. data/lib/prism/translation.rb +11 -0
  81. data/lib/prism/version.rb +3 -0
  82. data/lib/prism/visitor.rb +495 -0
  83. data/lib/prism.rb +99 -0
  84. data/prism.gemspec +135 -0
  85. data/rbi/prism.rbi +7767 -0
  86. data/rbi/prism_static.rbi +207 -0
  87. data/sig/prism.rbs +4773 -0
  88. data/sig/prism_static.rbs +201 -0
  89. data/src/diagnostic.c +400 -0
  90. data/src/encoding.c +5132 -0
  91. data/src/node.c +2786 -0
  92. data/src/options.c +213 -0
  93. data/src/pack.c +493 -0
  94. data/src/prettyprint.c +8881 -0
  95. data/src/prism.c +18406 -0
  96. data/src/regexp.c +638 -0
  97. data/src/serialize.c +1554 -0
  98. data/src/token_type.c +700 -0
  99. data/src/util/pm_buffer.c +190 -0
  100. data/src/util/pm_char.c +318 -0
  101. data/src/util/pm_constant_pool.c +322 -0
  102. data/src/util/pm_list.c +49 -0
  103. data/src/util/pm_memchr.c +35 -0
  104. data/src/util/pm_newline_list.c +84 -0
  105. data/src/util/pm_state_stack.c +25 -0
  106. data/src/util/pm_string.c +203 -0
  107. data/src/util/pm_string_list.c +28 -0
  108. data/src/util/pm_strncasecmp.c +24 -0
  109. data/src/util/pm_strpbrk.c +180 -0
  110. metadata +156 -0
@@ -0,0 +1,908 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+ require "ripper"
5
+
6
+ module Prism
7
+ # This class is responsible for lexing the source using prism and then
8
+ # converting those tokens to be compatible with Ripper. In the vast majority
9
+ # of cases, this is a one-to-one mapping of the token type. Everything else
10
+ # generally lines up. However, there are a few cases that require special
11
+ # handling.
12
+ class LexCompat # :nodoc:
13
+ # This is a mapping of prism token types to Ripper token types. This is a
14
+ # many-to-one mapping because we split up our token types, whereas Ripper
15
+ # tends to group them.
16
+ RIPPER = {
17
+ AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND: :on_op,
19
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
20
+ AMPERSAND_DOT: :on_op,
21
+ AMPERSAND_EQUAL: :on_op,
22
+ BACK_REFERENCE: :on_backref,
23
+ BACKTICK: :on_backtick,
24
+ BANG: :on_op,
25
+ BANG_EQUAL: :on_op,
26
+ BANG_TILDE: :on_op,
27
+ BRACE_LEFT: :on_lbrace,
28
+ BRACE_RIGHT: :on_rbrace,
29
+ BRACKET_LEFT: :on_lbracket,
30
+ BRACKET_LEFT_ARRAY: :on_lbracket,
31
+ BRACKET_LEFT_RIGHT: :on_op,
32
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
33
+ BRACKET_RIGHT: :on_rbracket,
34
+ CARET: :on_op,
35
+ CARET_EQUAL: :on_op,
36
+ CHARACTER_LITERAL: :on_CHAR,
37
+ CLASS_VARIABLE: :on_cvar,
38
+ COLON: :on_op,
39
+ COLON_COLON: :on_op,
40
+ COMMA: :on_comma,
41
+ COMMENT: :on_comment,
42
+ CONSTANT: :on_const,
43
+ DOT: :on_period,
44
+ DOT_DOT: :on_op,
45
+ DOT_DOT_DOT: :on_op,
46
+ EMBDOC_BEGIN: :on_embdoc_beg,
47
+ EMBDOC_END: :on_embdoc_end,
48
+ EMBDOC_LINE: :on_embdoc,
49
+ EMBEXPR_BEGIN: :on_embexpr_beg,
50
+ EMBEXPR_END: :on_embexpr_end,
51
+ EMBVAR: :on_embvar,
52
+ EOF: :on_eof,
53
+ EQUAL: :on_op,
54
+ EQUAL_EQUAL: :on_op,
55
+ EQUAL_EQUAL_EQUAL: :on_op,
56
+ EQUAL_GREATER: :on_op,
57
+ EQUAL_TILDE: :on_op,
58
+ FLOAT: :on_float,
59
+ FLOAT_IMAGINARY: :on_imaginary,
60
+ FLOAT_RATIONAL: :on_rational,
61
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
62
+ GREATER: :on_op,
63
+ GREATER_EQUAL: :on_op,
64
+ GREATER_GREATER: :on_op,
65
+ GREATER_GREATER_EQUAL: :on_op,
66
+ GLOBAL_VARIABLE: :on_gvar,
67
+ HEREDOC_END: :on_heredoc_end,
68
+ HEREDOC_START: :on_heredoc_beg,
69
+ IDENTIFIER: :on_ident,
70
+ IGNORED_NEWLINE: :on_ignored_nl,
71
+ INTEGER: :on_int,
72
+ INTEGER_IMAGINARY: :on_imaginary,
73
+ INTEGER_RATIONAL: :on_rational,
74
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
75
+ INSTANCE_VARIABLE: :on_ivar,
76
+ INVALID: :INVALID,
77
+ KEYWORD___ENCODING__: :on_kw,
78
+ KEYWORD___LINE__: :on_kw,
79
+ KEYWORD___FILE__: :on_kw,
80
+ KEYWORD_ALIAS: :on_kw,
81
+ KEYWORD_AND: :on_kw,
82
+ KEYWORD_BEGIN: :on_kw,
83
+ KEYWORD_BEGIN_UPCASE: :on_kw,
84
+ KEYWORD_BREAK: :on_kw,
85
+ KEYWORD_CASE: :on_kw,
86
+ KEYWORD_CLASS: :on_kw,
87
+ KEYWORD_DEF: :on_kw,
88
+ KEYWORD_DEFINED: :on_kw,
89
+ KEYWORD_DO: :on_kw,
90
+ KEYWORD_DO_LOOP: :on_kw,
91
+ KEYWORD_ELSE: :on_kw,
92
+ KEYWORD_ELSIF: :on_kw,
93
+ KEYWORD_END: :on_kw,
94
+ KEYWORD_END_UPCASE: :on_kw,
95
+ KEYWORD_ENSURE: :on_kw,
96
+ KEYWORD_FALSE: :on_kw,
97
+ KEYWORD_FOR: :on_kw,
98
+ KEYWORD_IF: :on_kw,
99
+ KEYWORD_IF_MODIFIER: :on_kw,
100
+ KEYWORD_IN: :on_kw,
101
+ KEYWORD_MODULE: :on_kw,
102
+ KEYWORD_NEXT: :on_kw,
103
+ KEYWORD_NIL: :on_kw,
104
+ KEYWORD_NOT: :on_kw,
105
+ KEYWORD_OR: :on_kw,
106
+ KEYWORD_REDO: :on_kw,
107
+ KEYWORD_RESCUE: :on_kw,
108
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
109
+ KEYWORD_RETRY: :on_kw,
110
+ KEYWORD_RETURN: :on_kw,
111
+ KEYWORD_SELF: :on_kw,
112
+ KEYWORD_SUPER: :on_kw,
113
+ KEYWORD_THEN: :on_kw,
114
+ KEYWORD_TRUE: :on_kw,
115
+ KEYWORD_UNDEF: :on_kw,
116
+ KEYWORD_UNLESS: :on_kw,
117
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
118
+ KEYWORD_UNTIL: :on_kw,
119
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
120
+ KEYWORD_WHEN: :on_kw,
121
+ KEYWORD_WHILE: :on_kw,
122
+ KEYWORD_WHILE_MODIFIER: :on_kw,
123
+ KEYWORD_YIELD: :on_kw,
124
+ LABEL: :on_label,
125
+ LABEL_END: :on_label_end,
126
+ LAMBDA_BEGIN: :on_tlambeg,
127
+ LESS: :on_op,
128
+ LESS_EQUAL: :on_op,
129
+ LESS_EQUAL_GREATER: :on_op,
130
+ LESS_LESS: :on_op,
131
+ LESS_LESS_EQUAL: :on_op,
132
+ METHOD_NAME: :on_ident,
133
+ MINUS: :on_op,
134
+ MINUS_EQUAL: :on_op,
135
+ MINUS_GREATER: :on_tlambda,
136
+ NEWLINE: :on_nl,
137
+ NUMBERED_REFERENCE: :on_backref,
138
+ PARENTHESIS_LEFT: :on_lparen,
139
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
140
+ PARENTHESIS_RIGHT: :on_rparen,
141
+ PERCENT: :on_op,
142
+ PERCENT_EQUAL: :on_op,
143
+ PERCENT_LOWER_I: :on_qsymbols_beg,
144
+ PERCENT_LOWER_W: :on_qwords_beg,
145
+ PERCENT_LOWER_X: :on_backtick,
146
+ PERCENT_UPPER_I: :on_symbols_beg,
147
+ PERCENT_UPPER_W: :on_words_beg,
148
+ PIPE: :on_op,
149
+ PIPE_EQUAL: :on_op,
150
+ PIPE_PIPE: :on_op,
151
+ PIPE_PIPE_EQUAL: :on_op,
152
+ PLUS: :on_op,
153
+ PLUS_EQUAL: :on_op,
154
+ QUESTION_MARK: :on_op,
155
+ RATIONAL_FLOAT: :on_rational,
156
+ RATIONAL_INTEGER: :on_rational,
157
+ REGEXP_BEGIN: :on_regexp_beg,
158
+ REGEXP_END: :on_regexp_end,
159
+ SEMICOLON: :on_semicolon,
160
+ SLASH: :on_op,
161
+ SLASH_EQUAL: :on_op,
162
+ STAR: :on_op,
163
+ STAR_EQUAL: :on_op,
164
+ STAR_STAR: :on_op,
165
+ STAR_STAR_EQUAL: :on_op,
166
+ STRING_BEGIN: :on_tstring_beg,
167
+ STRING_CONTENT: :on_tstring_content,
168
+ STRING_END: :on_tstring_end,
169
+ SYMBOL_BEGIN: :on_symbeg,
170
+ TILDE: :on_op,
171
+ UAMPERSAND: :on_op,
172
+ UCOLON_COLON: :on_op,
173
+ UDOT_DOT: :on_op,
174
+ UDOT_DOT_DOT: :on_op,
175
+ UMINUS: :on_op,
176
+ UMINUS_NUM: :on_op,
177
+ UPLUS: :on_op,
178
+ USTAR: :on_op,
179
+ USTAR_STAR: :on_op,
180
+ WORDS_SEP: :on_words_sep,
181
+ "__END__": :on___end__
182
+ }.freeze
183
+
184
+ # When we produce tokens, we produce the same arrays that Ripper does.
185
+ # However, we add a couple of convenience methods onto them to make them a
186
+ # little easier to work with. We delegate all other methods to the array.
187
+ class Token < SimpleDelegator
188
+ # The location of the token in the source.
189
+ def location
190
+ self[0]
191
+ end
192
+
193
+ # The type of the token.
194
+ def event
195
+ self[1]
196
+ end
197
+
198
+ # The slice of the source that this token represents.
199
+ def value
200
+ self[2]
201
+ end
202
+
203
+ # The state of the lexer when this token was produced.
204
+ def state
205
+ self[3]
206
+ end
207
+ end
208
+
209
+ # Ripper doesn't include the rest of the token in the event, so we need to
210
+ # trim it down to just the content on the first line when comparing.
211
+ class EndContentToken < Token
212
+ def ==(other) # :nodoc:
213
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
214
+ end
215
+ end
216
+
217
+ # Tokens where state should be ignored
218
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
219
+ class IgnoreStateToken < Token
220
+ def ==(other) # :nodoc:
221
+ self[0...-1] == other[0...-1]
222
+ end
223
+ end
224
+
225
+ # Ident tokens for the most part are exactly the same, except sometimes we
226
+ # know an ident is a local when ripper doesn't (when they are introduced
227
+ # through named captures in regular expressions). In that case we don't
228
+ # compare the state.
229
+ class IdentToken < Token
230
+ def ==(other) # :nodoc:
231
+ (self[0...-1] == other[0...-1]) && (
232
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
233
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
234
+ )
235
+ end
236
+ end
237
+
238
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
239
+ # we compare the state differently here.
240
+ class IgnoredNewlineToken < Token
241
+ def ==(other) # :nodoc:
242
+ return false unless self[0...-1] == other[0...-1]
243
+
244
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
245
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
246
+ else
247
+ self[4] == other[4]
248
+ end
249
+ end
250
+ end
251
+
252
+ # If we have an identifier that follows a method name like:
253
+ #
254
+ # def foo bar
255
+ #
256
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
257
+ # scope named bar because it hasn't pushed the local table yet. We do this
258
+ # more accurately, so we need to allow comparing against both END and
259
+ # END|LABEL.
260
+ class ParamToken < Token
261
+ def ==(other) # :nodoc:
262
+ (self[0...-1] == other[0...-1]) && (
263
+ (other[3] == Ripper::EXPR_END) ||
264
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
265
+ )
266
+ end
267
+ end
268
+
269
+ # A heredoc in this case is a list of tokens that belong to the body of the
270
+ # heredoc that should be appended onto the list of tokens when the heredoc
271
+ # closes.
272
+ module Heredoc # :nodoc:
273
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
274
+ # We need to keep them around so that we can insert them in the correct
275
+ # order back into the token stream and set the state of the last token to
276
+ # the state that the heredoc was opened in.
277
+ class PlainHeredoc # :nodoc:
278
+ attr_reader :tokens
279
+
280
+ def initialize
281
+ @tokens = []
282
+ end
283
+
284
+ def <<(token)
285
+ tokens << token
286
+ end
287
+
288
+ def to_a
289
+ tokens
290
+ end
291
+ end
292
+
293
+ # Dash heredocs are a little more complicated. They are a list of tokens
294
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
295
+ # to keep track of the state that the heredoc was opened in.
296
+ class DashHeredoc # :nodoc:
297
+ attr_reader :split, :tokens
298
+
299
+ def initialize(split)
300
+ @split = split
301
+ @tokens = []
302
+ end
303
+
304
+ def <<(token)
305
+ tokens << token
306
+ end
307
+
308
+ def to_a
309
+ embexpr_balance = 0
310
+
311
+ tokens.each_with_object([]) do |token, results|
312
+ case token.event
313
+ when :on_embexpr_beg
314
+ embexpr_balance += 1
315
+ results << token
316
+ when :on_embexpr_end
317
+ embexpr_balance -= 1
318
+ results << token
319
+ when :on_tstring_content
320
+ if embexpr_balance == 0
321
+ lineno = token[0][0]
322
+ column = token[0][1]
323
+
324
+ if split
325
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
326
+ # to keep the delimiter in the result.
327
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
328
+ column = 0 if index > 0
329
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
330
+ lineno += value.count("\n")
331
+ end
332
+ else
333
+ results << token
334
+ end
335
+ else
336
+ results << token
337
+ end
338
+ else
339
+ results << token
340
+ end
341
+ end
342
+ end
343
+ end
344
+
345
+ # Heredocs that are dedenting heredocs are a little more complicated.
346
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
347
+ # removed from the output. prism only modifies the node itself and keeps
348
+ # the token the same. This simplifies prism, but makes comparing against
349
+ # Ripper much harder because there is a length mismatch.
350
+ #
351
+ # Fortunately, we already have to pull out the heredoc tokens in order to
352
+ # insert them into the stream in the correct order. As such, we can do
353
+ # some extra manipulation on the tokens to make them match Ripper's
354
+ # output by mirroring the dedent logic that Ripper uses.
355
+ class DedentingHeredoc # :nodoc:
356
+ TAB_WIDTH = 8
357
+
358
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
359
+
360
+ def initialize
361
+ @tokens = []
362
+ @dedent_next = true
363
+ @dedent = nil
364
+ @embexpr_balance = 0
365
+ @ended_on_newline = false
366
+ end
367
+
368
+ # As tokens are coming in, we track the minimum amount of common leading
369
+ # whitespace on plain string content tokens. This allows us to later
370
+ # remove that amount of whitespace from the beginning of each line.
371
+ def <<(token)
372
+ case token.event
373
+ when :on_embexpr_beg, :on_heredoc_beg
374
+ @embexpr_balance += 1
375
+ @dedent = 0 if @dedent_next && @ended_on_newline
376
+ when :on_embexpr_end, :on_heredoc_end
377
+ @embexpr_balance -= 1
378
+ when :on_tstring_content
379
+ if embexpr_balance == 0
380
+ line = token.value
381
+
382
+ if dedent_next && !(line.strip.empty? && line.end_with?("\n"))
383
+ leading = line[/\A(\s*)\n?/, 1]
384
+ next_dedent = 0
385
+
386
+ leading.each_char do |char|
387
+ if char == "\t"
388
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
389
+ else
390
+ next_dedent += 1
391
+ end
392
+ end
393
+
394
+ @dedent = [dedent, next_dedent].compact.min
395
+ @dedent_next = true
396
+ @ended_on_newline = line.end_with?("\n")
397
+ tokens << token
398
+ return
399
+ end
400
+ end
401
+ end
402
+
403
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
404
+ @ended_on_newline = false
405
+ tokens << token
406
+ end
407
+
408
+ def to_a
409
+ # If every line in the heredoc is blank, we still need to split up the
410
+ # string content token into multiple tokens.
411
+ if dedent.nil?
412
+ results = []
413
+ embexpr_balance = 0
414
+
415
+ tokens.each do |token|
416
+ case token.event
417
+ when :on_embexpr_beg, :on_heredoc_beg
418
+ embexpr_balance += 1
419
+ results << token
420
+ when :on_embexpr_end, :on_heredoc_end
421
+ embexpr_balance -= 1
422
+ results << token
423
+ when :on_tstring_content
424
+ if embexpr_balance == 0
425
+ lineno = token[0][0]
426
+ column = token[0][1]
427
+
428
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
429
+ column = 0 if index > 0
430
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
431
+ lineno += 1
432
+ end
433
+ else
434
+ results << token
435
+ end
436
+ else
437
+ results << token
438
+ end
439
+ end
440
+
441
+ return results
442
+ end
443
+
444
+ # If the minimum common whitespace is 0, then we need to concatenate
445
+ # string nodes together that are immediately adjacent.
446
+ if dedent == 0
447
+ results = []
448
+ embexpr_balance = 0
449
+
450
+ index = 0
451
+ max_index = tokens.length
452
+
453
+ while index < max_index
454
+ token = tokens[index]
455
+ results << token
456
+ index += 1
457
+
458
+ case token.event
459
+ when :on_embexpr_beg, :on_heredoc_beg
460
+ embexpr_balance += 1
461
+ when :on_embexpr_end, :on_heredoc_end
462
+ embexpr_balance -= 1
463
+ when :on_tstring_content
464
+ if embexpr_balance == 0
465
+ while index < max_index && tokens[index].event == :on_tstring_content
466
+ token.value << tokens[index].value
467
+ index += 1
468
+ end
469
+ end
470
+ end
471
+ end
472
+
473
+ return results
474
+ end
475
+
476
+ # Otherwise, we're going to run through each token in the list and
477
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
478
+ # perform. We also need to remove the dedent from the beginning of
479
+ # each line of plain string content tokens.
480
+ results = []
481
+ dedent_next = true
482
+ embexpr_balance = 0
483
+
484
+ tokens.each do |token|
485
+ # Notice that the structure of this conditional largely matches the
486
+ # whitespace calculation we performed above. This is because
487
+ # checking if the subsequent token needs to be dedented is common to
488
+ # both the dedent calculation and the ignored_sp insertion.
489
+ case token.event
490
+ when :on_embexpr_beg
491
+ embexpr_balance += 1
492
+ results << token
493
+ when :on_embexpr_end
494
+ embexpr_balance -= 1
495
+ results << token
496
+ when :on_tstring_content
497
+ if embexpr_balance == 0
498
+ # Here we're going to split the string on newlines, but maintain
499
+ # the newlines in the resulting array. We'll do that with a look
500
+ # behind assertion.
501
+ splits = token.value.split(/(?<=\n)/)
502
+ index = 0
503
+
504
+ while index < splits.length
505
+ line = splits[index]
506
+ lineno = token[0][0] + index
507
+ column = token[0][1]
508
+
509
+ # Blank lines do not count toward common leading whitespace
510
+ # calculation and do not need to be dedented.
511
+ if dedent_next || index > 0
512
+ column = 0
513
+ end
514
+
515
+ # If the dedent is 0 and we're not supposed to dedent the next
516
+ # line or this line doesn't start with whitespace, then we
517
+ # should concatenate the rest of the string to match ripper.
518
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
519
+ line = splits[index..].join
520
+ index = splits.length
521
+ end
522
+
523
+ # If we are supposed to dedent this line or if this is not the
524
+ # first line of the string and this line isn't entirely blank,
525
+ # then we need to insert an on_ignored_sp token and remove the
526
+ # dedent from the beginning of the line.
527
+ if (dedent > 0) && (dedent_next || index > 0)
528
+ deleting = 0
529
+ deleted_chars = []
530
+
531
+ # Gather up all of the characters that we're going to
532
+ # delete, stopping when you hit a character that would put
533
+ # you over the dedent amount.
534
+ line.each_char.with_index do |char, i|
535
+ case char
536
+ when "\r"
537
+ if line[i + 1] == "\n"
538
+ break
539
+ end
540
+ when "\n"
541
+ break
542
+ when "\t"
543
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
544
+ else
545
+ deleting += 1
546
+ end
547
+
548
+ break if deleting > dedent
549
+ deleted_chars << char
550
+ end
551
+
552
+ # If we have something to delete, then delete it from the
553
+ # string and insert an on_ignored_sp token.
554
+ if deleted_chars.any?
555
+ ignored = deleted_chars.join
556
+ line.delete_prefix!(ignored)
557
+
558
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
559
+ column = ignored.length
560
+ end
561
+ end
562
+
563
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
564
+ index += 1
565
+ end
566
+ else
567
+ results << token
568
+ end
569
+ else
570
+ results << token
571
+ end
572
+
573
+ dedent_next =
574
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
575
+ embexpr_balance == 0
576
+ end
577
+
578
+ results
579
+ end
580
+ end
581
+
582
+ # Here we will split between the two types of heredocs and return the
583
+ # object that will store their tokens.
584
+ def self.build(opening)
585
+ case opening.value[2]
586
+ when "~"
587
+ DedentingHeredoc.new
588
+ when "-"
589
+ DashHeredoc.new(opening.value[3] != "'")
590
+ else
591
+ PlainHeredoc.new
592
+ end
593
+ end
594
+ end
595
+
596
+ private_constant :Heredoc
597
+
598
+ attr_reader :source, :options
599
+
600
+ def initialize(source, **options)
601
+ @source = source
602
+ @options = options
603
+ end
604
+
605
+ def result
606
+ tokens = []
607
+
608
+ state = :default
609
+ heredoc_stack = [[]]
610
+
611
+ result = Prism.lex(source, **options)
612
+ result_value = result.value
613
+ previous_state = nil
614
+ last_heredoc_end = nil
615
+
616
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
617
+ # first token, so we had to have a hack in place to account for that. This
618
+ # checks for that behavior.
619
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
620
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
621
+
622
+ result_value.each_with_index do |(token, lex_state), index|
623
+ lineno = token.location.start_line
624
+ column = token.location.start_column
625
+
626
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
627
+ # certain tokens ripper sets the first token back by 3 bytes. It also
628
+ # keeps the byte order mark in the first token's value. This is weird,
629
+ # and I don't want to mirror that in our parser. So instead, we'll match
630
+ # up the columns and values here.
631
+ if bom && lineno == 1
632
+ column -= 3
633
+
634
+ if index == 0 && column == 0 && !bom_flushed
635
+ flushed =
636
+ case token.type
637
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
638
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
639
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
640
+ :PERCENT_UPPER_W, :STRING_BEGIN
641
+ true
642
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
643
+ token.value.start_with?("%")
644
+ else
645
+ false
646
+ end
647
+
648
+ unless flushed
649
+ column -= 3
650
+ value = token.value
651
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
652
+ end
653
+ end
654
+ end
655
+
656
+ event = RIPPER.fetch(token.type)
657
+ value = token.value
658
+ lex_state = Ripper::Lexer::State.new(lex_state)
659
+
660
+ token =
661
+ case event
662
+ when :on___end__
663
+ EndContentToken.new([[lineno, column], event, value, lex_state])
664
+ when :on_comment
665
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
666
+ when :on_heredoc_end
667
+ # Heredoc end tokens can be emitted in an odd order, so we don't
668
+ # want to bother comparing the state on them.
669
+ last_heredoc_end = token.location.end_offset
670
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
671
+ when :on_ident
672
+ if lex_state == Ripper::EXPR_END
673
+ # If we have an identifier that follows a method name like:
674
+ #
675
+ # def foo bar
676
+ #
677
+ # then Ripper will mark bar as END|LABEL if there is a local in a
678
+ # parent scope named bar because it hasn't pushed the local table
679
+ # yet. We do this more accurately, so we need to allow comparing
680
+ # against both END and END|LABEL.
681
+ ParamToken.new([[lineno, column], event, value, lex_state])
682
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
683
+ # In the event that we're comparing identifiers, we're going to
684
+ # allow a little divergence. Ripper doesn't account for local
685
+ # variables introduced through named captures in regexes, and we
686
+ # do, which accounts for this difference.
687
+ IdentToken.new([[lineno, column], event, value, lex_state])
688
+ else
689
+ Token.new([[lineno, column], event, value, lex_state])
690
+ end
691
+ when :on_embexpr_end
692
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
693
+ when :on_ignored_nl
694
+ # Ignored newlines can occasionally have a LABEL state attached to
695
+ # them which doesn't actually impact anything. We don't mirror that
696
+ # state so we ignored it.
697
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
698
+ when :on_regexp_end
699
+ # On regex end, Ripper scans and then sets end state, so the ripper
700
+ # lexed output is begin, when it should be end. prism sets lex state
701
+ # correctly to end state, but we want to be able to compare against
702
+ # Ripper's lexed state. So here, if it's a regexp end token, we
703
+ # output the state as the previous state, solely for the sake of
704
+ # comparison.
705
+ previous_token = result_value[index - 1][0]
706
+ lex_state =
707
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
708
+ # If the previous token is embexpr_end, then we have to do even
709
+ # more processing. The end of an embedded expression sets the
710
+ # state to the state that it had at the beginning of the
711
+ # embedded expression. So we have to go and find that state and
712
+ # set it here.
713
+ counter = 1
714
+ current_index = index - 1
715
+
716
+ until counter == 0
717
+ current_index -= 1
718
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
719
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
720
+ end
721
+
722
+ Ripper::Lexer::State.new(result_value[current_index][1])
723
+ else
724
+ previous_state
725
+ end
726
+
727
+ Token.new([[lineno, column], event, value, lex_state])
728
+ when :on_eof
729
+ previous_token = result_value[index - 1][0]
730
+
731
+ # If we're at the end of the file and the previous token was a
732
+ # comment and there is still whitespace after the comment, then
733
+ # Ripper will append a on_nl token (even though there isn't
734
+ # necessarily a newline). We mirror that here.
735
+ if previous_token.type == :COMMENT
736
+ # If the comment is at the start of a heredoc: <<HEREDOC # comment
737
+ # then the comment's end_offset is up near the heredoc_beg.
738
+ # This is not the correct offset to use for figuring out if
739
+ # there is trailing whitespace after the last token.
740
+ # Use the greater offset of the two to determine the start of
741
+ # the trailing whitespace.
742
+ start_offset = [previous_token.location.end_offset, last_heredoc_end].compact.max
743
+ end_offset = token.location.start_offset
744
+
745
+ if start_offset < end_offset
746
+ if bom
747
+ start_offset += 3
748
+ end_offset += 3
749
+ end
750
+
751
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
752
+ end
753
+ end
754
+
755
+ Token.new([[lineno, column], event, value, lex_state])
756
+ else
757
+ Token.new([[lineno, column], event, value, lex_state])
758
+ end
759
+
760
+ previous_state = lex_state
761
+
762
+ # The order in which tokens appear in our lexer is different from the
763
+ # order that they appear in Ripper. When we hit the declaration of a
764
+ # heredoc in prism, we skip forward and lex the rest of the content of
765
+ # the heredoc before going back and lexing at the end of the heredoc
766
+ # identifier.
767
+ #
768
+ # To match up to ripper, we keep a small state variable around here to
769
+ # track whether we're in the middle of a heredoc or not. In this way we
770
+ # can shuffle around the token to match Ripper's output.
771
+ case state
772
+ when :default
773
+ # The default state is when there are no heredocs at all. In this
774
+ # state we can append the token to the list of tokens and move on.
775
+ tokens << token
776
+
777
+ # If we get the declaration of a heredoc, then we open a new heredoc
778
+ # and move into the heredoc_opened state.
779
+ if event == :on_heredoc_beg
780
+ state = :heredoc_opened
781
+ heredoc_stack.last << Heredoc.build(token)
782
+ end
783
+ when :heredoc_opened
784
+ # The heredoc_opened state is when we've seen the declaration of a
785
+ # heredoc and are now lexing the body of the heredoc. In this state we
786
+ # push tokens onto the most recently created heredoc.
787
+ heredoc_stack.last.last << token
788
+
789
+ case event
790
+ when :on_heredoc_beg
791
+ # If we receive a heredoc declaration while lexing the body of a
792
+ # heredoc, this means we have nested heredocs. In this case we'll
793
+ # push a new heredoc onto the stack and stay in the heredoc_opened
794
+ # state since we're now lexing the body of the new heredoc.
795
+ heredoc_stack << [Heredoc.build(token)]
796
+ when :on_heredoc_end
797
+ # If we receive the end of a heredoc, then we're done lexing the
798
+ # body of the heredoc. In this case we now have a completed heredoc
799
+ # but need to wait for the next newline to push it into the token
800
+ # stream.
801
+ state = :heredoc_closed
802
+ end
803
+ when :heredoc_closed
804
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
805
+ if heredoc_stack.size > 1
806
+ flushing = heredoc_stack.pop
807
+ heredoc_stack.last.last << token
808
+
809
+ flushing.each do |heredoc|
810
+ heredoc.to_a.each do |flushed_token|
811
+ heredoc_stack.last.last << flushed_token
812
+ end
813
+ end
814
+
815
+ state = :heredoc_opened
816
+ next
817
+ end
818
+ elsif event == :on_heredoc_beg
819
+ tokens << token
820
+ state = :heredoc_opened
821
+ heredoc_stack.last << Heredoc.build(token)
822
+ next
823
+ elsif heredoc_stack.size > 1
824
+ heredoc_stack[-2].last << token
825
+ next
826
+ end
827
+
828
+ heredoc_stack.last.each do |heredoc|
829
+ tokens.concat(heredoc.to_a)
830
+ end
831
+
832
+ heredoc_stack.last.clear
833
+ state = :default
834
+
835
+ tokens << token
836
+ end
837
+ end
838
+
839
+ # Drop the EOF token from the list
840
+ tokens = tokens[0...-1]
841
+
842
+ # We sort by location to compare against Ripper's output
843
+ tokens.sort_by!(&:location)
844
+
845
+ ParseResult.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, [])
846
+ end
847
+ end
848
+
849
+ private_constant :LexCompat
850
+
851
+ # This is a class that wraps the Ripper lexer to produce almost exactly the
852
+ # same tokens.
853
+ class LexRipper # :nodoc:
854
+ attr_reader :source
855
+
856
+ def initialize(source)
857
+ @source = source
858
+ end
859
+
860
+ def result
861
+ previous = []
862
+ results = []
863
+
864
+ lex(source).each do |token|
865
+ case token[1]
866
+ when :on_sp
867
+ # skip
868
+ when :on_tstring_content
869
+ if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
870
+ previous[2] << token[2]
871
+ else
872
+ results << token
873
+ previous = token
874
+ end
875
+ when :on_words_sep
876
+ if previous[1] == :on_words_sep
877
+ previous[2] << token[2]
878
+ else
879
+ results << token
880
+ previous = token
881
+ end
882
+ else
883
+ results << token
884
+ previous = token
885
+ end
886
+ end
887
+
888
+ results
889
+ end
890
+
891
+ private
892
+
893
+ if Ripper.method(:lex).parameters.assoc(:keyrest)
894
+ def lex(source)
895
+ Ripper.lex(source, raise_errors: true)
896
+ end
897
+ else
898
+ def lex(source)
899
+ ripper = Ripper::Lexer.new(source)
900
+ ripper.lex.tap do |result|
901
+ raise SyntaxError, ripper.errors.map(&:message).join(' ;') if ripper.errors.any?
902
+ end
903
+ end
904
+ end
905
+ end
906
+
907
+ private_constant :LexRipper
908
+ end