prism 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +172 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +84 -0
  7. data/README.md +89 -0
  8. data/config.yml +2481 -0
  9. data/docs/build_system.md +74 -0
  10. data/docs/building.md +22 -0
  11. data/docs/configuration.md +60 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +117 -0
  14. data/docs/fuzzing.md +93 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/mapping.md +117 -0
  17. data/docs/ripper.md +36 -0
  18. data/docs/ruby_api.md +25 -0
  19. data/docs/serialization.md +181 -0
  20. data/docs/testing.md +55 -0
  21. data/ext/prism/api_node.c +4725 -0
  22. data/ext/prism/api_pack.c +256 -0
  23. data/ext/prism/extconf.rb +136 -0
  24. data/ext/prism/extension.c +626 -0
  25. data/ext/prism/extension.h +18 -0
  26. data/include/prism/ast.h +1932 -0
  27. data/include/prism/defines.h +45 -0
  28. data/include/prism/diagnostic.h +231 -0
  29. data/include/prism/enc/pm_encoding.h +95 -0
  30. data/include/prism/node.h +41 -0
  31. data/include/prism/pack.h +141 -0
  32. data/include/prism/parser.h +418 -0
  33. data/include/prism/regexp.h +19 -0
  34. data/include/prism/unescape.h +48 -0
  35. data/include/prism/util/pm_buffer.h +51 -0
  36. data/include/prism/util/pm_char.h +91 -0
  37. data/include/prism/util/pm_constant_pool.h +78 -0
  38. data/include/prism/util/pm_list.h +67 -0
  39. data/include/prism/util/pm_memchr.h +14 -0
  40. data/include/prism/util/pm_newline_list.h +61 -0
  41. data/include/prism/util/pm_state_stack.h +24 -0
  42. data/include/prism/util/pm_string.h +61 -0
  43. data/include/prism/util/pm_string_list.h +25 -0
  44. data/include/prism/util/pm_strpbrk.h +29 -0
  45. data/include/prism/version.h +4 -0
  46. data/include/prism.h +82 -0
  47. data/lib/prism/compiler.rb +465 -0
  48. data/lib/prism/debug.rb +157 -0
  49. data/lib/prism/desugar_compiler.rb +206 -0
  50. data/lib/prism/dispatcher.rb +2051 -0
  51. data/lib/prism/dsl.rb +750 -0
  52. data/lib/prism/ffi.rb +251 -0
  53. data/lib/prism/lex_compat.rb +838 -0
  54. data/lib/prism/mutation_compiler.rb +718 -0
  55. data/lib/prism/node.rb +14540 -0
  56. data/lib/prism/node_ext.rb +55 -0
  57. data/lib/prism/node_inspector.rb +68 -0
  58. data/lib/prism/pack.rb +185 -0
  59. data/lib/prism/parse_result/comments.rb +172 -0
  60. data/lib/prism/parse_result/newlines.rb +60 -0
  61. data/lib/prism/parse_result.rb +266 -0
  62. data/lib/prism/pattern.rb +239 -0
  63. data/lib/prism/ripper_compat.rb +174 -0
  64. data/lib/prism/serialize.rb +662 -0
  65. data/lib/prism/visitor.rb +470 -0
  66. data/lib/prism.rb +64 -0
  67. data/prism.gemspec +113 -0
  68. data/src/diagnostic.c +287 -0
  69. data/src/enc/pm_big5.c +52 -0
  70. data/src/enc/pm_euc_jp.c +58 -0
  71. data/src/enc/pm_gbk.c +61 -0
  72. data/src/enc/pm_shift_jis.c +56 -0
  73. data/src/enc/pm_tables.c +507 -0
  74. data/src/enc/pm_unicode.c +2324 -0
  75. data/src/enc/pm_windows_31j.c +56 -0
  76. data/src/node.c +2633 -0
  77. data/src/pack.c +493 -0
  78. data/src/prettyprint.c +2136 -0
  79. data/src/prism.c +14587 -0
  80. data/src/regexp.c +580 -0
  81. data/src/serialize.c +1899 -0
  82. data/src/token_type.c +349 -0
  83. data/src/unescape.c +637 -0
  84. data/src/util/pm_buffer.c +103 -0
  85. data/src/util/pm_char.c +272 -0
  86. data/src/util/pm_constant_pool.c +252 -0
  87. data/src/util/pm_list.c +41 -0
  88. data/src/util/pm_memchr.c +33 -0
  89. data/src/util/pm_newline_list.c +134 -0
  90. data/src/util/pm_state_stack.c +19 -0
  91. data/src/util/pm_string.c +200 -0
  92. data/src/util/pm_string_list.c +29 -0
  93. data/src/util/pm_strncasecmp.c +17 -0
  94. data/src/util/pm_strpbrk.c +66 -0
  95. metadata +138 -0
@@ -0,0 +1,838 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ module Prism
6
+ # This class is responsible for lexing the source using prism and then
7
+ # converting those tokens to be compatible with Ripper. In the vast majority
8
+ # of cases, this is a one-to-one mapping of the token type. Everything else
9
+ # generally lines up. However, there are a few cases that require special
10
+ # handling.
11
+ class LexCompat
12
+ # This is a mapping of prism token types to Ripper token types. This is a
13
+ # many-to-one mapping because we split up our token types, whereas Ripper
14
+ # tends to group them.
15
+ RIPPER = {
16
+ AMPERSAND: :on_op,
17
+ AMPERSAND_AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
19
+ AMPERSAND_DOT: :on_op,
20
+ AMPERSAND_EQUAL: :on_op,
21
+ BACK_REFERENCE: :on_backref,
22
+ BACKTICK: :on_backtick,
23
+ BANG: :on_op,
24
+ BANG_EQUAL: :on_op,
25
+ BANG_TILDE: :on_op,
26
+ BRACE_LEFT: :on_lbrace,
27
+ BRACE_RIGHT: :on_rbrace,
28
+ BRACKET_LEFT: :on_lbracket,
29
+ BRACKET_LEFT_ARRAY: :on_lbracket,
30
+ BRACKET_LEFT_RIGHT: :on_op,
31
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
32
+ BRACKET_RIGHT: :on_rbracket,
33
+ CARET: :on_op,
34
+ CARET_EQUAL: :on_op,
35
+ CHARACTER_LITERAL: :on_CHAR,
36
+ CLASS_VARIABLE: :on_cvar,
37
+ COLON: :on_op,
38
+ COLON_COLON: :on_op,
39
+ COMMA: :on_comma,
40
+ COMMENT: :on_comment,
41
+ CONSTANT: :on_const,
42
+ DOT: :on_period,
43
+ DOT_DOT: :on_op,
44
+ DOT_DOT_DOT: :on_op,
45
+ EMBDOC_BEGIN: :on_embdoc_beg,
46
+ EMBDOC_END: :on_embdoc_end,
47
+ EMBDOC_LINE: :on_embdoc,
48
+ EMBEXPR_BEGIN: :on_embexpr_beg,
49
+ EMBEXPR_END: :on_embexpr_end,
50
+ EMBVAR: :on_embvar,
51
+ EOF: :on_eof,
52
+ EQUAL: :on_op,
53
+ EQUAL_EQUAL: :on_op,
54
+ EQUAL_EQUAL_EQUAL: :on_op,
55
+ EQUAL_GREATER: :on_op,
56
+ EQUAL_TILDE: :on_op,
57
+ FLOAT: :on_float,
58
+ FLOAT_IMAGINARY: :on_imaginary,
59
+ FLOAT_RATIONAL: :on_rational,
60
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
61
+ GREATER: :on_op,
62
+ GREATER_EQUAL: :on_op,
63
+ GREATER_GREATER: :on_op,
64
+ GREATER_GREATER_EQUAL: :on_op,
65
+ GLOBAL_VARIABLE: :on_gvar,
66
+ HEREDOC_END: :on_heredoc_end,
67
+ HEREDOC_START: :on_heredoc_beg,
68
+ IDENTIFIER: :on_ident,
69
+ IGNORED_NEWLINE: :on_ignored_nl,
70
+ INTEGER: :on_int,
71
+ INTEGER_IMAGINARY: :on_imaginary,
72
+ INTEGER_RATIONAL: :on_rational,
73
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
74
+ INSTANCE_VARIABLE: :on_ivar,
75
+ INVALID: :INVALID,
76
+ KEYWORD___ENCODING__: :on_kw,
77
+ KEYWORD___LINE__: :on_kw,
78
+ KEYWORD___FILE__: :on_kw,
79
+ KEYWORD_ALIAS: :on_kw,
80
+ KEYWORD_AND: :on_kw,
81
+ KEYWORD_BEGIN: :on_kw,
82
+ KEYWORD_BEGIN_UPCASE: :on_kw,
83
+ KEYWORD_BREAK: :on_kw,
84
+ KEYWORD_CASE: :on_kw,
85
+ KEYWORD_CLASS: :on_kw,
86
+ KEYWORD_DEF: :on_kw,
87
+ KEYWORD_DEFINED: :on_kw,
88
+ KEYWORD_DO: :on_kw,
89
+ KEYWORD_DO_LOOP: :on_kw,
90
+ KEYWORD_ELSE: :on_kw,
91
+ KEYWORD_ELSIF: :on_kw,
92
+ KEYWORD_END: :on_kw,
93
+ KEYWORD_END_UPCASE: :on_kw,
94
+ KEYWORD_ENSURE: :on_kw,
95
+ KEYWORD_FALSE: :on_kw,
96
+ KEYWORD_FOR: :on_kw,
97
+ KEYWORD_IF: :on_kw,
98
+ KEYWORD_IF_MODIFIER: :on_kw,
99
+ KEYWORD_IN: :on_kw,
100
+ KEYWORD_MODULE: :on_kw,
101
+ KEYWORD_NEXT: :on_kw,
102
+ KEYWORD_NIL: :on_kw,
103
+ KEYWORD_NOT: :on_kw,
104
+ KEYWORD_OR: :on_kw,
105
+ KEYWORD_REDO: :on_kw,
106
+ KEYWORD_RESCUE: :on_kw,
107
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
108
+ KEYWORD_RETRY: :on_kw,
109
+ KEYWORD_RETURN: :on_kw,
110
+ KEYWORD_SELF: :on_kw,
111
+ KEYWORD_SUPER: :on_kw,
112
+ KEYWORD_THEN: :on_kw,
113
+ KEYWORD_TRUE: :on_kw,
114
+ KEYWORD_UNDEF: :on_kw,
115
+ KEYWORD_UNLESS: :on_kw,
116
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
117
+ KEYWORD_UNTIL: :on_kw,
118
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
119
+ KEYWORD_WHEN: :on_kw,
120
+ KEYWORD_WHILE: :on_kw,
121
+ KEYWORD_WHILE_MODIFIER: :on_kw,
122
+ KEYWORD_YIELD: :on_kw,
123
+ LABEL: :on_label,
124
+ LABEL_END: :on_label_end,
125
+ LAMBDA_BEGIN: :on_tlambeg,
126
+ LESS: :on_op,
127
+ LESS_EQUAL: :on_op,
128
+ LESS_EQUAL_GREATER: :on_op,
129
+ LESS_LESS: :on_op,
130
+ LESS_LESS_EQUAL: :on_op,
131
+ METHOD_NAME: :on_ident,
132
+ MINUS: :on_op,
133
+ MINUS_EQUAL: :on_op,
134
+ MINUS_GREATER: :on_tlambda,
135
+ NEWLINE: :on_nl,
136
+ NUMBERED_REFERENCE: :on_backref,
137
+ PARENTHESIS_LEFT: :on_lparen,
138
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
139
+ PARENTHESIS_RIGHT: :on_rparen,
140
+ PERCENT: :on_op,
141
+ PERCENT_EQUAL: :on_op,
142
+ PERCENT_LOWER_I: :on_qsymbols_beg,
143
+ PERCENT_LOWER_W: :on_qwords_beg,
144
+ PERCENT_LOWER_X: :on_backtick,
145
+ PERCENT_UPPER_I: :on_symbols_beg,
146
+ PERCENT_UPPER_W: :on_words_beg,
147
+ PIPE: :on_op,
148
+ PIPE_EQUAL: :on_op,
149
+ PIPE_PIPE: :on_op,
150
+ PIPE_PIPE_EQUAL: :on_op,
151
+ PLUS: :on_op,
152
+ PLUS_EQUAL: :on_op,
153
+ QUESTION_MARK: :on_op,
154
+ RATIONAL_FLOAT: :on_rational,
155
+ RATIONAL_INTEGER: :on_rational,
156
+ REGEXP_BEGIN: :on_regexp_beg,
157
+ REGEXP_END: :on_regexp_end,
158
+ SEMICOLON: :on_semicolon,
159
+ SLASH: :on_op,
160
+ SLASH_EQUAL: :on_op,
161
+ STAR: :on_op,
162
+ STAR_EQUAL: :on_op,
163
+ STAR_STAR: :on_op,
164
+ STAR_STAR_EQUAL: :on_op,
165
+ STRING_BEGIN: :on_tstring_beg,
166
+ STRING_CONTENT: :on_tstring_content,
167
+ STRING_END: :on_tstring_end,
168
+ SYMBOL_BEGIN: :on_symbeg,
169
+ TILDE: :on_op,
170
+ UAMPERSAND: :on_op,
171
+ UCOLON_COLON: :on_op,
172
+ UDOT_DOT: :on_op,
173
+ UDOT_DOT_DOT: :on_op,
174
+ UMINUS: :on_op,
175
+ UMINUS_NUM: :on_op,
176
+ UPLUS: :on_op,
177
+ USTAR: :on_op,
178
+ USTAR_STAR: :on_op,
179
+ WORDS_SEP: :on_words_sep,
180
+ "__END__": :on___end__
181
+ }.freeze
182
+
183
+ # When we produce tokens, we produce the same arrays that Ripper does.
184
+ # However, we add a couple of convenience methods onto them to make them a
185
+ # little easier to work with. We delegate all other methods to the array.
186
+ class Token < SimpleDelegator
187
+ def location
188
+ self[0]
189
+ end
190
+
191
+ def event
192
+ self[1]
193
+ end
194
+
195
+ def value
196
+ self[2]
197
+ end
198
+
199
+ def state
200
+ self[3]
201
+ end
202
+ end
203
+
204
+ # Ripper doesn't include the rest of the token in the event, so we need to
205
+ # trim it down to just the content on the first line when comparing.
206
+ class EndContentToken < Token
207
+ def ==(other)
208
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
209
+ end
210
+ end
211
+
212
+ # Tokens where state should be ignored
213
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
214
+ class IgnoreStateToken < Token
215
+ def ==(other)
216
+ self[0...-1] == other[0...-1]
217
+ end
218
+ end
219
+
220
+ # Ident tokens for the most part are exactly the same, except sometimes we
221
+ # know an ident is a local when ripper doesn't (when they are introduced
222
+ # through named captures in regular expressions). In that case we don't
223
+ # compare the state.
224
+ class IdentToken < Token
225
+ def ==(other)
226
+ (self[0...-1] == other[0...-1]) && (
227
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
228
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
229
+ )
230
+ end
231
+ end
232
+
233
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
234
+ # we compare the state differently here.
235
+ class IgnoredNewlineToken < Token
236
+ def ==(other)
237
+ return false unless self[0...-1] == other[0...-1]
238
+
239
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
240
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
241
+ else
242
+ self[4] == other[4]
243
+ end
244
+ end
245
+ end
246
+
247
+ # If we have an identifier that follows a method name like:
248
+ #
249
+ # def foo bar
250
+ #
251
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
252
+ # scope named bar because it hasn't pushed the local table yet. We do this
253
+ # more accurately, so we need to allow comparing against both END and
254
+ # END|LABEL.
255
+ class ParamToken < Token
256
+ def ==(other)
257
+ (self[0...-1] == other[0...-1]) && (
258
+ (other[3] == Ripper::EXPR_END) ||
259
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
260
+ )
261
+ end
262
+ end
263
+
264
+ # A heredoc in this case is a list of tokens that belong to the body of the
265
+ # heredoc that should be appended onto the list of tokens when the heredoc
266
+ # closes.
267
+ module Heredoc
268
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
269
+ # We need to keep them around so that we can insert them in the correct
270
+ # order back into the token stream and set the state of the last token to
271
+ # the state that the heredoc was opened in.
272
+ class PlainHeredoc
273
+ attr_reader :tokens
274
+
275
+ def initialize
276
+ @tokens = []
277
+ end
278
+
279
+ def <<(token)
280
+ tokens << token
281
+ end
282
+
283
+ def to_a
284
+ tokens
285
+ end
286
+ end
287
+
288
+ # Dash heredocs are a little more complicated. They are a list of tokens
289
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
290
+ # to keep track of the state that the heredoc was opened in.
291
+ class DashHeredoc
292
+ attr_reader :split, :tokens
293
+
294
+ def initialize(split)
295
+ @split = split
296
+ @tokens = []
297
+ end
298
+
299
+ def <<(token)
300
+ tokens << token
301
+ end
302
+
303
+ def to_a
304
+ embexpr_balance = 0
305
+
306
+ tokens.each_with_object([]) do |token, results|
307
+ case token.event
308
+ when :on_embexpr_beg
309
+ embexpr_balance += 1
310
+ results << token
311
+ when :on_embexpr_end
312
+ embexpr_balance -= 1
313
+ results << token
314
+ when :on_tstring_content
315
+ if embexpr_balance == 0
316
+ lineno = token[0][0]
317
+ column = token[0][1]
318
+
319
+ if split
320
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
321
+ # to keep the delimiter in the result.
322
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
323
+ column = 0 if index > 0
324
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
325
+ lineno += value.count("\n")
326
+ end
327
+ else
328
+ results << token
329
+ end
330
+ else
331
+ results << token
332
+ end
333
+ else
334
+ results << token
335
+ end
336
+ end
337
+ end
338
+ end
339
+
340
+ # Heredocs that are dedenting heredocs are a little more complicated.
341
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
342
+ # removed from the output. prism only modifies the node itself and keeps
343
+ # the token the same. This simplifies prism, but makes comparing against
344
+ # Ripper much harder because there is a length mismatch.
345
+ #
346
+ # Fortunately, we already have to pull out the heredoc tokens in order to
347
+ # insert them into the stream in the correct order. As such, we can do
348
+ # some extra manipulation on the tokens to make them match Ripper's
349
+ # output by mirroring the dedent logic that Ripper uses.
350
+ class DedentingHeredoc
351
+ TAB_WIDTH = 8
352
+
353
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
354
+
355
+ def initialize
356
+ @tokens = []
357
+ @dedent_next = true
358
+ @dedent = nil
359
+ @embexpr_balance = 0
360
+ end
361
+
362
+ # As tokens are coming in, we track the minimum amount of common leading
363
+ # whitespace on plain string content tokens. This allows us to later
364
+ # remove that amount of whitespace from the beginning of each line.
365
+ def <<(token)
366
+ case token.event
367
+ when :on_embexpr_beg, :on_heredoc_beg
368
+ @embexpr_balance += 1
369
+ when :on_embexpr_end, :on_heredoc_end
370
+ @embexpr_balance -= 1
371
+ when :on_tstring_content
372
+ if embexpr_balance == 0
373
+ token.value.split(/(?<=\n)/).each_with_index do |line, index|
374
+ next if line.strip.empty? && line.end_with?("\n")
375
+ next if !(dedent_next || index > 0)
376
+
377
+ leading = line[/\A(\s*)\n?/, 1]
378
+ next_dedent = 0
379
+
380
+ leading.each_char do |char|
381
+ if char == "\t"
382
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
383
+ else
384
+ next_dedent += 1
385
+ end
386
+ end
387
+
388
+ @dedent = [dedent, next_dedent].compact.min
389
+ end
390
+ end
391
+ end
392
+
393
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
394
+ tokens << token
395
+ end
396
+
397
+ def to_a
398
+ # If every line in the heredoc is blank, we still need to split up the
399
+ # string content token into multiple tokens.
400
+ if dedent.nil?
401
+ results = []
402
+ embexpr_balance = 0
403
+
404
+ tokens.each do |token|
405
+ case token.event
406
+ when :on_embexpr_beg, :on_heredoc_beg
407
+ embexpr_balance += 1
408
+ results << token
409
+ when :on_embexpr_end, :on_heredoc_end
410
+ embexpr_balance -= 1
411
+ results << token
412
+ when :on_tstring_content
413
+ if embexpr_balance == 0
414
+ lineno = token[0][0]
415
+ column = token[0][1]
416
+
417
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
418
+ column = 0 if index > 0
419
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
420
+ lineno += 1
421
+ end
422
+ else
423
+ results << token
424
+ end
425
+ else
426
+ results << token
427
+ end
428
+ end
429
+
430
+ return results
431
+ end
432
+
433
+ # Otherwise, we're going to run through each token in the list and
434
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
435
+ # perform. We also need to remove the dedent from the beginning of
436
+ # each line of plain string content tokens.
437
+ results = []
438
+ dedent_next = true
439
+ embexpr_balance = 0
440
+
441
+ tokens.each do |token|
442
+ # Notice that the structure of this conditional largely matches the
443
+ # whitespace calculation we performed above. This is because
444
+ # checking if the subsequent token needs to be dedented is common to
445
+ # both the dedent calculation and the ignored_sp insertion.
446
+ case token.event
447
+ when :on_embexpr_beg
448
+ embexpr_balance += 1
449
+ results << token
450
+ when :on_embexpr_end
451
+ embexpr_balance -= 1
452
+ results << token
453
+ when :on_tstring_content
454
+ if embexpr_balance == 0
455
+ # Here we're going to split the string on newlines, but maintain
456
+ # the newlines in the resulting array. We'll do that with a look
457
+ # behind assertion.
458
+ splits = token.value.split(/(?<=\n)/)
459
+ index = 0
460
+
461
+ while index < splits.length
462
+ line = splits[index]
463
+ lineno = token[0][0] + index
464
+ column = token[0][1]
465
+
466
+ # Blank lines do not count toward common leading whitespace
467
+ # calculation and do not need to be dedented.
468
+ if dedent_next || index > 0
469
+ column = 0
470
+ end
471
+
472
+ # If the dedent is 0 and we're not supposed to dedent the next
473
+ # line or this line doesn't start with whitespace, then we
474
+ # should concatenate the rest of the string to match ripper.
475
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
476
+ line = splits[index..].join
477
+ index = splits.length
478
+ end
479
+
480
+ # If we are supposed to dedent this line or if this is not the
481
+ # first line of the string and this line isn't entirely blank,
482
+ # then we need to insert an on_ignored_sp token and remove the
483
+ # dedent from the beginning of the line.
484
+ if (dedent > 0) && (dedent_next || index > 0)
485
+ deleting = 0
486
+ deleted_chars = []
487
+
488
+ # Gather up all of the characters that we're going to
489
+ # delete, stopping when you hit a character that would put
490
+ # you over the dedent amount.
491
+ line.each_char.with_index do |char, i|
492
+ case char
493
+ when "\r"
494
+ if line.chars[i + 1] == "\n"
495
+ break
496
+ end
497
+ when "\n"
498
+ break
499
+ when "\t"
500
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
501
+ else
502
+ deleting += 1
503
+ end
504
+
505
+ break if deleting > dedent
506
+ deleted_chars << char
507
+ end
508
+
509
+ # If we have something to delete, then delete it from the
510
+ # string and insert an on_ignored_sp token.
511
+ if deleted_chars.any?
512
+ ignored = deleted_chars.join
513
+ line.delete_prefix!(ignored)
514
+
515
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
516
+ column = ignored.length
517
+ end
518
+ end
519
+
520
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
521
+ index += 1
522
+ end
523
+ else
524
+ results << token
525
+ end
526
+ else
527
+ results << token
528
+ end
529
+
530
+ dedent_next =
531
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
532
+ embexpr_balance == 0
533
+ end
534
+
535
+ results
536
+ end
537
+ end
538
+
539
+ # Here we will split between the two types of heredocs and return the
540
+ # object that will store their tokens.
541
+ def self.build(opening)
542
+ case opening.value[2]
543
+ when "~"
544
+ DedentingHeredoc.new
545
+ when "-"
546
+ DashHeredoc.new(opening.value[3] != "'")
547
+ else
548
+ PlainHeredoc.new
549
+ end
550
+ end
551
+ end
552
+
553
+ attr_reader :source, :filepath
554
+
555
+ def initialize(source, filepath = "")
556
+ @source = source
557
+ @filepath = filepath || ""
558
+ end
559
+
560
+ def result
561
+ tokens = []
562
+
563
+ state = :default
564
+ heredoc_stack = [[]]
565
+
566
+ result = Prism.lex(source, @filepath)
567
+ result_value = result.value
568
+ previous_state = nil
569
+
570
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
571
+ # first token, so we had to have a hack in place to account for that. This
572
+ # checks for that behavior.
573
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
574
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
575
+
576
+ result_value.each_with_index do |(token, lex_state), index|
577
+ lineno = token.location.start_line
578
+ column = token.location.start_column
579
+
580
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
581
+ # certain tokens ripper sets the first token back by 3 bytes. It also
582
+ # keeps the byte order mark in the first token's value. This is weird,
583
+ # and I don't want to mirror that in our parser. So instead, we'll match
584
+ # up the columns and values here.
585
+ if bom && lineno == 1
586
+ column -= 3
587
+
588
+ if index == 0 && column == 0 && !bom_flushed
589
+ flushed =
590
+ case token.type
591
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
592
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
593
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
594
+ :PERCENT_UPPER_W, :STRING_BEGIN
595
+ true
596
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
597
+ token.value.start_with?("%")
598
+ else
599
+ false
600
+ end
601
+
602
+ unless flushed
603
+ column -= 3
604
+ value = token.value
605
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
606
+ end
607
+ end
608
+ end
609
+
610
+ event = RIPPER.fetch(token.type)
611
+ value = token.value
612
+ lex_state = Ripper::Lexer::State.new(lex_state)
613
+
614
+ token =
615
+ case event
616
+ when :on___end__
617
+ EndContentToken.new([[lineno, column], event, value, lex_state])
618
+ when :on_comment
619
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
620
+ when :on_heredoc_end
621
+ # Heredoc end tokens can be emitted in an odd order, so we don't
622
+ # want to bother comparing the state on them.
623
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
624
+ when :on_ident
625
+ if lex_state == Ripper::EXPR_END
626
+ # If we have an identifier that follows a method name like:
627
+ #
628
+ # def foo bar
629
+ #
630
+ # then Ripper will mark bar as END|LABEL if there is a local in a
631
+ # parent scope named bar because it hasn't pushed the local table
632
+ # yet. We do this more accurately, so we need to allow comparing
633
+ # against both END and END|LABEL.
634
+ ParamToken.new([[lineno, column], event, value, lex_state])
635
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
636
+ # In the event that we're comparing identifiers, we're going to
637
+ # allow a little divergence. Ripper doesn't account for local
638
+ # variables introduced through named captures in regexes, and we
639
+ # do, which accounts for this difference.
640
+ IdentToken.new([[lineno, column], event, value, lex_state])
641
+ else
642
+ Token.new([[lineno, column], event, value, lex_state])
643
+ end
644
+ when :on_embexpr_end
645
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
646
+ when :on_ignored_nl
647
+ # Ignored newlines can occasionally have a LABEL state attached to
648
+ # them which doesn't actually impact anything. We don't mirror that
649
+ # state so we ignored it.
650
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
651
+ when :on_regexp_end
652
+ # On regex end, Ripper scans and then sets end state, so the ripper
653
+ # lexed output is begin, when it should be end. prism sets lex state
654
+ # correctly to end state, but we want to be able to compare against
655
+ # Ripper's lexed state. So here, if it's a regexp end token, we
656
+ # output the state as the previous state, solely for the sake of
657
+ # comparison.
658
+ previous_token = result_value[index - 1][0]
659
+ lex_state =
660
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
661
+ # If the previous token is embexpr_end, then we have to do even
662
+ # more processing. The end of an embedded expression sets the
663
+ # state to the state that it had at the beginning of the
664
+ # embedded expression. So we have to go and find that state and
665
+ # set it here.
666
+ counter = 1
667
+ current_index = index - 1
668
+
669
+ until counter == 0
670
+ current_index -= 1
671
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
672
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
673
+ end
674
+
675
+ Ripper::Lexer::State.new(result_value[current_index][1])
676
+ else
677
+ previous_state
678
+ end
679
+
680
+ Token.new([[lineno, column], event, value, lex_state])
681
+ when :on_eof
682
+ previous_token = result_value[index - 1][0]
683
+
684
+ # If we're at the end of the file and the previous token was a
685
+ # comment and there is still whitespace after the comment, then
686
+ # Ripper will append a on_nl token (even though there isn't
687
+ # necessarily a newline). We mirror that here.
688
+ start_offset = previous_token.location.end_offset
689
+ end_offset = token.location.start_offset
690
+
691
+ if previous_token.type == :COMMENT && start_offset < end_offset
692
+ if bom
693
+ start_offset += 3
694
+ end_offset += 3
695
+ end
696
+
697
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
698
+ end
699
+
700
+ Token.new([[lineno, column], event, value, lex_state])
701
+ else
702
+ Token.new([[lineno, column], event, value, lex_state])
703
+ end
704
+
705
+ previous_state = lex_state
706
+
707
+ # The order in which tokens appear in our lexer is different from the
708
+ # order that they appear in Ripper. When we hit the declaration of a
709
+ # heredoc in prism, we skip forward and lex the rest of the content of
710
+ # the heredoc before going back and lexing at the end of the heredoc
711
+ # identifier.
712
+ #
713
+ # To match up to ripper, we keep a small state variable around here to
714
+ # track whether we're in the middle of a heredoc or not. In this way we
715
+ # can shuffle around the token to match Ripper's output.
716
+ case state
717
+ when :default
718
+ # The default state is when there are no heredocs at all. In this
719
+ # state we can append the token to the list of tokens and move on.
720
+ tokens << token
721
+
722
+ # If we get the declaration of a heredoc, then we open a new heredoc
723
+ # and move into the heredoc_opened state.
724
+ if event == :on_heredoc_beg
725
+ state = :heredoc_opened
726
+ heredoc_stack.last << Heredoc.build(token)
727
+ end
728
+ when :heredoc_opened
729
+ # The heredoc_opened state is when we've seen the declaration of a
730
+ # heredoc and are now lexing the body of the heredoc. In this state we
731
+ # push tokens onto the most recently created heredoc.
732
+ heredoc_stack.last.last << token
733
+
734
+ case event
735
+ when :on_heredoc_beg
736
+ # If we receive a heredoc declaration while lexing the body of a
737
+ # heredoc, this means we have nested heredocs. In this case we'll
738
+ # push a new heredoc onto the stack and stay in the heredoc_opened
739
+ # state since we're now lexing the body of the new heredoc.
740
+ heredoc_stack << [Heredoc.build(token)]
741
+ when :on_heredoc_end
742
+ # If we receive the end of a heredoc, then we're done lexing the
743
+ # body of the heredoc. In this case we now have a completed heredoc
744
+ # but need to wait for the next newline to push it into the token
745
+ # stream.
746
+ state = :heredoc_closed
747
+ end
748
+ when :heredoc_closed
749
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
750
+ if heredoc_stack.size > 1
751
+ flushing = heredoc_stack.pop
752
+ heredoc_stack.last.last << token
753
+
754
+ flushing.each do |heredoc|
755
+ heredoc.to_a.each do |flushed_token|
756
+ heredoc_stack.last.last << flushed_token
757
+ end
758
+ end
759
+
760
+ state = :heredoc_opened
761
+ next
762
+ end
763
+ elsif event == :on_heredoc_beg
764
+ tokens << token
765
+ state = :heredoc_opened
766
+ heredoc_stack.last << Heredoc.build(token)
767
+ next
768
+ elsif heredoc_stack.size > 1
769
+ heredoc_stack[-2].last << token
770
+ next
771
+ end
772
+
773
+ heredoc_stack.last.each do |heredoc|
774
+ tokens.concat(heredoc.to_a)
775
+ end
776
+
777
+ heredoc_stack.last.clear
778
+ state = :default
779
+
780
+ tokens << token
781
+ end
782
+ end
783
+
784
+ # Drop the EOF token from the list
785
+ tokens = tokens[0...-1]
786
+
787
+ # We sort by location to compare against Ripper's output
788
+ tokens.sort_by!(&:location)
789
+
790
+ if result_value.size - 1 > tokens.size
791
+ raise StandardError, "Lost tokens when performing lex_compat"
792
+ end
793
+
794
+ ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
795
+ end
796
+ end
797
+
798
+ # This is a class that wraps the Ripper lexer to produce almost exactly the
799
+ # same tokens.
800
+ class LexRipper
801
+ attr_reader :source
802
+
803
+ def initialize(source)
804
+ @source = source
805
+ end
806
+
807
+ def result
808
+ previous = []
809
+ results = []
810
+
811
+ Ripper.lex(source, raise_errors: true).each do |token|
812
+ case token[1]
813
+ when :on_sp
814
+ # skip
815
+ when :on_tstring_content
816
+ if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
817
+ previous[2] << token[2]
818
+ else
819
+ results << token
820
+ previous = token
821
+ end
822
+ when :on_words_sep
823
+ if previous[1] == :on_words_sep
824
+ previous[2] << token[2]
825
+ else
826
+ results << token
827
+ previous = token
828
+ end
829
+ else
830
+ results << token
831
+ previous = token
832
+ end
833
+ end
834
+
835
+ results
836
+ end
837
+ end
838
+ end