prism 0.13.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (95) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +172 -0
  3. data/CODE_OF_CONDUCT.md +76 -0
  4. data/CONTRIBUTING.md +62 -0
  5. data/LICENSE.md +7 -0
  6. data/Makefile +84 -0
  7. data/README.md +89 -0
  8. data/config.yml +2481 -0
  9. data/docs/build_system.md +74 -0
  10. data/docs/building.md +22 -0
  11. data/docs/configuration.md +60 -0
  12. data/docs/design.md +53 -0
  13. data/docs/encoding.md +117 -0
  14. data/docs/fuzzing.md +93 -0
  15. data/docs/heredocs.md +36 -0
  16. data/docs/mapping.md +117 -0
  17. data/docs/ripper.md +36 -0
  18. data/docs/ruby_api.md +25 -0
  19. data/docs/serialization.md +181 -0
  20. data/docs/testing.md +55 -0
  21. data/ext/prism/api_node.c +4725 -0
  22. data/ext/prism/api_pack.c +256 -0
  23. data/ext/prism/extconf.rb +136 -0
  24. data/ext/prism/extension.c +626 -0
  25. data/ext/prism/extension.h +18 -0
  26. data/include/prism/ast.h +1932 -0
  27. data/include/prism/defines.h +45 -0
  28. data/include/prism/diagnostic.h +231 -0
  29. data/include/prism/enc/pm_encoding.h +95 -0
  30. data/include/prism/node.h +41 -0
  31. data/include/prism/pack.h +141 -0
  32. data/include/prism/parser.h +418 -0
  33. data/include/prism/regexp.h +19 -0
  34. data/include/prism/unescape.h +48 -0
  35. data/include/prism/util/pm_buffer.h +51 -0
  36. data/include/prism/util/pm_char.h +91 -0
  37. data/include/prism/util/pm_constant_pool.h +78 -0
  38. data/include/prism/util/pm_list.h +67 -0
  39. data/include/prism/util/pm_memchr.h +14 -0
  40. data/include/prism/util/pm_newline_list.h +61 -0
  41. data/include/prism/util/pm_state_stack.h +24 -0
  42. data/include/prism/util/pm_string.h +61 -0
  43. data/include/prism/util/pm_string_list.h +25 -0
  44. data/include/prism/util/pm_strpbrk.h +29 -0
  45. data/include/prism/version.h +4 -0
  46. data/include/prism.h +82 -0
  47. data/lib/prism/compiler.rb +465 -0
  48. data/lib/prism/debug.rb +157 -0
  49. data/lib/prism/desugar_compiler.rb +206 -0
  50. data/lib/prism/dispatcher.rb +2051 -0
  51. data/lib/prism/dsl.rb +750 -0
  52. data/lib/prism/ffi.rb +251 -0
  53. data/lib/prism/lex_compat.rb +838 -0
  54. data/lib/prism/mutation_compiler.rb +718 -0
  55. data/lib/prism/node.rb +14540 -0
  56. data/lib/prism/node_ext.rb +55 -0
  57. data/lib/prism/node_inspector.rb +68 -0
  58. data/lib/prism/pack.rb +185 -0
  59. data/lib/prism/parse_result/comments.rb +172 -0
  60. data/lib/prism/parse_result/newlines.rb +60 -0
  61. data/lib/prism/parse_result.rb +266 -0
  62. data/lib/prism/pattern.rb +239 -0
  63. data/lib/prism/ripper_compat.rb +174 -0
  64. data/lib/prism/serialize.rb +662 -0
  65. data/lib/prism/visitor.rb +470 -0
  66. data/lib/prism.rb +64 -0
  67. data/prism.gemspec +113 -0
  68. data/src/diagnostic.c +287 -0
  69. data/src/enc/pm_big5.c +52 -0
  70. data/src/enc/pm_euc_jp.c +58 -0
  71. data/src/enc/pm_gbk.c +61 -0
  72. data/src/enc/pm_shift_jis.c +56 -0
  73. data/src/enc/pm_tables.c +507 -0
  74. data/src/enc/pm_unicode.c +2324 -0
  75. data/src/enc/pm_windows_31j.c +56 -0
  76. data/src/node.c +2633 -0
  77. data/src/pack.c +493 -0
  78. data/src/prettyprint.c +2136 -0
  79. data/src/prism.c +14587 -0
  80. data/src/regexp.c +580 -0
  81. data/src/serialize.c +1899 -0
  82. data/src/token_type.c +349 -0
  83. data/src/unescape.c +637 -0
  84. data/src/util/pm_buffer.c +103 -0
  85. data/src/util/pm_char.c +272 -0
  86. data/src/util/pm_constant_pool.c +252 -0
  87. data/src/util/pm_list.c +41 -0
  88. data/src/util/pm_memchr.c +33 -0
  89. data/src/util/pm_newline_list.c +134 -0
  90. data/src/util/pm_state_stack.c +19 -0
  91. data/src/util/pm_string.c +200 -0
  92. data/src/util/pm_string_list.c +29 -0
  93. data/src/util/pm_strncasecmp.c +17 -0
  94. data/src/util/pm_strpbrk.c +66 -0
  95. metadata +138 -0
@@ -0,0 +1,838 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ module Prism
6
+ # This class is responsible for lexing the source using prism and then
7
+ # converting those tokens to be compatible with Ripper. In the vast majority
8
+ # of cases, this is a one-to-one mapping of the token type. Everything else
9
+ # generally lines up. However, there are a few cases that require special
10
+ # handling.
11
+ class LexCompat
12
+ # This is a mapping of prism token types to Ripper token types. This is a
13
+ # many-to-one mapping because we split up our token types, whereas Ripper
14
+ # tends to group them.
15
+ RIPPER = {
16
+ AMPERSAND: :on_op,
17
+ AMPERSAND_AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
19
+ AMPERSAND_DOT: :on_op,
20
+ AMPERSAND_EQUAL: :on_op,
21
+ BACK_REFERENCE: :on_backref,
22
+ BACKTICK: :on_backtick,
23
+ BANG: :on_op,
24
+ BANG_EQUAL: :on_op,
25
+ BANG_TILDE: :on_op,
26
+ BRACE_LEFT: :on_lbrace,
27
+ BRACE_RIGHT: :on_rbrace,
28
+ BRACKET_LEFT: :on_lbracket,
29
+ BRACKET_LEFT_ARRAY: :on_lbracket,
30
+ BRACKET_LEFT_RIGHT: :on_op,
31
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
32
+ BRACKET_RIGHT: :on_rbracket,
33
+ CARET: :on_op,
34
+ CARET_EQUAL: :on_op,
35
+ CHARACTER_LITERAL: :on_CHAR,
36
+ CLASS_VARIABLE: :on_cvar,
37
+ COLON: :on_op,
38
+ COLON_COLON: :on_op,
39
+ COMMA: :on_comma,
40
+ COMMENT: :on_comment,
41
+ CONSTANT: :on_const,
42
+ DOT: :on_period,
43
+ DOT_DOT: :on_op,
44
+ DOT_DOT_DOT: :on_op,
45
+ EMBDOC_BEGIN: :on_embdoc_beg,
46
+ EMBDOC_END: :on_embdoc_end,
47
+ EMBDOC_LINE: :on_embdoc,
48
+ EMBEXPR_BEGIN: :on_embexpr_beg,
49
+ EMBEXPR_END: :on_embexpr_end,
50
+ EMBVAR: :on_embvar,
51
+ EOF: :on_eof,
52
+ EQUAL: :on_op,
53
+ EQUAL_EQUAL: :on_op,
54
+ EQUAL_EQUAL_EQUAL: :on_op,
55
+ EQUAL_GREATER: :on_op,
56
+ EQUAL_TILDE: :on_op,
57
+ FLOAT: :on_float,
58
+ FLOAT_IMAGINARY: :on_imaginary,
59
+ FLOAT_RATIONAL: :on_rational,
60
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
61
+ GREATER: :on_op,
62
+ GREATER_EQUAL: :on_op,
63
+ GREATER_GREATER: :on_op,
64
+ GREATER_GREATER_EQUAL: :on_op,
65
+ GLOBAL_VARIABLE: :on_gvar,
66
+ HEREDOC_END: :on_heredoc_end,
67
+ HEREDOC_START: :on_heredoc_beg,
68
+ IDENTIFIER: :on_ident,
69
+ IGNORED_NEWLINE: :on_ignored_nl,
70
+ INTEGER: :on_int,
71
+ INTEGER_IMAGINARY: :on_imaginary,
72
+ INTEGER_RATIONAL: :on_rational,
73
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
74
+ INSTANCE_VARIABLE: :on_ivar,
75
+ INVALID: :INVALID,
76
+ KEYWORD___ENCODING__: :on_kw,
77
+ KEYWORD___LINE__: :on_kw,
78
+ KEYWORD___FILE__: :on_kw,
79
+ KEYWORD_ALIAS: :on_kw,
80
+ KEYWORD_AND: :on_kw,
81
+ KEYWORD_BEGIN: :on_kw,
82
+ KEYWORD_BEGIN_UPCASE: :on_kw,
83
+ KEYWORD_BREAK: :on_kw,
84
+ KEYWORD_CASE: :on_kw,
85
+ KEYWORD_CLASS: :on_kw,
86
+ KEYWORD_DEF: :on_kw,
87
+ KEYWORD_DEFINED: :on_kw,
88
+ KEYWORD_DO: :on_kw,
89
+ KEYWORD_DO_LOOP: :on_kw,
90
+ KEYWORD_ELSE: :on_kw,
91
+ KEYWORD_ELSIF: :on_kw,
92
+ KEYWORD_END: :on_kw,
93
+ KEYWORD_END_UPCASE: :on_kw,
94
+ KEYWORD_ENSURE: :on_kw,
95
+ KEYWORD_FALSE: :on_kw,
96
+ KEYWORD_FOR: :on_kw,
97
+ KEYWORD_IF: :on_kw,
98
+ KEYWORD_IF_MODIFIER: :on_kw,
99
+ KEYWORD_IN: :on_kw,
100
+ KEYWORD_MODULE: :on_kw,
101
+ KEYWORD_NEXT: :on_kw,
102
+ KEYWORD_NIL: :on_kw,
103
+ KEYWORD_NOT: :on_kw,
104
+ KEYWORD_OR: :on_kw,
105
+ KEYWORD_REDO: :on_kw,
106
+ KEYWORD_RESCUE: :on_kw,
107
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
108
+ KEYWORD_RETRY: :on_kw,
109
+ KEYWORD_RETURN: :on_kw,
110
+ KEYWORD_SELF: :on_kw,
111
+ KEYWORD_SUPER: :on_kw,
112
+ KEYWORD_THEN: :on_kw,
113
+ KEYWORD_TRUE: :on_kw,
114
+ KEYWORD_UNDEF: :on_kw,
115
+ KEYWORD_UNLESS: :on_kw,
116
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
117
+ KEYWORD_UNTIL: :on_kw,
118
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
119
+ KEYWORD_WHEN: :on_kw,
120
+ KEYWORD_WHILE: :on_kw,
121
+ KEYWORD_WHILE_MODIFIER: :on_kw,
122
+ KEYWORD_YIELD: :on_kw,
123
+ LABEL: :on_label,
124
+ LABEL_END: :on_label_end,
125
+ LAMBDA_BEGIN: :on_tlambeg,
126
+ LESS: :on_op,
127
+ LESS_EQUAL: :on_op,
128
+ LESS_EQUAL_GREATER: :on_op,
129
+ LESS_LESS: :on_op,
130
+ LESS_LESS_EQUAL: :on_op,
131
+ METHOD_NAME: :on_ident,
132
+ MINUS: :on_op,
133
+ MINUS_EQUAL: :on_op,
134
+ MINUS_GREATER: :on_tlambda,
135
+ NEWLINE: :on_nl,
136
+ NUMBERED_REFERENCE: :on_backref,
137
+ PARENTHESIS_LEFT: :on_lparen,
138
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
139
+ PARENTHESIS_RIGHT: :on_rparen,
140
+ PERCENT: :on_op,
141
+ PERCENT_EQUAL: :on_op,
142
+ PERCENT_LOWER_I: :on_qsymbols_beg,
143
+ PERCENT_LOWER_W: :on_qwords_beg,
144
+ PERCENT_LOWER_X: :on_backtick,
145
+ PERCENT_UPPER_I: :on_symbols_beg,
146
+ PERCENT_UPPER_W: :on_words_beg,
147
+ PIPE: :on_op,
148
+ PIPE_EQUAL: :on_op,
149
+ PIPE_PIPE: :on_op,
150
+ PIPE_PIPE_EQUAL: :on_op,
151
+ PLUS: :on_op,
152
+ PLUS_EQUAL: :on_op,
153
+ QUESTION_MARK: :on_op,
154
+ RATIONAL_FLOAT: :on_rational,
155
+ RATIONAL_INTEGER: :on_rational,
156
+ REGEXP_BEGIN: :on_regexp_beg,
157
+ REGEXP_END: :on_regexp_end,
158
+ SEMICOLON: :on_semicolon,
159
+ SLASH: :on_op,
160
+ SLASH_EQUAL: :on_op,
161
+ STAR: :on_op,
162
+ STAR_EQUAL: :on_op,
163
+ STAR_STAR: :on_op,
164
+ STAR_STAR_EQUAL: :on_op,
165
+ STRING_BEGIN: :on_tstring_beg,
166
+ STRING_CONTENT: :on_tstring_content,
167
+ STRING_END: :on_tstring_end,
168
+ SYMBOL_BEGIN: :on_symbeg,
169
+ TILDE: :on_op,
170
+ UAMPERSAND: :on_op,
171
+ UCOLON_COLON: :on_op,
172
+ UDOT_DOT: :on_op,
173
+ UDOT_DOT_DOT: :on_op,
174
+ UMINUS: :on_op,
175
+ UMINUS_NUM: :on_op,
176
+ UPLUS: :on_op,
177
+ USTAR: :on_op,
178
+ USTAR_STAR: :on_op,
179
+ WORDS_SEP: :on_words_sep,
180
+ "__END__": :on___end__
181
+ }.freeze
182
+
183
+ # When we produce tokens, we produce the same arrays that Ripper does.
184
+ # However, we add a couple of convenience methods onto them to make them a
185
+ # little easier to work with. We delegate all other methods to the array.
186
+ class Token < SimpleDelegator
187
+ def location
188
+ self[0]
189
+ end
190
+
191
+ def event
192
+ self[1]
193
+ end
194
+
195
+ def value
196
+ self[2]
197
+ end
198
+
199
+ def state
200
+ self[3]
201
+ end
202
+ end
203
+
204
+ # Ripper doesn't include the rest of the token in the event, so we need to
205
+ # trim it down to just the content on the first line when comparing.
206
+ class EndContentToken < Token
207
+ def ==(other)
208
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
209
+ end
210
+ end
211
+
212
+ # Tokens where state should be ignored
213
+ # used for :on_comment, :on_heredoc_end, :on_embexpr_end
214
+ class IgnoreStateToken < Token
215
+ def ==(other)
216
+ self[0...-1] == other[0...-1]
217
+ end
218
+ end
219
+
220
+ # Ident tokens for the most part are exactly the same, except sometimes we
221
+ # know an ident is a local when ripper doesn't (when they are introduced
222
+ # through named captures in regular expressions). In that case we don't
223
+ # compare the state.
224
+ class IdentToken < Token
225
+ def ==(other)
226
+ (self[0...-1] == other[0...-1]) && (
227
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
228
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
229
+ )
230
+ end
231
+ end
232
+
233
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
234
+ # we compare the state differently here.
235
+ class IgnoredNewlineToken < Token
236
+ def ==(other)
237
+ return false unless self[0...-1] == other[0...-1]
238
+
239
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
240
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
241
+ else
242
+ self[4] == other[4]
243
+ end
244
+ end
245
+ end
246
+
247
+ # If we have an identifier that follows a method name like:
248
+ #
249
+ # def foo bar
250
+ #
251
+ # then Ripper will mark bar as END|LABEL if there is a local in a parent
252
+ # scope named bar because it hasn't pushed the local table yet. We do this
253
+ # more accurately, so we need to allow comparing against both END and
254
+ # END|LABEL.
255
+ class ParamToken < Token
256
+ def ==(other)
257
+ (self[0...-1] == other[0...-1]) && (
258
+ (other[3] == Ripper::EXPR_END) ||
259
+ (other[3] == Ripper::EXPR_END | Ripper::EXPR_LABEL)
260
+ )
261
+ end
262
+ end
263
+
264
+ # A heredoc in this case is a list of tokens that belong to the body of the
265
+ # heredoc that should be appended onto the list of tokens when the heredoc
266
+ # closes.
267
+ module Heredoc
268
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
269
+ # We need to keep them around so that we can insert them in the correct
270
+ # order back into the token stream and set the state of the last token to
271
+ # the state that the heredoc was opened in.
272
+ class PlainHeredoc
273
+ attr_reader :tokens
274
+
275
+ def initialize
276
+ @tokens = []
277
+ end
278
+
279
+ def <<(token)
280
+ tokens << token
281
+ end
282
+
283
+ def to_a
284
+ tokens
285
+ end
286
+ end
287
+
288
+ # Dash heredocs are a little more complicated. They are a list of tokens
289
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
290
+ # to keep track of the state that the heredoc was opened in.
291
+ class DashHeredoc
292
+ attr_reader :split, :tokens
293
+
294
+ def initialize(split)
295
+ @split = split
296
+ @tokens = []
297
+ end
298
+
299
+ def <<(token)
300
+ tokens << token
301
+ end
302
+
303
+ def to_a
304
+ embexpr_balance = 0
305
+
306
+ tokens.each_with_object([]) do |token, results|
307
+ case token.event
308
+ when :on_embexpr_beg
309
+ embexpr_balance += 1
310
+ results << token
311
+ when :on_embexpr_end
312
+ embexpr_balance -= 1
313
+ results << token
314
+ when :on_tstring_content
315
+ if embexpr_balance == 0
316
+ lineno = token[0][0]
317
+ column = token[0][1]
318
+
319
+ if split
320
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
321
+ # to keep the delimiter in the result.
322
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
323
+ column = 0 if index > 0
324
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
325
+ lineno += value.count("\n")
326
+ end
327
+ else
328
+ results << token
329
+ end
330
+ else
331
+ results << token
332
+ end
333
+ else
334
+ results << token
335
+ end
336
+ end
337
+ end
338
+ end
339
+
340
+ # Heredocs that are dedenting heredocs are a little more complicated.
341
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
342
+ # removed from the output. prism only modifies the node itself and keeps
343
+ # the token the same. This simplifies prism, but makes comparing against
344
+ # Ripper much harder because there is a length mismatch.
345
+ #
346
+ # Fortunately, we already have to pull out the heredoc tokens in order to
347
+ # insert them into the stream in the correct order. As such, we can do
348
+ # some extra manipulation on the tokens to make them match Ripper's
349
+ # output by mirroring the dedent logic that Ripper uses.
350
+ class DedentingHeredoc
351
+ TAB_WIDTH = 8
352
+
353
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
354
+
355
+ def initialize
356
+ @tokens = []
357
+ @dedent_next = true
358
+ @dedent = nil
359
+ @embexpr_balance = 0
360
+ end
361
+
362
+ # As tokens are coming in, we track the minimum amount of common leading
363
+ # whitespace on plain string content tokens. This allows us to later
364
+ # remove that amount of whitespace from the beginning of each line.
365
+ def <<(token)
366
+ case token.event
367
+ when :on_embexpr_beg, :on_heredoc_beg
368
+ @embexpr_balance += 1
369
+ when :on_embexpr_end, :on_heredoc_end
370
+ @embexpr_balance -= 1
371
+ when :on_tstring_content
372
+ if embexpr_balance == 0
373
+ token.value.split(/(?<=\n)/).each_with_index do |line, index|
374
+ next if line.strip.empty? && line.end_with?("\n")
375
+ next if !(dedent_next || index > 0)
376
+
377
+ leading = line[/\A(\s*)\n?/, 1]
378
+ next_dedent = 0
379
+
380
+ leading.each_char do |char|
381
+ if char == "\t"
382
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
383
+ else
384
+ next_dedent += 1
385
+ end
386
+ end
387
+
388
+ @dedent = [dedent, next_dedent].compact.min
389
+ end
390
+ end
391
+ end
392
+
393
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
394
+ tokens << token
395
+ end
396
+
397
+ def to_a
398
+ # If every line in the heredoc is blank, we still need to split up the
399
+ # string content token into multiple tokens.
400
+ if dedent.nil?
401
+ results = []
402
+ embexpr_balance = 0
403
+
404
+ tokens.each do |token|
405
+ case token.event
406
+ when :on_embexpr_beg, :on_heredoc_beg
407
+ embexpr_balance += 1
408
+ results << token
409
+ when :on_embexpr_end, :on_heredoc_end
410
+ embexpr_balance -= 1
411
+ results << token
412
+ when :on_tstring_content
413
+ if embexpr_balance == 0
414
+ lineno = token[0][0]
415
+ column = token[0][1]
416
+
417
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
418
+ column = 0 if index > 0
419
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
420
+ lineno += 1
421
+ end
422
+ else
423
+ results << token
424
+ end
425
+ else
426
+ results << token
427
+ end
428
+ end
429
+
430
+ return results
431
+ end
432
+
433
+ # Otherwise, we're going to run through each token in the list and
434
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
435
+ # perform. We also need to remove the dedent from the beginning of
436
+ # each line of plain string content tokens.
437
+ results = []
438
+ dedent_next = true
439
+ embexpr_balance = 0
440
+
441
+ tokens.each do |token|
442
+ # Notice that the structure of this conditional largely matches the
443
+ # whitespace calculation we performed above. This is because
444
+ # checking if the subsequent token needs to be dedented is common to
445
+ # both the dedent calculation and the ignored_sp insertion.
446
+ case token.event
447
+ when :on_embexpr_beg
448
+ embexpr_balance += 1
449
+ results << token
450
+ when :on_embexpr_end
451
+ embexpr_balance -= 1
452
+ results << token
453
+ when :on_tstring_content
454
+ if embexpr_balance == 0
455
+ # Here we're going to split the string on newlines, but maintain
456
+ # the newlines in the resulting array. We'll do that with a look
457
+ # behind assertion.
458
+ splits = token.value.split(/(?<=\n)/)
459
+ index = 0
460
+
461
+ while index < splits.length
462
+ line = splits[index]
463
+ lineno = token[0][0] + index
464
+ column = token[0][1]
465
+
466
+ # Blank lines do not count toward common leading whitespace
467
+ # calculation and do not need to be dedented.
468
+ if dedent_next || index > 0
469
+ column = 0
470
+ end
471
+
472
+ # If the dedent is 0 and we're not supposed to dedent the next
473
+ # line or this line doesn't start with whitespace, then we
474
+ # should concatenate the rest of the string to match ripper.
475
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
476
+ line = splits[index..].join
477
+ index = splits.length
478
+ end
479
+
480
+ # If we are supposed to dedent this line or if this is not the
481
+ # first line of the string and this line isn't entirely blank,
482
+ # then we need to insert an on_ignored_sp token and remove the
483
+ # dedent from the beginning of the line.
484
+ if (dedent > 0) && (dedent_next || index > 0)
485
+ deleting = 0
486
+ deleted_chars = []
487
+
488
+ # Gather up all of the characters that we're going to
489
+ # delete, stopping when you hit a character that would put
490
+ # you over the dedent amount.
491
+ line.each_char.with_index do |char, i|
492
+ case char
493
+ when "\r"
494
+ if line.chars[i + 1] == "\n"
495
+ break
496
+ end
497
+ when "\n"
498
+ break
499
+ when "\t"
500
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
501
+ else
502
+ deleting += 1
503
+ end
504
+
505
+ break if deleting > dedent
506
+ deleted_chars << char
507
+ end
508
+
509
+ # If we have something to delete, then delete it from the
510
+ # string and insert an on_ignored_sp token.
511
+ if deleted_chars.any?
512
+ ignored = deleted_chars.join
513
+ line.delete_prefix!(ignored)
514
+
515
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
516
+ column = ignored.length
517
+ end
518
+ end
519
+
520
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
521
+ index += 1
522
+ end
523
+ else
524
+ results << token
525
+ end
526
+ else
527
+ results << token
528
+ end
529
+
530
+ dedent_next =
531
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
532
+ embexpr_balance == 0
533
+ end
534
+
535
+ results
536
+ end
537
+ end
538
+
539
+ # Here we will split between the two types of heredocs and return the
540
+ # object that will store their tokens.
541
+ def self.build(opening)
542
+ case opening.value[2]
543
+ when "~"
544
+ DedentingHeredoc.new
545
+ when "-"
546
+ DashHeredoc.new(opening.value[3] != "'")
547
+ else
548
+ PlainHeredoc.new
549
+ end
550
+ end
551
+ end
552
+
553
+ attr_reader :source, :filepath
554
+
555
+ def initialize(source, filepath = "")
556
+ @source = source
557
+ @filepath = filepath || ""
558
+ end
559
+
560
+ def result
561
+ tokens = []
562
+
563
+ state = :default
564
+ heredoc_stack = [[]]
565
+
566
+ result = Prism.lex(source, @filepath)
567
+ result_value = result.value
568
+ previous_state = nil
569
+
570
+ # In previous versions of Ruby, Ripper wouldn't flush the bom before the
571
+ # first token, so we had to have a hack in place to account for that. This
572
+ # checks for that behavior.
573
+ bom_flushed = Ripper.lex("\xEF\xBB\xBF# test")[0][0][1] == 0
574
+ bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
575
+
576
+ result_value.each_with_index do |(token, lex_state), index|
577
+ lineno = token.location.start_line
578
+ column = token.location.start_column
579
+
580
+ # If there's a UTF-8 byte-order mark as the start of the file, then for
581
+ # certain tokens ripper sets the first token back by 3 bytes. It also
582
+ # keeps the byte order mark in the first token's value. This is weird,
583
+ # and I don't want to mirror that in our parser. So instead, we'll match
584
+ # up the columns and values here.
585
+ if bom && lineno == 1
586
+ column -= 3
587
+
588
+ if index == 0 && column == 0 && !bom_flushed
589
+ flushed =
590
+ case token.type
591
+ when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
592
+ :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
593
+ :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
594
+ :PERCENT_UPPER_W, :STRING_BEGIN
595
+ true
596
+ when :REGEXP_BEGIN, :SYMBOL_BEGIN
597
+ token.value.start_with?("%")
598
+ else
599
+ false
600
+ end
601
+
602
+ unless flushed
603
+ column -= 3
604
+ value = token.value
605
+ value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
606
+ end
607
+ end
608
+ end
609
+
610
+ event = RIPPER.fetch(token.type)
611
+ value = token.value
612
+ lex_state = Ripper::Lexer::State.new(lex_state)
613
+
614
+ token =
615
+ case event
616
+ when :on___end__
617
+ EndContentToken.new([[lineno, column], event, value, lex_state])
618
+ when :on_comment
619
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
620
+ when :on_heredoc_end
621
+ # Heredoc end tokens can be emitted in an odd order, so we don't
622
+ # want to bother comparing the state on them.
623
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
624
+ when :on_ident
625
+ if lex_state == Ripper::EXPR_END
626
+ # If we have an identifier that follows a method name like:
627
+ #
628
+ # def foo bar
629
+ #
630
+ # then Ripper will mark bar as END|LABEL if there is a local in a
631
+ # parent scope named bar because it hasn't pushed the local table
632
+ # yet. We do this more accurately, so we need to allow comparing
633
+ # against both END and END|LABEL.
634
+ ParamToken.new([[lineno, column], event, value, lex_state])
635
+ elsif lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
636
+ # In the event that we're comparing identifiers, we're going to
637
+ # allow a little divergence. Ripper doesn't account for local
638
+ # variables introduced through named captures in regexes, and we
639
+ # do, which accounts for this difference.
640
+ IdentToken.new([[lineno, column], event, value, lex_state])
641
+ else
642
+ Token.new([[lineno, column], event, value, lex_state])
643
+ end
644
+ when :on_embexpr_end
645
+ IgnoreStateToken.new([[lineno, column], event, value, lex_state])
646
+ when :on_ignored_nl
647
+ # Ignored newlines can occasionally have a LABEL state attached to
648
+ # them which doesn't actually impact anything. We don't mirror that
649
+ # state so we ignored it.
650
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
651
+ when :on_regexp_end
652
+ # On regex end, Ripper scans and then sets end state, so the ripper
653
+ # lexed output is begin, when it should be end. prism sets lex state
654
+ # correctly to end state, but we want to be able to compare against
655
+ # Ripper's lexed state. So here, if it's a regexp end token, we
656
+ # output the state as the previous state, solely for the sake of
657
+ # comparison.
658
+ previous_token = result_value[index - 1][0]
659
+ lex_state =
660
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
661
+ # If the previous token is embexpr_end, then we have to do even
662
+ # more processing. The end of an embedded expression sets the
663
+ # state to the state that it had at the beginning of the
664
+ # embedded expression. So we have to go and find that state and
665
+ # set it here.
666
+ counter = 1
667
+ current_index = index - 1
668
+
669
+ until counter == 0
670
+ current_index -= 1
671
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
672
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
673
+ end
674
+
675
+ Ripper::Lexer::State.new(result_value[current_index][1])
676
+ else
677
+ previous_state
678
+ end
679
+
680
+ Token.new([[lineno, column], event, value, lex_state])
681
+ when :on_eof
682
+ previous_token = result_value[index - 1][0]
683
+
684
+ # If we're at the end of the file and the previous token was a
685
+ # comment and there is still whitespace after the comment, then
686
+ # Ripper will append a on_nl token (even though there isn't
687
+ # necessarily a newline). We mirror that here.
688
+ start_offset = previous_token.location.end_offset
689
+ end_offset = token.location.start_offset
690
+
691
+ if previous_token.type == :COMMENT && start_offset < end_offset
692
+ if bom
693
+ start_offset += 3
694
+ end_offset += 3
695
+ end
696
+
697
+ tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
698
+ end
699
+
700
+ Token.new([[lineno, column], event, value, lex_state])
701
+ else
702
+ Token.new([[lineno, column], event, value, lex_state])
703
+ end
704
+
705
+ previous_state = lex_state
706
+
707
+ # The order in which tokens appear in our lexer is different from the
708
+ # order that they appear in Ripper. When we hit the declaration of a
709
+ # heredoc in prism, we skip forward and lex the rest of the content of
710
+ # the heredoc before going back and lexing at the end of the heredoc
711
+ # identifier.
712
+ #
713
+ # To match up to ripper, we keep a small state variable around here to
714
+ # track whether we're in the middle of a heredoc or not. In this way we
715
+ # can shuffle around the token to match Ripper's output.
716
+ case state
717
+ when :default
718
+ # The default state is when there are no heredocs at all. In this
719
+ # state we can append the token to the list of tokens and move on.
720
+ tokens << token
721
+
722
+ # If we get the declaration of a heredoc, then we open a new heredoc
723
+ # and move into the heredoc_opened state.
724
+ if event == :on_heredoc_beg
725
+ state = :heredoc_opened
726
+ heredoc_stack.last << Heredoc.build(token)
727
+ end
728
+ when :heredoc_opened
729
+ # The heredoc_opened state is when we've seen the declaration of a
730
+ # heredoc and are now lexing the body of the heredoc. In this state we
731
+ # push tokens onto the most recently created heredoc.
732
+ heredoc_stack.last.last << token
733
+
734
+ case event
735
+ when :on_heredoc_beg
736
+ # If we receive a heredoc declaration while lexing the body of a
737
+ # heredoc, this means we have nested heredocs. In this case we'll
738
+ # push a new heredoc onto the stack and stay in the heredoc_opened
739
+ # state since we're now lexing the body of the new heredoc.
740
+ heredoc_stack << [Heredoc.build(token)]
741
+ when :on_heredoc_end
742
+ # If we receive the end of a heredoc, then we're done lexing the
743
+ # body of the heredoc. In this case we now have a completed heredoc
744
+ # but need to wait for the next newline to push it into the token
745
+ # stream.
746
+ state = :heredoc_closed
747
+ end
748
+ when :heredoc_closed
749
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
750
+ if heredoc_stack.size > 1
751
+ flushing = heredoc_stack.pop
752
+ heredoc_stack.last.last << token
753
+
754
+ flushing.each do |heredoc|
755
+ heredoc.to_a.each do |flushed_token|
756
+ heredoc_stack.last.last << flushed_token
757
+ end
758
+ end
759
+
760
+ state = :heredoc_opened
761
+ next
762
+ end
763
+ elsif event == :on_heredoc_beg
764
+ tokens << token
765
+ state = :heredoc_opened
766
+ heredoc_stack.last << Heredoc.build(token)
767
+ next
768
+ elsif heredoc_stack.size > 1
769
+ heredoc_stack[-2].last << token
770
+ next
771
+ end
772
+
773
+ heredoc_stack.last.each do |heredoc|
774
+ tokens.concat(heredoc.to_a)
775
+ end
776
+
777
+ heredoc_stack.last.clear
778
+ state = :default
779
+
780
+ tokens << token
781
+ end
782
+ end
783
+
784
+ # Drop the EOF token from the list
785
+ tokens = tokens[0...-1]
786
+
787
+ # We sort by location to compare against Ripper's output
788
+ tokens.sort_by!(&:location)
789
+
790
+ if result_value.size - 1 > tokens.size
791
+ raise StandardError, "Lost tokens when performing lex_compat"
792
+ end
793
+
794
+ ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
795
+ end
796
+ end
797
+
798
+ # This is a class that wraps the Ripper lexer to produce almost exactly the
799
+ # same tokens.
800
+ class LexRipper
801
+ attr_reader :source
802
+
803
+ def initialize(source)
804
+ @source = source
805
+ end
806
+
807
+ def result
808
+ previous = []
809
+ results = []
810
+
811
+ Ripper.lex(source, raise_errors: true).each do |token|
812
+ case token[1]
813
+ when :on_sp
814
+ # skip
815
+ when :on_tstring_content
816
+ if previous[1] == :on_tstring_content && (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
817
+ previous[2] << token[2]
818
+ else
819
+ results << token
820
+ previous = token
821
+ end
822
+ when :on_words_sep
823
+ if previous[1] == :on_words_sep
824
+ previous[2] << token[2]
825
+ else
826
+ results << token
827
+ previous = token
828
+ end
829
+ else
830
+ results << token
831
+ previous = token
832
+ end
833
+ end
834
+
835
+ results
836
+ end
837
+ end
838
+ end