yarp 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +76 -0
  3. data/CONTRIBUTING.md +51 -0
  4. data/LICENSE.md +7 -0
  5. data/Makefile.in +79 -0
  6. data/README.md +86 -0
  7. data/config.h.in +25 -0
  8. data/config.yml +2147 -0
  9. data/configure +4487 -0
  10. data/docs/build_system.md +85 -0
  11. data/docs/building.md +26 -0
  12. data/docs/configuration.md +56 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +116 -0
  15. data/docs/extension.md +20 -0
  16. data/docs/fuzzing.md +93 -0
  17. data/docs/heredocs.md +36 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/ripper.md +36 -0
  20. data/docs/serialization.md +130 -0
  21. data/docs/testing.md +55 -0
  22. data/ext/yarp/api_node.c +3680 -0
  23. data/ext/yarp/api_pack.c +256 -0
  24. data/ext/yarp/extconf.rb +131 -0
  25. data/ext/yarp/extension.c +547 -0
  26. data/ext/yarp/extension.h +18 -0
  27. data/include/yarp/ast.h +1412 -0
  28. data/include/yarp/defines.h +54 -0
  29. data/include/yarp/diagnostic.h +24 -0
  30. data/include/yarp/enc/yp_encoding.h +94 -0
  31. data/include/yarp/node.h +36 -0
  32. data/include/yarp/pack.h +141 -0
  33. data/include/yarp/parser.h +389 -0
  34. data/include/yarp/regexp.h +19 -0
  35. data/include/yarp/unescape.h +42 -0
  36. data/include/yarp/util/yp_buffer.h +39 -0
  37. data/include/yarp/util/yp_char.h +75 -0
  38. data/include/yarp/util/yp_constant_pool.h +64 -0
  39. data/include/yarp/util/yp_list.h +67 -0
  40. data/include/yarp/util/yp_memchr.h +14 -0
  41. data/include/yarp/util/yp_newline_list.h +54 -0
  42. data/include/yarp/util/yp_state_stack.h +24 -0
  43. data/include/yarp/util/yp_string.h +57 -0
  44. data/include/yarp/util/yp_string_list.h +28 -0
  45. data/include/yarp/util/yp_strpbrk.h +29 -0
  46. data/include/yarp/version.h +5 -0
  47. data/include/yarp.h +69 -0
  48. data/lib/yarp/lex_compat.rb +759 -0
  49. data/lib/yarp/node.rb +7428 -0
  50. data/lib/yarp/pack.rb +185 -0
  51. data/lib/yarp/ripper_compat.rb +174 -0
  52. data/lib/yarp/serialize.rb +389 -0
  53. data/lib/yarp.rb +330 -0
  54. data/src/diagnostic.c +25 -0
  55. data/src/enc/yp_big5.c +79 -0
  56. data/src/enc/yp_euc_jp.c +85 -0
  57. data/src/enc/yp_gbk.c +88 -0
  58. data/src/enc/yp_shift_jis.c +83 -0
  59. data/src/enc/yp_tables.c +509 -0
  60. data/src/enc/yp_unicode.c +2320 -0
  61. data/src/enc/yp_windows_31j.c +83 -0
  62. data/src/node.c +2011 -0
  63. data/src/pack.c +493 -0
  64. data/src/prettyprint.c +1782 -0
  65. data/src/regexp.c +580 -0
  66. data/src/serialize.c +1576 -0
  67. data/src/token_type.c +347 -0
  68. data/src/unescape.c +576 -0
  69. data/src/util/yp_buffer.c +78 -0
  70. data/src/util/yp_char.c +229 -0
  71. data/src/util/yp_constant_pool.c +147 -0
  72. data/src/util/yp_list.c +50 -0
  73. data/src/util/yp_memchr.c +31 -0
  74. data/src/util/yp_newline_list.c +119 -0
  75. data/src/util/yp_state_stack.c +25 -0
  76. data/src/util/yp_string.c +207 -0
  77. data/src/util/yp_string_list.c +32 -0
  78. data/src/util/yp_strncasecmp.c +20 -0
  79. data/src/util/yp_strpbrk.c +66 -0
  80. data/src/yarp.c +13211 -0
  81. data/yarp.gemspec +100 -0
  82. metadata +125 -0
@@ -0,0 +1,759 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ module YARP
6
+ # This class is responsible for lexing the source using YARP and then
7
+ # converting those tokens to be compatible with Ripper. In the vast majority
8
+ # of cases, this is a one-to-one mapping of the token type. Everything else
9
+ # generally lines up. However, there are a few cases that require special
10
+ # handling.
11
+ class LexCompat
12
+ # This is a mapping of YARP token types to Ripper token types. This is a
13
+ # many-to-one mapping because we split up our token types, whereas Ripper
14
+ # tends to group them.
15
+ RIPPER = {
16
+ AMPERSAND: :on_op,
17
+ AMPERSAND_AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
19
+ AMPERSAND_DOT: :on_op,
20
+ AMPERSAND_EQUAL: :on_op,
21
+ BACK_REFERENCE: :on_backref,
22
+ BACKTICK: :on_backtick,
23
+ BANG: :on_op,
24
+ BANG_EQUAL: :on_op,
25
+ BANG_TILDE: :on_op,
26
+ BRACE_LEFT: :on_lbrace,
27
+ BRACE_RIGHT: :on_rbrace,
28
+ BRACKET_LEFT: :on_lbracket,
29
+ BRACKET_LEFT_ARRAY: :on_lbracket,
30
+ BRACKET_LEFT_RIGHT: :on_op,
31
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
32
+ BRACKET_RIGHT: :on_rbracket,
33
+ CARET: :on_op,
34
+ CARET_EQUAL: :on_op,
35
+ CHARACTER_LITERAL: :on_CHAR,
36
+ CLASS_VARIABLE: :on_cvar,
37
+ COLON: :on_op,
38
+ COLON_COLON: :on_op,
39
+ COMMA: :on_comma,
40
+ COMMENT: :on_comment,
41
+ CONSTANT: :on_const,
42
+ DOT: :on_period,
43
+ DOT_DOT: :on_op,
44
+ DOT_DOT_DOT: :on_op,
45
+ EMBDOC_BEGIN: :on_embdoc_beg,
46
+ EMBDOC_END: :on_embdoc_end,
47
+ EMBDOC_LINE: :on_embdoc,
48
+ EMBEXPR_BEGIN: :on_embexpr_beg,
49
+ EMBEXPR_END: :on_embexpr_end,
50
+ EMBVAR: :on_embvar,
51
+ EOF: :on_eof,
52
+ EQUAL: :on_op,
53
+ EQUAL_EQUAL: :on_op,
54
+ EQUAL_EQUAL_EQUAL: :on_op,
55
+ EQUAL_GREATER: :on_op,
56
+ EQUAL_TILDE: :on_op,
57
+ FLOAT: :on_float,
58
+ FLOAT_IMAGINARY: :on_imaginary,
59
+ FLOAT_RATIONAL: :on_rational,
60
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
61
+ GREATER: :on_op,
62
+ GREATER_EQUAL: :on_op,
63
+ GREATER_GREATER: :on_op,
64
+ GREATER_GREATER_EQUAL: :on_op,
65
+ GLOBAL_VARIABLE: :on_gvar,
66
+ HEREDOC_END: :on_heredoc_end,
67
+ HEREDOC_START: :on_heredoc_beg,
68
+ IDENTIFIER: :on_ident,
69
+ IGNORED_NEWLINE: :on_ignored_nl,
70
+ INTEGER: :on_int,
71
+ INTEGER_IMAGINARY: :on_imaginary,
72
+ INTEGER_RATIONAL: :on_rational,
73
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
74
+ INSTANCE_VARIABLE: :on_ivar,
75
+ INVALID: :INVALID,
76
+ KEYWORD___ENCODING__: :on_kw,
77
+ KEYWORD___LINE__: :on_kw,
78
+ KEYWORD___FILE__: :on_kw,
79
+ KEYWORD_ALIAS: :on_kw,
80
+ KEYWORD_AND: :on_kw,
81
+ KEYWORD_BEGIN: :on_kw,
82
+ KEYWORD_BEGIN_UPCASE: :on_kw,
83
+ KEYWORD_BREAK: :on_kw,
84
+ KEYWORD_CASE: :on_kw,
85
+ KEYWORD_CLASS: :on_kw,
86
+ KEYWORD_DEF: :on_kw,
87
+ KEYWORD_DEFINED: :on_kw,
88
+ KEYWORD_DO: :on_kw,
89
+ KEYWORD_DO_LOOP: :on_kw,
90
+ KEYWORD_ELSE: :on_kw,
91
+ KEYWORD_ELSIF: :on_kw,
92
+ KEYWORD_END: :on_kw,
93
+ KEYWORD_END_UPCASE: :on_kw,
94
+ KEYWORD_ENSURE: :on_kw,
95
+ KEYWORD_FALSE: :on_kw,
96
+ KEYWORD_FOR: :on_kw,
97
+ KEYWORD_IF: :on_kw,
98
+ KEYWORD_IF_MODIFIER: :on_kw,
99
+ KEYWORD_IN: :on_kw,
100
+ KEYWORD_MODULE: :on_kw,
101
+ KEYWORD_NEXT: :on_kw,
102
+ KEYWORD_NIL: :on_kw,
103
+ KEYWORD_NOT: :on_kw,
104
+ KEYWORD_OR: :on_kw,
105
+ KEYWORD_REDO: :on_kw,
106
+ KEYWORD_RESCUE: :on_kw,
107
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
108
+ KEYWORD_RETRY: :on_kw,
109
+ KEYWORD_RETURN: :on_kw,
110
+ KEYWORD_SELF: :on_kw,
111
+ KEYWORD_SUPER: :on_kw,
112
+ KEYWORD_THEN: :on_kw,
113
+ KEYWORD_TRUE: :on_kw,
114
+ KEYWORD_UNDEF: :on_kw,
115
+ KEYWORD_UNLESS: :on_kw,
116
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
117
+ KEYWORD_UNTIL: :on_kw,
118
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
119
+ KEYWORD_WHEN: :on_kw,
120
+ KEYWORD_WHILE: :on_kw,
121
+ KEYWORD_WHILE_MODIFIER: :on_kw,
122
+ KEYWORD_YIELD: :on_kw,
123
+ LABEL: :on_label,
124
+ LABEL_END: :on_label_end,
125
+ LAMBDA_BEGIN: :on_tlambeg,
126
+ LESS: :on_op,
127
+ LESS_EQUAL: :on_op,
128
+ LESS_EQUAL_GREATER: :on_op,
129
+ LESS_LESS: :on_op,
130
+ LESS_LESS_EQUAL: :on_op,
131
+ MINUS: :on_op,
132
+ MINUS_EQUAL: :on_op,
133
+ MINUS_GREATER: :on_tlambda,
134
+ NEWLINE: :on_nl,
135
+ NUMBERED_REFERENCE: :on_backref,
136
+ PARENTHESIS_LEFT: :on_lparen,
137
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
138
+ PARENTHESIS_RIGHT: :on_rparen,
139
+ PERCENT: :on_op,
140
+ PERCENT_EQUAL: :on_op,
141
+ PERCENT_LOWER_I: :on_qsymbols_beg,
142
+ PERCENT_LOWER_W: :on_qwords_beg,
143
+ PERCENT_LOWER_X: :on_backtick,
144
+ PERCENT_UPPER_I: :on_symbols_beg,
145
+ PERCENT_UPPER_W: :on_words_beg,
146
+ PIPE: :on_op,
147
+ PIPE_EQUAL: :on_op,
148
+ PIPE_PIPE: :on_op,
149
+ PIPE_PIPE_EQUAL: :on_op,
150
+ PLUS: :on_op,
151
+ PLUS_EQUAL: :on_op,
152
+ QUESTION_MARK: :on_op,
153
+ RATIONAL_FLOAT: :on_rational,
154
+ RATIONAL_INTEGER: :on_rational,
155
+ REGEXP_BEGIN: :on_regexp_beg,
156
+ REGEXP_END: :on_regexp_end,
157
+ SEMICOLON: :on_semicolon,
158
+ SLASH: :on_op,
159
+ SLASH_EQUAL: :on_op,
160
+ STAR: :on_op,
161
+ STAR_EQUAL: :on_op,
162
+ STAR_STAR: :on_op,
163
+ STAR_STAR_EQUAL: :on_op,
164
+ STRING_BEGIN: :on_tstring_beg,
165
+ STRING_CONTENT: :on_tstring_content,
166
+ STRING_END: :on_tstring_end,
167
+ SYMBOL_BEGIN: :on_symbeg,
168
+ TILDE: :on_op,
169
+ UAMPERSAND: :on_op,
170
+ UCOLON_COLON: :on_op,
171
+ UDOT_DOT: :on_op,
172
+ UDOT_DOT_DOT: :on_op,
173
+ UMINUS: :on_op,
174
+ UMINUS_NUM: :on_op,
175
+ UPLUS: :on_op,
176
+ USTAR: :on_op,
177
+ USTAR_STAR: :on_op,
178
+ WORDS_SEP: :on_words_sep,
179
+ "__END__": :on___end__
180
+ }.freeze
181
+
182
+ # When we produce tokens, we produce the same arrays that Ripper does.
183
+ # However, we add a couple of convenience methods onto them to make them a
184
+ # little easier to work with. We delegate all other methods to the array.
185
+ class Token < SimpleDelegator
186
+ def location
187
+ self[0]
188
+ end
189
+
190
+ def event
191
+ self[1]
192
+ end
193
+
194
+ def value
195
+ self[2]
196
+ end
197
+
198
+ def state
199
+ self[3]
200
+ end
201
+ end
202
+
203
+ # Ripper doesn't include the rest of the token in the event, so we need to
204
+ # trim it down to just the content on the first line when comparing.
205
+ class EndContentToken < Token
206
+ def ==(other)
207
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
208
+ end
209
+ end
210
+
211
+ # It is extremely non obvious which state the parser is in when comments get
212
+ # dispatched. Because of this we don't both comparing state when comparing
213
+ # against other comment tokens.
214
+ class CommentToken < Token
215
+ def ==(other)
216
+ self[0...-1] == other[0...-1]
217
+ end
218
+ end
219
+
220
+ # Heredoc end tokens are emitted in an odd order, so we don't compare the
221
+ # state on them.
222
+ class HeredocEndToken < Token
223
+ def ==(other)
224
+ self[0...-1] == other[0...-1]
225
+ end
226
+ end
227
+
228
+ # Ident tokens for the most part are exactly the same, except sometimes we
229
+ # know an ident is a local when ripper doesn't (when they are introduced
230
+ # through named captures in regular expressions). In that case we don't
231
+ # compare the state.
232
+ class IdentToken < Token
233
+ def ==(other)
234
+ (self[0...-1] == other[0...-1]) && (
235
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
236
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
237
+ )
238
+ end
239
+ end
240
+
241
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
242
+ # we compare the state differently here.
243
+ class IgnoredNewlineToken < Token
244
+ def ==(other)
245
+ return false unless self[0...-1] == other[0...-1]
246
+
247
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
248
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
249
+ else
250
+ self[4] == other[4]
251
+ end
252
+ end
253
+ end
254
+
255
+ # A heredoc in this case is a list of tokens that belong to the body of the
256
+ # heredoc that should be appended onto the list of tokens when the heredoc
257
+ # closes.
258
+ module Heredoc
259
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
260
+ # We need to keep them around so that we can insert them in the correct
261
+ # order back into the token stream and set the state of the last token to
262
+ # the state that the heredoc was opened in.
263
+ class PlainHeredoc
264
+ attr_reader :tokens
265
+
266
+ def initialize
267
+ @tokens = []
268
+ end
269
+
270
+ def <<(token)
271
+ tokens << token
272
+ end
273
+
274
+ def to_a
275
+ tokens
276
+ end
277
+ end
278
+
279
+ # Dash heredocs are a little more complicated. They are a list of tokens
280
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
281
+ # to keep track of the state that the heredoc was opened in.
282
+ class DashHeredoc
283
+ attr_reader :split, :tokens
284
+
285
+ def initialize(split)
286
+ @split = split
287
+ @tokens = []
288
+ end
289
+
290
+ def <<(token)
291
+ tokens << token
292
+ end
293
+
294
+ def to_a
295
+ embexpr_balance = 0
296
+
297
+ tokens.each_with_object([]) do |token, results|
298
+ case token.event
299
+ when :on_embexpr_beg
300
+ embexpr_balance += 1
301
+ results << token
302
+ when :on_embexpr_end
303
+ embexpr_balance -= 1
304
+ results << token
305
+ when :on_tstring_content
306
+ if embexpr_balance == 0
307
+ lineno = token[0][0]
308
+ column = token[0][1]
309
+
310
+ if split
311
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
312
+ # to keep the delimiter in the result.
313
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
314
+ column = 0 if index > 0
315
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
316
+ lineno += value.count("\n")
317
+ end
318
+ else
319
+ results << token
320
+ end
321
+ else
322
+ results << token
323
+ end
324
+ else
325
+ results << token
326
+ end
327
+ end
328
+ end
329
+ end
330
+
331
+ # Heredocs that are dedenting heredocs are a little more complicated.
332
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
333
+ # removed from the output. YARP only modifies the node itself and keeps
334
+ # the token the same. This simplifies YARP, but makes comparing against
335
+ # Ripper much harder because there is a length mismatch.
336
+ #
337
+ # Fortunately, we already have to pull out the heredoc tokens in order to
338
+ # insert them into the stream in the correct order. As such, we can do
339
+ # some extra manipulation on the tokens to make them match Ripper's
340
+ # output by mirroring the dedent logic that Ripper uses.
341
+ class DedentingHeredoc
342
+ TAB_WIDTH = 8
343
+
344
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
345
+
346
+ def initialize
347
+ @tokens = []
348
+ @dedent_next = true
349
+ @dedent = nil
350
+ @embexpr_balance = 0
351
+ end
352
+
353
+ # As tokens are coming in, we track the minimum amount of common leading
354
+ # whitespace on plain string content tokens. This allows us to later
355
+ # remove that amount of whitespace from the beginning of each line.
356
+ def <<(token)
357
+ case token.event
358
+ when :on_embexpr_beg, :on_heredoc_beg
359
+ @embexpr_balance += 1
360
+ when :on_embexpr_end, :on_heredoc_end
361
+ @embexpr_balance -= 1
362
+ when :on_tstring_content
363
+ if embexpr_balance == 0
364
+ token.value.split(/(?<=\n)/).each_with_index do |line, index|
365
+ next if line.strip.empty? && line.end_with?("\n")
366
+ next if !(dedent_next || index > 0)
367
+
368
+ leading = line[/\A(\s*)\n?/, 1]
369
+ next_dedent = 0
370
+
371
+ leading.each_char do |char|
372
+ if char == "\t"
373
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
374
+ else
375
+ next_dedent += 1
376
+ end
377
+ end
378
+
379
+ @dedent = [dedent, next_dedent].compact.min
380
+ end
381
+ end
382
+ end
383
+
384
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
385
+ tokens << token
386
+ end
387
+
388
+ def to_a
389
+ # If every line in the heredoc is blank, we still need to split up the
390
+ # string content token into multiple tokens.
391
+ if dedent.nil?
392
+ results = []
393
+ embexpr_balance = 0
394
+
395
+ tokens.each do |token|
396
+ case token.event
397
+ when :on_embexpr_beg, :on_heredoc_beg
398
+ embexpr_balance += 1
399
+ results << token
400
+ when :on_embexpr_end, :on_heredoc_end
401
+ embexpr_balance -= 1
402
+ results << token
403
+ when :on_tstring_content
404
+ if embexpr_balance == 0
405
+ lineno = token[0][0]
406
+ column = token[0][1]
407
+
408
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
409
+ column = 0 if index > 0
410
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
411
+ lineno += 1
412
+ end
413
+ else
414
+ results << token
415
+ end
416
+ else
417
+ results << token
418
+ end
419
+ end
420
+
421
+ return results
422
+ end
423
+
424
+ # Otherwise, we're going to run through each token in the list and
425
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
426
+ # perform. We also need to remove the dedent from the beginning of
427
+ # each line of plain string content tokens.
428
+ results = []
429
+ dedent_next = true
430
+ embexpr_balance = 0
431
+
432
+ tokens.each do |token|
433
+ # Notice that the structure of this conditional largely matches the
434
+ # whitespace calculation we performed above. This is because
435
+ # checking if the subsequent token needs to be dedented is common to
436
+ # both the dedent calculation and the ignored_sp insertion.
437
+ case token.event
438
+ when :on_embexpr_beg
439
+ embexpr_balance += 1
440
+ results << token
441
+ when :on_embexpr_end
442
+ embexpr_balance -= 1
443
+ results << token
444
+ when :on_tstring_content
445
+ if embexpr_balance == 0
446
+ # Here we're going to split the string on newlines, but maintain
447
+ # the newlines in the resulting array. We'll do that with a look
448
+ # behind assertion.
449
+ splits = token.value.split(/(?<=\n)/)
450
+ index = 0
451
+
452
+ while index < splits.length
453
+ line = splits[index]
454
+ lineno = token[0][0] + index
455
+ column = token[0][1]
456
+
457
+ # Blank lines do not count toward common leading whitespace
458
+ # calculation and do not need to be dedented.
459
+ if dedent_next || index > 0
460
+ column = 0
461
+ end
462
+
463
+ # If the dedent is 0 and we're not supposed to dedent the next
464
+ # line or this line doesn't start with whitespace, then we
465
+ # should concatenate the rest of the string to match ripper.
466
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
467
+ line = splits[index..].join
468
+ index = splits.length
469
+ end
470
+
471
+ # If we are supposed to dedent this line or if this is not the
472
+ # first line of the string and this line isn't entirely blank,
473
+ # then we need to insert an on_ignored_sp token and remove the
474
+ # dedent from the beginning of the line.
475
+ if (dedent > 0) && (dedent_next || index > 0)
476
+ deleting = 0
477
+ deleted_chars = []
478
+
479
+ # Gather up all of the characters that we're going to
480
+ # delete, stopping when you hit a character that would put
481
+ # you over the dedent amount.
482
+ line.each_char.with_index do |char, i|
483
+ case char
484
+ when "\r"
485
+ if line.chars[i + 1] == "\n"
486
+ break
487
+ end
488
+ when "\n"
489
+ break
490
+ when "\t"
491
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
492
+ else
493
+ deleting += 1
494
+ end
495
+
496
+ break if deleting > dedent
497
+ deleted_chars << char
498
+ end
499
+
500
+ # If we have something to delete, then delete it from the
501
+ # string and insert an on_ignored_sp token.
502
+ if deleted_chars.any?
503
+ ignored = deleted_chars.join
504
+ line.delete_prefix!(ignored)
505
+
506
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
507
+ column = ignored.length
508
+ end
509
+ end
510
+
511
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
512
+ index += 1
513
+ end
514
+ else
515
+ results << token
516
+ end
517
+ else
518
+ results << token
519
+ end
520
+
521
+ dedent_next =
522
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
523
+ embexpr_balance == 0
524
+ end
525
+
526
+ results
527
+ end
528
+ end
529
+
530
+ # Here we will split between the two types of heredocs and return the
531
+ # object that will store their tokens.
532
+ def self.build(opening)
533
+ case opening.value[2]
534
+ when "~"
535
+ DedentingHeredoc.new
536
+ when "-"
537
+ DashHeredoc.new(opening.value[3] != "'")
538
+ else
539
+ PlainHeredoc.new
540
+ end
541
+ end
542
+ end
543
+
544
+ attr_reader :source, :filepath
545
+
546
+ def initialize(source, filepath = "")
547
+ @source = source
548
+ @filepath = filepath || ""
549
+ end
550
+
551
+ def result
552
+ tokens = []
553
+
554
+ state = :default
555
+ heredoc_stack = [[]]
556
+
557
+ result = YARP.lex(source, @filepath)
558
+ result_value = result.value
559
+ previous_state = nil
560
+
561
+ # If there's a UTF-8 byte-order mark as the start of the file, then ripper
562
+ # sets every token's on the first line back by 6 bytes. It also keeps the
563
+ # byte order mark in the first token's value. This is weird, and I don't
564
+ # want to mirror that in our parser. So instead, we'll match up the values
565
+ # here, and then match up the locations as we process the tokens.
566
+ bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
567
+ result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
568
+
569
+ result_value.each_with_index do |(token, lex_state), index|
570
+ lineno = token.location.start_line
571
+ column = token.location.start_column
572
+ column -= index == 0 ? 6 : 3 if bom && lineno == 1
573
+
574
+ event = RIPPER.fetch(token.type)
575
+ value = token.value
576
+ lex_state = Ripper::Lexer::State.new(lex_state)
577
+
578
+ token =
579
+ case event
580
+ when :on___end__
581
+ EndContentToken.new([[lineno, column], event, value, lex_state])
582
+ when :on_comment
583
+ CommentToken.new([[lineno, column], event, value, lex_state])
584
+ when :on_heredoc_end
585
+ # Heredoc end tokens can be emitted in an odd order, so we don't
586
+ # want to bother comparing the state on them.
587
+ HeredocEndToken.new([[lineno, column], event, value, lex_state])
588
+ when :on_embexpr_end, :on_ident
589
+ if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
590
+ # In the event that we're comparing identifiers, we're going to
591
+ # allow a little divergence. Ripper doesn't account for local
592
+ # variables introduced through named captures in regexes, and we
593
+ # do, which accounts for this difference.
594
+ IdentToken.new([[lineno, column], event, value, lex_state])
595
+ else
596
+ Token.new([[lineno, column], event, value, lex_state])
597
+ end
598
+ when :on_ignored_nl
599
+ # Ignored newlines can occasionally have a LABEL state attached to
600
+ # them which doesn't actually impact anything. We don't mirror that
601
+ # state so we ignored it.
602
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
603
+ when :on_regexp_end
604
+ # On regex end, Ripper scans and then sets end state, so the ripper
605
+ # lexed output is begin, when it should be end. YARP sets lex state
606
+ # correctly to end state, but we want to be able to compare against
607
+ # Ripper's lexed state. So here, if it's a regexp end token, we
608
+ # output the state as the previous state, solely for the sake of
609
+ # comparison.
610
+ previous_token = result_value[index - 1][0]
611
+ lex_state =
612
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
613
+ # If the previous token is embexpr_end, then we have to do even
614
+ # more processing. The end of an embedded expression sets the
615
+ # state to the state that it had at the beginning of the
616
+ # embedded expression. So we have to go and find that state and
617
+ # set it here.
618
+ counter = 1
619
+ current_index = index - 1
620
+
621
+ until counter == 0
622
+ current_index -= 1
623
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
624
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
625
+ end
626
+
627
+ Ripper::Lexer::State.new(result_value[current_index][1])
628
+ else
629
+ previous_state
630
+ end
631
+
632
+ Token.new([[lineno, column], event, value, lex_state])
633
+ else
634
+ Token.new([[lineno, column], event, value, lex_state])
635
+ end
636
+
637
+ previous_state = lex_state
638
+
639
+ # The order in which tokens appear in our lexer is different from the
640
+ # order that they appear in Ripper. When we hit the declaration of a
641
+ # heredoc in YARP, we skip forward and lex the rest of the content of
642
+ # the heredoc before going back and lexing at the end of the heredoc
643
+ # identifier.
644
+ #
645
+ # To match up to ripper, we keep a small state variable around here to
646
+ # track whether we're in the middle of a heredoc or not. In this way we
647
+ # can shuffle around the token to match Ripper's output.
648
+ case state
649
+ when :default
650
+ tokens << token
651
+
652
+ if event == :on_heredoc_beg
653
+ state = :heredoc_opened
654
+ heredoc_stack.last << Heredoc.build(token)
655
+ end
656
+ when :heredoc_opened
657
+ heredoc_stack.last.last << token
658
+
659
+ case event
660
+ when :on_heredoc_beg
661
+ heredoc_stack << [Heredoc.build(token)]
662
+ when :on_heredoc_end
663
+ state = :heredoc_closed
664
+ end
665
+ when :heredoc_closed
666
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
667
+ if heredoc_stack.size > 1
668
+ flushing = heredoc_stack.pop
669
+ heredoc_stack.last.last << token
670
+
671
+ flushing.each do |heredoc|
672
+ heredoc.to_a.each do |flushed_token|
673
+ heredoc_stack.last.last << flushed_token
674
+ end
675
+ end
676
+
677
+ state = :heredoc_opened
678
+ next
679
+ end
680
+ elsif event == :on_heredoc_beg
681
+ tokens << token
682
+ state = :heredoc_opened
683
+ heredoc_stack.last << Heredoc.build(token)
684
+ next
685
+ elsif heredoc_stack.size > 1
686
+ heredoc_stack[-2].last << token
687
+ next
688
+ end
689
+
690
+ heredoc_stack.last.each do |heredoc|
691
+ tokens.concat(heredoc.to_a)
692
+ end
693
+
694
+ heredoc_stack.last.clear
695
+ state = :default
696
+
697
+ tokens << token
698
+ end
699
+ end
700
+
701
+ tokens.reject! { |t| t.event == :on_eof }
702
+
703
+ # We sort by location to compare against Ripper's output
704
+ tokens.sort_by!(&:location)
705
+
706
+ if result_value.size - 1 > tokens.size
707
+ raise StandardError, "Lost tokens when performing lex_compat"
708
+ end
709
+
710
+ ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
711
+ end
712
+ end
713
+
714
+ # The constant that wraps the behavior of the lexer to match Ripper's output
715
+ # is an implementation detail, so we don't want it to be public.
716
+ private_constant :LexCompat
717
+
718
+ # Returns an array of tokens that closely resembles that of the Ripper lexer.
719
+ # The only difference is that since we don't keep track of lexer state in the
720
+ # same way, it's going to always return the NONE state.
721
+ def self.lex_compat(source, filepath = "")
722
+ LexCompat.new(source, filepath).result
723
+ end
724
+
725
+ # This lexes with the Ripper lex. It drops any space events but otherwise
726
+ # returns the same tokens. Raises SyntaxError if the syntax in source is
727
+ # invalid.
728
+ def self.lex_ripper(source)
729
+ previous = []
730
+ results = []
731
+
732
+ Ripper.lex(source, raise_errors: true).each do |token|
733
+ case token[1]
734
+ when :on_sp
735
+ # skip
736
+ when :on_tstring_content
737
+ if previous[1] == :on_tstring_content &&
738
+ (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
739
+ previous[2] << token[2]
740
+ else
741
+ results << token
742
+ previous = token
743
+ end
744
+ when :on_words_sep
745
+ if previous[1] == :on_words_sep
746
+ previous[2] << token[2]
747
+ else
748
+ results << token
749
+ previous = token
750
+ end
751
+ else
752
+ results << token
753
+ previous = token
754
+ end
755
+ end
756
+
757
+ results
758
+ end
759
+ end