yarp 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +76 -0
  3. data/CONTRIBUTING.md +51 -0
  4. data/LICENSE.md +7 -0
  5. data/Makefile.in +79 -0
  6. data/README.md +86 -0
  7. data/config.h.in +25 -0
  8. data/config.yml +2147 -0
  9. data/configure +4487 -0
  10. data/docs/build_system.md +85 -0
  11. data/docs/building.md +26 -0
  12. data/docs/configuration.md +56 -0
  13. data/docs/design.md +53 -0
  14. data/docs/encoding.md +116 -0
  15. data/docs/extension.md +20 -0
  16. data/docs/fuzzing.md +93 -0
  17. data/docs/heredocs.md +36 -0
  18. data/docs/mapping.md +117 -0
  19. data/docs/ripper.md +36 -0
  20. data/docs/serialization.md +130 -0
  21. data/docs/testing.md +55 -0
  22. data/ext/yarp/api_node.c +3680 -0
  23. data/ext/yarp/api_pack.c +256 -0
  24. data/ext/yarp/extconf.rb +131 -0
  25. data/ext/yarp/extension.c +547 -0
  26. data/ext/yarp/extension.h +18 -0
  27. data/include/yarp/ast.h +1412 -0
  28. data/include/yarp/defines.h +54 -0
  29. data/include/yarp/diagnostic.h +24 -0
  30. data/include/yarp/enc/yp_encoding.h +94 -0
  31. data/include/yarp/node.h +36 -0
  32. data/include/yarp/pack.h +141 -0
  33. data/include/yarp/parser.h +389 -0
  34. data/include/yarp/regexp.h +19 -0
  35. data/include/yarp/unescape.h +42 -0
  36. data/include/yarp/util/yp_buffer.h +39 -0
  37. data/include/yarp/util/yp_char.h +75 -0
  38. data/include/yarp/util/yp_constant_pool.h +64 -0
  39. data/include/yarp/util/yp_list.h +67 -0
  40. data/include/yarp/util/yp_memchr.h +14 -0
  41. data/include/yarp/util/yp_newline_list.h +54 -0
  42. data/include/yarp/util/yp_state_stack.h +24 -0
  43. data/include/yarp/util/yp_string.h +57 -0
  44. data/include/yarp/util/yp_string_list.h +28 -0
  45. data/include/yarp/util/yp_strpbrk.h +29 -0
  46. data/include/yarp/version.h +5 -0
  47. data/include/yarp.h +69 -0
  48. data/lib/yarp/lex_compat.rb +759 -0
  49. data/lib/yarp/node.rb +7428 -0
  50. data/lib/yarp/pack.rb +185 -0
  51. data/lib/yarp/ripper_compat.rb +174 -0
  52. data/lib/yarp/serialize.rb +389 -0
  53. data/lib/yarp.rb +330 -0
  54. data/src/diagnostic.c +25 -0
  55. data/src/enc/yp_big5.c +79 -0
  56. data/src/enc/yp_euc_jp.c +85 -0
  57. data/src/enc/yp_gbk.c +88 -0
  58. data/src/enc/yp_shift_jis.c +83 -0
  59. data/src/enc/yp_tables.c +509 -0
  60. data/src/enc/yp_unicode.c +2320 -0
  61. data/src/enc/yp_windows_31j.c +83 -0
  62. data/src/node.c +2011 -0
  63. data/src/pack.c +493 -0
  64. data/src/prettyprint.c +1782 -0
  65. data/src/regexp.c +580 -0
  66. data/src/serialize.c +1576 -0
  67. data/src/token_type.c +347 -0
  68. data/src/unescape.c +576 -0
  69. data/src/util/yp_buffer.c +78 -0
  70. data/src/util/yp_char.c +229 -0
  71. data/src/util/yp_constant_pool.c +147 -0
  72. data/src/util/yp_list.c +50 -0
  73. data/src/util/yp_memchr.c +31 -0
  74. data/src/util/yp_newline_list.c +119 -0
  75. data/src/util/yp_state_stack.c +25 -0
  76. data/src/util/yp_string.c +207 -0
  77. data/src/util/yp_string_list.c +32 -0
  78. data/src/util/yp_strncasecmp.c +20 -0
  79. data/src/util/yp_strpbrk.c +66 -0
  80. data/src/yarp.c +13211 -0
  81. data/yarp.gemspec +100 -0
  82. metadata +125 -0
@@ -0,0 +1,759 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "delegate"
4
+
5
+ module YARP
6
+ # This class is responsible for lexing the source using YARP and then
7
+ # converting those tokens to be compatible with Ripper. In the vast majority
8
+ # of cases, this is a one-to-one mapping of the token type. Everything else
9
+ # generally lines up. However, there are a few cases that require special
10
+ # handling.
11
+ class LexCompat
12
+ # This is a mapping of YARP token types to Ripper token types. This is a
13
+ # many-to-one mapping because we split up our token types, whereas Ripper
14
+ # tends to group them.
15
+ RIPPER = {
16
+ AMPERSAND: :on_op,
17
+ AMPERSAND_AMPERSAND: :on_op,
18
+ AMPERSAND_AMPERSAND_EQUAL: :on_op,
19
+ AMPERSAND_DOT: :on_op,
20
+ AMPERSAND_EQUAL: :on_op,
21
+ BACK_REFERENCE: :on_backref,
22
+ BACKTICK: :on_backtick,
23
+ BANG: :on_op,
24
+ BANG_EQUAL: :on_op,
25
+ BANG_TILDE: :on_op,
26
+ BRACE_LEFT: :on_lbrace,
27
+ BRACE_RIGHT: :on_rbrace,
28
+ BRACKET_LEFT: :on_lbracket,
29
+ BRACKET_LEFT_ARRAY: :on_lbracket,
30
+ BRACKET_LEFT_RIGHT: :on_op,
31
+ BRACKET_LEFT_RIGHT_EQUAL: :on_op,
32
+ BRACKET_RIGHT: :on_rbracket,
33
+ CARET: :on_op,
34
+ CARET_EQUAL: :on_op,
35
+ CHARACTER_LITERAL: :on_CHAR,
36
+ CLASS_VARIABLE: :on_cvar,
37
+ COLON: :on_op,
38
+ COLON_COLON: :on_op,
39
+ COMMA: :on_comma,
40
+ COMMENT: :on_comment,
41
+ CONSTANT: :on_const,
42
+ DOT: :on_period,
43
+ DOT_DOT: :on_op,
44
+ DOT_DOT_DOT: :on_op,
45
+ EMBDOC_BEGIN: :on_embdoc_beg,
46
+ EMBDOC_END: :on_embdoc_end,
47
+ EMBDOC_LINE: :on_embdoc,
48
+ EMBEXPR_BEGIN: :on_embexpr_beg,
49
+ EMBEXPR_END: :on_embexpr_end,
50
+ EMBVAR: :on_embvar,
51
+ EOF: :on_eof,
52
+ EQUAL: :on_op,
53
+ EQUAL_EQUAL: :on_op,
54
+ EQUAL_EQUAL_EQUAL: :on_op,
55
+ EQUAL_GREATER: :on_op,
56
+ EQUAL_TILDE: :on_op,
57
+ FLOAT: :on_float,
58
+ FLOAT_IMAGINARY: :on_imaginary,
59
+ FLOAT_RATIONAL: :on_rational,
60
+ FLOAT_RATIONAL_IMAGINARY: :on_imaginary,
61
+ GREATER: :on_op,
62
+ GREATER_EQUAL: :on_op,
63
+ GREATER_GREATER: :on_op,
64
+ GREATER_GREATER_EQUAL: :on_op,
65
+ GLOBAL_VARIABLE: :on_gvar,
66
+ HEREDOC_END: :on_heredoc_end,
67
+ HEREDOC_START: :on_heredoc_beg,
68
+ IDENTIFIER: :on_ident,
69
+ IGNORED_NEWLINE: :on_ignored_nl,
70
+ INTEGER: :on_int,
71
+ INTEGER_IMAGINARY: :on_imaginary,
72
+ INTEGER_RATIONAL: :on_rational,
73
+ INTEGER_RATIONAL_IMAGINARY: :on_imaginary,
74
+ INSTANCE_VARIABLE: :on_ivar,
75
+ INVALID: :INVALID,
76
+ KEYWORD___ENCODING__: :on_kw,
77
+ KEYWORD___LINE__: :on_kw,
78
+ KEYWORD___FILE__: :on_kw,
79
+ KEYWORD_ALIAS: :on_kw,
80
+ KEYWORD_AND: :on_kw,
81
+ KEYWORD_BEGIN: :on_kw,
82
+ KEYWORD_BEGIN_UPCASE: :on_kw,
83
+ KEYWORD_BREAK: :on_kw,
84
+ KEYWORD_CASE: :on_kw,
85
+ KEYWORD_CLASS: :on_kw,
86
+ KEYWORD_DEF: :on_kw,
87
+ KEYWORD_DEFINED: :on_kw,
88
+ KEYWORD_DO: :on_kw,
89
+ KEYWORD_DO_LOOP: :on_kw,
90
+ KEYWORD_ELSE: :on_kw,
91
+ KEYWORD_ELSIF: :on_kw,
92
+ KEYWORD_END: :on_kw,
93
+ KEYWORD_END_UPCASE: :on_kw,
94
+ KEYWORD_ENSURE: :on_kw,
95
+ KEYWORD_FALSE: :on_kw,
96
+ KEYWORD_FOR: :on_kw,
97
+ KEYWORD_IF: :on_kw,
98
+ KEYWORD_IF_MODIFIER: :on_kw,
99
+ KEYWORD_IN: :on_kw,
100
+ KEYWORD_MODULE: :on_kw,
101
+ KEYWORD_NEXT: :on_kw,
102
+ KEYWORD_NIL: :on_kw,
103
+ KEYWORD_NOT: :on_kw,
104
+ KEYWORD_OR: :on_kw,
105
+ KEYWORD_REDO: :on_kw,
106
+ KEYWORD_RESCUE: :on_kw,
107
+ KEYWORD_RESCUE_MODIFIER: :on_kw,
108
+ KEYWORD_RETRY: :on_kw,
109
+ KEYWORD_RETURN: :on_kw,
110
+ KEYWORD_SELF: :on_kw,
111
+ KEYWORD_SUPER: :on_kw,
112
+ KEYWORD_THEN: :on_kw,
113
+ KEYWORD_TRUE: :on_kw,
114
+ KEYWORD_UNDEF: :on_kw,
115
+ KEYWORD_UNLESS: :on_kw,
116
+ KEYWORD_UNLESS_MODIFIER: :on_kw,
117
+ KEYWORD_UNTIL: :on_kw,
118
+ KEYWORD_UNTIL_MODIFIER: :on_kw,
119
+ KEYWORD_WHEN: :on_kw,
120
+ KEYWORD_WHILE: :on_kw,
121
+ KEYWORD_WHILE_MODIFIER: :on_kw,
122
+ KEYWORD_YIELD: :on_kw,
123
+ LABEL: :on_label,
124
+ LABEL_END: :on_label_end,
125
+ LAMBDA_BEGIN: :on_tlambeg,
126
+ LESS: :on_op,
127
+ LESS_EQUAL: :on_op,
128
+ LESS_EQUAL_GREATER: :on_op,
129
+ LESS_LESS: :on_op,
130
+ LESS_LESS_EQUAL: :on_op,
131
+ MINUS: :on_op,
132
+ MINUS_EQUAL: :on_op,
133
+ MINUS_GREATER: :on_tlambda,
134
+ NEWLINE: :on_nl,
135
+ NUMBERED_REFERENCE: :on_backref,
136
+ PARENTHESIS_LEFT: :on_lparen,
137
+ PARENTHESIS_LEFT_PARENTHESES: :on_lparen,
138
+ PARENTHESIS_RIGHT: :on_rparen,
139
+ PERCENT: :on_op,
140
+ PERCENT_EQUAL: :on_op,
141
+ PERCENT_LOWER_I: :on_qsymbols_beg,
142
+ PERCENT_LOWER_W: :on_qwords_beg,
143
+ PERCENT_LOWER_X: :on_backtick,
144
+ PERCENT_UPPER_I: :on_symbols_beg,
145
+ PERCENT_UPPER_W: :on_words_beg,
146
+ PIPE: :on_op,
147
+ PIPE_EQUAL: :on_op,
148
+ PIPE_PIPE: :on_op,
149
+ PIPE_PIPE_EQUAL: :on_op,
150
+ PLUS: :on_op,
151
+ PLUS_EQUAL: :on_op,
152
+ QUESTION_MARK: :on_op,
153
+ RATIONAL_FLOAT: :on_rational,
154
+ RATIONAL_INTEGER: :on_rational,
155
+ REGEXP_BEGIN: :on_regexp_beg,
156
+ REGEXP_END: :on_regexp_end,
157
+ SEMICOLON: :on_semicolon,
158
+ SLASH: :on_op,
159
+ SLASH_EQUAL: :on_op,
160
+ STAR: :on_op,
161
+ STAR_EQUAL: :on_op,
162
+ STAR_STAR: :on_op,
163
+ STAR_STAR_EQUAL: :on_op,
164
+ STRING_BEGIN: :on_tstring_beg,
165
+ STRING_CONTENT: :on_tstring_content,
166
+ STRING_END: :on_tstring_end,
167
+ SYMBOL_BEGIN: :on_symbeg,
168
+ TILDE: :on_op,
169
+ UAMPERSAND: :on_op,
170
+ UCOLON_COLON: :on_op,
171
+ UDOT_DOT: :on_op,
172
+ UDOT_DOT_DOT: :on_op,
173
+ UMINUS: :on_op,
174
+ UMINUS_NUM: :on_op,
175
+ UPLUS: :on_op,
176
+ USTAR: :on_op,
177
+ USTAR_STAR: :on_op,
178
+ WORDS_SEP: :on_words_sep,
179
+ "__END__": :on___end__
180
+ }.freeze
181
+
182
+ # When we produce tokens, we produce the same arrays that Ripper does.
183
+ # However, we add a couple of convenience methods onto them to make them a
184
+ # little easier to work with. We delegate all other methods to the array.
185
+ class Token < SimpleDelegator
186
+ def location
187
+ self[0]
188
+ end
189
+
190
+ def event
191
+ self[1]
192
+ end
193
+
194
+ def value
195
+ self[2]
196
+ end
197
+
198
+ def state
199
+ self[3]
200
+ end
201
+ end
202
+
203
+ # Ripper doesn't include the rest of the token in the event, so we need to
204
+ # trim it down to just the content on the first line when comparing.
205
+ class EndContentToken < Token
206
+ def ==(other)
207
+ [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other
208
+ end
209
+ end
210
+
211
+ # It is extremely non obvious which state the parser is in when comments get
212
+ # dispatched. Because of this we don't both comparing state when comparing
213
+ # against other comment tokens.
214
+ class CommentToken < Token
215
+ def ==(other)
216
+ self[0...-1] == other[0...-1]
217
+ end
218
+ end
219
+
220
+ # Heredoc end tokens are emitted in an odd order, so we don't compare the
221
+ # state on them.
222
+ class HeredocEndToken < Token
223
+ def ==(other)
224
+ self[0...-1] == other[0...-1]
225
+ end
226
+ end
227
+
228
+ # Ident tokens for the most part are exactly the same, except sometimes we
229
+ # know an ident is a local when ripper doesn't (when they are introduced
230
+ # through named captures in regular expressions). In that case we don't
231
+ # compare the state.
232
+ class IdentToken < Token
233
+ def ==(other)
234
+ (self[0...-1] == other[0...-1]) && (
235
+ (other[3] == Ripper::EXPR_LABEL | Ripper::EXPR_END) ||
236
+ (other[3] & Ripper::EXPR_ARG_ANY != 0)
237
+ )
238
+ end
239
+ end
240
+
241
+ # Ignored newlines can occasionally have a LABEL state attached to them, so
242
+ # we compare the state differently here.
243
+ class IgnoredNewlineToken < Token
244
+ def ==(other)
245
+ return false unless self[0...-1] == other[0...-1]
246
+
247
+ if self[4] == Ripper::EXPR_ARG | Ripper::EXPR_LABELED
248
+ other[4] & Ripper::EXPR_ARG | Ripper::EXPR_LABELED > 0
249
+ else
250
+ self[4] == other[4]
251
+ end
252
+ end
253
+ end
254
+
255
+ # A heredoc in this case is a list of tokens that belong to the body of the
256
+ # heredoc that should be appended onto the list of tokens when the heredoc
257
+ # closes.
258
+ module Heredoc
259
+ # Heredocs that are no dash or tilde heredocs are just a list of tokens.
260
+ # We need to keep them around so that we can insert them in the correct
261
+ # order back into the token stream and set the state of the last token to
262
+ # the state that the heredoc was opened in.
263
+ class PlainHeredoc
264
+ attr_reader :tokens
265
+
266
+ def initialize
267
+ @tokens = []
268
+ end
269
+
270
+ def <<(token)
271
+ tokens << token
272
+ end
273
+
274
+ def to_a
275
+ tokens
276
+ end
277
+ end
278
+
279
+ # Dash heredocs are a little more complicated. They are a list of tokens
280
+ # that need to be split on "\\\n" to mimic Ripper's behavior. We also need
281
+ # to keep track of the state that the heredoc was opened in.
282
+ class DashHeredoc
283
+ attr_reader :split, :tokens
284
+
285
+ def initialize(split)
286
+ @split = split
287
+ @tokens = []
288
+ end
289
+
290
+ def <<(token)
291
+ tokens << token
292
+ end
293
+
294
+ def to_a
295
+ embexpr_balance = 0
296
+
297
+ tokens.each_with_object([]) do |token, results|
298
+ case token.event
299
+ when :on_embexpr_beg
300
+ embexpr_balance += 1
301
+ results << token
302
+ when :on_embexpr_end
303
+ embexpr_balance -= 1
304
+ results << token
305
+ when :on_tstring_content
306
+ if embexpr_balance == 0
307
+ lineno = token[0][0]
308
+ column = token[0][1]
309
+
310
+ if split
311
+ # Split on "\\\n" to mimic Ripper's behavior. Use a lookbehind
312
+ # to keep the delimiter in the result.
313
+ token.value.split(/(?<=[^\\]\\\n)|(?<=[^\\]\\\r\n)/).each_with_index do |value, index|
314
+ column = 0 if index > 0
315
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
316
+ lineno += value.count("\n")
317
+ end
318
+ else
319
+ results << token
320
+ end
321
+ else
322
+ results << token
323
+ end
324
+ else
325
+ results << token
326
+ end
327
+ end
328
+ end
329
+ end
330
+
331
+ # Heredocs that are dedenting heredocs are a little more complicated.
332
+ # Ripper outputs on_ignored_sp tokens for the whitespace that is being
333
+ # removed from the output. YARP only modifies the node itself and keeps
334
+ # the token the same. This simplifies YARP, but makes comparing against
335
+ # Ripper much harder because there is a length mismatch.
336
+ #
337
+ # Fortunately, we already have to pull out the heredoc tokens in order to
338
+ # insert them into the stream in the correct order. As such, we can do
339
+ # some extra manipulation on the tokens to make them match Ripper's
340
+ # output by mirroring the dedent logic that Ripper uses.
341
+ class DedentingHeredoc
342
+ TAB_WIDTH = 8
343
+
344
+ attr_reader :tokens, :dedent_next, :dedent, :embexpr_balance
345
+
346
+ def initialize
347
+ @tokens = []
348
+ @dedent_next = true
349
+ @dedent = nil
350
+ @embexpr_balance = 0
351
+ end
352
+
353
+ # As tokens are coming in, we track the minimum amount of common leading
354
+ # whitespace on plain string content tokens. This allows us to later
355
+ # remove that amount of whitespace from the beginning of each line.
356
+ def <<(token)
357
+ case token.event
358
+ when :on_embexpr_beg, :on_heredoc_beg
359
+ @embexpr_balance += 1
360
+ when :on_embexpr_end, :on_heredoc_end
361
+ @embexpr_balance -= 1
362
+ when :on_tstring_content
363
+ if embexpr_balance == 0
364
+ token.value.split(/(?<=\n)/).each_with_index do |line, index|
365
+ next if line.strip.empty? && line.end_with?("\n")
366
+ next if !(dedent_next || index > 0)
367
+
368
+ leading = line[/\A(\s*)\n?/, 1]
369
+ next_dedent = 0
370
+
371
+ leading.each_char do |char|
372
+ if char == "\t"
373
+ next_dedent = next_dedent - (next_dedent % TAB_WIDTH) + TAB_WIDTH
374
+ else
375
+ next_dedent += 1
376
+ end
377
+ end
378
+
379
+ @dedent = [dedent, next_dedent].compact.min
380
+ end
381
+ end
382
+ end
383
+
384
+ @dedent_next = token.event == :on_tstring_content && embexpr_balance == 0
385
+ tokens << token
386
+ end
387
+
388
+ def to_a
389
+ # If every line in the heredoc is blank, we still need to split up the
390
+ # string content token into multiple tokens.
391
+ if dedent.nil?
392
+ results = []
393
+ embexpr_balance = 0
394
+
395
+ tokens.each do |token|
396
+ case token.event
397
+ when :on_embexpr_beg, :on_heredoc_beg
398
+ embexpr_balance += 1
399
+ results << token
400
+ when :on_embexpr_end, :on_heredoc_end
401
+ embexpr_balance -= 1
402
+ results << token
403
+ when :on_tstring_content
404
+ if embexpr_balance == 0
405
+ lineno = token[0][0]
406
+ column = token[0][1]
407
+
408
+ token.value.split(/(?<=\n)/).each_with_index do |value, index|
409
+ column = 0 if index > 0
410
+ results << Token.new([[lineno, column], :on_tstring_content, value, token.state])
411
+ lineno += 1
412
+ end
413
+ else
414
+ results << token
415
+ end
416
+ else
417
+ results << token
418
+ end
419
+ end
420
+
421
+ return results
422
+ end
423
+
424
+ # Otherwise, we're going to run through each token in the list and
425
+ # insert on_ignored_sp tokens for the amount of dedent that we need to
426
+ # perform. We also need to remove the dedent from the beginning of
427
+ # each line of plain string content tokens.
428
+ results = []
429
+ dedent_next = true
430
+ embexpr_balance = 0
431
+
432
+ tokens.each do |token|
433
+ # Notice that the structure of this conditional largely matches the
434
+ # whitespace calculation we performed above. This is because
435
+ # checking if the subsequent token needs to be dedented is common to
436
+ # both the dedent calculation and the ignored_sp insertion.
437
+ case token.event
438
+ when :on_embexpr_beg
439
+ embexpr_balance += 1
440
+ results << token
441
+ when :on_embexpr_end
442
+ embexpr_balance -= 1
443
+ results << token
444
+ when :on_tstring_content
445
+ if embexpr_balance == 0
446
+ # Here we're going to split the string on newlines, but maintain
447
+ # the newlines in the resulting array. We'll do that with a look
448
+ # behind assertion.
449
+ splits = token.value.split(/(?<=\n)/)
450
+ index = 0
451
+
452
+ while index < splits.length
453
+ line = splits[index]
454
+ lineno = token[0][0] + index
455
+ column = token[0][1]
456
+
457
+ # Blank lines do not count toward common leading whitespace
458
+ # calculation and do not need to be dedented.
459
+ if dedent_next || index > 0
460
+ column = 0
461
+ end
462
+
463
+ # If the dedent is 0 and we're not supposed to dedent the next
464
+ # line or this line doesn't start with whitespace, then we
465
+ # should concatenate the rest of the string to match ripper.
466
+ if dedent == 0 && (!dedent_next || !line.start_with?(/\s/))
467
+ line = splits[index..].join
468
+ index = splits.length
469
+ end
470
+
471
+ # If we are supposed to dedent this line or if this is not the
472
+ # first line of the string and this line isn't entirely blank,
473
+ # then we need to insert an on_ignored_sp token and remove the
474
+ # dedent from the beginning of the line.
475
+ if (dedent > 0) && (dedent_next || index > 0)
476
+ deleting = 0
477
+ deleted_chars = []
478
+
479
+ # Gather up all of the characters that we're going to
480
+ # delete, stopping when you hit a character that would put
481
+ # you over the dedent amount.
482
+ line.each_char.with_index do |char, i|
483
+ case char
484
+ when "\r"
485
+ if line.chars[i + 1] == "\n"
486
+ break
487
+ end
488
+ when "\n"
489
+ break
490
+ when "\t"
491
+ deleting = deleting - (deleting % TAB_WIDTH) + TAB_WIDTH
492
+ else
493
+ deleting += 1
494
+ end
495
+
496
+ break if deleting > dedent
497
+ deleted_chars << char
498
+ end
499
+
500
+ # If we have something to delete, then delete it from the
501
+ # string and insert an on_ignored_sp token.
502
+ if deleted_chars.any?
503
+ ignored = deleted_chars.join
504
+ line.delete_prefix!(ignored)
505
+
506
+ results << Token.new([[lineno, 0], :on_ignored_sp, ignored, token[3]])
507
+ column = ignored.length
508
+ end
509
+ end
510
+
511
+ results << Token.new([[lineno, column], token[1], line, token[3]]) unless line.empty?
512
+ index += 1
513
+ end
514
+ else
515
+ results << token
516
+ end
517
+ else
518
+ results << token
519
+ end
520
+
521
+ dedent_next =
522
+ ((token.event == :on_tstring_content) || (token.event == :on_heredoc_end)) &&
523
+ embexpr_balance == 0
524
+ end
525
+
526
+ results
527
+ end
528
+ end
529
+
530
+ # Here we will split between the two types of heredocs and return the
531
+ # object that will store their tokens.
532
+ def self.build(opening)
533
+ case opening.value[2]
534
+ when "~"
535
+ DedentingHeredoc.new
536
+ when "-"
537
+ DashHeredoc.new(opening.value[3] != "'")
538
+ else
539
+ PlainHeredoc.new
540
+ end
541
+ end
542
+ end
543
+
544
+ attr_reader :source, :filepath
545
+
546
+ def initialize(source, filepath = "")
547
+ @source = source
548
+ @filepath = filepath || ""
549
+ end
550
+
551
+ def result
552
+ tokens = []
553
+
554
+ state = :default
555
+ heredoc_stack = [[]]
556
+
557
+ result = YARP.lex(source, @filepath)
558
+ result_value = result.value
559
+ previous_state = nil
560
+
561
+ # If there's a UTF-8 byte-order mark as the start of the file, then ripper
562
+ # sets every token's on the first line back by 6 bytes. It also keeps the
563
+ # byte order mark in the first token's value. This is weird, and I don't
564
+ # want to mirror that in our parser. So instead, we'll match up the values
565
+ # here, and then match up the locations as we process the tokens.
566
+ bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
567
+ result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
568
+
569
+ result_value.each_with_index do |(token, lex_state), index|
570
+ lineno = token.location.start_line
571
+ column = token.location.start_column
572
+ column -= index == 0 ? 6 : 3 if bom && lineno == 1
573
+
574
+ event = RIPPER.fetch(token.type)
575
+ value = token.value
576
+ lex_state = Ripper::Lexer::State.new(lex_state)
577
+
578
+ token =
579
+ case event
580
+ when :on___end__
581
+ EndContentToken.new([[lineno, column], event, value, lex_state])
582
+ when :on_comment
583
+ CommentToken.new([[lineno, column], event, value, lex_state])
584
+ when :on_heredoc_end
585
+ # Heredoc end tokens can be emitted in an odd order, so we don't
586
+ # want to bother comparing the state on them.
587
+ HeredocEndToken.new([[lineno, column], event, value, lex_state])
588
+ when :on_embexpr_end, :on_ident
589
+ if lex_state == Ripper::EXPR_END | Ripper::EXPR_LABEL
590
+ # In the event that we're comparing identifiers, we're going to
591
+ # allow a little divergence. Ripper doesn't account for local
592
+ # variables introduced through named captures in regexes, and we
593
+ # do, which accounts for this difference.
594
+ IdentToken.new([[lineno, column], event, value, lex_state])
595
+ else
596
+ Token.new([[lineno, column], event, value, lex_state])
597
+ end
598
+ when :on_ignored_nl
599
+ # Ignored newlines can occasionally have a LABEL state attached to
600
+ # them which doesn't actually impact anything. We don't mirror that
601
+ # state so we ignored it.
602
+ IgnoredNewlineToken.new([[lineno, column], event, value, lex_state])
603
+ when :on_regexp_end
604
+ # On regex end, Ripper scans and then sets end state, so the ripper
605
+ # lexed output is begin, when it should be end. YARP sets lex state
606
+ # correctly to end state, but we want to be able to compare against
607
+ # Ripper's lexed state. So here, if it's a regexp end token, we
608
+ # output the state as the previous state, solely for the sake of
609
+ # comparison.
610
+ previous_token = result_value[index - 1][0]
611
+ lex_state =
612
+ if RIPPER.fetch(previous_token.type) == :on_embexpr_end
613
+ # If the previous token is embexpr_end, then we have to do even
614
+ # more processing. The end of an embedded expression sets the
615
+ # state to the state that it had at the beginning of the
616
+ # embedded expression. So we have to go and find that state and
617
+ # set it here.
618
+ counter = 1
619
+ current_index = index - 1
620
+
621
+ until counter == 0
622
+ current_index -= 1
623
+ current_event = RIPPER.fetch(result_value[current_index][0].type)
624
+ counter += { on_embexpr_beg: -1, on_embexpr_end: 1 }[current_event] || 0
625
+ end
626
+
627
+ Ripper::Lexer::State.new(result_value[current_index][1])
628
+ else
629
+ previous_state
630
+ end
631
+
632
+ Token.new([[lineno, column], event, value, lex_state])
633
+ else
634
+ Token.new([[lineno, column], event, value, lex_state])
635
+ end
636
+
637
+ previous_state = lex_state
638
+
639
+ # The order in which tokens appear in our lexer is different from the
640
+ # order that they appear in Ripper. When we hit the declaration of a
641
+ # heredoc in YARP, we skip forward and lex the rest of the content of
642
+ # the heredoc before going back and lexing at the end of the heredoc
643
+ # identifier.
644
+ #
645
+ # To match up to ripper, we keep a small state variable around here to
646
+ # track whether we're in the middle of a heredoc or not. In this way we
647
+ # can shuffle around the token to match Ripper's output.
648
+ case state
649
+ when :default
650
+ tokens << token
651
+
652
+ if event == :on_heredoc_beg
653
+ state = :heredoc_opened
654
+ heredoc_stack.last << Heredoc.build(token)
655
+ end
656
+ when :heredoc_opened
657
+ heredoc_stack.last.last << token
658
+
659
+ case event
660
+ when :on_heredoc_beg
661
+ heredoc_stack << [Heredoc.build(token)]
662
+ when :on_heredoc_end
663
+ state = :heredoc_closed
664
+ end
665
+ when :heredoc_closed
666
+ if %i[on_nl on_ignored_nl on_comment].include?(event) || (event == :on_tstring_content && value.end_with?("\n"))
667
+ if heredoc_stack.size > 1
668
+ flushing = heredoc_stack.pop
669
+ heredoc_stack.last.last << token
670
+
671
+ flushing.each do |heredoc|
672
+ heredoc.to_a.each do |flushed_token|
673
+ heredoc_stack.last.last << flushed_token
674
+ end
675
+ end
676
+
677
+ state = :heredoc_opened
678
+ next
679
+ end
680
+ elsif event == :on_heredoc_beg
681
+ tokens << token
682
+ state = :heredoc_opened
683
+ heredoc_stack.last << Heredoc.build(token)
684
+ next
685
+ elsif heredoc_stack.size > 1
686
+ heredoc_stack[-2].last << token
687
+ next
688
+ end
689
+
690
+ heredoc_stack.last.each do |heredoc|
691
+ tokens.concat(heredoc.to_a)
692
+ end
693
+
694
+ heredoc_stack.last.clear
695
+ state = :default
696
+
697
+ tokens << token
698
+ end
699
+ end
700
+
701
+ tokens.reject! { |t| t.event == :on_eof }
702
+
703
+ # We sort by location to compare against Ripper's output
704
+ tokens.sort_by!(&:location)
705
+
706
+ if result_value.size - 1 > tokens.size
707
+ raise StandardError, "Lost tokens when performing lex_compat"
708
+ end
709
+
710
+ ParseResult.new(tokens, result.comments, result.errors, result.warnings, [])
711
+ end
712
+ end
713
+
714
+ # The constant that wraps the behavior of the lexer to match Ripper's output
715
+ # is an implementation detail, so we don't want it to be public.
716
+ private_constant :LexCompat
717
+
718
+ # Returns an array of tokens that closely resembles that of the Ripper lexer.
719
+ # The only difference is that since we don't keep track of lexer state in the
720
+ # same way, it's going to always return the NONE state.
721
+ def self.lex_compat(source, filepath = "")
722
+ LexCompat.new(source, filepath).result
723
+ end
724
+
725
+ # This lexes with the Ripper lex. It drops any space events but otherwise
726
+ # returns the same tokens. Raises SyntaxError if the syntax in source is
727
+ # invalid.
728
+ def self.lex_ripper(source)
729
+ previous = []
730
+ results = []
731
+
732
+ Ripper.lex(source, raise_errors: true).each do |token|
733
+ case token[1]
734
+ when :on_sp
735
+ # skip
736
+ when :on_tstring_content
737
+ if previous[1] == :on_tstring_content &&
738
+ (token[2].start_with?("\#$") || token[2].start_with?("\#@"))
739
+ previous[2] << token[2]
740
+ else
741
+ results << token
742
+ previous = token
743
+ end
744
+ when :on_words_sep
745
+ if previous[1] == :on_words_sep
746
+ previous[2] << token[2]
747
+ else
748
+ results << token
749
+ previous = token
750
+ end
751
+ else
752
+ results << token
753
+ previous = token
754
+ end
755
+ end
756
+
757
+ results
758
+ end
759
+ end