ruby_parser 3.17.0 → 3.18.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,638 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RubyLexer
4
+ def eat_whitespace
5
+ r = scan(/\s+/)
6
+ self.lineno += r.count("\n") if r
7
+
8
+ r += eat_whitespace if eos? && in_heredoc? # forces heredoc pop
9
+
10
+ r
11
+ end
12
+
13
+ def heredoc here # ../compare/parse30.y:7678
14
+ _, term, func, _indent_max, _lineno, range = here
15
+
16
+ start_line = lineno
17
+ eos = term # HACK
18
+ indent = func =~ STR_FUNC_INDENT
19
+
20
+ self.string_buffer = []
21
+
22
+ last_line = self.ss_string[range] if range
23
+ eol = last_line && last_line.end_with?("\r\n") ? "\r\n" : "\n" # HACK
24
+
25
+ expand = func =~ STR_FUNC_EXPAND
26
+
27
+ # TODO? p->heredoc_line_indent == -1
28
+
29
+ indent_re = indent ? "[ \t]*" : nil
30
+ eos_re = /#{indent_re}#{Regexp.escape eos}(?=\r?\n|\z)/
31
+ err_msg = "can't match #{eos_re.inspect} anywhere in "
32
+
33
+ maybe_pop_stack
34
+ rb_compile_error err_msg if end_of_stream?
35
+
36
+ if beginning_of_line? && scan(eos_re) then
37
+ scan(/\r?\n|\z/)
38
+ self.lineno += 1 if matched =~ /\n/
39
+
40
+ heredoc_restore
41
+
42
+ self.lex_strterm = nil
43
+ self.lex_state = EXPR_END
44
+
45
+ return :tSTRING_END, [term, func, range]
46
+ end
47
+
48
+ if expand then
49
+ case
50
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
51
+ # TODO: !ISASCII
52
+ # ?! see parser_peek_variable_name
53
+ return :tSTRING_DVAR, matched
54
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
55
+ # TODO: !ISASCII
56
+ return :tSTRING_DVAR, matched
57
+ when scan(/#[{]/) then
58
+ self.command_start = true
59
+ return :tSTRING_DBEG, matched
60
+ when scan(/#/) then
61
+ string_buffer << "#"
62
+ end
63
+
64
+ begin
65
+ # NOTE: this visibly diverges from the C code but uses tokadd_string
66
+ # to stay clean.
67
+
68
+ str = tokadd_string func, eol, nil
69
+ rb_compile_error err_msg if str == RubyLexer::EOF
70
+
71
+ if str != eol then
72
+ str = string_buffer.join
73
+ string_buffer.clear
74
+ return result nil, :tSTRING_CONTENT, str, start_line
75
+ else
76
+ string_buffer << scan(/\r?\n/)
77
+ self.lineno += 1 # TODO: try to remove most scan(/\n/) and friends
78
+ end
79
+ end until check eos_re
80
+ else
81
+ until check(eos_re) do
82
+ string_buffer << scan(/.*(\r?\n|\z)/)
83
+ self.lineno += 1
84
+ rb_compile_error err_msg if end_of_stream?
85
+ end
86
+ end
87
+
88
+ string_content = begin
89
+ s = string_buffer.join
90
+ s.b.force_encoding Encoding::UTF_8
91
+ s
92
+ end
93
+ string_buffer.clear
94
+
95
+ result nil, :tSTRING_CONTENT, string_content, start_line
96
+ end
97
+
98
+ def heredoc_identifier # ../compare/parse30.y:7354
99
+ token = :tSTRING_BEG
100
+ func = STR_FUNC_BORING
101
+ term = nil
102
+ indent = nil
103
+ quote = nil
104
+ char_pos = nil
105
+ byte_pos = nil
106
+
107
+ heredoc_indent_mods = "-"
108
+ heredoc_indent_mods += '\~' if ruby23plus?
109
+
110
+ case
111
+ when scan(/([#{heredoc_indent_mods}]?)([\'\"\`])(.*?)\2/) then
112
+ mods, quote, term = match[1], match[2], match[3]
113
+ char_pos = ss.charpos
114
+ byte_pos = ss.pos
115
+
116
+ func |= STR_FUNC_INDENT unless mods.empty?
117
+ func |= STR_FUNC_DEDENT if mods == "~"
118
+ func |= case quote
119
+ when "\'" then
120
+ STR_SQUOTE
121
+ when '"' then
122
+ STR_DQUOTE
123
+ when "`" then
124
+ token = :tXSTRING_BEG
125
+ STR_XQUOTE
126
+ else
127
+ debug 1
128
+ end
129
+ when scan(/[#{heredoc_indent_mods}]?([\'\"\`])(?!\1*\Z)/) then
130
+ rb_compile_error "unterminated here document identifier"
131
+ when scan(/([#{heredoc_indent_mods}]?)(#{IDENT_CHAR}+)/) then
132
+ mods, term = match[1], match[2]
133
+ quote = '"'
134
+ char_pos = ss.charpos
135
+ byte_pos = ss.pos
136
+
137
+ func |= STR_FUNC_INDENT unless mods.empty?
138
+ func |= STR_FUNC_DEDENT if mods == "~"
139
+ func |= STR_DQUOTE
140
+ else
141
+ return
142
+ end
143
+
144
+ old_lineno = self.lineno
145
+ rest_of_line = scan(/.*(?:\r?\n|\z)/)
146
+ self.lineno += rest_of_line.count "\n"
147
+
148
+ char_pos_end = ss.charpos - 1
149
+
150
+ range = nil
151
+ range = char_pos..char_pos_end unless rest_of_line.empty?
152
+
153
+ self.lex_strterm = [:heredoc, term, func, indent, old_lineno, range, byte_pos]
154
+
155
+ result nil, token, quote, old_lineno
156
+ end
157
+
158
+ def heredoc_restore # ../compare/parse30.y:7438
159
+ _, _term, _func, _indent, lineno, range, bytepos = lex_strterm
160
+
161
+ new_ss = ss.class.new self.ss_string[0..range.max]
162
+ new_ss.pos = bytepos
163
+
164
+ lineno_push lineno
165
+ ss_push new_ss
166
+
167
+ nil
168
+ end
169
+
170
+ def newtok
171
+ string_buffer.clear
172
+ end
173
+
174
+ def nextc
175
+ # TODO:
176
+ # if (UNLIKELY((p->lex.pcur == p->lex.pend) || p->eofp || RTEST(p->lex.nextline))) {
177
+ # if (nextline(p)) return -1;
178
+ # }
179
+
180
+ maybe_pop_stack
181
+
182
+ c = ss.getch
183
+
184
+ if c == "\n" then
185
+ ss.unscan
186
+ c = nil
187
+ end
188
+
189
+ c
190
+ end
191
+
192
+ def parse_string quote # ../compare/parse30.y:7273
193
+ _, func, term, paren = quote
194
+
195
+ qwords = func =~ STR_FUNC_QWORDS
196
+ regexp = func =~ STR_FUNC_REGEXP
197
+ expand = func =~ STR_FUNC_EXPAND
198
+ list = func =~ STR_FUNC_LIST
199
+ termx = func =~ STR_FUNC_TERM # TODO: document wtf this means
200
+
201
+ space = false
202
+ term_re = regexp_cache[term]
203
+
204
+ if termx then
205
+ # self.nextc if qwords # delayed term
206
+
207
+ self.lex_strterm = nil
208
+
209
+ return result EXPR_END, regexp ? :tREGEXP_END : :tSTRING_END, term
210
+ end
211
+
212
+ space = true if qwords and eat_whitespace
213
+
214
+ if list then
215
+ debug 4
216
+ # quote[1] -= STR_FUNC_LIST
217
+ # space = true
218
+ end
219
+
220
+ # TODO: move to quote.nest!
221
+ if string_nest == 0 && scan(term_re) then
222
+ if qwords then
223
+ quote[1] |= STR_FUNC_TERM
224
+
225
+ return :tSPACE, matched
226
+ end
227
+
228
+ return string_term func
229
+ end
230
+
231
+ return result nil, :tSPACE, " " if space
232
+
233
+ newtok
234
+
235
+ if expand && check(/#/) then
236
+ t = self.scan_variable_name
237
+ return t if t
238
+
239
+ tokadd "#"
240
+ end
241
+
242
+ # TODO: add string_nest, enc, base_enc ?
243
+ lineno = self.lineno
244
+ if tokadd_string(func, term, paren) == RubyLexer::EOF then
245
+ if qwords then
246
+ rb_compile_error "unterminated list meets end of file"
247
+ end
248
+
249
+ if regexp then
250
+ rb_compile_error "unterminated regexp meets end of file"
251
+ else
252
+ rb_compile_error "unterminated string meets end of file"
253
+ end
254
+ end
255
+
256
+ result nil, :tSTRING_CONTENT, string_buffer.join, lineno
257
+ end
258
+
259
+ # called from process_percent
260
+ def process_percent_quote # ../compare/parse30.y:8645
261
+ c = getch # type %<type><term>...<term>
262
+
263
+ long_hand = !!(c =~ /[QqWwIixrs]/)
264
+
265
+ if end_of_stream? || c !~ /\p{Alnum}/ then
266
+ term = c # TODO? PERCENT_END[c] || c
267
+
268
+ debug 2 if c && c !~ /\p{ASCII}/
269
+ c = "Q"
270
+ else
271
+ term = getch
272
+
273
+ debug 3 if term =~ /\p{Alnum}|\P{ASCII}/
274
+ end
275
+
276
+ if end_of_stream? or c == RubyLexer::EOF or term == RubyLexer::EOF then
277
+ rb_compile_error "unterminated quoted string meets end of file"
278
+ end
279
+
280
+ # "\0" is special to indicate beg=nnd and that no nesting?
281
+ paren = term
282
+ term = PERCENT_END[term]
283
+ term, paren = paren, "\0" if term.nil? # TODO: "\0" -> nil
284
+
285
+ text = long_hand ? "%#{c}#{paren}" : "%#{term}"
286
+
287
+ current_line = self.lineno
288
+
289
+ token_type, string_type =
290
+ case c
291
+ when "Q" then
292
+ [:tSTRING_BEG, STR_DQUOTE]
293
+ when "q" then
294
+ [:tSTRING_BEG, STR_SQUOTE]
295
+ when "W" then
296
+ eat_whitespace
297
+ [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
298
+ when "w" then
299
+ eat_whitespace
300
+ [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
301
+ when "I" then
302
+ eat_whitespace
303
+ [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS]
304
+ when "i" then
305
+ eat_whitespace
306
+ [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS]
307
+ when "x" then
308
+ [:tXSTRING_BEG, STR_XQUOTE]
309
+ when "r" then
310
+ [:tREGEXP_BEG, STR_REGEXP]
311
+ when "s" then
312
+ self.lex_state = EXPR_FNAME
313
+ [:tSYMBEG, STR_SSYM]
314
+ else
315
+ rb_compile_error "unknown type of %string. Expected [QqWwIixrs], found '#{c}'."
316
+ end
317
+
318
+ string string_type, term, paren
319
+
320
+ result nil, token_type, text, current_line
321
+ end
322
+
323
+ def process_string_or_heredoc # ../compare/parse30.y:9075
324
+ if lex_strterm[0] == :heredoc then
325
+ self.heredoc lex_strterm
326
+ else
327
+ self.parse_string lex_strterm
328
+ end
329
+ end
330
+
331
+ def read_escape flags = nil # ../compare/parse30.y:6712
332
+ case
333
+ when scan(/\\/) then # Backslash
334
+ '\\'
335
+ when scan(/n/) then # newline
336
+ "\n"
337
+ when scan(/t/) then # horizontal tab
338
+ "\t"
339
+ when scan(/r/) then # carriage-return
340
+ "\r"
341
+ when scan(/f/) then # form-feed
342
+ "\f"
343
+ when scan(/v/) then # vertical tab
344
+ "\13"
345
+ when scan(/a/) then # alarm(bell)
346
+ "\007"
347
+ when scan(/e/) then # escape
348
+ "\033"
349
+ when scan(/[0-7]{1,3}/) then # octal constant
350
+ (matched.to_i(8) & 0xFF).chr.force_encoding Encoding::UTF_8
351
+ when scan(/x([0-9a-fA-F]{1,2})/) then # hex constant
352
+ # TODO: force encode everything to UTF-8?
353
+ match[1].to_i(16).chr.force_encoding Encoding::UTF_8
354
+ when scan(/b/) then # backspace
355
+ "\010"
356
+ when scan(/s/) then # space
357
+ " "
358
+ when check(/M-\\u/) then
359
+ debug 5
360
+ when scan(/M-\\(?=.)/) then
361
+ c = read_escape
362
+ c[0] = (c[0].ord | 0x80).chr
363
+ c
364
+ when scan(/M-(\p{ASCII})/) then
365
+ # TODO: ISCNTRL(c) -> goto eof
366
+ c = match[1]
367
+ c[0] = (c[0].ord | 0x80).chr
368
+ c
369
+ when check(/(C-|c)\\u/) then
370
+ debug 6
371
+ when scan(/(C-|c)\\?\?/) then
372
+ 127.chr
373
+ when scan(/(C-|c)\\/) then
374
+ c = read_escape
375
+ c[0] = (c[0].ord & 0x9f).chr
376
+ c
377
+ when scan(/(?:C-|c)(.)/) then
378
+ c = match[1]
379
+ c[0] = (c[0].ord & 0x9f).chr
380
+ c
381
+ when scan(/^[89]/i) then # bad octal or hex... MRI ignores them :(
382
+ matched
383
+ when scan(/u(\h{4})/) then
384
+ [match[1].to_i(16)].pack("U")
385
+ when scan(/u(\h{1,3})/) then
386
+ debug 7
387
+ rb_compile_error "Invalid escape character syntax"
388
+ when scan(/u\{(\h+(?: +\h+)*)\}/) then
389
+ match[1].split.map { |s| s.to_i(16) }.pack("U*")
390
+ when scan(/[McCx0-9]/) || end_of_stream? then
391
+ rb_compile_error("Invalid escape character syntax")
392
+ else
393
+ getch
394
+ end.dup
395
+ end
396
+
397
+ def regx_options # ../compare/parse30.y:6914
398
+ newtok
399
+
400
+ options = scan(/\p{Alpha}+/) || ""
401
+
402
+ rb_compile_error("unknown regexp options: %s" % [options]) if
403
+ options =~ /[^ixmonesu]/
404
+
405
+ options
406
+ end
407
+
408
+ def scan_variable_name # ../compare/parse30.y:7208
409
+ case
410
+ when scan(/#(?=\$(-.|[a-zA-Z_0-9~\*\$\?!@\/\\;,\.=:<>\"\&\`\'+]))/) then
411
+ # TODO: !ISASCII
412
+ return :tSTRING_DVAR, matched
413
+ when scan(/#(?=\@\@?[a-zA-Z_])/) then
414
+ # TODO: !ISASCII
415
+ return :tSTRING_DVAR, matched
416
+ when scan(/#[{]/) then
417
+ self.command_start = true
418
+ return :tSTRING_DBEG, matched
419
+ when scan(/#/) then
420
+ # do nothing but swallow
421
+ end
422
+
423
+ # if scan(/\P{ASCII}|_|\p{Alpha}/) then # TODO: fold into above DVAR cases
424
+ # # if (!ISASCII(c) || c == '_' || ISALPHA(c))
425
+ # # return tSTRING_DVAR;
426
+ # end
427
+
428
+ nil
429
+ end
430
+
431
+ def string type, beg, nnd = nil
432
+ # label = (IS_LABEL_POSSIBLE() ? str_label : 0);
433
+ # p->lex.strterm = NEW_STRTERM(str_dquote | label, '"', 0);
434
+ # p->lex.ptok = p->lex.pcur-1;
435
+
436
+ type |= STR_FUNC_LABEL if is_label_possible?
437
+ self.lex_strterm = [:strterm, type, beg, nnd || "\0"]
438
+ end
439
+
440
+ def string_term func # ../compare/parse30.y:7254
441
+ self.lex_strterm = nil
442
+
443
+ return result EXPR_END, :tREGEXP_END, self.regx_options if
444
+ func =~ STR_FUNC_REGEXP
445
+
446
+ if func =~ STR_FUNC_LABEL && is_label_suffix? then
447
+ self.getch
448
+ self.lex_state = EXPR_BEG|EXPR_LABEL
449
+
450
+ return :tLABEL_END, string_buffer.join
451
+ end
452
+
453
+ self.lex_state = EXPR_END
454
+
455
+ return :tSTRING_END, [self.matched, func]
456
+ end
457
+
458
+ def tokadd c # ../compare/parse30.y:6548
459
+ string_buffer << c
460
+ end
461
+
462
+ def tokadd_escape # ../compare/parse30.y:6840
463
+ case
464
+ when scan(/\\\n/) then
465
+ # just ignore
466
+ when scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then
467
+ tokadd matched
468
+ when scan(/\\([MC]-|c)(?=\\)/) then
469
+ tokadd matched
470
+ self.tokadd_escape
471
+ when scan(/\\([MC]-|c)(.)/) then
472
+ tokadd matched
473
+
474
+ self.tokadd_escape if check(/\\/) # recurse if continued!
475
+ when scan(/\\[McCx]/) then # all unprocessed branches from above have failed
476
+ rb_compile_error "Invalid escape character syntax"
477
+ when scan(/\\(.)/m) then
478
+ chr, = match[1]
479
+
480
+ tokadd "\\"
481
+ tokadd chr
482
+ else
483
+ rb_compile_error "Invalid escape character syntax: %p" % [self.rest.lines.first]
484
+ end
485
+ end
486
+
487
+ def tokadd_string func, term, paren # ../compare/parse30.y:7020
488
+ qwords = func =~ STR_FUNC_QWORDS
489
+ escape = func =~ STR_FUNC_ESCAPE
490
+ expand = func =~ STR_FUNC_EXPAND
491
+ regexp = func =~ STR_FUNC_REGEXP
492
+
493
+ paren_re = regexp_cache[paren] if paren != "\0"
494
+ term_re = if term == "\n"
495
+ /\r?\n/
496
+ else
497
+ regexp_cache[term]
498
+ end
499
+
500
+ until end_of_stream? do
501
+ case
502
+ when paren_re && scan(paren_re) then
503
+ self.string_nest += 1
504
+ when scan(term_re) then
505
+ if self.string_nest == 0 then
506
+ self.pos -= 1 # TODO: ss.unscan 665 errors #$ HACK: why do we depend on this so hard?
507
+ break # leave eos loop, go parse term in caller (heredoc or parse_string)
508
+ else
509
+ self.lineno += matched.count("\n")
510
+ self.string_nest -= 1
511
+ end
512
+
513
+ when expand && check(/#[\$\@\{]/) then
514
+ # do nothing since we used `check`
515
+ break # leave eos loop
516
+ when check(/\\/) then
517
+ case
518
+ when scan(/\\\n/) then
519
+ self.lineno += 1
520
+ case
521
+ when qwords then
522
+ tokadd "\n"
523
+ next
524
+ when expand then
525
+ next if func !~ STR_FUNC_INDENT
526
+
527
+ if term == "\n" then
528
+ unscan # rollback
529
+ scan(/\\/) # and split
530
+ scan(/\n/) # this is `matched`
531
+ break
532
+ end
533
+
534
+ tokadd "\\"
535
+ debug 9
536
+ else
537
+ unscan # rollback
538
+ scan(/\\/) # this is `matched`
539
+ end
540
+ when check(/\\\\/) then
541
+ tokadd '\\' if escape
542
+ nextc # ignore 1st \\
543
+ nextc # for tokadd ss.matched, below
544
+ when scan(/\\u/) then
545
+ unless expand then
546
+ tokadd "\\"
547
+ next
548
+ end
549
+
550
+ tokadd_utf8 term, func, regexp
551
+
552
+ next
553
+ else
554
+ scan(/\\/) # eat it, we know it's there
555
+
556
+ return RubyLexer::EOF if end_of_stream?
557
+
558
+ if scan(/\P{ASCII}/) then
559
+ tokadd "\\" unless expand
560
+ tokadd self.matched
561
+ next
562
+ end
563
+
564
+ case
565
+ when regexp then
566
+ if term !~ SIMPLE_RE_META && scan(term_re) then
567
+ tokadd matched
568
+ next
569
+ end
570
+
571
+ self.pos -= 1 # TODO: ss.unscan 15 errors
572
+ # HACK? decide whether to eat the \\ above
573
+ if _esc = tokadd_escape && end_of_stream? then
574
+ debug 10
575
+ end
576
+
577
+ next # C's continue = Ruby's next
578
+ when expand then
579
+ tokadd "\\" if escape
580
+ tokadd read_escape
581
+ next
582
+ when qwords && scan(/\s/) then
583
+ # ignore backslashed spaces in %w
584
+ when !check(term_re) && !(paren_re && check(paren_re)) then
585
+ tokadd "\\"
586
+ next
587
+ else
588
+ getch # slurp it too for matched below
589
+ end
590
+ end # inner case for /\\/
591
+
592
+ when scan(/\P{ASCII}/) then
593
+ # not currently checking encoding stuff -- drops to tokadd below
594
+ when qwords && check(/\s/) then
595
+ break # leave eos loop
596
+ else
597
+ t = Regexp.escape term == "\n" ? "\r\n" : term
598
+ x = Regexp.escape paren if paren && paren != "\000"
599
+ q = "\\s" if qwords
600
+ re = /[^#{t}#{x}\#\\#{q}]+/
601
+
602
+ scan re or getch
603
+ self.lineno += matched.count "\n" if matched
604
+ end # big case
605
+
606
+ tokadd self.matched
607
+ end # until end_of_stream?
608
+
609
+ if self.matched then
610
+ self.matched
611
+ elsif end_of_stream? then
612
+ RubyLexer::EOF
613
+ end
614
+ end # tokadd_string
615
+
616
+ def tokadd_utf8 term, func, regexp_literal # ../compare/parse30.y:6646
617
+ tokadd "\\u" if regexp_literal
618
+
619
+ case
620
+ when scan(/\h{4}/) then
621
+ codepoint = [matched.to_i(16)].pack("U")
622
+
623
+ tokadd regexp_literal ? matched : codepoint
624
+ when scan(/\{\s*(\h{1,6}(?:\s+\h{1,6})*)\s*\}/) then
625
+ codepoints = match[1].split.map { |s| s.to_i 16 }.pack("U")
626
+
627
+ if regexp_literal then
628
+ tokadd "{"
629
+ tokadd match[1].split.join(" ")
630
+ tokadd "}"
631
+ else
632
+ tokadd codepoints
633
+ end
634
+ else
635
+ rb_compile_error "unterminated Unicode escape"
636
+ end
637
+ end
638
+ end