@ast-grep/lang-ruby 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,1107 @@
1
+ #include "tree_sitter/alloc.h"
2
+ #include "tree_sitter/array.h"
3
+ #include "tree_sitter/parser.h"
4
+
5
+ #include <string.h>
6
+ #include <wctype.h>
7
+
8
+ typedef enum {
9
+ LINE_BREAK,
10
+ NO_LINE_BREAK,
11
+
12
+ // Delimited literals
13
+ SIMPLE_SYMBOL,
14
+ STRING_START,
15
+ SYMBOL_START,
16
+ SUBSHELL_START,
17
+ REGEX_START,
18
+ STRING_ARRAY_START,
19
+ SYMBOL_ARRAY_START,
20
+ HEREDOC_BODY_START,
21
+ STRING_CONTENT,
22
+ HEREDOC_CONTENT,
23
+ STRING_END,
24
+ HEREDOC_BODY_END,
25
+ HEREDOC_START,
26
+
27
+ // Whitespace-sensitive tokens
28
+ FORWARD_SLASH,
29
+ BLOCK_AMPERSAND,
30
+ SPLAT_STAR,
31
+ UNARY_MINUS,
32
+ UNARY_MINUS_NUM,
33
+ BINARY_MINUS,
34
+ BINARY_STAR,
35
+ SINGLETON_CLASS_LEFT_ANGLE_LEFT_ANGLE,
36
+ HASH_KEY_SYMBOL,
37
+ IDENTIFIER_SUFFIX,
38
+ CONSTANT_SUFFIX,
39
+ HASH_SPLAT_STAR_STAR,
40
+ BINARY_STAR_STAR,
41
+ ELEMENT_REFERENCE_BRACKET,
42
+ SHORT_INTERPOLATION,
43
+
44
+ NONE
45
+ } TokenType;
46
+
47
+ typedef Array(char) String;
48
+
49
+ typedef struct {
50
+ TokenType type;
51
+ int32_t open_delimiter;
52
+ int32_t close_delimiter;
53
+ int32_t nesting_depth;
54
+ bool allows_interpolation;
55
+ } Literal;
56
+
57
+ typedef struct {
58
+ String word;
59
+ bool end_word_indentation_allowed;
60
+ bool allows_interpolation;
61
+ bool started;
62
+ } Heredoc;
63
+
64
+ typedef struct {
65
+ bool has_leading_whitespace;
66
+ Array(Literal) literal_stack;
67
+ Array(Heredoc) open_heredocs;
68
+ } Scanner;
69
+
70
+ const char NON_IDENTIFIER_CHARS[] = {
71
+ '\0', '\n', '\r', '\t', ' ', ':', ';', '`', '"', '\'', '@', '$', '#', '.', ',', '|', '^', '&',
72
+ '<', '=', '>', '+', '-', '*', '/', '\\', '%', '?', '!', '~', '(', ')', '[', ']', '{', '}',
73
+ };
74
+
75
+ static inline void skip(Scanner *scanner, TSLexer *lexer) {
76
+ scanner->has_leading_whitespace = true;
77
+ lexer->advance(lexer, true);
78
+ }
79
+
80
+ static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
81
+
82
+ static inline void reset(Scanner *scanner) {
83
+ array_delete(&scanner->literal_stack);
84
+ for (uint32_t i = 0; i < scanner->open_heredocs.size; i++) {
85
+ array_delete(&array_get(&scanner->open_heredocs, i)->word);
86
+ }
87
+ array_delete(&scanner->open_heredocs);
88
+ }
89
+
90
+ static inline unsigned serialize(Scanner *scanner, char *buffer) {
91
+ unsigned size = 0;
92
+
93
+ if (scanner->literal_stack.size * 5 + 2 >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
94
+ return 0;
95
+ }
96
+
97
+ buffer[size++] = (char)scanner->literal_stack.size;
98
+ for (uint32_t i = 0; i < scanner->literal_stack.size; i++) {
99
+ Literal *literal = array_get(&scanner->literal_stack, i);
100
+ buffer[size++] = literal->type;
101
+ buffer[size++] = (char)literal->open_delimiter;
102
+ buffer[size++] = (char)literal->close_delimiter;
103
+ buffer[size++] = (char)literal->nesting_depth;
104
+ buffer[size++] = (char)literal->allows_interpolation;
105
+ }
106
+
107
+ buffer[size++] = (char)scanner->open_heredocs.size;
108
+ for (uint32_t i = 0; i < scanner->open_heredocs.size; i++) {
109
+ Heredoc *heredoc = array_get(&scanner->open_heredocs, i);
110
+ if (size + 2 + heredoc->word.size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
111
+ return 0;
112
+ }
113
+ buffer[size++] = (char)heredoc->end_word_indentation_allowed;
114
+ buffer[size++] = (char)heredoc->allows_interpolation;
115
+ buffer[size++] = (char)heredoc->started;
116
+ buffer[size++] = (char)heredoc->word.size;
117
+ memcpy(&buffer[size], heredoc->word.contents, heredoc->word.size);
118
+ size += heredoc->word.size;
119
+ }
120
+
121
+ return size;
122
+ }
123
+
124
+ static inline void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
125
+ unsigned size = 0;
126
+ scanner->has_leading_whitespace = false;
127
+ reset(scanner);
128
+
129
+ if (length == 0) {
130
+ return;
131
+ }
132
+
133
+ uint8_t literal_depth = buffer[size++];
134
+ for (unsigned j = 0; j < literal_depth; j++) {
135
+ Literal literal = {0};
136
+ literal.type = (TokenType)(buffer[size++]);
137
+ literal.open_delimiter = (unsigned char)buffer[size++];
138
+ literal.close_delimiter = (unsigned char)buffer[size++];
139
+ literal.nesting_depth = (unsigned char)buffer[size++];
140
+ literal.allows_interpolation = buffer[size++];
141
+ array_push(&scanner->literal_stack, literal);
142
+ }
143
+
144
+ uint8_t open_heredoc_count = buffer[size++];
145
+ for (unsigned j = 0; j < open_heredoc_count; j++) {
146
+ Heredoc heredoc = {0};
147
+ heredoc.end_word_indentation_allowed = buffer[size++];
148
+ heredoc.allows_interpolation = buffer[size++];
149
+ heredoc.started = buffer[size++];
150
+
151
+ heredoc.word = (String)array_new();
152
+ uint8_t word_length = buffer[size++];
153
+ array_reserve(&heredoc.word, word_length);
154
+ memcpy(heredoc.word.contents, &buffer[size], word_length);
155
+ heredoc.word.size = word_length;
156
+ size += word_length;
157
+ array_push(&scanner->open_heredocs, heredoc);
158
+ }
159
+
160
+ assert(size == length);
161
+ }
162
+
163
+ static inline bool scan_whitespace(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
164
+ bool heredoc_body_start_is_valid = scanner->open_heredocs.size > 0 && !scanner->open_heredocs.contents[0].started &&
165
+ valid_symbols[HEREDOC_BODY_START];
166
+ bool crossed_newline = false;
167
+
168
+ for (;;) {
169
+ if (!valid_symbols[NO_LINE_BREAK] && valid_symbols[LINE_BREAK] && lexer->is_at_included_range_start(lexer)) {
170
+ lexer->mark_end(lexer);
171
+ lexer->result_symbol = LINE_BREAK;
172
+ return true;
173
+ }
174
+
175
+ switch (lexer->lookahead) {
176
+ case ' ':
177
+ case '\t':
178
+ skip(scanner, lexer);
179
+ break;
180
+ case '\r':
181
+ if (heredoc_body_start_is_valid) {
182
+ lexer->result_symbol = HEREDOC_BODY_START;
183
+ scanner->open_heredocs.contents[0].started = true;
184
+ return true;
185
+ } else {
186
+ skip(scanner, lexer);
187
+ break;
188
+ }
189
+ case '\n':
190
+ if (heredoc_body_start_is_valid) {
191
+ lexer->result_symbol = HEREDOC_BODY_START;
192
+ scanner->open_heredocs.contents[0].started = true;
193
+ return true;
194
+ } else if (!valid_symbols[NO_LINE_BREAK] && valid_symbols[LINE_BREAK] && !crossed_newline) {
195
+ lexer->mark_end(lexer);
196
+ advance(lexer);
197
+ crossed_newline = true;
198
+ } else {
199
+ skip(scanner, lexer);
200
+ }
201
+ break;
202
+ case '\\':
203
+ advance(lexer);
204
+ if (lexer->lookahead == '\r') {
205
+ skip(scanner, lexer);
206
+ }
207
+ if (iswspace(lexer->lookahead)) {
208
+ skip(scanner, lexer);
209
+ } else {
210
+ return false;
211
+ }
212
+ break;
213
+ default:
214
+ if (crossed_newline) {
215
+ if (lexer->lookahead != '.' && lexer->lookahead != '&' && lexer->lookahead != '#') {
216
+ lexer->result_symbol = LINE_BREAK;
217
+ } else if (lexer->lookahead == '.') {
218
+ // Don't return LINE_BREAK for the call operator (`.`) but do return one for range
219
+ // operators
220
+ // (`..` and `...`)
221
+ advance(lexer);
222
+ if (!lexer->eof(lexer) && lexer->lookahead == '.') {
223
+ lexer->result_symbol = LINE_BREAK;
224
+ } else {
225
+ return false;
226
+ }
227
+ }
228
+ }
229
+ return true;
230
+ }
231
+ }
232
+ }
233
+
234
+ static inline bool scan_operator(TSLexer *lexer) {
235
+ switch (lexer->lookahead) {
236
+ // <, <=, <<, <=>
237
+ case '<':
238
+ advance(lexer);
239
+ if (lexer->lookahead == '<') {
240
+ advance(lexer);
241
+ } else if (lexer->lookahead == '=') {
242
+ advance(lexer);
243
+ if (lexer->lookahead == '>') {
244
+ advance(lexer);
245
+ }
246
+ }
247
+ return true;
248
+
249
+ // >, >=, >>
250
+ case '>':
251
+ advance(lexer);
252
+ if (lexer->lookahead == '>' || lexer->lookahead == '=') {
253
+ advance(lexer);
254
+ }
255
+ return true;
256
+
257
+ // ==, ===, =~
258
+ case '=':
259
+ advance(lexer);
260
+ if (lexer->lookahead == '~') {
261
+ advance(lexer);
262
+ return true;
263
+ }
264
+ if (lexer->lookahead == '=') {
265
+ advance(lexer);
266
+ if (lexer->lookahead == '=') {
267
+ advance(lexer);
268
+ }
269
+ return true;
270
+ }
271
+ return false;
272
+
273
+ // +, -, ~, +@, -@, ~@
274
+ case '+':
275
+ case '-':
276
+ case '~':
277
+ advance(lexer);
278
+ if (lexer->lookahead == '@') {
279
+ advance(lexer);
280
+ }
281
+ return true;
282
+
283
+ // ..
284
+ case '.':
285
+ advance(lexer);
286
+ if (lexer->lookahead == '.') {
287
+ advance(lexer);
288
+ return true;
289
+ }
290
+ return false;
291
+
292
+ // &, ^, |, /, %`
293
+ case '&':
294
+ case '^':
295
+ case '|':
296
+ case '/':
297
+ case '%':
298
+ case '`':
299
+ advance(lexer);
300
+ return true;
301
+
302
+ // !, !=, !~
303
+ case '!':
304
+ advance(lexer);
305
+ if (lexer->lookahead == '=' || lexer->lookahead == '~') {
306
+ advance(lexer);
307
+ }
308
+ return true;
309
+
310
+ // *, **
311
+ case '*':
312
+ advance(lexer);
313
+ if (lexer->lookahead == '*') {
314
+ advance(lexer);
315
+ }
316
+ return true;
317
+
318
+ // [], []=
319
+ case '[':
320
+ advance(lexer);
321
+ if (lexer->lookahead == ']') {
322
+ advance(lexer);
323
+ } else {
324
+ return false;
325
+ }
326
+ if (lexer->lookahead == '=') {
327
+ advance(lexer);
328
+ }
329
+ return true;
330
+
331
+ default:
332
+ return false;
333
+ }
334
+ }
335
+
336
+ static inline bool is_iden_char(char c) {
337
+ return memchr(&NON_IDENTIFIER_CHARS, c, sizeof(NON_IDENTIFIER_CHARS)) == NULL;
338
+ }
339
+
340
+ static inline bool scan_symbol_identifier(TSLexer *lexer) {
341
+ if (lexer->lookahead == '@') {
342
+ advance(lexer);
343
+ if (lexer->lookahead == '@') {
344
+ advance(lexer);
345
+ }
346
+ } else if (lexer->lookahead == '$') {
347
+ advance(lexer);
348
+ }
349
+
350
+ if (is_iden_char((char)lexer->lookahead)) {
351
+ advance(lexer);
352
+ } else if (!scan_operator(lexer)) {
353
+ return false;
354
+ }
355
+
356
+ while (is_iden_char((char)lexer->lookahead)) {
357
+ advance(lexer);
358
+ }
359
+
360
+ if (lexer->lookahead == '?' || lexer->lookahead == '!') {
361
+ advance(lexer);
362
+ }
363
+
364
+ if (lexer->lookahead == '=') {
365
+ lexer->mark_end(lexer);
366
+ advance(lexer);
367
+ if (lexer->lookahead != '>') {
368
+ lexer->mark_end(lexer);
369
+ }
370
+ }
371
+
372
+ return true;
373
+ }
374
+
375
+ static inline bool scan_open_delimiter(Scanner *scanner, TSLexer *lexer, Literal *literal, const bool *valid_symbols) {
376
+ switch (lexer->lookahead) {
377
+ case '"':
378
+ literal->type = STRING_START;
379
+ literal->open_delimiter = literal->close_delimiter = lexer->lookahead;
380
+ literal->allows_interpolation = true;
381
+ advance(lexer);
382
+ return true;
383
+
384
+ case '\'':
385
+ literal->type = STRING_START;
386
+ literal->open_delimiter = literal->close_delimiter = lexer->lookahead;
387
+ literal->allows_interpolation = false;
388
+ advance(lexer);
389
+ return true;
390
+
391
+ case '`':
392
+ if (!valid_symbols[SUBSHELL_START]) {
393
+ return false;
394
+ }
395
+ literal->type = SUBSHELL_START;
396
+ literal->open_delimiter = literal->close_delimiter = lexer->lookahead;
397
+ literal->allows_interpolation = true;
398
+ advance(lexer);
399
+ return true;
400
+
401
+ case '/':
402
+ if (!valid_symbols[REGEX_START]) {
403
+ return false;
404
+ }
405
+ literal->type = REGEX_START;
406
+ literal->open_delimiter = literal->close_delimiter = lexer->lookahead;
407
+ literal->allows_interpolation = true;
408
+ advance(lexer);
409
+ if (valid_symbols[FORWARD_SLASH]) {
410
+ if (!scanner->has_leading_whitespace) {
411
+ return false;
412
+ }
413
+ if (lexer->lookahead == ' ' || lexer->lookahead == '\t' || lexer->lookahead == '\n' ||
414
+ lexer->lookahead == '\r') {
415
+ return false;
416
+ }
417
+ if (lexer->lookahead == '=') {
418
+ return false;
419
+ }
420
+ }
421
+ return true;
422
+
423
+ case '%':
424
+ advance(lexer);
425
+
426
+ switch (lexer->lookahead) {
427
+ case 's':
428
+ if (!valid_symbols[SIMPLE_SYMBOL]) {
429
+ return false;
430
+ }
431
+ literal->type = SYMBOL_START;
432
+ literal->allows_interpolation = false;
433
+ advance(lexer);
434
+ break;
435
+
436
+ case 'r':
437
+ if (!valid_symbols[REGEX_START]) {
438
+ return false;
439
+ }
440
+ literal->type = REGEX_START;
441
+ literal->allows_interpolation = true;
442
+ advance(lexer);
443
+ break;
444
+
445
+ case 'x':
446
+ if (!valid_symbols[SUBSHELL_START]) {
447
+ return false;
448
+ }
449
+ literal->type = SUBSHELL_START;
450
+ literal->allows_interpolation = true;
451
+ advance(lexer);
452
+ break;
453
+
454
+ case 'q':
455
+ if (!valid_symbols[STRING_START]) {
456
+ return false;
457
+ }
458
+ literal->type = STRING_START;
459
+ literal->allows_interpolation = false;
460
+ advance(lexer);
461
+ break;
462
+
463
+ case 'Q':
464
+ if (!valid_symbols[STRING_START]) {
465
+ return false;
466
+ }
467
+ literal->type = STRING_START;
468
+ literal->allows_interpolation = true;
469
+ advance(lexer);
470
+ break;
471
+
472
+ case 'w':
473
+ if (!valid_symbols[STRING_ARRAY_START]) {
474
+ return false;
475
+ }
476
+ literal->type = STRING_ARRAY_START;
477
+ literal->allows_interpolation = false;
478
+ advance(lexer);
479
+ break;
480
+
481
+ case 'i':
482
+ if (!valid_symbols[SYMBOL_ARRAY_START]) {
483
+ return false;
484
+ }
485
+ literal->type = SYMBOL_ARRAY_START;
486
+ literal->allows_interpolation = false;
487
+ advance(lexer);
488
+ break;
489
+
490
+ case 'W':
491
+ if (!valid_symbols[STRING_ARRAY_START]) {
492
+ return false;
493
+ }
494
+ literal->type = STRING_ARRAY_START;
495
+ literal->allows_interpolation = true;
496
+ advance(lexer);
497
+ break;
498
+
499
+ case 'I':
500
+ if (!valid_symbols[SYMBOL_ARRAY_START]) {
501
+ return false;
502
+ }
503
+ literal->type = SYMBOL_ARRAY_START;
504
+ literal->allows_interpolation = true;
505
+ advance(lexer);
506
+ break;
507
+
508
+ default:
509
+ if (!valid_symbols[STRING_START]) {
510
+ return false;
511
+ }
512
+ literal->type = STRING_START;
513
+ literal->allows_interpolation = true;
514
+ break;
515
+ }
516
+
517
+ switch (lexer->lookahead) {
518
+ case '(':
519
+ literal->open_delimiter = '(';
520
+ literal->close_delimiter = ')';
521
+ break;
522
+
523
+ case '[':
524
+ literal->open_delimiter = '[';
525
+ literal->close_delimiter = ']';
526
+ break;
527
+
528
+ case '{':
529
+ literal->open_delimiter = '{';
530
+ literal->close_delimiter = '}';
531
+ break;
532
+
533
+ case '<':
534
+ literal->open_delimiter = '<';
535
+ literal->close_delimiter = '>';
536
+ break;
537
+
538
+ case '\r':
539
+ case '\n':
540
+ case ' ':
541
+ case '\t':
542
+ // If the `/` operator is valid, then so is the `%` operator, which means
543
+ // that a `%` followed by whitespace should be considered an operator,
544
+ // not a percent string.
545
+ if (valid_symbols[FORWARD_SLASH]) {
546
+ return false;
547
+ }
548
+ break;
549
+
550
+ case '|':
551
+ case '!':
552
+ case '#':
553
+ case '/':
554
+ case '\\':
555
+ case '@':
556
+ case '$':
557
+ case '%':
558
+ case '^':
559
+ case '&':
560
+ case '*':
561
+ case ')':
562
+ case ']':
563
+ case '}':
564
+ case '>':
565
+ // TODO: Implement %= as external rule and re-enable = as a valid
566
+ // unbalanced delimiter. That will be necessary due to ambiguity
567
+ // between &= assignment operator and %=...= as string
568
+ // content delimiter.
569
+ // case '=':
570
+ case '+':
571
+ case '-':
572
+ case '~':
573
+ case '`':
574
+ case ',':
575
+ case '.':
576
+ case '?':
577
+ case ':':
578
+ case ';':
579
+ case '_':
580
+ case '"':
581
+ case '\'':
582
+ literal->open_delimiter = lexer->lookahead;
583
+ literal->close_delimiter = lexer->lookahead;
584
+ break;
585
+ default:
586
+ return false;
587
+ }
588
+
589
+ advance(lexer);
590
+ return true;
591
+
592
+ default:
593
+ return false;
594
+ }
595
+ }
596
+
597
+ static inline void scan_heredoc_word(TSLexer *lexer, Heredoc *heredoc) {
598
+ String word = array_new();
599
+ int32_t quote = 0;
600
+
601
+ switch (lexer->lookahead) {
602
+ case '\'':
603
+ case '"':
604
+ case '`':
605
+ quote = lexer->lookahead;
606
+ advance(lexer);
607
+ while (lexer->lookahead != quote && !lexer->eof(lexer)) {
608
+ array_push(&word, lexer->lookahead);
609
+ advance(lexer);
610
+ }
611
+ advance(lexer);
612
+ break;
613
+
614
+ default:
615
+ if (iswalnum(lexer->lookahead) || lexer->lookahead == '_') {
616
+ array_push(&word, lexer->lookahead);
617
+ advance(lexer);
618
+ while (iswalnum(lexer->lookahead) || lexer->lookahead == '_') {
619
+ array_push(&word, lexer->lookahead);
620
+ advance(lexer);
621
+ }
622
+ }
623
+ break;
624
+ }
625
+
626
+ heredoc->word = word;
627
+ heredoc->allows_interpolation = quote != '\'';
628
+ }
629
+
630
+ static inline bool scan_short_interpolation(TSLexer *lexer, const bool has_content, const TSSymbol content_symbol) {
631
+ char start = (char)lexer->lookahead;
632
+ if (start == '@' || start == '$') {
633
+ if (has_content) {
634
+ lexer->result_symbol = content_symbol;
635
+ return true;
636
+ }
637
+ lexer->mark_end(lexer);
638
+ advance(lexer);
639
+ bool is_short_interpolation = false;
640
+ if (start == '$') {
641
+ if (strchr("!@&`'+~=/\\,;.<>*$?:\"", lexer->lookahead) != NULL) {
642
+ is_short_interpolation = true;
643
+ } else {
644
+ if (lexer->lookahead == '-') {
645
+ advance(lexer);
646
+ is_short_interpolation = iswalpha(lexer->lookahead) || lexer->lookahead == '_';
647
+ } else {
648
+ is_short_interpolation = iswalnum(lexer->lookahead) || lexer->lookahead == '_';
649
+ }
650
+ }
651
+ }
652
+ if (start == '@') {
653
+ if (lexer->lookahead == '@') {
654
+ advance(lexer);
655
+ }
656
+ is_short_interpolation = is_iden_char((char)lexer->lookahead) && !iswdigit(lexer->lookahead);
657
+ }
658
+
659
+ if (is_short_interpolation) {
660
+ lexer->result_symbol = SHORT_INTERPOLATION;
661
+ return true;
662
+ }
663
+ }
664
+ return false;
665
+ }
666
+
667
+ static inline bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer) {
668
+ Heredoc *heredoc = array_get(&scanner->open_heredocs, 0);
669
+ size_t position_in_word = 0;
670
+ bool look_for_heredoc_end = true;
671
+ bool has_content = false;
672
+
673
+ for (;;) {
674
+ if (position_in_word == heredoc->word.size) {
675
+ if (!has_content) {
676
+ lexer->mark_end(lexer);
677
+ }
678
+ while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
679
+ advance(lexer);
680
+ }
681
+ if (lexer->lookahead == '\n' || lexer->lookahead == '\r') {
682
+ if (has_content) {
683
+ lexer->result_symbol = HEREDOC_CONTENT;
684
+ } else {
685
+ array_delete(&heredoc->word);
686
+ array_erase(&scanner->open_heredocs, 0);
687
+ lexer->result_symbol = HEREDOC_BODY_END;
688
+ }
689
+ return true;
690
+ }
691
+ has_content = true;
692
+ position_in_word = 0;
693
+ }
694
+
695
+ if (lexer->eof(lexer)) {
696
+ lexer->mark_end(lexer);
697
+ if (has_content) {
698
+ lexer->result_symbol = HEREDOC_CONTENT;
699
+ } else {
700
+ array_delete(&heredoc->word);
701
+ array_erase(&scanner->open_heredocs, 0);
702
+ lexer->result_symbol = HEREDOC_BODY_END;
703
+ }
704
+ return true;
705
+ }
706
+
707
+ if (lexer->lookahead == *array_get(&heredoc->word, position_in_word) && look_for_heredoc_end) {
708
+ advance(lexer);
709
+ position_in_word++;
710
+ } else {
711
+ position_in_word = 0;
712
+ look_for_heredoc_end = false;
713
+
714
+ if (heredoc->allows_interpolation && lexer->lookahead == '\\') {
715
+ if (has_content) {
716
+ lexer->result_symbol = HEREDOC_CONTENT;
717
+ return true;
718
+ }
719
+ return false;
720
+ }
721
+
722
+ if (heredoc->allows_interpolation && lexer->lookahead == '#') {
723
+ lexer->mark_end(lexer);
724
+ advance(lexer);
725
+ if (lexer->lookahead == '{') {
726
+ if (has_content) {
727
+ lexer->result_symbol = HEREDOC_CONTENT;
728
+ return true;
729
+ }
730
+ return false;
731
+ }
732
+ if (scan_short_interpolation(lexer, has_content, HEREDOC_CONTENT)) {
733
+ return true;
734
+ }
735
+ } else if (lexer->lookahead == '\r' || lexer->lookahead == '\n') {
736
+ if (lexer->lookahead == '\r') {
737
+ advance(lexer);
738
+ if (lexer->lookahead == '\n') {
739
+ advance(lexer);
740
+ }
741
+ } else {
742
+ advance(lexer);
743
+ }
744
+ has_content = true;
745
+ look_for_heredoc_end = true;
746
+ while (lexer->lookahead == ' ' || lexer->lookahead == '\t') {
747
+ advance(lexer);
748
+ if (!heredoc->end_word_indentation_allowed) {
749
+ look_for_heredoc_end = false;
750
+ }
751
+ }
752
+ lexer->mark_end(lexer);
753
+ } else {
754
+ has_content = true;
755
+ advance(lexer);
756
+ lexer->mark_end(lexer);
757
+ }
758
+ }
759
+ }
760
+ }
761
+
762
+ static inline bool scan_literal_content(Scanner *scanner, TSLexer *lexer) {
763
+ Literal *literal = array_back(&scanner->literal_stack);
764
+ bool has_content = false;
765
+ bool stop_on_space = literal->type == SYMBOL_ARRAY_START || literal->type == STRING_ARRAY_START;
766
+
767
+ for (;;) {
768
+ if (stop_on_space && iswspace(lexer->lookahead)) {
769
+ if (has_content) {
770
+ lexer->mark_end(lexer);
771
+ lexer->result_symbol = STRING_CONTENT;
772
+ return true;
773
+ }
774
+ return false;
775
+ }
776
+ if (lexer->lookahead == literal->close_delimiter) {
777
+ lexer->mark_end(lexer);
778
+ if (literal->nesting_depth == 1) {
779
+ if (has_content) {
780
+ lexer->result_symbol = STRING_CONTENT;
781
+ } else {
782
+ advance(lexer);
783
+ if (literal->type == REGEX_START) {
784
+ while (iswlower(lexer->lookahead)) {
785
+ advance(lexer);
786
+ }
787
+ }
788
+ array_pop(&scanner->literal_stack);
789
+ lexer->result_symbol = STRING_END;
790
+ lexer->mark_end(lexer);
791
+ }
792
+ return true;
793
+ }
794
+ literal->nesting_depth--;
795
+ advance(lexer);
796
+
797
+ } else if (lexer->lookahead == literal->open_delimiter) {
798
+ literal->nesting_depth++;
799
+ advance(lexer);
800
+ } else if (literal->allows_interpolation && lexer->lookahead == '#') {
801
+ lexer->mark_end(lexer);
802
+ advance(lexer);
803
+ if (lexer->lookahead == '{') {
804
+ if (has_content) {
805
+ lexer->result_symbol = STRING_CONTENT;
806
+ return true;
807
+ }
808
+ return false;
809
+ }
810
+ if (scan_short_interpolation(lexer, has_content, STRING_CONTENT)) {
811
+ return true;
812
+ }
813
+ } else if (lexer->lookahead == '\\') {
814
+ if (literal->allows_interpolation) {
815
+ if (has_content) {
816
+ lexer->mark_end(lexer);
817
+ lexer->result_symbol = STRING_CONTENT;
818
+ return true;
819
+ }
820
+ return false;
821
+ }
822
+ advance(lexer);
823
+ advance(lexer);
824
+
825
+ } else if (lexer->eof(lexer)) {
826
+ advance(lexer);
827
+ lexer->mark_end(lexer);
828
+ return false;
829
+ } else {
830
+ advance(lexer);
831
+ }
832
+
833
+ has_content = true;
834
+ }
835
+ }
836
+
837
+ static inline bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
838
+ scanner->has_leading_whitespace = false;
839
+
840
+ // Contents of literals, which match any character except for some close delimiter
841
+ if (!valid_symbols[STRING_START]) {
842
+ if ((valid_symbols[STRING_CONTENT] || valid_symbols[STRING_END]) && scanner->literal_stack.size > 0) {
843
+ return scan_literal_content(scanner, lexer);
844
+ }
845
+ if ((valid_symbols[HEREDOC_CONTENT] || valid_symbols[HEREDOC_BODY_END]) && scanner->open_heredocs.size > 0) {
846
+ return scan_heredoc_content(scanner, lexer);
847
+ }
848
+ }
849
+
850
+ // Whitespace
851
+ lexer->result_symbol = NONE;
852
+ if (!scan_whitespace(scanner, lexer, valid_symbols)) {
853
+ return false;
854
+ }
855
+ if (lexer->result_symbol != NONE) {
856
+ return true;
857
+ }
858
+
859
+ switch (lexer->lookahead) {
860
+ case '&':
861
+ if (valid_symbols[BLOCK_AMPERSAND]) {
862
+ advance(lexer);
863
+ if (lexer->lookahead != '&' && lexer->lookahead != '.' && lexer->lookahead != '=' &&
864
+ !iswspace(lexer->lookahead)) {
865
+ lexer->result_symbol = BLOCK_AMPERSAND;
866
+ return true;
867
+ }
868
+ return false;
869
+ }
870
+ break;
871
+
872
+ case '<':
873
+ if (valid_symbols[SINGLETON_CLASS_LEFT_ANGLE_LEFT_ANGLE]) {
874
+ advance(lexer);
875
+ if (lexer->lookahead == '<') {
876
+ advance(lexer);
877
+ lexer->result_symbol = SINGLETON_CLASS_LEFT_ANGLE_LEFT_ANGLE;
878
+ return true;
879
+ }
880
+ return false;
881
+ }
882
+ break;
883
+
884
+ case '*':
885
+ if (valid_symbols[SPLAT_STAR] || valid_symbols[BINARY_STAR] || valid_symbols[HASH_SPLAT_STAR_STAR] ||
886
+ valid_symbols[BINARY_STAR_STAR]) {
887
+ advance(lexer);
888
+ if (lexer->lookahead == '=') {
889
+ return false;
890
+ }
891
+ if (lexer->lookahead == '*') {
892
+ if (valid_symbols[HASH_SPLAT_STAR_STAR] || valid_symbols[BINARY_STAR_STAR]) {
893
+ advance(lexer);
894
+ if (lexer->lookahead == '=') {
895
+ return false;
896
+ }
897
+ if (valid_symbols[BINARY_STAR_STAR] && !scanner->has_leading_whitespace) {
898
+ lexer->result_symbol = BINARY_STAR_STAR;
899
+ return true;
900
+ }
901
+ if (valid_symbols[HASH_SPLAT_STAR_STAR] && !iswspace(lexer->lookahead)) {
902
+ lexer->result_symbol = HASH_SPLAT_STAR_STAR;
903
+ return true;
904
+ }
905
+ if (valid_symbols[BINARY_STAR_STAR]) {
906
+ lexer->result_symbol = BINARY_STAR_STAR;
907
+ return true;
908
+ }
909
+ if (valid_symbols[HASH_SPLAT_STAR_STAR]) {
910
+ lexer->result_symbol = HASH_SPLAT_STAR_STAR;
911
+ return true;
912
+ }
913
+ return false;
914
+ }
915
+ return false;
916
+ }
917
+ if (valid_symbols[BINARY_STAR] && !scanner->has_leading_whitespace) {
918
+ lexer->result_symbol = BINARY_STAR;
919
+ return true;
920
+ }
921
+ if (valid_symbols[SPLAT_STAR] && !iswspace(lexer->lookahead)) {
922
+ lexer->result_symbol = SPLAT_STAR;
923
+ return true;
924
+ }
925
+ if (valid_symbols[BINARY_STAR]) {
926
+ lexer->result_symbol = BINARY_STAR;
927
+ return true;
928
+ }
929
+ if (valid_symbols[SPLAT_STAR]) {
930
+ lexer->result_symbol = SPLAT_STAR;
931
+ return true;
932
+ }
933
+ return false;
934
+ }
935
+ break;
936
+
937
+ case '-':
938
+ if (valid_symbols[UNARY_MINUS] || valid_symbols[UNARY_MINUS_NUM] || valid_symbols[BINARY_MINUS]) {
939
+ advance(lexer);
940
+ if (lexer->lookahead != '=' && lexer->lookahead != '>') {
941
+ if (valid_symbols[UNARY_MINUS_NUM] &&
942
+ (!valid_symbols[BINARY_STAR] || scanner->has_leading_whitespace) &&
943
+ iswdigit(lexer->lookahead)) {
944
+ lexer->result_symbol = UNARY_MINUS_NUM;
945
+ return true;
946
+ }
947
+ if (valid_symbols[UNARY_MINUS] && scanner->has_leading_whitespace && !iswspace(lexer->lookahead)) {
948
+ lexer->result_symbol = UNARY_MINUS;
949
+ } else if (valid_symbols[BINARY_MINUS]) {
950
+ lexer->result_symbol = BINARY_MINUS;
951
+ } else {
952
+ lexer->result_symbol = UNARY_MINUS;
953
+ }
954
+ return true;
955
+ }
956
+ return false;
957
+ }
958
+ break;
959
+
960
+ case ':':
961
+ if (valid_symbols[SYMBOL_START]) {
962
+ Literal literal = {0};
963
+ literal.type = SYMBOL_START;
964
+ literal.nesting_depth = 1;
965
+ advance(lexer);
966
+
967
+ switch (lexer->lookahead) {
968
+ case '"':
969
+ advance(lexer);
970
+ literal.open_delimiter = '"';
971
+ literal.close_delimiter = '"';
972
+ literal.allows_interpolation = true;
973
+ array_push(&scanner->literal_stack, literal);
974
+ lexer->result_symbol = SYMBOL_START;
975
+ return true;
976
+
977
+ case '\'':
978
+ advance(lexer);
979
+ literal.open_delimiter = '\'';
980
+ literal.close_delimiter = '\'';
981
+ literal.allows_interpolation = false;
982
+ array_push(&scanner->literal_stack, literal);
983
+ lexer->result_symbol = SYMBOL_START;
984
+ return true;
985
+
986
+ default:
987
+ if (scan_symbol_identifier(lexer)) {
988
+ lexer->result_symbol = SIMPLE_SYMBOL;
989
+ return true;
990
+ }
991
+ }
992
+
993
+ return false;
994
+ }
995
+ break;
996
+
997
+ case '[':
998
+ // Treat a square bracket as an element reference if either:
999
+ // * the bracket is not preceded by any whitespace
1000
+ // * an arbitrary expression is not valid at the current position.
1001
+ if (valid_symbols[ELEMENT_REFERENCE_BRACKET] &&
1002
+ (!scanner->has_leading_whitespace || !valid_symbols[STRING_START])) {
1003
+ advance(lexer);
1004
+ lexer->result_symbol = ELEMENT_REFERENCE_BRACKET;
1005
+ return true;
1006
+ }
1007
+ break;
1008
+
1009
+ default:
1010
+ break;
1011
+ }
1012
+
1013
+ // Open delimiters for literals
1014
+ if (((valid_symbols[HASH_KEY_SYMBOL] || valid_symbols[IDENTIFIER_SUFFIX]) &&
1015
+ (iswalpha(lexer->lookahead) || lexer->lookahead == '_')) ||
1016
+ (valid_symbols[CONSTANT_SUFFIX] && iswupper(lexer->lookahead))) {
1017
+ TokenType validIdentifierSymbol = iswupper(lexer->lookahead) ? CONSTANT_SUFFIX : IDENTIFIER_SUFFIX;
1018
+ while (iswalnum(lexer->lookahead) || lexer->lookahead == '_') {
1019
+ advance(lexer);
1020
+ }
1021
+
1022
+ if (valid_symbols[HASH_KEY_SYMBOL] && lexer->lookahead == ':') {
1023
+ lexer->mark_end(lexer);
1024
+ advance(lexer);
1025
+ if (lexer->lookahead != ':') {
1026
+ lexer->result_symbol = HASH_KEY_SYMBOL;
1027
+ return true;
1028
+ }
1029
+ } else if (valid_symbols[validIdentifierSymbol] && lexer->lookahead == '!') {
1030
+ advance(lexer);
1031
+ if (lexer->lookahead != '=') {
1032
+ lexer->result_symbol = validIdentifierSymbol;
1033
+ return true;
1034
+ }
1035
+ }
1036
+
1037
+ return false;
1038
+ }
1039
+
1040
+ // Open delimiters for literals
1041
+ if (valid_symbols[STRING_START]) {
1042
+ Literal literal = {0};
1043
+ literal.nesting_depth = 1;
1044
+
1045
+ if (lexer->lookahead == '<') {
1046
+ advance(lexer);
1047
+ if (lexer->lookahead != '<') {
1048
+ return false;
1049
+ }
1050
+ advance(lexer);
1051
+
1052
+ Heredoc heredoc = {0};
1053
+ if (lexer->lookahead == '-' || lexer->lookahead == '~') {
1054
+ advance(lexer);
1055
+ heredoc.end_word_indentation_allowed = true;
1056
+ }
1057
+
1058
+ scan_heredoc_word(lexer, &heredoc);
1059
+ if (heredoc.word.size == 0) {
1060
+ array_delete(&heredoc.word);
1061
+ return false;
1062
+ }
1063
+ array_push(&scanner->open_heredocs, heredoc);
1064
+ lexer->result_symbol = HEREDOC_START;
1065
+ return true;
1066
+ }
1067
+
1068
+ if (scan_open_delimiter(scanner, lexer, &literal, valid_symbols)) {
1069
+ array_push(&scanner->literal_stack, literal);
1070
+ lexer->result_symbol = literal.type;
1071
+ return true;
1072
+ }
1073
+ return false;
1074
+ }
1075
+
1076
+ return false;
1077
+ }
1078
+
1079
+ void *tree_sitter_ruby_external_scanner_create() {
1080
+ Scanner *scanner = (Scanner *)ts_calloc(1, sizeof(Scanner));
1081
+ return scanner;
1082
+ }
1083
+
1084
+ bool tree_sitter_ruby_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
1085
+ Scanner *scanner = (Scanner *)payload;
1086
+ return scan(scanner, lexer, valid_symbols);
1087
+ }
1088
+
1089
+ unsigned tree_sitter_ruby_external_scanner_serialize(void *payload, char *buffer) {
1090
+ Scanner *scanner = (Scanner *)payload;
1091
+ return serialize(scanner, buffer);
1092
+ }
1093
+
1094
+ void tree_sitter_ruby_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
1095
+ Scanner *scanner = (Scanner *)payload;
1096
+ deserialize(scanner, buffer, length);
1097
+ }
1098
+
1099
+ void tree_sitter_ruby_external_scanner_destroy(void *payload) {
1100
+ Scanner *scanner = (Scanner *)payload;
1101
+ for (uint32_t i = 0; i < scanner->open_heredocs.size; i++) {
1102
+ array_delete(&array_get(&scanner->open_heredocs, i)->word);
1103
+ }
1104
+ array_delete(&scanner->open_heredocs);
1105
+ array_delete(&scanner->literal_stack);
1106
+ ts_free(scanner);
1107
+ }