tree-sitter-zsh 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,2416 @@
1
+ #include "tree_sitter/array.h"
2
+ #include "tree_sitter/parser.h"
3
+
4
+ #include <assert.h>
5
+ #include <ctype.h>
6
+ #include <stdint.h>
7
+ #include <stdio.h>
8
+ #include <string.h>
9
+ #include <wctype.h>
10
+
11
+ #define DEBUG 0
12
+
13
+ enum TokenType {
14
+ HEREDOC_START,
15
+ SIMPLE_HEREDOC_BODY,
16
+ HEREDOC_BODY_BEGINNING,
17
+ HEREDOC_CONTENT,
18
+ HEREDOC_END,
19
+ FILE_DESCRIPTOR,
20
+ EMPTY_VALUE,
21
+ CONCAT,
22
+ VARIABLE_NAME,
23
+ SIMPLE_VARIABLE_NAME,
24
+ SPECIAL_VARIABLE_NAME,
25
+ TEST_OPERATOR,
26
+ REGEX,
27
+ REGEX_NO_SLASH,
28
+ REGEX_NO_SPACE,
29
+ EXPANSION_WORD,
30
+ EXTGLOB_PATTERN,
31
+ RAW_DOLLAR, // Consumes spaces, only if $ alone for strings / commands
32
+ BARE_DOLLAR, // Consumes spaces
33
+ PEEK_BARE_DOLLAR, // Just determines if immediate $ is present
34
+ BRACE_START,
35
+ BRACE_EXPR_START,
36
+ IMMEDIATE_DOUBLE_HASH,
37
+ ARRAY_STAR_TOKEN,
38
+ ARRAY_AT_TOKEN,
39
+ CLOSING_BRACE,
40
+ CLOSING_BRACKET,
41
+ CLOSING_PAREN,
42
+ CLOSING_DOUBLE_PAREN,
43
+ HEREDOC_ARROW,
44
+ HEREDOC_ARROW_DASH,
45
+ HASH_PATTERN, // #pattern
46
+ DOUBLE_HASH_PATTERN, // ##pattern
47
+ ENTER_PATTERN, // implicit / etc
48
+ PATTERN_START, // After pattern operators, before pattern content
49
+ PATTERN_SUFFIX_START, // After # operators, before pattern content
50
+ NEWLINE,
51
+ OPENING_PAREN,
52
+ DOUBLE_OPENING_PAREN,
53
+ OPENING_BRACKET,
54
+ TEST_COMMAND_START, // [[
55
+ TEST_COMMAND_END, // ]]
56
+ ESAC,
57
+ ZSH_EXTENDED_GLOB_FLAGS,
58
+ DOUBLE_QUOTE,
59
+ BACKTICK,
60
+ ERROR_RECOVERY,
61
+ };
62
+
63
+ #if DEBUG
64
+ const char *TokenNames[] = {
65
+ "HEREDOC_START",
66
+ "SIMPLE_HEREDOC_BODY",
67
+ "HEREDOC_BODY_BEGINNING",
68
+ "HEREDOC_CONTENT",
69
+ "HEREDOC_END",
70
+ "FILE_DESCRIPTOR",
71
+ "EMPTY_VALUE",
72
+ "CONCAT",
73
+ "VARIABLE_NAME",
74
+ "SIMPLE_VARIABLE_NAME",
75
+ "SPECIAL_VARIABLE_NAME",
76
+ "TEST_OPERATOR",
77
+ "REGEX",
78
+ "REGEX_NO_SLASH",
79
+ "REGEX_NO_SPACE",
80
+ "EXPANSION_WORD",
81
+ "EXTGLOB_PATTERN",
82
+ "RAW_DOLLAR",
83
+ "BARE_DOLLAR",
84
+ "PEEK_BARE_DOLLAR",
85
+ "BRACE_START",
86
+ "BRACE_EXPR_START",
87
+ "IMMEDIATE_DOUBLE_HASH",
88
+ "ARRAY_STAR_TOKEN",
89
+ "ARRAY_AT_TOKEN",
90
+ "CLOSING_BRACE",
91
+ "CLOSING_BRACKET",
92
+ "CLOSING_PAREN",
93
+ "CLOSING_DOUBLE_PAREN",
94
+ "HEREDOC_ARROW",
95
+ "HEREDOC_ARROW_DASH",
96
+ "HASH_PATTERN", // #pattern
97
+ "DOUBLE_HASH_PATTERN", // ##pattern
98
+ "ENTER_PATTERN",
99
+ "PATTERN_START",
100
+ "PATTERN_SUFFIX_START",
101
+ "NEWLINE",
102
+ "OPENING_PAREN",
103
+ "DOUBLE_OPENING_PAREN",
104
+ "OPENING_BRACKET",
105
+ "TEST_COMMAND_START",
106
+ "TEST_COMMAND_END",
107
+ "ESAC",
108
+ "ZSH_EXTENDED_GLOB_FLAGS",
109
+ "DOUBLE_QUOTE",
110
+ "BACKTICK",
111
+ "ERROR_RECOVERY",
112
+ };
113
+ #endif
114
+
115
+ typedef Array(char) String;
116
+
117
+ // Context types for nested expansion tracking
118
+ typedef enum {
119
+ CTX_NONE = 0,
120
+ CTX_PARAMETER = 1, // ${...}
121
+ CTX_ARITHMETIC = 2, // $((...))
122
+ CTX_COMMAND = 3, // $(...)
123
+ CTX_TEST = 4, // [[ ... ]]
124
+ CTX_BRACE_EXPANSION = 5, // {a..b} and {a..b..c}
125
+ CTX_PARAMETER_PATTERN_SUFFIX =
126
+ 6, // ${var%pattern} and ${var#pattern} - suffix/prefix removal
127
+ CTX_PARAMETER_PATTERN_SUBSTITUTE =
128
+ 7, // ${var/pattern/replacement} - substitution
129
+ CTX_STRING = 8, // "..." string context
130
+ CTX_COMPOUND = 9, // "{ x; y; z; }"
131
+ CTX_BACKTICK = 10 // `a b c`
132
+ } context_type_t;
133
+
134
+ const char *ContextNames[] = {
135
+ "CTX_NONE",
136
+ "CTX_PARAMETER", // ${...}
137
+ "CTX_ARITHMETIC", // $((...))
138
+ "CTX_COMMAND", // $(...)
139
+ "CTX_TEST", // [[ ... ]]
140
+ "CTX_BRACE_EXPANSION", // {a..b} and {a..b..c}
141
+ "CTX_PARAMETER_PATTERN_SUFFIX", // ${var%pattern} and ${var#pattern} -
142
+ // suffix/prefix removal
143
+ "CTX_PARAMETER_PATTERN_SUBSTITUTE", // ${var/pattern/replacement} -
144
+ // substitution
145
+ "CTX_STRING", // "..." string context
146
+ "CTX_COMPOUND", // "{ x; y; z; }"
147
+ "CTX_BACKTICK" // `a b c`
148
+ };
149
+
150
+ typedef struct {
151
+ bool is_raw;
152
+ bool started;
153
+ bool allows_indent;
154
+ String delimiter;
155
+ String current_leading_word;
156
+ } Heredoc;
157
+
158
+ #define heredoc_new() \
159
+ { \
160
+ .is_raw = false, \
161
+ .started = false, \
162
+ .allows_indent = false, \
163
+ .delimiter = array_new(), \
164
+ .current_leading_word = array_new(), \
165
+ };
166
+
167
+ typedef struct {
168
+ uint8_t last_glob_paren_depth;
169
+ bool ext_was_in_double_quote;
170
+ bool ext_saw_outside_quote;
171
+ Array(context_type_t) context_stack; // Proper context stack
172
+ bool just_returned_variable_name; // Track if we just returned VARIABLE_NAME
173
+ bool just_returned_bare_dollar; // Track if we just returned BARE_DOLLAR
174
+ bool just_exited_string; // Track if we just exited a string context
175
+ bool just_newline; // Track if we just handled newline
176
+ Array(Heredoc) heredocs;
177
+ } Scanner;
178
+
179
+ static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
180
+
181
+ // Context management functions using proper stack
182
+ static inline context_type_t get_current_context(Scanner *scanner) {
183
+ if (scanner->context_stack.size == 0) {
184
+ return CTX_NONE;
185
+ }
186
+ return *array_back(&scanner->context_stack);
187
+ }
188
+
189
+ static inline bool in_parameter_expansion(Scanner *scanner) {
190
+ context_type_t ctx = get_current_context(scanner);
191
+ return ctx == CTX_PARAMETER || ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
192
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE;
193
+ }
194
+
195
+ // Helper to determine if we should stop at pattern operators
196
+ static inline bool should_stop_at_pattern_operators(Scanner *scanner) {
197
+ context_type_t ctx = get_current_context(scanner);
198
+ return ctx == CTX_PARAMETER || ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
199
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE;
200
+ }
201
+
202
+ static inline bool should_stop_at_pattern_slash(Scanner *scanner) {
203
+ context_type_t ctx = get_current_context(scanner);
204
+ return ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE;
205
+ }
206
+
207
+ // Helper to check if we're in parameter expansion context (for tokenization
208
+ // decisions)
209
+ static inline bool in_parameter_expansion_context(Scanner *scanner) {
210
+ return in_parameter_expansion(scanner);
211
+ }
212
+
213
+ // Helper to check if we should break on '/' in EXPANSION_WORD
214
+ static inline bool should_break_on_slash(Scanner *scanner) {
215
+ context_type_t ctx = get_current_context(scanner);
216
+ return ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE;
217
+ }
218
+ static inline void enter_context(Scanner *scanner, context_type_t context) {
219
+ #if DEBUG
220
+ fprintf(stderr, "DEBUG: Entering context %s\n", ContextNames[context]);
221
+ for (int i = 0; i < scanner->context_stack.size; ++i) {
222
+ fprintf(stderr, " DEBUG: context_stack %d= %s\n", i,
223
+ ContextNames[*array_get(&scanner->context_stack, i)]);
224
+ }
225
+ #endif
226
+ array_push(&scanner->context_stack, context);
227
+ }
228
+
229
+ static inline void exit_context(Scanner *scanner,
230
+ context_type_t expected_context) {
231
+ if (scanner->context_stack.size > 0) {
232
+ context_type_t current = *array_back(&scanner->context_stack);
233
+ // Verify we're exiting the expected context (for debugging)
234
+ if (current == expected_context) {
235
+ #if DEBUG
236
+ fprintf(stderr, "DEBUG: Exiting matching context %s\n",
237
+ ContextNames[current]);
238
+ #endif
239
+ array_pop(&scanner->context_stack);
240
+ } else {
241
+ #if DEBUG
242
+ fprintf(stderr,
243
+ "DEBUG: Exiting mismatching context %s, wanted %s\n",
244
+ ContextNames[current], ContextNames[expected_context]);
245
+ #endif
246
+ // Gracefully handle mismatched contexts by popping anyway
247
+ array_pop(&scanner->context_stack);
248
+ }
249
+ #if DEBUG
250
+ for (int i = 0; i < scanner->context_stack.size; ++i) {
251
+ fprintf(stderr, " DEBUG: context_stack %d= %s\n", i,
252
+ ContextNames[*array_get(&scanner->context_stack, i)]);
253
+ }
254
+ #endif
255
+ }
256
+ }
257
+
258
+ // Helper functions for checking contexts
259
+ static inline bool in_expansion_context(Scanner *scanner) {
260
+ context_type_t ctx = get_current_context(scanner);
261
+ return ctx == CTX_PARAMETER || ctx == CTX_ARITHMETIC || ctx == CTX_COMMAND;
262
+ }
263
+
264
+ static inline bool in_pattern_context(Scanner *scanner) {
265
+ context_type_t ctx = get_current_context(scanner);
266
+ return ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
267
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE;
268
+ }
269
+
270
+ static inline bool in_test_command(Scanner *scanner) {
271
+ context_type_t ctx = get_current_context(scanner);
272
+ return ctx == CTX_TEST;
273
+ }
274
+
275
+ static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
276
+
277
+ static inline void skip_ws(TSLexer *lexer) {
278
+ while (iswspace(lexer->lookahead) && lexer->lookahead != '\n' &&
279
+ !lexer->eof(lexer)) {
280
+ #if DEBUG
281
+ fprintf(stderr, "WARNING skip_ws skipping space");
282
+ #endif
283
+ skip(lexer);
284
+ }
285
+ }
286
+ static inline void skip_wsnl(TSLexer *lexer) {
287
+ while (iswspace(lexer->lookahead) && !lexer->eof(lexer)) {
288
+ #if DEBUG
289
+ fprintf(stderr, "WARNING skip_wsnl skipping space");
290
+ #endif
291
+ skip(lexer);
292
+ }
293
+ }
294
+
295
+ static inline bool in_error_recovery(const bool *valid_symbols) {
296
+ return valid_symbols[ERROR_RECOVERY];
297
+ }
298
+
299
+ static inline void reset_string(String *string) {
300
+ if (string->size > 0) {
301
+ memset(string->contents, 0, string->size);
302
+ array_clear(string);
303
+ }
304
+ }
305
+
306
+ static inline void reset_heredoc(Heredoc *heredoc) {
307
+ heredoc->is_raw = false;
308
+ heredoc->started = false;
309
+ heredoc->allows_indent = false;
310
+ reset_string(&heredoc->delimiter);
311
+ }
312
+
313
+ static inline void reset(Scanner *scanner) {
314
+ #if DEBUG
315
+ fprintf(stderr, "DEBUG: Reset called - heredocs.size before=%u %u\n",
316
+ scanner->heredocs.size, scanner->context_stack.size);
317
+ #endif
318
+ scanner->last_glob_paren_depth = 0;
319
+ scanner->ext_was_in_double_quote = false;
320
+ scanner->ext_saw_outside_quote = false;
321
+ scanner->context_stack.size = 0; // Clear context stack
322
+ scanner->just_returned_variable_name = false;
323
+ scanner->just_returned_bare_dollar = false;
324
+ scanner->just_exited_string = false;
325
+ scanner->just_newline = false;
326
+ for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
327
+ reset_heredoc(array_get(&scanner->heredocs, i));
328
+ }
329
+ #if DEBUG
330
+ fprintf(stderr, "DEBUG: Reset done - heredocs.size after=%u %u\n",
331
+ scanner->heredocs.size, scanner->context_stack.size);
332
+ #endif
333
+ }
334
+
335
+ static unsigned serialize(Scanner *scanner, char *buffer) {
336
+ uint32_t size = 0;
337
+
338
+ buffer[size++] = (char)scanner->last_glob_paren_depth;
339
+ buffer[size++] = (char)scanner->ext_was_in_double_quote;
340
+ buffer[size++] = (char)scanner->ext_saw_outside_quote;
341
+ buffer[size++] = (char)scanner->context_stack.size;
342
+ buffer[size++] = (char)scanner->heredocs.size;
343
+ buffer[size++] = (char)scanner->just_returned_variable_name;
344
+ buffer[size++] = (char)scanner->just_returned_bare_dollar;
345
+ buffer[size++] = (char)scanner->just_exited_string;
346
+ buffer[size++] = (char)scanner->just_newline;
347
+
348
+ // Serialize context stack
349
+ for (uint32_t i = 0; i < scanner->context_stack.size; i++) {
350
+ if (size >= TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
351
+ return 0;
352
+ }
353
+ context_type_t *ctx = array_get(&scanner->context_stack, i);
354
+ buffer[size++] = (char)*ctx;
355
+ }
356
+
357
+ for (uint32_t i = 0; i < scanner->heredocs.size; i++) {
358
+ Heredoc *heredoc = array_get(&scanner->heredocs, i);
359
+ if (size + 3 + sizeof(uint32_t) + heredoc->delimiter.size >=
360
+ TREE_SITTER_SERIALIZATION_BUFFER_SIZE) {
361
+ return 0;
362
+ }
363
+
364
+ buffer[size++] = (char)heredoc->is_raw;
365
+ buffer[size++] = (char)heredoc->started;
366
+ buffer[size++] = (char)heredoc->allows_indent;
367
+
368
+ memcpy(&buffer[size], &heredoc->delimiter.size, sizeof(uint32_t));
369
+ size += sizeof(uint32_t);
370
+ if (heredoc->delimiter.size > 0) {
371
+ memcpy(&buffer[size], heredoc->delimiter.contents,
372
+ heredoc->delimiter.size);
373
+ size += heredoc->delimiter.size;
374
+ }
375
+ }
376
+ return size;
377
+ }
378
+
379
+ static void deserialize(Scanner *scanner, const char *buffer, unsigned length) {
380
+ #if DEBUG
381
+ fprintf(stderr,
382
+ "DEBUG: Deserialize called - length=%u, before heredocs.size =%u "
383
+ "ctx_stack=%u\n",
384
+ length, scanner->heredocs.size, scanner->context_stack.size);
385
+ #endif
386
+ if (length == 0) {
387
+ reset(scanner);
388
+ } else {
389
+ uint32_t size = 0;
390
+ scanner->last_glob_paren_depth = buffer[size++];
391
+ scanner->ext_was_in_double_quote = buffer[size++];
392
+ scanner->ext_saw_outside_quote = buffer[size++];
393
+ uint32_t context_stack_size = (unsigned char)buffer[size++];
394
+ uint32_t heredoc_count = (unsigned char)buffer[size++];
395
+ #if DEBUG
396
+ fprintf(stderr,
397
+ "DEBUG: Deserialize - heredoc_count=%u context_stack_size=%u\n",
398
+ heredoc_count, context_stack_size);
399
+ #endif
400
+ scanner->just_returned_variable_name = buffer[size++];
401
+ scanner->just_returned_bare_dollar = buffer[size++];
402
+ scanner->just_exited_string = buffer[size++];
403
+ scanner->just_newline = buffer[size++];
404
+
405
+ // Deserialize context stack
406
+ scanner->context_stack.size = 0;
407
+ for (uint32_t i = 0; i < context_stack_size; i++) {
408
+ if (size >= length)
409
+ break;
410
+ context_type_t ctx = (context_type_t)buffer[size++];
411
+ array_push(&scanner->context_stack, ctx);
412
+ }
413
+
414
+ for (uint32_t i = 0; i < heredoc_count; i++) {
415
+ Heredoc *heredoc = NULL;
416
+ if (i < scanner->heredocs.size) {
417
+ heredoc = array_get(&scanner->heredocs, i);
418
+ } else {
419
+ Heredoc new_heredoc = heredoc_new();
420
+ array_push(&scanner->heredocs, new_heredoc);
421
+ heredoc = array_back(&scanner->heredocs);
422
+ }
423
+
424
+ heredoc->is_raw = buffer[size++];
425
+ heredoc->started = buffer[size++];
426
+ heredoc->allows_indent = buffer[size++];
427
+
428
+ memcpy(&heredoc->delimiter.size, &buffer[size], sizeof(uint32_t));
429
+ size += sizeof(uint32_t);
430
+ array_reserve(&heredoc->delimiter, heredoc->delimiter.size);
431
+
432
+ if (heredoc->delimiter.size > 0) {
433
+ memcpy(heredoc->delimiter.contents, &buffer[size],
434
+ heredoc->delimiter.size);
435
+ size += heredoc->delimiter.size;
436
+ }
437
+ }
438
+ assert(size == length);
439
+ }
440
+ #if DEBUG
441
+ fprintf(stderr, "DEBUG: Deserialize done - heredocs.size after=%u %u\n",
442
+ scanner->heredocs.size, scanner->context_stack.size);
443
+ #endif
444
+ }
445
+
446
+ /**
447
+ * Consume a "word" in POSIX parlance, and returns it unquoted.
448
+ *
449
+ * This is an approximate implementation that doesn't deal with any
450
+ * POSIX-mandated substitution, and assumes the default value for
451
+ * IFS.
452
+ */
453
+ static bool advance_word(TSLexer *lexer, String *unquoted_word) {
454
+ bool empty = true;
455
+
456
+ int32_t quote = 0;
457
+ if (lexer->lookahead == '\'' || lexer->lookahead == '"') {
458
+ quote = lexer->lookahead;
459
+ advance(lexer);
460
+ }
461
+
462
+ while (lexer->lookahead &&
463
+ !(quote ? lexer->lookahead == quote || lexer->lookahead == '\r' ||
464
+ lexer->lookahead == '\n'
465
+ : iswspace(lexer->lookahead))) {
466
+ if (lexer->lookahead == '\\') {
467
+ advance(lexer);
468
+ if (!lexer->lookahead) {
469
+ return false;
470
+ }
471
+ }
472
+ empty = false;
473
+ array_push(unquoted_word, lexer->lookahead);
474
+ advance(lexer);
475
+ }
476
+ array_push(unquoted_word, '\0');
477
+
478
+ if (quote && lexer->lookahead == quote) {
479
+ advance(lexer);
480
+ }
481
+
482
+ return !empty;
483
+ }
484
+
485
+ static inline bool scan_raw_dollar(TSLexer *lexer, const bool *valid_symbols) {
486
+ skip_ws(lexer);
487
+
488
+ if (lexer->lookahead == '$') {
489
+ advance(lexer);
490
+ lexer->result_symbol = BARE_DOLLAR;
491
+ lexer->mark_end(lexer);
492
+ return iswspace(lexer->lookahead) || lexer->eof(lexer) ||
493
+ lexer->lookahead == '\"';
494
+ }
495
+
496
+ return false;
497
+ }
498
+
499
+ static bool scan_heredoc_start(Heredoc *heredoc, TSLexer *lexer) {
500
+ while (iswspace(lexer->lookahead)) {
501
+ skip(lexer);
502
+ }
503
+
504
+ lexer->result_symbol = HEREDOC_START;
505
+ heredoc->is_raw = lexer->lookahead == '\'' || lexer->lookahead == '"' ||
506
+ lexer->lookahead == '\\';
507
+
508
+ bool found_delimiter = advance_word(lexer, &heredoc->delimiter);
509
+ if (!found_delimiter) {
510
+ reset_string(&heredoc->delimiter);
511
+ return false;
512
+ }
513
+ return found_delimiter;
514
+ }
515
+
516
+ static bool scan_heredoc_end_identifier(Heredoc *heredoc, TSLexer *lexer) {
517
+ reset_string(&heredoc->current_leading_word);
518
+ // Scan the first 'n' characters on this line, to see if they match the
519
+ // heredoc delimiter
520
+ int32_t size = 0;
521
+ if (heredoc->delimiter.size > 0) {
522
+ while (lexer->lookahead != '\0' && lexer->lookahead != '\n' &&
523
+ (int32_t)*array_get(&heredoc->delimiter, size) ==
524
+ lexer->lookahead &&
525
+ heredoc->current_leading_word.size < heredoc->delimiter.size) {
526
+ array_push(&heredoc->current_leading_word, lexer->lookahead);
527
+ advance(lexer);
528
+ size++;
529
+ }
530
+ }
531
+ array_push(&heredoc->current_leading_word, '\0');
532
+ return heredoc->delimiter.size == 0
533
+ ? false
534
+ : strcmp(heredoc->current_leading_word.contents,
535
+ heredoc->delimiter.contents) == 0;
536
+ }
537
+
538
+ static bool scan_heredoc_content(Scanner *scanner, TSLexer *lexer,
539
+ enum TokenType middle_type,
540
+ enum TokenType end_type) {
541
+ bool did_advance = false;
542
+ Heredoc *heredoc = array_back(&scanner->heredocs);
543
+
544
+ for (;;) {
545
+ switch (lexer->lookahead) {
546
+ case '\0': {
547
+ if (lexer->eof(lexer) && did_advance) {
548
+ reset_heredoc(heredoc);
549
+ lexer->result_symbol = end_type;
550
+ return true;
551
+ }
552
+ return false;
553
+ }
554
+
555
+ case '\\': {
556
+ did_advance = true;
557
+ advance(lexer);
558
+ advance(lexer);
559
+ break;
560
+ }
561
+
562
+ case '$': {
563
+ if (heredoc->is_raw) {
564
+ did_advance = true;
565
+ advance(lexer);
566
+ break;
567
+ }
568
+ if (did_advance) {
569
+ lexer->mark_end(lexer);
570
+ lexer->result_symbol = middle_type;
571
+ heredoc->started = true;
572
+ advance(lexer);
573
+ if (iswalpha(lexer->lookahead) || lexer->lookahead == '{' ||
574
+ lexer->lookahead == '(') {
575
+ return true;
576
+ }
577
+ break;
578
+ }
579
+ if (middle_type == HEREDOC_BODY_BEGINNING &&
580
+ lexer->get_column(lexer) == 0) {
581
+ lexer->mark_end(lexer);
582
+ lexer->result_symbol = middle_type;
583
+ heredoc->started = true;
584
+ return true;
585
+ }
586
+ return false;
587
+ }
588
+
589
+ case '\n': {
590
+ if (!did_advance) {
591
+ skip(lexer);
592
+ } else {
593
+ advance(lexer);
594
+ }
595
+ did_advance = true;
596
+ if (heredoc->allows_indent) {
597
+ while (iswspace(lexer->lookahead)) {
598
+ advance(lexer);
599
+ }
600
+ }
601
+ lexer->result_symbol = heredoc->started ? middle_type : end_type;
602
+ lexer->mark_end(lexer);
603
+ if (scan_heredoc_end_identifier(heredoc, lexer)) {
604
+ if (lexer->result_symbol == HEREDOC_END) {
605
+ array_pop(&scanner->heredocs);
606
+ }
607
+ return true;
608
+ }
609
+ break;
610
+ }
611
+
612
+ default: {
613
+ if (lexer->get_column(lexer) == 0) {
614
+ // an alternative is to check the starting column of the
615
+ // heredoc body and track that statefully
616
+ while (iswspace(lexer->lookahead)) {
617
+ if (did_advance) {
618
+ advance(lexer);
619
+ } else {
620
+ skip(lexer);
621
+ }
622
+ }
623
+ if (end_type != SIMPLE_HEREDOC_BODY) {
624
+ lexer->result_symbol = middle_type;
625
+ if (scan_heredoc_end_identifier(heredoc, lexer)) {
626
+ return true;
627
+ }
628
+ }
629
+ if (end_type == SIMPLE_HEREDOC_BODY) {
630
+ lexer->result_symbol = end_type;
631
+ lexer->mark_end(lexer);
632
+ if (scan_heredoc_end_identifier(heredoc, lexer)) {
633
+ return true;
634
+ }
635
+ }
636
+ }
637
+ did_advance = true;
638
+ advance(lexer);
639
+ break;
640
+ }
641
+ }
642
+ }
643
+ }
644
+
645
+ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
646
+ #if DEBUG
647
+ fprintf(stderr, "SCANNER: invoked lookahead='%c'\n", lexer->lookahead);
648
+ for (int i = 0; i <= ERROR_RECOVERY; i++) {
649
+ if (valid_symbols[i]) {
650
+ fprintf(stderr, "SCANNER: valid symbol: %s\n", TokenNames[i]);
651
+ }
652
+ }
653
+ #endif
654
+
655
+ // Clear flag at start and capture its previous value
656
+ bool was_just_variable_name = scanner->just_returned_variable_name;
657
+ scanner->just_returned_variable_name = false;
658
+
659
+ bool was_just_bare_dollar = scanner->just_returned_bare_dollar;
660
+ scanner->just_returned_bare_dollar = false;
661
+
662
+ // Clear string exit flag at start and capture its previous value
663
+ bool was_just_exited_string = scanner->just_exited_string;
664
+ scanner->just_exited_string = false;
665
+
666
+ // FIXME: newline handling and exited string handling should go
667
+ bool was_just_newline = scanner->just_newline;
668
+ scanner->just_newline = false;
669
+
670
+ if (valid_symbols[CONCAT] && !in_error_recovery(valid_symbols)) {
671
+ context_type_t ctx = get_current_context(scanner);
672
+ #if DEBUG
673
+ fprintf(stderr,
674
+ "SCANNER: CONCAT handler lookeahead=%c "
675
+ "was_just_exited_string=%d was_just_newline=%d\n",
676
+ lexer->lookahead, was_just_exited_string, was_just_newline);
677
+ #endif
678
+
679
+ if (!(lexer->lookahead == 0 || iswspace(lexer->lookahead) ||
680
+ lexer->lookahead == '>' || lexer->lookahead == '<' ||
681
+ (lexer->lookahead == ')' &&
682
+ (valid_symbols[CLOSING_PAREN] ||
683
+ valid_symbols[CLOSING_DOUBLE_PAREN])) ||
684
+ lexer->lookahead == '(' || lexer->lookahead == ';' ||
685
+ lexer->lookahead == '&' || lexer->lookahead == '|' ||
686
+ lexer->lookahead == '{' ||
687
+ // prevent concat over newline after string ends
688
+ (was_just_exited_string && lexer->lookahead == '\n') ||
689
+ (lexer->lookahead == '"' && ctx == CTX_STRING) ||
690
+ (was_just_newline) ||
691
+ // Prevent recursion on / pattern
692
+ (lexer->lookahead == '/' &&
693
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE) ||
694
+ (lexer->lookahead == '}' && in_parameter_expansion(scanner)) ||
695
+ // Split subscript out
696
+ (lexer->lookahead == ']' && valid_symbols[CLOSING_BRACKET]) ||
697
+ (lexer->lookahead == '[' &&
698
+ was_just_variable_name) || // Suppress CONCAT after $var when [
699
+ (lexer->lookahead == ':' &&
700
+ was_just_variable_name) || // Suppress CONCAT after $var when :
701
+ (lexer->lookahead == '`' && ctx == CTX_BACKTICK)
702
+ )) {
703
+ // follows
704
+ #if DEBUG
705
+ fprintf(stderr, "SCANNER: CONCAT\n");
706
+ #endif
707
+
708
+ // So for a`b`, we want to return a concat. We check if the
709
+ // 2nd backtick has whitespace after it, and if it does we
710
+ // return concat.
711
+ if (lexer->lookahead == '`' && ctx != CTX_BACKTICK) {
712
+ lexer->mark_end(lexer);
713
+ advance(lexer);
714
+ bool was_escape = false;
715
+ while ((lexer->lookahead != '`' || was_escape) && !lexer->eof(lexer)) {
716
+ advance(lexer);
717
+ was_escape = false;
718
+ if (lexer->lookahead == '\\') {
719
+ was_escape = true;
720
+ }
721
+ }
722
+ if (lexer->eof(lexer)) {
723
+ return false;
724
+ }
725
+ if (lexer->lookahead == '`') {
726
+ advance(lexer);
727
+ }
728
+ if ((iswspace(lexer->lookahead) &&
729
+ lexer->lookahead != '\n' // HACK
730
+ ) ||
731
+ lexer->eof(lexer)) {
732
+ lexer->result_symbol = CONCAT;
733
+ return true;
734
+ }
735
+ }
736
+ // strings w/ expansions that contains escaped quotes or
737
+ // backslashes need this to return a concat
738
+ if (lexer->lookahead == '\\') {
739
+ lexer->mark_end(lexer);
740
+ advance(lexer);
741
+ if (lexer->lookahead == '"' || lexer->lookahead == '\'' ||
742
+ lexer->lookahead == '\\') {
743
+ lexer->result_symbol = CONCAT;
744
+ return true;
745
+ }
746
+ if (lexer->eof(lexer)) {
747
+ return false;
748
+ }
749
+ } else {
750
+ lexer->mark_end(lexer);
751
+ lexer->result_symbol = CONCAT;
752
+ return true;
753
+ }
754
+ }
755
+ }
756
+
757
+ // Handle string context tracking
758
+ if (valid_symbols[DOUBLE_QUOTE]) {
759
+ if (get_current_context(scanner) != CTX_STRING) {
760
+ skip_ws(lexer);
761
+
762
+ if (lexer->lookahead == '"') {
763
+ // Entering a string context
764
+ enter_context(scanner, CTX_STRING);
765
+ #if DEBUG
766
+ fprintf(stderr, "SCANNER: Entering string context\n");
767
+ #endif
768
+ advance(lexer);
769
+ lexer->mark_end(lexer);
770
+ lexer->result_symbol = DOUBLE_QUOTE;
771
+ return true;
772
+ }
773
+ } else if (lexer->lookahead == '"') {
774
+ // Exiting a string context
775
+ exit_context(scanner, CTX_STRING);
776
+ // Set the flag to indicate we just exited a string
777
+ was_just_exited_string = scanner->just_exited_string = true;
778
+
779
+ #if DEBUG
780
+ fprintf(stderr, "SCANNER: Exiting string context\n");
781
+ #endif
782
+
783
+ advance(lexer);
784
+ lexer->mark_end(lexer);
785
+ lexer->result_symbol = DOUBLE_QUOTE;
786
+ return true;
787
+ }
788
+ }
789
+
790
+ // Handle string context tracking
791
+ if (valid_symbols[BACKTICK]) {
792
+ if (get_current_context(scanner) != CTX_BACKTICK) {
793
+ skip_ws(lexer);
794
+
795
+ if (lexer->lookahead == '`') {
796
+ // Entering a string context
797
+ enter_context(scanner, CTX_BACKTICK);
798
+ #if DEBUG
799
+ fprintf(stderr, "SCANNER: Entering backtick context\n");
800
+ #endif
801
+ advance(lexer);
802
+ lexer->mark_end(lexer);
803
+ lexer->result_symbol = BACKTICK;
804
+ return true;
805
+ }
806
+ } else if (lexer->lookahead == '`') {
807
+ // Exiting a string context
808
+ exit_context(scanner, CTX_BACKTICK);
809
+
810
+ #if DEBUG
811
+ fprintf(stderr, "SCANNER: Exiting backtick context\n");
812
+ #endif
813
+
814
+ advance(lexer);
815
+ lexer->mark_end(lexer);
816
+ lexer->result_symbol = BACKTICK;
817
+ return true;
818
+ }
819
+ }
820
+
821
+ #if DEBUG
822
+ fprintf(stderr,
823
+ "DEBUG: scan() start - was_just_bare_dollar=%s, lookahead='%c'\n",
824
+ was_just_bare_dollar ? "true" : "false", lexer->lookahead);
825
+ #endif
826
+
827
+ // Resolve and absorb newlines when requested
828
+ if (valid_symbols[NEWLINE] && !in_error_recovery(valid_symbols)) {
829
+ #if DEBUG
830
+ fprintf(stderr, "SCANNER: NEWLINE handler, lookahead='%c'\n",
831
+ lexer->lookahead);
832
+ #endif
833
+ skip_ws(lexer);
834
+ if (lexer->lookahead == '\n') {
835
+ while (iswspace(lexer->lookahead)) {
836
+ skip(lexer);
837
+ }
838
+ was_just_newline = scanner->just_newline = true;
839
+ lexer->mark_end(lexer);
840
+ lexer->result_symbol = NEWLINE;
841
+ return true;
842
+ }
843
+ else if (lexer->lookahead == '\\') {
844
+ lexer->mark_end(lexer);
845
+ skip(lexer);
846
+ if (lexer->lookahead == '\n') {
847
+ // Just ignore the newline
848
+ skip(lexer);
849
+ skip_ws(lexer);
850
+ } else {
851
+ // we consumed things we should not have
852
+ lexer->mark_end(lexer);
853
+ }
854
+ }
855
+ }
856
+
857
+ // Dedicated context-aware brace handler - handles closing braces for
858
+ // different contexts
859
+ if (valid_symbols[CLOSING_BRACE] && !in_error_recovery(valid_symbols)) {
860
+ context_type_t active = get_current_context(scanner);
861
+
862
+ skip_wsnl(lexer);
863
+ if (lexer->lookahead == '}') {
864
+ #if DEBUG
865
+ fprintf(stderr,
866
+ "SCANNER: Detected } closing brace, active "
867
+ "context=%d\n",
868
+ active);
869
+ #endif
870
+ if (active == CTX_PARAMETER ||
871
+ active == CTX_PARAMETER_PATTERN_SUFFIX ||
872
+ active == CTX_PARAMETER_PATTERN_SUBSTITUTE) {
873
+ #if DEBUG
874
+ fprintf(stderr,
875
+ "SCANNER: Exiting parameter expansion context on }\n");
876
+ #endif
877
+ exit_context(scanner, active);
878
+ lexer->result_symbol = CLOSING_BRACE;
879
+ advance(lexer);
880
+ lexer->mark_end(lexer);
881
+ return true;
882
+ } else if (active == CTX_BRACE_EXPANSION) {
883
+ #if DEBUG
884
+ fprintf(stderr,
885
+ "SCANNER: Exiting brace expression context on }\n");
886
+ #endif
887
+ exit_context(scanner, active);
888
+ lexer->result_symbol = CLOSING_BRACE;
889
+ advance(lexer);
890
+ lexer->mark_end(lexer);
891
+ return true;
892
+ } else if (active == CTX_COMPOUND) {
893
+ #if DEBUG
894
+ fprintf(stderr,
895
+ "SCANNER: Exiting compound expression context on }\n");
896
+ #endif
897
+ exit_context(scanner, active);
898
+ lexer->result_symbol = CLOSING_BRACE;
899
+ advance(lexer);
900
+ lexer->mark_end(lexer);
901
+ return true;
902
+ }
903
+
904
+ // Note: CTX_BRACE_EXPANSION closing braces are handled by grammar
905
+ // as token.immediate('}')
906
+ }
907
+ }
908
+
909
+ // Handle BARE_DOLLAR for parameter expansion: $ followed by {
910
+ if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols)) {
911
+ #if DEBUG
912
+ fprintf(stderr,
913
+ "SCANNER: Entering BARE_DOLLAR handler, lookahead='%c'\n",
914
+ lexer->lookahead);
915
+ #endif
916
+
917
+ // Only skip whitespace if we're starting with whitespace
918
+ // This preserves whitespace significance for concatenation
919
+ #if DEBUG
920
+ fprintf(stderr,
921
+ "SCANNER: BARE_DOLLAR whitespace check: "
922
+ "valid_symbols[CONCAT]=%d, lookahead='%c'\n",
923
+ valid_symbols[CONCAT], lexer->lookahead);
924
+ #endif
925
+ if (!valid_symbols[CONCAT] &&
926
+ (lexer->lookahead == ' ' || lexer->lookahead == '\t')) {
927
+ #if DEBUG
928
+ fprintf(stderr, "SCANNER: BARE_DOLLAR skipping whitespace\n");
929
+ #endif
930
+ while ((lexer->lookahead == ' ' || lexer->lookahead == '\t') &&
931
+ !lexer->eof(lexer)) {
932
+ skip(lexer);
933
+ }
934
+ }
935
+
936
+ if (lexer->lookahead == '$') {
937
+ #if DEBUG
938
+ fprintf(stderr, "SCANNER: Found $ character\n");
939
+ #endif
940
+ advance(lexer);
941
+ if (lexer->lookahead != '\"') {
942
+ lexer->mark_end(lexer);
943
+ lexer->result_symbol = BARE_DOLLAR;
944
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
945
+ true;
946
+ return true;
947
+ }
948
+ #if DEBUG
949
+ fprintf(stderr, "SCANNER: Not ${...} pattern, returning false\n");
950
+ #endif
951
+ }
952
+ #if DEBUG
953
+ fprintf(stderr, "SCANNER: No $ character found, continuing\n");
954
+ #endif
955
+ }
956
+
957
+ // Must be after BARE_DOLLAR to avoid conflict
958
+ // Handle PEEK_BARE_DOLLAR for concatenation: check if next non-whitespace
959
+ // token is $ without consuming
960
+ if (valid_symbols[PEEK_BARE_DOLLAR] && !in_error_recovery(valid_symbols)) {
961
+ #if DEBUG
962
+ fprintf(stderr,
963
+ "SCANNER: Entering PEEK_BARE_DOLLAR handler, lookahead='%c'\n",
964
+ lexer->lookahead);
965
+ #endif
966
+
967
+ if (lexer->lookahead == '$') {
968
+ #if DEBUG
969
+ fprintf(stderr, "SCANNER: PEEK found $ character, returning "
970
+ "PEEK_BARE_DOLLAR\n");
971
+ #endif
972
+ lexer->result_symbol = PEEK_BARE_DOLLAR;
973
+ return true;
974
+ }
975
+
976
+ #if DEBUG
977
+ fprintf(stderr, "SCANNER: PEEK did not find $ character\n");
978
+ #endif
979
+ }
980
+
981
+ // Handle BRACE_START - if we're in parameter expansion context, this is
982
+ // part of ${
983
+ if (valid_symbols[BRACE_START] && !in_error_recovery(valid_symbols)) {
984
+ if (lexer->lookahead == '{') {
985
+ if (was_just_bare_dollar) {
986
+ advance(lexer);
987
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
988
+ false; // Reset flag
989
+ lexer->result_symbol = BRACE_START;
990
+ lexer->mark_end(lexer);
991
+ // This is ${...} - increment expansion depth
992
+ enter_context(scanner, CTX_PARAMETER);
993
+ return true;
994
+ }
995
+ }
996
+ // If not after $, we may need to consume spaces
997
+ skip_ws(lexer);
998
+ if (lexer->lookahead == '{') {
999
+ advance(lexer);
1000
+ lexer->result_symbol = BRACE_START;
1001
+ lexer->mark_end(lexer);
1002
+ // This is ${...} - increment expansion depth
1003
+ enter_context(scanner, CTX_COMPOUND);
1004
+ return true;
1005
+ }
1006
+ }
1007
+
1008
+ // Handle OPENING_PAREN after BARE_DOLLAR
1009
+ if ((valid_symbols[OPENING_PAREN] || valid_symbols[DOUBLE_OPENING_PAREN] ||
1010
+ valid_symbols[ZSH_EXTENDED_GLOB_FLAGS]) &&
1011
+ !in_error_recovery(valid_symbols)) {
1012
+ skip_ws(lexer);
1013
+ if (lexer->lookahead == '(') {
1014
+ advance(lexer);
1015
+ lexer->mark_end(lexer);
1016
+
1017
+ if (was_just_bare_dollar) {
1018
+ #if DEBUG
1019
+ fprintf(stderr, "SCANNER: Detected OPENING_PAREN after "
1020
+ "BARE_DOLLAR\n");
1021
+ #endif
1022
+ if (lexer->lookahead == '(' &&
1023
+ valid_symbols[DOUBLE_OPENING_PAREN]) {
1024
+ advance(lexer);
1025
+ lexer->mark_end(lexer);
1026
+ // This is $((...)) - increment arithmetic depth
1027
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1028
+ false; // Reset flag
1029
+ enter_context(scanner, CTX_ARITHMETIC);
1030
+ lexer->result_symbol = DOUBLE_OPENING_PAREN;
1031
+ return true;
1032
+ } else if (valid_symbols[OPENING_PAREN]) {
1033
+ // This is $(...) - increment command_depth
1034
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1035
+ false; // Reset flag
1036
+ enter_context(scanner, CTX_COMMAND);
1037
+ lexer->result_symbol = OPENING_PAREN;
1038
+ return true;
1039
+ }
1040
+ } else if (lexer->lookahead == '(' &&
1041
+ valid_symbols[DOUBLE_OPENING_PAREN]) {
1042
+ advance(lexer);
1043
+ lexer->mark_end(lexer);
1044
+ // This is ((...)) - increment arithmetic depth
1045
+ enter_context(scanner, CTX_ARITHMETIC);
1046
+ lexer->result_symbol = DOUBLE_OPENING_PAREN;
1047
+ return true;
1048
+ } else if (valid_symbols[OPENING_PAREN] ||
1049
+ valid_symbols[ZSH_EXTENDED_GLOB_FLAGS]) {
1050
+ // Handle ZSH_EXTENDED_GLOB_FLAGS - (#flags) patterns
1051
+ if (lexer->lookahead == '#' &&
1052
+ valid_symbols[ZSH_EXTENDED_GLOB_FLAGS]) {
1053
+ advance(lexer);
1054
+
1055
+ // Check for valid flag characters
1056
+ bool found_flags = false;
1057
+ while (
1058
+ lexer->lookahead &&
1059
+ (iswalnum(lexer->lookahead) ||
1060
+ lexer->lookahead == '.' || lexer->lookahead == 'i' ||
1061
+ lexer->lookahead == 'q' || lexer->lookahead == 'b' ||
1062
+ lexer->lookahead == 'm' || lexer->lookahead == 'n' ||
1063
+ lexer->lookahead == 's' || lexer->lookahead == 'B' ||
1064
+ lexer->lookahead == 'I' || lexer->lookahead == 'N' ||
1065
+ lexer->lookahead == 'U' || lexer->lookahead == 'X' ||
1066
+ lexer->lookahead == 'c' || lexer->lookahead == 'e' ||
1067
+ lexer->lookahead == 'l' || lexer->lookahead == 'f' ||
1068
+ lexer->lookahead == 'a' || lexer->lookahead == 'C' ||
1069
+ lexer->lookahead == 'o')) {
1070
+ found_flags = true;
1071
+ advance(lexer);
1072
+ }
1073
+
1074
+ if (found_flags && lexer->lookahead == ')') {
1075
+ advance(lexer);
1076
+ lexer->mark_end(lexer);
1077
+ lexer->result_symbol = ZSH_EXTENDED_GLOB_FLAGS;
1078
+ return true;
1079
+ }
1080
+
1081
+ // If we get here, it's not a valid glob flags pattern
1082
+ // Reset and let other tokens handle it
1083
+ return false;
1084
+ }
1085
+ if (valid_symbols[OPENING_PAREN]) {
1086
+ #if DEBUG
1087
+ fprintf(stderr, "SCANNER: Detected OPENING_PAREN NOT AFTER "
1088
+ "BARE_DOLLAR\n");
1089
+ #endif
1090
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1091
+ false; // Reset flag
1092
+ lexer->mark_end(lexer);
1093
+ enter_context(scanner, CTX_COMMAND);
1094
+ lexer->result_symbol = OPENING_PAREN;
1095
+ return true;
1096
+ }
1097
+ }
1098
+ }
1099
+ }
1100
+
1101
+ if ((valid_symbols[OPENING_BRACKET] || valid_symbols[TEST_COMMAND_START]) &&
1102
+ !in_error_recovery(valid_symbols)) {
1103
+ #if DEBUG
1104
+ fprintf(stderr,
1105
+ "DEBUG: CHECKING TEST_COMMAND_START=%d OPENING_BRACKET=%d "
1106
+ "lookahead=%c\n",
1107
+ valid_symbols[TEST_COMMAND_START],
1108
+ valid_symbols[OPENING_BRACKET], lexer->lookahead);
1109
+ #endif
1110
+ skip_wsnl(lexer);
1111
+ #if DEBUG
1112
+ fprintf(stderr,
1113
+ "DEBUG: CHECKING TEST_COMMAND_START=%d OPENING_BRACKET=%d "
1114
+ "lookahead=%c\n",
1115
+ valid_symbols[TEST_COMMAND_START],
1116
+ valid_symbols[OPENING_BRACKET], lexer->lookahead);
1117
+ #endif
1118
+ if (lexer->lookahead == '[') {
1119
+ advance(lexer);
1120
+
1121
+ if (lexer->lookahead == '[' && valid_symbols[TEST_COMMAND_START]) {
1122
+ advance(lexer);
1123
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1124
+ false; // Reset flag
1125
+ lexer->result_symbol = TEST_COMMAND_START;
1126
+ lexer->mark_end(lexer);
1127
+
1128
+ #if DEBUG
1129
+ fprintf(stderr, "DEBUG: Detected TEST_COMMAND_START [[\n");
1130
+ #endif
1131
+ // Enter test command context
1132
+ enter_context(scanner, CTX_TEST);
1133
+ return true;
1134
+ } else if (was_just_bare_dollar && valid_symbols[OPENING_BRACKET]) {
1135
+ // This is $[
1136
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1137
+ false; // Reset flag
1138
+ lexer->result_symbol = OPENING_BRACKET;
1139
+ lexer->mark_end(lexer);
1140
+
1141
+ #if DEBUG
1142
+ fprintf(stderr, "DEBUG: Detected OPENING_BRACKET $[\n");
1143
+ #endif
1144
+ enter_context(scanner, CTX_ARITHMETIC);
1145
+ return true;
1146
+ } else if (valid_symbols[OPENING_BRACKET]) {
1147
+ // This is single [
1148
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1149
+ false; // Reset flag
1150
+ lexer->result_symbol = OPENING_BRACKET;
1151
+ lexer->mark_end(lexer);
1152
+
1153
+ #if DEBUG
1154
+ fprintf(stderr, "DEBUG: Detected OPENING_BRACKET [\n");
1155
+ #endif
1156
+ return true;
1157
+ }
1158
+ }
1159
+ }
1160
+
1161
+ // Handle TEST_COMMAND_END ]]
1162
+ if ((valid_symbols[TEST_COMMAND_END] || valid_symbols[CLOSING_BRACKET]) &&
1163
+ !in_error_recovery(valid_symbols)) {
1164
+ skip_ws(lexer);
1165
+ if (lexer->lookahead == ']') {
1166
+ advance(lexer);
1167
+ if (lexer->lookahead == ']' && valid_symbols[TEST_COMMAND_END]) {
1168
+ advance(lexer);
1169
+ lexer->result_symbol = TEST_COMMAND_END;
1170
+ lexer->mark_end(lexer);
1171
+
1172
+ #if DEBUG
1173
+ fprintf(stderr, "DEBUG: Detected TEST_COMMAND_END ]]\n");
1174
+ #endif
1175
+ // Exit test command context
1176
+ exit_context(scanner, CTX_TEST);
1177
+ return true;
1178
+ } else if (valid_symbols[CLOSING_BRACKET]) {
1179
+ // This is single ]
1180
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1181
+ false; // Reset flag
1182
+ lexer->result_symbol = CLOSING_BRACKET;
1183
+ lexer->mark_end(lexer);
1184
+
1185
+ #if DEBUG
1186
+ fprintf(stderr, "DEBUG: Detected CLOSING_BRACKET ]\n");
1187
+ #endif
1188
+ return true;
1189
+ }
1190
+ // If only one ], don't consume it - let normal parsing handle it
1191
+ return false;
1192
+ }
1193
+ }
1194
+
1195
+ if ((valid_symbols[CLOSING_PAREN] || valid_symbols[CLOSING_DOUBLE_PAREN]) &&
1196
+ !in_error_recovery(valid_symbols)) {
1197
+ skip_ws(lexer);
1198
+ if (lexer->lookahead == ')') {
1199
+ advance(lexer);
1200
+
1201
+ if (lexer->lookahead == ')' &&
1202
+ valid_symbols[CLOSING_DOUBLE_PAREN]) {
1203
+ advance(lexer);
1204
+ lexer->result_symbol = CLOSING_DOUBLE_PAREN;
1205
+ lexer->mark_end(lexer);
1206
+
1207
+ // Exit test command context
1208
+ exit_context(scanner, CTX_ARITHMETIC);
1209
+ return true;
1210
+ } else if (valid_symbols[CLOSING_PAREN]) {
1211
+ // This is single )
1212
+ lexer->result_symbol = CLOSING_PAREN;
1213
+ lexer->mark_end(lexer);
1214
+
1215
+ // Exit relevant context
1216
+
1217
+ if (get_current_context(scanner) == CTX_COMMAND) {
1218
+ exit_context(scanner, CTX_COMMAND);
1219
+ } else if (get_current_context(scanner) == CTX_ARITHMETIC) {
1220
+ exit_context(scanner, CTX_ARITHMETIC);
1221
+ }
1222
+ return true;
1223
+ }
1224
+ // If only one ], don't consume it - let normal parsing handle it
1225
+ return false;
1226
+ }
1227
+ }
1228
+
1229
+ // Handle PATTERN_START - emitted after pattern operators in parameter
1230
+ // expansions
1231
+ if (valid_symbols[PATTERN_START] && !in_error_recovery(valid_symbols)) {
1232
+ if (get_current_context(scanner) == CTX_PARAMETER &&
1233
+ lexer->lookahead !=
1234
+ '}') { // Don't emit if expansion is about to end
1235
+
1236
+ // Determine pattern context based on what type of pattern we're
1237
+ // entering
1238
+ #if DEBUG
1239
+ fprintf(stderr,
1240
+ "DEBUG: PATTERN_START emitting for substitution, "
1241
+ "lookahead='%c'\n",
1242
+ lexer->lookahead);
1243
+ #endif
1244
+ exit_context(scanner, CTX_PARAMETER);
1245
+ enter_context(scanner, CTX_PARAMETER_PATTERN_SUBSTITUTE);
1246
+ #if DEBUG
1247
+ fprintf(stderr, "DEBUG: Context after transition: %d\n",
1248
+ get_current_context(scanner));
1249
+ #endif
1250
+ lexer->result_symbol = PATTERN_START;
1251
+ lexer->mark_end(lexer);
1252
+ return true;
1253
+ }
1254
+ }
1255
+
1256
+ // Handle PATTERN_SUFFIX_START - emitted after pattern operators in
1257
+ // parameter expansions
1258
+ if (valid_symbols[PATTERN_SUFFIX_START] &&
1259
+ !in_error_recovery(valid_symbols)) {
1260
+ if (get_current_context(scanner) == CTX_PARAMETER &&
1261
+ lexer->lookahead !=
1262
+ '}') { // Don't emit if expansion is about to end
1263
+
1264
+ // Determine pattern context based on what type of pattern we're
1265
+ // entering % # patterns are suffix/prefix removal
1266
+ #if DEBUG
1267
+ fprintf(stderr,
1268
+ "DEBUG: PATTERN_START emitting for suffix/prefix, "
1269
+ "lookahead='%c'\n",
1270
+ lexer->lookahead);
1271
+ #endif
1272
+
1273
+ exit_context(scanner, CTX_PARAMETER);
1274
+ enter_context(scanner, CTX_PARAMETER_PATTERN_SUFFIX);
1275
+ #if DEBUG
1276
+ fprintf(stderr, "DEBUG: Context after transition: %d\n",
1277
+ get_current_context(scanner));
1278
+ #endif
1279
+ lexer->result_symbol = PATTERN_SUFFIX_START;
1280
+ lexer->mark_end(lexer);
1281
+ return true;
1282
+ }
1283
+ }
1284
+
1285
+ // Handle hash operations in parameter expansion context
1286
+ if (in_parameter_expansion(scanner) && lexer->lookahead == '#' &&
1287
+ (valid_symbols[HASH_PATTERN] || valid_symbols[DOUBLE_HASH_PATTERN]) &&
1288
+ !in_error_recovery(valid_symbols)) {
1289
+ #if DEBUG
1290
+ fprintf(stderr, "SCANNER: Hash operation detected\n");
1291
+ #endif
1292
+ advance(lexer); // consume first #
1293
+
1294
+ if (lexer->lookahead == '#') {
1295
+ // Double hash: ##pattern
1296
+ if (valid_symbols[DOUBLE_HASH_PATTERN]) {
1297
+ #if DEBUG
1298
+ fprintf(stderr, "SCANNER: Returning DOUBLE_HASH_PATTERN\n");
1299
+ #endif
1300
+ advance(lexer); // consume second #
1301
+ lexer->result_symbol = DOUBLE_HASH_PATTERN;
1302
+ lexer->mark_end(lexer);
1303
+ return true;
1304
+ }
1305
+ } else {
1306
+ // Single hash: #pattern
1307
+ if (valid_symbols[HASH_PATTERN]) {
1308
+ #if DEBUG
1309
+ fprintf(stderr, "SCANNER: Returning HASH_PATTERN\n");
1310
+ #endif
1311
+ lexer->result_symbol = HASH_PATTERN;
1312
+ lexer->mark_end(lexer);
1313
+ return true;
1314
+ }
1315
+ }
1316
+ #if DEBUG
1317
+ fprintf(stderr, "SCANNER: Hash operation not matched\n");
1318
+ #endif
1319
+ return false;
1320
+ }
1321
+
1322
+ // Array operators: ${var[*]} and ${var[@]}
1323
+ if ((valid_symbols[ARRAY_STAR_TOKEN] || valid_symbols[ARRAY_AT_TOKEN]) &&
1324
+ !in_error_recovery(valid_symbols)) {
1325
+ if (lexer->lookahead == '*' && valid_symbols[ARRAY_STAR_TOKEN] &&
1326
+ !valid_symbols[REGEX] && !valid_symbols[REGEX_NO_SLASH] &&
1327
+ !valid_symbols[REGEX_NO_SPACE]) {
1328
+ lexer->result_symbol = ARRAY_STAR_TOKEN;
1329
+ advance(lexer);
1330
+ lexer->mark_end(lexer);
1331
+ return true;
1332
+ }
1333
+ if (lexer->lookahead == '@' && valid_symbols[ARRAY_AT_TOKEN]) {
1334
+ lexer->result_symbol = ARRAY_AT_TOKEN;
1335
+ advance(lexer);
1336
+ lexer->mark_end(lexer);
1337
+ return true;
1338
+ }
1339
+ }
1340
+
1341
+ if (valid_symbols[EMPTY_VALUE]) {
1342
+ if (iswspace(lexer->lookahead) || lexer->eof(lexer) ||
1343
+ lexer->lookahead == ';' || lexer->lookahead == '&' ||
1344
+ lexer->lookahead == '}') {
1345
+ lexer->mark_end(lexer);
1346
+ lexer->result_symbol = EMPTY_VALUE;
1347
+ return true;
1348
+ }
1349
+ }
1350
+
1351
+ if ((valid_symbols[HEREDOC_BODY_BEGINNING] ||
1352
+ valid_symbols[SIMPLE_HEREDOC_BODY]) &&
1353
+ scanner->heredocs.size > 0 &&
1354
+ !array_back(&scanner->heredocs)->started &&
1355
+ !in_error_recovery(valid_symbols)) {
1356
+ return scan_heredoc_content(scanner, lexer, HEREDOC_BODY_BEGINNING,
1357
+ SIMPLE_HEREDOC_BODY);
1358
+ }
1359
+
1360
+ if (valid_symbols[HEREDOC_END] && scanner->heredocs.size > 0) {
1361
+ Heredoc *heredoc = array_back(&scanner->heredocs);
1362
+ if (scan_heredoc_end_identifier(heredoc, lexer)) {
1363
+ array_delete(&heredoc->current_leading_word);
1364
+ array_delete(&heredoc->delimiter);
1365
+ array_pop(&scanner->heredocs);
1366
+ lexer->result_symbol = HEREDOC_END;
1367
+ return true;
1368
+ }
1369
+ }
1370
+
1371
+ if (valid_symbols[HEREDOC_CONTENT] && scanner->heredocs.size > 0 &&
1372
+ array_back(&scanner->heredocs)->started &&
1373
+ !in_error_recovery(valid_symbols)) {
1374
+ return scan_heredoc_content(scanner, lexer, HEREDOC_CONTENT,
1375
+ HEREDOC_END);
1376
+ }
1377
+
1378
+ if (valid_symbols[HEREDOC_START] && !in_error_recovery(valid_symbols) &&
1379
+ scanner->heredocs.size > 0) {
1380
+ #if DEBUG
1381
+ fprintf(stderr,
1382
+ "DEBUG: HEREDOC_START check - heredocs.size=%u, "
1383
+ "in_error_recovery=%s\n",
1384
+ scanner->heredocs.size,
1385
+ in_error_recovery(valid_symbols) ? "true" : "false");
1386
+ #endif
1387
+ return scan_heredoc_start(array_back(&scanner->heredocs), lexer);
1388
+ }
1389
+
1390
+ if (valid_symbols[TEST_OPERATOR] && !valid_symbols[EXPANSION_WORD]) {
1391
+ skip_ws(lexer);
1392
+ if (lexer->lookahead == '\\') {
1393
+ if (valid_symbols[EXTGLOB_PATTERN]) {
1394
+ goto extglob_pattern;
1395
+ }
1396
+ if (valid_symbols[REGEX_NO_SPACE]) {
1397
+ goto regex;
1398
+ }
1399
+ skip(lexer);
1400
+
1401
+ if (lexer->eof(lexer)) {
1402
+ return false;
1403
+ }
1404
+
1405
+ if (lexer->lookahead == '\r') {
1406
+ skip(lexer);
1407
+ if (lexer->lookahead == '\n') {
1408
+ skip(lexer);
1409
+ }
1410
+ } else if (lexer->lookahead == '\n') {
1411
+ skip(lexer);
1412
+ } else {
1413
+ return false;
1414
+ }
1415
+
1416
+ while (iswspace(lexer->lookahead)) {
1417
+ skip(lexer);
1418
+ }
1419
+ }
1420
+
1421
+ if (lexer->lookahead == '\n' && !valid_symbols[NEWLINE]) {
1422
+ skip(lexer);
1423
+
1424
+ while (iswspace(lexer->lookahead)) {
1425
+ skip(lexer);
1426
+ }
1427
+ }
1428
+
1429
+ if (lexer->lookahead == '-') {
1430
+ advance(lexer);
1431
+
1432
+ bool advanced_once = false;
1433
+ while (iswalpha(lexer->lookahead)) {
1434
+ advanced_once = true;
1435
+ advance(lexer);
1436
+ }
1437
+
1438
+ if (iswspace(lexer->lookahead) && advanced_once) {
1439
+ lexer->mark_end(lexer);
1440
+ advance(lexer);
1441
+ context_type_t ctx = get_current_context(scanner);
1442
+ if (lexer->lookahead == '}' &&
1443
+ (ctx == CTX_PARAMETER ||
1444
+ ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
1445
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE)) {
1446
+ if (valid_symbols[EXPANSION_WORD]) {
1447
+ lexer->mark_end(lexer);
1448
+ lexer->result_symbol = EXPANSION_WORD;
1449
+ return true;
1450
+ }
1451
+ return false;
1452
+ }
1453
+ lexer->result_symbol = TEST_OPERATOR;
1454
+ return true;
1455
+ }
1456
+ if (iswspace(lexer->lookahead) && valid_symbols[EXTGLOB_PATTERN]) {
1457
+ lexer->result_symbol = EXTGLOB_PATTERN;
1458
+ return true;
1459
+ }
1460
+ }
1461
+
1462
+ if (valid_symbols[RAW_DOLLAR] && !in_error_recovery(valid_symbols) &&
1463
+ scan_raw_dollar(lexer, valid_symbols)) {
1464
+ return true;
1465
+ }
1466
+ }
1467
+
1468
+ if (valid_symbols[SIMPLE_VARIABLE_NAME] &&
1469
+ !in_error_recovery(valid_symbols)) {
1470
+ bool in_param_expand = in_parameter_expansion_context(scanner);
1471
+
1472
+ #if DEBUG
1473
+ fprintf(stderr, "SCANNER: trying SIMPLE_VARIABLE_NAME\n");
1474
+ #endif
1475
+
1476
+ skip_ws(lexer);
1477
+ if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
1478
+ int consumed = 0;
1479
+ while (iswalnum(lexer->lookahead) || lexer->lookahead == '_') {
1480
+ advance(lexer);
1481
+ consumed++;
1482
+ }
1483
+
1484
+ if (consumed > 0) {
1485
+ lexer->mark_end(lexer);
1486
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1487
+ false;
1488
+ scanner->just_returned_variable_name = true;
1489
+ lexer->result_symbol = SIMPLE_VARIABLE_NAME;
1490
+ #if DEBUG
1491
+ fprintf(stderr, "SCANNER: SIMPLE_VARIABLE_NAME found\n");
1492
+ #endif
1493
+ return true;
1494
+ }
1495
+ }
1496
+ }
1497
+
1498
+ if (valid_symbols[SPECIAL_VARIABLE_NAME] &&
1499
+ !in_error_recovery(valid_symbols)) {
1500
+ // '*', '@', '?', '!', '#', '-', '$', '0', '_'
1501
+ skip_ws(lexer);
1502
+ bool in_param_expand = in_parameter_expansion_context(scanner);
1503
+ #if DEBUG
1504
+ if (in_param_expand) {
1505
+ if (lexer->lookahead == '!' || lexer->lookahead == '#')
1506
+ fprintf(stderr, "SCANNER: skipping flag chars as part of "
1507
+ "SPECIAL_VARIABLE_NAME\n");
1508
+ }
1509
+ #endif
1510
+ if (lexer->lookahead == '*' || lexer->lookahead == '@' ||
1511
+ lexer->lookahead == '?' || lexer->lookahead == '-' ||
1512
+ (lexer->lookahead == '!' && !in_param_expand) ||
1513
+ (lexer->lookahead == '#' && !in_param_expand) ||
1514
+ lexer->lookahead == '$' || lexer->lookahead == '_' ||
1515
+ iswdigit(lexer->lookahead)) {
1516
+ advance(lexer);
1517
+ lexer->mark_end(lexer);
1518
+ was_just_bare_dollar = scanner->just_returned_bare_dollar = false;
1519
+ was_just_variable_name = scanner->just_returned_variable_name =
1520
+ true;
1521
+ lexer->result_symbol = SPECIAL_VARIABLE_NAME;
1522
+ return true;
1523
+ }
1524
+ }
1525
+
1526
+ if ((valid_symbols[VARIABLE_NAME] || valid_symbols[FILE_DESCRIPTOR] ||
1527
+ valid_symbols[HEREDOC_ARROW]) &&
1528
+ !valid_symbols[REGEX_NO_SLASH] && !in_error_recovery(valid_symbols)) {
1529
+ for (;;) {
1530
+ if ((lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
1531
+ lexer->lookahead == '\r' ||
1532
+ (lexer->lookahead == '\n' && !valid_symbols[NEWLINE])) &&
1533
+ !valid_symbols[EXPANSION_WORD] && !valid_symbols[CONCAT]) {
1534
+ #if DEBUG
1535
+ fprintf(stderr, "SCANNER: VARIABLE_NAME skipped ws\n");
1536
+ #endif
1537
+ // Only skip whitespace if CONCAT is not valid
1538
+ skip(lexer);
1539
+ } else if (lexer->lookahead == '\\') {
1540
+ skip(lexer);
1541
+
1542
+ if (lexer->eof(lexer)) {
1543
+ lexer->mark_end(lexer);
1544
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1545
+ false; // Reset flag
1546
+ #if DEBUG
1547
+ fprintf(stderr, "SCANNER: VARIABLE_NAME after \\\n");
1548
+ #endif
1549
+ lexer->result_symbol = VARIABLE_NAME;
1550
+ scanner->just_returned_variable_name = true;
1551
+ return true;
1552
+ }
1553
+
1554
+ if (lexer->lookahead == '\r') {
1555
+ skip(lexer);
1556
+ }
1557
+ if (lexer->lookahead == '\n') {
1558
+ skip(lexer);
1559
+ } else {
1560
+ if (lexer->lookahead == '\\' &&
1561
+ valid_symbols[EXPANSION_WORD]) {
1562
+ goto expansion_word;
1563
+ }
1564
+ return false;
1565
+ }
1566
+ } else {
1567
+ break;
1568
+ }
1569
+ }
1570
+
1571
+ // no '*', '@', '?', '-', '$', '0', '_', '#'
1572
+ if (!valid_symbols[EXPANSION_WORD] &&
1573
+ (lexer->lookahead == '*' || lexer->lookahead == '@' ||
1574
+ lexer->lookahead == '?' || lexer->lookahead == '-' ||
1575
+ lexer->lookahead == '0' || lexer->lookahead == '_' ||
1576
+ lexer->lookahead == '#')) {
1577
+ lexer->mark_end(lexer);
1578
+ advance(lexer);
1579
+ if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
1580
+ lexer->lookahead == ':' || lexer->lookahead == '-' ||
1581
+ lexer->lookahead == '%' || lexer->lookahead == '/') {
1582
+ return false;
1583
+ }
1584
+ if (valid_symbols[EXTGLOB_PATTERN] && iswspace(lexer->lookahead)) {
1585
+ lexer->mark_end(lexer);
1586
+ lexer->result_symbol = EXTGLOB_PATTERN;
1587
+ return true;
1588
+ }
1589
+ }
1590
+
1591
+ if (valid_symbols[HEREDOC_ARROW] && lexer->lookahead == '<') {
1592
+ advance(lexer);
1593
+ if (lexer->lookahead == '<') {
1594
+ advance(lexer);
1595
+ if (lexer->lookahead == '-') {
1596
+ advance(lexer);
1597
+ Heredoc heredoc = heredoc_new();
1598
+ heredoc.allows_indent = true;
1599
+ array_push(&scanner->heredocs, heredoc);
1600
+ #if DEBUG
1601
+ fprintf(stderr,
1602
+ "DEBUG: HEREDOC_ARROW_DASH - added heredoc, size "
1603
+ "now=%u\n",
1604
+ scanner->heredocs.size);
1605
+ #endif
1606
+ lexer->result_symbol = HEREDOC_ARROW_DASH;
1607
+ } else if (lexer->lookahead == '<' || lexer->lookahead == '=') {
1608
+ return false;
1609
+ } else {
1610
+ Heredoc heredoc = heredoc_new();
1611
+ array_push(&scanner->heredocs, heredoc);
1612
+ #if DEBUG
1613
+ fprintf(
1614
+ stderr,
1615
+ "DEBUG: HEREDOC_ARROW - added heredoc, size now=%u\n",
1616
+ scanner->heredocs.size);
1617
+ #endif
1618
+ lexer->result_symbol = HEREDOC_ARROW;
1619
+ }
1620
+ return true;
1621
+ }
1622
+ return false;
1623
+ }
1624
+
1625
+ bool is_number = true;
1626
+ if (iswdigit(lexer->lookahead)) {
1627
+ advance(lexer);
1628
+ } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
1629
+ is_number = false;
1630
+ advance(lexer);
1631
+ } else {
1632
+ if (lexer->lookahead == '{') {
1633
+ goto brace_start;
1634
+ }
1635
+ if (valid_symbols[EXPANSION_WORD]) {
1636
+ goto expansion_word;
1637
+ }
1638
+ if (valid_symbols[EXTGLOB_PATTERN]) {
1639
+ goto extglob_pattern;
1640
+ }
1641
+ return false;
1642
+ }
1643
+
1644
+ for (;;) {
1645
+ if (iswdigit(lexer->lookahead)) {
1646
+ advance(lexer);
1647
+ } else if (iswalpha(lexer->lookahead) || lexer->lookahead == '_') {
1648
+ is_number = false;
1649
+ advance(lexer);
1650
+ } else {
1651
+ break;
1652
+ }
1653
+ }
1654
+
1655
+ if (is_number && valid_symbols[FILE_DESCRIPTOR] &&
1656
+ (lexer->lookahead == '>' || lexer->lookahead == '<')) {
1657
+ lexer->result_symbol = FILE_DESCRIPTOR;
1658
+ return true;
1659
+ }
1660
+
1661
+ if (valid_symbols[VARIABLE_NAME]) {
1662
+ if (lexer->lookahead == '+') {
1663
+ lexer->mark_end(lexer);
1664
+ advance(lexer);
1665
+ context_type_t ctx = get_current_context(scanner);
1666
+ if (lexer->lookahead == '=' || lexer->lookahead == ':' ||
1667
+ (ctx == CTX_PARAMETER ||
1668
+ ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
1669
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE)) {
1670
+ #if DEBUG
1671
+ fprintf(stderr,
1672
+ "SCANNER: VARIABLE_NAME after + operator\n");
1673
+ #endif
1674
+ lexer->result_symbol = VARIABLE_NAME;
1675
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1676
+ false;
1677
+ scanner->just_returned_variable_name = true;
1678
+ return true;
1679
+ }
1680
+ return false;
1681
+ }
1682
+ if (lexer->lookahead == '/') {
1683
+ return false;
1684
+ }
1685
+ context_type_t ctx = get_current_context(scanner);
1686
+ if (lexer->lookahead == '=' || lexer->lookahead == '[' ||
1687
+ lexer->lookahead == '%' ||
1688
+ (lexer->lookahead == '#' && !is_number) ||
1689
+ lexer->lookahead == '@' ||
1690
+ (lexer->lookahead == '-' &&
1691
+ (ctx == CTX_PARAMETER || ctx == CTX_PARAMETER_PATTERN_SUFFIX ||
1692
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE))) {
1693
+ lexer->mark_end(lexer);
1694
+ #if DEBUG
1695
+ fprintf(stderr, "SCANNER: VARIABLE_NAME after =\n");
1696
+ #endif
1697
+ lexer->result_symbol = VARIABLE_NAME;
1698
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1699
+ false;
1700
+ scanner->just_returned_variable_name = true;
1701
+ return true;
1702
+ }
1703
+
1704
+ if (lexer->lookahead == '?') {
1705
+ lexer->mark_end(lexer);
1706
+ advance(lexer);
1707
+ lexer->result_symbol = VARIABLE_NAME;
1708
+ was_just_bare_dollar = scanner->just_returned_bare_dollar =
1709
+ false;
1710
+ #if DEBUG
1711
+ fprintf(stderr, "SCANNER: VARIABLE_NAME after ?\n");
1712
+ #endif
1713
+ scanner->just_returned_variable_name = true;
1714
+ return iswalpha(lexer->lookahead);
1715
+ }
1716
+ }
1717
+
1718
+ #if DEBUG
1719
+ fprintf(stderr, "DEBUG: expansion word not valud returning false\n");
1720
+ #endif
1721
+ return false;
1722
+ }
1723
+
1724
+ if (valid_symbols[BARE_DOLLAR] && !in_error_recovery(valid_symbols) &&
1725
+ scan_raw_dollar(lexer, valid_symbols)) {
1726
+ return true;
1727
+ }
1728
+
1729
+ regex:
1730
+ if ((valid_symbols[REGEX] || valid_symbols[REGEX_NO_SLASH] ||
1731
+ valid_symbols[REGEX_NO_SPACE]) &&
1732
+ !in_error_recovery(valid_symbols)) {
1733
+ if (valid_symbols[REGEX] || valid_symbols[REGEX_NO_SPACE]) {
1734
+ while (iswspace(lexer->lookahead)) {
1735
+ skip(lexer);
1736
+ }
1737
+ }
1738
+
1739
+ if ((lexer->lookahead != '"' && lexer->lookahead != '\'') ||
1740
+ ((lexer->lookahead == '$' || lexer->lookahead == '\'') &&
1741
+ valid_symbols[REGEX_NO_SLASH]) ||
1742
+ (lexer->lookahead == '\'' && valid_symbols[REGEX_NO_SPACE])) {
1743
+ typedef struct {
1744
+ bool done;
1745
+ bool advanced_once;
1746
+ bool found_non_alnumdollarunderdash;
1747
+ bool last_was_escape;
1748
+ bool in_single_quote;
1749
+ uint32_t paren_depth;
1750
+ uint32_t bracket_depth;
1751
+ uint32_t brace_depth;
1752
+ } State;
1753
+
1754
+ if (lexer->lookahead == '$' && valid_symbols[REGEX_NO_SLASH]) {
1755
+ lexer->mark_end(lexer);
1756
+ advance(lexer);
1757
+ if (lexer->lookahead == '(') {
1758
+ return false;
1759
+ }
1760
+ }
1761
+
1762
+ lexer->mark_end(lexer);
1763
+
1764
+ State state = {false, false, false, false, false, 0, 0, 0};
1765
+ while (!state.done) {
1766
+ if (state.in_single_quote) {
1767
+ if (lexer->lookahead == '\'') {
1768
+ state.in_single_quote = false;
1769
+ advance(lexer);
1770
+ lexer->mark_end(lexer);
1771
+
1772
+ // Track entering parameter expansion context
1773
+ enter_context(scanner, CTX_PARAMETER);
1774
+ }
1775
+ }
1776
+ switch (lexer->lookahead) {
1777
+ case '\\':
1778
+ state.last_was_escape = true;
1779
+ break;
1780
+ case '\0':
1781
+ return false;
1782
+ case '(':
1783
+ state.paren_depth++;
1784
+ state.last_was_escape = false;
1785
+ break;
1786
+ case '[':
1787
+ state.bracket_depth++;
1788
+ state.last_was_escape = false;
1789
+ break;
1790
+ case '{':
1791
+ if (!state.last_was_escape) {
1792
+ state.brace_depth++;
1793
+ }
1794
+ state.last_was_escape = false;
1795
+ break;
1796
+ case ')':
1797
+ if (state.paren_depth == 0) {
1798
+ state.done = true;
1799
+ }
1800
+ state.paren_depth--;
1801
+ state.last_was_escape = false;
1802
+ break;
1803
+ case ']':
1804
+ if (state.bracket_depth == 0) {
1805
+ state.done = true;
1806
+ }
1807
+ state.bracket_depth--;
1808
+ state.last_was_escape = false;
1809
+ break;
1810
+ case '}':
1811
+ if (state.brace_depth == 0) {
1812
+ state.done = true;
1813
+ }
1814
+ state.brace_depth--;
1815
+ state.last_was_escape = false;
1816
+ break;
1817
+ case '\'':
1818
+ // Enter or exit a single-quoted string.
1819
+ state.in_single_quote = !state.in_single_quote;
1820
+ advance(lexer);
1821
+ state.advanced_once = true;
1822
+ state.last_was_escape = false;
1823
+ continue;
1824
+ default:
1825
+ state.last_was_escape = false;
1826
+ break;
1827
+ }
1828
+
1829
+ if (!state.done) {
1830
+ if (valid_symbols[REGEX]) {
1831
+ bool was_space = !state.in_single_quote &&
1832
+ iswspace(lexer->lookahead);
1833
+ advance(lexer);
1834
+ state.advanced_once = true;
1835
+ if (!was_space || state.paren_depth > 0) {
1836
+ lexer->mark_end(lexer);
1837
+ }
1838
+ } else if (valid_symbols[REGEX_NO_SLASH]) {
1839
+ if (lexer->lookahead == '/') {
1840
+ lexer->mark_end(lexer);
1841
+ lexer->result_symbol = REGEX_NO_SLASH;
1842
+ return state.advanced_once;
1843
+ }
1844
+ if (lexer->lookahead == '\\') {
1845
+ advance(lexer);
1846
+ state.advanced_once = true;
1847
+ if (!lexer->eof(lexer) && lexer->lookahead != '[' &&
1848
+ lexer->lookahead != '/') {
1849
+ advance(lexer);
1850
+ lexer->mark_end(lexer);
1851
+ }
1852
+ } else {
1853
+ bool was_space = !state.in_single_quote &&
1854
+ iswspace(lexer->lookahead);
1855
+ advance(lexer);
1856
+ state.advanced_once = true;
1857
+ if (!was_space) {
1858
+ lexer->mark_end(lexer);
1859
+ }
1860
+ }
1861
+ } else if (valid_symbols[REGEX_NO_SPACE]) {
1862
+ if (lexer->lookahead == '\\') {
1863
+ state.found_non_alnumdollarunderdash = true;
1864
+ advance(lexer);
1865
+ if (!lexer->eof(lexer)) {
1866
+ advance(lexer);
1867
+ }
1868
+ } else if (lexer->lookahead == '$') {
1869
+ lexer->mark_end(lexer);
1870
+ advance(lexer);
1871
+ // do not parse a command
1872
+ // substitution
1873
+ if (lexer->lookahead == '(') {
1874
+ return false;
1875
+ }
1876
+ // end $ always means regex, e.g.
1877
+ // 99999999$
1878
+ if (iswspace(lexer->lookahead)) {
1879
+ lexer->result_symbol = REGEX_NO_SPACE;
1880
+ lexer->mark_end(lexer);
1881
+ return true;
1882
+ }
1883
+ } else {
1884
+ bool was_space = !state.in_single_quote &&
1885
+ iswspace(lexer->lookahead);
1886
+ if (was_space && state.paren_depth == 0) {
1887
+ lexer->mark_end(lexer);
1888
+ lexer->result_symbol = REGEX_NO_SPACE;
1889
+ return state.found_non_alnumdollarunderdash;
1890
+ }
1891
+ if (!iswalnum(lexer->lookahead) &&
1892
+ lexer->lookahead != '$' &&
1893
+ lexer->lookahead != '-' &&
1894
+ lexer->lookahead != '_') {
1895
+ state.found_non_alnumdollarunderdash = true;
1896
+ }
1897
+ advance(lexer);
1898
+ }
1899
+ }
1900
+ }
1901
+ }
1902
+
1903
+ lexer->result_symbol =
1904
+ valid_symbols[REGEX_NO_SLASH] ? REGEX_NO_SLASH
1905
+ : valid_symbols[REGEX_NO_SPACE] ? REGEX_NO_SPACE
1906
+ : REGEX;
1907
+ if (valid_symbols[REGEX] && !state.advanced_once) {
1908
+ #if DEBUG
1909
+ fprintf(stderr, "DEBUG: regex not valid returning false\n");
1910
+ #endif
1911
+ return false;
1912
+ }
1913
+ return true;
1914
+ }
1915
+ }
1916
+
1917
+ extglob_pattern:
1918
+ if (valid_symbols[EXTGLOB_PATTERN] && !in_error_recovery(valid_symbols) &&
1919
+ !valid_symbols[REGEX] && !valid_symbols[REGEX_NO_SLASH] &&
1920
+ !valid_symbols[REGEX_NO_SPACE] &&
1921
+ !in_parameter_expansion_context(
1922
+ scanner) // Don't generate EXTGLOB_PATTERN inside ${...}
1923
+ ) {
1924
+ // first skip ws, then check for ? * + @ !
1925
+ while (iswspace(lexer->lookahead)) {
1926
+ skip(lexer);
1927
+ }
1928
+
1929
+ if (lexer->lookahead == '?' || lexer->lookahead == '*' ||
1930
+ lexer->lookahead == '+' || lexer->lookahead == '@' ||
1931
+ lexer->lookahead == '!' || lexer->lookahead == '-' ||
1932
+ lexer->lookahead == ')' || lexer->lookahead == '\\' ||
1933
+ lexer->lookahead == '.' || lexer->lookahead == '[' ||
1934
+ (iswalpha(lexer->lookahead))) {
1935
+ if (lexer->lookahead == '\\') {
1936
+ advance(lexer);
1937
+ if ((iswspace(lexer->lookahead) || lexer->lookahead == '"') &&
1938
+ lexer->lookahead != '\r' && lexer->lookahead != '\n') {
1939
+ advance(lexer);
1940
+ } else {
1941
+ return false;
1942
+ }
1943
+ }
1944
+
1945
+ if (lexer->lookahead == ')' &&
1946
+ scanner->last_glob_paren_depth == 0) {
1947
+ lexer->mark_end(lexer);
1948
+ advance(lexer);
1949
+
1950
+ if (iswspace(lexer->lookahead)) {
1951
+ return false;
1952
+ }
1953
+ }
1954
+
1955
+ lexer->mark_end(lexer);
1956
+ bool was_non_alpha = !iswalpha(lexer->lookahead);
1957
+ if (lexer->lookahead != '[') {
1958
+ // no esac
1959
+ if (lexer->lookahead == 'e') {
1960
+ lexer->mark_end(lexer);
1961
+ advance(lexer);
1962
+ if (lexer->lookahead == 's') {
1963
+ advance(lexer);
1964
+ if (lexer->lookahead == 'a') {
1965
+ advance(lexer);
1966
+ if (lexer->lookahead == 'c') {
1967
+ advance(lexer);
1968
+ if (iswspace(lexer->lookahead)) {
1969
+ return false;
1970
+ }
1971
+ }
1972
+ }
1973
+ }
1974
+ } else {
1975
+ advance(lexer);
1976
+ }
1977
+ }
1978
+
1979
+ // -\w is just a word, find something else special
1980
+ if (lexer->lookahead == '-') {
1981
+ lexer->mark_end(lexer);
1982
+ advance(lexer);
1983
+ while (iswalnum(lexer->lookahead)) {
1984
+ advance(lexer);
1985
+ }
1986
+
1987
+ if (lexer->lookahead == ')' || lexer->lookahead == '\\' ||
1988
+ lexer->lookahead == '.') {
1989
+ return false;
1990
+ }
1991
+ lexer->mark_end(lexer);
1992
+ }
1993
+
1994
+ // case item -) or *)
1995
+ if (lexer->lookahead == ')' &&
1996
+ scanner->last_glob_paren_depth == 0) {
1997
+ lexer->mark_end(lexer);
1998
+ advance(lexer);
1999
+ if (iswspace(lexer->lookahead)) {
2000
+ lexer->result_symbol = EXTGLOB_PATTERN;
2001
+ return was_non_alpha;
2002
+ }
2003
+ }
2004
+
2005
+ if (iswspace(lexer->lookahead)) {
2006
+ lexer->mark_end(lexer);
2007
+ lexer->result_symbol = EXTGLOB_PATTERN;
2008
+ scanner->last_glob_paren_depth = 0;
2009
+ return true;
2010
+ }
2011
+
2012
+ if (lexer->lookahead == '$') {
2013
+ lexer->mark_end(lexer);
2014
+ advance(lexer);
2015
+ if (lexer->lookahead == '{' || lexer->lookahead == '(') {
2016
+ lexer->result_symbol = EXTGLOB_PATTERN;
2017
+ return true;
2018
+ }
2019
+ }
2020
+
2021
+ if (lexer->lookahead == '|') {
2022
+ lexer->mark_end(lexer);
2023
+ advance(lexer);
2024
+ lexer->result_symbol = EXTGLOB_PATTERN;
2025
+ return true;
2026
+ }
2027
+
2028
+ if (!iswalnum(lexer->lookahead) && lexer->lookahead != '(' &&
2029
+ lexer->lookahead != '"' && lexer->lookahead != '[' &&
2030
+ lexer->lookahead != '?' && lexer->lookahead != '/' &&
2031
+ lexer->lookahead != '\\' && lexer->lookahead != '_' &&
2032
+ lexer->lookahead != '*') {
2033
+ return false;
2034
+ }
2035
+
2036
+ typedef struct {
2037
+ bool done;
2038
+ bool saw_non_alphadot;
2039
+ uint32_t paren_depth;
2040
+ uint32_t bracket_depth;
2041
+ uint32_t brace_depth;
2042
+ } State;
2043
+
2044
+ State state = {false, was_non_alpha, scanner->last_glob_paren_depth,
2045
+ 0, 0};
2046
+ while (!state.done) {
2047
+ switch (lexer->lookahead) {
2048
+ case '\0':
2049
+ return false;
2050
+ case '(':
2051
+ state.paren_depth++;
2052
+ break;
2053
+ case '[':
2054
+ state.bracket_depth++;
2055
+ break;
2056
+ case '{':
2057
+ state.brace_depth++;
2058
+ break;
2059
+ case ')':
2060
+ if (state.paren_depth == 0) {
2061
+ state.done = true;
2062
+ }
2063
+ state.paren_depth--;
2064
+ break;
2065
+ case ']':
2066
+ if (state.bracket_depth == 0) {
2067
+ state.done = true;
2068
+ }
2069
+ state.bracket_depth--;
2070
+ break;
2071
+ case '}':
2072
+ if (state.brace_depth == 0) {
2073
+ state.done = true;
2074
+ }
2075
+ state.brace_depth--;
2076
+ break;
2077
+ }
2078
+
2079
+ if (lexer->lookahead == '|') {
2080
+ lexer->mark_end(lexer);
2081
+ advance(lexer);
2082
+ if (state.paren_depth == 0 && state.bracket_depth == 0 &&
2083
+ state.brace_depth == 0) {
2084
+ lexer->result_symbol = EXTGLOB_PATTERN;
2085
+ return true;
2086
+ }
2087
+ }
2088
+
2089
+ if (!state.done) {
2090
+ bool was_space = iswspace(lexer->lookahead);
2091
+ if (lexer->lookahead == '$') {
2092
+ lexer->mark_end(lexer);
2093
+ if (!iswalpha(lexer->lookahead) &&
2094
+ lexer->lookahead != '.' &&
2095
+ lexer->lookahead != '\\') {
2096
+ state.saw_non_alphadot = true;
2097
+ }
2098
+ advance(lexer);
2099
+ if (lexer->lookahead == '(' ||
2100
+ lexer->lookahead == '{') {
2101
+ lexer->result_symbol = EXTGLOB_PATTERN;
2102
+ scanner->last_glob_paren_depth = state.paren_depth;
2103
+ return state.saw_non_alphadot;
2104
+ }
2105
+ }
2106
+ if (was_space) {
2107
+ lexer->mark_end(lexer);
2108
+ lexer->result_symbol = EXTGLOB_PATTERN;
2109
+ scanner->last_glob_paren_depth = 0;
2110
+ return state.saw_non_alphadot;
2111
+ }
2112
+ if (lexer->lookahead == '"') {
2113
+ lexer->mark_end(lexer);
2114
+ lexer->result_symbol = EXTGLOB_PATTERN;
2115
+ scanner->last_glob_paren_depth = 0;
2116
+ return state.saw_non_alphadot;
2117
+ }
2118
+ if (lexer->lookahead == '\\') {
2119
+ if (!iswalpha(lexer->lookahead) &&
2120
+ lexer->lookahead != '.' &&
2121
+ lexer->lookahead != '\\') {
2122
+ state.saw_non_alphadot = true;
2123
+ }
2124
+ advance(lexer);
2125
+ if (iswspace(lexer->lookahead) ||
2126
+ lexer->lookahead == '"') {
2127
+ advance(lexer);
2128
+ }
2129
+ } else {
2130
+ if (!iswalpha(lexer->lookahead) &&
2131
+ lexer->lookahead != '.' &&
2132
+ lexer->lookahead != '\\') {
2133
+ state.saw_non_alphadot = true;
2134
+ }
2135
+ advance(lexer);
2136
+ }
2137
+ if (!was_space) {
2138
+ lexer->mark_end(lexer);
2139
+ }
2140
+ }
2141
+ }
2142
+
2143
+ lexer->result_symbol = EXTGLOB_PATTERN;
2144
+ scanner->last_glob_paren_depth = 0;
2145
+ return state.saw_non_alphadot;
2146
+ }
2147
+ scanner->last_glob_paren_depth = 0;
2148
+ #if DEBUG
2149
+ fprintf(stderr, "DEBUG: EXTGLOB not valid returning false\n");
2150
+ #endif
2151
+ return false;
2152
+ }
2153
+
2154
+ expansion_word:
2155
+ if (valid_symbols[EXPANSION_WORD]) {
2156
+ #if DEBUG
2157
+ fprintf(stderr,
2158
+ "DEBUG: EXPANSION_WORD handler called, context=%d, "
2159
+ "lookahead='%c'\n",
2160
+ get_current_context(scanner), lexer->lookahead);
2161
+ #endif
2162
+ // If we just returned a variable name and encounter # or %,
2163
+ // don't consume them as expansion word - let them be operator tokens
2164
+ if (was_just_variable_name &&
2165
+ (lexer->lookahead == '#' || lexer->lookahead == '%')) {
2166
+ #if DEBUG
2167
+ fprintf(stderr, "DEBUG: EXPANSION_WORD early return due to "
2168
+ "variable_name + operator\n");
2169
+ #endif
2170
+ return false;
2171
+ }
2172
+ bool advanced_once = false;
2173
+ bool advance_once_space = false;
2174
+ for (;;) {
2175
+ if (lexer->lookahead == '\"') {
2176
+ return false;
2177
+ }
2178
+ if (lexer->lookahead == '$') {
2179
+ lexer->mark_end(lexer);
2180
+ advance(lexer);
2181
+ if (lexer->lookahead == '{' || lexer->lookahead == '(' ||
2182
+ lexer->lookahead == '\'' || iswalnum(lexer->lookahead)) {
2183
+ lexer->result_symbol = EXPANSION_WORD;
2184
+ return true;
2185
+ }
2186
+ advanced_once = true;
2187
+ }
2188
+
2189
+ if (lexer->lookahead == '/' &&
2190
+ should_stop_at_pattern_slash(scanner)) {
2191
+ lexer->mark_end(lexer);
2192
+ lexer->result_symbol = EXPANSION_WORD;
2193
+ return true;
2194
+ }
2195
+
2196
+ if (lexer->lookahead == '}' && in_parameter_expansion(scanner)) {
2197
+ // Track exiting parameter expansion context
2198
+ lexer->mark_end(lexer);
2199
+ lexer->result_symbol = EXPANSION_WORD;
2200
+ return true;
2201
+ }
2202
+
2203
+ if (lexer->lookahead == '(' &&
2204
+ !(advanced_once || advance_once_space)) {
2205
+ lexer->mark_end(lexer);
2206
+ advance(lexer);
2207
+ while (lexer->lookahead != ')' && !lexer->eof(lexer)) {
2208
+ // if we find a $( or ${ assume this is valid and is
2209
+ // a garbage concatenation of some weird word + an
2210
+ // expansion
2211
+ // I wonder where this can fail
2212
+ if (lexer->lookahead == '$') {
2213
+ lexer->mark_end(lexer);
2214
+ advance(lexer);
2215
+ if (lexer->lookahead == '{' ||
2216
+ lexer->lookahead == '(' ||
2217
+ lexer->lookahead == '\'' ||
2218
+ iswalnum(lexer->lookahead)) {
2219
+ lexer->result_symbol = EXPANSION_WORD;
2220
+ return true;
2221
+ }
2222
+ advanced_once = true;
2223
+ } else {
2224
+
2225
+ // In parameter expansion, handle subscript boundaries
2226
+ // and operators properly
2227
+ if (should_stop_at_pattern_operators(scanner)) {
2228
+ if (lexer->lookahead == ']') {
2229
+ // Stop at ] to let it be handled as subscript
2230
+ // terminator
2231
+ lexer->mark_end(lexer);
2232
+ lexer->result_symbol = EXPANSION_WORD;
2233
+ return true;
2234
+ }
2235
+ if (lexer->lookahead == '#' ||
2236
+ lexer->lookahead == '%') {
2237
+ // Stop at operators to let them be handled
2238
+ // separately
2239
+ lexer->mark_end(lexer);
2240
+ lexer->result_symbol = EXPANSION_WORD;
2241
+ return true;
2242
+ }
2243
+ if (lexer->lookahead == ':') {
2244
+ // Stop at colon to let it be handled separately
2245
+ // for colon-based operations
2246
+ lexer->mark_end(lexer);
2247
+ lexer->result_symbol = EXPANSION_WORD;
2248
+ return true;
2249
+ }
2250
+ }
2251
+ advanced_once =
2252
+ advanced_once || !iswspace(lexer->lookahead);
2253
+ advance_once_space =
2254
+ advance_once_space || iswspace(lexer->lookahead);
2255
+ advance(lexer);
2256
+ }
2257
+ }
2258
+ lexer->mark_end(lexer);
2259
+ if (lexer->lookahead == ')') {
2260
+ advanced_once = true;
2261
+ advance(lexer);
2262
+ lexer->mark_end(lexer);
2263
+ } else {
2264
+ return false;
2265
+ }
2266
+ }
2267
+
2268
+ if (lexer->lookahead == '\'') {
2269
+ return false;
2270
+ }
2271
+
2272
+ if (lexer->eof(lexer)) {
2273
+ return false;
2274
+ }
2275
+
2276
+ // In parameter expansion, handle subscript boundaries and operators
2277
+ // properly
2278
+ if (should_stop_at_pattern_operators(scanner)) {
2279
+ #if DEBUG
2280
+ fprintf(stderr,
2281
+ "DEBUG: EXPANSION_WORD checking pattern operators, "
2282
+ "lookahead='%c'\n",
2283
+ lexer->lookahead);
2284
+ #endif
2285
+ if (lexer->lookahead == ']') {
2286
+ // Stop at ] to let it be handled as subscript terminator
2287
+ #if DEBUG
2288
+ fprintf(stderr, "DEBUG: EXPANSION_WORD stopping at ]\n");
2289
+ #endif
2290
+ lexer->mark_end(lexer);
2291
+ lexer->result_symbol = EXPANSION_WORD;
2292
+ return true;
2293
+ }
2294
+ if (lexer->lookahead == '#' || lexer->lookahead == '%' ||
2295
+ lexer->lookahead == '/') {
2296
+ context_type_t ctx = get_current_context(scanner);
2297
+ if (lexer->lookahead == '/' &&
2298
+ ctx == CTX_PARAMETER_PATTERN_SUBSTITUTE &&
2299
+ // should_stop_at_pattern_operators(scanner) &&
2300
+ !advanced_once) {
2301
+ // Stop at operators to let them be handled separately
2302
+ #if DEBUG
2303
+ fprintf(
2304
+ stderr,
2305
+ "DEBUG: EXPANSION_WORD stopping at operator '%c'\n",
2306
+ lexer->lookahead);
2307
+ #endif
2308
+ lexer->mark_end(lexer);
2309
+ lexer->result_symbol = EXPANSION_WORD;
2310
+ return true;
2311
+ }
2312
+ }
2313
+ }
2314
+
2315
+ advanced_once = advanced_once || !iswspace(lexer->lookahead);
2316
+ advance_once_space =
2317
+ advance_once_space || iswspace(lexer->lookahead);
2318
+ advance(lexer);
2319
+ }
2320
+ } else {
2321
+ #if DEBUG
2322
+ fprintf(stderr, "DEBUG: EXPANSION_WORD not valid, skipping\n");
2323
+ #endif
2324
+ }
2325
+
2326
+ // This handles ranges in braces
2327
+ brace_start:
2328
+ if (valid_symbols[BRACE_EXPR_START] && !in_error_recovery(valid_symbols)) {
2329
+ skip_ws(lexer);
2330
+
2331
+ if (lexer->lookahead == '{') {
2332
+ advance(lexer);
2333
+ lexer->mark_end(lexer);
2334
+
2335
+ // Don't enter context - brace expressions are handled by grammar
2336
+ // The grammar will handle the entire {1..10} pattern itself
2337
+
2338
+ while (isdigit(lexer->lookahead)) {
2339
+ advance(lexer);
2340
+ }
2341
+
2342
+ if (lexer->lookahead != '.') {
2343
+ return false;
2344
+ }
2345
+ advance(lexer);
2346
+
2347
+ if (lexer->lookahead != '.') {
2348
+ return false;
2349
+ }
2350
+ advance(lexer);
2351
+
2352
+ while (isdigit(lexer->lookahead)) {
2353
+ advance(lexer);
2354
+ }
2355
+
2356
+ if (lexer->lookahead != '}') {
2357
+ return false;
2358
+ }
2359
+
2360
+ enter_context(scanner, CTX_BRACE_EXPANSION);
2361
+ lexer->result_symbol = BRACE_EXPR_START;
2362
+ return true;
2363
+ }
2364
+ }
2365
+
2366
+ #if DEBUG
2367
+ fprintf(stderr, "SCANNER: scan returning false\n");
2368
+ #endif
2369
+
2370
+ return false;
2371
+ }
2372
+
2373
+ void *tree_sitter_zsh_external_scanner_create() {
2374
+ Scanner *scanner = calloc(1, sizeof(Scanner));
2375
+ UINT32_MAX; // Initialize to invalid position
2376
+ array_init(&scanner->heredocs);
2377
+ array_init(&scanner->context_stack);
2378
+ #if DEBUG
2379
+ fprintf(
2380
+ stderr,
2381
+ "DEBUG: Scanner created - heredocs.size=%u, context_stack.size=%u\n",
2382
+ scanner->heredocs.size, scanner->context_stack.size);
2383
+ #endif
2384
+ return scanner;
2385
+ }
2386
+
2387
+ bool tree_sitter_zsh_external_scanner_scan(void *payload, TSLexer *lexer,
2388
+ const bool *valid_symbols) {
2389
+ Scanner *scanner = (Scanner *)payload;
2390
+ return scan(scanner, lexer, valid_symbols);
2391
+ }
2392
+
2393
+ unsigned tree_sitter_zsh_external_scanner_serialize(void *payload,
2394
+ char *state) {
2395
+ Scanner *scanner = (Scanner *)payload;
2396
+ return serialize(scanner, state);
2397
+ }
2398
+
2399
+ void tree_sitter_zsh_external_scanner_deserialize(void *payload,
2400
+ const char *state,
2401
+ unsigned length) {
2402
+ Scanner *scanner = (Scanner *)payload;
2403
+ deserialize(scanner, state, length);
2404
+ }
2405
+
2406
+ void tree_sitter_zsh_external_scanner_destroy(void *payload) {
2407
+ Scanner *scanner = (Scanner *)payload;
2408
+ for (size_t i = 0; i < scanner->heredocs.size; i++) {
2409
+ Heredoc *heredoc = array_get(&scanner->heredocs, i);
2410
+ array_delete(&heredoc->current_leading_word);
2411
+ array_delete(&heredoc->delimiter);
2412
+ }
2413
+ array_delete(&scanner->heredocs);
2414
+ array_delete(&scanner->context_stack);
2415
+ free(scanner);
2416
+ }