@ast-grep/lang-haskell 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,3471 @@
1
+ /**
2
+ * The scanner is an extension to the built-in lexer that handles cases that are hard or impossible to express with the
3
+ * high-level grammar rules.
4
+ * Since Haskell is indentation sensitive and uses parse errors to end layouts, this component has many
5
+ * responsibilities.
6
+ *
7
+ * tree-sitter runs the scanner at every position repeatedly until it fails, after which the built-in lexer consumes one
8
+ * token.
9
+ * When the scanner succeeds, it returns the index of a symbol in the `externals` array in `grammar/externals.js`, which
10
+ * is then processed like other grammar symbols, except that it terminates any conflict branches in which the symbol
11
+ * isn't valid.
12
+ * The scanner's state is persisted and passed into the next run, but it is discarded when the scanner fails, i.e. when
13
+ * it yields control back to the built-in lexer.
14
+ *
15
+ * The high-level workflow of the scanner consists of three distinct modes.
16
+ * When the first character after whitespace is a newline, the scanner starts newline lookahead, otherwise it processes
17
+ * an interior position.
18
+ * If the state indicates that the previous run performed newline lookahead, it enters newline processing mode.
19
+ *
20
+ * In interior mode, a single lexing pass is performed.
21
+ *
22
+ * Such a pass consists of two steps:
23
+ *
24
+ * In the first step, the scanner identifies the immediate token by branching on the first character after whitespace
25
+ * and examining different conditions to select one of the variants of the enum `Lexed`, which enumerates all known,
26
+ * interesting, situations.
27
+ * The position of the lexer may be advanced in the process to look at subsequent characters.
28
+ * To avoid having to arrange different parts of the logic according to how many characters have been consumed,
29
+ * lookahead is written to an array in the transient state on demand, so that each component can specify the index
30
+ * relative to the position at the beginning of the run (modulo whitespace).
31
+ * The entry point for this step is the function `lex`.
32
+ *
33
+ * The second step is different for each mode.
34
+ * In interior mode, the `Lexed` token determines which symbol to return to the grammar based on the current state, like
35
+ * layout contexts and valid symbols.
36
+ * Most symbols do not contain any text, but only act as conditions in the grammar, but for symbolic operators, CPP,
37
+ * comments, pragmas, and quasiquotes, the lexer is advanced to the end of the token and `mark_end` is called to
38
+ * communicate the range to tree-sitter.
39
+ *
40
+ * In newline lookahead mode, the scanner performs repeated lexing passes until it encounters a `Lexed` token that is
41
+ * not CPP or a comment.
42
+ * In the second step of each pass, the token determines whether to terminate and/or which flags to set in the state to
43
+ * guide processing in the next run.
44
+ * If the lookahead loop has only made a single lexing pass that did not consume any characters of the following token
45
+ * (because the first character did not match any of the conditions for lexing that require more lookahead), the scanner
46
+ * switches to newline processing mode directly; otherwise it terminates the run after storing the newline information
47
+ * in the persistent state.
48
+ * This is possible by succeeding with the symbol `UPDATE`, which is mapped to newline in `externals`.
49
+ * tree-sitter does not create a node in the parse tree for this symbol if `mark_end` wasn't called after consuming
50
+ * lookahead, and immediately calls the scanner again at the same position.
51
+ *
52
+ * In either case, the scanner ends up in newline processing mode, in which it performs a series of highly
53
+ * order-sensitive steps based on the data collected in lookahead mode, potentially returning multiple symbols in
54
+ * successive runs until none of the newline-related conditions match.
55
+ * This procedure ensures that nested layouts are terminated at the earliest position instead of extending over all
56
+ * subsequent (top-level) whitespace, comments and CPP up to the next layout element.
57
+ * Only when all layouts are terminated will the scanner process the final `Lexed` token that it stored in the state in
58
+ * lookahead mode, using the same logic as in interior mode, and update the state to disable newline processing for the
59
+ * next run.
60
+ */
61
+
62
+ #include "tree_sitter/alloc.h"
63
+ #include "tree_sitter/array.h"
64
+ #include "tree_sitter/parser.h"
65
+
66
+ #include "unicode.h"
67
+ #include <assert.h>
68
+ #include <stdbool.h>
69
+ #include <string.h>
70
+
71
+ #define PEEK env->lexer->lookahead
72
+
73
+ #ifdef TREE_SITTER_DEBUG
74
+
75
+ #include <locale.h>
76
+
77
+ #define S_ADVANCE advance_debug(env)
78
+ #define S_SKIP skip_debug(env)
79
+ #define MARK(s) mark_debug(env, s)
80
+ #define dbg(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
81
+
82
+ #else
83
+
84
+ // Move the parser position one character to the right.
85
+ #define S_ADVANCE advance(env)
86
+
87
+ // Move the parser position one character to the right, treating the consumed character as whitespace.
88
+ #define S_SKIP env->lexer->advance(env->lexer, true)
89
+
90
+ /**
91
+ * Instruct the lexer that the current position is the end of the potentially detected symbol, causing the next run to
92
+ * be started after this character in the success case.
93
+ *
94
+ * This is useful if the validity of the detected symbol depends on what follows.
95
+ */
96
+ #define MARK(s) env->lexer->mark_end(env->lexer)
97
+
98
+ #define dbg(...) do {} while (0)
99
+
100
+ #endif
101
+
102
+ // Short circuit a parse step: If the argument expression returns 0, continue; otherwise return its result.
103
+ // This is used with enums, so casting to unsigned should not cause problems.
104
+ #define SEQ(expr) do { unsigned res = (unsigned) expr; if (res) return res; } while (0)
105
+
106
+ // --------------------------------------------------------------------------------------------------------
107
+ // Symbols
108
+ // --------------------------------------------------------------------------------------------------------
109
+
110
+ /**
111
+ * This enum mirrors the symbols in `externals` in `grammar/externals.js`.
112
+ * tree-sitter passes an array of booleans to the scanner whose entries are `true` if the symbol at the corresponding
113
+ * index is valid at the current parser position.
114
+ */
115
+ typedef enum {
116
+ FAIL,
117
+ SEMICOLON,
118
+ START,
119
+ START_DO,
120
+ START_CASE,
121
+ START_IF,
122
+ START_LET,
123
+ START_QUOTE,
124
+ START_EXPLICIT,
125
+ END,
126
+ END_EXPLICIT,
127
+ START_BRACE,
128
+ END_BRACE,
129
+ START_TEXP,
130
+ END_TEXP,
131
+ WHERE,
132
+ IN,
133
+ ARROW,
134
+ BAR,
135
+ DERIVING,
136
+ COMMENT,
137
+ HADDOCK,
138
+ CPP,
139
+ PRAGMA,
140
+ QQ_START,
141
+ QQ_BODY,
142
+ SPLICE,
143
+ QUAL_DOT,
144
+ TIGHT_DOT,
145
+ PREFIX_DOT,
146
+ DOTDOT,
147
+ TIGHT_AT,
148
+ PREFIX_AT,
149
+ TIGHT_BANG,
150
+ PREFIX_BANG,
151
+ TIGHT_TILDE,
152
+ PREFIX_TILDE,
153
+ PREFIX_PERCENT,
154
+ QUALIFIED_OP,
155
+ LEFT_SECTION_OP,
156
+ NO_SECTION_OP,
157
+ MINUS,
158
+ CONTEXT,
159
+ INFIX,
160
+ DATA_INFIX,
161
+ TYPE_INSTANCE,
162
+ VARSYM,
163
+ CONSYM,
164
+ UPDATE,
165
+ } Symbol;
166
+
167
+ #ifdef TREE_SITTER_DEBUG
168
+
169
+ static const char *sym_names[] = {
170
+ "fail",
171
+ "semicolon",
172
+ "start",
173
+ "start_do",
174
+ "start_case",
175
+ "start_if",
176
+ "start_let",
177
+ "start_quote",
178
+ "start_explicit",
179
+ "end",
180
+ "end_explicit",
181
+ "start_brace",
182
+ "end_brace",
183
+ "start_texp",
184
+ "end_texp",
185
+ "where",
186
+ "in",
187
+ "arrow",
188
+ "bar",
189
+ "deriving",
190
+ "comment",
191
+ "haddock",
192
+ "cpp",
193
+ "pragma",
194
+ "qq_start",
195
+ "qq_body",
196
+ "splice",
197
+ "tight_dot",
198
+ "proj_dot",
199
+ "prefix_dot",
200
+ "dotdot",
201
+ "tight_at",
202
+ "prefix_at",
203
+ "tight_bang",
204
+ "prefix_bang",
205
+ "tight_tilde",
206
+ "prefix_tilde",
207
+ "prefix_percent",
208
+ "qualified_op",
209
+ "left_section_op",
210
+ "no_section_op",
211
+ "minus",
212
+ "context",
213
+ "infix",
214
+ "data_infix",
215
+ "type_instance",
216
+ "varsym",
217
+ "consym",
218
+ "update",
219
+ };
220
+
221
+ #endif
222
+
223
+ // --------------------------------------------------------------------------------------------------------
224
+ // Data
225
+ // --------------------------------------------------------------------------------------------------------
226
+
227
+ #ifdef TREE_SITTER_DEBUG
228
+
229
+ typedef Array(int32_t) ParseLine;
230
+
231
+ /**
232
+ * A vector of lines, persisted across runs, for visualizing the current lexer position and scanner lookahead.
233
+ */
234
+ typedef Array(ParseLine) ParseLines;
235
+
236
+ /**
237
+ * Info about calls to `mark_end` and how far the lexer has progressed in a run.
238
+ * Discarded after each run.
239
+ */
240
+ typedef struct {
241
+ int marked;
242
+ unsigned marked_line;
243
+ unsigned start_col;
244
+ unsigned start_line;
245
+ unsigned end_col;
246
+ const char *marked_by;
247
+ } Debug;
248
+
249
+ Debug debug_new(TSLexer *l) {
250
+ return (Debug) {
251
+ .marked = -1,
252
+ .marked_line = 0,
253
+ .start_col = l->get_column(l),
254
+ .start_line = 0,
255
+ .end_col = 0,
256
+ .marked_by = "",
257
+ };
258
+ }
259
+
260
+ #endif
261
+
262
+ /**
263
+ * Different sorts of layout contexts that require special treatment.
264
+ */
265
+ typedef enum {
266
+ DeclLayout,
267
+ DoLayout,
268
+ CaseLayout,
269
+ LetLayout,
270
+ QuoteLayout,
271
+ MultiWayIfLayout,
272
+ Braces,
273
+ TExp,
274
+ ModuleHeader,
275
+ NoContext,
276
+ } ContextSort;
277
+
278
+ #ifdef TREE_SITTER_DEBUG
279
+
280
+ static char const *context_names[] = {
281
+ "decls",
282
+ "do",
283
+ "case",
284
+ "let",
285
+ "multi_way_if",
286
+ "quote",
287
+ "braces",
288
+ "texp",
289
+ "module_header",
290
+ "none",
291
+ };
292
+
293
+ #endif
294
+
295
+ /**
296
+ * The persistent state maintains a stack of layout contexts.
297
+ * New entries are created when a layout symbol is valid at the current position, and they are removed when the indent
298
+ * of a line satisfies conditions that depend on the current context sort, or when certain tokens (like `else`) occur.
299
+ */
300
+ typedef struct {
301
+ ContextSort sort;
302
+ uint32_t indent;
303
+ } Context;
304
+
305
+ /**
306
+ * This enumerates the lookahead tokens that have special meaning in the scanner.
307
+ */
308
+ typedef enum {
309
+ LNothing,
310
+ LEof,
311
+ LWhere,
312
+ LIn,
313
+ LThen,
314
+ LElse,
315
+ LDeriving,
316
+ LModule,
317
+ LUpper,
318
+ LTick,
319
+ LSymop,
320
+ LSymopSpecial,
321
+ LDotDot,
322
+ LDotId,
323
+ LDotSymop,
324
+ LDotOpen,
325
+ LDollar,
326
+ LBang,
327
+ LTilde,
328
+ LAt,
329
+ LPercent,
330
+ LHash,
331
+ LBar,
332
+ LArrow,
333
+ LCArrow,
334
+ LTexpCloser,
335
+ LQuoteClose,
336
+ LPragma,
337
+ LBlockComment,
338
+ LLineComment,
339
+ LBraceClose,
340
+ LBraceOpen,
341
+ LBracketOpen,
342
+ LUnboxedClose,
343
+ LSemi,
344
+ LCppElse,
345
+ LCpp,
346
+ } Lexed;
347
+
348
+ #ifdef TREE_SITTER_DEBUG
349
+
350
+ static const char *token_names[] = {
351
+ "nothing",
352
+ "eof",
353
+ "where",
354
+ "in",
355
+ "then",
356
+ "else",
357
+ "deriving",
358
+ "module",
359
+ "upper",
360
+ "tick",
361
+ "symop",
362
+ "symop-special",
363
+ "dot-dot",
364
+ "dot-id",
365
+ "dot-symop",
366
+ "dot-open",
367
+ "dollar",
368
+ "bang",
369
+ "tilde",
370
+ "at",
371
+ "percent",
372
+ "hash",
373
+ "bar",
374
+ "arrow",
375
+ "ctr",
376
+ "texp-closer",
377
+ "quote-close",
378
+ "pragma",
379
+ "block-comment",
380
+ "line-comment",
381
+ "brace-close",
382
+ "brace-open",
383
+ "bracket-open",
384
+ "unboxed-close",
385
+ "semi",
386
+ "cpp-else",
387
+ "cpp",
388
+ };
389
+
390
+ #endif
391
+
392
+ /**
393
+ * The current newline mode.
394
+ * `NInit` is set during newline lookahead, and `NProcess` when lookahead has finished.
395
+ * After processing is complete, the state is reset to `NInactive`.
396
+ * `NResume` is a special variant that forces newline lookahead mode when a run starts without requiring a newline.
397
+ * This is used for the beginning of the file and after pragmas (see `pragma`).
398
+ */
399
+ typedef enum {
400
+ NInactive,
401
+ NInit,
402
+ NProcess,
403
+ NResume,
404
+ } NewlineState;
405
+
406
+ /**
407
+ * The two newline modes need to operate across multiple scanner runs and adapt their behavior to the context
408
+ * established by previous runs, encoded by this persistent state.
409
+ */
410
+ typedef struct {
411
+ NewlineState state;
412
+ // The final token encountered after skipping comments and CPP.
413
+ Lexed end;
414
+ // The indent of `end`, used to decide layout actions before parsing intermediate extras.
415
+ uint32_t indent;
416
+ // When there is no token after extras, we shouldn't start layouts.
417
+ bool eof;
418
+ // Prohibit layout semicolons in future runs.
419
+ bool no_semi;
420
+ // Prohibit layout semicolons in future runs, but can be relaxed by some actions.
421
+ // See `explicit_semicolon`.
422
+ bool skip_semi;
423
+ // Lookahead has advanced into `end`, so the scanner has to be restarted before processing the newline result.
424
+ bool unsafe;
425
+ } Newline;
426
+
427
+ /**
428
+ * The vector for the layout context stack.
429
+ */
430
+ typedef Array(Context) Contexts;
431
+
432
+ /**
433
+ * Whenever the lexer is advanced over non-(leading-)whitespace, the consumed character is appended to this vector.
434
+ * This avoids having to ensure that different components that need to examine multiple lookahead characters have to be
435
+ * run in the correct order.
436
+ * Instead, we refer to lookahead by the character's index using the interface described in the section 'Lookahead'.
437
+ *
438
+ * For example, the functions `peek0`, `char0`, `char1` operate on the first/second character relative to the start of
439
+ * the scanner run, and the implementation advances the lexer position when it is necessary.
440
+ *
441
+ * The field `offset` can be used to reset relative indexing to the current lexer position.
442
+ * This is used, for example, in `newline_lookahead`, to perform repeated lexing passes, since `lex` uses the lookahead
443
+ * interface.
444
+ * After processing a `Lexed` token, `newline_lookahead` continues seeking ahead after comments and CPP, and when it
445
+ * encounters the next token, it calls `reset_lookahead` to set `offset` to the current position, ensuring that `lex`
446
+ * can use `char0` to test the following character.
447
+ *
448
+ * The terminology for advancing is:
449
+ * - "Advance before character C at index N" means "`lexer->lookahead` returns C, but 'Lookahead' does not contain C and
450
+ * has size N"
451
+ * - "Advance over character C at index N" means "`lexer->lookahead` returns the character following C, 'Lookahead'
452
+ * contains C and has size N+1" (or "advance before N+1")
453
+ * - If the size of 'Lookahead' is already larger than N, and therefore C can be read from the vector, the
454
+ * postconditions may not hold (when independent steps access lookahead at different indexes)
455
+ *
456
+ * Example:
457
+ *
458
+ * Assume we are parsing the following line, and the scanner is called right after the `a` in the right-hand side:
459
+ *
460
+ * > calc a b = a Library.Math.** b
461
+ * ^ (lexer position: before the character above the ^, `lexer->lookahead` returns the space)
462
+ * || 0/0 (content of `data` between bars, empty; `len` after bars, `offset` after slash)
463
+ *
464
+ * 'Lookahead' is initialized with `len = 0` and `offset = 0`.
465
+ *
466
+ * The full lookahead string (stored in tree-sitter's internals) at this position is ` Library.Math.** b`, and all
467
+ * _absolute_ indexes point into that string.
468
+ * Since tree-sitter only exposes the "next" character at a time, indexing requires advancing the lexer and copying
469
+ * characters to 'Lookahead' on demand.
470
+ *
471
+ * An initial `skip_space` advances over the space between `a` and `Lib`, which does not update 'Lookahead'.
472
+ *
473
+ * > calc a b = a Library.Math.** b
474
+ * ^
475
+ * || 0/0
476
+ *
477
+ * The uppercase character in `Lib` triggers the detection of qualified operators in `qualified_op`, which repeatedly
478
+ * lexes module segments and dots.
479
+ *
480
+ * The module segment step starts (in `conid`) by checking that the next character is upper case using `peek0` (short
481
+ * for `peek(0)`), which accesses the _first_ lookahead character – but _first_ is always relative to the current
482
+ * `offset`.
483
+ * We call the relative index `rel` and the absolute one `abs = offset + rel`.
484
+ * Before `Lib`, this translates to `abs = rel = 0`.
485
+ *
486
+ * `peek` checks if 'Lookahead' already contains the character for this index (`abs < len`), so it can directly return
487
+ * the value at `data[abs]`, which fails, since the vector is empty.
488
+ * Instead, it will fetch the character directly from the tree-sitter lexer.
489
+ * The lexer provides one character of lookahead outside of 'Lookahead', which is enough for this case.
490
+ * `peek` is a conservative action, so it will not copy the character to 'Lookahead', and leave the lexer position
491
+ * unchanged.
492
+ *
493
+ * `L` is upper case, so `qualified_op` switches to the next phase: Advancing to the end of the module segment, which
494
+ * amounts to advancing before the first character that is not an identifier character:
495
+ *
496
+ * > advance_while(1, is_inner_id_char)
497
+ *
498
+ * This function applies the specified predicate to the character at the specified index.
499
+ * If that returns `true`, it advances over the character and increments the index.
500
+ * These steps are repeated until the predicate is `false`.
501
+ * The index is returned, pointing to the character after the module segment.
502
+ *
503
+ * `peek0` doesn't modify lookahead, so the next character is still `L`.
504
+ * We don't need to validate it again, so the starting index specified to `advance_while` is `1`.
505
+ *
506
+ * Let's look at the steps performed by this function in detail.
507
+ * It starts by accessing the character at the initial index, calling `peek(1)`.
508
+ * As for the `L` check, this calculates `abs = offset + rel = 0 + 1` and determines that it is smaller than `len`,
509
+ * again.
510
+ * However, this time the requested character is the _second_ lookahead character, so `peek` calls `advance_before(1)`,
511
+ * which calls `advance` as many times as needed to access the character via `lexer->lookahead`, which is
512
+ * `offset + n - len` times, so _once_ in this case.
513
+ * The result is that `L` is copied to 'Lookahead' and `lexer->advance` is invoked one time, resulting in this new
514
+ * state:
515
+ *
516
+ * > calc a b = a Library.Math.** b
517
+ * ^
518
+ * || 1/0
519
+ *
520
+ * Now `lexer->lookahead` returns `i`, which `conid` successfully validates as an "inner ID character", so it increments
521
+ * the index to 2.
522
+ * `peek(2)` performs the exact same steps as `peek(1)`, as do all subsequent steps until `peek(7)` returns `.`, which
523
+ * fails the predicate, terminating the loop without advancing and returning 7 from `conid`, with the final state:
524
+ *
525
+ * > calc a b = a Library.Math.** b
526
+ * ^
527
+ * || 7/0
528
+ *
529
+ * `qualified_op` now examines the returned index:
530
+ * If it is 0, the first character was not upper case and there is no module segment at this position, so lexing fails
531
+ * and the scanner returns control to tree-sitter.
532
+ * Otherwise, it calls `char_at(7, '.')` to require that the character after the module segment is a dot, with the same
533
+ * consequences.
534
+ *
535
+ * Since our test code meets these conditions, `qualified_op` continues with `reset_lookahead_to(8)`.
536
+ * This sets `offset` to 8, causing all future lookahead actions that use relative indexes to operate on characters
537
+ * _after_ this new offset.
538
+ * Here this is the first character after the dot, `M`.
539
+ * Note that modifying the offset does not advance the lexer right away, so the lexer position will remain at 7:
540
+ *
541
+ * > calc a b = a Library.Math.** b
542
+ * ^ (zero-based index 7)
543
+ * || 7/8
544
+ *
545
+ * After a dot, `qualified_op` decides what to do next by determining whether what follows is a symbolic operator by
546
+ * calling `symop_lookahead`, which uses the same predicate-based function as before, `advance_while(0, symop_char)`.
547
+ * When that function calls `peek(0)`, the absolute index `offset + 0 = 8` is requested, which is not available, so the
548
+ * lexer is advanced once:
549
+ *
550
+ * > calc a b = a Library.Math.** b
551
+ * ^
552
+ * || 8/8
553
+ *
554
+ * Note that `len == 8` means there are eight characters in 'Lookahead', up to and including the dot, while the index
555
+ * `offset == 8` refers to the _ninth_ character, `M`.
556
+ *
557
+ * `M` is not a symop character, so `qualified_op` restarts the loop and parses the next module segment.
558
+ * The process is identical to the previous iteration except for the value of `offset`, which causes all steps that
559
+ * examine relative lookahead with `peek0` and `peek_at` add 8 to each index.
560
+ *
561
+ * Once the second dot is parsed, the symop test will succeed after advancing over both asterisks, which satisfies the
562
+ * termination condition in `qualified_op`, and the scanner run finishes with the final state:
563
+ *
564
+ * > calc a b = a Library.Math.** b
565
+ * ^
566
+ * || 15/13
567
+ */
568
+ typedef struct {
569
+ int32_t *contents;
570
+ uint32_t size;
571
+ uint32_t capacity;
572
+ uint32_t offset;
573
+ } Lookahead;
574
+
575
+ /**
576
+ * The state that is persisted across scanner runs.
577
+ *
578
+ * Although 'Lookahead' is always reset when starting a new run, storing it in the state avoids having to allocate and
579
+ * free the array repeatedly.
580
+ * Instead we just reset the `len` attribute to 0 and reuse the previous memory.
581
+ *
582
+ * REVIEW: Can tree-sitter run the scanner concurrently on multiple nodes in the same file in some situations?
583
+ */
584
+ typedef struct {
585
+ Contexts contexts;
586
+ Newline newline;
587
+ Lookahead lookahead;
588
+ #ifdef TREE_SITTER_DEBUG
589
+ ParseLines parse;
590
+ #endif
591
+ } State;
592
+
593
+ /**
594
+ * Transient state and stuff provided by tree-sitter.
595
+ */
596
+ typedef struct {
597
+ TSLexer *lexer;
598
+ const bool *symbols;
599
+ uint32_t symop;
600
+ State *state;
601
+ #ifdef TREE_SITTER_DEBUG
602
+ Debug debug;
603
+ #endif
604
+ } Env;
605
+
606
+ static Env env_new(TSLexer *l, const bool * symbols, State *state) {
607
+ return (Env) {
608
+ .lexer = l,
609
+ .symbols = symbols,
610
+ .symop = 0,
611
+ .state = state,
612
+ #ifdef TREE_SITTER_DEBUG
613
+ .debug = debug_new(l),
614
+ #endif
615
+ };
616
+ }
617
+
618
+ static void reset_newline(Env *env) { memset(&env->state->newline, 0, sizeof(Newline)); }
619
+
620
+ static bool newline_active(Env *env) { return env->state->newline.state == NInit || env->state->newline.state == NProcess; }
621
+
622
+ static bool newline_init(Env *env) { return env->state->newline.state == NInit; }
623
+
624
+ // --------------------------------------------------------------------------------------------------------
625
+ // Lexer interaction
626
+ // --------------------------------------------------------------------------------------------------------
627
+
628
+ static bool is_eof(Env *env) { return env->lexer->eof(env->lexer); }
629
+
630
+ static bool not_eof(Env *env) { return !(is_eof(env)); }
631
+
632
+ /**
633
+ * The parser's position in the current line.
634
+ * Note: This is expensive to use.
635
+ */
636
+ static uint32_t column(Env *env) {
637
+ return is_eof(env) ? 0 : env->lexer->get_column(env->lexer);
638
+ }
639
+
640
+ /**
641
+ * tree-sitter's lexer interface maintains a current position that determines the lookahead character and the range of
642
+ * text that is associated with the symbol selected by the scanner, if `mark_end` is called.
643
+ *
644
+ * It's not possible to read earlier characters once the lexer has advanced over them, so this function appends the
645
+ * lookahead character to the array `lookahead` in the `State`.
646
+ *
647
+ * Don't add zeroes to the lookahead buffer when hitting EOF – it causes `no_lookahead` to report false negatives.
648
+ */
649
+ static void advance(Env *env) {
650
+ if (not_eof(env)) {
651
+ array_push(&env->state->lookahead, PEEK);
652
+ env->lexer->advance(env->lexer, false);
653
+ }
654
+ }
655
+
656
+ static bool set_result_symbol(Env *env, Symbol result) {
657
+ if (result != FAIL) {
658
+ env->lexer->result_symbol = (TSSymbol) result;
659
+ return true;
660
+ }
661
+ return false;
662
+ }
663
+
664
+ #ifdef TREE_SITTER_DEBUG
665
+
666
+ static void mark_debug(Env *env, const char *restrict marked_by) {
667
+ dbg("mark: %s\n", marked_by);
668
+ env->debug.marked = (int) column(env);
669
+ env->debug.marked_line = 0;
670
+ env->debug.marked_by = marked_by;
671
+ env->lexer->mark_end(env->lexer);
672
+ }
673
+
674
+ static void append_parse_buffer(Env *env);
675
+
676
+ static void advance_debug(Env *env) {
677
+ append_parse_buffer(env);
678
+ advance(env);
679
+ }
680
+
681
+ static void skip_debug(Env *env) {
682
+ append_parse_buffer(env);
683
+ env->lexer->advance(env->lexer, true);
684
+ }
685
+
686
+ #endif
687
+
688
+ /**
689
+ * `inline` has a noticeable impact, reaching parity with a macro.
690
+ */
691
+ static inline bool valid(Env *env, Symbol s) { return env->symbols[s]; }
692
+
693
+ // --------------------------------------------------------------------------------------------------------
694
+ // Symbol constructors
695
+ // --------------------------------------------------------------------------------------------------------
696
+
697
+ static Symbol finish(Symbol s, const char *restrict desc) {
698
+ // Suppress unused param warning
699
+ (void) desc;
700
+ dbg("finish: %s\n", desc);
701
+ return s;
702
+ }
703
+
704
+ static Symbol finish_if_valid(Env *env, Symbol s, const char *restrict desc) {
705
+ if (valid(env, s)) return finish(s, desc);
706
+ return FAIL;
707
+ }
708
+
709
+ static Symbol finish_marked(Env *env, Symbol s, const char *restrict desc) {
710
+ (void) desc;
711
+ MARK(desc);
712
+ return s;
713
+ }
714
+
715
+ static Symbol update_state(const char *restrict desc) {
716
+ return finish(UPDATE, desc);
717
+ }
718
+
719
+ // --------------------------------------------------------------------------------------------------------
720
+ // Lookahead
721
+ // --------------------------------------------------------------------------------------------------------
722
+
723
+ /**
724
+ * Ensure that at least `abs + 1` characters are present in the lookahead buffer by calling `advance` `len - abs + 1`
725
+ * times.
726
+ */
727
+ static void advance_over_abs(Env *env, uint32_t abs) {
728
+ for (uint32_t i = env->state->lookahead.size; i <= abs; i++) S_ADVANCE;
729
+ }
730
+
731
+ /**
732
+ * Ensure that at least `rel` characters after and including the current `offset` are present in the lookahead buffer by
733
+ * calling `advance` as often as the difference between the desired index (`offset + rel`) and one less than the current
734
+ * buffer size.
735
+ *
736
+ * Note: The character at the offset is included in the range, so that when `len == offset == rel == 0`, this function
737
+ * advances once, over the character at index 0.
738
+ */
739
+ static void advance_over(Env *env, uint32_t rel) {
740
+ advance_over_abs(env, env->state->lookahead.offset + rel);
741
+ }
742
+
743
+ /**
744
+ * Skip whitespace relative to `offset`, but keep characters that have already been copied to the buffer.
745
+ *
746
+ * Example:
747
+ *
748
+ * > a = b
749
+ * ^
750
+ *
751
+ * Assume step A sets `offset` to 1, pointing to the first space.
752
+ * Step B calls `peek1`, to look at the `=`. This needs to advance over the space, which is copied to the lookahead
753
+ * buffer, causing `lexer->lookahead` to return `=`.
754
+ * Step C then calls `peek0`, sees that it is a space, and requests that it be skipped. Since it is already in the
755
+ * buffer, calling `lexer-advance` would skip the wrong character.
756
+ *
757
+ * Hence, this function only skips indexes larger than the lookahead buffer's `len`.
758
+ *
759
+ * Additionally, if `offset` has been set to a position outside of the buffer, all characters up to that index are
760
+ * copied to the buffer beforehand.
761
+ */
762
+ static void skip_over(Env *env, uint32_t rel) {
763
+ Lookahead *l = &env->state->lookahead;
764
+ // Subtraction is safe because the condition establishes that `offset` is at least 1
765
+ if (l->offset > l->size) advance_over_abs(env, l->offset - 1);
766
+ uint32_t abs = l->offset + rel;
767
+ for (uint32_t i = env->state->lookahead.size; i <= abs; i++) S_SKIP;
768
+ }
769
+
770
+ /**
771
+ * Ensure that the lookahead buffer is large enough to allow reading the `n`th character.
772
+ * Since `lexer->lookahead` points at the character after the buffer, it must have `offset + n - 1` elements.
773
+ */
774
+ static void advance_before(Env *env, uint32_t rel) {
775
+ uint32_t abs = env->state->lookahead.offset + rel;
776
+ if (abs > 0) advance_over_abs(env, abs - 1);
777
+ }
778
+
779
+ /**
780
+ * Return the lookahead character with index `n`.
781
+ * If the index is larger than the lookahead buffer, return 0.
782
+ *
783
+ * Unsafe insofar as that it does not advance if the index points outside of the lookahead buffer.
784
+ * This may happen in regular operation when a tool like `seq` attempts to look beyond EOF.
785
+ */
786
+ static int32_t unsafe_peek_abs(Env *env, uint32_t abs) {
787
+ return
788
+ abs < env->state->lookahead.size ?
789
+ env->state->lookahead.contents[abs] :
790
+ 0;
791
+ }
792
+
793
+ /**
794
+ * Return the lookahead character with index `offset + n`.
795
+ * See `unsafe_peek_abs`.
796
+ */
797
+ static int32_t unsafe_peek(Env *env, uint32_t rel) {
798
+ return unsafe_peek_abs(env, env->state->lookahead.offset + rel);
799
+ }
800
+
801
+ #ifdef TREE_SITTER_DEBUG
802
+
803
+ static void debug_peek(Env *env, uint32_t rel) {
804
+ uint32_t abs = env->state->lookahead.offset + rel;
805
+ dbg("peek ");
806
+ if (env->state->lookahead.offset > 0) dbg("%u->", env->state->lookahead.offset);
807
+ dbg("%u", rel);
808
+ if (abs < env->state->lookahead.size)
809
+ dbg(" cached | len: %u", env->state->lookahead.size);
810
+ else if (abs > env->state->lookahead.size)
811
+ dbg(" advance | len: %u", env->state->lookahead.size);
812
+ dbg("\n");
813
+ }
814
+
815
+ #endif
816
+
817
+ /**
818
+ * Return the lookahead character with index `offset + rel`.
819
+ * If the character is not accessible, advance the position until it is.
820
+ *
821
+ * This "peeks" insofar as it doesn't advance over the requested character – `peek(0)` is equivalent to
822
+ * `lexer->lookahead` if `offset == 0`.
823
+ */
824
+ static int32_t peek(Env *env, uint32_t rel) {
825
+ #ifdef TREE_SITTER_DEBUG
826
+ debug_peek(env, rel);
827
+ #endif
828
+ if (env->state->lookahead.offset + rel < env->state->lookahead.size) return unsafe_peek(env, rel);
829
+ else {
830
+ advance_before(env, rel);
831
+ return PEEK;
832
+ }
833
+ }
834
+
835
+ /**
836
+ * Return the first lookahead character after the `offset` without advancing the position.
837
+ */
838
+ static int32_t peek0(Env *env) { return peek(env, 0); }
839
+
840
+ /**
841
+ * Return the second lookahead character after the `offset` without advancing the position further than the first
842
+ * character.
843
+ */
844
+ static int32_t peek1(Env *env) { return peek(env, 1); }
845
+
846
+ /**
847
+ * Return the third lookahead character after the `offset` without advancing the position further than the second
848
+ * character.
849
+ */
850
+ static int32_t peek2(Env *env) { return peek(env, 2); }
851
+
852
+ /**
853
+ * Test the lookahead character at index `offset + n` for equality.
854
+ */
855
+ static bool char_at(Env *env, uint32_t n, int32_t c) {
856
+ return peek(env, n) == c;
857
+ }
858
+
859
+ /**
860
+ * Test the lookahead character at index `offset` for equality.
861
+ */
862
+ static bool char0(Env *env, int32_t c) {
863
+ return char_at(env, 0, c);
864
+ }
865
+
866
+ /**
867
+ * Test the lookahead character at index `offset + 1` for equality.
868
+ */
869
+ static bool char1(Env *env, int32_t c) {
870
+ return char_at(env, 1, c);
871
+ }
872
+
873
+ /**
874
+ * Test the lookahead character at index `offset + 2` for equality.
875
+ */
876
+ static bool char2(Env *env, int32_t c) {
877
+ return char_at(env, 2, c);
878
+ }
879
+
880
+ /**
881
+ * Set the offset to `index`, so that the indexes in future calls to lookahead functions like `char0` are interpreted
882
+ * relative to this new value.
883
+ *
884
+ * Resets `symop` for soundness, even though no rule would continue after advancing over symbolic characters.
885
+ *
886
+ * See 'Lookahead' for an example.
887
+ */
888
+ static void reset_lookahead_abs(Env *env, uint32_t abs) {
889
+ dbg("reset: %u\n", abs);
890
+ env->state->lookahead.offset = abs;
891
+ env->symop = 0;
892
+ }
893
+
894
+ static void reset_lookahead_to(Env *env, uint32_t rel) {
895
+ reset_lookahead_abs(env, env->state->lookahead.offset + rel);
896
+ }
897
+
898
+ /**
899
+ * Move `offset` to the end of the consumed lookahead, causing `peek`, `char0` etc. to operate on characters following
900
+ * the current position at the time this function is executed.
901
+ */
902
+ static void reset_lookahead(Env *env) {
903
+ reset_lookahead_abs(env, env->state->lookahead.size);
904
+ }
905
+
906
+ /**
907
+ * Return whether the lookahead position has been advanced since starting the run, not considering skipped characters
908
+ * (which are usually whitespace).
909
+ * This is important to decide whether the scanner has to be restarted to emit certain symbols.
910
+ *
911
+ * For example, before starting layouts and generating layout semicolons after newlines, we skip whitespace and mark, so
912
+ * that subsequent symbols start at their non-whitespace boundary instead of before the newline(s).
913
+ * When newline lookahead mode finishes, it can continue directly with this step _only if_ no non-whitespace characters
914
+ * were consumed, otherwise they would be included in the semicolon symbol.
915
+ * We also cannot unconditionally mark after whitespace in newline lookahead mode since there are several potential
916
+ * symbols that can be emitted before skipped whitespace is marked, like layout end, which should not extend beyond
917
+ * newlines.
918
+ */
919
+ static bool no_lookahead(Env *env) {
920
+ return env->state->lookahead.size == 0;
921
+ }
922
+
923
+ /**
924
+ * Return the column of the first lookahead character of the current run.
925
+ * This is needed for starting layouts in interior mode, since we don't count positions across interior runs.
926
+ */
927
+ static uint32_t start_column(Env *env) {
928
+ return column(env) - env->state->lookahead.size;
929
+ }
930
+
931
+ /**
932
+ * Increment `i` while the predicate is true for the lookahead character at that index (relative to `offset`), advancing
933
+ * the position when `i` points beyond the end of the lookahead buffer.
934
+ * Return the index after the last matching character.
935
+ */
936
+ static uint32_t advance_while(Env *env, uint32_t i, bool (*pred)(int32_t)) {
937
+ while (pred(peek(env, i))) { i++; }
938
+ return i;
939
+ }
940
+
941
+ /**
942
+ * Same as `advance_while`, using "not equal to `c`" for the predicate.
943
+ * Stops at EOF.
944
+ */
945
+ static uint32_t advance_until_char(Env *env, uint32_t i, int32_t c) {
946
+ while (not_eof(env) && !char_at(env, i, c)) { i++; }
947
+ return i;
948
+ }
949
+
950
+ // --------------------------------------------------------------------------------------------------------
951
+ // Context manipulation and conditions
952
+ // --------------------------------------------------------------------------------------------------------
953
+
954
+ static bool has_contexts(Env *env) { return env->state->contexts.size != 0; }
955
+
956
+ /**
957
+ * Push a layout context onto the stack.
958
+ */
959
+ static void push_context(Env *env, ContextSort sort, uint32_t indent) {
960
+ dbg("push: %s %d\n", context_names[sort], indent);
961
+ Context ctx = (Context) {.sort = sort, .indent = indent};
962
+ array_push(&env->state->contexts, ctx);
963
+ }
964
+
965
+ /**
966
+ * Remove a layout context from the stack.
967
+ */
968
+ static void pop(Env *env) {
969
+ if (has_contexts(env)) {
970
+ dbg("pop: %s\n", context_names[array_back(&env->state->contexts)->sort]);
971
+ array_pop(&env->state->contexts);
972
+ }
973
+ }
974
+
975
+ static ContextSort current_context(Env *env) {
976
+ return has_contexts(env) ? array_back(&env->state->contexts)->sort : NoContext;
977
+ }
978
+
979
+ static bool is_layout_context(Env *env) {
980
+ return current_context(env) < Braces;
981
+ }
982
+
983
+ /**
984
+ * Decide whether the current context requires generation of layout semicolons.
985
+ * This is true for all layout contexts except for multi-way if, since that uses `|` to start layout elements.
986
+ */
987
+ static bool is_semicolon_context(Env *env) {
988
+ return current_context(env) < MultiWayIfLayout;
989
+ }
990
+
991
+ /**
992
+ * Return the indent of the innermost layout context.
993
+ * If there are non-layout contexts at the top of the stack, search downwards.
994
+ */
995
+ static uint32_t current_indent(Env *env) {
996
+ for (int32_t i = (int32_t) env->state->contexts.size - 1; i >= 0; i--) {
997
+ Context *cur = array_get(&env->state->contexts, i);
998
+ if (cur->sort < Braces) return cur->indent;
999
+ }
1000
+ return 0;
1001
+ }
1002
+
1003
+ static bool indent_less(Env *env, uint32_t indent) {
1004
+ return is_layout_context(env) && indent < current_indent(env);
1005
+ }
1006
+
1007
+ static bool indent_lesseq(Env *env, uint32_t indent) {
1008
+ return is_layout_context(env) && indent <= current_indent(env);
1009
+ }
1010
+
1011
+ static bool top_layout(Env *env) {
1012
+ return env->state->contexts.size == 1;
1013
+ }
1014
+
1015
+ static bool in_module_header(Env *env) {
1016
+ return current_context(env) == ModuleHeader;
1017
+ }
1018
+
1019
+ /**
1020
+ * Return the appropriate symbol to close the given context, or FAIL if it can't be closed.
1021
+ */
1022
+ static Symbol context_end_sym(ContextSort s) {
1023
+ switch (s) {
1024
+ case TExp:
1025
+ return END_TEXP;
1026
+ case Braces:
1027
+ return END_BRACE;
1028
+ default:
1029
+ return s < Braces ? END : FAIL;
1030
+ }
1031
+ }
1032
+
1033
+ // --------------------------------------------------------------------------------------------------------
1034
+ // Character and lookahead conditions
1035
+ // --------------------------------------------------------------------------------------------------------
1036
+
1037
+ #define NEWLINE_CASES \
1038
+ case '\n': \
1039
+ case '\r': \
1040
+ case '\f'
1041
+
1042
+
1043
+ static bool is_newline(int32_t c) {
1044
+ switch (c) {
1045
+ NEWLINE_CASES:
1046
+ return true;
1047
+ default:
1048
+ return false;
1049
+ }
1050
+ }
1051
+
1052
+ static bool varid_start_char(const int32_t c) { return c == '_' || is_varid_start_char(c); }
1053
+
1054
+ // TODO This should be combined with is_inner_id_char and made more explicit about when which char can occur.
1055
+ // For example, lex_symop uses this to decide about prefix dot being a field selector, where single quotes aren't valid.
1056
+ static bool is_id_char(const int32_t c) {
1057
+ return c == '_' || c == '\'' || is_identifier_char(c);
1058
+ }
1059
+
1060
+ // TODO hashes only work at the end of identifiers
1061
+ static bool is_inner_id_char(const int32_t c) {
1062
+ return is_id_char(c) || c == '#';
1063
+ }
1064
+
1065
+ static bool quoter_char(const int32_t c) { return is_id_char(c) || c == '.'; }
1066
+
1067
+ static bool reserved_symbolic(const int32_t c) {
1068
+ switch (c) {
1069
+ case '(':
1070
+ case ')':
1071
+ case ',':
1072
+ case ';':
1073
+ case '[':
1074
+ case ']':
1075
+ case '`':
1076
+ case '{':
1077
+ case '}':
1078
+ case '"':
1079
+ case '\'':
1080
+ case '_':
1081
+ return true;
1082
+ default: return false;
1083
+ }
1084
+ }
1085
+
1086
+ static bool symop_char(const int32_t c) {
1087
+ return is_symop_char(c) && !reserved_symbolic(c);
1088
+ }
1089
+
1090
+ /**
1091
+ * Advance the position to the first character that's not valid for a symbolic operator, and return that position.
1092
+ * If the function has been called before, directly return the cached position.
1093
+ *
1094
+ * This consumes the entire symop, since the field denotes the length of the string and therefore the last (failing)
1095
+ * peek is _beyond_ the end, consuming the last valid char.
1096
+ */
1097
+ static uint32_t symop_lookahead(Env *env) {
1098
+ if (env->symop == 0) {
1099
+ env->symop = advance_while(env, 0, symop_char);
1100
+ if (env->symop > 0)
1101
+ dbg("symop: %d, %.*ls\n", env->symop, env->symop, env->state->lookahead.contents + env->state->lookahead.offset);
1102
+ }
1103
+ return env->symop;
1104
+ }
1105
+
1106
+ static bool is_symop(Env *env) {
1107
+ return symop_lookahead(env) > 0;
1108
+ }
1109
+
1110
+ /**
1111
+ * The parser calls `scan` with all symbols declared as valid directly after it encountered an error.
1112
+ * The symbol `FAIL` is not used in the grammar, so it can only be valid in this error case.
1113
+ */
1114
+
1115
+ static bool after_error(Env *env) { return valid(env, FAIL); }
1116
+
1117
+ // --------------------------------------------------------------------------------------------------------
1118
+ // Debug printing
1119
+ // --------------------------------------------------------------------------------------------------------
1120
+
1121
+ #ifdef TREE_SITTER_DEBUG
1122
+
1123
+ static void push_parse_buffer_line(Env *env) {
1124
+ ParseLine new_line = array_new();
1125
+ array_reserve(&new_line, 1);
1126
+ array_push(&env->state->parse, new_line);
1127
+ }
1128
+
1129
+ static ParseLine *ensure_parse_buffer(Env *env) {
1130
+ ParseLines *buffer = &env->state->parse;
1131
+ if (buffer->size == 0) push_parse_buffer_line(env);
1132
+ if (is_newline(PEEK)) push_parse_buffer_line(env);
1133
+ return array_back(buffer);
1134
+ }
1135
+
1136
+ static void append_parse_buffer(Env *env) {
1137
+ ParseLine *current_line = ensure_parse_buffer(env);
1138
+ if (is_newline(PEEK)) {
1139
+ env->debug.marked_line++;
1140
+ env->debug.start_line++;
1141
+ }
1142
+ else if (column(env) >= current_line->size) array_push(current_line, PEEK);
1143
+ }
1144
+
1145
+ static void fill_parse_buffer(Env *env) {
1146
+ env->debug.end_col = column(env);
1147
+ while (!(is_newline(PEEK) || is_eof(env))) S_ADVANCE;
1148
+ }
1149
+
1150
+ static bool seq(Env *env, const char *restrict s);
1151
+
1152
+ static void print_lookahead(Env *env) {
1153
+ dbg("lookahead: %.*ls\n", env->state->lookahead.size, env->state->lookahead.contents);
1154
+ }
1155
+
1156
+ static const char * space = "<space>";
1157
+ static const char * newline_char = "\\n";
1158
+
1159
+ static const char * show_char(int32_t c) {
1160
+ switch (c) {
1161
+ NEWLINE_CASES:
1162
+ return newline_char;
1163
+ case ' ':
1164
+ case '\t':
1165
+ case '\v':
1166
+ return space;
1167
+ default:
1168
+ return NULL;
1169
+ }
1170
+ }
1171
+
1172
+ static void print_lookahead_chars_from(Env *env, uint32_t start) {
1173
+ if (start < env->state->lookahead.size) {
1174
+ dbg("lookahead from %d: ", start);
1175
+ for (; start < env->state->lookahead.size; start++) {
1176
+ int32_t c = env->state->lookahead.contents[start];
1177
+ const char * s = show_char(c);
1178
+ if (s == NULL) dbg("%lc", c);
1179
+ else dbg("%s", s);
1180
+ }
1181
+ dbg("\n");
1182
+ }
1183
+ else
1184
+ dbg("print_lookahead_chars_from: Too large (%d / %d)", start, env->state->lookahead.size);
1185
+ }
1186
+
1187
+ static void debug_contexts(Env *env) {
1188
+ if (env->state->contexts.size == 0) dbg("empty");
1189
+ bool empty = true;
1190
+ for (size_t i = 0; i < env->state->contexts.size; i++) {
1191
+ if (!empty) dbg("-");
1192
+ Context ctx = *array_get(&env->state->contexts, i);
1193
+ if (ctx.sort == ModuleHeader) dbg("pre");
1194
+ else if (ctx.sort == Braces) dbg("brace");
1195
+ else if (ctx.sort == TExp) dbg("texp");
1196
+ else {
1197
+ if (ctx.sort == DoLayout) dbg("do ");
1198
+ else if (ctx.sort == LetLayout) dbg("let ");
1199
+ else if (ctx.sort == CaseLayout) dbg("case ");
1200
+ else if (ctx.sort == MultiWayIfLayout) dbg("if ");
1201
+ else if (ctx.sort == QuoteLayout) dbg("quote ");
1202
+ dbg("%d", ctx.indent);
1203
+ }
1204
+ empty = false;
1205
+ }
1206
+ }
1207
+
1208
+ void debug_newline(Env *env) {
1209
+ switch (env->state->newline.state) {
1210
+ case NInactive:
1211
+ dbg("no");
1212
+ break;
1213
+ case NInit:
1214
+ dbg("init");
1215
+ break;
1216
+ case NProcess:
1217
+ dbg("process");
1218
+ break;
1219
+ case NResume:
1220
+ dbg("resume");
1221
+ break;
1222
+ }
1223
+ if (env->state->newline.state != NInactive) dbg(" %d %s", env->state->newline.indent, token_names[env->state->newline.end]);
1224
+ if (env->state->newline.eof) dbg(" [eof]");
1225
+ if (env->state->newline.no_semi) dbg(" [no_semi]");
1226
+ if (env->state->newline.skip_semi) dbg(" [skip_semi]");
1227
+ if (env->state->newline.unsafe) dbg(" [unsafe]");
1228
+ }
1229
+
1230
+ /**
1231
+ * Produce a comma-separated string of valid symbols.
1232
+ */
1233
+ static void debug_valid(Env *env, const bool *syms) {
1234
+ if (after_error(env)) {
1235
+ dbg("all");
1236
+ return;
1237
+ }
1238
+ bool fst = true;
1239
+ for (Symbol i = FAIL; i <= UPDATE; i++) {
1240
+ if (syms[i]) {
1241
+ if (!fst) dbg(",");
1242
+ dbg("%s", sym_names[i]);
1243
+ fst = false;
1244
+ }
1245
+ }
1246
+ }
1247
+
1248
+ static bool debug_init(Env *env) {
1249
+ setlocale(LC_ALL, "C.UTF-8");
1250
+ dbg("\n");
1251
+ dbg("state:\n syms = ");
1252
+ debug_valid(env, env->symbols);
1253
+ dbg("\n contexts = ");
1254
+ debug_contexts(env);
1255
+ dbg("\n newline = ");
1256
+ debug_newline(env);
1257
+ dbg("\n");
1258
+ return false;
1259
+ }
1260
+
1261
+ void sgr(const char *restrict code) {
1262
+ dbg("\x1b[%sm", code);
1263
+ }
1264
+
1265
+ void color(unsigned c) {
1266
+ char code[3];
1267
+ sprintf(code, "3%d", c);
1268
+ sgr(code);
1269
+ }
1270
+
1271
+ void palette() {
1272
+ color(4);
1273
+ dbg("before");
1274
+ color(2);
1275
+ dbg(" marked");
1276
+ color(3);
1277
+ dbg(" advanced");
1278
+ color(5);
1279
+ dbg(" lookahead");
1280
+ sgr("");
1281
+ dbg("\n");
1282
+ }
1283
+
1284
+ static bool debug_parse_metadata = false;
1285
+
1286
+ static void dump_parse_metadata(Env *env) {
1287
+ Debug *debug = &env->debug;
1288
+ dbg(
1289
+ "lines: %d | start_line: %d | start_col: %d | marked_line: %d | marked: %d | end_col: %d | persist lines: %d\n",
1290
+ env->state->parse.size,
1291
+ debug->start_line,
1292
+ debug->start_col,
1293
+ debug->marked_line,
1294
+ debug->marked,
1295
+ debug->end_col,
1296
+ env->state->parse.size - debug->marked_line
1297
+ );
1298
+ }
1299
+
1300
+ /**
1301
+ * Note: We're printing individual characters here instead of using a format with precision like `%.*ls` and slicing
1302
+ * the buffer, because:
1303
+ * - The buffer contains wide characters, but `fprintf` counts bytes
1304
+ * - `fwprintf` counts wide characters, but can't be interleaved with `fprintf`, so we'd have to use that function, and
1305
+ * therefore wide literals, everywhere, which is tedious
1306
+ */
1307
+ void debug_parse(Env *env) {
1308
+ Debug *debug = &env->debug;
1309
+ ParseLines *buffer = &env->state->parse;
1310
+ uint32_t lines = buffer->size;
1311
+ dbg("-----------------------\n");
1312
+ // For investigating mistakes in the debugging code.
1313
+ if (debug_parse_metadata) dump_parse_metadata(env);
1314
+ if (lines > 0) {
1315
+ color(4);
1316
+ for (uint32_t i = 0; i < lines; i++) {
1317
+ ParseLine *line = array_get(buffer, i);
1318
+ int32_t *buf = line->contents;
1319
+ if (line->contents == NULL) break;
1320
+ uint32_t pos = 0;
1321
+
1322
+ if (debug->start_line == lines - 1 - i) {
1323
+ while (pos < debug->start_col) { dbg("%lc", buf[pos]); pos++; }
1324
+ color(2);
1325
+ }
1326
+
1327
+ if (debug->marked >= 0 && debug->marked_line == lines - 1 - i) {
1328
+ while ((int) pos < debug->marked) { dbg("%lc", buf[pos]); pos++; }
1329
+ color(3);
1330
+ }
1331
+
1332
+ if (i == lines - 1) {
1333
+ while (pos < debug->end_col) { dbg("%lc", buf[pos]); pos++; }
1334
+ color(5);
1335
+ }
1336
+
1337
+ while (pos < line->size) { dbg("%lc", buf[pos]); pos++; }
1338
+
1339
+ dbg("\n");
1340
+ }
1341
+ sgr("");
1342
+ }
1343
+ dbg("-----------------------\n");
1344
+ }
1345
+
1346
+ static unsigned serialize_parse_lines(char *cursor, ParseLines *parse, unsigned to_copy) {
1347
+ for (unsigned i = 0; i < parse->size; i++) {
1348
+ ParseLine *line = array_get(parse, i);
1349
+ unsigned line_size = line->size * sizeof(uint32_t);
1350
+ to_copy += line_size + sizeof(uint32_t);
1351
+ if (to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
1352
+ *((uint32_t *) cursor) = line->size;
1353
+ cursor += sizeof(line->size);
1354
+ memcpy(cursor, line->contents, line_size);
1355
+ cursor += line_size;
1356
+ }
1357
+ return to_copy;
1358
+ }
1359
+
1360
+ static void deserialize_parse_lines(const char *cursor, ParseLines *parse, uint32_t size) {
1361
+ // Ensure ParseLines has room for at _least_ as many lines as the new state
1362
+ array_reserve(parse, size);
1363
+ for (unsigned i = 0; i < size; i++) {
1364
+ if (i >= parse->size) { array_push(parse, (ParseLine)array_new()); }
1365
+ ParseLine *line = &parse->contents[i];
1366
+ uint32_t line_len = *((uint32_t *) cursor);
1367
+ cursor += sizeof(uint32_t);
1368
+ array_reserve(line, line_len);
1369
+ line->size = line_len;
1370
+ unsigned line_size = line->size * sizeof(uint32_t);
1371
+ memcpy(line->contents, cursor, line_size);
1372
+ cursor += line_size;
1373
+ }
1374
+ // Free the excessive lines in the previous since we can't check in the next round whether there was a line in
1375
+ // a slot before and reuse the pointer.
1376
+ // This only happens when we didn't push any lines above, which would reset parse->len to len.
1377
+ for (unsigned i = parse->size; i > size; i--) { array_delete(array_get(parse, i - 1)); }
1378
+ // Truncate ParseLines in case the new state has fewer lines
1379
+ parse->size = size;
1380
+ }
1381
+
1382
+ void debug_finish(Env *env, Symbol result) {
1383
+ dbg("result: ");
1384
+ if (result) dbg("%s, ", sym_names[result]);
1385
+ else dbg("<skipped>, ");
1386
+ if (env->debug.marked == -1) dbg("%d", column(env));
1387
+ else dbg("%s@%d", env->debug.marked_by, env->debug.marked);
1388
+ dbg("\n\n");
1389
+ fill_parse_buffer(env);
1390
+ debug_parse(env);
1391
+ env->state->parse.size -= env->debug.marked_line;
1392
+ }
1393
+
1394
+ #endif
1395
+
1396
+ // --------------------------------------------------------------------------------------------------------
1397
+ // Lookahead
1398
+ // --------------------------------------------------------------------------------------------------------
1399
+
1400
+ /**
1401
+ * Check if lookahead contains the string `s` starting at position `offset + start`.
1402
+ * This advances only over matching characters.
1403
+ */
1404
+ static bool seq_from(Env *env, const char *restrict s, uint32_t start) {
1405
+ uint32_t len = (uint32_t) strlen(s);
1406
+ for (uint32_t i = 0; i < len; i++) {
1407
+ int32_t c = s[i];
1408
+ int32_t c2 = peek(env, start + i);
1409
+ if (c != c2) return false;
1410
+ }
1411
+ peek(env, start + len);
1412
+ return true;
1413
+ }
1414
+
1415
+ /**
1416
+ * Check if lookahead contains the string `s` starting at position `offset`.
1417
+ */
1418
+ static bool seq(Env *env, const char *restrict s) {
1419
+ return seq_from(env, s, 0);
1420
+ }
1421
+
1422
+ /**
1423
+ * Advance until the next newline or EOF, used to consume the body of a comment.
1424
+ */
1425
+ static void take_line(Env *env) {
1426
+ while (not_eof(env) && !is_newline(PEEK)) S_ADVANCE;
1427
+ }
1428
+
1429
+ static bool is_space_or_tab(int32_t c) {
1430
+ return c == ' ' || c == '\t';
1431
+ }
1432
+
1433
+ /**
1434
+ * Advance until the next newline or EOF, used to consume the body of a cpp directive.
1435
+ * Escaped newlines are treated as line continuations, which allow spaces and tabs between backslash and newline.
1436
+ */
1437
+ static void take_line_escaped_newline(Env *env) {
1438
+ for (;;) {
1439
+ while (not_eof(env) && !is_newline(PEEK) && PEEK != '\\') S_ADVANCE;
1440
+ if (PEEK == '\\') {
1441
+ S_ADVANCE;
1442
+ if (is_space_or_tab(PEEK)) {
1443
+ while (is_space_or_tab(PEEK)) S_ADVANCE;
1444
+ if (is_newline(PEEK)) S_ADVANCE;
1445
+ }
1446
+ else S_ADVANCE;
1447
+ }
1448
+ else return;
1449
+ }
1450
+ }
1451
+
1452
+ /**
1453
+ * Skip the lexer until the following character is neither space nor tab.
1454
+ * Return whether any characters were skipped.
1455
+ */
1456
+ static bool skip_space(Env *env) {
1457
+ if (!is_space_char(PEEK)) return false;
1458
+ S_SKIP;
1459
+ while(is_space_char(PEEK)) S_SKIP;
1460
+ return true;
1461
+ }
1462
+
1463
+ /**
1464
+ * Skip the lexer until the following character is not a newline.
1465
+ * Return whether any characters were skipped.
1466
+ */
1467
+ static bool skip_newlines(Env *env) {
1468
+ if (!is_newline(PEEK)) return false;
1469
+ S_SKIP;
1470
+ while(is_newline(PEEK)) S_SKIP;
1471
+ return true;
1472
+ }
1473
+
1474
+ typedef enum {
1475
+ NoSpace,
1476
+ Indented,
1477
+ BOL,
1478
+ } Space;
1479
+
1480
+ /**
1481
+ * Alternate between skipping space and newlines, and return which was seen last.
1482
+ * This does not use the lookahead buffer, but directly accesses the lexer.
1483
+ * Only to be used when it is certain that no whitespace has been copied to the buffer by previous steps, and that no
1484
+ * previous characters should be included in the range of non-zero-width symbol.
1485
+ */
1486
+ static Space skip_whitespace(Env *env) {
1487
+ Space space = NoSpace;
1488
+ while (true) {
1489
+ if (skip_space(env)) space = Indented;
1490
+ else if (skip_newlines(env)) space = BOL;
1491
+ else return space;
1492
+ };
1493
+ }
1494
+
1495
+ /**
1496
+ * Advance the lexer until the following character is neither space nor tab, starting at position `offset + start`, and
1497
+ * return the index of the next character.
1498
+ */
1499
+ static uint32_t take_space_from(Env *env, uint32_t start) {
1500
+ return advance_while(env, start, is_space_char);
1501
+ }
1502
+
1503
+ /**
1504
+ * Ensure that the character after a keyword like `module` is not a character that would change its meaning to be an
1505
+ * identifier.
1506
+ */
1507
+ static bool token_end(int32_t c) { return !is_inner_id_char(c); }
1508
+
1509
+ /**
1510
+ * Check if lookahead contains the string `s` starting at position `offset + start`, followed by a non-id character.
1511
+ * See `seq`.
1512
+ */
1513
+ static bool token_from(Env *env, const char *restrict s, uint32_t start) {
1514
+ return seq_from(env, s, start) && token_end(peek(env, start + (uint32_t) strlen(s)));
1515
+ }
1516
+
1517
+ /**
1518
+ * `token_from` at the current offset.
1519
+ */
1520
+ static bool token(Env *env, const char *restrict s) {
1521
+ return seq(env, s) && token_end(peek(env, (uint32_t) strlen(s)));
1522
+ }
1523
+
1524
+ /**
1525
+ * Check if lookahead contains any of the strings in `tokens` starting at position `offset + start`, followed by a
1526
+ * non-id character.
1527
+ */
1528
+ static bool any_token_from(Env *env, size_t n, const char * tokens[], uint32_t start) {
1529
+ for (size_t i = 0; i < n; i++) {
1530
+ if (token_from(env, tokens[i], start)) return true;
1531
+ }
1532
+ return false;
1533
+ }
1534
+
1535
+ static bool match_symop(Env *env, const char *restrict target) {
1536
+ return symop_lookahead(env) == strlen(target) && seq(env, target);
1537
+ }
1538
+
1539
+ static bool uninitialized(Env *env) { return !has_contexts(env); }
1540
+
1541
+ static uint32_t conid(Env *env) {
1542
+ if (!is_conid_start_char(peek0(env))) return 0;
1543
+ return advance_while(env, 1, is_inner_id_char);
1544
+ }
1545
+
1546
+ typedef enum {
1547
+ NoQualifiedName,
1548
+ QualifiedTarget,
1549
+ QualifiedConid,
1550
+ } QualifiedName;
1551
+
1552
+ static QualifiedName qualified_name(Env *env, bool (*name)(Env *)) {
1553
+ bool qualified = false;
1554
+ while (true) {
1555
+ uint32_t end = conid(env);
1556
+ if (end == 0) break;
1557
+ if (!char_at(env, end, '.')) {
1558
+ if (qualified) return QualifiedConid;
1559
+ else break;
1560
+ }
1561
+ qualified = true;
1562
+ reset_lookahead_to(env, end + 1);
1563
+ if (name(env)) return true;
1564
+ }
1565
+ return NoQualifiedName;
1566
+ }
1567
+
1568
+ /**
1569
+ * Use the lookahead buffer to determine whether a character is escaped, by counting the number of backslashes.
1570
+ */
1571
+ static bool odd_backslashes_before(Env *env, int32_t index) {
1572
+ bool odd = false;
1573
+ while (index >= 0 && peek(env, (uint32_t) index) == '\\') {
1574
+ odd = !odd;
1575
+ index--;
1576
+ }
1577
+ return odd;
1578
+ }
1579
+
1580
+ /**
1581
+ * Advance before the next unescaped double quote.
1582
+ */
1583
+ static uint32_t take_string_literal(Env *env) {
1584
+ uint32_t end = 1;
1585
+ while (true) {
1586
+ end = advance_until_char(env, end, '"') + 1;
1587
+ if (is_eof(env) || !odd_backslashes_before(env, (int) end - 2)) return end;
1588
+ }
1589
+ }
1590
+
1591
+ /**
1592
+ * Advance before the single quote that validly ends a character literal.
1593
+ * If there is none, return 1.
1594
+ * Either the first character is a backslash, or the second character is a single quote.
1595
+ *
1596
+ * A single quote followed by backslash is a char unless it was part of a varid, in which case the backslash is a
1597
+ * lambda.
1598
+ * The caller must make sure to lex varids beforehand.
1599
+ */
1600
+ static uint32_t take_char_literal(Env *env) {
1601
+ if (char1(env, '\\')) return advance_until_char(env, 2, '\'') + 2;
1602
+ else return char_at(env, 2, '\'') ? 3 : 1;
1603
+ }
1604
+
1605
+ // --------------------------------------------------------------------------------------------------------
1606
+ // Lookahead: CPP
1607
+ // --------------------------------------------------------------------------------------------------------
1608
+
1609
+ typedef enum {
1610
+ CppNothing,
1611
+ CppStart,
1612
+ CppElse,
1613
+ CppEnd,
1614
+ CppOther,
1615
+ } CppDirective;
1616
+
1617
+ static const char *cpp_tokens_start[3] = {
1618
+ "if",
1619
+ "ifdef",
1620
+ "ifndef",
1621
+ };
1622
+
1623
+ static bool cpp_cond_start(Env *env, uint32_t start) {
1624
+ return any_token_from(env, 3, cpp_tokens_start, start);
1625
+ }
1626
+
1627
+ static const char *cpp_tokens_else[4] = {
1628
+ "else",
1629
+ "elif",
1630
+ "elifdef",
1631
+ "elifndef",
1632
+ };
1633
+
1634
+ static bool cpp_cond_else(Env *env, uint32_t start) {
1635
+ return any_token_from(env, 4, cpp_tokens_else, start);
1636
+ }
1637
+
1638
+ static bool cpp_cond_end(Env *env, uint32_t start) { return token_from(env, "endif", start); }
1639
+
1640
+ static const char *cpp_tokens_other[7] = {
1641
+ "define",
1642
+ "undef",
1643
+ "include",
1644
+ "pragma",
1645
+ "error",
1646
+ "warning",
1647
+ "line",
1648
+ };
1649
+
1650
+ static bool cpp_directive_other(Env *env, uint32_t start) {
1651
+ return
1652
+ any_token_from(env, 7, cpp_tokens_other, start)
1653
+ ||
1654
+ // A hash followed by nothing but whitespace is CPP.
1655
+ // If non-whitespace follows whitespace, it is a parse error, unless we're in a brace layout; then it is a varsym.
1656
+ // Complete overkill to parse this, but eh!
1657
+ is_newline(peek(env, start))
1658
+ ||
1659
+ // shebang for scripts
1660
+ (char1(env, '!') && uninitialized(env))
1661
+ ;
1662
+ }
1663
+
1664
+ /**
1665
+ * If the first character at `offset` is a hash, skip space and try all tokens that start a CPP directive.
1666
+ * Return the matching variant of the enum `CppDirective`.
1667
+ */
1668
+ static CppDirective cpp_directive(Env *env) {
1669
+ if (!char0(env, '#')) return CppNothing;
1670
+ uint32_t start = take_space_from(env, 1);
1671
+ if (cpp_cond_start(env, start)) return CppStart;
1672
+ else if (cpp_cond_else(env, start)) return CppElse;
1673
+ else if (cpp_cond_end(env, start)) return CppEnd;
1674
+ else if (cpp_directive_other(env, start)) return CppOther;
1675
+ else return CppNothing;
1676
+ }
1677
+
1678
+ // --------------------------------------------------------------------------------------------------------
1679
+ // Starting layouts
1680
+ // --------------------------------------------------------------------------------------------------------
1681
+
1682
+ /**
1683
+ * Opening and closing braces are always followed by a command (`grammar/util.js`), so this can unconditionally push a
1684
+ * context.
1685
+ * See `grammar/externals.js` for more.
1686
+ *
1687
+ * Note: This is not related to regular brace layouts, which are handled by `start_layout`!
1688
+ * Aside from layouts, braces are also used for records and inferred type variables, where indentation is also ignored!
1689
+ * Therefore, we add a context to skip steps like semicolon generation.
1690
+ *
1691
+ * Check out some examples in the tests:
1692
+ * - data: record zero indent
1693
+ * - type decl: inferred quantifier at column 0
1694
+ */
1695
+ static Symbol start_brace(Env *env) {
1696
+ if (valid(env, START_BRACE)) {
1697
+ push_context(env, Braces, 0);
1698
+ return finish(START_BRACE, "start_brace");
1699
+ }
1700
+ return FAIL;
1701
+ }
1702
+
1703
+ /**
1704
+ * See `start_brace`.
1705
+ */
1706
+ static Symbol end_brace(Env *env) {
1707
+ if (valid(env, END_BRACE) && current_context(env) == Braces) {
1708
+ pop(env);
1709
+ return finish(END_BRACE, "end_brace");
1710
+ }
1711
+ return FAIL;
1712
+ }
1713
+
1714
+ /**
1715
+ * Return the first valid layout start symbol.
1716
+ */
1717
+ static Symbol valid_layout_start_sym(Env *env) {
1718
+ for (Symbol i = START; i < END; i++) {
1719
+ if (valid(env, i)) return i;
1720
+ }
1721
+ return FAIL;
1722
+ }
1723
+
1724
+ /**
1725
+ * Map `Symbol` to `ContextSort`.
1726
+ */
1727
+ static ContextSort layout_sort(Symbol s) {
1728
+ switch (s) {
1729
+ case START_DO:
1730
+ return DoLayout;
1731
+ case START_CASE:
1732
+ return CaseLayout;
1733
+ case START_IF:
1734
+ return MultiWayIfLayout;
1735
+ case START_LET:
1736
+ return LetLayout;
1737
+ case START_QUOTE:
1738
+ return QuoteLayout;
1739
+ default:
1740
+ return DeclLayout;
1741
+ }
1742
+ }
1743
+
1744
+ typedef struct {
1745
+ Symbol sym;
1746
+ ContextSort sort;
1747
+ } StartLayout;
1748
+
1749
+ /**
1750
+ * Determine whether the layout sort corresponding to the potentially valid symbol can start at this position.
1751
+ * If the context stack is `uninitialized`, the first layout is added by `process_token_init`.
1752
+ * In newline processing mode, brace layouts cannot be started because there may be comments before the brace that need
1753
+ * to be emitted first.
1754
+ * Regular `if/then/else` conditionals are always valid at the same position as multi-way if layouts.
1755
+ * If we were to unconditionally start a layout when START_IF is valid, it would never be possible to parse the former,
1756
+ * so this skips that layout sort unless the `Lexed` token is `LBar`.
1757
+ */
1758
+ static StartLayout valid_layout_start(Env *env, Lexed next) {
1759
+ StartLayout start = {.sym = valid_layout_start_sym(env), .sort = NoContext};
1760
+ if (uninitialized(env) || start.sym == FAIL) return start;
1761
+ ContextSort sort = layout_sort(start.sym);
1762
+ switch (next) {
1763
+ case LBar:
1764
+ break;
1765
+ case LBraceOpen:
1766
+ if (newline_active(env)) return start;
1767
+ sort = Braces;
1768
+ start.sym = START_EXPLICIT;
1769
+ break;
1770
+ default:
1771
+ if (sort == MultiWayIfLayout) return start;
1772
+ break;
1773
+ }
1774
+ start.sort = sort;
1775
+ return start;
1776
+ }
1777
+
1778
+ /**
1779
+ * If the current context is a brace layout, any indent is legal for a new layout.
1780
+ * Otherwise, compare with the indent of the current context.
1781
+ * Since starting layouts is allowed in tuple expressions, we look at the last real indent, skipping over `TExp`s, using
1782
+ * 0 if none exists (which should never be the case).
1783
+ *
1784
+ * According to the docs for `NondecreasingIndentation`, the rule is that a nested context may start at the same column
1785
+ * _if the enclosing context is a do expression_.
1786
+ * From experimental evidence, it appears though that this is the other way round – a do expression within, say, a case
1787
+ * alt can start at the same level as the case layout.
1788
+ */
1789
+ static bool indent_can_start_layout(Env *env, ContextSort sort, uint32_t indent) {
1790
+ if (current_context(env) == Braces) return true;
1791
+ uint32_t cur = current_indent(env);
1792
+ return (indent > cur || (indent == cur && sort == DoLayout));
1793
+ }
1794
+
1795
+ /**
1796
+ * Start the given layout if the position allows it:
1797
+ *
1798
+ * - If the current context is `ModuleHeader`, the layout must be the `where` after `module`, so any indent is valid.
1799
+
1800
+ * - If the new layout is a brace layout, legal indent is technically required, but we can be lenient since there's no
1801
+ * other way to interpret an opening brace after a layout opener.
1802
+ * However, we need to mark to include the brace in the range to create a terminal (see `grammar/externals.js` for
1803
+ * why).
1804
+ *
1805
+ * - Otherwise, examine indent.
1806
+ */
1807
+ static Symbol start_layout(Env *env, const StartLayout start, uint32_t indent, const char * restrict desc) {
1808
+ if (in_module_header(env)) pop(env);
1809
+ else if (start.sort == Braces) MARK("start_layout brace");
1810
+ else if (!indent_can_start_layout(env, start.sort, indent)) return FAIL;
1811
+ push_context(env, start.sort, indent);
1812
+ return finish(start.sym, desc);
1813
+ }
1814
+
1815
+ /**
1816
+ * The indent of a layout started at an interior token can only be determined by calling `get_column`.
1817
+ * This is an expensive operation, but hopefully it is rare enough to not make a substantial dent.
1818
+ * Because we might have advanced beyond the layout's start position to check conditions, we need to subtract the length
1819
+ * of the lookahead buffer from the current column.
1820
+ * Whitespace is skipped, and not added to the buffer, so the resulting position is after whitespace.
1821
+ */
1822
+ static Symbol start_layout_interior(Env *env, Lexed next) {
1823
+ StartLayout start = valid_layout_start(env, next);
1824
+ if (start.sort == NoContext) return FAIL;
1825
+ return start_layout(env, start, start_column(env), "interior");
1826
+ }
1827
+
1828
+ /**
1829
+ * The indent of a layout started at the beginning of a line is determined by `newline_lookahead`, so this does not have
1830
+ * to compute it.
1831
+ */
1832
+ static Symbol start_layout_newline(Env *env) {
1833
+ StartLayout start = valid_layout_start(env, env->state->newline.end);
1834
+ if (start.sort == NoContext) return FAIL;
1835
+ Symbol result = start_layout(env, start, env->state->newline.indent, "newline");
1836
+ if (result != FAIL) env->state->newline.no_semi = true;
1837
+ return result;
1838
+ }
1839
+
1840
+ /**
1841
+ * See `token_end_layout_texp`.
1842
+ */
1843
+ static Symbol texp_context(Env *env) {
1844
+ if (valid(env, START_TEXP)) {
1845
+ push_context(env, TExp, 0);
1846
+ return finish(START_TEXP, "texp_context");
1847
+ }
1848
+ else if (valid(env, END_TEXP) && current_context(env) == TExp) {
1849
+ pop(env);
1850
+ return finish(END_TEXP, "texp_context");
1851
+ }
1852
+ else return FAIL;
1853
+ }
1854
+
1855
+ // --------------------------------------------------------------------------------------------------------
1856
+ // Ending layouts
1857
+ // --------------------------------------------------------------------------------------------------------
1858
+
1859
+ /**
1860
+ * Separated this from `end_layout` because it caused some weird performance glitches.
1861
+ */
1862
+ static Symbol end_layout_unchecked(Env *env, const char *restrict desc) {
1863
+ pop(env);
1864
+ return finish(END, desc);
1865
+ }
1866
+
1867
+ /**
1868
+ * If a layout end is valid at this position, pop a context and succeed with layout end.
1869
+ */
1870
+ static Symbol end_layout(Env *env, const char *restrict desc) {
1871
+ if (valid(env, END)) return end_layout_unchecked(env, desc);
1872
+ else return FAIL;
1873
+ }
1874
+
1875
+ /**
1876
+ * Explicit brace layouts need a dedicated symbol, see `_cmd_layout_start_explicit` for an explanation.
1877
+ * Includes the brace in the range.
1878
+ */
1879
+ static Symbol end_layout_brace(Env *env) {
1880
+ if (valid(env, END_EXPLICIT) && current_context(env) == Braces) {
1881
+ advance_over(env, 0);
1882
+ MARK("end_layout_brace");
1883
+ pop(env);
1884
+ return finish(END_EXPLICIT, "brace");
1885
+ }
1886
+ else return FAIL;
1887
+ }
1888
+
1889
+ /**
1890
+ * End a layout based on indent decrease.
1891
+ *
1892
+ * If the indent of the current line is smaller than the indent of the current layout, we end the layout in most cases.
1893
+ * Exceptions are:
1894
+ *
1895
+ * - Brace layouts
1896
+ * - The top-level layout, which should only be ended at the end of file.
1897
+ * For leniency, we change the current indent to the smaller value.
1898
+ */
1899
+ static Symbol end_layout_indent(Env *env) {
1900
+ if (valid(env, END) && indent_less(env, env->state->newline.indent)) {
1901
+ if (top_layout(env)) {
1902
+ array_back(&env->state->contexts)->indent = env->state->newline.indent;
1903
+ return update_state("end top layout");
1904
+ }
1905
+ else {
1906
+ env->state->newline.skip_semi = false;
1907
+ return end_layout_unchecked(env, "indent");
1908
+ }
1909
+ }
1910
+ return FAIL;
1911
+ }
1912
+
1913
+ /**
1914
+ * An expression layout may be closed by an infix operator when it is not valid at that position:
1915
+ *
1916
+ * a :: IO Int
1917
+ * a = do a <- pure 5
1918
+ * pure a
1919
+ * >>= pure
1920
+ *
1921
+ * In this situation, the indent of the operator causes a semicolon to be generated, which leads to varsym being invalid
1922
+ * lookahead.
1923
+ * The layout is closed and the entire `do` block becomes the left operand of the `>>=`.
1924
+ * The same applies for `infix` id operators.
1925
+ *
1926
+ * It doesn't apply to multi-way if layouts, because those don't use semicolons.
1927
+ */
1928
+ static Symbol end_layout_infix(Env *env) {
1929
+ if (!valid(env, VARSYM) && !valid(env, CONSYM)) return end_layout(env, "symop invalid");
1930
+ return FAIL;
1931
+ }
1932
+
1933
+ /**
1934
+ * A case alt can have a `where` clause attached to it, so a case layout is ended by a `where` only if its indent is
1935
+ * equal to or smaller than the layout indent.
1936
+ *
1937
+ * A `do` or `if` cannot have a `where`, so they are always terminated.
1938
+ *
1939
+ * It would be tempting to leave it at that, but there can be multiple successive `where` clauses.
1940
+ * If a `case` is followed by two of them (greater indent), the first one would attach to the last alt.
1941
+ * The second one would have to close the `case` layout and attach to the next higher allowed place (e.g. the enclosing
1942
+ * function decl), but if its indent is greater, this cannot be detected here – it would just seem like a `where`
1943
+ * attaching to an alt, since we don't keep track of the number of `where`s encountered (and we couldn't, since we're
1944
+ * dealing with layouts, not case alts).
1945
+ *
1946
+ * By tracking the validity of `where` symbols, we can simplify the condition for `do` and `if`: End any layout when
1947
+ * `where` is parsed, but invalid.
1948
+ */
1949
+ static Symbol end_layout_where(Env *env) {
1950
+ if (valid(env, END) && !valid(env, WHERE) && is_layout_context(env)) return end_layout(env, "where");
1951
+ return FAIL;
1952
+ }
1953
+
1954
+ /**
1955
+ * Ending layouts with `in` heavily relies on parse errors in GHC, so this is difficult.
1956
+ * The heuristic here is that if `in` is not valid (i.e. a parse error), we pop any layout.
1957
+ *
1958
+ * Take the example of some inline layouts in a let decl:
1959
+ * `let a = case a of a -> do a in a`
1960
+ * The `do` and `case` layouts have valid `END` symbols at the `in`, but `in` itself is not valid as long as the `case`
1961
+ * hasn't reduced, so we pop until we find `IN`.
1962
+ *
1963
+ * This isn't enough though, since `let` also opened a layout that ends here, so we have to test for that explicitly.
1964
+ *
1965
+ * Note that this doesn't allow the `in` of a nested `let` to close the outer `let`, since the `END` for that isn't
1966
+ * valid before the inner `let` has reduced.
1967
+ */
1968
+ static Symbol end_layout_in(Env *env) {
1969
+ if (valid(env, END) && (!valid(env, IN) || current_context(env) == LetLayout)) return end_layout(env, "in");
1970
+ return FAIL;
1971
+ }
1972
+
1973
+ /**
1974
+ * For GADT constructor layouts.
1975
+ */
1976
+ static Symbol end_layout_deriving(Env *env) {
1977
+ if (valid(env, END) && !valid(env, DERIVING) && !top_layout(env) && current_context(env) == DeclLayout)
1978
+ return end_layout(env, "deriving");
1979
+ return FAIL;
1980
+ }
1981
+
1982
+ /**
1983
+ * Return `true` if there is a `TExp` context on the stack and only layouts above it.
1984
+ */
1985
+ static bool layouts_in_texp(Env *env) {
1986
+ if (is_layout_context(env) && (env->state->contexts.size > 1)) {
1987
+ for (int32_t i = (int32_t) env->state->contexts.size - 2; i >= 0; i--) {
1988
+ Context *cur = array_get(&env->state->contexts, i);
1989
+ if (cur->sort == TExp || cur->sort == Braces) return true;
1990
+ else if (cur->sort > Braces) break;
1991
+ }
1992
+ }
1993
+ return false;
1994
+ }
1995
+
1996
+ /**
1997
+ * Tuple expressions are constructs that syntactically delimit their contents in an unambiguous way that makes parsing
1998
+ * a lot easier.
1999
+ * In GHC, this concept is used to classify productions like view patterns and annotated expressions.
2000
+ * For us, unfortunately, it also means that there are significantly more circumstances in which layouts can be ended by
2001
+ * parse errors.
2002
+ *
2003
+ * In practice, it means that expression layouts can be closed by commas, vertical bars and closing brackets and parens
2004
+ * when they are elements in a list or tuple-like construct:
2005
+ *
2006
+ * (case a of a -> a, do a; a, if | a -> a | a -> a)
2007
+ * [case a of a -> a | a <- a]
2008
+ *
2009
+ * We encode this as a special context sort, `TExp`, that is pushed and popped at opening and closing brackets.
2010
+ *
2011
+ * Some other constructs, like guards, have similar characteristics, so we use the same mechanism for them:
2012
+ *
2013
+ * > a = case a of
2014
+ * > a | let a = a -> a
2015
+ *
2016
+ * Here the let layout must be ended by parse error, so we start a tuple expression at the bar and end it at the arrow.
2017
+ */
2018
+ static Symbol token_end_layout_texp(Env *env) {
2019
+ return (valid(env, END) && layouts_in_texp(env)) ? end_layout(env, "texp") : FAIL;
2020
+ }
2021
+
2022
+ static Symbol force_end_context(Env *env) {
2023
+ for (int32_t i = (int32_t) env->state->contexts.size - 1; i >= 0; i--) {
2024
+ ContextSort ctx = array_get(&env->state->contexts, i)->sort;
2025
+ Symbol s = context_end_sym(ctx);
2026
+ pop(env);
2027
+ if (s != FAIL && valid(env, s)) return finish(s, "force_end_context");
2028
+ }
2029
+ return FAIL;
2030
+ }
2031
+
2032
+ // --------------------------------------------------------------------------------------------------------
2033
+ // Operators
2034
+ // --------------------------------------------------------------------------------------------------------
2035
+
2036
+ /**
2037
+ * Opening tokens are a class of characters that may immediately follow prefix operators like bang pattern `!` or type
2038
+ * application `@`.
2039
+ */
2040
+ static bool opening_token(Env *env, uint32_t i) {
2041
+ int32_t c = peek(env, i);
2042
+ switch (c) {
2043
+ case 0x27e6: // ⟦
2044
+ case 0x2987: // ⦇
2045
+ case '(':
2046
+ case '[':
2047
+ case '"':
2048
+ return true;
2049
+ case '{':
2050
+ return peek(env, i + 1) != '-';
2051
+ default:
2052
+ // Includes single quote
2053
+ return is_id_char(c);
2054
+ }
2055
+ }
2056
+
2057
+ /**
2058
+ * Test for reserved operators of two characters.
2059
+ */
2060
+ static bool valid_symop_two_chars(int32_t first_char, int32_t second_char) {
2061
+ switch (first_char) {
2062
+ case '=':
2063
+ return second_char != '>';
2064
+ case '<':
2065
+ return second_char != '-';
2066
+ case ':':
2067
+ return second_char != ':';
2068
+ default:
2069
+ return true;
2070
+ }
2071
+ }
2072
+
2073
+ /**
2074
+ * If a prefix operator is not followed by an opening token, it may still be a valid varsym.
2075
+ */
2076
+ static Lexed lex_prefix(Env *env, Lexed t) {
2077
+ return opening_token(env, 1) ? t : LSymop;
2078
+ }
2079
+
2080
+ /**
2081
+ * If a splice operator is not followed by an opening token, it may still be a valid varsym.
2082
+ * We only allow variables and parenthesized expressions for performance reasons, though.
2083
+ */
2084
+ static Lexed lex_splice(int32_t c) {
2085
+ return varid_start_char(c) || c == '(' ? LDollar : LSymop;
2086
+ }
2087
+
2088
+ /**
2089
+ * Lex special occurrences of symbolic operator characters, or declare a valid operator.
2090
+ *
2091
+ * For the dot:
2092
+ *
2093
+ * - Two dots: `..`: Only used for arithmetic sequences (`[a..10]`).
2094
+ * These conflict with record field projection (`[a.b, c]`) and infix operators (`[a..+b]`), and it's too hard to
2095
+ * disambiguate them without this special rule.
2096
+ *
2097
+ * - Tight dot `a.b.c`: A regular tight op, but it has to get a separate symbol from qualified module dots since those
2098
+ * can be followed by symops.
2099
+ *
2100
+ * - Prefix dot `(.a)`: A regular prefix op, for record dot field selectors.
2101
+ *
2102
+ * - Qualified dot `A.B.c`, `A.B.C`, `A.B.+`: Used primarily for qualified modules, but needs to be accepted for field
2103
+ * selectors as well due to ambiguity.
2104
+ * This is not a regular tight op since it needs to allow symops and conid.
2105
+ */
2106
+ static Lexed lex_symop(Env *env) {
2107
+ uint32_t len = symop_lookahead(env);
2108
+ if (len == 0) return LNothing;
2109
+ int32_t c1 = unsafe_peek(env, 0);
2110
+ if (len == 1) {
2111
+ switch (c1) {
2112
+ case '?':
2113
+ // A `?` can be the head of an implicit parameter, if followed by a varid.
2114
+ return varid_start_char(peek1(env)) ? LNothing : LSymop;
2115
+ case '#':
2116
+ return char1(env, ')') ? LUnboxedClose : LHash;
2117
+ case '|':
2118
+ return char1(env, ']') ? LQuoteClose : LBar;
2119
+ case '!':
2120
+ return lex_prefix(env, LBang);
2121
+ case '~':
2122
+ return lex_prefix(env, LTilde);
2123
+ case '@':
2124
+ return lex_prefix(env, LAt);
2125
+ case '%':
2126
+ return lex_prefix(env, LPercent);
2127
+ case '$':
2128
+ return lex_splice(peek1(env));
2129
+ case '.':
2130
+ if (is_id_char(peek1(env))) return LDotId;
2131
+ else if (opening_token(env, 1)) return LDotOpen;
2132
+ else return LSymop;
2133
+ case 0x2192: // →
2134
+ case 0x22b8: // ⊸
2135
+ return LArrow;
2136
+ case 0x21d2: // ⇒
2137
+ return LCArrow;
2138
+ case '=':
2139
+ case 0x27e7: // ⟧
2140
+ case 0x2988: // ⦈
2141
+ return LTexpCloser;
2142
+ case '*':
2143
+ case '-':
2144
+ return LSymopSpecial;
2145
+ case '\\':
2146
+ case 0x2190: // ←
2147
+ case 0x2200: // ∀
2148
+ case 0x2237: // ∷
2149
+ case 0x2605: // ★
2150
+ case 0x27e6: // ⟦
2151
+ case 0x2919: // ⤙
2152
+ case 0x291a: // ⤚
2153
+ case 0x291b: // ⤛
2154
+ case 0x291c: // ⤜
2155
+ case 0x2987: // ⦇
2156
+ return LNothing;
2157
+ }
2158
+ }
2159
+ else if (len == 2) {
2160
+ if (seq(env, "->")) return LArrow;
2161
+ if (seq(env, "=>")) return LCArrow;
2162
+ int32_t c2 = unsafe_peek(env, 1);
2163
+ switch (c1) {
2164
+ case '$':
2165
+ if (c2 == '$') return lex_splice(peek2(env));
2166
+ break;
2167
+ case '|':
2168
+ if (c2 == '|' && char2(env, ']')) return LQuoteClose;
2169
+ break;
2170
+ case '.':
2171
+ if (c2 == '.') return LDotDot;
2172
+ else return LDotSymop;
2173
+ break;
2174
+ case '#':
2175
+ // Unboxed unit `(##)` and unboxed sum with missing space `(#| Int #)`
2176
+ if (c2 == '#' || c2 == '|') return LSymopSpecial;
2177
+ break;
2178
+ default:
2179
+ if (!valid_symop_two_chars(c1, c2)) return LNothing;
2180
+ break;
2181
+ }
2182
+ }
2183
+ else switch (c1) {
2184
+ case '-':
2185
+ if (seq(env, "->.")) return LArrow;
2186
+ break;
2187
+ case '.':
2188
+ return LDotSymop;
2189
+ }
2190
+ return LSymop;
2191
+ }
2192
+
2193
+ /**
2194
+ * If the next character after whitespace starting from `start` is a closing parenthesis, finish with `LEFT_SECTION_OP`.
2195
+ * This is called after a previous step has already lexed a valid infix operator (symbolic or ticked varid).
2196
+ *
2197
+ * Left section operators must be detected here to disambiguate from infix expressions:
2198
+ *
2199
+ * > f = (1 - 2 +)
2200
+ *
2201
+ * When lookahead is `+`, the parser must decide whether to reduce `1 - 2` to `infix` because it is the operand of a
2202
+ * section, or to shift into another `infix`.
2203
+ * With a single lookahead token, this is not decidable.
2204
+ *
2205
+ * Note: The obvious solution would be to make `infix` left-associative, so it would always reduce.
2206
+ * Unfortunately, this doesn't work for minus, due to apparently unsurmountable problems caused by the
2207
+ * apply/infix/negation conflict.
2208
+ *
2209
+ * Note: This will fail if there are extras between the operator and the parenthesis (and the ticks and the varid).
2210
+ *
2211
+ * Note: If the operator isn't followed by a parenthesis, it will be parsed as an infix operator in the next step, since
2212
+ * those are always valid when left sections are (except for qualified symops).
2213
+ * However, this function advances over whitespace to find the paren, so if the next step marks and finishes, it will
2214
+ * either:
2215
+ * - Include the whitespace in its range, if this consumes it
2216
+ * - Have a zero-width range, if this skips whitespace
2217
+ * To mitigate this, we introduce the auxiliary symbol `NO_SECTION_OP`, which is (optionally) valid before infix
2218
+ * operators, but not before section operators.
2219
+ * When this function finds any whitespace before the parenthesis, it will finish with that symbol, so that
2220
+ * `LEFT_SECTION_OP` won't be valid in the next run, but all other infix operators are.
2221
+ */
2222
+ static Symbol left_section_op(Env *env, uint32_t start) {
2223
+ if (valid(env, LEFT_SECTION_OP)) {
2224
+ advance_before(env, start);
2225
+ Space space = skip_whitespace(env);
2226
+ if (char_at(env, start, ')')) return finish(LEFT_SECTION_OP, "left section");
2227
+ if (space != NoSpace) return finish_if_valid(env, NO_SECTION_OP, "left section");
2228
+ }
2229
+ return FAIL;
2230
+ }
2231
+
2232
+ /**
2233
+ * Specialization of `left_section_op` for ticked infix identifiers.
2234
+ */
2235
+ static Symbol left_section_ticked(Env *env) {
2236
+ if (valid(env, LEFT_SECTION_OP)) {
2237
+ uint32_t end_tick = advance_until_char(env, 1, '`');
2238
+ // Could be EOF
2239
+ if (char_at(env, end_tick, '`')) {
2240
+ return left_section_op(env, end_tick + 1);
2241
+ }
2242
+ }
2243
+ return FAIL;
2244
+ }
2245
+
2246
+ /**
2247
+ * This calls `symop_lookahead` to ensure that the position has advanced beyond the end of the symop, which is necessary
2248
+ * because newline lookahead may have validated the symop in a previous run.
2249
+ * This marks the range to emit a terminal.
2250
+ */
2251
+ static Symbol finish_symop(Env *env, Symbol s) {
2252
+ if (valid(env, s) || valid(env, LEFT_SECTION_OP)) {
2253
+ uint32_t after_symop = symop_lookahead(env);
2254
+ SEQ(left_section_op(env, after_symop));
2255
+ MARK("symop");
2256
+ return s;
2257
+ }
2258
+ return FAIL;
2259
+ }
2260
+
2261
+ /**
2262
+ * Tight ops like `dot.syntax` require that no initial whitespace was skipped.
2263
+ */
2264
+ static Symbol tight_op(Env *env, bool whitespace, Symbol s) {
2265
+ if (!whitespace) return finish_if_valid(env, s, "tight_op");
2266
+ else return FAIL;
2267
+ }
2268
+
2269
+ /**
2270
+ * Used for situations where the operator is followed by an opening token, and so can be a prefix op if it is preceded
2271
+ * by whitespace; but is no valid tight op and therefore becomes a regular operator if not preceded by whitespace or the
2272
+ * symbol is not valid.
2273
+ *
2274
+ * Only used for `%` (modifier).
2275
+ */
2276
+ static Symbol prefix_or_varsym(Env *env, bool whitespace, Symbol s) {
2277
+ if (whitespace) SEQ(finish_if_valid(env, s, "prefix_or_varsym"));
2278
+ return finish_symop(env, VARSYM);
2279
+ }
2280
+
2281
+ /**
2282
+ * Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
2283
+ * preceded by whitespace; but is no valid prefix op and therefore becomes a regular operator if preceded by whitespace
2284
+ * or the symbol is not valid.
2285
+ *
2286
+ * Only used for `.`, when a projection selector `(.fieldname)` is not valid at this position, so the dot becomes the
2287
+ * composition operator.
2288
+ */
2289
+ static Symbol tight_or_varsym(Env *env, bool whitespace, Symbol s) {
2290
+ SEQ(tight_op(env, whitespace, s));
2291
+ return finish_symop(env, VARSYM);
2292
+ }
2293
+
2294
+ /**
2295
+ * Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
2296
+ * preceded by whitespace or a prefix op if it is.
2297
+ *
2298
+ * If neither of those symbols is valid, fall back to a regular operator.
2299
+ *
2300
+ * Used for `!`, `~` and `@`.
2301
+ */
2302
+ static Symbol infix_or_varsym(Env *env, bool whitespace, Symbol prefix, Symbol tight) {
2303
+ SEQ(finish_if_valid(env, whitespace ? prefix : tight, "infix_or_varsym"));
2304
+ return finish_symop(env, VARSYM);
2305
+ }
2306
+
2307
+ static Symbol qualified_op(Env *env) {
2308
+ if (qualified_name(env, is_symop) == QualifiedTarget) {
2309
+ SEQ(left_section_op(env, symop_lookahead(env)));
2310
+ return QUALIFIED_OP;
2311
+ }
2312
+ return FAIL;
2313
+ }
2314
+
2315
+ // --------------------------------------------------------------------------------------------------------
2316
+ // Token lookahead
2317
+ // --------------------------------------------------------------------------------------------------------
2318
+
2319
+ /**
2320
+ * Detect the start of a quasiquote: An opening bracket followed by an optional varid and a vertical bar, all without
2321
+ * whitespace in between.
2322
+ */
2323
+ static bool is_qq_start(Env *env) {
2324
+ uint32_t end = advance_while(env, 1, quoter_char);
2325
+ return char_at(env, end, '|');
2326
+ }
2327
+
2328
+ /**
2329
+ * An end token is a keyword like `else` or `deriving` that can end a layout without newline or indent.
2330
+ */
2331
+ static Lexed try_end_token(Env *env, const char * restrict target, Lexed match) {
2332
+ if (token(env, target)) return match;
2333
+ else return LNothing;
2334
+ }
2335
+
2336
+ /**
2337
+ * Check that a symop consists only of minuses after the second character.
2338
+ */
2339
+ static bool only_minus(Env *env) {
2340
+ uint32_t i = 2;
2341
+ while (peek(env, i) == '-') i++;
2342
+ return !symop_char(peek(env, i));
2343
+ }
2344
+
2345
+ /**
2346
+ * Check that a symop consists only of minuses, making it a comment herald.
2347
+ */
2348
+ static bool line_comment_herald(Env *env) {
2349
+ return seq(env, "--") && only_minus(env);
2350
+ }
2351
+
2352
+ static Lexed lex_cpp(Env *env) {
2353
+ switch(cpp_directive(env)) {
2354
+ case CppElse:
2355
+ return LCppElse;
2356
+ case CppNothing:
2357
+ return LNothing;
2358
+ default:
2359
+ return LCpp;
2360
+ }
2361
+ }
2362
+
2363
+ /**
2364
+ * Lex pragmas, comments and CPP.
2365
+ */
2366
+ static Lexed lex_extras(Env *env, bool bol) {
2367
+ switch (peek0(env)) {
2368
+ case '{':
2369
+ if (char1(env, '-')) return char2(env, '#') ? LPragma : LBlockComment;
2370
+ break;
2371
+ case '#':
2372
+ if (bol) return lex_cpp(env);
2373
+ break;
2374
+ case '-':
2375
+ if (line_comment_herald(env)) return LLineComment;
2376
+ break;
2377
+ default:
2378
+ break;
2379
+ }
2380
+ return LNothing;
2381
+ }
2382
+
2383
+ /**
2384
+ * The main lexing entry point, branching on the first character, then advancing as far as necessary to identify all
2385
+ * interesting tokens.
2386
+ */
2387
+ static Lexed lex(Env *env, bool bol) {
2388
+ SEQ(lex_extras(env, bol));
2389
+ if (symop_char(peek0(env))) SEQ(lex_symop(env));
2390
+ else switch (peek0(env)) {
2391
+ case 'w':
2392
+ return try_end_token(env, "where", LWhere);
2393
+ case 'i':
2394
+ return try_end_token(env, "in", LIn);
2395
+ case 't':
2396
+ return try_end_token(env, "then", LThen);
2397
+ case 'e':
2398
+ return try_end_token(env, "else", LElse);
2399
+ case 'd':
2400
+ return try_end_token(env, "deriving", LDeriving);
2401
+ case 'm':
2402
+ if ((uninitialized(env) || in_module_header(env)) && token(env, "module")) return LModule;
2403
+ break;
2404
+ case '{':
2405
+ return LBraceOpen;
2406
+ case '}':
2407
+ return LBraceClose;
2408
+ case ';':
2409
+ return LSemi;
2410
+ case '`':
2411
+ return LTick;
2412
+ case '[':
2413
+ if (valid(env, QQ_START) && is_qq_start(env)) return LBracketOpen;
2414
+ break;
2415
+ case ']':
2416
+ case ')':
2417
+ case ',':
2418
+ return LTexpCloser;
2419
+ default:
2420
+ if (is_conid_start_char(peek0(env))) return LUpper;
2421
+ break;
2422
+ }
2423
+ return LNothing;
2424
+ }
2425
+
2426
+ // --------------------------------------------------------------------------------------------------------
2427
+ // CPP
2428
+ // --------------------------------------------------------------------------------------------------------
2429
+
2430
+ /**
2431
+ * This tests for `#endif` directly after taking a line, so it only matches it at the first column.
2432
+ * Int finishes right before the `#endif`, so that pragma is parsed by `cpp_consume` in the next round.
2433
+ */
2434
+ static Symbol cpp_else(Env *env, bool emit) {
2435
+ uint32_t nesting = 1;
2436
+ do {
2437
+ take_line_escaped_newline(env);
2438
+ if (emit) MARK("cpp_else");
2439
+ S_ADVANCE;
2440
+ reset_lookahead(env);
2441
+ switch (cpp_directive(env)) {
2442
+ case CppStart:
2443
+ nesting++;
2444
+ break;
2445
+ case CppEnd:
2446
+ nesting--;
2447
+ break;
2448
+ default:
2449
+ break;
2450
+ }
2451
+ }
2452
+ while (not_eof(env) && nesting > 0);
2453
+ if (emit) return finish(CPP, "cpp-else");
2454
+ else return FAIL;
2455
+ }
2456
+
2457
+ static Symbol cpp_line(Env *env) {
2458
+ take_line_escaped_newline(env);
2459
+ return finish_marked(env, CPP, "cpp");
2460
+ }
2461
+
2462
+ // --------------------------------------------------------------------------------------------------------
2463
+ // Comments
2464
+ // --------------------------------------------------------------------------------------------------------
2465
+
2466
+ /**
2467
+ * Distinguish between haddocks and plain comments by matching on the first non-whitespace character.
2468
+ */
2469
+ static Symbol comment_type(Env *env) {
2470
+ uint32_t i = 2;
2471
+ while (peek(env, i) == '-') i++;
2472
+ while (not_eof(env)) {
2473
+ int32_t c = peek(env, i++);
2474
+ if (c == '|' || c == '^') return HADDOCK;
2475
+ else if (!is_space_char(c)) break;
2476
+ }
2477
+ return COMMENT;
2478
+ }
2479
+
2480
+ /**
2481
+ * Inline comments extend over all consecutive lines that start with comments.
2482
+ * Could be improved by requiring equal indent.
2483
+ */
2484
+ static Symbol inline_comment(Env *env) {
2485
+ Symbol sym = comment_type(env);
2486
+ do {
2487
+ take_line(env);
2488
+ MARK("inline comment");
2489
+ S_ADVANCE;
2490
+ reset_lookahead(env);
2491
+ } while (line_comment_herald(env));
2492
+ return sym;
2493
+ }
2494
+
2495
+ static uint32_t consume_block_comment(Env *env, uint32_t col) {
2496
+ uint32_t level = 0;
2497
+ for (;;) {
2498
+ if (is_eof(env)) return col;
2499
+ col++;
2500
+ switch (PEEK) {
2501
+ case '{':
2502
+ S_ADVANCE;
2503
+ if (PEEK == '-') {
2504
+ S_ADVANCE;
2505
+ col++;
2506
+ level++;
2507
+ }
2508
+ break;
2509
+ case '-':
2510
+ S_ADVANCE;
2511
+ if (PEEK == '}') {
2512
+ S_ADVANCE;
2513
+ col++;
2514
+ if (level == 0) return col;
2515
+ level--;
2516
+ }
2517
+ break;
2518
+ NEWLINE_CASES:
2519
+ S_ADVANCE;
2520
+ col = 0;
2521
+ break;
2522
+ case '\t':
2523
+ S_ADVANCE;
2524
+ col += 7;
2525
+ break;
2526
+ default:
2527
+ S_ADVANCE;
2528
+ break;
2529
+ }
2530
+ }
2531
+ }
2532
+
2533
+ /**
2534
+ * Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been opened, so that the
2535
+ * outermost comment isn't closed prematurely.
2536
+ */
2537
+ static Symbol block_comment(Env *env) {
2538
+ Symbol sym = comment_type(env);
2539
+ consume_block_comment(env, env->state->lookahead.size);
2540
+ return finish_marked(env, sym, "block_comment");
2541
+ }
2542
+
2543
+ // --------------------------------------------------------------------------------------------------------
2544
+ // Pragma
2545
+ // --------------------------------------------------------------------------------------------------------
2546
+
2547
+ static bool consume_pragma(Env *env) {
2548
+ if (seq(env, "{-#")) {
2549
+ while (!seq(env, "#-}") && not_eof(env)) {
2550
+ reset_lookahead(env);
2551
+ advance_over(env, 0);
2552
+ }
2553
+ return true;
2554
+ }
2555
+ return false;
2556
+ }
2557
+
2558
+ /**
2559
+ * Since pragmas can occur anywhere, like comments, but contrarily determine indentation when occurring at the beginning
2560
+ * of a line in layouts, this sets `NResume` to continue newline processing with the indent of the pragma.
2561
+ *
2562
+ * If the pragma is followed by newline, this only ensures that no semicolon is emitted (since this rule is run before
2563
+ * `semicolon` and `NResume` restarts lookahead).
2564
+ *
2565
+ * Otherwise it ensures that the following token is treated as a layout element with the correct indent.
2566
+ */
2567
+ static Symbol pragma(Env *env) {
2568
+ if (consume_pragma(env)) {
2569
+ MARK("pragma");
2570
+ if (env->state->newline.state != NInactive) env->state->newline.state = NResume;
2571
+ return finish(PRAGMA, "newline");
2572
+ }
2573
+ return FAIL;
2574
+ }
2575
+
2576
+ // --------------------------------------------------------------------------------------------------------
2577
+ // Quasiquote
2578
+ // --------------------------------------------------------------------------------------------------------
2579
+
2580
+ static Symbol qq_body(Env *env) {
2581
+ for (;;) {
2582
+ if (is_eof(env)) return finish(QQ_BODY, "qq_body");
2583
+ else if (PEEK == 0x27e7) {
2584
+ return finish_marked(env, QQ_BODY, "qq_body");
2585
+ }
2586
+ else if (PEEK == '|') {
2587
+ MARK("qq_body");
2588
+ S_ADVANCE;
2589
+ if (PEEK == ']') {
2590
+ return finish(QQ_BODY, "qq_body");
2591
+ }
2592
+ } else S_ADVANCE;
2593
+ }
2594
+ }
2595
+
2596
+ // --------------------------------------------------------------------------------------------------------
2597
+ // Semicolon
2598
+ // --------------------------------------------------------------------------------------------------------
2599
+
2600
+ /**
2601
+ * When encountering explicit semicolons, we want to ensure that a subsequent newline doesn't trigger a layout
2602
+ * semicolon, so we set `skip_semi`.
2603
+ * If the next symbol is not a newline (and not another semicolon), the scanner will immediate end up in
2604
+ * `resolve_semicolon`, matching the condition, where we unset the flag to avoid a mid-line semicolon from influencing
2605
+ * an unrelated newline.
2606
+ *
2607
+ * Take this example:
2608
+ *
2609
+ * > a = 1;;
2610
+ * > b = 2
2611
+ * > ;;c = 3
2612
+ *
2613
+ * At the first semicolon, `explicit_semicolon` is called (conditioned on `LSemi` in `process_token_interior`) and
2614
+ * SEMICOLON is valid, so the flag is set.
2615
+ * The scanner will be called again immediately without advancing, and first enter `resolve_semicolon`, which does
2616
+ * nothing because the next token is still `LSemi`.
2617
+ * Next it will enter `explicit_semicolon` again.
2618
+ * SEMICOLON is valid, but since the flag is set we fall through and defer to internal lexing.
2619
+ * The grammar advances into `semi` (in `util.js`), which causes SEMICOLON to become invalid.
2620
+ * The scanner is executed before the second semicolon, where both functions skip again, this time additionally because
2621
+ * SEMICOLON is now invalid.
2622
+ *
2623
+ * In the next scan, the newline branch is taken in `scan`, so this function is not executed again.
2624
+ * Newline lookahead finds the next line to begin at column 0, which would usually trigger a layout semicolon in
2625
+ * `semicolon`, but that is inhibited by `skip_semi`, so the scan only skips whitespace and resets the newline state,
2626
+ * which unsets `skip_semi` again.
2627
+ * In the following scan, the conditions for both functions are unfulfilled, so parsing continues regularly until the
2628
+ * next newline.
2629
+ *
2630
+ * Newline lookahead now encounters the third semicolon on the next line and sets `no_semi`, which supersedes
2631
+ * `skip_semi` and prohibits layout semicolon irreversibly, so the explicit semicolons are parsed by the grammar.
2632
+ *
2633
+ * Now consider an inline semicolon:
2634
+ *
2635
+ * > f = let
2636
+ * > a = 1; b = 2
2637
+ * > c = 3; {- x -}
2638
+ * > d = 4
2639
+ * > in c
2640
+ *
2641
+ * When the semicolon is lexed, `explicit_semicolon` sets `skip_semi`.
2642
+ * If we would not reset it until the newline, no layout semicolon would be generated before `c`, resulting in a parse
2643
+ * error at `=`.
2644
+ * Therefore, `resolve_semicolon` unsets `skip_semi` when lexing `b`, triggered by `skip_semi` being set and the next
2645
+ * token not being `LSemi`.
2646
+ *
2647
+ * The semicolon after `c = 3` is followed by a comment, so it is unclear if there is going to be another layout element
2648
+ * in the same line.
2649
+ * If there is none, the situation is the same as in the first example's first line; if another layout element were to
2650
+ * follow, `skip_semi` would need to be reset like in this example's first line.
2651
+ * Therefore, `resolve_semicolon` also keeps the flag as it is in this case.
2652
+ */
2653
+ static Symbol explicit_semicolon(Env *env) {
2654
+ if (valid(env, SEMICOLON) && !env->state->newline.skip_semi) {
2655
+ env->state->newline.skip_semi = true;
2656
+ return update_state("explicit semicolon enable");
2657
+ }
2658
+ return FAIL;
2659
+ }
2660
+
2661
+ static Symbol resolve_semicolon(Env *env, Lexed next) {
2662
+ if (env->state->newline.skip_semi) {
2663
+ switch(next) {
2664
+ case LLineComment:
2665
+ case LBlockComment:
2666
+ case LPragma:
2667
+ case LSemi:
2668
+ break;
2669
+ default:
2670
+ env->state->newline.skip_semi = false;
2671
+ return update_state("explicit semicolon disable");
2672
+ }
2673
+ }
2674
+ return FAIL;
2675
+ }
2676
+
2677
+ /**
2678
+ * Generate a layout semicolon after a newline if the indent is less or equal to the current layout's indent, unless:
2679
+ *
2680
+ * - The current context doesn't use layout semicolons, which is the case for explicit brace layouts, tuple expressions,
2681
+ * the module header and multi-way if layouts.
2682
+ *
2683
+ * - `no_semi` was set because newline lookahead found an explicit semicolon in the next line, or this function was
2684
+ * executed before for the same newline.
2685
+ *
2686
+ * - `skip_semi` was set because the previous line ended with an explicit semicolon.
2687
+ */
2688
+ static Symbol semicolon(Env *env) {
2689
+ if (
2690
+ is_semicolon_context(env)
2691
+ &&
2692
+ !(env->state->newline.no_semi || env->state->newline.skip_semi)
2693
+ &&
2694
+ indent_lesseq(env, env->state->newline.indent)
2695
+ ) {
2696
+ env->state->newline.no_semi = true;
2697
+ return finish(SEMICOLON, "newline");
2698
+ }
2699
+ else return FAIL;
2700
+ }
2701
+
2702
+ // --------------------------------------------------------------------------------------------------------
2703
+ // High-level `Lexed` dispatch
2704
+ // --------------------------------------------------------------------------------------------------------
2705
+
2706
+ /**
2707
+ * Process a `Lexed` token if it results in a layout end or an extra.
2708
+ *
2709
+ * This is called by `newline_post` before marking, so the actions must not fail after advancing.
2710
+ */
2711
+ static Symbol process_token_safe(Env *env, Lexed next) {
2712
+ switch (next) {
2713
+ case LWhere:
2714
+ return end_layout_where(env);
2715
+ case LIn:
2716
+ return end_layout_in(env);
2717
+ case LThen:
2718
+ case LElse:
2719
+ return end_layout(env, "then/else");
2720
+ case LDeriving:
2721
+ return end_layout_deriving(env);
2722
+ case LBar:
2723
+ if (!valid(env, BAR)) return end_layout(env, "bar");
2724
+ break;
2725
+ case LPragma:
2726
+ return pragma(env);
2727
+ case LBlockComment:
2728
+ return block_comment(env);
2729
+ case LLineComment:
2730
+ return inline_comment(env);
2731
+ case LCppElse:
2732
+ return cpp_else(env, true);
2733
+ case LCpp:
2734
+ return cpp_line(env);
2735
+ case LSymop:
2736
+ case LTick:
2737
+ case LHash:
2738
+ return end_layout_infix(env);
2739
+ case LUnboxedClose:
2740
+ SEQ(token_end_layout_texp(env));
2741
+ return end_layout_infix(env);
2742
+ case LArrow:
2743
+ if (!valid(env, ARROW)) return token_end_layout_texp(env);
2744
+ break;
2745
+ case LTexpCloser:
2746
+ return token_end_layout_texp(env);
2747
+ case LQuoteClose:
2748
+ return end_layout(env, "quote bracket");
2749
+ break;
2750
+ default:
2751
+ break;
2752
+ }
2753
+ return FAIL;
2754
+ }
2755
+
2756
+ /**
2757
+ * Process a `Lexed` token if it results in a symbolic operator.
2758
+ */
2759
+ static Symbol process_token_symop(Env *env, bool whitespace, Lexed next) {
2760
+ switch (next) {
2761
+ case LDotDot:
2762
+ SEQ(finish_if_valid(env, DOTDOT, "symop"));
2763
+ return tight_op(env, whitespace, QUAL_DOT);
2764
+ case LDotId:
2765
+ SEQ(finish_if_valid(env, whitespace ? PREFIX_DOT : TIGHT_DOT, "symop"));
2766
+ return tight_op(env, whitespace, QUAL_DOT);
2767
+ case LDotSymop:
2768
+ return tight_or_varsym(env, whitespace, QUAL_DOT);
2769
+ case LDotOpen:
2770
+ return prefix_or_varsym(env, whitespace, PREFIX_DOT);
2771
+ case LBang:
2772
+ return infix_or_varsym(env, whitespace, PREFIX_BANG, TIGHT_BANG);
2773
+ case LTilde:
2774
+ return infix_or_varsym(env, whitespace, PREFIX_TILDE, TIGHT_TILDE);
2775
+ case LAt:
2776
+ return infix_or_varsym(env, whitespace, PREFIX_AT, TIGHT_AT);
2777
+ case LPercent:
2778
+ return prefix_or_varsym(env, whitespace, PREFIX_PERCENT);
2779
+ case LSymop:
2780
+ if (char0(env, ':')) return finish_symop(env, CONSYM);
2781
+ else return finish_symop(env, VARSYM);
2782
+ // The following are handled here despite not being purely symop tokens because `process_token_symop` is executed
2783
+ // last, and these handlers all have potentially quite far lookahead and can fail.
2784
+ case LSymopSpecial:
2785
+ SEQ(left_section_op(env, symop_lookahead(env)));
2786
+ if (valid(env, MINUS) && match_symop(env, "-")) return finish(MINUS, "symop");
2787
+ break;
2788
+ case LUnboxedClose:
2789
+ case LHash:
2790
+ return left_section_op(env, symop_lookahead(env));
2791
+ case LTick:
2792
+ return left_section_ticked(env);
2793
+ case LUpper:
2794
+ if (valid(env, QUALIFIED_OP) || valid(env, LEFT_SECTION_OP)) SEQ(qualified_op(env));
2795
+ break;
2796
+ default:
2797
+ break;
2798
+ }
2799
+ return FAIL;
2800
+ }
2801
+
2802
+ static Symbol process_token_splice(Env *env, Lexed next) {
2803
+ switch (next) {
2804
+ case LDollar:
2805
+ return finish_if_valid(env, SPLICE, "symop");
2806
+ default:
2807
+ break;
2808
+ }
2809
+ return FAIL;
2810
+ }
2811
+
2812
+ /**
2813
+ * Process a `Lexed` token for an interior position.
2814
+ */
2815
+ static Symbol process_token_interior(Env *env, Lexed next) {
2816
+ switch (next) {
2817
+ case LBraceClose:
2818
+ SEQ(end_layout_brace(env));
2819
+ return token_end_layout_texp(env);
2820
+ // Skip layout start
2821
+ case LModule:
2822
+ return FAIL;
2823
+ case LSemi:
2824
+ return explicit_semicolon(env);
2825
+ case LBracketOpen:
2826
+ return finish(QQ_START, "qq_start");
2827
+ default:
2828
+ break;
2829
+ }
2830
+ SEQ(process_token_safe(env, next));
2831
+ return start_layout_interior(env, next);
2832
+ }
2833
+
2834
+ /**
2835
+ * Process a `Lexed` token to initialize the context stack.
2836
+ */
2837
+ static Symbol process_token_init(Env *env, uint32_t indent, Lexed next) {
2838
+ switch (next) {
2839
+ case LModule:
2840
+ push_context(env, ModuleHeader, 0);
2841
+ return update_state("init");
2842
+ case LBraceOpen:
2843
+ advance_over(env, 0);
2844
+ MARK("init brace");
2845
+ push_context(env, Braces, indent);
2846
+ return finish(START_EXPLICIT, "init");
2847
+ default:
2848
+ push_context(env, DeclLayout, indent);
2849
+ return finish(START, "init");
2850
+ }
2851
+ }
2852
+
2853
+ // --------------------------------------------------------------------------------------------------------
2854
+ // Newline actions
2855
+ // --------------------------------------------------------------------------------------------------------
2856
+
2857
+ /**
2858
+ * `NoSpace` + `newline_init()` means that we're at the very beginning of the file, where we start in `NResume` mode
2859
+ * without a newline character that can tell us where we are.
2860
+ */
2861
+ static Symbol newline_extras(Env *env, Space space) {
2862
+ bool bol = space == BOL || (space == NoSpace && newline_init(env));
2863
+ Lexed next = lex_extras(env, bol);
2864
+ dbg("newline extras token: %s\n", token_names[next]);
2865
+ return process_token_safe(env, next);
2866
+ }
2867
+
2868
+ // Don't finish newline processing before pragmas – they are indicators of layout indent, but since they are extras,
2869
+ // they cannot consume a semicolon, so when there's a pragma on a line of its own, we would get two semicolons if we
2870
+ // finished here.
2871
+ // It's guaranteed that the newline state was committed at least once because `newline_lookahead` sets `unsafe` when
2872
+ // finding a pragma.
2873
+ static Symbol newline_process(Env *env) {
2874
+ dbg("newline post\n");
2875
+ uint32_t indent = env->state->newline.indent;
2876
+ Lexed end = env->state->newline.end;
2877
+ SEQ(end_layout_indent(env));
2878
+ SEQ(process_token_safe(env, end));
2879
+ Space space = skip_whitespace(env);
2880
+ MARK("newline_post");
2881
+ if (env->state->newline.unsafe) SEQ(newline_extras(env, space));
2882
+ if (!env->state->newline.eof) SEQ(start_layout_newline(env));
2883
+ // TODO it is only necessary to run this late because of very few situations, like nondecreasing indent.
2884
+ // But it has the consequence that whitespace is included in the parent in nested layouts.
2885
+ // Maybe there's a way to run it before and after `start_layout_newline` with conditions.
2886
+ SEQ(semicolon(env));
2887
+ reset_newline(env);
2888
+ if (uninitialized(env)) SEQ(process_token_init(env, indent, end));
2889
+ else {
2890
+ SEQ(process_token_symop(env, true, end));
2891
+ SEQ(process_token_splice(env, end));
2892
+ }
2893
+ return update_state("newline final");
2894
+ }
2895
+
2896
+ static Symbol newline_post(Env *env) {
2897
+ Symbol res = newline_process(env);
2898
+ if (newline_init(env)) env->state->newline.state = NProcess;
2899
+ return res;
2900
+ }
2901
+
2902
+ /**
2903
+ * Repeatedly lex lookahead until encountering something that is neither a comment nor CPP, skipping whitespace and
2904
+ * newlines in between.
2905
+ */
2906
+ static void newline_lookahead(Env *env, Newline *newline) {
2907
+ for (;;) {
2908
+ // Using `peek0` to look for whitespace requires the lookahead buffer to have been reset immediately before this
2909
+ // statement – so before the call to this function or at the end of the for loop body.
2910
+ // The reason this isn't using `lexer->lookahead` is that the function may be called at an interior position, to
2911
+ // skip extras.
2912
+ switch (peek0(env)) {
2913
+ NEWLINE_CASES:
2914
+ skip_over(env, 0);
2915
+ newline->indent = 0;
2916
+ break;
2917
+ case '\t':
2918
+ skip_over(env, 0);
2919
+ newline->indent += 8;
2920
+ break;
2921
+ default:
2922
+ if (is_space_char(peek0(env))) {
2923
+ skip_over(env, 0);
2924
+ newline->indent++;
2925
+ break;
2926
+ }
2927
+ newline->end = lex(env, newline->indent == 0);
2928
+ dbg("newline token: %s, %lc\n", token_names[newline->end], peek0(env));
2929
+ // Newlines without extras are only safe if `lex` didn't advance the lexer over non-whitespace.
2930
+ newline->unsafe |= !no_lookahead(env);
2931
+ switch (newline->end) {
2932
+ case LEof:
2933
+ newline->indent = 0;
2934
+ newline->eof = true;
2935
+ return;
2936
+ // If/then blocks can have semicolons, but don't have a layout.
2937
+ // Allowing layout semicolons costs 100kB.
2938
+ case LThen:
2939
+ case LElse:
2940
+ case LSemi:
2941
+ newline->no_semi = true;
2942
+ return;
2943
+ case LBlockComment:
2944
+ newline->indent = consume_block_comment(env, newline->indent + 2);
2945
+ break;
2946
+ case LLineComment:
2947
+ newline->indent = 0;
2948
+ take_line(env);
2949
+ break;
2950
+ case LCppElse:
2951
+ cpp_else(env, false);
2952
+ take_line_escaped_newline(env);
2953
+ break;
2954
+ case LCpp:
2955
+ take_line_escaped_newline(env);
2956
+ break;
2957
+ default:
2958
+ return;
2959
+ }
2960
+ }
2961
+ reset_lookahead(env);
2962
+ }
2963
+ }
2964
+
2965
+ /**
2966
+ * Perform newline lookahead, then either finish the run if the position was advanced into the next token, or directly
2967
+ * start newline processing if not.
2968
+ */
2969
+ static Symbol newline_start(Env *env) {
2970
+ dbg("newline lookahead\n");
2971
+ env->state->newline.state = NInit;
2972
+ newline_lookahead(env, &env->state->newline);
2973
+ if (env->state->newline.unsafe) return update_state("newline lookahead");
2974
+ else return newline_post(env);
2975
+ }
2976
+
2977
+ /**
2978
+ * Perform newline lookahead with preset indent, used at the beginning of a file and after pragmas.
2979
+ */
2980
+ static Symbol newline_resume(Env *env) {
2981
+ dbg("newline resume\n");
2982
+ uint32_t indent = env->state->newline.indent;
2983
+ // Skip space between the pragma end and the next token, which might be the first real token (or another pragma or
2984
+ // comment, or newline).
2985
+ // We don't want to count the space as indent.
2986
+ skip_space(env);
2987
+ reset_newline(env);
2988
+ env->state->newline.indent = indent;
2989
+ return newline_start(env);
2990
+ }
2991
+
2992
+ // --------------------------------------------------------------------------------------------------------
2993
+ // Constraints
2994
+ // --------------------------------------------------------------------------------------------------------
2995
+
2996
+ /**
2997
+ * The following mechanism avoids the conflict between types and classes.
2998
+ * Consider this situation:
2999
+ *
3000
+ * > data A = B b % C => D d :+ E
3001
+ * > data E = F f => G g
3002
+ *
3003
+ * After the `=`, a diverse set of constructs are valid.
3004
+ *
3005
+ * - Data constructor
3006
+ * - Infix `D d :+ E` -> `(type/name) (type/variable) (constructor_operator) (type/name)`
3007
+ * - Prefix `G g` -> `(name) (type/variable)`
3008
+ * - Context
3009
+ * - Infix `B b % C` -> `(type/name) (type/variable) (operator) (type/name)`
3010
+ * - Prefix `F f` -> `(constraint/name) (type/variable)`
3011
+ *
3012
+ * Each of these starts with a `(name)` with different reduction rules that can only be resolved when the arrow or a
3013
+ * data constructor-ending token is encountered.
3014
+ * The conflict between `D` and `G` is an additional hurdle that is not addressed here.
3015
+ *
3016
+ * Constraint lookahead scans ahead until it finds `=>` or a clear rejection criterion like `=` or (layout) semicolon,
3017
+ * emitting `_cond_context` to unlock the rules `_qtype_context`, `context` and `_ctr_context`.
3018
+ *
3019
+ * However, even the two context variants conflict, since infix classes have types in their operands, while a prefix
3020
+ * constraint starts with a class name.
3021
+ * To mitigate this, constraint lookahead additionally emits `_cond_infix` when it encounters an infix operator.
3022
+ * This symbol is only emitted when `_cond_context` is not valid (because it was parsed right before) or because no `=>`
3023
+ * is encountered afterwards (because the current position is in parentheses).
3024
+ * This only works because infix classes are localized within contexts – disambiguating all infix types like this is
3025
+ * impossible without completely restructuring the grammar.
3026
+ *
3027
+ * Note that this problem could easily be avoided by parsing all contexts as types, accepting that queries for class
3028
+ * names would be more verbose and couldn't match more complex constraints.
3029
+ * Furthermore, a much simpler fix would be a runtime conflict, which has the potential to result in randomly incorrect
3030
+ * parse trees.
3031
+ *
3032
+ * Similarly to contexts, data constructor heads have infix type-related conflicts that aren't as severe but can easily
3033
+ * piggyback on this mechanism, so they are included.
3034
+ *
3035
+ * Lastly, associated type families and instances conflict because they can both be heralded by `type` alone, so the
3036
+ * decision to reduce to type head or instance head nodes is informed by the presence of `::` or `=` without `|`.
3037
+ */
3038
+
3039
+ /**
3040
+ * Result of constraint lookahead.
3041
+ */
3042
+ typedef enum {
3043
+ // Continue searching
3044
+ CtrUndecided,
3045
+ // Clear evidence found that no context or infix class is ahead.
3046
+ CtrImpossible,
3047
+ // The context arrow `=>` was found.
3048
+ CtrArrowFound,
3049
+ // An infix operator was found.
3050
+ CtrInfixFound,
3051
+ // An `=` was found.
3052
+ CtrEqualsFound,
3053
+ // A `|` was found.
3054
+ CtrBarFound,
3055
+ } CtrResult;
3056
+
3057
+ #ifdef TREE_SITTER_DEBUG
3058
+
3059
+ static const char *ctr_result_names[] = {
3060
+ "undecided",
3061
+ "impossible",
3062
+ "arrow",
3063
+ "infix",
3064
+ "equals",
3065
+ "bar",
3066
+ };
3067
+
3068
+ #endif
3069
+
3070
+ /**
3071
+ * Constraint lookahead state.
3072
+ */
3073
+ typedef struct {
3074
+ // The amount of characters to skip after an iteration.
3075
+ // For example, after lexing a `conid` the next token can be lexed at the end of the identifier.
3076
+ uint32_t reset;
3077
+ // The number of nested brackets.
3078
+ // When this is nonzero, end tokens are not treated as pertaining to the current expression.
3079
+ uint32_t brackets;
3080
+ // A context arrow was found.
3081
+ bool context;
3082
+ // An infix operator was found.
3083
+ bool infix;
3084
+ bool data_infix;
3085
+ bool type_instance;
3086
+ } CtrState;
3087
+
3088
+ /**
3089
+ * Increment the bracket count.
3090
+ */
3091
+ static CtrResult ctr_bracket_open(CtrState *state) {
3092
+ state->brackets++;
3093
+ state->reset = 1;
3094
+ return CtrUndecided;
3095
+ }
3096
+
3097
+ /**
3098
+ * Decrement the bracket count.
3099
+ * If the count was zero already, parsing started inside of brackets that are closed here, so lookahead is terminated.
3100
+ */
3101
+ static CtrResult ctr_bracket_close(CtrState *state) {
3102
+ if (state->brackets == 0) return CtrImpossible;
3103
+ state->brackets--;
3104
+ state->reset = 1;
3105
+ return CtrUndecided;
3106
+ }
3107
+
3108
+ /**
3109
+ * If the given token is ahead, terminate lookahead unsuccessfully.
3110
+ */
3111
+ static CtrResult ctr_stop_on_token(Env *env, const char * restrict target) {
3112
+ return token(env, target) ? CtrImpossible : CtrUndecided;
3113
+ }
3114
+
3115
+ /**
3116
+ * Check if the lexed token is `=>` or an infix operator.
3117
+ *
3118
+ * This is performed only when the current position is not in a bracketed expression, i.e. at top level relative to the
3119
+ * initial lexer position.
3120
+ * Otherwise the token belongs to a later, nested expression.
3121
+ *
3122
+ * Certain tokens are proof that no context can start at the current position, like `::` or `forall`, so lookahead is
3123
+ * terminated.
3124
+ * It is still possible that an infix class can be parsed, for example in this type when starting at the at `C` and
3125
+ * terminating at `::`:
3126
+ * > `a :: (C + D :: Constraint) => E`
3127
+ */
3128
+ static CtrResult ctr_top(Env *env, Lexed next) {
3129
+ switch (next) {
3130
+ case LCArrow:
3131
+ return CtrArrowFound;
3132
+ case LSymop:
3133
+ case LSymopSpecial:
3134
+ case LTilde:
3135
+ case LTick:
3136
+ return CtrInfixFound;
3137
+ case LBar:
3138
+ return CtrBarFound;
3139
+ case LArrow:
3140
+ case LWhere:
3141
+ case LDotDot:
3142
+ case LSemi:
3143
+ break;
3144
+ case LTexpCloser:
3145
+ switch (peek0(env)) {
3146
+ case '=':
3147
+ return CtrEqualsFound;
3148
+ default:
3149
+ break;
3150
+ }
3151
+ break;
3152
+ default:
3153
+ switch (peek0(env)) {
3154
+ // Symop is processed in `ctr_lookahead_step`, so `=` and `::` can not be a prefix
3155
+ case '=':
3156
+ return CtrEqualsFound;
3157
+ case 0x2200: // ∀
3158
+ break;
3159
+ case ':':
3160
+ if (char1(env, ':')) break;
3161
+ return CtrUndecided;
3162
+ case 'f':
3163
+ SEQ(ctr_stop_on_token(env, "forall"));
3164
+ return ctr_stop_on_token(env, "family");
3165
+ case 'i':
3166
+ return ctr_stop_on_token(env, "instance");
3167
+ default:
3168
+ return CtrUndecided;
3169
+ }
3170
+ }
3171
+ return CtrImpossible;
3172
+ }
3173
+
3174
+ /**
3175
+ * Process a lexed token for constraint lookahead:
3176
+ * - Update bracket nesting count
3177
+ * - Advance over pragmas, strings, chars and conids
3178
+ * - Set the reset index for certain tokens
3179
+ *
3180
+ * If the token wasn't identified to be irrelevant for the lookahead result, and the current bracket nesting level is
3181
+ * zero, call `ctr_top`.
3182
+ */
3183
+ static CtrResult ctr_lookahead_step(Env *env, CtrState *state, Lexed next) {
3184
+ state->reset = 1;
3185
+ switch (next) {
3186
+ case LBraceClose:
3187
+ return ctr_bracket_close(state);
3188
+ case LUnboxedClose:
3189
+ SEQ(ctr_bracket_close(state));
3190
+ state->reset = 2;
3191
+ return CtrUndecided;
3192
+ case LBraceOpen:
3193
+ return ctr_bracket_open(state);
3194
+ case LSymopSpecial:
3195
+ case LSymop:
3196
+ state->reset = symop_lookahead(env);
3197
+ break;
3198
+ case LUpper:
3199
+ state->reset = conid(env);
3200
+ return CtrUndecided;
3201
+ case LDotId:
3202
+ return CtrUndecided;
3203
+ case LPragma:
3204
+ if (consume_pragma(env)) state->reset = 3;
3205
+ return CtrUndecided;
3206
+ case LTexpCloser:
3207
+ case LNothing:
3208
+ switch (peek0(env)) {
3209
+ case ')':
3210
+ case ']':
3211
+ return ctr_bracket_close(state);
3212
+ case '(':
3213
+ case '[':
3214
+ return ctr_bracket_open(state);
3215
+ case '"':
3216
+ state->reset = take_string_literal(env);
3217
+ return CtrUndecided;
3218
+ case '\'':
3219
+ state->reset = take_char_literal(env);
3220
+ return CtrUndecided;
3221
+ default:
3222
+ if (varid_start_char(peek0(env))) state->reset = advance_while(env, 1, is_id_char);
3223
+ break;
3224
+ }
3225
+ default:
3226
+ break;
3227
+ }
3228
+ if (state->brackets != 0) return CtrUndecided;
3229
+ return ctr_top(env, next);
3230
+ }
3231
+
3232
+ /**
3233
+ * Main loop for context lookahead.
3234
+ *
3235
+ * Perform newline lookahead and terminate if the end of the current layout element is encountered.
3236
+ * Otherwise use the new end token to detect a context arrow or infix operator.
3237
+ * If no termination criterion is fulfilled, reset lookahead and repeat.
3238
+ *
3239
+ * Newline lookahead skips over extras.
3240
+ *
3241
+ * A context arrow is always a termination criterion; an infix operator only if CONTEXT isn't valid.
3242
+ */
3243
+ static Symbol constraint_lookahead(Env *env) {
3244
+ dbg("type lookahead\n");
3245
+ CtrState state = {.reset = 0};
3246
+ bool done = false;
3247
+ while (!done && not_eof(env)) {
3248
+ // Setting indent to 99999 only to not trigger the following termination condition when no newline was advanced over
3249
+ Newline newline = {.state = 0, .indent = 99999};
3250
+ newline_lookahead(env, &newline);
3251
+ if (newline.indent <= current_indent(env) && current_context(env) != Braces) break;
3252
+ CtrResult result = ctr_lookahead_step(env, &state, newline.end);
3253
+ dbg("type: %lc, %s\n", peek0(env), ctr_result_names[result]);
3254
+ switch (result) {
3255
+ case CtrArrowFound:
3256
+ state.context = true;
3257
+ done = true;
3258
+ break;
3259
+ case CtrInfixFound:
3260
+ if (char0(env, ':') || char0(env, '`')) state.data_infix = true;
3261
+ state.infix = true;
3262
+ // Context has precedence, e.g. `instance a + a => A` finds `+` first and would treat that as the class name of
3263
+ // the head, then failing on the right operand.
3264
+ done = !valid(env, CONTEXT);
3265
+ break;
3266
+ case CtrEqualsFound:
3267
+ done = !valid(env, TYPE_INSTANCE);
3268
+ state.type_instance = true;
3269
+ break;
3270
+ case CtrBarFound:
3271
+ done = true;
3272
+ state.type_instance = false;
3273
+ break;
3274
+ case CtrImpossible:
3275
+ done = true;
3276
+ case CtrUndecided:
3277
+ break;
3278
+ }
3279
+ reset_lookahead_to(env, state.reset);
3280
+ state.reset = 0;
3281
+ }
3282
+ if (state.context) SEQ(finish_if_valid(env, CONTEXT, "ctr"));
3283
+ if (state.infix) SEQ(finish_if_valid(env, INFIX, "ctr"));
3284
+ if (state.data_infix) SEQ(finish_if_valid(env, DATA_INFIX, "ctr"));
3285
+ if (state.type_instance) SEQ(finish_if_valid(env, TYPE_INSTANCE, "ctr"));
3286
+ return FAIL;
3287
+ }
3288
+
3289
+ // --------------------------------------------------------------------------------------------------------
3290
+ // Actions that are executed for interior positions
3291
+ // --------------------------------------------------------------------------------------------------------
3292
+
3293
+ static Symbol process_token_constraint(Env *env) {
3294
+ if (
3295
+ valid(env, CONTEXT)
3296
+ ||
3297
+ valid(env, INFIX)
3298
+ ||
3299
+ valid(env, DATA_INFIX)
3300
+ ||
3301
+ valid(env, TYPE_INSTANCE)
3302
+ )
3303
+ return constraint_lookahead(env);
3304
+ return FAIL;
3305
+ }
3306
+
3307
+ static Symbol interior(Env *env, bool whitespace) {
3308
+ Lexed next = lex(env, false);
3309
+ dbg("interior, column %d, ws %d, token %s\n", column(env), whitespace, token_names[next]);
3310
+ SEQ(resolve_semicolon(env, next));
3311
+ SEQ(process_token_interior(env, next));
3312
+ SEQ(process_token_symop(env, whitespace, next));
3313
+ SEQ(process_token_constraint(env));
3314
+ SEQ(process_token_splice(env, next));
3315
+ return FAIL;
3316
+ }
3317
+
3318
+ // --------------------------------------------------------------------------------------------------------
3319
+ // Initial actions
3320
+ // --------------------------------------------------------------------------------------------------------
3321
+
3322
+ /**
3323
+ * These are conditioned only on symbols and don't advance, except for `qq_body`, which cannot fail.
3324
+ */
3325
+ static Symbol pre_ws_commands(Env *env) {
3326
+ SEQ(texp_context(env));
3327
+ SEQ(start_brace(env));
3328
+ SEQ(end_brace(env));
3329
+ // Leading whitespace must be included in the node.
3330
+ if (valid(env, QQ_BODY)) return qq_body(env);
3331
+ if (newline_active(env)) SEQ(newline_post(env));
3332
+ else if (env->state->newline.state == NResume) SEQ(newline_resume(env));
3333
+ return FAIL;
3334
+ }
3335
+
3336
+ static Symbol scan_main(Env *env) {
3337
+ MARK("main");
3338
+ SEQ(pre_ws_commands(env));
3339
+ bool whitespace = skip_space(env);
3340
+ if (is_newline(PEEK)) return newline_start(env);
3341
+ else if (not_eof(env)) return interior(env, whitespace);
3342
+ return FAIL;
3343
+ }
3344
+
3345
+ #ifdef TREE_SITTER_DEBUG
3346
+
3347
+ static Symbol scan_debug(Env *env) {
3348
+ if (debug_init(env)) return update_state("debug init parse buffer");
3349
+ Symbol result = scan_main(env);
3350
+ debug_finish(env, result);
3351
+ return result;
3352
+ }
3353
+
3354
+ #endif
3355
+
3356
+ static bool process_result(Env *env, Symbol result) {
3357
+ if (result == FAIL && is_eof(env) && no_lookahead(env)) {
3358
+ MARK("eof whitespace");
3359
+ // Inlined `end_layout` because of perf glitch
3360
+ if (valid(env, END)) result = end_layout_unchecked(env, "eof");
3361
+ else if (valid(env, SEMICOLON)) result = finish(SEMICOLON, "eof");
3362
+ else {
3363
+ result = force_end_context(env);
3364
+ if (result == FAIL) {
3365
+ dbg("eof | context cap: %d | lookahead cap: %d | parse cap: %d\n",
3366
+ env->state->contexts.capacity, env->state->lookahead.capacity, env->state->parse.capacity);}
3367
+ }
3368
+ }
3369
+ return set_result_symbol(env, result);
3370
+ }
3371
+
3372
+
3373
+ static bool scan(Env *env) {
3374
+ if(after_error(env)) { dbg("error recovery\n"); return false; }
3375
+ #ifdef TREE_SITTER_DEBUG
3376
+ Symbol result = scan_debug(env);
3377
+ #else
3378
+ Symbol result = scan_main(env);
3379
+ #endif
3380
+ return process_result(env, result);
3381
+ }
3382
+
3383
+ // --------------------------------------------------------------------------------------------------------
3384
+ // API
3385
+ // --------------------------------------------------------------------------------------------------------
3386
+
3387
+ typedef struct {
3388
+ unsigned contexts;
3389
+ Newline newline;
3390
+ #ifdef TREE_SITTER_DEBUG
3391
+ unsigned parse;
3392
+ #endif
3393
+ } Persist;
3394
+
3395
+ /**
3396
+ * This function allocates the persistent state of the parser that is passed into the other API functions.
3397
+ */
3398
+ void *tree_sitter_haskell_external_scanner_create() {
3399
+ State *state = ts_calloc(1, sizeof(State));
3400
+ array_reserve(&state->contexts, 8);
3401
+ array_reserve(&state->lookahead, 8);
3402
+ #ifdef TREE_SITTER_DEBUG
3403
+ array_reserve(&state->parse, 20);
3404
+ #endif
3405
+ return state;
3406
+ }
3407
+
3408
+ /**
3409
+ * Main logic entry point.
3410
+ * Since the state is a singular vector, it can just be cast and used directly.
3411
+ */
3412
+ bool tree_sitter_haskell_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
3413
+ Env env = env_new(lexer, valid_symbols, (State*) payload);
3414
+ return scan(&env);
3415
+ }
3416
+
3417
+ unsigned tree_sitter_haskell_external_scanner_serialize(void *payload, char *buffer) {
3418
+ State *state = (State *) payload;
3419
+ Persist persist = {.contexts = state->contexts.size, .newline = state->newline};
3420
+ #ifdef TREE_SITTER_DEBUG
3421
+ persist.parse = state->parse.size;
3422
+ #endif
3423
+ unsigned contexts_size = persist.contexts * sizeof(Context);
3424
+ memcpy(buffer, &persist, sizeof(Persist));
3425
+ unsigned to_copy = sizeof(Persist) + contexts_size;
3426
+ if (to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
3427
+ memcpy(buffer + sizeof(Persist), state->contexts.contents, contexts_size);
3428
+ #ifdef TREE_SITTER_DEBUG
3429
+ to_copy = serialize_parse_lines(buffer + sizeof(Persist) + contexts_size, &state->parse, to_copy);
3430
+ #endif
3431
+ return to_copy;
3432
+ }
3433
+
3434
+ void tree_sitter_haskell_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
3435
+ State *state = (State *) payload;
3436
+ Persist p;
3437
+ Persist *persist;
3438
+ if (length > 0)
3439
+ persist = (Persist *) buffer;
3440
+ else {
3441
+ p = (Persist) {.contexts = 0};
3442
+ persist = &p;
3443
+ persist->newline.state = NResume;
3444
+ }
3445
+ unsigned contexts_size = persist->contexts * sizeof(Context);
3446
+ state->newline = persist->newline;
3447
+ array_reserve(&state->contexts, persist->contexts);
3448
+ state->contexts.size = persist->contexts;
3449
+ if (length > 0)
3450
+ memcpy(state->contexts.contents, buffer + sizeof(Persist), contexts_size);
3451
+ state->lookahead.size = 0;
3452
+ state->lookahead.offset = 0;
3453
+ array_reserve(&state->lookahead, 8);
3454
+ #ifdef TREE_SITTER_DEBUG
3455
+ if (length > 0)
3456
+ deserialize_parse_lines(buffer + sizeof(Persist) + contexts_size, &state->parse, persist->parse);
3457
+ #endif
3458
+ }
3459
+
3460
+ void tree_sitter_haskell_external_scanner_destroy(void *payload) {
3461
+ State *state = (State*) payload;
3462
+ #ifdef TREE_SITTER_DEBUG
3463
+ palette();
3464
+ ParseLines *parse = &state->parse;
3465
+ for (unsigned i = 0; i < parse->size; i++) array_delete(array_get(parse, i));
3466
+ array_delete(parse);
3467
+ #endif
3468
+ array_delete(&state->contexts);
3469
+ array_delete(&state->lookahead);
3470
+ ts_free(state);
3471
+ }