@ast-grep/lang-haskell 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +24 -0
- package/index.d.ts +10 -0
- package/index.js +9 -0
- package/package.json +46 -0
- package/postinstall.js +4 -0
- package/prebuilds/prebuild-Linux-X64/parser.so +0 -0
- package/prebuilds/prebuild-Windows-X64/parser.so +0 -0
- package/prebuilds/prebuild-macOS-ARM64/parser.so +0 -0
- package/src/grammar.json +13267 -0
- package/src/node-types.json +6412 -0
- package/src/parser.c +677128 -0
- package/src/scanner.c +3471 -0
- package/src/tree_sitter/alloc.h +54 -0
- package/src/tree_sitter/array.h +290 -0
- package/src/tree_sitter/parser.h +266 -0
- package/src/unicode.h +2504 -0
- package/type.d.ts +5899 -0
package/src/scanner.c
ADDED
|
@@ -0,0 +1,3471 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* The scanner is an extension to the built-in lexer that handles cases that are hard or impossible to express with the
|
|
3
|
+
* high-level grammar rules.
|
|
4
|
+
* Since Haskell is indentation sensitive and uses parse errors to end layouts, this component has many
|
|
5
|
+
* responsibilities.
|
|
6
|
+
*
|
|
7
|
+
* tree-sitter runs the scanner at every position repeatedly until it fails, after which the built-in lexer consumes one
|
|
8
|
+
* token.
|
|
9
|
+
* When the scanner succeeds, it returns the index of a symbol in the `externals` array in `grammar/externals.js`, which
|
|
10
|
+
* is then processed like other grammar symbols, except that it terminates any conflict branches in which the symbol
|
|
11
|
+
* isn't valid.
|
|
12
|
+
* The scanner's state is persisted and passed into the next run, but it is discarded when the scanner fails, i.e. when
|
|
13
|
+
* it yields control back to the built-in lexer.
|
|
14
|
+
*
|
|
15
|
+
* The high-level workflow of the scanner consists of three distinct modes.
|
|
16
|
+
* When the first character after whitespace is a newline, the scanner starts newline lookahead, otherwise it processes
|
|
17
|
+
* an interior position.
|
|
18
|
+
* If the state indicates that the previous run performed newline lookahead, it enters newline processing mode.
|
|
19
|
+
*
|
|
20
|
+
* In interior mode, a single lexing pass is performed.
|
|
21
|
+
*
|
|
22
|
+
* Such a pass consists of two steps:
|
|
23
|
+
*
|
|
24
|
+
* In the first step, the scanner identifies the immediate token by branching on the first character after whitespace
|
|
25
|
+
* and examining different conditions to select one of the variants of the enum `Lexed`, which enumerates all known,
|
|
26
|
+
* interesting, situations.
|
|
27
|
+
* The position of the lexer may be advanced in the process to look at subsequent characters.
|
|
28
|
+
* To avoid having to arrange different parts of the logic according to how many characters have been consumed,
|
|
29
|
+
* lookahead is written to an array in the transient state on demand, so that each component can specify the index
|
|
30
|
+
* relative to the position at the beginning of the run (modulo whitespace).
|
|
31
|
+
* The entry point for this step is the function `lex`.
|
|
32
|
+
*
|
|
33
|
+
* The second step is different for each mode.
|
|
34
|
+
* In interior mode, the `Lexed` token determines which symbol to return to the grammar based on the current state, like
|
|
35
|
+
* layout contexts and valid symbols.
|
|
36
|
+
* Most symbols do not contain any text, but only act as conditions in the grammar, but for symbolic operators, CPP,
|
|
37
|
+
* comments, pragmas, and quasiquotes, the lexer is advanced to the end of the token and `mark_end` is called to
|
|
38
|
+
* communicate the range to tree-sitter.
|
|
39
|
+
*
|
|
40
|
+
* In newline lookahead mode, the scanner performs repeated lexing passes until it encounters a `Lexed` token that is
|
|
41
|
+
* not CPP or a comment.
|
|
42
|
+
* In the second step of each pass, the token determines whether to terminate and/or which flags to set in the state to
|
|
43
|
+
* guide processing in the next run.
|
|
44
|
+
* If the lookahead loop has only made a single lexing pass that did not consume any characters of the following token
|
|
45
|
+
* (because the first character did not match any of the conditions for lexing that require more lookahead), the scanner
|
|
46
|
+
* switches to newline processing mode directly; otherwise it terminates the run after storing the newline information
|
|
47
|
+
* in the persistent state.
|
|
48
|
+
* This is possible by succeeding with the symbol `UPDATE`, which is mapped to newline in `externals`.
|
|
49
|
+
* tree-sitter does not create a node in the parse tree for this symbol if `mark_end` wasn't called after consuming
|
|
50
|
+
* lookahead, and immediately calls the scanner again at the same position.
|
|
51
|
+
*
|
|
52
|
+
* In either case, the scanner ends up in newline processing mode, in which it performs a series of highly
|
|
53
|
+
* order-sensitive steps based on the data collected in lookahead mode, potentially returning multiple symbols in
|
|
54
|
+
* successive runs until none of the newline-related conditions match.
|
|
55
|
+
* This procedure ensures that nested layouts are terminated at the earliest position instead of extending over all
|
|
56
|
+
* subsequent (top-level) whitespace, comments and CPP up to the next layout element.
|
|
57
|
+
* Only when all layouts are terminated will the scanner process the final `Lexed` token that it stored in the state in
|
|
58
|
+
* lookahead mode, using the same logic as in interior mode, and update the state to disable newline processing for the
|
|
59
|
+
* next run.
|
|
60
|
+
*/
|
|
61
|
+
|
|
62
|
+
#include "tree_sitter/alloc.h"
|
|
63
|
+
#include "tree_sitter/array.h"
|
|
64
|
+
#include "tree_sitter/parser.h"
|
|
65
|
+
|
|
66
|
+
#include "unicode.h"
|
|
67
|
+
#include <assert.h>
|
|
68
|
+
#include <stdbool.h>
|
|
69
|
+
#include <string.h>
|
|
70
|
+
|
|
71
|
+
#define PEEK env->lexer->lookahead
|
|
72
|
+
|
|
73
|
+
#ifdef TREE_SITTER_DEBUG
|
|
74
|
+
|
|
75
|
+
#include <locale.h>
|
|
76
|
+
|
|
77
|
+
#define S_ADVANCE advance_debug(env)
|
|
78
|
+
#define S_SKIP skip_debug(env)
|
|
79
|
+
#define MARK(s) mark_debug(env, s)
|
|
80
|
+
#define dbg(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
|
|
81
|
+
|
|
82
|
+
#else
|
|
83
|
+
|
|
84
|
+
// Move the parser position one character to the right.
|
|
85
|
+
#define S_ADVANCE advance(env)
|
|
86
|
+
|
|
87
|
+
// Move the parser position one character to the right, treating the consumed character as whitespace.
|
|
88
|
+
#define S_SKIP env->lexer->advance(env->lexer, true)
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Instruct the lexer that the current position is the end of the potentially detected symbol, causing the next run to
|
|
92
|
+
* be started after this character in the success case.
|
|
93
|
+
*
|
|
94
|
+
* This is useful if the validity of the detected symbol depends on what follows.
|
|
95
|
+
*/
|
|
96
|
+
#define MARK(s) env->lexer->mark_end(env->lexer)
|
|
97
|
+
|
|
98
|
+
#define dbg(...) do {} while (0)
|
|
99
|
+
|
|
100
|
+
#endif
|
|
101
|
+
|
|
102
|
+
// Short circuit a parse step: If the argument expression returns 0, continue; otherwise return its result.
|
|
103
|
+
// This is used with enums, so casting to unsigned should not cause problems.
|
|
104
|
+
#define SEQ(expr) do { unsigned res = (unsigned) expr; if (res) return res; } while (0)
|
|
105
|
+
|
|
106
|
+
// --------------------------------------------------------------------------------------------------------
|
|
107
|
+
// Symbols
|
|
108
|
+
// --------------------------------------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* This enum mirrors the symbols in `externals` in `grammar/externals.js`.
|
|
112
|
+
* tree-sitter passes an array of booleans to the scanner whose entries are `true` if the symbol at the corresponding
|
|
113
|
+
* index is valid at the current parser position.
|
|
114
|
+
*/
|
|
115
|
+
typedef enum {
|
|
116
|
+
FAIL,
|
|
117
|
+
SEMICOLON,
|
|
118
|
+
START,
|
|
119
|
+
START_DO,
|
|
120
|
+
START_CASE,
|
|
121
|
+
START_IF,
|
|
122
|
+
START_LET,
|
|
123
|
+
START_QUOTE,
|
|
124
|
+
START_EXPLICIT,
|
|
125
|
+
END,
|
|
126
|
+
END_EXPLICIT,
|
|
127
|
+
START_BRACE,
|
|
128
|
+
END_BRACE,
|
|
129
|
+
START_TEXP,
|
|
130
|
+
END_TEXP,
|
|
131
|
+
WHERE,
|
|
132
|
+
IN,
|
|
133
|
+
ARROW,
|
|
134
|
+
BAR,
|
|
135
|
+
DERIVING,
|
|
136
|
+
COMMENT,
|
|
137
|
+
HADDOCK,
|
|
138
|
+
CPP,
|
|
139
|
+
PRAGMA,
|
|
140
|
+
QQ_START,
|
|
141
|
+
QQ_BODY,
|
|
142
|
+
SPLICE,
|
|
143
|
+
QUAL_DOT,
|
|
144
|
+
TIGHT_DOT,
|
|
145
|
+
PREFIX_DOT,
|
|
146
|
+
DOTDOT,
|
|
147
|
+
TIGHT_AT,
|
|
148
|
+
PREFIX_AT,
|
|
149
|
+
TIGHT_BANG,
|
|
150
|
+
PREFIX_BANG,
|
|
151
|
+
TIGHT_TILDE,
|
|
152
|
+
PREFIX_TILDE,
|
|
153
|
+
PREFIX_PERCENT,
|
|
154
|
+
QUALIFIED_OP,
|
|
155
|
+
LEFT_SECTION_OP,
|
|
156
|
+
NO_SECTION_OP,
|
|
157
|
+
MINUS,
|
|
158
|
+
CONTEXT,
|
|
159
|
+
INFIX,
|
|
160
|
+
DATA_INFIX,
|
|
161
|
+
TYPE_INSTANCE,
|
|
162
|
+
VARSYM,
|
|
163
|
+
CONSYM,
|
|
164
|
+
UPDATE,
|
|
165
|
+
} Symbol;
|
|
166
|
+
|
|
167
|
+
#ifdef TREE_SITTER_DEBUG
|
|
168
|
+
|
|
169
|
+
static const char *sym_names[] = {
|
|
170
|
+
"fail",
|
|
171
|
+
"semicolon",
|
|
172
|
+
"start",
|
|
173
|
+
"start_do",
|
|
174
|
+
"start_case",
|
|
175
|
+
"start_if",
|
|
176
|
+
"start_let",
|
|
177
|
+
"start_quote",
|
|
178
|
+
"start_explicit",
|
|
179
|
+
"end",
|
|
180
|
+
"end_explicit",
|
|
181
|
+
"start_brace",
|
|
182
|
+
"end_brace",
|
|
183
|
+
"start_texp",
|
|
184
|
+
"end_texp",
|
|
185
|
+
"where",
|
|
186
|
+
"in",
|
|
187
|
+
"arrow",
|
|
188
|
+
"bar",
|
|
189
|
+
"deriving",
|
|
190
|
+
"comment",
|
|
191
|
+
"haddock",
|
|
192
|
+
"cpp",
|
|
193
|
+
"pragma",
|
|
194
|
+
"qq_start",
|
|
195
|
+
"qq_body",
|
|
196
|
+
"splice",
|
|
197
|
+
"tight_dot",
|
|
198
|
+
"proj_dot",
|
|
199
|
+
"prefix_dot",
|
|
200
|
+
"dotdot",
|
|
201
|
+
"tight_at",
|
|
202
|
+
"prefix_at",
|
|
203
|
+
"tight_bang",
|
|
204
|
+
"prefix_bang",
|
|
205
|
+
"tight_tilde",
|
|
206
|
+
"prefix_tilde",
|
|
207
|
+
"prefix_percent",
|
|
208
|
+
"qualified_op",
|
|
209
|
+
"left_section_op",
|
|
210
|
+
"no_section_op",
|
|
211
|
+
"minus",
|
|
212
|
+
"context",
|
|
213
|
+
"infix",
|
|
214
|
+
"data_infix",
|
|
215
|
+
"type_instance",
|
|
216
|
+
"varsym",
|
|
217
|
+
"consym",
|
|
218
|
+
"update",
|
|
219
|
+
};
|
|
220
|
+
|
|
221
|
+
#endif
|
|
222
|
+
|
|
223
|
+
// --------------------------------------------------------------------------------------------------------
|
|
224
|
+
// Data
|
|
225
|
+
// --------------------------------------------------------------------------------------------------------
|
|
226
|
+
|
|
227
|
+
#ifdef TREE_SITTER_DEBUG
|
|
228
|
+
|
|
229
|
+
typedef Array(int32_t) ParseLine;
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* A vector of lines, persisted across runs, for visualizing the current lexer position and scanner lookahead.
|
|
233
|
+
*/
|
|
234
|
+
typedef Array(ParseLine) ParseLines;
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Info about calls to `mark_end` and how far the lexer has progressed in a run.
|
|
238
|
+
* Discarded after each run.
|
|
239
|
+
*/
|
|
240
|
+
typedef struct {
|
|
241
|
+
int marked;
|
|
242
|
+
unsigned marked_line;
|
|
243
|
+
unsigned start_col;
|
|
244
|
+
unsigned start_line;
|
|
245
|
+
unsigned end_col;
|
|
246
|
+
const char *marked_by;
|
|
247
|
+
} Debug;
|
|
248
|
+
|
|
249
|
+
Debug debug_new(TSLexer *l) {
|
|
250
|
+
return (Debug) {
|
|
251
|
+
.marked = -1,
|
|
252
|
+
.marked_line = 0,
|
|
253
|
+
.start_col = l->get_column(l),
|
|
254
|
+
.start_line = 0,
|
|
255
|
+
.end_col = 0,
|
|
256
|
+
.marked_by = "",
|
|
257
|
+
};
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
#endif
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Different sorts of layout contexts that require special treatment.
|
|
264
|
+
*/
|
|
265
|
+
typedef enum {
|
|
266
|
+
DeclLayout,
|
|
267
|
+
DoLayout,
|
|
268
|
+
CaseLayout,
|
|
269
|
+
LetLayout,
|
|
270
|
+
QuoteLayout,
|
|
271
|
+
MultiWayIfLayout,
|
|
272
|
+
Braces,
|
|
273
|
+
TExp,
|
|
274
|
+
ModuleHeader,
|
|
275
|
+
NoContext,
|
|
276
|
+
} ContextSort;
|
|
277
|
+
|
|
278
|
+
#ifdef TREE_SITTER_DEBUG
|
|
279
|
+
|
|
280
|
+
static char const *context_names[] = {
|
|
281
|
+
"decls",
|
|
282
|
+
"do",
|
|
283
|
+
"case",
|
|
284
|
+
"let",
|
|
285
|
+
"multi_way_if",
|
|
286
|
+
"quote",
|
|
287
|
+
"braces",
|
|
288
|
+
"texp",
|
|
289
|
+
"module_header",
|
|
290
|
+
"none",
|
|
291
|
+
};
|
|
292
|
+
|
|
293
|
+
#endif
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* The persistent state maintains a stack of layout contexts.
|
|
297
|
+
* New entries are created when a layout symbol is valid at the current position, and they are removed when the indent
|
|
298
|
+
* of a line satisfies conditions that depend on the current context sort, or when certain tokens (like `else`) occur.
|
|
299
|
+
*/
|
|
300
|
+
typedef struct {
|
|
301
|
+
ContextSort sort;
|
|
302
|
+
uint32_t indent;
|
|
303
|
+
} Context;
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* This enumerates the lookahead tokens that have special meaning in the scanner.
|
|
307
|
+
*/
|
|
308
|
+
typedef enum {
|
|
309
|
+
LNothing,
|
|
310
|
+
LEof,
|
|
311
|
+
LWhere,
|
|
312
|
+
LIn,
|
|
313
|
+
LThen,
|
|
314
|
+
LElse,
|
|
315
|
+
LDeriving,
|
|
316
|
+
LModule,
|
|
317
|
+
LUpper,
|
|
318
|
+
LTick,
|
|
319
|
+
LSymop,
|
|
320
|
+
LSymopSpecial,
|
|
321
|
+
LDotDot,
|
|
322
|
+
LDotId,
|
|
323
|
+
LDotSymop,
|
|
324
|
+
LDotOpen,
|
|
325
|
+
LDollar,
|
|
326
|
+
LBang,
|
|
327
|
+
LTilde,
|
|
328
|
+
LAt,
|
|
329
|
+
LPercent,
|
|
330
|
+
LHash,
|
|
331
|
+
LBar,
|
|
332
|
+
LArrow,
|
|
333
|
+
LCArrow,
|
|
334
|
+
LTexpCloser,
|
|
335
|
+
LQuoteClose,
|
|
336
|
+
LPragma,
|
|
337
|
+
LBlockComment,
|
|
338
|
+
LLineComment,
|
|
339
|
+
LBraceClose,
|
|
340
|
+
LBraceOpen,
|
|
341
|
+
LBracketOpen,
|
|
342
|
+
LUnboxedClose,
|
|
343
|
+
LSemi,
|
|
344
|
+
LCppElse,
|
|
345
|
+
LCpp,
|
|
346
|
+
} Lexed;
|
|
347
|
+
|
|
348
|
+
#ifdef TREE_SITTER_DEBUG
|
|
349
|
+
|
|
350
|
+
static const char *token_names[] = {
|
|
351
|
+
"nothing",
|
|
352
|
+
"eof",
|
|
353
|
+
"where",
|
|
354
|
+
"in",
|
|
355
|
+
"then",
|
|
356
|
+
"else",
|
|
357
|
+
"deriving",
|
|
358
|
+
"module",
|
|
359
|
+
"upper",
|
|
360
|
+
"tick",
|
|
361
|
+
"symop",
|
|
362
|
+
"symop-special",
|
|
363
|
+
"dot-dot",
|
|
364
|
+
"dot-id",
|
|
365
|
+
"dot-symop",
|
|
366
|
+
"dot-open",
|
|
367
|
+
"dollar",
|
|
368
|
+
"bang",
|
|
369
|
+
"tilde",
|
|
370
|
+
"at",
|
|
371
|
+
"percent",
|
|
372
|
+
"hash",
|
|
373
|
+
"bar",
|
|
374
|
+
"arrow",
|
|
375
|
+
"ctr",
|
|
376
|
+
"texp-closer",
|
|
377
|
+
"quote-close",
|
|
378
|
+
"pragma",
|
|
379
|
+
"block-comment",
|
|
380
|
+
"line-comment",
|
|
381
|
+
"brace-close",
|
|
382
|
+
"brace-open",
|
|
383
|
+
"bracket-open",
|
|
384
|
+
"unboxed-close",
|
|
385
|
+
"semi",
|
|
386
|
+
"cpp-else",
|
|
387
|
+
"cpp",
|
|
388
|
+
};
|
|
389
|
+
|
|
390
|
+
#endif
|
|
391
|
+
|
|
392
|
+
/**
|
|
393
|
+
* The current newline mode.
|
|
394
|
+
* `NInit` is set during newline lookahead, and `NProcess` when lookahead has finished.
|
|
395
|
+
* After processing is complete, the state is reset to `NInactive`.
|
|
396
|
+
* `NResume` is a special variant that forces newline lookahead mode when a run starts without requiring a newline.
|
|
397
|
+
* This is used for the beginning of the file and after pragmas (see `pragma`).
|
|
398
|
+
*/
|
|
399
|
+
typedef enum {
|
|
400
|
+
NInactive,
|
|
401
|
+
NInit,
|
|
402
|
+
NProcess,
|
|
403
|
+
NResume,
|
|
404
|
+
} NewlineState;
|
|
405
|
+
|
|
406
|
+
/**
|
|
407
|
+
* The two newline modes need to operate across multiple scanner runs and adapt their behavior to the context
|
|
408
|
+
* established by previous runs, encoded by this persistent state.
|
|
409
|
+
*/
|
|
410
|
+
typedef struct {
|
|
411
|
+
NewlineState state;
|
|
412
|
+
// The final token encountered after skipping comments and CPP.
|
|
413
|
+
Lexed end;
|
|
414
|
+
// The indent of `end`, used to decide layout actions before parsing intermediate extras.
|
|
415
|
+
uint32_t indent;
|
|
416
|
+
// When there is no token after extras, we shouldn't start layouts.
|
|
417
|
+
bool eof;
|
|
418
|
+
// Prohibit layout semicolons in future runs.
|
|
419
|
+
bool no_semi;
|
|
420
|
+
// Prohibit layout semicolons in future runs, but can be relaxed by some actions.
|
|
421
|
+
// See `explicit_semicolon`.
|
|
422
|
+
bool skip_semi;
|
|
423
|
+
// Lookahead has advanced into `end`, so the scanner has to be restarted before processing the newline result.
|
|
424
|
+
bool unsafe;
|
|
425
|
+
} Newline;
|
|
426
|
+
|
|
427
|
+
/**
|
|
428
|
+
* The vector for the layout context stack.
|
|
429
|
+
*/
|
|
430
|
+
typedef Array(Context) Contexts;
|
|
431
|
+
|
|
432
|
+
/**
|
|
433
|
+
* Whenever the lexer is advanced over non-(leading-)whitespace, the consumed character is appended to this vector.
|
|
434
|
+
* This avoids having to ensure that different components that need to examine multiple lookahead characters have to be
|
|
435
|
+
* run in the correct order.
|
|
436
|
+
* Instead, we refer to lookahead by the character's index using the interface described in the section 'Lookahead'.
|
|
437
|
+
*
|
|
438
|
+
* For example, the functions `peek0`, `char0`, `char1` operate on the first/second character relative to the start of
|
|
439
|
+
* the scanner run, and the implementation advances the lexer position when it is necessary.
|
|
440
|
+
*
|
|
441
|
+
* The field `offset` can be used to reset relative indexing to the current lexer position.
|
|
442
|
+
* This is used, for example, in `newline_lookahead`, to perform repeated lexing passes, since `lex` uses the lookahead
|
|
443
|
+
* interface.
|
|
444
|
+
* After processing a `Lexed` token, `newline_lookahead` continues seeking ahead after comments and CPP, and when it
|
|
445
|
+
* encounters the next token, it calls `reset_lookahead` to set `offset` to the current position, ensuring that `lex`
|
|
446
|
+
* can use `char0` to test the following character.
|
|
447
|
+
*
|
|
448
|
+
* The terminology for advancing is:
|
|
449
|
+
* - "Advance before character C at index N" means "`lexer->lookahead` returns C, but 'Lookahead' does not contain C and
|
|
450
|
+
* has size N"
|
|
451
|
+
* - "Advance over character C at index N" means "`lexer->lookahead` returns the character following C, 'Lookahead'
|
|
452
|
+
* contains C and has size N+1" (or "advance before N+1")
|
|
453
|
+
* - If the size of 'Lookahead' is already larger than N, and therefore C can be read from the vector, the
|
|
454
|
+
* postconditions may not hold (when independent steps access lookahead at different indexes)
|
|
455
|
+
*
|
|
456
|
+
* Example:
|
|
457
|
+
*
|
|
458
|
+
* Assume we are parsing the following line, and the scanner is called right after the `a` in the right-hand side:
|
|
459
|
+
*
|
|
460
|
+
* > calc a b = a Library.Math.** b
|
|
461
|
+
* ^ (lexer position: before the character above the ^, `lexer->lookahead` returns the space)
|
|
462
|
+
* || 0/0 (content of `data` between bars, empty; `len` after bars, `offset` after slash)
|
|
463
|
+
*
|
|
464
|
+
* 'Lookahead' is initialized with `len = 0` and `offset = 0`.
|
|
465
|
+
*
|
|
466
|
+
* The full lookahead string (stored in tree-sitter's internals) at this position is ` Library.Math.** b`, and all
|
|
467
|
+
* _absolute_ indexes point into that string.
|
|
468
|
+
* Since tree-sitter only exposes the "next" character at a time, indexing requires advancing the lexer and copying
|
|
469
|
+
* characters to 'Lookahead' on demand.
|
|
470
|
+
*
|
|
471
|
+
* An initial `skip_space` advances over the space between `a` and `Lib`, which does not update 'Lookahead'.
|
|
472
|
+
*
|
|
473
|
+
* > calc a b = a Library.Math.** b
|
|
474
|
+
* ^
|
|
475
|
+
* || 0/0
|
|
476
|
+
*
|
|
477
|
+
* The uppercase character in `Lib` triggers the detection of qualified operators in `qualified_op`, which repeatedly
|
|
478
|
+
* lexes module segments and dots.
|
|
479
|
+
*
|
|
480
|
+
* The module segment step starts (in `conid`) by checking that the next character is upper case using `peek0` (short
|
|
481
|
+
* for `peek(0)`), which accesses the _first_ lookahead character – but _first_ is always relative to the current
|
|
482
|
+
* `offset`.
|
|
483
|
+
* We call the relative index `rel` and the absolute one `abs = offset + rel`.
|
|
484
|
+
* Before `Lib`, this translates to `abs = rel = 0`.
|
|
485
|
+
*
|
|
486
|
+
* `peek` checks if 'Lookahead' already contains the character for this index (`abs < len`), so it can directly return
|
|
487
|
+
* the value at `data[abs]`, which fails, since the vector is empty.
|
|
488
|
+
* Instead, it will fetch the character directly from the tree-sitter lexer.
|
|
489
|
+
* The lexer provides one character of lookahead outside of 'Lookahead', which is enough for this case.
|
|
490
|
+
* `peek` is a conservative action, so it will not copy the character to 'Lookahead', and leave the lexer position
|
|
491
|
+
* unchanged.
|
|
492
|
+
*
|
|
493
|
+
* `L` is upper case, so `qualified_op` switches to the next phase: Advancing to the end of the module segment, which
|
|
494
|
+
* amounts to advancing before the first character that is not an identifier character:
|
|
495
|
+
*
|
|
496
|
+
* > advance_while(1, is_inner_id_char)
|
|
497
|
+
*
|
|
498
|
+
* This function applies the specified predicate to the character at the specified index.
|
|
499
|
+
* If that returns `true`, it advances over the character and increments the index.
|
|
500
|
+
* These steps are repeated until the predicate is `false`.
|
|
501
|
+
* The index is returned, pointing to the character after the module segment.
|
|
502
|
+
*
|
|
503
|
+
* `peek0` doesn't modify lookahead, so the next character is still `L`.
|
|
504
|
+
* We don't need to validate it again, so the starting index specified to `advance_while` is `1`.
|
|
505
|
+
*
|
|
506
|
+
* Let's look at the steps performed by this function in detail.
|
|
507
|
+
* It starts by accessing the character at the initial index, calling `peek(1)`.
|
|
508
|
+
* As for the `L` check, this calculates `abs = offset + rel = 0 + 1` and determines that it is smaller than `len`,
|
|
509
|
+
* again.
|
|
510
|
+
* However, this time the requested character is the _second_ lookahead character, so `peek` calls `advance_before(1)`,
|
|
511
|
+
* which calls `advance` as many times as needed to access the character via `lexer->lookahead`, which is
|
|
512
|
+
* `offset + n - len` times, so _once_ in this case.
|
|
513
|
+
* The result is that `L` is copied to 'Lookahead' and `lexer->advance` is invoked one time, resulting in this new
|
|
514
|
+
* state:
|
|
515
|
+
*
|
|
516
|
+
* > calc a b = a Library.Math.** b
|
|
517
|
+
* ^
|
|
518
|
+
* || 1/0
|
|
519
|
+
*
|
|
520
|
+
* Now `lexer->lookahead` returns `i`, which `conid` successfully validates as an "inner ID character", so it increments
|
|
521
|
+
* the index to 2.
|
|
522
|
+
* `peek(2)` performs the exact same steps as `peek(1)`, as do all subsequent steps until `peek(7)` returns `.`, which
|
|
523
|
+
* fails the predicate, terminating the loop without advancing and returning 7 from `conid`, with the final state:
|
|
524
|
+
*
|
|
525
|
+
* > calc a b = a Library.Math.** b
|
|
526
|
+
* ^
|
|
527
|
+
* || 7/0
|
|
528
|
+
*
|
|
529
|
+
* `qualified_op` now examines the returned index:
|
|
530
|
+
* If it is 0, the first character was not upper case and there is no module segment at this position, so lexing fails
|
|
531
|
+
* and the scanner returns control to tree-sitter.
|
|
532
|
+
* Otherwise, it calls `char_at(7, '.')` to require that the character after the module segment is a dot, with the same
|
|
533
|
+
* consequences.
|
|
534
|
+
*
|
|
535
|
+
* Since our test code meets these conditions, `qualified_op` continues with `reset_lookahead_to(8)`.
|
|
536
|
+
* This sets `offset` to 8, causing all future lookahead actions that use relative indexes to operate on characters
|
|
537
|
+
* _after_ this new offset.
|
|
538
|
+
* Here this is the first character after the dot, `M`.
|
|
539
|
+
* Note that modifying the offset does not advance the lexer right away, so the lexer position will remain at 7:
|
|
540
|
+
*
|
|
541
|
+
* > calc a b = a Library.Math.** b
|
|
542
|
+
* ^ (zero-based index 7)
|
|
543
|
+
* || 7/8
|
|
544
|
+
*
|
|
545
|
+
* After a dot, `qualified_op` decides what to do next by determining whether what follows is a symbolic operator by
|
|
546
|
+
* calling `symop_lookahead`, which uses the same predicate-based function as before, `advance_while(0, symop_char)`.
|
|
547
|
+
* When that function calls `peek(0)`, the absolute index `offset + 0 = 8` is requested, which is not available, so the
|
|
548
|
+
* lexer is advanced once:
|
|
549
|
+
*
|
|
550
|
+
* > calc a b = a Library.Math.** b
|
|
551
|
+
* ^
|
|
552
|
+
* || 8/8
|
|
553
|
+
*
|
|
554
|
+
* Note that `len == 8` means there are eight characters in 'Lookahead', up to and including the dot, while the index
|
|
555
|
+
* `offset == 8` refers to the _ninth_ character, `M`.
|
|
556
|
+
*
|
|
557
|
+
* `M` is not a symop character, so `qualified_op` restarts the loop and parses the next module segment.
|
|
558
|
+
* The process is identical to the previous iteration except for the value of `offset`, which causes all steps that
|
|
559
|
+
* examine relative lookahead with `peek0` and `peek_at` add 8 to each index.
|
|
560
|
+
*
|
|
561
|
+
* Once the second dot is parsed, the symop test will succeed after advancing over both asterisks, which satisfies the
|
|
562
|
+
* termination condition in `qualified_op`, and the scanner run finishes with the final state:
|
|
563
|
+
*
|
|
564
|
+
* > calc a b = a Library.Math.** b
|
|
565
|
+
* ^
|
|
566
|
+
* || 15/13
|
|
567
|
+
*/
|
|
568
|
+
typedef struct {
|
|
569
|
+
int32_t *contents;
|
|
570
|
+
uint32_t size;
|
|
571
|
+
uint32_t capacity;
|
|
572
|
+
uint32_t offset;
|
|
573
|
+
} Lookahead;
|
|
574
|
+
|
|
575
|
+
/**
|
|
576
|
+
* The state that is persisted across scanner runs.
|
|
577
|
+
*
|
|
578
|
+
* Although 'Lookahead' is always reset when starting a new run, storing it in the state avoids having to allocate and
|
|
579
|
+
* free the array repeatedly.
|
|
580
|
+
* Instead we just reset the `len` attribute to 0 and reuse the previous memory.
|
|
581
|
+
*
|
|
582
|
+
* REVIEW: Can tree-sitter run the scanner concurrently on multiple nodes in the same file in some situations?
|
|
583
|
+
*/
|
|
584
|
+
typedef struct {
|
|
585
|
+
Contexts contexts;
|
|
586
|
+
Newline newline;
|
|
587
|
+
Lookahead lookahead;
|
|
588
|
+
#ifdef TREE_SITTER_DEBUG
|
|
589
|
+
ParseLines parse;
|
|
590
|
+
#endif
|
|
591
|
+
} State;
|
|
592
|
+
|
|
593
|
+
/**
|
|
594
|
+
* Transient state and stuff provided by tree-sitter.
|
|
595
|
+
*/
|
|
596
|
+
typedef struct {
|
|
597
|
+
TSLexer *lexer;
|
|
598
|
+
const bool *symbols;
|
|
599
|
+
uint32_t symop;
|
|
600
|
+
State *state;
|
|
601
|
+
#ifdef TREE_SITTER_DEBUG
|
|
602
|
+
Debug debug;
|
|
603
|
+
#endif
|
|
604
|
+
} Env;
|
|
605
|
+
|
|
606
|
+
static Env env_new(TSLexer *l, const bool * symbols, State *state) {
|
|
607
|
+
return (Env) {
|
|
608
|
+
.lexer = l,
|
|
609
|
+
.symbols = symbols,
|
|
610
|
+
.symop = 0,
|
|
611
|
+
.state = state,
|
|
612
|
+
#ifdef TREE_SITTER_DEBUG
|
|
613
|
+
.debug = debug_new(l),
|
|
614
|
+
#endif
|
|
615
|
+
};
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
static void reset_newline(Env *env) { memset(&env->state->newline, 0, sizeof(Newline)); }
|
|
619
|
+
|
|
620
|
+
static bool newline_active(Env *env) { return env->state->newline.state == NInit || env->state->newline.state == NProcess; }
|
|
621
|
+
|
|
622
|
+
static bool newline_init(Env *env) { return env->state->newline.state == NInit; }
|
|
623
|
+
|
|
624
|
+
// --------------------------------------------------------------------------------------------------------
|
|
625
|
+
// Lexer interaction
|
|
626
|
+
// --------------------------------------------------------------------------------------------------------
|
|
627
|
+
|
|
628
|
+
static bool is_eof(Env *env) { return env->lexer->eof(env->lexer); }
|
|
629
|
+
|
|
630
|
+
static bool not_eof(Env *env) { return !(is_eof(env)); }
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* The parser's position in the current line.
|
|
634
|
+
* Note: This is expensive to use.
|
|
635
|
+
*/
|
|
636
|
+
static uint32_t column(Env *env) {
|
|
637
|
+
return is_eof(env) ? 0 : env->lexer->get_column(env->lexer);
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
/**
|
|
641
|
+
* tree-sitter's lexer interface maintains a current position that determines the lookahead character and the range of
|
|
642
|
+
* text that is associated with the symbol selected by the scanner, if `mark_end` is called.
|
|
643
|
+
*
|
|
644
|
+
* It's not possible to read earlier characters once the lexer has advanced over them, so this function appends the
|
|
645
|
+
* lookahead character to the array `lookahead` in the `State`.
|
|
646
|
+
*
|
|
647
|
+
* Don't add zeroes to the lookahead buffer when hitting EOF – it causes `no_lookahead` to report false negatives.
|
|
648
|
+
*/
|
|
649
|
+
static void advance(Env *env) {
|
|
650
|
+
if (not_eof(env)) {
|
|
651
|
+
array_push(&env->state->lookahead, PEEK);
|
|
652
|
+
env->lexer->advance(env->lexer, false);
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
static bool set_result_symbol(Env *env, Symbol result) {
|
|
657
|
+
if (result != FAIL) {
|
|
658
|
+
env->lexer->result_symbol = (TSSymbol) result;
|
|
659
|
+
return true;
|
|
660
|
+
}
|
|
661
|
+
return false;
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
#ifdef TREE_SITTER_DEBUG
|
|
665
|
+
|
|
666
|
+
static void mark_debug(Env *env, const char *restrict marked_by) {
|
|
667
|
+
dbg("mark: %s\n", marked_by);
|
|
668
|
+
env->debug.marked = (int) column(env);
|
|
669
|
+
env->debug.marked_line = 0;
|
|
670
|
+
env->debug.marked_by = marked_by;
|
|
671
|
+
env->lexer->mark_end(env->lexer);
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
static void append_parse_buffer(Env *env);
|
|
675
|
+
|
|
676
|
+
static void advance_debug(Env *env) {
|
|
677
|
+
append_parse_buffer(env);
|
|
678
|
+
advance(env);
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
static void skip_debug(Env *env) {
|
|
682
|
+
append_parse_buffer(env);
|
|
683
|
+
env->lexer->advance(env->lexer, true);
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
#endif
|
|
687
|
+
|
|
688
|
+
/**
|
|
689
|
+
* `inline` has a noticeable impact, reaching parity with a macro.
|
|
690
|
+
*/
|
|
691
|
+
static inline bool valid(Env *env, Symbol s) { return env->symbols[s]; }
|
|
692
|
+
|
|
693
|
+
// --------------------------------------------------------------------------------------------------------
|
|
694
|
+
// Symbol constructors
|
|
695
|
+
// --------------------------------------------------------------------------------------------------------
|
|
696
|
+
|
|
697
|
+
static Symbol finish(Symbol s, const char *restrict desc) {
|
|
698
|
+
// Suppress unused param warning
|
|
699
|
+
(void) desc;
|
|
700
|
+
dbg("finish: %s\n", desc);
|
|
701
|
+
return s;
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
static Symbol finish_if_valid(Env *env, Symbol s, const char *restrict desc) {
|
|
705
|
+
if (valid(env, s)) return finish(s, desc);
|
|
706
|
+
return FAIL;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
static Symbol finish_marked(Env *env, Symbol s, const char *restrict desc) {
|
|
710
|
+
(void) desc;
|
|
711
|
+
MARK(desc);
|
|
712
|
+
return s;
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
static Symbol update_state(const char *restrict desc) {
|
|
716
|
+
return finish(UPDATE, desc);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
// --------------------------------------------------------------------------------------------------------
|
|
720
|
+
// Lookahead
|
|
721
|
+
// --------------------------------------------------------------------------------------------------------
|
|
722
|
+
|
|
723
|
+
/**
|
|
724
|
+
* Ensure that at least `abs + 1` characters are present in the lookahead buffer by calling `advance` `len - abs + 1`
|
|
725
|
+
* times.
|
|
726
|
+
*/
|
|
727
|
+
static void advance_over_abs(Env *env, uint32_t abs) {
|
|
728
|
+
for (uint32_t i = env->state->lookahead.size; i <= abs; i++) S_ADVANCE;
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
/**
|
|
732
|
+
* Ensure that at least `rel` characters after and including the current `offset` are present in the lookahead buffer by
|
|
733
|
+
* calling `advance` as often as the difference between the desired index (`offset + rel`) and one less than the current
|
|
734
|
+
* buffer size.
|
|
735
|
+
*
|
|
736
|
+
* Note: The character at the offset is included in the range, so that when `len == offset == rel == 0`, this function
|
|
737
|
+
* advances once, over the character at index 0.
|
|
738
|
+
*/
|
|
739
|
+
static void advance_over(Env *env, uint32_t rel) {
|
|
740
|
+
advance_over_abs(env, env->state->lookahead.offset + rel);
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
/**
|
|
744
|
+
* Skip whitespace relative to `offset`, but keep characters that have already been copied to the buffer.
|
|
745
|
+
*
|
|
746
|
+
* Example:
|
|
747
|
+
*
|
|
748
|
+
* > a = b
|
|
749
|
+
* ^
|
|
750
|
+
*
|
|
751
|
+
* Assume step A sets `offset` to 1, pointing to the first space.
|
|
752
|
+
* Step B calls `peek1`, to look at the `=`. This needs to advance over the space, which is copied to the lookahead
|
|
753
|
+
* buffer, causing `lexer->lookahead` to return `=`.
|
|
754
|
+
* Step C then calls `peek0`, sees that it is a space, and requests that it be skipped. Since it is already in the
|
|
755
|
+
* buffer, calling `lexer-advance` would skip the wrong character.
|
|
756
|
+
*
|
|
757
|
+
* Hence, this function only skips indexes larger than the lookahead buffer's `len`.
|
|
758
|
+
*
|
|
759
|
+
* Additionally, if `offset` has been set to a position outside of the buffer, all characters up to that index are
|
|
760
|
+
* copied to the buffer beforehand.
|
|
761
|
+
*/
|
|
762
|
+
static void skip_over(Env *env, uint32_t rel) {
|
|
763
|
+
Lookahead *l = &env->state->lookahead;
|
|
764
|
+
// Subtraction is safe because the condition establishes that `offset` is at least 1
|
|
765
|
+
if (l->offset > l->size) advance_over_abs(env, l->offset - 1);
|
|
766
|
+
uint32_t abs = l->offset + rel;
|
|
767
|
+
for (uint32_t i = env->state->lookahead.size; i <= abs; i++) S_SKIP;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Ensure that the lookahead buffer is large enough to allow reading the `n`th character.
|
|
772
|
+
* Since `lexer->lookahead` points at the character after the buffer, it must have `offset + n - 1` elements.
|
|
773
|
+
*/
|
|
774
|
+
static void advance_before(Env *env, uint32_t rel) {
|
|
775
|
+
uint32_t abs = env->state->lookahead.offset + rel;
|
|
776
|
+
if (abs > 0) advance_over_abs(env, abs - 1);
|
|
777
|
+
}
|
|
778
|
+
|
|
779
|
+
/**
|
|
780
|
+
* Return the lookahead character with index `n`.
|
|
781
|
+
* If the index is larger than the lookahead buffer, return 0.
|
|
782
|
+
*
|
|
783
|
+
* Unsafe insofar as that it does not advance if the index points outside of the lookahead buffer.
|
|
784
|
+
* This may happen in regular operation when a tool like `seq` attempts to look beyond EOF.
|
|
785
|
+
*/
|
|
786
|
+
static int32_t unsafe_peek_abs(Env *env, uint32_t abs) {
|
|
787
|
+
return
|
|
788
|
+
abs < env->state->lookahead.size ?
|
|
789
|
+
env->state->lookahead.contents[abs] :
|
|
790
|
+
0;
|
|
791
|
+
}
|
|
792
|
+
|
|
793
|
+
/**
|
|
794
|
+
* Return the lookahead character with index `offset + n`.
|
|
795
|
+
* See `unsafe_peek_abs`.
|
|
796
|
+
*/
|
|
797
|
+
static int32_t unsafe_peek(Env *env, uint32_t rel) {
|
|
798
|
+
return unsafe_peek_abs(env, env->state->lookahead.offset + rel);
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
#ifdef TREE_SITTER_DEBUG
|
|
802
|
+
|
|
803
|
+
static void debug_peek(Env *env, uint32_t rel) {
|
|
804
|
+
uint32_t abs = env->state->lookahead.offset + rel;
|
|
805
|
+
dbg("peek ");
|
|
806
|
+
if (env->state->lookahead.offset > 0) dbg("%u->", env->state->lookahead.offset);
|
|
807
|
+
dbg("%u", rel);
|
|
808
|
+
if (abs < env->state->lookahead.size)
|
|
809
|
+
dbg(" cached | len: %u", env->state->lookahead.size);
|
|
810
|
+
else if (abs > env->state->lookahead.size)
|
|
811
|
+
dbg(" advance | len: %u", env->state->lookahead.size);
|
|
812
|
+
dbg("\n");
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
#endif
|
|
816
|
+
|
|
817
|
+
/**
|
|
818
|
+
* Return the lookahead character with index `offset + rel`.
|
|
819
|
+
* If the character is not accessible, advance the position until it is.
|
|
820
|
+
*
|
|
821
|
+
* This "peeks" insofar as it doesn't advance over the requested character – `peek(0)` is equivalent to
|
|
822
|
+
* `lexer->lookahead` if `offset == 0`.
|
|
823
|
+
*/
|
|
824
|
+
static int32_t peek(Env *env, uint32_t rel) {
|
|
825
|
+
#ifdef TREE_SITTER_DEBUG
|
|
826
|
+
debug_peek(env, rel);
|
|
827
|
+
#endif
|
|
828
|
+
if (env->state->lookahead.offset + rel < env->state->lookahead.size) return unsafe_peek(env, rel);
|
|
829
|
+
else {
|
|
830
|
+
advance_before(env, rel);
|
|
831
|
+
return PEEK;
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
|
|
835
|
+
/**
|
|
836
|
+
* Return the first lookahead character after the `offset` without advancing the position.
|
|
837
|
+
*/
|
|
838
|
+
static int32_t peek0(Env *env) { return peek(env, 0); }
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Return the second lookahead character after the `offset` without advancing the position further than the first
|
|
842
|
+
* character.
|
|
843
|
+
*/
|
|
844
|
+
static int32_t peek1(Env *env) { return peek(env, 1); }
|
|
845
|
+
|
|
846
|
+
/**
|
|
847
|
+
* Return the third lookahead character after the `offset` without advancing the position further than the second
|
|
848
|
+
* character.
|
|
849
|
+
*/
|
|
850
|
+
static int32_t peek2(Env *env) { return peek(env, 2); }
|
|
851
|
+
|
|
852
|
+
/**
|
|
853
|
+
* Test the lookahead character at index `offset + n` for equality.
|
|
854
|
+
*/
|
|
855
|
+
static bool char_at(Env *env, uint32_t n, int32_t c) {
|
|
856
|
+
return peek(env, n) == c;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
/**
|
|
860
|
+
* Test the lookahead character at index `offset` for equality.
|
|
861
|
+
*/
|
|
862
|
+
static bool char0(Env *env, int32_t c) {
|
|
863
|
+
return char_at(env, 0, c);
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
/**
|
|
867
|
+
* Test the lookahead character at index `offset + 1` for equality.
|
|
868
|
+
*/
|
|
869
|
+
static bool char1(Env *env, int32_t c) {
|
|
870
|
+
return char_at(env, 1, c);
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
/**
|
|
874
|
+
* Test the lookahead character at index `offset + 2` for equality.
|
|
875
|
+
*/
|
|
876
|
+
static bool char2(Env *env, int32_t c) {
|
|
877
|
+
return char_at(env, 2, c);
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
/**
|
|
881
|
+
* Set the offset to `index`, so that the indexes in future calls to lookahead functions like `char0` are interpreted
|
|
882
|
+
* relative to this new value.
|
|
883
|
+
*
|
|
884
|
+
* Resets `symop` for soundness, even though no rule would continue after advancing over symbolic characters.
|
|
885
|
+
*
|
|
886
|
+
* See 'Lookahead' for an example.
|
|
887
|
+
*/
|
|
888
|
+
static void reset_lookahead_abs(Env *env, uint32_t abs) {
|
|
889
|
+
dbg("reset: %u\n", abs);
|
|
890
|
+
env->state->lookahead.offset = abs;
|
|
891
|
+
env->symop = 0;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
static void reset_lookahead_to(Env *env, uint32_t rel) {
|
|
895
|
+
reset_lookahead_abs(env, env->state->lookahead.offset + rel);
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
/**
|
|
899
|
+
* Move `offset` to the end of the consumed lookahead, causing `peek`, `char0` etc. to operate on characters following
|
|
900
|
+
* the current position at the time this function is executed.
|
|
901
|
+
*/
|
|
902
|
+
static void reset_lookahead(Env *env) {
|
|
903
|
+
reset_lookahead_abs(env, env->state->lookahead.size);
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
/**
|
|
907
|
+
* Return whether the lookahead position has been advanced since starting the run, not considering skipped characters
|
|
908
|
+
* (which are usually whitespace).
|
|
909
|
+
* This is important to decide whether the scanner has to be restarted to emit certain symbols.
|
|
910
|
+
*
|
|
911
|
+
* For example, before starting layouts and generating layout semicolons after newlines, we skip whitespace and mark, so
|
|
912
|
+
* that subsequent symbols start at their non-whitespace boundary instead of before the newline(s).
|
|
913
|
+
* When newline lookahead mode finishes, it can continue directly with this step _only if_ no non-whitespace characters
|
|
914
|
+
* were consumed, otherwise they would be included in the semicolon symbol.
|
|
915
|
+
* We also cannot unconditionally mark after whitespace in newline lookahead mode since there are several potential
|
|
916
|
+
* symbols that can be emitted before skipped whitespace is marked, like layout end, which should not extend beyond
|
|
917
|
+
* newlines.
|
|
918
|
+
*/
|
|
919
|
+
static bool no_lookahead(Env *env) {
|
|
920
|
+
return env->state->lookahead.size == 0;
|
|
921
|
+
}
|
|
922
|
+
|
|
923
|
+
/**
|
|
924
|
+
* Return the column of the first lookahead character of the current run.
|
|
925
|
+
* This is needed for starting layouts in interior mode, since we don't count positions across interior runs.
|
|
926
|
+
*/
|
|
927
|
+
static uint32_t start_column(Env *env) {
|
|
928
|
+
return column(env) - env->state->lookahead.size;
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
/**
|
|
932
|
+
* Increment `i` while the predicate is true for the lookahead character at that index (relative to `offset`), advancing
|
|
933
|
+
* the position when `i` points beyond the end of the lookahead buffer.
|
|
934
|
+
* Return the index after the last matching character.
|
|
935
|
+
*/
|
|
936
|
+
static uint32_t advance_while(Env *env, uint32_t i, bool (*pred)(int32_t)) {
|
|
937
|
+
while (pred(peek(env, i))) { i++; }
|
|
938
|
+
return i;
|
|
939
|
+
}
|
|
940
|
+
|
|
941
|
+
/**
|
|
942
|
+
* Same as `advance_while`, using "not equal to `c`" for the predicate.
|
|
943
|
+
* Stops at EOF.
|
|
944
|
+
*/
|
|
945
|
+
static uint32_t advance_until_char(Env *env, uint32_t i, int32_t c) {
|
|
946
|
+
while (not_eof(env) && !char_at(env, i, c)) { i++; }
|
|
947
|
+
return i;
|
|
948
|
+
}
|
|
949
|
+
|
|
950
|
+
// --------------------------------------------------------------------------------------------------------
|
|
951
|
+
// Context manipulation and conditions
|
|
952
|
+
// --------------------------------------------------------------------------------------------------------
|
|
953
|
+
|
|
954
|
+
static bool has_contexts(Env *env) { return env->state->contexts.size != 0; }
|
|
955
|
+
|
|
956
|
+
/**
|
|
957
|
+
* Push a layout context onto the stack.
|
|
958
|
+
*/
|
|
959
|
+
static void push_context(Env *env, ContextSort sort, uint32_t indent) {
|
|
960
|
+
dbg("push: %s %d\n", context_names[sort], indent);
|
|
961
|
+
Context ctx = (Context) {.sort = sort, .indent = indent};
|
|
962
|
+
array_push(&env->state->contexts, ctx);
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
/**
|
|
966
|
+
* Remove a layout context from the stack.
|
|
967
|
+
*/
|
|
968
|
+
static void pop(Env *env) {
|
|
969
|
+
if (has_contexts(env)) {
|
|
970
|
+
dbg("pop: %s\n", context_names[array_back(&env->state->contexts)->sort]);
|
|
971
|
+
array_pop(&env->state->contexts);
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
static ContextSort current_context(Env *env) {
|
|
976
|
+
return has_contexts(env) ? array_back(&env->state->contexts)->sort : NoContext;
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
static bool is_layout_context(Env *env) {
|
|
980
|
+
return current_context(env) < Braces;
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
/**
|
|
984
|
+
* Decide whether the current context requires generation of layout semicolons.
|
|
985
|
+
* This is true for all layout contexts except for multi-way if, since that uses `|` to start layout elements.
|
|
986
|
+
*/
|
|
987
|
+
static bool is_semicolon_context(Env *env) {
|
|
988
|
+
return current_context(env) < MultiWayIfLayout;
|
|
989
|
+
}
|
|
990
|
+
|
|
991
|
+
/**
|
|
992
|
+
* Return the indent of the innermost layout context.
|
|
993
|
+
* If there are non-layout contexts at the top of the stack, search downwards.
|
|
994
|
+
*/
|
|
995
|
+
static uint32_t current_indent(Env *env) {
|
|
996
|
+
for (int32_t i = (int32_t) env->state->contexts.size - 1; i >= 0; i--) {
|
|
997
|
+
Context *cur = array_get(&env->state->contexts, i);
|
|
998
|
+
if (cur->sort < Braces) return cur->indent;
|
|
999
|
+
}
|
|
1000
|
+
return 0;
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
static bool indent_less(Env *env, uint32_t indent) {
|
|
1004
|
+
return is_layout_context(env) && indent < current_indent(env);
|
|
1005
|
+
}
|
|
1006
|
+
|
|
1007
|
+
static bool indent_lesseq(Env *env, uint32_t indent) {
|
|
1008
|
+
return is_layout_context(env) && indent <= current_indent(env);
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
static bool top_layout(Env *env) {
|
|
1012
|
+
return env->state->contexts.size == 1;
|
|
1013
|
+
}
|
|
1014
|
+
|
|
1015
|
+
static bool in_module_header(Env *env) {
|
|
1016
|
+
return current_context(env) == ModuleHeader;
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
/**
|
|
1020
|
+
* Return the appropriate symbol to close the given context, or FAIL if it can't be closed.
|
|
1021
|
+
*/
|
|
1022
|
+
static Symbol context_end_sym(ContextSort s) {
|
|
1023
|
+
switch (s) {
|
|
1024
|
+
case TExp:
|
|
1025
|
+
return END_TEXP;
|
|
1026
|
+
case Braces:
|
|
1027
|
+
return END_BRACE;
|
|
1028
|
+
default:
|
|
1029
|
+
return s < Braces ? END : FAIL;
|
|
1030
|
+
}
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1034
|
+
// Character and lookahead conditions
|
|
1035
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1036
|
+
|
|
1037
|
+
#define NEWLINE_CASES \
|
|
1038
|
+
case '\n': \
|
|
1039
|
+
case '\r': \
|
|
1040
|
+
case '\f'
|
|
1041
|
+
|
|
1042
|
+
|
|
1043
|
+
static bool is_newline(int32_t c) {
|
|
1044
|
+
switch (c) {
|
|
1045
|
+
NEWLINE_CASES:
|
|
1046
|
+
return true;
|
|
1047
|
+
default:
|
|
1048
|
+
return false;
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
static bool varid_start_char(const int32_t c) { return c == '_' || is_varid_start_char(c); }
|
|
1053
|
+
|
|
1054
|
+
// TODO This should be combined with is_inner_id_char and made more explicit about when which char can occur.
|
|
1055
|
+
// For example, lex_symop uses this to decide about prefix dot being a field selector, where single quotes aren't valid.
|
|
1056
|
+
static bool is_id_char(const int32_t c) {
|
|
1057
|
+
return c == '_' || c == '\'' || is_identifier_char(c);
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
// TODO hashes only work at the end of identifiers
|
|
1061
|
+
static bool is_inner_id_char(const int32_t c) {
|
|
1062
|
+
return is_id_char(c) || c == '#';
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
static bool quoter_char(const int32_t c) { return is_id_char(c) || c == '.'; }
|
|
1066
|
+
|
|
1067
|
+
static bool reserved_symbolic(const int32_t c) {
|
|
1068
|
+
switch (c) {
|
|
1069
|
+
case '(':
|
|
1070
|
+
case ')':
|
|
1071
|
+
case ',':
|
|
1072
|
+
case ';':
|
|
1073
|
+
case '[':
|
|
1074
|
+
case ']':
|
|
1075
|
+
case '`':
|
|
1076
|
+
case '{':
|
|
1077
|
+
case '}':
|
|
1078
|
+
case '"':
|
|
1079
|
+
case '\'':
|
|
1080
|
+
case '_':
|
|
1081
|
+
return true;
|
|
1082
|
+
default: return false;
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
static bool symop_char(const int32_t c) {
|
|
1087
|
+
return is_symop_char(c) && !reserved_symbolic(c);
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
/**
|
|
1091
|
+
* Advance the position to the first character that's not valid for a symbolic operator, and return that position.
|
|
1092
|
+
* If the function has been called before, directly return the cached position.
|
|
1093
|
+
*
|
|
1094
|
+
* This consumes the entire symop, since the field denotes the length of the string and therefore the last (failing)
|
|
1095
|
+
* peek is _beyond_ the end, consuming the last valid char.
|
|
1096
|
+
*/
|
|
1097
|
+
static uint32_t symop_lookahead(Env *env) {
|
|
1098
|
+
if (env->symop == 0) {
|
|
1099
|
+
env->symop = advance_while(env, 0, symop_char);
|
|
1100
|
+
if (env->symop > 0)
|
|
1101
|
+
dbg("symop: %d, %.*ls\n", env->symop, env->symop, env->state->lookahead.contents + env->state->lookahead.offset);
|
|
1102
|
+
}
|
|
1103
|
+
return env->symop;
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
static bool is_symop(Env *env) {
|
|
1107
|
+
return symop_lookahead(env) > 0;
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
/**
|
|
1111
|
+
* The parser calls `scan` with all symbols declared as valid directly after it encountered an error.
|
|
1112
|
+
* The symbol `FAIL` is not used in the grammar, so it can only be valid in this error case.
|
|
1113
|
+
*/
|
|
1114
|
+
|
|
1115
|
+
static bool after_error(Env *env) { return valid(env, FAIL); }
|
|
1116
|
+
|
|
1117
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1118
|
+
// Debug printing
|
|
1119
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1120
|
+
|
|
1121
|
+
#ifdef TREE_SITTER_DEBUG
|
|
1122
|
+
|
|
1123
|
+
static void push_parse_buffer_line(Env *env) {
|
|
1124
|
+
ParseLine new_line = array_new();
|
|
1125
|
+
array_reserve(&new_line, 1);
|
|
1126
|
+
array_push(&env->state->parse, new_line);
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
static ParseLine *ensure_parse_buffer(Env *env) {
|
|
1130
|
+
ParseLines *buffer = &env->state->parse;
|
|
1131
|
+
if (buffer->size == 0) push_parse_buffer_line(env);
|
|
1132
|
+
if (is_newline(PEEK)) push_parse_buffer_line(env);
|
|
1133
|
+
return array_back(buffer);
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
static void append_parse_buffer(Env *env) {
|
|
1137
|
+
ParseLine *current_line = ensure_parse_buffer(env);
|
|
1138
|
+
if (is_newline(PEEK)) {
|
|
1139
|
+
env->debug.marked_line++;
|
|
1140
|
+
env->debug.start_line++;
|
|
1141
|
+
}
|
|
1142
|
+
else if (column(env) >= current_line->size) array_push(current_line, PEEK);
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
static void fill_parse_buffer(Env *env) {
|
|
1146
|
+
env->debug.end_col = column(env);
|
|
1147
|
+
while (!(is_newline(PEEK) || is_eof(env))) S_ADVANCE;
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
static bool seq(Env *env, const char *restrict s);
|
|
1151
|
+
|
|
1152
|
+
static void print_lookahead(Env *env) {
|
|
1153
|
+
dbg("lookahead: %.*ls\n", env->state->lookahead.size, env->state->lookahead.contents);
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
static const char * space = "<space>";
|
|
1157
|
+
static const char * newline_char = "\\n";
|
|
1158
|
+
|
|
1159
|
+
static const char * show_char(int32_t c) {
|
|
1160
|
+
switch (c) {
|
|
1161
|
+
NEWLINE_CASES:
|
|
1162
|
+
return newline_char;
|
|
1163
|
+
case ' ':
|
|
1164
|
+
case '\t':
|
|
1165
|
+
case '\v':
|
|
1166
|
+
return space;
|
|
1167
|
+
default:
|
|
1168
|
+
return NULL;
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
|
|
1172
|
+
static void print_lookahead_chars_from(Env *env, uint32_t start) {
|
|
1173
|
+
if (start < env->state->lookahead.size) {
|
|
1174
|
+
dbg("lookahead from %d: ", start);
|
|
1175
|
+
for (; start < env->state->lookahead.size; start++) {
|
|
1176
|
+
int32_t c = env->state->lookahead.contents[start];
|
|
1177
|
+
const char * s = show_char(c);
|
|
1178
|
+
if (s == NULL) dbg("%lc", c);
|
|
1179
|
+
else dbg("%s", s);
|
|
1180
|
+
}
|
|
1181
|
+
dbg("\n");
|
|
1182
|
+
}
|
|
1183
|
+
else
|
|
1184
|
+
dbg("print_lookahead_chars_from: Too large (%d / %d)", start, env->state->lookahead.size);
|
|
1185
|
+
}
|
|
1186
|
+
|
|
1187
|
+
static void debug_contexts(Env *env) {
|
|
1188
|
+
if (env->state->contexts.size == 0) dbg("empty");
|
|
1189
|
+
bool empty = true;
|
|
1190
|
+
for (size_t i = 0; i < env->state->contexts.size; i++) {
|
|
1191
|
+
if (!empty) dbg("-");
|
|
1192
|
+
Context ctx = *array_get(&env->state->contexts, i);
|
|
1193
|
+
if (ctx.sort == ModuleHeader) dbg("pre");
|
|
1194
|
+
else if (ctx.sort == Braces) dbg("brace");
|
|
1195
|
+
else if (ctx.sort == TExp) dbg("texp");
|
|
1196
|
+
else {
|
|
1197
|
+
if (ctx.sort == DoLayout) dbg("do ");
|
|
1198
|
+
else if (ctx.sort == LetLayout) dbg("let ");
|
|
1199
|
+
else if (ctx.sort == CaseLayout) dbg("case ");
|
|
1200
|
+
else if (ctx.sort == MultiWayIfLayout) dbg("if ");
|
|
1201
|
+
else if (ctx.sort == QuoteLayout) dbg("quote ");
|
|
1202
|
+
dbg("%d", ctx.indent);
|
|
1203
|
+
}
|
|
1204
|
+
empty = false;
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
void debug_newline(Env *env) {
|
|
1209
|
+
switch (env->state->newline.state) {
|
|
1210
|
+
case NInactive:
|
|
1211
|
+
dbg("no");
|
|
1212
|
+
break;
|
|
1213
|
+
case NInit:
|
|
1214
|
+
dbg("init");
|
|
1215
|
+
break;
|
|
1216
|
+
case NProcess:
|
|
1217
|
+
dbg("process");
|
|
1218
|
+
break;
|
|
1219
|
+
case NResume:
|
|
1220
|
+
dbg("resume");
|
|
1221
|
+
break;
|
|
1222
|
+
}
|
|
1223
|
+
if (env->state->newline.state != NInactive) dbg(" %d %s", env->state->newline.indent, token_names[env->state->newline.end]);
|
|
1224
|
+
if (env->state->newline.eof) dbg(" [eof]");
|
|
1225
|
+
if (env->state->newline.no_semi) dbg(" [no_semi]");
|
|
1226
|
+
if (env->state->newline.skip_semi) dbg(" [skip_semi]");
|
|
1227
|
+
if (env->state->newline.unsafe) dbg(" [unsafe]");
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
/**
|
|
1231
|
+
* Produce a comma-separated string of valid symbols.
|
|
1232
|
+
*/
|
|
1233
|
+
static void debug_valid(Env *env, const bool *syms) {
|
|
1234
|
+
if (after_error(env)) {
|
|
1235
|
+
dbg("all");
|
|
1236
|
+
return;
|
|
1237
|
+
}
|
|
1238
|
+
bool fst = true;
|
|
1239
|
+
for (Symbol i = FAIL; i <= UPDATE; i++) {
|
|
1240
|
+
if (syms[i]) {
|
|
1241
|
+
if (!fst) dbg(",");
|
|
1242
|
+
dbg("%s", sym_names[i]);
|
|
1243
|
+
fst = false;
|
|
1244
|
+
}
|
|
1245
|
+
}
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
static bool debug_init(Env *env) {
|
|
1249
|
+
setlocale(LC_ALL, "C.UTF-8");
|
|
1250
|
+
dbg("\n");
|
|
1251
|
+
dbg("state:\n syms = ");
|
|
1252
|
+
debug_valid(env, env->symbols);
|
|
1253
|
+
dbg("\n contexts = ");
|
|
1254
|
+
debug_contexts(env);
|
|
1255
|
+
dbg("\n newline = ");
|
|
1256
|
+
debug_newline(env);
|
|
1257
|
+
dbg("\n");
|
|
1258
|
+
return false;
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
void sgr(const char *restrict code) {
|
|
1262
|
+
dbg("\x1b[%sm", code);
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
void color(unsigned c) {
|
|
1266
|
+
char code[3];
|
|
1267
|
+
sprintf(code, "3%d", c);
|
|
1268
|
+
sgr(code);
|
|
1269
|
+
}
|
|
1270
|
+
|
|
1271
|
+
void palette() {
|
|
1272
|
+
color(4);
|
|
1273
|
+
dbg("before");
|
|
1274
|
+
color(2);
|
|
1275
|
+
dbg(" marked");
|
|
1276
|
+
color(3);
|
|
1277
|
+
dbg(" advanced");
|
|
1278
|
+
color(5);
|
|
1279
|
+
dbg(" lookahead");
|
|
1280
|
+
sgr("");
|
|
1281
|
+
dbg("\n");
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
static bool debug_parse_metadata = false;
|
|
1285
|
+
|
|
1286
|
+
static void dump_parse_metadata(Env *env) {
|
|
1287
|
+
Debug *debug = &env->debug;
|
|
1288
|
+
dbg(
|
|
1289
|
+
"lines: %d | start_line: %d | start_col: %d | marked_line: %d | marked: %d | end_col: %d | persist lines: %d\n",
|
|
1290
|
+
env->state->parse.size,
|
|
1291
|
+
debug->start_line,
|
|
1292
|
+
debug->start_col,
|
|
1293
|
+
debug->marked_line,
|
|
1294
|
+
debug->marked,
|
|
1295
|
+
debug->end_col,
|
|
1296
|
+
env->state->parse.size - debug->marked_line
|
|
1297
|
+
);
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
/**
|
|
1301
|
+
* Note: We're printing individual characters here instead of using a format with precision like `%.*ls` and slicing
|
|
1302
|
+
* the buffer, because:
|
|
1303
|
+
* - The buffer contains wide characters, but `fprintf` counts bytes
|
|
1304
|
+
* - `fwprintf` counts wide characters, but can't be interleaved with `fprintf`, so we'd have to use that function, and
|
|
1305
|
+
* therefore wide literals, everywhere, which is tedious
|
|
1306
|
+
*/
|
|
1307
|
+
void debug_parse(Env *env) {
|
|
1308
|
+
Debug *debug = &env->debug;
|
|
1309
|
+
ParseLines *buffer = &env->state->parse;
|
|
1310
|
+
uint32_t lines = buffer->size;
|
|
1311
|
+
dbg("-----------------------\n");
|
|
1312
|
+
// For investigating mistakes in the debugging code.
|
|
1313
|
+
if (debug_parse_metadata) dump_parse_metadata(env);
|
|
1314
|
+
if (lines > 0) {
|
|
1315
|
+
color(4);
|
|
1316
|
+
for (uint32_t i = 0; i < lines; i++) {
|
|
1317
|
+
ParseLine *line = array_get(buffer, i);
|
|
1318
|
+
int32_t *buf = line->contents;
|
|
1319
|
+
if (line->contents == NULL) break;
|
|
1320
|
+
uint32_t pos = 0;
|
|
1321
|
+
|
|
1322
|
+
if (debug->start_line == lines - 1 - i) {
|
|
1323
|
+
while (pos < debug->start_col) { dbg("%lc", buf[pos]); pos++; }
|
|
1324
|
+
color(2);
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
if (debug->marked >= 0 && debug->marked_line == lines - 1 - i) {
|
|
1328
|
+
while ((int) pos < debug->marked) { dbg("%lc", buf[pos]); pos++; }
|
|
1329
|
+
color(3);
|
|
1330
|
+
}
|
|
1331
|
+
|
|
1332
|
+
if (i == lines - 1) {
|
|
1333
|
+
while (pos < debug->end_col) { dbg("%lc", buf[pos]); pos++; }
|
|
1334
|
+
color(5);
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
while (pos < line->size) { dbg("%lc", buf[pos]); pos++; }
|
|
1338
|
+
|
|
1339
|
+
dbg("\n");
|
|
1340
|
+
}
|
|
1341
|
+
sgr("");
|
|
1342
|
+
}
|
|
1343
|
+
dbg("-----------------------\n");
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
static unsigned serialize_parse_lines(char *cursor, ParseLines *parse, unsigned to_copy) {
|
|
1347
|
+
for (unsigned i = 0; i < parse->size; i++) {
|
|
1348
|
+
ParseLine *line = array_get(parse, i);
|
|
1349
|
+
unsigned line_size = line->size * sizeof(uint32_t);
|
|
1350
|
+
to_copy += line_size + sizeof(uint32_t);
|
|
1351
|
+
if (to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
|
|
1352
|
+
*((uint32_t *) cursor) = line->size;
|
|
1353
|
+
cursor += sizeof(line->size);
|
|
1354
|
+
memcpy(cursor, line->contents, line_size);
|
|
1355
|
+
cursor += line_size;
|
|
1356
|
+
}
|
|
1357
|
+
return to_copy;
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
static void deserialize_parse_lines(const char *cursor, ParseLines *parse, uint32_t size) {
|
|
1361
|
+
// Ensure ParseLines has room for at _least_ as many lines as the new state
|
|
1362
|
+
array_reserve(parse, size);
|
|
1363
|
+
for (unsigned i = 0; i < size; i++) {
|
|
1364
|
+
if (i >= parse->size) { array_push(parse, (ParseLine)array_new()); }
|
|
1365
|
+
ParseLine *line = &parse->contents[i];
|
|
1366
|
+
uint32_t line_len = *((uint32_t *) cursor);
|
|
1367
|
+
cursor += sizeof(uint32_t);
|
|
1368
|
+
array_reserve(line, line_len);
|
|
1369
|
+
line->size = line_len;
|
|
1370
|
+
unsigned line_size = line->size * sizeof(uint32_t);
|
|
1371
|
+
memcpy(line->contents, cursor, line_size);
|
|
1372
|
+
cursor += line_size;
|
|
1373
|
+
}
|
|
1374
|
+
// Free the excessive lines in the previous since we can't check in the next round whether there was a line in
|
|
1375
|
+
// a slot before and reuse the pointer.
|
|
1376
|
+
// This only happens when we didn't push any lines above, which would reset parse->len to len.
|
|
1377
|
+
for (unsigned i = parse->size; i > size; i--) { array_delete(array_get(parse, i - 1)); }
|
|
1378
|
+
// Truncate ParseLines in case the new state has fewer lines
|
|
1379
|
+
parse->size = size;
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
void debug_finish(Env *env, Symbol result) {
|
|
1383
|
+
dbg("result: ");
|
|
1384
|
+
if (result) dbg("%s, ", sym_names[result]);
|
|
1385
|
+
else dbg("<skipped>, ");
|
|
1386
|
+
if (env->debug.marked == -1) dbg("%d", column(env));
|
|
1387
|
+
else dbg("%s@%d", env->debug.marked_by, env->debug.marked);
|
|
1388
|
+
dbg("\n\n");
|
|
1389
|
+
fill_parse_buffer(env);
|
|
1390
|
+
debug_parse(env);
|
|
1391
|
+
env->state->parse.size -= env->debug.marked_line;
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
#endif
|
|
1395
|
+
|
|
1396
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1397
|
+
// Lookahead
|
|
1398
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1399
|
+
|
|
1400
|
+
/**
|
|
1401
|
+
* Check if lookahead contains the string `s` starting at position `offset + start`.
|
|
1402
|
+
* This advances only over matching characters.
|
|
1403
|
+
*/
|
|
1404
|
+
static bool seq_from(Env *env, const char *restrict s, uint32_t start) {
|
|
1405
|
+
uint32_t len = (uint32_t) strlen(s);
|
|
1406
|
+
for (uint32_t i = 0; i < len; i++) {
|
|
1407
|
+
int32_t c = s[i];
|
|
1408
|
+
int32_t c2 = peek(env, start + i);
|
|
1409
|
+
if (c != c2) return false;
|
|
1410
|
+
}
|
|
1411
|
+
peek(env, start + len);
|
|
1412
|
+
return true;
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
/**
|
|
1416
|
+
* Check if lookahead contains the string `s` starting at position `offset`.
|
|
1417
|
+
*/
|
|
1418
|
+
static bool seq(Env *env, const char *restrict s) {
|
|
1419
|
+
return seq_from(env, s, 0);
|
|
1420
|
+
}
|
|
1421
|
+
|
|
1422
|
+
/**
|
|
1423
|
+
* Advance until the next newline or EOF, used to consume the body of a comment.
|
|
1424
|
+
*/
|
|
1425
|
+
static void take_line(Env *env) {
|
|
1426
|
+
while (not_eof(env) && !is_newline(PEEK)) S_ADVANCE;
|
|
1427
|
+
}
|
|
1428
|
+
|
|
1429
|
+
static bool is_space_or_tab(int32_t c) {
|
|
1430
|
+
return c == ' ' || c == '\t';
|
|
1431
|
+
}
|
|
1432
|
+
|
|
1433
|
+
/**
|
|
1434
|
+
* Advance until the next newline or EOF, used to consume the body of a cpp directive.
|
|
1435
|
+
* Escaped newlines are treated as line continuations, which allow spaces and tabs between backslash and newline.
|
|
1436
|
+
*/
|
|
1437
|
+
static void take_line_escaped_newline(Env *env) {
|
|
1438
|
+
for (;;) {
|
|
1439
|
+
while (not_eof(env) && !is_newline(PEEK) && PEEK != '\\') S_ADVANCE;
|
|
1440
|
+
if (PEEK == '\\') {
|
|
1441
|
+
S_ADVANCE;
|
|
1442
|
+
if (is_space_or_tab(PEEK)) {
|
|
1443
|
+
while (is_space_or_tab(PEEK)) S_ADVANCE;
|
|
1444
|
+
if (is_newline(PEEK)) S_ADVANCE;
|
|
1445
|
+
}
|
|
1446
|
+
else S_ADVANCE;
|
|
1447
|
+
}
|
|
1448
|
+
else return;
|
|
1449
|
+
}
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
/**
|
|
1453
|
+
* Skip the lexer until the following character is neither space nor tab.
|
|
1454
|
+
* Return whether any characters were skipped.
|
|
1455
|
+
*/
|
|
1456
|
+
static bool skip_space(Env *env) {
|
|
1457
|
+
if (!is_space_char(PEEK)) return false;
|
|
1458
|
+
S_SKIP;
|
|
1459
|
+
while(is_space_char(PEEK)) S_SKIP;
|
|
1460
|
+
return true;
|
|
1461
|
+
}
|
|
1462
|
+
|
|
1463
|
+
/**
|
|
1464
|
+
* Skip the lexer until the following character is not a newline.
|
|
1465
|
+
* Return whether any characters were skipped.
|
|
1466
|
+
*/
|
|
1467
|
+
static bool skip_newlines(Env *env) {
|
|
1468
|
+
if (!is_newline(PEEK)) return false;
|
|
1469
|
+
S_SKIP;
|
|
1470
|
+
while(is_newline(PEEK)) S_SKIP;
|
|
1471
|
+
return true;
|
|
1472
|
+
}
|
|
1473
|
+
|
|
1474
|
+
typedef enum {
|
|
1475
|
+
NoSpace,
|
|
1476
|
+
Indented,
|
|
1477
|
+
BOL,
|
|
1478
|
+
} Space;
|
|
1479
|
+
|
|
1480
|
+
/**
|
|
1481
|
+
* Alternate between skipping space and newlines, and return which was seen last.
|
|
1482
|
+
* This does not use the lookahead buffer, but directly accesses the lexer.
|
|
1483
|
+
* Only to be used when it is certain that no whitespace has been copied to the buffer by previous steps, and that no
|
|
1484
|
+
* previous characters should be included in the range of non-zero-width symbol.
|
|
1485
|
+
*/
|
|
1486
|
+
static Space skip_whitespace(Env *env) {
|
|
1487
|
+
Space space = NoSpace;
|
|
1488
|
+
while (true) {
|
|
1489
|
+
if (skip_space(env)) space = Indented;
|
|
1490
|
+
else if (skip_newlines(env)) space = BOL;
|
|
1491
|
+
else return space;
|
|
1492
|
+
};
|
|
1493
|
+
}
|
|
1494
|
+
|
|
1495
|
+
/**
|
|
1496
|
+
* Advance the lexer until the following character is neither space nor tab, starting at position `offset + start`, and
|
|
1497
|
+
* return the index of the next character.
|
|
1498
|
+
*/
|
|
1499
|
+
static uint32_t take_space_from(Env *env, uint32_t start) {
|
|
1500
|
+
return advance_while(env, start, is_space_char);
|
|
1501
|
+
}
|
|
1502
|
+
|
|
1503
|
+
/**
|
|
1504
|
+
* Ensure that the character after a keyword like `module` is not a character that would change its meaning to be an
|
|
1505
|
+
* identifier.
|
|
1506
|
+
*/
|
|
1507
|
+
static bool token_end(int32_t c) { return !is_inner_id_char(c); }
|
|
1508
|
+
|
|
1509
|
+
/**
|
|
1510
|
+
* Check if lookahead contains the string `s` starting at position `offset + start`, followed by a non-id character.
|
|
1511
|
+
* See `seq`.
|
|
1512
|
+
*/
|
|
1513
|
+
static bool token_from(Env *env, const char *restrict s, uint32_t start) {
|
|
1514
|
+
return seq_from(env, s, start) && token_end(peek(env, start + (uint32_t) strlen(s)));
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
/**
|
|
1518
|
+
* `token_from` at the current offset.
|
|
1519
|
+
*/
|
|
1520
|
+
static bool token(Env *env, const char *restrict s) {
|
|
1521
|
+
return seq(env, s) && token_end(peek(env, (uint32_t) strlen(s)));
|
|
1522
|
+
}
|
|
1523
|
+
|
|
1524
|
+
/**
|
|
1525
|
+
* Check if lookahead contains any of the strings in `tokens` starting at position `offset + start`, followed by a
|
|
1526
|
+
* non-id character.
|
|
1527
|
+
*/
|
|
1528
|
+
static bool any_token_from(Env *env, size_t n, const char * tokens[], uint32_t start) {
|
|
1529
|
+
for (size_t i = 0; i < n; i++) {
|
|
1530
|
+
if (token_from(env, tokens[i], start)) return true;
|
|
1531
|
+
}
|
|
1532
|
+
return false;
|
|
1533
|
+
}
|
|
1534
|
+
|
|
1535
|
+
static bool match_symop(Env *env, const char *restrict target) {
|
|
1536
|
+
return symop_lookahead(env) == strlen(target) && seq(env, target);
|
|
1537
|
+
}
|
|
1538
|
+
|
|
1539
|
+
static bool uninitialized(Env *env) { return !has_contexts(env); }
|
|
1540
|
+
|
|
1541
|
+
static uint32_t conid(Env *env) {
|
|
1542
|
+
if (!is_conid_start_char(peek0(env))) return 0;
|
|
1543
|
+
return advance_while(env, 1, is_inner_id_char);
|
|
1544
|
+
}
|
|
1545
|
+
|
|
1546
|
+
typedef enum {
|
|
1547
|
+
NoQualifiedName,
|
|
1548
|
+
QualifiedTarget,
|
|
1549
|
+
QualifiedConid,
|
|
1550
|
+
} QualifiedName;
|
|
1551
|
+
|
|
1552
|
+
static QualifiedName qualified_name(Env *env, bool (*name)(Env *)) {
|
|
1553
|
+
bool qualified = false;
|
|
1554
|
+
while (true) {
|
|
1555
|
+
uint32_t end = conid(env);
|
|
1556
|
+
if (end == 0) break;
|
|
1557
|
+
if (!char_at(env, end, '.')) {
|
|
1558
|
+
if (qualified) return QualifiedConid;
|
|
1559
|
+
else break;
|
|
1560
|
+
}
|
|
1561
|
+
qualified = true;
|
|
1562
|
+
reset_lookahead_to(env, end + 1);
|
|
1563
|
+
if (name(env)) return true;
|
|
1564
|
+
}
|
|
1565
|
+
return NoQualifiedName;
|
|
1566
|
+
}
|
|
1567
|
+
|
|
1568
|
+
/**
|
|
1569
|
+
* Use the lookahead buffer to determine whether a character is escaped, by counting the number of backslashes.
|
|
1570
|
+
*/
|
|
1571
|
+
static bool odd_backslashes_before(Env *env, int32_t index) {
|
|
1572
|
+
bool odd = false;
|
|
1573
|
+
while (index >= 0 && peek(env, (uint32_t) index) == '\\') {
|
|
1574
|
+
odd = !odd;
|
|
1575
|
+
index--;
|
|
1576
|
+
}
|
|
1577
|
+
return odd;
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
/**
|
|
1581
|
+
* Advance before the next unescaped double quote.
|
|
1582
|
+
*/
|
|
1583
|
+
static uint32_t take_string_literal(Env *env) {
|
|
1584
|
+
uint32_t end = 1;
|
|
1585
|
+
while (true) {
|
|
1586
|
+
end = advance_until_char(env, end, '"') + 1;
|
|
1587
|
+
if (is_eof(env) || !odd_backslashes_before(env, (int) end - 2)) return end;
|
|
1588
|
+
}
|
|
1589
|
+
}
|
|
1590
|
+
|
|
1591
|
+
/**
|
|
1592
|
+
* Advance before the single quote that validly ends a character literal.
|
|
1593
|
+
* If there is none, return 1.
|
|
1594
|
+
* Either the first character is a backslash, or the second character is a single quote.
|
|
1595
|
+
*
|
|
1596
|
+
* A single quote followed by backslash is a char unless it was part of a varid, in which case the backslash is a
|
|
1597
|
+
* lambda.
|
|
1598
|
+
* The caller must make sure to lex varids beforehand.
|
|
1599
|
+
*/
|
|
1600
|
+
static uint32_t take_char_literal(Env *env) {
|
|
1601
|
+
if (char1(env, '\\')) return advance_until_char(env, 2, '\'') + 2;
|
|
1602
|
+
else return char_at(env, 2, '\'') ? 3 : 1;
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1606
|
+
// Lookahead: CPP
|
|
1607
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1608
|
+
|
|
1609
|
+
typedef enum {
|
|
1610
|
+
CppNothing,
|
|
1611
|
+
CppStart,
|
|
1612
|
+
CppElse,
|
|
1613
|
+
CppEnd,
|
|
1614
|
+
CppOther,
|
|
1615
|
+
} CppDirective;
|
|
1616
|
+
|
|
1617
|
+
static const char *cpp_tokens_start[3] = {
|
|
1618
|
+
"if",
|
|
1619
|
+
"ifdef",
|
|
1620
|
+
"ifndef",
|
|
1621
|
+
};
|
|
1622
|
+
|
|
1623
|
+
static bool cpp_cond_start(Env *env, uint32_t start) {
|
|
1624
|
+
return any_token_from(env, 3, cpp_tokens_start, start);
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
static const char *cpp_tokens_else[4] = {
|
|
1628
|
+
"else",
|
|
1629
|
+
"elif",
|
|
1630
|
+
"elifdef",
|
|
1631
|
+
"elifndef",
|
|
1632
|
+
};
|
|
1633
|
+
|
|
1634
|
+
static bool cpp_cond_else(Env *env, uint32_t start) {
|
|
1635
|
+
return any_token_from(env, 4, cpp_tokens_else, start);
|
|
1636
|
+
}
|
|
1637
|
+
|
|
1638
|
+
static bool cpp_cond_end(Env *env, uint32_t start) { return token_from(env, "endif", start); }
|
|
1639
|
+
|
|
1640
|
+
static const char *cpp_tokens_other[7] = {
|
|
1641
|
+
"define",
|
|
1642
|
+
"undef",
|
|
1643
|
+
"include",
|
|
1644
|
+
"pragma",
|
|
1645
|
+
"error",
|
|
1646
|
+
"warning",
|
|
1647
|
+
"line",
|
|
1648
|
+
};
|
|
1649
|
+
|
|
1650
|
+
static bool cpp_directive_other(Env *env, uint32_t start) {
|
|
1651
|
+
return
|
|
1652
|
+
any_token_from(env, 7, cpp_tokens_other, start)
|
|
1653
|
+
||
|
|
1654
|
+
// A hash followed by nothing but whitespace is CPP.
|
|
1655
|
+
// If non-whitespace follows whitespace, it is a parse error, unless we're in a brace layout; then it is a varsym.
|
|
1656
|
+
// Complete overkill to parse this, but eh!
|
|
1657
|
+
is_newline(peek(env, start))
|
|
1658
|
+
||
|
|
1659
|
+
// shebang for scripts
|
|
1660
|
+
(char1(env, '!') && uninitialized(env))
|
|
1661
|
+
;
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
/**
|
|
1665
|
+
* If the first character at `offset` is a hash, skip space and try all tokens that start a CPP directive.
|
|
1666
|
+
* Return the matching variant of the enum `CppDirective`.
|
|
1667
|
+
*/
|
|
1668
|
+
static CppDirective cpp_directive(Env *env) {
|
|
1669
|
+
if (!char0(env, '#')) return CppNothing;
|
|
1670
|
+
uint32_t start = take_space_from(env, 1);
|
|
1671
|
+
if (cpp_cond_start(env, start)) return CppStart;
|
|
1672
|
+
else if (cpp_cond_else(env, start)) return CppElse;
|
|
1673
|
+
else if (cpp_cond_end(env, start)) return CppEnd;
|
|
1674
|
+
else if (cpp_directive_other(env, start)) return CppOther;
|
|
1675
|
+
else return CppNothing;
|
|
1676
|
+
}
|
|
1677
|
+
|
|
1678
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1679
|
+
// Starting layouts
|
|
1680
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1681
|
+
|
|
1682
|
+
/**
|
|
1683
|
+
* Opening and closing braces are always followed by a command (`grammar/util.js`), so this can unconditionally push a
|
|
1684
|
+
* context.
|
|
1685
|
+
* See `grammar/externals.js` for more.
|
|
1686
|
+
*
|
|
1687
|
+
* Note: This is not related to regular brace layouts, which are handled by `start_layout`!
|
|
1688
|
+
* Aside from layouts, braces are also used for records and inferred type variables, where indentation is also ignored!
|
|
1689
|
+
* Therefore, we add a context to skip steps like semicolon generation.
|
|
1690
|
+
*
|
|
1691
|
+
* Check out some examples in the tests:
|
|
1692
|
+
* - data: record zero indent
|
|
1693
|
+
* - type decl: inferred quantifier at column 0
|
|
1694
|
+
*/
|
|
1695
|
+
static Symbol start_brace(Env *env) {
|
|
1696
|
+
if (valid(env, START_BRACE)) {
|
|
1697
|
+
push_context(env, Braces, 0);
|
|
1698
|
+
return finish(START_BRACE, "start_brace");
|
|
1699
|
+
}
|
|
1700
|
+
return FAIL;
|
|
1701
|
+
}
|
|
1702
|
+
|
|
1703
|
+
/**
|
|
1704
|
+
* See `start_brace`.
|
|
1705
|
+
*/
|
|
1706
|
+
static Symbol end_brace(Env *env) {
|
|
1707
|
+
if (valid(env, END_BRACE) && current_context(env) == Braces) {
|
|
1708
|
+
pop(env);
|
|
1709
|
+
return finish(END_BRACE, "end_brace");
|
|
1710
|
+
}
|
|
1711
|
+
return FAIL;
|
|
1712
|
+
}
|
|
1713
|
+
|
|
1714
|
+
/**
|
|
1715
|
+
* Return the first valid layout start symbol.
|
|
1716
|
+
*/
|
|
1717
|
+
static Symbol valid_layout_start_sym(Env *env) {
|
|
1718
|
+
for (Symbol i = START; i < END; i++) {
|
|
1719
|
+
if (valid(env, i)) return i;
|
|
1720
|
+
}
|
|
1721
|
+
return FAIL;
|
|
1722
|
+
}
|
|
1723
|
+
|
|
1724
|
+
/**
|
|
1725
|
+
* Map `Symbol` to `ContextSort`.
|
|
1726
|
+
*/
|
|
1727
|
+
static ContextSort layout_sort(Symbol s) {
|
|
1728
|
+
switch (s) {
|
|
1729
|
+
case START_DO:
|
|
1730
|
+
return DoLayout;
|
|
1731
|
+
case START_CASE:
|
|
1732
|
+
return CaseLayout;
|
|
1733
|
+
case START_IF:
|
|
1734
|
+
return MultiWayIfLayout;
|
|
1735
|
+
case START_LET:
|
|
1736
|
+
return LetLayout;
|
|
1737
|
+
case START_QUOTE:
|
|
1738
|
+
return QuoteLayout;
|
|
1739
|
+
default:
|
|
1740
|
+
return DeclLayout;
|
|
1741
|
+
}
|
|
1742
|
+
}
|
|
1743
|
+
|
|
1744
|
+
typedef struct {
|
|
1745
|
+
Symbol sym;
|
|
1746
|
+
ContextSort sort;
|
|
1747
|
+
} StartLayout;
|
|
1748
|
+
|
|
1749
|
+
/**
|
|
1750
|
+
* Determine whether the layout sort corresponding to the potentially valid symbol can start at this position.
|
|
1751
|
+
* If the context stack is `uninitialized`, the first layout is added by `process_token_init`.
|
|
1752
|
+
* In newline processing mode, brace layouts cannot be started because there may be comments before the brace that need
|
|
1753
|
+
* to be emitted first.
|
|
1754
|
+
* Regular `if/then/else` conditionals are always valid at the same position as multi-way if layouts.
|
|
1755
|
+
* If we were to unconditionally start a layout when START_IF is valid, it would never be possible to parse the former,
|
|
1756
|
+
* so this skips that layout sort unless the `Lexed` token is `LBar`.
|
|
1757
|
+
*/
|
|
1758
|
+
static StartLayout valid_layout_start(Env *env, Lexed next) {
|
|
1759
|
+
StartLayout start = {.sym = valid_layout_start_sym(env), .sort = NoContext};
|
|
1760
|
+
if (uninitialized(env) || start.sym == FAIL) return start;
|
|
1761
|
+
ContextSort sort = layout_sort(start.sym);
|
|
1762
|
+
switch (next) {
|
|
1763
|
+
case LBar:
|
|
1764
|
+
break;
|
|
1765
|
+
case LBraceOpen:
|
|
1766
|
+
if (newline_active(env)) return start;
|
|
1767
|
+
sort = Braces;
|
|
1768
|
+
start.sym = START_EXPLICIT;
|
|
1769
|
+
break;
|
|
1770
|
+
default:
|
|
1771
|
+
if (sort == MultiWayIfLayout) return start;
|
|
1772
|
+
break;
|
|
1773
|
+
}
|
|
1774
|
+
start.sort = sort;
|
|
1775
|
+
return start;
|
|
1776
|
+
}
|
|
1777
|
+
|
|
1778
|
+
/**
|
|
1779
|
+
* If the current context is a brace layout, any indent is legal for a new layout.
|
|
1780
|
+
* Otherwise, compare with the indent of the current context.
|
|
1781
|
+
* Since starting layouts is allowed in tuple expressions, we look at the last real indent, skipping over `TExp`s, using
|
|
1782
|
+
* 0 if none exists (which should never be the case).
|
|
1783
|
+
*
|
|
1784
|
+
* According to the docs for `NondecreasingIndentation`, the rule is that a nested context may start at the same column
|
|
1785
|
+
* _if the enclosing context is a do expression_.
|
|
1786
|
+
* From experimental evidence, it appears though that this is the other way round – a do expression within, say, a case
|
|
1787
|
+
* alt can start at the same level as the case layout.
|
|
1788
|
+
*/
|
|
1789
|
+
static bool indent_can_start_layout(Env *env, ContextSort sort, uint32_t indent) {
|
|
1790
|
+
if (current_context(env) == Braces) return true;
|
|
1791
|
+
uint32_t cur = current_indent(env);
|
|
1792
|
+
return (indent > cur || (indent == cur && sort == DoLayout));
|
|
1793
|
+
}
|
|
1794
|
+
|
|
1795
|
+
/**
|
|
1796
|
+
* Start the given layout if the position allows it:
|
|
1797
|
+
*
|
|
1798
|
+
* - If the current context is `ModuleHeader`, the layout must be the `where` after `module`, so any indent is valid.
|
|
1799
|
+
|
|
1800
|
+
* - If the new layout is a brace layout, legal indent is technically required, but we can be lenient since there's no
|
|
1801
|
+
* other way to interpret an opening brace after a layout opener.
|
|
1802
|
+
* However, we need to mark to include the brace in the range to create a terminal (see `grammar/externals.js` for
|
|
1803
|
+
* why).
|
|
1804
|
+
*
|
|
1805
|
+
* - Otherwise, examine indent.
|
|
1806
|
+
*/
|
|
1807
|
+
static Symbol start_layout(Env *env, const StartLayout start, uint32_t indent, const char * restrict desc) {
|
|
1808
|
+
if (in_module_header(env)) pop(env);
|
|
1809
|
+
else if (start.sort == Braces) MARK("start_layout brace");
|
|
1810
|
+
else if (!indent_can_start_layout(env, start.sort, indent)) return FAIL;
|
|
1811
|
+
push_context(env, start.sort, indent);
|
|
1812
|
+
return finish(start.sym, desc);
|
|
1813
|
+
}
|
|
1814
|
+
|
|
1815
|
+
/**
|
|
1816
|
+
* The indent of a layout started at an interior token can only be determined by calling `get_column`.
|
|
1817
|
+
* This is an expensive operation, but hopefully it is rare enough to not make a substantial dent.
|
|
1818
|
+
* Because we might have advanced beyond the layout's start position to check conditions, we need to subtract the length
|
|
1819
|
+
* of the lookahead buffer from the current column.
|
|
1820
|
+
* Whitespace is skipped, and not added to the buffer, so the resulting position is after whitespace.
|
|
1821
|
+
*/
|
|
1822
|
+
static Symbol start_layout_interior(Env *env, Lexed next) {
|
|
1823
|
+
StartLayout start = valid_layout_start(env, next);
|
|
1824
|
+
if (start.sort == NoContext) return FAIL;
|
|
1825
|
+
return start_layout(env, start, start_column(env), "interior");
|
|
1826
|
+
}
|
|
1827
|
+
|
|
1828
|
+
/**
|
|
1829
|
+
* The indent of a layout started at the beginning of a line is determined by `newline_lookahead`, so this does not have
|
|
1830
|
+
* to compute it.
|
|
1831
|
+
*/
|
|
1832
|
+
static Symbol start_layout_newline(Env *env) {
|
|
1833
|
+
StartLayout start = valid_layout_start(env, env->state->newline.end);
|
|
1834
|
+
if (start.sort == NoContext) return FAIL;
|
|
1835
|
+
Symbol result = start_layout(env, start, env->state->newline.indent, "newline");
|
|
1836
|
+
if (result != FAIL) env->state->newline.no_semi = true;
|
|
1837
|
+
return result;
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
/**
|
|
1841
|
+
* See `token_end_layout_texp`.
|
|
1842
|
+
*/
|
|
1843
|
+
static Symbol texp_context(Env *env) {
|
|
1844
|
+
if (valid(env, START_TEXP)) {
|
|
1845
|
+
push_context(env, TExp, 0);
|
|
1846
|
+
return finish(START_TEXP, "texp_context");
|
|
1847
|
+
}
|
|
1848
|
+
else if (valid(env, END_TEXP) && current_context(env) == TExp) {
|
|
1849
|
+
pop(env);
|
|
1850
|
+
return finish(END_TEXP, "texp_context");
|
|
1851
|
+
}
|
|
1852
|
+
else return FAIL;
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1856
|
+
// Ending layouts
|
|
1857
|
+
// --------------------------------------------------------------------------------------------------------
|
|
1858
|
+
|
|
1859
|
+
/**
|
|
1860
|
+
* Separated this from `end_layout` because it caused some weird performance glitches.
|
|
1861
|
+
*/
|
|
1862
|
+
static Symbol end_layout_unchecked(Env *env, const char *restrict desc) {
|
|
1863
|
+
pop(env);
|
|
1864
|
+
return finish(END, desc);
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
/**
|
|
1868
|
+
* If a layout end is valid at this position, pop a context and succeed with layout end.
|
|
1869
|
+
*/
|
|
1870
|
+
static Symbol end_layout(Env *env, const char *restrict desc) {
|
|
1871
|
+
if (valid(env, END)) return end_layout_unchecked(env, desc);
|
|
1872
|
+
else return FAIL;
|
|
1873
|
+
}
|
|
1874
|
+
|
|
1875
|
+
/**
|
|
1876
|
+
* Explicit brace layouts need a dedicated symbol, see `_cmd_layout_start_explicit` for an explanation.
|
|
1877
|
+
* Includes the brace in the range.
|
|
1878
|
+
*/
|
|
1879
|
+
static Symbol end_layout_brace(Env *env) {
|
|
1880
|
+
if (valid(env, END_EXPLICIT) && current_context(env) == Braces) {
|
|
1881
|
+
advance_over(env, 0);
|
|
1882
|
+
MARK("end_layout_brace");
|
|
1883
|
+
pop(env);
|
|
1884
|
+
return finish(END_EXPLICIT, "brace");
|
|
1885
|
+
}
|
|
1886
|
+
else return FAIL;
|
|
1887
|
+
}
|
|
1888
|
+
|
|
1889
|
+
/**
|
|
1890
|
+
* End a layout based on indent decrease.
|
|
1891
|
+
*
|
|
1892
|
+
* If the indent of the current line is smaller than the indent of the current layout, we end the layout in most cases.
|
|
1893
|
+
* Exceptions are:
|
|
1894
|
+
*
|
|
1895
|
+
* - Brace layouts
|
|
1896
|
+
* - The top-level layout, which should only be ended at the end of file.
|
|
1897
|
+
* For leniency, we change the current indent to the smaller value.
|
|
1898
|
+
*/
|
|
1899
|
+
static Symbol end_layout_indent(Env *env) {
|
|
1900
|
+
if (valid(env, END) && indent_less(env, env->state->newline.indent)) {
|
|
1901
|
+
if (top_layout(env)) {
|
|
1902
|
+
array_back(&env->state->contexts)->indent = env->state->newline.indent;
|
|
1903
|
+
return update_state("end top layout");
|
|
1904
|
+
}
|
|
1905
|
+
else {
|
|
1906
|
+
env->state->newline.skip_semi = false;
|
|
1907
|
+
return end_layout_unchecked(env, "indent");
|
|
1908
|
+
}
|
|
1909
|
+
}
|
|
1910
|
+
return FAIL;
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
/**
|
|
1914
|
+
* An expression layout may be closed by an infix operator when it is not valid at that position:
|
|
1915
|
+
*
|
|
1916
|
+
* a :: IO Int
|
|
1917
|
+
* a = do a <- pure 5
|
|
1918
|
+
* pure a
|
|
1919
|
+
* >>= pure
|
|
1920
|
+
*
|
|
1921
|
+
* In this situation, the indent of the operator causes a semicolon to be generated, which leads to varsym being invalid
|
|
1922
|
+
* lookahead.
|
|
1923
|
+
* The layout is closed and the entire `do` block becomes the left operand of the `>>=`.
|
|
1924
|
+
* The same applies for `infix` id operators.
|
|
1925
|
+
*
|
|
1926
|
+
* It doesn't apply to multi-way if layouts, because those don't use semicolons.
|
|
1927
|
+
*/
|
|
1928
|
+
static Symbol end_layout_infix(Env *env) {
|
|
1929
|
+
if (!valid(env, VARSYM) && !valid(env, CONSYM)) return end_layout(env, "symop invalid");
|
|
1930
|
+
return FAIL;
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
/**
|
|
1934
|
+
* A case alt can have a `where` clause attached to it, so a case layout is ended by a `where` only if its indent is
|
|
1935
|
+
* equal to or smaller than the layout indent.
|
|
1936
|
+
*
|
|
1937
|
+
* A `do` or `if` cannot have a `where`, so they are always terminated.
|
|
1938
|
+
*
|
|
1939
|
+
* It would be tempting to leave it at that, but there can be multiple successive `where` clauses.
|
|
1940
|
+
* If a `case` is followed by two of them (greater indent), the first one would attach to the last alt.
|
|
1941
|
+
* The second one would have to close the `case` layout and attach to the next higher allowed place (e.g. the enclosing
|
|
1942
|
+
* function decl), but if its indent is greater, this cannot be detected here – it would just seem like a `where`
|
|
1943
|
+
* attaching to an alt, since we don't keep track of the number of `where`s encountered (and we couldn't, since we're
|
|
1944
|
+
* dealing with layouts, not case alts).
|
|
1945
|
+
*
|
|
1946
|
+
* By tracking the validity of `where` symbols, we can simplify the condition for `do` and `if`: End any layout when
|
|
1947
|
+
* `where` is parsed, but invalid.
|
|
1948
|
+
*/
|
|
1949
|
+
static Symbol end_layout_where(Env *env) {
|
|
1950
|
+
if (valid(env, END) && !valid(env, WHERE) && is_layout_context(env)) return end_layout(env, "where");
|
|
1951
|
+
return FAIL;
|
|
1952
|
+
}
|
|
1953
|
+
|
|
1954
|
+
/**
|
|
1955
|
+
* Ending layouts with `in` heavily relies on parse errors in GHC, so this is difficult.
|
|
1956
|
+
* The heuristic here is that if `in` is not valid (i.e. a parse error), we pop any layout.
|
|
1957
|
+
*
|
|
1958
|
+
* Take the example of some inline layouts in a let decl:
|
|
1959
|
+
* `let a = case a of a -> do a in a`
|
|
1960
|
+
* The `do` and `case` layouts have valid `END` symbols at the `in`, but `in` itself is not valid as long as the `case`
|
|
1961
|
+
* hasn't reduced, so we pop until we find `IN`.
|
|
1962
|
+
*
|
|
1963
|
+
* This isn't enough though, since `let` also opened a layout that ends here, so we have to test for that explicitly.
|
|
1964
|
+
*
|
|
1965
|
+
* Note that this doesn't allow the `in` of a nested `let` to close the outer `let`, since the `END` for that isn't
|
|
1966
|
+
* valid before the inner `let` has reduced.
|
|
1967
|
+
*/
|
|
1968
|
+
static Symbol end_layout_in(Env *env) {
|
|
1969
|
+
if (valid(env, END) && (!valid(env, IN) || current_context(env) == LetLayout)) return end_layout(env, "in");
|
|
1970
|
+
return FAIL;
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
/**
|
|
1974
|
+
* For GADT constructor layouts.
|
|
1975
|
+
*/
|
|
1976
|
+
static Symbol end_layout_deriving(Env *env) {
|
|
1977
|
+
if (valid(env, END) && !valid(env, DERIVING) && !top_layout(env) && current_context(env) == DeclLayout)
|
|
1978
|
+
return end_layout(env, "deriving");
|
|
1979
|
+
return FAIL;
|
|
1980
|
+
}
|
|
1981
|
+
|
|
1982
|
+
/**
|
|
1983
|
+
* Return `true` if there is a `TExp` context on the stack and only layouts above it.
|
|
1984
|
+
*/
|
|
1985
|
+
static bool layouts_in_texp(Env *env) {
|
|
1986
|
+
if (is_layout_context(env) && (env->state->contexts.size > 1)) {
|
|
1987
|
+
for (int32_t i = (int32_t) env->state->contexts.size - 2; i >= 0; i--) {
|
|
1988
|
+
Context *cur = array_get(&env->state->contexts, i);
|
|
1989
|
+
if (cur->sort == TExp || cur->sort == Braces) return true;
|
|
1990
|
+
else if (cur->sort > Braces) break;
|
|
1991
|
+
}
|
|
1992
|
+
}
|
|
1993
|
+
return false;
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
/**
|
|
1997
|
+
* Tuple expressions are constructs that syntactically delimit their contents in an unambiguous way that makes parsing
|
|
1998
|
+
* a lot easier.
|
|
1999
|
+
* In GHC, this concept is used to classify productions like view patterns and annotated expressions.
|
|
2000
|
+
* For us, unfortunately, it also means that there are significantly more circumstances in which layouts can be ended by
|
|
2001
|
+
* parse errors.
|
|
2002
|
+
*
|
|
2003
|
+
* In practice, it means that expression layouts can be closed by commas, vertical bars and closing brackets and parens
|
|
2004
|
+
* when they are elements in a list or tuple-like construct:
|
|
2005
|
+
*
|
|
2006
|
+
* (case a of a -> a, do a; a, if | a -> a | a -> a)
|
|
2007
|
+
* [case a of a -> a | a <- a]
|
|
2008
|
+
*
|
|
2009
|
+
* We encode this as a special context sort, `TExp`, that is pushed and popped at opening and closing brackets.
|
|
2010
|
+
*
|
|
2011
|
+
* Some other constructs, like guards, have similar characteristics, so we use the same mechanism for them:
|
|
2012
|
+
*
|
|
2013
|
+
* > a = case a of
|
|
2014
|
+
* > a | let a = a -> a
|
|
2015
|
+
*
|
|
2016
|
+
* Here the let layout must be ended by parse error, so we start a tuple expression at the bar and end it at the arrow.
|
|
2017
|
+
*/
|
|
2018
|
+
static Symbol token_end_layout_texp(Env *env) {
|
|
2019
|
+
return (valid(env, END) && layouts_in_texp(env)) ? end_layout(env, "texp") : FAIL;
|
|
2020
|
+
}
|
|
2021
|
+
|
|
2022
|
+
static Symbol force_end_context(Env *env) {
|
|
2023
|
+
for (int32_t i = (int32_t) env->state->contexts.size - 1; i >= 0; i--) {
|
|
2024
|
+
ContextSort ctx = array_get(&env->state->contexts, i)->sort;
|
|
2025
|
+
Symbol s = context_end_sym(ctx);
|
|
2026
|
+
pop(env);
|
|
2027
|
+
if (s != FAIL && valid(env, s)) return finish(s, "force_end_context");
|
|
2028
|
+
}
|
|
2029
|
+
return FAIL;
|
|
2030
|
+
}
|
|
2031
|
+
|
|
2032
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2033
|
+
// Operators
|
|
2034
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2035
|
+
|
|
2036
|
+
/**
|
|
2037
|
+
* Opening tokens are a class of characters that may immediately follow prefix operators like bang pattern `!` or type
|
|
2038
|
+
* application `@`.
|
|
2039
|
+
*/
|
|
2040
|
+
static bool opening_token(Env *env, uint32_t i) {
|
|
2041
|
+
int32_t c = peek(env, i);
|
|
2042
|
+
switch (c) {
|
|
2043
|
+
case 0x27e6: // ⟦
|
|
2044
|
+
case 0x2987: // ⦇
|
|
2045
|
+
case '(':
|
|
2046
|
+
case '[':
|
|
2047
|
+
case '"':
|
|
2048
|
+
return true;
|
|
2049
|
+
case '{':
|
|
2050
|
+
return peek(env, i + 1) != '-';
|
|
2051
|
+
default:
|
|
2052
|
+
// Includes single quote
|
|
2053
|
+
return is_id_char(c);
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
|
|
2057
|
+
/**
|
|
2058
|
+
* Test for reserved operators of two characters.
|
|
2059
|
+
*/
|
|
2060
|
+
static bool valid_symop_two_chars(int32_t first_char, int32_t second_char) {
|
|
2061
|
+
switch (first_char) {
|
|
2062
|
+
case '=':
|
|
2063
|
+
return second_char != '>';
|
|
2064
|
+
case '<':
|
|
2065
|
+
return second_char != '-';
|
|
2066
|
+
case ':':
|
|
2067
|
+
return second_char != ':';
|
|
2068
|
+
default:
|
|
2069
|
+
return true;
|
|
2070
|
+
}
|
|
2071
|
+
}
|
|
2072
|
+
|
|
2073
|
+
/**
|
|
2074
|
+
* If a prefix operator is not followed by an opening token, it may still be a valid varsym.
|
|
2075
|
+
*/
|
|
2076
|
+
static Lexed lex_prefix(Env *env, Lexed t) {
|
|
2077
|
+
return opening_token(env, 1) ? t : LSymop;
|
|
2078
|
+
}
|
|
2079
|
+
|
|
2080
|
+
/**
|
|
2081
|
+
* If a splice operator is not followed by an opening token, it may still be a valid varsym.
|
|
2082
|
+
* We only allow variables and parenthesized expressions for performance reasons, though.
|
|
2083
|
+
*/
|
|
2084
|
+
static Lexed lex_splice(int32_t c) {
|
|
2085
|
+
return varid_start_char(c) || c == '(' ? LDollar : LSymop;
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
/**
|
|
2089
|
+
* Lex special occurrences of symbolic operator characters, or declare a valid operator.
|
|
2090
|
+
*
|
|
2091
|
+
* For the dot:
|
|
2092
|
+
*
|
|
2093
|
+
* - Two dots: `..`: Only used for arithmetic sequences (`[a..10]`).
|
|
2094
|
+
* These conflict with record field projection (`[a.b, c]`) and infix operators (`[a..+b]`), and it's too hard to
|
|
2095
|
+
* disambiguate them without this special rule.
|
|
2096
|
+
*
|
|
2097
|
+
* - Tight dot `a.b.c`: A regular tight op, but it has to get a separate symbol from qualified module dots since those
|
|
2098
|
+
* can be followed by symops.
|
|
2099
|
+
*
|
|
2100
|
+
* - Prefix dot `(.a)`: A regular prefix op, for record dot field selectors.
|
|
2101
|
+
*
|
|
2102
|
+
* - Qualified dot `A.B.c`, `A.B.C`, `A.B.+`: Used primarily for qualified modules, but needs to be accepted for field
|
|
2103
|
+
* selectors as well due to ambiguity.
|
|
2104
|
+
* This is not a regular tight op since it needs to allow symops and conid.
|
|
2105
|
+
*/
|
|
2106
|
+
static Lexed lex_symop(Env *env) {
|
|
2107
|
+
uint32_t len = symop_lookahead(env);
|
|
2108
|
+
if (len == 0) return LNothing;
|
|
2109
|
+
int32_t c1 = unsafe_peek(env, 0);
|
|
2110
|
+
if (len == 1) {
|
|
2111
|
+
switch (c1) {
|
|
2112
|
+
case '?':
|
|
2113
|
+
// A `?` can be the head of an implicit parameter, if followed by a varid.
|
|
2114
|
+
return varid_start_char(peek1(env)) ? LNothing : LSymop;
|
|
2115
|
+
case '#':
|
|
2116
|
+
return char1(env, ')') ? LUnboxedClose : LHash;
|
|
2117
|
+
case '|':
|
|
2118
|
+
return char1(env, ']') ? LQuoteClose : LBar;
|
|
2119
|
+
case '!':
|
|
2120
|
+
return lex_prefix(env, LBang);
|
|
2121
|
+
case '~':
|
|
2122
|
+
return lex_prefix(env, LTilde);
|
|
2123
|
+
case '@':
|
|
2124
|
+
return lex_prefix(env, LAt);
|
|
2125
|
+
case '%':
|
|
2126
|
+
return lex_prefix(env, LPercent);
|
|
2127
|
+
case '$':
|
|
2128
|
+
return lex_splice(peek1(env));
|
|
2129
|
+
case '.':
|
|
2130
|
+
if (is_id_char(peek1(env))) return LDotId;
|
|
2131
|
+
else if (opening_token(env, 1)) return LDotOpen;
|
|
2132
|
+
else return LSymop;
|
|
2133
|
+
case 0x2192: // →
|
|
2134
|
+
case 0x22b8: // ⊸
|
|
2135
|
+
return LArrow;
|
|
2136
|
+
case 0x21d2: // ⇒
|
|
2137
|
+
return LCArrow;
|
|
2138
|
+
case '=':
|
|
2139
|
+
case 0x27e7: // ⟧
|
|
2140
|
+
case 0x2988: // ⦈
|
|
2141
|
+
return LTexpCloser;
|
|
2142
|
+
case '*':
|
|
2143
|
+
case '-':
|
|
2144
|
+
return LSymopSpecial;
|
|
2145
|
+
case '\\':
|
|
2146
|
+
case 0x2190: // ←
|
|
2147
|
+
case 0x2200: // ∀
|
|
2148
|
+
case 0x2237: // ∷
|
|
2149
|
+
case 0x2605: // ★
|
|
2150
|
+
case 0x27e6: // ⟦
|
|
2151
|
+
case 0x2919: // ⤙
|
|
2152
|
+
case 0x291a: // ⤚
|
|
2153
|
+
case 0x291b: // ⤛
|
|
2154
|
+
case 0x291c: // ⤜
|
|
2155
|
+
case 0x2987: // ⦇
|
|
2156
|
+
return LNothing;
|
|
2157
|
+
}
|
|
2158
|
+
}
|
|
2159
|
+
else if (len == 2) {
|
|
2160
|
+
if (seq(env, "->")) return LArrow;
|
|
2161
|
+
if (seq(env, "=>")) return LCArrow;
|
|
2162
|
+
int32_t c2 = unsafe_peek(env, 1);
|
|
2163
|
+
switch (c1) {
|
|
2164
|
+
case '$':
|
|
2165
|
+
if (c2 == '$') return lex_splice(peek2(env));
|
|
2166
|
+
break;
|
|
2167
|
+
case '|':
|
|
2168
|
+
if (c2 == '|' && char2(env, ']')) return LQuoteClose;
|
|
2169
|
+
break;
|
|
2170
|
+
case '.':
|
|
2171
|
+
if (c2 == '.') return LDotDot;
|
|
2172
|
+
else return LDotSymop;
|
|
2173
|
+
break;
|
|
2174
|
+
case '#':
|
|
2175
|
+
// Unboxed unit `(##)` and unboxed sum with missing space `(#| Int #)`
|
|
2176
|
+
if (c2 == '#' || c2 == '|') return LSymopSpecial;
|
|
2177
|
+
break;
|
|
2178
|
+
default:
|
|
2179
|
+
if (!valid_symop_two_chars(c1, c2)) return LNothing;
|
|
2180
|
+
break;
|
|
2181
|
+
}
|
|
2182
|
+
}
|
|
2183
|
+
else switch (c1) {
|
|
2184
|
+
case '-':
|
|
2185
|
+
if (seq(env, "->.")) return LArrow;
|
|
2186
|
+
break;
|
|
2187
|
+
case '.':
|
|
2188
|
+
return LDotSymop;
|
|
2189
|
+
}
|
|
2190
|
+
return LSymop;
|
|
2191
|
+
}
|
|
2192
|
+
|
|
2193
|
+
/**
|
|
2194
|
+
* If the next character after whitespace starting from `start` is a closing parenthesis, finish with `LEFT_SECTION_OP`.
|
|
2195
|
+
* This is called after a previous step has already lexed a valid infix operator (symbolic or ticked varid).
|
|
2196
|
+
*
|
|
2197
|
+
* Left section operators must be detected here to disambiguate from infix expressions:
|
|
2198
|
+
*
|
|
2199
|
+
* > f = (1 - 2 +)
|
|
2200
|
+
*
|
|
2201
|
+
* When lookahead is `+`, the parser must decide whether to reduce `1 - 2` to `infix` because it is the operand of a
|
|
2202
|
+
* section, or to shift into another `infix`.
|
|
2203
|
+
* With a single lookahead token, this is not decidable.
|
|
2204
|
+
*
|
|
2205
|
+
* Note: The obvious solution would be to make `infix` left-associative, so it would always reduce.
|
|
2206
|
+
* Unfortunately, this doesn't work for minus, due to apparently unsurmountable problems caused by the
|
|
2207
|
+
* apply/infix/negation conflict.
|
|
2208
|
+
*
|
|
2209
|
+
* Note: This will fail if there are extras between the operator and the parenthesis (and the ticks and the varid).
|
|
2210
|
+
*
|
|
2211
|
+
* Note: If the operator isn't followed by a parenthesis, it will be parsed as an infix operator in the next step, since
|
|
2212
|
+
* those are always valid when left sections are (except for qualified symops).
|
|
2213
|
+
* However, this function advances over whitespace to find the paren, so if the next step marks and finishes, it will
|
|
2214
|
+
* either:
|
|
2215
|
+
* - Include the whitespace in its range, if this consumes it
|
|
2216
|
+
* - Have a zero-width range, if this skips whitespace
|
|
2217
|
+
* To mitigate this, we introduce the auxiliary symbol `NO_SECTION_OP`, which is (optionally) valid before infix
|
|
2218
|
+
* operators, but not before section operators.
|
|
2219
|
+
* When this function finds any whitespace before the parenthesis, it will finish with that symbol, so that
|
|
2220
|
+
* `LEFT_SECTION_OP` won't be valid in the next run, but all other infix operators are.
|
|
2221
|
+
*/
|
|
2222
|
+
static Symbol left_section_op(Env *env, uint32_t start) {
|
|
2223
|
+
if (valid(env, LEFT_SECTION_OP)) {
|
|
2224
|
+
advance_before(env, start);
|
|
2225
|
+
Space space = skip_whitespace(env);
|
|
2226
|
+
if (char_at(env, start, ')')) return finish(LEFT_SECTION_OP, "left section");
|
|
2227
|
+
if (space != NoSpace) return finish_if_valid(env, NO_SECTION_OP, "left section");
|
|
2228
|
+
}
|
|
2229
|
+
return FAIL;
|
|
2230
|
+
}
|
|
2231
|
+
|
|
2232
|
+
/**
|
|
2233
|
+
* Specialization of `left_section_op` for ticked infix identifiers.
|
|
2234
|
+
*/
|
|
2235
|
+
static Symbol left_section_ticked(Env *env) {
|
|
2236
|
+
if (valid(env, LEFT_SECTION_OP)) {
|
|
2237
|
+
uint32_t end_tick = advance_until_char(env, 1, '`');
|
|
2238
|
+
// Could be EOF
|
|
2239
|
+
if (char_at(env, end_tick, '`')) {
|
|
2240
|
+
return left_section_op(env, end_tick + 1);
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
return FAIL;
|
|
2244
|
+
}
|
|
2245
|
+
|
|
2246
|
+
/**
|
|
2247
|
+
* This calls `symop_lookahead` to ensure that the position has advanced beyond the end of the symop, which is necessary
|
|
2248
|
+
* because newline lookahead may have validated the symop in a previous run.
|
|
2249
|
+
* This marks the range to emit a terminal.
|
|
2250
|
+
*/
|
|
2251
|
+
static Symbol finish_symop(Env *env, Symbol s) {
|
|
2252
|
+
if (valid(env, s) || valid(env, LEFT_SECTION_OP)) {
|
|
2253
|
+
uint32_t after_symop = symop_lookahead(env);
|
|
2254
|
+
SEQ(left_section_op(env, after_symop));
|
|
2255
|
+
MARK("symop");
|
|
2256
|
+
return s;
|
|
2257
|
+
}
|
|
2258
|
+
return FAIL;
|
|
2259
|
+
}
|
|
2260
|
+
|
|
2261
|
+
/**
|
|
2262
|
+
* Tight ops like `dot.syntax` require that no initial whitespace was skipped.
|
|
2263
|
+
*/
|
|
2264
|
+
static Symbol tight_op(Env *env, bool whitespace, Symbol s) {
|
|
2265
|
+
if (!whitespace) return finish_if_valid(env, s, "tight_op");
|
|
2266
|
+
else return FAIL;
|
|
2267
|
+
}
|
|
2268
|
+
|
|
2269
|
+
/**
|
|
2270
|
+
* Used for situations where the operator is followed by an opening token, and so can be a prefix op if it is preceded
|
|
2271
|
+
* by whitespace; but is no valid tight op and therefore becomes a regular operator if not preceded by whitespace or the
|
|
2272
|
+
* symbol is not valid.
|
|
2273
|
+
*
|
|
2274
|
+
* Only used for `%` (modifier).
|
|
2275
|
+
*/
|
|
2276
|
+
static Symbol prefix_or_varsym(Env *env, bool whitespace, Symbol s) {
|
|
2277
|
+
if (whitespace) SEQ(finish_if_valid(env, s, "prefix_or_varsym"));
|
|
2278
|
+
return finish_symop(env, VARSYM);
|
|
2279
|
+
}
|
|
2280
|
+
|
|
2281
|
+
/**
|
|
2282
|
+
* Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
|
|
2283
|
+
* preceded by whitespace; but is no valid prefix op and therefore becomes a regular operator if preceded by whitespace
|
|
2284
|
+
* or the symbol is not valid.
|
|
2285
|
+
*
|
|
2286
|
+
* Only used for `.`, when a projection selector `(.fieldname)` is not valid at this position, so the dot becomes the
|
|
2287
|
+
* composition operator.
|
|
2288
|
+
*/
|
|
2289
|
+
static Symbol tight_or_varsym(Env *env, bool whitespace, Symbol s) {
|
|
2290
|
+
SEQ(tight_op(env, whitespace, s));
|
|
2291
|
+
return finish_symop(env, VARSYM);
|
|
2292
|
+
}
|
|
2293
|
+
|
|
2294
|
+
/**
|
|
2295
|
+
* Used for situations where the operator is followed by an opening token, and so can be a tight op if it is not
|
|
2296
|
+
* preceded by whitespace or a prefix op if it is.
|
|
2297
|
+
*
|
|
2298
|
+
* If neither of those symbols is valid, fall back to a regular operator.
|
|
2299
|
+
*
|
|
2300
|
+
* Used for `!`, `~` and `@`.
|
|
2301
|
+
*/
|
|
2302
|
+
static Symbol infix_or_varsym(Env *env, bool whitespace, Symbol prefix, Symbol tight) {
|
|
2303
|
+
SEQ(finish_if_valid(env, whitespace ? prefix : tight, "infix_or_varsym"));
|
|
2304
|
+
return finish_symop(env, VARSYM);
|
|
2305
|
+
}
|
|
2306
|
+
|
|
2307
|
+
static Symbol qualified_op(Env *env) {
|
|
2308
|
+
if (qualified_name(env, is_symop) == QualifiedTarget) {
|
|
2309
|
+
SEQ(left_section_op(env, symop_lookahead(env)));
|
|
2310
|
+
return QUALIFIED_OP;
|
|
2311
|
+
}
|
|
2312
|
+
return FAIL;
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2315
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2316
|
+
// Token lookahead
|
|
2317
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2318
|
+
|
|
2319
|
+
/**
|
|
2320
|
+
* Detect the start of a quasiquote: An opening bracket followed by an optional varid and a vertical bar, all without
|
|
2321
|
+
* whitespace in between.
|
|
2322
|
+
*/
|
|
2323
|
+
static bool is_qq_start(Env *env) {
|
|
2324
|
+
uint32_t end = advance_while(env, 1, quoter_char);
|
|
2325
|
+
return char_at(env, end, '|');
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2328
|
+
/**
|
|
2329
|
+
* An end token is a keyword like `else` or `deriving` that can end a layout without newline or indent.
|
|
2330
|
+
*/
|
|
2331
|
+
static Lexed try_end_token(Env *env, const char * restrict target, Lexed match) {
|
|
2332
|
+
if (token(env, target)) return match;
|
|
2333
|
+
else return LNothing;
|
|
2334
|
+
}
|
|
2335
|
+
|
|
2336
|
+
/**
|
|
2337
|
+
* Check that a symop consists only of minuses after the second character.
|
|
2338
|
+
*/
|
|
2339
|
+
static bool only_minus(Env *env) {
|
|
2340
|
+
uint32_t i = 2;
|
|
2341
|
+
while (peek(env, i) == '-') i++;
|
|
2342
|
+
return !symop_char(peek(env, i));
|
|
2343
|
+
}
|
|
2344
|
+
|
|
2345
|
+
/**
|
|
2346
|
+
* Check that a symop consists only of minuses, making it a comment herald.
|
|
2347
|
+
*/
|
|
2348
|
+
static bool line_comment_herald(Env *env) {
|
|
2349
|
+
return seq(env, "--") && only_minus(env);
|
|
2350
|
+
}
|
|
2351
|
+
|
|
2352
|
+
static Lexed lex_cpp(Env *env) {
|
|
2353
|
+
switch(cpp_directive(env)) {
|
|
2354
|
+
case CppElse:
|
|
2355
|
+
return LCppElse;
|
|
2356
|
+
case CppNothing:
|
|
2357
|
+
return LNothing;
|
|
2358
|
+
default:
|
|
2359
|
+
return LCpp;
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
|
|
2363
|
+
/**
|
|
2364
|
+
* Lex pragmas, comments and CPP.
|
|
2365
|
+
*/
|
|
2366
|
+
static Lexed lex_extras(Env *env, bool bol) {
|
|
2367
|
+
switch (peek0(env)) {
|
|
2368
|
+
case '{':
|
|
2369
|
+
if (char1(env, '-')) return char2(env, '#') ? LPragma : LBlockComment;
|
|
2370
|
+
break;
|
|
2371
|
+
case '#':
|
|
2372
|
+
if (bol) return lex_cpp(env);
|
|
2373
|
+
break;
|
|
2374
|
+
case '-':
|
|
2375
|
+
if (line_comment_herald(env)) return LLineComment;
|
|
2376
|
+
break;
|
|
2377
|
+
default:
|
|
2378
|
+
break;
|
|
2379
|
+
}
|
|
2380
|
+
return LNothing;
|
|
2381
|
+
}
|
|
2382
|
+
|
|
2383
|
+
/**
|
|
2384
|
+
* The main lexing entry point, branching on the first character, then advancing as far as necessary to identify all
|
|
2385
|
+
* interesting tokens.
|
|
2386
|
+
*/
|
|
2387
|
+
static Lexed lex(Env *env, bool bol) {
|
|
2388
|
+
SEQ(lex_extras(env, bol));
|
|
2389
|
+
if (symop_char(peek0(env))) SEQ(lex_symop(env));
|
|
2390
|
+
else switch (peek0(env)) {
|
|
2391
|
+
case 'w':
|
|
2392
|
+
return try_end_token(env, "where", LWhere);
|
|
2393
|
+
case 'i':
|
|
2394
|
+
return try_end_token(env, "in", LIn);
|
|
2395
|
+
case 't':
|
|
2396
|
+
return try_end_token(env, "then", LThen);
|
|
2397
|
+
case 'e':
|
|
2398
|
+
return try_end_token(env, "else", LElse);
|
|
2399
|
+
case 'd':
|
|
2400
|
+
return try_end_token(env, "deriving", LDeriving);
|
|
2401
|
+
case 'm':
|
|
2402
|
+
if ((uninitialized(env) || in_module_header(env)) && token(env, "module")) return LModule;
|
|
2403
|
+
break;
|
|
2404
|
+
case '{':
|
|
2405
|
+
return LBraceOpen;
|
|
2406
|
+
case '}':
|
|
2407
|
+
return LBraceClose;
|
|
2408
|
+
case ';':
|
|
2409
|
+
return LSemi;
|
|
2410
|
+
case '`':
|
|
2411
|
+
return LTick;
|
|
2412
|
+
case '[':
|
|
2413
|
+
if (valid(env, QQ_START) && is_qq_start(env)) return LBracketOpen;
|
|
2414
|
+
break;
|
|
2415
|
+
case ']':
|
|
2416
|
+
case ')':
|
|
2417
|
+
case ',':
|
|
2418
|
+
return LTexpCloser;
|
|
2419
|
+
default:
|
|
2420
|
+
if (is_conid_start_char(peek0(env))) return LUpper;
|
|
2421
|
+
break;
|
|
2422
|
+
}
|
|
2423
|
+
return LNothing;
|
|
2424
|
+
}
|
|
2425
|
+
|
|
2426
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2427
|
+
// CPP
|
|
2428
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2429
|
+
|
|
2430
|
+
/**
|
|
2431
|
+
* This tests for `#endif` directly after taking a line, so it only matches it at the first column.
|
|
2432
|
+
* Int finishes right before the `#endif`, so that pragma is parsed by `cpp_consume` in the next round.
|
|
2433
|
+
*/
|
|
2434
|
+
static Symbol cpp_else(Env *env, bool emit) {
|
|
2435
|
+
uint32_t nesting = 1;
|
|
2436
|
+
do {
|
|
2437
|
+
take_line_escaped_newline(env);
|
|
2438
|
+
if (emit) MARK("cpp_else");
|
|
2439
|
+
S_ADVANCE;
|
|
2440
|
+
reset_lookahead(env);
|
|
2441
|
+
switch (cpp_directive(env)) {
|
|
2442
|
+
case CppStart:
|
|
2443
|
+
nesting++;
|
|
2444
|
+
break;
|
|
2445
|
+
case CppEnd:
|
|
2446
|
+
nesting--;
|
|
2447
|
+
break;
|
|
2448
|
+
default:
|
|
2449
|
+
break;
|
|
2450
|
+
}
|
|
2451
|
+
}
|
|
2452
|
+
while (not_eof(env) && nesting > 0);
|
|
2453
|
+
if (emit) return finish(CPP, "cpp-else");
|
|
2454
|
+
else return FAIL;
|
|
2455
|
+
}
|
|
2456
|
+
|
|
2457
|
+
static Symbol cpp_line(Env *env) {
|
|
2458
|
+
take_line_escaped_newline(env);
|
|
2459
|
+
return finish_marked(env, CPP, "cpp");
|
|
2460
|
+
}
|
|
2461
|
+
|
|
2462
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2463
|
+
// Comments
|
|
2464
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2465
|
+
|
|
2466
|
+
/**
|
|
2467
|
+
* Distinguish between haddocks and plain comments by matching on the first non-whitespace character.
|
|
2468
|
+
*/
|
|
2469
|
+
static Symbol comment_type(Env *env) {
|
|
2470
|
+
uint32_t i = 2;
|
|
2471
|
+
while (peek(env, i) == '-') i++;
|
|
2472
|
+
while (not_eof(env)) {
|
|
2473
|
+
int32_t c = peek(env, i++);
|
|
2474
|
+
if (c == '|' || c == '^') return HADDOCK;
|
|
2475
|
+
else if (!is_space_char(c)) break;
|
|
2476
|
+
}
|
|
2477
|
+
return COMMENT;
|
|
2478
|
+
}
|
|
2479
|
+
|
|
2480
|
+
/**
|
|
2481
|
+
* Inline comments extend over all consecutive lines that start with comments.
|
|
2482
|
+
* Could be improved by requiring equal indent.
|
|
2483
|
+
*/
|
|
2484
|
+
static Symbol inline_comment(Env *env) {
|
|
2485
|
+
Symbol sym = comment_type(env);
|
|
2486
|
+
do {
|
|
2487
|
+
take_line(env);
|
|
2488
|
+
MARK("inline comment");
|
|
2489
|
+
S_ADVANCE;
|
|
2490
|
+
reset_lookahead(env);
|
|
2491
|
+
} while (line_comment_herald(env));
|
|
2492
|
+
return sym;
|
|
2493
|
+
}
|
|
2494
|
+
|
|
2495
|
+
static uint32_t consume_block_comment(Env *env, uint32_t col) {
|
|
2496
|
+
uint32_t level = 0;
|
|
2497
|
+
for (;;) {
|
|
2498
|
+
if (is_eof(env)) return col;
|
|
2499
|
+
col++;
|
|
2500
|
+
switch (PEEK) {
|
|
2501
|
+
case '{':
|
|
2502
|
+
S_ADVANCE;
|
|
2503
|
+
if (PEEK == '-') {
|
|
2504
|
+
S_ADVANCE;
|
|
2505
|
+
col++;
|
|
2506
|
+
level++;
|
|
2507
|
+
}
|
|
2508
|
+
break;
|
|
2509
|
+
case '-':
|
|
2510
|
+
S_ADVANCE;
|
|
2511
|
+
if (PEEK == '}') {
|
|
2512
|
+
S_ADVANCE;
|
|
2513
|
+
col++;
|
|
2514
|
+
if (level == 0) return col;
|
|
2515
|
+
level--;
|
|
2516
|
+
}
|
|
2517
|
+
break;
|
|
2518
|
+
NEWLINE_CASES:
|
|
2519
|
+
S_ADVANCE;
|
|
2520
|
+
col = 0;
|
|
2521
|
+
break;
|
|
2522
|
+
case '\t':
|
|
2523
|
+
S_ADVANCE;
|
|
2524
|
+
col += 7;
|
|
2525
|
+
break;
|
|
2526
|
+
default:
|
|
2527
|
+
S_ADVANCE;
|
|
2528
|
+
break;
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
}
|
|
2532
|
+
|
|
2533
|
+
/**
|
|
2534
|
+
* Since {- -} comments can be nested arbitrarily, this has to keep track of how many have been opened, so that the
|
|
2535
|
+
* outermost comment isn't closed prematurely.
|
|
2536
|
+
*/
|
|
2537
|
+
static Symbol block_comment(Env *env) {
|
|
2538
|
+
Symbol sym = comment_type(env);
|
|
2539
|
+
consume_block_comment(env, env->state->lookahead.size);
|
|
2540
|
+
return finish_marked(env, sym, "block_comment");
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2544
|
+
// Pragma
|
|
2545
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2546
|
+
|
|
2547
|
+
static bool consume_pragma(Env *env) {
|
|
2548
|
+
if (seq(env, "{-#")) {
|
|
2549
|
+
while (!seq(env, "#-}") && not_eof(env)) {
|
|
2550
|
+
reset_lookahead(env);
|
|
2551
|
+
advance_over(env, 0);
|
|
2552
|
+
}
|
|
2553
|
+
return true;
|
|
2554
|
+
}
|
|
2555
|
+
return false;
|
|
2556
|
+
}
|
|
2557
|
+
|
|
2558
|
+
/**
|
|
2559
|
+
* Since pragmas can occur anywhere, like comments, but contrarily determine indentation when occurring at the beginning
|
|
2560
|
+
* of a line in layouts, this sets `NResume` to continue newline processing with the indent of the pragma.
|
|
2561
|
+
*
|
|
2562
|
+
* If the pragma is followed by newline, this only ensures that no semicolon is emitted (since this rule is run before
|
|
2563
|
+
* `semicolon` and `NResume` restarts lookahead).
|
|
2564
|
+
*
|
|
2565
|
+
* Otherwise it ensures that the following token is treated as a layout element with the correct indent.
|
|
2566
|
+
*/
|
|
2567
|
+
static Symbol pragma(Env *env) {
|
|
2568
|
+
if (consume_pragma(env)) {
|
|
2569
|
+
MARK("pragma");
|
|
2570
|
+
if (env->state->newline.state != NInactive) env->state->newline.state = NResume;
|
|
2571
|
+
return finish(PRAGMA, "newline");
|
|
2572
|
+
}
|
|
2573
|
+
return FAIL;
|
|
2574
|
+
}
|
|
2575
|
+
|
|
2576
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2577
|
+
// Quasiquote
|
|
2578
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2579
|
+
|
|
2580
|
+
static Symbol qq_body(Env *env) {
|
|
2581
|
+
for (;;) {
|
|
2582
|
+
if (is_eof(env)) return finish(QQ_BODY, "qq_body");
|
|
2583
|
+
else if (PEEK == 0x27e7) {
|
|
2584
|
+
return finish_marked(env, QQ_BODY, "qq_body");
|
|
2585
|
+
}
|
|
2586
|
+
else if (PEEK == '|') {
|
|
2587
|
+
MARK("qq_body");
|
|
2588
|
+
S_ADVANCE;
|
|
2589
|
+
if (PEEK == ']') {
|
|
2590
|
+
return finish(QQ_BODY, "qq_body");
|
|
2591
|
+
}
|
|
2592
|
+
} else S_ADVANCE;
|
|
2593
|
+
}
|
|
2594
|
+
}
|
|
2595
|
+
|
|
2596
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2597
|
+
// Semicolon
|
|
2598
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2599
|
+
|
|
2600
|
+
/**
|
|
2601
|
+
* When encountering explicit semicolons, we want to ensure that a subsequent newline doesn't trigger a layout
|
|
2602
|
+
* semicolon, so we set `skip_semi`.
|
|
2603
|
+
* If the next symbol is not a newline (and not another semicolon), the scanner will immediate end up in
|
|
2604
|
+
* `resolve_semicolon`, matching the condition, where we unset the flag to avoid a mid-line semicolon from influencing
|
|
2605
|
+
* an unrelated newline.
|
|
2606
|
+
*
|
|
2607
|
+
* Take this example:
|
|
2608
|
+
*
|
|
2609
|
+
* > a = 1;;
|
|
2610
|
+
* > b = 2
|
|
2611
|
+
* > ;;c = 3
|
|
2612
|
+
*
|
|
2613
|
+
* At the first semicolon, `explicit_semicolon` is called (conditioned on `LSemi` in `process_token_interior`) and
|
|
2614
|
+
* SEMICOLON is valid, so the flag is set.
|
|
2615
|
+
* The scanner will be called again immediately without advancing, and first enter `resolve_semicolon`, which does
|
|
2616
|
+
* nothing because the next token is still `LSemi`.
|
|
2617
|
+
* Next it will enter `explicit_semicolon` again.
|
|
2618
|
+
* SEMICOLON is valid, but since the flag is set we fall through and defer to internal lexing.
|
|
2619
|
+
* The grammar advances into `semi` (in `util.js`), which causes SEMICOLON to become invalid.
|
|
2620
|
+
* The scanner is executed before the second semicolon, where both functions skip again, this time additionally because
|
|
2621
|
+
* SEMICOLON is now invalid.
|
|
2622
|
+
*
|
|
2623
|
+
* In the next scan, the newline branch is taken in `scan`, so this function is not executed again.
|
|
2624
|
+
* Newline lookahead finds the next line to begin at column 0, which would usually trigger a layout semicolon in
|
|
2625
|
+
* `semicolon`, but that is inhibited by `skip_semi`, so the scan only skips whitespace and resets the newline state,
|
|
2626
|
+
* which unsets `skip_semi` again.
|
|
2627
|
+
* In the following scan, the conditions for both functions are unfulfilled, so parsing continues regularly until the
|
|
2628
|
+
* next newline.
|
|
2629
|
+
*
|
|
2630
|
+
* Newline lookahead now encounters the third semicolon on the next line and sets `no_semi`, which supersedes
|
|
2631
|
+
* `skip_semi` and prohibits layout semicolon irreversibly, so the explicit semicolons are parsed by the grammar.
|
|
2632
|
+
*
|
|
2633
|
+
* Now consider an inline semicolon:
|
|
2634
|
+
*
|
|
2635
|
+
* > f = let
|
|
2636
|
+
* > a = 1; b = 2
|
|
2637
|
+
* > c = 3; {- x -}
|
|
2638
|
+
* > d = 4
|
|
2639
|
+
* > in c
|
|
2640
|
+
*
|
|
2641
|
+
* When the semicolon is lexed, `explicit_semicolon` sets `skip_semi`.
|
|
2642
|
+
* If we would not reset it until the newline, no layout semicolon would be generated before `c`, resulting in a parse
|
|
2643
|
+
* error at `=`.
|
|
2644
|
+
* Therefore, `resolve_semicolon` unsets `skip_semi` when lexing `b`, triggered by `skip_semi` being set and the next
|
|
2645
|
+
* token not being `LSemi`.
|
|
2646
|
+
*
|
|
2647
|
+
* The semicolon after `c = 3` is followed by a comment, so it is unclear if there is going to be another layout element
|
|
2648
|
+
* in the same line.
|
|
2649
|
+
* If there is none, the situation is the same as in the first example's first line; if another layout element were to
|
|
2650
|
+
* follow, `skip_semi` would need to be reset like in this example's first line.
|
|
2651
|
+
* Therefore, `resolve_semicolon` also keeps the flag as it is in this case.
|
|
2652
|
+
*/
|
|
2653
|
+
static Symbol explicit_semicolon(Env *env) {
|
|
2654
|
+
if (valid(env, SEMICOLON) && !env->state->newline.skip_semi) {
|
|
2655
|
+
env->state->newline.skip_semi = true;
|
|
2656
|
+
return update_state("explicit semicolon enable");
|
|
2657
|
+
}
|
|
2658
|
+
return FAIL;
|
|
2659
|
+
}
|
|
2660
|
+
|
|
2661
|
+
static Symbol resolve_semicolon(Env *env, Lexed next) {
|
|
2662
|
+
if (env->state->newline.skip_semi) {
|
|
2663
|
+
switch(next) {
|
|
2664
|
+
case LLineComment:
|
|
2665
|
+
case LBlockComment:
|
|
2666
|
+
case LPragma:
|
|
2667
|
+
case LSemi:
|
|
2668
|
+
break;
|
|
2669
|
+
default:
|
|
2670
|
+
env->state->newline.skip_semi = false;
|
|
2671
|
+
return update_state("explicit semicolon disable");
|
|
2672
|
+
}
|
|
2673
|
+
}
|
|
2674
|
+
return FAIL;
|
|
2675
|
+
}
|
|
2676
|
+
|
|
2677
|
+
/**
|
|
2678
|
+
* Generate a layout semicolon after a newline if the indent is less or equal to the current layout's indent, unless:
|
|
2679
|
+
*
|
|
2680
|
+
* - The current context doesn't use layout semicolons, which is the case for explicit brace layouts, tuple expressions,
|
|
2681
|
+
* the module header and multi-way if layouts.
|
|
2682
|
+
*
|
|
2683
|
+
* - `no_semi` was set because newline lookahead found an explicit semicolon in the next line, or this function was
|
|
2684
|
+
* executed before for the same newline.
|
|
2685
|
+
*
|
|
2686
|
+
* - `skip_semi` was set because the previous line ended with an explicit semicolon.
|
|
2687
|
+
*/
|
|
2688
|
+
static Symbol semicolon(Env *env) {
|
|
2689
|
+
if (
|
|
2690
|
+
is_semicolon_context(env)
|
|
2691
|
+
&&
|
|
2692
|
+
!(env->state->newline.no_semi || env->state->newline.skip_semi)
|
|
2693
|
+
&&
|
|
2694
|
+
indent_lesseq(env, env->state->newline.indent)
|
|
2695
|
+
) {
|
|
2696
|
+
env->state->newline.no_semi = true;
|
|
2697
|
+
return finish(SEMICOLON, "newline");
|
|
2698
|
+
}
|
|
2699
|
+
else return FAIL;
|
|
2700
|
+
}
|
|
2701
|
+
|
|
2702
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2703
|
+
// High-level `Lexed` dispatch
|
|
2704
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2705
|
+
|
|
2706
|
+
/**
|
|
2707
|
+
* Process a `Lexed` token if it results in a layout end or an extra.
|
|
2708
|
+
*
|
|
2709
|
+
* This is called by `newline_post` before marking, so the actions must not fail after advancing.
|
|
2710
|
+
*/
|
|
2711
|
+
static Symbol process_token_safe(Env *env, Lexed next) {
|
|
2712
|
+
switch (next) {
|
|
2713
|
+
case LWhere:
|
|
2714
|
+
return end_layout_where(env);
|
|
2715
|
+
case LIn:
|
|
2716
|
+
return end_layout_in(env);
|
|
2717
|
+
case LThen:
|
|
2718
|
+
case LElse:
|
|
2719
|
+
return end_layout(env, "then/else");
|
|
2720
|
+
case LDeriving:
|
|
2721
|
+
return end_layout_deriving(env);
|
|
2722
|
+
case LBar:
|
|
2723
|
+
if (!valid(env, BAR)) return end_layout(env, "bar");
|
|
2724
|
+
break;
|
|
2725
|
+
case LPragma:
|
|
2726
|
+
return pragma(env);
|
|
2727
|
+
case LBlockComment:
|
|
2728
|
+
return block_comment(env);
|
|
2729
|
+
case LLineComment:
|
|
2730
|
+
return inline_comment(env);
|
|
2731
|
+
case LCppElse:
|
|
2732
|
+
return cpp_else(env, true);
|
|
2733
|
+
case LCpp:
|
|
2734
|
+
return cpp_line(env);
|
|
2735
|
+
case LSymop:
|
|
2736
|
+
case LTick:
|
|
2737
|
+
case LHash:
|
|
2738
|
+
return end_layout_infix(env);
|
|
2739
|
+
case LUnboxedClose:
|
|
2740
|
+
SEQ(token_end_layout_texp(env));
|
|
2741
|
+
return end_layout_infix(env);
|
|
2742
|
+
case LArrow:
|
|
2743
|
+
if (!valid(env, ARROW)) return token_end_layout_texp(env);
|
|
2744
|
+
break;
|
|
2745
|
+
case LTexpCloser:
|
|
2746
|
+
return token_end_layout_texp(env);
|
|
2747
|
+
case LQuoteClose:
|
|
2748
|
+
return end_layout(env, "quote bracket");
|
|
2749
|
+
break;
|
|
2750
|
+
default:
|
|
2751
|
+
break;
|
|
2752
|
+
}
|
|
2753
|
+
return FAIL;
|
|
2754
|
+
}
|
|
2755
|
+
|
|
2756
|
+
/**
|
|
2757
|
+
* Process a `Lexed` token if it results in a symbolic operator.
|
|
2758
|
+
*/
|
|
2759
|
+
static Symbol process_token_symop(Env *env, bool whitespace, Lexed next) {
|
|
2760
|
+
switch (next) {
|
|
2761
|
+
case LDotDot:
|
|
2762
|
+
SEQ(finish_if_valid(env, DOTDOT, "symop"));
|
|
2763
|
+
return tight_op(env, whitespace, QUAL_DOT);
|
|
2764
|
+
case LDotId:
|
|
2765
|
+
SEQ(finish_if_valid(env, whitespace ? PREFIX_DOT : TIGHT_DOT, "symop"));
|
|
2766
|
+
return tight_op(env, whitespace, QUAL_DOT);
|
|
2767
|
+
case LDotSymop:
|
|
2768
|
+
return tight_or_varsym(env, whitespace, QUAL_DOT);
|
|
2769
|
+
case LDotOpen:
|
|
2770
|
+
return prefix_or_varsym(env, whitespace, PREFIX_DOT);
|
|
2771
|
+
case LBang:
|
|
2772
|
+
return infix_or_varsym(env, whitespace, PREFIX_BANG, TIGHT_BANG);
|
|
2773
|
+
case LTilde:
|
|
2774
|
+
return infix_or_varsym(env, whitespace, PREFIX_TILDE, TIGHT_TILDE);
|
|
2775
|
+
case LAt:
|
|
2776
|
+
return infix_or_varsym(env, whitespace, PREFIX_AT, TIGHT_AT);
|
|
2777
|
+
case LPercent:
|
|
2778
|
+
return prefix_or_varsym(env, whitespace, PREFIX_PERCENT);
|
|
2779
|
+
case LSymop:
|
|
2780
|
+
if (char0(env, ':')) return finish_symop(env, CONSYM);
|
|
2781
|
+
else return finish_symop(env, VARSYM);
|
|
2782
|
+
// The following are handled here despite not being purely symop tokens because `process_token_symop` is executed
|
|
2783
|
+
// last, and these handlers all have potentially quite far lookahead and can fail.
|
|
2784
|
+
case LSymopSpecial:
|
|
2785
|
+
SEQ(left_section_op(env, symop_lookahead(env)));
|
|
2786
|
+
if (valid(env, MINUS) && match_symop(env, "-")) return finish(MINUS, "symop");
|
|
2787
|
+
break;
|
|
2788
|
+
case LUnboxedClose:
|
|
2789
|
+
case LHash:
|
|
2790
|
+
return left_section_op(env, symop_lookahead(env));
|
|
2791
|
+
case LTick:
|
|
2792
|
+
return left_section_ticked(env);
|
|
2793
|
+
case LUpper:
|
|
2794
|
+
if (valid(env, QUALIFIED_OP) || valid(env, LEFT_SECTION_OP)) SEQ(qualified_op(env));
|
|
2795
|
+
break;
|
|
2796
|
+
default:
|
|
2797
|
+
break;
|
|
2798
|
+
}
|
|
2799
|
+
return FAIL;
|
|
2800
|
+
}
|
|
2801
|
+
|
|
2802
|
+
static Symbol process_token_splice(Env *env, Lexed next) {
|
|
2803
|
+
switch (next) {
|
|
2804
|
+
case LDollar:
|
|
2805
|
+
return finish_if_valid(env, SPLICE, "symop");
|
|
2806
|
+
default:
|
|
2807
|
+
break;
|
|
2808
|
+
}
|
|
2809
|
+
return FAIL;
|
|
2810
|
+
}
|
|
2811
|
+
|
|
2812
|
+
/**
|
|
2813
|
+
* Process a `Lexed` token for an interior position.
|
|
2814
|
+
*/
|
|
2815
|
+
static Symbol process_token_interior(Env *env, Lexed next) {
|
|
2816
|
+
switch (next) {
|
|
2817
|
+
case LBraceClose:
|
|
2818
|
+
SEQ(end_layout_brace(env));
|
|
2819
|
+
return token_end_layout_texp(env);
|
|
2820
|
+
// Skip layout start
|
|
2821
|
+
case LModule:
|
|
2822
|
+
return FAIL;
|
|
2823
|
+
case LSemi:
|
|
2824
|
+
return explicit_semicolon(env);
|
|
2825
|
+
case LBracketOpen:
|
|
2826
|
+
return finish(QQ_START, "qq_start");
|
|
2827
|
+
default:
|
|
2828
|
+
break;
|
|
2829
|
+
}
|
|
2830
|
+
SEQ(process_token_safe(env, next));
|
|
2831
|
+
return start_layout_interior(env, next);
|
|
2832
|
+
}
|
|
2833
|
+
|
|
2834
|
+
/**
|
|
2835
|
+
* Process a `Lexed` token to initialize the context stack.
|
|
2836
|
+
*/
|
|
2837
|
+
static Symbol process_token_init(Env *env, uint32_t indent, Lexed next) {
|
|
2838
|
+
switch (next) {
|
|
2839
|
+
case LModule:
|
|
2840
|
+
push_context(env, ModuleHeader, 0);
|
|
2841
|
+
return update_state("init");
|
|
2842
|
+
case LBraceOpen:
|
|
2843
|
+
advance_over(env, 0);
|
|
2844
|
+
MARK("init brace");
|
|
2845
|
+
push_context(env, Braces, indent);
|
|
2846
|
+
return finish(START_EXPLICIT, "init");
|
|
2847
|
+
default:
|
|
2848
|
+
push_context(env, DeclLayout, indent);
|
|
2849
|
+
return finish(START, "init");
|
|
2850
|
+
}
|
|
2851
|
+
}
|
|
2852
|
+
|
|
2853
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2854
|
+
// Newline actions
|
|
2855
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2856
|
+
|
|
2857
|
+
/**
|
|
2858
|
+
* `NoSpace` + `newline_init()` means that we're at the very beginning of the file, where we start in `NResume` mode
|
|
2859
|
+
* without a newline character that can tell us where we are.
|
|
2860
|
+
*/
|
|
2861
|
+
static Symbol newline_extras(Env *env, Space space) {
|
|
2862
|
+
bool bol = space == BOL || (space == NoSpace && newline_init(env));
|
|
2863
|
+
Lexed next = lex_extras(env, bol);
|
|
2864
|
+
dbg("newline extras token: %s\n", token_names[next]);
|
|
2865
|
+
return process_token_safe(env, next);
|
|
2866
|
+
}
|
|
2867
|
+
|
|
2868
|
+
// Don't finish newline processing before pragmas – they are indicators of layout indent, but since they are extras,
|
|
2869
|
+
// they cannot consume a semicolon, so when there's a pragma on a line of its own, we would get two semicolons if we
|
|
2870
|
+
// finished here.
|
|
2871
|
+
// It's guaranteed that the newline state was committed at least once because `newline_lookahead` sets `unsafe` when
|
|
2872
|
+
// finding a pragma.
|
|
2873
|
+
static Symbol newline_process(Env *env) {
|
|
2874
|
+
dbg("newline post\n");
|
|
2875
|
+
uint32_t indent = env->state->newline.indent;
|
|
2876
|
+
Lexed end = env->state->newline.end;
|
|
2877
|
+
SEQ(end_layout_indent(env));
|
|
2878
|
+
SEQ(process_token_safe(env, end));
|
|
2879
|
+
Space space = skip_whitespace(env);
|
|
2880
|
+
MARK("newline_post");
|
|
2881
|
+
if (env->state->newline.unsafe) SEQ(newline_extras(env, space));
|
|
2882
|
+
if (!env->state->newline.eof) SEQ(start_layout_newline(env));
|
|
2883
|
+
// TODO it is only necessary to run this late because of very few situations, like nondecreasing indent.
|
|
2884
|
+
// But it has the consequence that whitespace is included in the parent in nested layouts.
|
|
2885
|
+
// Maybe there's a way to run it before and after `start_layout_newline` with conditions.
|
|
2886
|
+
SEQ(semicolon(env));
|
|
2887
|
+
reset_newline(env);
|
|
2888
|
+
if (uninitialized(env)) SEQ(process_token_init(env, indent, end));
|
|
2889
|
+
else {
|
|
2890
|
+
SEQ(process_token_symop(env, true, end));
|
|
2891
|
+
SEQ(process_token_splice(env, end));
|
|
2892
|
+
}
|
|
2893
|
+
return update_state("newline final");
|
|
2894
|
+
}
|
|
2895
|
+
|
|
2896
|
+
static Symbol newline_post(Env *env) {
|
|
2897
|
+
Symbol res = newline_process(env);
|
|
2898
|
+
if (newline_init(env)) env->state->newline.state = NProcess;
|
|
2899
|
+
return res;
|
|
2900
|
+
}
|
|
2901
|
+
|
|
2902
|
+
/**
|
|
2903
|
+
* Repeatedly lex lookahead until encountering something that is neither a comment nor CPP, skipping whitespace and
|
|
2904
|
+
* newlines in between.
|
|
2905
|
+
*/
|
|
2906
|
+
static void newline_lookahead(Env *env, Newline *newline) {
|
|
2907
|
+
for (;;) {
|
|
2908
|
+
// Using `peek0` to look for whitespace requires the lookahead buffer to have been reset immediately before this
|
|
2909
|
+
// statement – so before the call to this function or at the end of the for loop body.
|
|
2910
|
+
// The reason this isn't using `lexer->lookahead` is that the function may be called at an interior position, to
|
|
2911
|
+
// skip extras.
|
|
2912
|
+
switch (peek0(env)) {
|
|
2913
|
+
NEWLINE_CASES:
|
|
2914
|
+
skip_over(env, 0);
|
|
2915
|
+
newline->indent = 0;
|
|
2916
|
+
break;
|
|
2917
|
+
case '\t':
|
|
2918
|
+
skip_over(env, 0);
|
|
2919
|
+
newline->indent += 8;
|
|
2920
|
+
break;
|
|
2921
|
+
default:
|
|
2922
|
+
if (is_space_char(peek0(env))) {
|
|
2923
|
+
skip_over(env, 0);
|
|
2924
|
+
newline->indent++;
|
|
2925
|
+
break;
|
|
2926
|
+
}
|
|
2927
|
+
newline->end = lex(env, newline->indent == 0);
|
|
2928
|
+
dbg("newline token: %s, %lc\n", token_names[newline->end], peek0(env));
|
|
2929
|
+
// Newlines without extras are only safe if `lex` didn't advance the lexer over non-whitespace.
|
|
2930
|
+
newline->unsafe |= !no_lookahead(env);
|
|
2931
|
+
switch (newline->end) {
|
|
2932
|
+
case LEof:
|
|
2933
|
+
newline->indent = 0;
|
|
2934
|
+
newline->eof = true;
|
|
2935
|
+
return;
|
|
2936
|
+
// If/then blocks can have semicolons, but don't have a layout.
|
|
2937
|
+
// Allowing layout semicolons costs 100kB.
|
|
2938
|
+
case LThen:
|
|
2939
|
+
case LElse:
|
|
2940
|
+
case LSemi:
|
|
2941
|
+
newline->no_semi = true;
|
|
2942
|
+
return;
|
|
2943
|
+
case LBlockComment:
|
|
2944
|
+
newline->indent = consume_block_comment(env, newline->indent + 2);
|
|
2945
|
+
break;
|
|
2946
|
+
case LLineComment:
|
|
2947
|
+
newline->indent = 0;
|
|
2948
|
+
take_line(env);
|
|
2949
|
+
break;
|
|
2950
|
+
case LCppElse:
|
|
2951
|
+
cpp_else(env, false);
|
|
2952
|
+
take_line_escaped_newline(env);
|
|
2953
|
+
break;
|
|
2954
|
+
case LCpp:
|
|
2955
|
+
take_line_escaped_newline(env);
|
|
2956
|
+
break;
|
|
2957
|
+
default:
|
|
2958
|
+
return;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2961
|
+
reset_lookahead(env);
|
|
2962
|
+
}
|
|
2963
|
+
}
|
|
2964
|
+
|
|
2965
|
+
/**
|
|
2966
|
+
* Perform newline lookahead, then either finish the run if the position was advanced into the next token, or directly
|
|
2967
|
+
* start newline processing if not.
|
|
2968
|
+
*/
|
|
2969
|
+
static Symbol newline_start(Env *env) {
|
|
2970
|
+
dbg("newline lookahead\n");
|
|
2971
|
+
env->state->newline.state = NInit;
|
|
2972
|
+
newline_lookahead(env, &env->state->newline);
|
|
2973
|
+
if (env->state->newline.unsafe) return update_state("newline lookahead");
|
|
2974
|
+
else return newline_post(env);
|
|
2975
|
+
}
|
|
2976
|
+
|
|
2977
|
+
/**
|
|
2978
|
+
* Perform newline lookahead with preset indent, used at the beginning of a file and after pragmas.
|
|
2979
|
+
*/
|
|
2980
|
+
static Symbol newline_resume(Env *env) {
|
|
2981
|
+
dbg("newline resume\n");
|
|
2982
|
+
uint32_t indent = env->state->newline.indent;
|
|
2983
|
+
// Skip space between the pragma end and the next token, which might be the first real token (or another pragma or
|
|
2984
|
+
// comment, or newline).
|
|
2985
|
+
// We don't want to count the space as indent.
|
|
2986
|
+
skip_space(env);
|
|
2987
|
+
reset_newline(env);
|
|
2988
|
+
env->state->newline.indent = indent;
|
|
2989
|
+
return newline_start(env);
|
|
2990
|
+
}
|
|
2991
|
+
|
|
2992
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2993
|
+
// Constraints
|
|
2994
|
+
// --------------------------------------------------------------------------------------------------------
|
|
2995
|
+
|
|
2996
|
+
/**
|
|
2997
|
+
* The following mechanism avoids the conflict between types and classes.
|
|
2998
|
+
* Consider this situation:
|
|
2999
|
+
*
|
|
3000
|
+
* > data A = B b % C => D d :+ E
|
|
3001
|
+
* > data E = F f => G g
|
|
3002
|
+
*
|
|
3003
|
+
* After the `=`, a diverse set of constructs are valid.
|
|
3004
|
+
*
|
|
3005
|
+
* - Data constructor
|
|
3006
|
+
* - Infix `D d :+ E` -> `(type/name) (type/variable) (constructor_operator) (type/name)`
|
|
3007
|
+
* - Prefix `G g` -> `(name) (type/variable)`
|
|
3008
|
+
* - Context
|
|
3009
|
+
* - Infix `B b % C` -> `(type/name) (type/variable) (operator) (type/name)`
|
|
3010
|
+
* - Prefix `F f` -> `(constraint/name) (type/variable)`
|
|
3011
|
+
*
|
|
3012
|
+
* Each of these starts with a `(name)` with different reduction rules that can only be resolved when the arrow or a
|
|
3013
|
+
* data constructor-ending token is encountered.
|
|
3014
|
+
* The conflict between `D` and `G` is an additional hurdle that is not addressed here.
|
|
3015
|
+
*
|
|
3016
|
+
* Constraint lookahead scans ahead until it finds `=>` or a clear rejection criterion like `=` or (layout) semicolon,
|
|
3017
|
+
* emitting `_cond_context` to unlock the rules `_qtype_context`, `context` and `_ctr_context`.
|
|
3018
|
+
*
|
|
3019
|
+
* However, even the two context variants conflict, since infix classes have types in their operands, while a prefix
|
|
3020
|
+
* constraint starts with a class name.
|
|
3021
|
+
* To mitigate this, constraint lookahead additionally emits `_cond_infix` when it encounters an infix operator.
|
|
3022
|
+
* This symbol is only emitted when `_cond_context` is not valid (because it was parsed right before) or because no `=>`
|
|
3023
|
+
* is encountered afterwards (because the current position is in parentheses).
|
|
3024
|
+
* This only works because infix classes are localized within contexts – disambiguating all infix types like this is
|
|
3025
|
+
* impossible without completely restructuring the grammar.
|
|
3026
|
+
*
|
|
3027
|
+
* Note that this problem could easily be avoided by parsing all contexts as types, accepting that queries for class
|
|
3028
|
+
* names would be more verbose and couldn't match more complex constraints.
|
|
3029
|
+
* Furthermore, a much simpler fix would be a runtime conflict, which has the potential to result in randomly incorrect
|
|
3030
|
+
* parse trees.
|
|
3031
|
+
*
|
|
3032
|
+
* Similarly to contexts, data constructor heads have infix type-related conflicts that aren't as severe but can easily
|
|
3033
|
+
* piggyback on this mechanism, so they are included.
|
|
3034
|
+
*
|
|
3035
|
+
* Lastly, associated type families and instances conflict because they can both be heralded by `type` alone, so the
|
|
3036
|
+
* decision to reduce to type head or instance head nodes is informed by the presence of `::` or `=` without `|`.
|
|
3037
|
+
*/
|
|
3038
|
+
|
|
3039
|
+
/**
|
|
3040
|
+
* Result of constraint lookahead.
|
|
3041
|
+
*/
|
|
3042
|
+
typedef enum {
|
|
3043
|
+
// Continue searching
|
|
3044
|
+
CtrUndecided,
|
|
3045
|
+
// Clear evidence found that no context or infix class is ahead.
|
|
3046
|
+
CtrImpossible,
|
|
3047
|
+
// The context arrow `=>` was found.
|
|
3048
|
+
CtrArrowFound,
|
|
3049
|
+
// An infix operator was found.
|
|
3050
|
+
CtrInfixFound,
|
|
3051
|
+
// An `=` was found.
|
|
3052
|
+
CtrEqualsFound,
|
|
3053
|
+
// A `|` was found.
|
|
3054
|
+
CtrBarFound,
|
|
3055
|
+
} CtrResult;
|
|
3056
|
+
|
|
3057
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3058
|
+
|
|
3059
|
+
static const char *ctr_result_names[] = {
|
|
3060
|
+
"undecided",
|
|
3061
|
+
"impossible",
|
|
3062
|
+
"arrow",
|
|
3063
|
+
"infix",
|
|
3064
|
+
"equals",
|
|
3065
|
+
"bar",
|
|
3066
|
+
};
|
|
3067
|
+
|
|
3068
|
+
#endif
|
|
3069
|
+
|
|
3070
|
+
/**
|
|
3071
|
+
* Constraint lookahead state.
|
|
3072
|
+
*/
|
|
3073
|
+
typedef struct {
|
|
3074
|
+
// The amount of characters to skip after an iteration.
|
|
3075
|
+
// For example, after lexing a `conid` the next token can be lexed at the end of the identifier.
|
|
3076
|
+
uint32_t reset;
|
|
3077
|
+
// The number of nested brackets.
|
|
3078
|
+
// When this is nonzero, end tokens are not treated as pertaining to the current expression.
|
|
3079
|
+
uint32_t brackets;
|
|
3080
|
+
// A context arrow was found.
|
|
3081
|
+
bool context;
|
|
3082
|
+
// An infix operator was found.
|
|
3083
|
+
bool infix;
|
|
3084
|
+
bool data_infix;
|
|
3085
|
+
bool type_instance;
|
|
3086
|
+
} CtrState;
|
|
3087
|
+
|
|
3088
|
+
/**
|
|
3089
|
+
* Increment the bracket count.
|
|
3090
|
+
*/
|
|
3091
|
+
static CtrResult ctr_bracket_open(CtrState *state) {
|
|
3092
|
+
state->brackets++;
|
|
3093
|
+
state->reset = 1;
|
|
3094
|
+
return CtrUndecided;
|
|
3095
|
+
}
|
|
3096
|
+
|
|
3097
|
+
/**
|
|
3098
|
+
* Decrement the bracket count.
|
|
3099
|
+
* If the count was zero already, parsing started inside of brackets that are closed here, so lookahead is terminated.
|
|
3100
|
+
*/
|
|
3101
|
+
static CtrResult ctr_bracket_close(CtrState *state) {
|
|
3102
|
+
if (state->brackets == 0) return CtrImpossible;
|
|
3103
|
+
state->brackets--;
|
|
3104
|
+
state->reset = 1;
|
|
3105
|
+
return CtrUndecided;
|
|
3106
|
+
}
|
|
3107
|
+
|
|
3108
|
+
/**
|
|
3109
|
+
* If the given token is ahead, terminate lookahead unsuccessfully.
|
|
3110
|
+
*/
|
|
3111
|
+
static CtrResult ctr_stop_on_token(Env *env, const char * restrict target) {
|
|
3112
|
+
return token(env, target) ? CtrImpossible : CtrUndecided;
|
|
3113
|
+
}
|
|
3114
|
+
|
|
3115
|
+
/**
|
|
3116
|
+
* Check if the lexed token is `=>` or an infix operator.
|
|
3117
|
+
*
|
|
3118
|
+
* This is performed only when the current position is not in a bracketed expression, i.e. at top level relative to the
|
|
3119
|
+
* initial lexer position.
|
|
3120
|
+
* Otherwise the token belongs to a later, nested expression.
|
|
3121
|
+
*
|
|
3122
|
+
* Certain tokens are proof that no context can start at the current position, like `::` or `forall`, so lookahead is
|
|
3123
|
+
* terminated.
|
|
3124
|
+
* It is still possible that an infix class can be parsed, for example in this type when starting at the at `C` and
|
|
3125
|
+
* terminating at `::`:
|
|
3126
|
+
* > `a :: (C + D :: Constraint) => E`
|
|
3127
|
+
*/
|
|
3128
|
+
static CtrResult ctr_top(Env *env, Lexed next) {
|
|
3129
|
+
switch (next) {
|
|
3130
|
+
case LCArrow:
|
|
3131
|
+
return CtrArrowFound;
|
|
3132
|
+
case LSymop:
|
|
3133
|
+
case LSymopSpecial:
|
|
3134
|
+
case LTilde:
|
|
3135
|
+
case LTick:
|
|
3136
|
+
return CtrInfixFound;
|
|
3137
|
+
case LBar:
|
|
3138
|
+
return CtrBarFound;
|
|
3139
|
+
case LArrow:
|
|
3140
|
+
case LWhere:
|
|
3141
|
+
case LDotDot:
|
|
3142
|
+
case LSemi:
|
|
3143
|
+
break;
|
|
3144
|
+
case LTexpCloser:
|
|
3145
|
+
switch (peek0(env)) {
|
|
3146
|
+
case '=':
|
|
3147
|
+
return CtrEqualsFound;
|
|
3148
|
+
default:
|
|
3149
|
+
break;
|
|
3150
|
+
}
|
|
3151
|
+
break;
|
|
3152
|
+
default:
|
|
3153
|
+
switch (peek0(env)) {
|
|
3154
|
+
// Symop is processed in `ctr_lookahead_step`, so `=` and `::` can not be a prefix
|
|
3155
|
+
case '=':
|
|
3156
|
+
return CtrEqualsFound;
|
|
3157
|
+
case 0x2200: // ∀
|
|
3158
|
+
break;
|
|
3159
|
+
case ':':
|
|
3160
|
+
if (char1(env, ':')) break;
|
|
3161
|
+
return CtrUndecided;
|
|
3162
|
+
case 'f':
|
|
3163
|
+
SEQ(ctr_stop_on_token(env, "forall"));
|
|
3164
|
+
return ctr_stop_on_token(env, "family");
|
|
3165
|
+
case 'i':
|
|
3166
|
+
return ctr_stop_on_token(env, "instance");
|
|
3167
|
+
default:
|
|
3168
|
+
return CtrUndecided;
|
|
3169
|
+
}
|
|
3170
|
+
}
|
|
3171
|
+
return CtrImpossible;
|
|
3172
|
+
}
|
|
3173
|
+
|
|
3174
|
+
/**
|
|
3175
|
+
* Process a lexed token for constraint lookahead:
|
|
3176
|
+
* - Update bracket nesting count
|
|
3177
|
+
* - Advance over pragmas, strings, chars and conids
|
|
3178
|
+
* - Set the reset index for certain tokens
|
|
3179
|
+
*
|
|
3180
|
+
* If the token wasn't identified to be irrelevant for the lookahead result, and the current bracket nesting level is
|
|
3181
|
+
* zero, call `ctr_top`.
|
|
3182
|
+
*/
|
|
3183
|
+
static CtrResult ctr_lookahead_step(Env *env, CtrState *state, Lexed next) {
|
|
3184
|
+
state->reset = 1;
|
|
3185
|
+
switch (next) {
|
|
3186
|
+
case LBraceClose:
|
|
3187
|
+
return ctr_bracket_close(state);
|
|
3188
|
+
case LUnboxedClose:
|
|
3189
|
+
SEQ(ctr_bracket_close(state));
|
|
3190
|
+
state->reset = 2;
|
|
3191
|
+
return CtrUndecided;
|
|
3192
|
+
case LBraceOpen:
|
|
3193
|
+
return ctr_bracket_open(state);
|
|
3194
|
+
case LSymopSpecial:
|
|
3195
|
+
case LSymop:
|
|
3196
|
+
state->reset = symop_lookahead(env);
|
|
3197
|
+
break;
|
|
3198
|
+
case LUpper:
|
|
3199
|
+
state->reset = conid(env);
|
|
3200
|
+
return CtrUndecided;
|
|
3201
|
+
case LDotId:
|
|
3202
|
+
return CtrUndecided;
|
|
3203
|
+
case LPragma:
|
|
3204
|
+
if (consume_pragma(env)) state->reset = 3;
|
|
3205
|
+
return CtrUndecided;
|
|
3206
|
+
case LTexpCloser:
|
|
3207
|
+
case LNothing:
|
|
3208
|
+
switch (peek0(env)) {
|
|
3209
|
+
case ')':
|
|
3210
|
+
case ']':
|
|
3211
|
+
return ctr_bracket_close(state);
|
|
3212
|
+
case '(':
|
|
3213
|
+
case '[':
|
|
3214
|
+
return ctr_bracket_open(state);
|
|
3215
|
+
case '"':
|
|
3216
|
+
state->reset = take_string_literal(env);
|
|
3217
|
+
return CtrUndecided;
|
|
3218
|
+
case '\'':
|
|
3219
|
+
state->reset = take_char_literal(env);
|
|
3220
|
+
return CtrUndecided;
|
|
3221
|
+
default:
|
|
3222
|
+
if (varid_start_char(peek0(env))) state->reset = advance_while(env, 1, is_id_char);
|
|
3223
|
+
break;
|
|
3224
|
+
}
|
|
3225
|
+
default:
|
|
3226
|
+
break;
|
|
3227
|
+
}
|
|
3228
|
+
if (state->brackets != 0) return CtrUndecided;
|
|
3229
|
+
return ctr_top(env, next);
|
|
3230
|
+
}
|
|
3231
|
+
|
|
3232
|
+
/**
|
|
3233
|
+
* Main loop for context lookahead.
|
|
3234
|
+
*
|
|
3235
|
+
* Perform newline lookahead and terminate if the end of the current layout element is encountered.
|
|
3236
|
+
* Otherwise use the new end token to detect a context arrow or infix operator.
|
|
3237
|
+
* If no termination criterion is fulfilled, reset lookahead and repeat.
|
|
3238
|
+
*
|
|
3239
|
+
* Newline lookahead skips over extras.
|
|
3240
|
+
*
|
|
3241
|
+
* A context arrow is always a termination criterion; an infix operator only if CONTEXT isn't valid.
|
|
3242
|
+
*/
|
|
3243
|
+
static Symbol constraint_lookahead(Env *env) {
|
|
3244
|
+
dbg("type lookahead\n");
|
|
3245
|
+
CtrState state = {.reset = 0};
|
|
3246
|
+
bool done = false;
|
|
3247
|
+
while (!done && not_eof(env)) {
|
|
3248
|
+
// Setting indent to 99999 only to not trigger the following termination condition when no newline was advanced over
|
|
3249
|
+
Newline newline = {.state = 0, .indent = 99999};
|
|
3250
|
+
newline_lookahead(env, &newline);
|
|
3251
|
+
if (newline.indent <= current_indent(env) && current_context(env) != Braces) break;
|
|
3252
|
+
CtrResult result = ctr_lookahead_step(env, &state, newline.end);
|
|
3253
|
+
dbg("type: %lc, %s\n", peek0(env), ctr_result_names[result]);
|
|
3254
|
+
switch (result) {
|
|
3255
|
+
case CtrArrowFound:
|
|
3256
|
+
state.context = true;
|
|
3257
|
+
done = true;
|
|
3258
|
+
break;
|
|
3259
|
+
case CtrInfixFound:
|
|
3260
|
+
if (char0(env, ':') || char0(env, '`')) state.data_infix = true;
|
|
3261
|
+
state.infix = true;
|
|
3262
|
+
// Context has precedence, e.g. `instance a + a => A` finds `+` first and would treat that as the class name of
|
|
3263
|
+
// the head, then failing on the right operand.
|
|
3264
|
+
done = !valid(env, CONTEXT);
|
|
3265
|
+
break;
|
|
3266
|
+
case CtrEqualsFound:
|
|
3267
|
+
done = !valid(env, TYPE_INSTANCE);
|
|
3268
|
+
state.type_instance = true;
|
|
3269
|
+
break;
|
|
3270
|
+
case CtrBarFound:
|
|
3271
|
+
done = true;
|
|
3272
|
+
state.type_instance = false;
|
|
3273
|
+
break;
|
|
3274
|
+
case CtrImpossible:
|
|
3275
|
+
done = true;
|
|
3276
|
+
case CtrUndecided:
|
|
3277
|
+
break;
|
|
3278
|
+
}
|
|
3279
|
+
reset_lookahead_to(env, state.reset);
|
|
3280
|
+
state.reset = 0;
|
|
3281
|
+
}
|
|
3282
|
+
if (state.context) SEQ(finish_if_valid(env, CONTEXT, "ctr"));
|
|
3283
|
+
if (state.infix) SEQ(finish_if_valid(env, INFIX, "ctr"));
|
|
3284
|
+
if (state.data_infix) SEQ(finish_if_valid(env, DATA_INFIX, "ctr"));
|
|
3285
|
+
if (state.type_instance) SEQ(finish_if_valid(env, TYPE_INSTANCE, "ctr"));
|
|
3286
|
+
return FAIL;
|
|
3287
|
+
}
|
|
3288
|
+
|
|
3289
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3290
|
+
// Actions that are executed for interior positions
|
|
3291
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3292
|
+
|
|
3293
|
+
static Symbol process_token_constraint(Env *env) {
|
|
3294
|
+
if (
|
|
3295
|
+
valid(env, CONTEXT)
|
|
3296
|
+
||
|
|
3297
|
+
valid(env, INFIX)
|
|
3298
|
+
||
|
|
3299
|
+
valid(env, DATA_INFIX)
|
|
3300
|
+
||
|
|
3301
|
+
valid(env, TYPE_INSTANCE)
|
|
3302
|
+
)
|
|
3303
|
+
return constraint_lookahead(env);
|
|
3304
|
+
return FAIL;
|
|
3305
|
+
}
|
|
3306
|
+
|
|
3307
|
+
static Symbol interior(Env *env, bool whitespace) {
|
|
3308
|
+
Lexed next = lex(env, false);
|
|
3309
|
+
dbg("interior, column %d, ws %d, token %s\n", column(env), whitespace, token_names[next]);
|
|
3310
|
+
SEQ(resolve_semicolon(env, next));
|
|
3311
|
+
SEQ(process_token_interior(env, next));
|
|
3312
|
+
SEQ(process_token_symop(env, whitespace, next));
|
|
3313
|
+
SEQ(process_token_constraint(env));
|
|
3314
|
+
SEQ(process_token_splice(env, next));
|
|
3315
|
+
return FAIL;
|
|
3316
|
+
}
|
|
3317
|
+
|
|
3318
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3319
|
+
// Initial actions
|
|
3320
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3321
|
+
|
|
3322
|
+
/**
|
|
3323
|
+
* These are conditioned only on symbols and don't advance, except for `qq_body`, which cannot fail.
|
|
3324
|
+
*/
|
|
3325
|
+
static Symbol pre_ws_commands(Env *env) {
|
|
3326
|
+
SEQ(texp_context(env));
|
|
3327
|
+
SEQ(start_brace(env));
|
|
3328
|
+
SEQ(end_brace(env));
|
|
3329
|
+
// Leading whitespace must be included in the node.
|
|
3330
|
+
if (valid(env, QQ_BODY)) return qq_body(env);
|
|
3331
|
+
if (newline_active(env)) SEQ(newline_post(env));
|
|
3332
|
+
else if (env->state->newline.state == NResume) SEQ(newline_resume(env));
|
|
3333
|
+
return FAIL;
|
|
3334
|
+
}
|
|
3335
|
+
|
|
3336
|
+
static Symbol scan_main(Env *env) {
|
|
3337
|
+
MARK("main");
|
|
3338
|
+
SEQ(pre_ws_commands(env));
|
|
3339
|
+
bool whitespace = skip_space(env);
|
|
3340
|
+
if (is_newline(PEEK)) return newline_start(env);
|
|
3341
|
+
else if (not_eof(env)) return interior(env, whitespace);
|
|
3342
|
+
return FAIL;
|
|
3343
|
+
}
|
|
3344
|
+
|
|
3345
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3346
|
+
|
|
3347
|
+
static Symbol scan_debug(Env *env) {
|
|
3348
|
+
if (debug_init(env)) return update_state("debug init parse buffer");
|
|
3349
|
+
Symbol result = scan_main(env);
|
|
3350
|
+
debug_finish(env, result);
|
|
3351
|
+
return result;
|
|
3352
|
+
}
|
|
3353
|
+
|
|
3354
|
+
#endif
|
|
3355
|
+
|
|
3356
|
+
static bool process_result(Env *env, Symbol result) {
|
|
3357
|
+
if (result == FAIL && is_eof(env) && no_lookahead(env)) {
|
|
3358
|
+
MARK("eof whitespace");
|
|
3359
|
+
// Inlined `end_layout` because of perf glitch
|
|
3360
|
+
if (valid(env, END)) result = end_layout_unchecked(env, "eof");
|
|
3361
|
+
else if (valid(env, SEMICOLON)) result = finish(SEMICOLON, "eof");
|
|
3362
|
+
else {
|
|
3363
|
+
result = force_end_context(env);
|
|
3364
|
+
if (result == FAIL) {
|
|
3365
|
+
dbg("eof | context cap: %d | lookahead cap: %d | parse cap: %d\n",
|
|
3366
|
+
env->state->contexts.capacity, env->state->lookahead.capacity, env->state->parse.capacity);}
|
|
3367
|
+
}
|
|
3368
|
+
}
|
|
3369
|
+
return set_result_symbol(env, result);
|
|
3370
|
+
}
|
|
3371
|
+
|
|
3372
|
+
|
|
3373
|
+
static bool scan(Env *env) {
|
|
3374
|
+
if(after_error(env)) { dbg("error recovery\n"); return false; }
|
|
3375
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3376
|
+
Symbol result = scan_debug(env);
|
|
3377
|
+
#else
|
|
3378
|
+
Symbol result = scan_main(env);
|
|
3379
|
+
#endif
|
|
3380
|
+
return process_result(env, result);
|
|
3381
|
+
}
|
|
3382
|
+
|
|
3383
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3384
|
+
// API
|
|
3385
|
+
// --------------------------------------------------------------------------------------------------------
|
|
3386
|
+
|
|
3387
|
+
typedef struct {
|
|
3388
|
+
unsigned contexts;
|
|
3389
|
+
Newline newline;
|
|
3390
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3391
|
+
unsigned parse;
|
|
3392
|
+
#endif
|
|
3393
|
+
} Persist;
|
|
3394
|
+
|
|
3395
|
+
/**
|
|
3396
|
+
* This function allocates the persistent state of the parser that is passed into the other API functions.
|
|
3397
|
+
*/
|
|
3398
|
+
void *tree_sitter_haskell_external_scanner_create() {
|
|
3399
|
+
State *state = ts_calloc(1, sizeof(State));
|
|
3400
|
+
array_reserve(&state->contexts, 8);
|
|
3401
|
+
array_reserve(&state->lookahead, 8);
|
|
3402
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3403
|
+
array_reserve(&state->parse, 20);
|
|
3404
|
+
#endif
|
|
3405
|
+
return state;
|
|
3406
|
+
}
|
|
3407
|
+
|
|
3408
|
+
/**
|
|
3409
|
+
* Main logic entry point.
|
|
3410
|
+
* Since the state is a singular vector, it can just be cast and used directly.
|
|
3411
|
+
*/
|
|
3412
|
+
bool tree_sitter_haskell_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
|
|
3413
|
+
Env env = env_new(lexer, valid_symbols, (State*) payload);
|
|
3414
|
+
return scan(&env);
|
|
3415
|
+
}
|
|
3416
|
+
|
|
3417
|
+
unsigned tree_sitter_haskell_external_scanner_serialize(void *payload, char *buffer) {
|
|
3418
|
+
State *state = (State *) payload;
|
|
3419
|
+
Persist persist = {.contexts = state->contexts.size, .newline = state->newline};
|
|
3420
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3421
|
+
persist.parse = state->parse.size;
|
|
3422
|
+
#endif
|
|
3423
|
+
unsigned contexts_size = persist.contexts * sizeof(Context);
|
|
3424
|
+
memcpy(buffer, &persist, sizeof(Persist));
|
|
3425
|
+
unsigned to_copy = sizeof(Persist) + contexts_size;
|
|
3426
|
+
if (to_copy > TREE_SITTER_SERIALIZATION_BUFFER_SIZE) return 0;
|
|
3427
|
+
memcpy(buffer + sizeof(Persist), state->contexts.contents, contexts_size);
|
|
3428
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3429
|
+
to_copy = serialize_parse_lines(buffer + sizeof(Persist) + contexts_size, &state->parse, to_copy);
|
|
3430
|
+
#endif
|
|
3431
|
+
return to_copy;
|
|
3432
|
+
}
|
|
3433
|
+
|
|
3434
|
+
void tree_sitter_haskell_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
|
|
3435
|
+
State *state = (State *) payload;
|
|
3436
|
+
Persist p;
|
|
3437
|
+
Persist *persist;
|
|
3438
|
+
if (length > 0)
|
|
3439
|
+
persist = (Persist *) buffer;
|
|
3440
|
+
else {
|
|
3441
|
+
p = (Persist) {.contexts = 0};
|
|
3442
|
+
persist = &p;
|
|
3443
|
+
persist->newline.state = NResume;
|
|
3444
|
+
}
|
|
3445
|
+
unsigned contexts_size = persist->contexts * sizeof(Context);
|
|
3446
|
+
state->newline = persist->newline;
|
|
3447
|
+
array_reserve(&state->contexts, persist->contexts);
|
|
3448
|
+
state->contexts.size = persist->contexts;
|
|
3449
|
+
if (length > 0)
|
|
3450
|
+
memcpy(state->contexts.contents, buffer + sizeof(Persist), contexts_size);
|
|
3451
|
+
state->lookahead.size = 0;
|
|
3452
|
+
state->lookahead.offset = 0;
|
|
3453
|
+
array_reserve(&state->lookahead, 8);
|
|
3454
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3455
|
+
if (length > 0)
|
|
3456
|
+
deserialize_parse_lines(buffer + sizeof(Persist) + contexts_size, &state->parse, persist->parse);
|
|
3457
|
+
#endif
|
|
3458
|
+
}
|
|
3459
|
+
|
|
3460
|
+
void tree_sitter_haskell_external_scanner_destroy(void *payload) {
|
|
3461
|
+
State *state = (State*) payload;
|
|
3462
|
+
#ifdef TREE_SITTER_DEBUG
|
|
3463
|
+
palette();
|
|
3464
|
+
ParseLines *parse = &state->parse;
|
|
3465
|
+
for (unsigned i = 0; i < parse->size; i++) array_delete(array_get(parse, i));
|
|
3466
|
+
array_delete(parse);
|
|
3467
|
+
#endif
|
|
3468
|
+
array_delete(&state->contexts);
|
|
3469
|
+
array_delete(&state->lookahead);
|
|
3470
|
+
ts_free(state);
|
|
3471
|
+
}
|