@rejot-dev/tree-sitter-thalo 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +377 -0
- package/binding.gyp +35 -0
- package/bindings/node/binding.cc +19 -0
- package/bindings/node/binding_test.js +11 -0
- package/bindings/node/index.d.ts +60 -0
- package/bindings/node/index.js +38 -0
- package/grammar.js +303 -0
- package/package.json +45 -0
- package/queries/highlights.scm +71 -0
- package/src/grammar.json +1467 -0
- package/src/node-types.json +1132 -0
- package/src/parser.c +5742 -0
- package/src/scanner.c +419 -0
- package/src/tree_sitter/alloc.h +54 -0
- package/src/tree_sitter/array.h +291 -0
- package/src/tree_sitter/parser.h +286 -0
- package/tree-sitter-thalo.wasm +0 -0
package/src/scanner.c
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @file scanner.c
|
|
3
|
+
* @brief External scanner for tree-sitter-thalo parser
|
|
4
|
+
*
|
|
5
|
+
* This file implements an external scanner for the thalo language parser.
|
|
6
|
+
* It handles indentation-sensitive parsing for metadata and content blocks.
|
|
7
|
+
*
|
|
8
|
+
* Token types:
|
|
9
|
+
* - INDENT: Start of an indented line (newline + proper indentation consumed)
|
|
10
|
+
* - CONTENT_BLANK: A blank line within content (may have trailing whitespace)
|
|
11
|
+
* - ERROR_SENTINEL: Marker to detect error recovery mode
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
#include "tree_sitter/alloc.h"
|
|
15
|
+
#include "tree_sitter/array.h"
|
|
16
|
+
#include "tree_sitter/parser.h"
|
|
17
|
+
|
|
18
|
+
#include <stdio.h>
|
|
19
|
+
|
|
20
|
+
// Debug mode - set to 1 to enable debug output
|
|
21
|
+
#define DEBUG_SCANNER 0
|
|
22
|
+
|
|
23
|
+
#if DEBUG_SCANNER
|
|
24
|
+
#define DEBUG_LOG(...) fprintf(stderr, __VA_ARGS__)
|
|
25
|
+
#else
|
|
26
|
+
#define DEBUG_LOG(...)
|
|
27
|
+
#endif
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @brief Token types that the external scanner can produce
|
|
31
|
+
*
|
|
32
|
+
* These must match the order in the grammar's externals array.
|
|
33
|
+
*/
|
|
34
|
+
enum TokenType
|
|
35
|
+
{
|
|
36
|
+
INDENT, // Newline followed by proper indentation (2+ spaces or tab)
|
|
37
|
+
CONTENT_BLANK, // Blank line in content (newline, optionally with whitespace-only line)
|
|
38
|
+
ERROR_SENTINEL, // Sentinel for error recovery detection
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* @brief Scanner state
|
|
43
|
+
*
|
|
44
|
+
* Currently stateless since we don't track indent levels across parses.
|
|
45
|
+
* Tree-sitter handles the grammar-level block structure.
|
|
46
|
+
*/
|
|
47
|
+
typedef struct
|
|
48
|
+
{
|
|
49
|
+
// Reserved for future use if we need state
|
|
50
|
+
uint8_t _reserved;
|
|
51
|
+
} Scanner;
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* @brief Advance the lexer to the next character (include in parse result)
|
|
55
|
+
*/
|
|
56
|
+
static inline void advance(TSLexer *lexer)
|
|
57
|
+
{
|
|
58
|
+
lexer->advance(lexer, false);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* @brief Check if character is a newline
|
|
63
|
+
*/
|
|
64
|
+
static inline bool is_newline(int32_t c)
|
|
65
|
+
{
|
|
66
|
+
return c == '\n' || c == '\r';
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* @brief Check if character is horizontal whitespace (space or tab)
|
|
71
|
+
*/
|
|
72
|
+
static inline bool is_hspace(int32_t c)
|
|
73
|
+
{
|
|
74
|
+
return c == ' ' || c == '\t';
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* @brief Check if we're in error recovery mode
|
|
79
|
+
*
|
|
80
|
+
* During error recovery, all symbols are marked valid. We detect this
|
|
81
|
+
* by checking if the error sentinel is valid.
|
|
82
|
+
*/
|
|
83
|
+
static bool in_error_recovery(const bool *valid_symbols)
|
|
84
|
+
{
|
|
85
|
+
return valid_symbols[ERROR_SENTINEL];
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* @brief Check if a line has valid indentation
|
|
90
|
+
*
|
|
91
|
+
* Valid indentation is at least 1 space or at least 1 tab.
|
|
92
|
+
*/
|
|
93
|
+
static bool has_valid_indent(int indent, bool has_tab)
|
|
94
|
+
{
|
|
95
|
+
return has_tab || indent >= 1;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* @brief Consume a newline sequence (\n or \r\n)
|
|
100
|
+
*/
|
|
101
|
+
static void consume_newline(TSLexer *lexer)
|
|
102
|
+
{
|
|
103
|
+
bool was_cr = lexer->lookahead == '\r';
|
|
104
|
+
advance(lexer);
|
|
105
|
+
if (was_cr && lexer->lookahead == '\n')
|
|
106
|
+
{
|
|
107
|
+
advance(lexer);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* @brief Count indentation (spaces/tabs) and advance past it
|
|
113
|
+
*
|
|
114
|
+
* Returns the indent count and sets has_tab if a tab was found.
|
|
115
|
+
*/
|
|
116
|
+
static int consume_indentation(TSLexer *lexer, bool *has_tab)
|
|
117
|
+
{
|
|
118
|
+
int indent = 0;
|
|
119
|
+
*has_tab = false;
|
|
120
|
+
|
|
121
|
+
while (is_hspace(lexer->lookahead))
|
|
122
|
+
{
|
|
123
|
+
if (lexer->lookahead == '\t')
|
|
124
|
+
{
|
|
125
|
+
*has_tab = true;
|
|
126
|
+
}
|
|
127
|
+
indent++;
|
|
128
|
+
advance(lexer);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return indent;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* @brief Skip to end of line (past comment content)
|
|
136
|
+
*/
|
|
137
|
+
static void skip_to_eol(TSLexer *lexer)
|
|
138
|
+
{
|
|
139
|
+
while (!is_newline(lexer->lookahead) && !lexer->eof(lexer))
|
|
140
|
+
{
|
|
141
|
+
advance(lexer);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* @brief Look ahead past comment(s) to see if indented content follows
|
|
147
|
+
*
|
|
148
|
+
* Called when we're at '//' (already advanced past first '/').
|
|
149
|
+
* Returns true if there's indented content after the comment(s).
|
|
150
|
+
*/
|
|
151
|
+
static bool look_ahead_for_indented_content(TSLexer *lexer)
|
|
152
|
+
{
|
|
153
|
+
// Skip rest of current comment line
|
|
154
|
+
skip_to_eol(lexer);
|
|
155
|
+
|
|
156
|
+
while (!lexer->eof(lexer))
|
|
157
|
+
{
|
|
158
|
+
if (!is_newline(lexer->lookahead))
|
|
159
|
+
break;
|
|
160
|
+
|
|
161
|
+
consume_newline(lexer);
|
|
162
|
+
|
|
163
|
+
bool next_has_tab = false;
|
|
164
|
+
int next_indent = consume_indentation(lexer, &next_has_tab);
|
|
165
|
+
|
|
166
|
+
if (is_newline(lexer->lookahead) || lexer->eof(lexer))
|
|
167
|
+
{
|
|
168
|
+
// Blank line, continue looking
|
|
169
|
+
continue;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Check if this is another unindented comment
|
|
173
|
+
if (next_indent == 0 && !next_has_tab && lexer->lookahead == '/')
|
|
174
|
+
{
|
|
175
|
+
advance(lexer);
|
|
176
|
+
if (lexer->lookahead == '/')
|
|
177
|
+
{
|
|
178
|
+
// Another unindented comment, skip it and continue
|
|
179
|
+
skip_to_eol(lexer);
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
// Single slash at start of line - not a comment, not indented
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Found non-blank, non-comment content
|
|
187
|
+
return has_valid_indent(next_indent, next_has_tab);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// EOF without finding indented content
|
|
191
|
+
return false;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* @brief Skip horizontal whitespace without including it in the token
|
|
196
|
+
*
|
|
197
|
+
* Tree-sitter extras are not consumed before external scanners are called,
|
|
198
|
+
* so we need to manually skip trailing whitespace to find the newline.
|
|
199
|
+
*/
|
|
200
|
+
static void skip_hspace(TSLexer *lexer)
|
|
201
|
+
{
|
|
202
|
+
while (is_hspace(lexer->lookahead))
|
|
203
|
+
{
|
|
204
|
+
lexer->advance(lexer, true); // true = skip (don't include in token)
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* @brief Unified newline scanner
|
|
210
|
+
*
|
|
211
|
+
* This function handles both INDENT and CONTENT_BLANK in a single pass
|
|
212
|
+
* to avoid advancing the lexer before knowing what token to produce.
|
|
213
|
+
*
|
|
214
|
+
* Algorithm:
|
|
215
|
+
* 0. Skip any trailing horizontal whitespace (extras aren't auto-consumed for externals)
|
|
216
|
+
* 1. Consume the initial newline
|
|
217
|
+
* 2. Count indentation on the current line
|
|
218
|
+
* 3. If we have valid indent and content: return INDENT
|
|
219
|
+
* 4. If unindented comment and indented content follows: return INDENT
|
|
220
|
+
* 5. If we're at end of line (blank line): look ahead for content
|
|
221
|
+
* - If indented content follows: return CONTENT_BLANK
|
|
222
|
+
* - Otherwise: return false (let grammar handle the newline)
|
|
223
|
+
*/
|
|
224
|
+
static bool scan_newline(TSLexer *lexer, const bool *valid_symbols)
|
|
225
|
+
{
|
|
226
|
+
// Skip any trailing whitespace before the newline
|
|
227
|
+
// (tree-sitter extras are not automatically consumed for external scanners)
|
|
228
|
+
skip_hspace(lexer);
|
|
229
|
+
|
|
230
|
+
// Must now be at a newline
|
|
231
|
+
if (!is_newline(lexer->lookahead))
|
|
232
|
+
{
|
|
233
|
+
return false;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Consume the newline
|
|
237
|
+
consume_newline(lexer);
|
|
238
|
+
|
|
239
|
+
// Count indentation on this line
|
|
240
|
+
bool has_tab = false;
|
|
241
|
+
int indent = consume_indentation(lexer, &has_tab);
|
|
242
|
+
|
|
243
|
+
// Check what's on this line
|
|
244
|
+
bool at_eol = is_newline(lexer->lookahead) || lexer->eof(lexer);
|
|
245
|
+
bool valid_indent = has_valid_indent(indent, has_tab);
|
|
246
|
+
|
|
247
|
+
DEBUG_LOG("[SCANNER] line: indent=%d, has_tab=%d, at_eol=%d, valid_indent=%d, lookahead='%c'(%d)\n",
|
|
248
|
+
indent, has_tab, at_eol, valid_indent,
|
|
249
|
+
lexer->lookahead > 31 && lexer->lookahead < 127
|
|
250
|
+
? (char)lexer->lookahead
|
|
251
|
+
: '?',
|
|
252
|
+
lexer->lookahead);
|
|
253
|
+
|
|
254
|
+
// Case 1: Valid indented line with content (including comments) -> INDENT
|
|
255
|
+
if (!at_eol && valid_indent && valid_symbols[INDENT])
|
|
256
|
+
{
|
|
257
|
+
lexer->mark_end(lexer);
|
|
258
|
+
lexer->result_symbol = INDENT;
|
|
259
|
+
DEBUG_LOG("[SCANNER] -> INDENT\n");
|
|
260
|
+
return true;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// Case 2: Unindented comment line - check if followed by indented content
|
|
264
|
+
// This allows comments at column 0 to be part of an entry if more metadata follows
|
|
265
|
+
if (!at_eol && indent == 0 && !has_tab && valid_symbols[INDENT] && lexer->lookahead == '/')
|
|
266
|
+
{
|
|
267
|
+
// Mark end here (token is just the newline, no indentation)
|
|
268
|
+
lexer->mark_end(lexer);
|
|
269
|
+
|
|
270
|
+
// Check if this is a comment and if indented content follows
|
|
271
|
+
advance(lexer); // Consume first '/'
|
|
272
|
+
if (lexer->lookahead == '/')
|
|
273
|
+
{
|
|
274
|
+
// It's a comment, look ahead past it
|
|
275
|
+
if (look_ahead_for_indented_content(lexer))
|
|
276
|
+
{
|
|
277
|
+
lexer->result_symbol = INDENT;
|
|
278
|
+
DEBUG_LOG("[SCANNER] -> INDENT (unindented comment with indented content after)\n");
|
|
279
|
+
return true;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
// Not a comment or no indented content follows - don't match
|
|
283
|
+
DEBUG_LOG("[SCANNER] -> no match (unindented comment without indented content after)\n");
|
|
284
|
+
return false;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
// Case 3: Blank line (or whitespace-only line)
|
|
288
|
+
// Only match if content follows AND CONTENT_BLANK is valid
|
|
289
|
+
if (at_eol && valid_symbols[CONTENT_BLANK])
|
|
290
|
+
{
|
|
291
|
+
// Mark the end after this blank line
|
|
292
|
+
lexer->mark_end(lexer);
|
|
293
|
+
|
|
294
|
+
// Look ahead to see if indented content follows
|
|
295
|
+
while (is_newline(lexer->lookahead))
|
|
296
|
+
{
|
|
297
|
+
consume_newline(lexer);
|
|
298
|
+
|
|
299
|
+
// Count indent on this next line
|
|
300
|
+
bool next_has_tab = false;
|
|
301
|
+
int next_indent = consume_indentation(lexer, &next_has_tab);
|
|
302
|
+
|
|
303
|
+
// Check what's on this line
|
|
304
|
+
if (!is_newline(lexer->lookahead) && !lexer->eof(lexer))
|
|
305
|
+
{
|
|
306
|
+
// Found a line with content
|
|
307
|
+
if (has_valid_indent(next_indent, next_has_tab))
|
|
308
|
+
{
|
|
309
|
+
// Indented content follows - match CONTENT_BLANK
|
|
310
|
+
lexer->result_symbol = CONTENT_BLANK;
|
|
311
|
+
DEBUG_LOG("[SCANNER] -> CONTENT_BLANK (indented content follows)\n");
|
|
312
|
+
return true;
|
|
313
|
+
}
|
|
314
|
+
else
|
|
315
|
+
{
|
|
316
|
+
// Unindented content (new entry) - don't match
|
|
317
|
+
DEBUG_LOG("[SCANNER] -> no match (unindented content follows)\n");
|
|
318
|
+
return false;
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
// Another blank line - continue looking
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Reached EOF without finding indented content
|
|
325
|
+
DEBUG_LOG("[SCANNER] -> no match (EOF, no content follows)\n");
|
|
326
|
+
return false;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// No match - not a valid indent and not a blank line
|
|
330
|
+
DEBUG_LOG("[SCANNER] -> no match (at_eol=%d, valid_indent=%d)\n", at_eol, valid_indent);
|
|
331
|
+
return false;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/**
|
|
335
|
+
* @brief Main scanning function
|
|
336
|
+
*
|
|
337
|
+
* Attempts to recognize external tokens based on what's valid at this position.
|
|
338
|
+
*/
|
|
339
|
+
static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
|
|
340
|
+
{
|
|
341
|
+
(void)scanner; // Currently unused
|
|
342
|
+
|
|
343
|
+
DEBUG_LOG("[SCANNER] called: lookahead='%c'(%d) valid=[%d,%d,%d]\n",
|
|
344
|
+
lexer->lookahead > 31 && lexer->lookahead < 127
|
|
345
|
+
? (char)lexer->lookahead
|
|
346
|
+
: '?',
|
|
347
|
+
lexer->lookahead, valid_symbols[INDENT], valid_symbols[CONTENT_BLANK],
|
|
348
|
+
valid_symbols[ERROR_SENTINEL]);
|
|
349
|
+
|
|
350
|
+
// Don't produce tokens during error recovery
|
|
351
|
+
if (in_error_recovery(valid_symbols))
|
|
352
|
+
{
|
|
353
|
+
DEBUG_LOG("[SCANNER] error recovery mode, returning false\n");
|
|
354
|
+
return false;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Only scan if we might want INDENT or CONTENT_BLANK
|
|
358
|
+
if (valid_symbols[INDENT] || valid_symbols[CONTENT_BLANK])
|
|
359
|
+
{
|
|
360
|
+
return scan_newline(lexer, valid_symbols);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
return false;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* @brief Create a new scanner instance
|
|
368
|
+
*/
|
|
369
|
+
void *tree_sitter_thalo_external_scanner_create(void)
|
|
370
|
+
{
|
|
371
|
+
Scanner *scanner = ts_calloc(1, sizeof(Scanner));
|
|
372
|
+
return scanner;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* @brief Destroy scanner instance and free memory
|
|
377
|
+
*/
|
|
378
|
+
void tree_sitter_thalo_external_scanner_destroy(void *payload)
|
|
379
|
+
{
|
|
380
|
+
Scanner *scanner = (Scanner *)payload;
|
|
381
|
+
ts_free(scanner);
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/**
|
|
385
|
+
* @brief Serialize scanner state
|
|
386
|
+
*
|
|
387
|
+
* Currently stateless, so nothing to serialize.
|
|
388
|
+
*/
|
|
389
|
+
unsigned tree_sitter_thalo_external_scanner_serialize(void *payload,
|
|
390
|
+
char *buffer)
|
|
391
|
+
{
|
|
392
|
+
(void)payload;
|
|
393
|
+
(void)buffer;
|
|
394
|
+
return 0;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* @brief Deserialize scanner state
|
|
399
|
+
*
|
|
400
|
+
* Currently stateless, so nothing to deserialize.
|
|
401
|
+
*/
|
|
402
|
+
void tree_sitter_thalo_external_scanner_deserialize(void *payload,
|
|
403
|
+
const char *buffer,
|
|
404
|
+
unsigned length)
|
|
405
|
+
{
|
|
406
|
+
(void)payload;
|
|
407
|
+
(void)buffer;
|
|
408
|
+
(void)length;
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* @brief Main entry point for token scanning
|
|
413
|
+
*/
|
|
414
|
+
bool tree_sitter_thalo_external_scanner_scan(void *payload, TSLexer *lexer,
|
|
415
|
+
const bool *valid_symbols)
|
|
416
|
+
{
|
|
417
|
+
Scanner *scanner = (Scanner *)payload;
|
|
418
|
+
return scan(scanner, lexer, valid_symbols);
|
|
419
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#ifndef TREE_SITTER_ALLOC_H_
|
|
2
|
+
#define TREE_SITTER_ALLOC_H_
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
#include <stdbool.h>
|
|
9
|
+
#include <stdio.h>
|
|
10
|
+
#include <stdlib.h>
|
|
11
|
+
|
|
12
|
+
// Allow clients to override allocation functions
|
|
13
|
+
#ifdef TREE_SITTER_REUSE_ALLOCATOR
|
|
14
|
+
|
|
15
|
+
extern void *(*ts_current_malloc)(size_t size);
|
|
16
|
+
extern void *(*ts_current_calloc)(size_t count, size_t size);
|
|
17
|
+
extern void *(*ts_current_realloc)(void *ptr, size_t size);
|
|
18
|
+
extern void (*ts_current_free)(void *ptr);
|
|
19
|
+
|
|
20
|
+
#ifndef ts_malloc
|
|
21
|
+
#define ts_malloc ts_current_malloc
|
|
22
|
+
#endif
|
|
23
|
+
#ifndef ts_calloc
|
|
24
|
+
#define ts_calloc ts_current_calloc
|
|
25
|
+
#endif
|
|
26
|
+
#ifndef ts_realloc
|
|
27
|
+
#define ts_realloc ts_current_realloc
|
|
28
|
+
#endif
|
|
29
|
+
#ifndef ts_free
|
|
30
|
+
#define ts_free ts_current_free
|
|
31
|
+
#endif
|
|
32
|
+
|
|
33
|
+
#else
|
|
34
|
+
|
|
35
|
+
#ifndef ts_malloc
|
|
36
|
+
#define ts_malloc malloc
|
|
37
|
+
#endif
|
|
38
|
+
#ifndef ts_calloc
|
|
39
|
+
#define ts_calloc calloc
|
|
40
|
+
#endif
|
|
41
|
+
#ifndef ts_realloc
|
|
42
|
+
#define ts_realloc realloc
|
|
43
|
+
#endif
|
|
44
|
+
#ifndef ts_free
|
|
45
|
+
#define ts_free free
|
|
46
|
+
#endif
|
|
47
|
+
|
|
48
|
+
#endif
|
|
49
|
+
|
|
50
|
+
#ifdef __cplusplus
|
|
51
|
+
}
|
|
52
|
+
#endif
|
|
53
|
+
|
|
54
|
+
#endif // TREE_SITTER_ALLOC_H_
|