@rejot-dev/tree-sitter-thalo 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,419 @@
1
+ /**
2
+ * @file scanner.c
3
+ * @brief External scanner for tree-sitter-thalo parser
4
+ *
5
+ * This file implements an external scanner for the thalo language parser.
6
+ * It handles indentation-sensitive parsing for metadata and content blocks.
7
+ *
8
+ * Token types:
9
+ * - INDENT: Start of an indented line (newline + proper indentation consumed)
10
+ * - CONTENT_BLANK: A blank line within content (may have trailing whitespace)
11
+ * - ERROR_SENTINEL: Marker to detect error recovery mode
12
+ */
13
+
14
+ #include "tree_sitter/alloc.h"
15
+ #include "tree_sitter/array.h"
16
+ #include "tree_sitter/parser.h"
17
+
18
+ #include <stdio.h>
19
+
20
+ // Debug mode - set to 1 to enable debug output
21
+ #define DEBUG_SCANNER 0
22
+
23
+ #if DEBUG_SCANNER
24
+ #define DEBUG_LOG(...) fprintf(stderr, __VA_ARGS__)
25
+ #else
26
+ #define DEBUG_LOG(...)
27
+ #endif
28
+
29
+ /**
30
+ * @brief Token types that the external scanner can produce
31
+ *
32
+ * These must match the order in the grammar's externals array.
33
+ */
34
+ enum TokenType
35
+ {
36
+ INDENT, // Newline followed by proper indentation (2+ spaces or tab)
37
+ CONTENT_BLANK, // Blank line in content (newline, optionally with whitespace-only line)
38
+ ERROR_SENTINEL, // Sentinel for error recovery detection
39
+ };
40
+
41
+ /**
42
+ * @brief Scanner state
43
+ *
44
+ * Currently stateless since we don't track indent levels across parses.
45
+ * Tree-sitter handles the grammar-level block structure.
46
+ */
47
+ typedef struct
48
+ {
49
+ // Reserved for future use if we need state
50
+ uint8_t _reserved;
51
+ } Scanner;
52
+
53
+ /**
54
+ * @brief Advance the lexer to the next character (include in parse result)
55
+ */
56
+ static inline void advance(TSLexer *lexer)
57
+ {
58
+ lexer->advance(lexer, false);
59
+ }
60
+
61
+ /**
62
+ * @brief Check if character is a newline
63
+ */
64
+ static inline bool is_newline(int32_t c)
65
+ {
66
+ return c == '\n' || c == '\r';
67
+ }
68
+
69
+ /**
70
+ * @brief Check if character is horizontal whitespace (space or tab)
71
+ */
72
+ static inline bool is_hspace(int32_t c)
73
+ {
74
+ return c == ' ' || c == '\t';
75
+ }
76
+
77
+ /**
78
+ * @brief Check if we're in error recovery mode
79
+ *
80
+ * During error recovery, all symbols are marked valid. We detect this
81
+ * by checking if the error sentinel is valid.
82
+ */
83
+ static bool in_error_recovery(const bool *valid_symbols)
84
+ {
85
+ return valid_symbols[ERROR_SENTINEL];
86
+ }
87
+
88
+ /**
89
+ * @brief Check if a line has valid indentation
90
+ *
91
+ * Valid indentation is at least 1 space or at least 1 tab.
92
+ */
93
+ static bool has_valid_indent(int indent, bool has_tab)
94
+ {
95
+ return has_tab || indent >= 1;
96
+ }
97
+
98
+ /**
99
+ * @brief Consume a newline sequence (\n or \r\n)
100
+ */
101
+ static void consume_newline(TSLexer *lexer)
102
+ {
103
+ bool was_cr = lexer->lookahead == '\r';
104
+ advance(lexer);
105
+ if (was_cr && lexer->lookahead == '\n')
106
+ {
107
+ advance(lexer);
108
+ }
109
+ }
110
+
111
+ /**
112
+ * @brief Count indentation (spaces/tabs) and advance past it
113
+ *
114
+ * Returns the indent count and sets has_tab if a tab was found.
115
+ */
116
+ static int consume_indentation(TSLexer *lexer, bool *has_tab)
117
+ {
118
+ int indent = 0;
119
+ *has_tab = false;
120
+
121
+ while (is_hspace(lexer->lookahead))
122
+ {
123
+ if (lexer->lookahead == '\t')
124
+ {
125
+ *has_tab = true;
126
+ }
127
+ indent++;
128
+ advance(lexer);
129
+ }
130
+
131
+ return indent;
132
+ }
133
+
134
+ /**
135
+ * @brief Skip to end of line (past comment content)
136
+ */
137
+ static void skip_to_eol(TSLexer *lexer)
138
+ {
139
+ while (!is_newline(lexer->lookahead) && !lexer->eof(lexer))
140
+ {
141
+ advance(lexer);
142
+ }
143
+ }
144
+
145
+ /**
146
+ * @brief Look ahead past comment(s) to see if indented content follows
147
+ *
148
+ * Called when we're at '//' (already advanced past first '/').
149
+ * Returns true if there's indented content after the comment(s).
150
+ */
151
+ static bool look_ahead_for_indented_content(TSLexer *lexer)
152
+ {
153
+ // Skip rest of current comment line
154
+ skip_to_eol(lexer);
155
+
156
+ while (!lexer->eof(lexer))
157
+ {
158
+ if (!is_newline(lexer->lookahead))
159
+ break;
160
+
161
+ consume_newline(lexer);
162
+
163
+ bool next_has_tab = false;
164
+ int next_indent = consume_indentation(lexer, &next_has_tab);
165
+
166
+ if (is_newline(lexer->lookahead) || lexer->eof(lexer))
167
+ {
168
+ // Blank line, continue looking
169
+ continue;
170
+ }
171
+
172
+ // Check if this is another unindented comment
173
+ if (next_indent == 0 && !next_has_tab && lexer->lookahead == '/')
174
+ {
175
+ advance(lexer);
176
+ if (lexer->lookahead == '/')
177
+ {
178
+ // Another unindented comment, skip it and continue
179
+ skip_to_eol(lexer);
180
+ continue;
181
+ }
182
+ // Single slash at start of line - not a comment, not indented
183
+ return false;
184
+ }
185
+
186
+ // Found non-blank, non-comment content
187
+ return has_valid_indent(next_indent, next_has_tab);
188
+ }
189
+
190
+ // EOF without finding indented content
191
+ return false;
192
+ }
193
+
194
+ /**
195
+ * @brief Skip horizontal whitespace without including it in the token
196
+ *
197
+ * Tree-sitter extras are not consumed before external scanners are called,
198
+ * so we need to manually skip trailing whitespace to find the newline.
199
+ */
200
+ static void skip_hspace(TSLexer *lexer)
201
+ {
202
+ while (is_hspace(lexer->lookahead))
203
+ {
204
+ lexer->advance(lexer, true); // true = skip (don't include in token)
205
+ }
206
+ }
207
+
208
+ /**
209
+ * @brief Unified newline scanner
210
+ *
211
+ * This function handles both INDENT and CONTENT_BLANK in a single pass
212
+ * to avoid advancing the lexer before knowing what token to produce.
213
+ *
214
+ * Algorithm:
215
+ * 0. Skip any trailing horizontal whitespace (extras aren't auto-consumed for externals)
216
+ * 1. Consume the initial newline
217
+ * 2. Count indentation on the current line
218
+ * 3. If we have valid indent and content: return INDENT
219
+ * 4. If unindented comment and indented content follows: return INDENT
220
+ * 5. If we're at end of line (blank line): look ahead for content
221
+ * - If indented content follows: return CONTENT_BLANK
222
+ * - Otherwise: return false (let grammar handle the newline)
223
+ */
224
+ static bool scan_newline(TSLexer *lexer, const bool *valid_symbols)
225
+ {
226
+ // Skip any trailing whitespace before the newline
227
+ // (tree-sitter extras are not automatically consumed for external scanners)
228
+ skip_hspace(lexer);
229
+
230
+ // Must now be at a newline
231
+ if (!is_newline(lexer->lookahead))
232
+ {
233
+ return false;
234
+ }
235
+
236
+ // Consume the newline
237
+ consume_newline(lexer);
238
+
239
+ // Count indentation on this line
240
+ bool has_tab = false;
241
+ int indent = consume_indentation(lexer, &has_tab);
242
+
243
+ // Check what's on this line
244
+ bool at_eol = is_newline(lexer->lookahead) || lexer->eof(lexer);
245
+ bool valid_indent = has_valid_indent(indent, has_tab);
246
+
247
+ DEBUG_LOG("[SCANNER] line: indent=%d, has_tab=%d, at_eol=%d, valid_indent=%d, lookahead='%c'(%d)\n",
248
+ indent, has_tab, at_eol, valid_indent,
249
+ lexer->lookahead > 31 && lexer->lookahead < 127
250
+ ? (char)lexer->lookahead
251
+ : '?',
252
+ lexer->lookahead);
253
+
254
+ // Case 1: Valid indented line with content (including comments) -> INDENT
255
+ if (!at_eol && valid_indent && valid_symbols[INDENT])
256
+ {
257
+ lexer->mark_end(lexer);
258
+ lexer->result_symbol = INDENT;
259
+ DEBUG_LOG("[SCANNER] -> INDENT\n");
260
+ return true;
261
+ }
262
+
263
+ // Case 2: Unindented comment line - check if followed by indented content
264
+ // This allows comments at column 0 to be part of an entry if more metadata follows
265
+ if (!at_eol && indent == 0 && !has_tab && valid_symbols[INDENT] && lexer->lookahead == '/')
266
+ {
267
+ // Mark end here (token is just the newline, no indentation)
268
+ lexer->mark_end(lexer);
269
+
270
+ // Check if this is a comment and if indented content follows
271
+ advance(lexer); // Consume first '/'
272
+ if (lexer->lookahead == '/')
273
+ {
274
+ // It's a comment, look ahead past it
275
+ if (look_ahead_for_indented_content(lexer))
276
+ {
277
+ lexer->result_symbol = INDENT;
278
+ DEBUG_LOG("[SCANNER] -> INDENT (unindented comment with indented content after)\n");
279
+ return true;
280
+ }
281
+ }
282
+ // Not a comment or no indented content follows - don't match
283
+ DEBUG_LOG("[SCANNER] -> no match (unindented comment without indented content after)\n");
284
+ return false;
285
+ }
286
+
287
+ // Case 3: Blank line (or whitespace-only line)
288
+ // Only match if content follows AND CONTENT_BLANK is valid
289
+ if (at_eol && valid_symbols[CONTENT_BLANK])
290
+ {
291
+ // Mark the end after this blank line
292
+ lexer->mark_end(lexer);
293
+
294
+ // Look ahead to see if indented content follows
295
+ while (is_newline(lexer->lookahead))
296
+ {
297
+ consume_newline(lexer);
298
+
299
+ // Count indent on this next line
300
+ bool next_has_tab = false;
301
+ int next_indent = consume_indentation(lexer, &next_has_tab);
302
+
303
+ // Check what's on this line
304
+ if (!is_newline(lexer->lookahead) && !lexer->eof(lexer))
305
+ {
306
+ // Found a line with content
307
+ if (has_valid_indent(next_indent, next_has_tab))
308
+ {
309
+ // Indented content follows - match CONTENT_BLANK
310
+ lexer->result_symbol = CONTENT_BLANK;
311
+ DEBUG_LOG("[SCANNER] -> CONTENT_BLANK (indented content follows)\n");
312
+ return true;
313
+ }
314
+ else
315
+ {
316
+ // Unindented content (new entry) - don't match
317
+ DEBUG_LOG("[SCANNER] -> no match (unindented content follows)\n");
318
+ return false;
319
+ }
320
+ }
321
+ // Another blank line - continue looking
322
+ }
323
+
324
+ // Reached EOF without finding indented content
325
+ DEBUG_LOG("[SCANNER] -> no match (EOF, no content follows)\n");
326
+ return false;
327
+ }
328
+
329
+ // No match - not a valid indent and not a blank line
330
+ DEBUG_LOG("[SCANNER] -> no match (at_eol=%d, valid_indent=%d)\n", at_eol, valid_indent);
331
+ return false;
332
+ }
333
+
334
+ /**
335
+ * @brief Main scanning function
336
+ *
337
+ * Attempts to recognize external tokens based on what's valid at this position.
338
+ */
339
+ static bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols)
340
+ {
341
+ (void)scanner; // Currently unused
342
+
343
+ DEBUG_LOG("[SCANNER] called: lookahead='%c'(%d) valid=[%d,%d,%d]\n",
344
+ lexer->lookahead > 31 && lexer->lookahead < 127
345
+ ? (char)lexer->lookahead
346
+ : '?',
347
+ lexer->lookahead, valid_symbols[INDENT], valid_symbols[CONTENT_BLANK],
348
+ valid_symbols[ERROR_SENTINEL]);
349
+
350
+ // Don't produce tokens during error recovery
351
+ if (in_error_recovery(valid_symbols))
352
+ {
353
+ DEBUG_LOG("[SCANNER] error recovery mode, returning false\n");
354
+ return false;
355
+ }
356
+
357
+ // Only scan if we might want INDENT or CONTENT_BLANK
358
+ if (valid_symbols[INDENT] || valid_symbols[CONTENT_BLANK])
359
+ {
360
+ return scan_newline(lexer, valid_symbols);
361
+ }
362
+
363
+ return false;
364
+ }
365
+
366
+ /**
367
+ * @brief Create a new scanner instance
368
+ */
369
+ void *tree_sitter_thalo_external_scanner_create(void)
370
+ {
371
+ Scanner *scanner = ts_calloc(1, sizeof(Scanner));
372
+ return scanner;
373
+ }
374
+
375
+ /**
376
+ * @brief Destroy scanner instance and free memory
377
+ */
378
+ void tree_sitter_thalo_external_scanner_destroy(void *payload)
379
+ {
380
+ Scanner *scanner = (Scanner *)payload;
381
+ ts_free(scanner);
382
+ }
383
+
384
+ /**
385
+ * @brief Serialize scanner state
386
+ *
387
+ * Currently stateless, so nothing to serialize.
388
+ */
389
+ unsigned tree_sitter_thalo_external_scanner_serialize(void *payload,
390
+ char *buffer)
391
+ {
392
+ (void)payload;
393
+ (void)buffer;
394
+ return 0;
395
+ }
396
+
397
+ /**
398
+ * @brief Deserialize scanner state
399
+ *
400
+ * Currently stateless, so nothing to deserialize.
401
+ */
402
+ void tree_sitter_thalo_external_scanner_deserialize(void *payload,
403
+ const char *buffer,
404
+ unsigned length)
405
+ {
406
+ (void)payload;
407
+ (void)buffer;
408
+ (void)length;
409
+ }
410
+
411
+ /**
412
+ * @brief Main entry point for token scanning
413
+ */
414
+ bool tree_sitter_thalo_external_scanner_scan(void *payload, TSLexer *lexer,
415
+ const bool *valid_symbols)
416
+ {
417
+ Scanner *scanner = (Scanner *)payload;
418
+ return scan(scanner, lexer, valid_symbols);
419
+ }
@@ -0,0 +1,54 @@
1
+ #ifndef TREE_SITTER_ALLOC_H_
2
+ #define TREE_SITTER_ALLOC_H_
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ #include <stdbool.h>
9
+ #include <stdio.h>
10
+ #include <stdlib.h>
11
+
12
+ // Allow clients to override allocation functions
13
+ #ifdef TREE_SITTER_REUSE_ALLOCATOR
14
+
15
+ extern void *(*ts_current_malloc)(size_t size);
16
+ extern void *(*ts_current_calloc)(size_t count, size_t size);
17
+ extern void *(*ts_current_realloc)(void *ptr, size_t size);
18
+ extern void (*ts_current_free)(void *ptr);
19
+
20
+ #ifndef ts_malloc
21
+ #define ts_malloc ts_current_malloc
22
+ #endif
23
+ #ifndef ts_calloc
24
+ #define ts_calloc ts_current_calloc
25
+ #endif
26
+ #ifndef ts_realloc
27
+ #define ts_realloc ts_current_realloc
28
+ #endif
29
+ #ifndef ts_free
30
+ #define ts_free ts_current_free
31
+ #endif
32
+
33
+ #else
34
+
35
+ #ifndef ts_malloc
36
+ #define ts_malloc malloc
37
+ #endif
38
+ #ifndef ts_calloc
39
+ #define ts_calloc calloc
40
+ #endif
41
+ #ifndef ts_realloc
42
+ #define ts_realloc realloc
43
+ #endif
44
+ #ifndef ts_free
45
+ #define ts_free free
46
+ #endif
47
+
48
+ #endif
49
+
50
+ #ifdef __cplusplus
51
+ }
52
+ #endif
53
+
54
+ #endif // TREE_SITTER_ALLOC_H_