@nvl/tree-sitter-sveltex 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/scanner.c ADDED
@@ -0,0 +1,847 @@
1
+ // External scanner for the SvelTeX (`.sveltex`) tree-sitter grammar.
2
+ //
3
+ // `grammar.js` parses only the `.sveltex` top-level structure and leaves the
4
+ // embedded languages to injections. The constructs below cannot be expressed
5
+ // with the LR core and are resolved here instead:
6
+ //
7
+ // * `_frontmatter_start` — a `---` / `+++` line opening frontmatter,
8
+ // * `_frontmatter_end` — a `---` / `+++` line closing frontmatter,
9
+ // * `_frontmatter_body` — the lines between the two fences,
10
+ // * `_markdown_chunk` — a maximal run of ordinary Markdown content
11
+ // that stops right before the next
12
+ // `.sveltex`-special construct (or EOF),
13
+ // * `_verbatim_tex_content` — the body of a `<tex>/<latex>/<tikz>` env,
14
+ // * `_verbatim_plain_content`— the body of a `<verb>/<verbatim>` env:
15
+ // everything up to the matching `</tag>`,
16
+ // * `_inline_math_content` — the body of `$ ... $`,
17
+ // * `_display_math_content` — the body of `$$ ... $$`.
18
+ //
19
+ // The scanner is stateless between tokens (no `serialize`/`deserialize`
20
+ // payload), which keeps it trivially correct under tree-sitter's speculative
21
+ // parsing: every decision is recomputed from the input. tree-sitter only
22
+ // marks `_frontmatter_start` valid in the document's initial parse state, so
23
+ // the scanner need not separately verify that it sits at byte offset 0.
24
+
25
+ #include "tree_sitter/parser.h"
26
+
27
+ #include <stdbool.h>
28
+ #include <string.h>
29
+
30
+ // Token ids — must match the order of the `externals` array in `grammar.js`.
31
+ enum TokenType {
32
+ FRONTMATTER_START,
33
+ FRONTMATTER_END,
34
+ FRONTMATTER_BODY,
35
+ VERBATIM_TEX_CONTENT,
36
+ VERBATIM_PLAIN_CONTENT,
37
+ INLINE_MATH_CONTENT,
38
+ DISPLAY_MATH_CONTENT,
39
+ MARKDOWN_CHUNK,
40
+ ERROR_SENTINEL,
41
+ };
42
+
43
+ // Verbatim environment tag names. Kept in sync with `grammar.js`'s
44
+ // `TEX_VERBATIM_TAGS` / `PLAIN_VERBATIM_TAGS`. Matching is case-sensitive
45
+ // here; the listed capitalised variants cover the common spellings.
46
+ static const char *const VERBATIM_TAGS[] = {
47
+ "tex", "latex", "tikz", "TeX", "LaTeX", "TikZ",
48
+ "verb", "verbatim", "Verb", "Verbatim",
49
+ };
50
+ static const unsigned VERBATIM_TAG_COUNT =
51
+ sizeof(VERBATIM_TAGS) / sizeof(VERBATIM_TAGS[0]);
52
+
53
+ // ── Low-level helpers ────────────────────────────────────────────────────
54
+
55
+ static inline bool is_eof(TSLexer *lexer) { return lexer->eof(lexer); }
56
+
57
+ static inline void advance(TSLexer *lexer) {
58
+ lexer->advance(lexer, false);
59
+ }
60
+
61
+ static inline bool is_tag_name_char(int32_t c) {
62
+ // SvelTeX tag names match `[a-zA-Z][-.:0-9_a-zA-Z]*` (see the VS Code
63
+ // extension's settings docs). This predicate covers the trailing chars.
64
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
65
+ (c >= '0' && c <= '9') || c == '-' || c == '.' || c == ':' ||
66
+ c == '_';
67
+ }
68
+
69
+ static inline bool is_space_or_tab(int32_t c) {
70
+ return c == ' ' || c == '\t';
71
+ }
72
+
73
+ // Consumes a `\n` or `\r\n` line ending. Returns whether one was consumed.
74
+ static bool consume_line_ending(TSLexer *lexer) {
75
+ if (lexer->lookahead == '\n') {
76
+ advance(lexer);
77
+ return true;
78
+ }
79
+ if (lexer->lookahead == '\r') {
80
+ advance(lexer);
81
+ if (lexer->lookahead == '\n') advance(lexer);
82
+ return true;
83
+ }
84
+ return false;
85
+ }
86
+
87
+ // ── Verbatim-environment look-ahead ──────────────────────────────────────
88
+ //
89
+ // At a `<`, decide whether what follows opens a verbatim environment. The
90
+ // lexer's lookahead is the `<`.
91
+ //
92
+ // Returns the length of the matched tag name (>0) on success and consumes the
93
+ // `<` and the tag name, or returns 0 on failure (having consumed only scratch
94
+ // input). The caller must `mark_end` before calling so a failed probe is
95
+ // discarded.
96
+ static unsigned probe_verbatim_open(TSLexer *lexer, char *out_name,
97
+ unsigned out_cap) {
98
+ advance(lexer); // consume '<'
99
+ if (lexer->lookahead == '/') return 0; // a closing tag, not an opening
100
+
101
+ unsigned len = 0;
102
+ while (len + 1 < out_cap && is_tag_name_char(lexer->lookahead)) {
103
+ out_name[len++] = (char)lexer->lookahead;
104
+ advance(lexer);
105
+ }
106
+ out_name[len] = '\0';
107
+ if (len == 0) return 0;
108
+
109
+ // The char after the name must be whitespace or `>` for this to be a tag.
110
+ int32_t after = lexer->lookahead;
111
+ bool ok = after == '>' || after == ' ' || after == '\t' ||
112
+ after == '\r' || after == '\n';
113
+ if (!ok) return 0;
114
+
115
+ for (unsigned i = 0; i < VERBATIM_TAG_COUNT; i++) {
116
+ if (strcmp(out_name, VERBATIM_TAGS[i]) == 0) return len;
117
+ }
118
+ return 0;
119
+ }
120
+
121
+ // Returns whether `name` is one of the configured verbatim tags.
122
+ static bool is_verbatim_tag(const char *name) {
123
+ for (unsigned i = 0; i < VERBATIM_TAG_COUNT; i++) {
124
+ if (strcmp(name, VERBATIM_TAGS[i]) == 0) return true;
125
+ }
126
+ return false;
127
+ }
128
+
129
+ // ── Frontmatter ──────────────────────────────────────────────────────────
130
+ //
131
+ // A frontmatter fence is a line consisting solely of `---` or `+++` (the
132
+ // `---` form may carry a trailing language keyword on the opening fence; the
133
+ // keyword itself is a separate immediate token in `grammar.js`). The scanner
134
+ // emits three tokens:
135
+ //
136
+ // * FRONTMATTER_START — the opening fence (just the `---`/`+++`),
137
+ // * FRONTMATTER_BODY — every line up to, but excluding, the closing fence,
138
+ // * FRONTMATTER_END — the closing fence line, line ending included.
139
+
140
+ // Scans the opening `---` / `+++`. Only the three fence characters are
141
+ // consumed; a trailing `yaml`/`toml`/`json` keyword (if any) is left for the
142
+ // grammar's immediate token.
143
+ static bool scan_frontmatter_start(TSLexer *lexer) {
144
+ int32_t fence = lexer->lookahead;
145
+ if (fence != '-' && fence != '+') return false;
146
+ for (int i = 0; i < 3; i++) {
147
+ if (lexer->lookahead != fence) return false;
148
+ advance(lexer);
149
+ }
150
+ lexer->result_symbol = FRONTMATTER_START;
151
+ lexer->mark_end(lexer);
152
+ return true;
153
+ }
154
+
155
+ // Returns whether the current line (lexer at its first column) consists
156
+ // solely of a frontmatter language keyword (`yaml`/`toml`/`json`) followed by
157
+ // optional spaces/tabs and a line ending. This is the keyword that may follow
158
+ // the *opening* `---` fence; the body scanner must decline it so the grammar's
159
+ // dedicated `frontmatter_language` token can match it instead. Consumes only
160
+ // scratch input; the caller controls `mark_end`.
161
+ static bool line_is_language_keyword(TSLexer *lexer) {
162
+ static const char *const KEYWORDS[] = {"yaml", "toml", "json"};
163
+ char word[8];
164
+ unsigned len = 0;
165
+ while (len + 1 < sizeof(word) && lexer->lookahead >= 'a' &&
166
+ lexer->lookahead <= 'z') {
167
+ word[len++] = (char)lexer->lookahead;
168
+ advance(lexer);
169
+ }
170
+ word[len] = '\0';
171
+ bool match = false;
172
+ for (unsigned i = 0; i < 3; i++) {
173
+ if (strcmp(word, KEYWORDS[i]) == 0) {
174
+ match = true;
175
+ break;
176
+ }
177
+ }
178
+ if (!match) return false;
179
+ while (is_space_or_tab(lexer->lookahead)) advance(lexer);
180
+ return lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
181
+ is_eof(lexer);
182
+ }
183
+
184
+ // Returns whether the current line (lexer at its first column) is a closing
185
+ // frontmatter fence: exactly `---` or `+++` optionally followed by spaces or
186
+ // tabs, then a line ending or EOF. Does not consume input the caller keeps;
187
+ // the caller controls `mark_end`.
188
+ static bool line_is_fence(TSLexer *lexer) {
189
+ int32_t fence = lexer->lookahead;
190
+ if (fence != '-' && fence != '+') return false;
191
+ for (int i = 0; i < 3; i++) {
192
+ if (lexer->lookahead != fence) return false;
193
+ advance(lexer);
194
+ }
195
+ while (is_space_or_tab(lexer->lookahead)) advance(lexer);
196
+ return lexer->lookahead == '\n' || lexer->lookahead == '\r' ||
197
+ is_eof(lexer);
198
+ }
199
+
200
+ // Scans the closing fence line, line ending included so the frontmatter node
201
+ // ends cleanly on a line boundary. The lexer starts at the fence's first
202
+ // column (the body scanner stops there).
203
+ static bool scan_frontmatter_end(TSLexer *lexer) {
204
+ int32_t fence = lexer->lookahead;
205
+ if (fence != '-' && fence != '+') return false;
206
+ for (int i = 0; i < 3; i++) {
207
+ if (lexer->lookahead != fence) return false;
208
+ advance(lexer);
209
+ }
210
+ while (is_space_or_tab(lexer->lookahead)) advance(lexer);
211
+ if (!is_eof(lexer) && !consume_line_ending(lexer)) return false;
212
+ lexer->result_symbol = FRONTMATTER_END;
213
+ lexer->mark_end(lexer);
214
+ return true;
215
+ }
216
+
217
+ // Scans the body: every line up to, but excluding, the closing fence. An
218
+ // unterminated frontmatter block consumes to EOF and still yields a body so a
219
+ // partial document parses into a stable tree.
220
+ //
221
+ // The token end is `mark_end`-ed at the start of each line *before* probing
222
+ // it for a fence: if the line is the closing fence, that marked position
223
+ // (line start) is exactly where the body must end, and `line_is_fence`'s own
224
+ // advances past the fence are left as scratch. On EOF the end is re-marked at
225
+ // the true cursor.
226
+ static bool scan_frontmatter_body(TSLexer *lexer) {
227
+ lexer->result_symbol = FRONTMATTER_BODY;
228
+ bool consumed = false;
229
+ bool first_line = true;
230
+
231
+ for (;;) {
232
+ if (is_eof(lexer)) {
233
+ // End the body at the current cursor (the marked end still sits
234
+ // at the start of the last consumed line otherwise).
235
+ lexer->mark_end(lexer);
236
+ break;
237
+ }
238
+ // At the start of a line: mark here, then classify the line. The
239
+ // first character is enough to pick the right probe — `-`/`+` can
240
+ // only begin a closing fence, `a`–`z` can only begin the opening
241
+ // fence's language keyword — and the probes are mutually exclusive,
242
+ // so exactly one runs and the line-start `mark_end` stays valid for a
243
+ // fence stop.
244
+ lexer->mark_end(lexer);
245
+ int32_t first = lexer->lookahead;
246
+ if (first == '-' || first == '+') {
247
+ if (line_is_fence(lexer)) break;
248
+ } else if (first_line && first >= 'a' && first <= 'z') {
249
+ // The very first body line might be the opening fence's language
250
+ // keyword (`---toml`). Decline so the grammar's dedicated
251
+ // `frontmatter_language` immediate token can match it.
252
+ if (line_is_language_keyword(lexer)) return false;
253
+ }
254
+ first_line = false;
255
+ // An ordinary body line — consume the rest of it (a probe above may
256
+ // have advanced the cursor partway through) plus its line ending.
257
+ while (!is_eof(lexer) && lexer->lookahead != '\n' &&
258
+ lexer->lookahead != '\r') {
259
+ advance(lexer);
260
+ }
261
+ consume_line_ending(lexer);
262
+ consumed = true;
263
+ }
264
+
265
+ return consumed;
266
+ }
267
+
268
+ // ── Markdown code skipping ───────────────────────────────────────────────
269
+ //
270
+ // SvelTeX treats fenced code blocks and inline code spans as opaque: a `$`,
271
+ // `<tag>` or `\(` inside them is *not* a math/verbatim delimiter (a `$state`
272
+ // rune inside a ```svelte block is the canonical example). The Markdown-chunk
273
+ // scanner must therefore skip over code the same way SvelTeX's escaper does,
274
+ // so those characters stay inside the chunk and reach the `markdown` grammar
275
+ // (which highlights the code and injects the code language) rather than being
276
+ // mis-tokenised as `.sveltex` constructs.
277
+ //
278
+ // These helpers are called from `scan_markdown_chunk` with all of their
279
+ // consumed input *kept* (they are not scratch): the chunk legitimately
280
+ // contains the code.
281
+
282
+ // Consumes a fenced code block whose `open_len`-long opening fence (of
283
+ // character `tick`, ` ` ` ` or `~`) has *already* been consumed by the
284
+ // caller, but whose info-string line has not. On return the rest of the
285
+ // opening line, the content, and the closing fence (or EOF) are all consumed.
286
+ //
287
+ // The closing fence is a line whose first non-indent run is at least
288
+ // `open_len` characters of the same `tick`.
289
+ static void skip_fenced_code_block_after_open(TSLexer *lexer, int32_t tick,
290
+ unsigned open_len) {
291
+ // Consume the rest of the opening fence line (the info string).
292
+ while (!is_eof(lexer) && lexer->lookahead != '\n' &&
293
+ lexer->lookahead != '\r') {
294
+ advance(lexer);
295
+ }
296
+ if (!consume_line_ending(lexer)) return;
297
+
298
+ for (;;) {
299
+ if (is_eof(lexer)) return;
300
+ // Skip up to three leading spaces of indent.
301
+ unsigned indent = 0;
302
+ while (indent < 3 && lexer->lookahead == ' ') {
303
+ advance(lexer);
304
+ indent++;
305
+ }
306
+ if (lexer->lookahead == tick) {
307
+ unsigned close_len = 0;
308
+ while (lexer->lookahead == tick) {
309
+ advance(lexer);
310
+ close_len++;
311
+ }
312
+ if (close_len >= open_len) {
313
+ // A closing fence — consume the rest of its line and stop.
314
+ while (!is_eof(lexer) && lexer->lookahead != '\n' &&
315
+ lexer->lookahead != '\r') {
316
+ advance(lexer);
317
+ }
318
+ consume_line_ending(lexer);
319
+ return;
320
+ }
321
+ }
322
+ // Not a closing fence — consume the rest of the line.
323
+ while (!is_eof(lexer) && lexer->lookahead != '\n' &&
324
+ lexer->lookahead != '\r') {
325
+ advance(lexer);
326
+ }
327
+ if (!consume_line_ending(lexer)) return;
328
+ }
329
+ }
330
+
331
+ // Consumes an inline code span whose `open_len`-long opening backtick run has
332
+ // *already* been consumed by the caller. On return the content and the
333
+ // matching closing run are consumed. A code span is closed by a backtick run
334
+ // of exactly `open_len`. If none is found before EOF the cursor stops at EOF,
335
+ // so the backticks simply degrade to literal text inside the chunk.
336
+ static void skip_inline_code_span_after_open(TSLexer *lexer,
337
+ unsigned open_len) {
338
+ for (;;) {
339
+ if (is_eof(lexer)) return;
340
+ if (lexer->lookahead == '`') {
341
+ unsigned run = 0;
342
+ while (lexer->lookahead == '`') {
343
+ advance(lexer);
344
+ run++;
345
+ }
346
+ if (run == open_len) return; // matched closing run
347
+ // A run of a different length is part of the span content.
348
+ continue;
349
+ }
350
+ advance(lexer);
351
+ }
352
+ }
353
+
354
+ // ── Svelte `<script>` / `<style>` and mustache-tag skipping ──────────────
355
+ //
356
+ // SvelTeX's escaper also treats `<script>` / `<style>` blocks and `{ … }`
357
+ // mustache tags as opaque (see `docs/.../escaping.md`): a `$` inside them is
358
+ // JS/CSS, not math (`import x from '$lib/…'` is the canonical example). The
359
+ // Markdown-chunk scanner skips them so the embedded grammars — not the
360
+ // `.sveltex` math rules — handle their contents.
361
+
362
+ // Case-insensitive equality of an already-read tag name against `keyword`.
363
+ static bool eq_keyword_ci(const char *name, const char *keyword) {
364
+ const char *a = name;
365
+ const char *b = keyword;
366
+ for (; *a && *b; a++, b++) {
367
+ int ca = (*a >= 'A' && *a <= 'Z') ? *a + 32 : *a;
368
+ if (ca != *b) return false;
369
+ }
370
+ return *a == '\0' && *b == '\0';
371
+ }
372
+
373
+ // Case-insensitively compares the upcoming input to `keyword`, consuming the
374
+ // characters as it goes. Returns whether they all matched. The caller has
375
+ // already consumed `<`; this is used to recognise `script` / `style`.
376
+ static bool match_keyword_ci(TSLexer *lexer, const char *keyword) {
377
+ for (const char *k = keyword; *k; k++) {
378
+ int32_t c = lexer->lookahead;
379
+ int32_t lower = (c >= 'A' && c <= 'Z') ? c + 32 : c;
380
+ if (lower != (int32_t)*k) return false;
381
+ advance(lexer);
382
+ }
383
+ return true;
384
+ }
385
+
386
+ // Consumes a `<script …>…</script>` or `<style …>…</style>` element whose
387
+ // opening `<` *and* tag name have already been consumed by the caller (the
388
+ // caller passes which `tag` was matched). The rest of the opening tag, the
389
+ // body and the matching `</tag>` (or EOF) are consumed. A self-closing
390
+ // `<script/>` is handled too.
391
+ static void skip_script_or_style_after_name(TSLexer *lexer,
392
+ const char *tag) {
393
+ // Consume the remainder of the opening tag, up to and including `>`.
394
+ for (;;) {
395
+ if (is_eof(lexer)) return;
396
+ int32_t c = lexer->lookahead;
397
+ if (c == '>') {
398
+ advance(lexer);
399
+ break;
400
+ }
401
+ if (c == '/') {
402
+ advance(lexer);
403
+ if (lexer->lookahead == '>') {
404
+ advance(lexer);
405
+ return; // self-closing `<script/>` — no body
406
+ }
407
+ continue;
408
+ }
409
+ advance(lexer);
410
+ }
411
+ // Consume the body up to the matching `</tag>` (case-insensitive).
412
+ for (;;) {
413
+ if (is_eof(lexer)) return;
414
+ if (lexer->lookahead == '<') {
415
+ advance(lexer);
416
+ if (lexer->lookahead == '/') {
417
+ advance(lexer);
418
+ if (match_keyword_ci(lexer, tag)) {
419
+ // Skip optional whitespace then require `>`.
420
+ while (lexer->lookahead == ' ' ||
421
+ lexer->lookahead == '\t' ||
422
+ lexer->lookahead == '\r' ||
423
+ lexer->lookahead == '\n') {
424
+ advance(lexer);
425
+ }
426
+ if (lexer->lookahead == '>') {
427
+ advance(lexer);
428
+ return;
429
+ }
430
+ }
431
+ }
432
+ continue;
433
+ }
434
+ advance(lexer);
435
+ }
436
+ }
437
+
438
+ // Consumes a balanced `{ … }` mustache tag whose opening `{` has *already*
439
+ // been consumed by the caller. Nested braces are tracked so a `{ {x} }`
440
+ // expression is consumed as a whole; an unbalanced tag consumes to EOF.
441
+ // Strings and template literals inside the expression are skipped so a `}`
442
+ // (or a `$`) inside a string literal does not end the tag prematurely.
443
+ static void skip_mustache_after_open(TSLexer *lexer) {
444
+ unsigned depth = 1;
445
+ for (;;) {
446
+ if (is_eof(lexer)) return;
447
+ int32_t c = lexer->lookahead;
448
+ if (c == '{') {
449
+ depth++;
450
+ advance(lexer);
451
+ continue;
452
+ }
453
+ if (c == '}') {
454
+ depth--;
455
+ advance(lexer);
456
+ if (depth == 0) return;
457
+ continue;
458
+ }
459
+ if (c == '\'' || c == '"' || c == '`') {
460
+ // Skip a string / template literal verbatim.
461
+ int32_t quote = c;
462
+ advance(lexer);
463
+ for (;;) {
464
+ if (is_eof(lexer)) return;
465
+ int32_t s = lexer->lookahead;
466
+ if (s == '\\') {
467
+ advance(lexer);
468
+ if (!is_eof(lexer)) advance(lexer);
469
+ continue;
470
+ }
471
+ if (s == quote) {
472
+ advance(lexer);
473
+ break;
474
+ }
475
+ advance(lexer);
476
+ }
477
+ continue;
478
+ }
479
+ advance(lexer);
480
+ }
481
+ }
482
+
483
+ // ── `_markdown_chunk` ────────────────────────────────────────────────────
484
+ //
485
+ // Consume a maximal run of ordinary content. The run stops just before the
486
+ // next `.sveltex`-special construct:
487
+ // * a verbatim opening tag `<tag …>` for a configured tag,
488
+ // * a `$` (single- or double-dollar math fence),
489
+ // * `\(` or `\[` (escaped-delimiter math).
490
+ //
491
+ // Fenced code blocks and inline code spans are skipped over wholesale (see
492
+ // the helpers above), so delimiter-like characters inside code never end the
493
+ // run.
494
+ //
495
+ // An empty result would loop forever, so it fails the token instead (which
496
+ // only happens on an empty document or when the cursor already sits on a
497
+ // boundary — both handled by the surrounding grammar).
498
+ //
499
+ // Whenever the run stops *at a boundary*, the token must end at that boundary
500
+ // — not at the cursor, which the look-ahead probes may have advanced past as
501
+ // scratch. Each boundary branch therefore `mark_end`s at the boundary and
502
+ // returns directly; only the EOF and end-of-loop paths `mark_end` at the
503
+ // cursor.
504
+ static bool scan_markdown_chunk(TSLexer *lexer) {
505
+ lexer->result_symbol = MARKDOWN_CHUNK;
506
+ bool consumed = false;
507
+ // Whether the cursor is at the first column of a line (modulo indent).
508
+ bool at_line_start = true;
509
+
510
+ for (;;) {
511
+ if (is_eof(lexer)) break;
512
+
513
+ int32_t here = lexer->lookahead;
514
+
515
+ // Fenced code block: a ``` or ~~~ run at the start of a line. Skipped
516
+ // wholesale so its contents never end the Markdown run.
517
+ if (at_line_start && (here == '`' || here == '~')) {
518
+ // A fence needs at least three of the same character.
519
+ // `skip_fenced_code_block` handles indent itself, but here the
520
+ // cursor is already past any indent (see the space branch below).
521
+ // Probe the run length without losing the position: only commit
522
+ // to a fenced block for a 3+ run, otherwise treat ``` as inline.
523
+ if (here == '`') {
524
+ // Could be a fenced block (3+) or an inline span (any run).
525
+ // `skip_inline_code_span` handles a 1–2 run; a 3+ run at line
526
+ // start is a fenced block. Distinguish by counting first.
527
+ lexer->mark_end(lexer); // boundary fallback (unused on skip)
528
+ unsigned run = 0;
529
+ // The probe advances the cursor; those advances are kept text
530
+ // regardless of the branch taken, so `consumed` is set.
531
+ while (lexer->lookahead == '`') {
532
+ advance(lexer);
533
+ run++;
534
+ }
535
+ consumed = true;
536
+ if (run >= 3) {
537
+ // Fenced block: consume content until the closing fence.
538
+ // The opening run is already consumed; resume from there.
539
+ skip_fenced_code_block_after_open(lexer, '`', run);
540
+ } else {
541
+ // Inline span opened by a 1–2 run: find the closing run.
542
+ skip_inline_code_span_after_open(lexer, run);
543
+ }
544
+ at_line_start = false;
545
+ continue;
546
+ }
547
+ // `~` only ever begins a fenced block (no inline `~` spans).
548
+ unsigned run = 0;
549
+ while (lexer->lookahead == '~') {
550
+ advance(lexer);
551
+ run++;
552
+ }
553
+ consumed = true;
554
+ if (run >= 3) {
555
+ skip_fenced_code_block_after_open(lexer, '~', run);
556
+ }
557
+ at_line_start = false;
558
+ continue;
559
+ }
560
+
561
+ if (here == '`') {
562
+ // An inline code span not at line start.
563
+ unsigned run = 0;
564
+ while (lexer->lookahead == '`') {
565
+ advance(lexer);
566
+ run++;
567
+ }
568
+ consumed = true;
569
+ skip_inline_code_span_after_open(lexer, run);
570
+ at_line_start = false;
571
+ continue;
572
+ }
573
+
574
+ if (here == '$') {
575
+ // A `$` always ends the Markdown run; the math rules decide
576
+ // whether it is inline or display. The cursor is exactly at the
577
+ // boundary, so mark and stop here.
578
+ lexer->mark_end(lexer);
579
+ return consumed;
580
+ }
581
+
582
+ if (here == '<') {
583
+ // Probe the tag. `mark_end` first so the boundary (the `<`) is
584
+ // the token end if this turns out to be a verbatim environment.
585
+ lexer->mark_end(lexer);
586
+ char name[32];
587
+ unsigned len = probe_verbatim_open(lexer, name, sizeof(name));
588
+ if (len > 0) {
589
+ // A verbatim environment starts here — the chunk ends at the
590
+ // already-marked `<`.
591
+ return consumed;
592
+ }
593
+ // Not verbatim. `probe_verbatim_open` left the (lower-case-able)
594
+ // tag name in `name`; a `<script>` / `<style>` element is opaque
595
+ // to SvelTeX, so skip the whole element — the cursor is already
596
+ // positioned right after the tag name.
597
+ if (eq_keyword_ci(name, "script")) {
598
+ skip_script_or_style_after_name(lexer, "script");
599
+ consumed = true;
600
+ at_line_start = false;
601
+ continue;
602
+ }
603
+ if (eq_keyword_ci(name, "style")) {
604
+ skip_script_or_style_after_name(lexer, "style");
605
+ consumed = true;
606
+ at_line_start = false;
607
+ continue;
608
+ }
609
+ // Any other `<` is ordinary Markdown/HTML/Svelte text. The probe
610
+ // consumed the `<` (and any partial name) as scratch; that scratch
611
+ // is now kept text, so the next loop iteration's `mark_end` (or
612
+ // the final one) includes it.
613
+ consumed = true;
614
+ at_line_start = false;
615
+ continue;
616
+ }
617
+
618
+ if (here == '{') {
619
+ // A Svelte mustache tag / logic-block delimiter. SvelTeX escapes
620
+ // `{ … }`, so a `$` inside it is JS, not math — skip the balanced
621
+ // tag wholesale.
622
+ advance(lexer); // consume '{'
623
+ skip_mustache_after_open(lexer);
624
+ consumed = true;
625
+ at_line_start = false;
626
+ continue;
627
+ }
628
+
629
+ if (here == '\\') {
630
+ // Could be `\(` / `\[` math, or an ordinary backslash escape.
631
+ lexer->mark_end(lexer);
632
+ advance(lexer); // scratch: consume '\'
633
+ int32_t n = lexer->lookahead;
634
+ if (n == '(' || n == '[') {
635
+ // Escaped-delimiter math starts here — body ends at the `\`.
636
+ return consumed;
637
+ }
638
+ // Ordinary escape — keep the `\` and the next char.
639
+ consumed = true;
640
+ at_line_start = false;
641
+ if (!is_eof(lexer)) advance(lexer);
642
+ continue;
643
+ }
644
+
645
+ // Ordinary character. A newline puts the cursor at a line start;
646
+ // leading spaces/tabs keep it there (so an indented fence still
647
+ // counts as a fence opener).
648
+ if (here == '\n' || here == '\r') {
649
+ at_line_start = true;
650
+ } else if (here != ' ' && here != '\t') {
651
+ at_line_start = false;
652
+ }
653
+ advance(lexer);
654
+ consumed = true;
655
+ }
656
+
657
+ // Reached EOF: the whole remaining input is Markdown.
658
+ if (consumed) {
659
+ lexer->mark_end(lexer);
660
+ return true;
661
+ }
662
+ return false;
663
+ }
664
+
665
+ // ── Verbatim body scanners ───────────────────────────────────────────────
666
+ //
667
+ // Consume everything up to (but excluding) the matching `</tag>`. The closing
668
+ // tag is matched by the LR grammar. An unterminated environment consumes to
669
+ // EOF and still yields a (non-empty) body so the partial tree is stable.
670
+ //
671
+ // `lexer` starts right after the opening tag's `>`.
672
+ static bool scan_verbatim_body(TSLexer *lexer, enum TokenType result) {
673
+ lexer->result_symbol = result;
674
+ bool consumed = false;
675
+
676
+ for (;;) {
677
+ if (is_eof(lexer)) break;
678
+
679
+ if (lexer->lookahead == '<') {
680
+ // Mark the end *before* the `<` so a found `</tag>` is excluded
681
+ // from the body.
682
+ lexer->mark_end(lexer);
683
+ advance(lexer);
684
+ if (lexer->lookahead == '/') {
685
+ advance(lexer);
686
+ char name[32];
687
+ unsigned len = 0;
688
+ while (len + 1 < sizeof(name) &&
689
+ is_tag_name_char(lexer->lookahead)) {
690
+ name[len++] = (char)lexer->lookahead;
691
+ advance(lexer);
692
+ }
693
+ name[len] = '\0';
694
+ if (len > 0 && lexer->lookahead == '>' &&
695
+ is_verbatim_tag(name)) {
696
+ // A real `</tag>` — stop, body excludes it.
697
+ if (consumed) return true;
698
+ // Zero-width body: let the grammar's `optional` body
699
+ // handle it by failing this token.
700
+ return false;
701
+ }
702
+ }
703
+ // A `<` that is not a verbatim close tag: part of the body.
704
+ consumed = true;
705
+ continue;
706
+ }
707
+
708
+ advance(lexer);
709
+ consumed = true;
710
+ }
711
+
712
+ // Reached EOF without a close tag.
713
+ if (consumed) {
714
+ lexer->mark_end(lexer);
715
+ return true;
716
+ }
717
+ return false;
718
+ }
719
+
720
+ // ── Dollar-math body scanners ────────────────────────────────────────────
721
+ //
722
+ // Consume the body of `$ … $` or `$$ … $$`. The lexer starts right after the
723
+ // opening fence. The body excludes the closing fence, which the LR grammar
724
+ // matches. A `\$` is an escaped dollar and does not close the math.
725
+ static bool scan_math_body(TSLexer *lexer, enum TokenType result,
726
+ bool display) {
727
+ lexer->result_symbol = result;
728
+ bool consumed = false;
729
+
730
+ for (;;) {
731
+ if (is_eof(lexer)) break;
732
+
733
+ if (lexer->lookahead == '\\') {
734
+ // Escape: keep the backslash and the next char verbatim.
735
+ advance(lexer);
736
+ consumed = true;
737
+ if (!is_eof(lexer)) advance(lexer);
738
+ continue;
739
+ }
740
+
741
+ if (lexer->lookahead == '$') {
742
+ // Potential closing fence — exclude it from the body.
743
+ lexer->mark_end(lexer);
744
+ advance(lexer);
745
+ bool second_dollar = (lexer->lookahead == '$');
746
+ if (display) {
747
+ // `$$` closes display math; a lone `$` inside is body.
748
+ if (second_dollar) {
749
+ return consumed; // empty body -> grammar `optional`
750
+ }
751
+ } else {
752
+ // A single `$` closes inline math. Even if it is the first
753
+ // `$` of a `$$`, the body still ends here.
754
+ return consumed;
755
+ }
756
+ // A lone `$` in display math: part of the body.
757
+ consumed = true;
758
+ continue;
759
+ }
760
+
761
+ advance(lexer);
762
+ consumed = true;
763
+ }
764
+
765
+ if (consumed) {
766
+ lexer->mark_end(lexer);
767
+ return true;
768
+ }
769
+ return false;
770
+ }
771
+
772
+ // ── tree-sitter entry points ─────────────────────────────────────────────
773
+
774
+ void *tree_sitter_sveltex_external_scanner_create(void) { return NULL; }
775
+
776
+ void tree_sitter_sveltex_external_scanner_destroy(void *payload) {
777
+ (void)payload;
778
+ }
779
+
780
+ unsigned tree_sitter_sveltex_external_scanner_serialize(void *payload,
781
+ char *buffer) {
782
+ (void)payload;
783
+ (void)buffer;
784
+ return 0; // stateless
785
+ }
786
+
787
+ void tree_sitter_sveltex_external_scanner_deserialize(void *payload,
788
+ const char *buffer,
789
+ unsigned length) {
790
+ (void)payload;
791
+ (void)buffer;
792
+ (void)length;
793
+ }
794
+
795
+ bool tree_sitter_sveltex_external_scanner_scan(void *payload, TSLexer *lexer,
796
+ const bool *valid_symbols) {
797
+ (void)payload;
798
+
799
+ // tree-sitter sets the error-sentinel slot while recovering from a parse
800
+ // error. The scanner has nothing useful to contribute then; declining
801
+ // lets the LR error recovery proceed.
802
+ if (valid_symbols[ERROR_SENTINEL]) {
803
+ return false;
804
+ }
805
+
806
+ // Frontmatter fences. `_frontmatter_start` is only valid in the document's
807
+ // initial state, so emitting it whenever it is valid and the input looks
808
+ // right is correct.
809
+ if (valid_symbols[FRONTMATTER_START] &&
810
+ (lexer->lookahead == '-' || lexer->lookahead == '+')) {
811
+ if (scan_frontmatter_start(lexer)) return true;
812
+ }
813
+ if (valid_symbols[FRONTMATTER_END] &&
814
+ (lexer->lookahead == '-' || lexer->lookahead == '+')) {
815
+ if (scan_frontmatter_end(lexer)) return true;
816
+ }
817
+ if (valid_symbols[FRONTMATTER_BODY]) {
818
+ if (scan_frontmatter_body(lexer)) return true;
819
+ // No body (closing fence immediately follows the opening fence); fall
820
+ // through so `_frontmatter_end` can be tried at the same position.
821
+ if (valid_symbols[FRONTMATTER_END] &&
822
+ (lexer->lookahead == '-' || lexer->lookahead == '+')) {
823
+ if (scan_frontmatter_end(lexer)) return true;
824
+ }
825
+ }
826
+
827
+ // Body tokens are mutually exclusive with `_markdown_chunk` at any given
828
+ // position, so the order of these checks does not matter for correctness —
829
+ // tree-sitter only marks the symbols valid in the current parse state.
830
+ if (valid_symbols[VERBATIM_TEX_CONTENT]) {
831
+ return scan_verbatim_body(lexer, VERBATIM_TEX_CONTENT);
832
+ }
833
+ if (valid_symbols[VERBATIM_PLAIN_CONTENT]) {
834
+ return scan_verbatim_body(lexer, VERBATIM_PLAIN_CONTENT);
835
+ }
836
+ if (valid_symbols[DISPLAY_MATH_CONTENT]) {
837
+ return scan_math_body(lexer, DISPLAY_MATH_CONTENT, true);
838
+ }
839
+ if (valid_symbols[INLINE_MATH_CONTENT]) {
840
+ return scan_math_body(lexer, INLINE_MATH_CONTENT, false);
841
+ }
842
+ if (valid_symbols[MARKDOWN_CHUNK]) {
843
+ return scan_markdown_chunk(lexer);
844
+ }
845
+
846
+ return false;
847
+ }