@bearcove/monaco-lang-styx 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,496 @@
1
+ import type * as monaco from 'monaco-editor';
2
+
3
+ // Styx tokenizer state
4
+ type ContextType = 'object' | 'sequence';
5
+ type EntryPhase = 'key' | 'value';
6
+
7
+ interface HeredocState {
8
+ delimiter: string;
9
+ language: string | null;
10
+ indentation: string | null; // captured when we see closing delimiter
11
+ }
12
+
13
+ interface StyxState extends monaco.languages.IState {
14
+ // Stack of contexts: each entry is either 'object' or 'sequence'
15
+ contextStack: ContextType[];
16
+ // Current entry phase (only meaningful in object context)
17
+ entryPhase: EntryPhase;
18
+ // Heredoc state (when inside heredoc content)
19
+ heredoc: HeredocState | null;
20
+ // Raw string hash count (when inside raw string)
21
+ rawStringHashes: number | null;
22
+ // String state (when inside quoted string)
23
+ inString: boolean;
24
+ // Is this a key string or value string?
25
+ stringIsKey: boolean;
26
+ }
27
+
28
+ function createInitialState(): StyxState {
29
+ return {
30
+ contextStack: ['object'], // document is implicit root object
31
+ entryPhase: 'key',
32
+ heredoc: null,
33
+ rawStringHashes: null,
34
+ inString: false,
35
+ stringIsKey: false,
36
+ clone() {
37
+ return {
38
+ contextStack: [...this.contextStack],
39
+ entryPhase: this.entryPhase,
40
+ heredoc: this.heredoc ? { ...this.heredoc } : null,
41
+ rawStringHashes: this.rawStringHashes,
42
+ inString: this.inString,
43
+ stringIsKey: this.stringIsKey,
44
+ clone: this.clone,
45
+ equals: this.equals,
46
+ };
47
+ },
48
+ equals(other: monaco.languages.IState): boolean {
49
+ const o = other as StyxState;
50
+ return (
51
+ this.contextStack.length === o.contextStack.length &&
52
+ this.contextStack.every((v, i) => v === o.contextStack[i]) &&
53
+ this.entryPhase === o.entryPhase &&
54
+ JSON.stringify(this.heredoc) === JSON.stringify(o.heredoc) &&
55
+ this.rawStringHashes === o.rawStringHashes &&
56
+ this.inString === o.inString &&
57
+ this.stringIsKey === o.stringIsKey
58
+ );
59
+ },
60
+ };
61
+ }
62
+
63
+ function currentContext(state: StyxState): ContextType {
64
+ return state.contextStack[state.contextStack.length - 1] || 'object';
65
+ }
66
+
67
+ function isInObjectContext(state: StyxState): boolean {
68
+ return currentContext(state) === 'object';
69
+ }
70
+
71
+ // Token types used by Monaco
72
+ const TOKEN = {
73
+ WHITE: 'white',
74
+ COMMENT: 'comment',
75
+ COMMENT_DOC: 'comment.doc',
76
+ KEY: 'key',
77
+ STRING_KEY: 'string.key',
78
+ TAG_KEY: 'tag.key',
79
+ VALUE: 'value',
80
+ TAG: 'tag',
81
+ STRING: 'string',
82
+ STRING_HEREDOC: 'string.heredoc',
83
+ STRING_ESCAPE: 'string.escape',
84
+ DELIMITER_CURLY: 'delimiter.curly',
85
+ DELIMITER_PAREN: 'delimiter.parenthesis',
86
+ DELIMITER_COMMA: 'delimiter.comma',
87
+ INVALID: 'invalid',
88
+ };
89
+
90
+ interface Token {
91
+ startIndex: number;
92
+ scopes: string;
93
+ }
94
+
95
+ // Regex patterns
96
+ const WHITESPACE = /^[ \t]+/;
97
+ const DOC_COMMENT = /^\/\/\/.*/;
98
+ const LINE_COMMENT = /^\/\/.*/;
99
+ const TAG_IDENT = /^@[A-Za-z_][A-Za-z0-9_-]*/;
100
+ const UNIT = /^@(?![A-Za-z_])/;
101
+ const HEREDOC_START = /^<<([A-Z][A-Z0-9_]*)(?:,([a-z][a-z0-9_.-]*))?/;
102
+ const RAW_STRING_START = /^r(#+)"/;
103
+ // Bare scalar: first char not in forbidden set, subsequent chars allow @ and =, but not >
104
+ const BARE_FIRST_CHAR = /^[^\s{}()\,\"=@>\r\n]/;
105
+ const BARE_CONT_CHAR = /^[^\s{}()\,\">\r\n]/;
106
+
107
+ /**
108
+ * Monaco tokens provider for Styx language.
109
+ * Handles context-aware tokenization including heredocs and embedded language injection.
110
+ */
111
+ export class StyxTokensProvider implements monaco.languages.TokensProvider {
112
+ private monacoEditor: typeof monaco.editor | undefined;
113
+
114
+ /**
115
+ * @param monacoEditor Optional monaco.editor reference for embedded language tokenization.
116
+ * If not provided, heredocs will be styled as plain heredoc strings.
117
+ */
118
+ constructor(monacoEditor?: typeof monaco.editor) {
119
+ this.monacoEditor = monacoEditor;
120
+ }
121
+
122
+ getInitialState(): monaco.languages.IState {
123
+ return createInitialState();
124
+ }
125
+
126
+ tokenize(line: string, inputState: monaco.languages.IState): monaco.languages.ILineTokens {
127
+ const state = (inputState as StyxState).clone() as StyxState;
128
+ const tokens: Token[] = [];
129
+ let pos = 0;
130
+
131
+ const addToken = (start: number, type: string) => {
132
+ tokens.push({ startIndex: start, scopes: type });
133
+ };
134
+
135
+ // Helper to determine token type based on context and phase
136
+ const atomType = (isTag: boolean): string => {
137
+ if (!isInObjectContext(state)) {
138
+ // In sequence context, everything is a value
139
+ return isTag ? TOKEN.TAG : TOKEN.VALUE;
140
+ }
141
+ // In object context
142
+ if (state.entryPhase === 'key') {
143
+ return isTag ? TOKEN.TAG_KEY : TOKEN.KEY;
144
+ }
145
+ return isTag ? TOKEN.TAG : TOKEN.VALUE;
146
+ };
147
+
148
+ const stringType = (): string => {
149
+ if (!isInObjectContext(state)) {
150
+ return TOKEN.STRING;
151
+ }
152
+ return state.entryPhase === 'key' ? TOKEN.STRING_KEY : TOKEN.STRING;
153
+ };
154
+
155
+ // After consuming an atom, update entry phase
156
+ const afterAtom = () => {
157
+ if (isInObjectContext(state)) {
158
+ if (state.entryPhase === 'key') {
159
+ state.entryPhase = 'value';
160
+ } else {
161
+ // After value, entry is complete, next atom is a key
162
+ state.entryPhase = 'key';
163
+ }
164
+ }
165
+ // In sequence context, phase doesn't change
166
+ };
167
+
168
+ // Handle heredoc content
169
+ if (state.heredoc) {
170
+ const delim = state.heredoc.delimiter;
171
+ // Check for closing delimiter (possibly indented)
172
+ const closeMatch = line.match(new RegExp(`^(\\s*)(${delim})\\s*$`));
173
+ if (closeMatch) {
174
+ // Closing delimiter line
175
+ addToken(0, TOKEN.STRING_HEREDOC);
176
+ state.heredoc = null;
177
+ afterAtom();
178
+ return { tokens, endState: state };
179
+ }
180
+
181
+ // Content line - check for language injection
182
+ const lang = state.heredoc.language;
183
+ if (lang && this.monacoEditor) {
184
+ // Try to use Monaco's built-in tokenizer for the embedded language
185
+ try {
186
+ const embeddedTokens = this.monacoEditor.tokenize(line, lang);
187
+ if (embeddedTokens.length > 0 && embeddedTokens[0].length > 0) {
188
+ // Use the embedded language's tokens
189
+ for (const token of embeddedTokens[0]) {
190
+ tokens.push({
191
+ startIndex: token.offset,
192
+ scopes: token.type,
193
+ });
194
+ }
195
+ return { tokens, endState: state };
196
+ }
197
+ } catch {
198
+ // Language not available, fall back to heredoc style
199
+ }
200
+ }
201
+
202
+ // Default: style as heredoc string
203
+ addToken(0, TOKEN.STRING_HEREDOC);
204
+ return { tokens, endState: state };
205
+ }
206
+
207
+ // Handle continued quoted string
208
+ if (state.inString) {
209
+ const tokenType = state.stringIsKey ? TOKEN.STRING_KEY : TOKEN.STRING;
210
+ while (pos < line.length) {
211
+ const ch = line[pos];
212
+ if (ch === '\\' && pos + 1 < line.length) {
213
+ // Escape sequence
214
+ if (tokens.length === 0 || tokens[tokens.length - 1].startIndex !== pos) {
215
+ addToken(pos, TOKEN.STRING_ESCAPE);
216
+ }
217
+ pos += 2;
218
+ if (pos < line.length) {
219
+ addToken(pos, tokenType);
220
+ }
221
+ } else if (ch === '"') {
222
+ // End of string
223
+ addToken(pos, tokenType);
224
+ pos++;
225
+ state.inString = false;
226
+ afterAtom();
227
+ break;
228
+ } else {
229
+ if (tokens.length === 0) {
230
+ addToken(pos, tokenType);
231
+ }
232
+ pos++;
233
+ }
234
+ }
235
+ if (pos >= line.length && state.inString) {
236
+ // String continues to next line (invalid in Styx, but highlight gracefully)
237
+ if (tokens.length === 0) {
238
+ addToken(0, tokenType);
239
+ }
240
+ return { tokens, endState: state };
241
+ }
242
+ }
243
+
244
+ // Handle continued raw string
245
+ if (state.rawStringHashes !== null) {
246
+ const hashes = state.rawStringHashes;
247
+ const tokenType = state.stringIsKey ? TOKEN.STRING_KEY : TOKEN.STRING;
248
+ const closePattern = '"' + '#'.repeat(hashes);
249
+
250
+ while (pos < line.length) {
251
+ const idx = line.indexOf(closePattern, pos);
252
+ if (idx >= 0) {
253
+ addToken(pos, tokenType);
254
+ pos = idx + closePattern.length;
255
+ state.rawStringHashes = null;
256
+ afterAtom();
257
+ break;
258
+ } else {
259
+ // No closing on this line
260
+ addToken(pos, tokenType);
261
+ return { tokens, endState: state };
262
+ }
263
+ }
264
+ }
265
+
266
+ // Main tokenization loop
267
+ while (pos < line.length) {
268
+ const rest = line.slice(pos);
269
+ let match: RegExpMatchArray | null;
270
+
271
+ // Whitespace
272
+ if ((match = rest.match(WHITESPACE))) {
273
+ addToken(pos, TOKEN.WHITE);
274
+ pos += match[0].length;
275
+ continue;
276
+ }
277
+
278
+ // Doc comment
279
+ if ((match = rest.match(DOC_COMMENT))) {
280
+ addToken(pos, TOKEN.COMMENT_DOC);
281
+ pos += match[0].length;
282
+ continue;
283
+ }
284
+
285
+ // Line comment
286
+ if ((match = rest.match(LINE_COMMENT))) {
287
+ addToken(pos, TOKEN.COMMENT);
288
+ pos += match[0].length;
289
+ continue;
290
+ }
291
+
292
+ // Opening brace - starts object context
293
+ if (rest[0] === '{') {
294
+ addToken(pos, TOKEN.DELIMITER_CURLY);
295
+ pos++;
296
+ state.contextStack.push('object');
297
+ state.entryPhase = 'key';
298
+ continue;
299
+ }
300
+
301
+ // Closing brace - ends object context
302
+ if (rest[0] === '}') {
303
+ addToken(pos, TOKEN.DELIMITER_CURLY);
304
+ pos++;
305
+ state.contextStack.pop();
306
+ // After closing brace, if we're back in object context, the brace was the value
307
+ // So entry is complete
308
+ if (isInObjectContext(state) && state.entryPhase === 'value') {
309
+ state.entryPhase = 'key';
310
+ }
311
+ continue;
312
+ }
313
+
314
+ // Opening paren - starts sequence context
315
+ if (rest[0] === '(') {
316
+ addToken(pos, TOKEN.DELIMITER_PAREN);
317
+ pos++;
318
+ state.contextStack.push('sequence');
319
+ continue;
320
+ }
321
+
322
+ // Closing paren - ends sequence context
323
+ if (rest[0] === ')') {
324
+ addToken(pos, TOKEN.DELIMITER_PAREN);
325
+ pos++;
326
+ state.contextStack.pop();
327
+ // After closing paren, if we're back in object context, the sequence was the value
328
+ if (isInObjectContext(state) && state.entryPhase === 'value') {
329
+ state.entryPhase = 'key';
330
+ }
331
+ continue;
332
+ }
333
+
334
+ // Comma - entry separator in object, invalid in sequence
335
+ if (rest[0] === ',') {
336
+ addToken(pos, TOKEN.DELIMITER_COMMA);
337
+ pos++;
338
+ // Comma ends the current entry, next atom is a key
339
+ if (isInObjectContext(state)) {
340
+ state.entryPhase = 'key';
341
+ }
342
+ continue;
343
+ }
344
+
345
+ // Heredoc
346
+ if ((match = rest.match(HEREDOC_START))) {
347
+ addToken(pos, TOKEN.STRING_HEREDOC);
348
+ pos += match[0].length;
349
+ state.heredoc = {
350
+ delimiter: match[1],
351
+ language: match[2] || null,
352
+ indentation: null,
353
+ };
354
+ // Heredoc continues to the next line
355
+ return { tokens, endState: state };
356
+ }
357
+
358
+ // Raw string
359
+ if ((match = rest.match(RAW_STRING_START))) {
360
+ const hashes = match[1].length;
361
+ const isKey = isInObjectContext(state) && state.entryPhase === 'key';
362
+ addToken(pos, isKey ? TOKEN.STRING_KEY : TOKEN.STRING);
363
+ pos += match[0].length;
364
+
365
+ // Look for closing
366
+ const closePattern = '"' + '#'.repeat(hashes);
367
+ const closeIdx = line.indexOf(closePattern, pos);
368
+ if (closeIdx >= 0) {
369
+ // Found closing on same line
370
+ pos = closeIdx + closePattern.length;
371
+ afterAtom();
372
+ } else {
373
+ // Continues to next line
374
+ state.rawStringHashes = hashes;
375
+ state.stringIsKey = isKey;
376
+ return { tokens, endState: state };
377
+ }
378
+ continue;
379
+ }
380
+
381
+ // Quoted string
382
+ if (rest[0] === '"') {
383
+ const isKey = isInObjectContext(state) && state.entryPhase === 'key';
384
+ const tokenType = isKey ? TOKEN.STRING_KEY : TOKEN.STRING;
385
+ addToken(pos, tokenType);
386
+ pos++;
387
+
388
+ // Parse the string
389
+ while (pos < line.length) {
390
+ const ch = line[pos];
391
+ if (ch === '\\' && pos + 1 < line.length) {
392
+ addToken(pos, TOKEN.STRING_ESCAPE);
393
+ pos += 2;
394
+ if (pos < line.length && line[pos] !== '"') {
395
+ addToken(pos, tokenType);
396
+ }
397
+ } else if (ch === '"') {
398
+ addToken(pos, tokenType);
399
+ pos++;
400
+ afterAtom();
401
+ break;
402
+ } else {
403
+ pos++;
404
+ }
405
+ }
406
+ if (pos >= line.length && line[line.length - 1] !== '"') {
407
+ // Unclosed string, continues to next line
408
+ state.inString = true;
409
+ state.stringIsKey = isKey;
410
+ }
411
+ continue;
412
+ }
413
+
414
+ // Unit (@)
415
+ if ((match = rest.match(UNIT))) {
416
+ addToken(pos, atomType(true));
417
+ pos += match[0].length;
418
+ afterAtom();
419
+ continue;
420
+ }
421
+
422
+ // Tag with identifier
423
+ if ((match = rest.match(TAG_IDENT))) {
424
+ addToken(pos, atomType(true));
425
+ pos += match[0].length;
426
+
427
+ // Check for immediate payload (no whitespace)
428
+ const afterTag = line.slice(pos);
429
+ if (afterTag[0] === '{' || afterTag[0] === '(') {
430
+ // Payload will be handled as separate atom by the braces
431
+ // But actually for tags like @tag{...}, the whole thing is one atom
432
+ // The brace handling will take care of context, but we should NOT call afterAtom yet
433
+ // Actually, let's reconsider: @tag{...} is ONE atom (tagged object)
434
+ // So after the closing }, that's when the atom ends
435
+ // For now, let's NOT call afterAtom here, let the closing brace handle it
436
+ // But wait, that means the brace opens a context and after closing,
437
+ // we'd still be at 'value' phase which is wrong
438
+ //
439
+ // Let me re-read the spec... "A tag MAY be immediately followed (no whitespace) by a payload"
440
+ // So @tag{...} is a tagged object, which is ONE value atom
441
+ //
442
+ // Actually the way our state machine works:
443
+ // 1. See @tag - it's a tag, phase becomes 'value' (if was 'key')
444
+ // 2. See { - opens object context, phase becomes 'key'
445
+ // 3. Inside object, process entries
446
+ // 4. See } - pops object context
447
+ // 5. Back in parent context, we need to know the entire @tag{...} was ONE atom
448
+ //
449
+ // This is tricky. For immediate payloads, we should NOT advance phase after the tag.
450
+ // We should wait for the payload to complete.
451
+ //
452
+ // For now, let's do it simply: tag without immediate payload = afterAtom()
453
+ // tag with immediate payload = don't afterAtom(), let the closing delimiter do it
454
+ continue; // Don't call afterAtom
455
+ } else if (afterTag[0] === '"' || afterTag.match(/^r#+"/)) {
456
+ // Tag with string payload - let the string parsing handle it
457
+ // Don't call afterAtom yet
458
+ continue;
459
+ } else if (afterTag.match(/^<<[A-Z]/)) {
460
+ // Tag with heredoc payload
461
+ continue;
462
+ }
463
+
464
+ // Tag with no immediate payload (tagged unit or standalone tag)
465
+ afterAtom();
466
+ continue;
467
+ }
468
+
469
+ // Bare scalar
470
+ if (rest.match(BARE_FIRST_CHAR)) {
471
+ const startPos = pos;
472
+ pos++;
473
+ // Continue consuming
474
+ while (pos < line.length && line.slice(pos).match(BARE_CONT_CHAR)) {
475
+ pos++;
476
+ }
477
+
478
+ // Check for attribute syntax (key>value)
479
+ addToken(startPos, atomType(false));
480
+ afterAtom();
481
+ continue;
482
+ }
483
+
484
+ // Unknown character - mark as invalid
485
+ addToken(pos, TOKEN.INVALID);
486
+ pos++;
487
+ }
488
+
489
+ // End of line - in object context, newline ends the entry
490
+ if (isInObjectContext(state)) {
491
+ state.entryPhase = 'key';
492
+ }
493
+
494
+ return { tokens, endState: state };
495
+ }
496
+ }