sommark 3.3.4 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +98 -82
  2. package/assets/logo.json +28 -0
  3. package/assets/smark.logo.png +0 -0
  4. package/assets/smark.logo.svg +21 -0
  5. package/cli/cli.mjs +7 -17
  6. package/cli/commands/build.js +24 -4
  7. package/cli/commands/color.js +22 -26
  8. package/cli/commands/help.js +10 -10
  9. package/cli/commands/init.js +20 -31
  10. package/cli/commands/print.js +18 -16
  11. package/cli/commands/show.js +4 -0
  12. package/cli/commands/version.js +6 -0
  13. package/cli/constants.js +9 -5
  14. package/cli/helpers/config.js +11 -0
  15. package/cli/helpers/file.js +17 -6
  16. package/cli/helpers/transpile.js +7 -12
  17. package/core/errors.js +49 -25
  18. package/core/formats.js +7 -3
  19. package/core/formatter.js +215 -0
  20. package/core/helpers/config-loader.js +29 -74
  21. package/core/labels.js +21 -9
  22. package/core/lexer.js +491 -212
  23. package/core/modules.js +164 -0
  24. package/core/parser.js +516 -389
  25. package/core/tokenTypes.js +36 -1
  26. package/core/transpiler.js +237 -154
  27. package/core/validator.js +79 -0
  28. package/formatter/mark.js +203 -43
  29. package/formatter/tag.js +202 -32
  30. package/grammar.ebnf +57 -50
  31. package/helpers/colorize.js +26 -13
  32. package/helpers/escapeHTML.js +13 -6
  33. package/helpers/kebabize.js +6 -0
  34. package/helpers/peek.js +9 -0
  35. package/helpers/removeChar.js +26 -13
  36. package/helpers/safeDataParser.js +114 -0
  37. package/helpers/utils.js +140 -158
  38. package/index.js +198 -188
  39. package/mappers/languages/html.js +105 -213
  40. package/mappers/languages/json.js +122 -171
  41. package/mappers/languages/markdown.js +355 -108
  42. package/mappers/languages/mdx.js +76 -120
  43. package/mappers/languages/xml.js +114 -0
  44. package/mappers/mapper.js +152 -123
  45. package/mappers/shared/index.js +22 -0
  46. package/package.json +26 -6
  47. package/SOMMARK-SPEC.md +0 -481
  48. package/cli/commands/list.js +0 -124
  49. package/constants/html_tags.js +0 -146
  50. package/core/pluginManager.js +0 -149
  51. package/core/plugins/comment-remover.js +0 -47
  52. package/core/plugins/module-system.js +0 -176
  53. package/core/plugins/raw-content-plugin.js +0 -78
  54. package/core/plugins/rules-validation-plugin.js +0 -231
  55. package/core/plugins/sommark-format.js +0 -244
  56. package/coverage_test.js +0 -21
  57. package/debug.js +0 -15
  58. package/helpers/camelize.js +0 -2
  59. package/helpers/defaultTheme.js +0 -3
  60. package/test_format_fix.js +0 -42
  61. package/v3-todo.smark +0 -73
package/core/lexer.js CHANGED
@@ -5,247 +5,526 @@ import { lexerError } from "./errors.js";
5
5
 
6
6
  /**
7
7
  * SomMark Lexer
8
+ *
9
+ * Transforms a raw SomMark source string into a stream of tokens.
10
+ * It uses a state-machine approach to handle complex contexts like At-Block bodies,
11
+ * quoted values, and hierarchical headers.
12
+ *
13
+ * @param {string} src - The raw SomMark source code.
14
+ * @param {string} [filename="anonymous"] - Source filename for error reporting.
15
+ * @returns {Array<Object>} Array of token objects.
8
16
  */
17
+ function lexer(src, filename = "anonymous") {
18
+ if (!src || typeof src !== "string") return [];
19
+ const tokens = [];
9
20
 
10
- // ========================================================================== //
11
- // Helper Functions //
12
- // ========================================================================== //
21
+ let prev_type = "";
22
+ let last_non_junk_type = ""; // Tracks the last real token for context guessing
23
+ let i = 0;
24
+ let line = 0, character = 0;
13
25
 
14
- const atBlockEndRegex = new RegExp(`^@_\\s*${end_keyword}\\s*_@`);
26
+ // State Variables
27
+ let isInAtBlockBody = false;
28
+ let isInQuote = false;
29
+ let isInHeader = false; // Tracks if we are in a structural header context
30
+ let isInInlineHead = false; // Specific for (key:val) after ->
31
+ let parenDepth = 0; // To track balanced parentheses in inlines
32
+ let delimiterStack = []; // To track block nesting for body mode
15
33
 
16
- // Checks if we reached the end of an At-Block
17
- function isAtBlockEnd(input, index) {
18
- const slice = typeof input === "string" ? input.slice(index, index + 100) : input.slice(index, index + 100).join("");
19
- return atBlockEndRegex.test(slice);
20
- }
34
+ /**
35
+ * Adds a token to the stream and updates the scanner's position tracking.
36
+ *
37
+ * @param {string} type - The type of token (from TOKEN_TYPES).
38
+ * @param {string} value - The literal text content of the token.
39
+ */
40
+ function addToken(type, value) {
41
+ const start = { line, character };
21
42
 
22
- // Collects characters inside a quoted string
23
- function concatQuote(input, index) {
24
- let text = "\"";
25
- for (let i = index + 1; i < input.length; i++) {
26
- const char = input[i];
27
- if (char === "\\" && peek(input, i, 1) === "\"") {
28
- text += "\\\"";
29
- i++;
30
- continue;
43
+ // Update position
44
+ const parts = value.split("\n");
45
+ if (parts.length > 1) {
46
+ line += parts.length - 1;
47
+ character = parts[parts.length - 1].length;
48
+ } else {
49
+ character += value.length;
50
+ }
51
+
52
+ const end = { line, character };
53
+ tokens.push({
54
+ type,
55
+ value,
56
+ source: filename,
57
+ range: { start, end },
58
+ depth: delimiterStack.length
59
+ });
60
+
61
+ prev_type = type;
62
+ if (type !== TOKEN_TYPES.WHITESPACE && type !== TOKEN_TYPES.COMMENT) {
63
+ if (type !== TOKEN_TYPES.TEXT || value.trim() !== "") {
64
+ last_non_junk_type = type;
65
+ }
31
66
  }
32
- text += char;
33
- if (char === "\"") return text;
34
67
  }
35
- lexerError(["[Lexer Error]: Unclosed quote"]);
36
- return text;
37
- }
38
68
 
39
- // Collects plain text until a special character is found
40
- function concatText(input, index, isInHeader, isInAtBlockBody, isLiberalValue = false) {
41
- let text = "";
42
- if (index >= input.length) return text;
43
- for (let i = index; i < input.length; i++) {
44
- const char = input[i];
45
- const stopConditions = [
46
- ["[", !isInAtBlockBody],
47
- ["(", !isInAtBlockBody],
48
- ["#", !isInAtBlockBody && !isLiberalValue],
49
- ["=", isInHeader && !isInAtBlockBody],
50
- ["\"", isInHeader],
51
- ["]", isInHeader],
52
- [")", isInHeader],
53
- ["-", peek(input, i, 1) === ">" && (isInHeader || true)],
54
- ["@", peek(input, i, 1) === "_" && (!isInAtBlockBody || isAtBlockEnd(input, i))],
55
- ["_", peek(input, i, 1) === "@" && isInHeader],
56
- ["\\", true],
57
- [":", isInHeader && !isInAtBlockBody],
58
- [";", isInHeader],
59
- [",", isInHeader]
60
- ];
61
- let shouldStop = false;
62
- for (const [stopChar, conditionMet] of stopConditions) {
63
- if (conditionMet && input.substring(i, i + stopChar.length) === stopChar) {
64
- shouldStop = true;
65
- break;
66
- }
67
- }
68
- if (shouldStop) break;
69
- text += char;
69
+ /**
70
+ * Looks ahead to find the next structural character, skipping whitespace and comments.
71
+ * Used for context-guessing (e.g., distinguishing KEY from VALUE).
72
+ *
73
+ * @param {number} start - Index to start peeking from.
74
+ * @returns {string|null} The next structural character or null if EOF.
75
+ */
76
+ function peekStructural(start) {
77
+ let j = start;
78
+ while (j < src.length) {
79
+ const c = src[j];
80
+ if (c === " " || c === "\t" || c === "\n" || c === "\r") {
81
+ j++;
82
+ continue;
83
+ }
84
+ if (c === "#") {
85
+ while (j < src.length && src[j] !== "\n") j++;
86
+ continue;
87
+ }
88
+ if (c === "\\") {
89
+ // Escape sequence: jump over the backslash and the escaped char
90
+ j += 2;
91
+ continue;
92
+ }
93
+ return c;
94
+ }
95
+ return null;
70
96
  }
71
- return text;
72
- }
73
97
 
74
- // Handles backslash escapes in the text
75
- function concatEscape(input, index) {
76
- if (index >= input.length) return "";
77
- const nextChar = peek(input, index, 1);
78
- const WHITESPACES = [" ", "\t", "\n", "\r", "\v", "\f"];
79
- if (WHITESPACES.includes(nextChar)) lexerError(["[Lexer Error]: Invalid escape sequence (escaped whitespace)"]);
80
- if (input[index] === "\\" && nextChar !== null) return "\\" + nextChar;
81
- lexerError(["[Lexer Error]: Invalid escape sequence"]);
82
- return "";
83
- }
98
+ while (i < src.length) {
99
+ // --- PHASE 1: AT-BLOCK BODY MODE ---
100
+ // In this mode, we consume everything as raw text until we hit the @_ marker.
101
+ if (isInAtBlockBody) {
102
+ if (src[i] === "@" && src[i + 1] === "_") {
103
+ isInAtBlockBody = false;
104
+ } else {
105
+ let body = "";
106
+ while (i < src.length) {
107
+ // Handle escapes in At-Block Body
108
+ if (src[i] === "\\" && i + 1 < src.length) {
109
+ body += src[i + 1];
110
+ i += 2;
111
+ continue;
112
+ }
113
+ // Stop at end marker
114
+ if (src[i] === "@" && src[i + 1] === "_") {
115
+ break;
116
+ }
117
+ body += src[i];
118
+ i++;
119
+ }
120
+ if (body.length > 0) {
121
+ addToken(TOKEN_TYPES.TEXT, body);
122
+ }
123
+ continue;
124
+ }
125
+ }
126
+ const char = src[i];
127
+ const next = src[i + 1];
84
128
 
85
- // ========================================================================== //
86
- // Main Lexer Function //
87
- // ========================================================================== //
129
+ // --- PHASE 2: QUOTE MODE ---
130
+ // Handles balanced strings and allows prefix layers (js{}, p{}) inside them.
131
+ if (isInQuote) {
132
+ let quoteValue = "";
133
+ const quoteChar = tokens[tokens.length - 1].value;
134
+ while (i < src.length) {
135
+ if (src[i] === "\\" && i + 1 < src.length) {
136
+ // Inside quotes, we split escapes if we want to match reliability tests
137
+ if (quoteValue.length > 0) addToken(TOKEN_TYPES.VALUE, quoteValue);
138
+ addToken(TOKEN_TYPES.ESCAPE, "\\" + src[i + 1]);
139
+ quoteValue = "";
140
+ i += 2;
141
+ continue;
142
+ }
143
+
144
+ // Support Prefix Layers inside quotes!
145
+ if ((src[i] === "j" && src[i+1] === "s" && src[i+2] === "{") || (src[i] === "p" && src[i+1] === "{")) {
146
+ const isJS = (src[i] === "j");
147
+ if (quoteValue.length > 0) {
148
+ addToken(TOKEN_TYPES.VALUE, quoteValue);
149
+ quoteValue = "";
150
+ }
151
+
152
+ let braceDepth = 1;
153
+ let prefixValue = isJS ? "js{" : "p{";
154
+ i += isJS ? 3 : 2;
88
155
 
89
- function lexer(src) {
90
- if (!src || typeof src !== "string") return [];
91
- const tokens = [];
92
- let isInHeader = false, isInAtBlockBody = false;
93
- let line = 0, character = 0, depth_stack = [];
94
-
95
- // ========================================================================== //
96
- // Token Creation Helpers //
97
- // ========================================================================== //
98
-
99
- function addToken(type, value, rawValue) {
100
- if (typeof rawValue === "string" && typeof value === "string" && rawValue !== value) {
101
- const offset = rawValue.indexOf(value);
102
- if (offset !== -1) {
103
- advance(rawValue.slice(0, offset));
104
- const startPos = { line, character }; advance(value);
105
- const endPos = { line, character };
106
- tokens.push({ type, value, range: { start: startPos, end: endPos }, depth: depth_stack.length });
107
- advance(rawValue.slice(offset + value.length));
108
- return;
109
- }
110
- }
111
- const startPos = { line, character }; advance(rawValue || value);
112
- const endPos = { line, character };
113
- tokens.push({ type, value, range: { start: startPos, end: endPos }, depth: depth_stack.length });
114
- }
156
+ let internalString = null;
157
+ while (i < src.length && braceDepth > 0) {
158
+ const c = src[i];
159
+ const n = src[i + 1];
160
+ if (internalString) {
161
+ if (c === "\\" && (n === internalString || n === "\\")) {
162
+ prefixValue += c + n;
163
+ i += 2;
164
+ continue;
165
+ }
166
+ if (c === internalString) internalString = null;
167
+ } else {
168
+ if (c === "\"" || c === "'") internalString = c;
169
+ else if (c === "{") braceDepth++;
170
+ else if (c === "}") braceDepth--;
171
+ }
172
+ prefixValue += c;
173
+ i++;
174
+ }
175
+ addToken(isJS ? TOKEN_TYPES.PREFIX_JS : TOKEN_TYPES.PREFIX_P, prefixValue);
176
+ continue;
177
+ }
115
178
 
116
- function advance(text) {
117
- const newlines = (text.match(/\n/g) || []).length;
118
- if (newlines > 0) { line += newlines; character = text.split("\n").pop().length; }
119
- else character += text.length;
120
- }
179
+ if (src[i] === quoteChar) {
180
+ // Guess role based on next structural character
181
+ let nextStructural = peekStructural(i + 1);
182
+ let tokenType = (isInHeader || isInInlineHead) && (nextStructural === ":" || nextStructural === "=")
183
+ ? TOKEN_TYPES.KEY
184
+ : TOKEN_TYPES.VALUE;
121
185
 
122
- function validateIdentifier(id, charPos) {
123
- if (!/^[a-zA-Z0-9\-_$]+$/.test(id.trim())) {
124
- lexerError([`[Lexer Error]: Invalid Identifier: '${id.trim()}' at line ${line + 1}, col ${charPos || character}`]);
186
+ if (quoteValue.length > 0) addToken(tokenType, quoteValue);
187
+ addToken(TOKEN_TYPES.QUOTE, quoteChar);
188
+ isInQuote = false;
189
+ i++;
190
+ break;
191
+ }
192
+ quoteValue += src[i];
193
+ i++;
194
+ }
195
+ if (!isInQuote) continue;
125
196
  }
126
- }
127
197
 
128
- // ========================================================================== //
129
- // Main Tokenization Loop //
130
- // ========================================================================== //
131
-
132
- for (let i = 0; i < src.length; i++) {
133
- const char = src[i];
134
- const next = peek(src, i, 1);
198
+ // --- PHASE 3: STRUCTURAL PARSING ---
199
+ // Handles markers, whitespace, and structural symbols.
135
200
 
136
- // ========================================================================== //
137
- // Look back at previous tokens to determine current context //
138
- // ========================================================================== //
139
- let prev_type = "", prev_prev_type = "", count = 0;
140
- for (let j = tokens.length - 1; j >= 0; j--) {
141
- const t = tokens[j];
142
- if (t.type !== TOKEN_TYPES.TEXT && t.type !== TOKEN_TYPES.COMMENT) {
143
- if (count === 0) prev_type = t.type;
144
- else if (count === 1) prev_prev_type = t.type;
145
- count++; if (count >= 2) break;
146
- }
147
- }
148
-
149
- // ========================================================================== //
150
- // Check for structural characters ([ ], ( ), @_, _@) //
151
- // ========================================================================== //
152
-
153
- if (char === "[" && !isInAtBlockBody) {
154
- let idPeek = ""; for (let j = i + 1; j < src.length && !/[=\]:#]/.test(src[j]); j++) idPeek += src[j];
155
- if (idPeek.trim() !== end_keyword) depth_stack.push("B");
156
- addToken(TOKEN_TYPES.OPEN_BRACKET, char); isInHeader = true;
157
- } else if (char === "]" && isInHeader) {
158
- addToken(TOKEN_TYPES.CLOSE_BRACKET, char); isInHeader = false;
159
- // Reliable depth pop on [end]
160
- for (let j = tokens.length - 1; j >= 0; j--) {
161
- const t = tokens[j];
162
- if (t.type === TOKEN_TYPES.IDENTIFIER || t.type === TOKEN_TYPES.END_KEYWORD) {
163
- if (t.type === TOKEN_TYPES.END_KEYWORD || t.value.trim() === end_keyword) depth_stack.pop();
164
- break;
201
+ // WHITESPACE
202
+ if (char === "\n") {
203
+ addToken(TOKEN_TYPES.WHITESPACE, char);
204
+ i++;
205
+ continue;
206
+ }
207
+
208
+ if (char === " " || char === "\t" || char === "\r") {
209
+ let ws = "";
210
+ while (i < src.length && (src[i] === " " || src[i] === "\t" || src[i] === "\r")) {
211
+ ws += src[i];
212
+ i++;
213
+ }
214
+ addToken(TOKEN_TYPES.WHITESPACE, ws);
215
+ continue;
216
+ }
217
+
218
+ // COMMENTS
219
+ if (char === "#") {
220
+ let comm = "";
221
+ while (i < src.length && src[i] !== "\n") {
222
+ comm += src[i];
223
+ i++;
224
+ }
225
+ addToken(TOKEN_TYPES.COMMENT, comm);
226
+ continue;
227
+ }
228
+
229
+ // ESCAPE CHARACTER (Sequence-based)
230
+ if (char === "\\") {
231
+ const seq = i + 1 < src.length ? "\\" + src[i + 1] : "\\";
232
+ addToken(TOKEN_TYPES.ESCAPE, seq);
233
+ i += seq.length;
234
+ continue;
235
+ }
236
+
237
+ // PREFIX LAYERS (js{...} or p{...})
238
+ if ((char === "j" && next === "s" && src[i+2] === "{") || (char === "p" && next === "{")) {
239
+ const isJS = (char === "j");
240
+ const isP = (char === "p");
241
+
242
+ // Context Check
243
+ const top = (delimiterStack.length > 0) ? delimiterStack[delimiterStack.length - 1] : null;
244
+ const isInBlockHeader = isInHeader && top === "[";
245
+ const isInNormalText = !isInHeader && !isInInlineHead && !isInAtBlockBody && parenDepth === 0;
246
+
247
+ let allowed = false;
248
+ if (isJS && isInBlockHeader) allowed = true;
249
+ if (isP && (isInBlockHeader || isInNormalText)) allowed = true;
250
+
251
+ if (allowed) {
252
+ let braceDepth = 1;
253
+ let prefixValue = isJS ? "js{" : "p{";
254
+ i += isJS ? 3 : 2;
255
+
256
+ let inString = null; // Track if we are inside " " or ' '
257
+ while (i < src.length && braceDepth > 0) {
258
+ const c = src[i];
259
+ const n = src[i + 1];
260
+
261
+ if (inString) {
262
+ if (c === "\\" && (n === inString || n === "\\")) {
263
+ prefixValue += c + n;
264
+ i += 2;
265
+ continue;
266
+ }
267
+ if (c === inString) inString = null;
268
+ } else {
269
+ if (c === "\"" || c === "'") inString = c;
270
+ else if (c === "{") braceDepth++;
271
+ else if (c === "}") braceDepth--;
272
+ }
273
+ prefixValue += c;
274
+ i++;
165
275
  }
276
+ addToken(isJS ? TOKEN_TYPES.PREFIX_JS : TOKEN_TYPES.PREFIX_P, prefixValue);
277
+ continue;
166
278
  }
167
- } else if (char === "(" && !isInAtBlockBody) {
168
- addToken(TOKEN_TYPES.OPEN_PAREN, char); isInHeader = true;
169
- } else if (char === ")" && isInHeader) {
170
- addToken(TOKEN_TYPES.CLOSE_PAREN, char); isInHeader = false;
171
- } else if (char === "@" && next === "_" && (!isInAtBlockBody || isAtBlockEnd(src, i))) {
172
- let idPeek = ""; for (let j = i + 2; j < src.length && !/[_@:#]/.test(src[j]); j++) idPeek += src[j];
173
- if (idPeek.trim() !== end_keyword) depth_stack.push("A");
174
- addToken(TOKEN_TYPES.OPEN_AT, "@_"); i++; isInHeader = true;
175
- } else if (char === "_" && next === "@" && (isInHeader || isInAtBlockBody)) {
176
- addToken(TOKEN_TYPES.CLOSE_AT, "_@"); i++;
177
- for (let j = tokens.length - 1; j >= 0; j--) {
178
- const t = tokens[j];
179
- if (t.type === TOKEN_TYPES.IDENTIFIER || t.type === TOKEN_TYPES.END_KEYWORD) {
180
- if (t.type === TOKEN_TYPES.END_KEYWORD || t.value.trim() === end_keyword) depth_stack.pop();
181
- break;
279
+ // If not allowed, it will fall through to normal word scanning
280
+ }
281
+
282
+ // MULTI-CHAR MARKERS
283
+ if (char === "@" && next === "_") {
284
+ addToken(TOKEN_TYPES.OPEN_AT, "@_");
285
+ i += 2;
286
+ if (!isInAtBlockBody) delimiterStack.push("@");
287
+ isInHeader = true; // At-Blocks start with a header part
288
+ continue;
289
+ }
290
+ if (char === "-" && next === ">") {
291
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
292
+ addToken(TOKEN_TYPES.TEXT, "-");
293
+ i++; // Swallowed one char
294
+ } else {
295
+ addToken(TOKEN_TYPES.THIN_ARROW, "->");
296
+ i += 2;
297
+ isInInlineHead = true; // The following ( ) will be structural
298
+ }
299
+ continue;
300
+ }
301
+
302
+ // SINGLE-CHAR MARKERS
303
+ if (char === "[") {
304
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
305
+ addToken(TOKEN_TYPES.TEXT, "[");
306
+ } else {
307
+ addToken(TOKEN_TYPES.OPEN_BRACKET, "[");
308
+ delimiterStack.push("[");
309
+ isInHeader = true;
310
+ }
311
+ i++;
312
+ continue;
313
+ }
314
+ if (char === "_" && next === "@") {
315
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
316
+ addToken(TOKEN_TYPES.TEXT, "_@");
317
+ } else {
318
+ const lastRealType = last_non_junk_type;
319
+ addToken(TOKEN_TYPES.CLOSE_AT, "_@");
320
+ const top = delimiterStack[delimiterStack.length - 1];
321
+ if (top === "@") {
322
+ if (lastRealType === TOKEN_TYPES.END_KEYWORD) {
323
+ delimiterStack.pop();
324
+ isInAtBlockBody = false;
325
+ isInHeader = false;
326
+ }
182
327
  }
183
328
  }
184
- isInHeader = true; isInAtBlockBody = false;
185
- } else if (char === ";" && isInHeader) {
186
- addToken(TOKEN_TYPES.SEMICOLON, char); isInHeader = false; isInAtBlockBody = true;
187
- } else if (char === "=" && isInHeader && !isInAtBlockBody) {
188
- addToken(TOKEN_TYPES.EQUAL, char);
189
- } else if (char === ":" && isInHeader && !isInAtBlockBody && (prev_type === TOKEN_TYPES.IDENTIFIER || prev_type === TOKEN_TYPES.CLOSE_AT)) {
190
- addToken(TOKEN_TYPES.COLON, char);
191
- } else if (char === "," && isInHeader) {
192
- addToken(TOKEN_TYPES.COMMA, char);
193
- } else if (char === "-" && next === ">" && (isInHeader || prev_type === TOKEN_TYPES.CLOSE_PAREN)) {
194
- addToken(TOKEN_TYPES.THIN_ARROW, "->"); i++;
195
- } else if (char === "\"" && isInHeader) {
196
- const quote = concatQuote(src, i); addToken(TOKEN_TYPES.VALUE, quote); i += quote.length - 1;
197
- } else if (char === "\\") {
198
- const esc = concatEscape(src, i); addToken(TOKEN_TYPES.ESCAPE, esc); i += esc.length - 1;
199
- } else if (char === "#" && !isInAtBlockBody) {
200
- let comm = ""; for (; i < src.length && src[i] !== "\n"; i++) comm += src[i];
201
- addToken(TOKEN_TYPES.COMMENT, comm, comm); i--;
202
- } else if (char === "\n" && !isInAtBlockBody) {
203
- advance(char);
204
- } else {
205
- // ========================================================================== //
206
- // Capture plain text or Identifier values //
207
- // ========================================================================== //
208
- const isValueContext = (prev_type === TOKEN_TYPES.COLON || prev_type === TOKEN_TYPES.EQUAL);
209
- const context = concatText(src, i, isInHeader, isInAtBlockBody, isValueContext);
210
- if (context.length > 0) {
211
- if (isInHeader) {
212
- const trimmed = context.trim();
213
- if ((prev_type === TOKEN_TYPES.OPEN_BRACKET || prev_type === TOKEN_TYPES.OPEN_AT) && trimmed === end_keyword) {
214
- addToken(TOKEN_TYPES.END_KEYWORD, trimmed, context);
215
- } else if (trimmed.length > 0) {
216
- let isNextColon = false;
217
- for (let j = i + context.length; j < src.length; j++) {
218
- const c = src[j];
219
- if (c === " " || c === "\t" || c === "\n") continue;
220
- if (c === ":") isNextColon = true;
221
- break;
222
- }
223
-
224
- const isBlockStart = (prev_type === TOKEN_TYPES.OPEN_BRACKET || prev_type === TOKEN_TYPES.OPEN_AT);
225
- const isMapperHead = (prev_type === TOKEN_TYPES.OPEN_PAREN && prev_prev_type === TOKEN_TYPES.THIN_ARROW);
226
- const isMandatoryId = (isNextColon || prev_type === TOKEN_TYPES.THIN_ARROW);
227
-
228
- if (isBlockStart || isMapperHead || isMandatoryId) {
229
- validateIdentifier(trimmed, character + context.indexOf(trimmed));
230
- addToken(TOKEN_TYPES.IDENTIFIER, trimmed, context);
231
- } else {
232
- addToken(TOKEN_TYPES.VALUE, trimmed, context);
329
+ i += 2;
330
+ continue;
331
+ }
332
+ if (char === "]") {
333
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
334
+ addToken(TOKEN_TYPES.TEXT, "]");
335
+ } else {
336
+ addToken(TOKEN_TYPES.CLOSE_BRACKET, "]");
337
+ isInHeader = false;
338
+ }
339
+ i++;
340
+ continue;
341
+ }
342
+ if (char === "(") {
343
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
344
+ addToken(TOKEN_TYPES.TEXT, "(");
345
+ parenDepth++;
346
+ } else {
347
+ addToken(TOKEN_TYPES.OPEN_PAREN, "(");
348
+ parenDepth++;
349
+ }
350
+ i++;
351
+ continue;
352
+ }
353
+ if (char === ")") {
354
+ if (isInAtBlockBody || (parenDepth > 1 && !isInInlineHead)) {
355
+ addToken(TOKEN_TYPES.TEXT, ")");
356
+ parenDepth--;
357
+ } else if (parenDepth > 0) {
358
+ // This ends the content part if depth drops to 0
359
+ parenDepth--;
360
+ if (parenDepth === 0) {
361
+ addToken(TOKEN_TYPES.CLOSE_PAREN, ")");
362
+ if (isInInlineHead) isInInlineHead = false;
363
+ } else {
364
+ addToken(TOKEN_TYPES.TEXT, ")");
365
+ }
366
+ } else {
367
+ addToken(TOKEN_TYPES.TEXT, ")");
368
+ }
369
+ i++;
370
+ continue;
371
+ }
372
+ if (char === ":") {
373
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
374
+ addToken(TOKEN_TYPES.TEXT, ":");
375
+ } else {
376
+ const allowed = [TOKEN_TYPES.IDENTIFIER, TOKEN_TYPES.KEY, TOKEN_TYPES.CLOSE_AT, TOKEN_TYPES.VALUE, TOKEN_TYPES.ESCAPE, TOKEN_TYPES.QUOTE, TOKEN_TYPES.PREFIX_JS, TOKEN_TYPES.PREFIX_P, TOKEN_TYPES.IMPORT, TOKEN_TYPES.USE_MODULE, TOKEN_TYPES.END_KEYWORD, TOKEN_TYPES.TEXT];
377
+ if (allowed.includes(last_non_junk_type)) {
378
+ addToken(TOKEN_TYPES.COLON, ":");
379
+ isInHeader = true;
380
+ } else {
381
+ addToken(TOKEN_TYPES.TEXT, ":");
382
+ }
383
+ }
384
+ i++;
385
+ continue;
386
+ }
387
+ if (char === "=") {
388
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
389
+ addToken(TOKEN_TYPES.TEXT, "=");
390
+ } else {
391
+ const allowed = [TOKEN_TYPES.IDENTIFIER, TOKEN_TYPES.KEY, TOKEN_TYPES.ESCAPE, TOKEN_TYPES.QUOTE, TOKEN_TYPES.PREFIX_JS, TOKEN_TYPES.PREFIX_P, TOKEN_TYPES.IMPORT, TOKEN_TYPES.USE_MODULE, TOKEN_TYPES.END_KEYWORD, TOKEN_TYPES.TEXT];
392
+ if (allowed.includes(last_non_junk_type)) {
393
+ addToken(TOKEN_TYPES.EQUAL, "=");
394
+ } else {
395
+ addToken(TOKEN_TYPES.TEXT, "=");
396
+ }
397
+ }
398
+ i++;
399
+ continue;
400
+ }
401
+ if (char === ",") {
402
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
403
+ addToken(TOKEN_TYPES.TEXT, ",");
404
+ } else {
405
+ const allowed = [TOKEN_TYPES.VALUE, TOKEN_TYPES.IDENTIFIER, TOKEN_TYPES.QUOTE, TOKEN_TYPES.ESCAPE, TOKEN_TYPES.PREFIX_JS, TOKEN_TYPES.PREFIX_P, TOKEN_TYPES.IMPORT, TOKEN_TYPES.USE_MODULE, TOKEN_TYPES.END_KEYWORD, TOKEN_TYPES.TEXT];
406
+ if (allowed.includes(last_non_junk_type)) {
407
+ addToken(TOKEN_TYPES.COMMA, ",");
408
+ } else {
409
+ addToken(TOKEN_TYPES.TEXT, ",");
410
+ }
411
+ }
412
+ i++;
413
+ continue;
414
+ }
415
+ if (char === ";") {
416
+ if (isInAtBlockBody || (parenDepth > 0 && !isInInlineHead)) {
417
+ addToken(TOKEN_TYPES.TEXT, ";");
418
+ } else {
419
+ const allowed = [TOKEN_TYPES.IDENTIFIER, TOKEN_TYPES.VALUE, TOKEN_TYPES.CLOSE_AT, TOKEN_TYPES.CLOSE_PAREN, TOKEN_TYPES.ESCAPE, TOKEN_TYPES.QUOTE, TOKEN_TYPES.PREFIX_JS, TOKEN_TYPES.PREFIX_P, TOKEN_TYPES.IMPORT, TOKEN_TYPES.USE_MODULE, TOKEN_TYPES.END_KEYWORD, TOKEN_TYPES.TEXT];
420
+ if (allowed.includes(last_non_junk_type)) {
421
+ addToken(TOKEN_TYPES.SEMICOLON, ";");
422
+ isInHeader = false; // Semicolon ends the At-Block header
423
+ // Trigger body mode for At-Blocks
424
+ if (delimiterStack.length > 0) {
425
+ const top = delimiterStack[delimiterStack.length - 1];
426
+ if (top === "@") {
427
+ isInAtBlockBody = true;
233
428
  }
234
- } else {
235
- advance(context);
236
429
  }
237
430
  } else {
238
- addToken(TOKEN_TYPES.TEXT, context);
431
+ addToken(TOKEN_TYPES.TEXT, ";");
432
+ }
433
+ }
434
+ i++;
435
+ continue;
436
+ }
437
+ if (char === "\"" || char === "'") {
438
+ const valTriggers = [TOKEN_TYPES.COLON, TOKEN_TYPES.EQUAL, TOKEN_TYPES.COMMA, TOKEN_TYPES.ESCAPE, TOKEN_TYPES.OPEN_BRACKET, TOKEN_TYPES.OPEN_AT];
439
+ const wasValueTrigger = valTriggers.includes(last_non_junk_type);
440
+ addToken(TOKEN_TYPES.QUOTE, char);
441
+ i++;
442
+ // Enable quote mode
443
+ // NOTE: We allow quotes basically anywhere in headers as values/keys
444
+ if (isInHeader || wasValueTrigger) {
445
+ isInQuote = true;
446
+ }
447
+ continue;
448
+ }
449
+
450
+ // --- PHASE 4: WORD / TEXT SCANNING ---
451
+ // This is the "Fallback" mode where we scan for identifiers, keys, or values.
452
+ // It uses lookahead and context variables to guess the role of a word.
453
+ let word = "";
454
+ // Only Blocks ([ ]) allow ':' in their main identifier.
455
+ // At-Blocks (@_) and Inlines (->( )) do NOT allow ':' in the ID.
456
+ const isStartOfBlockId = (last_non_junk_type === TOKEN_TYPES.OPEN_BRACKET);
457
+
458
+ let stopChars = "[](){}:=;,@_>\"'#\\ \t\n\r";
459
+ if (isStartOfBlockId || (parenDepth > 0 && !isInInlineHead)) {
460
+ stopChars = stopChars.replace(":", "");
461
+ }
462
+ if (!isInHeader && !isInInlineHead) {
463
+ stopChars = "[]@_()\\#\n\r"; // In normal text, stop at markers, comments and newlines
464
+ }
465
+
466
+ while (i < src.length && !stopChars.includes(src[i])) {
467
+ // Lookahead for -> marker in normal text
468
+ if (!isInHeader && src[i] === "-" && src[i+1] === ">") break;
469
+
470
+ // Stop if we hit an ALLOWED prefix trigger
471
+ if ((src[i] === "p" && src[i+1] === "{")) {
472
+ const top = (delimiterStack.length > 0) ? delimiterStack[delimiterStack.length - 1] : null;
473
+ const isInBlockHeader = isInHeader && top === "[";
474
+ const isInNormalText = !isInHeader && !isInInlineHead && !isInAtBlockBody && parenDepth === 0;
475
+ if (isInBlockHeader || isInNormalText) break;
476
+ }
477
+ if (src[i] === "j" && src[i+1] === "s" && src[i+2] === "{") {
478
+ const top = (delimiterStack.length > 0) ? delimiterStack[delimiterStack.length - 1] : null;
479
+ if (isInHeader && top === "[") break;
480
+ }
481
+ word += src[i];
482
+ i++;
483
+ }
484
+
485
+ if (word.length > 0) {
486
+ // Guess role based on context
487
+ if (parenDepth > 0 && !isInInlineHead) {
488
+ // Inside Inline Content (raw text)
489
+ addToken(TOKEN_TYPES.TEXT, word);
490
+ } else if (isInHeader || isInInlineHead) {
491
+ // Inside a structural header context
492
+ const isMainIdentifier = (
493
+ last_non_junk_type === TOKEN_TYPES.OPEN_BRACKET ||
494
+ last_non_junk_type === TOKEN_TYPES.OPEN_AT ||
495
+ (last_non_junk_type === TOKEN_TYPES.OPEN_PAREN && isInInlineHead)
496
+ );
497
+
498
+ if (isMainIdentifier) {
499
+ if (word === end_keyword) {
500
+ addToken(TOKEN_TYPES.END_KEYWORD, word);
501
+ if (delimiterStack[delimiterStack.length - 1] === "[") delimiterStack.pop();
502
+ }
503
+ else if (word === "import") addToken(TOKEN_TYPES.IMPORT, word);
504
+ else if (word === "$use-module") addToken(TOKEN_TYPES.USE_MODULE, word);
505
+ else addToken(TOKEN_TYPES.IDENTIFIER, word);
506
+ } else {
507
+ // Use lookahead to distinguish KEY from VALUE
508
+ const p = peekStructural(i);
509
+ if (p === ":") {
510
+ addToken(TOKEN_TYPES.KEY, word);
511
+ } else {
512
+ addToken(TOKEN_TYPES.VALUE, word);
513
+ }
239
514
  }
240
- i += context.length - 1;
241
515
  } else {
242
- addToken(TOKEN_TYPES.TEXT, char);
516
+ // Normal text
517
+ addToken(TOKEN_TYPES.TEXT, word);
518
+ }
519
+ } else {
520
+ // Fallback for any unhandled characters
521
+ if (i < src.length) {
522
+ addToken(TOKEN_TYPES.TEXT, src[i]);
523
+ i++;
243
524
  }
244
525
  }
245
526
  }
246
- // ========================================================================== //
247
- // Finalize with End-of-File token //
248
- // ========================================================================== //
527
+
249
528
  addToken(TOKEN_TYPES.EOF, "");
250
529
  return tokens;
251
530
  }