@wdprlib/parser 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/dist/index.cjs +312 -121
  2. package/dist/index.js +289 -98
  3. package/package.json +5 -3
  4. package/src/index.ts +163 -0
  5. package/src/lexer/index.ts +20 -0
  6. package/src/lexer/lexer.ts +687 -0
  7. package/src/lexer/tokens.ts +141 -0
  8. package/src/parser/constants.ts +173 -0
  9. package/src/parser/depth.ts +251 -0
  10. package/src/parser/index.ts +18 -0
  11. package/src/parser/parse.ts +315 -0
  12. package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
  13. package/src/parser/postprocess/index.ts +15 -0
  14. package/src/parser/postprocess/spanStrip.ts +697 -0
  15. package/src/parser/preprocess/expr.ts +265 -0
  16. package/src/parser/preprocess/index.ts +38 -0
  17. package/src/parser/preprocess/typography.ts +67 -0
  18. package/src/parser/preprocess/utils.ts +250 -0
  19. package/src/parser/preprocess/whitespace.ts +111 -0
  20. package/src/parser/rules/block/align.ts +282 -0
  21. package/src/parser/rules/block/bibliography.ts +359 -0
  22. package/src/parser/rules/block/block-list.ts +689 -0
  23. package/src/parser/rules/block/blockquote.ts +238 -0
  24. package/src/parser/rules/block/center.ts +87 -0
  25. package/src/parser/rules/block/clear-float.ts +75 -0
  26. package/src/parser/rules/block/code.ts +187 -0
  27. package/src/parser/rules/block/collapsible.ts +337 -0
  28. package/src/parser/rules/block/comment.ts +73 -0
  29. package/src/parser/rules/block/content-separator.ts +79 -0
  30. package/src/parser/rules/block/definition-list.ts +270 -0
  31. package/src/parser/rules/block/div.ts +400 -0
  32. package/src/parser/rules/block/embed-block.ts +153 -0
  33. package/src/parser/rules/block/footnoteblock.ts +200 -0
  34. package/src/parser/rules/block/heading.ts +142 -0
  35. package/src/parser/rules/block/horizontal-rule.ts +61 -0
  36. package/src/parser/rules/block/html.ts +222 -0
  37. package/src/parser/rules/block/iframe.ts +239 -0
  38. package/src/parser/rules/block/iftags.ts +150 -0
  39. package/src/parser/rules/block/include.ts +179 -0
  40. package/src/parser/rules/block/index.ts +127 -0
  41. package/src/parser/rules/block/list.ts +244 -0
  42. package/src/parser/rules/block/math.ts +183 -0
  43. package/src/parser/rules/block/module/backlinks/index.ts +31 -0
  44. package/src/parser/rules/block/module/backlinks/types.ts +21 -0
  45. package/src/parser/rules/block/module/categories/index.ts +34 -0
  46. package/src/parser/rules/block/module/categories/types.ts +21 -0
  47. package/src/parser/rules/block/module/css/index.ts +37 -0
  48. package/src/parser/rules/block/module/iftags/condition.ts +109 -0
  49. package/src/parser/rules/block/module/iftags/index.ts +26 -0
  50. package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
  51. package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
  52. package/src/parser/rules/block/module/iftags/types.ts +63 -0
  53. package/src/parser/rules/block/module/include/index.ts +20 -0
  54. package/src/parser/rules/block/module/include/resolve.ts +556 -0
  55. package/src/parser/rules/block/module/index.ts +122 -0
  56. package/src/parser/rules/block/module/join/index.ts +34 -0
  57. package/src/parser/rules/block/module/join/types.ts +23 -0
  58. package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
  59. package/src/parser/rules/block/module/listpages/extract.ts +410 -0
  60. package/src/parser/rules/block/module/listpages/index.ts +83 -0
  61. package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
  62. package/src/parser/rules/block/module/listpages/parser.ts +106 -0
  63. package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
  64. package/src/parser/rules/block/module/listpages/types.ts +513 -0
  65. package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
  66. package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
  67. package/src/parser/rules/block/module/listusers/extract.ts +45 -0
  68. package/src/parser/rules/block/module/listusers/index.ts +36 -0
  69. package/src/parser/rules/block/module/listusers/parser.ts +54 -0
  70. package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
  71. package/src/parser/rules/block/module/listusers/types.ts +93 -0
  72. package/src/parser/rules/block/module/mapping.ts +61 -0
  73. package/src/parser/rules/block/module/page-tree/index.ts +38 -0
  74. package/src/parser/rules/block/module/page-tree/types.ts +29 -0
  75. package/src/parser/rules/block/module/rate/index.ts +28 -0
  76. package/src/parser/rules/block/module/rate/types.ts +19 -0
  77. package/src/parser/rules/block/module/resolve.ts +411 -0
  78. package/src/parser/rules/block/module/types-common.ts +59 -0
  79. package/src/parser/rules/block/module/types.ts +61 -0
  80. package/src/parser/rules/block/module/utils.ts +43 -0
  81. package/src/parser/rules/block/module/walk.ts +380 -0
  82. package/src/parser/rules/block/module.ts +164 -0
  83. package/src/parser/rules/block/orphan-li.ts +177 -0
  84. package/src/parser/rules/block/paragraph.ts +157 -0
  85. package/src/parser/rules/block/table-block.ts +726 -0
  86. package/src/parser/rules/block/table.ts +441 -0
  87. package/src/parser/rules/block/tabview.ts +331 -0
  88. package/src/parser/rules/block/toc.ts +129 -0
  89. package/src/parser/rules/block/utils.ts +615 -0
  90. package/src/parser/rules/index.ts +49 -0
  91. package/src/parser/rules/inline/anchor-name.ts +154 -0
  92. package/src/parser/rules/inline/anchor.ts +327 -0
  93. package/src/parser/rules/inline/bibcite.ts +153 -0
  94. package/src/parser/rules/inline/bold.ts +86 -0
  95. package/src/parser/rules/inline/color.ts +140 -0
  96. package/src/parser/rules/inline/comment.ts +90 -0
  97. package/src/parser/rules/inline/equation-ref.ts +115 -0
  98. package/src/parser/rules/inline/expr.ts +526 -0
  99. package/src/parser/rules/inline/footnote.ts +223 -0
  100. package/src/parser/rules/inline/guillemet.ts +64 -0
  101. package/src/parser/rules/inline/html.ts +132 -0
  102. package/src/parser/rules/inline/image.ts +328 -0
  103. package/src/parser/rules/inline/index.ts +150 -0
  104. package/src/parser/rules/inline/italic.ts +74 -0
  105. package/src/parser/rules/inline/line-break.ts +326 -0
  106. package/src/parser/rules/inline/link-anchor.ts +147 -0
  107. package/src/parser/rules/inline/link-single.ts +164 -0
  108. package/src/parser/rules/inline/link-star.ts +134 -0
  109. package/src/parser/rules/inline/link-triple.ts +267 -0
  110. package/src/parser/rules/inline/math-inline.ts +126 -0
  111. package/src/parser/rules/inline/monospace.ts +78 -0
  112. package/src/parser/rules/inline/raw.ts +262 -0
  113. package/src/parser/rules/inline/size.ts +244 -0
  114. package/src/parser/rules/inline/span.ts +424 -0
  115. package/src/parser/rules/inline/strikethrough.ts +115 -0
  116. package/src/parser/rules/inline/subscript.ts +84 -0
  117. package/src/parser/rules/inline/superscript.ts +84 -0
  118. package/src/parser/rules/inline/text.ts +84 -0
  119. package/src/parser/rules/inline/underline.ts +127 -0
  120. package/src/parser/rules/inline/user.ts +147 -0
  121. package/src/parser/rules/inline/utils.ts +344 -0
  122. package/src/parser/rules/types.ts +252 -0
  123. package/src/parser/rules/utils.ts +155 -0
  124. package/src/parser/toc.ts +130 -0
@@ -0,0 +1,20 @@
1
+ /**
2
+ *
3
+ * Lexer (tokenizer) for Wikidot markup.
4
+ *
5
+ * The lexer converts preprocessed wikitext into a flat sequence of tokens
6
+ * that the parser consumes. Each token has a type (e.g., `HEADING_MARKER`,
7
+ * `BOLD`, `TEXT`) and a string value. The lexer is context-free and does
8
+ * not build any tree structure; that is the parser's responsibility.
9
+ *
10
+ * The main entry points are:
11
+ * - `tokenize()` - convenience function that tokenizes a string in one call
12
+ * - `Lexer` class - for more control over tokenization options
13
+ *
14
+ * @module
15
+ */
16
+
17
+ export type { TokenType, Token } from "./tokens";
18
+ export { createToken } from "./tokens";
19
+ export type { LexerOptions } from "./lexer";
20
+ export { Lexer, tokenize } from "./lexer";
@@ -0,0 +1,687 @@
1
+ import { createPoint, createPosition } from "@wdprlib/ast";
2
+ import { createToken, type Token, type TokenType } from "./tokens";
3
+
4
+ /**
5
+ * Configuration for the {@link Lexer}.
6
+ *
7
+ * @group Lexer
8
+ */
9
+ export interface LexerOptions {
10
+ /**
11
+ * When `true` (default), every token carries accurate line/column/offset
12
+ * data. Set to `false` to skip position tracking for faster tokenisation
13
+ * when source-map information is not needed.
14
+ */
15
+ trackPositions?: boolean;
16
+ }
17
+
18
+ /**
19
+ * Internal mutable state carried through a single tokenisation pass.
20
+ */
21
+ interface LexerState {
22
+ source: string;
23
+ pos: number;
24
+ line: number;
25
+ column: number;
26
+ lineStart: boolean;
27
+ tokens: Token[];
28
+ }
29
+
30
+ /**
31
+ * Converts a Wikidot markup source string into a flat array of {@link Token}s.
32
+ *
33
+ * The lexer is single-pass and greedy: it tries the longest-matching
34
+ * multi-character pattern first (e.g. `[[[` before `[[`, `**` before `*`).
35
+ * Context-sensitive constructs (line-start headings, blockquote markers)
36
+ * are disambiguated via the `lineStart` state flag.
37
+ *
38
+ * For convenience, use the standalone {@link tokenize} function instead
39
+ * of constructing a `Lexer` directly.
40
+ *
41
+ * @group Lexer
42
+ */
43
+ export class Lexer {
44
+ private state: LexerState;
45
+ private options: Required<LexerOptions>;
46
+ // Positions where ]] should be split into ] + ] (for invalid anchor names)
47
+ private splitBlockClosePositions: Set<number> = new Set();
48
+ /**
49
+ * Nesting depth of block-opener context (between `[[` / `[[/` and the
50
+ * matching `]]`). Used to scope `QUOTED_STRING` recognition so that
51
+ * `"` after `=` only becomes a quoted attribute value while we are
52
+ * actually parsing block attributes — otherwise inline `=` followed by
53
+ * `"` (e.g. inside `[[footnote]]="[[/footnote]]`) would erroneously
54
+ * consume content up to the next `"` or newline.
55
+ */
56
+ private blockOpenerDepth = 0;
57
+
58
+ constructor(source: string, options: LexerOptions = {}) {
59
+ this.options = {
60
+ trackPositions: options.trackPositions ?? true,
61
+ };
62
+ this.state = {
63
+ source,
64
+ pos: 0,
65
+ line: 1,
66
+ column: 1,
67
+ lineStart: true,
68
+ tokens: [],
69
+ };
70
+ }
71
+
72
+ /**
73
+ * Tokenize the entire source
74
+ */
75
+ tokenize(): Token[] {
76
+ while (!this.isAtEnd()) {
77
+ this.scanToken();
78
+ }
79
+
80
+ this.addToken("EOF", "");
81
+ return this.state.tokens;
82
+ }
83
+
84
+ /**
85
+ * Check if at end of source
86
+ */
87
+ private isAtEnd(): boolean {
88
+ return this.state.pos >= this.state.source.length;
89
+ }
90
+
91
+ /**
92
+ * Get current character
93
+ */
94
+ private current(): string {
95
+ return this.state.source[this.state.pos] ?? "";
96
+ }
97
+
98
+ /**
99
+ * Check if [[# is followed by an invalid anchor name that closes with ]].
100
+ * Valid: [[# valid-name]] where name matches [-_A-Za-z0-9.%]+
101
+ * Invalid: [[# name with spaces]] or [[# name$special]]
102
+ * When invalid, returns the position of the closing ]] so the lexer can
103
+ * emit tokens that allow the inner [# text] to be parsed as a described link.
104
+ */
105
+ private findInvalidAnchorNameEnd(): number | null {
106
+ const src = this.state.source;
107
+ const pos = this.state.pos;
108
+
109
+ // Must start with [[#
110
+ if (src[pos] !== "[" || src[pos + 1] !== "[" || src[pos + 2] !== "#") {
111
+ return null;
112
+ }
113
+
114
+ // Must have space after #
115
+ if (src[pos + 3] !== " ") {
116
+ return null;
117
+ }
118
+
119
+ // Skip spaces after #
120
+ let i = pos + 4;
121
+ while (i < src.length && src[i] === " ") {
122
+ i++;
123
+ }
124
+
125
+ // Scan for invalid characters
126
+ let foundInvalid = false;
127
+ while (i < src.length) {
128
+ const ch = src[i]!;
129
+ if (ch === "\n") return null;
130
+ if (ch === "]" && src[i + 1] === "]") {
131
+ // Reached ]] - if we found invalid chars, this is an invalid anchor name
132
+ return foundInvalid ? i : null;
133
+ }
134
+ const code = ch.charCodeAt(0);
135
+ const isValid =
136
+ (code >= 48 && code <= 57) || // 0-9
137
+ (code >= 65 && code <= 90) || // A-Z
138
+ (code >= 97 && code <= 122) || // a-z
139
+ code === 45 || // -
140
+ code === 95 || // _
141
+ code === 46 || // .
142
+ code === 37; // %
143
+ if (!isValid) {
144
+ foundInvalid = true;
145
+ }
146
+ i++;
147
+ }
148
+
149
+ return null;
150
+ }
151
+
152
+ /**
153
+ * Check if source matches pattern at current position
154
+ */
155
+ private match(pattern: string): boolean {
156
+ for (let i = 0; i < pattern.length; i++) {
157
+ if (this.state.source[this.state.pos + i] !== pattern[i]) {
158
+ return false;
159
+ }
160
+ }
161
+ return true;
162
+ }
163
+
164
+ /**
165
+ * Advance position by n characters
166
+ */
167
+ private advance(n = 1): string {
168
+ let result = "";
169
+ for (let i = 0; i < n && !this.isAtEnd(); i++) {
170
+ const char = this.current();
171
+ result += char;
172
+ this.state.pos++;
173
+
174
+ if (char === "\n") {
175
+ this.state.line++;
176
+ this.state.column = 1;
177
+ this.state.lineStart = true;
178
+ } else {
179
+ this.state.column++;
180
+ if (char !== " " && char !== "\t") {
181
+ this.state.lineStart = false;
182
+ }
183
+ }
184
+ }
185
+ return result;
186
+ }
187
+
188
+ /**
189
+ * Returns the type of the last non-whitespace token, or null if none.
190
+ */
191
+ private lastNonWhitespaceTokenType(): TokenType | null {
192
+ for (let i = this.state.tokens.length - 1; i >= 0; i--) {
193
+ const t = this.state.tokens[i]!;
194
+ if (t.type !== "WHITESPACE") return t.type;
195
+ }
196
+ return null;
197
+ }
198
+
199
+ /**
200
+ * Add token
201
+ */
202
+ private addToken(type: TokenType, value: string): void {
203
+ const startPos = createPoint(
204
+ this.state.line,
205
+ this.state.column - value.length,
206
+ this.state.pos - value.length,
207
+ );
208
+ const endPos = createPoint(this.state.line, this.state.column, this.state.pos);
209
+ const position = this.options.trackPositions
210
+ ? createPosition(startPos, endPos)
211
+ : createPosition(createPoint(0, 0, 0), createPoint(0, 0, 0));
212
+
213
+ const lineStart =
214
+ this.state.tokens.length === 0 ||
215
+ this.state.tokens[this.state.tokens.length - 1]?.type === "NEWLINE";
216
+
217
+ this.state.tokens.push(createToken(type, value, position, lineStart));
218
+
219
+ // Track block-opener nesting so `"` after `=` is only recognised as a
220
+ // quoted attribute value while we are actually inside `[[ ... ]]`.
221
+ if (type === "BLOCK_OPEN" || type === "BLOCK_END_OPEN") {
222
+ this.blockOpenerDepth++;
223
+ } else if (type === "BLOCK_CLOSE" && this.blockOpenerDepth > 0) {
224
+ this.blockOpenerDepth--;
225
+ }
226
+ }
227
+
228
+ /**
229
+ * Scan a single token
230
+ */
231
+ private scanToken(): void {
232
+ const char = this.current();
233
+ const isLineStart = this.state.lineStart;
234
+
235
+ // Newline
236
+ if (char === "\n") {
237
+ this.advance();
238
+ this.addToken("NEWLINE", "\n");
239
+ return;
240
+ }
241
+
242
+ // Whitespace (non-newline)
243
+ if (char === " " || char === "\t") {
244
+ let ws = "";
245
+ while (!this.isAtEnd() && (this.current() === " " || this.current() === "\t")) {
246
+ ws += this.advance();
247
+ }
248
+ this.addToken("WHITESPACE", ws);
249
+ return;
250
+ }
251
+
252
+ // Comment open [!-- (must check before [[[)
253
+ if (this.match("[!--")) {
254
+ this.advance(4);
255
+ this.addToken("COMMENT_OPEN", "[!--");
256
+ return;
257
+ }
258
+
259
+ // Link open [[[ (must check before [[)
260
+ if (this.match("[[[")) {
261
+ this.advance(3);
262
+ this.addToken("LINK_OPEN", "[[[");
263
+ return;
264
+ }
265
+
266
+ // Block end open [[/
267
+ if (this.match("[[/")) {
268
+ this.advance(3);
269
+ this.addToken("BLOCK_END_OPEN", "[[/");
270
+ return;
271
+ }
272
+
273
+ // Block open [[
274
+ if (this.match("[[")) {
275
+ // Check for invalid anchor name pattern: [[# name-with-spaces]]
276
+ // Wikidot's Anchor regex requires [-_A-Za-z0-9.%] only after [[# .
277
+ // If [[# is followed by invalid anchor name, decompose into
278
+ // TEXT "[" so the inner [# text] is parsed as a described anchor link.
279
+ // The closing ]] will also be split: ] (BRACKET_CLOSE) + ] (TEXT).
280
+ const invalidEnd = this.findInvalidAnchorNameEnd();
281
+ if (invalidEnd !== null) {
282
+ this.splitBlockClosePositions.add(invalidEnd);
283
+ this.advance(1);
284
+ this.addToken("TEXT", "[");
285
+ return;
286
+ }
287
+ this.advance(2);
288
+ this.addToken("BLOCK_OPEN", "[[");
289
+ return;
290
+ }
291
+
292
+ // Link close ]]] (must check before ]])
293
+ if (this.match("]]]")) {
294
+ this.advance(3);
295
+ this.addToken("LINK_CLOSE", "]]]");
296
+ return;
297
+ }
298
+
299
+ // Block close ]]
300
+ if (this.match("]]")) {
301
+ // For invalid anchor names, split ]] into ] (BRACKET_CLOSE) + ] (TEXT)
302
+ if (this.splitBlockClosePositions.has(this.state.pos)) {
303
+ this.splitBlockClosePositions.delete(this.state.pos);
304
+ this.advance(1);
305
+ this.addToken("BRACKET_CLOSE", "]");
306
+ this.advance(1);
307
+ this.addToken("TEXT", "]");
308
+ return;
309
+ }
310
+ this.advance(2);
311
+ this.addToken("BLOCK_CLOSE", "]]");
312
+ return;
313
+ }
314
+
315
+ // Raw/escape @@
316
+ if (this.match("@@")) {
317
+ this.advance(2);
318
+ this.addToken("RAW_OPEN", "@@");
319
+ return;
320
+ }
321
+
322
+ // Raw block @<
323
+ if (this.match("@<")) {
324
+ this.advance(2);
325
+ this.addToken("RAW_BLOCK_OPEN", "@<");
326
+ return;
327
+ }
328
+
329
+ // Raw block close >@
330
+ if (this.match(">@")) {
331
+ this.advance(2);
332
+ this.addToken("RAW_BLOCK_CLOSE", ">@");
333
+ return;
334
+ }
335
+
336
+ // Monospace open {{
337
+ if (this.match("{{")) {
338
+ this.advance(2);
339
+ this.addToken("MONO_MARKER", "{{");
340
+ return;
341
+ }
342
+
343
+ // Monospace close }}
344
+ if (this.match("}}")) {
345
+ this.advance(2);
346
+ this.addToken("MONO_CLOSE", "}}");
347
+ return;
348
+ }
349
+
350
+ // Bold **
351
+ if (this.match("**")) {
352
+ this.advance(2);
353
+ this.addToken("BOLD_MARKER", "**");
354
+ return;
355
+ }
356
+
357
+ // Horizontal rule ---- or more (4+ hyphens, check before --)
358
+ if (isLineStart && this.match("----")) {
359
+ let dashes = "";
360
+ while (this.current() === "-") {
361
+ dashes += this.advance();
362
+ }
363
+ this.addToken("HR_MARKER", dashes);
364
+ return;
365
+ }
366
+
367
+ // Comment close --] (must check before --)
368
+ if (this.match("--]")) {
369
+ this.advance(3);
370
+ this.addToken("COMMENT_CLOSE", "--]");
371
+ return;
372
+ }
373
+
374
+ // Strikethrough -- (Wikidot only uses --)
375
+ if (this.match("--")) {
376
+ this.advance(2);
377
+ this.addToken("STRIKE_MARKER", "--");
378
+ return;
379
+ }
380
+
381
+ // Left double angle << (guillemet)
382
+ if (this.match("<<")) {
383
+ this.advance(2);
384
+ this.addToken("LEFT_DOUBLE_ANGLE", "<<");
385
+ return;
386
+ }
387
+
388
+ // Clear float ~~~~ or more (at line start only, Wikidot requires 4+)
389
+ if (isLineStart && this.match("~~~~")) {
390
+ let tildes = "";
391
+ while (this.current() === "~") {
392
+ tildes += this.advance();
393
+ }
394
+ // Check for directional clear float
395
+ if (this.current() === "<") {
396
+ this.advance();
397
+ this.addToken("CLEAR_FLOAT_LEFT", `${tildes}<`);
398
+ return;
399
+ }
400
+ if (this.current() === ">") {
401
+ this.advance();
402
+ this.addToken("CLEAR_FLOAT_RIGHT", `${tildes}>`);
403
+ return;
404
+ }
405
+ this.addToken("CLEAR_FLOAT", `${tildes}`);
406
+ return;
407
+ }
408
+
409
+ // Single hyphen (not part of --)
410
+ if (char === "-") {
411
+ this.advance();
412
+ this.addToken("TEXT", "-");
413
+ return;
414
+ }
415
+
416
+ // Underline __ (check before single _)
417
+ if (this.match("__")) {
418
+ this.advance(2);
419
+ this.addToken("UNDERLINE_MARKER", "__");
420
+ return;
421
+ }
422
+
423
+ // Single underscore _ (for line break)
424
+ if (char === "_") {
425
+ this.advance();
426
+ this.addToken("UNDERSCORE", "_");
427
+ return;
428
+ }
429
+
430
+ // Superscript ^^
431
+ if (this.match("^^")) {
432
+ this.advance(2);
433
+ this.addToken("SUPER_MARKER", "^^");
434
+ return;
435
+ }
436
+
437
+ // Subscript ,,
438
+ if (this.match(",,")) {
439
+ this.advance(2);
440
+ this.addToken("SUB_MARKER", ",,");
441
+ return;
442
+ }
443
+
444
+ // Italic //
445
+ if (this.match("//")) {
446
+ this.advance(2);
447
+ this.addToken("ITALIC_MARKER", "//");
448
+ return;
449
+ }
450
+
451
+ // Table markers
452
+ // ||~ (header), ||< (left), ||= (center), ||> (right), || (normal)
453
+ if (this.match("||~")) {
454
+ this.advance(3);
455
+ this.addToken("TABLE_HEADER", "||~");
456
+ return;
457
+ }
458
+ if (this.match("||<")) {
459
+ this.advance(3);
460
+ this.addToken("TABLE_LEFT", "||<");
461
+ return;
462
+ }
463
+ if (this.match("||=")) {
464
+ this.advance(3);
465
+ this.addToken("TABLE_CENTER", "||=");
466
+ return;
467
+ }
468
+ if (this.match("||>")) {
469
+ this.advance(3);
470
+ this.addToken("TABLE_RIGHT", "||>");
471
+ return;
472
+ }
473
+ if (this.match("||")) {
474
+ this.advance(2);
475
+ this.addToken("TABLE_MARKER", "||");
476
+ return;
477
+ }
478
+
479
+ // Heading + (at line start)
480
+ if (isLineStart && char === "+") {
481
+ let plusCount = 0;
482
+ while (this.current() === "+") {
483
+ plusCount++;
484
+ this.advance();
485
+ }
486
+ this.addToken("HEADING_MARKER", "+".repeat(plusCount));
487
+ return;
488
+ }
489
+
490
+ // List bullet * (at line start)
491
+ if (isLineStart && char === "*") {
492
+ this.advance();
493
+ this.addToken("LIST_BULLET", "*");
494
+ return;
495
+ }
496
+
497
+ // Color marker ## (check before LIST_NUMBER)
498
+ if (this.match("##")) {
499
+ this.advance(2);
500
+ this.addToken("COLOR_MARKER", "##");
501
+ return;
502
+ }
503
+
504
+ // List number # (at line start)
505
+ if (isLineStart && char === "#") {
506
+ this.advance();
507
+ this.addToken("LIST_NUMBER", "#");
508
+ return;
509
+ }
510
+
511
+ // Blockquote > or >>> (at line start only for blockquote)
512
+ if (char === ">") {
513
+ if (isLineStart) {
514
+ // At line start: consume all consecutive > as a single blockquote marker
515
+ let depth = "";
516
+ while (this.current() === ">") {
517
+ depth += this.advance();
518
+ }
519
+ this.addToken("BLOCKQUOTE_MARKER", depth);
520
+ return;
521
+ }
522
+ // Not at line start
523
+ if (this.match(">>")) {
524
+ // >> not at line start - guillemet
525
+ this.advance(2);
526
+ this.addToken("RIGHT_DOUBLE_ANGLE", ">>");
527
+ return;
528
+ }
529
+ // Single > not at line start - just text
530
+ this.advance();
531
+ this.addToken("TEXT", ">");
532
+ return;
533
+ }
534
+
535
+ // Bracket anchor [#
536
+ if (this.match("[#")) {
537
+ this.advance(2);
538
+ this.addToken("BRACKET_ANCHOR", "[#");
539
+ return;
540
+ }
541
+
542
+ // Bracket star [* (for new tab links)
543
+ if (this.match("[*")) {
544
+ this.advance(2);
545
+ this.addToken("BRACKET_STAR", "[*");
546
+ return;
547
+ }
548
+
549
+ // Single characters
550
+ if (char === "[") {
551
+ this.advance();
552
+ this.addToken("BRACKET_OPEN", "[");
553
+ return;
554
+ }
555
+
556
+ if (char === "]") {
557
+ this.advance();
558
+ this.addToken("BRACKET_CLOSE", "]");
559
+ return;
560
+ }
561
+
562
+ if (char === "|") {
563
+ this.advance();
564
+ this.addToken("PIPE", "|");
565
+ return;
566
+ }
567
+
568
+ if (char === "=") {
569
+ this.advance();
570
+ this.addToken("EQUALS", "=");
571
+ return;
572
+ }
573
+
574
+ // Quoted string (only after EQUALS for block attribute values)
575
+ // In inline context (outside of a `[[...]]` opener), `"` is just a
576
+ // text character (typographic quote). Without the depth gate, an
577
+ // inline `=` followed by `"` (e.g. `[[footnote]]="[[/footnote]]`)
578
+ // would otherwise eat the closing tag.
579
+ if (char === '"') {
580
+ const lastNonWs = this.lastNonWhitespaceTokenType();
581
+ if (this.blockOpenerDepth > 0 && lastNonWs === "EQUALS") {
582
+ let quoted = this.advance(); // opening "
583
+ while (!this.isAtEnd() && this.current() !== '"' && this.current() !== "\n") {
584
+ quoted += this.advance();
585
+ }
586
+ if (this.current() === '"') {
587
+ quoted += this.advance(); // closing "
588
+ }
589
+ this.addToken("QUOTED_STRING", quoted);
590
+ return;
591
+ }
592
+ this.advance();
593
+ this.addToken("TEXT", '"');
594
+ return;
595
+ }
596
+
597
+ if (char === ":") {
598
+ this.advance();
599
+ this.addToken("COLON", ":");
600
+ return;
601
+ }
602
+
603
+ if (char === "/") {
604
+ this.advance();
605
+ this.addToken("SLASH", "/");
606
+ return;
607
+ }
608
+
609
+ if (char === "*") {
610
+ this.advance();
611
+ this.addToken("STAR", "*");
612
+ return;
613
+ }
614
+
615
+ if (char === "#") {
616
+ this.advance();
617
+ this.addToken("HASH", "#");
618
+ return;
619
+ }
620
+
621
+ if (char === "@") {
622
+ this.advance();
623
+ this.addToken("AT", "@");
624
+ return;
625
+ }
626
+
627
+ if (char === "&") {
628
+ this.advance();
629
+ this.addToken("AMPERSAND", "&");
630
+ return;
631
+ }
632
+
633
+ if (char === "\\") {
634
+ this.advance();
635
+ this.addToken("BACKSLASH", "\\");
636
+ return;
637
+ }
638
+
639
+ // Backslash line break marker (U+E000, inserted by preproc)
640
+ if (char.charCodeAt(0) === 0xe000) {
641
+ this.advance();
642
+ this.addToken("BACKSLASH_BREAK", char);
643
+ return;
644
+ }
645
+
646
+ // Identifier: alphanumeric sequence
647
+ if (this.isAlphanumeric(char)) {
648
+ let ident = "";
649
+ while (!this.isAtEnd() && this.isAlphanumeric(this.current())) {
650
+ ident += this.advance();
651
+ }
652
+ this.addToken("IDENTIFIER", ident);
653
+ return;
654
+ }
655
+
656
+ // Default: single character as text
657
+ const text = this.advance();
658
+ this.addToken("TEXT", text);
659
+ }
660
+
661
+ /**
662
+ * Check if character is alphanumeric (for identifier tokens)
663
+ */
664
+ private isAlphanumeric(char: string): boolean {
665
+ const code = char.charCodeAt(0);
666
+ return (
667
+ (code >= 48 && code <= 57) || // 0-9
668
+ (code >= 65 && code <= 90) || // A-Z
669
+ (code >= 97 && code <= 122) // a-z
670
+ );
671
+ }
672
+ }
673
+
674
+ /**
675
+ * Tokenise a Wikidot markup source string in one call.
676
+ *
677
+ * Shorthand for `new Lexer(source, options).tokenize()`.
678
+ *
679
+ * @param source - Raw Wikidot markup
680
+ * @param options - Optional lexer configuration
681
+ * @returns A flat array of tokens, ending with an `EOF` token
682
+ *
683
+ * @group Lexer
684
+ */
685
+ export function tokenize(source: string, options?: LexerOptions): Token[] {
686
+ return new Lexer(source, options).tokenize();
687
+ }