@wdprlib/parser 3.1.1 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. package/dist/index.cjs +312 -121
  2. package/dist/index.js +289 -98
  3. package/package.json +5 -3
  4. package/src/index.ts +163 -0
  5. package/src/lexer/index.ts +20 -0
  6. package/src/lexer/lexer.ts +687 -0
  7. package/src/lexer/tokens.ts +141 -0
  8. package/src/parser/constants.ts +173 -0
  9. package/src/parser/depth.ts +251 -0
  10. package/src/parser/index.ts +18 -0
  11. package/src/parser/parse.ts +315 -0
  12. package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
  13. package/src/parser/postprocess/index.ts +15 -0
  14. package/src/parser/postprocess/spanStrip.ts +697 -0
  15. package/src/parser/preprocess/expr.ts +265 -0
  16. package/src/parser/preprocess/index.ts +38 -0
  17. package/src/parser/preprocess/typography.ts +67 -0
  18. package/src/parser/preprocess/utils.ts +250 -0
  19. package/src/parser/preprocess/whitespace.ts +111 -0
  20. package/src/parser/rules/block/align.ts +282 -0
  21. package/src/parser/rules/block/bibliography.ts +359 -0
  22. package/src/parser/rules/block/block-list.ts +689 -0
  23. package/src/parser/rules/block/blockquote.ts +238 -0
  24. package/src/parser/rules/block/center.ts +87 -0
  25. package/src/parser/rules/block/clear-float.ts +75 -0
  26. package/src/parser/rules/block/code.ts +187 -0
  27. package/src/parser/rules/block/collapsible.ts +337 -0
  28. package/src/parser/rules/block/comment.ts +73 -0
  29. package/src/parser/rules/block/content-separator.ts +79 -0
  30. package/src/parser/rules/block/definition-list.ts +270 -0
  31. package/src/parser/rules/block/div.ts +400 -0
  32. package/src/parser/rules/block/embed-block.ts +153 -0
  33. package/src/parser/rules/block/footnoteblock.ts +200 -0
  34. package/src/parser/rules/block/heading.ts +142 -0
  35. package/src/parser/rules/block/horizontal-rule.ts +61 -0
  36. package/src/parser/rules/block/html.ts +222 -0
  37. package/src/parser/rules/block/iframe.ts +239 -0
  38. package/src/parser/rules/block/iftags.ts +150 -0
  39. package/src/parser/rules/block/include.ts +179 -0
  40. package/src/parser/rules/block/index.ts +127 -0
  41. package/src/parser/rules/block/list.ts +244 -0
  42. package/src/parser/rules/block/math.ts +183 -0
  43. package/src/parser/rules/block/module/backlinks/index.ts +31 -0
  44. package/src/parser/rules/block/module/backlinks/types.ts +21 -0
  45. package/src/parser/rules/block/module/categories/index.ts +34 -0
  46. package/src/parser/rules/block/module/categories/types.ts +21 -0
  47. package/src/parser/rules/block/module/css/index.ts +37 -0
  48. package/src/parser/rules/block/module/iftags/condition.ts +109 -0
  49. package/src/parser/rules/block/module/iftags/index.ts +26 -0
  50. package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
  51. package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
  52. package/src/parser/rules/block/module/iftags/types.ts +63 -0
  53. package/src/parser/rules/block/module/include/index.ts +20 -0
  54. package/src/parser/rules/block/module/include/resolve.ts +556 -0
  55. package/src/parser/rules/block/module/index.ts +122 -0
  56. package/src/parser/rules/block/module/join/index.ts +34 -0
  57. package/src/parser/rules/block/module/join/types.ts +23 -0
  58. package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
  59. package/src/parser/rules/block/module/listpages/extract.ts +410 -0
  60. package/src/parser/rules/block/module/listpages/index.ts +83 -0
  61. package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
  62. package/src/parser/rules/block/module/listpages/parser.ts +106 -0
  63. package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
  64. package/src/parser/rules/block/module/listpages/types.ts +513 -0
  65. package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
  66. package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
  67. package/src/parser/rules/block/module/listusers/extract.ts +45 -0
  68. package/src/parser/rules/block/module/listusers/index.ts +36 -0
  69. package/src/parser/rules/block/module/listusers/parser.ts +54 -0
  70. package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
  71. package/src/parser/rules/block/module/listusers/types.ts +93 -0
  72. package/src/parser/rules/block/module/mapping.ts +61 -0
  73. package/src/parser/rules/block/module/page-tree/index.ts +38 -0
  74. package/src/parser/rules/block/module/page-tree/types.ts +29 -0
  75. package/src/parser/rules/block/module/rate/index.ts +28 -0
  76. package/src/parser/rules/block/module/rate/types.ts +19 -0
  77. package/src/parser/rules/block/module/resolve.ts +411 -0
  78. package/src/parser/rules/block/module/types-common.ts +59 -0
  79. package/src/parser/rules/block/module/types.ts +61 -0
  80. package/src/parser/rules/block/module/utils.ts +43 -0
  81. package/src/parser/rules/block/module/walk.ts +380 -0
  82. package/src/parser/rules/block/module.ts +164 -0
  83. package/src/parser/rules/block/orphan-li.ts +177 -0
  84. package/src/parser/rules/block/paragraph.ts +157 -0
  85. package/src/parser/rules/block/table-block.ts +726 -0
  86. package/src/parser/rules/block/table.ts +441 -0
  87. package/src/parser/rules/block/tabview.ts +331 -0
  88. package/src/parser/rules/block/toc.ts +129 -0
  89. package/src/parser/rules/block/utils.ts +615 -0
  90. package/src/parser/rules/index.ts +49 -0
  91. package/src/parser/rules/inline/anchor-name.ts +154 -0
  92. package/src/parser/rules/inline/anchor.ts +327 -0
  93. package/src/parser/rules/inline/bibcite.ts +153 -0
  94. package/src/parser/rules/inline/bold.ts +86 -0
  95. package/src/parser/rules/inline/color.ts +140 -0
  96. package/src/parser/rules/inline/comment.ts +90 -0
  97. package/src/parser/rules/inline/equation-ref.ts +115 -0
  98. package/src/parser/rules/inline/expr.ts +526 -0
  99. package/src/parser/rules/inline/footnote.ts +223 -0
  100. package/src/parser/rules/inline/guillemet.ts +64 -0
  101. package/src/parser/rules/inline/html.ts +132 -0
  102. package/src/parser/rules/inline/image.ts +328 -0
  103. package/src/parser/rules/inline/index.ts +150 -0
  104. package/src/parser/rules/inline/italic.ts +74 -0
  105. package/src/parser/rules/inline/line-break.ts +326 -0
  106. package/src/parser/rules/inline/link-anchor.ts +147 -0
  107. package/src/parser/rules/inline/link-single.ts +164 -0
  108. package/src/parser/rules/inline/link-star.ts +134 -0
  109. package/src/parser/rules/inline/link-triple.ts +267 -0
  110. package/src/parser/rules/inline/math-inline.ts +126 -0
  111. package/src/parser/rules/inline/monospace.ts +78 -0
  112. package/src/parser/rules/inline/raw.ts +262 -0
  113. package/src/parser/rules/inline/size.ts +244 -0
  114. package/src/parser/rules/inline/span.ts +424 -0
  115. package/src/parser/rules/inline/strikethrough.ts +115 -0
  116. package/src/parser/rules/inline/subscript.ts +84 -0
  117. package/src/parser/rules/inline/superscript.ts +84 -0
  118. package/src/parser/rules/inline/text.ts +84 -0
  119. package/src/parser/rules/inline/underline.ts +127 -0
  120. package/src/parser/rules/inline/user.ts +147 -0
  121. package/src/parser/rules/inline/utils.ts +344 -0
  122. package/src/parser/rules/types.ts +252 -0
  123. package/src/parser/rules/utils.ts +155 -0
  124. package/src/parser/toc.ts +130 -0
@@ -0,0 +1,265 @@
1
+ /**
2
+ *
3
+ * Text-level expansion of `[[#if ...]]`, `[[#ifexpr ...]]`, and
4
+ * `[[#expr ...]]` directives that sit *inside* another block's opener.
5
+ *
6
+ * The inline rules in `rules/inline/expr.ts` parse these forms as regular
7
+ * inline elements, but that only works when the directive appears in
8
+ * parseable inline text. When one is embedded inside a block opener's
9
+ * attribute string, e.g.
10
+ *
11
+ * ```wikitext
12
+ * [[div class="x [[#if 1 | a | b ]]"]]
13
+ * [[li class="[[#if 1 | folded | unfolded ]] [[#ifexpr 1>0 | hot | cold ]]"]]
14
+ * [[div col="[[#expr 1+1]]"]]
15
+ * ```
16
+ *
17
+ * the lexer cannot recover a well-formed opener from the input. The
18
+ * embedded directive has to collapse to a plain string before the parser
19
+ * sees the outer tag.
20
+ *
21
+ * This pass only resolves directives whose `[[#` sits inside an unclosed
22
+ * `[[` (depth > 0). Top-level directives are left untouched so the inline
23
+ * parser / AST renderer keeps its full evaluator + element support.
24
+ *
25
+ * Truthiness rules match the inline `ifRule` / `ifExprRule`: an empty
26
+ * string, `"0"`, `"false"`, `"null"` (case-insensitive) are falsy.
27
+ *
28
+ * @module
29
+ */
30
+
31
+ import { evaluateExpression, formatExprValue, isTruthy } from "@wdprlib/ast";
32
+ import {
33
+ computeBracketDepths,
34
+ makeUniqueSentinels,
35
+ maskRawRegions,
36
+ restorePlaceholders,
37
+ } from "./utils";
38
+
39
+ /**
40
+ * Resolve every `[[#if]]` / `[[#ifexpr]]` / `[[#expr]]` that sits inside
41
+ * another block's opener (depth > 0). Top-level directives are left for
42
+ * the inline parser. Innermost-first reduction lets an outer directive
43
+ * re-process the flattened body on the next pass. Unmatched / malformed
44
+ * directives are left untouched.
45
+ */
46
+ export function preprocessExpr(source: string): string {
47
+ if (!source.includes("[[#")) return source;
48
+
49
+ const sentinels = makeUniqueSentinels(source);
50
+ const { masked, placeholders } = maskRawRegions(source, sentinels);
51
+ const reduced = reduceExpr(masked);
52
+ return restorePlaceholders(reduced, placeholders, sentinels);
53
+ }
54
+
55
+ /**
56
+ * Backwards-compatible alias for the older `preprocessIf` name (used by
57
+ * external callers that target the previous, `[[#if]]`-only behaviour).
58
+ * Both names point at the same implementation, which now also resolves
59
+ * `[[#ifexpr]]` and `[[#expr]]` in opener context.
60
+ */
61
+ export const preprocessIf: (source: string) => string = preprocessExpr;
62
+
63
+ function reduceExpr(source: string): string {
64
+ let current = source;
65
+ const maxIterations = source.length + 1;
66
+ for (let i = 0; i < maxIterations; i++) {
67
+ const next = expandInnermost(current);
68
+ if (next === current) return current;
69
+ current = next;
70
+ }
71
+ return current;
72
+ }
73
+
74
+ /**
75
+ * Walk `source`, locate every innermost `[[#if]]` / `[[#ifexpr]]` /
76
+ * `[[#expr]]` directive that sits inside an unclosed `[[`, and replace
77
+ * it with its evaluated string. Returns the source unchanged when no
78
+ * replacements were made.
79
+ */
80
+ function expandInnermost(source: string): string {
81
+ const depths = computeBracketDepths(source);
82
+ let result = "";
83
+ let i = 0;
84
+ let replaced = false;
85
+
86
+ while (i < source.length) {
87
+ const kind = matchDirectiveKind(source, i);
88
+ if (kind !== null && depths[i]! > 0) {
89
+ const match = tryParseInnermostDirective(source, i, kind);
90
+ if (match !== null) {
91
+ result += evaluateDirective(kind, match);
92
+ i = match.end;
93
+ replaced = true;
94
+ continue;
95
+ }
96
+ }
97
+ result += source[i];
98
+ i++;
99
+ }
100
+
101
+ return replaced ? result : source;
102
+ }
103
+
104
+ type DirectiveKind = "if" | "ifexpr" | "expr";
105
+
106
+ /** Return the kind of `[[#xxx` directive at `i`, or null if none matches. */
107
+ function matchDirectiveKind(source: string, i: number): DirectiveKind | null {
108
+ if (!source.startsWith("[[#", i)) return null;
109
+ // Order matters: `ifexpr` must be checked before `if` because the
110
+ // shorter `if` prefix would otherwise consume `ifexpr` openings.
111
+ if (source.startsWith("ifexpr", i + 3) && !isIdentChar(source[i + 9])) {
112
+ return "ifexpr";
113
+ }
114
+ if (source.startsWith("if", i + 3) && !isIdentChar(source[i + 5])) {
115
+ return "if";
116
+ }
117
+ if (source.startsWith("expr", i + 3) && !isIdentChar(source[i + 7])) {
118
+ return "expr";
119
+ }
120
+ return null;
121
+ }
122
+
123
+ interface DirectiveMatch {
124
+ /** Position just past the closing `]]`. */
125
+ end: number;
126
+ /** Raw condition / expression (everything between the keyword and the first top-level `|` or `]]`). */
127
+ head: string;
128
+ /** Raw `then` branch (empty when no `|` appeared). */
129
+ thenText: string;
130
+ /** Raw `else` branch (empty when only one `|` appeared). */
131
+ elseText: string;
132
+ /** Whether the directive supplied a `|` at all. */
133
+ hasPipe: boolean;
134
+ }
135
+
136
+ /**
137
+ * Try to parse a single `[[#kind ...]]` directive starting at `start`.
138
+ * Returns `null` when the directive is malformed (no closing `]]`) or
139
+ * when its body contains another `[[#kind]]` of the same family
140
+ * (so the caller should keep descending). The substrings are returned
141
+ * raw; callers decide how to evaluate them.
142
+ */
143
+ function tryParseInnermostDirective(
144
+ source: string,
145
+ start: number,
146
+ kind: DirectiveKind,
147
+ ): DirectiveMatch | null {
148
+ const keywordLen = kind === "ifexpr" ? 6 : kind === "expr" ? 4 : 2;
149
+ // start + 3 ("[[#") + keywordLen → first char after the keyword.
150
+ let pos = start + 3 + keywordLen;
151
+ // The inline rule does not require a whitespace separator here — it
152
+ // accepts e.g. `[[#expr(1+1)]]` and `[[#ifexpr(1)|yes|no]]`. Skip any
153
+ // optional leading whitespace and let the body scan handle the rest.
154
+ while (pos < source.length && isWhitespace(source[pos])) pos++;
155
+
156
+ const headStart = pos;
157
+ let blockDepth = 0;
158
+ let linkDepth = 0;
159
+ const pipes: number[] = [];
160
+ let closeStart = -1;
161
+
162
+ while (pos < source.length) {
163
+ // Reject any nested directive of the same family so we resolve
164
+ // innermost-first.
165
+ if (matchDirectiveKind(source, pos) !== null) {
166
+ return null;
167
+ }
168
+ if (source.startsWith("[[[", pos)) {
169
+ linkDepth++;
170
+ pos += 3;
171
+ continue;
172
+ }
173
+ if (linkDepth > 0 && source.startsWith("]]]", pos)) {
174
+ linkDepth--;
175
+ pos += 3;
176
+ continue;
177
+ }
178
+ if (linkDepth > 0) {
179
+ pos++;
180
+ continue;
181
+ }
182
+ if (source.startsWith("[[", pos)) {
183
+ blockDepth++;
184
+ pos += 2;
185
+ continue;
186
+ }
187
+ if (source.startsWith("]]", pos)) {
188
+ if (blockDepth === 0) {
189
+ closeStart = pos;
190
+ break;
191
+ }
192
+ blockDepth--;
193
+ pos += 2;
194
+ continue;
195
+ }
196
+ if (source[pos] === "|" && blockDepth === 0 && linkDepth === 0) {
197
+ pipes.push(pos);
198
+ }
199
+ pos++;
200
+ }
201
+
202
+ if (closeStart === -1) return null;
203
+ const hasPipe = pipes.length > 0;
204
+ // `[[#if]]` / `[[#ifexpr]]` require a `then` branch separated by `|`.
205
+ // A directive without a pipe is malformed; leave it for the inline
206
+ // parser to report rather than silently dropping it.
207
+ if (!hasPipe && (kind === "if" || kind === "ifexpr")) return null;
208
+
209
+ let head: string;
210
+ let thenText = "";
211
+ let elseText = "";
212
+
213
+ if (!hasPipe) {
214
+ head = source.slice(headStart, closeStart).trim();
215
+ } else {
216
+ head = source.slice(headStart, pipes[0]!).trim();
217
+ if (pipes.length >= 2) {
218
+ thenText = source.slice(pipes[0]! + 1, pipes[1]!).trim();
219
+ elseText = source.slice(pipes[1]! + 1, closeStart).trim();
220
+ } else {
221
+ thenText = source.slice(pipes[0]! + 1, closeStart).trim();
222
+ }
223
+ }
224
+
225
+ return {
226
+ end: closeStart + 2,
227
+ head,
228
+ thenText,
229
+ elseText,
230
+ hasPipe,
231
+ };
232
+ }
233
+
234
+ /** Evaluate a parsed directive into its replacement string. */
235
+ function evaluateDirective(kind: DirectiveKind, m: DirectiveMatch): string {
236
+ if (kind === "expr") {
237
+ const result = evaluateExpression(m.head);
238
+ if (result.success) return formatExprValue(result.value);
239
+ // The inline renderer emits nothing for an empty `[[#expr ]]`; mirror
240
+ // that so an opener-embedded empty expr collapses to an empty
241
+ // attribute value rather than the literal "ERROR" placeholder.
242
+ if (result.error === "empty expression") return "";
243
+ return "ERROR";
244
+ }
245
+ if (kind === "if") {
246
+ if (!m.hasPipe) return "";
247
+ return isTruthy(m.head) ? m.thenText : m.elseText;
248
+ }
249
+ // ifexpr — the inline renderer treats every error (including empty
250
+ // expression) as a "run-time error" string, so we keep the placeholder
251
+ // here to avoid silently swallowing a malformed conditional.
252
+ if (!m.hasPipe) return "";
253
+ const result = evaluateExpression(m.head);
254
+ if (!result.success) return "ERROR";
255
+ return result.value !== 0 && !Number.isNaN(result.value) ? m.thenText : m.elseText;
256
+ }
257
+
258
+ function isWhitespace(ch: string | undefined): boolean {
259
+ return ch === " " || ch === "\t" || ch === "\n" || ch === "\r";
260
+ }
261
+
262
+ function isIdentChar(ch: string | undefined): boolean {
263
+ if (!ch) return false;
264
+ return /[a-z0-9_-]/i.test(ch);
265
+ }
@@ -0,0 +1,38 @@
1
+ /**
2
+ *
3
+ * Preprocessing pipeline that transforms raw wikitext before tokenization.
4
+ *
5
+ * Wikidot applies two categories of text substitutions before the main parser
6
+ * sees the input. This module orchestrates those substitutions in the correct
7
+ * order: whitespace normalization first (to establish consistent line structure),
8
+ * then typographic transformations (to convert ASCII quote/ellipsis patterns
9
+ * into Unicode equivalents).
10
+ *
11
+ * The preprocessing step is essential because the lexer and parser assume
12
+ * normalized input (Unix newlines, no tabs, consistent whitespace).
13
+ *
14
+ * @module
15
+ */
16
+
17
+ import { substitute as whitespaceSubstitute } from "./whitespace";
18
+ import { substitute as typographySubstitute } from "./typography";
19
+
20
+ export { substitute as whitespace } from "./whitespace";
21
+ export { substitute as typography } from "./typography";
22
+
23
+ /**
24
+ * Run the full preprocessing pipeline on raw wikitext.
25
+ *
26
+ * Applies the following transformations in order:
27
+ * 1. Whitespace normalization (DOS/Mac newlines, tabs, leading spaces, etc.)
28
+ * 2. Typographic substitutions (curly quotes, ellipsis)
29
+ *
30
+ * @param text - Raw wikitext input
31
+ * @returns Preprocessed text ready for tokenization
32
+ */
33
+ export function preprocess(text: string): string {
34
+ let result = text;
35
+ result = whitespaceSubstitute(result);
36
+ result = typographySubstitute(result);
37
+ return result;
38
+ }
@@ -0,0 +1,67 @@
1
+ /**
2
+ *
3
+ * Typographic preprocessing for Wikidot markup.
4
+ *
5
+ * Wikidot converts certain ASCII character sequences into their Unicode
6
+ * typographic equivalents before parsing. This module handles the following
7
+ * conversions:
8
+ *
9
+ * - ` `` ... '' ` becomes left/right double curly quotes (U+201C / U+201D)
10
+ * - ` ,, ... '' ` becomes low-9 double quote + right double quote (U+201E / U+201D)
11
+ * - `` ` ... ' `` becomes left/right single curly quotes (U+2018 / U+2019)
12
+ * - `...` (three dots) and `. . .` (spaced dots) become an ellipsis (U+2026)
13
+ *
14
+ * Em dash conversion (`--` to U+2014) is intentionally NOT handled here.
15
+ * It is performed in the parser instead, because the `--` sequence also appears
16
+ * in HTML comment markers (`[!--` and `--]`), and converting it during
17
+ * preprocessing would break comment detection.
18
+ *
19
+ * @module
20
+ */
21
+
22
+ /** Unicode left single quotation mark (U+2018) */
23
+ const LEFT_SINGLE_QUOTE = "\u2018"; // '
24
+ /** Unicode right single quotation mark (U+2019) */
25
+ const RIGHT_SINGLE_QUOTE = "\u2019"; // '
26
+ /** Unicode left double quotation mark (U+201C) */
27
+ const LEFT_DOUBLE_QUOTE = "\u201c"; // "
28
+ /** Unicode right double quotation mark (U+201D) */
29
+ const RIGHT_DOUBLE_QUOTE = "\u201d"; // "
30
+ /** Unicode double low-9 quotation mark (U+201E), used in German/Polish typography */
31
+ const LOW_DOUBLE_QUOTE = "\u201e"; // „
32
+ /** Unicode horizontal ellipsis (U+2026) */
33
+ const ELLIPSIS = "\u2026"; // …
34
+
35
+ /**
36
+ * Apply all typographic substitutions to the given text.
37
+ *
38
+ * Substitutions are applied in a specific order: double quotes first,
39
+ * then low double quotes, then single quotes, then ellipsis. This order
40
+ * matters because the backtick and apostrophe characters are shared
41
+ * between single and double quote patterns.
42
+ *
43
+ * @param text - Text to transform
44
+ * @returns Text with ASCII typography patterns replaced by Unicode equivalents
45
+ */
46
+ export function substitute(text: string): string {
47
+ let result = text;
48
+
49
+ // Double quotes: ``...'' -> "..."
50
+ result = result.replace(/``(.*?)''/g, `${LEFT_DOUBLE_QUOTE}$1${RIGHT_DOUBLE_QUOTE}`);
51
+
52
+ // Low double quotes: ,,..'' -> „..."
53
+ result = result.replace(/,,(.*?)''/g, `${LOW_DOUBLE_QUOTE}$1${RIGHT_DOUBLE_QUOTE}`);
54
+
55
+ // Single quotes: `...' -> '...'
56
+ result = result.replace(/`(.*?)'/g, `${LEFT_SINGLE_QUOTE}$1${RIGHT_SINGLE_QUOTE}`);
57
+
58
+ // Ellipsis: ... or . . . -> …
59
+ // Must be exactly 3 dots, not preceded or followed by more dots
60
+ // Handle continuous dots: ...
61
+ result = result.replace(/(?<![.])\.\.\.(?![.])/g, ELLIPSIS);
62
+
63
+ // Handle spaced dots: . . .
64
+ result = result.replace(/(?<![.])\. \. \.(?![.])/g, ELLIPSIS);
65
+
66
+ return result;
67
+ }
@@ -0,0 +1,250 @@
1
+ /**
2
+ *
3
+ * Shared helpers for text-level preprocess passes that run before
4
+ * tokenization (e.g. `[[iftags]]` collapse, opener-embedded `[[#if]]`
5
+ * collapse).
6
+ *
7
+ * Each pass needs to:
8
+ * - mask raw regions (`[[code]]`, `[[html]]`, `@@..@@`, `@<..>@`) so a
9
+ * pattern they enclose is not transformed
10
+ * - know the bracket-opener depth at every offset so it can distinguish
11
+ * directives at the top level from ones nested inside another block's
12
+ * opener attribute string
13
+ *
14
+ * The depth tracking mirrors the lexer's `blockOpenerDepth`:
15
+ * - `[[` increments, `]]` decrements (clamped at 0)
16
+ * - `[[[ ... ]]]` triple links do not affect block depth
17
+ * - quoted attribute values (`= "..."`) are skipped to the next `"` /
18
+ * newline, matching the lexer's `QUOTED_STRING` recognition
19
+ * - newlines reset depth to 0 (block openers are single-line constructs)
20
+ *
21
+ * @module
22
+ */
23
+
24
+ const BASE_PLACEHOLDER_OPEN = "\uE000";
25
+ const BASE_PLACEHOLDER_CLOSE = "\uE001";
26
+
27
+ const RAW_BLOCK_OPEN_PATTERN = /\[\[\s*(code|html)\b[^\]]*\]\]/iy;
28
+
29
+ /** Unique sentinel characters used to wrap raw-region placeholders. */
30
+ export interface Sentinels {
31
+ open: string;
32
+ close: string;
33
+ }
34
+
35
+ /**
36
+ * Choose sentinel strings that are guaranteed not to appear in `source`.
37
+ * The placeholders we splice into the masked source have the form
38
+ * `<open><digits><close>`, so the restore pass must not confuse them
39
+ * with content. Extends both sentinel characters until neither appears.
40
+ */
41
+ export function makeUniqueSentinels(source: string): Sentinels {
42
+ let open = BASE_PLACEHOLDER_OPEN;
43
+ let close = BASE_PLACEHOLDER_CLOSE;
44
+ while (source.includes(open) || source.includes(close)) {
45
+ open += BASE_PLACEHOLDER_OPEN;
46
+ close += BASE_PLACEHOLDER_CLOSE;
47
+ }
48
+ return { open, close };
49
+ }
50
+
51
+ /**
52
+ * Walk `source` and replace each raw region with a placeholder token so
53
+ * downstream passes (regex / scan) do not transform their bodies. The
54
+ * original substrings are kept in `placeholders` for {@link restorePlaceholders}
55
+ * to splice back at the end.
56
+ *
57
+ * Raw regions handled:
58
+ * - `[[code ...]]...[[/code]]` — consumes to EOF when the closing tag
59
+ * is missing (mirroring the block parser's behaviour for unclosed
60
+ * code blocks).
61
+ * - `[[html ...]]...[[/html]]` — only masked when the closing tag is
62
+ * present; an unclosed `[[html]]` is left in place so a later directive
63
+ * is not incorrectly hidden behind the mask.
64
+ * - `@<...>@` (single-line balanced raw — `>@` must be on the same line).
65
+ * - `@@...@@` (single-line inline raw — must not span newlines).
66
+ *
67
+ * Genuinely unclosed `@@` / `@<` are left in place (the parser treats
68
+ * them as literal text anyway). Comments `[!-- ... --]` are intentionally
69
+ * not masked: Wikidot's legacy Text_Wiki evaluates `[[iftags]]` before
70
+ * comments, so masking here would invert that order.
71
+ */
72
+ export function maskRawRegions(
73
+ source: string,
74
+ sentinels: Sentinels,
75
+ ): { masked: string; placeholders: string[] } {
76
+ const placeholders: string[] = [];
77
+ let masked = "";
78
+ let i = 0;
79
+
80
+ while (i < source.length) {
81
+ if (source[i] === "[" && source[i + 1] === "[") {
82
+ RAW_BLOCK_OPEN_PATTERN.lastIndex = i;
83
+ const openMatch = RAW_BLOCK_OPEN_PATTERN.exec(source);
84
+ if (openMatch) {
85
+ const name = openMatch[1]!.toLowerCase();
86
+ const openLen = openMatch[0].length;
87
+ const closePattern = new RegExp(`\\[\\[\\/\\s*${name}\\s*\\]\\]`, "ig");
88
+ closePattern.lastIndex = i + openLen;
89
+ const closeMatch = closePattern.exec(source);
90
+ if (closeMatch) {
91
+ const regionEnd = closeMatch.index + closeMatch[0].length;
92
+ masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
93
+ i = regionEnd;
94
+ continue;
95
+ }
96
+ if (name === "code") {
97
+ masked += pushPlaceholder(placeholders, source.slice(i), sentinels);
98
+ i = source.length;
99
+ continue;
100
+ }
101
+ }
102
+ }
103
+
104
+ if (source[i] === "@" && source[i + 1] === "<") {
105
+ const close = source.indexOf(">@", i + 2);
106
+ const newline = source.indexOf("\n", i + 2);
107
+ if (close !== -1 && (newline === -1 || close < newline)) {
108
+ const regionEnd = close + 2;
109
+ masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
110
+ i = regionEnd;
111
+ continue;
112
+ }
113
+ }
114
+
115
+ if (source[i] === "@" && source[i + 1] === "@") {
116
+ const close = source.indexOf("@@", i + 2);
117
+ const newline = source.indexOf("\n", i + 2);
118
+ if (close !== -1 && (newline === -1 || close < newline)) {
119
+ const regionEnd = close + 2;
120
+ masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
121
+ i = regionEnd;
122
+ continue;
123
+ }
124
+ }
125
+
126
+ masked += source[i];
127
+ i++;
128
+ }
129
+
130
+ return { masked, placeholders };
131
+ }
132
+
133
+ function pushPlaceholder(placeholders: string[], text: string, sentinels: Sentinels): string {
134
+ const idx = placeholders.length;
135
+ placeholders.push(text);
136
+ return `${sentinels.open}${idx}${sentinels.close}`;
137
+ }
138
+
139
+ function escapeRegex(str: string): string {
140
+ return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
141
+ }
142
+
143
+ /** Inverse of {@link maskRawRegions}: replace placeholders with originals. */
144
+ export function restorePlaceholders(
145
+ source: string,
146
+ placeholders: string[],
147
+ sentinels: Sentinels,
148
+ ): string {
149
+ const pattern = new RegExp(
150
+ `${escapeRegex(sentinels.open)}(\\d+)${escapeRegex(sentinels.close)}`,
151
+ "g",
152
+ );
153
+ return source.replace(pattern, (_, idx: string) => placeholders[Number(idx)] ?? "");
154
+ }
155
+
156
+ /**
157
+ * Compute the unmatched-`[[` depth at each character offset of `source`.
158
+ * Mirrors the lexer's `blockOpenerDepth`. Returns `Int32Array` of length
159
+ * `source.length + 1`; `depths[k]` is the depth immediately before the
160
+ * character at offset `k` is consumed.
161
+ */
162
+ export function computeBracketDepths(source: string): Int32Array {
163
+ const n = source.length;
164
+ const depths = new Int32Array(n + 1);
165
+ let depth = 0;
166
+ let i = 0;
167
+ while (i < n) {
168
+ depths[i] = depth;
169
+ const c = source.charCodeAt(i);
170
+ const c1 = i + 1 < n ? source.charCodeAt(i + 1) : -1;
171
+ const c2 = i + 2 < n ? source.charCodeAt(i + 2) : -1;
172
+
173
+ if (depth > 0 && c === 0x22 /* " */ && precededByEqualsAttr(source, i)) {
174
+ const end = findQuoteEnd(source, i + 1);
175
+ for (let k = i; k <= end; k++) depths[k] = depth;
176
+ i = end + 1;
177
+ continue;
178
+ }
179
+
180
+ if (c === 0x5b /* [ */ && c1 === 0x5b && c2 === 0x5b) {
181
+ const end = findTripleLinkEnd(source, i + 3);
182
+ for (let k = i; k <= end; k++) depths[k] = depth;
183
+ i = end + 1;
184
+ continue;
185
+ }
186
+
187
+ if (c === 0x5b && c1 === 0x5b) {
188
+ depth++;
189
+ depths[i + 1] = depth;
190
+ i += 2;
191
+ continue;
192
+ }
193
+
194
+ if (c === 0x5d /* ] */ && c1 === 0x5d) {
195
+ depth = Math.max(0, depth - 1);
196
+ depths[i + 1] = depth;
197
+ i += 2;
198
+ continue;
199
+ }
200
+
201
+ if (c === 0x0a /* \n */) {
202
+ // Block openers are single-line; reset depth at line boundaries so
203
+ // an unterminated `[[xxx` does not keep subsequent directives
204
+ // inside its (imaginary) opener context.
205
+ depth = 0;
206
+ }
207
+
208
+ i++;
209
+ }
210
+ depths[n] = depth;
211
+ return depths;
212
+ }
213
+
214
+ function precededByEqualsAttr(s: string, i: number): boolean {
215
+ let j = i - 1;
216
+ while (j >= 0) {
217
+ const ch = s.charCodeAt(j);
218
+ if (ch === 0x20 /* space */ || ch === 0x09 /* tab */) {
219
+ j--;
220
+ continue;
221
+ }
222
+ return ch === 0x3d; /* = */
223
+ }
224
+ return false;
225
+ }
226
+
227
+ function findQuoteEnd(s: string, from: number): number {
228
+ for (let i = from; i < s.length; i++) {
229
+ const ch = s.charCodeAt(i);
230
+ if (ch === 0x22 /* " */ || ch === 0x0a /* \n */) return i;
231
+ }
232
+ return s.length - 1;
233
+ }
234
+
235
+ function findTripleLinkEnd(s: string, from: number): number {
236
+ for (let i = from; i < s.length; i++) {
237
+ if (
238
+ s.charCodeAt(i) === 0x5d &&
239
+ i + 2 < s.length &&
240
+ s.charCodeAt(i + 1) === 0x5d &&
241
+ s.charCodeAt(i + 2) === 0x5d
242
+ ) {
243
+ return i + 2;
244
+ }
245
+ if (s.charCodeAt(i) === 0x0a && i + 1 < s.length && s.charCodeAt(i + 1) === 0x0a) {
246
+ return i;
247
+ }
248
+ }
249
+ return s.length - 1;
250
+ }