@wdprlib/parser 3.1.1 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +312 -121
- package/dist/index.js +289 -98
- package/package.json +5 -3
- package/src/index.ts +163 -0
- package/src/lexer/index.ts +20 -0
- package/src/lexer/lexer.ts +687 -0
- package/src/lexer/tokens.ts +141 -0
- package/src/parser/constants.ts +173 -0
- package/src/parser/depth.ts +251 -0
- package/src/parser/index.ts +18 -0
- package/src/parser/parse.ts +315 -0
- package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
- package/src/parser/postprocess/index.ts +15 -0
- package/src/parser/postprocess/spanStrip.ts +697 -0
- package/src/parser/preprocess/expr.ts +265 -0
- package/src/parser/preprocess/index.ts +38 -0
- package/src/parser/preprocess/typography.ts +67 -0
- package/src/parser/preprocess/utils.ts +250 -0
- package/src/parser/preprocess/whitespace.ts +111 -0
- package/src/parser/rules/block/align.ts +282 -0
- package/src/parser/rules/block/bibliography.ts +359 -0
- package/src/parser/rules/block/block-list.ts +689 -0
- package/src/parser/rules/block/blockquote.ts +238 -0
- package/src/parser/rules/block/center.ts +87 -0
- package/src/parser/rules/block/clear-float.ts +75 -0
- package/src/parser/rules/block/code.ts +187 -0
- package/src/parser/rules/block/collapsible.ts +337 -0
- package/src/parser/rules/block/comment.ts +73 -0
- package/src/parser/rules/block/content-separator.ts +79 -0
- package/src/parser/rules/block/definition-list.ts +270 -0
- package/src/parser/rules/block/div.ts +400 -0
- package/src/parser/rules/block/embed-block.ts +153 -0
- package/src/parser/rules/block/footnoteblock.ts +200 -0
- package/src/parser/rules/block/heading.ts +142 -0
- package/src/parser/rules/block/horizontal-rule.ts +61 -0
- package/src/parser/rules/block/html.ts +222 -0
- package/src/parser/rules/block/iframe.ts +239 -0
- package/src/parser/rules/block/iftags.ts +150 -0
- package/src/parser/rules/block/include.ts +179 -0
- package/src/parser/rules/block/index.ts +127 -0
- package/src/parser/rules/block/list.ts +244 -0
- package/src/parser/rules/block/math.ts +183 -0
- package/src/parser/rules/block/module/backlinks/index.ts +31 -0
- package/src/parser/rules/block/module/backlinks/types.ts +21 -0
- package/src/parser/rules/block/module/categories/index.ts +34 -0
- package/src/parser/rules/block/module/categories/types.ts +21 -0
- package/src/parser/rules/block/module/css/index.ts +37 -0
- package/src/parser/rules/block/module/iftags/condition.ts +109 -0
- package/src/parser/rules/block/module/iftags/index.ts +26 -0
- package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
- package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
- package/src/parser/rules/block/module/iftags/types.ts +63 -0
- package/src/parser/rules/block/module/include/index.ts +20 -0
- package/src/parser/rules/block/module/include/resolve.ts +556 -0
- package/src/parser/rules/block/module/index.ts +122 -0
- package/src/parser/rules/block/module/join/index.ts +34 -0
- package/src/parser/rules/block/module/join/types.ts +23 -0
- package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
- package/src/parser/rules/block/module/listpages/extract.ts +410 -0
- package/src/parser/rules/block/module/listpages/index.ts +83 -0
- package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
- package/src/parser/rules/block/module/listpages/parser.ts +106 -0
- package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
- package/src/parser/rules/block/module/listpages/types.ts +513 -0
- package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
- package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
- package/src/parser/rules/block/module/listusers/extract.ts +45 -0
- package/src/parser/rules/block/module/listusers/index.ts +36 -0
- package/src/parser/rules/block/module/listusers/parser.ts +54 -0
- package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
- package/src/parser/rules/block/module/listusers/types.ts +93 -0
- package/src/parser/rules/block/module/mapping.ts +61 -0
- package/src/parser/rules/block/module/page-tree/index.ts +38 -0
- package/src/parser/rules/block/module/page-tree/types.ts +29 -0
- package/src/parser/rules/block/module/rate/index.ts +28 -0
- package/src/parser/rules/block/module/rate/types.ts +19 -0
- package/src/parser/rules/block/module/resolve.ts +411 -0
- package/src/parser/rules/block/module/types-common.ts +59 -0
- package/src/parser/rules/block/module/types.ts +61 -0
- package/src/parser/rules/block/module/utils.ts +43 -0
- package/src/parser/rules/block/module/walk.ts +380 -0
- package/src/parser/rules/block/module.ts +164 -0
- package/src/parser/rules/block/orphan-li.ts +177 -0
- package/src/parser/rules/block/paragraph.ts +157 -0
- package/src/parser/rules/block/table-block.ts +726 -0
- package/src/parser/rules/block/table.ts +441 -0
- package/src/parser/rules/block/tabview.ts +331 -0
- package/src/parser/rules/block/toc.ts +129 -0
- package/src/parser/rules/block/utils.ts +615 -0
- package/src/parser/rules/index.ts +49 -0
- package/src/parser/rules/inline/anchor-name.ts +154 -0
- package/src/parser/rules/inline/anchor.ts +327 -0
- package/src/parser/rules/inline/bibcite.ts +153 -0
- package/src/parser/rules/inline/bold.ts +86 -0
- package/src/parser/rules/inline/color.ts +140 -0
- package/src/parser/rules/inline/comment.ts +90 -0
- package/src/parser/rules/inline/equation-ref.ts +115 -0
- package/src/parser/rules/inline/expr.ts +526 -0
- package/src/parser/rules/inline/footnote.ts +223 -0
- package/src/parser/rules/inline/guillemet.ts +64 -0
- package/src/parser/rules/inline/html.ts +132 -0
- package/src/parser/rules/inline/image.ts +328 -0
- package/src/parser/rules/inline/index.ts +150 -0
- package/src/parser/rules/inline/italic.ts +74 -0
- package/src/parser/rules/inline/line-break.ts +326 -0
- package/src/parser/rules/inline/link-anchor.ts +147 -0
- package/src/parser/rules/inline/link-single.ts +164 -0
- package/src/parser/rules/inline/link-star.ts +134 -0
- package/src/parser/rules/inline/link-triple.ts +267 -0
- package/src/parser/rules/inline/math-inline.ts +126 -0
- package/src/parser/rules/inline/monospace.ts +78 -0
- package/src/parser/rules/inline/raw.ts +262 -0
- package/src/parser/rules/inline/size.ts +244 -0
- package/src/parser/rules/inline/span.ts +424 -0
- package/src/parser/rules/inline/strikethrough.ts +115 -0
- package/src/parser/rules/inline/subscript.ts +84 -0
- package/src/parser/rules/inline/superscript.ts +84 -0
- package/src/parser/rules/inline/text.ts +84 -0
- package/src/parser/rules/inline/underline.ts +127 -0
- package/src/parser/rules/inline/user.ts +147 -0
- package/src/parser/rules/inline/utils.ts +344 -0
- package/src/parser/rules/types.ts +252 -0
- package/src/parser/rules/utils.ts +155 -0
- package/src/parser/toc.ts +130 -0
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Text-level expansion of `[[#if ...]]`, `[[#ifexpr ...]]`, and
|
|
4
|
+
* `[[#expr ...]]` directives that sit *inside* another block's opener.
|
|
5
|
+
*
|
|
6
|
+
* The inline rules in `rules/inline/expr.ts` parse these forms as regular
|
|
7
|
+
* inline elements, but that only works when the directive appears in
|
|
8
|
+
* parseable inline text. When one is embedded inside a block opener's
|
|
9
|
+
* attribute string, e.g.
|
|
10
|
+
*
|
|
11
|
+
* ```wikitext
|
|
12
|
+
* [[div class="x [[#if 1 | a | b ]]"]]
|
|
13
|
+
* [[li class="[[#if 1 | folded | unfolded ]] [[#ifexpr 1>0 | hot | cold ]]"]]
|
|
14
|
+
* [[div col="[[#expr 1+1]]"]]
|
|
15
|
+
* ```
|
|
16
|
+
*
|
|
17
|
+
* the lexer cannot recover a well-formed opener from the input. The
|
|
18
|
+
* embedded directive has to collapse to a plain string before the parser
|
|
19
|
+
* sees the outer tag.
|
|
20
|
+
*
|
|
21
|
+
* This pass only resolves directives whose `[[#` sits inside an unclosed
|
|
22
|
+
* `[[` (depth > 0). Top-level directives are left untouched so the inline
|
|
23
|
+
* parser / AST renderer keeps its full evaluator + element support.
|
|
24
|
+
*
|
|
25
|
+
* Truthiness rules match the inline `ifRule` / `ifExprRule`: an empty
|
|
26
|
+
* string, `"0"`, `"false"`, `"null"` (case-insensitive) are falsy.
|
|
27
|
+
*
|
|
28
|
+
* @module
|
|
29
|
+
*/
|
|
30
|
+
|
|
31
|
+
import { evaluateExpression, formatExprValue, isTruthy } from "@wdprlib/ast";
|
|
32
|
+
import {
|
|
33
|
+
computeBracketDepths,
|
|
34
|
+
makeUniqueSentinels,
|
|
35
|
+
maskRawRegions,
|
|
36
|
+
restorePlaceholders,
|
|
37
|
+
} from "./utils";
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Resolve every `[[#if]]` / `[[#ifexpr]]` / `[[#expr]]` that sits inside
|
|
41
|
+
* another block's opener (depth > 0). Top-level directives are left for
|
|
42
|
+
* the inline parser. Innermost-first reduction lets an outer directive
|
|
43
|
+
* re-process the flattened body on the next pass. Unmatched / malformed
|
|
44
|
+
* directives are left untouched.
|
|
45
|
+
*/
|
|
46
|
+
export function preprocessExpr(source: string): string {
|
|
47
|
+
if (!source.includes("[[#")) return source;
|
|
48
|
+
|
|
49
|
+
const sentinels = makeUniqueSentinels(source);
|
|
50
|
+
const { masked, placeholders } = maskRawRegions(source, sentinels);
|
|
51
|
+
const reduced = reduceExpr(masked);
|
|
52
|
+
return restorePlaceholders(reduced, placeholders, sentinels);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Backwards-compatible alias for the older `preprocessIf` name (used by
|
|
57
|
+
* external callers that target the previous, `[[#if]]`-only behaviour).
|
|
58
|
+
* Both names point at the same implementation, which now also resolves
|
|
59
|
+
* `[[#ifexpr]]` and `[[#expr]]` in opener context.
|
|
60
|
+
*/
|
|
61
|
+
export const preprocessIf: (source: string) => string = preprocessExpr;
|
|
62
|
+
|
|
63
|
+
function reduceExpr(source: string): string {
|
|
64
|
+
let current = source;
|
|
65
|
+
const maxIterations = source.length + 1;
|
|
66
|
+
for (let i = 0; i < maxIterations; i++) {
|
|
67
|
+
const next = expandInnermost(current);
|
|
68
|
+
if (next === current) return current;
|
|
69
|
+
current = next;
|
|
70
|
+
}
|
|
71
|
+
return current;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Walk `source`, locate every innermost `[[#if]]` / `[[#ifexpr]]` /
|
|
76
|
+
* `[[#expr]]` directive that sits inside an unclosed `[[`, and replace
|
|
77
|
+
* it with its evaluated string. Returns the source unchanged when no
|
|
78
|
+
* replacements were made.
|
|
79
|
+
*/
|
|
80
|
+
function expandInnermost(source: string): string {
|
|
81
|
+
const depths = computeBracketDepths(source);
|
|
82
|
+
let result = "";
|
|
83
|
+
let i = 0;
|
|
84
|
+
let replaced = false;
|
|
85
|
+
|
|
86
|
+
while (i < source.length) {
|
|
87
|
+
const kind = matchDirectiveKind(source, i);
|
|
88
|
+
if (kind !== null && depths[i]! > 0) {
|
|
89
|
+
const match = tryParseInnermostDirective(source, i, kind);
|
|
90
|
+
if (match !== null) {
|
|
91
|
+
result += evaluateDirective(kind, match);
|
|
92
|
+
i = match.end;
|
|
93
|
+
replaced = true;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
result += source[i];
|
|
98
|
+
i++;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return replaced ? result : source;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
type DirectiveKind = "if" | "ifexpr" | "expr";
|
|
105
|
+
|
|
106
|
+
/** Return the kind of `[[#xxx` directive at `i`, or null if none matches. */
|
|
107
|
+
function matchDirectiveKind(source: string, i: number): DirectiveKind | null {
|
|
108
|
+
if (!source.startsWith("[[#", i)) return null;
|
|
109
|
+
// Order matters: `ifexpr` must be checked before `if` because the
|
|
110
|
+
// shorter `if` prefix would otherwise consume `ifexpr` openings.
|
|
111
|
+
if (source.startsWith("ifexpr", i + 3) && !isIdentChar(source[i + 9])) {
|
|
112
|
+
return "ifexpr";
|
|
113
|
+
}
|
|
114
|
+
if (source.startsWith("if", i + 3) && !isIdentChar(source[i + 5])) {
|
|
115
|
+
return "if";
|
|
116
|
+
}
|
|
117
|
+
if (source.startsWith("expr", i + 3) && !isIdentChar(source[i + 7])) {
|
|
118
|
+
return "expr";
|
|
119
|
+
}
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
interface DirectiveMatch {
|
|
124
|
+
/** Position just past the closing `]]`. */
|
|
125
|
+
end: number;
|
|
126
|
+
/** Raw condition / expression (everything between the keyword and the first top-level `|` or `]]`). */
|
|
127
|
+
head: string;
|
|
128
|
+
/** Raw `then` branch (empty when no `|` appeared). */
|
|
129
|
+
thenText: string;
|
|
130
|
+
/** Raw `else` branch (empty when only one `|` appeared). */
|
|
131
|
+
elseText: string;
|
|
132
|
+
/** Whether the directive supplied a `|` at all. */
|
|
133
|
+
hasPipe: boolean;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Try to parse a single `[[#kind ...]]` directive starting at `start`.
|
|
138
|
+
* Returns `null` when the directive is malformed (no closing `]]`) or
|
|
139
|
+
* when its body contains another `[[#kind]]` of the same family
|
|
140
|
+
* (so the caller should keep descending). The substrings are returned
|
|
141
|
+
* raw; callers decide how to evaluate them.
|
|
142
|
+
*/
|
|
143
|
+
function tryParseInnermostDirective(
|
|
144
|
+
source: string,
|
|
145
|
+
start: number,
|
|
146
|
+
kind: DirectiveKind,
|
|
147
|
+
): DirectiveMatch | null {
|
|
148
|
+
const keywordLen = kind === "ifexpr" ? 6 : kind === "expr" ? 4 : 2;
|
|
149
|
+
// start + 3 ("[[#") + keywordLen → first char after the keyword.
|
|
150
|
+
let pos = start + 3 + keywordLen;
|
|
151
|
+
// The inline rule does not require a whitespace separator here — it
|
|
152
|
+
// accepts e.g. `[[#expr(1+1)]]` and `[[#ifexpr(1)|yes|no]]`. Skip any
|
|
153
|
+
// optional leading whitespace and let the body scan handle the rest.
|
|
154
|
+
while (pos < source.length && isWhitespace(source[pos])) pos++;
|
|
155
|
+
|
|
156
|
+
const headStart = pos;
|
|
157
|
+
let blockDepth = 0;
|
|
158
|
+
let linkDepth = 0;
|
|
159
|
+
const pipes: number[] = [];
|
|
160
|
+
let closeStart = -1;
|
|
161
|
+
|
|
162
|
+
while (pos < source.length) {
|
|
163
|
+
// Reject any nested directive of the same family so we resolve
|
|
164
|
+
// innermost-first.
|
|
165
|
+
if (matchDirectiveKind(source, pos) !== null) {
|
|
166
|
+
return null;
|
|
167
|
+
}
|
|
168
|
+
if (source.startsWith("[[[", pos)) {
|
|
169
|
+
linkDepth++;
|
|
170
|
+
pos += 3;
|
|
171
|
+
continue;
|
|
172
|
+
}
|
|
173
|
+
if (linkDepth > 0 && source.startsWith("]]]", pos)) {
|
|
174
|
+
linkDepth--;
|
|
175
|
+
pos += 3;
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
if (linkDepth > 0) {
|
|
179
|
+
pos++;
|
|
180
|
+
continue;
|
|
181
|
+
}
|
|
182
|
+
if (source.startsWith("[[", pos)) {
|
|
183
|
+
blockDepth++;
|
|
184
|
+
pos += 2;
|
|
185
|
+
continue;
|
|
186
|
+
}
|
|
187
|
+
if (source.startsWith("]]", pos)) {
|
|
188
|
+
if (blockDepth === 0) {
|
|
189
|
+
closeStart = pos;
|
|
190
|
+
break;
|
|
191
|
+
}
|
|
192
|
+
blockDepth--;
|
|
193
|
+
pos += 2;
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
if (source[pos] === "|" && blockDepth === 0 && linkDepth === 0) {
|
|
197
|
+
pipes.push(pos);
|
|
198
|
+
}
|
|
199
|
+
pos++;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if (closeStart === -1) return null;
|
|
203
|
+
const hasPipe = pipes.length > 0;
|
|
204
|
+
// `[[#if]]` / `[[#ifexpr]]` require a `then` branch separated by `|`.
|
|
205
|
+
// A directive without a pipe is malformed; leave it for the inline
|
|
206
|
+
// parser to report rather than silently dropping it.
|
|
207
|
+
if (!hasPipe && (kind === "if" || kind === "ifexpr")) return null;
|
|
208
|
+
|
|
209
|
+
let head: string;
|
|
210
|
+
let thenText = "";
|
|
211
|
+
let elseText = "";
|
|
212
|
+
|
|
213
|
+
if (!hasPipe) {
|
|
214
|
+
head = source.slice(headStart, closeStart).trim();
|
|
215
|
+
} else {
|
|
216
|
+
head = source.slice(headStart, pipes[0]!).trim();
|
|
217
|
+
if (pipes.length >= 2) {
|
|
218
|
+
thenText = source.slice(pipes[0]! + 1, pipes[1]!).trim();
|
|
219
|
+
elseText = source.slice(pipes[1]! + 1, closeStart).trim();
|
|
220
|
+
} else {
|
|
221
|
+
thenText = source.slice(pipes[0]! + 1, closeStart).trim();
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
end: closeStart + 2,
|
|
227
|
+
head,
|
|
228
|
+
thenText,
|
|
229
|
+
elseText,
|
|
230
|
+
hasPipe,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/** Evaluate a parsed directive into its replacement string. */
|
|
235
|
+
function evaluateDirective(kind: DirectiveKind, m: DirectiveMatch): string {
|
|
236
|
+
if (kind === "expr") {
|
|
237
|
+
const result = evaluateExpression(m.head);
|
|
238
|
+
if (result.success) return formatExprValue(result.value);
|
|
239
|
+
// The inline renderer emits nothing for an empty `[[#expr ]]`; mirror
|
|
240
|
+
// that so an opener-embedded empty expr collapses to an empty
|
|
241
|
+
// attribute value rather than the literal "ERROR" placeholder.
|
|
242
|
+
if (result.error === "empty expression") return "";
|
|
243
|
+
return "ERROR";
|
|
244
|
+
}
|
|
245
|
+
if (kind === "if") {
|
|
246
|
+
if (!m.hasPipe) return "";
|
|
247
|
+
return isTruthy(m.head) ? m.thenText : m.elseText;
|
|
248
|
+
}
|
|
249
|
+
// ifexpr — the inline renderer treats every error (including empty
|
|
250
|
+
// expression) as a "run-time error" string, so we keep the placeholder
|
|
251
|
+
// here to avoid silently swallowing a malformed conditional.
|
|
252
|
+
if (!m.hasPipe) return "";
|
|
253
|
+
const result = evaluateExpression(m.head);
|
|
254
|
+
if (!result.success) return "ERROR";
|
|
255
|
+
return result.value !== 0 && !Number.isNaN(result.value) ? m.thenText : m.elseText;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
function isWhitespace(ch: string | undefined): boolean {
|
|
259
|
+
return ch === " " || ch === "\t" || ch === "\n" || ch === "\r";
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function isIdentChar(ch: string | undefined): boolean {
|
|
263
|
+
if (!ch) return false;
|
|
264
|
+
return /[a-z0-9_-]/i.test(ch);
|
|
265
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Preprocessing pipeline that transforms raw wikitext before tokenization.
|
|
4
|
+
*
|
|
5
|
+
* Wikidot applies two categories of text substitutions before the main parser
|
|
6
|
+
* sees the input. This module orchestrates those substitutions in the correct
|
|
7
|
+
* order: whitespace normalization first (to establish consistent line structure),
|
|
8
|
+
* then typographic transformations (to convert ASCII quote/ellipsis patterns
|
|
9
|
+
* into Unicode equivalents).
|
|
10
|
+
*
|
|
11
|
+
* The preprocessing step is essential because the lexer and parser assume
|
|
12
|
+
* normalized input (Unix newlines, no tabs, consistent whitespace).
|
|
13
|
+
*
|
|
14
|
+
* @module
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import { substitute as whitespaceSubstitute } from "./whitespace";
|
|
18
|
+
import { substitute as typographySubstitute } from "./typography";
|
|
19
|
+
|
|
20
|
+
export { substitute as whitespace } from "./whitespace";
|
|
21
|
+
export { substitute as typography } from "./typography";
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* Run the full preprocessing pipeline on raw wikitext.
|
|
25
|
+
*
|
|
26
|
+
* Applies the following transformations in order:
|
|
27
|
+
* 1. Whitespace normalization (DOS/Mac newlines, tabs, leading spaces, etc.)
|
|
28
|
+
* 2. Typographic substitutions (curly quotes, ellipsis)
|
|
29
|
+
*
|
|
30
|
+
* @param text - Raw wikitext input
|
|
31
|
+
* @returns Preprocessed text ready for tokenization
|
|
32
|
+
*/
|
|
33
|
+
export function preprocess(text: string): string {
|
|
34
|
+
let result = text;
|
|
35
|
+
result = whitespaceSubstitute(result);
|
|
36
|
+
result = typographySubstitute(result);
|
|
37
|
+
return result;
|
|
38
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Typographic preprocessing for Wikidot markup.
|
|
4
|
+
*
|
|
5
|
+
* Wikidot converts certain ASCII character sequences into their Unicode
|
|
6
|
+
* typographic equivalents before parsing. This module handles the following
|
|
7
|
+
* conversions:
|
|
8
|
+
*
|
|
9
|
+
* - ` `` ... '' ` becomes left/right double curly quotes (U+201C / U+201D)
|
|
10
|
+
* - ` ,, ... '' ` becomes low-9 double quote + right double quote (U+201E / U+201D)
|
|
11
|
+
* - `` ` ... ' `` becomes left/right single curly quotes (U+2018 / U+2019)
|
|
12
|
+
* - `...` (three dots) and `. . .` (spaced dots) become an ellipsis (U+2026)
|
|
13
|
+
*
|
|
14
|
+
* Em dash conversion (`--` to U+2014) is intentionally NOT handled here.
|
|
15
|
+
* It is performed in the parser instead, because the `--` sequence also appears
|
|
16
|
+
* in HTML comment markers (`[!--` and `--]`), and converting it during
|
|
17
|
+
* preprocessing would break comment detection.
|
|
18
|
+
*
|
|
19
|
+
* @module
|
|
20
|
+
*/
|
|
21
|
+
|
|
22
|
+
/** Unicode left single quotation mark (U+2018) */
|
|
23
|
+
const LEFT_SINGLE_QUOTE = "\u2018"; // '
|
|
24
|
+
/** Unicode right single quotation mark (U+2019) */
|
|
25
|
+
const RIGHT_SINGLE_QUOTE = "\u2019"; // '
|
|
26
|
+
/** Unicode left double quotation mark (U+201C) */
|
|
27
|
+
const LEFT_DOUBLE_QUOTE = "\u201c"; // "
|
|
28
|
+
/** Unicode right double quotation mark (U+201D) */
|
|
29
|
+
const RIGHT_DOUBLE_QUOTE = "\u201d"; // "
|
|
30
|
+
/** Unicode double low-9 quotation mark (U+201E), used in German/Polish typography */
|
|
31
|
+
const LOW_DOUBLE_QUOTE = "\u201e"; // „
|
|
32
|
+
/** Unicode horizontal ellipsis (U+2026) */
|
|
33
|
+
const ELLIPSIS = "\u2026"; // …
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Apply all typographic substitutions to the given text.
|
|
37
|
+
*
|
|
38
|
+
* Substitutions are applied in a specific order: double quotes first,
|
|
39
|
+
* then low double quotes, then single quotes, then ellipsis. This order
|
|
40
|
+
* matters because the backtick and apostrophe characters are shared
|
|
41
|
+
* between single and double quote patterns.
|
|
42
|
+
*
|
|
43
|
+
* @param text - Text to transform
|
|
44
|
+
* @returns Text with ASCII typography patterns replaced by Unicode equivalents
|
|
45
|
+
*/
|
|
46
|
+
export function substitute(text: string): string {
|
|
47
|
+
let result = text;
|
|
48
|
+
|
|
49
|
+
// Double quotes: ``...'' -> "..."
|
|
50
|
+
result = result.replace(/``(.*?)''/g, `${LEFT_DOUBLE_QUOTE}$1${RIGHT_DOUBLE_QUOTE}`);
|
|
51
|
+
|
|
52
|
+
// Low double quotes: ,,..'' -> „..."
|
|
53
|
+
result = result.replace(/,,(.*?)''/g, `${LOW_DOUBLE_QUOTE}$1${RIGHT_DOUBLE_QUOTE}`);
|
|
54
|
+
|
|
55
|
+
// Single quotes: `...' -> '...'
|
|
56
|
+
result = result.replace(/`(.*?)'/g, `${LEFT_SINGLE_QUOTE}$1${RIGHT_SINGLE_QUOTE}`);
|
|
57
|
+
|
|
58
|
+
// Ellipsis: ... or . . . -> …
|
|
59
|
+
// Must be exactly 3 dots, not preceded or followed by more dots
|
|
60
|
+
// Handle continuous dots: ...
|
|
61
|
+
result = result.replace(/(?<![.])\.\.\.(?![.])/g, ELLIPSIS);
|
|
62
|
+
|
|
63
|
+
// Handle spaced dots: . . .
|
|
64
|
+
result = result.replace(/(?<![.])\. \. \.(?![.])/g, ELLIPSIS);
|
|
65
|
+
|
|
66
|
+
return result;
|
|
67
|
+
}
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Shared helpers for text-level preprocess passes that run before
|
|
4
|
+
* tokenization (e.g. `[[iftags]]` collapse, opener-embedded `[[#if]]`
|
|
5
|
+
* collapse).
|
|
6
|
+
*
|
|
7
|
+
* Each pass needs to:
|
|
8
|
+
* - mask raw regions (`[[code]]`, `[[html]]`, `@@..@@`, `@<..>@`) so a
|
|
9
|
+
* pattern they enclose is not transformed
|
|
10
|
+
* - know the bracket-opener depth at every offset so it can distinguish
|
|
11
|
+
* directives at the top level from ones nested inside another block's
|
|
12
|
+
* opener attribute string
|
|
13
|
+
*
|
|
14
|
+
* The depth tracking mirrors the lexer's `blockOpenerDepth`:
|
|
15
|
+
* - `[[` increments, `]]` decrements (clamped at 0)
|
|
16
|
+
* - `[[[ ... ]]]` triple links do not affect block depth
|
|
17
|
+
* - quoted attribute values (`= "..."`) are skipped to the next `"` /
|
|
18
|
+
* newline, matching the lexer's `QUOTED_STRING` recognition
|
|
19
|
+
* - newlines reset depth to 0 (block openers are single-line constructs)
|
|
20
|
+
*
|
|
21
|
+
* @module
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
const BASE_PLACEHOLDER_OPEN = "\uE000";
|
|
25
|
+
const BASE_PLACEHOLDER_CLOSE = "\uE001";
|
|
26
|
+
|
|
27
|
+
const RAW_BLOCK_OPEN_PATTERN = /\[\[\s*(code|html)\b[^\]]*\]\]/iy;
|
|
28
|
+
|
|
29
|
+
/** Unique sentinel characters used to wrap raw-region placeholders. */
|
|
30
|
+
export interface Sentinels {
|
|
31
|
+
open: string;
|
|
32
|
+
close: string;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Choose sentinel strings that are guaranteed not to appear in `source`.
|
|
37
|
+
* The placeholders we splice into the masked source have the form
|
|
38
|
+
* `<open><digits><close>`, so the restore pass must not confuse them
|
|
39
|
+
* with content. Extends both sentinel characters until neither appears.
|
|
40
|
+
*/
|
|
41
|
+
export function makeUniqueSentinels(source: string): Sentinels {
|
|
42
|
+
let open = BASE_PLACEHOLDER_OPEN;
|
|
43
|
+
let close = BASE_PLACEHOLDER_CLOSE;
|
|
44
|
+
while (source.includes(open) || source.includes(close)) {
|
|
45
|
+
open += BASE_PLACEHOLDER_OPEN;
|
|
46
|
+
close += BASE_PLACEHOLDER_CLOSE;
|
|
47
|
+
}
|
|
48
|
+
return { open, close };
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Walk `source` and replace each raw region with a placeholder token so
|
|
53
|
+
* downstream passes (regex / scan) do not transform their bodies. The
|
|
54
|
+
* original substrings are kept in `placeholders` for {@link restorePlaceholders}
|
|
55
|
+
* to splice back at the end.
|
|
56
|
+
*
|
|
57
|
+
* Raw regions handled:
|
|
58
|
+
* - `[[code ...]]...[[/code]]` — consumes to EOF when the closing tag
|
|
59
|
+
* is missing (mirroring the block parser's behaviour for unclosed
|
|
60
|
+
* code blocks).
|
|
61
|
+
* - `[[html ...]]...[[/html]]` — only masked when the closing tag is
|
|
62
|
+
* present; an unclosed `[[html]]` is left in place so a later directive
|
|
63
|
+
* is not incorrectly hidden behind the mask.
|
|
64
|
+
* - `@<...>@` (single-line balanced raw — `>@` must be on the same line).
|
|
65
|
+
* - `@@...@@` (single-line inline raw — must not span newlines).
|
|
66
|
+
*
|
|
67
|
+
* Genuinely unclosed `@@` / `@<` are left in place (the parser treats
|
|
68
|
+
* them as literal text anyway). Comments `[!-- ... --]` are intentionally
|
|
69
|
+
* not masked: Wikidot's legacy Text_Wiki evaluates `[[iftags]]` before
|
|
70
|
+
* comments, so masking here would invert that order.
|
|
71
|
+
*/
|
|
72
|
+
export function maskRawRegions(
|
|
73
|
+
source: string,
|
|
74
|
+
sentinels: Sentinels,
|
|
75
|
+
): { masked: string; placeholders: string[] } {
|
|
76
|
+
const placeholders: string[] = [];
|
|
77
|
+
let masked = "";
|
|
78
|
+
let i = 0;
|
|
79
|
+
|
|
80
|
+
while (i < source.length) {
|
|
81
|
+
if (source[i] === "[" && source[i + 1] === "[") {
|
|
82
|
+
RAW_BLOCK_OPEN_PATTERN.lastIndex = i;
|
|
83
|
+
const openMatch = RAW_BLOCK_OPEN_PATTERN.exec(source);
|
|
84
|
+
if (openMatch) {
|
|
85
|
+
const name = openMatch[1]!.toLowerCase();
|
|
86
|
+
const openLen = openMatch[0].length;
|
|
87
|
+
const closePattern = new RegExp(`\\[\\[\\/\\s*${name}\\s*\\]\\]`, "ig");
|
|
88
|
+
closePattern.lastIndex = i + openLen;
|
|
89
|
+
const closeMatch = closePattern.exec(source);
|
|
90
|
+
if (closeMatch) {
|
|
91
|
+
const regionEnd = closeMatch.index + closeMatch[0].length;
|
|
92
|
+
masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
|
|
93
|
+
i = regionEnd;
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
if (name === "code") {
|
|
97
|
+
masked += pushPlaceholder(placeholders, source.slice(i), sentinels);
|
|
98
|
+
i = source.length;
|
|
99
|
+
continue;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (source[i] === "@" && source[i + 1] === "<") {
|
|
105
|
+
const close = source.indexOf(">@", i + 2);
|
|
106
|
+
const newline = source.indexOf("\n", i + 2);
|
|
107
|
+
if (close !== -1 && (newline === -1 || close < newline)) {
|
|
108
|
+
const regionEnd = close + 2;
|
|
109
|
+
masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
|
|
110
|
+
i = regionEnd;
|
|
111
|
+
continue;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (source[i] === "@" && source[i + 1] === "@") {
|
|
116
|
+
const close = source.indexOf("@@", i + 2);
|
|
117
|
+
const newline = source.indexOf("\n", i + 2);
|
|
118
|
+
if (close !== -1 && (newline === -1 || close < newline)) {
|
|
119
|
+
const regionEnd = close + 2;
|
|
120
|
+
masked += pushPlaceholder(placeholders, source.slice(i, regionEnd), sentinels);
|
|
121
|
+
i = regionEnd;
|
|
122
|
+
continue;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
masked += source[i];
|
|
127
|
+
i++;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return { masked, placeholders };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function pushPlaceholder(placeholders: string[], text: string, sentinels: Sentinels): string {
|
|
134
|
+
const idx = placeholders.length;
|
|
135
|
+
placeholders.push(text);
|
|
136
|
+
return `${sentinels.open}${idx}${sentinels.close}`;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function escapeRegex(str: string): string {
|
|
140
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Inverse of {@link maskRawRegions}: replace placeholders with originals. */
|
|
144
|
+
export function restorePlaceholders(
|
|
145
|
+
source: string,
|
|
146
|
+
placeholders: string[],
|
|
147
|
+
sentinels: Sentinels,
|
|
148
|
+
): string {
|
|
149
|
+
const pattern = new RegExp(
|
|
150
|
+
`${escapeRegex(sentinels.open)}(\\d+)${escapeRegex(sentinels.close)}`,
|
|
151
|
+
"g",
|
|
152
|
+
);
|
|
153
|
+
return source.replace(pattern, (_, idx: string) => placeholders[Number(idx)] ?? "");
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Compute the unmatched-`[[` depth at each character offset of `source`.
|
|
158
|
+
* Mirrors the lexer's `blockOpenerDepth`. Returns `Int32Array` of length
|
|
159
|
+
* `source.length + 1`; `depths[k]` is the depth immediately before the
|
|
160
|
+
* character at offset `k` is consumed.
|
|
161
|
+
*/
|
|
162
|
+
export function computeBracketDepths(source: string): Int32Array {
|
|
163
|
+
const n = source.length;
|
|
164
|
+
const depths = new Int32Array(n + 1);
|
|
165
|
+
let depth = 0;
|
|
166
|
+
let i = 0;
|
|
167
|
+
while (i < n) {
|
|
168
|
+
depths[i] = depth;
|
|
169
|
+
const c = source.charCodeAt(i);
|
|
170
|
+
const c1 = i + 1 < n ? source.charCodeAt(i + 1) : -1;
|
|
171
|
+
const c2 = i + 2 < n ? source.charCodeAt(i + 2) : -1;
|
|
172
|
+
|
|
173
|
+
if (depth > 0 && c === 0x22 /* " */ && precededByEqualsAttr(source, i)) {
|
|
174
|
+
const end = findQuoteEnd(source, i + 1);
|
|
175
|
+
for (let k = i; k <= end; k++) depths[k] = depth;
|
|
176
|
+
i = end + 1;
|
|
177
|
+
continue;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (c === 0x5b /* [ */ && c1 === 0x5b && c2 === 0x5b) {
|
|
181
|
+
const end = findTripleLinkEnd(source, i + 3);
|
|
182
|
+
for (let k = i; k <= end; k++) depths[k] = depth;
|
|
183
|
+
i = end + 1;
|
|
184
|
+
continue;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
if (c === 0x5b && c1 === 0x5b) {
|
|
188
|
+
depth++;
|
|
189
|
+
depths[i + 1] = depth;
|
|
190
|
+
i += 2;
|
|
191
|
+
continue;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
if (c === 0x5d /* ] */ && c1 === 0x5d) {
|
|
195
|
+
depth = Math.max(0, depth - 1);
|
|
196
|
+
depths[i + 1] = depth;
|
|
197
|
+
i += 2;
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
if (c === 0x0a /* \n */) {
|
|
202
|
+
// Block openers are single-line; reset depth at line boundaries so
|
|
203
|
+
// an unterminated `[[xxx` does not keep subsequent directives
|
|
204
|
+
// inside its (imaginary) opener context.
|
|
205
|
+
depth = 0;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
i++;
|
|
209
|
+
}
|
|
210
|
+
depths[n] = depth;
|
|
211
|
+
return depths;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function precededByEqualsAttr(s: string, i: number): boolean {
|
|
215
|
+
let j = i - 1;
|
|
216
|
+
while (j >= 0) {
|
|
217
|
+
const ch = s.charCodeAt(j);
|
|
218
|
+
if (ch === 0x20 /* space */ || ch === 0x09 /* tab */) {
|
|
219
|
+
j--;
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
return ch === 0x3d; /* = */
|
|
223
|
+
}
|
|
224
|
+
return false;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function findQuoteEnd(s: string, from: number): number {
|
|
228
|
+
for (let i = from; i < s.length; i++) {
|
|
229
|
+
const ch = s.charCodeAt(i);
|
|
230
|
+
if (ch === 0x22 /* " */ || ch === 0x0a /* \n */) return i;
|
|
231
|
+
}
|
|
232
|
+
return s.length - 1;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
function findTripleLinkEnd(s: string, from: number): number {
|
|
236
|
+
for (let i = from; i < s.length; i++) {
|
|
237
|
+
if (
|
|
238
|
+
s.charCodeAt(i) === 0x5d &&
|
|
239
|
+
i + 2 < s.length &&
|
|
240
|
+
s.charCodeAt(i + 1) === 0x5d &&
|
|
241
|
+
s.charCodeAt(i + 2) === 0x5d
|
|
242
|
+
) {
|
|
243
|
+
return i + 2;
|
|
244
|
+
}
|
|
245
|
+
if (s.charCodeAt(i) === 0x0a && i + 1 < s.length && s.charCodeAt(i + 1) === 0x0a) {
|
|
246
|
+
return i;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
return s.length - 1;
|
|
250
|
+
}
|