@wdprlib/parser 3.1.1 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +312 -121
- package/dist/index.js +289 -98
- package/package.json +5 -3
- package/src/index.ts +163 -0
- package/src/lexer/index.ts +20 -0
- package/src/lexer/lexer.ts +687 -0
- package/src/lexer/tokens.ts +141 -0
- package/src/parser/constants.ts +173 -0
- package/src/parser/depth.ts +251 -0
- package/src/parser/index.ts +18 -0
- package/src/parser/parse.ts +315 -0
- package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
- package/src/parser/postprocess/index.ts +15 -0
- package/src/parser/postprocess/spanStrip.ts +697 -0
- package/src/parser/preprocess/expr.ts +265 -0
- package/src/parser/preprocess/index.ts +38 -0
- package/src/parser/preprocess/typography.ts +67 -0
- package/src/parser/preprocess/utils.ts +250 -0
- package/src/parser/preprocess/whitespace.ts +111 -0
- package/src/parser/rules/block/align.ts +282 -0
- package/src/parser/rules/block/bibliography.ts +359 -0
- package/src/parser/rules/block/block-list.ts +689 -0
- package/src/parser/rules/block/blockquote.ts +238 -0
- package/src/parser/rules/block/center.ts +87 -0
- package/src/parser/rules/block/clear-float.ts +75 -0
- package/src/parser/rules/block/code.ts +187 -0
- package/src/parser/rules/block/collapsible.ts +337 -0
- package/src/parser/rules/block/comment.ts +73 -0
- package/src/parser/rules/block/content-separator.ts +79 -0
- package/src/parser/rules/block/definition-list.ts +270 -0
- package/src/parser/rules/block/div.ts +400 -0
- package/src/parser/rules/block/embed-block.ts +153 -0
- package/src/parser/rules/block/footnoteblock.ts +200 -0
- package/src/parser/rules/block/heading.ts +142 -0
- package/src/parser/rules/block/horizontal-rule.ts +61 -0
- package/src/parser/rules/block/html.ts +222 -0
- package/src/parser/rules/block/iframe.ts +239 -0
- package/src/parser/rules/block/iftags.ts +150 -0
- package/src/parser/rules/block/include.ts +179 -0
- package/src/parser/rules/block/index.ts +127 -0
- package/src/parser/rules/block/list.ts +244 -0
- package/src/parser/rules/block/math.ts +183 -0
- package/src/parser/rules/block/module/backlinks/index.ts +31 -0
- package/src/parser/rules/block/module/backlinks/types.ts +21 -0
- package/src/parser/rules/block/module/categories/index.ts +34 -0
- package/src/parser/rules/block/module/categories/types.ts +21 -0
- package/src/parser/rules/block/module/css/index.ts +37 -0
- package/src/parser/rules/block/module/iftags/condition.ts +109 -0
- package/src/parser/rules/block/module/iftags/index.ts +26 -0
- package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
- package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
- package/src/parser/rules/block/module/iftags/types.ts +63 -0
- package/src/parser/rules/block/module/include/index.ts +20 -0
- package/src/parser/rules/block/module/include/resolve.ts +556 -0
- package/src/parser/rules/block/module/index.ts +122 -0
- package/src/parser/rules/block/module/join/index.ts +34 -0
- package/src/parser/rules/block/module/join/types.ts +23 -0
- package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
- package/src/parser/rules/block/module/listpages/extract.ts +410 -0
- package/src/parser/rules/block/module/listpages/index.ts +83 -0
- package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
- package/src/parser/rules/block/module/listpages/parser.ts +106 -0
- package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
- package/src/parser/rules/block/module/listpages/types.ts +513 -0
- package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
- package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
- package/src/parser/rules/block/module/listusers/extract.ts +45 -0
- package/src/parser/rules/block/module/listusers/index.ts +36 -0
- package/src/parser/rules/block/module/listusers/parser.ts +54 -0
- package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
- package/src/parser/rules/block/module/listusers/types.ts +93 -0
- package/src/parser/rules/block/module/mapping.ts +61 -0
- package/src/parser/rules/block/module/page-tree/index.ts +38 -0
- package/src/parser/rules/block/module/page-tree/types.ts +29 -0
- package/src/parser/rules/block/module/rate/index.ts +28 -0
- package/src/parser/rules/block/module/rate/types.ts +19 -0
- package/src/parser/rules/block/module/resolve.ts +411 -0
- package/src/parser/rules/block/module/types-common.ts +59 -0
- package/src/parser/rules/block/module/types.ts +61 -0
- package/src/parser/rules/block/module/utils.ts +43 -0
- package/src/parser/rules/block/module/walk.ts +380 -0
- package/src/parser/rules/block/module.ts +164 -0
- package/src/parser/rules/block/orphan-li.ts +177 -0
- package/src/parser/rules/block/paragraph.ts +157 -0
- package/src/parser/rules/block/table-block.ts +726 -0
- package/src/parser/rules/block/table.ts +441 -0
- package/src/parser/rules/block/tabview.ts +331 -0
- package/src/parser/rules/block/toc.ts +129 -0
- package/src/parser/rules/block/utils.ts +615 -0
- package/src/parser/rules/index.ts +49 -0
- package/src/parser/rules/inline/anchor-name.ts +154 -0
- package/src/parser/rules/inline/anchor.ts +327 -0
- package/src/parser/rules/inline/bibcite.ts +153 -0
- package/src/parser/rules/inline/bold.ts +86 -0
- package/src/parser/rules/inline/color.ts +140 -0
- package/src/parser/rules/inline/comment.ts +90 -0
- package/src/parser/rules/inline/equation-ref.ts +115 -0
- package/src/parser/rules/inline/expr.ts +526 -0
- package/src/parser/rules/inline/footnote.ts +223 -0
- package/src/parser/rules/inline/guillemet.ts +64 -0
- package/src/parser/rules/inline/html.ts +132 -0
- package/src/parser/rules/inline/image.ts +328 -0
- package/src/parser/rules/inline/index.ts +150 -0
- package/src/parser/rules/inline/italic.ts +74 -0
- package/src/parser/rules/inline/line-break.ts +326 -0
- package/src/parser/rules/inline/link-anchor.ts +147 -0
- package/src/parser/rules/inline/link-single.ts +164 -0
- package/src/parser/rules/inline/link-star.ts +134 -0
- package/src/parser/rules/inline/link-triple.ts +267 -0
- package/src/parser/rules/inline/math-inline.ts +126 -0
- package/src/parser/rules/inline/monospace.ts +78 -0
- package/src/parser/rules/inline/raw.ts +262 -0
- package/src/parser/rules/inline/size.ts +244 -0
- package/src/parser/rules/inline/span.ts +424 -0
- package/src/parser/rules/inline/strikethrough.ts +115 -0
- package/src/parser/rules/inline/subscript.ts +84 -0
- package/src/parser/rules/inline/superscript.ts +84 -0
- package/src/parser/rules/inline/text.ts +84 -0
- package/src/parser/rules/inline/underline.ts +127 -0
- package/src/parser/rules/inline/user.ts +147 -0
- package/src/parser/rules/inline/utils.ts +344 -0
- package/src/parser/rules/types.ts +252 -0
- package/src/parser/rules/utils.ts +155 -0
- package/src/parser/toc.ts +130 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Lexer (tokenizer) for Wikidot markup.
|
|
4
|
+
*
|
|
5
|
+
* The lexer converts preprocessed wikitext into a flat sequence of tokens
|
|
6
|
+
* that the parser consumes. Each token has a type (e.g., `HEADING_MARKER`,
|
|
7
|
+
* `BOLD`, `TEXT`) and a string value. The lexer is context-free and does
|
|
8
|
+
* not build any tree structure; that is the parser's responsibility.
|
|
9
|
+
*
|
|
10
|
+
* The main entry points are:
|
|
11
|
+
* - `tokenize()` - convenience function that tokenizes a string in one call
|
|
12
|
+
* - `Lexer` class - for more control over tokenization options
|
|
13
|
+
*
|
|
14
|
+
* @module
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export type { TokenType, Token } from "./tokens";
|
|
18
|
+
export { createToken } from "./tokens";
|
|
19
|
+
export type { LexerOptions } from "./lexer";
|
|
20
|
+
export { Lexer, tokenize } from "./lexer";
|
|
@@ -0,0 +1,687 @@
|
|
|
1
|
+
import { createPoint, createPosition } from "@wdprlib/ast";
|
|
2
|
+
import { createToken, type Token, type TokenType } from "./tokens";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Configuration for the {@link Lexer}.
|
|
6
|
+
*
|
|
7
|
+
* @group Lexer
|
|
8
|
+
*/
|
|
9
|
+
export interface LexerOptions {
|
|
10
|
+
/**
|
|
11
|
+
* When `true` (default), every token carries accurate line/column/offset
|
|
12
|
+
* data. Set to `false` to skip position tracking for faster tokenisation
|
|
13
|
+
* when source-map information is not needed.
|
|
14
|
+
*/
|
|
15
|
+
trackPositions?: boolean;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Internal mutable state carried through a single tokenisation pass.
|
|
20
|
+
*/
|
|
21
|
+
interface LexerState {
|
|
22
|
+
source: string;
|
|
23
|
+
pos: number;
|
|
24
|
+
line: number;
|
|
25
|
+
column: number;
|
|
26
|
+
lineStart: boolean;
|
|
27
|
+
tokens: Token[];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Converts a Wikidot markup source string into a flat array of {@link Token}s.
|
|
32
|
+
*
|
|
33
|
+
* The lexer is single-pass and greedy: it tries the longest-matching
|
|
34
|
+
* multi-character pattern first (e.g. `[[[` before `[[`, `**` before `*`).
|
|
35
|
+
* Context-sensitive constructs (line-start headings, blockquote markers)
|
|
36
|
+
* are disambiguated via the `lineStart` state flag.
|
|
37
|
+
*
|
|
38
|
+
* For convenience, use the standalone {@link tokenize} function instead
|
|
39
|
+
* of constructing a `Lexer` directly.
|
|
40
|
+
*
|
|
41
|
+
* @group Lexer
|
|
42
|
+
*/
|
|
43
|
+
export class Lexer {
|
|
44
|
+
private state: LexerState;
|
|
45
|
+
private options: Required<LexerOptions>;
|
|
46
|
+
// Positions where ]] should be split into ] + ] (for invalid anchor names)
|
|
47
|
+
private splitBlockClosePositions: Set<number> = new Set();
|
|
48
|
+
/**
|
|
49
|
+
* Nesting depth of block-opener context (between `[[` / `[[/` and the
|
|
50
|
+
* matching `]]`). Used to scope `QUOTED_STRING` recognition so that
|
|
51
|
+
* `"` after `=` only becomes a quoted attribute value while we are
|
|
52
|
+
* actually parsing block attributes — otherwise inline `=` followed by
|
|
53
|
+
* `"` (e.g. inside `[[footnote]]="[[/footnote]]`) would erroneously
|
|
54
|
+
* consume content up to the next `"` or newline.
|
|
55
|
+
*/
|
|
56
|
+
private blockOpenerDepth = 0;
|
|
57
|
+
|
|
58
|
+
constructor(source: string, options: LexerOptions = {}) {
|
|
59
|
+
this.options = {
|
|
60
|
+
trackPositions: options.trackPositions ?? true,
|
|
61
|
+
};
|
|
62
|
+
this.state = {
|
|
63
|
+
source,
|
|
64
|
+
pos: 0,
|
|
65
|
+
line: 1,
|
|
66
|
+
column: 1,
|
|
67
|
+
lineStart: true,
|
|
68
|
+
tokens: [],
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Tokenize the entire source
|
|
74
|
+
*/
|
|
75
|
+
tokenize(): Token[] {
|
|
76
|
+
while (!this.isAtEnd()) {
|
|
77
|
+
this.scanToken();
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
this.addToken("EOF", "");
|
|
81
|
+
return this.state.tokens;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Check if at end of source
|
|
86
|
+
*/
|
|
87
|
+
private isAtEnd(): boolean {
|
|
88
|
+
return this.state.pos >= this.state.source.length;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Get current character
|
|
93
|
+
*/
|
|
94
|
+
private current(): string {
|
|
95
|
+
return this.state.source[this.state.pos] ?? "";
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Check if [[# is followed by an invalid anchor name that closes with ]].
|
|
100
|
+
* Valid: [[# valid-name]] where name matches [-_A-Za-z0-9.%]+
|
|
101
|
+
* Invalid: [[# name with spaces]] or [[# name$special]]
|
|
102
|
+
* When invalid, returns the position of the closing ]] so the lexer can
|
|
103
|
+
* emit tokens that allow the inner [# text] to be parsed as a described link.
|
|
104
|
+
*/
|
|
105
|
+
private findInvalidAnchorNameEnd(): number | null {
|
|
106
|
+
const src = this.state.source;
|
|
107
|
+
const pos = this.state.pos;
|
|
108
|
+
|
|
109
|
+
// Must start with [[#
|
|
110
|
+
if (src[pos] !== "[" || src[pos + 1] !== "[" || src[pos + 2] !== "#") {
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Must have space after #
|
|
115
|
+
if (src[pos + 3] !== " ") {
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Skip spaces after #
|
|
120
|
+
let i = pos + 4;
|
|
121
|
+
while (i < src.length && src[i] === " ") {
|
|
122
|
+
i++;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Scan for invalid characters
|
|
126
|
+
let foundInvalid = false;
|
|
127
|
+
while (i < src.length) {
|
|
128
|
+
const ch = src[i]!;
|
|
129
|
+
if (ch === "\n") return null;
|
|
130
|
+
if (ch === "]" && src[i + 1] === "]") {
|
|
131
|
+
// Reached ]] - if we found invalid chars, this is an invalid anchor name
|
|
132
|
+
return foundInvalid ? i : null;
|
|
133
|
+
}
|
|
134
|
+
const code = ch.charCodeAt(0);
|
|
135
|
+
const isValid =
|
|
136
|
+
(code >= 48 && code <= 57) || // 0-9
|
|
137
|
+
(code >= 65 && code <= 90) || // A-Z
|
|
138
|
+
(code >= 97 && code <= 122) || // a-z
|
|
139
|
+
code === 45 || // -
|
|
140
|
+
code === 95 || // _
|
|
141
|
+
code === 46 || // .
|
|
142
|
+
code === 37; // %
|
|
143
|
+
if (!isValid) {
|
|
144
|
+
foundInvalid = true;
|
|
145
|
+
}
|
|
146
|
+
i++;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return null;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Check if source matches pattern at current position
|
|
154
|
+
*/
|
|
155
|
+
private match(pattern: string): boolean {
|
|
156
|
+
for (let i = 0; i < pattern.length; i++) {
|
|
157
|
+
if (this.state.source[this.state.pos + i] !== pattern[i]) {
|
|
158
|
+
return false;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
return true;
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
/**
|
|
165
|
+
* Advance position by n characters
|
|
166
|
+
*/
|
|
167
|
+
private advance(n = 1): string {
|
|
168
|
+
let result = "";
|
|
169
|
+
for (let i = 0; i < n && !this.isAtEnd(); i++) {
|
|
170
|
+
const char = this.current();
|
|
171
|
+
result += char;
|
|
172
|
+
this.state.pos++;
|
|
173
|
+
|
|
174
|
+
if (char === "\n") {
|
|
175
|
+
this.state.line++;
|
|
176
|
+
this.state.column = 1;
|
|
177
|
+
this.state.lineStart = true;
|
|
178
|
+
} else {
|
|
179
|
+
this.state.column++;
|
|
180
|
+
if (char !== " " && char !== "\t") {
|
|
181
|
+
this.state.lineStart = false;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return result;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/**
|
|
189
|
+
* Returns the type of the last non-whitespace token, or null if none.
|
|
190
|
+
*/
|
|
191
|
+
private lastNonWhitespaceTokenType(): TokenType | null {
|
|
192
|
+
for (let i = this.state.tokens.length - 1; i >= 0; i--) {
|
|
193
|
+
const t = this.state.tokens[i]!;
|
|
194
|
+
if (t.type !== "WHITESPACE") return t.type;
|
|
195
|
+
}
|
|
196
|
+
return null;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Add token
|
|
201
|
+
*/
|
|
202
|
+
private addToken(type: TokenType, value: string): void {
|
|
203
|
+
const startPos = createPoint(
|
|
204
|
+
this.state.line,
|
|
205
|
+
this.state.column - value.length,
|
|
206
|
+
this.state.pos - value.length,
|
|
207
|
+
);
|
|
208
|
+
const endPos = createPoint(this.state.line, this.state.column, this.state.pos);
|
|
209
|
+
const position = this.options.trackPositions
|
|
210
|
+
? createPosition(startPos, endPos)
|
|
211
|
+
: createPosition(createPoint(0, 0, 0), createPoint(0, 0, 0));
|
|
212
|
+
|
|
213
|
+
const lineStart =
|
|
214
|
+
this.state.tokens.length === 0 ||
|
|
215
|
+
this.state.tokens[this.state.tokens.length - 1]?.type === "NEWLINE";
|
|
216
|
+
|
|
217
|
+
this.state.tokens.push(createToken(type, value, position, lineStart));
|
|
218
|
+
|
|
219
|
+
// Track block-opener nesting so `"` after `=` is only recognised as a
|
|
220
|
+
// quoted attribute value while we are actually inside `[[ ... ]]`.
|
|
221
|
+
if (type === "BLOCK_OPEN" || type === "BLOCK_END_OPEN") {
|
|
222
|
+
this.blockOpenerDepth++;
|
|
223
|
+
} else if (type === "BLOCK_CLOSE" && this.blockOpenerDepth > 0) {
|
|
224
|
+
this.blockOpenerDepth--;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/**
|
|
229
|
+
* Scan a single token
|
|
230
|
+
*/
|
|
231
|
+
private scanToken(): void {
|
|
232
|
+
const char = this.current();
|
|
233
|
+
const isLineStart = this.state.lineStart;
|
|
234
|
+
|
|
235
|
+
// Newline
|
|
236
|
+
if (char === "\n") {
|
|
237
|
+
this.advance();
|
|
238
|
+
this.addToken("NEWLINE", "\n");
|
|
239
|
+
return;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Whitespace (non-newline)
|
|
243
|
+
if (char === " " || char === "\t") {
|
|
244
|
+
let ws = "";
|
|
245
|
+
while (!this.isAtEnd() && (this.current() === " " || this.current() === "\t")) {
|
|
246
|
+
ws += this.advance();
|
|
247
|
+
}
|
|
248
|
+
this.addToken("WHITESPACE", ws);
|
|
249
|
+
return;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Comment open [!-- (must check before [[[)
|
|
253
|
+
if (this.match("[!--")) {
|
|
254
|
+
this.advance(4);
|
|
255
|
+
this.addToken("COMMENT_OPEN", "[!--");
|
|
256
|
+
return;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Link open [[[ (must check before [[)
|
|
260
|
+
if (this.match("[[[")) {
|
|
261
|
+
this.advance(3);
|
|
262
|
+
this.addToken("LINK_OPEN", "[[[");
|
|
263
|
+
return;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// Block end open [[/
|
|
267
|
+
if (this.match("[[/")) {
|
|
268
|
+
this.advance(3);
|
|
269
|
+
this.addToken("BLOCK_END_OPEN", "[[/");
|
|
270
|
+
return;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Block open [[
|
|
274
|
+
if (this.match("[[")) {
|
|
275
|
+
// Check for invalid anchor name pattern: [[# name-with-spaces]]
|
|
276
|
+
// Wikidot's Anchor regex requires [-_A-Za-z0-9.%] only after [[# .
|
|
277
|
+
// If [[# is followed by invalid anchor name, decompose into
|
|
278
|
+
// TEXT "[" so the inner [# text] is parsed as a described anchor link.
|
|
279
|
+
// The closing ]] will also be split: ] (BRACKET_CLOSE) + ] (TEXT).
|
|
280
|
+
const invalidEnd = this.findInvalidAnchorNameEnd();
|
|
281
|
+
if (invalidEnd !== null) {
|
|
282
|
+
this.splitBlockClosePositions.add(invalidEnd);
|
|
283
|
+
this.advance(1);
|
|
284
|
+
this.addToken("TEXT", "[");
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
this.advance(2);
|
|
288
|
+
this.addToken("BLOCK_OPEN", "[[");
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Link close ]]] (must check before ]])
|
|
293
|
+
if (this.match("]]]")) {
|
|
294
|
+
this.advance(3);
|
|
295
|
+
this.addToken("LINK_CLOSE", "]]]");
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// Block close ]]
|
|
300
|
+
if (this.match("]]")) {
|
|
301
|
+
// For invalid anchor names, split ]] into ] (BRACKET_CLOSE) + ] (TEXT)
|
|
302
|
+
if (this.splitBlockClosePositions.has(this.state.pos)) {
|
|
303
|
+
this.splitBlockClosePositions.delete(this.state.pos);
|
|
304
|
+
this.advance(1);
|
|
305
|
+
this.addToken("BRACKET_CLOSE", "]");
|
|
306
|
+
this.advance(1);
|
|
307
|
+
this.addToken("TEXT", "]");
|
|
308
|
+
return;
|
|
309
|
+
}
|
|
310
|
+
this.advance(2);
|
|
311
|
+
this.addToken("BLOCK_CLOSE", "]]");
|
|
312
|
+
return;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Raw/escape @@
|
|
316
|
+
if (this.match("@@")) {
|
|
317
|
+
this.advance(2);
|
|
318
|
+
this.addToken("RAW_OPEN", "@@");
|
|
319
|
+
return;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Raw block @<
|
|
323
|
+
if (this.match("@<")) {
|
|
324
|
+
this.advance(2);
|
|
325
|
+
this.addToken("RAW_BLOCK_OPEN", "@<");
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Raw block close >@
|
|
330
|
+
if (this.match(">@")) {
|
|
331
|
+
this.advance(2);
|
|
332
|
+
this.addToken("RAW_BLOCK_CLOSE", ">@");
|
|
333
|
+
return;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Monospace open {{
|
|
337
|
+
if (this.match("{{")) {
|
|
338
|
+
this.advance(2);
|
|
339
|
+
this.addToken("MONO_MARKER", "{{");
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// Monospace close }}
|
|
344
|
+
if (this.match("}}")) {
|
|
345
|
+
this.advance(2);
|
|
346
|
+
this.addToken("MONO_CLOSE", "}}");
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// Bold **
|
|
351
|
+
if (this.match("**")) {
|
|
352
|
+
this.advance(2);
|
|
353
|
+
this.addToken("BOLD_MARKER", "**");
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// Horizontal rule ---- or more (4+ hyphens, check before --)
|
|
358
|
+
if (isLineStart && this.match("----")) {
|
|
359
|
+
let dashes = "";
|
|
360
|
+
while (this.current() === "-") {
|
|
361
|
+
dashes += this.advance();
|
|
362
|
+
}
|
|
363
|
+
this.addToken("HR_MARKER", dashes);
|
|
364
|
+
return;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
// Comment close --] (must check before --)
|
|
368
|
+
if (this.match("--]")) {
|
|
369
|
+
this.advance(3);
|
|
370
|
+
this.addToken("COMMENT_CLOSE", "--]");
|
|
371
|
+
return;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
// Strikethrough -- (Wikidot only uses --)
|
|
375
|
+
if (this.match("--")) {
|
|
376
|
+
this.advance(2);
|
|
377
|
+
this.addToken("STRIKE_MARKER", "--");
|
|
378
|
+
return;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Left double angle << (guillemet)
|
|
382
|
+
if (this.match("<<")) {
|
|
383
|
+
this.advance(2);
|
|
384
|
+
this.addToken("LEFT_DOUBLE_ANGLE", "<<");
|
|
385
|
+
return;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Clear float ~~~~ or more (at line start only, Wikidot requires 4+)
|
|
389
|
+
if (isLineStart && this.match("~~~~")) {
|
|
390
|
+
let tildes = "";
|
|
391
|
+
while (this.current() === "~") {
|
|
392
|
+
tildes += this.advance();
|
|
393
|
+
}
|
|
394
|
+
// Check for directional clear float
|
|
395
|
+
if (this.current() === "<") {
|
|
396
|
+
this.advance();
|
|
397
|
+
this.addToken("CLEAR_FLOAT_LEFT", `${tildes}<`);
|
|
398
|
+
return;
|
|
399
|
+
}
|
|
400
|
+
if (this.current() === ">") {
|
|
401
|
+
this.advance();
|
|
402
|
+
this.addToken("CLEAR_FLOAT_RIGHT", `${tildes}>`);
|
|
403
|
+
return;
|
|
404
|
+
}
|
|
405
|
+
this.addToken("CLEAR_FLOAT", `${tildes}`);
|
|
406
|
+
return;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Single hyphen (not part of --)
|
|
410
|
+
if (char === "-") {
|
|
411
|
+
this.advance();
|
|
412
|
+
this.addToken("TEXT", "-");
|
|
413
|
+
return;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Underline __ (check before single _)
|
|
417
|
+
if (this.match("__")) {
|
|
418
|
+
this.advance(2);
|
|
419
|
+
this.addToken("UNDERLINE_MARKER", "__");
|
|
420
|
+
return;
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// Single underscore _ (for line break)
|
|
424
|
+
if (char === "_") {
|
|
425
|
+
this.advance();
|
|
426
|
+
this.addToken("UNDERSCORE", "_");
|
|
427
|
+
return;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
// Superscript ^^
|
|
431
|
+
if (this.match("^^")) {
|
|
432
|
+
this.advance(2);
|
|
433
|
+
this.addToken("SUPER_MARKER", "^^");
|
|
434
|
+
return;
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
// Subscript ,,
|
|
438
|
+
if (this.match(",,")) {
|
|
439
|
+
this.advance(2);
|
|
440
|
+
this.addToken("SUB_MARKER", ",,");
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
// Italic //
|
|
445
|
+
if (this.match("//")) {
|
|
446
|
+
this.advance(2);
|
|
447
|
+
this.addToken("ITALIC_MARKER", "//");
|
|
448
|
+
return;
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// Table markers
|
|
452
|
+
// ||~ (header), ||< (left), ||= (center), ||> (right), || (normal)
|
|
453
|
+
if (this.match("||~")) {
|
|
454
|
+
this.advance(3);
|
|
455
|
+
this.addToken("TABLE_HEADER", "||~");
|
|
456
|
+
return;
|
|
457
|
+
}
|
|
458
|
+
if (this.match("||<")) {
|
|
459
|
+
this.advance(3);
|
|
460
|
+
this.addToken("TABLE_LEFT", "||<");
|
|
461
|
+
return;
|
|
462
|
+
}
|
|
463
|
+
if (this.match("||=")) {
|
|
464
|
+
this.advance(3);
|
|
465
|
+
this.addToken("TABLE_CENTER", "||=");
|
|
466
|
+
return;
|
|
467
|
+
}
|
|
468
|
+
if (this.match("||>")) {
|
|
469
|
+
this.advance(3);
|
|
470
|
+
this.addToken("TABLE_RIGHT", "||>");
|
|
471
|
+
return;
|
|
472
|
+
}
|
|
473
|
+
if (this.match("||")) {
|
|
474
|
+
this.advance(2);
|
|
475
|
+
this.addToken("TABLE_MARKER", "||");
|
|
476
|
+
return;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
// Heading + (at line start)
|
|
480
|
+
if (isLineStart && char === "+") {
|
|
481
|
+
let plusCount = 0;
|
|
482
|
+
while (this.current() === "+") {
|
|
483
|
+
plusCount++;
|
|
484
|
+
this.advance();
|
|
485
|
+
}
|
|
486
|
+
this.addToken("HEADING_MARKER", "+".repeat(plusCount));
|
|
487
|
+
return;
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
// List bullet * (at line start)
|
|
491
|
+
if (isLineStart && char === "*") {
|
|
492
|
+
this.advance();
|
|
493
|
+
this.addToken("LIST_BULLET", "*");
|
|
494
|
+
return;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
// Color marker ## (check before LIST_NUMBER)
|
|
498
|
+
if (this.match("##")) {
|
|
499
|
+
this.advance(2);
|
|
500
|
+
this.addToken("COLOR_MARKER", "##");
|
|
501
|
+
return;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// List number # (at line start)
|
|
505
|
+
if (isLineStart && char === "#") {
|
|
506
|
+
this.advance();
|
|
507
|
+
this.addToken("LIST_NUMBER", "#");
|
|
508
|
+
return;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Blockquote > or >>> (at line start only for blockquote)
|
|
512
|
+
if (char === ">") {
|
|
513
|
+
if (isLineStart) {
|
|
514
|
+
// At line start: consume all consecutive > as a single blockquote marker
|
|
515
|
+
let depth = "";
|
|
516
|
+
while (this.current() === ">") {
|
|
517
|
+
depth += this.advance();
|
|
518
|
+
}
|
|
519
|
+
this.addToken("BLOCKQUOTE_MARKER", depth);
|
|
520
|
+
return;
|
|
521
|
+
}
|
|
522
|
+
// Not at line start
|
|
523
|
+
if (this.match(">>")) {
|
|
524
|
+
// >> not at line start - guillemet
|
|
525
|
+
this.advance(2);
|
|
526
|
+
this.addToken("RIGHT_DOUBLE_ANGLE", ">>");
|
|
527
|
+
return;
|
|
528
|
+
}
|
|
529
|
+
// Single > not at line start - just text
|
|
530
|
+
this.advance();
|
|
531
|
+
this.addToken("TEXT", ">");
|
|
532
|
+
return;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
// Bracket anchor [#
|
|
536
|
+
if (this.match("[#")) {
|
|
537
|
+
this.advance(2);
|
|
538
|
+
this.addToken("BRACKET_ANCHOR", "[#");
|
|
539
|
+
return;
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// Bracket star [* (for new tab links)
|
|
543
|
+
if (this.match("[*")) {
|
|
544
|
+
this.advance(2);
|
|
545
|
+
this.addToken("BRACKET_STAR", "[*");
|
|
546
|
+
return;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
// Single characters
|
|
550
|
+
if (char === "[") {
|
|
551
|
+
this.advance();
|
|
552
|
+
this.addToken("BRACKET_OPEN", "[");
|
|
553
|
+
return;
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
if (char === "]") {
|
|
557
|
+
this.advance();
|
|
558
|
+
this.addToken("BRACKET_CLOSE", "]");
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
if (char === "|") {
|
|
563
|
+
this.advance();
|
|
564
|
+
this.addToken("PIPE", "|");
|
|
565
|
+
return;
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
if (char === "=") {
|
|
569
|
+
this.advance();
|
|
570
|
+
this.addToken("EQUALS", "=");
|
|
571
|
+
return;
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
// Quoted string (only after EQUALS for block attribute values)
|
|
575
|
+
// In inline context (outside of a `[[...]]` opener), `"` is just a
|
|
576
|
+
// text character (typographic quote). Without the depth gate, an
|
|
577
|
+
// inline `=` followed by `"` (e.g. `[[footnote]]="[[/footnote]]`)
|
|
578
|
+
// would otherwise eat the closing tag.
|
|
579
|
+
if (char === '"') {
|
|
580
|
+
const lastNonWs = this.lastNonWhitespaceTokenType();
|
|
581
|
+
if (this.blockOpenerDepth > 0 && lastNonWs === "EQUALS") {
|
|
582
|
+
let quoted = this.advance(); // opening "
|
|
583
|
+
while (!this.isAtEnd() && this.current() !== '"' && this.current() !== "\n") {
|
|
584
|
+
quoted += this.advance();
|
|
585
|
+
}
|
|
586
|
+
if (this.current() === '"') {
|
|
587
|
+
quoted += this.advance(); // closing "
|
|
588
|
+
}
|
|
589
|
+
this.addToken("QUOTED_STRING", quoted);
|
|
590
|
+
return;
|
|
591
|
+
}
|
|
592
|
+
this.advance();
|
|
593
|
+
this.addToken("TEXT", '"');
|
|
594
|
+
return;
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
if (char === ":") {
|
|
598
|
+
this.advance();
|
|
599
|
+
this.addToken("COLON", ":");
|
|
600
|
+
return;
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
if (char === "/") {
|
|
604
|
+
this.advance();
|
|
605
|
+
this.addToken("SLASH", "/");
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
if (char === "*") {
|
|
610
|
+
this.advance();
|
|
611
|
+
this.addToken("STAR", "*");
|
|
612
|
+
return;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
if (char === "#") {
|
|
616
|
+
this.advance();
|
|
617
|
+
this.addToken("HASH", "#");
|
|
618
|
+
return;
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
if (char === "@") {
|
|
622
|
+
this.advance();
|
|
623
|
+
this.addToken("AT", "@");
|
|
624
|
+
return;
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
if (char === "&") {
|
|
628
|
+
this.advance();
|
|
629
|
+
this.addToken("AMPERSAND", "&");
|
|
630
|
+
return;
|
|
631
|
+
}
|
|
632
|
+
|
|
633
|
+
if (char === "\\") {
|
|
634
|
+
this.advance();
|
|
635
|
+
this.addToken("BACKSLASH", "\\");
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// Backslash line break marker (U+E000, inserted by preproc)
|
|
640
|
+
if (char.charCodeAt(0) === 0xe000) {
|
|
641
|
+
this.advance();
|
|
642
|
+
this.addToken("BACKSLASH_BREAK", char);
|
|
643
|
+
return;
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Identifier: alphanumeric sequence
|
|
647
|
+
if (this.isAlphanumeric(char)) {
|
|
648
|
+
let ident = "";
|
|
649
|
+
while (!this.isAtEnd() && this.isAlphanumeric(this.current())) {
|
|
650
|
+
ident += this.advance();
|
|
651
|
+
}
|
|
652
|
+
this.addToken("IDENTIFIER", ident);
|
|
653
|
+
return;
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
// Default: single character as text
|
|
657
|
+
const text = this.advance();
|
|
658
|
+
this.addToken("TEXT", text);
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
/**
|
|
662
|
+
* Check if character is alphanumeric (for identifier tokens)
|
|
663
|
+
*/
|
|
664
|
+
private isAlphanumeric(char: string): boolean {
|
|
665
|
+
const code = char.charCodeAt(0);
|
|
666
|
+
return (
|
|
667
|
+
(code >= 48 && code <= 57) || // 0-9
|
|
668
|
+
(code >= 65 && code <= 90) || // A-Z
|
|
669
|
+
(code >= 97 && code <= 122) // a-z
|
|
670
|
+
);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
/**
|
|
675
|
+
* Tokenise a Wikidot markup source string in one call.
|
|
676
|
+
*
|
|
677
|
+
* Shorthand for `new Lexer(source, options).tokenize()`.
|
|
678
|
+
*
|
|
679
|
+
* @param source - Raw Wikidot markup
|
|
680
|
+
* @param options - Optional lexer configuration
|
|
681
|
+
* @returns A flat array of tokens, ending with an `EOF` token
|
|
682
|
+
*
|
|
683
|
+
* @group Lexer
|
|
684
|
+
*/
|
|
685
|
+
export function tokenize(source: string, options?: LexerOptions): Token[] {
|
|
686
|
+
return new Lexer(source, options).tokenize();
|
|
687
|
+
}
|