@wdprlib/parser 3.1.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +295 -118
- package/dist/index.js +272 -95
- package/package.json +5 -3
- package/src/index.ts +163 -0
- package/src/lexer/index.ts +20 -0
- package/src/lexer/lexer.ts +687 -0
- package/src/lexer/tokens.ts +141 -0
- package/src/parser/constants.ts +173 -0
- package/src/parser/depth.ts +251 -0
- package/src/parser/index.ts +18 -0
- package/src/parser/parse.ts +315 -0
- package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
- package/src/parser/postprocess/index.ts +15 -0
- package/src/parser/postprocess/spanStrip.ts +697 -0
- package/src/parser/preprocess/expr.ts +265 -0
- package/src/parser/preprocess/index.ts +38 -0
- package/src/parser/preprocess/typography.ts +67 -0
- package/src/parser/preprocess/utils.ts +250 -0
- package/src/parser/preprocess/whitespace.ts +111 -0
- package/src/parser/rules/block/align.ts +282 -0
- package/src/parser/rules/block/bibliography.ts +359 -0
- package/src/parser/rules/block/block-list.ts +689 -0
- package/src/parser/rules/block/blockquote.ts +238 -0
- package/src/parser/rules/block/center.ts +87 -0
- package/src/parser/rules/block/clear-float.ts +75 -0
- package/src/parser/rules/block/code.ts +187 -0
- package/src/parser/rules/block/collapsible.ts +337 -0
- package/src/parser/rules/block/comment.ts +73 -0
- package/src/parser/rules/block/content-separator.ts +79 -0
- package/src/parser/rules/block/definition-list.ts +270 -0
- package/src/parser/rules/block/div.ts +400 -0
- package/src/parser/rules/block/embed-block.ts +153 -0
- package/src/parser/rules/block/footnoteblock.ts +200 -0
- package/src/parser/rules/block/heading.ts +142 -0
- package/src/parser/rules/block/horizontal-rule.ts +61 -0
- package/src/parser/rules/block/html.ts +222 -0
- package/src/parser/rules/block/iframe.ts +239 -0
- package/src/parser/rules/block/iftags.ts +150 -0
- package/src/parser/rules/block/include.ts +179 -0
- package/src/parser/rules/block/index.ts +127 -0
- package/src/parser/rules/block/list.ts +244 -0
- package/src/parser/rules/block/math.ts +183 -0
- package/src/parser/rules/block/module/backlinks/index.ts +31 -0
- package/src/parser/rules/block/module/backlinks/types.ts +21 -0
- package/src/parser/rules/block/module/categories/index.ts +34 -0
- package/src/parser/rules/block/module/categories/types.ts +21 -0
- package/src/parser/rules/block/module/css/index.ts +37 -0
- package/src/parser/rules/block/module/iftags/condition.ts +109 -0
- package/src/parser/rules/block/module/iftags/index.ts +26 -0
- package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
- package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
- package/src/parser/rules/block/module/iftags/types.ts +63 -0
- package/src/parser/rules/block/module/include/index.ts +20 -0
- package/src/parser/rules/block/module/include/resolve.ts +556 -0
- package/src/parser/rules/block/module/index.ts +122 -0
- package/src/parser/rules/block/module/join/index.ts +34 -0
- package/src/parser/rules/block/module/join/types.ts +23 -0
- package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
- package/src/parser/rules/block/module/listpages/extract.ts +410 -0
- package/src/parser/rules/block/module/listpages/index.ts +83 -0
- package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
- package/src/parser/rules/block/module/listpages/parser.ts +106 -0
- package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
- package/src/parser/rules/block/module/listpages/types.ts +513 -0
- package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
- package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
- package/src/parser/rules/block/module/listusers/extract.ts +45 -0
- package/src/parser/rules/block/module/listusers/index.ts +36 -0
- package/src/parser/rules/block/module/listusers/parser.ts +54 -0
- package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
- package/src/parser/rules/block/module/listusers/types.ts +93 -0
- package/src/parser/rules/block/module/mapping.ts +61 -0
- package/src/parser/rules/block/module/page-tree/index.ts +38 -0
- package/src/parser/rules/block/module/page-tree/types.ts +29 -0
- package/src/parser/rules/block/module/rate/index.ts +28 -0
- package/src/parser/rules/block/module/rate/types.ts +19 -0
- package/src/parser/rules/block/module/resolve.ts +411 -0
- package/src/parser/rules/block/module/types-common.ts +59 -0
- package/src/parser/rules/block/module/types.ts +61 -0
- package/src/parser/rules/block/module/utils.ts +43 -0
- package/src/parser/rules/block/module/walk.ts +380 -0
- package/src/parser/rules/block/module.ts +164 -0
- package/src/parser/rules/block/orphan-li.ts +177 -0
- package/src/parser/rules/block/paragraph.ts +157 -0
- package/src/parser/rules/block/table-block.ts +726 -0
- package/src/parser/rules/block/table.ts +441 -0
- package/src/parser/rules/block/tabview.ts +331 -0
- package/src/parser/rules/block/toc.ts +129 -0
- package/src/parser/rules/block/utils.ts +615 -0
- package/src/parser/rules/index.ts +49 -0
- package/src/parser/rules/inline/anchor-name.ts +154 -0
- package/src/parser/rules/inline/anchor.ts +327 -0
- package/src/parser/rules/inline/bibcite.ts +153 -0
- package/src/parser/rules/inline/bold.ts +86 -0
- package/src/parser/rules/inline/color.ts +140 -0
- package/src/parser/rules/inline/comment.ts +90 -0
- package/src/parser/rules/inline/equation-ref.ts +115 -0
- package/src/parser/rules/inline/expr.ts +526 -0
- package/src/parser/rules/inline/footnote.ts +223 -0
- package/src/parser/rules/inline/guillemet.ts +64 -0
- package/src/parser/rules/inline/html.ts +132 -0
- package/src/parser/rules/inline/image.ts +328 -0
- package/src/parser/rules/inline/index.ts +150 -0
- package/src/parser/rules/inline/italic.ts +74 -0
- package/src/parser/rules/inline/line-break.ts +326 -0
- package/src/parser/rules/inline/link-anchor.ts +147 -0
- package/src/parser/rules/inline/link-single.ts +164 -0
- package/src/parser/rules/inline/link-star.ts +134 -0
- package/src/parser/rules/inline/link-triple.ts +267 -0
- package/src/parser/rules/inline/math-inline.ts +126 -0
- package/src/parser/rules/inline/monospace.ts +78 -0
- package/src/parser/rules/inline/raw.ts +262 -0
- package/src/parser/rules/inline/size.ts +244 -0
- package/src/parser/rules/inline/span.ts +424 -0
- package/src/parser/rules/inline/strikethrough.ts +115 -0
- package/src/parser/rules/inline/subscript.ts +84 -0
- package/src/parser/rules/inline/superscript.ts +84 -0
- package/src/parser/rules/inline/text.ts +84 -0
- package/src/parser/rules/inline/underline.ts +127 -0
- package/src/parser/rules/inline/user.ts +147 -0
- package/src/parser/rules/inline/utils.ts +344 -0
- package/src/parser/rules/types.ts +252 -0
- package/src/parser/rules/utils.ts +155 -0
- package/src/parser/toc.ts +130 -0
|
@@ -0,0 +1,615 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Shared utilities used by block-level parser rules.
|
|
4
|
+
*
|
|
5
|
+
* This module provides the core building blocks that most block rules
|
|
6
|
+
* depend on:
|
|
7
|
+
*
|
|
8
|
+
* - {@link canApplyBlockRule} -- fast pre-check for whether a rule's start
|
|
9
|
+
* tokens match the current token.
|
|
10
|
+
* - {@link parseBlocksUntil} -- the main block-level content parser that
|
|
11
|
+
* iterates rules until a close condition is met (used by div, collapsible,
|
|
12
|
+
* tabview, iftags, align, etc.).
|
|
13
|
+
* - {@link parseInlineContentUntil} -- similar to `parseBlocksUntil` but
|
|
14
|
+
* without paragraph wrapping, used for `div_` paragraph-strip mode.
|
|
15
|
+
* - {@link parseAttributes} / {@link parseAttributesRaw} -- attribute
|
|
16
|
+
* parsers for block opening tags (with and without safety filtering).
|
|
17
|
+
* - {@link createBlockEndCondition} -- factory for close-condition predicates.
|
|
18
|
+
*
|
|
19
|
+
* Re-exports {@link filterUnsafeAttributes} and {@link parseBlockName} from
|
|
20
|
+
* the shared `../utils` module for backward compatibility.
|
|
21
|
+
*
|
|
22
|
+
* @module
|
|
23
|
+
*/
|
|
24
|
+
import type { Token } from "../../../lexer";
|
|
25
|
+
import type { Element } from "@wdprlib/ast";
|
|
26
|
+
import type { ParseContext, BlockRule } from "../types";
|
|
27
|
+
import { KNOWN_BLOCK_NAMES } from "../../constants";
|
|
28
|
+
import { canApplyInlineRule } from "../inline/utils";
|
|
29
|
+
import { filterUnsafeAttributes, parseBlockName } from "../utils";
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Whether the BLOCK_OPEN / BLOCK_END_OPEN token at `pos` opens a block name
|
|
33
|
+
* that should *not* end the surrounding paragraph / inline run. Mirrors the
|
|
34
|
+
* logic used by `parseInlineUntil` so that paragraph-strip mode (`div_`)
|
|
35
|
+
* agrees with regular paragraph parsing about which block names are inline.
|
|
36
|
+
*/
|
|
37
|
+
function isNonBoundaryBlockToken(ctx: ParseContext, pos: number): boolean {
|
|
38
|
+
const token = ctx.tokens[pos];
|
|
39
|
+
if (token?.type !== "BLOCK_OPEN" && token?.type !== "BLOCK_END_OPEN") {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
const nameResult = parseBlockName(ctx, pos + 1);
|
|
43
|
+
if (nameResult === null) {
|
|
44
|
+
// `[[=]]` / `[[==]]` align markers tokenize as EQUALS, not TEXT/IDENTIFIER —
|
|
45
|
+
// those are real block boundaries.
|
|
46
|
+
if (ctx.tokens[pos + 1]?.type === "EQUALS") {
|
|
47
|
+
return false;
|
|
48
|
+
}
|
|
49
|
+
// `[[` followed by no recognizable identifier -- treat as inline.
|
|
50
|
+
return true;
|
|
51
|
+
}
|
|
52
|
+
if (ctx.scope.excludedBlockNames?.has(nameResult.name)) {
|
|
53
|
+
return true;
|
|
54
|
+
}
|
|
55
|
+
return !KNOWN_BLOCK_NAMES.has(nameResult.name);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Re-export for backwards compatibility
|
|
59
|
+
export { filterUnsafeAttributes, parseBlockName } from "../utils";
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Result of parsing a sequence of block-level content.
|
|
63
|
+
*/
|
|
64
|
+
export interface BlockParseResult {
|
|
65
|
+
/** The parsed AST elements. */
|
|
66
|
+
elements: Element[];
|
|
67
|
+
/** Total number of tokens consumed from the stream. */
|
|
68
|
+
consumed: number;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Determines whether a block rule is eligible for the current token.
|
|
73
|
+
*
|
|
74
|
+
* A rule is eligible if:
|
|
75
|
+
* 1. The token is at line start (when `rule.requiresLineStart` is true).
|
|
76
|
+
* 2. The token's type is in the rule's `startTokens` list (or the list
|
|
77
|
+
* is empty, meaning the rule is a universal fallback).
|
|
78
|
+
*
|
|
79
|
+
* @param rule - The block rule to check.
|
|
80
|
+
* @param token - The current token.
|
|
81
|
+
* @returns `true` if the rule may be attempted.
|
|
82
|
+
*/
|
|
83
|
+
export function canApplyBlockRule(rule: BlockRule, token: Token): boolean {
|
|
84
|
+
if (rule.requiresLineStart && !token.lineStart) {
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
if (rule.startTokens.length === 0) {
|
|
88
|
+
return true; // fallback rule
|
|
89
|
+
}
|
|
90
|
+
return rule.startTokens.includes(token.type);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Parses block-level elements from the token stream until a close
|
|
95
|
+
* condition is satisfied.
|
|
96
|
+
*
|
|
97
|
+
* This is the workhorse parser used by container blocks (div, collapsible,
|
|
98
|
+
* tabview, iftags, align, etc.) to parse their body content. It loops
|
|
99
|
+
* through tokens, trying each block rule in priority order, and falls back
|
|
100
|
+
* to the paragraph rule when nothing else matches.
|
|
101
|
+
*
|
|
102
|
+
* Whitespace and newline tokens between blocks are silently consumed.
|
|
103
|
+
* The close condition receives a ParseContext snapshot at the current
|
|
104
|
+
* position and should return `true` to stop parsing (the close tag
|
|
105
|
+
* itself is NOT consumed here -- the caller handles that).
|
|
106
|
+
*
|
|
107
|
+
* The close condition is also injected into `blockCloseCondition` on
|
|
108
|
+
* the context so that the paragraph parser can respect the enclosing
|
|
109
|
+
* block's boundary.
|
|
110
|
+
*
|
|
111
|
+
* @param ctx - Parse context positioned at the start of the body.
|
|
112
|
+
* @param closeCondition - Predicate that signals the end of the block body.
|
|
113
|
+
* @param options - Optional settings.
|
|
114
|
+
* @param options.excludedBlockNames - Block names that should be excluded
|
|
115
|
+
* from both rule dispatch and paragraph-boundary detection. The named
|
|
116
|
+
* rules are filtered out of `blockRules`, and the set is propagated to
|
|
117
|
+
* the inline parser via `ParseContext.excludedBlockNames` so that
|
|
118
|
+
* `BLOCK_OPEN` / `BLOCK_END_OPEN` tokens for these names do not trigger
|
|
119
|
+
* paragraph breaks.
|
|
120
|
+
* @returns Parsed elements and total tokens consumed.
|
|
121
|
+
*/
|
|
122
|
+
export function parseBlocksUntil(
|
|
123
|
+
ctx: ParseContext,
|
|
124
|
+
closeCondition: (ctx: ParseContext) => boolean,
|
|
125
|
+
options?: { excludedBlockNames?: ReadonlySet<string> },
|
|
126
|
+
): BlockParseResult {
|
|
127
|
+
const elements: Element[] = [];
|
|
128
|
+
let consumed = 0;
|
|
129
|
+
let pos = ctx.pos;
|
|
130
|
+
|
|
131
|
+
const excluded = options?.excludedBlockNames;
|
|
132
|
+
const blockRules = excluded
|
|
133
|
+
? ctx.blockRules.filter((r) => !excluded.has(r.name))
|
|
134
|
+
: ctx.blockRules;
|
|
135
|
+
const { blockFallbackRule } = ctx;
|
|
136
|
+
|
|
137
|
+
while (pos < ctx.tokens.length) {
|
|
138
|
+
const token = ctx.tokens[pos];
|
|
139
|
+
if (!token || token.type === "EOF") {
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// Check close condition
|
|
144
|
+
const checkCtx: ParseContext = { ...ctx, pos };
|
|
145
|
+
if (closeCondition(checkCtx)) {
|
|
146
|
+
break;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Skip whitespace
|
|
150
|
+
if (token.type === "WHITESPACE") {
|
|
151
|
+
pos++;
|
|
152
|
+
consumed++;
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Skip newlines
|
|
157
|
+
if (token.type === "NEWLINE") {
|
|
158
|
+
pos++;
|
|
159
|
+
consumed++;
|
|
160
|
+
continue;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// Try each block rule
|
|
164
|
+
let matched = false;
|
|
165
|
+
// Pass close condition and excluded names to context
|
|
166
|
+
const blockCtx: ParseContext = {
|
|
167
|
+
...ctx,
|
|
168
|
+
pos,
|
|
169
|
+
blockRules,
|
|
170
|
+
scope: {
|
|
171
|
+
...ctx.scope,
|
|
172
|
+
blockCloseCondition: closeCondition,
|
|
173
|
+
excludedBlockNames: excluded,
|
|
174
|
+
},
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
for (const rule of blockRules) {
|
|
178
|
+
if (canApplyBlockRule(rule, token)) {
|
|
179
|
+
const result = rule.parse(blockCtx);
|
|
180
|
+
if (result.success) {
|
|
181
|
+
elements.push(...result.elements);
|
|
182
|
+
consumed += result.consumed;
|
|
183
|
+
pos += result.consumed;
|
|
184
|
+
matched = true;
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (!matched) {
|
|
191
|
+
// Fallback to paragraph
|
|
192
|
+
const result = blockFallbackRule.parse(blockCtx);
|
|
193
|
+
if (result.success && result.elements.length > 0) {
|
|
194
|
+
elements.push(...result.elements);
|
|
195
|
+
consumed += result.consumed;
|
|
196
|
+
pos += result.consumed;
|
|
197
|
+
} else {
|
|
198
|
+
// Skip token to avoid infinite loop
|
|
199
|
+
pos++;
|
|
200
|
+
consumed++;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return { elements, consumed };
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/**
|
|
209
|
+
* Parses mixed inline/block content until a close condition is met,
|
|
210
|
+
* WITHOUT paragraph wrapping.
|
|
211
|
+
*
|
|
212
|
+
* This is used for `div_` (paragraph strip mode) where newlines become
|
|
213
|
+
* `<br />` elements rather than paragraph separators. Blank lines
|
|
214
|
+
* (multiple consecutive newlines) are collapsed into a single `<br />`.
|
|
215
|
+
*
|
|
216
|
+
* Block-level elements (nested div, collapsible, etc.) are mixed directly
|
|
217
|
+
* into the inline element stream. Newlines immediately before a BLOCK_OPEN
|
|
218
|
+
* or BLOCK_END_OPEN are silently consumed (no `<br />` generated).
|
|
219
|
+
*
|
|
220
|
+
* Trailing line-break elements are stripped from the result.
|
|
221
|
+
*
|
|
222
|
+
* @param ctx - Parse context positioned at the start of the body.
|
|
223
|
+
* @param closeCondition - Predicate that signals the end of the content.
|
|
224
|
+
* @returns Parsed elements and total tokens consumed.
|
|
225
|
+
*/
|
|
226
|
+
export function parseInlineContentUntil(
|
|
227
|
+
ctx: ParseContext,
|
|
228
|
+
closeCondition: (ctx: ParseContext) => boolean,
|
|
229
|
+
): BlockParseResult {
|
|
230
|
+
const elements: Element[] = [];
|
|
231
|
+
let consumed = 0;
|
|
232
|
+
let pos = ctx.pos;
|
|
233
|
+
|
|
234
|
+
const { blockRules, inlineRules } = ctx;
|
|
235
|
+
|
|
236
|
+
while (pos < ctx.tokens.length) {
|
|
237
|
+
const token = ctx.tokens[pos];
|
|
238
|
+
if (!token || token.type === "EOF") {
|
|
239
|
+
break;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Check close condition
|
|
243
|
+
const checkCtx: ParseContext = { ...ctx, pos };
|
|
244
|
+
if (closeCondition(checkCtx)) {
|
|
245
|
+
break;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Skip whitespace at beginning of lines (but not between words)
|
|
249
|
+
if (token.type === "WHITESPACE" && token.lineStart) {
|
|
250
|
+
pos++;
|
|
251
|
+
consumed++;
|
|
252
|
+
continue;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// Handle newlines - convert to line-breaks
|
|
256
|
+
// In paragraph strip mode, blank lines (double newline) become single line-break
|
|
257
|
+
// But newlines before block elements are not converted to line-breaks
|
|
258
|
+
if (token.type === "NEWLINE") {
|
|
259
|
+
pos++;
|
|
260
|
+
consumed++;
|
|
261
|
+
// Skip additional blank lines
|
|
262
|
+
while (ctx.tokens[pos]?.type === "NEWLINE") {
|
|
263
|
+
pos++;
|
|
264
|
+
consumed++;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// Check if next token starts a block element (BLOCK_OPEN, BLOCK_END_OPEN)
|
|
268
|
+
// If so, don't add line-break - the newline just separates text from block.
|
|
269
|
+
// But: excluded block names (e.g. nested collapsible inside div_) and
|
|
270
|
+
// unknown block names (e.g. `[[foo]]`) are treated as inline by the
|
|
271
|
+
// paragraph parser, so they must still produce a `<br />` here.
|
|
272
|
+
const nextToken = ctx.tokens[pos];
|
|
273
|
+
if (!nextToken || nextToken.type === "EOF") {
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
if (nextToken.type === "BLOCK_OPEN" || nextToken.type === "BLOCK_END_OPEN") {
|
|
277
|
+
const peekCtx: ParseContext = { ...ctx, pos };
|
|
278
|
+
if (!isNonBoundaryBlockToken(peekCtx, pos)) {
|
|
279
|
+
continue;
|
|
280
|
+
}
|
|
281
|
+
// Fall through and emit a line-break for inline-treated `[[name]]`.
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// Otherwise, add line-break
|
|
285
|
+
elements.push({ element: "line-break" });
|
|
286
|
+
continue;
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Try block rules first (for nested div, collapsible, etc.)
|
|
290
|
+
// In paragraph strip mode, blocks are mixed into the inline stream
|
|
291
|
+
let matched = false;
|
|
292
|
+
const blockCtx: ParseContext = { ...ctx, pos };
|
|
293
|
+
|
|
294
|
+
for (const rule of blockRules) {
|
|
295
|
+
if (canApplyBlockRule(rule, token)) {
|
|
296
|
+
const result = rule.parse(blockCtx);
|
|
297
|
+
if (result.success) {
|
|
298
|
+
// Add block elements directly (mixed into inline stream)
|
|
299
|
+
elements.push(...result.elements);
|
|
300
|
+
consumed += result.consumed;
|
|
301
|
+
pos += result.consumed;
|
|
302
|
+
matched = true;
|
|
303
|
+
break;
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (matched) continue;
|
|
309
|
+
|
|
310
|
+
// Try each inline rule
|
|
311
|
+
const inlineCtx: ParseContext = { ...ctx, pos };
|
|
312
|
+
|
|
313
|
+
for (const rule of inlineRules) {
|
|
314
|
+
if (canApplyInlineRule(rule, token)) {
|
|
315
|
+
const result = rule.parse(inlineCtx);
|
|
316
|
+
if (result.success) {
|
|
317
|
+
elements.push(...result.elements);
|
|
318
|
+
consumed += result.consumed;
|
|
319
|
+
pos += result.consumed;
|
|
320
|
+
matched = true;
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
if (!matched) {
|
|
327
|
+
// Fallback to text
|
|
328
|
+
elements.push({ element: "text", data: token.value });
|
|
329
|
+
consumed++;
|
|
330
|
+
pos++;
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Remove trailing line-breaks
|
|
335
|
+
while (elements.length > 0 && elements[elements.length - 1]?.element === "line-break") {
|
|
336
|
+
elements.pop();
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
return { elements, consumed };
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Parses HTML-style attributes from block opening tags.
|
|
344
|
+
*
|
|
345
|
+
* Supports:
|
|
346
|
+
* - `name="value"` (quoted string)
|
|
347
|
+
* - `name=value` (unquoted single-token value)
|
|
348
|
+
* - `name` (boolean attribute, stored as `"true"`)
|
|
349
|
+
* - Hyphenated names like `data-paragraph` or `aria-label` (composed
|
|
350
|
+
* from TEXT `-` IDENTIFIER token sequences).
|
|
351
|
+
*
|
|
352
|
+
* Attribute names are lowercased (Wikidot is case-insensitive).
|
|
353
|
+
* The result is filtered through {@link filterUnsafeAttributes} to strip
|
|
354
|
+
* potentially dangerous attributes (e.g. `onload`, `onclick`).
|
|
355
|
+
*
|
|
356
|
+
* Stops at BLOCK_CLOSE, NEWLINE, or EOF.
|
|
357
|
+
*
|
|
358
|
+
* @param ctx - Parse context.
|
|
359
|
+
* @param startPos - Token index to begin scanning.
|
|
360
|
+
* @returns Parsed (filtered) attributes and total tokens consumed.
|
|
361
|
+
*/
|
|
362
|
+
export function parseAttributes(
|
|
363
|
+
ctx: ParseContext,
|
|
364
|
+
startPos: number,
|
|
365
|
+
): { attrs: Record<string, string>; consumed: number } {
|
|
366
|
+
const attrs: Record<string, string> = {};
|
|
367
|
+
let pos = startPos;
|
|
368
|
+
let consumed = 0;
|
|
369
|
+
|
|
370
|
+
while (pos < ctx.tokens.length) {
|
|
371
|
+
const token = ctx.tokens[pos];
|
|
372
|
+
if (
|
|
373
|
+
!token ||
|
|
374
|
+
token.type === "BLOCK_CLOSE" ||
|
|
375
|
+
token.type === "NEWLINE" ||
|
|
376
|
+
token.type === "EOF"
|
|
377
|
+
) {
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Skip whitespace
|
|
382
|
+
if (token.type === "WHITESPACE") {
|
|
383
|
+
pos++;
|
|
384
|
+
consumed++;
|
|
385
|
+
continue;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Attribute name (TEXT or IDENTIFIER token)
|
|
389
|
+
// May include hyphens like "data-paragraph" which tokenizes as: IDENTIFIER "data", TEXT "-", IDENTIFIER "paragraph"
|
|
390
|
+
if (token.type === "TEXT" || token.type === "IDENTIFIER") {
|
|
391
|
+
let name = token.value;
|
|
392
|
+
pos++;
|
|
393
|
+
consumed++;
|
|
394
|
+
|
|
395
|
+
// Collect hyphenated parts (e.g., data-paragraph, aria-label)
|
|
396
|
+
while (
|
|
397
|
+
ctx.tokens[pos]?.type === "TEXT" &&
|
|
398
|
+
ctx.tokens[pos]?.value === "-" &&
|
|
399
|
+
(ctx.tokens[pos + 1]?.type === "IDENTIFIER" || ctx.tokens[pos + 1]?.type === "TEXT")
|
|
400
|
+
) {
|
|
401
|
+
name += "-";
|
|
402
|
+
pos++;
|
|
403
|
+
consumed++;
|
|
404
|
+
name += ctx.tokens[pos]?.value ?? "";
|
|
405
|
+
pos++;
|
|
406
|
+
consumed++;
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// Normalize attribute name to lowercase (Wikidot is case-insensitive)
|
|
410
|
+
name = name.toLowerCase();
|
|
411
|
+
|
|
412
|
+
// Check for =
|
|
413
|
+
const eqToken = ctx.tokens[pos];
|
|
414
|
+
if (eqToken?.type === "EQUALS") {
|
|
415
|
+
pos++;
|
|
416
|
+
consumed++;
|
|
417
|
+
|
|
418
|
+
// Get value (quoted string or text)
|
|
419
|
+
const valueToken = ctx.tokens[pos];
|
|
420
|
+
if (valueToken?.type === "QUOTED_STRING") {
|
|
421
|
+
// Remove quotes
|
|
422
|
+
let value = valueToken.value;
|
|
423
|
+
if (value.startsWith('"') && value.endsWith('"')) {
|
|
424
|
+
value = value.slice(1, -1);
|
|
425
|
+
}
|
|
426
|
+
attrs[name] = value;
|
|
427
|
+
pos++;
|
|
428
|
+
consumed++;
|
|
429
|
+
} else if (valueToken?.type === "TEXT" || valueToken?.type === "IDENTIFIER") {
|
|
430
|
+
attrs[name] = valueToken.value;
|
|
431
|
+
pos++;
|
|
432
|
+
consumed++;
|
|
433
|
+
}
|
|
434
|
+
} else {
|
|
435
|
+
// Boolean attribute
|
|
436
|
+
attrs[name] = "true";
|
|
437
|
+
}
|
|
438
|
+
} else {
|
|
439
|
+
// Unknown token, skip
|
|
440
|
+
pos++;
|
|
441
|
+
consumed++;
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
return { attrs: filterUnsafeAttributes(attrs), consumed };
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* Parses attributes from block opening tags WITHOUT safety filtering.
|
|
450
|
+
*
|
|
451
|
+
* Use this for block-specific parameters (like `type` on `[[code]]`) that
|
|
452
|
+
* are not emitted as HTML attributes and therefore do not need XSS
|
|
453
|
+
* protection. The parsing logic is identical to {@link parseAttributes}
|
|
454
|
+
* except the result is returned as-is.
|
|
455
|
+
*
|
|
456
|
+
* Hyphenated name handling is configurable because some contexts (e.g.
|
|
457
|
+
* code block with `data-src`) should treat hyphens as part of the name,
|
|
458
|
+
* while others should not.
|
|
459
|
+
*
|
|
460
|
+
* Also handles STRIKE_MARKER tokens (`--`) in attribute name positions,
|
|
461
|
+
* which can appear when a double hyphen is used in names like
|
|
462
|
+
* `data--something`.
|
|
463
|
+
*
|
|
464
|
+
* @param ctx - Parse context.
|
|
465
|
+
* @param startPos - Token index to begin scanning.
|
|
466
|
+
* @param hyphenatedNames - When `true` (default), hyphens are collected
|
|
467
|
+
* into the attribute name. When `false`, only
|
|
468
|
+
* the first segment before a hyphen is used.
|
|
469
|
+
* @returns Parsed (unfiltered) attributes and total tokens consumed.
|
|
470
|
+
*/
|
|
471
|
+
export function parseAttributesRaw(
|
|
472
|
+
ctx: ParseContext,
|
|
473
|
+
startPos: number,
|
|
474
|
+
hyphenatedNames = true,
|
|
475
|
+
): { attrs: Record<string, string>; consumed: number } {
|
|
476
|
+
const attrs: Record<string, string> = {};
|
|
477
|
+
let pos = startPos;
|
|
478
|
+
let consumed = 0;
|
|
479
|
+
|
|
480
|
+
while (pos < ctx.tokens.length) {
|
|
481
|
+
const token = ctx.tokens[pos];
|
|
482
|
+
if (
|
|
483
|
+
!token ||
|
|
484
|
+
token.type === "BLOCK_CLOSE" ||
|
|
485
|
+
token.type === "NEWLINE" ||
|
|
486
|
+
token.type === "EOF"
|
|
487
|
+
) {
|
|
488
|
+
break;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (token.type === "WHITESPACE") {
|
|
492
|
+
pos++;
|
|
493
|
+
consumed++;
|
|
494
|
+
continue;
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if (token.type === "TEXT" || token.type === "IDENTIFIER") {
|
|
498
|
+
let name = token.value;
|
|
499
|
+
pos++;
|
|
500
|
+
consumed++;
|
|
501
|
+
|
|
502
|
+
// Handle hyphenated attribute names (e.g., data-paragraph, aria-label)
|
|
503
|
+
// When hyphenatedNames=true: collect full name (data-paragraph)
|
|
504
|
+
// When hyphenatedNames=false: skip hyphen parts, use first segment only (data)
|
|
505
|
+
// This prevents data-src from becoming separate "data" and "src" attributes
|
|
506
|
+
// Also handles STRIKE_MARKER (--) and multiple hyphens (----, etc.)
|
|
507
|
+
const isHyphen = (t: (typeof ctx.tokens)[0] | undefined) =>
|
|
508
|
+
(t?.type === "TEXT" && t.value === "-") || t?.type === "STRIKE_MARKER";
|
|
509
|
+
const isNamePart = (t: (typeof ctx.tokens)[0] | undefined) =>
|
|
510
|
+
t?.type === "IDENTIFIER" || t?.type === "TEXT";
|
|
511
|
+
|
|
512
|
+
while (isHyphen(ctx.tokens[pos])) {
|
|
513
|
+
// Skip consecutive hyphens first
|
|
514
|
+
while (isHyphen(ctx.tokens[pos])) {
|
|
515
|
+
if (hyphenatedNames) {
|
|
516
|
+
name += ctx.tokens[pos]?.value ?? "-";
|
|
517
|
+
}
|
|
518
|
+
pos++;
|
|
519
|
+
consumed++;
|
|
520
|
+
}
|
|
521
|
+
// Then check if followed by name part
|
|
522
|
+
if (isNamePart(ctx.tokens[pos])) {
|
|
523
|
+
if (hyphenatedNames) {
|
|
524
|
+
name += ctx.tokens[pos]?.value ?? "";
|
|
525
|
+
}
|
|
526
|
+
pos++;
|
|
527
|
+
consumed++;
|
|
528
|
+
} else {
|
|
529
|
+
// No name part after hyphens, stop
|
|
530
|
+
break;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// Normalize attribute name to lowercase (Wikidot is case-insensitive)
|
|
535
|
+
name = name.toLowerCase();
|
|
536
|
+
|
|
537
|
+
const eqToken = ctx.tokens[pos];
|
|
538
|
+
if (eqToken?.type === "EQUALS") {
|
|
539
|
+
pos++;
|
|
540
|
+
consumed++;
|
|
541
|
+
|
|
542
|
+
const valueToken = ctx.tokens[pos];
|
|
543
|
+
if (valueToken?.type === "QUOTED_STRING") {
|
|
544
|
+
let value = valueToken.value;
|
|
545
|
+
if (value.startsWith('"') && value.endsWith('"')) {
|
|
546
|
+
value = value.slice(1, -1);
|
|
547
|
+
}
|
|
548
|
+
attrs[name] = value;
|
|
549
|
+
pos++;
|
|
550
|
+
consumed++;
|
|
551
|
+
} else if (valueToken?.type === "TEXT" || valueToken?.type === "IDENTIFIER") {
|
|
552
|
+
attrs[name] = valueToken.value;
|
|
553
|
+
pos++;
|
|
554
|
+
consumed++;
|
|
555
|
+
}
|
|
556
|
+
} else {
|
|
557
|
+
attrs[name] = "true";
|
|
558
|
+
}
|
|
559
|
+
} else {
|
|
560
|
+
pos++;
|
|
561
|
+
consumed++;
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return { attrs, consumed };
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
/**
|
|
569
|
+
* Creates a reusable close-condition function that matches block end tags
|
|
570
|
+
* (`[[/name]]`) for one or more block names.
|
|
571
|
+
*
|
|
572
|
+
* The returned function inspects the tokens at `ctx.pos` and returns both
|
|
573
|
+
* whether a match was found and how many tokens the closing tag occupies
|
|
574
|
+
* (including the optional trailing NEWLINE).
|
|
575
|
+
*
|
|
576
|
+
* @param blockNames - Array of block names to match (e.g. `["div"]`).
|
|
577
|
+
* @returns A function suitable for use as a `closeCondition` argument,
|
|
578
|
+
* returning `{ matched, consumed }`.
|
|
579
|
+
*/
|
|
580
|
+
export function createBlockEndCondition(
|
|
581
|
+
blockNames: string[],
|
|
582
|
+
): (ctx: ParseContext) => { matched: boolean; consumed: number } {
|
|
583
|
+
return (ctx: ParseContext) => {
|
|
584
|
+
const token = ctx.tokens[ctx.pos];
|
|
585
|
+
if (token?.type !== "BLOCK_END_OPEN") {
|
|
586
|
+
return { matched: false, consumed: 0 };
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
const nameResult = parseBlockName(ctx, ctx.pos + 1);
|
|
590
|
+
if (!nameResult) {
|
|
591
|
+
return { matched: false, consumed: 0 };
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
if (!blockNames.includes(nameResult.name)) {
|
|
595
|
+
return { matched: false, consumed: 0 };
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
// Calculate consumed: [[/ + name + ]]
|
|
599
|
+
let consumed = 1 + nameResult.consumed;
|
|
600
|
+
|
|
601
|
+
// Check for closing ]]
|
|
602
|
+
const closePos = ctx.pos + 1 + nameResult.consumed;
|
|
603
|
+
if (ctx.tokens[closePos]?.type === "BLOCK_CLOSE") {
|
|
604
|
+
consumed++;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// Check for trailing newline
|
|
608
|
+
const newlinePos = closePos + 1;
|
|
609
|
+
if (ctx.tokens[newlinePos]?.type === "NEWLINE") {
|
|
610
|
+
consumed++;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
return { matched: true, consumed };
|
|
614
|
+
};
|
|
615
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Aggregated exports for all parser rules (block and inline).
|
|
4
|
+
*
|
|
5
|
+
* The parser uses a rule-based architecture where each syntactic construct
|
|
6
|
+
* (heading, list, bold, link, etc.) is defined as a separate rule. Rules are
|
|
7
|
+
* organized into two categories:
|
|
8
|
+
*
|
|
9
|
+
* - **Block rules**: Match constructs that occupy one or more full lines
|
|
10
|
+
* (headings, lists, blockquotes, horizontal rules, paragraphs, etc.)
|
|
11
|
+
* - **Inline rules**: Match constructs within a line of text
|
|
12
|
+
* (bold, italic, links, raw/code spans, etc.)
|
|
13
|
+
*
|
|
14
|
+
* Each category also has a fallback rule that is used when no other rule matches,
|
|
15
|
+
* ensuring that all input is consumed.
|
|
16
|
+
*
|
|
17
|
+
* @module
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
// Types
|
|
21
|
+
export type { ParseContext, ScopeContext, RuleResult, BlockRule, InlineRule } from "./types";
|
|
22
|
+
export {
|
|
23
|
+
currentToken,
|
|
24
|
+
peekToken,
|
|
25
|
+
checkToken,
|
|
26
|
+
isAtEnd,
|
|
27
|
+
hasClosingMarkerBeforeNewline,
|
|
28
|
+
} from "./types";
|
|
29
|
+
|
|
30
|
+
// Block rules
|
|
31
|
+
export { blockRules, blockFallbackRule } from "./block";
|
|
32
|
+
export { headingRule, horizontalRuleRule, listRule, blockquoteRule, paragraphRule } from "./block"; // TODO: exportが足りているのか確認
|
|
33
|
+
|
|
34
|
+
// Inline rules
|
|
35
|
+
export { inlineRules, inlineFallbackRule } from "./inline";
|
|
36
|
+
export {
|
|
37
|
+
boldRule,
|
|
38
|
+
italicRule,
|
|
39
|
+
underlineRule,
|
|
40
|
+
strikethroughRule,
|
|
41
|
+
superscriptRule,
|
|
42
|
+
subscriptRule,
|
|
43
|
+
monospaceRule,
|
|
44
|
+
linkTripleRule,
|
|
45
|
+
linkSingleRule,
|
|
46
|
+
linkAnchorRule,
|
|
47
|
+
rawRule,
|
|
48
|
+
textRule,
|
|
49
|
+
} from "./inline"; // TODO: exportが足りているのか確認
|