@wdprlib/parser 3.1.2 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +295 -118
- package/dist/index.js +272 -95
- package/package.json +5 -3
- package/src/index.ts +163 -0
- package/src/lexer/index.ts +20 -0
- package/src/lexer/lexer.ts +687 -0
- package/src/lexer/tokens.ts +141 -0
- package/src/parser/constants.ts +173 -0
- package/src/parser/depth.ts +251 -0
- package/src/parser/index.ts +18 -0
- package/src/parser/parse.ts +315 -0
- package/src/parser/postprocess/divAdjacentParagraph.ts +76 -0
- package/src/parser/postprocess/index.ts +15 -0
- package/src/parser/postprocess/spanStrip.ts +697 -0
- package/src/parser/preprocess/expr.ts +265 -0
- package/src/parser/preprocess/index.ts +38 -0
- package/src/parser/preprocess/typography.ts +67 -0
- package/src/parser/preprocess/utils.ts +250 -0
- package/src/parser/preprocess/whitespace.ts +111 -0
- package/src/parser/rules/block/align.ts +282 -0
- package/src/parser/rules/block/bibliography.ts +359 -0
- package/src/parser/rules/block/block-list.ts +689 -0
- package/src/parser/rules/block/blockquote.ts +238 -0
- package/src/parser/rules/block/center.ts +87 -0
- package/src/parser/rules/block/clear-float.ts +75 -0
- package/src/parser/rules/block/code.ts +187 -0
- package/src/parser/rules/block/collapsible.ts +337 -0
- package/src/parser/rules/block/comment.ts +73 -0
- package/src/parser/rules/block/content-separator.ts +79 -0
- package/src/parser/rules/block/definition-list.ts +270 -0
- package/src/parser/rules/block/div.ts +400 -0
- package/src/parser/rules/block/embed-block.ts +153 -0
- package/src/parser/rules/block/footnoteblock.ts +200 -0
- package/src/parser/rules/block/heading.ts +142 -0
- package/src/parser/rules/block/horizontal-rule.ts +61 -0
- package/src/parser/rules/block/html.ts +222 -0
- package/src/parser/rules/block/iframe.ts +239 -0
- package/src/parser/rules/block/iftags.ts +150 -0
- package/src/parser/rules/block/include.ts +179 -0
- package/src/parser/rules/block/index.ts +127 -0
- package/src/parser/rules/block/list.ts +244 -0
- package/src/parser/rules/block/math.ts +183 -0
- package/src/parser/rules/block/module/backlinks/index.ts +31 -0
- package/src/parser/rules/block/module/backlinks/types.ts +21 -0
- package/src/parser/rules/block/module/categories/index.ts +34 -0
- package/src/parser/rules/block/module/categories/types.ts +21 -0
- package/src/parser/rules/block/module/css/index.ts +37 -0
- package/src/parser/rules/block/module/iftags/condition.ts +109 -0
- package/src/parser/rules/block/module/iftags/index.ts +26 -0
- package/src/parser/rules/block/module/iftags/preprocess.ts +140 -0
- package/src/parser/rules/block/module/iftags/resolve.ts +73 -0
- package/src/parser/rules/block/module/iftags/types.ts +63 -0
- package/src/parser/rules/block/module/include/index.ts +20 -0
- package/src/parser/rules/block/module/include/resolve.ts +556 -0
- package/src/parser/rules/block/module/index.ts +122 -0
- package/src/parser/rules/block/module/join/index.ts +34 -0
- package/src/parser/rules/block/module/join/types.ts +23 -0
- package/src/parser/rules/block/module/listpages/compiler.ts +453 -0
- package/src/parser/rules/block/module/listpages/extract.ts +410 -0
- package/src/parser/rules/block/module/listpages/index.ts +83 -0
- package/src/parser/rules/block/module/listpages/normalize.ts +390 -0
- package/src/parser/rules/block/module/listpages/parser.ts +106 -0
- package/src/parser/rules/block/module/listpages/resolve.ts +130 -0
- package/src/parser/rules/block/module/listpages/types.ts +513 -0
- package/src/parser/rules/block/module/listpages/url-resolver.ts +186 -0
- package/src/parser/rules/block/module/listusers/compiler.ts +77 -0
- package/src/parser/rules/block/module/listusers/extract.ts +45 -0
- package/src/parser/rules/block/module/listusers/index.ts +36 -0
- package/src/parser/rules/block/module/listusers/parser.ts +54 -0
- package/src/parser/rules/block/module/listusers/resolve.ts +58 -0
- package/src/parser/rules/block/module/listusers/types.ts +93 -0
- package/src/parser/rules/block/module/mapping.ts +61 -0
- package/src/parser/rules/block/module/page-tree/index.ts +38 -0
- package/src/parser/rules/block/module/page-tree/types.ts +29 -0
- package/src/parser/rules/block/module/rate/index.ts +28 -0
- package/src/parser/rules/block/module/rate/types.ts +19 -0
- package/src/parser/rules/block/module/resolve.ts +411 -0
- package/src/parser/rules/block/module/types-common.ts +59 -0
- package/src/parser/rules/block/module/types.ts +61 -0
- package/src/parser/rules/block/module/utils.ts +43 -0
- package/src/parser/rules/block/module/walk.ts +380 -0
- package/src/parser/rules/block/module.ts +164 -0
- package/src/parser/rules/block/orphan-li.ts +177 -0
- package/src/parser/rules/block/paragraph.ts +157 -0
- package/src/parser/rules/block/table-block.ts +726 -0
- package/src/parser/rules/block/table.ts +441 -0
- package/src/parser/rules/block/tabview.ts +331 -0
- package/src/parser/rules/block/toc.ts +129 -0
- package/src/parser/rules/block/utils.ts +615 -0
- package/src/parser/rules/index.ts +49 -0
- package/src/parser/rules/inline/anchor-name.ts +154 -0
- package/src/parser/rules/inline/anchor.ts +327 -0
- package/src/parser/rules/inline/bibcite.ts +153 -0
- package/src/parser/rules/inline/bold.ts +86 -0
- package/src/parser/rules/inline/color.ts +140 -0
- package/src/parser/rules/inline/comment.ts +90 -0
- package/src/parser/rules/inline/equation-ref.ts +115 -0
- package/src/parser/rules/inline/expr.ts +526 -0
- package/src/parser/rules/inline/footnote.ts +223 -0
- package/src/parser/rules/inline/guillemet.ts +64 -0
- package/src/parser/rules/inline/html.ts +132 -0
- package/src/parser/rules/inline/image.ts +328 -0
- package/src/parser/rules/inline/index.ts +150 -0
- package/src/parser/rules/inline/italic.ts +74 -0
- package/src/parser/rules/inline/line-break.ts +326 -0
- package/src/parser/rules/inline/link-anchor.ts +147 -0
- package/src/parser/rules/inline/link-single.ts +164 -0
- package/src/parser/rules/inline/link-star.ts +134 -0
- package/src/parser/rules/inline/link-triple.ts +267 -0
- package/src/parser/rules/inline/math-inline.ts +126 -0
- package/src/parser/rules/inline/monospace.ts +78 -0
- package/src/parser/rules/inline/raw.ts +262 -0
- package/src/parser/rules/inline/size.ts +244 -0
- package/src/parser/rules/inline/span.ts +424 -0
- package/src/parser/rules/inline/strikethrough.ts +115 -0
- package/src/parser/rules/inline/subscript.ts +84 -0
- package/src/parser/rules/inline/superscript.ts +84 -0
- package/src/parser/rules/inline/text.ts +84 -0
- package/src/parser/rules/inline/underline.ts +127 -0
- package/src/parser/rules/inline/user.ts +147 -0
- package/src/parser/rules/inline/utils.ts +344 -0
- package/src/parser/rules/types.ts +252 -0
- package/src/parser/rules/utils.ts +155 -0
- package/src/parser/toc.ts +130 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Parses the Wikidot named anchor syntax: `[[# name]]`.
|
|
4
|
+
*
|
|
5
|
+
* A named anchor creates an invisible anchor target (`<a id="name">`)
|
|
6
|
+
* that can be referenced by page-internal links such as `[#name Label]`
|
|
7
|
+
* or triple-bracket anchor links like `[[[#name]]]`.
|
|
8
|
+
*
|
|
9
|
+
* The anchor name must consist exclusively of the characters
|
|
10
|
+
* `[-_A-Za-z0-9.%]` (matching the original Wikidot regex
|
|
11
|
+
* `/(\[\[# )([-_A-Za-z0-9.%]+?)(\]\])/i`).
|
|
12
|
+
*
|
|
13
|
+
* A whitespace gap is required between the `#` and the name
|
|
14
|
+
* (`[[# myAnchor]]` is valid; `[[#myAnchor]]` is not).
|
|
15
|
+
*
|
|
16
|
+
* Produces an `"anchor-name"` AST element whose `data` field contains
|
|
17
|
+
* the raw anchor name string.
|
|
18
|
+
*
|
|
19
|
+
* @module
|
|
20
|
+
*/
|
|
21
|
+
import type { Element } from "@wdprlib/ast";
|
|
22
|
+
import type { InlineRule, ParseContext, RuleResult } from "../types";
|
|
23
|
+
import { currentToken } from "../types";
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Tests whether a single character is a valid anchor name character.
|
|
27
|
+
*
|
|
28
|
+
* Wikidot restricts anchor names to ASCII alphanumerics, hyphens,
|
|
29
|
+
* underscores, dots, and percent signs.
|
|
30
|
+
*
|
|
31
|
+
* @param char - A single character to validate
|
|
32
|
+
* @returns `true` if the character is allowed in an anchor name
|
|
33
|
+
*/
|
|
34
|
+
function isValidAnchorChar(char: string): boolean {
|
|
35
|
+
return /^[-_A-Za-z0-9.%]$/.test(char);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Inline rule for parsing `[[# name]]` named anchor targets.
|
|
40
|
+
*
|
|
41
|
+
* Triggered by a `BLOCK_OPEN` (`[[`) token. The rule looks for the `#`
|
|
42
|
+
* character followed by mandatory whitespace and then the anchor name.
|
|
43
|
+
*
|
|
44
|
+
* Parsing steps:
|
|
45
|
+
* 1. Consume `[[` and optional leading whitespace
|
|
46
|
+
* 2. Require a `#` token (HASH or TEXT `"#"`)
|
|
47
|
+
* 3. Require at least one whitespace token after `#`
|
|
48
|
+
* 4. Collect consecutive valid anchor-name characters as the name
|
|
49
|
+
* 5. Require closing `]]`
|
|
50
|
+
*
|
|
51
|
+
* Fails if the anchor name is empty or if `]]` is not found.
|
|
52
|
+
*/
|
|
53
|
+
export const anchorNameRule: InlineRule = {
|
|
54
|
+
name: "anchorName",
|
|
55
|
+
startTokens: ["BLOCK_OPEN"],
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Attempts to parse a `[[# name]]` named anchor at the current position.
|
|
59
|
+
*
|
|
60
|
+
* @param ctx - Parse context with token stream and current position
|
|
61
|
+
* @returns A successful result with an `"anchor-name"` element, or `{ success: false }`
|
|
62
|
+
*/
|
|
63
|
+
parse(ctx: ParseContext): RuleResult<Element> {
|
|
64
|
+
const openToken = currentToken(ctx);
|
|
65
|
+
if (openToken.type !== "BLOCK_OPEN") {
|
|
66
|
+
return { success: false };
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
let pos = ctx.pos + 1;
|
|
70
|
+
let consumed = 1;
|
|
71
|
+
|
|
72
|
+
// Skip whitespace
|
|
73
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE") {
|
|
74
|
+
pos++;
|
|
75
|
+
consumed++;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Check for # (hash) - can be TEXT or HASH token
|
|
79
|
+
const hashToken = ctx.tokens[pos];
|
|
80
|
+
if (
|
|
81
|
+
!hashToken ||
|
|
82
|
+
(hashToken.type !== "HASH" && !(hashToken.type === "TEXT" && hashToken.value === "#"))
|
|
83
|
+
) {
|
|
84
|
+
return { success: false };
|
|
85
|
+
}
|
|
86
|
+
pos++;
|
|
87
|
+
consumed++;
|
|
88
|
+
|
|
89
|
+
// Require whitespace after #
|
|
90
|
+
if (ctx.tokens[pos]?.type !== "WHITESPACE") {
|
|
91
|
+
return { success: false };
|
|
92
|
+
}
|
|
93
|
+
pos++;
|
|
94
|
+
consumed++;
|
|
95
|
+
|
|
96
|
+
// Skip additional whitespace
|
|
97
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE") {
|
|
98
|
+
pos++;
|
|
99
|
+
consumed++;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// Parse anchor name - collect valid characters until ]]
|
|
103
|
+
let name = "";
|
|
104
|
+
while (pos < ctx.tokens.length) {
|
|
105
|
+
const token = ctx.tokens[pos];
|
|
106
|
+
if (
|
|
107
|
+
!token ||
|
|
108
|
+
token.type === "BLOCK_CLOSE" ||
|
|
109
|
+
token.type === "NEWLINE" ||
|
|
110
|
+
token.type === "EOF"
|
|
111
|
+
) {
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
// Check if all characters in token are valid anchor chars
|
|
115
|
+
const value = token.value;
|
|
116
|
+
let allValid = true;
|
|
117
|
+
for (const char of value) {
|
|
118
|
+
if (!isValidAnchorChar(char)) {
|
|
119
|
+
allValid = false;
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
if (!allValid) {
|
|
124
|
+
break;
|
|
125
|
+
}
|
|
126
|
+
name += value;
|
|
127
|
+
pos++;
|
|
128
|
+
consumed++;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// Anchor name is required
|
|
132
|
+
if (!name) {
|
|
133
|
+
return { success: false };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Expect ]]
|
|
137
|
+
if (ctx.tokens[pos]?.type !== "BLOCK_CLOSE") {
|
|
138
|
+
return { success: false };
|
|
139
|
+
}
|
|
140
|
+
pos++;
|
|
141
|
+
consumed++;
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
success: true,
|
|
145
|
+
elements: [
|
|
146
|
+
{
|
|
147
|
+
element: "anchor-name",
|
|
148
|
+
data: name,
|
|
149
|
+
},
|
|
150
|
+
],
|
|
151
|
+
consumed,
|
|
152
|
+
};
|
|
153
|
+
},
|
|
154
|
+
};
|
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Parses the Wikidot anchor inline block syntax: `[[a]]...[[/a]]`.
|
|
4
|
+
*
|
|
5
|
+
* An anchor wraps inline content in an HTML `<a>` element, allowing
|
|
6
|
+
* href, target, and other HTML attributes to be specified.
|
|
7
|
+
*
|
|
8
|
+
* Wikidot syntax variants:
|
|
9
|
+
* - `[[a href="url"]]text[[/a]]` -- basic anchor with href
|
|
10
|
+
* - `[[a_ href="url"]]text[[/a]]` -- paragraph strip mode (trailing underscore)
|
|
11
|
+
*
|
|
12
|
+
* Paragraph strip mode (`[[a_]]`) suppresses newlines within the anchor
|
|
13
|
+
* body and strips at most one trailing newline after the closing tag
|
|
14
|
+
* (preserving double newlines as paragraph breaks). This prevents
|
|
15
|
+
* unwanted `<br>` elements when consecutive anchor blocks are placed on
|
|
16
|
+
* separate lines.
|
|
17
|
+
*
|
|
18
|
+
* The `target` attribute is extracted and mapped to a semantic enum value
|
|
19
|
+
* (`"new-tab"`, `"parent"`, `"top"`, `"same"`), while the remaining
|
|
20
|
+
* attributes (including `href`) are passed through after URL sanitization.
|
|
21
|
+
*
|
|
22
|
+
* @module
|
|
23
|
+
*/
|
|
24
|
+
import type { Element } from "@wdprlib/ast";
|
|
25
|
+
import type { InlineRule, ParseContext, RuleResult } from "../types";
|
|
26
|
+
import { currentToken } from "../types";
|
|
27
|
+
import { inlineRules } from "../index";
|
|
28
|
+
import { sanitizeUrl as braintreeSanitizeUrl } from "@braintree/sanitize-url";
|
|
29
|
+
import { parseAttributes } from "../block/utils";
|
|
30
|
+
import { canApplyInlineRule } from "./utils";
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Sanitizes a URL to prevent XSS attacks via dangerous URI schemes.
|
|
34
|
+
*
|
|
35
|
+
* Applies two layers of protection:
|
|
36
|
+
* 1. Pre-checks the whitespace-normalized URL against known dangerous schemes
|
|
37
|
+
* (`javascript:`, `data:`, `vbscript:`), catching evasion attempts like
|
|
38
|
+
* `"java script:"` with embedded whitespace.
|
|
39
|
+
* 2. Delegates to `@braintree/sanitize-url` for additional validation.
|
|
40
|
+
*
|
|
41
|
+
* Returns the original URL (not the normalized form) to avoid unintended
|
|
42
|
+
* modifications such as trailing-slash addition.
|
|
43
|
+
*
|
|
44
|
+
* @param url - The raw URL string to sanitize
|
|
45
|
+
* @returns The original URL if safe, or `"#invalid-url"` if the URL is deemed dangerous
|
|
46
|
+
*/
|
|
47
|
+
function sanitizeUrl(url: string): string {
|
|
48
|
+
// Pre-process: normalize whitespace to catch evasion attempts like "java script:"
|
|
49
|
+
const normalizedForCheck = url.replace(/[\s\u0000-\u001f]/g, "").toLowerCase();
|
|
50
|
+
|
|
51
|
+
// Check for dangerous schemes after whitespace normalization
|
|
52
|
+
const dangerousSchemes = ["javascript:", "data:", "vbscript:"];
|
|
53
|
+
for (const scheme of dangerousSchemes) {
|
|
54
|
+
if (normalizedForCheck.startsWith(scheme)) {
|
|
55
|
+
return "#invalid-url";
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Use library for additional checks
|
|
60
|
+
const sanitized = braintreeSanitizeUrl(url);
|
|
61
|
+
if (sanitized === "about:blank") {
|
|
62
|
+
return "#invalid-url";
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// Return original URL to avoid unwanted normalization (e.g., trailing slash addition)
|
|
66
|
+
return url;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Parses the block name portion of an anchor open/close tag, handling the
|
|
71
|
+
* optional underscore suffix that activates paragraph strip mode.
|
|
72
|
+
*
|
|
73
|
+
* Recognizes `a`, `anchor`, `a_`, and `anchor_` (case-insensitive).
|
|
74
|
+
* The underscore suffix is reported via the `score` field so the caller
|
|
75
|
+
* can decide how to handle newlines inside the anchor body.
|
|
76
|
+
*
|
|
77
|
+
* @param ctx - The current parse context containing the token stream
|
|
78
|
+
* @param startPos - Token index at which to begin scanning
|
|
79
|
+
* @returns An object with the lowercased name (including trailing `_` if present),
|
|
80
|
+
* a `score` boolean indicating paragraph strip mode, and the number of
|
|
81
|
+
* tokens consumed -- or `null` if no valid anchor block name was found
|
|
82
|
+
*/
|
|
83
|
+
function parseAnchorBlockName(
|
|
84
|
+
ctx: ParseContext,
|
|
85
|
+
startPos: number,
|
|
86
|
+
): { name: string; score: boolean; consumed: number } | null {
|
|
87
|
+
let pos = startPos;
|
|
88
|
+
let consumed = 0;
|
|
89
|
+
|
|
90
|
+
// Skip whitespace
|
|
91
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE") {
|
|
92
|
+
pos++;
|
|
93
|
+
consumed++;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const token = ctx.tokens[pos];
|
|
97
|
+
if (!token || (token.type !== "TEXT" && token.type !== "IDENTIFIER")) {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
let name = token.value.toLowerCase();
|
|
102
|
+
consumed++;
|
|
103
|
+
pos++;
|
|
104
|
+
|
|
105
|
+
// Check for underscore suffix (paragraph strip)
|
|
106
|
+
let score = false;
|
|
107
|
+
if (ctx.tokens[pos]?.type === "UNDERSCORE") {
|
|
108
|
+
score = true;
|
|
109
|
+
name += "_";
|
|
110
|
+
consumed++;
|
|
111
|
+
pos++;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return { name, score, consumed };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Inline rule for parsing `[[a]]...[[/a]]` blocks.
|
|
119
|
+
*
|
|
120
|
+
* Triggered by a `BLOCK_OPEN` (`[[`) token. The rule verifies the block name
|
|
121
|
+
* is `a` or `anchor` (optionally with `_` suffix), parses HTML attributes,
|
|
122
|
+
* then recursively parses inline content until the matching closing tag.
|
|
123
|
+
*
|
|
124
|
+
* Produces an `"anchor"` AST element containing the parsed children, a
|
|
125
|
+
* semantic `target` value, and the sanitized attribute map.
|
|
126
|
+
*
|
|
127
|
+
* Edge cases:
|
|
128
|
+
* - If no matching closing tag is found, the rule fails (returns `{ success: false }`),
|
|
129
|
+
* allowing the tokens to fall through to other rules or the text fallback.
|
|
130
|
+
* - In paragraph strip mode, newlines within the body are consumed silently
|
|
131
|
+
* rather than converted to line-break elements. After the closing tag,
|
|
132
|
+
* at most one trailing newline is consumed to prevent a line-break between
|
|
133
|
+
* consecutive `[[a_]]` blocks, but double newlines are preserved as
|
|
134
|
+
* paragraph breaks.
|
|
135
|
+
* - The `href` attribute is sanitized to block `javascript:`, `data:`, and
|
|
136
|
+
* `vbscript:` schemes.
|
|
137
|
+
*/
|
|
138
|
+
export const anchorRule: InlineRule = {
|
|
139
|
+
name: "anchor",
|
|
140
|
+
startTokens: ["BLOCK_OPEN"],
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Attempts to parse an anchor block starting at the current position.
|
|
144
|
+
*
|
|
145
|
+
* @param ctx - Parse context with token stream and current position
|
|
146
|
+
* @returns A successful result with an `"anchor"` element, or `{ success: false }`
|
|
147
|
+
*/
|
|
148
|
+
parse(ctx: ParseContext): RuleResult<Element> {
|
|
149
|
+
const openToken = currentToken(ctx);
|
|
150
|
+
if (openToken.type !== "BLOCK_OPEN") {
|
|
151
|
+
return { success: false };
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
let pos = ctx.pos + 1;
|
|
155
|
+
let consumed = 1;
|
|
156
|
+
|
|
157
|
+
// Parse block name with flags
|
|
158
|
+
const nameResult = parseAnchorBlockName(ctx, pos);
|
|
159
|
+
if (!nameResult) {
|
|
160
|
+
return { success: false };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
const baseName = nameResult.name.replace(/_$/, "");
|
|
164
|
+
if (baseName !== "a" && baseName !== "anchor") {
|
|
165
|
+
return { success: false };
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const paragraphStrip = nameResult.score;
|
|
169
|
+
|
|
170
|
+
pos += nameResult.consumed;
|
|
171
|
+
consumed += nameResult.consumed;
|
|
172
|
+
|
|
173
|
+
// Parse attributes
|
|
174
|
+
const attrResult = parseAttributes(ctx, pos);
|
|
175
|
+
pos += attrResult.consumed;
|
|
176
|
+
consumed += attrResult.consumed;
|
|
177
|
+
|
|
178
|
+
// Expect ]]
|
|
179
|
+
if (ctx.tokens[pos]?.type !== "BLOCK_CLOSE") {
|
|
180
|
+
return { success: false };
|
|
181
|
+
}
|
|
182
|
+
pos++;
|
|
183
|
+
consumed++;
|
|
184
|
+
|
|
185
|
+
// Parse content until [[/a]] or [[/anchor]]
|
|
186
|
+
const children: Element[] = [];
|
|
187
|
+
let foundClose = false;
|
|
188
|
+
|
|
189
|
+
while (pos < ctx.tokens.length) {
|
|
190
|
+
const token = ctx.tokens[pos];
|
|
191
|
+
if (!token || token.type === "EOF") {
|
|
192
|
+
break;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Check for closing tag
|
|
196
|
+
if (token.type === "BLOCK_END_OPEN") {
|
|
197
|
+
const closeNameResult = parseAnchorBlockName(ctx, pos + 1);
|
|
198
|
+
const closeBaseName = closeNameResult?.name.replace(/_$/, "");
|
|
199
|
+
if (closeNameResult && (closeBaseName === "a" || closeBaseName === "anchor")) {
|
|
200
|
+
pos++; // [[/
|
|
201
|
+
consumed++;
|
|
202
|
+
pos += closeNameResult.consumed;
|
|
203
|
+
consumed += closeNameResult.consumed;
|
|
204
|
+
if (ctx.tokens[pos]?.type === "BLOCK_CLOSE") {
|
|
205
|
+
pos++;
|
|
206
|
+
consumed++;
|
|
207
|
+
}
|
|
208
|
+
foundClose = true;
|
|
209
|
+
|
|
210
|
+
// In paragraph strip mode, consume one trailing newline after close tag
|
|
211
|
+
// This prevents a line-break between consecutive [[a_]] blocks
|
|
212
|
+
// but preserves paragraph breaks (double newlines)
|
|
213
|
+
if (
|
|
214
|
+
paragraphStrip &&
|
|
215
|
+
ctx.tokens[pos]?.type === "NEWLINE" &&
|
|
216
|
+
ctx.tokens[pos + 1]?.type !== "NEWLINE"
|
|
217
|
+
) {
|
|
218
|
+
pos++;
|
|
219
|
+
consumed++;
|
|
220
|
+
}
|
|
221
|
+
break;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Handle NEWLINE
|
|
226
|
+
if (token.type === "NEWLINE") {
|
|
227
|
+
if (paragraphStrip) {
|
|
228
|
+
// Skip newlines in paragraph strip mode
|
|
229
|
+
pos++;
|
|
230
|
+
consumed++;
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
// Convert to line-break
|
|
234
|
+
children.push({ element: "line-break" });
|
|
235
|
+
pos++;
|
|
236
|
+
consumed++;
|
|
237
|
+
// Skip leading whitespace after newline
|
|
238
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE" && ctx.tokens[pos]?.lineStart) {
|
|
239
|
+
pos++;
|
|
240
|
+
consumed++;
|
|
241
|
+
}
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Skip whitespace at line start
|
|
246
|
+
if (token.type === "WHITESPACE" && token.lineStart) {
|
|
247
|
+
pos++;
|
|
248
|
+
consumed++;
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Try each inline rule
|
|
253
|
+
let matched = false;
|
|
254
|
+
const inlineCtx: ParseContext = { ...ctx, pos };
|
|
255
|
+
|
|
256
|
+
for (const rule of inlineRules) {
|
|
257
|
+
if (canApplyInlineRule(rule, token)) {
|
|
258
|
+
const result = rule.parse(inlineCtx);
|
|
259
|
+
if (result.success) {
|
|
260
|
+
children.push(...result.elements);
|
|
261
|
+
pos += result.consumed;
|
|
262
|
+
consumed += result.consumed;
|
|
263
|
+
matched = true;
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
if (!matched) {
|
|
270
|
+
children.push({ element: "text", data: token.value });
|
|
271
|
+
pos++;
|
|
272
|
+
consumed++;
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (!foundClose) {
|
|
277
|
+
ctx.diagnostics.push({
|
|
278
|
+
severity: "warning",
|
|
279
|
+
code: "unclosed-block",
|
|
280
|
+
message: `Missing closing tag [[/a]] for [[${nameResult.name}]]`,
|
|
281
|
+
position: openToken.position,
|
|
282
|
+
});
|
|
283
|
+
return { success: false };
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Clean up children - remove leading/trailing line breaks if paragraph strip
|
|
287
|
+
if (paragraphStrip) {
|
|
288
|
+
while (children.length > 0 && children[0]?.element === "line-break") {
|
|
289
|
+
children.shift();
|
|
290
|
+
}
|
|
291
|
+
while (children.length > 0 && children[children.length - 1]?.element === "line-break") {
|
|
292
|
+
children.pop();
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// Determine target from attributes
|
|
297
|
+
let target: "new-tab" | "parent" | "top" | "same" | null = null;
|
|
298
|
+
const targetAttr = attrResult.attrs.target;
|
|
299
|
+
if (targetAttr === "_blank") target = "new-tab";
|
|
300
|
+
else if (targetAttr === "_parent") target = "parent";
|
|
301
|
+
else if (targetAttr === "_top") target = "top";
|
|
302
|
+
else if (targetAttr === "_self") target = "same";
|
|
303
|
+
|
|
304
|
+
// Remove target from attributes (href stays in attributes)
|
|
305
|
+
const { target: _t, ...cleanAttrs } = attrResult.attrs;
|
|
306
|
+
|
|
307
|
+
// Sanitize href to prevent XSS
|
|
308
|
+
if (cleanAttrs.href) {
|
|
309
|
+
cleanAttrs.href = sanitizeUrl(cleanAttrs.href);
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return {
|
|
313
|
+
success: true,
|
|
314
|
+
elements: [
|
|
315
|
+
{
|
|
316
|
+
element: "anchor",
|
|
317
|
+
data: {
|
|
318
|
+
target,
|
|
319
|
+
attributes: cleanAttrs,
|
|
320
|
+
elements: children,
|
|
321
|
+
},
|
|
322
|
+
},
|
|
323
|
+
],
|
|
324
|
+
consumed,
|
|
325
|
+
};
|
|
326
|
+
},
|
|
327
|
+
};
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*
|
|
3
|
+
* Parses the Wikidot bibliography citation syntax: `((bibcite label))`.
|
|
4
|
+
*
|
|
5
|
+
* A bibcite creates a numbered inline reference (similar to footnotes)
|
|
6
|
+
* that links to a corresponding entry in a `[[bibliography]]` block
|
|
7
|
+
* elsewhere on the page. The `label` string is used to match the
|
|
8
|
+
* citation with its bibliography entry.
|
|
9
|
+
*
|
|
10
|
+
* Unlike most inline blocks that start with `[[`, bibcite uses double
|
|
11
|
+
* parentheses `((...))` as delimiters. The keyword `bibcite` must
|
|
12
|
+
* appear (case-insensitive) between the opening `((` and the label.
|
|
13
|
+
*
|
|
14
|
+
* Produces a `"bibliography-cite"` AST element. The label is also
|
|
15
|
+
* pushed into `ctx.bibcites` so the renderer can later resolve
|
|
16
|
+
* citation numbers.
|
|
17
|
+
*
|
|
18
|
+
* Wikidot syntax examples:
|
|
19
|
+
* - `((bibcite author2024))` -- cite with label "author2024"
|
|
20
|
+
* - `((bibcite my-source))` -- cite with label "my-source"
|
|
21
|
+
*
|
|
22
|
+
* @module
|
|
23
|
+
*/
|
|
24
|
+
import type { Element } from "@wdprlib/ast";
|
|
25
|
+
import type { InlineRule, ParseContext, RuleResult } from "../types";
|
|
26
|
+
import { currentToken } from "../types";
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Inline rule for parsing `((bibcite label))` bibliography citations.
|
|
30
|
+
*
|
|
31
|
+
* Triggered by a `TEXT` token (specifically the `(` character). The parser
|
|
32
|
+
* looks for two consecutive `(` tokens, the keyword `bibcite`, the label
|
|
33
|
+
* text, and then two consecutive `)` tokens.
|
|
34
|
+
*
|
|
35
|
+
* The label may span multiple tokens and is trimmed of surrounding whitespace.
|
|
36
|
+
* Parsing fails if the label is empty or if a newline/EOF is encountered
|
|
37
|
+
* before the closing `))`.
|
|
38
|
+
*
|
|
39
|
+
* Side effect: pushes the label into `ctx.bibcites` for later resolution
|
|
40
|
+
* during rendering.
|
|
41
|
+
*/
|
|
42
|
+
export const bibciteRule: InlineRule = {
|
|
43
|
+
name: "bibcite",
|
|
44
|
+
startTokens: ["TEXT"],
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Attempts to parse a `((bibcite label))` citation at the current position.
|
|
48
|
+
*
|
|
49
|
+
* @param ctx - Parse context with token stream and current position
|
|
50
|
+
* @returns A successful result with a `"bibliography-cite"` element, or `{ success: false }`
|
|
51
|
+
*/
|
|
52
|
+
parse(ctx: ParseContext): RuleResult<Element> {
|
|
53
|
+
const token = currentToken(ctx);
|
|
54
|
+
|
|
55
|
+
// Must start with (
|
|
56
|
+
if (token.type !== "TEXT" || token.value !== "(") {
|
|
57
|
+
return { success: false };
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Check for second (
|
|
61
|
+
const nextToken = ctx.tokens[ctx.pos + 1];
|
|
62
|
+
if (!nextToken || nextToken.type !== "TEXT" || nextToken.value !== "(") {
|
|
63
|
+
return { success: false };
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Check for "bibcite" identifier
|
|
67
|
+
let pos = ctx.pos + 2;
|
|
68
|
+
let consumed = 2;
|
|
69
|
+
|
|
70
|
+
// Skip whitespace
|
|
71
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE") {
|
|
72
|
+
pos++;
|
|
73
|
+
consumed++;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const nameToken = ctx.tokens[pos];
|
|
77
|
+
if (
|
|
78
|
+
!nameToken ||
|
|
79
|
+
nameToken.type !== "IDENTIFIER" ||
|
|
80
|
+
nameToken.value.toLowerCase() !== "bibcite"
|
|
81
|
+
) {
|
|
82
|
+
return { success: false };
|
|
83
|
+
}
|
|
84
|
+
pos++;
|
|
85
|
+
consumed++;
|
|
86
|
+
|
|
87
|
+
// Skip whitespace
|
|
88
|
+
while (ctx.tokens[pos]?.type === "WHITESPACE") {
|
|
89
|
+
pos++;
|
|
90
|
+
consumed++;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Parse label (identifier or text)
|
|
94
|
+
const labelToken = ctx.tokens[pos];
|
|
95
|
+
if (!labelToken || (labelToken.type !== "IDENTIFIER" && labelToken.type !== "TEXT")) {
|
|
96
|
+
return { success: false };
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Collect label (may span multiple tokens until ))
|
|
100
|
+
let label = "";
|
|
101
|
+
let foundClose = false;
|
|
102
|
+
while (pos < ctx.tokens.length) {
|
|
103
|
+
const t = ctx.tokens[pos];
|
|
104
|
+
if (!t) break;
|
|
105
|
+
|
|
106
|
+
// Check for ))
|
|
107
|
+
if (t.type === "TEXT" && t.value === ")") {
|
|
108
|
+
const nextT = ctx.tokens[pos + 1];
|
|
109
|
+
if (nextT?.type === "TEXT" && nextT.value === ")") {
|
|
110
|
+
// Found closing ))
|
|
111
|
+
consumed += 2;
|
|
112
|
+
foundClose = true;
|
|
113
|
+
break;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Stop at newline or EOF
|
|
118
|
+
if (t.type === "NEWLINE" || t.type === "EOF") {
|
|
119
|
+
return { success: false };
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
label += t.value;
|
|
123
|
+
pos++;
|
|
124
|
+
consumed++;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (!foundClose) {
|
|
128
|
+
return { success: false };
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
label = label.trim();
|
|
132
|
+
if (!label) {
|
|
133
|
+
return { success: false };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Store bibcite reference in context for later resolution
|
|
137
|
+
ctx.bibcites.push(label);
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
success: true,
|
|
141
|
+
elements: [
|
|
142
|
+
{
|
|
143
|
+
element: "bibliography-cite",
|
|
144
|
+
data: {
|
|
145
|
+
label,
|
|
146
|
+
brackets: false, // Wikidot adds brackets in output but they're not in the AST
|
|
147
|
+
},
|
|
148
|
+
},
|
|
149
|
+
],
|
|
150
|
+
consumed,
|
|
151
|
+
};
|
|
152
|
+
},
|
|
153
|
+
};
|