@markuplint/markdown-parser 5.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ARCHITECTURE.md +190 -0
- package/CHANGELOG.md +10 -0
- package/LICENSE +21 -0
- package/README.ja.md +47 -0
- package/README.md +47 -0
- package/lib/index.d.ts +6 -0
- package/lib/index.js +6 -0
- package/lib/markdown-aware-parser.d.ts +179 -0
- package/lib/markdown-aware-parser.js +529 -0
- package/lib/parser.d.ts +40 -0
- package/lib/parser.js +91 -0
- package/package.json +39 -0
- package/src/index.spec.ts +747 -0
- package/src/index.ts +7 -0
- package/src/markdown-aware-parser.ts +656 -0
- package/src/parser.ts +109 -0
- package/tsconfig.build.json +9 -0
- package/tsconfig.build.tsbuildinfo +1 -0
- package/tsconfig.json +17 -0
|
@@ -0,0 +1,529 @@
|
|
|
1
|
+
import { Parser, getNamespace } from '@markuplint/parser-utils';
|
|
2
|
+
/**
|
|
3
|
+
* Abstract base class for parsers that handle Markdown content.
|
|
4
|
+
*
|
|
5
|
+
* Provides shared logic for converting mdast nodes (headings, links, images,
|
|
6
|
+
* lists, code, tables, etc.) into markuplint's AST. Both MarkdownParser and
|
|
7
|
+
* MDXParser extend this class to avoid code duplication.
|
|
8
|
+
*/
|
|
9
|
+
export class MarkdownAwareParser extends Parser {
|
|
10
|
+
/**
|
|
11
|
+
* Stores link/image reference definitions (`[id]: url "title"`)
|
|
12
|
+
* extracted during tokenization for resolving linkReference/imageReference nodes.
|
|
13
|
+
*/
|
|
14
|
+
definitions = new Map();
|
|
15
|
+
/**
|
|
16
|
+
* Offsets of table rows that are header rows (first row of each table).
|
|
17
|
+
* Set by visitTableElement, read by nodeizeMarkdownNode for tableRow dispatch.
|
|
18
|
+
*/
|
|
19
|
+
#headerRowOffsets = new Set();
|
|
20
|
+
/**
|
|
21
|
+
* Current cell element name ('th' or 'td').
|
|
22
|
+
* Set by tableRow processing, read by tableCell processing.
|
|
23
|
+
* Reset to 'td' after each row.
|
|
24
|
+
*/
|
|
25
|
+
#currentCellName = 'td';
|
|
26
|
+
constructor(options) {
|
|
27
|
+
super(options);
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Resets mutable state accumulated during a previous `parse()` call.
|
|
31
|
+
*
|
|
32
|
+
* Must be called at the beginning of every `tokenize()` invocation to
|
|
33
|
+
* prevent definitions, header-row offsets, and cell-name state from
|
|
34
|
+
* leaking across successive `parse()` calls on the same parser instance.
|
|
35
|
+
*/
|
|
36
|
+
resetMarkdownState() {
|
|
37
|
+
this.definitions.clear();
|
|
38
|
+
this.#headerRowOffsets.clear();
|
|
39
|
+
this.#currentCellName = 'td';
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Adjusts the flattened node list for Markdown output.
|
|
43
|
+
*
|
|
44
|
+
* Disables whitespace and invalid-node exposure because Markdown
|
|
45
|
+
* generates only synthetic elements with no real HTML whitespace tokens.
|
|
46
|
+
*
|
|
47
|
+
* @param nodeList - The flattened node tree produced by the base class.
|
|
48
|
+
* @returns The adjusted node list.
|
|
49
|
+
*/
|
|
50
|
+
afterFlattenNodes(nodeList) {
|
|
51
|
+
return super.afterFlattenNodes(nodeList, {
|
|
52
|
+
exposeWhiteSpace: false,
|
|
53
|
+
exposeInvalidNode: false,
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Creates a synthetic HTML attribute token for Markdown-derived elements.
|
|
58
|
+
*
|
|
59
|
+
* The attribute positions point to the element's own token range because
|
|
60
|
+
* Markdown syntax does not have discrete attribute source positions.
|
|
61
|
+
*
|
|
62
|
+
* @param name - The attribute name (e.g., `"href"`, `"alt"`).
|
|
63
|
+
* @param value - The attribute value extracted from Markdown syntax.
|
|
64
|
+
* @param token - The source token whose position is reused for the attribute.
|
|
65
|
+
* @returns A fully-formed HTML attribute node.
|
|
66
|
+
*/
|
|
67
|
+
createSyntheticAttr(name, value, token) {
|
|
68
|
+
const emptyToken = this.createToken('', token.offset, token.line, token.col);
|
|
69
|
+
const nameToken = this.createToken(name, token.offset, token.line, token.col);
|
|
70
|
+
const valueToken = this.createToken(value, token.offset, token.line, token.col);
|
|
71
|
+
const attrToken = this.createToken(`${name}="${value}"`, token.offset, token.line, token.col);
|
|
72
|
+
return {
|
|
73
|
+
...attrToken,
|
|
74
|
+
type: 'attr',
|
|
75
|
+
nodeName: name,
|
|
76
|
+
spacesBeforeName: emptyToken,
|
|
77
|
+
name: nameToken,
|
|
78
|
+
spacesBeforeEqual: emptyToken,
|
|
79
|
+
equal: this.createToken('=', token.offset, token.line, token.col),
|
|
80
|
+
spacesAfterEqual: emptyToken,
|
|
81
|
+
startQuote: this.createToken('"', token.offset, token.line, token.col),
|
|
82
|
+
value: valueToken,
|
|
83
|
+
endQuote: this.createToken('"', token.offset, token.line, token.col),
|
|
84
|
+
isDuplicatable: false,
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Builds a generic HTML element node from a Markdown construct.
|
|
89
|
+
*
|
|
90
|
+
* @param token - The source token covering the entire construct.
|
|
91
|
+
* @param nodeName - The HTML element name (e.g., `"p"`, `"h1"`, `"li"`).
|
|
92
|
+
* @param childNodes - The mdast children to recurse into.
|
|
93
|
+
* @param depth - Current nesting depth in the AST.
|
|
94
|
+
* @param parentNode - Parent AST node, or `null` for top-level nodes.
|
|
95
|
+
* @param attributes - Optional pre-built attributes to attach.
|
|
96
|
+
* @returns The element node followed by its descendants.
|
|
97
|
+
*/
|
|
98
|
+
visitMarkdownElement(token, nodeName,
|
|
99
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
100
|
+
childNodes, depth, parentNode, attributes = []) {
|
|
101
|
+
const startTag = {
|
|
102
|
+
...token,
|
|
103
|
+
...this.createToken(token),
|
|
104
|
+
attributes: [...attributes],
|
|
105
|
+
type: 'starttag',
|
|
106
|
+
elementType: this.detectElementType(nodeName),
|
|
107
|
+
namespace: getNamespace(nodeName, parentNode),
|
|
108
|
+
childNodes: [],
|
|
109
|
+
blockBehavior: null,
|
|
110
|
+
depth,
|
|
111
|
+
parentNode,
|
|
112
|
+
pairNode: null,
|
|
113
|
+
tagOpenChar: '',
|
|
114
|
+
tagCloseChar: '',
|
|
115
|
+
isGhost: false,
|
|
116
|
+
isFragment: false,
|
|
117
|
+
nodeName,
|
|
118
|
+
};
|
|
119
|
+
// Safe cast: childNodes are always subtypes of RootContent (= MdastNode)
|
|
120
|
+
const siblings = this.visitChildren([...childNodes], startTag);
|
|
121
|
+
return [startTag, ...siblings];
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Builds an `<a>` element with `href` (and optionally `title`) attributes.
|
|
125
|
+
*
|
|
126
|
+
* @param originNode - The mdast `link` node.
|
|
127
|
+
* @param token - The source token covering the link.
|
|
128
|
+
* @param depth - Current nesting depth.
|
|
129
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
130
|
+
* @returns The `<a>` element node and its descendants.
|
|
131
|
+
*/
|
|
132
|
+
visitLinkElement(
|
|
133
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
134
|
+
originNode, token, depth, parentNode) {
|
|
135
|
+
const attrs = [this.createSyntheticAttr('href', originNode.url, token)];
|
|
136
|
+
if (originNode.title != null) {
|
|
137
|
+
attrs.push(this.createSyntheticAttr('title', originNode.title, token));
|
|
138
|
+
}
|
|
139
|
+
return this.visitMarkdownElement(token, 'a', originNode.children, depth, parentNode, attrs);
|
|
140
|
+
}
|
|
141
|
+
/**
|
|
142
|
+
* Builds an `<img>` element with `src`, `alt`, and optionally `title` attributes.
|
|
143
|
+
*
|
|
144
|
+
* @param originNode - The mdast `image` node.
|
|
145
|
+
* @param token - The source token covering the image.
|
|
146
|
+
* @param depth - Current nesting depth.
|
|
147
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
148
|
+
* @returns The `<img>` element node.
|
|
149
|
+
*/
|
|
150
|
+
visitImageElement(
|
|
151
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
152
|
+
originNode, token, depth, parentNode) {
|
|
153
|
+
const attrs = [
|
|
154
|
+
this.createSyntheticAttr('src', originNode.url, token),
|
|
155
|
+
this.createSyntheticAttr('alt', originNode.alt ?? '', token),
|
|
156
|
+
];
|
|
157
|
+
if (originNode.title != null) {
|
|
158
|
+
attrs.push(this.createSyntheticAttr('title', originNode.title, token));
|
|
159
|
+
}
|
|
160
|
+
return this.visitMarkdownElement(token, 'img', [], depth, parentNode, attrs);
|
|
161
|
+
}
|
|
162
|
+
/**
|
|
163
|
+
* Builds a `<ul>` or `<ol>` element. Adds a `start` attribute when the
|
|
164
|
+
* ordered list begins at a number other than 1.
|
|
165
|
+
*
|
|
166
|
+
* @param originNode - The mdast `list` node.
|
|
167
|
+
* @param token - The source token covering the list.
|
|
168
|
+
* @param depth - Current nesting depth.
|
|
169
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
170
|
+
* @returns The list element node and its descendants.
|
|
171
|
+
*/
|
|
172
|
+
visitListElement(
|
|
173
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
174
|
+
originNode, token, depth, parentNode) {
|
|
175
|
+
const nodeName = originNode.ordered ? 'ol' : 'ul';
|
|
176
|
+
const attrs = [];
|
|
177
|
+
if (originNode.ordered && originNode.start != null && originNode.start !== 1) {
|
|
178
|
+
attrs.push(this.createSyntheticAttr('start', String(originNode.start), token));
|
|
179
|
+
}
|
|
180
|
+
return this.visitMarkdownElement(token, nodeName, originNode.children, depth, parentNode, attrs);
|
|
181
|
+
}
|
|
182
|
+
/**
|
|
183
|
+
* Builds a `<code>` element for inline code spans (backtick-delimited).
|
|
184
|
+
*
|
|
185
|
+
* @param originNode - The mdast `inlineCode` node.
|
|
186
|
+
* @param token - The source token covering the code span.
|
|
187
|
+
* @param offset - Start offset in the original source.
|
|
188
|
+
* @param endOffset - End offset in the original source.
|
|
189
|
+
* @param depth - Current nesting depth.
|
|
190
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
191
|
+
* @returns The `<code>` element node (with a text child when content is found).
|
|
192
|
+
*/
|
|
193
|
+
visitInlineCode(
|
|
194
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
195
|
+
originNode, token, offset, endOffset, depth, parentNode) {
|
|
196
|
+
const startTag = {
|
|
197
|
+
...token,
|
|
198
|
+
...this.createToken(token),
|
|
199
|
+
attributes: [],
|
|
200
|
+
type: 'starttag',
|
|
201
|
+
elementType: this.detectElementType('code'),
|
|
202
|
+
namespace: getNamespace('code', parentNode),
|
|
203
|
+
childNodes: [],
|
|
204
|
+
blockBehavior: null,
|
|
205
|
+
depth,
|
|
206
|
+
parentNode,
|
|
207
|
+
pairNode: null,
|
|
208
|
+
tagOpenChar: '',
|
|
209
|
+
tagCloseChar: '',
|
|
210
|
+
isGhost: false,
|
|
211
|
+
isFragment: false,
|
|
212
|
+
nodeName: 'code',
|
|
213
|
+
};
|
|
214
|
+
const raw = this.rawCode.slice(offset, endOffset);
|
|
215
|
+
const valueStart = raw.indexOf(originNode.value);
|
|
216
|
+
// Defensive guard: if value cannot be found in raw source (e.g., whitespace-only code spans) or is empty
|
|
217
|
+
if (valueStart === -1 || originNode.value.length === 0) {
|
|
218
|
+
return [startTag];
|
|
219
|
+
}
|
|
220
|
+
const valueOffset = offset + valueStart;
|
|
221
|
+
const valueEndOffset = valueOffset + originNode.value.length;
|
|
222
|
+
const textToken = this.sliceFragment(valueOffset, valueEndOffset);
|
|
223
|
+
const textNode = {
|
|
224
|
+
...textToken,
|
|
225
|
+
...this.createToken(textToken),
|
|
226
|
+
type: 'text',
|
|
227
|
+
depth: depth + 1,
|
|
228
|
+
nodeName: '#text',
|
|
229
|
+
parentNode: startTag,
|
|
230
|
+
};
|
|
231
|
+
this.appendChild(startTag, textNode);
|
|
232
|
+
return [startTag];
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Builds a `<pre><code>` structure for fenced code blocks.
|
|
236
|
+
* When a language is specified, adds `class="language-{lang}"` to the `<code>` element.
|
|
237
|
+
*
|
|
238
|
+
* @param originNode - The mdast `code` node.
|
|
239
|
+
* @param token - The source token covering the fenced block.
|
|
240
|
+
* @param depth - Current nesting depth.
|
|
241
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
242
|
+
* @returns The `<pre>` and `<code>` element nodes.
|
|
243
|
+
*/
|
|
244
|
+
visitCodeBlock(
|
|
245
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
246
|
+
originNode, token, depth, parentNode) {
|
|
247
|
+
// Build <pre> element
|
|
248
|
+
const preTag = {
|
|
249
|
+
...token,
|
|
250
|
+
...this.createToken(token),
|
|
251
|
+
attributes: [],
|
|
252
|
+
type: 'starttag',
|
|
253
|
+
elementType: this.detectElementType('pre'),
|
|
254
|
+
namespace: getNamespace('pre', parentNode),
|
|
255
|
+
childNodes: [],
|
|
256
|
+
blockBehavior: null,
|
|
257
|
+
depth,
|
|
258
|
+
parentNode,
|
|
259
|
+
pairNode: null,
|
|
260
|
+
tagOpenChar: '',
|
|
261
|
+
tagCloseChar: '',
|
|
262
|
+
isGhost: false,
|
|
263
|
+
isFragment: false,
|
|
264
|
+
nodeName: 'pre',
|
|
265
|
+
};
|
|
266
|
+
// Build <code> element as child of <pre>
|
|
267
|
+
const codeAttrs = [];
|
|
268
|
+
if (originNode.lang) {
|
|
269
|
+
codeAttrs.push(this.createSyntheticAttr('class', `language-${originNode.lang}`, token));
|
|
270
|
+
}
|
|
271
|
+
const codeTag = {
|
|
272
|
+
...token,
|
|
273
|
+
...this.createToken(token),
|
|
274
|
+
attributes: codeAttrs,
|
|
275
|
+
type: 'starttag',
|
|
276
|
+
elementType: this.detectElementType('code'),
|
|
277
|
+
namespace: getNamespace('code', preTag),
|
|
278
|
+
childNodes: [],
|
|
279
|
+
blockBehavior: null,
|
|
280
|
+
depth: depth + 1,
|
|
281
|
+
parentNode: preTag,
|
|
282
|
+
pairNode: null,
|
|
283
|
+
tagOpenChar: '',
|
|
284
|
+
tagCloseChar: '',
|
|
285
|
+
isGhost: false,
|
|
286
|
+
isFragment: false,
|
|
287
|
+
nodeName: 'code',
|
|
288
|
+
};
|
|
289
|
+
// Add code content as text node if present
|
|
290
|
+
if (originNode.value.length > 0) {
|
|
291
|
+
const position = originNode.position;
|
|
292
|
+
if (position) {
|
|
293
|
+
const rawContent = this.rawCode.slice(position.start.offset ?? 0, position.end.offset ?? 0);
|
|
294
|
+
const valueStart = rawContent.indexOf(originNode.value);
|
|
295
|
+
if (valueStart !== -1) {
|
|
296
|
+
const valueOffset = (position.start.offset ?? 0) + valueStart;
|
|
297
|
+
const valueEndOffset = valueOffset + originNode.value.length;
|
|
298
|
+
const textToken = this.sliceFragment(valueOffset, valueEndOffset);
|
|
299
|
+
const textNode = {
|
|
300
|
+
...textToken,
|
|
301
|
+
...this.createToken(textToken),
|
|
302
|
+
type: 'text',
|
|
303
|
+
depth: depth + 2,
|
|
304
|
+
nodeName: '#text',
|
|
305
|
+
parentNode: codeTag,
|
|
306
|
+
};
|
|
307
|
+
this.appendChild(codeTag, textNode);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
this.appendChild(preTag, codeTag);
|
|
312
|
+
return [preTag, codeTag];
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Builds a `<table>` element from a GFM table node.
|
|
316
|
+
* Marks the first row's offset as a header row so that its cells become `<th>`.
|
|
317
|
+
*
|
|
318
|
+
* @param originNode - The mdast `table` node (GFM extension).
|
|
319
|
+
* @param token - The source token covering the table.
|
|
320
|
+
* @param depth - Current nesting depth.
|
|
321
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
322
|
+
* @returns The `<table>` element node and its descendants.
|
|
323
|
+
*/
|
|
324
|
+
visitTableElement(
|
|
325
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
326
|
+
originNode, token, depth, parentNode) {
|
|
327
|
+
const firstRow = originNode.children[0];
|
|
328
|
+
if (firstRow?.position?.start.offset != null) {
|
|
329
|
+
this.#headerRowOffsets.add(firstRow.position.start.offset);
|
|
330
|
+
}
|
|
331
|
+
return this.visitMarkdownElement(token, 'table', originNode.children, depth, parentNode);
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Dispatches a single mdast node to the appropriate visit method.
|
|
335
|
+
*
|
|
336
|
+
* @param originNode - The mdast node to convert.
|
|
337
|
+
* @param token - The source token covering the node's range.
|
|
338
|
+
* @param offset - Start offset in the original source.
|
|
339
|
+
* @param endOffset - End offset in the original source.
|
|
340
|
+
* @param depth - Current nesting depth.
|
|
341
|
+
* @param parentNode - Parent AST node, or `null` for top-level nodes.
|
|
342
|
+
* @returns An array of AST nodes for recognized Markdown constructs,
|
|
343
|
+
* or `null` when the node type is not handled here (the caller is
|
|
344
|
+
* responsible for handling it — typically `text`, `html`, or
|
|
345
|
+
* parser-specific node types).
|
|
346
|
+
*/
|
|
347
|
+
nodeizeMarkdownNode(
|
|
348
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
349
|
+
originNode, token, offset, endOffset, depth, parentNode) {
|
|
350
|
+
switch (originNode.type) {
|
|
351
|
+
case 'heading': {
|
|
352
|
+
const nodeName = `h${originNode.depth}`;
|
|
353
|
+
return this.visitMarkdownElement(token, nodeName, originNode.children, depth, parentNode);
|
|
354
|
+
}
|
|
355
|
+
case 'paragraph': {
|
|
356
|
+
return this.visitMarkdownElement(token, 'p', originNode.children, depth, parentNode);
|
|
357
|
+
}
|
|
358
|
+
case 'emphasis': {
|
|
359
|
+
return this.visitMarkdownElement(token, 'em', originNode.children, depth, parentNode);
|
|
360
|
+
}
|
|
361
|
+
case 'strong': {
|
|
362
|
+
return this.visitMarkdownElement(token, 'strong', originNode.children, depth, parentNode);
|
|
363
|
+
}
|
|
364
|
+
case 'link': {
|
|
365
|
+
return this.visitLinkElement(originNode, token, depth, parentNode);
|
|
366
|
+
}
|
|
367
|
+
case 'image': {
|
|
368
|
+
return this.visitImageElement(originNode, token, depth, parentNode);
|
|
369
|
+
}
|
|
370
|
+
case 'list': {
|
|
371
|
+
return this.visitListElement(originNode, token, depth, parentNode);
|
|
372
|
+
}
|
|
373
|
+
case 'listItem': {
|
|
374
|
+
return this.visitMarkdownElement(token, 'li', originNode.children, depth, parentNode);
|
|
375
|
+
}
|
|
376
|
+
case 'blockquote': {
|
|
377
|
+
return this.visitMarkdownElement(token, 'blockquote', originNode.children, depth, parentNode);
|
|
378
|
+
}
|
|
379
|
+
case 'thematicBreak': {
|
|
380
|
+
return this.visitMarkdownElement(token, 'hr', [], depth, parentNode);
|
|
381
|
+
}
|
|
382
|
+
case 'break': {
|
|
383
|
+
return this.visitMarkdownElement(token, 'br', [], depth, parentNode);
|
|
384
|
+
}
|
|
385
|
+
case 'inlineCode': {
|
|
386
|
+
return this.visitInlineCode(originNode, token, offset, endOffset, depth, parentNode);
|
|
387
|
+
}
|
|
388
|
+
case 'code': {
|
|
389
|
+
return this.visitCodeBlock(originNode, token, depth, parentNode);
|
|
390
|
+
}
|
|
391
|
+
case 'linkReference': {
|
|
392
|
+
return this.visitLinkReference(originNode, token, depth, parentNode);
|
|
393
|
+
}
|
|
394
|
+
case 'imageReference': {
|
|
395
|
+
return this.visitImageReference(originNode, token, depth, parentNode);
|
|
396
|
+
}
|
|
397
|
+
case 'table': {
|
|
398
|
+
return this.visitTableElement(originNode, token, depth, parentNode);
|
|
399
|
+
}
|
|
400
|
+
case 'tableRow': {
|
|
401
|
+
const isHeader = this.#headerRowOffsets.delete(offset);
|
|
402
|
+
if (isHeader) {
|
|
403
|
+
this.#currentCellName = 'th';
|
|
404
|
+
}
|
|
405
|
+
// tableRow.children is TableCell[] — safely widens to MdastNode[]
|
|
406
|
+
const result = this.visitMarkdownElement(token, 'tr', originNode.children, depth, parentNode);
|
|
407
|
+
this.#currentCellName = 'td';
|
|
408
|
+
return result;
|
|
409
|
+
}
|
|
410
|
+
case 'tableCell': {
|
|
411
|
+
return this.visitMarkdownElement(token, this.#currentCellName, originNode.children, depth, parentNode);
|
|
412
|
+
}
|
|
413
|
+
case 'delete': {
|
|
414
|
+
return this.visitMarkdownElement(token, 'del', originNode.children, depth, parentNode);
|
|
415
|
+
}
|
|
416
|
+
case 'yaml':
|
|
417
|
+
case 'definition':
|
|
418
|
+
case 'footnoteReference':
|
|
419
|
+
case 'footnoteDefinition': {
|
|
420
|
+
return this.visitPsBlock({
|
|
421
|
+
...token,
|
|
422
|
+
depth,
|
|
423
|
+
parentNode,
|
|
424
|
+
nodeName: originNode.type,
|
|
425
|
+
isFragment: false,
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
case 'text': {
|
|
429
|
+
// Caller handles text nodes directly
|
|
430
|
+
return null;
|
|
431
|
+
}
|
|
432
|
+
default: {
|
|
433
|
+
// null = the caller is responsible for handling this node type
|
|
434
|
+
return null;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
/**
|
|
439
|
+
* Resolves a linkReference using collected definitions, producing an `<a>` element.
|
|
440
|
+
* Falls back to a psblock when the definition is not found.
|
|
441
|
+
*/
|
|
442
|
+
visitLinkReference(
|
|
443
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
444
|
+
originNode, token, depth, parentNode) {
|
|
445
|
+
const def = this.definitions.get(originNode.identifier);
|
|
446
|
+
if (!def) {
|
|
447
|
+
return this.visitPsBlock({
|
|
448
|
+
...token,
|
|
449
|
+
depth,
|
|
450
|
+
parentNode,
|
|
451
|
+
nodeName: 'linkReference',
|
|
452
|
+
isFragment: false,
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
const attrs = [this.createSyntheticAttr('href', def.url, token)];
|
|
456
|
+
if (def.title != null) {
|
|
457
|
+
attrs.push(this.createSyntheticAttr('title', def.title, token));
|
|
458
|
+
}
|
|
459
|
+
return this.visitMarkdownElement(token, 'a', originNode.children, depth, parentNode, attrs);
|
|
460
|
+
}
|
|
461
|
+
/**
|
|
462
|
+
* Resolves an imageReference using collected definitions, producing an `<img>` element.
|
|
463
|
+
* Falls back to a psblock when the definition is not found.
|
|
464
|
+
*/
|
|
465
|
+
visitImageReference(
|
|
466
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
467
|
+
originNode, token, depth, parentNode) {
|
|
468
|
+
const def = this.definitions.get(originNode.identifier);
|
|
469
|
+
if (!def) {
|
|
470
|
+
return this.visitPsBlock({
|
|
471
|
+
...token,
|
|
472
|
+
depth,
|
|
473
|
+
parentNode,
|
|
474
|
+
nodeName: 'imageReference',
|
|
475
|
+
isFragment: false,
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
const attrs = [
|
|
479
|
+
this.createSyntheticAttr('src', def.url, token),
|
|
480
|
+
this.createSyntheticAttr('alt', originNode.alt ?? '', token),
|
|
481
|
+
];
|
|
482
|
+
if (def.title != null) {
|
|
483
|
+
attrs.push(this.createSyntheticAttr('title', def.title, token));
|
|
484
|
+
}
|
|
485
|
+
return this.visitMarkdownElement(token, 'img', [], depth, parentNode, attrs);
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Extracts definition nodes from mdast children and populates `this.definitions`.
|
|
489
|
+
*
|
|
490
|
+
* Per CommonMark spec, the first definition for a given identifier takes
|
|
491
|
+
* precedence. remark-parse emits all definition nodes in source order, so
|
|
492
|
+
* we skip duplicates via `Map.has` to honour the first-wins rule.
|
|
493
|
+
*
|
|
494
|
+
* @param children - The root-level mdast children to scan for `definition` nodes.
|
|
495
|
+
*/
|
|
496
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
497
|
+
collectDefinitions(children) {
|
|
498
|
+
for (const child of children) {
|
|
499
|
+
if (child.type === 'definition' && !this.definitions.has(child.identifier)) {
|
|
500
|
+
this.definitions.set(child.identifier, child);
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
505
|
+
/**
|
|
506
|
+
* Computes the 1-based line number and 1-based column for a given offset.
|
|
507
|
+
*
|
|
508
|
+
* Equivalent to `getPosition()` in `@markuplint/parser-utils`, but that
|
|
509
|
+
* function is not exported from the package. Kept as a standalone utility
|
|
510
|
+
* to avoid coupling to parser-utils internals.
|
|
511
|
+
*
|
|
512
|
+
* @param source - The full source string.
|
|
513
|
+
* @param offset - The 0-based character offset to resolve.
|
|
514
|
+
* @returns An object with 1-based `line` and `col` values.
|
|
515
|
+
*/
|
|
516
|
+
export function getLineAndColumn(source, offset) {
|
|
517
|
+
let line = 1;
|
|
518
|
+
let col = 1;
|
|
519
|
+
for (let i = 0; i < offset; i++) {
|
|
520
|
+
if (source[i] === '\n') {
|
|
521
|
+
line++;
|
|
522
|
+
col = 1;
|
|
523
|
+
}
|
|
524
|
+
else {
|
|
525
|
+
col++;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
return { line, col };
|
|
529
|
+
}
|
package/lib/parser.d.ts
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import type { MLASTNodeTreeItem, MLASTParentNode } from '@markuplint/ml-ast';
|
|
2
|
+
import type { RootContent } from 'mdast';
|
|
3
|
+
import { MarkdownAwareParser } from './markdown-aware-parser.js';
|
|
4
|
+
type MdastNode = RootContent;
|
|
5
|
+
/**
|
|
6
|
+
* Parser for Markdown files that converts Markdown syntax to HTML AST elements.
|
|
7
|
+
*
|
|
8
|
+
* Uses remark-parse to produce an mdast, then maps Markdown constructs
|
|
9
|
+
* (headings, paragraphs, lists, links, etc.) to their corresponding HTML
|
|
10
|
+
* element AST nodes. Raw HTML regions are parsed via HtmlParser.
|
|
11
|
+
*/
|
|
12
|
+
declare class MarkdownParser extends MarkdownAwareParser {
|
|
13
|
+
#private;
|
|
14
|
+
/**
|
|
15
|
+
* Tokenizes the raw Markdown source into an mdast tree.
|
|
16
|
+
*
|
|
17
|
+
* Resets parser state, parses via remark (with GFM and frontmatter plugins),
|
|
18
|
+
* and collects link/image reference definitions for later resolution.
|
|
19
|
+
*
|
|
20
|
+
* @returns The mdast children and fragment flag.
|
|
21
|
+
*/
|
|
22
|
+
tokenize(): {
|
|
23
|
+
ast: RootContent[];
|
|
24
|
+
isFragment: boolean;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Converts a single mdast node into markuplint AST nodes.
|
|
28
|
+
*
|
|
29
|
+
* Delegates to the shared `nodeizeMarkdownNode()` for common Markdown
|
|
30
|
+
* constructs. Handles `html` regions via HtmlParser and `text` nodes directly.
|
|
31
|
+
*
|
|
32
|
+
* @param originNode - The mdast node to convert.
|
|
33
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
34
|
+
* @param depth - Current nesting depth.
|
|
35
|
+
* @returns An array of markuplint AST nodes.
|
|
36
|
+
*/
|
|
37
|
+
nodeize(originNode: MdastNode, parentNode: MLASTParentNode | null, depth: number): readonly MLASTNodeTreeItem[];
|
|
38
|
+
}
|
|
39
|
+
export declare const parser: MarkdownParser;
|
|
40
|
+
export {};
|
package/lib/parser.js
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import { HtmlParser } from '@markuplint/html-parser';
|
|
2
|
+
import remarkFrontmatter from 'remark-frontmatter';
|
|
3
|
+
import remarkGfm from 'remark-gfm';
|
|
4
|
+
import remarkParse from 'remark-parse';
|
|
5
|
+
import { unified } from 'unified';
|
|
6
|
+
import { MarkdownAwareParser, getLineAndColumn } from './markdown-aware-parser.js';
|
|
7
|
+
/**
|
|
8
|
+
* Parser for Markdown files that converts Markdown syntax to HTML AST elements.
|
|
9
|
+
*
|
|
10
|
+
* Uses remark-parse to produce an mdast, then maps Markdown constructs
|
|
11
|
+
* (headings, paragraphs, lists, links, etc.) to their corresponding HTML
|
|
12
|
+
* element AST nodes. Raw HTML regions are parsed via HtmlParser.
|
|
13
|
+
*/
|
|
14
|
+
class MarkdownParser extends MarkdownAwareParser {
|
|
15
|
+
#htmlParser = new HtmlParser();
|
|
16
|
+
/**
|
|
17
|
+
* Tokenizes the raw Markdown source into an mdast tree.
|
|
18
|
+
*
|
|
19
|
+
* Resets parser state, parses via remark (with GFM and frontmatter plugins),
|
|
20
|
+
* and collects link/image reference definitions for later resolution.
|
|
21
|
+
*
|
|
22
|
+
* @returns The mdast children and fragment flag.
|
|
23
|
+
*/
|
|
24
|
+
tokenize() {
|
|
25
|
+
this.resetMarkdownState();
|
|
26
|
+
const processor = unified().use(remarkParse).use(remarkGfm).use(remarkFrontmatter, ['yaml']);
|
|
27
|
+
const mdast = processor.parse(this.rawCode);
|
|
28
|
+
this.collectDefinitions(mdast.children);
|
|
29
|
+
return {
|
|
30
|
+
ast: mdast.children,
|
|
31
|
+
isFragment: true,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Converts a single mdast node into markuplint AST nodes.
|
|
36
|
+
*
|
|
37
|
+
* Delegates to the shared `nodeizeMarkdownNode()` for common Markdown
|
|
38
|
+
* constructs. Handles `html` regions via HtmlParser and `text` nodes directly.
|
|
39
|
+
*
|
|
40
|
+
* @param originNode - The mdast node to convert.
|
|
41
|
+
* @param parentNode - Parent AST node, or `null` for top-level.
|
|
42
|
+
* @param depth - Current nesting depth.
|
|
43
|
+
* @returns An array of markuplint AST nodes.
|
|
44
|
+
*/
|
|
45
|
+
// eslint-disable-next-line @typescript-eslint/prefer-readonly-parameter-types
|
|
46
|
+
nodeize(originNode, parentNode, depth) {
|
|
47
|
+
const position = originNode.position;
|
|
48
|
+
if (!position) {
|
|
49
|
+
return [];
|
|
50
|
+
}
|
|
51
|
+
const offset = position.start.offset ?? 0;
|
|
52
|
+
const endOffset = position.end.offset ?? offset;
|
|
53
|
+
const token = this.sliceFragment(offset, endOffset);
|
|
54
|
+
// Try common Markdown node handling first
|
|
55
|
+
const result = this.nodeizeMarkdownNode(originNode, token, offset, endOffset, depth, parentNode);
|
|
56
|
+
if (result !== null) {
|
|
57
|
+
return result;
|
|
58
|
+
}
|
|
59
|
+
switch (originNode.type) {
|
|
60
|
+
case 'html': {
|
|
61
|
+
return this.#parseHtmlRegion(originNode, offset);
|
|
62
|
+
}
|
|
63
|
+
case 'text': {
|
|
64
|
+
return this.visitText({
|
|
65
|
+
...token,
|
|
66
|
+
depth,
|
|
67
|
+
parentNode,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
default: {
|
|
71
|
+
return this.visitPsBlock({
|
|
72
|
+
...token,
|
|
73
|
+
depth,
|
|
74
|
+
parentNode,
|
|
75
|
+
nodeName: originNode.type,
|
|
76
|
+
isFragment: false,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
#parseHtmlRegion(originNode, offset) {
|
|
82
|
+
const { line, col } = getLineAndColumn(this.rawCode, offset);
|
|
83
|
+
const doc = this.#htmlParser.parse(originNode.value, {
|
|
84
|
+
offsetOffset: offset,
|
|
85
|
+
offsetLine: line,
|
|
86
|
+
offsetColumn: col,
|
|
87
|
+
});
|
|
88
|
+
return [...doc.nodeList];
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
export const parser = new MarkdownParser();
|