@mdream/js 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +9 -0
- package/README.md +135 -0
- package/bin/mdream.mjs +2 -0
- package/dist/_chunks/const.mjs +137 -0
- package/dist/_chunks/index.d.mts +14 -0
- package/dist/_chunks/minimal.d.mts +10 -0
- package/dist/_chunks/parse.mjs +1201 -0
- package/dist/_chunks/plugins.mjs +791 -0
- package/dist/_chunks/resolve-plugins.mjs +302 -0
- package/dist/_chunks/src.mjs +344 -0
- package/dist/_chunks/types.d.mts +390 -0
- package/dist/cli.d.mts +1 -0
- package/dist/cli.mjs +27 -0
- package/dist/index.d.mts +4 -0
- package/dist/index.mjs +7 -0
- package/dist/llms-txt.d.mts +89 -0
- package/dist/llms-txt.mjs +347 -0
- package/dist/negotiate.d.mts +26 -0
- package/dist/negotiate.mjs +92 -0
- package/dist/parse.d.mts +57 -0
- package/dist/parse.mjs +3 -0
- package/dist/plugins.d.mts +93 -0
- package/dist/plugins.mjs +3 -0
- package/dist/preset/minimal.d.mts +2 -0
- package/dist/preset/minimal.mjs +34 -0
- package/dist/splitter.d.mts +21 -0
- package/dist/splitter.mjs +215 -0
- package/package.json +93 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
import { n as DEFAULT_BLOCK_SPACING, o as NO_SPACING } from "./const.mjs";
|
|
2
|
+
import { r as parseHtmlStream } from "./parse.mjs";
|
|
3
|
+
import { a as extractionCollectorPlugin, i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./plugins.mjs";
|
|
4
|
+
//#region src/plugin-processor.ts
|
|
5
|
+
/**
|
|
6
|
+
* Processes plugins for a given node event
|
|
7
|
+
* Shared logic between markdown-processor.ts and stream.ts
|
|
8
|
+
*
|
|
9
|
+
* @param event - The node event to process
|
|
10
|
+
* @param plugins - Array of plugins to apply
|
|
11
|
+
* @param state - The current runtime state
|
|
12
|
+
* @param processEvent - Callback to process the event after plugin processing
|
|
13
|
+
* @returns true if the event should be skipped, false to continue processing
|
|
14
|
+
*/
|
|
15
|
+
function processPluginsForEvent(event, plugins, state, processEvent) {
|
|
16
|
+
if (plugins?.length) {
|
|
17
|
+
if (event.node.type === 1 && event.type === 0) {
|
|
18
|
+
const element = event.node;
|
|
19
|
+
for (const plugin of plugins) if (plugin.processAttributes) plugin.processAttributes(element, state);
|
|
20
|
+
}
|
|
21
|
+
let shouldSkip = false;
|
|
22
|
+
for (const plugin of plugins) {
|
|
23
|
+
const res = plugin.beforeNodeProcess?.(event, state);
|
|
24
|
+
if (typeof res === "object") shouldSkip = res.skip;
|
|
25
|
+
}
|
|
26
|
+
if (shouldSkip) return true;
|
|
27
|
+
if (event.node.type === 1) {
|
|
28
|
+
const element = event.node;
|
|
29
|
+
const fn = event.type === 0 ? "onNodeEnter" : "onNodeExit";
|
|
30
|
+
const pluginOutputs = [];
|
|
31
|
+
for (const plugin of plugins) if (plugin[fn]) {
|
|
32
|
+
const result = plugin[fn](element, state);
|
|
33
|
+
if (result) pluginOutputs.push(result);
|
|
34
|
+
}
|
|
35
|
+
if (pluginOutputs.length > 0) element.pluginOutput = [...element.pluginOutput || [], ...pluginOutputs];
|
|
36
|
+
} else if (event.node.type === 2 && event.type === 0) {
|
|
37
|
+
const textNode = event.node;
|
|
38
|
+
for (const plugin of plugins) if (plugin.processTextNode) {
|
|
39
|
+
const result = plugin.processTextNode(textNode, state);
|
|
40
|
+
if (result) {
|
|
41
|
+
if (result.skip) return true;
|
|
42
|
+
textNode.value = result.content;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
processEvent(event);
|
|
48
|
+
return false;
|
|
49
|
+
}
|
|
50
|
+
//#endregion
|
|
51
|
+
//#region src/markdown-processor.ts
|
|
52
|
+
/**
|
|
53
|
+
* Determines if spacing is needed between two characters
|
|
54
|
+
*/
|
|
55
|
+
function needsSpacing(lastChar, firstChar, state) {
|
|
56
|
+
if (lastChar === " " || lastChar === "\n" || lastChar === " ") return false;
|
|
57
|
+
if (firstChar === " " || firstChar === "\n" || firstChar === " ") return false;
|
|
58
|
+
const noSpaceAfter = new Set([
|
|
59
|
+
"[",
|
|
60
|
+
"(",
|
|
61
|
+
">",
|
|
62
|
+
"*",
|
|
63
|
+
"_",
|
|
64
|
+
"`"
|
|
65
|
+
]);
|
|
66
|
+
const noSpaceBefore = new Set([
|
|
67
|
+
"]",
|
|
68
|
+
")",
|
|
69
|
+
"<",
|
|
70
|
+
".",
|
|
71
|
+
",",
|
|
72
|
+
"!",
|
|
73
|
+
"?",
|
|
74
|
+
":",
|
|
75
|
+
";",
|
|
76
|
+
"*",
|
|
77
|
+
"_",
|
|
78
|
+
"`"
|
|
79
|
+
]);
|
|
80
|
+
if (lastChar === "|" && firstChar === "<" && state && (state.depthMap[28] || 0) > 0) return true;
|
|
81
|
+
if (noSpaceAfter.has(lastChar) || noSpaceBefore.has(firstChar)) return false;
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Determines if spacing should be added before text content
|
|
86
|
+
*/
|
|
87
|
+
function shouldAddSpacingBeforeText(lastChar, lastNode, textNode) {
|
|
88
|
+
if (!lastChar || lastChar === "\n" || lastChar === " " || lastChar === "[" || lastChar === ">") return false;
|
|
89
|
+
if (lastNode?.tagHandler?.isInline) return false;
|
|
90
|
+
const firstChar = textNode.value[0];
|
|
91
|
+
if (firstChar === " ") return false;
|
|
92
|
+
if (firstChar === "." || firstChar === "," || firstChar === "!" || firstChar === "?" || firstChar === ":" || firstChar === ";" || firstChar === "_" || firstChar === "*" || firstChar === "`" || firstChar === ")" || firstChar === "]") return false;
|
|
93
|
+
return true;
|
|
94
|
+
}
|
|
95
|
+
/**
|
|
96
|
+
* Calculate newline configuration based on tag handler spacing config
|
|
97
|
+
*/
|
|
98
|
+
function calculateNewLineConfig(node) {
|
|
99
|
+
const tagId = node.tagId;
|
|
100
|
+
const depthMap = node.depthMap;
|
|
101
|
+
if (tagId !== 25 && (depthMap[25] || 0) > 0 || tagId !== 22 && (depthMap[22] || 0) > 0) return NO_SPACING;
|
|
102
|
+
const isBlockElement = tagId !== void 0 && (tagId >= 7 && tagId <= 12 || tagId === 35 || tagId === 36);
|
|
103
|
+
let currParent = node.parent;
|
|
104
|
+
while (currParent) {
|
|
105
|
+
if (currParent.tagHandler?.collapsesInnerWhiteSpace) {
|
|
106
|
+
if (isBlockElement && currParent.tagId === 37) {
|
|
107
|
+
currParent = currParent.parent;
|
|
108
|
+
continue;
|
|
109
|
+
}
|
|
110
|
+
return NO_SPACING;
|
|
111
|
+
}
|
|
112
|
+
currParent = currParent.parent;
|
|
113
|
+
}
|
|
114
|
+
if (node.tagHandler?.spacing) return node.tagHandler?.spacing;
|
|
115
|
+
return DEFAULT_BLOCK_SPACING;
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Creates a markdown processor that consumes DOM events and generates markdown
|
|
119
|
+
*/
|
|
120
|
+
function createMarkdownProcessor(options = {}, resolvedPlugins = [], tagOverrideHandlers) {
|
|
121
|
+
const state = {
|
|
122
|
+
options,
|
|
123
|
+
buffer: [],
|
|
124
|
+
depthMap: new Uint8Array(108)
|
|
125
|
+
};
|
|
126
|
+
let lastYieldedLength = 0;
|
|
127
|
+
/**
|
|
128
|
+
* Process a DOM event and generate markdown
|
|
129
|
+
*/
|
|
130
|
+
function processEvent(event) {
|
|
131
|
+
const { type: eventType, node } = event;
|
|
132
|
+
const lastNode = state.lastNode;
|
|
133
|
+
state.lastNode = event.node;
|
|
134
|
+
state.depth = node.depth;
|
|
135
|
+
const buff = state.buffer;
|
|
136
|
+
const lastBuffEntry = buff.at(-1);
|
|
137
|
+
const lastChar = lastBuffEntry?.charAt(lastBuffEntry.length - 1) || "";
|
|
138
|
+
let secondLastChar;
|
|
139
|
+
if (lastBuffEntry && lastBuffEntry.length > 1) secondLastChar = lastBuffEntry.charAt(lastBuffEntry.length - 2);
|
|
140
|
+
else if (buff.length > 1) {
|
|
141
|
+
const prevBuff = buff[buff.length - 2];
|
|
142
|
+
if (prevBuff) secondLastChar = prevBuff.charAt(prevBuff.length - 1);
|
|
143
|
+
}
|
|
144
|
+
if (node.type === 2 && eventType === 0) {
|
|
145
|
+
const textNode = node;
|
|
146
|
+
if (textNode.value) {
|
|
147
|
+
if (textNode.excludedFromMarkdown) return;
|
|
148
|
+
if (textNode.value === " " && lastChar === "\n") return;
|
|
149
|
+
if (shouldAddSpacingBeforeText(lastChar, lastNode, textNode)) textNode.value = ` ${textNode.value}`;
|
|
150
|
+
state.buffer.push(textNode.value);
|
|
151
|
+
state.lastContentCache = textNode.value;
|
|
152
|
+
}
|
|
153
|
+
state.lastTextNode = textNode;
|
|
154
|
+
return;
|
|
155
|
+
}
|
|
156
|
+
if (node.type !== 1) return;
|
|
157
|
+
const context = {
|
|
158
|
+
node,
|
|
159
|
+
state
|
|
160
|
+
};
|
|
161
|
+
const output = [];
|
|
162
|
+
const element = node;
|
|
163
|
+
if (element.pluginOutput?.length) {
|
|
164
|
+
output.push(...element.pluginOutput);
|
|
165
|
+
element.pluginOutput = [];
|
|
166
|
+
}
|
|
167
|
+
const lastFragment = state.lastContentCache;
|
|
168
|
+
let lastNewLines = 0;
|
|
169
|
+
if (lastChar === "\n") lastNewLines++;
|
|
170
|
+
if (secondLastChar === "\n") lastNewLines++;
|
|
171
|
+
const eventFn = eventType === 0 ? "enter" : "exit";
|
|
172
|
+
const handler = node.tagHandler;
|
|
173
|
+
if (!output.length && handler?.[eventFn]) {
|
|
174
|
+
const res = handler[eventFn](context);
|
|
175
|
+
if (res) output.push(res);
|
|
176
|
+
}
|
|
177
|
+
const configuredNewLines = calculateNewLineConfig(node)[eventType] || 0;
|
|
178
|
+
const newLines = Math.max(0, configuredNewLines - lastNewLines);
|
|
179
|
+
if (newLines > 0) {
|
|
180
|
+
if (!buff.length) {
|
|
181
|
+
for (const fragment of output) if (fragment) {
|
|
182
|
+
state.buffer.push(fragment);
|
|
183
|
+
state.lastContentCache = fragment;
|
|
184
|
+
}
|
|
185
|
+
return;
|
|
186
|
+
}
|
|
187
|
+
const newlinesStr = "\n".repeat(newLines);
|
|
188
|
+
if (lastChar === " " && buff?.length) buff[buff.length - 1] = buff.at(-1).substring(0, buff.at(-1).length - 1);
|
|
189
|
+
if (eventType === 0) output.unshift(newlinesStr);
|
|
190
|
+
else output.push(newlinesStr);
|
|
191
|
+
} else if (lastFragment && state.lastTextNode?.containsWhitespace && !!node.parent && "value" in state.lastTextNode && typeof state.lastTextNode.value === "string") {
|
|
192
|
+
if (!node.parent.depthMap[34] || node.parent.tagId === 34) {
|
|
193
|
+
const isInlineElement = node.tagHandler?.isInline;
|
|
194
|
+
const collapsesWhiteSpace = node.tagHandler?.collapsesInnerWhiteSpace;
|
|
195
|
+
const hasSpacing = node.tagHandler?.spacing && Array.isArray(node.tagHandler.spacing);
|
|
196
|
+
if ((!isInlineElement || eventType === 1) && !(!isInlineElement && !collapsesWhiteSpace && configuredNewLines > 0) && !(collapsesWhiteSpace && eventType === 0) && !(hasSpacing && eventType === 0)) {
|
|
197
|
+
const originalLength = lastFragment.length;
|
|
198
|
+
const trimmed = lastFragment.trimEnd();
|
|
199
|
+
if (originalLength - trimmed.length > 0) {
|
|
200
|
+
if (buff?.length && buff.at(-1) === lastFragment) buff[buff.length - 1] = trimmed;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
state.lastTextNode = void 0;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
if (output[0]?.[0] && eventType === 0 && lastChar && needsSpacing(lastChar, output[0][0], state)) {
|
|
207
|
+
state.buffer.push(" ");
|
|
208
|
+
state.lastContentCache = " ";
|
|
209
|
+
}
|
|
210
|
+
for (const fragment of output) if (fragment) {
|
|
211
|
+
state.buffer.push(fragment);
|
|
212
|
+
state.lastContentCache = fragment;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Process HTML string and generate events
|
|
217
|
+
*/
|
|
218
|
+
function processHtml(html) {
|
|
219
|
+
parseHtmlStream(html, {
|
|
220
|
+
depthMap: state.depthMap,
|
|
221
|
+
depth: 0,
|
|
222
|
+
resolvedPlugins,
|
|
223
|
+
tagOverrideHandlers
|
|
224
|
+
}, (event) => {
|
|
225
|
+
processPluginsForEvent(event, resolvedPlugins, state, processEvent);
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Get the final markdown output
|
|
230
|
+
*/
|
|
231
|
+
function getMarkdown() {
|
|
232
|
+
const result = state.buffer.join("").trimStart();
|
|
233
|
+
state.buffer.length = 0;
|
|
234
|
+
return result.trimEnd();
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Get new markdown content since the last call (for streaming)
|
|
238
|
+
*/
|
|
239
|
+
function getMarkdownChunk() {
|
|
240
|
+
const currentContent = state.buffer.join("").trimStart();
|
|
241
|
+
const newContent = currentContent.slice(lastYieldedLength);
|
|
242
|
+
lastYieldedLength = currentContent.length;
|
|
243
|
+
return newContent;
|
|
244
|
+
}
|
|
245
|
+
return {
|
|
246
|
+
processEvent,
|
|
247
|
+
processHtml,
|
|
248
|
+
getMarkdown,
|
|
249
|
+
getMarkdownChunk,
|
|
250
|
+
state
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
//#endregion
|
|
254
|
+
//#region src/resolve-plugins.ts
|
|
255
|
+
function resolveFrontmatterOpt(opt) {
|
|
256
|
+
if (typeof opt === "function") return {
|
|
257
|
+
config: {},
|
|
258
|
+
callback: opt
|
|
259
|
+
};
|
|
260
|
+
if (typeof opt === "object") return {
|
|
261
|
+
config: opt,
|
|
262
|
+
callback: opt.onExtract
|
|
263
|
+
};
|
|
264
|
+
return { config: {} };
|
|
265
|
+
}
|
|
266
|
+
/**
|
|
267
|
+
* Resolves declarative BuiltinPlugins config into a flat TransformPlugin array.
|
|
268
|
+
* Optionally appends imperative transform plugins.
|
|
269
|
+
*/
|
|
270
|
+
function resolvePlugins(options, hooks) {
|
|
271
|
+
const plugins = [];
|
|
272
|
+
let callExtractionHandlers;
|
|
273
|
+
let getFrontmatter;
|
|
274
|
+
let frontmatterCallback;
|
|
275
|
+
const config = options.plugins;
|
|
276
|
+
if (config) {
|
|
277
|
+
if (config.frontmatter) {
|
|
278
|
+
const fm = resolveFrontmatterOpt(config.frontmatter);
|
|
279
|
+
const fmPlugin = frontmatterPlugin(fm.config);
|
|
280
|
+
plugins.push(fmPlugin);
|
|
281
|
+
getFrontmatter = fmPlugin.getFrontmatter;
|
|
282
|
+
frontmatterCallback = fm.callback;
|
|
283
|
+
}
|
|
284
|
+
if (config.isolateMain) plugins.push(isolateMainPlugin());
|
|
285
|
+
if (config.tailwind) plugins.push(tailwindPlugin());
|
|
286
|
+
if (config.filter) plugins.push(filterPlugin(config.filter));
|
|
287
|
+
if (config.extraction) {
|
|
288
|
+
const collector = extractionCollectorPlugin(config.extraction);
|
|
289
|
+
plugins.push(collector.plugin);
|
|
290
|
+
callExtractionHandlers = collector.callHandlers;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
if (hooks) plugins.push(...hooks);
|
|
294
|
+
return {
|
|
295
|
+
plugins,
|
|
296
|
+
callExtractionHandlers,
|
|
297
|
+
getFrontmatter,
|
|
298
|
+
frontmatterCallback
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
//#endregion
|
|
302
|
+
export { createMarkdownProcessor as n, processPluginsForEvent as r, resolvePlugins as t };
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
import { i as buildTagOverrideHandlers, r as parseHtmlStream } from "./parse.mjs";
|
|
2
|
+
import { n as createMarkdownProcessor, r as processPluginsForEvent, t as resolvePlugins } from "./resolve-plugins.mjs";
|
|
3
|
+
//#region src/clean.ts
|
|
4
|
+
function resolveClean(clean) {
|
|
5
|
+
if (clean === true) return {
|
|
6
|
+
urls: true,
|
|
7
|
+
fragments: true,
|
|
8
|
+
emptyLinks: true,
|
|
9
|
+
redundantLinks: true,
|
|
10
|
+
selfLinkHeadings: true,
|
|
11
|
+
emptyImages: true,
|
|
12
|
+
emptyLinkText: true
|
|
13
|
+
};
|
|
14
|
+
if (clean === false) return {};
|
|
15
|
+
return clean;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Try to parse a markdown link `[text](url)` starting at position `start` (the `[`).
|
|
19
|
+
* Returns { text, url, end } or null if not a link.
|
|
20
|
+
* Handles balanced parens in URLs (e.g. `javascript:void(0)`).
|
|
21
|
+
*/
|
|
22
|
+
function parseLink(md, start) {
|
|
23
|
+
const len = md.length;
|
|
24
|
+
let j = start + 1;
|
|
25
|
+
let depth = 1;
|
|
26
|
+
while (j < len && depth > 0) {
|
|
27
|
+
const c = md.charCodeAt(j);
|
|
28
|
+
if (c === 91) depth++;
|
|
29
|
+
else if (c === 93) depth--;
|
|
30
|
+
j++;
|
|
31
|
+
}
|
|
32
|
+
if (depth !== 0) return null;
|
|
33
|
+
const textEnd = j - 1;
|
|
34
|
+
if (j >= len || md.charCodeAt(j) !== 40) return null;
|
|
35
|
+
j++;
|
|
36
|
+
const urlStart = j;
|
|
37
|
+
let parenDepth = 1;
|
|
38
|
+
while (j < len && parenDepth > 0) {
|
|
39
|
+
const c = md.charCodeAt(j);
|
|
40
|
+
if (c === 40) parenDepth++;
|
|
41
|
+
else if (c === 41) parenDepth--;
|
|
42
|
+
j++;
|
|
43
|
+
}
|
|
44
|
+
if (parenDepth !== 0) return null;
|
|
45
|
+
const urlEnd = j - 1;
|
|
46
|
+
return {
|
|
47
|
+
text: md.slice(start + 1, textEnd),
|
|
48
|
+
url: md.slice(urlStart, urlEnd),
|
|
49
|
+
end: j
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
function slugify(text) {
|
|
53
|
+
let slug = "";
|
|
54
|
+
let lastWasDash = false;
|
|
55
|
+
for (let i = 0; i < text.length; i++) {
|
|
56
|
+
const c = text.charCodeAt(i);
|
|
57
|
+
if (c >= 97 && c <= 122) {
|
|
58
|
+
slug += text[i];
|
|
59
|
+
lastWasDash = false;
|
|
60
|
+
} else if (c >= 65 && c <= 90) {
|
|
61
|
+
slug += String.fromCharCode(c + 32);
|
|
62
|
+
lastWasDash = false;
|
|
63
|
+
} else if (c >= 48 && c <= 57) {
|
|
64
|
+
slug += text[i];
|
|
65
|
+
lastWasDash = false;
|
|
66
|
+
} else if (c === 95) {
|
|
67
|
+
slug += "_";
|
|
68
|
+
lastWasDash = false;
|
|
69
|
+
} else if (c === 32 || c === 9 || c === 45) {
|
|
70
|
+
if (!lastWasDash && slug.length > 0) {
|
|
71
|
+
slug += "-";
|
|
72
|
+
lastWasDash = true;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
if (lastWasDash) slug = slug.slice(0, -1);
|
|
77
|
+
return slug;
|
|
78
|
+
}
|
|
79
|
+
/** Strip inline markdown formatting from heading text for slug generation */
|
|
80
|
+
function stripHeadingFormatting(text) {
|
|
81
|
+
let result = "";
|
|
82
|
+
const len = text.length;
|
|
83
|
+
let i = 0;
|
|
84
|
+
while (i < len) {
|
|
85
|
+
const c = text.charCodeAt(i);
|
|
86
|
+
if (c === 91) {
|
|
87
|
+
const link = parseLink(text, i);
|
|
88
|
+
if (link) {
|
|
89
|
+
result += link.text;
|
|
90
|
+
i = link.end;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (c === 42 || c === 95 || c === 96 || c === 126) {
|
|
95
|
+
i++;
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
result += text[i];
|
|
99
|
+
i++;
|
|
100
|
+
}
|
|
101
|
+
return result.trim();
|
|
102
|
+
}
|
|
103
|
+
function collectHeadingSlugs(md) {
|
|
104
|
+
const slugs = /* @__PURE__ */ new Set();
|
|
105
|
+
const len = md.length;
|
|
106
|
+
let i = 0;
|
|
107
|
+
while (i < len) {
|
|
108
|
+
if (i === 0 || md.charCodeAt(i - 1) === 10) {
|
|
109
|
+
let hashes = 0;
|
|
110
|
+
let j = i;
|
|
111
|
+
while (j < len && md.charCodeAt(j) === 35) {
|
|
112
|
+
hashes++;
|
|
113
|
+
j++;
|
|
114
|
+
}
|
|
115
|
+
if (hashes >= 1 && hashes <= 6 && j < len && md.charCodeAt(j) === 32) {
|
|
116
|
+
j++;
|
|
117
|
+
const lineEnd = md.indexOf("\n", j);
|
|
118
|
+
const cleaned = stripHeadingFormatting(lineEnd === -1 ? md.slice(j) : md.slice(j, lineEnd));
|
|
119
|
+
if (cleaned) slugs.add(slugify(cleaned));
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
const nl = md.indexOf("\n", i);
|
|
123
|
+
if (nl === -1) break;
|
|
124
|
+
i = nl + 1;
|
|
125
|
+
}
|
|
126
|
+
return slugs;
|
|
127
|
+
}
|
|
128
|
+
function cleanFragments(md) {
|
|
129
|
+
const slugs = collectHeadingSlugs(md);
|
|
130
|
+
const len = md.length;
|
|
131
|
+
let result = "";
|
|
132
|
+
let i = 0;
|
|
133
|
+
while (i < len) {
|
|
134
|
+
if (md.charCodeAt(i) === 91) {
|
|
135
|
+
const link = parseLink(md, i);
|
|
136
|
+
if (link && link.url.charCodeAt(0) === 35 && link.url.length > 1) {
|
|
137
|
+
const fragment = link.url.slice(1).split(" ")[0];
|
|
138
|
+
if (slugs.size > 0 ? !slugs.has(fragment) : true) {
|
|
139
|
+
result += link.text;
|
|
140
|
+
i = link.end;
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
result += md[i];
|
|
146
|
+
i++;
|
|
147
|
+
}
|
|
148
|
+
return result;
|
|
149
|
+
}
|
|
150
|
+
function cleanEmptyLinks(md) {
|
|
151
|
+
const len = md.length;
|
|
152
|
+
let result = "";
|
|
153
|
+
let i = 0;
|
|
154
|
+
while (i < len) {
|
|
155
|
+
if (md.charCodeAt(i) === 91) {
|
|
156
|
+
const link = parseLink(md, i);
|
|
157
|
+
if (link) {
|
|
158
|
+
const url = link.url;
|
|
159
|
+
if (url === "#" || url.startsWith("javascript:") || url.startsWith("data:") || url.startsWith("vbscript:")) {
|
|
160
|
+
result += link.text;
|
|
161
|
+
i = link.end;
|
|
162
|
+
continue;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
result += md[i];
|
|
167
|
+
i++;
|
|
168
|
+
}
|
|
169
|
+
return result;
|
|
170
|
+
}
|
|
171
|
+
function cleanRedundantLinks(md) {
|
|
172
|
+
const len = md.length;
|
|
173
|
+
let result = "";
|
|
174
|
+
let i = 0;
|
|
175
|
+
while (i < len) {
|
|
176
|
+
if (md.charCodeAt(i) === 91) {
|
|
177
|
+
const link = parseLink(md, i);
|
|
178
|
+
if (link && link.text === link.url) {
|
|
179
|
+
result += link.text;
|
|
180
|
+
i = link.end;
|
|
181
|
+
continue;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
result += md[i];
|
|
185
|
+
i++;
|
|
186
|
+
}
|
|
187
|
+
return result;
|
|
188
|
+
}
|
|
189
|
+
function cleanSelfLinkHeadings(md) {
|
|
190
|
+
const len = md.length;
|
|
191
|
+
let result = "";
|
|
192
|
+
let i = 0;
|
|
193
|
+
while (i < len) {
|
|
194
|
+
if (i === 0 || md.charCodeAt(i - 1) === 10) {
|
|
195
|
+
let hashes = 0;
|
|
196
|
+
let j = i;
|
|
197
|
+
while (j < len && md.charCodeAt(j) === 35) {
|
|
198
|
+
hashes++;
|
|
199
|
+
j++;
|
|
200
|
+
}
|
|
201
|
+
if (hashes >= 1 && hashes <= 6 && j < len && md.charCodeAt(j) === 32) {
|
|
202
|
+
j++;
|
|
203
|
+
if (j < len && md.charCodeAt(j) === 91) {
|
|
204
|
+
const link = parseLink(md, j);
|
|
205
|
+
if (link && link.url.charCodeAt(0) === 35) {
|
|
206
|
+
result += md.slice(i, j);
|
|
207
|
+
result += link.text;
|
|
208
|
+
i = link.end;
|
|
209
|
+
continue;
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
result += md[i];
|
|
215
|
+
i++;
|
|
216
|
+
}
|
|
217
|
+
return result;
|
|
218
|
+
}
|
|
219
|
+
function cleanEmptyImages(md) {
|
|
220
|
+
const len = md.length;
|
|
221
|
+
let result = "";
|
|
222
|
+
let i = 0;
|
|
223
|
+
while (i < len) {
|
|
224
|
+
if (md.charCodeAt(i) === 33 && i + 1 < len && md.charCodeAt(i + 1) === 91) {
|
|
225
|
+
let j = i + 2;
|
|
226
|
+
const altStart = j;
|
|
227
|
+
while (j < len && md.charCodeAt(j) !== 93) j++;
|
|
228
|
+
if (j < len) {
|
|
229
|
+
if (md.slice(altStart, j).trim().length === 0) {
|
|
230
|
+
j++;
|
|
231
|
+
if (j < len && md.charCodeAt(j) === 40) {
|
|
232
|
+
let parenDepth = 1;
|
|
233
|
+
j++;
|
|
234
|
+
while (j < len && parenDepth > 0) {
|
|
235
|
+
if (md.charCodeAt(j) === 40) parenDepth++;
|
|
236
|
+
else if (md.charCodeAt(j) === 41) parenDepth--;
|
|
237
|
+
j++;
|
|
238
|
+
}
|
|
239
|
+
i = j;
|
|
240
|
+
continue;
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
result += md[i];
|
|
246
|
+
i++;
|
|
247
|
+
}
|
|
248
|
+
return result;
|
|
249
|
+
}
|
|
250
|
+
function cleanEmptyLinkText(md) {
|
|
251
|
+
const len = md.length;
|
|
252
|
+
let result = "";
|
|
253
|
+
let i = 0;
|
|
254
|
+
while (i < len) {
|
|
255
|
+
if (md.charCodeAt(i) === 91) {
|
|
256
|
+
const link = parseLink(md, i);
|
|
257
|
+
if (link && link.text.trim().length === 0) {
|
|
258
|
+
i = link.end;
|
|
259
|
+
continue;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
result += md[i];
|
|
263
|
+
i++;
|
|
264
|
+
}
|
|
265
|
+
return result;
|
|
266
|
+
}
|
|
267
|
+
function applyClean(md, opts) {
|
|
268
|
+
if (opts.emptyImages) md = cleanEmptyImages(md);
|
|
269
|
+
if (opts.emptyLinks) md = cleanEmptyLinks(md);
|
|
270
|
+
if (opts.emptyLinkText) md = cleanEmptyLinkText(md);
|
|
271
|
+
if (opts.redundantLinks) md = cleanRedundantLinks(md);
|
|
272
|
+
if (opts.selfLinkHeadings) md = cleanSelfLinkHeadings(md);
|
|
273
|
+
if (opts.fragments) md = cleanFragments(md);
|
|
274
|
+
return md;
|
|
275
|
+
}
|
|
276
|
+
//#endregion
|
|
277
|
+
//#region src/stream.ts
|
|
278
|
+
/**
|
|
279
|
+
* Creates a markdown stream from an HTML stream
|
|
280
|
+
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
281
|
+
* @param options - Configuration options for conversion
|
|
282
|
+
* @param resolvedPlugins - Pre-resolved plugin instances
|
|
283
|
+
* @param tagOverrideHandlers - Tag override handlers from declarative config
|
|
284
|
+
* @returns An async generator yielding markdown chunks
|
|
285
|
+
*/
|
|
286
|
+
async function* streamHtmlToMarkdown$1(htmlStream, options = {}, resolvedPlugins = [], tagOverrideHandlers) {
|
|
287
|
+
if (!htmlStream) throw new Error("Invalid HTML stream provided");
|
|
288
|
+
const decoder = new TextDecoder();
|
|
289
|
+
const reader = htmlStream.getReader();
|
|
290
|
+
const processor = createMarkdownProcessor(options, resolvedPlugins, tagOverrideHandlers);
|
|
291
|
+
const parseState = {
|
|
292
|
+
depthMap: new Uint8Array(1024),
|
|
293
|
+
depth: 0,
|
|
294
|
+
resolvedPlugins,
|
|
295
|
+
tagOverrideHandlers
|
|
296
|
+
};
|
|
297
|
+
let remainingHtml = "";
|
|
298
|
+
try {
|
|
299
|
+
while (true) {
|
|
300
|
+
const { done, value } = await reader.read();
|
|
301
|
+
if (done) break;
|
|
302
|
+
remainingHtml = parseHtmlStream(`${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`, parseState, (event) => {
|
|
303
|
+
processPluginsForEvent(event, resolvedPlugins, processor.state, processor.processEvent);
|
|
304
|
+
});
|
|
305
|
+
const chunk = processor.getMarkdownChunk();
|
|
306
|
+
if (chunk) yield chunk;
|
|
307
|
+
}
|
|
308
|
+
if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
|
|
309
|
+
processPluginsForEvent(event, resolvedPlugins, processor.state, processor.processEvent);
|
|
310
|
+
});
|
|
311
|
+
const finalChunk = processor.getMarkdownChunk();
|
|
312
|
+
if (finalChunk) yield finalChunk;
|
|
313
|
+
} finally {
|
|
314
|
+
if (remainingHtml) decoder.decode(new Uint8Array(0), { stream: false });
|
|
315
|
+
reader.releaseLock();
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
//#endregion
|
|
319
|
+
//#region src/index.ts
|
|
320
|
+
function resolveHooks(options) {
|
|
321
|
+
return options.hooks?.length ? options.hooks : void 0;
|
|
322
|
+
}
|
|
323
|
+
function convert(html, options, hooks) {
|
|
324
|
+
const { plugins, callExtractionHandlers, getFrontmatter, frontmatterCallback } = resolvePlugins(options, hooks);
|
|
325
|
+
const processor = createMarkdownProcessor(options, plugins, options.plugins?.tagOverrides ? buildTagOverrideHandlers(options.plugins.tagOverrides) : void 0);
|
|
326
|
+
processor.processHtml(html);
|
|
327
|
+
if (getFrontmatter && frontmatterCallback) {
|
|
328
|
+
const fm = getFrontmatter();
|
|
329
|
+
if (fm) frontmatterCallback(fm);
|
|
330
|
+
}
|
|
331
|
+
callExtractionHandlers?.();
|
|
332
|
+
return processor.getMarkdown();
|
|
333
|
+
}
|
|
334
|
+
function htmlToMarkdown(html, options = {}) {
|
|
335
|
+
const markdown = convert(html, options, resolveHooks(options));
|
|
336
|
+
if (options.clean) return applyClean(markdown, resolveClean(options.clean));
|
|
337
|
+
return markdown;
|
|
338
|
+
}
|
|
339
|
+
function streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
340
|
+
const { plugins } = resolvePlugins(options, resolveHooks(options));
|
|
341
|
+
return streamHtmlToMarkdown$1(htmlStream, options, plugins, options.plugins?.tagOverrides ? buildTagOverrideHandlers(options.plugins.tagOverrides) : void 0);
|
|
342
|
+
}
|
|
343
|
+
//#endregion
|
|
344
|
+
export { streamHtmlToMarkdown as n, htmlToMarkdown as t };
|