mdream 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/minimal-Ru8PBNVI.mjs +40 -0
- package/dist/_chunks/{plugin-DCJFRZej.mjs → plugin-Bqz9GKOA.mjs} +1 -1
- package/dist/_chunks/plugin-D45YAMmt.d.mts +12 -0
- package/dist/_chunks/plugins-D305pIpW.mjs +844 -0
- package/dist/_chunks/{stream-BeojJNLt.mjs → stream-IeCVDuTy.mjs} +53 -35
- package/dist/_chunks/{types-BHoibuoP.d.mts → types-D9VKEbix.d.mts} +29 -8
- package/dist/cli.mjs +14 -8
- package/dist/index.d.mts +75 -3
- package/dist/index.mjs +2 -2
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +2 -3
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +3 -39
- package/package.json +3 -17
- package/README.md +0 -252
- package/dist/_chunks/index-VTwTBxk0.d.mts +0 -58
- package/dist/_chunks/plugins-DGakgpSl.mjs +0 -582
- package/dist/_chunks/readability-BfCjcbbx.mjs +0 -271
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap
|
|
1
|
+
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./plugin-Bqz9GKOA.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/tags.ts
|
|
4
4
|
function resolveUrl(url, origin) {
|
|
@@ -1146,6 +1146,51 @@ function parseAttributes(attrStr) {
|
|
|
1146
1146
|
return result;
|
|
1147
1147
|
}
|
|
1148
1148
|
|
|
1149
|
+
//#endregion
|
|
1150
|
+
//#region src/plugin-processor.ts
|
|
1151
|
+
/**
|
|
1152
|
+
* Processes plugins for a given node event
|
|
1153
|
+
* Shared logic between markdown-processor.ts and stream.ts
|
|
1154
|
+
*
|
|
1155
|
+
* @param event - The node event to process
|
|
1156
|
+
* @param plugins - Array of plugins to apply
|
|
1157
|
+
* @param state - The current runtime state
|
|
1158
|
+
* @param processEvent - Callback to process the event after plugin processing
|
|
1159
|
+
* @returns true if the event should be skipped, false to continue processing
|
|
1160
|
+
*/
|
|
1161
|
+
function processPluginsForEvent(event, plugins, state, processEvent) {
|
|
1162
|
+
if (plugins?.length) {
|
|
1163
|
+
for (const plugin of plugins) {
|
|
1164
|
+
const res = plugin.beforeNodeProcess?.(event, state);
|
|
1165
|
+
if (typeof res === "object" && res.skip) return true;
|
|
1166
|
+
}
|
|
1167
|
+
if (event.node.type === ELEMENT_NODE) {
|
|
1168
|
+
const element = event.node;
|
|
1169
|
+
if (event.type === NodeEventEnter) {
|
|
1170
|
+
for (const plugin of plugins) if (plugin.processAttributes) plugin.processAttributes(element, state);
|
|
1171
|
+
}
|
|
1172
|
+
const fn = event.type === NodeEventEnter ? "onNodeEnter" : "onNodeExit";
|
|
1173
|
+
const pluginOutputs = [];
|
|
1174
|
+
for (const plugin of plugins) if (plugin[fn]) {
|
|
1175
|
+
const result = plugin[fn](element, state);
|
|
1176
|
+
if (result) pluginOutputs.push(result);
|
|
1177
|
+
}
|
|
1178
|
+
if (pluginOutputs.length > 0) element.pluginOutput = (element.pluginOutput || []).concat(pluginOutputs);
|
|
1179
|
+
} else if (event.node.type === TEXT_NODE && event.type === NodeEventEnter) {
|
|
1180
|
+
const textNode = event.node;
|
|
1181
|
+
for (const plugin of plugins) if (plugin.processTextNode) {
|
|
1182
|
+
const result = plugin.processTextNode(textNode, state);
|
|
1183
|
+
if (result) {
|
|
1184
|
+
if (result.skip) return true;
|
|
1185
|
+
textNode.value = result.content;
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
processEvent(event);
|
|
1191
|
+
return false;
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1149
1194
|
//#endregion
|
|
1150
1195
|
//#region src/markdown-processor.ts
|
|
1151
1196
|
/**
|
|
@@ -1184,7 +1229,7 @@ function needsSpacing(lastChar, firstChar, state) {
|
|
|
1184
1229
|
* Determines if spacing should be added before text content
|
|
1185
1230
|
*/
|
|
1186
1231
|
function shouldAddSpacingBeforeText(lastChar, lastNode, textNode) {
|
|
1187
|
-
return lastChar && lastChar !== "\n" && lastChar !== " " && lastChar !== "[" && lastChar !== ">" && !lastNode?.tagHandler?.isInline && textNode.value[0] !== " ";
|
|
1232
|
+
return !!lastChar && lastChar !== "\n" && lastChar !== " " && lastChar !== "[" && lastChar !== ">" && !lastNode?.tagHandler?.isInline && textNode.value[0] !== " ";
|
|
1188
1233
|
}
|
|
1189
1234
|
/**
|
|
1190
1235
|
* Calculate newline configuration based on tag handler spacing config
|
|
@@ -1303,35 +1348,7 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1303
1348
|
plugins: state.options?.plugins || []
|
|
1304
1349
|
};
|
|
1305
1350
|
parseHtmlStream(html, parseState, (event) => {
|
|
1306
|
-
|
|
1307
|
-
for (const plugin of state.options.plugins) {
|
|
1308
|
-
const res = plugin.beforeNodeProcess?.(event, state);
|
|
1309
|
-
if (typeof res === "object" && res.skip) return;
|
|
1310
|
-
}
|
|
1311
|
-
if (event.node.type === ELEMENT_NODE) {
|
|
1312
|
-
const element = event.node;
|
|
1313
|
-
if (event.type === NodeEventEnter) {
|
|
1314
|
-
for (const plugin of state.options.plugins) if (plugin.processAttributes) plugin.processAttributes(element, state);
|
|
1315
|
-
}
|
|
1316
|
-
const fn = event.type === NodeEventEnter ? "onNodeEnter" : "onNodeExit";
|
|
1317
|
-
const pluginOutputs = [];
|
|
1318
|
-
for (const plugin of state.options.plugins) if (plugin[fn]) {
|
|
1319
|
-
const result = plugin[fn](element, state);
|
|
1320
|
-
if (result) pluginOutputs.push(result);
|
|
1321
|
-
}
|
|
1322
|
-
if (pluginOutputs.length > 0) element.pluginOutput = (element.pluginOutput || []).concat(pluginOutputs);
|
|
1323
|
-
} else if (event.node.type === TEXT_NODE && event.type === NodeEventEnter) {
|
|
1324
|
-
const textNode = event.node;
|
|
1325
|
-
for (const plugin of state.options.plugins) if (plugin.processTextNode) {
|
|
1326
|
-
const result = plugin.processTextNode(textNode, state);
|
|
1327
|
-
if (result) {
|
|
1328
|
-
if (result.skip) return;
|
|
1329
|
-
if (result.content) textNode.value = result.content;
|
|
1330
|
-
}
|
|
1331
|
-
}
|
|
1332
|
-
}
|
|
1333
|
-
}
|
|
1334
|
-
processEvent(event);
|
|
1351
|
+
processPluginsForEvent(event, state.options?.plugins, state, processEvent);
|
|
1335
1352
|
});
|
|
1336
1353
|
}
|
|
1337
1354
|
/**
|
|
@@ -1359,7 +1376,8 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1359
1376
|
processEvent,
|
|
1360
1377
|
processHtml,
|
|
1361
1378
|
getMarkdown,
|
|
1362
|
-
getMarkdownChunk
|
|
1379
|
+
getMarkdownChunk,
|
|
1380
|
+
state
|
|
1363
1381
|
};
|
|
1364
1382
|
}
|
|
1365
1383
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
@@ -1389,13 +1407,13 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
|
1389
1407
|
if (done) break;
|
|
1390
1408
|
const htmlContent = `${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`;
|
|
1391
1409
|
remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
|
|
1392
|
-
processor.processEvent
|
|
1410
|
+
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
1393
1411
|
});
|
|
1394
1412
|
const chunk = processor.getMarkdownChunk();
|
|
1395
1413
|
if (chunk) yield chunk;
|
|
1396
1414
|
}
|
|
1397
1415
|
if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
|
|
1398
|
-
processor.processEvent
|
|
1416
|
+
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
1399
1417
|
});
|
|
1400
1418
|
const finalChunk = processor.getMarkdownChunk();
|
|
1401
1419
|
if (finalChunk) yield finalChunk;
|
|
@@ -1406,4 +1424,4 @@ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
|
1406
1424
|
}
|
|
1407
1425
|
|
|
1408
1426
|
//#endregion
|
|
1409
|
-
export { MarkdownProcessor
|
|
1427
|
+
export { MarkdownProcessor, createMarkdownProcessor, parseHtml, streamHtmlToMarkdown };
|
|
@@ -37,12 +37,12 @@ interface Plugin {
|
|
|
37
37
|
* Process a text node before it's added to the output
|
|
38
38
|
* @param node - The text node to process
|
|
39
39
|
* @param state - The current runtime state
|
|
40
|
-
* @returns
|
|
40
|
+
* @returns Result with content and skip flag, or undefined for no transformation
|
|
41
41
|
*/
|
|
42
|
-
processTextNode?: (node: TextNode, state: MdreamRuntimeState) =>
|
|
42
|
+
processTextNode?: (node: TextNode, state: MdreamRuntimeState) => {
|
|
43
43
|
content: string;
|
|
44
44
|
skip: boolean;
|
|
45
|
-
};
|
|
45
|
+
} | undefined;
|
|
46
46
|
}
|
|
47
47
|
/**
|
|
48
48
|
* Plugin creation options for controlling plugin behavior
|
|
@@ -78,7 +78,7 @@ interface ElementNode extends Node {
|
|
|
78
78
|
/** HTML attributes (for ELEMENT_NODE) */
|
|
79
79
|
attributes: Record<string, string>;
|
|
80
80
|
/** Custom data added by plugins */
|
|
81
|
-
context?:
|
|
81
|
+
context?: PluginContext;
|
|
82
82
|
/** ID of the tag for fast handler lookup */
|
|
83
83
|
tagId?: number;
|
|
84
84
|
/** Map of tag names to their nesting count (using Uint8Array for performance) */
|
|
@@ -90,7 +90,7 @@ interface TextNode extends Node {
|
|
|
90
90
|
/** Text content (for TEXT_NODE) */
|
|
91
91
|
value: string;
|
|
92
92
|
/** Custom data added by plugins */
|
|
93
|
-
context?:
|
|
93
|
+
context?: PluginContext;
|
|
94
94
|
/** Whether this text node should be excluded from markdown output (for script/style elements) */
|
|
95
95
|
excludedFromMarkdown?: boolean;
|
|
96
96
|
}
|
|
@@ -117,7 +117,7 @@ interface Node {
|
|
|
117
117
|
/** Parent node */
|
|
118
118
|
parent?: ElementNode | null;
|
|
119
119
|
/** Custom data added by plugins */
|
|
120
|
-
context?:
|
|
120
|
+
context?: PluginContext;
|
|
121
121
|
/** Region ID for buffer region tracking */
|
|
122
122
|
regionId?: number;
|
|
123
123
|
}
|
|
@@ -190,7 +190,7 @@ interface MdreamRuntimeState extends Partial<MdreamProcessingState> {
|
|
|
190
190
|
lastContentCache?: string;
|
|
191
191
|
/** Reference to the last processed node */
|
|
192
192
|
lastNode?: Node;
|
|
193
|
-
context?:
|
|
193
|
+
context?: PluginContext;
|
|
194
194
|
}
|
|
195
195
|
type NodeEventEnter = 0;
|
|
196
196
|
type NodeEventExit = 1;
|
|
@@ -230,5 +230,26 @@ interface TagHandler {
|
|
|
230
230
|
spacing?: readonly [number, number];
|
|
231
231
|
excludesTextNodes?: boolean;
|
|
232
232
|
}
|
|
233
|
+
interface ReadabilityContext {
|
|
234
|
+
score?: number;
|
|
235
|
+
tagCount?: number;
|
|
236
|
+
linkTextLength?: number;
|
|
237
|
+
textLength?: number;
|
|
238
|
+
isHighLinkDensity?: boolean;
|
|
239
|
+
}
|
|
240
|
+
interface TailwindContext {
|
|
241
|
+
hidden?: boolean;
|
|
242
|
+
prefix?: string;
|
|
243
|
+
suffix?: string;
|
|
244
|
+
}
|
|
245
|
+
interface PluginContext {
|
|
246
|
+
score?: number;
|
|
247
|
+
tagCount?: number;
|
|
248
|
+
linkTextLength?: number;
|
|
249
|
+
textLength?: number;
|
|
250
|
+
isHighLinkDensity?: boolean;
|
|
251
|
+
tailwind?: TailwindContext;
|
|
252
|
+
[key: string]: unknown;
|
|
253
|
+
}
|
|
233
254
|
//#endregion
|
|
234
|
-
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE as TEXT_NODE$1, TagHandler, TextNode, extractionPlugin as extractionPlugin$1 };
|
|
255
|
+
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,23 +1,29 @@
|
|
|
1
|
-
import "./_chunks/plugin-
|
|
2
|
-
import { streamHtmlToMarkdown
|
|
3
|
-
import
|
|
1
|
+
import "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
|
|
3
|
+
import "./_chunks/plugins-D305pIpW.mjs";
|
|
4
|
+
import { withMinimalPreset } from "./_chunks/minimal-Ru8PBNVI.mjs";
|
|
5
|
+
import { readFileSync } from "node:fs";
|
|
6
|
+
import { dirname, join } from "node:path";
|
|
4
7
|
import { Readable } from "node:stream";
|
|
8
|
+
import { fileURLToPath } from "node:url";
|
|
5
9
|
import { cac } from "cac";
|
|
6
10
|
|
|
7
11
|
//#region src/cli.ts
|
|
8
12
|
async function streamingConvert(options = {}) {
|
|
9
13
|
const outputStream = process.stdout;
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
conversionOptions.plugins.push(readabilityPlugin());
|
|
13
|
-
conversionOptions.plugins.push(frontmatterPlugin());
|
|
14
|
+
let conversionOptions = { origin: options.origin };
|
|
15
|
+
if (options.preset === "minimal") conversionOptions = withMinimalPreset(conversionOptions);
|
|
14
16
|
const markdownGenerator = streamHtmlToMarkdown(Readable.toWeb(process.stdin), conversionOptions);
|
|
15
17
|
for await (const markdownChunk of markdownGenerator) if (markdownChunk && markdownChunk.length > 0) outputStream.write(markdownChunk);
|
|
16
18
|
}
|
|
19
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
20
|
+
const packageJsonPath = join(__dirname, "..", "package.json");
|
|
21
|
+
const packageJson = JSON.parse(readFileSync(packageJsonPath, "utf-8"));
|
|
22
|
+
const version = packageJson.version;
|
|
17
23
|
const cli = cac();
|
|
18
24
|
cli.command("[options]", "Convert HTML from stdin to Markdown on stdout").option("--origin <url>", "Origin URL for resolving relative image paths").option("--preset <preset>", "Conversion presets: minimal").action(async (_, opts) => {
|
|
19
25
|
await streamingConvert(opts);
|
|
20
26
|
});
|
|
21
|
-
cli.help().version(
|
|
27
|
+
cli.help().version(version).parse();
|
|
22
28
|
|
|
23
29
|
//#endregion
|
package/dist/index.d.mts
CHANGED
|
@@ -1,3 +1,75 @@
|
|
|
1
|
-
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginCreationOptions, TEXT_NODE$1 as TEXT_NODE, TagHandler, TextNode } from "./_chunks/types-
|
|
2
|
-
import {
|
|
3
|
-
|
|
1
|
+
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-D9VKEbix.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-D45YAMmt.mjs";
|
|
3
|
+
import { ReadableStream } from "node:stream/web";
|
|
4
|
+
|
|
5
|
+
//#region src/const.d.ts
|
|
6
|
+
|
|
7
|
+
declare const TagIdMap: Record<string, number>;
|
|
8
|
+
//#endregion
|
|
9
|
+
//#region src/markdown-processor.d.ts
|
|
10
|
+
interface MarkdownState {
|
|
11
|
+
/** Configuration options for conversion */
|
|
12
|
+
options?: HTMLToMarkdownOptions;
|
|
13
|
+
/** Map of region IDs to buffer regions for O(1) lookups */
|
|
14
|
+
regionToggles: Map<number, boolean>;
|
|
15
|
+
/** Content buffers for regions */
|
|
16
|
+
regionContentBuffers: Map<number, string[]>;
|
|
17
|
+
/** Performance cache for last content to avoid iteration */
|
|
18
|
+
lastContentCache?: string;
|
|
19
|
+
/** Reference to the last processed node */
|
|
20
|
+
lastNode?: ElementNode | TextNode;
|
|
21
|
+
/** Reference to the last processed text node - for context tracking */
|
|
22
|
+
lastTextNode?: TextNode;
|
|
23
|
+
/** Table processing state - specialized for Markdown tables */
|
|
24
|
+
tableRenderedTable?: boolean;
|
|
25
|
+
tableCurrentRowCells?: number;
|
|
26
|
+
tableColumnAlignments?: string[];
|
|
27
|
+
/** Map of tag names to their current nesting depth */
|
|
28
|
+
depthMap: Uint8Array;
|
|
29
|
+
/** Current depth for plugin access */
|
|
30
|
+
depth?: number;
|
|
31
|
+
/** Context for additional data */
|
|
32
|
+
context?: PluginContext;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Creates a markdown processor that consumes DOM events and generates markdown
|
|
36
|
+
*/
|
|
37
|
+
declare function createMarkdownProcessor(options?: HTMLToMarkdownOptions): {
|
|
38
|
+
processEvent: (event: NodeEvent) => void;
|
|
39
|
+
processHtml: (html: string) => void;
|
|
40
|
+
getMarkdown: () => string;
|
|
41
|
+
getMarkdownChunk: () => string;
|
|
42
|
+
state: MarkdownState;
|
|
43
|
+
};
|
|
44
|
+
declare const MarkdownProcessor: typeof createMarkdownProcessor;
|
|
45
|
+
//#endregion
|
|
46
|
+
//#region src/parse.d.ts
|
|
47
|
+
interface ParseOptions {
|
|
48
|
+
plugins?: Plugin[];
|
|
49
|
+
}
|
|
50
|
+
interface ParseResult {
|
|
51
|
+
events: NodeEvent[];
|
|
52
|
+
remainingHtml: string;
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Pure HTML parser that emits DOM events
|
|
56
|
+
* Completely decoupled from markdown generation
|
|
57
|
+
*/
|
|
58
|
+
declare function parseHtml(html: string, options?: ParseOptions): ParseResult;
|
|
59
|
+
/**
|
|
60
|
+
* Streaming HTML parser - calls onEvent for each DOM event
|
|
61
|
+
*/
|
|
62
|
+
//#endregion
|
|
63
|
+
//#region src/stream.d.ts
|
|
64
|
+
/**
|
|
65
|
+
* Creates a markdown stream from an HTML stream
|
|
66
|
+
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
67
|
+
* @param options - Configuration options for conversion
|
|
68
|
+
* @returns An async generator yielding markdown chunks
|
|
69
|
+
*/
|
|
70
|
+
declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options?: HTMLToMarkdownOptions): AsyncIterable<string>;
|
|
71
|
+
//#endregion
|
|
72
|
+
//#region src/index.d.ts
|
|
73
|
+
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
74
|
+
//#endregion
|
|
75
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { TagIdMap
|
|
2
|
-
import { MarkdownProcessor
|
|
1
|
+
import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { MarkdownProcessor, createMarkdownProcessor, parseHtml, streamHtmlToMarkdown } from "./_chunks/stream-IeCVDuTy.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/index.ts
|
|
5
5
|
function htmlToMarkdown(html, options = {}) {
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-
|
|
2
|
-
import { createPlugin } from "./_chunks/
|
|
1
|
+
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-D9VKEbix.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-D45YAMmt.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/plugins/filter.d.ts
|
|
5
5
|
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import { createPlugin
|
|
2
|
-
import { frontmatterPlugin, readabilityPlugin } from "./_chunks/
|
|
3
|
-
import { extractionPlugin, filterPlugin, isolateMainPlugin, tailwindPlugin } from "./_chunks/plugins-DGakgpSl.mjs";
|
|
1
|
+
import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import { extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-D305pIpW.mjs";
|
|
4
3
|
|
|
5
4
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,41 +1,5 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import {
|
|
1
|
+
import "../_chunks/plugin-Bqz9GKOA.mjs";
|
|
2
|
+
import "../_chunks/plugins-D305pIpW.mjs";
|
|
3
|
+
import { withMinimalPreset } from "../_chunks/minimal-Ru8PBNVI.mjs";
|
|
4
4
|
|
|
5
|
-
//#region src/preset/minimal.ts
|
|
6
|
-
/**
|
|
7
|
-
* Creates a configurable minimal preset with advanced options
|
|
8
|
-
*
|
|
9
|
-
* @param options HTML to Markdown options
|
|
10
|
-
* @returns HTML to Markdown options with configured plugins
|
|
11
|
-
*/
|
|
12
|
-
function withMinimalPreset(options = {}) {
|
|
13
|
-
const plugins = [
|
|
14
|
-
isolateMainPlugin(),
|
|
15
|
-
frontmatterPlugin(),
|
|
16
|
-
tailwindPlugin(),
|
|
17
|
-
filterPlugin({ exclude: [
|
|
18
|
-
TAG_FORM,
|
|
19
|
-
TAG_FIELDSET,
|
|
20
|
-
TAG_OBJECT,
|
|
21
|
-
TAG_EMBED,
|
|
22
|
-
TAG_FIGURE,
|
|
23
|
-
TAG_FOOTER,
|
|
24
|
-
TAG_ASIDE,
|
|
25
|
-
TAG_IFRAME,
|
|
26
|
-
TAG_INPUT,
|
|
27
|
-
TAG_TEXTAREA,
|
|
28
|
-
TAG_SELECT,
|
|
29
|
-
TAG_BUTTON,
|
|
30
|
-
TAG_NAV
|
|
31
|
-
] })
|
|
32
|
-
];
|
|
33
|
-
if (options.plugins) plugins.push(...options.plugins);
|
|
34
|
-
return {
|
|
35
|
-
...options,
|
|
36
|
-
plugins
|
|
37
|
-
};
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
//#endregion
|
|
41
5
|
export { withMinimalPreset };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.4.0",
|
|
5
5
|
"description": "Ultra-performant JavaScript HTML to Markdown converter optimized for LLMs.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -28,19 +28,6 @@
|
|
|
28
28
|
"dependencies": {
|
|
29
29
|
"cac": "^6.7.14"
|
|
30
30
|
},
|
|
31
|
-
"devDependencies": {
|
|
32
|
-
"@antfu/eslint-config": "^4.16.2",
|
|
33
|
-
"@types/node": "^24.0.10",
|
|
34
|
-
"bumpp": "^10.2.0",
|
|
35
|
-
"crawlee": "^3.13.9",
|
|
36
|
-
"eslint": "^9.30.1",
|
|
37
|
-
"llm-cost": "^1.0.5",
|
|
38
|
-
"obuild": "^0.2.1",
|
|
39
|
-
"playwright": "^1.53.2",
|
|
40
|
-
"typescript": "5.8.3",
|
|
41
|
-
"unbuild": "^3.5.0",
|
|
42
|
-
"vitest": "^3.2.4"
|
|
43
|
-
},
|
|
44
31
|
"scripts": {
|
|
45
32
|
"flame": "pnpm build && unbuild bench/bundle && clinic flame -- node bench/bundle/dist/string.mjs 10",
|
|
46
33
|
"bench:build": "pnpm build && unbuild bench/bundle",
|
|
@@ -55,9 +42,8 @@
|
|
|
55
42
|
"test:wiki:file": "pnpm build && cat test/fixtures/wikipedia-largest.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
56
43
|
"test:wiki-small:file": "cat test/fixtures/wikipedia-small.html | node ./bin/mdream.mjs --origin https://en.wikipedia.org | tee test/wiki-markdown.md",
|
|
57
44
|
"build": "obuild",
|
|
58
|
-
"typecheck": "tsc --noEmit
|
|
45
|
+
"typecheck": "tsc --noEmit",
|
|
59
46
|
"dev:prepare": "obuild --stub",
|
|
60
|
-
"test": "vitest test"
|
|
61
|
-
"release": "pnpm build && bumpp && pnpm -r publish"
|
|
47
|
+
"test": "vitest test"
|
|
62
48
|
}
|
|
63
49
|
}
|