mdream 0.12.3 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/{plugin-Bqz9GKOA.mjs → const-BOAJ1T5c.mjs} +1 -12
- package/dist/_chunks/{extraction-BSOWm6fo.mjs → extraction-BPaDGYvv.mjs} +1 -1
- package/dist/_chunks/{llms-txt-CQ4yFagU.mjs → llms-txt-XvDQwYbj.mjs} +2 -2
- package/dist/_chunks/{src-DBqiXz8C.mjs → markdown-processor-BWVPNlvD.mjs} +2 -51
- package/dist/_chunks/{minimal-DSW9dhXV.mjs → minimal-co1tIZYm.mjs} +2 -2
- package/dist/_chunks/{plugin-BUiqQb0v.d.mts → plugin-CgnpSqtP.d.mts} +1 -1
- package/dist/_chunks/plugin-DrovQriD.mjs +12 -0
- package/dist/_chunks/{plugins-TeB1_RYL.mjs → plugins-C5_irVJs.mjs} +3 -2
- package/dist/_chunks/src-D-NT7shY.mjs +52 -0
- package/dist/_chunks/{types-B94khc0C.d.mts → types-DqiI86yW.d.mts} +66 -1
- package/dist/cli.mjs +8 -6
- package/dist/index.d.mts +3 -3
- package/dist/index.mjs +4 -2
- package/dist/llms-txt.mjs +6 -4
- package/dist/plugins.d.mts +2 -2
- package/dist/plugins.mjs +4 -3
- package/dist/preset/minimal.d.mts +1 -1
- package/dist/preset/minimal.mjs +5 -4
- package/dist/splitter.d.mts +11 -0
- package/dist/splitter.mjs +202 -0
- package/package.json +9 -1
|
@@ -285,15 +285,4 @@ const LIST_ITEM_SPACING = [1, 0];
|
|
|
285
285
|
const TABLE_ROW_SPACING = [0, 1];
|
|
286
286
|
|
|
287
287
|
//#endregion
|
|
288
|
-
|
|
289
|
-
/**
|
|
290
|
-
* Create a plugin that implements the Plugin interface with improved type inference
|
|
291
|
-
*
|
|
292
|
-
* @returns A complete plugin implementation
|
|
293
|
-
*/
|
|
294
|
-
function createPlugin(plugin) {
|
|
295
|
-
return plugin;
|
|
296
|
-
}
|
|
297
|
-
|
|
298
|
-
//#endregion
|
|
299
|
-
export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin };
|
|
288
|
+
export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { htmlToMarkdown } from "./src-
|
|
2
|
-
import { extractionPlugin } from "./extraction-
|
|
1
|
+
import { htmlToMarkdown } from "./src-D-NT7shY.mjs";
|
|
2
|
+
import { extractionPlugin } from "./extraction-BPaDGYvv.mjs";
|
|
3
3
|
import { readFile } from "node:fs/promises";
|
|
4
4
|
import { basename, dirname, relative, sep } from "pathe";
|
|
5
5
|
import { glob } from "tinyglobby";
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./
|
|
1
|
+
import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./const-BOAJ1T5c.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/tags.ts
|
|
4
4
|
function resolveUrl(url, origin) {
|
|
@@ -1386,53 +1386,4 @@ function createMarkdownProcessor(options = {}) {
|
|
|
1386
1386
|
const MarkdownProcessor = createMarkdownProcessor;
|
|
1387
1387
|
|
|
1388
1388
|
//#endregion
|
|
1389
|
-
|
|
1390
|
-
/**
|
|
1391
|
-
* Creates a markdown stream from an HTML stream
|
|
1392
|
-
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
1393
|
-
* @param options - Configuration options for conversion
|
|
1394
|
-
* @returns An async generator yielding markdown chunks
|
|
1395
|
-
*/
|
|
1396
|
-
async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
1397
|
-
if (!htmlStream) throw new Error("Invalid HTML stream provided");
|
|
1398
|
-
const decoder = new TextDecoder();
|
|
1399
|
-
const reader = htmlStream.getReader();
|
|
1400
|
-
const processor = createMarkdownProcessor(options);
|
|
1401
|
-
const parseState = {
|
|
1402
|
-
depthMap: new Uint8Array(1024),
|
|
1403
|
-
depth: 0,
|
|
1404
|
-
plugins: options.plugins || []
|
|
1405
|
-
};
|
|
1406
|
-
let remainingHtml = "";
|
|
1407
|
-
try {
|
|
1408
|
-
while (true) {
|
|
1409
|
-
const { done, value } = await reader.read();
|
|
1410
|
-
if (done) break;
|
|
1411
|
-
const htmlContent = `${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`;
|
|
1412
|
-
remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
|
|
1413
|
-
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
1414
|
-
});
|
|
1415
|
-
const chunk = processor.getMarkdownChunk();
|
|
1416
|
-
if (chunk) yield chunk;
|
|
1417
|
-
}
|
|
1418
|
-
if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
|
|
1419
|
-
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
1420
|
-
});
|
|
1421
|
-
const finalChunk = processor.getMarkdownChunk();
|
|
1422
|
-
if (finalChunk) yield finalChunk;
|
|
1423
|
-
} finally {
|
|
1424
|
-
if (remainingHtml) decoder.decode(new Uint8Array(0), { stream: false });
|
|
1425
|
-
reader.releaseLock();
|
|
1426
|
-
}
|
|
1427
|
-
}
|
|
1428
|
-
|
|
1429
|
-
//#endregion
|
|
1430
|
-
//#region src/index.ts
|
|
1431
|
-
function htmlToMarkdown(html, options = {}) {
|
|
1432
|
-
const processor = createMarkdownProcessor(options);
|
|
1433
|
-
processor.processHtml(html);
|
|
1434
|
-
return processor.getMarkdown();
|
|
1435
|
-
}
|
|
1436
|
-
|
|
1437
|
-
//#endregion
|
|
1438
|
-
export { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
|
1389
|
+
export { MarkdownProcessor, createMarkdownProcessor, parseHtml, parseHtmlStream, processPluginsForEvent };
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./
|
|
2
|
-
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-
|
|
1
|
+
import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./const-BOAJ1T5c.mjs";
|
|
2
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-C5_irVJs.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/preset/minimal.ts
|
|
5
5
|
/**
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
//#region src/pluggable/plugin.ts
|
|
2
|
+
/**
|
|
3
|
+
* Create a plugin that implements the Plugin interface with improved type inference
|
|
4
|
+
*
|
|
5
|
+
* @returns A complete plugin implementation
|
|
6
|
+
*/
|
|
7
|
+
function createPlugin(plugin) {
|
|
8
|
+
return plugin;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
//#endregion
|
|
12
|
+
export { createPlugin };
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion
|
|
2
|
-
import {
|
|
1
|
+
import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion } from "./const-BOAJ1T5c.mjs";
|
|
2
|
+
import { createPlugin } from "./plugin-DrovQriD.mjs";
|
|
3
|
+
import { parseSelector } from "./extraction-BPaDGYvv.mjs";
|
|
3
4
|
|
|
4
5
|
//#region src/plugins/filter.ts
|
|
5
6
|
/**
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./markdown-processor-BWVPNlvD.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/stream.ts
|
|
4
|
+
/**
|
|
5
|
+
* Creates a markdown stream from an HTML stream
|
|
6
|
+
* @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
|
|
7
|
+
* @param options - Configuration options for conversion
|
|
8
|
+
* @returns An async generator yielding markdown chunks
|
|
9
|
+
*/
|
|
10
|
+
async function* streamHtmlToMarkdown(htmlStream, options = {}) {
|
|
11
|
+
if (!htmlStream) throw new Error("Invalid HTML stream provided");
|
|
12
|
+
const decoder = new TextDecoder();
|
|
13
|
+
const reader = htmlStream.getReader();
|
|
14
|
+
const processor = createMarkdownProcessor(options);
|
|
15
|
+
const parseState = {
|
|
16
|
+
depthMap: new Uint8Array(1024),
|
|
17
|
+
depth: 0,
|
|
18
|
+
plugins: options.plugins || []
|
|
19
|
+
};
|
|
20
|
+
let remainingHtml = "";
|
|
21
|
+
try {
|
|
22
|
+
while (true) {
|
|
23
|
+
const { done, value } = await reader.read();
|
|
24
|
+
if (done) break;
|
|
25
|
+
const htmlContent = `${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`;
|
|
26
|
+
remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
|
|
27
|
+
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
28
|
+
});
|
|
29
|
+
const chunk = processor.getMarkdownChunk();
|
|
30
|
+
if (chunk) yield chunk;
|
|
31
|
+
}
|
|
32
|
+
if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
|
|
33
|
+
processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
|
|
34
|
+
});
|
|
35
|
+
const finalChunk = processor.getMarkdownChunk();
|
|
36
|
+
if (finalChunk) yield finalChunk;
|
|
37
|
+
} finally {
|
|
38
|
+
if (remainingHtml) decoder.decode(new Uint8Array(0), { stream: false });
|
|
39
|
+
reader.releaseLock();
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
//#endregion
|
|
44
|
+
//#region src/index.ts
|
|
45
|
+
function htmlToMarkdown(html, options = {}) {
|
|
46
|
+
const processor = createMarkdownProcessor(options);
|
|
47
|
+
processor.processHtml(html);
|
|
48
|
+
return processor.getMarkdown();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
//#endregion
|
|
52
|
+
export { htmlToMarkdown, streamHtmlToMarkdown };
|
|
@@ -251,5 +251,70 @@ interface PluginContext {
|
|
|
251
251
|
tailwind?: TailwindContext;
|
|
252
252
|
[key: string]: unknown;
|
|
253
253
|
}
|
|
254
|
+
/**
|
|
255
|
+
* Markdown chunk with content and metadata
|
|
256
|
+
* Compatible with LangChain Document structure
|
|
257
|
+
*/
|
|
258
|
+
interface MarkdownChunk {
|
|
259
|
+
/** The markdown content of the chunk */
|
|
260
|
+
content: string;
|
|
261
|
+
/** Metadata extracted during chunking */
|
|
262
|
+
metadata: {
|
|
263
|
+
/** Header hierarchy at this chunk position */
|
|
264
|
+
headers?: Record<string, string>;
|
|
265
|
+
/** Code block language if chunk is/contains code */
|
|
266
|
+
code?: string;
|
|
267
|
+
/** Line number range in original document */
|
|
268
|
+
loc?: {
|
|
269
|
+
lines: {
|
|
270
|
+
from: number;
|
|
271
|
+
to: number;
|
|
272
|
+
};
|
|
273
|
+
};
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
/**
|
|
277
|
+
* Options for HTML to Markdown chunking
|
|
278
|
+
* Extends HTMLToMarkdownOptions with chunking-specific settings
|
|
279
|
+
*/
|
|
280
|
+
interface SplitterOptions extends HTMLToMarkdownOptions {
|
|
281
|
+
/**
|
|
282
|
+
* Header tag IDs to split on (TAG_H1, TAG_H2, etc.)
|
|
283
|
+
* @example [TAG_H1, TAG_H2]
|
|
284
|
+
* @default [TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6]
|
|
285
|
+
*/
|
|
286
|
+
headersToSplitOn?: number[];
|
|
287
|
+
/**
|
|
288
|
+
* Return each line as individual chunk
|
|
289
|
+
* @default false
|
|
290
|
+
*/
|
|
291
|
+
returnEachLine?: boolean;
|
|
292
|
+
/**
|
|
293
|
+
* Strip headers from chunk content
|
|
294
|
+
* @default true
|
|
295
|
+
*/
|
|
296
|
+
stripHeaders?: boolean;
|
|
297
|
+
/**
|
|
298
|
+
* Maximum chunk size
|
|
299
|
+
* @default 1000
|
|
300
|
+
*/
|
|
301
|
+
chunkSize?: number;
|
|
302
|
+
/**
|
|
303
|
+
* Overlap between chunks for context preservation
|
|
304
|
+
* @default 200
|
|
305
|
+
*/
|
|
306
|
+
chunkOverlap?: number;
|
|
307
|
+
/**
|
|
308
|
+
* Function to measure chunk length (default: character count)
|
|
309
|
+
* Can be replaced with token counter for LLM applications
|
|
310
|
+
* @default (text) => text.length
|
|
311
|
+
*/
|
|
312
|
+
lengthFunction?: (text: string) => number;
|
|
313
|
+
/**
|
|
314
|
+
* Keep separators in the split chunks
|
|
315
|
+
* @default false
|
|
316
|
+
*/
|
|
317
|
+
keepSeparator?: boolean;
|
|
318
|
+
}
|
|
254
319
|
//#endregion
|
|
255
|
-
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, type ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
|
|
320
|
+
export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, type ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
|
package/dist/cli.mjs
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
|
-
import "./_chunks/
|
|
2
|
-
import
|
|
3
|
-
import "./_chunks/
|
|
4
|
-
import {
|
|
5
|
-
import "./_chunks/
|
|
6
|
-
import {
|
|
1
|
+
import "./_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import "./_chunks/markdown-processor-BWVPNlvD.mjs";
|
|
3
|
+
import "./_chunks/plugin-DrovQriD.mjs";
|
|
4
|
+
import { streamHtmlToMarkdown } from "./_chunks/src-D-NT7shY.mjs";
|
|
5
|
+
import "./_chunks/extraction-BPaDGYvv.mjs";
|
|
6
|
+
import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-XvDQwYbj.mjs";
|
|
7
|
+
import "./_chunks/plugins-C5_irVJs.mjs";
|
|
8
|
+
import { withMinimalPreset } from "./_chunks/minimal-co1tIZYm.mjs";
|
|
7
9
|
import { readFileSync } from "node:fs";
|
|
8
10
|
import { mkdir, writeFile } from "node:fs/promises";
|
|
9
11
|
import { Readable } from "node:stream";
|
package/dist/index.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-
|
|
2
|
-
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-
|
|
1
|
+
import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-DqiI86yW.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-CgnpSqtP.mjs";
|
|
3
3
|
import { ReadableStream } from "node:stream/web";
|
|
4
4
|
|
|
5
5
|
//#region src/const.d.ts
|
|
@@ -181,4 +181,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
|
|
|
181
181
|
//#region src/index.d.ts
|
|
182
182
|
declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
|
|
183
183
|
//#endregion
|
|
184
|
-
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
|
184
|
+
export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/index.mjs
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import { TagIdMap
|
|
2
|
-
import { MarkdownProcessor,
|
|
1
|
+
import { TagIdMap } from "./_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import { MarkdownProcessor, parseHtml } from "./_chunks/markdown-processor-BWVPNlvD.mjs";
|
|
3
|
+
import { createPlugin } from "./_chunks/plugin-DrovQriD.mjs";
|
|
4
|
+
import { htmlToMarkdown, streamHtmlToMarkdown } from "./_chunks/src-D-NT7shY.mjs";
|
|
3
5
|
|
|
4
6
|
export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
|
package/dist/llms-txt.mjs
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
import "./_chunks/
|
|
2
|
-
import "./_chunks/
|
|
3
|
-
import "./_chunks/
|
|
4
|
-
import
|
|
1
|
+
import "./_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import "./_chunks/markdown-processor-BWVPNlvD.mjs";
|
|
3
|
+
import "./_chunks/plugin-DrovQriD.mjs";
|
|
4
|
+
import "./_chunks/src-D-NT7shY.mjs";
|
|
5
|
+
import "./_chunks/extraction-BPaDGYvv.mjs";
|
|
6
|
+
import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-XvDQwYbj.mjs";
|
|
5
7
|
|
|
6
8
|
export { generateLlmsTxtArtifacts };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-
|
|
2
|
-
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-
|
|
1
|
+
import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-DqiI86yW.mjs";
|
|
2
|
+
import { createPlugin$1 as createPlugin } from "./_chunks/plugin-CgnpSqtP.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/plugins/filter.d.ts
|
|
5
5
|
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
1
|
+
import "./_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import { createPlugin } from "./_chunks/plugin-DrovQriD.mjs";
|
|
3
|
+
import { extractionPlugin } from "./_chunks/extraction-BPaDGYvv.mjs";
|
|
4
|
+
import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-C5_irVJs.mjs";
|
|
4
5
|
|
|
5
6
|
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import "../_chunks/
|
|
2
|
-
import "../_chunks/
|
|
3
|
-
import "../_chunks/
|
|
4
|
-
import
|
|
1
|
+
import "../_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import "../_chunks/plugin-DrovQriD.mjs";
|
|
3
|
+
import "../_chunks/extraction-BPaDGYvv.mjs";
|
|
4
|
+
import "../_chunks/plugins-C5_irVJs.mjs";
|
|
5
|
+
import { withMinimalPreset } from "../_chunks/minimal-co1tIZYm.mjs";
|
|
5
6
|
|
|
6
7
|
export { withMinimalPreset };
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { MarkdownChunk, SplitterOptions } from "./_chunks/types-DqiI86yW.mjs";
|
|
2
|
+
|
|
3
|
+
//#region src/splitter.d.ts
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
7
|
+
* Chunks are created during HTML event processing
|
|
8
|
+
*/
|
|
9
|
+
declare function htmlToMarkdownSplitChunks(html: string, options?: SplitterOptions): MarkdownChunk[];
|
|
10
|
+
//#endregion
|
|
11
|
+
export { type MarkdownChunk, type SplitterOptions, htmlToMarkdownSplitChunks };
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_CODE, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HR, TAG_PRE, TEXT_NODE } from "./_chunks/const-BOAJ1T5c.mjs";
|
|
2
|
+
import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./_chunks/markdown-processor-BWVPNlvD.mjs";
|
|
3
|
+
|
|
4
|
+
//#region src/splitter.ts
|
|
5
|
+
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
6
|
+
TAG_H2,
|
|
7
|
+
TAG_H3,
|
|
8
|
+
TAG_H4,
|
|
9
|
+
TAG_H5,
|
|
10
|
+
TAG_H6
|
|
11
|
+
];
|
|
12
|
+
function createOptions(options) {
|
|
13
|
+
return {
|
|
14
|
+
headersToSplitOn: options.headersToSplitOn ?? DEFAULT_HEADERS_TO_SPLIT_ON,
|
|
15
|
+
returnEachLine: options.returnEachLine ?? false,
|
|
16
|
+
stripHeaders: options.stripHeaders ?? true,
|
|
17
|
+
chunkSize: options.chunkSize ?? 1e3,
|
|
18
|
+
chunkOverlap: options.chunkOverlap ?? 200,
|
|
19
|
+
lengthFunction: options.lengthFunction ?? ((text) => text.length),
|
|
20
|
+
keepSeparator: options.keepSeparator ?? false,
|
|
21
|
+
origin: options.origin,
|
|
22
|
+
plugins: options.plugins ?? []
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
function getCodeLanguage(node) {
|
|
26
|
+
const className = node.attributes?.class;
|
|
27
|
+
if (!className) return "";
|
|
28
|
+
const langParts = className.split(" ").map((c) => c.split("language-")[1]).filter(Boolean);
|
|
29
|
+
return langParts.length > 0 ? langParts[0].trim() : "";
|
|
30
|
+
}
|
|
31
|
+
function shouldSplitOnHeader(tagId, options) {
|
|
32
|
+
return options.headersToSplitOn.includes(tagId);
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Get current markdown content WITHOUT clearing buffers
|
|
36
|
+
*/
|
|
37
|
+
function getCurrentMarkdown(state) {
|
|
38
|
+
const fragments = [];
|
|
39
|
+
for (const [regionId, content] of state.regionContentBuffers.entries()) {
|
|
40
|
+
const include = state.regionToggles.get(regionId);
|
|
41
|
+
if (include) fragments.push(...content);
|
|
42
|
+
}
|
|
43
|
+
return fragments.join("").trimStart();
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Convert HTML to Markdown and split into chunks in single pass
|
|
47
|
+
* Chunks are created during HTML event processing
|
|
48
|
+
*/
|
|
49
|
+
function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
50
|
+
const opts = createOptions(options);
|
|
51
|
+
if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
|
|
52
|
+
const processor = createMarkdownProcessor({
|
|
53
|
+
origin: opts.origin,
|
|
54
|
+
plugins: opts.plugins
|
|
55
|
+
});
|
|
56
|
+
const chunks = [];
|
|
57
|
+
const headerHierarchy = new Map();
|
|
58
|
+
const seenSplitHeaders = new Set();
|
|
59
|
+
let currentChunkCodeLanguage = "";
|
|
60
|
+
let collectingHeaderText = false;
|
|
61
|
+
let currentHeaderTagId = null;
|
|
62
|
+
let currentHeaderText = "";
|
|
63
|
+
let lineNumber = 1;
|
|
64
|
+
let lastChunkEndPosition = 0;
|
|
65
|
+
let lastSplitPosition = 0;
|
|
66
|
+
function flushChunk(endPosition) {
|
|
67
|
+
const currentMd = getCurrentMarkdown(processor.state);
|
|
68
|
+
const chunkEnd = endPosition ?? currentMd.length;
|
|
69
|
+
const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
|
|
70
|
+
if (!chunkContent.trim()) {
|
|
71
|
+
lastChunkEndPosition = chunkEnd;
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
const chunk = {
|
|
75
|
+
content: chunkContent.trimEnd(),
|
|
76
|
+
metadata: { loc: { lines: {
|
|
77
|
+
from: lineNumber,
|
|
78
|
+
to: lineNumber + (chunkContent.match(/\n/g) || []).length
|
|
79
|
+
} } }
|
|
80
|
+
};
|
|
81
|
+
if (headerHierarchy.size > 0) {
|
|
82
|
+
chunk.metadata.headers = {};
|
|
83
|
+
for (const [tagId, text] of headerHierarchy.entries()) {
|
|
84
|
+
const level = `h${tagId - TAG_H1 + 1}`;
|
|
85
|
+
chunk.metadata.headers[level] = text;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
|
|
89
|
+
chunks.push(chunk);
|
|
90
|
+
currentChunkCodeLanguage = "";
|
|
91
|
+
lastSplitPosition = chunkEnd;
|
|
92
|
+
if (opts.chunkOverlap > 0) {
|
|
93
|
+
const maxOverlap = Math.max(0, chunkContent.length - 1);
|
|
94
|
+
const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
|
|
95
|
+
lastChunkEndPosition = chunkEnd - actualOverlap;
|
|
96
|
+
} else lastChunkEndPosition = chunkEnd;
|
|
97
|
+
lineNumber += (chunkContent.match(/\n/g) || []).length;
|
|
98
|
+
}
|
|
99
|
+
const parseState = {
|
|
100
|
+
depthMap: processor.state.depthMap,
|
|
101
|
+
depth: 0,
|
|
102
|
+
plugins: opts.plugins
|
|
103
|
+
};
|
|
104
|
+
parseHtmlStream(html, parseState, (event) => {
|
|
105
|
+
const { type: eventType, node } = event;
|
|
106
|
+
if (node.type === ELEMENT_NODE) {
|
|
107
|
+
const element = node;
|
|
108
|
+
const tagId = element.tagId;
|
|
109
|
+
if (tagId && tagId >= TAG_H1 && tagId <= TAG_H6) {
|
|
110
|
+
if (eventType === NodeEventEnter) {
|
|
111
|
+
collectingHeaderText = true;
|
|
112
|
+
currentHeaderTagId = tagId;
|
|
113
|
+
currentHeaderText = "";
|
|
114
|
+
if (shouldSplitOnHeader(tagId, opts)) {
|
|
115
|
+
if (seenSplitHeaders.has(tagId)) {
|
|
116
|
+
flushChunk();
|
|
117
|
+
for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
|
|
118
|
+
}
|
|
119
|
+
seenSplitHeaders.add(tagId);
|
|
120
|
+
}
|
|
121
|
+
} else if (eventType === NodeEventExit && currentHeaderTagId === tagId) {
|
|
122
|
+
headerHierarchy.set(tagId, currentHeaderText.trim());
|
|
123
|
+
collectingHeaderText = false;
|
|
124
|
+
currentHeaderTagId = null;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (tagId === TAG_CODE && element.depthMap[TAG_PRE] > 0) {
|
|
128
|
+
if (eventType === NodeEventEnter) {
|
|
129
|
+
const lang = getCodeLanguage(element);
|
|
130
|
+
if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
|
|
134
|
+
}
|
|
135
|
+
if (collectingHeaderText && node.type === TEXT_NODE) {
|
|
136
|
+
const textNode = node;
|
|
137
|
+
currentHeaderText += textNode.value;
|
|
138
|
+
}
|
|
139
|
+
processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
|
|
140
|
+
if (!opts.returnEachLine) {
|
|
141
|
+
const currentMd = getCurrentMarkdown(processor.state);
|
|
142
|
+
const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
|
|
143
|
+
if (currentChunkSize > opts.chunkSize) {
|
|
144
|
+
const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
|
|
145
|
+
const separators = [
|
|
146
|
+
"\n\n",
|
|
147
|
+
"```\n",
|
|
148
|
+
"\n",
|
|
149
|
+
" "
|
|
150
|
+
];
|
|
151
|
+
let splitPosition = -1;
|
|
152
|
+
for (const sep of separators) {
|
|
153
|
+
const idx = currentMd.lastIndexOf(sep, idealSplitPos);
|
|
154
|
+
const candidateSplitPos = idx + sep.length;
|
|
155
|
+
if (idx >= 0) {
|
|
156
|
+
const beforeSplit = currentMd.slice(0, candidateSplitPos);
|
|
157
|
+
let backtickCount = 0;
|
|
158
|
+
let pos = 0;
|
|
159
|
+
while ((pos = beforeSplit.indexOf("```", pos)) !== -1) {
|
|
160
|
+
backtickCount++;
|
|
161
|
+
pos += 3;
|
|
162
|
+
}
|
|
163
|
+
if (backtickCount % 2 === 1) continue;
|
|
164
|
+
}
|
|
165
|
+
if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
|
|
166
|
+
splitPosition = candidateSplitPos;
|
|
167
|
+
break;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
|
|
171
|
+
flushChunk(splitPosition);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
});
|
|
175
|
+
flushChunk();
|
|
176
|
+
if (opts.returnEachLine && chunks.length > 0) {
|
|
177
|
+
const lineChunks = [];
|
|
178
|
+
for (const chunk of chunks) {
|
|
179
|
+
const lines = chunk.content.split("\n");
|
|
180
|
+
const chunkStartLine = chunk.metadata.loc?.lines.from || 1;
|
|
181
|
+
for (let i = 0; i < lines.length; i++) {
|
|
182
|
+
const line = lines[i];
|
|
183
|
+
if (line.trim()) lineChunks.push({
|
|
184
|
+
content: line,
|
|
185
|
+
metadata: {
|
|
186
|
+
...chunk.metadata,
|
|
187
|
+
loc: { lines: {
|
|
188
|
+
from: chunkStartLine + i,
|
|
189
|
+
to: chunkStartLine + i
|
|
190
|
+
} }
|
|
191
|
+
}
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
return lineChunks;
|
|
196
|
+
}
|
|
197
|
+
if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
|
|
198
|
+
return chunks.filter((chunk) => chunk.content.length > 0);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
//#endregion
|
|
202
|
+
export { htmlToMarkdownSplitChunks };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.13.1",
|
|
5
5
|
"description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -54,6 +54,14 @@
|
|
|
54
54
|
"default": "./dist/preset/minimal.mjs"
|
|
55
55
|
},
|
|
56
56
|
"default": "./dist/preset/minimal.mjs"
|
|
57
|
+
},
|
|
58
|
+
"./splitter": {
|
|
59
|
+
"types": "./dist/splitter.d.mts",
|
|
60
|
+
"import": {
|
|
61
|
+
"types": "./dist/splitter.d.mts",
|
|
62
|
+
"default": "./dist/splitter.mjs"
|
|
63
|
+
},
|
|
64
|
+
"default": "./dist/splitter.mjs"
|
|
57
65
|
}
|
|
58
66
|
},
|
|
59
67
|
"main": "./dist/index.mjs",
|