mdream 0.12.3 → 0.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -285,15 +285,4 @@ const LIST_ITEM_SPACING = [1, 0];
285
285
  const TABLE_ROW_SPACING = [0, 1];
286
286
 
287
287
  //#endregion
288
- //#region src/pluggable/plugin.ts
289
- /**
290
- * Create a plugin that implements the Plugin interface with improved type inference
291
- *
292
- * @returns A complete plugin implementation
293
- */
294
- function createPlugin(plugin) {
295
- return plugin;
296
- }
297
-
298
- //#endregion
299
- export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion, createPlugin };
288
+ export { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CAPTION, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAIN, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OBJECT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent, createBufferRegion };
@@ -1,4 +1,4 @@
1
- import { createPlugin } from "./plugin-Bqz9GKOA.mjs";
1
+ import { createPlugin } from "./plugin-DrovQriD.mjs";
2
2
 
3
3
  //#region src/libs/query-selector.ts
4
4
  /**
@@ -1,5 +1,5 @@
1
- import { htmlToMarkdown } from "./src-DBqiXz8C.mjs";
2
- import { extractionPlugin } from "./extraction-BSOWm6fo.mjs";
1
+ import { htmlToMarkdown } from "./src-D-NT7shY.mjs";
2
+ import { extractionPlugin } from "./extraction-BPaDGYvv.mjs";
3
3
  import { readFile } from "node:fs/promises";
4
4
  import { basename, dirname, relative, sep } from "pathe";
5
5
  import { glob } from "tinyglobby";
@@ -1,4 +1,4 @@
1
- import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./plugin-Bqz9GKOA.mjs";
1
+ import { BLOCKQUOTE_SPACING, DEFAULT_BLOCK_SPACING, ELEMENT_NODE, HTML_ENTITIES, LIST_ITEM_SPACING, MARKDOWN_CODE_BLOCK, MARKDOWN_EMPHASIS, MARKDOWN_HORIZONTAL_RULE, MARKDOWN_INLINE_CODE, MARKDOWN_STRIKETHROUGH, MARKDOWN_STRONG, MAX_TAG_ID, NO_SPACING, NodeEventEnter, NodeEventExit, TABLE_ROW_SPACING, TAG_A, TAG_ABBR, TAG_ADDRESS, TAG_AREA, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BASE, TAG_BDO, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CANVAS, TAG_CENTER, TAG_CITE, TAG_CODE, TAG_COL, TAG_DD, TAG_DEL, TAG_DETAILS, TAG_DFN, TAG_DIALOG, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HR, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_INS, TAG_KBD, TAG_KEYGEN, TAG_LABEL, TAG_LEGEND, TAG_LI, TAG_LINK, TAG_MAP, TAG_MARK, TAG_META, TAG_METER, TAG_NAV, TAG_NOFRAMES, TAG_NOSCRIPT, TAG_OL, TAG_OPTION, TAG_P, TAG_PARAM, TAG_PLAINTEXT, TAG_PRE, TAG_PROGRESS, TAG_Q, TAG_RP, TAG_RT, TAG_RUBY, TAG_SAMP, TAG_SCRIPT, TAG_SELECT, TAG_SMALL, TAG_SOURCE, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUB, TAG_SUMMARY, TAG_SUP, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEMPLATE, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TIME, TAG_TITLE, TAG_TR, TAG_TRACK, TAG_U, TAG_UL, TAG_VAR, TAG_VIDEO, TAG_WBR, TAG_XMP, TEXT_NODE, TagIdMap, assembleBufferedContent, collectNodeContent } from "./const-BOAJ1T5c.mjs";
2
2
 
3
3
  //#region src/tags.ts
4
4
  function resolveUrl(url, origin) {
@@ -1386,53 +1386,4 @@ function createMarkdownProcessor(options = {}) {
1386
1386
  const MarkdownProcessor = createMarkdownProcessor;
1387
1387
 
1388
1388
  //#endregion
1389
- //#region src/stream.ts
1390
- /**
1391
- * Creates a markdown stream from an HTML stream
1392
- * @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
1393
- * @param options - Configuration options for conversion
1394
- * @returns An async generator yielding markdown chunks
1395
- */
1396
- async function* streamHtmlToMarkdown(htmlStream, options = {}) {
1397
- if (!htmlStream) throw new Error("Invalid HTML stream provided");
1398
- const decoder = new TextDecoder();
1399
- const reader = htmlStream.getReader();
1400
- const processor = createMarkdownProcessor(options);
1401
- const parseState = {
1402
- depthMap: new Uint8Array(1024),
1403
- depth: 0,
1404
- plugins: options.plugins || []
1405
- };
1406
- let remainingHtml = "";
1407
- try {
1408
- while (true) {
1409
- const { done, value } = await reader.read();
1410
- if (done) break;
1411
- const htmlContent = `${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`;
1412
- remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
1413
- processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
1414
- });
1415
- const chunk = processor.getMarkdownChunk();
1416
- if (chunk) yield chunk;
1417
- }
1418
- if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
1419
- processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
1420
- });
1421
- const finalChunk = processor.getMarkdownChunk();
1422
- if (finalChunk) yield finalChunk;
1423
- } finally {
1424
- if (remainingHtml) decoder.decode(new Uint8Array(0), { stream: false });
1425
- reader.releaseLock();
1426
- }
1427
- }
1428
-
1429
- //#endregion
1430
- //#region src/index.ts
1431
- function htmlToMarkdown(html, options = {}) {
1432
- const processor = createMarkdownProcessor(options);
1433
- processor.processHtml(html);
1434
- return processor.getMarkdown();
1435
- }
1436
-
1437
- //#endregion
1438
- export { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
1389
+ export { MarkdownProcessor, createMarkdownProcessor, parseHtml, parseHtmlStream, processPluginsForEvent };
@@ -1,5 +1,5 @@
1
- import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./plugin-Bqz9GKOA.mjs";
2
- import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-TeB1_RYL.mjs";
1
+ import { TAG_ASIDE, TAG_BUTTON, TAG_EMBED, TAG_FIELDSET, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_IFRAME, TAG_INPUT, TAG_NAV, TAG_OBJECT, TAG_SELECT, TAG_TEXTAREA } from "./const-BOAJ1T5c.mjs";
2
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin } from "./plugins-C5_irVJs.mjs";
3
3
 
4
4
  //#region src/preset/minimal.ts
5
5
  /**
@@ -1,4 +1,4 @@
1
- import { Plugin } from "./types-B94khc0C.mjs";
1
+ import { Plugin } from "./types-DqiI86yW.mjs";
2
2
 
3
3
  //#region src/pluggable/plugin.d.ts
4
4
 
@@ -0,0 +1,12 @@
1
+ //#region src/pluggable/plugin.ts
2
+ /**
3
+ * Create a plugin that implements the Plugin interface with improved type inference
4
+ *
5
+ * @returns A complete plugin implementation
6
+ */
7
+ function createPlugin(plugin) {
8
+ return plugin;
9
+ }
10
+
11
+ //#endregion
12
+ export { createPlugin };
@@ -1,5 +1,6 @@
1
- import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion, createPlugin } from "./plugin-Bqz9GKOA.mjs";
2
- import { parseSelector } from "./extraction-BSOWm6fo.mjs";
1
+ import { ELEMENT_NODE, TAG_A, TAG_ADDRESS, TAG_ARTICLE, TAG_ASIDE, TAG_AUDIO, TAG_B, TAG_BLOCKQUOTE, TAG_BODY, TAG_BR, TAG_BUTTON, TAG_CAPTION, TAG_CODE, TAG_DD, TAG_DETAILS, TAG_DIV, TAG_DL, TAG_DT, TAG_EM, TAG_EMBED, TAG_FIELDSET, TAG_FIGCAPTION, TAG_FIGURE, TAG_FOOTER, TAG_FORM, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HEAD, TAG_HEADER, TAG_HR, TAG_HTML, TAG_I, TAG_IFRAME, TAG_IMG, TAG_INPUT, TAG_LI, TAG_MAIN, TAG_META, TAG_NAV, TAG_OBJECT, TAG_OL, TAG_P, TAG_PRE, TAG_SCRIPT, TAG_SECTION, TAG_SELECT, TAG_SPAN, TAG_STRONG, TAG_STYLE, TAG_SUMMARY, TAG_SVG, TAG_TABLE, TAG_TBODY, TAG_TD, TAG_TEXTAREA, TAG_TFOOT, TAG_TH, TAG_THEAD, TAG_TITLE, TAG_TR, TAG_UL, TAG_VIDEO, TEXT_NODE, collectNodeContent, createBufferRegion } from "./const-BOAJ1T5c.mjs";
2
+ import { createPlugin } from "./plugin-DrovQriD.mjs";
3
+ import { parseSelector } from "./extraction-BPaDGYvv.mjs";
3
4
 
4
5
  //#region src/plugins/filter.ts
5
6
  /**
@@ -0,0 +1,52 @@
1
+ import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./markdown-processor-BWVPNlvD.mjs";
2
+
3
+ //#region src/stream.ts
4
+ /**
5
+ * Creates a markdown stream from an HTML stream
6
+ * @param htmlStream - ReadableStream of HTML content (as Uint8Array or string)
7
+ * @param options - Configuration options for conversion
8
+ * @returns An async generator yielding markdown chunks
9
+ */
10
+ async function* streamHtmlToMarkdown(htmlStream, options = {}) {
11
+ if (!htmlStream) throw new Error("Invalid HTML stream provided");
12
+ const decoder = new TextDecoder();
13
+ const reader = htmlStream.getReader();
14
+ const processor = createMarkdownProcessor(options);
15
+ const parseState = {
16
+ depthMap: new Uint8Array(1024),
17
+ depth: 0,
18
+ plugins: options.plugins || []
19
+ };
20
+ let remainingHtml = "";
21
+ try {
22
+ while (true) {
23
+ const { done, value } = await reader.read();
24
+ if (done) break;
25
+ const htmlContent = `${remainingHtml}${typeof value === "string" ? value : decoder.decode(value, { stream: true })}`;
26
+ remainingHtml = parseHtmlStream(htmlContent, parseState, (event) => {
27
+ processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
28
+ });
29
+ const chunk = processor.getMarkdownChunk();
30
+ if (chunk) yield chunk;
31
+ }
32
+ if (remainingHtml) parseHtmlStream(remainingHtml, parseState, (event) => {
33
+ processPluginsForEvent(event, options.plugins, processor.state, processor.processEvent);
34
+ });
35
+ const finalChunk = processor.getMarkdownChunk();
36
+ if (finalChunk) yield finalChunk;
37
+ } finally {
38
+ if (remainingHtml) decoder.decode(new Uint8Array(0), { stream: false });
39
+ reader.releaseLock();
40
+ }
41
+ }
42
+
43
+ //#endregion
44
+ //#region src/index.ts
45
+ function htmlToMarkdown(html, options = {}) {
46
+ const processor = createMarkdownProcessor(options);
47
+ processor.processHtml(html);
48
+ return processor.getMarkdown();
49
+ }
50
+
51
+ //#endregion
52
+ export { htmlToMarkdown, streamHtmlToMarkdown };
@@ -251,5 +251,70 @@ interface PluginContext {
251
251
  tailwind?: TailwindContext;
252
252
  [key: string]: unknown;
253
253
  }
254
+ /**
255
+ * Markdown chunk with content and metadata
256
+ * Compatible with LangChain Document structure
257
+ */
258
+ interface MarkdownChunk {
259
+ /** The markdown content of the chunk */
260
+ content: string;
261
+ /** Metadata extracted during chunking */
262
+ metadata: {
263
+ /** Header hierarchy at this chunk position */
264
+ headers?: Record<string, string>;
265
+ /** Code block language if chunk is/contains code */
266
+ code?: string;
267
+ /** Line number range in original document */
268
+ loc?: {
269
+ lines: {
270
+ from: number;
271
+ to: number;
272
+ };
273
+ };
274
+ };
275
+ }
276
+ /**
277
+ * Options for HTML to Markdown chunking
278
+ * Extends HTMLToMarkdownOptions with chunking-specific settings
279
+ */
280
+ interface SplitterOptions extends HTMLToMarkdownOptions {
281
+ /**
282
+ * Header tag IDs to split on (TAG_H1, TAG_H2, etc.)
283
+ * @example [TAG_H1, TAG_H2]
284
+ * @default [TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6]
285
+ */
286
+ headersToSplitOn?: number[];
287
+ /**
288
+ * Return each line as individual chunk
289
+ * @default false
290
+ */
291
+ returnEachLine?: boolean;
292
+ /**
293
+ * Strip headers from chunk content
294
+ * @default true
295
+ */
296
+ stripHeaders?: boolean;
297
+ /**
298
+ * Maximum chunk size
299
+ * @default 1000
300
+ */
301
+ chunkSize?: number;
302
+ /**
303
+ * Overlap between chunks for context preservation
304
+ * @default 200
305
+ */
306
+ chunkOverlap?: number;
307
+ /**
308
+ * Function to measure chunk length (default: character count)
309
+ * Can be replaced with token counter for LLM applications
310
+ * @default (text) => text.length
311
+ */
312
+ lengthFunction?: (text: string) => number;
313
+ /**
314
+ * Keep separators in the split chunks
315
+ * @default false
316
+ */
317
+ keepSeparator?: boolean;
318
+ }
254
319
  //#endregion
255
- export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, type ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
320
+ export { BufferRegion, ELEMENT_NODE as ELEMENT_NODE$1, ElementNode, type ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE as TEXT_NODE$1, TagHandler, TailwindContext, TextNode, extractionPlugin as extractionPlugin$1 };
package/dist/cli.mjs CHANGED
@@ -1,9 +1,11 @@
1
- import "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { streamHtmlToMarkdown } from "./_chunks/src-DBqiXz8C.mjs";
3
- import "./_chunks/extraction-BSOWm6fo.mjs";
4
- import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-CQ4yFagU.mjs";
5
- import "./_chunks/plugins-TeB1_RYL.mjs";
6
- import { withMinimalPreset } from "./_chunks/minimal-DSW9dhXV.mjs";
1
+ import "./_chunks/const-BOAJ1T5c.mjs";
2
+ import "./_chunks/markdown-processor-BWVPNlvD.mjs";
3
+ import "./_chunks/plugin-DrovQriD.mjs";
4
+ import { streamHtmlToMarkdown } from "./_chunks/src-D-NT7shY.mjs";
5
+ import "./_chunks/extraction-BPaDGYvv.mjs";
6
+ import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-XvDQwYbj.mjs";
7
+ import "./_chunks/plugins-C5_irVJs.mjs";
8
+ import { withMinimalPreset } from "./_chunks/minimal-co1tIZYm.mjs";
7
9
  import { readFileSync } from "node:fs";
8
10
  import { mkdir, writeFile } from "node:fs/promises";
9
11
  import { Readable } from "node:stream";
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-B94khc0C.mjs";
2
- import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
1
+ import { BufferRegion, ELEMENT_NODE$1 as ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE$1 as TEXT_NODE, TagHandler, TailwindContext, TextNode } from "./_chunks/types-DqiI86yW.mjs";
2
+ import { createPlugin$1 as createPlugin } from "./_chunks/plugin-CgnpSqtP.mjs";
3
3
  import { ReadableStream } from "node:stream/web";
4
4
 
5
5
  //#region src/const.d.ts
@@ -181,4 +181,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
181
181
  //#region src/index.d.ts
182
182
  declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
183
183
  //#endregion
184
- export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
184
+ export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -1,4 +1,6 @@
1
- import { TagIdMap, createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { MarkdownProcessor, htmlToMarkdown, parseHtml, streamHtmlToMarkdown } from "./_chunks/src-DBqiXz8C.mjs";
1
+ import { TagIdMap } from "./_chunks/const-BOAJ1T5c.mjs";
2
+ import { MarkdownProcessor, parseHtml } from "./_chunks/markdown-processor-BWVPNlvD.mjs";
3
+ import { createPlugin } from "./_chunks/plugin-DrovQriD.mjs";
4
+ import { htmlToMarkdown, streamHtmlToMarkdown } from "./_chunks/src-D-NT7shY.mjs";
3
5
 
4
6
  export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/llms-txt.mjs CHANGED
@@ -1,6 +1,8 @@
1
- import "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import "./_chunks/src-DBqiXz8C.mjs";
3
- import "./_chunks/extraction-BSOWm6fo.mjs";
4
- import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-CQ4yFagU.mjs";
1
+ import "./_chunks/const-BOAJ1T5c.mjs";
2
+ import "./_chunks/markdown-processor-BWVPNlvD.mjs";
3
+ import "./_chunks/plugin-DrovQriD.mjs";
4
+ import "./_chunks/src-D-NT7shY.mjs";
5
+ import "./_chunks/extraction-BPaDGYvv.mjs";
6
+ import { generateLlmsTxtArtifacts } from "./_chunks/llms-txt-XvDQwYbj.mjs";
5
7
 
6
8
  export { generateLlmsTxtArtifacts };
@@ -1,5 +1,5 @@
1
- import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-B94khc0C.mjs";
2
- import { createPlugin$1 as createPlugin } from "./_chunks/plugin-BUiqQb0v.mjs";
1
+ import { Plugin, extractionPlugin$1 as extractionPlugin } from "./_chunks/types-DqiI86yW.mjs";
2
+ import { createPlugin$1 as createPlugin } from "./_chunks/plugin-CgnpSqtP.mjs";
3
3
 
4
4
  //#region src/plugins/filter.d.ts
5
5
 
package/dist/plugins.mjs CHANGED
@@ -1,5 +1,6 @@
1
- import { createPlugin } from "./_chunks/plugin-Bqz9GKOA.mjs";
2
- import { extractionPlugin } from "./_chunks/extraction-BSOWm6fo.mjs";
3
- import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-TeB1_RYL.mjs";
1
+ import "./_chunks/const-BOAJ1T5c.mjs";
2
+ import { createPlugin } from "./_chunks/plugin-DrovQriD.mjs";
3
+ import { extractionPlugin } from "./_chunks/extraction-BPaDGYvv.mjs";
4
+ import { filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin } from "./_chunks/plugins-C5_irVJs.mjs";
4
5
 
5
6
  export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
@@ -1,4 +1,4 @@
1
- import { HTMLToMarkdownOptions } from "../_chunks/types-B94khc0C.mjs";
1
+ import { HTMLToMarkdownOptions } from "../_chunks/types-DqiI86yW.mjs";
2
2
 
3
3
  //#region src/preset/minimal.d.ts
4
4
 
@@ -1,6 +1,7 @@
1
- import "../_chunks/plugin-Bqz9GKOA.mjs";
2
- import "../_chunks/extraction-BSOWm6fo.mjs";
3
- import "../_chunks/plugins-TeB1_RYL.mjs";
4
- import { withMinimalPreset } from "../_chunks/minimal-DSW9dhXV.mjs";
1
+ import "../_chunks/const-BOAJ1T5c.mjs";
2
+ import "../_chunks/plugin-DrovQriD.mjs";
3
+ import "../_chunks/extraction-BPaDGYvv.mjs";
4
+ import "../_chunks/plugins-C5_irVJs.mjs";
5
+ import { withMinimalPreset } from "../_chunks/minimal-co1tIZYm.mjs";
5
6
 
6
7
  export { withMinimalPreset };
@@ -0,0 +1,11 @@
1
+ import { MarkdownChunk, SplitterOptions } from "./_chunks/types-DqiI86yW.mjs";
2
+
3
+ //#region src/splitter.d.ts
4
+
5
+ /**
6
+ * Convert HTML to Markdown and split into chunks in single pass
7
+ * Chunks are created during HTML event processing
8
+ */
9
+ declare function htmlToMarkdownSplitChunks(html: string, options?: SplitterOptions): MarkdownChunk[];
10
+ //#endregion
11
+ export { type MarkdownChunk, type SplitterOptions, htmlToMarkdownSplitChunks };
@@ -0,0 +1,202 @@
1
+ import { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_CODE, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HR, TAG_PRE, TEXT_NODE } from "./_chunks/const-BOAJ1T5c.mjs";
2
+ import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./_chunks/markdown-processor-BWVPNlvD.mjs";
3
+
4
+ //#region src/splitter.ts
5
+ const DEFAULT_HEADERS_TO_SPLIT_ON = [
6
+ TAG_H2,
7
+ TAG_H3,
8
+ TAG_H4,
9
+ TAG_H5,
10
+ TAG_H6
11
+ ];
12
+ function createOptions(options) {
13
+ return {
14
+ headersToSplitOn: options.headersToSplitOn ?? DEFAULT_HEADERS_TO_SPLIT_ON,
15
+ returnEachLine: options.returnEachLine ?? false,
16
+ stripHeaders: options.stripHeaders ?? true,
17
+ chunkSize: options.chunkSize ?? 1e3,
18
+ chunkOverlap: options.chunkOverlap ?? 200,
19
+ lengthFunction: options.lengthFunction ?? ((text) => text.length),
20
+ keepSeparator: options.keepSeparator ?? false,
21
+ origin: options.origin,
22
+ plugins: options.plugins ?? []
23
+ };
24
+ }
25
+ function getCodeLanguage(node) {
26
+ const className = node.attributes?.class;
27
+ if (!className) return "";
28
+ const langParts = className.split(" ").map((c) => c.split("language-")[1]).filter(Boolean);
29
+ return langParts.length > 0 ? langParts[0].trim() : "";
30
+ }
31
+ function shouldSplitOnHeader(tagId, options) {
32
+ return options.headersToSplitOn.includes(tagId);
33
+ }
34
+ /**
35
+ * Get current markdown content WITHOUT clearing buffers
36
+ */
37
+ function getCurrentMarkdown(state) {
38
+ const fragments = [];
39
+ for (const [regionId, content] of state.regionContentBuffers.entries()) {
40
+ const include = state.regionToggles.get(regionId);
41
+ if (include) fragments.push(...content);
42
+ }
43
+ return fragments.join("").trimStart();
44
+ }
45
+ /**
46
+ * Convert HTML to Markdown and split into chunks in single pass
47
+ * Chunks are created during HTML event processing
48
+ */
49
+ function htmlToMarkdownSplitChunks(html, options = {}) {
50
+ const opts = createOptions(options);
51
+ if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
52
+ const processor = createMarkdownProcessor({
53
+ origin: opts.origin,
54
+ plugins: opts.plugins
55
+ });
56
+ const chunks = [];
57
+ const headerHierarchy = new Map();
58
+ const seenSplitHeaders = new Set();
59
+ let currentChunkCodeLanguage = "";
60
+ let collectingHeaderText = false;
61
+ let currentHeaderTagId = null;
62
+ let currentHeaderText = "";
63
+ let lineNumber = 1;
64
+ let lastChunkEndPosition = 0;
65
+ let lastSplitPosition = 0;
66
+ function flushChunk(endPosition) {
67
+ const currentMd = getCurrentMarkdown(processor.state);
68
+ const chunkEnd = endPosition ?? currentMd.length;
69
+ const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
70
+ if (!chunkContent.trim()) {
71
+ lastChunkEndPosition = chunkEnd;
72
+ return;
73
+ }
74
+ const chunk = {
75
+ content: chunkContent.trimEnd(),
76
+ metadata: { loc: { lines: {
77
+ from: lineNumber,
78
+ to: lineNumber + (chunkContent.match(/\n/g) || []).length
79
+ } } }
80
+ };
81
+ if (headerHierarchy.size > 0) {
82
+ chunk.metadata.headers = {};
83
+ for (const [tagId, text] of headerHierarchy.entries()) {
84
+ const level = `h${tagId - TAG_H1 + 1}`;
85
+ chunk.metadata.headers[level] = text;
86
+ }
87
+ }
88
+ if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
89
+ chunks.push(chunk);
90
+ currentChunkCodeLanguage = "";
91
+ lastSplitPosition = chunkEnd;
92
+ if (opts.chunkOverlap > 0) {
93
+ const maxOverlap = Math.max(0, chunkContent.length - 1);
94
+ const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
95
+ lastChunkEndPosition = chunkEnd - actualOverlap;
96
+ } else lastChunkEndPosition = chunkEnd;
97
+ lineNumber += (chunkContent.match(/\n/g) || []).length;
98
+ }
99
+ const parseState = {
100
+ depthMap: processor.state.depthMap,
101
+ depth: 0,
102
+ plugins: opts.plugins
103
+ };
104
+ parseHtmlStream(html, parseState, (event) => {
105
+ const { type: eventType, node } = event;
106
+ if (node.type === ELEMENT_NODE) {
107
+ const element = node;
108
+ const tagId = element.tagId;
109
+ if (tagId && tagId >= TAG_H1 && tagId <= TAG_H6) {
110
+ if (eventType === NodeEventEnter) {
111
+ collectingHeaderText = true;
112
+ currentHeaderTagId = tagId;
113
+ currentHeaderText = "";
114
+ if (shouldSplitOnHeader(tagId, opts)) {
115
+ if (seenSplitHeaders.has(tagId)) {
116
+ flushChunk();
117
+ for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
118
+ }
119
+ seenSplitHeaders.add(tagId);
120
+ }
121
+ } else if (eventType === NodeEventExit && currentHeaderTagId === tagId) {
122
+ headerHierarchy.set(tagId, currentHeaderText.trim());
123
+ collectingHeaderText = false;
124
+ currentHeaderTagId = null;
125
+ }
126
+ }
127
+ if (tagId === TAG_CODE && element.depthMap[TAG_PRE] > 0) {
128
+ if (eventType === NodeEventEnter) {
129
+ const lang = getCodeLanguage(element);
130
+ if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
131
+ }
132
+ }
133
+ if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
134
+ }
135
+ if (collectingHeaderText && node.type === TEXT_NODE) {
136
+ const textNode = node;
137
+ currentHeaderText += textNode.value;
138
+ }
139
+ processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
140
+ if (!opts.returnEachLine) {
141
+ const currentMd = getCurrentMarkdown(processor.state);
142
+ const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
143
+ if (currentChunkSize > opts.chunkSize) {
144
+ const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
145
+ const separators = [
146
+ "\n\n",
147
+ "```\n",
148
+ "\n",
149
+ " "
150
+ ];
151
+ let splitPosition = -1;
152
+ for (const sep of separators) {
153
+ const idx = currentMd.lastIndexOf(sep, idealSplitPos);
154
+ const candidateSplitPos = idx + sep.length;
155
+ if (idx >= 0) {
156
+ const beforeSplit = currentMd.slice(0, candidateSplitPos);
157
+ let backtickCount = 0;
158
+ let pos = 0;
159
+ while ((pos = beforeSplit.indexOf("```", pos)) !== -1) {
160
+ backtickCount++;
161
+ pos += 3;
162
+ }
163
+ if (backtickCount % 2 === 1) continue;
164
+ }
165
+ if (idx >= 0 && candidateSplitPos > lastSplitPosition) {
166
+ splitPosition = candidateSplitPos;
167
+ break;
168
+ }
169
+ }
170
+ if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
171
+ flushChunk(splitPosition);
172
+ }
173
+ }
174
+ });
175
+ flushChunk();
176
+ if (opts.returnEachLine && chunks.length > 0) {
177
+ const lineChunks = [];
178
+ for (const chunk of chunks) {
179
+ const lines = chunk.content.split("\n");
180
+ const chunkStartLine = chunk.metadata.loc?.lines.from || 1;
181
+ for (let i = 0; i < lines.length; i++) {
182
+ const line = lines[i];
183
+ if (line.trim()) lineChunks.push({
184
+ content: line,
185
+ metadata: {
186
+ ...chunk.metadata,
187
+ loc: { lines: {
188
+ from: chunkStartLine + i,
189
+ to: chunkStartLine + i
190
+ } }
191
+ }
192
+ });
193
+ }
194
+ }
195
+ return lineChunks;
196
+ }
197
+ if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
198
+ return chunks.filter((chunk) => chunk.content.length > 0);
199
+ }
200
+
201
+ //#endregion
202
+ export { htmlToMarkdownSplitChunks };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.12.3",
4
+ "version": "0.13.1",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -54,6 +54,14 @@
54
54
  "default": "./dist/preset/minimal.mjs"
55
55
  },
56
56
  "default": "./dist/preset/minimal.mjs"
57
+ },
58
+ "./splitter": {
59
+ "types": "./dist/splitter.d.mts",
60
+ "import": {
61
+ "types": "./dist/splitter.d.mts",
62
+ "default": "./dist/splitter.mjs"
63
+ },
64
+ "default": "./dist/splitter.mjs"
57
65
  }
58
66
  },
59
67
  "main": "./dist/index.mjs",