npm - mdream - Versions diffs - 0.13.3 → 0.15.0 - Mend

mdream 0.13.3 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +111 -0
package/dist/_chunks/{const-BOAJ1T5c.mjs → const-Bf_XN9U9.mjs} +2 -5
package/dist/_chunks/{extraction-BPaDGYvv.mjs → extraction-BA9MDtq3.mjs} +4 -6
package/dist/_chunks/{llms-txt-DC12yO2l.mjs → llms-txt-T79S7X24.mjs} +123 -35
package/dist/_chunks/{markdown-processor-f7XT0--8.mjs → markdown-processor-D26Uo5td.mjs} +35 -64
package/dist/_chunks/{minimal-co1tIZYm.mjs → minimal-BiDhcwif.mjs} +3 -3
package/dist/_chunks/{plugin-DrovQriD.mjs → plugin-CjWWQTuL.mjs} +1 -1
package/dist/_chunks/{plugin-CgnpSqtP.d.mts → plugin-D5soyEXm.d.mts} +2 -2
package/dist/_chunks/{plugins-C5_irVJs.mjs → plugins-DJnqR2fA.mjs} +23 -41
package/dist/_chunks/{src-C3QpB75q.mjs → src-BJpipdul.mjs} +3 -4
package/dist/_chunks/{types-DqiI86yW.d.mts → types-CT4ZxeOH.d.mts} +1 -1
package/dist/cli.mjs +12 -18
package/dist/iife.js +8 -18
package/dist/index.d.mts +2 -5
package/dist/index.mjs +4 -4
package/dist/llms-txt.d.mts +45 -1
package/dist/llms-txt.mjs +7 -7
package/dist/plugins.d.mts +2 -2
package/dist/plugins.mjs +4 -4
package/dist/preset/minimal.d.mts +1 -1
package/dist/preset/minimal.mjs +5 -5
package/dist/splitter.d.mts +7 -2
package/dist/splitter.mjs +45 -34
package/package.json +1 -1

package/dist/splitter.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
-import { ELEMENT_NODE, NodeEventEnter, NodeEventExit, TAG_CODE, TAG_H1, TAG_H2, TAG_H3, TAG_H4, TAG_H5, TAG_H6, TAG_HR, TAG_PRE, TEXT_NODE } from "./_chunks/const-BOAJ1T5c.mjs";
-import { createMarkdownProcessor, parseHtmlStream, processPluginsForEvent } from "./_chunks/markdown-processor-f7XT0--8.mjs";
+import { $ as TAG_H2, F as TAG_CODE, Nt as TAG_PRE, Q as TAG_H1, et as TAG_H3, gn as TEXT_NODE, h as NodeEventExit, m as NodeEventEnter, nt as TAG_H5, ot as TAG_HR, r as ELEMENT_NODE, rt as TAG_H6, tt as TAG_H4 } from "./_chunks/const-Bf_XN9U9.mjs";
+import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor-D26Uo5td.mjs";
 //#region src/splitter.ts
 const DEFAULT_HEADERS_TO_SPLIT_ON = [
@@ -36,26 +36,22 @@ function shouldSplitOnHeader(tagId, options) {
 */
 function getCurrentMarkdown(state) {
 	const fragments = [];
-	for (const [regionId, content] of state.regionContentBuffers.entries()) {
-		const include = state.regionToggles.get(regionId);
-		if (include) fragments.push(...content);
-	}
+	for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
 	return fragments.join("").trimStart();
 }
 /**
 * Convert HTML to Markdown and split into chunks in single pass
-* Chunks are created during HTML event processing
+* Yields chunks during HTML event processing for better memory efficiency
 */
-function htmlToMarkdownSplitChunks(html, options = {}) {
+function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 	const opts = createOptions(options);
 	if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
 	const processor = createMarkdownProcessor({
 		origin: opts.origin,
 		plugins: opts.plugins
 	});
-	const chunks = [];
-	const headerHierarchy = new Map();
-	const seenSplitHeaders = new Set();
+	const headerHierarchy = /* @__PURE__ */ new Map();
+	const seenSplitHeaders = /* @__PURE__ */ new Set();
 	let currentChunkCodeLanguage = "";
 	let collectingHeaderText = false;
 	let currentHeaderTagId = null;
@@ -63,19 +59,27 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 	let lineNumber = 1;
 	let lastChunkEndPosition = 0;
 	let lastSplitPosition = 0;
-	function flushChunk(endPosition, applyOverlap = false) {
+	function* flushChunk(endPosition, applyOverlap = false) {
 		const currentMd = getCurrentMarkdown(processor.state);
 		const chunkEnd = endPosition ?? currentMd.length;
-		const chunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
-		if (!chunkContent.trim()) {
+		const originalChunkContent = currentMd.slice(lastChunkEndPosition, chunkEnd);
+		if (!originalChunkContent.trim()) {
 			lastChunkEndPosition = chunkEnd;
 			return;
 		}
+		let chunkContent = originalChunkContent;
+		if (opts.stripHeaders) {
+			chunkContent = chunkContent.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
+			if (!chunkContent) {
+				lastChunkEndPosition = chunkEnd;
+				return;
+			}
+		}
 		const chunk = {
 			content: chunkContent.trimEnd(),
 			metadata: { loc: { lines: {
 				from: lineNumber,
-				to: lineNumber + (chunkContent.match(/\n/g) || []).length
+				to: lineNumber + (originalChunkContent.match(/\n/g) || []).length
 			} } }
 		};
 		if (headerHierarchy.size > 0) {
@@ -86,22 +90,25 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 			}
 		}
 		if (currentChunkCodeLanguage) chunk.metadata.code = currentChunkCodeLanguage;
-		chunks.push(chunk);
+		yield chunk;
 		currentChunkCodeLanguage = "";
 		lastSplitPosition = chunkEnd;
 		if (applyOverlap && opts.chunkOverlap > 0) {
-			const maxOverlap = Math.max(0, chunkContent.length - 1);
-			const actualOverlap = Math.min(opts.chunkOverlap, maxOverlap);
-			lastChunkEndPosition = chunkEnd - actualOverlap;
+			const maxOverlap = Math.max(0, originalChunkContent.length - 1);
+			lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
 		} else lastChunkEndPosition = chunkEnd;
-		lineNumber += (chunkContent.match(/\n/g) || []).length;
+		lineNumber += (originalChunkContent.match(/\n/g) || []).length;
 	}
 	const parseState = {
 		depthMap: processor.state.depthMap,
 		depth: 0,
 		plugins: opts.plugins
 	};
+	const eventBuffer = [];
 	parseHtmlStream(html, parseState, (event) => {
+		eventBuffer.push(event);
+	});
+	for (const event of eventBuffer) {
 		const { type: eventType, node } = event;
 		if (node.type === ELEMENT_NODE) {
 			const element = node;
@@ -113,7 +120,7 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 					currentHeaderText = "";
 					if (shouldSplitOnHeader(tagId, opts)) {
 						if (seenSplitHeaders.has(tagId)) {
-							flushChunk();
+							yield* flushChunk();
 							for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
 						}
 						seenSplitHeaders.add(tagId);
@@ -130,17 +137,13 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 					if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
 				}
 			}
-			if (tagId === TAG_HR && eventType === NodeEventEnter) flushChunk();
-		}
-		if (collectingHeaderText && node.type === TEXT_NODE) {
-			const textNode = node;
-			currentHeaderText += textNode.value;
+			if (tagId === TAG_HR && eventType === NodeEventEnter) yield* flushChunk();
 		}
+		if (collectingHeaderText && node.type === TEXT_NODE) currentHeaderText += node.value;
 		processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
 		if (!opts.returnEachLine) {
 			const currentMd = getCurrentMarkdown(processor.state);
-			const currentChunkSize = opts.lengthFunction(currentMd.slice(lastChunkEndPosition));
-			if (currentChunkSize > opts.chunkSize) {
+			if (opts.lengthFunction(currentMd.slice(lastChunkEndPosition)) > opts.chunkSize) {
 				const idealSplitPos = lastChunkEndPosition + opts.chunkSize;
 				const separators = [
 					"\n\n",
@@ -168,11 +171,20 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 					}
 				}
 				if (splitPosition === -1 || splitPosition <= lastChunkEndPosition) splitPosition = currentMd.length;
-				flushChunk(splitPosition, true);
+				yield* flushChunk(splitPosition, true);
 			}
 		}
-	});
-	flushChunk();
+	}
+	yield* flushChunk();
+}
+/**
+* Convert HTML to Markdown and split into chunks in single pass
+* Chunks are created during HTML event processing
+*/
+function htmlToMarkdownSplitChunks(html, options = {}) {
+	const opts = createOptions(options);
+	const chunks = [];
+	for (const chunk of htmlToMarkdownSplitChunksStream(html, options)) chunks.push(chunk);
 	if (opts.returnEachLine && chunks.length > 0) {
 		const lineChunks = [];
 		for (const chunk of chunks) {
@@ -194,9 +206,8 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 		}
 		return lineChunks;
 	}
-	if (opts.stripHeaders) for (const chunk of chunks) chunk.content = chunk.content.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
-	return chunks.filter((chunk) => chunk.content.length > 0);
+	return chunks;
 }
 //#endregion
-export { htmlToMarkdownSplitChunks };
+export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "mdream",
   "type": "module",
-  "version": "0.13.3",
+  "version": "0.15.0",
   "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
   "author": {
     "name": "Harlan Wilton",