npm - mdream - Versions diffs - 0.16.0 → 0.17.0 - Mend

mdream 0.16.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/README.md +0 -9
package/dist/_chunks/const.mjs +110 -228
package/dist/_chunks/extraction.mjs +24 -1
package/dist/_chunks/markdown-processor.mjs +256 -165
package/dist/_chunks/plugin.mjs +7 -0
package/dist/_chunks/{tailwind.mjs → plugins.mjs} +109 -26
package/dist/_chunks/{stream.mjs → src.mjs} +16 -1
package/dist/cli.mjs +7 -1
package/dist/iife.js +3 -3
package/dist/index.mjs +3 -8
package/dist/llms-txt.mjs +91 -5
package/dist/negotiate.d.mts +26 -0
package/dist/negotiate.mjs +92 -0
package/dist/plugins.mjs +2 -1
package/dist/preset/minimal.mjs +28 -18
package/dist/splitter.mjs +34 -19
package/package.json +10 -2

package/dist/llms-txt.mjs CHANGED Viewed

@@ -1,10 +1,19 @@
+import "./_chunks/const.mjs";
 import "./_chunks/markdown-processor.mjs";
-import "./_chunks/stream.mjs";
-import { htmlToMarkdown } from "./index.mjs";
+import "./_chunks/plugin.mjs";
+import { t as htmlToMarkdown } from "./_chunks/src.mjs";
 import { t as extractionPlugin } from "./_chunks/extraction.mjs";
 import { mkdir, open, readFile } from "node:fs/promises";
 import { basename, dirname, join, relative, sep } from "pathe";
 import { glob } from "tinyglobby";
+//#region src/llms-txt.ts
+const FRONTMATTER_RE = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
+const ANCHOR_INVALID_CHARS_RE = /[^a-z0-9]/g;
+const LEADING_SLASH_RE = /^\//;
+const TRAILING_SLASH_RE = /\/$/;
+/**
+* Extract metadata from HTML content using mdream's extraction plugin
+*/
 function extractMetadata(html, url) {
 	let title = "";
 	let description = "";
@@ -40,6 +49,9 @@ function extractMetadata(html, url) {
 		author: author || void 0
 	};
 }
+/**
+* Convert file path to URL path
+*/
 function pathToUrl(filePath, baseDir) {
 	let url = relative(baseDir, filePath);
 	url = url.split(sep).join("/");
@@ -49,6 +61,9 @@ function pathToUrl(filePath, baseDir) {
 	if (!url.startsWith("/")) url = `/${url}`;
 	return url;
 }
+/**
+* Process HTML files from glob patterns
+*/
 async function processHtmlFiles(patterns, origin) {
 	const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
 	const allFiles = [];
@@ -76,6 +91,9 @@ async function processHtmlFiles(patterns, origin) {
 	}
 	return results;
 }
+/**
+* Generate llms.txt content
+*/
 function generateLlmsTxtContent(files, options) {
 	const { siteName = "Site", description, origin = "", sections, notes } = options;
 	let content = `# ${siteName}\n\n`;
@@ -99,8 +117,11 @@ function generateLlmsTxtContent(files, options) {
 	if (notes) content += `\n${formatNotes(notes)}`;
 	return content;
 }
+/**
+* Parse frontmatter from markdown content
+*/
 function parseFrontmatter(content) {
-	const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
+	const match = content.match(FRONTMATTER_RE);
 	if (!match) return {
 		frontmatter: null,
 		body: content
@@ -121,11 +142,17 @@ function parseFrontmatter(content) {
 		body
 	};
 }
+/**
+* Serialize frontmatter object to YAML-like format
+*/
 function serializeFrontmatter(data) {
 	const lines = [];
 	for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
 	return lines.join("\n");
 }
+/**
+* Generate llms-full.txt content with complete page content
+*/
 function generateLlmsFullTxtContent(files, options) {
 	const { siteName = "Site", description, origin = "", sections, notes } = options;
 	let content = `# ${siteName}\n\n`;
@@ -135,7 +162,7 @@ function generateLlmsFullTxtContent(files, options) {
 	if (files.length > 0) {
 		content += `## Table of Contents\n\n`;
 		for (const file of files) {
-			const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
+			const anchor = file.title.toLowerCase().replace(ANCHOR_INVALID_CHARS_RE, "-");
 			content += `- [${file.title}](#${anchor})\n`;
 		}
 		content += `\n---\n\n`;
@@ -166,10 +193,13 @@ function generateLlmsFullTxtContent(files, options) {
 	if (notes) content += `\n${formatNotes(notes)}`;
 	return content;
 }
+/**
+* Generate individual markdown files structure
+*/
 function generateMarkdownFilesContent(files) {
 	const markdownFiles = [];
 	for (const file of files) {
-		const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
+		const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(LEADING_SLASH_RE, "").replace(TRAILING_SLASH_RE, "")}.md`;
 		markdownFiles.push({
 			path: mdPath,
 			content: file.content
@@ -177,6 +207,9 @@ function generateMarkdownFilesContent(files) {
 	}
 	return markdownFiles;
 }
+/**
+* Main function to process files and generate llms.txt artifacts
+*/
 async function generateLlmsTxtArtifacts(options) {
 	let files;
 	if (options.files) files = options.files;
@@ -194,6 +227,9 @@ async function generateLlmsTxtArtifacts(options) {
 		processedFiles: files
 	};
 }
+/**
+* Format a section with title, description, and links
+*/
 function formatSection(section) {
 	let content = `## ${section.title}\n\n`;
 	if (section.description) {
@@ -209,18 +245,67 @@ function formatSection(section) {
 	}
 	return content;
 }
+/**
+* Format notes section
+*/
 function formatNotes(notes) {
 	const noteLines = Array.isArray(notes) ? notes : [notes];
 	let content = "";
 	for (const note of noteLines) content += `${note}\n\n`;
 	return content;
 }
+/**
+* Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
+*
+* Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
+* never keeping full content in memory. Creates outputDir recursively if needed.
+*
+* @example
+* ```typescript
+* const stream = createLlmsTxtStream({
+*   siteName: 'My Docs',
+*   description: 'Documentation site',
+*   origin: 'https://example.com',
+*   generateFull: true,
+*   outputDir: './dist',
+*   sections: [
+*     {
+*       title: 'Getting Started',
+*       description: 'Quick start guide',
+*       links: [
+*         { title: 'Installation', href: '/install', description: 'How to install' },
+*         { title: 'Quick Start', href: '/quickstart' },
+*       ],
+*     },
+*   ],
+*   notes: ['Generated by mdream', 'Last updated: 2024'],
+* })
+*
+* const writer = stream.getWriter()
+* await writer.write({
+*   title: 'Home',
+*   content: '# Welcome\n\nHome page content.',
+*   url: '/',
+* })
+* await writer.close()
+* ```
+*
+* @param options - Configuration options
+* @returns WritableStream that accepts ProcessedFile objects
+*/
+/**
+* Get group prefix for a URL (up to 2 segments)
+*/
 function getGroupPrefix(url, depth) {
 	const segments = url.split("/").filter(Boolean);
 	if (segments.length === 0) return "/";
 	if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
 	return `/${segments[0]}/${segments[1]}`;
 }
+/**
+* Sort pages by URL path in hierarchical order (directory tree structure)
+* Groups by up to 2 segments, with root-level pages without nesting grouped together
+*/
 function sortPagesByPath(pages) {
 	const twoSegmentCount = /* @__PURE__ */ new Map();
 	for (const page of pages) {
@@ -380,4 +465,5 @@ function createLlmsTxtStream(options = {}) {
 		}
 	});
 }
+//#endregion
 export { createLlmsTxtStream, generateLlmsTxtArtifacts };

package/dist/negotiate.d.mts ADDED Viewed

@@ -0,0 +1,26 @@
+//#region src/negotiate.d.ts
+interface AcceptEntry {
+  type: string;
+  q: number;
+  position: number;
+}
+/**
+ * Parse an HTTP Accept header into an ordered list of media types with quality values.
+ * Supports quality weights (q=0.9) and preserves original position for tie-breaking.
+ */
+declare function parseAcceptHeader(accept: string): AcceptEntry[];
+/**
+ * Determine if a client prefers markdown over HTML using proper content negotiation.
+ *
+ * Uses Accept header quality weights and position ordering:
+ * - If text/markdown or text/plain has higher quality than text/html → markdown
+ * - If same quality, earlier position in Accept header wins
+ * - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
+ * - sec-fetch-dest: document always returns false (browser navigation)
+ *
+ * @param acceptHeader - The HTTP Accept header value
+ * @param secFetchDest - The Sec-Fetch-Dest header value
+ */
+declare function shouldServeMarkdown(acceptHeader?: string, secFetchDest?: string): boolean;
+//#endregion
+export { parseAcceptHeader, shouldServeMarkdown };

package/dist/negotiate.mjs ADDED Viewed

@@ -0,0 +1,92 @@
+//#region src/negotiate.ts
+/**
+* Parse an HTTP Accept header into an ordered list of media types with quality values.
+* Supports quality weights (q=0.9) and preserves original position for tie-breaking.
+*/
+function parseAcceptHeader(accept) {
+	if (!accept) return [];
+	const entries = [];
+	const parts = accept.split(",");
+	for (let i = 0; i < parts.length; i++) {
+		const part = parts[i].trim();
+		if (!part) continue;
+		const semicolonIdx = part.indexOf(";");
+		let type;
+		let q = 1;
+		if (semicolonIdx === -1) type = part;
+		else {
+			type = part.slice(0, semicolonIdx).trim();
+			const paramStr = part.slice(semicolonIdx + 1);
+			const qIdx = paramStr.indexOf("q=");
+			if (qIdx !== -1) {
+				const qStart = qIdx + 2;
+				let qEnd = qStart;
+				while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
+				q = +paramStr.slice(qStart, qEnd) || 0;
+			}
+		}
+		entries.push({
+			type,
+			q,
+			position: i
+		});
+	}
+	return entries;
+}
+/**
+* Determine if a client prefers markdown over HTML using proper content negotiation.
+*
+* Uses Accept header quality weights and position ordering:
+* - If text/markdown or text/plain has higher quality than text/html → markdown
+* - If same quality, earlier position in Accept header wins
+* - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
+* - sec-fetch-dest: document always returns false (browser navigation)
+*
+* @param acceptHeader - The HTTP Accept header value
+* @param secFetchDest - The Sec-Fetch-Dest header value
+*/
+function shouldServeMarkdown(acceptHeader, secFetchDest) {
+	if (secFetchDest === "document") return false;
+	const accept = acceptHeader || "";
+	if (!accept) return false;
+	const parts = accept.split(",");
+	let bestMdQ = -1;
+	let bestMdPos = -1;
+	let htmlQ = -1;
+	let htmlPos = -1;
+	for (let i = 0; i < parts.length; i++) {
+		const part = parts[i].trim();
+		if (!part) continue;
+		const semicolonIdx = part.indexOf(";");
+		let type;
+		let q = 1;
+		if (semicolonIdx === -1) type = part;
+		else {
+			type = part.slice(0, semicolonIdx).trim();
+			const paramStr = part.slice(semicolonIdx + 1);
+			const qIdx = paramStr.indexOf("q=");
+			if (qIdx !== -1) {
+				const qStart = qIdx + 2;
+				let qEnd = qStart;
+				while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
+				q = +paramStr.slice(qStart, qEnd) || 0;
+			}
+		}
+		if (type === "text/markdown" || type === "text/plain") {
+			if (q > bestMdQ || q === bestMdQ && (bestMdPos === -1 || i < bestMdPos)) {
+				bestMdQ = q;
+				bestMdPos = i;
+			}
+		} else if (type === "text/html") {
+			htmlQ = q;
+			htmlPos = i;
+		}
+	}
+	if (bestMdPos === -1) return false;
+	if (htmlPos === -1) return true;
+	if (bestMdQ > htmlQ) return true;
+	if (bestMdQ === htmlQ && bestMdPos < htmlPos) return true;
+	return false;
+}
+//#endregion
+export { parseAcceptHeader, shouldServeMarkdown };

package/dist/plugins.mjs CHANGED Viewed

@@ -1,4 +1,5 @@
+import "./_chunks/const.mjs";
 import { t as createPlugin } from "./_chunks/plugin.mjs";
 import { t as extractionPlugin } from "./_chunks/extraction.mjs";
-import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/tailwind.mjs";
+import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/plugins.mjs";
 export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };

package/dist/preset/minimal.mjs CHANGED Viewed

@@ -1,30 +1,40 @@
-import { Ct as TAG_OBJECT, G as TAG_FIELDSET, J as TAG_FORM, K as TAG_FIGURE, Lt as TAG_SELECT, W as TAG_EMBED, Zt as TAG_TEXTAREA, at as TAG_IFRAME, bt as TAG_NAV, k as TAG_BUTTON, q as TAG_FOOTER, st as TAG_INPUT, x as TAG_ASIDE } from "../_chunks/const.mjs";
-import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "../_chunks/tailwind.mjs";
+import "../_chunks/const.mjs";
+import "../_chunks/plugin.mjs";
+import "../_chunks/extraction.mjs";
+import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "../_chunks/plugins.mjs";
+//#region src/preset/minimal.ts
+/**
+* Creates a configurable minimal preset with advanced options
+*
+* @param options HTML to Markdown options
+* @returns HTML to Markdown options with configured plugins
+*/
 function withMinimalPreset(options = {}) {
+	const filter = filterPlugin({ exclude: [
+		40,
+		68,
+		103,
+		58,
+		47,
+		88,
+		73,
+		59,
+		66,
+		65,
+		43,
+		41
+	] });
 	const plugins = [
 		frontmatterPlugin(),
 		isolateMainPlugin(),
 		tailwindPlugin(),
-		filterPlugin({ exclude: [
-			TAG_FORM,
-			TAG_FIELDSET,
-			TAG_OBJECT,
-			TAG_EMBED,
-			TAG_FIGURE,
-			TAG_FOOTER,
-			TAG_ASIDE,
-			TAG_IFRAME,
-			TAG_INPUT,
-			TAG_TEXTAREA,
-			TAG_SELECT,
-			TAG_BUTTON,
-			TAG_NAV
-		] })
+		...options.plugins || [],
+		filter
 	];
-	if (options.plugins) plugins.push(...options.plugins);
 	return {
 		...options,
 		plugins
 	};
 }
+//#endregion
 export { withMinimalPreset };

package/dist/splitter.mjs CHANGED Viewed

@@ -1,11 +1,14 @@
-import { $ as TAG_H5, N as TAG_CODE, Q as TAG_H4, X as TAG_H2, Y as TAG_H1, Z as TAG_H3, dn as TEXT_NODE, et as TAG_H6, h as NodeEventExit, kt as TAG_PRE, m as NodeEventEnter, r as ELEMENT_NODE, rt as TAG_HR } from "./_chunks/const.mjs";
+import "./_chunks/const.mjs";
 import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor.mjs";
+//#region src/splitter.ts
+const MARKDOWN_HEADER_LINE_RE = /^#{1,6}\s+/;
+const NEWLINE_RE = /\n/g;
 const DEFAULT_HEADERS_TO_SPLIT_ON = [
-	TAG_H2,
-	TAG_H3,
-	TAG_H4,
-	TAG_H5,
-	TAG_H6
+	8,
+	9,
+	10,
+	11,
+	12
 ];
 function createOptions(options) {
 	return {
@@ -29,9 +32,16 @@ function getCodeLanguage(node) {
 function shouldSplitOnHeader(tagId, options) {
 	return options.headersToSplitOn.includes(tagId);
 }
+/**
+* Get current markdown content WITHOUT clearing buffers
+*/
 function getCurrentMarkdown(state) {
 	return state.buffer.join("").trimStart();
 }
+/**
+* Convert HTML to Markdown and split into chunks in single pass
+* Yields chunks during HTML event processing for better memory efficiency
+*/
 function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 	const opts = createOptions(options);
 	if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
@@ -58,7 +68,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 		}
 		let chunkContent = originalChunkContent;
 		if (opts.stripHeaders) {
-			chunkContent = chunkContent.split("\n").filter((line) => !line.match(/^#{1,6}\s+/)).join("\n").trim();
+			chunkContent = chunkContent.split("\n").filter((line) => !MARKDOWN_HEADER_LINE_RE.test(line)).join("\n").trim();
 			if (!chunkContent) {
 				lastChunkEndPosition = chunkEnd;
 				return;
@@ -68,13 +78,13 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 			content: chunkContent.trimEnd(),
 			metadata: { loc: { lines: {
 				from: lineNumber,
-				to: lineNumber + (originalChunkContent.match(/\n/g) || []).length
+				to: lineNumber + (originalChunkContent.match(NEWLINE_RE) || []).length
 			} } }
 		};
 		if (headerHierarchy.size > 0) {
 			chunk.metadata.headers = {};
 			for (const [tagId, text] of headerHierarchy.entries()) {
-				const level = `h${tagId - TAG_H1 + 1}`;
+				const level = `h${tagId - 7 + 1}`;
 				chunk.metadata.headers[level] = text;
 			}
 		}
@@ -86,7 +96,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 			const maxOverlap = Math.max(0, originalChunkContent.length - 1);
 			lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
 		} else lastChunkEndPosition = chunkEnd;
-		lineNumber += (originalChunkContent.match(/\n/g) || []).length;
+		lineNumber += (originalChunkContent.match(NEWLINE_RE) || []).length;
 	}
 	const parseState = {
 		depthMap: processor.state.depthMap,
@@ -99,36 +109,36 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 	});
 	for (const event of eventBuffer) {
 		const { type: eventType, node } = event;
-		if (node.type === ELEMENT_NODE) {
+		if (node.type === 1) {
 			const element = node;
 			const tagId = element.tagId;
-			if (tagId && tagId >= TAG_H1 && tagId <= TAG_H6) {
-				if (eventType === NodeEventEnter) {
+			if (tagId && tagId >= 7 && tagId <= 12) {
+				if (eventType === 0) {
 					collectingHeaderText = true;
 					currentHeaderTagId = tagId;
 					currentHeaderText = "";
 					if (shouldSplitOnHeader(tagId, opts)) {
 						if (seenSplitHeaders.has(tagId)) {
 							yield* flushChunk();
-							for (let i = tagId; i <= TAG_H6; i++) headerHierarchy.delete(i);
+							for (let i = tagId; i <= 12; i++) headerHierarchy.delete(i);
 						}
 						seenSplitHeaders.add(tagId);
 					}
-				} else if (eventType === NodeEventExit && currentHeaderTagId === tagId) {
+				} else if (eventType === 1 && currentHeaderTagId === tagId) {
 					headerHierarchy.set(tagId, currentHeaderText.trim());
 					collectingHeaderText = false;
 					currentHeaderTagId = null;
 				}
 			}
-			if (tagId === TAG_CODE && element.depthMap[TAG_PRE] > 0) {
-				if (eventType === NodeEventEnter) {
+			if (tagId === 23 && element.depthMap[34] > 0) {
+				if (eventType === 0) {
 					const lang = getCodeLanguage(element);
 					if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
 				}
 			}
-			if (tagId === TAG_HR && eventType === NodeEventEnter) yield* flushChunk();
+			if (tagId === 13 && eventType === 0) yield* flushChunk();
 		}
-		if (collectingHeaderText && node.type === TEXT_NODE) currentHeaderText += node.value;
+		if (collectingHeaderText && node.type === 2) currentHeaderText += node.value;
 		processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
 		if (!opts.returnEachLine) {
 			const currentMd = getCurrentMarkdown(processor.state);
@@ -166,6 +176,10 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
 	}
 	yield* flushChunk();
 }
+/**
+* Convert HTML to Markdown and split into chunks in single pass
+* Chunks are created during HTML event processing
+*/
 function htmlToMarkdownSplitChunks(html, options = {}) {
 	const opts = createOptions(options);
 	const chunks = [];
@@ -193,4 +207,5 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
 	}
 	return chunks;
 }
+//#endregion
 export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "mdream",
   "type": "module",
-  "version": "0.16.0",
+  "version": "0.17.0",
   "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
   "author": {
     "name": "Harlan Wilton",
@@ -39,6 +39,14 @@
       },
       "default": "./dist/cli.mjs"
     },
+    "./negotiate": {
+      "types": "./dist/negotiate.d.mts",
+      "import": {
+        "types": "./dist/negotiate.d.mts",
+        "default": "./dist/negotiate.mjs"
+      },
+      "default": "./dist/negotiate.mjs"
+    },
     "./plugins": {
       "types": "./dist/plugins.d.mts",
       "import": {
@@ -77,7 +85,7 @@
   ],
   "browser": "./dist/iife.js",
   "dependencies": {
-    "cac": "^6.7.14",
+    "cac": "^7.0.0",
     "pathe": "^2.0.3",
     "tinyglobby": "^0.2.15"
   },