npm - mdream - Versions diffs - 0.15.0 → 0.15.2 - Mend

mdream 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +96 -0
package/dist/_chunks/{llms-txt-T79S7X24.mjs → llms-txt-Czb_M48B.mjs} +142 -15
package/dist/cli.mjs +1 -4
package/dist/llms-txt.d.mts +31 -29
package/dist/llms-txt.mjs +1 -4
package/dist/plugins.mjs +0 -1
package/dist/preset/minimal.mjs +0 -3
package/package.json +1 -1

package/README.md CHANGED Viewed

@@ -422,6 +422,102 @@ const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
 }))
 ```
+## llms.txt Generation
+Generate [llms.txt](https://llmstxt.org) files from HTML content for improved LLM discoverability. Mdream provides both streaming and batch APIs for creating llms.txt artifacts.
+### createLlmsTxtStream
+Stream llms.txt generation without keeping full content in memory:
+```ts
+import { createLlmsTxtStream } from 'mdream'
+const stream = createLlmsTxtStream({
+  siteName: 'My Docs',
+  description: 'Documentation site',
+  origin: 'https://example.com',
+  outputDir: './dist',
+  generateFull: true, // Also generate llms-full.txt
+  sections: [
+    {
+      title: 'Getting Started',
+      description: 'Quick start guide',
+      links: [
+        { title: 'Installation', href: '/install', description: 'How to install' },
+        { title: 'Quick Start', href: '/quickstart' },
+      ],
+    },
+  ],
+  notes: ['Generated by mdream', 'Last updated: 2024'],
+})
+const writer = stream.getWriter()
+await writer.write({
+  title: 'Home',
+  content: '# Welcome\n\nHome page content.',
+  url: '/',
+  metadata: {
+    description: 'Welcome page',
+  },
+})
+await writer.close()
+```
+This creates:
+- `llms.txt` - Links to all pages with metadata
+- `llms-full.txt` - Complete content with frontmatter (if `generateFull: true`)
+### generateLlmsTxtArtifacts
+Process HTML files or ProcessedFile objects:
+```ts
+import { generateLlmsTxtArtifacts } from 'mdream'
+const result = await generateLlmsTxtArtifacts({
+  patterns: '**/*.html', // Glob pattern for HTML files
+  siteName: 'My Site',
+  origin: 'https://example.com',
+  generateFull: true,
+  sections: [
+    {
+      title: 'Resources',
+      links: [
+        { title: 'Docs', href: '/docs' },
+      ],
+    },
+  ],
+  notes: 'Footer notes',
+})
+console.log(result.llmsTxt) // llms.txt content
+console.log(result.llmsFullTxt) // llms-full.txt content
+console.log(result.processedFiles) // Array of processed files
+```
+### Structure
+llms.txt follows this structure:
+```markdown
+# Site Name
+> Site description
+## Custom Section
+Section description
+- [Link Title](url): Optional description
+## Pages
+- [Page Title](url): Page description
+Custom notes
+```
 ## Credits
 - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration

package/dist/_chunks/{llms-txt-T79S7X24.mjs → llms-txt-Czb_M48B.mjs} RENAMED Viewed

@@ -89,9 +89,11 @@ async function processHtmlFiles(patterns, origin) {
 * Generate llms.txt content
 */
 function generateLlmsTxtContent(files, options) {
-	const { siteName = "Site", description, origin = "" } = options;
+	const { siteName = "Site", description, origin = "", sections, notes } = options;
 	let content = `# ${siteName}\n\n`;
 	if (description) content += `> ${description}\n\n`;
+	if (origin) content += `Canonical Origin: ${origin}\n\n`;
+	if (sections) for (const section of sections) content += formatSection(section);
 	if (files.length > 0) {
 		content += `## Pages\n\n`;
 		for (const file of files) {
@@ -101,11 +103,12 @@ function generateLlmsTxtContent(files, options) {
 				const relativePath = relative(options.outputDir, file.filePath);
 				content += `- [${file.title}](${relativePath})${descText}\n`;
 			} else {
-				const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
+				const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
 				content += `- [${file.title}](${url})${descText}\n`;
 			}
 		}
 	}
+	if (notes) content += `\n${formatNotes(notes)}`;
 	return content;
 }
 /**
@@ -145,9 +148,11 @@ function serializeFrontmatter(data) {
 * Generate llms-full.txt content with complete page content
 */
 function generateLlmsFullTxtContent(files, options) {
-	const { siteName = "Site", description, origin = "" } = options;
+	const { siteName = "Site", description, origin = "", sections, notes } = options;
 	let content = `# ${siteName}\n\n`;
 	if (description) content += `> ${description}\n\n`;
+	if (origin) content += `Canonical Origin: ${origin}\n\n`;
+	if (sections) for (const section of sections) content += formatSection(section);
 	if (files.length > 0) {
 		content += `## Table of Contents\n\n`;
 		for (const file of files) {
@@ -179,6 +184,7 @@ function generateLlmsFullTxtContent(files, options) {
 			content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
 		}
 	}
+	if (notes) content += `\n${formatNotes(notes)}`;
 	return content;
 }
 /**
@@ -216,6 +222,33 @@ async function generateLlmsTxtArtifacts(options) {
 	};
 }
 /**
+* Format a section with title, description, and links
+*/
+function formatSection(section) {
+	let content = `## ${section.title}\n\n`;
+	if (section.description) {
+		const descriptions = Array.isArray(section.description) ? section.description : [section.description];
+		for (const desc of descriptions) content += `${desc}\n\n`;
+	}
+	if (section.links?.length) {
+		for (const link of section.links) {
+			const desc = link.description ? `: ${link.description}` : "";
+			content += `- [${link.title}](${link.href})${desc}\n`;
+		}
+		content += "\n";
+	}
+	return content;
+}
+/**
+* Format notes section
+*/
+function formatNotes(notes) {
+	const noteLines = Array.isArray(notes) ? notes : [notes];
+	let content = "";
+	for (const note of noteLines) content += `${note}\n\n`;
+	return content;
+}
+/**
 * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
 *
 * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
@@ -229,6 +262,17 @@ async function generateLlmsTxtArtifacts(options) {
 *   origin: 'https://example.com',
 *   generateFull: true,
 *   outputDir: './dist',
+*   sections: [
+*     {
+*       title: 'Getting Started',
+*       description: 'Quick start guide',
+*       links: [
+*         { title: 'Installation', href: '/install', description: 'How to install' },
+*         { title: 'Quick Start', href: '/quickstart' },
+*       ],
+*     },
+*   ],
+*   notes: ['Generated by mdream', 'Last updated: 2024'],
 * })
 *
 * const writer = stream.getWriter()
@@ -243,37 +287,76 @@ async function generateLlmsTxtArtifacts(options) {
 * @param options - Configuration options
 * @returns WritableStream that accepts ProcessedFile objects
 */
+/**
+* Get the group key for a URL (up to 2 segments deep)
+*/
+/**
+* Sort pages by URL path in hierarchical order (directory tree structure)
+* Groups by first segment, with root-level pages without nesting grouped together
+*/
+function sortPagesByPath(pages) {
+	const segmentHasNested = /* @__PURE__ */ new Map();
+	for (const page of pages) {
+		const segments = page.url.split("/").filter(Boolean);
+		const firstSegment = segments.length > 0 ? segments[0] : "";
+		if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
+		if (segments.length > 1) segmentHasNested.set(firstSegment, true);
+	}
+	return pages.sort((a, b) => {
+		const segmentsA = a.url.split("/").filter(Boolean);
+		const segmentsB = b.url.split("/").filter(Boolean);
+		const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
+		const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
+		const isRootLevelA = segmentsA.length <= 1;
+		const isRootLevelB = segmentsB.length <= 1;
+		const hasNestedA = segmentHasNested.get(firstSegmentA);
+		const hasNestedB = segmentHasNested.get(firstSegmentB);
+		const groupKeyA = isRootLevelA && !hasNestedA ? "" : firstSegmentA;
+		const groupKeyB = isRootLevelB && !hasNestedB ? "" : firstSegmentB;
+		if (groupKeyA === "" && groupKeyB !== "") return -1;
+		if (groupKeyA !== "" && groupKeyB === "") return 1;
+		if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
+		if (segmentsA.length === 0) return -1;
+		if (segmentsB.length === 0) return 1;
+		const minLen = Math.min(segmentsA.length, segmentsB.length);
+		for (let i = 0; i < minLen; i++) {
+			const cmp = segmentsA[i].localeCompare(segmentsB[i]);
+			if (cmp !== 0) return cmp;
+		}
+		return segmentsA.length - segmentsB.length;
+	});
+}
 function createLlmsTxtStream(options = {}) {
-	const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd() } = options;
+	const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
 	let llmsTxtHandle;
 	let llmsFullTxtHandle;
+	const bufferedPages = [];
 	return new WritableStream({
 		async start() {
 			await mkdir(outputDir, { recursive: true });
 			llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
 			let header = `# ${siteName}\n\n`;
 			if (description) header += `> ${description}\n\n`;
-			header += `## Pages\n\n`;
+			if (origin) header += `Canonical Origin: ${origin}\n\n`;
+			if (sections) for (const section of sections) header += formatSection(section);
 			await llmsTxtHandle.write(header);
 			if (generateFull) {
 				llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
 				let fullHeader = `# ${siteName}\n\n`;
 				if (description) fullHeader += `> ${description}\n\n`;
+				if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
+				if (sections) for (const section of sections) fullHeader += formatSection(section);
 				await llmsFullTxtHandle.write(fullHeader);
 			}
 		},
 		async write(file) {
 			const desc = file.metadata?.description;
-			const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
-			let chunk = "";
-			if (file.filePath && file.filePath.endsWith(".md")) {
-				const relativePath = relative(outputDir, file.filePath);
-				chunk = `- [${file.title}](${relativePath})${descText}\n`;
-			} else {
-				const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
-				chunk = `- [${file.title}](${url})${descText}\n`;
-			}
-			await llmsTxtHandle?.write(chunk);
+			bufferedPages.push({
+				url: file.url,
+				title: file.title,
+				description: desc,
+				filePath: file.filePath
+			});
 			if (generateFull && llmsFullTxtHandle) {
 				const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
 				const { frontmatter, body } = parseFrontmatter(file.content);
@@ -299,6 +382,50 @@ function createLlmsTxtStream(options = {}) {
 			}
 		},
 		async close() {
+			const sortedPages = sortPagesByPath(bufferedPages);
+			const segmentHasNested = /* @__PURE__ */ new Map();
+			for (const page of sortedPages) {
+				const segments = page.url.split("/").filter(Boolean);
+				const firstSegment = segments.length > 0 ? segments[0] : "";
+				if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
+				if (segments.length > 1) segmentHasNested.set(firstSegment, true);
+			}
+			await llmsTxtHandle?.write(`## Pages\n\n`);
+			let currentGroup = "";
+			let segmentGroupIndex = 0;
+			let urlsInCurrentGroup = 0;
+			for (let i = 0; i < sortedPages.length; i++) {
+				const page = sortedPages[i];
+				const segments = page.url.split("/").filter(Boolean);
+				const firstSegment = segments.length > 0 ? segments[0] : "";
+				const isRootLevel = segments.length <= 1;
+				const hasNested = segmentHasNested.get(firstSegment);
+				const groupKey = isRootLevel && !hasNested ? "" : firstSegment;
+				if (groupKey !== currentGroup) {
+					if (urlsInCurrentGroup > 0) {
+						if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
+					}
+					currentGroup = groupKey;
+					segmentGroupIndex++;
+					urlsInCurrentGroup = 0;
+				}
+				urlsInCurrentGroup++;
+				const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
+				let chunk = "";
+				if (page.filePath && page.filePath.endsWith(".md")) {
+					const relativePath = relative(outputDir, page.filePath);
+					chunk = `- [${page.title}](${relativePath})${descText}\n`;
+				} else {
+					const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
+					chunk = `- [${page.title}](${url})${descText}\n`;
+				}
+				await llmsTxtHandle?.write(chunk);
+			}
+			if (notes) {
+				const notesContent = formatNotes(notes);
+				await llmsTxtHandle?.write(`\n${notesContent}`);
+				if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
+			}
 			await llmsTxtHandle?.close();
 			await llmsFullTxtHandle?.close();
 		},

package/dist/cli.mjs CHANGED Viewed

@@ -1,9 +1,6 @@
-import "./_chunks/const-Bf_XN9U9.mjs";
 import "./_chunks/markdown-processor-D26Uo5td.mjs";
-import "./_chunks/plugin-CjWWQTuL.mjs";
 import { n as streamHtmlToMarkdown } from "./_chunks/src-BJpipdul.mjs";
-import "./_chunks/extraction-BA9MDtq3.mjs";
-import { n as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-T79S7X24.mjs";
+import { n as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-Czb_M48B.mjs";
 import "./_chunks/plugins-DJnqR2fA.mjs";
 import { t as withMinimalPreset } from "./_chunks/minimal-BiDhcwif.mjs";
 import { readFileSync } from "node:fs";

package/dist/llms-txt.d.mts CHANGED Viewed

@@ -1,4 +1,26 @@
 //#region src/llms-txt.d.ts
+/**
+ * Link in llms.txt section
+ */
+interface LlmsTxtLink {
+  /** The title of the link */
+  title: string;
+  /** The description of the link */
+  description?: string;
+  /** The href of the link */
+  href: string;
+}
+/**
+ * Section in llms.txt
+ */
+interface LlmsTxtSection {
+  /** The title of the section */
+  title: string;
+  /** The description of the section (can be array for multiple paragraphs) */
+  description?: string | string[];
+  /** The links of the section */
+  links?: LlmsTxtLink[];
+}
 interface LlmsTxtArtifactsOptions {
   patterns?: string | string[];
   files?: ProcessedFile[];
@@ -8,6 +30,10 @@ interface LlmsTxtArtifactsOptions {
   generateFull?: boolean;
   generateMarkdown?: boolean;
   outputDir?: string;
+  /** The sections to write before pages */
+  sections?: LlmsTxtSection[];
+  /** Notes to write at the end */
+  notes?: string | string[];
 }
 interface ProcessedFile {
   filePath?: string;
@@ -48,35 +74,11 @@ interface CreateLlmsTxtStreamOptions extends Omit<LlmsTxtArtifactsOptions, 'patt
   origin?: string;
   /** Generate llms-full.txt with complete page content (defaults to false) */
   generateFull?: boolean;
+  /** The sections to write before pages */
+  sections?: LlmsTxtSection[];
+  /** Notes to write at the end */
+  notes?: string | string[];
 }
-/**
- * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
- *
- * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
- * never keeping full content in memory. Creates outputDir recursively if needed.
- *
- * @example
- * ```typescript
- * const stream = createLlmsTxtStream({
- *   siteName: 'My Docs',
- *   description: 'Documentation site',
- *   origin: 'https://example.com',
- *   generateFull: true,
- *   outputDir: './dist',
- * })
- *
- * const writer = stream.getWriter()
- * await writer.write({
- *   title: 'Home',
- *   content: '# Welcome\n\nHome page content.',
- *   url: '/',
- * })
- * await writer.close()
- * ```
- *
- * @param options - Configuration options
- * @returns WritableStream that accepts ProcessedFile objects
- */
 declare function createLlmsTxtStream(options?: CreateLlmsTxtStreamOptions): WritableStream<ProcessedFile>;
 //#endregion
-export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };
+export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, LlmsTxtLink, LlmsTxtSection, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };

package/dist/llms-txt.mjs CHANGED Viewed

@@ -1,8 +1,5 @@
-import "./_chunks/const-Bf_XN9U9.mjs";
 import "./_chunks/markdown-processor-D26Uo5td.mjs";
-import "./_chunks/plugin-CjWWQTuL.mjs";
 import "./_chunks/src-BJpipdul.mjs";
-import "./_chunks/extraction-BA9MDtq3.mjs";
-import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt-T79S7X24.mjs";
+import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt-Czb_M48B.mjs";
 export { createLlmsTxtStream, generateLlmsTxtArtifacts };

package/dist/plugins.mjs CHANGED Viewed

@@ -1,4 +1,3 @@
-import "./_chunks/const-Bf_XN9U9.mjs";
 import { t as createPlugin } from "./_chunks/plugin-CjWWQTuL.mjs";
 import { t as extractionPlugin } from "./_chunks/extraction-BA9MDtq3.mjs";
 import { a as filterPlugin, i as frontmatterPlugin, n as readabilityPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./_chunks/plugins-DJnqR2fA.mjs";

package/dist/preset/minimal.mjs CHANGED Viewed

@@ -1,6 +1,3 @@
-import "../_chunks/const-Bf_XN9U9.mjs";
-import "../_chunks/plugin-CjWWQTuL.mjs";
-import "../_chunks/extraction-BA9MDtq3.mjs";
 import "../_chunks/plugins-DJnqR2fA.mjs";
 import { t as withMinimalPreset } from "../_chunks/minimal-BiDhcwif.mjs";

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "mdream",
   "type": "module",
-  "version": "0.15.0",
+  "version": "0.15.2",
   "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
   "author": {
     "name": "Harlan Wilton",