mdream 0.14.0 → 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -422,6 +422,102 @@ const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
422
422
  }))
423
423
  ```
424
424
 
425
+ ## llms.txt Generation
426
+
427
+ Generate [llms.txt](https://llmstxt.org) files from HTML content for improved LLM discoverability. Mdream provides both streaming and batch APIs for creating llms.txt artifacts.
428
+
429
+ ### createLlmsTxtStream
430
+
431
+ Stream llms.txt generation without keeping full content in memory:
432
+
433
+ ```ts
434
+ import { createLlmsTxtStream } from 'mdream'
435
+
436
+ const stream = createLlmsTxtStream({
437
+ siteName: 'My Docs',
438
+ description: 'Documentation site',
439
+ origin: 'https://example.com',
440
+ outputDir: './dist',
441
+ generateFull: true, // Also generate llms-full.txt
442
+ sections: [
443
+ {
444
+ title: 'Getting Started',
445
+ description: 'Quick start guide',
446
+ links: [
447
+ { title: 'Installation', href: '/install', description: 'How to install' },
448
+ { title: 'Quick Start', href: '/quickstart' },
449
+ ],
450
+ },
451
+ ],
452
+ notes: ['Generated by mdream', 'Last updated: 2024'],
453
+ })
454
+
455
+ const writer = stream.getWriter()
456
+ await writer.write({
457
+ title: 'Home',
458
+ content: '# Welcome\n\nHome page content.',
459
+ url: '/',
460
+ metadata: {
461
+ description: 'Welcome page',
462
+ },
463
+ })
464
+ await writer.close()
465
+ ```
466
+
467
+ This creates:
468
+ - `llms.txt` - Links to all pages with metadata
469
+ - `llms-full.txt` - Complete content with frontmatter (if `generateFull: true`)
470
+
471
+ ### generateLlmsTxtArtifacts
472
+
473
+ Process HTML files or ProcessedFile objects:
474
+
475
+ ```ts
476
+ import { generateLlmsTxtArtifacts } from 'mdream'
477
+
478
+ const result = await generateLlmsTxtArtifacts({
479
+ patterns: '**/*.html', // Glob pattern for HTML files
480
+ siteName: 'My Site',
481
+ origin: 'https://example.com',
482
+ generateFull: true,
483
+ sections: [
484
+ {
485
+ title: 'Resources',
486
+ links: [
487
+ { title: 'Docs', href: '/docs' },
488
+ ],
489
+ },
490
+ ],
491
+ notes: 'Footer notes',
492
+ })
493
+
494
+ console.log(result.llmsTxt) // llms.txt content
495
+ console.log(result.llmsFullTxt) // llms-full.txt content
496
+ console.log(result.processedFiles) // Array of processed files
497
+ ```
498
+
499
+ ### Structure
500
+
501
+ llms.txt follows this structure:
502
+
503
+ ```markdown
504
+ # Site Name
505
+
506
+ > Site description
507
+
508
+ ## Custom Section
509
+
510
+ Section description
511
+
512
+ - [Link Title](url): Optional description
513
+
514
+ ## Pages
515
+
516
+ - [Page Title](url): Page description
517
+
518
+ Custom notes
519
+ ```
520
+
425
521
  ## Credits
426
522
 
427
523
  - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
@@ -1,7 +1,7 @@
1
1
  import { t as htmlToMarkdown } from "./src-BJpipdul.mjs";
2
2
  import { t as extractionPlugin } from "./extraction-BA9MDtq3.mjs";
3
- import { readFile } from "node:fs/promises";
4
- import { basename, dirname, relative, sep } from "pathe";
3
+ import { mkdir, open, readFile } from "node:fs/promises";
4
+ import { basename, dirname, join, relative, sep } from "pathe";
5
5
  import { glob } from "tinyglobby";
6
6
 
7
7
  //#region src/llms-txt.ts
@@ -89,9 +89,10 @@ async function processHtmlFiles(patterns, origin) {
89
89
  * Generate llms.txt content
90
90
  */
91
91
  function generateLlmsTxtContent(files, options) {
92
- const { siteName = "Site", description, origin = "" } = options;
92
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
93
93
  let content = `# ${siteName}\n\n`;
94
94
  if (description) content += `> ${description}\n\n`;
95
+ if (sections) for (const section of sections) content += formatSection(section);
95
96
  if (files.length > 0) {
96
97
  content += `## Pages\n\n`;
97
98
  for (const file of files) {
@@ -106,6 +107,7 @@ function generateLlmsTxtContent(files, options) {
106
107
  }
107
108
  }
108
109
  }
110
+ if (notes) content += `\n${formatNotes(notes)}`;
109
111
  return content;
110
112
  }
111
113
  /**
@@ -145,9 +147,10 @@ function serializeFrontmatter(data) {
145
147
  * Generate llms-full.txt content with complete page content
146
148
  */
147
149
  function generateLlmsFullTxtContent(files, options) {
148
- const { siteName = "Site", description, origin = "" } = options;
150
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
149
151
  let content = `# ${siteName}\n\n`;
150
152
  if (description) content += `> ${description}\n\n`;
153
+ if (sections) for (const section of sections) content += formatSection(section);
151
154
  if (files.length > 0) {
152
155
  content += `## Table of Contents\n\n`;
153
156
  for (const file of files) {
@@ -179,6 +182,7 @@ function generateLlmsFullTxtContent(files, options) {
179
182
  content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
180
183
  }
181
184
  }
185
+ if (notes) content += `\n${formatNotes(notes)}`;
182
186
  return content;
183
187
  }
184
188
  /**
@@ -215,6 +219,144 @@ async function generateLlmsTxtArtifacts(options) {
215
219
  processedFiles: files
216
220
  };
217
221
  }
222
+ /**
223
+ * Format a section with title, description, and links
224
+ */
225
+ function formatSection(section) {
226
+ let content = `## ${section.title}\n\n`;
227
+ if (section.description) {
228
+ const descriptions = Array.isArray(section.description) ? section.description : [section.description];
229
+ for (const desc of descriptions) content += `${desc}\n\n`;
230
+ }
231
+ if (section.links?.length) {
232
+ for (const link of section.links) {
233
+ const desc = link.description ? `: ${link.description}` : "";
234
+ content += `- [${link.title}](${link.href})${desc}\n`;
235
+ }
236
+ content += "\n";
237
+ }
238
+ return content;
239
+ }
240
+ /**
241
+ * Format notes section
242
+ */
243
+ function formatNotes(notes) {
244
+ const noteLines = Array.isArray(notes) ? notes : [notes];
245
+ let content = "";
246
+ for (const note of noteLines) content += `${note}\n\n`;
247
+ return content;
248
+ }
249
+ /**
250
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
251
+ *
252
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
253
+ * never keeping full content in memory. Creates outputDir recursively if needed.
254
+ *
255
+ * @example
256
+ * ```typescript
257
+ * const stream = createLlmsTxtStream({
258
+ * siteName: 'My Docs',
259
+ * description: 'Documentation site',
260
+ * origin: 'https://example.com',
261
+ * generateFull: true,
262
+ * outputDir: './dist',
263
+ * sections: [
264
+ * {
265
+ * title: 'Getting Started',
266
+ * description: 'Quick start guide',
267
+ * links: [
268
+ * { title: 'Installation', href: '/install', description: 'How to install' },
269
+ * { title: 'Quick Start', href: '/quickstart' },
270
+ * ],
271
+ * },
272
+ * ],
273
+ * notes: ['Generated by mdream', 'Last updated: 2024'],
274
+ * })
275
+ *
276
+ * const writer = stream.getWriter()
277
+ * await writer.write({
278
+ * title: 'Home',
279
+ * content: '# Welcome\n\nHome page content.',
280
+ * url: '/',
281
+ * })
282
+ * await writer.close()
283
+ * ```
284
+ *
285
+ * @param options - Configuration options
286
+ * @returns WritableStream that accepts ProcessedFile objects
287
+ */
288
+ function createLlmsTxtStream(options = {}) {
289
+ const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
290
+ let llmsTxtHandle;
291
+ let llmsFullTxtHandle;
292
+ return new WritableStream({
293
+ async start() {
294
+ await mkdir(outputDir, { recursive: true });
295
+ llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
296
+ let header = `# ${siteName}\n\n`;
297
+ if (description) header += `> ${description}\n\n`;
298
+ if (sections) for (const section of sections) header += formatSection(section);
299
+ header += `## Pages\n\n`;
300
+ await llmsTxtHandle.write(header);
301
+ if (generateFull) {
302
+ llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
303
+ let fullHeader = `# ${siteName}\n\n`;
304
+ if (description) fullHeader += `> ${description}\n\n`;
305
+ if (sections) for (const section of sections) fullHeader += formatSection(section);
306
+ await llmsFullTxtHandle.write(fullHeader);
307
+ }
308
+ },
309
+ async write(file) {
310
+ const desc = file.metadata?.description;
311
+ const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
312
+ let chunk = "";
313
+ if (file.filePath && file.filePath.endsWith(".md")) {
314
+ const relativePath = relative(outputDir, file.filePath);
315
+ chunk = `- [${file.title}](${relativePath})${descText}\n`;
316
+ } else {
317
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
318
+ chunk = `- [${file.title}](${url})${descText}\n`;
319
+ }
320
+ await llmsTxtHandle?.write(chunk);
321
+ if (generateFull && llmsFullTxtHandle) {
322
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
323
+ const { frontmatter, body } = parseFrontmatter(file.content);
324
+ const metadata = {
325
+ title: file.title,
326
+ url
327
+ };
328
+ if (file.filePath) metadata.file = relative(outputDir, file.filePath);
329
+ if (file.metadata) {
330
+ if (file.metadata.description) metadata.description = file.metadata.description;
331
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
332
+ if (file.metadata.author) metadata.author = file.metadata.author;
333
+ }
334
+ const frontmatterString = serializeFrontmatter(frontmatter ? {
335
+ ...frontmatter,
336
+ ...metadata
337
+ } : metadata);
338
+ let contentBody = frontmatter ? body : file.content;
339
+ const titleLine = contentBody.trim().split("\n")[0];
340
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
341
+ const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
342
+ await llmsFullTxtHandle.write(fullChunk);
343
+ }
344
+ },
345
+ async close() {
346
+ if (notes) {
347
+ const notesContent = formatNotes(notes);
348
+ await llmsTxtHandle?.write(`\n${notesContent}`);
349
+ if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
350
+ }
351
+ await llmsTxtHandle?.close();
352
+ await llmsFullTxtHandle?.close();
353
+ },
354
+ async abort(reason) {
355
+ await llmsTxtHandle?.close();
356
+ await llmsFullTxtHandle?.close();
357
+ }
358
+ });
359
+ }
218
360
 
219
361
  //#endregion
220
- export { generateLlmsTxtArtifacts as t };
362
+ export { generateLlmsTxtArtifacts as n, createLlmsTxtStream as t };
package/dist/cli.mjs CHANGED
@@ -1,9 +1,6 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import "./_chunks/markdown-processor-D26Uo5td.mjs";
3
- import "./_chunks/plugin-CjWWQTuL.mjs";
4
2
  import { n as streamHtmlToMarkdown } from "./_chunks/src-BJpipdul.mjs";
5
- import "./_chunks/extraction-BA9MDtq3.mjs";
6
- import { t as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-D7Hduhij.mjs";
3
+ import { n as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-BXtLmgK6.mjs";
7
4
  import "./_chunks/plugins-DJnqR2fA.mjs";
8
5
  import { t as withMinimalPreset } from "./_chunks/minimal-BiDhcwif.mjs";
9
6
  import { readFileSync } from "node:fs";
@@ -1,4 +1,26 @@
1
1
  //#region src/llms-txt.d.ts
2
+ /**
3
+ * Link in llms.txt section
4
+ */
5
+ interface LlmsTxtLink {
6
+ /** The title of the link */
7
+ title: string;
8
+ /** The description of the link */
9
+ description?: string;
10
+ /** The href of the link */
11
+ href: string;
12
+ }
13
+ /**
14
+ * Section in llms.txt
15
+ */
16
+ interface LlmsTxtSection {
17
+ /** The title of the section */
18
+ title: string;
19
+ /** The description of the section (can be array for multiple paragraphs) */
20
+ description?: string | string[];
21
+ /** The links of the section */
22
+ links?: LlmsTxtLink[];
23
+ }
2
24
  interface LlmsTxtArtifactsOptions {
3
25
  patterns?: string | string[];
4
26
  files?: ProcessedFile[];
@@ -8,6 +30,10 @@ interface LlmsTxtArtifactsOptions {
8
30
  generateFull?: boolean;
9
31
  generateMarkdown?: boolean;
10
32
  outputDir?: string;
33
+ /** The sections to write before pages */
34
+ sections?: LlmsTxtSection[];
35
+ /** Notes to write at the end */
36
+ notes?: string | string[];
11
37
  }
12
38
  interface ProcessedFile {
13
39
  filePath?: string;
@@ -34,5 +60,64 @@ interface LlmsTxtArtifactsResult {
34
60
  * Main function to process files and generate llms.txt artifacts
35
61
  */
36
62
  declare function generateLlmsTxtArtifacts(options: LlmsTxtArtifactsOptions): Promise<LlmsTxtArtifactsResult>;
63
+ /**
64
+ * Options for creating an llms.txt stream
65
+ */
66
+ interface CreateLlmsTxtStreamOptions extends Omit<LlmsTxtArtifactsOptions, 'patterns' | 'files' | 'outputDir' | 'generateMarkdown'> {
67
+ /** Directory to write files to (defaults to process.cwd()) */
68
+ outputDir?: string;
69
+ /** Site name for the header (defaults to 'Site') */
70
+ siteName?: string;
71
+ /** Site description for the header */
72
+ description?: string;
73
+ /** Origin URL to prepend to relative URLs */
74
+ origin?: string;
75
+ /** Generate llms-full.txt with complete page content (defaults to false) */
76
+ generateFull?: boolean;
77
+ /** The sections to write before pages */
78
+ sections?: LlmsTxtSection[];
79
+ /** Notes to write at the end */
80
+ notes?: string | string[];
81
+ }
82
+ /**
83
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
84
+ *
85
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
86
+ * never keeping full content in memory. Creates outputDir recursively if needed.
87
+ *
88
+ * @example
89
+ * ```typescript
90
+ * const stream = createLlmsTxtStream({
91
+ * siteName: 'My Docs',
92
+ * description: 'Documentation site',
93
+ * origin: 'https://example.com',
94
+ * generateFull: true,
95
+ * outputDir: './dist',
96
+ * sections: [
97
+ * {
98
+ * title: 'Getting Started',
99
+ * description: 'Quick start guide',
100
+ * links: [
101
+ * { title: 'Installation', href: '/install', description: 'How to install' },
102
+ * { title: 'Quick Start', href: '/quickstart' },
103
+ * ],
104
+ * },
105
+ * ],
106
+ * notes: ['Generated by mdream', 'Last updated: 2024'],
107
+ * })
108
+ *
109
+ * const writer = stream.getWriter()
110
+ * await writer.write({
111
+ * title: 'Home',
112
+ * content: '# Welcome\n\nHome page content.',
113
+ * url: '/',
114
+ * })
115
+ * await writer.close()
116
+ * ```
117
+ *
118
+ * @param options - Configuration options
119
+ * @returns WritableStream that accepts ProcessedFile objects
120
+ */
121
+ declare function createLlmsTxtStream(options?: CreateLlmsTxtStreamOptions): WritableStream<ProcessedFile>;
37
122
  //#endregion
38
- export { LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, ProcessedFile, generateLlmsTxtArtifacts };
123
+ export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, LlmsTxtLink, LlmsTxtSection, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };
package/dist/llms-txt.mjs CHANGED
@@ -1,8 +1,5 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import "./_chunks/markdown-processor-D26Uo5td.mjs";
3
- import "./_chunks/plugin-CjWWQTuL.mjs";
4
2
  import "./_chunks/src-BJpipdul.mjs";
5
- import "./_chunks/extraction-BA9MDtq3.mjs";
6
- import { t as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-D7Hduhij.mjs";
3
+ import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt-BXtLmgK6.mjs";
7
4
 
8
- export { generateLlmsTxtArtifacts };
5
+ export { createLlmsTxtStream, generateLlmsTxtArtifacts };
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,3 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import { t as createPlugin } from "./_chunks/plugin-CjWWQTuL.mjs";
3
2
  import { t as extractionPlugin } from "./_chunks/extraction-BA9MDtq3.mjs";
4
3
  import { a as filterPlugin, i as frontmatterPlugin, n as readabilityPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./_chunks/plugins-DJnqR2fA.mjs";
@@ -1,6 +1,3 @@
1
- import "../_chunks/const-Bf_XN9U9.mjs";
2
- import "../_chunks/plugin-CjWWQTuL.mjs";
3
- import "../_chunks/extraction-BA9MDtq3.mjs";
4
1
  import "../_chunks/plugins-DJnqR2fA.mjs";
5
2
  import { t as withMinimalPreset } from "../_chunks/minimal-BiDhcwif.mjs";
6
3
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.14.0",
4
+ "version": "0.15.1",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",