mdream 0.15.0 → 0.15.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -422,6 +422,102 @@ const chunks = htmlToMarkdownSplitChunks(html, withMinimalPreset({
422
422
  }))
423
423
  ```
424
424
 
425
+ ## llms.txt Generation
426
+
427
+ Generate [llms.txt](https://llmstxt.org) files from HTML content for improved LLM discoverability. Mdream provides both streaming and batch APIs for creating llms.txt artifacts.
428
+
429
+ ### createLlmsTxtStream
430
+
431
+ Stream llms.txt generation without keeping full content in memory:
432
+
433
+ ```ts
434
+ import { createLlmsTxtStream } from 'mdream'
435
+
436
+ const stream = createLlmsTxtStream({
437
+ siteName: 'My Docs',
438
+ description: 'Documentation site',
439
+ origin: 'https://example.com',
440
+ outputDir: './dist',
441
+ generateFull: true, // Also generate llms-full.txt
442
+ sections: [
443
+ {
444
+ title: 'Getting Started',
445
+ description: 'Quick start guide',
446
+ links: [
447
+ { title: 'Installation', href: '/install', description: 'How to install' },
448
+ { title: 'Quick Start', href: '/quickstart' },
449
+ ],
450
+ },
451
+ ],
452
+ notes: ['Generated by mdream', 'Last updated: 2024'],
453
+ })
454
+
455
+ const writer = stream.getWriter()
456
+ await writer.write({
457
+ title: 'Home',
458
+ content: '# Welcome\n\nHome page content.',
459
+ url: '/',
460
+ metadata: {
461
+ description: 'Welcome page',
462
+ },
463
+ })
464
+ await writer.close()
465
+ ```
466
+
467
+ This creates:
468
+ - `llms.txt` - Links to all pages with metadata
469
+ - `llms-full.txt` - Complete content with frontmatter (if `generateFull: true`)
470
+
471
+ ### generateLlmsTxtArtifacts
472
+
473
+ Process HTML files or ProcessedFile objects:
474
+
475
+ ```ts
476
+ import { generateLlmsTxtArtifacts } from 'mdream'
477
+
478
+ const result = await generateLlmsTxtArtifacts({
479
+ patterns: '**/*.html', // Glob pattern for HTML files
480
+ siteName: 'My Site',
481
+ origin: 'https://example.com',
482
+ generateFull: true,
483
+ sections: [
484
+ {
485
+ title: 'Resources',
486
+ links: [
487
+ { title: 'Docs', href: '/docs' },
488
+ ],
489
+ },
490
+ ],
491
+ notes: 'Footer notes',
492
+ })
493
+
494
+ console.log(result.llmsTxt) // llms.txt content
495
+ console.log(result.llmsFullTxt) // llms-full.txt content
496
+ console.log(result.processedFiles) // Array of processed files
497
+ ```
498
+
499
+ ### Structure
500
+
501
+ llms.txt follows this structure:
502
+
503
+ ```markdown
504
+ # Site Name
505
+
506
+ > Site description
507
+
508
+ ## Custom Section
509
+
510
+ Section description
511
+
512
+ - [Link Title](url): Optional description
513
+
514
+ ## Pages
515
+
516
+ - [Page Title](url): Page description
517
+
518
+ Custom notes
519
+ ```
520
+
425
521
  ## Credits
426
522
 
427
523
  - [ultrahtml](https://github.com/natemoo-re/ultrahtml): HTML parsing inspiration
@@ -89,9 +89,11 @@ async function processHtmlFiles(patterns, origin) {
89
89
  * Generate llms.txt content
90
90
  */
91
91
  function generateLlmsTxtContent(files, options) {
92
- const { siteName = "Site", description, origin = "" } = options;
92
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
93
93
  let content = `# ${siteName}\n\n`;
94
94
  if (description) content += `> ${description}\n\n`;
95
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
96
+ if (sections) for (const section of sections) content += formatSection(section);
95
97
  if (files.length > 0) {
96
98
  content += `## Pages\n\n`;
97
99
  for (const file of files) {
@@ -101,11 +103,12 @@ function generateLlmsTxtContent(files, options) {
101
103
  const relativePath = relative(options.outputDir, file.filePath);
102
104
  content += `- [${file.title}](${relativePath})${descText}\n`;
103
105
  } else {
104
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
106
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
105
107
  content += `- [${file.title}](${url})${descText}\n`;
106
108
  }
107
109
  }
108
110
  }
111
+ if (notes) content += `\n${formatNotes(notes)}`;
109
112
  return content;
110
113
  }
111
114
  /**
@@ -145,9 +148,11 @@ function serializeFrontmatter(data) {
145
148
  * Generate llms-full.txt content with complete page content
146
149
  */
147
150
  function generateLlmsFullTxtContent(files, options) {
148
- const { siteName = "Site", description, origin = "" } = options;
151
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
149
152
  let content = `# ${siteName}\n\n`;
150
153
  if (description) content += `> ${description}\n\n`;
154
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
155
+ if (sections) for (const section of sections) content += formatSection(section);
151
156
  if (files.length > 0) {
152
157
  content += `## Table of Contents\n\n`;
153
158
  for (const file of files) {
@@ -179,6 +184,7 @@ function generateLlmsFullTxtContent(files, options) {
179
184
  content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
180
185
  }
181
186
  }
187
+ if (notes) content += `\n${formatNotes(notes)}`;
182
188
  return content;
183
189
  }
184
190
  /**
@@ -216,6 +222,33 @@ async function generateLlmsTxtArtifacts(options) {
216
222
  };
217
223
  }
218
224
  /**
225
+ * Format a section with title, description, and links
226
+ */
227
+ function formatSection(section) {
228
+ let content = `## ${section.title}\n\n`;
229
+ if (section.description) {
230
+ const descriptions = Array.isArray(section.description) ? section.description : [section.description];
231
+ for (const desc of descriptions) content += `${desc}\n\n`;
232
+ }
233
+ if (section.links?.length) {
234
+ for (const link of section.links) {
235
+ const desc = link.description ? `: ${link.description}` : "";
236
+ content += `- [${link.title}](${link.href})${desc}\n`;
237
+ }
238
+ content += "\n";
239
+ }
240
+ return content;
241
+ }
242
+ /**
243
+ * Format notes section
244
+ */
245
+ function formatNotes(notes) {
246
+ const noteLines = Array.isArray(notes) ? notes : [notes];
247
+ let content = "";
248
+ for (const note of noteLines) content += `${note}\n\n`;
249
+ return content;
250
+ }
251
+ /**
219
252
  * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
220
253
  *
221
254
  * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
@@ -229,6 +262,17 @@ async function generateLlmsTxtArtifacts(options) {
229
262
  * origin: 'https://example.com',
230
263
  * generateFull: true,
231
264
  * outputDir: './dist',
265
+ * sections: [
266
+ * {
267
+ * title: 'Getting Started',
268
+ * description: 'Quick start guide',
269
+ * links: [
270
+ * { title: 'Installation', href: '/install', description: 'How to install' },
271
+ * { title: 'Quick Start', href: '/quickstart' },
272
+ * ],
273
+ * },
274
+ * ],
275
+ * notes: ['Generated by mdream', 'Last updated: 2024'],
232
276
  * })
233
277
  *
234
278
  * const writer = stream.getWriter()
@@ -243,37 +287,76 @@ async function generateLlmsTxtArtifacts(options) {
243
287
  * @param options - Configuration options
244
288
  * @returns WritableStream that accepts ProcessedFile objects
245
289
  */
290
+ /**
291
+ * Get the group key for a URL (up to 2 segments deep)
292
+ */
293
+ /**
294
+ * Sort pages by URL path in hierarchical order (directory tree structure)
295
+ * Groups by first segment, with root-level pages without nesting grouped together
296
+ */
297
+ function sortPagesByPath(pages) {
298
+ const segmentHasNested = /* @__PURE__ */ new Map();
299
+ for (const page of pages) {
300
+ const segments = page.url.split("/").filter(Boolean);
301
+ const firstSegment = segments.length > 0 ? segments[0] : "";
302
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
303
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
304
+ }
305
+ return pages.sort((a, b) => {
306
+ const segmentsA = a.url.split("/").filter(Boolean);
307
+ const segmentsB = b.url.split("/").filter(Boolean);
308
+ const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
309
+ const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
310
+ const isRootLevelA = segmentsA.length <= 1;
311
+ const isRootLevelB = segmentsB.length <= 1;
312
+ const hasNestedA = segmentHasNested.get(firstSegmentA);
313
+ const hasNestedB = segmentHasNested.get(firstSegmentB);
314
+ const groupKeyA = isRootLevelA && !hasNestedA ? "" : firstSegmentA;
315
+ const groupKeyB = isRootLevelB && !hasNestedB ? "" : firstSegmentB;
316
+ if (groupKeyA === "" && groupKeyB !== "") return -1;
317
+ if (groupKeyA !== "" && groupKeyB === "") return 1;
318
+ if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
319
+ if (segmentsA.length === 0) return -1;
320
+ if (segmentsB.length === 0) return 1;
321
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
322
+ for (let i = 0; i < minLen; i++) {
323
+ const cmp = segmentsA[i].localeCompare(segmentsB[i]);
324
+ if (cmp !== 0) return cmp;
325
+ }
326
+ return segmentsA.length - segmentsB.length;
327
+ });
328
+ }
246
329
  function createLlmsTxtStream(options = {}) {
247
- const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd() } = options;
330
+ const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
248
331
  let llmsTxtHandle;
249
332
  let llmsFullTxtHandle;
333
+ const bufferedPages = [];
250
334
  return new WritableStream({
251
335
  async start() {
252
336
  await mkdir(outputDir, { recursive: true });
253
337
  llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
254
338
  let header = `# ${siteName}\n\n`;
255
339
  if (description) header += `> ${description}\n\n`;
256
- header += `## Pages\n\n`;
340
+ if (origin) header += `Canonical Origin: ${origin}\n\n`;
341
+ if (sections) for (const section of sections) header += formatSection(section);
257
342
  await llmsTxtHandle.write(header);
258
343
  if (generateFull) {
259
344
  llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
260
345
  let fullHeader = `# ${siteName}\n\n`;
261
346
  if (description) fullHeader += `> ${description}\n\n`;
347
+ if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
348
+ if (sections) for (const section of sections) fullHeader += formatSection(section);
262
349
  await llmsFullTxtHandle.write(fullHeader);
263
350
  }
264
351
  },
265
352
  async write(file) {
266
353
  const desc = file.metadata?.description;
267
- const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
268
- let chunk = "";
269
- if (file.filePath && file.filePath.endsWith(".md")) {
270
- const relativePath = relative(outputDir, file.filePath);
271
- chunk = `- [${file.title}](${relativePath})${descText}\n`;
272
- } else {
273
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin + file.url;
274
- chunk = `- [${file.title}](${url})${descText}\n`;
275
- }
276
- await llmsTxtHandle?.write(chunk);
354
+ bufferedPages.push({
355
+ url: file.url,
356
+ title: file.title,
357
+ description: desc,
358
+ filePath: file.filePath
359
+ });
277
360
  if (generateFull && llmsFullTxtHandle) {
278
361
  const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
279
362
  const { frontmatter, body } = parseFrontmatter(file.content);
@@ -299,6 +382,50 @@ function createLlmsTxtStream(options = {}) {
299
382
  }
300
383
  },
301
384
  async close() {
385
+ const sortedPages = sortPagesByPath(bufferedPages);
386
+ const segmentHasNested = /* @__PURE__ */ new Map();
387
+ for (const page of sortedPages) {
388
+ const segments = page.url.split("/").filter(Boolean);
389
+ const firstSegment = segments.length > 0 ? segments[0] : "";
390
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
391
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
392
+ }
393
+ await llmsTxtHandle?.write(`## Pages\n\n`);
394
+ let currentGroup = "";
395
+ let segmentGroupIndex = 0;
396
+ let urlsInCurrentGroup = 0;
397
+ for (let i = 0; i < sortedPages.length; i++) {
398
+ const page = sortedPages[i];
399
+ const segments = page.url.split("/").filter(Boolean);
400
+ const firstSegment = segments.length > 0 ? segments[0] : "";
401
+ const isRootLevel = segments.length <= 1;
402
+ const hasNested = segmentHasNested.get(firstSegment);
403
+ const groupKey = isRootLevel && !hasNested ? "" : firstSegment;
404
+ if (groupKey !== currentGroup) {
405
+ if (urlsInCurrentGroup > 0) {
406
+ if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
407
+ }
408
+ currentGroup = groupKey;
409
+ segmentGroupIndex++;
410
+ urlsInCurrentGroup = 0;
411
+ }
412
+ urlsInCurrentGroup++;
413
+ const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
414
+ let chunk = "";
415
+ if (page.filePath && page.filePath.endsWith(".md")) {
416
+ const relativePath = relative(outputDir, page.filePath);
417
+ chunk = `- [${page.title}](${relativePath})${descText}\n`;
418
+ } else {
419
+ const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
420
+ chunk = `- [${page.title}](${url})${descText}\n`;
421
+ }
422
+ await llmsTxtHandle?.write(chunk);
423
+ }
424
+ if (notes) {
425
+ const notesContent = formatNotes(notes);
426
+ await llmsTxtHandle?.write(`\n${notesContent}`);
427
+ if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
428
+ }
302
429
  await llmsTxtHandle?.close();
303
430
  await llmsFullTxtHandle?.close();
304
431
  },
package/dist/cli.mjs CHANGED
@@ -1,9 +1,6 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import "./_chunks/markdown-processor-D26Uo5td.mjs";
3
- import "./_chunks/plugin-CjWWQTuL.mjs";
4
2
  import { n as streamHtmlToMarkdown } from "./_chunks/src-BJpipdul.mjs";
5
- import "./_chunks/extraction-BA9MDtq3.mjs";
6
- import { n as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-T79S7X24.mjs";
3
+ import { n as generateLlmsTxtArtifacts } from "./_chunks/llms-txt-Czb_M48B.mjs";
7
4
  import "./_chunks/plugins-DJnqR2fA.mjs";
8
5
  import { t as withMinimalPreset } from "./_chunks/minimal-BiDhcwif.mjs";
9
6
  import { readFileSync } from "node:fs";
@@ -1,4 +1,26 @@
1
1
  //#region src/llms-txt.d.ts
2
+ /**
3
+ * Link in llms.txt section
4
+ */
5
+ interface LlmsTxtLink {
6
+ /** The title of the link */
7
+ title: string;
8
+ /** The description of the link */
9
+ description?: string;
10
+ /** The href of the link */
11
+ href: string;
12
+ }
13
+ /**
14
+ * Section in llms.txt
15
+ */
16
+ interface LlmsTxtSection {
17
+ /** The title of the section */
18
+ title: string;
19
+ /** The description of the section (can be array for multiple paragraphs) */
20
+ description?: string | string[];
21
+ /** The links of the section */
22
+ links?: LlmsTxtLink[];
23
+ }
2
24
  interface LlmsTxtArtifactsOptions {
3
25
  patterns?: string | string[];
4
26
  files?: ProcessedFile[];
@@ -8,6 +30,10 @@ interface LlmsTxtArtifactsOptions {
8
30
  generateFull?: boolean;
9
31
  generateMarkdown?: boolean;
10
32
  outputDir?: string;
33
+ /** The sections to write before pages */
34
+ sections?: LlmsTxtSection[];
35
+ /** Notes to write at the end */
36
+ notes?: string | string[];
11
37
  }
12
38
  interface ProcessedFile {
13
39
  filePath?: string;
@@ -48,35 +74,11 @@ interface CreateLlmsTxtStreamOptions extends Omit<LlmsTxtArtifactsOptions, 'patt
48
74
  origin?: string;
49
75
  /** Generate llms-full.txt with complete page content (defaults to false) */
50
76
  generateFull?: boolean;
77
+ /** The sections to write before pages */
78
+ sections?: LlmsTxtSection[];
79
+ /** Notes to write at the end */
80
+ notes?: string | string[];
51
81
  }
52
- /**
53
- * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
54
- *
55
- * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
56
- * never keeping full content in memory. Creates outputDir recursively if needed.
57
- *
58
- * @example
59
- * ```typescript
60
- * const stream = createLlmsTxtStream({
61
- * siteName: 'My Docs',
62
- * description: 'Documentation site',
63
- * origin: 'https://example.com',
64
- * generateFull: true,
65
- * outputDir: './dist',
66
- * })
67
- *
68
- * const writer = stream.getWriter()
69
- * await writer.write({
70
- * title: 'Home',
71
- * content: '# Welcome\n\nHome page content.',
72
- * url: '/',
73
- * })
74
- * await writer.close()
75
- * ```
76
- *
77
- * @param options - Configuration options
78
- * @returns WritableStream that accepts ProcessedFile objects
79
- */
80
82
  declare function createLlmsTxtStream(options?: CreateLlmsTxtStreamOptions): WritableStream<ProcessedFile>;
81
83
  //#endregion
82
- export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };
84
+ export { CreateLlmsTxtStreamOptions, LlmsTxtArtifactsOptions, LlmsTxtArtifactsResult, LlmsTxtLink, LlmsTxtSection, ProcessedFile, createLlmsTxtStream, generateLlmsTxtArtifacts };
package/dist/llms-txt.mjs CHANGED
@@ -1,8 +1,5 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import "./_chunks/markdown-processor-D26Uo5td.mjs";
3
- import "./_chunks/plugin-CjWWQTuL.mjs";
4
2
  import "./_chunks/src-BJpipdul.mjs";
5
- import "./_chunks/extraction-BA9MDtq3.mjs";
6
- import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt-T79S7X24.mjs";
3
+ import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt-Czb_M48B.mjs";
7
4
 
8
5
  export { createLlmsTxtStream, generateLlmsTxtArtifacts };
package/dist/plugins.mjs CHANGED
@@ -1,4 +1,3 @@
1
- import "./_chunks/const-Bf_XN9U9.mjs";
2
1
  import { t as createPlugin } from "./_chunks/plugin-CjWWQTuL.mjs";
3
2
  import { t as extractionPlugin } from "./_chunks/extraction-BA9MDtq3.mjs";
4
3
  import { a as filterPlugin, i as frontmatterPlugin, n as readabilityPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./_chunks/plugins-DJnqR2fA.mjs";
@@ -1,6 +1,3 @@
1
- import "../_chunks/const-Bf_XN9U9.mjs";
2
- import "../_chunks/plugin-CjWWQTuL.mjs";
3
- import "../_chunks/extraction-BA9MDtq3.mjs";
4
1
  import "../_chunks/plugins-DJnqR2fA.mjs";
5
2
  import { t as withMinimalPreset } from "../_chunks/minimal-BiDhcwif.mjs";
6
3
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "mdream",
3
3
  "type": "module",
4
- "version": "0.15.0",
4
+ "version": "0.15.2",
5
5
  "description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",