mdream 0.15.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,440 +0,0 @@
1
- import { t as htmlToMarkdown } from "./src-BJpipdul.mjs";
2
- import { t as extractionPlugin } from "./extraction-BA9MDtq3.mjs";
3
- import { mkdir, open, readFile } from "node:fs/promises";
4
- import { basename, dirname, join, relative, sep } from "pathe";
5
- import { glob } from "tinyglobby";
6
-
7
- //#region src/llms-txt.ts
8
- /**
9
- * Extract metadata from HTML content using mdream's extraction plugin
10
- */
11
- function extractMetadata(html, url) {
12
- let title = "";
13
- let description = "";
14
- let keywords = "";
15
- let author = "";
16
- htmlToMarkdown(html, {
17
- plugins: [extractionPlugin({
18
- "title": (element) => {
19
- if (!title && element.textContent) title = element.textContent.trim();
20
- },
21
- "meta[name=\"description\"]": (element) => {
22
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
23
- },
24
- "meta[property=\"og:description\"]": (element) => {
25
- if (!description && element.attributes?.content) description = element.attributes.content.trim();
26
- },
27
- "meta[name=\"keywords\"]": (element) => {
28
- if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
29
- },
30
- "meta[name=\"author\"]": (element) => {
31
- if (!author && element.attributes?.content) author = element.attributes.content.trim();
32
- },
33
- "meta[property=\"og:title\"]": (element) => {
34
- if (!title && element.attributes?.content) title = element.attributes.content.trim();
35
- }
36
- })],
37
- origin: url
38
- });
39
- return {
40
- title: title || void 0,
41
- description: description || void 0,
42
- keywords: keywords || void 0,
43
- author: author || void 0
44
- };
45
- }
46
- /**
47
- * Convert file path to URL path
48
- */
49
- function pathToUrl(filePath, baseDir) {
50
- let url = relative(baseDir, filePath);
51
- url = url.split(sep).join("/");
52
- if (url.endsWith(".html")) url = url.slice(0, -5);
53
- if (url.endsWith("/index")) url = url.slice(0, -6);
54
- if (url === "index") return "/";
55
- if (!url.startsWith("/")) url = `/${url}`;
56
- return url;
57
- }
58
- /**
59
- * Process HTML files from glob patterns
60
- */
61
- async function processHtmlFiles(patterns, origin) {
62
- const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
63
- const allFiles = [];
64
- for (const pattern of allPatterns) {
65
- const files = await glob(pattern);
66
- allFiles.push(...files);
67
- }
68
- const uniqueFiles = [...new Set(allFiles)];
69
- const results = [];
70
- const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
71
- for (const filePath of uniqueFiles) try {
72
- const html = await readFile(filePath, "utf-8");
73
- const metadata = extractMetadata(html, origin || filePath);
74
- const content = htmlToMarkdown(html, { origin });
75
- const url = pathToUrl(filePath, baseDir);
76
- results.push({
77
- filePath,
78
- title: metadata?.title || basename(filePath, ".html"),
79
- content,
80
- url,
81
- metadata
82
- });
83
- } catch (error) {
84
- console.error(`Error processing ${filePath}:`, error);
85
- }
86
- return results;
87
- }
88
- /**
89
- * Generate llms.txt content
90
- */
91
- function generateLlmsTxtContent(files, options) {
92
- const { siteName = "Site", description, origin = "", sections, notes } = options;
93
- let content = `# ${siteName}\n\n`;
94
- if (description) content += `> ${description}\n\n`;
95
- if (origin) content += `Canonical Origin: ${origin}\n\n`;
96
- if (sections) for (const section of sections) content += formatSection(section);
97
- if (files.length > 0) {
98
- content += `## Pages\n\n`;
99
- for (const file of files) {
100
- const desc = file.metadata?.description;
101
- const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
102
- if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
103
- const relativePath = relative(options.outputDir, file.filePath);
104
- content += `- [${file.title}](${relativePath})${descText}\n`;
105
- } else {
106
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
107
- content += `- [${file.title}](${url})${descText}\n`;
108
- }
109
- }
110
- }
111
- if (notes) content += `\n${formatNotes(notes)}`;
112
- return content;
113
- }
114
- /**
115
- * Parse frontmatter from markdown content
116
- */
117
- function parseFrontmatter(content) {
118
- const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
119
- if (!match) return {
120
- frontmatter: null,
121
- body: content
122
- };
123
- const frontmatterContent = match[1];
124
- const body = match[2];
125
- const frontmatter = {};
126
- const lines = frontmatterContent.split("\n");
127
- for (const line of lines) {
128
- const colonIndex = line.indexOf(":");
129
- if (colonIndex > 0) {
130
- const key = line.substring(0, colonIndex).trim();
131
- frontmatter[key] = line.substring(colonIndex + 1).trim();
132
- }
133
- }
134
- return {
135
- frontmatter,
136
- body
137
- };
138
- }
139
- /**
140
- * Serialize frontmatter object to YAML-like format
141
- */
142
- function serializeFrontmatter(data) {
143
- const lines = [];
144
- for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
145
- return lines.join("\n");
146
- }
147
- /**
148
- * Generate llms-full.txt content with complete page content
149
- */
150
- function generateLlmsFullTxtContent(files, options) {
151
- const { siteName = "Site", description, origin = "", sections, notes } = options;
152
- let content = `# ${siteName}\n\n`;
153
- if (description) content += `> ${description}\n\n`;
154
- if (origin) content += `Canonical Origin: ${origin}\n\n`;
155
- if (sections) for (const section of sections) content += formatSection(section);
156
- if (files.length > 0) {
157
- content += `## Table of Contents\n\n`;
158
- for (const file of files) {
159
- const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
160
- content += `- [${file.title}](#${anchor})\n`;
161
- }
162
- content += `\n---\n\n`;
163
- for (const file of files) {
164
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
165
- const { frontmatter, body } = parseFrontmatter(file.content);
166
- const metadata = {
167
- title: file.title,
168
- url
169
- };
170
- if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
171
- else if (file.filePath) metadata.file = file.filePath;
172
- if (file.metadata) {
173
- if (file.metadata.description) metadata.description = file.metadata.description;
174
- if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
175
- if (file.metadata.author) metadata.author = file.metadata.author;
176
- }
177
- const frontmatterString = serializeFrontmatter(frontmatter ? {
178
- ...frontmatter,
179
- ...metadata
180
- } : metadata);
181
- let contentBody = frontmatter ? body : file.content;
182
- const titleLine = contentBody.trim().split("\n")[0];
183
- if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
184
- content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
185
- }
186
- }
187
- if (notes) content += `\n${formatNotes(notes)}`;
188
- return content;
189
- }
190
- /**
191
- * Generate individual markdown files structure
192
- */
193
- function generateMarkdownFilesContent(files) {
194
- const markdownFiles = [];
195
- for (const file of files) {
196
- const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
197
- markdownFiles.push({
198
- path: mdPath,
199
- content: file.content
200
- });
201
- }
202
- return markdownFiles;
203
- }
204
- /**
205
- * Main function to process files and generate llms.txt artifacts
206
- */
207
- async function generateLlmsTxtArtifacts(options) {
208
- let files;
209
- if (options.files) files = options.files;
210
- else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
211
- else throw new Error("Either patterns or files must be provided");
212
- const llmsTxt = generateLlmsTxtContent(files, options);
213
- let llmsFullTxt;
214
- if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
215
- let markdownFiles;
216
- if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
217
- return {
218
- llmsTxt,
219
- llmsFullTxt,
220
- markdownFiles,
221
- processedFiles: files
222
- };
223
- }
224
- /**
225
- * Format a section with title, description, and links
226
- */
227
- function formatSection(section) {
228
- let content = `## ${section.title}\n\n`;
229
- if (section.description) {
230
- const descriptions = Array.isArray(section.description) ? section.description : [section.description];
231
- for (const desc of descriptions) content += `${desc}\n\n`;
232
- }
233
- if (section.links?.length) {
234
- for (const link of section.links) {
235
- const desc = link.description ? `: ${link.description}` : "";
236
- content += `- [${link.title}](${link.href})${desc}\n`;
237
- }
238
- content += "\n";
239
- }
240
- return content;
241
- }
242
- /**
243
- * Format notes section
244
- */
245
- function formatNotes(notes) {
246
- const noteLines = Array.isArray(notes) ? notes : [notes];
247
- let content = "";
248
- for (const note of noteLines) content += `${note}\n\n`;
249
- return content;
250
- }
251
- /**
252
- * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
253
- *
254
- * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
255
- * never keeping full content in memory. Creates outputDir recursively if needed.
256
- *
257
- * @example
258
- * ```typescript
259
- * const stream = createLlmsTxtStream({
260
- * siteName: 'My Docs',
261
- * description: 'Documentation site',
262
- * origin: 'https://example.com',
263
- * generateFull: true,
264
- * outputDir: './dist',
265
- * sections: [
266
- * {
267
- * title: 'Getting Started',
268
- * description: 'Quick start guide',
269
- * links: [
270
- * { title: 'Installation', href: '/install', description: 'How to install' },
271
- * { title: 'Quick Start', href: '/quickstart' },
272
- * ],
273
- * },
274
- * ],
275
- * notes: ['Generated by mdream', 'Last updated: 2024'],
276
- * })
277
- *
278
- * const writer = stream.getWriter()
279
- * await writer.write({
280
- * title: 'Home',
281
- * content: '# Welcome\n\nHome page content.',
282
- * url: '/',
283
- * })
284
- * await writer.close()
285
- * ```
286
- *
287
- * @param options - Configuration options
288
- * @returns WritableStream that accepts ProcessedFile objects
289
- */
290
- /**
291
- * Get the group key for a URL (up to 2 segments deep)
292
- */
293
- /**
294
- * Sort pages by URL path in hierarchical order (directory tree structure)
295
- * Groups by first segment, with root-level pages without nesting grouped together
296
- */
297
- function sortPagesByPath(pages) {
298
- const segmentHasNested = /* @__PURE__ */ new Map();
299
- for (const page of pages) {
300
- const segments = page.url.split("/").filter(Boolean);
301
- const firstSegment = segments.length > 0 ? segments[0] : "";
302
- if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
303
- if (segments.length > 1) segmentHasNested.set(firstSegment, true);
304
- }
305
- return pages.sort((a, b) => {
306
- const segmentsA = a.url.split("/").filter(Boolean);
307
- const segmentsB = b.url.split("/").filter(Boolean);
308
- const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
309
- const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
310
- const isRootLevelA = segmentsA.length <= 1;
311
- const isRootLevelB = segmentsB.length <= 1;
312
- const hasNestedA = segmentHasNested.get(firstSegmentA);
313
- const hasNestedB = segmentHasNested.get(firstSegmentB);
314
- const groupKeyA = isRootLevelA && !hasNestedA ? "" : firstSegmentA;
315
- const groupKeyB = isRootLevelB && !hasNestedB ? "" : firstSegmentB;
316
- if (groupKeyA === "" && groupKeyB !== "") return -1;
317
- if (groupKeyA !== "" && groupKeyB === "") return 1;
318
- if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
319
- if (segmentsA.length === 0) return -1;
320
- if (segmentsB.length === 0) return 1;
321
- const minLen = Math.min(segmentsA.length, segmentsB.length);
322
- for (let i = 0; i < minLen; i++) {
323
- const cmp = segmentsA[i].localeCompare(segmentsB[i]);
324
- if (cmp !== 0) return cmp;
325
- }
326
- return segmentsA.length - segmentsB.length;
327
- });
328
- }
329
- function createLlmsTxtStream(options = {}) {
330
- const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
331
- let llmsTxtHandle;
332
- let llmsFullTxtHandle;
333
- const bufferedPages = [];
334
- return new WritableStream({
335
- async start() {
336
- await mkdir(outputDir, { recursive: true });
337
- llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
338
- let header = `# ${siteName}\n\n`;
339
- if (description) header += `> ${description}\n\n`;
340
- if (origin) header += `Canonical Origin: ${origin}\n\n`;
341
- if (sections) for (const section of sections) header += formatSection(section);
342
- await llmsTxtHandle.write(header);
343
- if (generateFull) {
344
- llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
345
- let fullHeader = `# ${siteName}\n\n`;
346
- if (description) fullHeader += `> ${description}\n\n`;
347
- if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
348
- if (sections) for (const section of sections) fullHeader += formatSection(section);
349
- await llmsFullTxtHandle.write(fullHeader);
350
- }
351
- },
352
- async write(file) {
353
- const desc = file.metadata?.description;
354
- bufferedPages.push({
355
- url: file.url,
356
- title: file.title,
357
- description: desc,
358
- filePath: file.filePath
359
- });
360
- if (generateFull && llmsFullTxtHandle) {
361
- const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
362
- const { frontmatter, body } = parseFrontmatter(file.content);
363
- const metadata = {
364
- title: file.title,
365
- url
366
- };
367
- if (file.filePath) metadata.file = relative(outputDir, file.filePath);
368
- if (file.metadata) {
369
- if (file.metadata.description) metadata.description = file.metadata.description;
370
- if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
371
- if (file.metadata.author) metadata.author = file.metadata.author;
372
- }
373
- const frontmatterString = serializeFrontmatter(frontmatter ? {
374
- ...frontmatter,
375
- ...metadata
376
- } : metadata);
377
- let contentBody = frontmatter ? body : file.content;
378
- const titleLine = contentBody.trim().split("\n")[0];
379
- if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
380
- const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
381
- await llmsFullTxtHandle.write(fullChunk);
382
- }
383
- },
384
- async close() {
385
- const sortedPages = sortPagesByPath(bufferedPages);
386
- const segmentHasNested = /* @__PURE__ */ new Map();
387
- for (const page of sortedPages) {
388
- const segments = page.url.split("/").filter(Boolean);
389
- const firstSegment = segments.length > 0 ? segments[0] : "";
390
- if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
391
- if (segments.length > 1) segmentHasNested.set(firstSegment, true);
392
- }
393
- await llmsTxtHandle?.write(`## Pages\n\n`);
394
- let currentGroup = "";
395
- let segmentGroupIndex = 0;
396
- let urlsInCurrentGroup = 0;
397
- for (let i = 0; i < sortedPages.length; i++) {
398
- const page = sortedPages[i];
399
- const segments = page.url.split("/").filter(Boolean);
400
- const firstSegment = segments.length > 0 ? segments[0] : "";
401
- const isRootLevel = segments.length <= 1;
402
- const hasNested = segmentHasNested.get(firstSegment);
403
- const groupKey = isRootLevel && !hasNested ? "" : firstSegment;
404
- if (groupKey !== currentGroup) {
405
- if (urlsInCurrentGroup > 0) {
406
- if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
407
- }
408
- currentGroup = groupKey;
409
- segmentGroupIndex++;
410
- urlsInCurrentGroup = 0;
411
- }
412
- urlsInCurrentGroup++;
413
- const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
414
- let chunk = "";
415
- if (page.filePath && page.filePath.endsWith(".md")) {
416
- const relativePath = relative(outputDir, page.filePath);
417
- chunk = `- [${page.title}](${relativePath})${descText}\n`;
418
- } else {
419
- const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
420
- chunk = `- [${page.title}](${url})${descText}\n`;
421
- }
422
- await llmsTxtHandle?.write(chunk);
423
- }
424
- if (notes) {
425
- const notesContent = formatNotes(notes);
426
- await llmsTxtHandle?.write(`\n${notesContent}`);
427
- if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
428
- }
429
- await llmsTxtHandle?.close();
430
- await llmsFullTxtHandle?.close();
431
- },
432
- async abort(reason) {
433
- await llmsTxtHandle?.close();
434
- await llmsFullTxtHandle?.close();
435
- }
436
- });
437
- }
438
-
439
- //#endregion
440
- export { generateLlmsTxtArtifacts as n, createLlmsTxtStream as t };
@@ -1,40 +0,0 @@
1
- import { A as TAG_BUTTON, Dt as TAG_OBJECT, Ht as TAG_SELECT, K as TAG_EMBED, S as TAG_ASIDE, X as TAG_FOOTER, Y as TAG_FIGURE, Z as TAG_FORM, dt as TAG_INPUT, lt as TAG_IFRAME, nn as TAG_TEXTAREA, q as TAG_FIELDSET, wt as TAG_NAV } from "./const-Bf_XN9U9.mjs";
2
- import { a as filterPlugin, i as frontmatterPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./plugins-DJnqR2fA.mjs";
3
-
4
- //#region src/preset/minimal.ts
5
- /**
6
- * Creates a configurable minimal preset with advanced options
7
- *
8
- * @param options HTML to Markdown options
9
- * @returns HTML to Markdown options with configured plugins
10
- */
11
- function withMinimalPreset(options = {}) {
12
- const plugins = [
13
- frontmatterPlugin(),
14
- isolateMainPlugin(),
15
- tailwindPlugin(),
16
- filterPlugin({ exclude: [
17
- TAG_FORM,
18
- TAG_FIELDSET,
19
- TAG_OBJECT,
20
- TAG_EMBED,
21
- TAG_FIGURE,
22
- TAG_FOOTER,
23
- TAG_ASIDE,
24
- TAG_IFRAME,
25
- TAG_INPUT,
26
- TAG_TEXTAREA,
27
- TAG_SELECT,
28
- TAG_BUTTON,
29
- TAG_NAV
30
- ] })
31
- ];
32
- if (options.plugins) plugins.push(...options.plugins);
33
- return {
34
- ...options,
35
- plugins
36
- };
37
- }
38
-
39
- //#endregion
40
- export { withMinimalPreset as t };
@@ -1,12 +0,0 @@
1
- //#region src/pluggable/plugin.ts
2
- /**
3
- * Create a plugin that implements the Plugin interface with improved type inference
4
- *
5
- * @returns A complete plugin implementation
6
- */
7
- function createPlugin(plugin) {
8
- return plugin;
9
- }
10
-
11
- //#endregion
12
- export { createPlugin as t };