mdream 0.15.3 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -10
- package/dist/_chunks/const.mjs +109 -268
- package/dist/_chunks/extraction.mjs +3 -5
- package/dist/_chunks/markdown-processor.mjs +195 -186
- package/dist/_chunks/plugin.d.mts +1 -2
- package/dist/_chunks/plugin.mjs +1 -2
- package/dist/_chunks/plugins.mjs +38 -215
- package/dist/_chunks/src.mjs +1 -4
- package/dist/_chunks/types.d.mts +6 -22
- package/dist/cli.mjs +6 -5
- package/dist/iife.js +8 -8
- package/dist/index.d.mts +4 -7
- package/dist/index.mjs +2 -3
- package/dist/llms-txt.mjs +468 -4
- package/dist/negotiate.d.mts +26 -0
- package/dist/negotiate.mjs +92 -0
- package/dist/plugins.d.mts +4 -15
- package/dist/plugins.mjs +3 -3
- package/dist/preset/minimal.d.mts +1 -2
- package/dist/preset/minimal.mjs +40 -4
- package/dist/splitter.d.mts +1 -2
- package/dist/splitter.mjs +26 -28
- package/package.json +10 -2
- package/dist/_chunks/llms-txt.mjs +0 -464
- package/dist/_chunks/minimal.mjs +0 -40
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,4 +1,40 @@
|
|
|
1
|
-
import "../_chunks/
|
|
2
|
-
import
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import "../_chunks/const.mjs";
|
|
2
|
+
import "../_chunks/plugin.mjs";
|
|
3
|
+
import "../_chunks/extraction.mjs";
|
|
4
|
+
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "../_chunks/plugins.mjs";
|
|
5
|
+
//#region src/preset/minimal.ts
|
|
6
|
+
/**
|
|
7
|
+
* Creates a configurable minimal preset with advanced options
|
|
8
|
+
*
|
|
9
|
+
* @param options HTML to Markdown options
|
|
10
|
+
* @returns HTML to Markdown options with configured plugins
|
|
11
|
+
*/
|
|
12
|
+
function withMinimalPreset(options = {}) {
|
|
13
|
+
const filter = filterPlugin({ exclude: [
|
|
14
|
+
40,
|
|
15
|
+
68,
|
|
16
|
+
103,
|
|
17
|
+
58,
|
|
18
|
+
47,
|
|
19
|
+
88,
|
|
20
|
+
73,
|
|
21
|
+
59,
|
|
22
|
+
66,
|
|
23
|
+
65,
|
|
24
|
+
43,
|
|
25
|
+
41
|
|
26
|
+
] });
|
|
27
|
+
const plugins = [
|
|
28
|
+
frontmatterPlugin(),
|
|
29
|
+
isolateMainPlugin(),
|
|
30
|
+
tailwindPlugin(),
|
|
31
|
+
...options.plugins || [],
|
|
32
|
+
filter
|
|
33
|
+
];
|
|
34
|
+
return {
|
|
35
|
+
...options,
|
|
36
|
+
plugins
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
//#endregion
|
|
40
|
+
export { withMinimalPreset };
|
package/dist/splitter.d.mts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { a as MarkdownChunk, m as SplitterOptions } from "./_chunks/types.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/splitter.d.ts
|
|
4
|
-
|
|
5
4
|
/**
|
|
6
5
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
7
6
|
* Yields chunks during HTML event processing for better memory efficiency
|
package/dist/splitter.mjs
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
|
-
import
|
|
1
|
+
import "./_chunks/const.mjs";
|
|
2
2
|
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor.mjs";
|
|
3
|
-
|
|
4
3
|
//#region src/splitter.ts
|
|
4
|
+
const MARKDOWN_HEADER_LINE_RE = /^#{1,6}\s+/;
|
|
5
|
+
const NEWLINE_RE = /\n/g;
|
|
5
6
|
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
7
|
+
8,
|
|
8
|
+
9,
|
|
9
|
+
10,
|
|
10
|
+
11,
|
|
11
|
+
12
|
|
11
12
|
];
|
|
12
13
|
function createOptions(options) {
|
|
13
14
|
return {
|
|
@@ -35,9 +36,7 @@ function shouldSplitOnHeader(tagId, options) {
|
|
|
35
36
|
* Get current markdown content WITHOUT clearing buffers
|
|
36
37
|
*/
|
|
37
38
|
function getCurrentMarkdown(state) {
|
|
38
|
-
|
|
39
|
-
for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
40
|
-
return fragments.join("").trimStart();
|
|
39
|
+
return state.buffer.join("").trimStart();
|
|
41
40
|
}
|
|
42
41
|
/**
|
|
43
42
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
@@ -69,7 +68,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
69
68
|
}
|
|
70
69
|
let chunkContent = originalChunkContent;
|
|
71
70
|
if (opts.stripHeaders) {
|
|
72
|
-
chunkContent = chunkContent.split("\n").filter((line) => !
|
|
71
|
+
chunkContent = chunkContent.split("\n").filter((line) => !MARKDOWN_HEADER_LINE_RE.test(line)).join("\n").trim();
|
|
73
72
|
if (!chunkContent) {
|
|
74
73
|
lastChunkEndPosition = chunkEnd;
|
|
75
74
|
return;
|
|
@@ -79,13 +78,13 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
79
78
|
content: chunkContent.trimEnd(),
|
|
80
79
|
metadata: { loc: { lines: {
|
|
81
80
|
from: lineNumber,
|
|
82
|
-
to: lineNumber + (originalChunkContent.match(
|
|
81
|
+
to: lineNumber + (originalChunkContent.match(NEWLINE_RE) || []).length
|
|
83
82
|
} } }
|
|
84
83
|
};
|
|
85
84
|
if (headerHierarchy.size > 0) {
|
|
86
85
|
chunk.metadata.headers = {};
|
|
87
86
|
for (const [tagId, text] of headerHierarchy.entries()) {
|
|
88
|
-
const level = `h${tagId -
|
|
87
|
+
const level = `h${tagId - 7 + 1}`;
|
|
89
88
|
chunk.metadata.headers[level] = text;
|
|
90
89
|
}
|
|
91
90
|
}
|
|
@@ -97,7 +96,7 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
97
96
|
const maxOverlap = Math.max(0, originalChunkContent.length - 1);
|
|
98
97
|
lastChunkEndPosition = chunkEnd - Math.min(opts.chunkOverlap, maxOverlap);
|
|
99
98
|
} else lastChunkEndPosition = chunkEnd;
|
|
100
|
-
lineNumber += (originalChunkContent.match(
|
|
99
|
+
lineNumber += (originalChunkContent.match(NEWLINE_RE) || []).length;
|
|
101
100
|
}
|
|
102
101
|
const parseState = {
|
|
103
102
|
depthMap: processor.state.depthMap,
|
|
@@ -110,36 +109,36 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
110
109
|
});
|
|
111
110
|
for (const event of eventBuffer) {
|
|
112
111
|
const { type: eventType, node } = event;
|
|
113
|
-
if (node.type ===
|
|
112
|
+
if (node.type === 1) {
|
|
114
113
|
const element = node;
|
|
115
114
|
const tagId = element.tagId;
|
|
116
|
-
if (tagId && tagId >=
|
|
117
|
-
if (eventType ===
|
|
115
|
+
if (tagId && tagId >= 7 && tagId <= 12) {
|
|
116
|
+
if (eventType === 0) {
|
|
118
117
|
collectingHeaderText = true;
|
|
119
118
|
currentHeaderTagId = tagId;
|
|
120
119
|
currentHeaderText = "";
|
|
121
120
|
if (shouldSplitOnHeader(tagId, opts)) {
|
|
122
121
|
if (seenSplitHeaders.has(tagId)) {
|
|
123
122
|
yield* flushChunk();
|
|
124
|
-
for (let i = tagId; i <=
|
|
123
|
+
for (let i = tagId; i <= 12; i++) headerHierarchy.delete(i);
|
|
125
124
|
}
|
|
126
125
|
seenSplitHeaders.add(tagId);
|
|
127
126
|
}
|
|
128
|
-
} else if (eventType ===
|
|
127
|
+
} else if (eventType === 1 && currentHeaderTagId === tagId) {
|
|
129
128
|
headerHierarchy.set(tagId, currentHeaderText.trim());
|
|
130
129
|
collectingHeaderText = false;
|
|
131
130
|
currentHeaderTagId = null;
|
|
132
131
|
}
|
|
133
132
|
}
|
|
134
|
-
if (tagId ===
|
|
135
|
-
if (eventType ===
|
|
133
|
+
if (tagId === 23 && element.depthMap[34] > 0) {
|
|
134
|
+
if (eventType === 0) {
|
|
136
135
|
const lang = getCodeLanguage(element);
|
|
137
136
|
if (lang && !currentChunkCodeLanguage) currentChunkCodeLanguage = lang;
|
|
138
137
|
}
|
|
139
138
|
}
|
|
140
|
-
if (tagId ===
|
|
139
|
+
if (tagId === 13 && eventType === 0) yield* flushChunk();
|
|
141
140
|
}
|
|
142
|
-
if (collectingHeaderText && node.type ===
|
|
141
|
+
if (collectingHeaderText && node.type === 2) currentHeaderText += node.value;
|
|
143
142
|
processPluginsForEvent(event, opts.plugins, processor.state, processor.processEvent);
|
|
144
143
|
if (!opts.returnEachLine) {
|
|
145
144
|
const currentMd = getCurrentMarkdown(processor.state);
|
|
@@ -158,10 +157,10 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
158
157
|
if (idx >= 0) {
|
|
159
158
|
const beforeSplit = currentMd.slice(0, candidateSplitPos);
|
|
160
159
|
let backtickCount = 0;
|
|
161
|
-
let pos = 0;
|
|
162
|
-
while (
|
|
160
|
+
let pos = beforeSplit.indexOf("```", 0);
|
|
161
|
+
while (pos !== -1) {
|
|
163
162
|
backtickCount++;
|
|
164
|
-
pos
|
|
163
|
+
pos = beforeSplit.indexOf("```", pos + 3);
|
|
165
164
|
}
|
|
166
165
|
if (backtickCount % 2 === 1) continue;
|
|
167
166
|
}
|
|
@@ -208,6 +207,5 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
208
207
|
}
|
|
209
208
|
return chunks;
|
|
210
209
|
}
|
|
211
|
-
|
|
212
210
|
//#endregion
|
|
213
|
-
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|
|
211
|
+
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mdream",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.17.0",
|
|
5
5
|
"description": "Ultra-performant HTML to Markdown Convertor Optimized for LLMs and llm.txt artifacts.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -39,6 +39,14 @@
|
|
|
39
39
|
},
|
|
40
40
|
"default": "./dist/cli.mjs"
|
|
41
41
|
},
|
|
42
|
+
"./negotiate": {
|
|
43
|
+
"types": "./dist/negotiate.d.mts",
|
|
44
|
+
"import": {
|
|
45
|
+
"types": "./dist/negotiate.d.mts",
|
|
46
|
+
"default": "./dist/negotiate.mjs"
|
|
47
|
+
},
|
|
48
|
+
"default": "./dist/negotiate.mjs"
|
|
49
|
+
},
|
|
42
50
|
"./plugins": {
|
|
43
51
|
"types": "./dist/plugins.d.mts",
|
|
44
52
|
"import": {
|
|
@@ -77,7 +85,7 @@
|
|
|
77
85
|
],
|
|
78
86
|
"browser": "./dist/iife.js",
|
|
79
87
|
"dependencies": {
|
|
80
|
-
"cac": "^
|
|
88
|
+
"cac": "^7.0.0",
|
|
81
89
|
"pathe": "^2.0.3",
|
|
82
90
|
"tinyglobby": "^0.2.15"
|
|
83
91
|
},
|
|
@@ -1,464 +0,0 @@
|
|
|
1
|
-
import { t as htmlToMarkdown } from "./src.mjs";
|
|
2
|
-
import { t as extractionPlugin } from "./extraction.mjs";
|
|
3
|
-
import { mkdir, open, readFile } from "node:fs/promises";
|
|
4
|
-
import { basename, dirname, join, relative, sep } from "pathe";
|
|
5
|
-
import { glob } from "tinyglobby";
|
|
6
|
-
|
|
7
|
-
//#region src/llms-txt.ts
|
|
8
|
-
/**
|
|
9
|
-
* Extract metadata from HTML content using mdream's extraction plugin
|
|
10
|
-
*/
|
|
11
|
-
function extractMetadata(html, url) {
|
|
12
|
-
let title = "";
|
|
13
|
-
let description = "";
|
|
14
|
-
let keywords = "";
|
|
15
|
-
let author = "";
|
|
16
|
-
htmlToMarkdown(html, {
|
|
17
|
-
plugins: [extractionPlugin({
|
|
18
|
-
"title": (element) => {
|
|
19
|
-
if (!title && element.textContent) title = element.textContent.trim();
|
|
20
|
-
},
|
|
21
|
-
"meta[name=\"description\"]": (element) => {
|
|
22
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
23
|
-
},
|
|
24
|
-
"meta[property=\"og:description\"]": (element) => {
|
|
25
|
-
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
26
|
-
},
|
|
27
|
-
"meta[name=\"keywords\"]": (element) => {
|
|
28
|
-
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
29
|
-
},
|
|
30
|
-
"meta[name=\"author\"]": (element) => {
|
|
31
|
-
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
32
|
-
},
|
|
33
|
-
"meta[property=\"og:title\"]": (element) => {
|
|
34
|
-
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
35
|
-
}
|
|
36
|
-
})],
|
|
37
|
-
origin: url
|
|
38
|
-
});
|
|
39
|
-
return {
|
|
40
|
-
title: title || void 0,
|
|
41
|
-
description: description || void 0,
|
|
42
|
-
keywords: keywords || void 0,
|
|
43
|
-
author: author || void 0
|
|
44
|
-
};
|
|
45
|
-
}
|
|
46
|
-
/**
|
|
47
|
-
* Convert file path to URL path
|
|
48
|
-
*/
|
|
49
|
-
function pathToUrl(filePath, baseDir) {
|
|
50
|
-
let url = relative(baseDir, filePath);
|
|
51
|
-
url = url.split(sep).join("/");
|
|
52
|
-
if (url.endsWith(".html")) url = url.slice(0, -5);
|
|
53
|
-
if (url.endsWith("/index")) url = url.slice(0, -6);
|
|
54
|
-
if (url === "index") return "/";
|
|
55
|
-
if (!url.startsWith("/")) url = `/${url}`;
|
|
56
|
-
return url;
|
|
57
|
-
}
|
|
58
|
-
/**
|
|
59
|
-
* Process HTML files from glob patterns
|
|
60
|
-
*/
|
|
61
|
-
async function processHtmlFiles(patterns, origin) {
|
|
62
|
-
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
63
|
-
const allFiles = [];
|
|
64
|
-
for (const pattern of allPatterns) {
|
|
65
|
-
const files = await glob(pattern);
|
|
66
|
-
allFiles.push(...files);
|
|
67
|
-
}
|
|
68
|
-
const uniqueFiles = [...new Set(allFiles)];
|
|
69
|
-
const results = [];
|
|
70
|
-
const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
|
|
71
|
-
for (const filePath of uniqueFiles) try {
|
|
72
|
-
const html = await readFile(filePath, "utf-8");
|
|
73
|
-
const metadata = extractMetadata(html, origin || filePath);
|
|
74
|
-
const content = htmlToMarkdown(html, { origin });
|
|
75
|
-
const url = pathToUrl(filePath, baseDir);
|
|
76
|
-
results.push({
|
|
77
|
-
filePath,
|
|
78
|
-
title: metadata?.title || basename(filePath, ".html"),
|
|
79
|
-
content,
|
|
80
|
-
url,
|
|
81
|
-
metadata
|
|
82
|
-
});
|
|
83
|
-
} catch (error) {
|
|
84
|
-
console.error(`Error processing ${filePath}:`, error);
|
|
85
|
-
}
|
|
86
|
-
return results;
|
|
87
|
-
}
|
|
88
|
-
/**
|
|
89
|
-
* Generate llms.txt content
|
|
90
|
-
*/
|
|
91
|
-
function generateLlmsTxtContent(files, options) {
|
|
92
|
-
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
93
|
-
let content = `# ${siteName}\n\n`;
|
|
94
|
-
if (description) content += `> ${description}\n\n`;
|
|
95
|
-
if (origin) content += `Canonical Origin: ${origin}\n\n`;
|
|
96
|
-
if (sections) for (const section of sections) content += formatSection(section);
|
|
97
|
-
if (files.length > 0) {
|
|
98
|
-
content += `## Pages\n\n`;
|
|
99
|
-
for (const file of files) {
|
|
100
|
-
const desc = file.metadata?.description;
|
|
101
|
-
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
102
|
-
if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
|
|
103
|
-
const relativePath = relative(options.outputDir, file.filePath);
|
|
104
|
-
content += `- [${file.title}](${relativePath})${descText}\n`;
|
|
105
|
-
} else {
|
|
106
|
-
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
107
|
-
content += `- [${file.title}](${url})${descText}\n`;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
if (notes) content += `\n${formatNotes(notes)}`;
|
|
112
|
-
return content;
|
|
113
|
-
}
|
|
114
|
-
/**
|
|
115
|
-
* Parse frontmatter from markdown content
|
|
116
|
-
*/
|
|
117
|
-
function parseFrontmatter(content) {
|
|
118
|
-
const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
119
|
-
if (!match) return {
|
|
120
|
-
frontmatter: null,
|
|
121
|
-
body: content
|
|
122
|
-
};
|
|
123
|
-
const frontmatterContent = match[1];
|
|
124
|
-
const body = match[2];
|
|
125
|
-
const frontmatter = {};
|
|
126
|
-
const lines = frontmatterContent.split("\n");
|
|
127
|
-
for (const line of lines) {
|
|
128
|
-
const colonIndex = line.indexOf(":");
|
|
129
|
-
if (colonIndex > 0) {
|
|
130
|
-
const key = line.substring(0, colonIndex).trim();
|
|
131
|
-
frontmatter[key] = line.substring(colonIndex + 1).trim();
|
|
132
|
-
}
|
|
133
|
-
}
|
|
134
|
-
return {
|
|
135
|
-
frontmatter,
|
|
136
|
-
body
|
|
137
|
-
};
|
|
138
|
-
}
|
|
139
|
-
/**
|
|
140
|
-
* Serialize frontmatter object to YAML-like format
|
|
141
|
-
*/
|
|
142
|
-
function serializeFrontmatter(data) {
|
|
143
|
-
const lines = [];
|
|
144
|
-
for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
|
|
145
|
-
return lines.join("\n");
|
|
146
|
-
}
|
|
147
|
-
/**
|
|
148
|
-
* Generate llms-full.txt content with complete page content
|
|
149
|
-
*/
|
|
150
|
-
function generateLlmsFullTxtContent(files, options) {
|
|
151
|
-
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
152
|
-
let content = `# ${siteName}\n\n`;
|
|
153
|
-
if (description) content += `> ${description}\n\n`;
|
|
154
|
-
if (origin) content += `Canonical Origin: ${origin}\n\n`;
|
|
155
|
-
if (sections) for (const section of sections) content += formatSection(section);
|
|
156
|
-
if (files.length > 0) {
|
|
157
|
-
content += `## Table of Contents\n\n`;
|
|
158
|
-
for (const file of files) {
|
|
159
|
-
const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
160
|
-
content += `- [${file.title}](#${anchor})\n`;
|
|
161
|
-
}
|
|
162
|
-
content += `\n---\n\n`;
|
|
163
|
-
for (const file of files) {
|
|
164
|
-
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
165
|
-
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
166
|
-
const metadata = {
|
|
167
|
-
title: file.title,
|
|
168
|
-
url
|
|
169
|
-
};
|
|
170
|
-
if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
|
|
171
|
-
else if (file.filePath) metadata.file = file.filePath;
|
|
172
|
-
if (file.metadata) {
|
|
173
|
-
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
174
|
-
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
175
|
-
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
176
|
-
}
|
|
177
|
-
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
178
|
-
...frontmatter,
|
|
179
|
-
...metadata
|
|
180
|
-
} : metadata);
|
|
181
|
-
let contentBody = frontmatter ? body : file.content;
|
|
182
|
-
const titleLine = contentBody.trim().split("\n")[0];
|
|
183
|
-
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
184
|
-
content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
if (notes) content += `\n${formatNotes(notes)}`;
|
|
188
|
-
return content;
|
|
189
|
-
}
|
|
190
|
-
/**
|
|
191
|
-
* Generate individual markdown files structure
|
|
192
|
-
*/
|
|
193
|
-
function generateMarkdownFilesContent(files) {
|
|
194
|
-
const markdownFiles = [];
|
|
195
|
-
for (const file of files) {
|
|
196
|
-
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
|
|
197
|
-
markdownFiles.push({
|
|
198
|
-
path: mdPath,
|
|
199
|
-
content: file.content
|
|
200
|
-
});
|
|
201
|
-
}
|
|
202
|
-
return markdownFiles;
|
|
203
|
-
}
|
|
204
|
-
/**
|
|
205
|
-
* Main function to process files and generate llms.txt artifacts
|
|
206
|
-
*/
|
|
207
|
-
async function generateLlmsTxtArtifacts(options) {
|
|
208
|
-
let files;
|
|
209
|
-
if (options.files) files = options.files;
|
|
210
|
-
else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
|
|
211
|
-
else throw new Error("Either patterns or files must be provided");
|
|
212
|
-
const llmsTxt = generateLlmsTxtContent(files, options);
|
|
213
|
-
let llmsFullTxt;
|
|
214
|
-
if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
|
|
215
|
-
let markdownFiles;
|
|
216
|
-
if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
|
|
217
|
-
return {
|
|
218
|
-
llmsTxt,
|
|
219
|
-
llmsFullTxt,
|
|
220
|
-
markdownFiles,
|
|
221
|
-
processedFiles: files
|
|
222
|
-
};
|
|
223
|
-
}
|
|
224
|
-
/**
|
|
225
|
-
* Format a section with title, description, and links
|
|
226
|
-
*/
|
|
227
|
-
function formatSection(section) {
|
|
228
|
-
let content = `## ${section.title}\n\n`;
|
|
229
|
-
if (section.description) {
|
|
230
|
-
const descriptions = Array.isArray(section.description) ? section.description : [section.description];
|
|
231
|
-
for (const desc of descriptions) content += `${desc}\n\n`;
|
|
232
|
-
}
|
|
233
|
-
if (section.links?.length) {
|
|
234
|
-
for (const link of section.links) {
|
|
235
|
-
const desc = link.description ? `: ${link.description}` : "";
|
|
236
|
-
content += `- [${link.title}](${link.href})${desc}\n`;
|
|
237
|
-
}
|
|
238
|
-
content += "\n";
|
|
239
|
-
}
|
|
240
|
-
return content;
|
|
241
|
-
}
|
|
242
|
-
/**
|
|
243
|
-
* Format notes section
|
|
244
|
-
*/
|
|
245
|
-
function formatNotes(notes) {
|
|
246
|
-
const noteLines = Array.isArray(notes) ? notes : [notes];
|
|
247
|
-
let content = "";
|
|
248
|
-
for (const note of noteLines) content += `${note}\n\n`;
|
|
249
|
-
return content;
|
|
250
|
-
}
|
|
251
|
-
/**
|
|
252
|
-
* Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
|
|
253
|
-
*
|
|
254
|
-
* Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
|
|
255
|
-
* never keeping full content in memory. Creates outputDir recursively if needed.
|
|
256
|
-
*
|
|
257
|
-
* @example
|
|
258
|
-
* ```typescript
|
|
259
|
-
* const stream = createLlmsTxtStream({
|
|
260
|
-
* siteName: 'My Docs',
|
|
261
|
-
* description: 'Documentation site',
|
|
262
|
-
* origin: 'https://example.com',
|
|
263
|
-
* generateFull: true,
|
|
264
|
-
* outputDir: './dist',
|
|
265
|
-
* sections: [
|
|
266
|
-
* {
|
|
267
|
-
* title: 'Getting Started',
|
|
268
|
-
* description: 'Quick start guide',
|
|
269
|
-
* links: [
|
|
270
|
-
* { title: 'Installation', href: '/install', description: 'How to install' },
|
|
271
|
-
* { title: 'Quick Start', href: '/quickstart' },
|
|
272
|
-
* ],
|
|
273
|
-
* },
|
|
274
|
-
* ],
|
|
275
|
-
* notes: ['Generated by mdream', 'Last updated: 2024'],
|
|
276
|
-
* })
|
|
277
|
-
*
|
|
278
|
-
* const writer = stream.getWriter()
|
|
279
|
-
* await writer.write({
|
|
280
|
-
* title: 'Home',
|
|
281
|
-
* content: '# Welcome\n\nHome page content.',
|
|
282
|
-
* url: '/',
|
|
283
|
-
* })
|
|
284
|
-
* await writer.close()
|
|
285
|
-
* ```
|
|
286
|
-
*
|
|
287
|
-
* @param options - Configuration options
|
|
288
|
-
* @returns WritableStream that accepts ProcessedFile objects
|
|
289
|
-
*/
|
|
290
|
-
/**
|
|
291
|
-
* Get group prefix for a URL (up to 2 segments)
|
|
292
|
-
*/
|
|
293
|
-
function getGroupPrefix(url, depth) {
|
|
294
|
-
const segments = url.split("/").filter(Boolean);
|
|
295
|
-
if (segments.length === 0) return "/";
|
|
296
|
-
if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
|
|
297
|
-
return `/${segments[0]}/${segments[1]}`;
|
|
298
|
-
}
|
|
299
|
-
/**
|
|
300
|
-
* Sort pages by URL path in hierarchical order (directory tree structure)
|
|
301
|
-
* Groups by up to 2 segments, with root-level pages without nesting grouped together
|
|
302
|
-
*/
|
|
303
|
-
function sortPagesByPath(pages) {
|
|
304
|
-
const twoSegmentCount = /* @__PURE__ */ new Map();
|
|
305
|
-
for (const page of pages) {
|
|
306
|
-
const prefix = getGroupPrefix(page.url, 2);
|
|
307
|
-
twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
|
|
308
|
-
}
|
|
309
|
-
const segmentHasNested = /* @__PURE__ */ new Map();
|
|
310
|
-
for (const page of pages) {
|
|
311
|
-
const segments = page.url.split("/").filter(Boolean);
|
|
312
|
-
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
313
|
-
if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
|
|
314
|
-
if (segments.length > 1) segmentHasNested.set(firstSegment, true);
|
|
315
|
-
}
|
|
316
|
-
return pages.sort((a, b) => {
|
|
317
|
-
const segmentsA = a.url.split("/").filter(Boolean);
|
|
318
|
-
const segmentsB = b.url.split("/").filter(Boolean);
|
|
319
|
-
const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
|
|
320
|
-
const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
|
|
321
|
-
const twoSegPrefixA = getGroupPrefix(a.url, 2);
|
|
322
|
-
const twoSegPrefixB = getGroupPrefix(b.url, 2);
|
|
323
|
-
const twoSegCountA = twoSegmentCount.get(twoSegPrefixA) || 0;
|
|
324
|
-
const twoSegCountB = twoSegmentCount.get(twoSegPrefixB) || 0;
|
|
325
|
-
let groupKeyA = twoSegCountA > 1 ? twoSegPrefixA : `/${firstSegmentA}`;
|
|
326
|
-
let groupKeyB = twoSegCountB > 1 ? twoSegPrefixB : `/${firstSegmentB}`;
|
|
327
|
-
const isRootLevelA = segmentsA.length <= 1;
|
|
328
|
-
const isRootLevelB = segmentsB.length <= 1;
|
|
329
|
-
const hasNestedA = segmentHasNested.get(firstSegmentA);
|
|
330
|
-
const hasNestedB = segmentHasNested.get(firstSegmentB);
|
|
331
|
-
if (isRootLevelA && !hasNestedA) groupKeyA = "";
|
|
332
|
-
if (isRootLevelB && !hasNestedB) groupKeyB = "";
|
|
333
|
-
if (groupKeyA === "" && groupKeyB !== "") return -1;
|
|
334
|
-
if (groupKeyA !== "" && groupKeyB === "") return 1;
|
|
335
|
-
if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
|
|
336
|
-
if (segmentsA.length === 0) return -1;
|
|
337
|
-
if (segmentsB.length === 0) return 1;
|
|
338
|
-
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
339
|
-
for (let i = 0; i < minLen; i++) {
|
|
340
|
-
const cmp = segmentsA[i].localeCompare(segmentsB[i]);
|
|
341
|
-
if (cmp !== 0) return cmp;
|
|
342
|
-
}
|
|
343
|
-
return segmentsA.length - segmentsB.length;
|
|
344
|
-
});
|
|
345
|
-
}
|
|
346
|
-
function createLlmsTxtStream(options = {}) {
|
|
347
|
-
const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
|
|
348
|
-
let llmsTxtHandle;
|
|
349
|
-
let llmsFullTxtHandle;
|
|
350
|
-
const bufferedPages = [];
|
|
351
|
-
return new WritableStream({
|
|
352
|
-
async start() {
|
|
353
|
-
await mkdir(outputDir, { recursive: true });
|
|
354
|
-
llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
|
|
355
|
-
let header = `# ${siteName}\n\n`;
|
|
356
|
-
if (description) header += `> ${description}\n\n`;
|
|
357
|
-
if (origin) header += `Canonical Origin: ${origin}\n\n`;
|
|
358
|
-
if (sections) for (const section of sections) header += formatSection(section);
|
|
359
|
-
await llmsTxtHandle.write(header);
|
|
360
|
-
if (generateFull) {
|
|
361
|
-
llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
|
|
362
|
-
let fullHeader = `# ${siteName}\n\n`;
|
|
363
|
-
if (description) fullHeader += `> ${description}\n\n`;
|
|
364
|
-
if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
|
|
365
|
-
if (sections) for (const section of sections) fullHeader += formatSection(section);
|
|
366
|
-
await llmsFullTxtHandle.write(fullHeader);
|
|
367
|
-
}
|
|
368
|
-
},
|
|
369
|
-
async write(file) {
|
|
370
|
-
const desc = file.metadata?.description;
|
|
371
|
-
bufferedPages.push({
|
|
372
|
-
url: file.url,
|
|
373
|
-
title: file.title,
|
|
374
|
-
description: desc,
|
|
375
|
-
filePath: file.filePath
|
|
376
|
-
});
|
|
377
|
-
if (generateFull && llmsFullTxtHandle) {
|
|
378
|
-
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
379
|
-
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
380
|
-
const metadata = {
|
|
381
|
-
title: file.title,
|
|
382
|
-
url
|
|
383
|
-
};
|
|
384
|
-
if (file.filePath) metadata.file = relative(outputDir, file.filePath);
|
|
385
|
-
if (file.metadata) {
|
|
386
|
-
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
387
|
-
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
388
|
-
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
389
|
-
}
|
|
390
|
-
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
391
|
-
...frontmatter,
|
|
392
|
-
...metadata
|
|
393
|
-
} : metadata);
|
|
394
|
-
let contentBody = frontmatter ? body : file.content;
|
|
395
|
-
const titleLine = contentBody.trim().split("\n")[0];
|
|
396
|
-
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
397
|
-
const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
398
|
-
await llmsFullTxtHandle.write(fullChunk);
|
|
399
|
-
}
|
|
400
|
-
},
|
|
401
|
-
async close() {
|
|
402
|
-
const sortedPages = sortPagesByPath(bufferedPages);
|
|
403
|
-
const twoSegmentCount = /* @__PURE__ */ new Map();
|
|
404
|
-
for (const page of sortedPages) {
|
|
405
|
-
const prefix = getGroupPrefix(page.url, 2);
|
|
406
|
-
twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
|
|
407
|
-
}
|
|
408
|
-
const segmentHasNested = /* @__PURE__ */ new Map();
|
|
409
|
-
for (const page of sortedPages) {
|
|
410
|
-
const segments = page.url.split("/").filter(Boolean);
|
|
411
|
-
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
412
|
-
if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
|
|
413
|
-
if (segments.length > 1) segmentHasNested.set(firstSegment, true);
|
|
414
|
-
}
|
|
415
|
-
await llmsTxtHandle?.write(`## Pages\n\n`);
|
|
416
|
-
let currentGroup = "";
|
|
417
|
-
let segmentGroupIndex = 0;
|
|
418
|
-
let urlsInCurrentGroup = 0;
|
|
419
|
-
for (let i = 0; i < sortedPages.length; i++) {
|
|
420
|
-
const page = sortedPages[i];
|
|
421
|
-
const segments = page.url.split("/").filter(Boolean);
|
|
422
|
-
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
423
|
-
const twoSegPrefix = getGroupPrefix(page.url, 2);
|
|
424
|
-
let groupKey = (twoSegmentCount.get(twoSegPrefix) || 0) > 1 ? twoSegPrefix : `/${firstSegment}`;
|
|
425
|
-
const isRootLevel = segments.length <= 1;
|
|
426
|
-
const hasNested = segmentHasNested.get(firstSegment);
|
|
427
|
-
if (isRootLevel && !hasNested) groupKey = "";
|
|
428
|
-
if (groupKey !== currentGroup) {
|
|
429
|
-
if (urlsInCurrentGroup > 0) {
|
|
430
|
-
if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
|
|
431
|
-
}
|
|
432
|
-
currentGroup = groupKey;
|
|
433
|
-
segmentGroupIndex++;
|
|
434
|
-
urlsInCurrentGroup = 0;
|
|
435
|
-
}
|
|
436
|
-
urlsInCurrentGroup++;
|
|
437
|
-
const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
|
|
438
|
-
let chunk = "";
|
|
439
|
-
if (page.filePath && page.filePath.endsWith(".md")) {
|
|
440
|
-
const relativePath = relative(outputDir, page.filePath);
|
|
441
|
-
chunk = `- [${page.title}](${relativePath})${descText}\n`;
|
|
442
|
-
} else {
|
|
443
|
-
const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
|
|
444
|
-
chunk = `- [${page.title}](${url})${descText}\n`;
|
|
445
|
-
}
|
|
446
|
-
await llmsTxtHandle?.write(chunk);
|
|
447
|
-
}
|
|
448
|
-
if (notes) {
|
|
449
|
-
const notesContent = formatNotes(notes);
|
|
450
|
-
await llmsTxtHandle?.write(`\n${notesContent}`);
|
|
451
|
-
if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
|
|
452
|
-
}
|
|
453
|
-
await llmsTxtHandle?.close();
|
|
454
|
-
await llmsFullTxtHandle?.close();
|
|
455
|
-
},
|
|
456
|
-
async abort(reason) {
|
|
457
|
-
await llmsTxtHandle?.close();
|
|
458
|
-
await llmsFullTxtHandle?.close();
|
|
459
|
-
}
|
|
460
|
-
});
|
|
461
|
-
}
|
|
462
|
-
|
|
463
|
-
//#endregion
|
|
464
|
-
export { generateLlmsTxtArtifacts as n, createLlmsTxtStream as t };
|