mdream 0.15.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/_chunks/{const-Bf_XN9U9.mjs → const.mjs} +1 -42
- package/dist/_chunks/{extraction-BA9MDtq3.mjs → extraction.mjs} +2 -27
- package/dist/_chunks/{markdown-processor-D26Uo5td.mjs → markdown-processor.mjs} +22 -104
- package/dist/_chunks/{plugin-D5soyEXm.d.mts → plugin.d.mts} +1 -2
- package/dist/_chunks/plugin.mjs +4 -0
- package/dist/_chunks/{src-BJpipdul.mjs → stream.mjs} +2 -20
- package/dist/_chunks/{plugins-DJnqR2fA.mjs → tailwind.mjs} +15 -275
- package/dist/_chunks/{types-CT4ZxeOH.d.mts → types.d.mts} +6 -22
- package/dist/cli.mjs +5 -9
- package/dist/iife.js +8 -8
- package/dist/index.d.mts +5 -8
- package/dist/index.mjs +10 -6
- package/dist/llms-txt.mjs +383 -5
- package/dist/plugins.d.mts +5 -16
- package/dist/plugins.mjs +4 -5
- package/dist/preset/minimal.d.mts +1 -2
- package/dist/preset/minimal.mjs +30 -4
- package/dist/splitter.d.mts +1 -2
- package/dist/splitter.mjs +7 -24
- package/package.json +1 -1
- package/dist/_chunks/llms-txt-Czb_M48B.mjs +0 -440
- package/dist/_chunks/minimal-BiDhcwif.mjs +0 -40
- package/dist/_chunks/plugin-CjWWQTuL.mjs +0 -12
package/dist/llms-txt.mjs
CHANGED
|
@@ -1,5 +1,383 @@
|
|
|
1
|
-
import "./_chunks/markdown-processor
|
|
2
|
-
import "./_chunks/
|
|
3
|
-
import {
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
import "./_chunks/markdown-processor.mjs";
|
|
2
|
+
import "./_chunks/stream.mjs";
|
|
3
|
+
import { htmlToMarkdown } from "./index.mjs";
|
|
4
|
+
import { t as extractionPlugin } from "./_chunks/extraction.mjs";
|
|
5
|
+
import { mkdir, open, readFile } from "node:fs/promises";
|
|
6
|
+
import { basename, dirname, join, relative, sep } from "pathe";
|
|
7
|
+
import { glob } from "tinyglobby";
|
|
8
|
+
function extractMetadata(html, url) {
|
|
9
|
+
let title = "";
|
|
10
|
+
let description = "";
|
|
11
|
+
let keywords = "";
|
|
12
|
+
let author = "";
|
|
13
|
+
htmlToMarkdown(html, {
|
|
14
|
+
plugins: [extractionPlugin({
|
|
15
|
+
"title": (element) => {
|
|
16
|
+
if (!title && element.textContent) title = element.textContent.trim();
|
|
17
|
+
},
|
|
18
|
+
"meta[name=\"description\"]": (element) => {
|
|
19
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
20
|
+
},
|
|
21
|
+
"meta[property=\"og:description\"]": (element) => {
|
|
22
|
+
if (!description && element.attributes?.content) description = element.attributes.content.trim();
|
|
23
|
+
},
|
|
24
|
+
"meta[name=\"keywords\"]": (element) => {
|
|
25
|
+
if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
|
|
26
|
+
},
|
|
27
|
+
"meta[name=\"author\"]": (element) => {
|
|
28
|
+
if (!author && element.attributes?.content) author = element.attributes.content.trim();
|
|
29
|
+
},
|
|
30
|
+
"meta[property=\"og:title\"]": (element) => {
|
|
31
|
+
if (!title && element.attributes?.content) title = element.attributes.content.trim();
|
|
32
|
+
}
|
|
33
|
+
})],
|
|
34
|
+
origin: url
|
|
35
|
+
});
|
|
36
|
+
return {
|
|
37
|
+
title: title || void 0,
|
|
38
|
+
description: description || void 0,
|
|
39
|
+
keywords: keywords || void 0,
|
|
40
|
+
author: author || void 0
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function pathToUrl(filePath, baseDir) {
|
|
44
|
+
let url = relative(baseDir, filePath);
|
|
45
|
+
url = url.split(sep).join("/");
|
|
46
|
+
if (url.endsWith(".html")) url = url.slice(0, -5);
|
|
47
|
+
if (url.endsWith("/index")) url = url.slice(0, -6);
|
|
48
|
+
if (url === "index") return "/";
|
|
49
|
+
if (!url.startsWith("/")) url = `/${url}`;
|
|
50
|
+
return url;
|
|
51
|
+
}
|
|
52
|
+
async function processHtmlFiles(patterns, origin) {
|
|
53
|
+
const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
|
|
54
|
+
const allFiles = [];
|
|
55
|
+
for (const pattern of allPatterns) {
|
|
56
|
+
const files = await glob(pattern);
|
|
57
|
+
allFiles.push(...files);
|
|
58
|
+
}
|
|
59
|
+
const uniqueFiles = [...new Set(allFiles)];
|
|
60
|
+
const results = [];
|
|
61
|
+
const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
|
|
62
|
+
for (const filePath of uniqueFiles) try {
|
|
63
|
+
const html = await readFile(filePath, "utf-8");
|
|
64
|
+
const metadata = extractMetadata(html, origin || filePath);
|
|
65
|
+
const content = htmlToMarkdown(html, { origin });
|
|
66
|
+
const url = pathToUrl(filePath, baseDir);
|
|
67
|
+
results.push({
|
|
68
|
+
filePath,
|
|
69
|
+
title: metadata?.title || basename(filePath, ".html"),
|
|
70
|
+
content,
|
|
71
|
+
url,
|
|
72
|
+
metadata
|
|
73
|
+
});
|
|
74
|
+
} catch (error) {
|
|
75
|
+
console.error(`Error processing ${filePath}:`, error);
|
|
76
|
+
}
|
|
77
|
+
return results;
|
|
78
|
+
}
|
|
79
|
+
function generateLlmsTxtContent(files, options) {
|
|
80
|
+
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
81
|
+
let content = `# ${siteName}\n\n`;
|
|
82
|
+
if (description) content += `> ${description}\n\n`;
|
|
83
|
+
if (origin) content += `Canonical Origin: ${origin}\n\n`;
|
|
84
|
+
if (sections) for (const section of sections) content += formatSection(section);
|
|
85
|
+
if (files.length > 0) {
|
|
86
|
+
content += `## Pages\n\n`;
|
|
87
|
+
for (const file of files) {
|
|
88
|
+
const desc = file.metadata?.description;
|
|
89
|
+
const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
|
|
90
|
+
if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
|
|
91
|
+
const relativePath = relative(options.outputDir, file.filePath);
|
|
92
|
+
content += `- [${file.title}](${relativePath})${descText}\n`;
|
|
93
|
+
} else {
|
|
94
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
95
|
+
content += `- [${file.title}](${url})${descText}\n`;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
if (notes) content += `\n${formatNotes(notes)}`;
|
|
100
|
+
return content;
|
|
101
|
+
}
|
|
102
|
+
function parseFrontmatter(content) {
|
|
103
|
+
const match = content.match(/^---\n([\s\S]*?)\n---\n([\s\S]*)$/);
|
|
104
|
+
if (!match) return {
|
|
105
|
+
frontmatter: null,
|
|
106
|
+
body: content
|
|
107
|
+
};
|
|
108
|
+
const frontmatterContent = match[1];
|
|
109
|
+
const body = match[2];
|
|
110
|
+
const frontmatter = {};
|
|
111
|
+
const lines = frontmatterContent.split("\n");
|
|
112
|
+
for (const line of lines) {
|
|
113
|
+
const colonIndex = line.indexOf(":");
|
|
114
|
+
if (colonIndex > 0) {
|
|
115
|
+
const key = line.substring(0, colonIndex).trim();
|
|
116
|
+
frontmatter[key] = line.substring(colonIndex + 1).trim();
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return {
|
|
120
|
+
frontmatter,
|
|
121
|
+
body
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
function serializeFrontmatter(data) {
|
|
125
|
+
const lines = [];
|
|
126
|
+
for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
|
|
127
|
+
return lines.join("\n");
|
|
128
|
+
}
|
|
129
|
+
function generateLlmsFullTxtContent(files, options) {
|
|
130
|
+
const { siteName = "Site", description, origin = "", sections, notes } = options;
|
|
131
|
+
let content = `# ${siteName}\n\n`;
|
|
132
|
+
if (description) content += `> ${description}\n\n`;
|
|
133
|
+
if (origin) content += `Canonical Origin: ${origin}\n\n`;
|
|
134
|
+
if (sections) for (const section of sections) content += formatSection(section);
|
|
135
|
+
if (files.length > 0) {
|
|
136
|
+
content += `## Table of Contents\n\n`;
|
|
137
|
+
for (const file of files) {
|
|
138
|
+
const anchor = file.title.toLowerCase().replace(/[^a-z0-9]/g, "-");
|
|
139
|
+
content += `- [${file.title}](#${anchor})\n`;
|
|
140
|
+
}
|
|
141
|
+
content += `\n---\n\n`;
|
|
142
|
+
for (const file of files) {
|
|
143
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
144
|
+
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
145
|
+
const metadata = {
|
|
146
|
+
title: file.title,
|
|
147
|
+
url
|
|
148
|
+
};
|
|
149
|
+
if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
|
|
150
|
+
else if (file.filePath) metadata.file = file.filePath;
|
|
151
|
+
if (file.metadata) {
|
|
152
|
+
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
153
|
+
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
154
|
+
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
155
|
+
}
|
|
156
|
+
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
157
|
+
...frontmatter,
|
|
158
|
+
...metadata
|
|
159
|
+
} : metadata);
|
|
160
|
+
let contentBody = frontmatter ? body : file.content;
|
|
161
|
+
const titleLine = contentBody.trim().split("\n")[0];
|
|
162
|
+
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
163
|
+
content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
if (notes) content += `\n${formatNotes(notes)}`;
|
|
167
|
+
return content;
|
|
168
|
+
}
|
|
169
|
+
function generateMarkdownFilesContent(files) {
|
|
170
|
+
const markdownFiles = [];
|
|
171
|
+
for (const file of files) {
|
|
172
|
+
const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(/^\//, "").replace(/\/$/, "")}.md`;
|
|
173
|
+
markdownFiles.push({
|
|
174
|
+
path: mdPath,
|
|
175
|
+
content: file.content
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
return markdownFiles;
|
|
179
|
+
}
|
|
180
|
+
async function generateLlmsTxtArtifacts(options) {
|
|
181
|
+
let files;
|
|
182
|
+
if (options.files) files = options.files;
|
|
183
|
+
else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
|
|
184
|
+
else throw new Error("Either patterns or files must be provided");
|
|
185
|
+
const llmsTxt = generateLlmsTxtContent(files, options);
|
|
186
|
+
let llmsFullTxt;
|
|
187
|
+
if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
|
|
188
|
+
let markdownFiles;
|
|
189
|
+
if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
|
|
190
|
+
return {
|
|
191
|
+
llmsTxt,
|
|
192
|
+
llmsFullTxt,
|
|
193
|
+
markdownFiles,
|
|
194
|
+
processedFiles: files
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
function formatSection(section) {
|
|
198
|
+
let content = `## ${section.title}\n\n`;
|
|
199
|
+
if (section.description) {
|
|
200
|
+
const descriptions = Array.isArray(section.description) ? section.description : [section.description];
|
|
201
|
+
for (const desc of descriptions) content += `${desc}\n\n`;
|
|
202
|
+
}
|
|
203
|
+
if (section.links?.length) {
|
|
204
|
+
for (const link of section.links) {
|
|
205
|
+
const desc = link.description ? `: ${link.description}` : "";
|
|
206
|
+
content += `- [${link.title}](${link.href})${desc}\n`;
|
|
207
|
+
}
|
|
208
|
+
content += "\n";
|
|
209
|
+
}
|
|
210
|
+
return content;
|
|
211
|
+
}
|
|
212
|
+
function formatNotes(notes) {
|
|
213
|
+
const noteLines = Array.isArray(notes) ? notes : [notes];
|
|
214
|
+
let content = "";
|
|
215
|
+
for (const note of noteLines) content += `${note}\n\n`;
|
|
216
|
+
return content;
|
|
217
|
+
}
|
|
218
|
+
function getGroupPrefix(url, depth) {
|
|
219
|
+
const segments = url.split("/").filter(Boolean);
|
|
220
|
+
if (segments.length === 0) return "/";
|
|
221
|
+
if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
|
|
222
|
+
return `/${segments[0]}/${segments[1]}`;
|
|
223
|
+
}
|
|
224
|
+
function sortPagesByPath(pages) {
|
|
225
|
+
const twoSegmentCount = /* @__PURE__ */ new Map();
|
|
226
|
+
for (const page of pages) {
|
|
227
|
+
const prefix = getGroupPrefix(page.url, 2);
|
|
228
|
+
twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
|
|
229
|
+
}
|
|
230
|
+
const segmentHasNested = /* @__PURE__ */ new Map();
|
|
231
|
+
for (const page of pages) {
|
|
232
|
+
const segments = page.url.split("/").filter(Boolean);
|
|
233
|
+
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
234
|
+
if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
|
|
235
|
+
if (segments.length > 1) segmentHasNested.set(firstSegment, true);
|
|
236
|
+
}
|
|
237
|
+
return pages.sort((a, b) => {
|
|
238
|
+
const segmentsA = a.url.split("/").filter(Boolean);
|
|
239
|
+
const segmentsB = b.url.split("/").filter(Boolean);
|
|
240
|
+
const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
|
|
241
|
+
const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
|
|
242
|
+
const twoSegPrefixA = getGroupPrefix(a.url, 2);
|
|
243
|
+
const twoSegPrefixB = getGroupPrefix(b.url, 2);
|
|
244
|
+
const twoSegCountA = twoSegmentCount.get(twoSegPrefixA) || 0;
|
|
245
|
+
const twoSegCountB = twoSegmentCount.get(twoSegPrefixB) || 0;
|
|
246
|
+
let groupKeyA = twoSegCountA > 1 ? twoSegPrefixA : `/${firstSegmentA}`;
|
|
247
|
+
let groupKeyB = twoSegCountB > 1 ? twoSegPrefixB : `/${firstSegmentB}`;
|
|
248
|
+
const isRootLevelA = segmentsA.length <= 1;
|
|
249
|
+
const isRootLevelB = segmentsB.length <= 1;
|
|
250
|
+
const hasNestedA = segmentHasNested.get(firstSegmentA);
|
|
251
|
+
const hasNestedB = segmentHasNested.get(firstSegmentB);
|
|
252
|
+
if (isRootLevelA && !hasNestedA) groupKeyA = "";
|
|
253
|
+
if (isRootLevelB && !hasNestedB) groupKeyB = "";
|
|
254
|
+
if (groupKeyA === "" && groupKeyB !== "") return -1;
|
|
255
|
+
if (groupKeyA !== "" && groupKeyB === "") return 1;
|
|
256
|
+
if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
|
|
257
|
+
if (segmentsA.length === 0) return -1;
|
|
258
|
+
if (segmentsB.length === 0) return 1;
|
|
259
|
+
const minLen = Math.min(segmentsA.length, segmentsB.length);
|
|
260
|
+
for (let i = 0; i < minLen; i++) {
|
|
261
|
+
const cmp = segmentsA[i].localeCompare(segmentsB[i]);
|
|
262
|
+
if (cmp !== 0) return cmp;
|
|
263
|
+
}
|
|
264
|
+
return segmentsA.length - segmentsB.length;
|
|
265
|
+
});
|
|
266
|
+
}
|
|
267
|
+
function createLlmsTxtStream(options = {}) {
|
|
268
|
+
const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
|
|
269
|
+
let llmsTxtHandle;
|
|
270
|
+
let llmsFullTxtHandle;
|
|
271
|
+
const bufferedPages = [];
|
|
272
|
+
return new WritableStream({
|
|
273
|
+
async start() {
|
|
274
|
+
await mkdir(outputDir, { recursive: true });
|
|
275
|
+
llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
|
|
276
|
+
let header = `# ${siteName}\n\n`;
|
|
277
|
+
if (description) header += `> ${description}\n\n`;
|
|
278
|
+
if (origin) header += `Canonical Origin: ${origin}\n\n`;
|
|
279
|
+
if (sections) for (const section of sections) header += formatSection(section);
|
|
280
|
+
await llmsTxtHandle.write(header);
|
|
281
|
+
if (generateFull) {
|
|
282
|
+
llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
|
|
283
|
+
let fullHeader = `# ${siteName}\n\n`;
|
|
284
|
+
if (description) fullHeader += `> ${description}\n\n`;
|
|
285
|
+
if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
|
|
286
|
+
if (sections) for (const section of sections) fullHeader += formatSection(section);
|
|
287
|
+
await llmsFullTxtHandle.write(fullHeader);
|
|
288
|
+
}
|
|
289
|
+
},
|
|
290
|
+
async write(file) {
|
|
291
|
+
const desc = file.metadata?.description;
|
|
292
|
+
bufferedPages.push({
|
|
293
|
+
url: file.url,
|
|
294
|
+
title: file.title,
|
|
295
|
+
description: desc,
|
|
296
|
+
filePath: file.filePath
|
|
297
|
+
});
|
|
298
|
+
if (generateFull && llmsFullTxtHandle) {
|
|
299
|
+
const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
|
|
300
|
+
const { frontmatter, body } = parseFrontmatter(file.content);
|
|
301
|
+
const metadata = {
|
|
302
|
+
title: file.title,
|
|
303
|
+
url
|
|
304
|
+
};
|
|
305
|
+
if (file.filePath) metadata.file = relative(outputDir, file.filePath);
|
|
306
|
+
if (file.metadata) {
|
|
307
|
+
if (file.metadata.description) metadata.description = file.metadata.description;
|
|
308
|
+
if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
|
|
309
|
+
if (file.metadata.author) metadata.author = file.metadata.author;
|
|
310
|
+
}
|
|
311
|
+
const frontmatterString = serializeFrontmatter(frontmatter ? {
|
|
312
|
+
...frontmatter,
|
|
313
|
+
...metadata
|
|
314
|
+
} : metadata);
|
|
315
|
+
let contentBody = frontmatter ? body : file.content;
|
|
316
|
+
const titleLine = contentBody.trim().split("\n")[0];
|
|
317
|
+
if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
|
|
318
|
+
const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
|
|
319
|
+
await llmsFullTxtHandle.write(fullChunk);
|
|
320
|
+
}
|
|
321
|
+
},
|
|
322
|
+
async close() {
|
|
323
|
+
const sortedPages = sortPagesByPath(bufferedPages);
|
|
324
|
+
const twoSegmentCount = /* @__PURE__ */ new Map();
|
|
325
|
+
for (const page of sortedPages) {
|
|
326
|
+
const prefix = getGroupPrefix(page.url, 2);
|
|
327
|
+
twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
|
|
328
|
+
}
|
|
329
|
+
const segmentHasNested = /* @__PURE__ */ new Map();
|
|
330
|
+
for (const page of sortedPages) {
|
|
331
|
+
const segments = page.url.split("/").filter(Boolean);
|
|
332
|
+
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
333
|
+
if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
|
|
334
|
+
if (segments.length > 1) segmentHasNested.set(firstSegment, true);
|
|
335
|
+
}
|
|
336
|
+
await llmsTxtHandle?.write(`## Pages\n\n`);
|
|
337
|
+
let currentGroup = "";
|
|
338
|
+
let segmentGroupIndex = 0;
|
|
339
|
+
let urlsInCurrentGroup = 0;
|
|
340
|
+
for (let i = 0; i < sortedPages.length; i++) {
|
|
341
|
+
const page = sortedPages[i];
|
|
342
|
+
const segments = page.url.split("/").filter(Boolean);
|
|
343
|
+
const firstSegment = segments.length > 0 ? segments[0] : "";
|
|
344
|
+
const twoSegPrefix = getGroupPrefix(page.url, 2);
|
|
345
|
+
let groupKey = (twoSegmentCount.get(twoSegPrefix) || 0) > 1 ? twoSegPrefix : `/${firstSegment}`;
|
|
346
|
+
const isRootLevel = segments.length <= 1;
|
|
347
|
+
const hasNested = segmentHasNested.get(firstSegment);
|
|
348
|
+
if (isRootLevel && !hasNested) groupKey = "";
|
|
349
|
+
if (groupKey !== currentGroup) {
|
|
350
|
+
if (urlsInCurrentGroup > 0) {
|
|
351
|
+
if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
|
|
352
|
+
}
|
|
353
|
+
currentGroup = groupKey;
|
|
354
|
+
segmentGroupIndex++;
|
|
355
|
+
urlsInCurrentGroup = 0;
|
|
356
|
+
}
|
|
357
|
+
urlsInCurrentGroup++;
|
|
358
|
+
const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
|
|
359
|
+
let chunk = "";
|
|
360
|
+
if (page.filePath && page.filePath.endsWith(".md")) {
|
|
361
|
+
const relativePath = relative(outputDir, page.filePath);
|
|
362
|
+
chunk = `- [${page.title}](${relativePath})${descText}\n`;
|
|
363
|
+
} else {
|
|
364
|
+
const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
|
|
365
|
+
chunk = `- [${page.title}](${url})${descText}\n`;
|
|
366
|
+
}
|
|
367
|
+
await llmsTxtHandle?.write(chunk);
|
|
368
|
+
}
|
|
369
|
+
if (notes) {
|
|
370
|
+
const notesContent = formatNotes(notes);
|
|
371
|
+
await llmsTxtHandle?.write(`\n${notesContent}`);
|
|
372
|
+
if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
|
|
373
|
+
}
|
|
374
|
+
await llmsTxtHandle?.close();
|
|
375
|
+
await llmsFullTxtHandle?.close();
|
|
376
|
+
},
|
|
377
|
+
async abort(_reason) {
|
|
378
|
+
await llmsTxtHandle?.close();
|
|
379
|
+
await llmsFullTxtHandle?.close();
|
|
380
|
+
}
|
|
381
|
+
});
|
|
382
|
+
}
|
|
383
|
+
export { createLlmsTxtStream, generateLlmsTxtArtifacts };
|
package/dist/plugins.d.mts
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { t as createPlugin } from "./_chunks/plugin
|
|
1
|
+
import { b as extractionPlugin, u as Plugin } from "./_chunks/types.mjs";
|
|
2
|
+
import { t as createPlugin } from "./_chunks/plugin.mjs";
|
|
3
3
|
|
|
4
4
|
//#region src/plugins/filter.d.ts
|
|
5
|
-
|
|
6
5
|
/**
|
|
7
6
|
* Plugin that filters nodes based on CSS selectors.
|
|
8
7
|
* Allows including or excluding nodes based on selectors.
|
|
@@ -16,11 +15,8 @@ import { t as createPlugin } from "./_chunks/plugin-D5soyEXm.mjs";
|
|
|
16
15
|
* withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
|
|
17
16
|
*/
|
|
18
17
|
declare function filterPlugin(options?: {
|
|
19
|
-
/** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
|
|
20
|
-
|
|
21
|
-
/** CSS selectors (or Tag Ids) for elements to exclude */
|
|
22
|
-
exclude?: (string | number)[];
|
|
23
|
-
/** Whether to also process the children of matching elements */
|
|
18
|
+
/** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */include?: (string | number)[]; /** CSS selectors (or Tag Ids) for elements to exclude */
|
|
19
|
+
exclude?: (string | number)[]; /** Whether to also process the children of matching elements */
|
|
24
20
|
processChildren?: boolean;
|
|
25
21
|
keepAbsolute?: boolean;
|
|
26
22
|
}): Plugin;
|
|
@@ -72,17 +68,10 @@ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
|
|
|
72
68
|
*/
|
|
73
69
|
declare function isolateMainPlugin(): Plugin;
|
|
74
70
|
//#endregion
|
|
75
|
-
//#region src/plugins/readability.d.ts
|
|
76
|
-
/**
|
|
77
|
-
* Creates a plugin that implements readability.js style heuristics for content quality assessment
|
|
78
|
-
* Controls content inclusion/exclusion using buffer regions
|
|
79
|
-
*/
|
|
80
|
-
declare function readabilityPlugin(): Plugin;
|
|
81
|
-
//#endregion
|
|
82
71
|
//#region src/plugins/tailwind.d.ts
|
|
83
72
|
/**
|
|
84
73
|
* Creates a plugin that adds Tailwind class processing
|
|
85
74
|
*/
|
|
86
75
|
declare function tailwindPlugin(): Plugin;
|
|
87
76
|
//#endregion
|
|
88
|
-
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin,
|
|
77
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
|
package/dist/plugins.mjs
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
import { t as createPlugin } from "./_chunks/plugin
|
|
2
|
-
import { t as extractionPlugin } from "./_chunks/extraction
|
|
3
|
-
import {
|
|
4
|
-
|
|
5
|
-
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
|
|
1
|
+
import { t as createPlugin } from "./_chunks/plugin.mjs";
|
|
2
|
+
import { t as extractionPlugin } from "./_chunks/extraction.mjs";
|
|
3
|
+
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/tailwind.mjs";
|
|
4
|
+
export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
|
package/dist/preset/minimal.mjs
CHANGED
|
@@ -1,4 +1,30 @@
|
|
|
1
|
-
import "../_chunks/
|
|
2
|
-
import { t as
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import { Ct as TAG_OBJECT, G as TAG_FIELDSET, J as TAG_FORM, K as TAG_FIGURE, Lt as TAG_SELECT, W as TAG_EMBED, Zt as TAG_TEXTAREA, at as TAG_IFRAME, bt as TAG_NAV, k as TAG_BUTTON, q as TAG_FOOTER, st as TAG_INPUT, x as TAG_ASIDE } from "../_chunks/const.mjs";
|
|
2
|
+
import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "../_chunks/tailwind.mjs";
|
|
3
|
+
function withMinimalPreset(options = {}) {
|
|
4
|
+
const plugins = [
|
|
5
|
+
frontmatterPlugin(),
|
|
6
|
+
isolateMainPlugin(),
|
|
7
|
+
tailwindPlugin(),
|
|
8
|
+
filterPlugin({ exclude: [
|
|
9
|
+
TAG_FORM,
|
|
10
|
+
TAG_FIELDSET,
|
|
11
|
+
TAG_OBJECT,
|
|
12
|
+
TAG_EMBED,
|
|
13
|
+
TAG_FIGURE,
|
|
14
|
+
TAG_FOOTER,
|
|
15
|
+
TAG_ASIDE,
|
|
16
|
+
TAG_IFRAME,
|
|
17
|
+
TAG_INPUT,
|
|
18
|
+
TAG_TEXTAREA,
|
|
19
|
+
TAG_SELECT,
|
|
20
|
+
TAG_BUTTON,
|
|
21
|
+
TAG_NAV
|
|
22
|
+
] })
|
|
23
|
+
];
|
|
24
|
+
if (options.plugins) plugins.push(...options.plugins);
|
|
25
|
+
return {
|
|
26
|
+
...options,
|
|
27
|
+
plugins
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
export { withMinimalPreset };
|
package/dist/splitter.d.mts
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { a as MarkdownChunk, m as SplitterOptions } from "./_chunks/types.mjs";
|
|
2
2
|
|
|
3
3
|
//#region src/splitter.d.ts
|
|
4
|
-
|
|
5
4
|
/**
|
|
6
5
|
* Convert HTML to Markdown and split into chunks in single pass
|
|
7
6
|
* Yields chunks during HTML event processing for better memory efficiency
|
package/dist/splitter.mjs
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import { $ as
|
|
2
|
-
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor
|
|
3
|
-
|
|
4
|
-
//#region src/splitter.ts
|
|
1
|
+
import { $ as TAG_H5, N as TAG_CODE, Q as TAG_H4, X as TAG_H2, Y as TAG_H1, Z as TAG_H3, dn as TEXT_NODE, et as TAG_H6, h as NodeEventExit, kt as TAG_PRE, m as NodeEventEnter, r as ELEMENT_NODE, rt as TAG_HR } from "./_chunks/const.mjs";
|
|
2
|
+
import { a as parseHtmlStream, n as createMarkdownProcessor, r as processPluginsForEvent } from "./_chunks/markdown-processor.mjs";
|
|
5
3
|
const DEFAULT_HEADERS_TO_SPLIT_ON = [
|
|
6
4
|
TAG_H2,
|
|
7
5
|
TAG_H3,
|
|
@@ -31,18 +29,9 @@ function getCodeLanguage(node) {
|
|
|
31
29
|
function shouldSplitOnHeader(tagId, options) {
|
|
32
30
|
return options.headersToSplitOn.includes(tagId);
|
|
33
31
|
}
|
|
34
|
-
/**
|
|
35
|
-
* Get current markdown content WITHOUT clearing buffers
|
|
36
|
-
*/
|
|
37
32
|
function getCurrentMarkdown(state) {
|
|
38
|
-
|
|
39
|
-
for (const [regionId, content] of state.regionContentBuffers.entries()) if (state.regionToggles.get(regionId)) fragments.push(...content);
|
|
40
|
-
return fragments.join("").trimStart();
|
|
33
|
+
return state.buffer.join("").trimStart();
|
|
41
34
|
}
|
|
42
|
-
/**
|
|
43
|
-
* Convert HTML to Markdown and split into chunks in single pass
|
|
44
|
-
* Yields chunks during HTML event processing for better memory efficiency
|
|
45
|
-
*/
|
|
46
35
|
function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
47
36
|
const opts = createOptions(options);
|
|
48
37
|
if (opts.chunkOverlap >= opts.chunkSize) throw new Error("chunkOverlap must be less than chunkSize");
|
|
@@ -158,10 +147,10 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
158
147
|
if (idx >= 0) {
|
|
159
148
|
const beforeSplit = currentMd.slice(0, candidateSplitPos);
|
|
160
149
|
let backtickCount = 0;
|
|
161
|
-
let pos = 0;
|
|
162
|
-
while (
|
|
150
|
+
let pos = beforeSplit.indexOf("```", 0);
|
|
151
|
+
while (pos !== -1) {
|
|
163
152
|
backtickCount++;
|
|
164
|
-
pos
|
|
153
|
+
pos = beforeSplit.indexOf("```", pos + 3);
|
|
165
154
|
}
|
|
166
155
|
if (backtickCount % 2 === 1) continue;
|
|
167
156
|
}
|
|
@@ -177,10 +166,6 @@ function* htmlToMarkdownSplitChunksStream(html, options = {}) {
|
|
|
177
166
|
}
|
|
178
167
|
yield* flushChunk();
|
|
179
168
|
}
|
|
180
|
-
/**
|
|
181
|
-
* Convert HTML to Markdown and split into chunks in single pass
|
|
182
|
-
* Chunks are created during HTML event processing
|
|
183
|
-
*/
|
|
184
169
|
function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
185
170
|
const opts = createOptions(options);
|
|
186
171
|
const chunks = [];
|
|
@@ -208,6 +193,4 @@ function htmlToMarkdownSplitChunks(html, options = {}) {
|
|
|
208
193
|
}
|
|
209
194
|
return chunks;
|
|
210
195
|
}
|
|
211
|
-
|
|
212
|
-
//#endregion
|
|
213
|
-
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|
|
196
|
+
export { htmlToMarkdownSplitChunks, htmlToMarkdownSplitChunksStream };
|