mdream 0.15.3 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -1,9 +1,8 @@
1
- import { _ as TagHandler, a as HandlerContext, b as ExtractedElement, c as MdreamRuntimeState, d as Plugin, f as PluginContext, g as TEXT_NODE, h as SplitterOptions, i as HTMLToMarkdownOptions, l as Node, m as ReadabilityContext, n as ELEMENT_NODE, o as MarkdownChunk, p as PluginCreationOptions, r as ElementNode, s as MdreamProcessingState, t as BufferRegion, u as NodeEvent, v as TailwindContext, y as TextNode } from "./_chunks/types.mjs";
1
+ import { _ as TailwindContext, a as MarkdownChunk, c as Node, d as PluginContext, f as PluginCreationOptions, g as TagHandler, h as TEXT_NODE, i as HandlerContext, l as NodeEvent, m as SplitterOptions, n as ElementNode, o as MdreamProcessingState, p as ReadabilityContext, r as HTMLToMarkdownOptions, s as MdreamRuntimeState, t as ELEMENT_NODE, u as Plugin, v as TextNode, y as ExtractedElement } from "./_chunks/types.mjs";
2
2
  import { t as createPlugin } from "./_chunks/plugin.mjs";
3
3
  import { ReadableStream } from "node:stream/web";
4
4
 
5
5
  //#region src/const.d.ts
6
-
7
6
  declare const TagIdMap: {
8
7
  readonly html: 0;
9
8
  readonly head: 1;
@@ -119,10 +118,8 @@ declare const TagIdMap: {
119
118
  interface MarkdownState {
120
119
  /** Configuration options for conversion */
121
120
  options?: HTMLToMarkdownOptions;
122
- /** Map of region IDs to buffer regions for O(1) lookups */
123
- regionToggles: Map<number, boolean>;
124
- /** Content buffers for regions */
125
- regionContentBuffers: Map<number, string[]>;
121
+ /** Content buffer for markdown output */
122
+ buffer: string[];
126
123
  /** Performance cache for last content to avoid iteration */
127
124
  lastContentCache?: string;
128
125
  /** Reference to the last processed node */
@@ -178,4 +175,4 @@ declare function streamHtmlToMarkdown(htmlStream: ReadableStream | null, options
178
175
  //#region src/index.d.ts
179
176
  declare function htmlToMarkdown(html: string, options?: HTMLToMarkdownOptions): string;
180
177
  //#endregion
181
- export { BufferRegion, ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
178
+ export { ELEMENT_NODE, ElementNode, ExtractedElement, HTMLToMarkdownOptions, HandlerContext, MarkdownChunk, MarkdownProcessor, MdreamProcessingState, MdreamRuntimeState, Node, NodeEvent, Plugin, PluginContext, PluginCreationOptions, ReadabilityContext, SplitterOptions, TEXT_NODE, TagHandler, TagIdMap, TailwindContext, TextNode, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/index.mjs CHANGED
@@ -1,6 +1,5 @@
1
- import { _n as TagIdMap } from "./_chunks/const.mjs";
1
+ import { s as TagIdMap } from "./_chunks/const.mjs";
2
2
  import { i as parseHtml, t as MarkdownProcessor } from "./_chunks/markdown-processor.mjs";
3
3
  import { t as createPlugin } from "./_chunks/plugin.mjs";
4
4
  import { n as streamHtmlToMarkdown, t as htmlToMarkdown } from "./_chunks/src.mjs";
5
-
6
- export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
5
+ export { MarkdownProcessor, TagIdMap, createPlugin, htmlToMarkdown, parseHtml, streamHtmlToMarkdown };
package/dist/llms-txt.mjs CHANGED
@@ -1,5 +1,469 @@
1
+ import "./_chunks/const.mjs";
1
2
  import "./_chunks/markdown-processor.mjs";
2
- import "./_chunks/src.mjs";
3
- import { n as generateLlmsTxtArtifacts, t as createLlmsTxtStream } from "./_chunks/llms-txt.mjs";
4
-
5
- export { createLlmsTxtStream, generateLlmsTxtArtifacts };
3
+ import "./_chunks/plugin.mjs";
4
+ import { t as htmlToMarkdown } from "./_chunks/src.mjs";
5
+ import { t as extractionPlugin } from "./_chunks/extraction.mjs";
6
+ import { mkdir, open, readFile } from "node:fs/promises";
7
+ import { basename, dirname, join, relative, sep } from "pathe";
8
+ import { glob } from "tinyglobby";
9
+ //#region src/llms-txt.ts
10
+ const FRONTMATTER_RE = /^---\n([\s\S]*?)\n---\n([\s\S]*)$/;
11
+ const ANCHOR_INVALID_CHARS_RE = /[^a-z0-9]/g;
12
+ const LEADING_SLASH_RE = /^\//;
13
+ const TRAILING_SLASH_RE = /\/$/;
14
+ /**
15
+ * Extract metadata from HTML content using mdream's extraction plugin
16
+ */
17
+ function extractMetadata(html, url) {
18
+ let title = "";
19
+ let description = "";
20
+ let keywords = "";
21
+ let author = "";
22
+ htmlToMarkdown(html, {
23
+ plugins: [extractionPlugin({
24
+ "title": (element) => {
25
+ if (!title && element.textContent) title = element.textContent.trim();
26
+ },
27
+ "meta[name=\"description\"]": (element) => {
28
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
29
+ },
30
+ "meta[property=\"og:description\"]": (element) => {
31
+ if (!description && element.attributes?.content) description = element.attributes.content.trim();
32
+ },
33
+ "meta[name=\"keywords\"]": (element) => {
34
+ if (!keywords && element.attributes?.content) keywords = element.attributes.content.trim();
35
+ },
36
+ "meta[name=\"author\"]": (element) => {
37
+ if (!author && element.attributes?.content) author = element.attributes.content.trim();
38
+ },
39
+ "meta[property=\"og:title\"]": (element) => {
40
+ if (!title && element.attributes?.content) title = element.attributes.content.trim();
41
+ }
42
+ })],
43
+ origin: url
44
+ });
45
+ return {
46
+ title: title || void 0,
47
+ description: description || void 0,
48
+ keywords: keywords || void 0,
49
+ author: author || void 0
50
+ };
51
+ }
52
+ /**
53
+ * Convert file path to URL path
54
+ */
55
+ function pathToUrl(filePath, baseDir) {
56
+ let url = relative(baseDir, filePath);
57
+ url = url.split(sep).join("/");
58
+ if (url.endsWith(".html")) url = url.slice(0, -5);
59
+ if (url.endsWith("/index")) url = url.slice(0, -6);
60
+ if (url === "index") return "/";
61
+ if (!url.startsWith("/")) url = `/${url}`;
62
+ return url;
63
+ }
64
+ /**
65
+ * Process HTML files from glob patterns
66
+ */
67
+ async function processHtmlFiles(patterns, origin) {
68
+ const allPatterns = Array.isArray(patterns) ? patterns : [patterns];
69
+ const allFiles = [];
70
+ for (const pattern of allPatterns) {
71
+ const files = await glob(pattern);
72
+ allFiles.push(...files);
73
+ }
74
+ const uniqueFiles = [...new Set(allFiles)];
75
+ const results = [];
76
+ const baseDir = uniqueFiles.length > 0 ? dirname(uniqueFiles[0]) : ".";
77
+ for (const filePath of uniqueFiles) try {
78
+ const html = await readFile(filePath, "utf-8");
79
+ const metadata = extractMetadata(html, origin || filePath);
80
+ const content = htmlToMarkdown(html, { origin });
81
+ const url = pathToUrl(filePath, baseDir);
82
+ results.push({
83
+ filePath,
84
+ title: metadata?.title || basename(filePath, ".html"),
85
+ content,
86
+ url,
87
+ metadata
88
+ });
89
+ } catch (error) {
90
+ console.error(`Error processing ${filePath}:`, error);
91
+ }
92
+ return results;
93
+ }
94
+ /**
95
+ * Generate llms.txt content
96
+ */
97
+ function generateLlmsTxtContent(files, options) {
98
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
99
+ let content = `# ${siteName}\n\n`;
100
+ if (description) content += `> ${description}\n\n`;
101
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
102
+ if (sections) for (const section of sections) content += formatSection(section);
103
+ if (files.length > 0) {
104
+ content += `## Pages\n\n`;
105
+ for (const file of files) {
106
+ const desc = file.metadata?.description;
107
+ const descText = desc ? `: ${desc.substring(0, 100)}${desc.length > 100 ? "..." : ""}` : "";
108
+ if (file.filePath && options.outputDir && file.filePath.endsWith(".md")) {
109
+ const relativePath = relative(options.outputDir, file.filePath);
110
+ content += `- [${file.title}](${relativePath})${descText}\n`;
111
+ } else {
112
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
113
+ content += `- [${file.title}](${url})${descText}\n`;
114
+ }
115
+ }
116
+ }
117
+ if (notes) content += `\n${formatNotes(notes)}`;
118
+ return content;
119
+ }
120
+ /**
121
+ * Parse frontmatter from markdown content
122
+ */
123
+ function parseFrontmatter(content) {
124
+ const match = content.match(FRONTMATTER_RE);
125
+ if (!match) return {
126
+ frontmatter: null,
127
+ body: content
128
+ };
129
+ const frontmatterContent = match[1];
130
+ const body = match[2];
131
+ const frontmatter = {};
132
+ const lines = frontmatterContent.split("\n");
133
+ for (const line of lines) {
134
+ const colonIndex = line.indexOf(":");
135
+ if (colonIndex > 0) {
136
+ const key = line.substring(0, colonIndex).trim();
137
+ frontmatter[key] = line.substring(colonIndex + 1).trim();
138
+ }
139
+ }
140
+ return {
141
+ frontmatter,
142
+ body
143
+ };
144
+ }
145
+ /**
146
+ * Serialize frontmatter object to YAML-like format
147
+ */
148
+ function serializeFrontmatter(data) {
149
+ const lines = [];
150
+ for (const [key, value] of Object.entries(data)) if (value !== void 0 && value !== null) lines.push(`${key}: ${String(value)}`);
151
+ return lines.join("\n");
152
+ }
153
+ /**
154
+ * Generate llms-full.txt content with complete page content
155
+ */
156
+ function generateLlmsFullTxtContent(files, options) {
157
+ const { siteName = "Site", description, origin = "", sections, notes } = options;
158
+ let content = `# ${siteName}\n\n`;
159
+ if (description) content += `> ${description}\n\n`;
160
+ if (origin) content += `Canonical Origin: ${origin}\n\n`;
161
+ if (sections) for (const section of sections) content += formatSection(section);
162
+ if (files.length > 0) {
163
+ content += `## Table of Contents\n\n`;
164
+ for (const file of files) {
165
+ const anchor = file.title.toLowerCase().replace(ANCHOR_INVALID_CHARS_RE, "-");
166
+ content += `- [${file.title}](#${anchor})\n`;
167
+ }
168
+ content += `\n---\n\n`;
169
+ for (const file of files) {
170
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
171
+ const { frontmatter, body } = parseFrontmatter(file.content);
172
+ const metadata = {
173
+ title: file.title,
174
+ url
175
+ };
176
+ if (file.filePath && options.outputDir) metadata.file = relative(options.outputDir, file.filePath);
177
+ else if (file.filePath) metadata.file = file.filePath;
178
+ if (file.metadata) {
179
+ if (file.metadata.description) metadata.description = file.metadata.description;
180
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
181
+ if (file.metadata.author) metadata.author = file.metadata.author;
182
+ }
183
+ const frontmatterString = serializeFrontmatter(frontmatter ? {
184
+ ...frontmatter,
185
+ ...metadata
186
+ } : metadata);
187
+ let contentBody = frontmatter ? body : file.content;
188
+ const titleLine = contentBody.trim().split("\n")[0];
189
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
190
+ content += `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
191
+ }
192
+ }
193
+ if (notes) content += `\n${formatNotes(notes)}`;
194
+ return content;
195
+ }
196
+ /**
197
+ * Generate individual markdown files structure
198
+ */
199
+ function generateMarkdownFilesContent(files) {
200
+ const markdownFiles = [];
201
+ for (const file of files) {
202
+ const mdPath = `md/${file.url === "/" ? "index" : file.url.replace(LEADING_SLASH_RE, "").replace(TRAILING_SLASH_RE, "")}.md`;
203
+ markdownFiles.push({
204
+ path: mdPath,
205
+ content: file.content
206
+ });
207
+ }
208
+ return markdownFiles;
209
+ }
210
+ /**
211
+ * Main function to process files and generate llms.txt artifacts
212
+ */
213
+ async function generateLlmsTxtArtifacts(options) {
214
+ let files;
215
+ if (options.files) files = options.files;
216
+ else if (options.patterns) files = await processHtmlFiles(options.patterns, options.origin);
217
+ else throw new Error("Either patterns or files must be provided");
218
+ const llmsTxt = generateLlmsTxtContent(files, options);
219
+ let llmsFullTxt;
220
+ if (options.generateFull) llmsFullTxt = generateLlmsFullTxtContent(files, options);
221
+ let markdownFiles;
222
+ if (options.generateMarkdown) markdownFiles = generateMarkdownFilesContent(files);
223
+ return {
224
+ llmsTxt,
225
+ llmsFullTxt,
226
+ markdownFiles,
227
+ processedFiles: files
228
+ };
229
+ }
230
+ /**
231
+ * Format a section with title, description, and links
232
+ */
233
+ function formatSection(section) {
234
+ let content = `## ${section.title}\n\n`;
235
+ if (section.description) {
236
+ const descriptions = Array.isArray(section.description) ? section.description : [section.description];
237
+ for (const desc of descriptions) content += `${desc}\n\n`;
238
+ }
239
+ if (section.links?.length) {
240
+ for (const link of section.links) {
241
+ const desc = link.description ? `: ${link.description}` : "";
242
+ content += `- [${link.title}](${link.href})${desc}\n`;
243
+ }
244
+ content += "\n";
245
+ }
246
+ return content;
247
+ }
248
+ /**
249
+ * Format notes section
250
+ */
251
+ function formatNotes(notes) {
252
+ const noteLines = Array.isArray(notes) ? notes : [notes];
253
+ let content = "";
254
+ for (const note of noteLines) content += `${note}\n\n`;
255
+ return content;
256
+ }
257
+ /**
258
+ * Create a WritableStream that generates llms.txt artifacts by streaming pages to disk
259
+ *
260
+ * Writes llms.txt (and optionally llms-full.txt) incrementally as pages are written,
261
+ * never keeping full content in memory. Creates outputDir recursively if needed.
262
+ *
263
+ * @example
264
+ * ```typescript
265
+ * const stream = createLlmsTxtStream({
266
+ * siteName: 'My Docs',
267
+ * description: 'Documentation site',
268
+ * origin: 'https://example.com',
269
+ * generateFull: true,
270
+ * outputDir: './dist',
271
+ * sections: [
272
+ * {
273
+ * title: 'Getting Started',
274
+ * description: 'Quick start guide',
275
+ * links: [
276
+ * { title: 'Installation', href: '/install', description: 'How to install' },
277
+ * { title: 'Quick Start', href: '/quickstart' },
278
+ * ],
279
+ * },
280
+ * ],
281
+ * notes: ['Generated by mdream', 'Last updated: 2024'],
282
+ * })
283
+ *
284
+ * const writer = stream.getWriter()
285
+ * await writer.write({
286
+ * title: 'Home',
287
+ * content: '# Welcome\n\nHome page content.',
288
+ * url: '/',
289
+ * })
290
+ * await writer.close()
291
+ * ```
292
+ *
293
+ * @param options - Configuration options
294
+ * @returns WritableStream that accepts ProcessedFile objects
295
+ */
296
+ /**
297
+ * Get group prefix for a URL (up to 2 segments)
298
+ */
299
+ function getGroupPrefix(url, depth) {
300
+ const segments = url.split("/").filter(Boolean);
301
+ if (segments.length === 0) return "/";
302
+ if (depth === 1 || segments.length === 1) return `/${segments[0]}`;
303
+ return `/${segments[0]}/${segments[1]}`;
304
+ }
305
+ /**
306
+ * Sort pages by URL path in hierarchical order (directory tree structure)
307
+ * Groups by up to 2 segments, with root-level pages without nesting grouped together
308
+ */
309
+ function sortPagesByPath(pages) {
310
+ const twoSegmentCount = /* @__PURE__ */ new Map();
311
+ for (const page of pages) {
312
+ const prefix = getGroupPrefix(page.url, 2);
313
+ twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
314
+ }
315
+ const segmentHasNested = /* @__PURE__ */ new Map();
316
+ for (const page of pages) {
317
+ const segments = page.url.split("/").filter(Boolean);
318
+ const firstSegment = segments.length > 0 ? segments[0] : "";
319
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
320
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
321
+ }
322
+ return pages.sort((a, b) => {
323
+ const segmentsA = a.url.split("/").filter(Boolean);
324
+ const segmentsB = b.url.split("/").filter(Boolean);
325
+ const firstSegmentA = segmentsA.length > 0 ? segmentsA[0] : "";
326
+ const firstSegmentB = segmentsB.length > 0 ? segmentsB[0] : "";
327
+ const twoSegPrefixA = getGroupPrefix(a.url, 2);
328
+ const twoSegPrefixB = getGroupPrefix(b.url, 2);
329
+ const twoSegCountA = twoSegmentCount.get(twoSegPrefixA) || 0;
330
+ const twoSegCountB = twoSegmentCount.get(twoSegPrefixB) || 0;
331
+ let groupKeyA = twoSegCountA > 1 ? twoSegPrefixA : `/${firstSegmentA}`;
332
+ let groupKeyB = twoSegCountB > 1 ? twoSegPrefixB : `/${firstSegmentB}`;
333
+ const isRootLevelA = segmentsA.length <= 1;
334
+ const isRootLevelB = segmentsB.length <= 1;
335
+ const hasNestedA = segmentHasNested.get(firstSegmentA);
336
+ const hasNestedB = segmentHasNested.get(firstSegmentB);
337
+ if (isRootLevelA && !hasNestedA) groupKeyA = "";
338
+ if (isRootLevelB && !hasNestedB) groupKeyB = "";
339
+ if (groupKeyA === "" && groupKeyB !== "") return -1;
340
+ if (groupKeyA !== "" && groupKeyB === "") return 1;
341
+ if (groupKeyA !== groupKeyB) return groupKeyA.localeCompare(groupKeyB);
342
+ if (segmentsA.length === 0) return -1;
343
+ if (segmentsB.length === 0) return 1;
344
+ const minLen = Math.min(segmentsA.length, segmentsB.length);
345
+ for (let i = 0; i < minLen; i++) {
346
+ const cmp = segmentsA[i].localeCompare(segmentsB[i]);
347
+ if (cmp !== 0) return cmp;
348
+ }
349
+ return segmentsA.length - segmentsB.length;
350
+ });
351
+ }
352
+ function createLlmsTxtStream(options = {}) {
353
+ const { siteName = "Site", description, origin = "", generateFull, outputDir = process.cwd(), sections, notes } = options;
354
+ let llmsTxtHandle;
355
+ let llmsFullTxtHandle;
356
+ const bufferedPages = [];
357
+ return new WritableStream({
358
+ async start() {
359
+ await mkdir(outputDir, { recursive: true });
360
+ llmsTxtHandle = await open(join(outputDir, "llms.txt"), "w");
361
+ let header = `# ${siteName}\n\n`;
362
+ if (description) header += `> ${description}\n\n`;
363
+ if (origin) header += `Canonical Origin: ${origin}\n\n`;
364
+ if (sections) for (const section of sections) header += formatSection(section);
365
+ await llmsTxtHandle.write(header);
366
+ if (generateFull) {
367
+ llmsFullTxtHandle = await open(join(outputDir, "llms-full.txt"), "w");
368
+ let fullHeader = `# ${siteName}\n\n`;
369
+ if (description) fullHeader += `> ${description}\n\n`;
370
+ if (origin) fullHeader += `Canonical Origin: ${origin}\n\n`;
371
+ if (sections) for (const section of sections) fullHeader += formatSection(section);
372
+ await llmsFullTxtHandle.write(fullHeader);
373
+ }
374
+ },
375
+ async write(file) {
376
+ const desc = file.metadata?.description;
377
+ bufferedPages.push({
378
+ url: file.url,
379
+ title: file.title,
380
+ description: desc,
381
+ filePath: file.filePath
382
+ });
383
+ if (generateFull && llmsFullTxtHandle) {
384
+ const url = file.url.startsWith("http://") || file.url.startsWith("https://") ? file.url : origin ? origin + file.url : file.url;
385
+ const { frontmatter, body } = parseFrontmatter(file.content);
386
+ const metadata = {
387
+ title: file.title,
388
+ url
389
+ };
390
+ if (file.filePath) metadata.file = relative(outputDir, file.filePath);
391
+ if (file.metadata) {
392
+ if (file.metadata.description) metadata.description = file.metadata.description;
393
+ if (file.metadata.keywords) metadata.keywords = file.metadata.keywords;
394
+ if (file.metadata.author) metadata.author = file.metadata.author;
395
+ }
396
+ const frontmatterString = serializeFrontmatter(frontmatter ? {
397
+ ...frontmatter,
398
+ ...metadata
399
+ } : metadata);
400
+ let contentBody = frontmatter ? body : file.content;
401
+ const titleLine = contentBody.trim().split("\n")[0];
402
+ if (titleLine === file.title || titleLine === `# ${file.title}`) contentBody = contentBody.trim().split("\n").slice(1).join("\n").trimStart();
403
+ const fullChunk = `---\n${frontmatterString}\n---\n\n${contentBody}\n\n---\n\n`;
404
+ await llmsFullTxtHandle.write(fullChunk);
405
+ }
406
+ },
407
+ async close() {
408
+ const sortedPages = sortPagesByPath(bufferedPages);
409
+ const twoSegmentCount = /* @__PURE__ */ new Map();
410
+ for (const page of sortedPages) {
411
+ const prefix = getGroupPrefix(page.url, 2);
412
+ twoSegmentCount.set(prefix, (twoSegmentCount.get(prefix) || 0) + 1);
413
+ }
414
+ const segmentHasNested = /* @__PURE__ */ new Map();
415
+ for (const page of sortedPages) {
416
+ const segments = page.url.split("/").filter(Boolean);
417
+ const firstSegment = segments.length > 0 ? segments[0] : "";
418
+ if (!segmentHasNested.has(firstSegment)) segmentHasNested.set(firstSegment, false);
419
+ if (segments.length > 1) segmentHasNested.set(firstSegment, true);
420
+ }
421
+ await llmsTxtHandle?.write(`## Pages\n\n`);
422
+ let currentGroup = "";
423
+ let segmentGroupIndex = 0;
424
+ let urlsInCurrentGroup = 0;
425
+ for (let i = 0; i < sortedPages.length; i++) {
426
+ const page = sortedPages[i];
427
+ const segments = page.url.split("/").filter(Boolean);
428
+ const firstSegment = segments.length > 0 ? segments[0] : "";
429
+ const twoSegPrefix = getGroupPrefix(page.url, 2);
430
+ let groupKey = (twoSegmentCount.get(twoSegPrefix) || 0) > 1 ? twoSegPrefix : `/${firstSegment}`;
431
+ const isRootLevel = segments.length <= 1;
432
+ const hasNested = segmentHasNested.get(firstSegment);
433
+ if (isRootLevel && !hasNested) groupKey = "";
434
+ if (groupKey !== currentGroup) {
435
+ if (urlsInCurrentGroup > 0) {
436
+ if (segmentGroupIndex === 0 || segmentGroupIndex >= 1 && segmentGroupIndex <= 2 && urlsInCurrentGroup > 1) await llmsTxtHandle?.write("\n");
437
+ }
438
+ currentGroup = groupKey;
439
+ segmentGroupIndex++;
440
+ urlsInCurrentGroup = 0;
441
+ }
442
+ urlsInCurrentGroup++;
443
+ const descText = page.description ? `: ${page.description.substring(0, 160)}${page.description.length > 160 ? "..." : ""}` : "";
444
+ let chunk = "";
445
+ if (page.filePath && page.filePath.endsWith(".md")) {
446
+ const relativePath = relative(outputDir, page.filePath);
447
+ chunk = `- [${page.title}](${relativePath})${descText}\n`;
448
+ } else {
449
+ const url = page.url.startsWith("http://") || page.url.startsWith("https://") ? page.url : origin ? origin + page.url : page.url;
450
+ chunk = `- [${page.title}](${url})${descText}\n`;
451
+ }
452
+ await llmsTxtHandle?.write(chunk);
453
+ }
454
+ if (notes) {
455
+ const notesContent = formatNotes(notes);
456
+ await llmsTxtHandle?.write(`\n${notesContent}`);
457
+ if (generateFull && llmsFullTxtHandle) await llmsFullTxtHandle.write(`\n${notesContent}`);
458
+ }
459
+ await llmsTxtHandle?.close();
460
+ await llmsFullTxtHandle?.close();
461
+ },
462
+ async abort(_reason) {
463
+ await llmsTxtHandle?.close();
464
+ await llmsFullTxtHandle?.close();
465
+ }
466
+ });
467
+ }
468
+ //#endregion
469
+ export { createLlmsTxtStream, generateLlmsTxtArtifacts };
@@ -0,0 +1,26 @@
1
+ //#region src/negotiate.d.ts
2
+ interface AcceptEntry {
3
+ type: string;
4
+ q: number;
5
+ position: number;
6
+ }
7
+ /**
8
+ * Parse an HTTP Accept header into an ordered list of media types with quality values.
9
+ * Supports quality weights (q=0.9) and preserves original position for tie-breaking.
10
+ */
11
+ declare function parseAcceptHeader(accept: string): AcceptEntry[];
12
+ /**
13
+ * Determine if a client prefers markdown over HTML using proper content negotiation.
14
+ *
15
+ * Uses Accept header quality weights and position ordering:
16
+ * - If text/markdown or text/plain has higher quality than text/html → markdown
17
+ * - If same quality, earlier position in Accept header wins
18
+ * - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
19
+ * - sec-fetch-dest: document always returns false (browser navigation)
20
+ *
21
+ * @param acceptHeader - The HTTP Accept header value
22
+ * @param secFetchDest - The Sec-Fetch-Dest header value
23
+ */
24
+ declare function shouldServeMarkdown(acceptHeader?: string, secFetchDest?: string): boolean;
25
+ //#endregion
26
+ export { parseAcceptHeader, shouldServeMarkdown };
@@ -0,0 +1,92 @@
1
+ //#region src/negotiate.ts
2
+ /**
3
+ * Parse an HTTP Accept header into an ordered list of media types with quality values.
4
+ * Supports quality weights (q=0.9) and preserves original position for tie-breaking.
5
+ */
6
+ function parseAcceptHeader(accept) {
7
+ if (!accept) return [];
8
+ const entries = [];
9
+ const parts = accept.split(",");
10
+ for (let i = 0; i < parts.length; i++) {
11
+ const part = parts[i].trim();
12
+ if (!part) continue;
13
+ const semicolonIdx = part.indexOf(";");
14
+ let type;
15
+ let q = 1;
16
+ if (semicolonIdx === -1) type = part;
17
+ else {
18
+ type = part.slice(0, semicolonIdx).trim();
19
+ const paramStr = part.slice(semicolonIdx + 1);
20
+ const qIdx = paramStr.indexOf("q=");
21
+ if (qIdx !== -1) {
22
+ const qStart = qIdx + 2;
23
+ let qEnd = qStart;
24
+ while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
25
+ q = +paramStr.slice(qStart, qEnd) || 0;
26
+ }
27
+ }
28
+ entries.push({
29
+ type,
30
+ q,
31
+ position: i
32
+ });
33
+ }
34
+ return entries;
35
+ }
36
+ /**
37
+ * Determine if a client prefers markdown over HTML using proper content negotiation.
38
+ *
39
+ * Uses Accept header quality weights and position ordering:
40
+ * - If text/markdown or text/plain has higher quality than text/html → markdown
41
+ * - If same quality, earlier position in Accept header wins
42
+ * - Bare wildcard does NOT trigger markdown (prevents breaking OG crawlers)
43
+ * - sec-fetch-dest: document always returns false (browser navigation)
44
+ *
45
+ * @param acceptHeader - The HTTP Accept header value
46
+ * @param secFetchDest - The Sec-Fetch-Dest header value
47
+ */
48
+ function shouldServeMarkdown(acceptHeader, secFetchDest) {
49
+ if (secFetchDest === "document") return false;
50
+ const accept = acceptHeader || "";
51
+ if (!accept) return false;
52
+ const parts = accept.split(",");
53
+ let bestMdQ = -1;
54
+ let bestMdPos = -1;
55
+ let htmlQ = -1;
56
+ let htmlPos = -1;
57
+ for (let i = 0; i < parts.length; i++) {
58
+ const part = parts[i].trim();
59
+ if (!part) continue;
60
+ const semicolonIdx = part.indexOf(";");
61
+ let type;
62
+ let q = 1;
63
+ if (semicolonIdx === -1) type = part;
64
+ else {
65
+ type = part.slice(0, semicolonIdx).trim();
66
+ const paramStr = part.slice(semicolonIdx + 1);
67
+ const qIdx = paramStr.indexOf("q=");
68
+ if (qIdx !== -1) {
69
+ const qStart = qIdx + 2;
70
+ let qEnd = qStart;
71
+ while (qEnd < paramStr.length && paramStr.charCodeAt(qEnd) !== 59 && paramStr.charCodeAt(qEnd) !== 32) qEnd++;
72
+ q = +paramStr.slice(qStart, qEnd) || 0;
73
+ }
74
+ }
75
+ if (type === "text/markdown" || type === "text/plain") {
76
+ if (q > bestMdQ || q === bestMdQ && (bestMdPos === -1 || i < bestMdPos)) {
77
+ bestMdQ = q;
78
+ bestMdPos = i;
79
+ }
80
+ } else if (type === "text/html") {
81
+ htmlQ = q;
82
+ htmlPos = i;
83
+ }
84
+ }
85
+ if (bestMdPos === -1) return false;
86
+ if (htmlPos === -1) return true;
87
+ if (bestMdQ > htmlQ) return true;
88
+ if (bestMdQ === htmlQ && bestMdPos < htmlPos) return true;
89
+ return false;
90
+ }
91
+ //#endregion
92
+ export { parseAcceptHeader, shouldServeMarkdown };
@@ -1,8 +1,7 @@
1
- import { d as Plugin, x as extractionPlugin } from "./_chunks/types.mjs";
1
+ import { b as extractionPlugin, u as Plugin } from "./_chunks/types.mjs";
2
2
  import { t as createPlugin } from "./_chunks/plugin.mjs";
3
3
 
4
4
  //#region src/plugins/filter.d.ts
5
-
6
5
  /**
7
6
  * Plugin that filters nodes based on CSS selectors.
8
7
  * Allows including or excluding nodes based on selectors.
@@ -16,11 +15,8 @@ import { t as createPlugin } from "./_chunks/plugin.mjs";
16
15
  * withQuerySelectorPlugin({ exclude: ['nav', '#sidebar', '.footer'] })
17
16
  */
18
17
  declare function filterPlugin(options?: {
19
- /** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */
20
- include?: (string | number)[];
21
- /** CSS selectors (or Tag Ids) for elements to exclude */
22
- exclude?: (string | number)[];
23
- /** Whether to also process the children of matching elements */
18
+ /** CSS selectors (or Tag Ids) for elements to include (all others will be excluded) */include?: (string | number)[]; /** CSS selectors (or Tag Ids) for elements to exclude */
19
+ exclude?: (string | number)[]; /** Whether to also process the children of matching elements */
24
20
  processChildren?: boolean;
25
21
  keepAbsolute?: boolean;
26
22
  }): Plugin;
@@ -72,17 +68,10 @@ declare function frontmatterPlugin(options?: FrontmatterPluginOptions): Plugin;
72
68
  */
73
69
  declare function isolateMainPlugin(): Plugin;
74
70
  //#endregion
75
- //#region src/plugins/readability.d.ts
76
- /**
77
- * Creates a plugin that implements readability.js style heuristics for content quality assessment
78
- * Controls content inclusion/exclusion using buffer regions
79
- */
80
- declare function readabilityPlugin(): Plugin;
81
- //#endregion
82
71
  //#region src/plugins/tailwind.d.ts
83
72
  /**
84
73
  * Creates a plugin that adds Tailwind class processing
85
74
  */
86
75
  declare function tailwindPlugin(): Plugin;
87
76
  //#endregion
88
- export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
77
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };
package/dist/plugins.mjs CHANGED
@@ -1,5 +1,5 @@
1
+ import "./_chunks/const.mjs";
1
2
  import { t as createPlugin } from "./_chunks/plugin.mjs";
2
3
  import { t as extractionPlugin } from "./_chunks/extraction.mjs";
3
- import { a as filterPlugin, i as frontmatterPlugin, n as readabilityPlugin, r as isolateMainPlugin, t as tailwindPlugin } from "./_chunks/plugins.mjs";
4
-
5
- export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, readabilityPlugin, tailwindPlugin };
4
+ import { i as filterPlugin, n as isolateMainPlugin, r as frontmatterPlugin, t as tailwindPlugin } from "./_chunks/plugins.mjs";
5
+ export { createPlugin, extractionPlugin, filterPlugin, frontmatterPlugin, isolateMainPlugin, tailwindPlugin };