@agent-seo/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,867 @@
1
+ // src/detect.ts
2
+ var AI_BOT_REGISTRY = [
3
+ // === OpenAI ===
4
+ {
5
+ pattern: /GPTBot/i,
6
+ info: { name: "GPTBot", operator: "OpenAI", purpose: "training", rendersJs: false }
7
+ },
8
+ {
9
+ pattern: /OAI-SearchBot/i,
10
+ info: { name: "OAI-SearchBot", operator: "OpenAI", purpose: "search", rendersJs: false }
11
+ },
12
+ {
13
+ pattern: /ChatGPT-User/i,
14
+ info: { name: "ChatGPT-User", operator: "OpenAI", purpose: "agent-browsing", rendersJs: true }
15
+ },
16
+ // === Anthropic ===
17
+ {
18
+ pattern: /ClaudeBot/i,
19
+ info: { name: "ClaudeBot", operator: "Anthropic", purpose: "training", rendersJs: false }
20
+ },
21
+ {
22
+ pattern: /Claude-User/i,
23
+ info: { name: "Claude-User", operator: "Anthropic", purpose: "agent-browsing", rendersJs: true }
24
+ },
25
+ {
26
+ pattern: /Claude-SearchBot/i,
27
+ info: { name: "Claude-SearchBot", operator: "Anthropic", purpose: "search", rendersJs: false }
28
+ },
29
+ {
30
+ pattern: /anthropic-ai/i,
31
+ info: { name: "anthropic-ai", operator: "Anthropic", purpose: "training", rendersJs: false }
32
+ },
33
+ // === Perplexity ===
34
+ {
35
+ pattern: /PerplexityBot/i,
36
+ info: { name: "PerplexityBot", operator: "Perplexity", purpose: "search", rendersJs: false }
37
+ },
38
+ {
39
+ pattern: /Perplexity-User/i,
40
+ info: { name: "Perplexity-User", operator: "Perplexity", purpose: "agent-browsing", rendersJs: true }
41
+ },
42
+ // === Google ===
43
+ {
44
+ pattern: /Google-Extended/i,
45
+ info: { name: "Google-Extended", operator: "Google", purpose: "training", rendersJs: true }
46
+ },
47
+ // === Apple ===
48
+ {
49
+ pattern: /Applebot-Extended/i,
50
+ info: { name: "Applebot-Extended", operator: "Apple", purpose: "training", rendersJs: true }
51
+ },
52
+ // === Meta ===
53
+ {
54
+ pattern: /meta-externalagent/i,
55
+ info: { name: "Meta-ExternalAgent", operator: "Meta", purpose: "training", rendersJs: false }
56
+ },
57
+ {
58
+ pattern: /FacebookBot/i,
59
+ info: { name: "FacebookBot", operator: "Meta", purpose: "search", rendersJs: false }
60
+ },
61
+ // === Common Crawl ===
62
+ {
63
+ pattern: /CCBot/i,
64
+ info: { name: "CCBot", operator: "Common Crawl", purpose: "training", rendersJs: false }
65
+ },
66
+ // === Cohere ===
67
+ {
68
+ pattern: /cohere-ai/i,
69
+ info: { name: "cohere-ai", operator: "Cohere", purpose: "training", rendersJs: false }
70
+ },
71
+ // === Amazon ===
72
+ {
73
+ pattern: /Amazonbot/i,
74
+ info: { name: "Amazonbot", operator: "Amazon", purpose: "search", rendersJs: false }
75
+ },
76
+ // === Bytedance ===
77
+ {
78
+ pattern: /Bytespider/i,
79
+ info: { name: "Bytespider", operator: "ByteDance", purpose: "training", rendersJs: false }
80
+ },
81
+ // === You.com ===
82
+ {
83
+ pattern: /YouBot/i,
84
+ info: { name: "YouBot", operator: "You.com", purpose: "search", rendersJs: false }
85
+ },
86
+ // === DeepSeek ===
87
+ {
88
+ pattern: /Deepseek/i,
89
+ info: { name: "DeepSeekBot", operator: "DeepSeek", purpose: "training", rendersJs: false }
90
+ }
91
+ ];
92
+ var TOKEN_REGISTRY = AI_BOT_REGISTRY.map((entry) => ({
93
+ entry,
94
+ token: regexToToken(entry.pattern)
95
+ }));
96
+ function detectAgent(userAgent, acceptHeader) {
97
+ const wantsMarkdown = acceptHeader ? /text\/markdown/i.test(acceptHeader) : false;
98
+ if (!userAgent) {
99
+ return { isAIBot: false, bot: null, wantsMarkdown };
100
+ }
101
+ const ua = userAgent.toLowerCase();
102
+ for (const { entry, token } of TOKEN_REGISTRY) {
103
+ if (token) {
104
+ if (ua.includes(token)) {
105
+ return { isAIBot: true, bot: entry.info, wantsMarkdown };
106
+ }
107
+ continue;
108
+ }
109
+ if (entry.pattern.test(userAgent)) {
110
+ return { isAIBot: true, bot: entry.info, wantsMarkdown };
111
+ }
112
+ }
113
+ return { isAIBot: false, bot: null, wantsMarkdown };
114
+ }
115
+ function shouldServeMarkdown(userAgent, acceptHeader) {
116
+ const ctx = detectAgent(userAgent, acceptHeader);
117
+ return ctx.isAIBot || ctx.wantsMarkdown;
118
+ }
119
+ function regexToToken(pattern) {
120
+ const source = pattern.source;
121
+ if (/^[A-Za-z0-9-]+$/.test(source)) return source.toLowerCase();
122
+ return null;
123
+ }
124
+
125
+ // src/transform.ts
126
+ import { JSDOM as JSDOM2 } from "jsdom";
127
+ import { Readability, isProbablyReaderable } from "@mozilla/readability";
128
+
129
+ // src/sanitize.ts
130
+ import { JSDOM } from "jsdom";
131
+ var DEFAULT_STRIP_TAGS = [
132
+ "script",
133
+ "style",
134
+ "noscript",
135
+ "iframe",
136
+ "svg",
137
+ "canvas",
138
+ "video",
139
+ "audio",
140
+ "map",
141
+ "object",
142
+ "embed",
143
+ "applet",
144
+ 'link[rel="stylesheet"]',
145
+ "meta"
146
+ ];
147
+ var DEFAULT_STRIP_SELECTORS = [
148
+ // Navigation & chrome
149
+ "nav",
150
+ "header:not(article header)",
151
+ "footer:not(article footer)",
152
+ '[role="navigation"]',
153
+ '[role="banner"]',
154
+ '[role="contentinfo"]',
155
+ '[role="complementary"]',
156
+ "aside",
157
+ // Ads, cookies, popups
158
+ ".advertisement",
159
+ ".ad",
160
+ ".ads",
161
+ '[class*="ad-"]',
162
+ '[class*="ad_"]',
163
+ ".cookie-banner",
164
+ ".cookie-consent",
165
+ '[class*="cookie"]',
166
+ ".popup",
167
+ ".modal",
168
+ '[class*="popup"]',
169
+ '[class*="modal"]',
170
+ ".overlay",
171
+ // Social & sharing
172
+ ".social-share",
173
+ ".share-buttons",
174
+ '[class*="social"]',
175
+ '[class*="share"]',
176
+ ".follow-us",
177
+ // Comments & forms (not the content)
178
+ ".comments",
179
+ "#comments",
180
+ ".comment-form",
181
+ 'form:not([class*="search"])',
182
+ // Related content / sidebar noise
183
+ ".related-posts",
184
+ ".recommended",
185
+ ".sidebar",
186
+ ".widget",
187
+ '[class*="related"]',
188
+ '[class*="sidebar"]',
189
+ '[class*="widget"]',
190
+ ".newsletter",
191
+ ".subscribe",
192
+ '[class*="newsletter"]',
193
+ ".cta",
194
+ '[class*="cta"]',
195
+ // Visual-only elements
196
+ ".breadcrumb",
197
+ ".breadcrumbs",
198
+ ".pagination",
199
+ ".skip-link",
200
+ '[aria-hidden="true"]',
201
+ // JS framework artifacts
202
+ "[data-reactroot] > noscript",
203
+ ".hydration-overlay"
204
+ ];
205
+ function sanitizeHtml(html, options = {}) {
206
+ const { stripSelectors = [], preserveSelectors = [] } = options;
207
+ const dom = new JSDOM(html);
208
+ const document = dom.window.document;
209
+ const preserveSet = /* @__PURE__ */ new Set();
210
+ for (const selector of preserveSelectors) {
211
+ try {
212
+ document.querySelectorAll(selector).forEach((el) => preserveSet.add(el));
213
+ } catch {
214
+ }
215
+ }
216
+ for (const tag of DEFAULT_STRIP_TAGS) {
217
+ try {
218
+ document.querySelectorAll(tag).forEach((el) => {
219
+ if (!preserveSet.has(el)) el.remove();
220
+ });
221
+ } catch {
222
+ }
223
+ }
224
+ const allStripSelectors = [...DEFAULT_STRIP_SELECTORS, ...stripSelectors];
225
+ for (const selector of allStripSelectors) {
226
+ try {
227
+ document.querySelectorAll(selector).forEach((el) => {
228
+ if (!preserveSet.has(el)) el.remove();
229
+ });
230
+ } catch {
231
+ }
232
+ }
233
+ stripLowDensityElements(document);
234
+ removeEmptyElements(document);
235
+ cleanAttributes(document);
236
+ const result = document.body?.innerHTML || "";
237
+ dom.window.close();
238
+ return result;
239
+ }
240
+ function stripLowDensityElements(document) {
241
+ const candidates = document.querySelectorAll("div, section, span");
242
+ for (const el of candidates) {
243
+ const textLength = (el.textContent || "").trim().length;
244
+ const childElementCount = el.querySelectorAll("*").length;
245
+ if (childElementCount > 10 && textLength < 50) {
246
+ el.remove();
247
+ }
248
+ }
249
+ }
250
+ function removeEmptyElements(document) {
251
+ const candidates = document.querySelectorAll("div, span, p, section, article");
252
+ for (const el of candidates) {
253
+ if (!(el.textContent || "").trim() && !el.querySelector("img, table, pre, code")) {
254
+ el.remove();
255
+ }
256
+ }
257
+ }
258
+ function cleanAttributes(document) {
259
+ const all = document.querySelectorAll("*");
260
+ const keepAttrs = /* @__PURE__ */ new Set(["href", "src", "alt", "title", "colspan", "rowspan", "scope", "headers", "lang", "dir", "type"]);
261
+ for (const el of all) {
262
+ const attrs = Array.from(el.attributes);
263
+ for (const attr of attrs) {
264
+ if (attr.name === "class" && el.tagName === "CODE") continue;
265
+ if (!keepAttrs.has(attr.name)) {
266
+ el.removeAttribute(attr.name);
267
+ }
268
+ }
269
+ }
270
+ }
271
+
272
+ // src/markdown.ts
273
+ import TurndownService from "turndown";
274
+ import { gfm } from "turndown-plugin-gfm";
275
+ function htmlToMarkdown(html, options = {}) {
276
+ const { url, customRules = [] } = options;
277
+ const turndown = new TurndownService({
278
+ headingStyle: "atx",
279
+ codeBlockStyle: "fenced",
280
+ bulletListMarker: "-",
281
+ emDelimiter: "*",
282
+ strongDelimiter: "**",
283
+ linkStyle: "inlined",
284
+ hr: "---"
285
+ });
286
+ turndown.use(gfm);
287
+ turndown.addRule("fencedCodeBlock", {
288
+ filter: (node) => {
289
+ return node.nodeName === "PRE" && node.firstChild !== null && node.firstChild.nodeName === "CODE";
290
+ },
291
+ replacement: (_content, node) => {
292
+ const codeEl = node.firstChild;
293
+ const className = codeEl?.getAttribute?.("class") || "";
294
+ const langMatch = className.match(
295
+ /(?:language-|lang-|hljs\s+|highlight-)([a-zA-Z0-9_+-]+)/
296
+ );
297
+ const lang = langMatch ? langMatch[1] : "";
298
+ const code = codeEl?.textContent || "";
299
+ return `
300
+
301
+ \`\`\`${lang}
302
+ ${code.replace(/\n+$/, "")}
303
+ \`\`\`
304
+
305
+ `;
306
+ }
307
+ });
308
+ turndown.addRule("meaningfulImages", {
309
+ filter: (node) => node.nodeName === "IMG",
310
+ replacement: (_content, node) => {
311
+ const el = node;
312
+ const alt = el.getAttribute("alt")?.trim();
313
+ const src = el.getAttribute("src")?.trim();
314
+ if (!alt) return "";
315
+ let resolvedSrc = src || "";
316
+ if (url && src && !src.startsWith("http") && !src.startsWith("data:")) {
317
+ try {
318
+ resolvedSrc = new URL(src, url).href;
319
+ } catch {
320
+ resolvedSrc = src;
321
+ }
322
+ }
323
+ return `![${alt}](${resolvedSrc})`;
324
+ }
325
+ });
326
+ turndown.addRule("resolveLinks", {
327
+ filter: "a",
328
+ replacement: (content, node) => {
329
+ const el = node;
330
+ const href = el.getAttribute("href");
331
+ if (!href || !content.trim()) return content;
332
+ if (href.startsWith("#")) return content;
333
+ if (href.startsWith("javascript:") || href.startsWith("data:text/html")) return "";
334
+ let resolvedHref = href;
335
+ if (url && !href.startsWith("http") && !href.startsWith("mailto:")) {
336
+ try {
337
+ resolvedHref = new URL(href, url).href;
338
+ } catch {
339
+ resolvedHref = href;
340
+ }
341
+ }
342
+ const title = el.getAttribute("title");
343
+ return title ? `[${content}](${resolvedHref} "${title}")` : `[${content}](${resolvedHref})`;
344
+ }
345
+ });
346
+ for (const rule of customRules) {
347
+ turndown.addRule(rule.name, {
348
+ filter: rule.filter,
349
+ replacement: rule.replacement
350
+ });
351
+ }
352
+ let markdown = turndown.turndown(html);
353
+ markdown = markdown.replace(/\n{3,}/g, "\n\n").trim();
354
+ return markdown;
355
+ }
356
+
357
+ // src/json-ld.ts
358
+ function extractJsonLdBlocks(document) {
359
+ const results = [];
360
+ const scripts = document.querySelectorAll('script[type="application/ld+json"]');
361
+ for (const script of scripts) {
362
+ try {
363
+ const data = JSON.parse(script.textContent || "");
364
+ if (data["@graph"] && Array.isArray(data["@graph"])) {
365
+ results.push(...data["@graph"]);
366
+ } else {
367
+ results.push(data);
368
+ }
369
+ } catch {
370
+ }
371
+ }
372
+ return results;
373
+ }
374
+
375
+ // src/tokens.ts
376
+ function estimateTokens(text) {
377
+ return Math.ceil(text.length / 4);
378
+ }
379
+
380
+ // src/frontmatter.ts
381
+ function buildFrontmatter(input) {
382
+ const lines = ["---"];
383
+ if (input.title) lines.push(`title: "${escapeYaml(input.title)}"`);
384
+ if (input.description)
385
+ lines.push(`description: "${escapeYaml(input.description)}"`);
386
+ if (input.url) lines.push(`url: "${escapeYaml(input.url)}"`);
387
+ if (input.lang) lines.push(`lang: "${escapeYaml(input.lang)}"`);
388
+ if (input.lastModified) lines.push(`lastModified: "${escapeYaml(input.lastModified)}"`);
389
+ if (input.jsonLd?.length) {
390
+ const primary = input.jsonLd[0];
391
+ const primaryType = primary?.["@type"];
392
+ if (primaryType) {
393
+ const typeStr = Array.isArray(primaryType) ? primaryType[0] : primaryType;
394
+ if (typeof typeStr === "string") {
395
+ lines.push(`schema: "${escapeYaml(typeStr)}"`);
396
+ }
397
+ }
398
+ const author = primary?.author;
399
+ if (author) {
400
+ const authorName = author.name;
401
+ if (authorName) lines.push(`author: "${escapeYaml(authorName)}"`);
402
+ }
403
+ const datePublished = primary?.datePublished;
404
+ if (datePublished) lines.push(`datePublished: "${escapeYaml(datePublished)}"`);
405
+ const dateModified = primary?.dateModified;
406
+ if (dateModified) lines.push(`dateModified: "${escapeYaml(dateModified)}"`);
407
+ }
408
+ lines.push("---");
409
+ return lines.join("\n");
410
+ }
411
+ function escapeYaml(str) {
412
+ return str.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, " ");
413
+ }
414
+
415
+ // src/transform.ts
416
+ async function transform(html, options = {}) {
417
+ const {
418
+ url,
419
+ tokenBudget,
420
+ extractJsonLd = true,
421
+ stripSelectors = [],
422
+ preserveSelectors = [],
423
+ frontmatter = true,
424
+ turndownRules = []
425
+ } = options;
426
+ const dom = new JSDOM2(html, { url: url || "https://localhost" });
427
+ const document = dom.window.document;
428
+ const title = document.querySelector("title")?.textContent?.trim() || document.querySelector("h1")?.textContent?.trim() || "";
429
+ const description = document.querySelector('meta[name="description"]')?.getAttribute("content")?.trim() || "";
430
+ const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute("href") || null;
431
+ const lang = document.documentElement.getAttribute("lang") || null;
432
+ const lastModified = document.querySelector('meta[property="article:modified_time"]')?.getAttribute("content") || document.querySelector('meta[name="last-modified"]')?.getAttribute("content") || null;
433
+ const jsonLd = extractJsonLd ? extractJsonLdBlocks(document) : [];
434
+ let contentHtml;
435
+ if (isProbablyReaderable(document)) {
436
+ const reader = new Readability(document, { charThreshold: 100 });
437
+ const article = reader.parse();
438
+ contentHtml = article?.content || document.body?.innerHTML || html;
439
+ } else {
440
+ contentHtml = document.body?.innerHTML || html;
441
+ }
442
+ const cleanHtml = sanitizeHtml(contentHtml, {
443
+ stripSelectors,
444
+ preserveSelectors
445
+ });
446
+ let markdown = htmlToMarkdown(cleanHtml, { url, customRules: turndownRules });
447
+ if (frontmatter) {
448
+ const fm = buildFrontmatter({ title, description, url, lang, lastModified, jsonLd });
449
+ markdown = fm + "\n\n" + markdown;
450
+ }
451
+ let tokenEstimate = estimateTokens(markdown);
452
+ if (tokenBudget && tokenEstimate > tokenBudget) {
453
+ markdown = truncateToTokenBudget(markdown, tokenBudget);
454
+ tokenEstimate = estimateTokens(markdown);
455
+ }
456
+ dom.window.close();
457
+ return {
458
+ markdown,
459
+ tokenEstimate,
460
+ title,
461
+ description,
462
+ jsonLd,
463
+ canonicalUrl,
464
+ lastModified,
465
+ lang
466
+ };
467
+ }
468
+ function truncateToTokenBudget(markdown, budget) {
469
+ const lines = markdown.split("\n");
470
+ const result = [];
471
+ let currentTokens = 0;
472
+ for (const line of lines) {
473
+ const lineTokens = estimateTokens(line);
474
+ if (currentTokens + lineTokens > budget) {
475
+ if (/^#{1,6}\s/.test(line)) {
476
+ result.push(line);
477
+ result.push("\n*[Content truncated for token budget]*\n");
478
+ }
479
+ break;
480
+ }
481
+ result.push(line);
482
+ currentTokens += lineTokens;
483
+ }
484
+ return result.join("\n");
485
+ }
486
+
487
+ // src/llms-txt.ts
488
+ function generateLlmsTxt(options, routes, fullTextContents) {
489
+ const { siteName, siteDescription, baseUrl, markdownExtension = ".md" } = options;
490
+ const sections = /* @__PURE__ */ new Map();
491
+ for (const route of routes) {
492
+ const section = route.section || "Pages";
493
+ if (!sections.has(section)) sections.set(section, []);
494
+ sections.get(section).push(route);
495
+ }
496
+ const lines = [];
497
+ lines.push(`# ${siteName}`);
498
+ lines.push("");
499
+ lines.push(`> ${siteDescription}`);
500
+ lines.push("");
501
+ for (const [section, sectionRoutes] of sections) {
502
+ lines.push(`## ${section}`);
503
+ lines.push("");
504
+ for (const route of sectionRoutes) {
505
+ const url = `${baseUrl}${route.path}${markdownExtension}`;
506
+ const desc = route.description ? `: ${route.description}` : "";
507
+ lines.push(`- [${route.title}](${url})${desc}`);
508
+ }
509
+ lines.push("");
510
+ }
511
+ const llmsTxt = lines.join("\n").trim() + "\n";
512
+ const fullLines = [];
513
+ fullLines.push(`# ${siteName}`);
514
+ fullLines.push("");
515
+ fullLines.push(`> ${siteDescription}`);
516
+ fullLines.push("");
517
+ if (fullTextContents) {
518
+ for (const route of routes) {
519
+ const content = fullTextContents.get(route.path);
520
+ if (content) {
521
+ fullLines.push(`---`);
522
+ fullLines.push("");
523
+ fullLines.push(`## ${route.title}`);
524
+ fullLines.push(`Source: ${baseUrl}${route.path}`);
525
+ fullLines.push("");
526
+ fullLines.push(content);
527
+ fullLines.push("");
528
+ }
529
+ }
530
+ }
531
+ const llmsFullTxt = fullLines.join("\n").trim() + "\n";
532
+ return {
533
+ llmsTxt,
534
+ llmsFullTxt,
535
+ routeCount: routes.length
536
+ };
537
+ }
538
+
539
+ // src/discover.ts
540
+ import { readFileSync, existsSync, readdirSync, lstatSync, realpathSync } from "fs";
541
+ import { join, sep } from "path";
542
+ var PAGE_FILE_PATTERNS = [
543
+ "page.tsx",
544
+ "page.ts",
545
+ "page.jsx",
546
+ "page.js",
547
+ "page.mdx",
548
+ "page.md"
549
+ ];
550
+ var SKIP_DIRS = /* @__PURE__ */ new Set([
551
+ "node_modules",
552
+ ".next",
553
+ ".git",
554
+ "dist",
555
+ ".turbo",
556
+ "_components",
557
+ "_lib",
558
+ "_utils",
559
+ "_hooks",
560
+ "_actions",
561
+ "components",
562
+ "lib",
563
+ "utils",
564
+ "hooks",
565
+ "actions",
566
+ "api"
567
+ // API routes are not pages
568
+ ]);
569
+ function discoverNextRoutes(appDir, options = {}) {
570
+ const {
571
+ exclude = ["/api"],
572
+ sectionStrategy = "directory",
573
+ defaultSection = "Pages"
574
+ } = options;
575
+ if (!existsSync(appDir)) {
576
+ return [];
577
+ }
578
+ const routes = [];
579
+ let rootReal = appDir;
580
+ try {
581
+ rootReal = realpathSync(appDir);
582
+ } catch {
583
+ return routes;
584
+ }
585
+ scanAppDir(appDir, appDir, rootReal, routes, exclude, sectionStrategy, defaultSection);
586
+ routes.sort((a, b) => {
587
+ if (a.path === "/") return -1;
588
+ if (b.path === "/") return 1;
589
+ return a.path.localeCompare(b.path);
590
+ });
591
+ return routes;
592
+ }
593
+ function scanAppDir(rootDir, currentDir, rootReal, routes, exclude, sectionStrategy, defaultSection) {
594
+ const entries = readdirSync(currentDir);
595
+ for (const entry of entries) {
596
+ const fullPath = join(currentDir, entry);
597
+ let stat;
598
+ try {
599
+ stat = lstatSync(fullPath);
600
+ } catch {
601
+ continue;
602
+ }
603
+ if (stat.isSymbolicLink()) continue;
604
+ if (stat.isDirectory()) {
605
+ let realDir = fullPath;
606
+ try {
607
+ realDir = realpathSync(fullPath);
608
+ } catch {
609
+ continue;
610
+ }
611
+ if (!isWithinRoot(realDir, rootReal)) continue;
612
+ const baseName = entry.toLowerCase();
613
+ if (baseName.startsWith("_") || baseName.startsWith(".") || SKIP_DIRS.has(baseName))
614
+ continue;
615
+ scanAppDir(
616
+ rootDir,
617
+ fullPath,
618
+ rootReal,
619
+ routes,
620
+ exclude,
621
+ sectionStrategy,
622
+ defaultSection
623
+ );
624
+ continue;
625
+ }
626
+ if (!PAGE_FILE_PATTERNS.includes(entry)) continue;
627
+ const relativePath = currentDir.substring(rootDir.length);
628
+ let urlPath = relativePath.replace(/\\/g, "/");
629
+ urlPath = urlPath.replace(/\/\([^)]+\)/g, "");
630
+ if (urlPath.includes("[...") || urlPath.includes("[[...")) continue;
631
+ if (urlPath === "") urlPath = "/";
632
+ if (!urlPath.startsWith("/")) urlPath = "/" + urlPath;
633
+ if (shouldExclude(urlPath, exclude)) continue;
634
+ if (urlPath === "/llms.txt" || urlPath === "/llms-full.txt") continue;
635
+ const { title, description } = extractMetadataFromFile(fullPath);
636
+ const section = deriveSection(urlPath, sectionStrategy, defaultSection);
637
+ routes.push({
638
+ path: urlPath,
639
+ title: title || pathToTitle(urlPath),
640
+ description: description || void 0,
641
+ section
642
+ });
643
+ }
644
+ }
645
+ function extractMetadataFromFile(filePath) {
646
+ try {
647
+ const content = readFileSync(filePath, "utf-8");
648
+ return extractMetadataFromSource(content);
649
+ } catch {
650
+ return { title: "", description: "" };
651
+ }
652
+ }
653
+ function extractMetadataFromSource(source) {
654
+ let title = "";
655
+ let description = "";
656
+ const metadataMatch = source.match(
657
+ /export\s+const\s+metadata[\s:]*(?:Metadata\s*)?=\s*\{/
658
+ );
659
+ if (!metadataMatch) {
660
+ return { title, description };
661
+ }
662
+ const startIdx = metadataMatch.index + metadataMatch[0].length - 1;
663
+ const objectStr = extractBalancedBraces(source, startIdx);
664
+ if (!objectStr) {
665
+ return { title, description };
666
+ }
667
+ const titleMatch = objectStr.match(
668
+ /title\s*:\s*(?:'([^']*)'|"([^"]*)"|`([^`]*)`)/
669
+ );
670
+ if (titleMatch) {
671
+ title = sanitizeMetadataValue(titleMatch[1] || titleMatch[2] || titleMatch[3] || "");
672
+ }
673
+ const descMatch = objectStr.match(
674
+ /description\s*:\s*(?:'([^']*)'|"([^"]*)"|`([^`]*)`)/
675
+ );
676
+ if (descMatch) {
677
+ description = sanitizeMetadataValue(descMatch[1] || descMatch[2] || descMatch[3] || "");
678
+ }
679
+ return { title, description };
680
+ }
681
+ function extractBalancedBraces(source, start) {
682
+ if (source[start] !== "{") return null;
683
+ let depth = 0;
684
+ for (let i = start; i < source.length; i++) {
685
+ if (source[i] === "{") depth++;
686
+ else if (source[i] === "}") depth--;
687
+ if (depth === 0) {
688
+ return source.substring(start, i + 1);
689
+ }
690
+ }
691
+ return null;
692
+ }
693
+ function deriveSection(urlPath, strategy, defaultSection) {
694
+ if (typeof strategy === "function") {
695
+ return strategy(urlPath);
696
+ }
697
+ const segments = urlPath.split("/").filter(Boolean);
698
+ if (segments.length === 0) return defaultSection;
699
+ const firstSegment = segments[0];
700
+ if (firstSegment.startsWith("[")) return defaultSection;
701
+ return firstSegment.split("-").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
702
+ }
703
+ function pathToTitle(urlPath) {
704
+ if (urlPath === "/") return "Home";
705
+ const lastSegment = urlPath.split("/").filter(Boolean).pop() || "";
706
+ if (lastSegment.startsWith("[")) {
707
+ return lastSegment.replace(/^\[|\]$/g, "");
708
+ }
709
+ return lastSegment.split("-").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
710
+ }
711
+ function shouldExclude(urlPath, patterns) {
712
+ for (const pattern of patterns) {
713
+ if (pattern.endsWith("/**") || pattern.endsWith("/*")) {
714
+ const prefix = pattern.replace(/\/\*\*?$/, "");
715
+ if (urlPath === prefix || urlPath.startsWith(prefix + "/")) return true;
716
+ } else if (pattern.startsWith("_")) {
717
+ if (urlPath.startsWith("/" + pattern) || urlPath.includes("/" + pattern + "/"))
718
+ return true;
719
+ } else if (urlPath === pattern || urlPath.startsWith(pattern + "/")) {
720
+ return true;
721
+ }
722
+ }
723
+ return false;
724
+ }
725
+ function discoverFilesystemRoutes(dir, options = {}) {
726
+ const {
727
+ exclude = [],
728
+ sectionStrategy = "directory",
729
+ defaultSection = "Pages"
730
+ } = options;
731
+ if (!existsSync(dir)) {
732
+ return [];
733
+ }
734
+ let rootReal = dir;
735
+ try {
736
+ rootReal = realpathSync(dir);
737
+ } catch {
738
+ return [];
739
+ }
740
+ const htmlFiles = findHtmlFiles(dir, rootReal);
741
+ const routes = [];
742
+ for (const filePath of htmlFiles) {
743
+ const relativePath = filePath.substring(dir.length);
744
+ let urlPath = relativePath.replace(/\\/g, "/").replace(/\/index\.html$/, "/").replace(/\.html$/, "");
745
+ if (urlPath === "") urlPath = "/";
746
+ if (!urlPath.startsWith("/")) urlPath = "/" + urlPath;
747
+ if (shouldExclude(urlPath, exclude)) continue;
748
+ const section = deriveSection(urlPath, sectionStrategy, defaultSection);
749
+ let title = "";
750
+ try {
751
+ const html = readFileSync(filePath, "utf-8");
752
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
753
+ title = sanitizeMetadataValue(titleMatch?.[1]?.trim() || "");
754
+ } catch {
755
+ }
756
+ routes.push({
757
+ path: urlPath,
758
+ title: title || pathToTitle(urlPath),
759
+ section
760
+ });
761
+ }
762
+ routes.sort((a, b) => {
763
+ if (a.path === "/") return -1;
764
+ if (b.path === "/") return 1;
765
+ return a.path.localeCompare(b.path);
766
+ });
767
+ return routes;
768
+ }
769
+ function findHtmlFiles(dir, rootReal) {
770
+ const results = [];
771
+ if (!existsSync(dir)) return results;
772
+ const entries = readdirSync(dir);
773
+ for (const entry of entries) {
774
+ const fullPath = join(dir, entry);
775
+ let stat;
776
+ try {
777
+ stat = lstatSync(fullPath);
778
+ } catch {
779
+ continue;
780
+ }
781
+ if (stat.isSymbolicLink()) continue;
782
+ if (stat.isDirectory()) {
783
+ if (["node_modules", ".next", ".git", "dist", ".turbo"].includes(entry))
784
+ continue;
785
+ let realDir = fullPath;
786
+ try {
787
+ realDir = realpathSync(fullPath);
788
+ } catch {
789
+ continue;
790
+ }
791
+ if (!isWithinRoot(realDir, rootReal)) continue;
792
+ results.push(...findHtmlFiles(fullPath, rootReal));
793
+ } else if (entry.endsWith(".html")) {
794
+ results.push(fullPath);
795
+ }
796
+ }
797
+ return results;
798
+ }
799
+ function isWithinRoot(realPath, rootReal) {
800
+ if (realPath === rootReal) return true;
801
+ const normalizedRoot = rootReal.endsWith(sep) ? rootReal : rootReal + sep;
802
+ return realPath.startsWith(normalizedRoot);
803
+ }
804
+ function sanitizeMetadataValue(value) {
805
+ if (!value) return "";
806
+ const sanitized = value.replace(/[\u0000-\u001F\u007F]/g, " ").replace(/[<>]/g, "").replace(/\s+/g, " ").trim();
807
+ return sanitized.length > 200 ? sanitized.slice(0, 200) : sanitized;
808
+ }
809
+
810
+ // src/headers.ts
811
+ function buildMarkdownHeaders(result, options, originalPath) {
812
+ const headers = {
813
+ "Content-Type": "text/markdown; charset=utf-8",
814
+ "Content-Disposition": "inline",
815
+ "Vary": "Accept, User-Agent",
816
+ "X-Markdown-Tokens": String(result.tokenEstimate)
817
+ };
818
+ const signal = options.contentSignal ?? { aiTrain: true, search: true, aiInput: true };
819
+ const signalParts = [];
820
+ if (signal.aiTrain !== false) signalParts.push("ai-train=yes");
821
+ if (signal.search !== false) signalParts.push("search=yes");
822
+ if (signal.aiInput !== false) signalParts.push("ai-input=yes");
823
+ if (signalParts.length > 0) {
824
+ headers["Content-Signal"] = signalParts.join(", ");
825
+ }
826
+ headers["X-Robots-Tag"] = "all";
827
+ return headers;
828
+ }
829
+ function buildAlternateLinkHeader(path, ext = ".md") {
830
+ const mdPath = path.endsWith("/") ? `${path}index${ext}` : `${path}${ext}`;
831
+ return `<${mdPath}>; rel="alternate"; type="text/markdown"`;
832
+ }
833
+
834
+ // src/cache.ts
835
+ import { LRUCache } from "lru-cache";
836
+ function createCache(options = {}) {
837
+ const { maxEntries = 100, ttl = 3e5 } = options;
838
+ const cache = new LRUCache({
839
+ max: maxEntries,
840
+ ttl
841
+ });
842
+ return {
843
+ get: (key) => cache.get(key),
844
+ set: (key, value) => cache.set(key, value),
845
+ has: (key) => cache.has(key),
846
+ clear: () => cache.clear(),
847
+ size: () => cache.size
848
+ };
849
+ }
850
+ export {
851
+ AI_BOT_REGISTRY,
852
+ buildAlternateLinkHeader,
853
+ buildMarkdownHeaders,
854
+ createCache,
855
+ detectAgent,
856
+ discoverFilesystemRoutes,
857
+ discoverNextRoutes,
858
+ estimateTokens,
859
+ extractJsonLdBlocks,
860
+ extractMetadataFromSource,
861
+ generateLlmsTxt,
862
+ htmlToMarkdown,
863
+ sanitizeHtml,
864
+ shouldServeMarkdown,
865
+ transform
866
+ };
867
+ //# sourceMappingURL=index.js.map