@agent-seo/core 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,918 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ AI_BOT_REGISTRY: () => AI_BOT_REGISTRY,
34
+ buildAlternateLinkHeader: () => buildAlternateLinkHeader,
35
+ buildMarkdownHeaders: () => buildMarkdownHeaders,
36
+ createCache: () => createCache,
37
+ detectAgent: () => detectAgent,
38
+ discoverFilesystemRoutes: () => discoverFilesystemRoutes,
39
+ discoverNextRoutes: () => discoverNextRoutes,
40
+ estimateTokens: () => estimateTokens,
41
+ extractJsonLdBlocks: () => extractJsonLdBlocks,
42
+ extractMetadataFromSource: () => extractMetadataFromSource,
43
+ generateLlmsTxt: () => generateLlmsTxt,
44
+ htmlToMarkdown: () => htmlToMarkdown,
45
+ sanitizeHtml: () => sanitizeHtml,
46
+ shouldServeMarkdown: () => shouldServeMarkdown,
47
+ transform: () => transform
48
+ });
49
+ module.exports = __toCommonJS(index_exports);
50
+
51
+ // src/detect.ts
52
+ var AI_BOT_REGISTRY = [
53
+ // === OpenAI ===
54
+ {
55
+ pattern: /GPTBot/i,
56
+ info: { name: "GPTBot", operator: "OpenAI", purpose: "training", rendersJs: false }
57
+ },
58
+ {
59
+ pattern: /OAI-SearchBot/i,
60
+ info: { name: "OAI-SearchBot", operator: "OpenAI", purpose: "search", rendersJs: false }
61
+ },
62
+ {
63
+ pattern: /ChatGPT-User/i,
64
+ info: { name: "ChatGPT-User", operator: "OpenAI", purpose: "agent-browsing", rendersJs: true }
65
+ },
66
+ // === Anthropic ===
67
+ {
68
+ pattern: /ClaudeBot/i,
69
+ info: { name: "ClaudeBot", operator: "Anthropic", purpose: "training", rendersJs: false }
70
+ },
71
+ {
72
+ pattern: /Claude-User/i,
73
+ info: { name: "Claude-User", operator: "Anthropic", purpose: "agent-browsing", rendersJs: true }
74
+ },
75
+ {
76
+ pattern: /Claude-SearchBot/i,
77
+ info: { name: "Claude-SearchBot", operator: "Anthropic", purpose: "search", rendersJs: false }
78
+ },
79
+ {
80
+ pattern: /anthropic-ai/i,
81
+ info: { name: "anthropic-ai", operator: "Anthropic", purpose: "training", rendersJs: false }
82
+ },
83
+ // === Perplexity ===
84
+ {
85
+ pattern: /PerplexityBot/i,
86
+ info: { name: "PerplexityBot", operator: "Perplexity", purpose: "search", rendersJs: false }
87
+ },
88
+ {
89
+ pattern: /Perplexity-User/i,
90
+ info: { name: "Perplexity-User", operator: "Perplexity", purpose: "agent-browsing", rendersJs: true }
91
+ },
92
+ // === Google ===
93
+ {
94
+ pattern: /Google-Extended/i,
95
+ info: { name: "Google-Extended", operator: "Google", purpose: "training", rendersJs: true }
96
+ },
97
+ // === Apple ===
98
+ {
99
+ pattern: /Applebot-Extended/i,
100
+ info: { name: "Applebot-Extended", operator: "Apple", purpose: "training", rendersJs: true }
101
+ },
102
+ // === Meta ===
103
+ {
104
+ pattern: /meta-externalagent/i,
105
+ info: { name: "Meta-ExternalAgent", operator: "Meta", purpose: "training", rendersJs: false }
106
+ },
107
+ {
108
+ pattern: /FacebookBot/i,
109
+ info: { name: "FacebookBot", operator: "Meta", purpose: "search", rendersJs: false }
110
+ },
111
+ // === Common Crawl ===
112
+ {
113
+ pattern: /CCBot/i,
114
+ info: { name: "CCBot", operator: "Common Crawl", purpose: "training", rendersJs: false }
115
+ },
116
+ // === Cohere ===
117
+ {
118
+ pattern: /cohere-ai/i,
119
+ info: { name: "cohere-ai", operator: "Cohere", purpose: "training", rendersJs: false }
120
+ },
121
+ // === Amazon ===
122
+ {
123
+ pattern: /Amazonbot/i,
124
+ info: { name: "Amazonbot", operator: "Amazon", purpose: "search", rendersJs: false }
125
+ },
126
+ // === Bytedance ===
127
+ {
128
+ pattern: /Bytespider/i,
129
+ info: { name: "Bytespider", operator: "ByteDance", purpose: "training", rendersJs: false }
130
+ },
131
+ // === You.com ===
132
+ {
133
+ pattern: /YouBot/i,
134
+ info: { name: "YouBot", operator: "You.com", purpose: "search", rendersJs: false }
135
+ },
136
+ // === DeepSeek ===
137
+ {
138
+ pattern: /Deepseek/i,
139
+ info: { name: "DeepSeekBot", operator: "DeepSeek", purpose: "training", rendersJs: false }
140
+ }
141
+ ];
142
+ var TOKEN_REGISTRY = AI_BOT_REGISTRY.map((entry) => ({
143
+ entry,
144
+ token: regexToToken(entry.pattern)
145
+ }));
146
+ function detectAgent(userAgent, acceptHeader) {
147
+ const wantsMarkdown = acceptHeader ? /text\/markdown/i.test(acceptHeader) : false;
148
+ if (!userAgent) {
149
+ return { isAIBot: false, bot: null, wantsMarkdown };
150
+ }
151
+ const ua = userAgent.toLowerCase();
152
+ for (const { entry, token } of TOKEN_REGISTRY) {
153
+ if (token) {
154
+ if (ua.includes(token)) {
155
+ return { isAIBot: true, bot: entry.info, wantsMarkdown };
156
+ }
157
+ continue;
158
+ }
159
+ if (entry.pattern.test(userAgent)) {
160
+ return { isAIBot: true, bot: entry.info, wantsMarkdown };
161
+ }
162
+ }
163
+ return { isAIBot: false, bot: null, wantsMarkdown };
164
+ }
165
+ function shouldServeMarkdown(userAgent, acceptHeader) {
166
+ const ctx = detectAgent(userAgent, acceptHeader);
167
+ return ctx.isAIBot || ctx.wantsMarkdown;
168
+ }
169
+ function regexToToken(pattern) {
170
+ const source = pattern.source;
171
+ if (/^[A-Za-z0-9-]+$/.test(source)) return source.toLowerCase();
172
+ return null;
173
+ }
174
+
175
+ // src/transform.ts
176
+ var import_jsdom2 = require("jsdom");
177
+ var import_readability = require("@mozilla/readability");
178
+
179
+ // src/sanitize.ts
180
+ var import_jsdom = require("jsdom");
181
+ var DEFAULT_STRIP_TAGS = [
182
+ "script",
183
+ "style",
184
+ "noscript",
185
+ "iframe",
186
+ "svg",
187
+ "canvas",
188
+ "video",
189
+ "audio",
190
+ "map",
191
+ "object",
192
+ "embed",
193
+ "applet",
194
+ 'link[rel="stylesheet"]',
195
+ "meta"
196
+ ];
197
+ var DEFAULT_STRIP_SELECTORS = [
198
+ // Navigation & chrome
199
+ "nav",
200
+ "header:not(article header)",
201
+ "footer:not(article footer)",
202
+ '[role="navigation"]',
203
+ '[role="banner"]',
204
+ '[role="contentinfo"]',
205
+ '[role="complementary"]',
206
+ "aside",
207
+ // Ads, cookies, popups
208
+ ".advertisement",
209
+ ".ad",
210
+ ".ads",
211
+ '[class*="ad-"]',
212
+ '[class*="ad_"]',
213
+ ".cookie-banner",
214
+ ".cookie-consent",
215
+ '[class*="cookie"]',
216
+ ".popup",
217
+ ".modal",
218
+ '[class*="popup"]',
219
+ '[class*="modal"]',
220
+ ".overlay",
221
+ // Social & sharing
222
+ ".social-share",
223
+ ".share-buttons",
224
+ '[class*="social"]',
225
+ '[class*="share"]',
226
+ ".follow-us",
227
+ // Comments & forms (not the content)
228
+ ".comments",
229
+ "#comments",
230
+ ".comment-form",
231
+ 'form:not([class*="search"])',
232
+ // Related content / sidebar noise
233
+ ".related-posts",
234
+ ".recommended",
235
+ ".sidebar",
236
+ ".widget",
237
+ '[class*="related"]',
238
+ '[class*="sidebar"]',
239
+ '[class*="widget"]',
240
+ ".newsletter",
241
+ ".subscribe",
242
+ '[class*="newsletter"]',
243
+ ".cta",
244
+ '[class*="cta"]',
245
+ // Visual-only elements
246
+ ".breadcrumb",
247
+ ".breadcrumbs",
248
+ ".pagination",
249
+ ".skip-link",
250
+ '[aria-hidden="true"]',
251
+ // JS framework artifacts
252
+ "[data-reactroot] > noscript",
253
+ ".hydration-overlay"
254
+ ];
255
+ function sanitizeHtml(html, options = {}) {
256
+ const { stripSelectors = [], preserveSelectors = [] } = options;
257
+ const dom = new import_jsdom.JSDOM(html);
258
+ const document = dom.window.document;
259
+ const preserveSet = /* @__PURE__ */ new Set();
260
+ for (const selector of preserveSelectors) {
261
+ try {
262
+ document.querySelectorAll(selector).forEach((el) => preserveSet.add(el));
263
+ } catch {
264
+ }
265
+ }
266
+ for (const tag of DEFAULT_STRIP_TAGS) {
267
+ try {
268
+ document.querySelectorAll(tag).forEach((el) => {
269
+ if (!preserveSet.has(el)) el.remove();
270
+ });
271
+ } catch {
272
+ }
273
+ }
274
+ const allStripSelectors = [...DEFAULT_STRIP_SELECTORS, ...stripSelectors];
275
+ for (const selector of allStripSelectors) {
276
+ try {
277
+ document.querySelectorAll(selector).forEach((el) => {
278
+ if (!preserveSet.has(el)) el.remove();
279
+ });
280
+ } catch {
281
+ }
282
+ }
283
+ stripLowDensityElements(document);
284
+ removeEmptyElements(document);
285
+ cleanAttributes(document);
286
+ const result = document.body?.innerHTML || "";
287
+ dom.window.close();
288
+ return result;
289
+ }
290
+ function stripLowDensityElements(document) {
291
+ const candidates = document.querySelectorAll("div, section, span");
292
+ for (const el of candidates) {
293
+ const textLength = (el.textContent || "").trim().length;
294
+ const childElementCount = el.querySelectorAll("*").length;
295
+ if (childElementCount > 10 && textLength < 50) {
296
+ el.remove();
297
+ }
298
+ }
299
+ }
300
+ function removeEmptyElements(document) {
301
+ const candidates = document.querySelectorAll("div, span, p, section, article");
302
+ for (const el of candidates) {
303
+ if (!(el.textContent || "").trim() && !el.querySelector("img, table, pre, code")) {
304
+ el.remove();
305
+ }
306
+ }
307
+ }
308
+ function cleanAttributes(document) {
309
+ const all = document.querySelectorAll("*");
310
+ const keepAttrs = /* @__PURE__ */ new Set(["href", "src", "alt", "title", "colspan", "rowspan", "scope", "headers", "lang", "dir", "type"]);
311
+ for (const el of all) {
312
+ const attrs = Array.from(el.attributes);
313
+ for (const attr of attrs) {
314
+ if (attr.name === "class" && el.tagName === "CODE") continue;
315
+ if (!keepAttrs.has(attr.name)) {
316
+ el.removeAttribute(attr.name);
317
+ }
318
+ }
319
+ }
320
+ }
321
+
322
+ // src/markdown.ts
323
+ var import_turndown = __toESM(require("turndown"), 1);
324
+ var import_turndown_plugin_gfm = require("turndown-plugin-gfm");
325
+ function htmlToMarkdown(html, options = {}) {
326
+ const { url, customRules = [] } = options;
327
+ const turndown = new import_turndown.default({
328
+ headingStyle: "atx",
329
+ codeBlockStyle: "fenced",
330
+ bulletListMarker: "-",
331
+ emDelimiter: "*",
332
+ strongDelimiter: "**",
333
+ linkStyle: "inlined",
334
+ hr: "---"
335
+ });
336
+ turndown.use(import_turndown_plugin_gfm.gfm);
337
+ turndown.addRule("fencedCodeBlock", {
338
+ filter: (node) => {
339
+ return node.nodeName === "PRE" && node.firstChild !== null && node.firstChild.nodeName === "CODE";
340
+ },
341
+ replacement: (_content, node) => {
342
+ const codeEl = node.firstChild;
343
+ const className = codeEl?.getAttribute?.("class") || "";
344
+ const langMatch = className.match(
345
+ /(?:language-|lang-|hljs\s+|highlight-)([a-zA-Z0-9_+-]+)/
346
+ );
347
+ const lang = langMatch ? langMatch[1] : "";
348
+ const code = codeEl?.textContent || "";
349
+ return `
350
+
351
+ \`\`\`${lang}
352
+ ${code.replace(/\n+$/, "")}
353
+ \`\`\`
354
+
355
+ `;
356
+ }
357
+ });
358
+ turndown.addRule("meaningfulImages", {
359
+ filter: (node) => node.nodeName === "IMG",
360
+ replacement: (_content, node) => {
361
+ const el = node;
362
+ const alt = el.getAttribute("alt")?.trim();
363
+ const src = el.getAttribute("src")?.trim();
364
+ if (!alt) return "";
365
+ let resolvedSrc = src || "";
366
+ if (url && src && !src.startsWith("http") && !src.startsWith("data:")) {
367
+ try {
368
+ resolvedSrc = new URL(src, url).href;
369
+ } catch {
370
+ resolvedSrc = src;
371
+ }
372
+ }
373
+ return `![${alt}](${resolvedSrc})`;
374
+ }
375
+ });
376
+ turndown.addRule("resolveLinks", {
377
+ filter: "a",
378
+ replacement: (content, node) => {
379
+ const el = node;
380
+ const href = el.getAttribute("href");
381
+ if (!href || !content.trim()) return content;
382
+ if (href.startsWith("#")) return content;
383
+ if (href.startsWith("javascript:") || href.startsWith("data:text/html")) return "";
384
+ let resolvedHref = href;
385
+ if (url && !href.startsWith("http") && !href.startsWith("mailto:")) {
386
+ try {
387
+ resolvedHref = new URL(href, url).href;
388
+ } catch {
389
+ resolvedHref = href;
390
+ }
391
+ }
392
+ const title = el.getAttribute("title");
393
+ return title ? `[${content}](${resolvedHref} "${title}")` : `[${content}](${resolvedHref})`;
394
+ }
395
+ });
396
+ for (const rule of customRules) {
397
+ turndown.addRule(rule.name, {
398
+ filter: rule.filter,
399
+ replacement: rule.replacement
400
+ });
401
+ }
402
+ let markdown = turndown.turndown(html);
403
+ markdown = markdown.replace(/\n{3,}/g, "\n\n").trim();
404
+ return markdown;
405
+ }
406
+
407
+ // src/json-ld.ts
408
+ function extractJsonLdBlocks(document) {
409
+ const results = [];
410
+ const scripts = document.querySelectorAll('script[type="application/ld+json"]');
411
+ for (const script of scripts) {
412
+ try {
413
+ const data = JSON.parse(script.textContent || "");
414
+ if (data["@graph"] && Array.isArray(data["@graph"])) {
415
+ results.push(...data["@graph"]);
416
+ } else {
417
+ results.push(data);
418
+ }
419
+ } catch {
420
+ }
421
+ }
422
+ return results;
423
+ }
424
+
425
+ // src/tokens.ts
426
+ function estimateTokens(text) {
427
+ return Math.ceil(text.length / 4);
428
+ }
429
+
430
+ // src/frontmatter.ts
431
+ function buildFrontmatter(input) {
432
+ const lines = ["---"];
433
+ if (input.title) lines.push(`title: "${escapeYaml(input.title)}"`);
434
+ if (input.description)
435
+ lines.push(`description: "${escapeYaml(input.description)}"`);
436
+ if (input.url) lines.push(`url: "${escapeYaml(input.url)}"`);
437
+ if (input.lang) lines.push(`lang: "${escapeYaml(input.lang)}"`);
438
+ if (input.lastModified) lines.push(`lastModified: "${escapeYaml(input.lastModified)}"`);
439
+ if (input.jsonLd?.length) {
440
+ const primary = input.jsonLd[0];
441
+ const primaryType = primary?.["@type"];
442
+ if (primaryType) {
443
+ const typeStr = Array.isArray(primaryType) ? primaryType[0] : primaryType;
444
+ if (typeof typeStr === "string") {
445
+ lines.push(`schema: "${escapeYaml(typeStr)}"`);
446
+ }
447
+ }
448
+ const author = primary?.author;
449
+ if (author) {
450
+ const authorName = author.name;
451
+ if (authorName) lines.push(`author: "${escapeYaml(authorName)}"`);
452
+ }
453
+ const datePublished = primary?.datePublished;
454
+ if (datePublished) lines.push(`datePublished: "${escapeYaml(datePublished)}"`);
455
+ const dateModified = primary?.dateModified;
456
+ if (dateModified) lines.push(`dateModified: "${escapeYaml(dateModified)}"`);
457
+ }
458
+ lines.push("---");
459
+ return lines.join("\n");
460
+ }
461
+ function escapeYaml(str) {
462
+ return str.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\n/g, " ");
463
+ }
464
+
465
+ // src/transform.ts
466
+ async function transform(html, options = {}) {
467
+ const {
468
+ url,
469
+ tokenBudget,
470
+ extractJsonLd = true,
471
+ stripSelectors = [],
472
+ preserveSelectors = [],
473
+ frontmatter = true,
474
+ turndownRules = []
475
+ } = options;
476
+ const dom = new import_jsdom2.JSDOM(html, { url: url || "https://localhost" });
477
+ const document = dom.window.document;
478
+ const title = document.querySelector("title")?.textContent?.trim() || document.querySelector("h1")?.textContent?.trim() || "";
479
+ const description = document.querySelector('meta[name="description"]')?.getAttribute("content")?.trim() || "";
480
+ const canonicalUrl = document.querySelector('link[rel="canonical"]')?.getAttribute("href") || null;
481
+ const lang = document.documentElement.getAttribute("lang") || null;
482
+ const lastModified = document.querySelector('meta[property="article:modified_time"]')?.getAttribute("content") || document.querySelector('meta[name="last-modified"]')?.getAttribute("content") || null;
483
+ const jsonLd = extractJsonLd ? extractJsonLdBlocks(document) : [];
484
+ let contentHtml;
485
+ if ((0, import_readability.isProbablyReaderable)(document)) {
486
+ const reader = new import_readability.Readability(document, { charThreshold: 100 });
487
+ const article = reader.parse();
488
+ contentHtml = article?.content || document.body?.innerHTML || html;
489
+ } else {
490
+ contentHtml = document.body?.innerHTML || html;
491
+ }
492
+ const cleanHtml = sanitizeHtml(contentHtml, {
493
+ stripSelectors,
494
+ preserveSelectors
495
+ });
496
+ let markdown = htmlToMarkdown(cleanHtml, { url, customRules: turndownRules });
497
+ if (frontmatter) {
498
+ const fm = buildFrontmatter({ title, description, url, lang, lastModified, jsonLd });
499
+ markdown = fm + "\n\n" + markdown;
500
+ }
501
+ let tokenEstimate = estimateTokens(markdown);
502
+ if (tokenBudget && tokenEstimate > tokenBudget) {
503
+ markdown = truncateToTokenBudget(markdown, tokenBudget);
504
+ tokenEstimate = estimateTokens(markdown);
505
+ }
506
+ dom.window.close();
507
+ return {
508
+ markdown,
509
+ tokenEstimate,
510
+ title,
511
+ description,
512
+ jsonLd,
513
+ canonicalUrl,
514
+ lastModified,
515
+ lang
516
+ };
517
+ }
518
+ function truncateToTokenBudget(markdown, budget) {
519
+ const lines = markdown.split("\n");
520
+ const result = [];
521
+ let currentTokens = 0;
522
+ for (const line of lines) {
523
+ const lineTokens = estimateTokens(line);
524
+ if (currentTokens + lineTokens > budget) {
525
+ if (/^#{1,6}\s/.test(line)) {
526
+ result.push(line);
527
+ result.push("\n*[Content truncated for token budget]*\n");
528
+ }
529
+ break;
530
+ }
531
+ result.push(line);
532
+ currentTokens += lineTokens;
533
+ }
534
+ return result.join("\n");
535
+ }
536
+
537
+ // src/llms-txt.ts
538
+ function generateLlmsTxt(options, routes, fullTextContents) {
539
+ const { siteName, siteDescription, baseUrl, markdownExtension = ".md" } = options;
540
+ const sections = /* @__PURE__ */ new Map();
541
+ for (const route of routes) {
542
+ const section = route.section || "Pages";
543
+ if (!sections.has(section)) sections.set(section, []);
544
+ sections.get(section).push(route);
545
+ }
546
+ const lines = [];
547
+ lines.push(`# ${siteName}`);
548
+ lines.push("");
549
+ lines.push(`> ${siteDescription}`);
550
+ lines.push("");
551
+ for (const [section, sectionRoutes] of sections) {
552
+ lines.push(`## ${section}`);
553
+ lines.push("");
554
+ for (const route of sectionRoutes) {
555
+ const url = `${baseUrl}${route.path}${markdownExtension}`;
556
+ const desc = route.description ? `: ${route.description}` : "";
557
+ lines.push(`- [${route.title}](${url})${desc}`);
558
+ }
559
+ lines.push("");
560
+ }
561
+ const llmsTxt = lines.join("\n").trim() + "\n";
562
+ const fullLines = [];
563
+ fullLines.push(`# ${siteName}`);
564
+ fullLines.push("");
565
+ fullLines.push(`> ${siteDescription}`);
566
+ fullLines.push("");
567
+ if (fullTextContents) {
568
+ for (const route of routes) {
569
+ const content = fullTextContents.get(route.path);
570
+ if (content) {
571
+ fullLines.push(`---`);
572
+ fullLines.push("");
573
+ fullLines.push(`## ${route.title}`);
574
+ fullLines.push(`Source: ${baseUrl}${route.path}`);
575
+ fullLines.push("");
576
+ fullLines.push(content);
577
+ fullLines.push("");
578
+ }
579
+ }
580
+ }
581
+ const llmsFullTxt = fullLines.join("\n").trim() + "\n";
582
+ return {
583
+ llmsTxt,
584
+ llmsFullTxt,
585
+ routeCount: routes.length
586
+ };
587
+ }
588
+
589
+ // src/discover.ts
590
+ var import_node_fs = require("fs");
591
+ var import_node_path = require("path");
592
+ var PAGE_FILE_PATTERNS = [
593
+ "page.tsx",
594
+ "page.ts",
595
+ "page.jsx",
596
+ "page.js",
597
+ "page.mdx",
598
+ "page.md"
599
+ ];
600
+ var SKIP_DIRS = /* @__PURE__ */ new Set([
601
+ "node_modules",
602
+ ".next",
603
+ ".git",
604
+ "dist",
605
+ ".turbo",
606
+ "_components",
607
+ "_lib",
608
+ "_utils",
609
+ "_hooks",
610
+ "_actions",
611
+ "components",
612
+ "lib",
613
+ "utils",
614
+ "hooks",
615
+ "actions",
616
+ "api"
617
+ // API routes are not pages
618
+ ]);
619
+ function discoverNextRoutes(appDir, options = {}) {
620
+ const {
621
+ exclude = ["/api"],
622
+ sectionStrategy = "directory",
623
+ defaultSection = "Pages"
624
+ } = options;
625
+ if (!(0, import_node_fs.existsSync)(appDir)) {
626
+ return [];
627
+ }
628
+ const routes = [];
629
+ let rootReal = appDir;
630
+ try {
631
+ rootReal = (0, import_node_fs.realpathSync)(appDir);
632
+ } catch {
633
+ return routes;
634
+ }
635
+ scanAppDir(appDir, appDir, rootReal, routes, exclude, sectionStrategy, defaultSection);
636
+ routes.sort((a, b) => {
637
+ if (a.path === "/") return -1;
638
+ if (b.path === "/") return 1;
639
+ return a.path.localeCompare(b.path);
640
+ });
641
+ return routes;
642
+ }
643
+ function scanAppDir(rootDir, currentDir, rootReal, routes, exclude, sectionStrategy, defaultSection) {
644
+ const entries = (0, import_node_fs.readdirSync)(currentDir);
645
+ for (const entry of entries) {
646
+ const fullPath = (0, import_node_path.join)(currentDir, entry);
647
+ let stat;
648
+ try {
649
+ stat = (0, import_node_fs.lstatSync)(fullPath);
650
+ } catch {
651
+ continue;
652
+ }
653
+ if (stat.isSymbolicLink()) continue;
654
+ if (stat.isDirectory()) {
655
+ let realDir = fullPath;
656
+ try {
657
+ realDir = (0, import_node_fs.realpathSync)(fullPath);
658
+ } catch {
659
+ continue;
660
+ }
661
+ if (!isWithinRoot(realDir, rootReal)) continue;
662
+ const baseName = entry.toLowerCase();
663
+ if (baseName.startsWith("_") || baseName.startsWith(".") || SKIP_DIRS.has(baseName))
664
+ continue;
665
+ scanAppDir(
666
+ rootDir,
667
+ fullPath,
668
+ rootReal,
669
+ routes,
670
+ exclude,
671
+ sectionStrategy,
672
+ defaultSection
673
+ );
674
+ continue;
675
+ }
676
+ if (!PAGE_FILE_PATTERNS.includes(entry)) continue;
677
+ const relativePath = currentDir.substring(rootDir.length);
678
+ let urlPath = relativePath.replace(/\\/g, "/");
679
+ urlPath = urlPath.replace(/\/\([^)]+\)/g, "");
680
+ if (urlPath.includes("[...") || urlPath.includes("[[...")) continue;
681
+ if (urlPath === "") urlPath = "/";
682
+ if (!urlPath.startsWith("/")) urlPath = "/" + urlPath;
683
+ if (shouldExclude(urlPath, exclude)) continue;
684
+ if (urlPath === "/llms.txt" || urlPath === "/llms-full.txt") continue;
685
+ const { title, description } = extractMetadataFromFile(fullPath);
686
+ const section = deriveSection(urlPath, sectionStrategy, defaultSection);
687
+ routes.push({
688
+ path: urlPath,
689
+ title: title || pathToTitle(urlPath),
690
+ description: description || void 0,
691
+ section
692
+ });
693
+ }
694
+ }
695
+ function extractMetadataFromFile(filePath) {
696
+ try {
697
+ const content = (0, import_node_fs.readFileSync)(filePath, "utf-8");
698
+ return extractMetadataFromSource(content);
699
+ } catch {
700
+ return { title: "", description: "" };
701
+ }
702
+ }
703
+ function extractMetadataFromSource(source) {
704
+ let title = "";
705
+ let description = "";
706
+ const metadataMatch = source.match(
707
+ /export\s+const\s+metadata[\s:]*(?:Metadata\s*)?=\s*\{/
708
+ );
709
+ if (!metadataMatch) {
710
+ return { title, description };
711
+ }
712
+ const startIdx = metadataMatch.index + metadataMatch[0].length - 1;
713
+ const objectStr = extractBalancedBraces(source, startIdx);
714
+ if (!objectStr) {
715
+ return { title, description };
716
+ }
717
+ const titleMatch = objectStr.match(
718
+ /title\s*:\s*(?:'([^']*)'|"([^"]*)"|`([^`]*)`)/
719
+ );
720
+ if (titleMatch) {
721
+ title = sanitizeMetadataValue(titleMatch[1] || titleMatch[2] || titleMatch[3] || "");
722
+ }
723
+ const descMatch = objectStr.match(
724
+ /description\s*:\s*(?:'([^']*)'|"([^"]*)"|`([^`]*)`)/
725
+ );
726
+ if (descMatch) {
727
+ description = sanitizeMetadataValue(descMatch[1] || descMatch[2] || descMatch[3] || "");
728
+ }
729
+ return { title, description };
730
+ }
731
+ function extractBalancedBraces(source, start) {
732
+ if (source[start] !== "{") return null;
733
+ let depth = 0;
734
+ for (let i = start; i < source.length; i++) {
735
+ if (source[i] === "{") depth++;
736
+ else if (source[i] === "}") depth--;
737
+ if (depth === 0) {
738
+ return source.substring(start, i + 1);
739
+ }
740
+ }
741
+ return null;
742
+ }
743
+ function deriveSection(urlPath, strategy, defaultSection) {
744
+ if (typeof strategy === "function") {
745
+ return strategy(urlPath);
746
+ }
747
+ const segments = urlPath.split("/").filter(Boolean);
748
+ if (segments.length === 0) return defaultSection;
749
+ const firstSegment = segments[0];
750
+ if (firstSegment.startsWith("[")) return defaultSection;
751
+ return firstSegment.split("-").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
752
+ }
753
+ function pathToTitle(urlPath) {
754
+ if (urlPath === "/") return "Home";
755
+ const lastSegment = urlPath.split("/").filter(Boolean).pop() || "";
756
+ if (lastSegment.startsWith("[")) {
757
+ return lastSegment.replace(/^\[|\]$/g, "");
758
+ }
759
+ return lastSegment.split("-").map((word) => word.charAt(0).toUpperCase() + word.slice(1)).join(" ");
760
+ }
761
+ function shouldExclude(urlPath, patterns) {
762
+ for (const pattern of patterns) {
763
+ if (pattern.endsWith("/**") || pattern.endsWith("/*")) {
764
+ const prefix = pattern.replace(/\/\*\*?$/, "");
765
+ if (urlPath === prefix || urlPath.startsWith(prefix + "/")) return true;
766
+ } else if (pattern.startsWith("_")) {
767
+ if (urlPath.startsWith("/" + pattern) || urlPath.includes("/" + pattern + "/"))
768
+ return true;
769
+ } else if (urlPath === pattern || urlPath.startsWith(pattern + "/")) {
770
+ return true;
771
+ }
772
+ }
773
+ return false;
774
+ }
775
+ function discoverFilesystemRoutes(dir, options = {}) {
776
+ const {
777
+ exclude = [],
778
+ sectionStrategy = "directory",
779
+ defaultSection = "Pages"
780
+ } = options;
781
+ if (!(0, import_node_fs.existsSync)(dir)) {
782
+ return [];
783
+ }
784
+ let rootReal = dir;
785
+ try {
786
+ rootReal = (0, import_node_fs.realpathSync)(dir);
787
+ } catch {
788
+ return [];
789
+ }
790
+ const htmlFiles = findHtmlFiles(dir, rootReal);
791
+ const routes = [];
792
+ for (const filePath of htmlFiles) {
793
+ const relativePath = filePath.substring(dir.length);
794
+ let urlPath = relativePath.replace(/\\/g, "/").replace(/\/index\.html$/, "/").replace(/\.html$/, "");
795
+ if (urlPath === "") urlPath = "/";
796
+ if (!urlPath.startsWith("/")) urlPath = "/" + urlPath;
797
+ if (shouldExclude(urlPath, exclude)) continue;
798
+ const section = deriveSection(urlPath, sectionStrategy, defaultSection);
799
+ let title = "";
800
+ try {
801
+ const html = (0, import_node_fs.readFileSync)(filePath, "utf-8");
802
+ const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
803
+ title = sanitizeMetadataValue(titleMatch?.[1]?.trim() || "");
804
+ } catch {
805
+ }
806
+ routes.push({
807
+ path: urlPath,
808
+ title: title || pathToTitle(urlPath),
809
+ section
810
+ });
811
+ }
812
+ routes.sort((a, b) => {
813
+ if (a.path === "/") return -1;
814
+ if (b.path === "/") return 1;
815
+ return a.path.localeCompare(b.path);
816
+ });
817
+ return routes;
818
+ }
819
+ function findHtmlFiles(dir, rootReal) {
820
+ const results = [];
821
+ if (!(0, import_node_fs.existsSync)(dir)) return results;
822
+ const entries = (0, import_node_fs.readdirSync)(dir);
823
+ for (const entry of entries) {
824
+ const fullPath = (0, import_node_path.join)(dir, entry);
825
+ let stat;
826
+ try {
827
+ stat = (0, import_node_fs.lstatSync)(fullPath);
828
+ } catch {
829
+ continue;
830
+ }
831
+ if (stat.isSymbolicLink()) continue;
832
+ if (stat.isDirectory()) {
833
+ if (["node_modules", ".next", ".git", "dist", ".turbo"].includes(entry))
834
+ continue;
835
+ let realDir = fullPath;
836
+ try {
837
+ realDir = (0, import_node_fs.realpathSync)(fullPath);
838
+ } catch {
839
+ continue;
840
+ }
841
+ if (!isWithinRoot(realDir, rootReal)) continue;
842
+ results.push(...findHtmlFiles(fullPath, rootReal));
843
+ } else if (entry.endsWith(".html")) {
844
+ results.push(fullPath);
845
+ }
846
+ }
847
+ return results;
848
+ }
849
+ function isWithinRoot(realPath, rootReal) {
850
+ if (realPath === rootReal) return true;
851
+ const normalizedRoot = rootReal.endsWith(import_node_path.sep) ? rootReal : rootReal + import_node_path.sep;
852
+ return realPath.startsWith(normalizedRoot);
853
+ }
854
+ function sanitizeMetadataValue(value) {
855
+ if (!value) return "";
856
+ const sanitized = value.replace(/[\u0000-\u001F\u007F]/g, " ").replace(/[<>]/g, "").replace(/\s+/g, " ").trim();
857
+ return sanitized.length > 200 ? sanitized.slice(0, 200) : sanitized;
858
+ }
859
+
860
+ // src/headers.ts
861
+ function buildMarkdownHeaders(result, options, originalPath) {
862
+ const headers = {
863
+ "Content-Type": "text/markdown; charset=utf-8",
864
+ "Content-Disposition": "inline",
865
+ "Vary": "Accept, User-Agent",
866
+ "X-Markdown-Tokens": String(result.tokenEstimate)
867
+ };
868
+ const signal = options.contentSignal ?? { aiTrain: true, search: true, aiInput: true };
869
+ const signalParts = [];
870
+ if (signal.aiTrain !== false) signalParts.push("ai-train=yes");
871
+ if (signal.search !== false) signalParts.push("search=yes");
872
+ if (signal.aiInput !== false) signalParts.push("ai-input=yes");
873
+ if (signalParts.length > 0) {
874
+ headers["Content-Signal"] = signalParts.join(", ");
875
+ }
876
+ headers["X-Robots-Tag"] = "all";
877
+ return headers;
878
+ }
879
+ function buildAlternateLinkHeader(path, ext = ".md") {
880
+ const mdPath = path.endsWith("/") ? `${path}index${ext}` : `${path}${ext}`;
881
+ return `<${mdPath}>; rel="alternate"; type="text/markdown"`;
882
+ }
883
+
884
+ // src/cache.ts
885
+ var import_lru_cache = require("lru-cache");
886
+ function createCache(options = {}) {
887
+ const { maxEntries = 100, ttl = 3e5 } = options;
888
+ const cache = new import_lru_cache.LRUCache({
889
+ max: maxEntries,
890
+ ttl
891
+ });
892
+ return {
893
+ get: (key) => cache.get(key),
894
+ set: (key, value) => cache.set(key, value),
895
+ has: (key) => cache.has(key),
896
+ clear: () => cache.clear(),
897
+ size: () => cache.size
898
+ };
899
+ }
900
+ // Annotate the CommonJS export names for ESM import in node:
901
+ 0 && (module.exports = {
902
+ AI_BOT_REGISTRY,
903
+ buildAlternateLinkHeader,
904
+ buildMarkdownHeaders,
905
+ createCache,
906
+ detectAgent,
907
+ discoverFilesystemRoutes,
908
+ discoverNextRoutes,
909
+ estimateTokens,
910
+ extractJsonLdBlocks,
911
+ extractMetadataFromSource,
912
+ generateLlmsTxt,
913
+ htmlToMarkdown,
914
+ sanitizeHtml,
915
+ shouldServeMarkdown,
916
+ transform
917
+ });
918
+ //# sourceMappingURL=index.cjs.map