@vercel/agent-readability 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,175 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var src_exports = {};
22
+ __export(src_exports, {
23
+ AI_AGENT_UA_PATTERNS: () => AI_AGENT_UA_PATTERNS,
24
+ BOT_LIKE_REGEX: () => BOT_LIKE_REGEX,
25
+ SIGNATURE_AGENT_DOMAINS: () => SIGNATURE_AGENT_DOMAINS,
26
+ TRADITIONAL_BOT_PATTERNS: () => TRADITIONAL_BOT_PATTERNS,
27
+ acceptsMarkdown: () => acceptsMarkdown,
28
+ generateNotFoundMarkdown: () => generateNotFoundMarkdown,
29
+ isAIAgent: () => isAIAgent,
30
+ shouldServeMarkdown: () => shouldServeMarkdown
31
+ });
32
+ module.exports = __toCommonJS(src_exports);
33
+
34
+ // src/patterns.ts
35
+ var AI_AGENT_UA_PATTERNS = [
36
+ // Anthropic — https://support.claude.com/en/articles/8896518
37
+ "claudebot",
38
+ "claude-searchbot",
39
+ "claude-user",
40
+ "anthropic-ai",
41
+ "claude-web",
42
+ // OpenAI — https://platform.openai.com/docs/bots
43
+ "chatgpt",
44
+ "gptbot",
45
+ "oai-searchbot",
46
+ "openai",
47
+ // Google AI
48
+ "gemini",
49
+ "bard",
50
+ "google-cloudvertexbot",
51
+ "google-extended",
52
+ // Meta
53
+ "meta-externalagent",
54
+ "meta-externalfetcher",
55
+ "meta-webindexer",
56
+ // Search/Research AI
57
+ "perplexity",
58
+ "youbot",
59
+ "you.com",
60
+ "deepseekbot",
61
+ // Coding assistants
62
+ "cursor",
63
+ "github-copilot",
64
+ "codeium",
65
+ "tabnine",
66
+ "sourcegraph",
67
+ // Other AI agents / data scrapers
68
+ "cohere-ai",
69
+ "bytespider",
70
+ "amazonbot",
71
+ "ai2bot",
72
+ "diffbot",
73
+ "omgili",
74
+ "omgilibot"
75
+ ];
76
+ var SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"];
77
+ var TRADITIONAL_BOT_PATTERNS = [
78
+ "googlebot",
79
+ "bingbot",
80
+ "yandexbot",
81
+ "baiduspider",
82
+ "duckduckbot",
83
+ "slurp",
84
+ "msnbot",
85
+ "facebot",
86
+ "twitterbot",
87
+ "linkedinbot",
88
+ "whatsapp",
89
+ "telegrambot",
90
+ "pingdom",
91
+ "uptimerobot",
92
+ "newrelic",
93
+ "datadog",
94
+ "statuspage",
95
+ "site24x7",
96
+ "applebot"
97
+ ];
98
+ var BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i;
99
+
100
+ // src/detection.ts
101
+ function isAIAgent(request) {
102
+ const userAgent = request.headers.get("user-agent");
103
+ const lowerUA = userAgent?.toLowerCase() ?? "";
104
+ if (lowerUA && AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) {
105
+ return { detected: true, method: "ua-match" };
106
+ }
107
+ const signatureAgent = request.headers.get("signature-agent");
108
+ if (signatureAgent) {
109
+ const lowerSig = signatureAgent.toLowerCase();
110
+ if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) {
111
+ return { detected: true, method: "signature-agent" };
112
+ }
113
+ }
114
+ const secFetchMode = request.headers.get("sec-fetch-mode");
115
+ if (!secFetchMode && lowerUA && BOT_LIKE_REGEX.test(lowerUA)) {
116
+ const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern));
117
+ if (!isTraditionalBot) {
118
+ return { detected: true, method: "heuristic" };
119
+ }
120
+ }
121
+ return { detected: false, method: null };
122
+ }
123
+
124
+ // src/negotiation.ts
125
+ var DEFAULT_MARKDOWN_TYPES = ["text/markdown", "text/x-markdown"];
126
+ function acceptsMarkdown(request, options) {
127
+ const accept = request.headers.get("accept");
128
+ if (!accept) return false;
129
+ const types = options?.mediaTypes ?? DEFAULT_MARKDOWN_TYPES;
130
+ const lowerAccept = accept.toLowerCase();
131
+ return types.some((type) => lowerAccept.includes(type));
132
+ }
133
+ function shouldServeMarkdown(request, options) {
134
+ const detection = isAIAgent(request);
135
+ if (detection.detected) {
136
+ return { serve: true, reason: "agent", detection };
137
+ }
138
+ if (acceptsMarkdown(request, options)) {
139
+ return { serve: true, reason: "accept-header", detection };
140
+ }
141
+ return { serve: false, reason: null, detection };
142
+ }
143
+
144
+ // src/not-found.ts
145
+ function generateNotFoundMarkdown(path, options) {
146
+ const sitemap = options?.sitemapUrl ?? "/sitemap.md";
147
+ const index = options?.indexUrl ?? "/llms.txt";
148
+ const base = options?.baseUrl ?? "";
149
+ return `# Page Not Found
150
+
151
+ The URL \`${path}\` does not exist.
152
+
153
+ ## How to find the correct page
154
+
155
+ 1. **Browse the sitemap**: [${sitemap}](${base}${sitemap}) - A structured index of all pages
156
+ 2. **Browse the full index**: [${index}](${base}${index}) - Complete documentation index
157
+
158
+ ## Tips for requesting documentation
159
+
160
+ - For markdown responses, append \`.md\` to URLs (e.g., \`/docs/functions.md\`)
161
+ - Use \`Accept: text/markdown\` header for content negotiation
162
+ `;
163
+ }
164
+ // Annotate the CommonJS export names for ESM import in node:
165
+ 0 && (module.exports = {
166
+ AI_AGENT_UA_PATTERNS,
167
+ BOT_LIKE_REGEX,
168
+ SIGNATURE_AGENT_DOMAINS,
169
+ TRADITIONAL_BOT_PATTERNS,
170
+ acceptsMarkdown,
171
+ generateNotFoundMarkdown,
172
+ isAIAgent,
173
+ shouldServeMarkdown
174
+ });
175
+ //# sourceMappingURL=index.cjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/index.ts","../src/patterns.ts","../src/detection.ts","../src/negotiation.ts","../src/not-found.ts"],"sourcesContent":["export { isAIAgent } from \"./detection\";\nexport { acceptsMarkdown, shouldServeMarkdown } from \"./negotiation\";\nexport { generateNotFoundMarkdown } from \"./not-found\";\nexport {\n\tAI_AGENT_UA_PATTERNS,\n\tBOT_LIKE_REGEX,\n\tSIGNATURE_AGENT_DOMAINS,\n\tTRADITIONAL_BOT_PATTERNS,\n} from \"./patterns\";\nexport type {\n\tDetectionMethod,\n\tDetectionResult,\n\tMinimalRequest,\n} from \"./types\";\nexport type { NotFoundOptions } from \"./not-found\";\nexport type { AcceptMarkdownOptions, ShouldServeMarkdownResult } from \"./negotiation\";\n","/**\n * Layer 1: Known AI agent UA substrings (lowercase).\n * Curated from https://bots.fyi/?tags=ai_assistant + official vendor docs.\n * Last reviewed: 2026-03-20\n */\nexport const AI_AGENT_UA_PATTERNS: readonly string[] = [\n\t// Anthropic — https://support.claude.com/en/articles/8896518\n\t\"claudebot\",\n\t\"claude-searchbot\",\n\t\"claude-user\",\n\t\"anthropic-ai\",\n\t\"claude-web\",\n\n\t// OpenAI — https://platform.openai.com/docs/bots\n\t\"chatgpt\",\n\t\"gptbot\",\n\t\"oai-searchbot\",\n\t\"openai\",\n\n\t// Google AI\n\t\"gemini\",\n\t\"bard\",\n\t\"google-cloudvertexbot\",\n\t\"google-extended\",\n\n\t// Meta\n\t\"meta-externalagent\",\n\t\"meta-externalfetcher\",\n\t\"meta-webindexer\",\n\n\t// Search/Research AI\n\t\"perplexity\",\n\t\"youbot\",\n\t\"you.com\",\n\t\"deepseekbot\",\n\n\t// Coding assistants\n\t\"cursor\",\n\t\"github-copilot\",\n\t\"codeium\",\n\t\"tabnine\",\n\t\"sourcegraph\",\n\n\t// Other AI agents / data scrapers\n\t\"cohere-ai\",\n\t\"bytespider\",\n\t\"amazonbot\",\n\t\"ai2bot\",\n\t\"diffbot\",\n\t\"omgili\",\n\t\"omgilibot\",\n];\n\n/**\n * Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).\n */\nexport const SIGNATURE_AGENT_DOMAINS: readonly string[] = [\"chatgpt.com\"];\n\n/**\n * Layer 3: Traditional bot exclusion list. Bots that should NOT trigger the\n * heuristic layer (search engine crawlers, social previews, monitoring tools).\n */\nexport const TRADITIONAL_BOT_PATTERNS: readonly string[] = [\n\t\"googlebot\",\n\t\"bingbot\",\n\t\"yandexbot\",\n\t\"baiduspider\",\n\t\"duckduckbot\",\n\t\"slurp\",\n\t\"msnbot\",\n\t\"facebot\",\n\t\"twitterbot\",\n\t\"linkedinbot\",\n\t\"whatsapp\",\n\t\"telegrambot\",\n\t\"pingdom\",\n\t\"uptimerobot\",\n\t\"newrelic\",\n\t\"datadog\",\n\t\"statuspage\",\n\t\"site24x7\",\n\t\"applebot\",\n];\n\n/**\n * Broad regex for bot-like UA strings (used only in Layer 3 heuristic).\n * No word boundaries — keywords commonly appear in compound names.\n */\nexport const BOT_LIKE_REGEX: RegExp = /bot|agent|fetch|crawl|spider|search/i;\n","import {\n\tAI_AGENT_UA_PATTERNS,\n\tBOT_LIKE_REGEX,\n\tSIGNATURE_AGENT_DOMAINS,\n\tTRADITIONAL_BOT_PATTERNS,\n} from \"./patterns\";\nimport type { DetectionResult, MinimalRequest } from \"./types\";\n\n/**\n * Detects AI agents from HTTP request headers.\n *\n * Three detection layers (checked in order):\n * 1. Known UA patterns (definitive)\n * 2. Signature-Agent header (definitive, RFC 9421)\n * 3. Missing sec-fetch-mode heuristic (catches unknown bots)\n *\n * Optimizes for recall over precision: serving markdown to a non-AI bot\n * is low-harm; missing an AI agent means a worse experience.\n */\nexport function isAIAgent(request: MinimalRequest): DetectionResult {\n\tconst userAgent = request.headers.get(\"user-agent\");\n\tconst lowerUA = userAgent?.toLowerCase() ?? \"\";\n\n\t// Layer 1: Known UA pattern match\n\tif (lowerUA && AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) {\n\t\treturn { detected: true, method: \"ua-match\" };\n\t}\n\n\t// Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent)\n\tconst signatureAgent = request.headers.get(\"signature-agent\");\n\tif (signatureAgent) {\n\t\tconst lowerSig = signatureAgent.toLowerCase();\n\t\tif (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) {\n\t\t\treturn { detected: true, method: \"signature-agent\" };\n\t\t}\n\t}\n\n\t// Layer 3: Missing browser fingerprint heuristic\n\t// Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode\n\t// on navigation requests. Its absence signals a programmatic client.\n\tconst secFetchMode = request.headers.get(\"sec-fetch-mode\");\n\tif (!secFetchMode && lowerUA && BOT_LIKE_REGEX.test(lowerUA)) {\n\t\tconst isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern));\n\t\tif (!isTraditionalBot) {\n\t\t\treturn { detected: true, method: \"heuristic\" };\n\t\t}\n\t}\n\n\treturn { detected: false, method: null };\n}\n","import { isAIAgent } from \"./detection\";\nimport type { DetectionResult, MinimalRequest } from \"./types\";\n\nconst DEFAULT_MARKDOWN_TYPES = [\"text/markdown\", \"text/x-markdown\"];\n\nexport interface AcceptMarkdownOptions {\n\tmediaTypes?: string[];\n}\n\n/**\n * Check if the request prefers markdown via the Accept header.\n */\nexport function acceptsMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): boolean {\n\tconst accept = request.headers.get(\"accept\");\n\tif (!accept) return false;\n\n\tconst types = options?.mediaTypes ?? DEFAULT_MARKDOWN_TYPES;\n\tconst lowerAccept = accept.toLowerCase();\n\treturn types.some((type) => lowerAccept.includes(type));\n}\n\nexport interface ShouldServeMarkdownResult {\n\tserve: boolean;\n\treason: \"agent\" | \"accept-header\" | null;\n\tdetection: DetectionResult;\n}\n\n/**\n * Combines agent detection and content negotiation into one call.\n * Returns whether to serve markdown and why.\n */\nexport function shouldServeMarkdown(\n\trequest: MinimalRequest,\n\toptions?: AcceptMarkdownOptions,\n): ShouldServeMarkdownResult {\n\tconst detection = isAIAgent(request);\n\tif (detection.detected) {\n\t\treturn { serve: true, reason: \"agent\", detection };\n\t}\n\n\tif (acceptsMarkdown(request, options)) {\n\t\treturn { serve: true, reason: \"accept-header\", detection };\n\t}\n\n\treturn { serve: false, reason: null, detection };\n}\n","export interface NotFoundOptions {\n\tsitemapUrl?: string;\n\tindexUrl?: string;\n\tbaseUrl?: string;\n}\n\n/**\n * Generates a markdown body for missing pages with links to discovery endpoints.\n * Return with a 200 status (agents discard 404 response bodies).\n */\nexport function generateNotFoundMarkdown(path: string, options?: NotFoundOptions): string {\n\tconst sitemap = options?.sitemapUrl ?? \"/sitemap.md\";\n\tconst index = options?.indexUrl ?? \"/llms.txt\";\n\tconst base = options?.baseUrl ?? \"\";\n\n\treturn `# Page Not Found\n\nThe URL \\`${path}\\` does not exist.\n\n## How to find the correct page\n\n1. **Browse the sitemap**: [${sitemap}](${base}${sitemap}) - A structured index of all pages\n2. **Browse the full index**: [${index}](${base}${index}) - Complete documentation index\n\n## Tips for requesting documentation\n\n- For markdown responses, append \\`.md\\` to URLs (e.g., \\`/docs/functions.md\\`)\n- Use \\`Accept: text/markdown\\` header for content negotiation\n`;\n}\n"],"mappings":";;;;;;;;;;;;;;;;;;;;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;;;ACKO,IAAM,uBAA0C;AAAA;AAAA,EAEtD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACD;AAKO,IAAM,0BAA6C,CAAC,aAAa;AAMjE,IAAM,2BAA8C;AAAA,EAC1D;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACD;AAMO,IAAM,iBAAyB;;;ACrE/B,SAAS,UAAU,SAA0C;AACnE,QAAM,YAAY,QAAQ,QAAQ,IAAI,YAAY;AAClD,QAAM,UAAU,WAAW,YAAY,KAAK;AAG5C,MAAI,WAAW,qBAAqB,KAAK,CAAC,YAAY,QAAQ,SAAS,OAAO,CAAC,GAAG;AACjF,WAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAAA,EAC7C;AAGA,QAAM,iBAAiB,QAAQ,QAAQ,IAAI,iBAAiB;AAC5D,MAAI,gBAAgB;AACnB,UAAM,WAAW,eAAe,YAAY;AAC5C,QAAI,wBAAwB,KAAK,CAAC,WAAW,SAAS,SAAS,MAAM,CAAC,GAAG;AACxE,aAAO,EAAE,UAAU,MAAM,QAAQ,kBAAkB;AAAA,IACpD;AAAA,EACD;AAKA,QAAM,eAAe,QAAQ,QAAQ,IAAI,gBAAgB;AACzD,MAAI,CAAC,gBAAgB,WAAW,eAAe,KAAK,OAAO,GAAG;AAC7D,UAAM,mBAAmB,yBAAyB,KAAK,CAAC,YAAY,QAAQ,SAAS,OAAO,CAAC;AAC7F,QAAI,CAAC,kBAAkB;AACtB,aAAO,EAAE,UAAU,MAAM,QAAQ,YAAY;AAAA,IAC9C;AAAA,EACD;AAEA,SAAO,EAAE,UAAU,OAAO,QAAQ,KAAK;AACxC;;;AC9CA,IAAM,yBAAyB,CAAC,iBAAiB,iBAAiB;AAS3D,SAAS,gBAAgB,SAAyB,SAA0C;AAClG,QAAM,SAAS,QAAQ,QAAQ,IAAI,QAAQ;AAC3C,MAAI,CAAC,OAAQ,QAAO;AAEpB,QAAM,QAAQ,SAAS,cAAc;AACrC,QAAM,cAAc,OAAO,YAAY;AACvC,SAAO,MAAM,KAAK,CAAC,SAAS,YAAY,SAAS,IAAI,CAAC;AACvD;AAYO,SAAS,oBACf,SACA,SAC4B;AAC5B,QAAM,YAAY,UAAU,OAAO;AACnC,MAAI,UAAU,UAAU;AACvB,WAAO,EAAE,OAAO,MAAM,QAAQ,SAAS,UAAU;AAAA,EAClD;AAEA,MAAI,gBAAgB,SAAS,OAAO,GAAG;AACtC,WAAO,EAAE,OAAO,MAAM,QAAQ,iBAAiB,UAAU;AAAA,EAC1D;AAEA,SAAO,EAAE,OAAO,OAAO,QAAQ,MAAM,UAAU;AAChD;;;ACnCO,SAAS,yBAAyB,MAAc,SAAmC;AACzF,QAAM,UAAU,SAAS,cAAc;AACvC,QAAM,QAAQ,SAAS,YAAY;AACnC,QAAM,OAAO,SAAS,WAAW;AAEjC,SAAO;AAAA;AAAA,YAEI,IAAI;AAAA;AAAA;AAAA;AAAA,8BAIc,OAAO,KAAK,IAAI,GAAG,OAAO;AAAA,iCACvB,KAAK,KAAK,IAAI,GAAG,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAOvD;","names":[]}
@@ -0,0 +1,82 @@
1
+ type DetectionMethod = "ua-match" | "signature-agent" | "heuristic";
2
+ type DetectionResult = {
3
+ detected: true;
4
+ method: DetectionMethod;
5
+ } | {
6
+ detected: false;
7
+ method: null;
8
+ };
9
+ /**
10
+ * Minimal request interface. Works with NextRequest, Request, or any object
11
+ * that has a headers.get() method.
12
+ */
13
+ interface MinimalRequest {
14
+ headers: {
15
+ get(name: string): string | null;
16
+ };
17
+ }
18
+
19
+ /**
20
+ * Detects AI agents from HTTP request headers.
21
+ *
22
+ * Three detection layers (checked in order):
23
+ * 1. Known UA patterns (definitive)
24
+ * 2. Signature-Agent header (definitive, RFC 9421)
25
+ * 3. Missing sec-fetch-mode heuristic (catches unknown bots)
26
+ *
27
+ * Optimizes for recall over precision: serving markdown to a non-AI bot
28
+ * is low-harm; missing an AI agent means a worse experience.
29
+ */
30
+ declare function isAIAgent(request: MinimalRequest): DetectionResult;
31
+
32
+ interface AcceptMarkdownOptions {
33
+ mediaTypes?: string[];
34
+ }
35
+ /**
36
+ * Check if the request prefers markdown via the Accept header.
37
+ */
38
+ declare function acceptsMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): boolean;
39
+ interface ShouldServeMarkdownResult {
40
+ serve: boolean;
41
+ reason: "agent" | "accept-header" | null;
42
+ detection: DetectionResult;
43
+ }
44
+ /**
45
+ * Combines agent detection and content negotiation into one call.
46
+ * Returns whether to serve markdown and why.
47
+ */
48
+ declare function shouldServeMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): ShouldServeMarkdownResult;
49
+
50
+ interface NotFoundOptions {
51
+ sitemapUrl?: string;
52
+ indexUrl?: string;
53
+ baseUrl?: string;
54
+ }
55
+ /**
56
+ * Generates a markdown body for missing pages with links to discovery endpoints.
57
+ * Return with a 200 status (agents discard 404 response bodies).
58
+ */
59
+ declare function generateNotFoundMarkdown(path: string, options?: NotFoundOptions): string;
60
+
61
+ /**
62
+ * Layer 1: Known AI agent UA substrings (lowercase).
63
+ * Curated from https://bots.fyi/?tags=ai_assistant + official vendor docs.
64
+ * Last reviewed: 2026-03-20
65
+ */
66
+ declare const AI_AGENT_UA_PATTERNS: readonly string[];
67
+ /**
68
+ * Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).
69
+ */
70
+ declare const SIGNATURE_AGENT_DOMAINS: readonly string[];
71
+ /**
72
+ * Layer 3: Traditional bot exclusion list. Bots that should NOT trigger the
73
+ * heuristic layer (search engine crawlers, social previews, monitoring tools).
74
+ */
75
+ declare const TRADITIONAL_BOT_PATTERNS: readonly string[];
76
+ /**
77
+ * Broad regex for bot-like UA strings (used only in Layer 3 heuristic).
78
+ * No word boundaries — keywords commonly appear in compound names.
79
+ */
80
+ declare const BOT_LIKE_REGEX: RegExp;
81
+
82
+ export { AI_AGENT_UA_PATTERNS, type AcceptMarkdownOptions, BOT_LIKE_REGEX, type DetectionMethod, type DetectionResult, type MinimalRequest, type NotFoundOptions, SIGNATURE_AGENT_DOMAINS, type ShouldServeMarkdownResult, TRADITIONAL_BOT_PATTERNS, acceptsMarkdown, generateNotFoundMarkdown, isAIAgent, shouldServeMarkdown };
@@ -0,0 +1,82 @@
1
+ type DetectionMethod = "ua-match" | "signature-agent" | "heuristic";
2
+ type DetectionResult = {
3
+ detected: true;
4
+ method: DetectionMethod;
5
+ } | {
6
+ detected: false;
7
+ method: null;
8
+ };
9
+ /**
10
+ * Minimal request interface. Works with NextRequest, Request, or any object
11
+ * that has a headers.get() method.
12
+ */
13
+ interface MinimalRequest {
14
+ headers: {
15
+ get(name: string): string | null;
16
+ };
17
+ }
18
+
19
+ /**
20
+ * Detects AI agents from HTTP request headers.
21
+ *
22
+ * Three detection layers (checked in order):
23
+ * 1. Known UA patterns (definitive)
24
+ * 2. Signature-Agent header (definitive, RFC 9421)
25
+ * 3. Missing sec-fetch-mode heuristic (catches unknown bots)
26
+ *
27
+ * Optimizes for recall over precision: serving markdown to a non-AI bot
28
+ * is low-harm; missing an AI agent means a worse experience.
29
+ */
30
+ declare function isAIAgent(request: MinimalRequest): DetectionResult;
31
+
32
+ interface AcceptMarkdownOptions {
33
+ mediaTypes?: string[];
34
+ }
35
+ /**
36
+ * Check if the request prefers markdown via the Accept header.
37
+ */
38
+ declare function acceptsMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): boolean;
39
+ interface ShouldServeMarkdownResult {
40
+ serve: boolean;
41
+ reason: "agent" | "accept-header" | null;
42
+ detection: DetectionResult;
43
+ }
44
+ /**
45
+ * Combines agent detection and content negotiation into one call.
46
+ * Returns whether to serve markdown and why.
47
+ */
48
+ declare function shouldServeMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): ShouldServeMarkdownResult;
49
+
50
+ interface NotFoundOptions {
51
+ sitemapUrl?: string;
52
+ indexUrl?: string;
53
+ baseUrl?: string;
54
+ }
55
+ /**
56
+ * Generates a markdown body for missing pages with links to discovery endpoints.
57
+ * Return with a 200 status (agents discard 404 response bodies).
58
+ */
59
+ declare function generateNotFoundMarkdown(path: string, options?: NotFoundOptions): string;
60
+
61
+ /**
62
+ * Layer 1: Known AI agent UA substrings (lowercase).
63
+ * Curated from https://bots.fyi/?tags=ai_assistant + official vendor docs.
64
+ * Last reviewed: 2026-03-20
65
+ */
66
+ declare const AI_AGENT_UA_PATTERNS: readonly string[];
67
+ /**
68
+ * Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).
69
+ */
70
+ declare const SIGNATURE_AGENT_DOMAINS: readonly string[];
71
+ /**
72
+ * Layer 3: Traditional bot exclusion list. Bots that should NOT trigger the
73
+ * heuristic layer (search engine crawlers, social previews, monitoring tools).
74
+ */
75
+ declare const TRADITIONAL_BOT_PATTERNS: readonly string[];
76
+ /**
77
+ * Broad regex for bot-like UA strings (used only in Layer 3 heuristic).
78
+ * No word boundaries — keywords commonly appear in compound names.
79
+ */
80
+ declare const BOT_LIKE_REGEX: RegExp;
81
+
82
+ export { AI_AGENT_UA_PATTERNS, type AcceptMarkdownOptions, BOT_LIKE_REGEX, type DetectionMethod, type DetectionResult, type MinimalRequest, type NotFoundOptions, SIGNATURE_AGENT_DOMAINS, type ShouldServeMarkdownResult, TRADITIONAL_BOT_PATTERNS, acceptsMarkdown, generateNotFoundMarkdown, isAIAgent, shouldServeMarkdown };
package/dist/index.js ADDED
@@ -0,0 +1,141 @@
1
+ // src/patterns.ts
2
+ var AI_AGENT_UA_PATTERNS = [
3
+ // Anthropic — https://support.claude.com/en/articles/8896518
4
+ "claudebot",
5
+ "claude-searchbot",
6
+ "claude-user",
7
+ "anthropic-ai",
8
+ "claude-web",
9
+ // OpenAI — https://platform.openai.com/docs/bots
10
+ "chatgpt",
11
+ "gptbot",
12
+ "oai-searchbot",
13
+ "openai",
14
+ // Google AI
15
+ "gemini",
16
+ "bard",
17
+ "google-cloudvertexbot",
18
+ "google-extended",
19
+ // Meta
20
+ "meta-externalagent",
21
+ "meta-externalfetcher",
22
+ "meta-webindexer",
23
+ // Search/Research AI
24
+ "perplexity",
25
+ "youbot",
26
+ "you.com",
27
+ "deepseekbot",
28
+ // Coding assistants
29
+ "cursor",
30
+ "github-copilot",
31
+ "codeium",
32
+ "tabnine",
33
+ "sourcegraph",
34
+ // Other AI agents / data scrapers
35
+ "cohere-ai",
36
+ "bytespider",
37
+ "amazonbot",
38
+ "ai2bot",
39
+ "diffbot",
40
+ "omgili",
41
+ "omgilibot"
42
+ ];
43
+ var SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"];
44
+ var TRADITIONAL_BOT_PATTERNS = [
45
+ "googlebot",
46
+ "bingbot",
47
+ "yandexbot",
48
+ "baiduspider",
49
+ "duckduckbot",
50
+ "slurp",
51
+ "msnbot",
52
+ "facebot",
53
+ "twitterbot",
54
+ "linkedinbot",
55
+ "whatsapp",
56
+ "telegrambot",
57
+ "pingdom",
58
+ "uptimerobot",
59
+ "newrelic",
60
+ "datadog",
61
+ "statuspage",
62
+ "site24x7",
63
+ "applebot"
64
+ ];
65
+ var BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i;
66
+
67
+ // src/detection.ts
68
+ function isAIAgent(request) {
69
+ const userAgent = request.headers.get("user-agent");
70
+ const lowerUA = userAgent?.toLowerCase() ?? "";
71
+ if (lowerUA && AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) {
72
+ return { detected: true, method: "ua-match" };
73
+ }
74
+ const signatureAgent = request.headers.get("signature-agent");
75
+ if (signatureAgent) {
76
+ const lowerSig = signatureAgent.toLowerCase();
77
+ if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) {
78
+ return { detected: true, method: "signature-agent" };
79
+ }
80
+ }
81
+ const secFetchMode = request.headers.get("sec-fetch-mode");
82
+ if (!secFetchMode && lowerUA && BOT_LIKE_REGEX.test(lowerUA)) {
83
+ const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern));
84
+ if (!isTraditionalBot) {
85
+ return { detected: true, method: "heuristic" };
86
+ }
87
+ }
88
+ return { detected: false, method: null };
89
+ }
90
+
91
+ // src/negotiation.ts
92
+ var DEFAULT_MARKDOWN_TYPES = ["text/markdown", "text/x-markdown"];
93
+ function acceptsMarkdown(request, options) {
94
+ const accept = request.headers.get("accept");
95
+ if (!accept) return false;
96
+ const types = options?.mediaTypes ?? DEFAULT_MARKDOWN_TYPES;
97
+ const lowerAccept = accept.toLowerCase();
98
+ return types.some((type) => lowerAccept.includes(type));
99
+ }
100
+ function shouldServeMarkdown(request, options) {
101
+ const detection = isAIAgent(request);
102
+ if (detection.detected) {
103
+ return { serve: true, reason: "agent", detection };
104
+ }
105
+ if (acceptsMarkdown(request, options)) {
106
+ return { serve: true, reason: "accept-header", detection };
107
+ }
108
+ return { serve: false, reason: null, detection };
109
+ }
110
+
111
+ // src/not-found.ts
112
+ function generateNotFoundMarkdown(path, options) {
113
+ const sitemap = options?.sitemapUrl ?? "/sitemap.md";
114
+ const index = options?.indexUrl ?? "/llms.txt";
115
+ const base = options?.baseUrl ?? "";
116
+ return `# Page Not Found
117
+
118
+ The URL \`${path}\` does not exist.
119
+
120
+ ## How to find the correct page
121
+
122
+ 1. **Browse the sitemap**: [${sitemap}](${base}${sitemap}) - A structured index of all pages
123
+ 2. **Browse the full index**: [${index}](${base}${index}) - Complete documentation index
124
+
125
+ ## Tips for requesting documentation
126
+
127
+ - For markdown responses, append \`.md\` to URLs (e.g., \`/docs/functions.md\`)
128
+ - Use \`Accept: text/markdown\` header for content negotiation
129
+ `;
130
+ }
131
+ export {
132
+ AI_AGENT_UA_PATTERNS,
133
+ BOT_LIKE_REGEX,
134
+ SIGNATURE_AGENT_DOMAINS,
135
+ TRADITIONAL_BOT_PATTERNS,
136
+ acceptsMarkdown,
137
+ generateNotFoundMarkdown,
138
+ isAIAgent,
139
+ shouldServeMarkdown
140
+ };
141
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/patterns.ts","../src/detection.ts","../src/negotiation.ts","../src/not-found.ts"],"sourcesContent":["/**\n * Layer 1: Known AI agent UA substrings (lowercase).\n * Curated from https://bots.fyi/?tags=ai_assistant + official vendor docs.\n * Last reviewed: 2026-03-20\n */\nexport const AI_AGENT_UA_PATTERNS: readonly string[] = [\n\t// Anthropic — https://support.claude.com/en/articles/8896518\n\t\"claudebot\",\n\t\"claude-searchbot\",\n\t\"claude-user\",\n\t\"anthropic-ai\",\n\t\"claude-web\",\n\n\t// OpenAI — https://platform.openai.com/docs/bots\n\t\"chatgpt\",\n\t\"gptbot\",\n\t\"oai-searchbot\",\n\t\"openai\",\n\n\t// Google AI\n\t\"gemini\",\n\t\"bard\",\n\t\"google-cloudvertexbot\",\n\t\"google-extended\",\n\n\t// Meta\n\t\"meta-externalagent\",\n\t\"meta-externalfetcher\",\n\t\"meta-webindexer\",\n\n\t// Search/Research AI\n\t\"perplexity\",\n\t\"youbot\",\n\t\"you.com\",\n\t\"deepseekbot\",\n\n\t// Coding assistants\n\t\"cursor\",\n\t\"github-copilot\",\n\t\"codeium\",\n\t\"tabnine\",\n\t\"sourcegraph\",\n\n\t// Other AI agents / data scrapers\n\t\"cohere-ai\",\n\t\"bytespider\",\n\t\"amazonbot\",\n\t\"ai2bot\",\n\t\"diffbot\",\n\t\"omgili\",\n\t\"omgilibot\",\n];\n\n/**\n * Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421).\n */\nexport const SIGNATURE_AGENT_DOMAINS: readonly string[] = [\"chatgpt.com\"];\n\n/**\n * Layer 3: Traditional bot exclusion list. Bots that should NOT trigger the\n * heuristic layer (search engine crawlers, social previews, monitoring tools).\n */\nexport const TRADITIONAL_BOT_PATTERNS: readonly string[] = [\n\t\"googlebot\",\n\t\"bingbot\",\n\t\"yandexbot\",\n\t\"baiduspider\",\n\t\"duckduckbot\",\n\t\"slurp\",\n\t\"msnbot\",\n\t\"facebot\",\n\t\"twitterbot\",\n\t\"linkedinbot\",\n\t\"whatsapp\",\n\t\"telegrambot\",\n\t\"pingdom\",\n\t\"uptimerobot\",\n\t\"newrelic\",\n\t\"datadog\",\n\t\"statuspage\",\n\t\"site24x7\",\n\t\"applebot\",\n];\n\n/**\n * Broad regex for bot-like UA strings (used only in Layer 3 heuristic).\n * No word boundaries — keywords commonly appear in compound names.\n */\nexport const BOT_LIKE_REGEX: RegExp = /bot|agent|fetch|crawl|spider|search/i;\n","import {\n\tAI_AGENT_UA_PATTERNS,\n\tBOT_LIKE_REGEX,\n\tSIGNATURE_AGENT_DOMAINS,\n\tTRADITIONAL_BOT_PATTERNS,\n} from \"./patterns\";\nimport type { DetectionResult, MinimalRequest } from \"./types\";\n\n/**\n * Detects AI agents from HTTP request headers.\n *\n * Three detection layers (checked in order):\n * 1. Known UA patterns (definitive)\n * 2. Signature-Agent header (definitive, RFC 9421)\n * 3. Missing sec-fetch-mode heuristic (catches unknown bots)\n *\n * Optimizes for recall over precision: serving markdown to a non-AI bot\n * is low-harm; missing an AI agent means a worse experience.\n */\nexport function isAIAgent(request: MinimalRequest): DetectionResult {\n\tconst userAgent = request.headers.get(\"user-agent\");\n\tconst lowerUA = userAgent?.toLowerCase() ?? \"\";\n\n\t// Layer 1: Known UA pattern match\n\tif (lowerUA && AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) {\n\t\treturn { detected: true, method: \"ua-match\" };\n\t}\n\n\t// Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent)\n\tconst signatureAgent = request.headers.get(\"signature-agent\");\n\tif (signatureAgent) {\n\t\tconst lowerSig = signatureAgent.toLowerCase();\n\t\tif (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) {\n\t\t\treturn { detected: true, method: \"signature-agent\" };\n\t\t}\n\t}\n\n\t// Layer 3: Missing browser fingerprint heuristic\n\t// Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode\n\t// on navigation requests. Its absence signals a programmatic client.\n\tconst secFetchMode = request.headers.get(\"sec-fetch-mode\");\n\tif (!secFetchMode && lowerUA && BOT_LIKE_REGEX.test(lowerUA)) {\n\t\tconst isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern));\n\t\tif (!isTraditionalBot) {\n\t\t\treturn { detected: true, method: \"heuristic\" };\n\t\t}\n\t}\n\n\treturn { detected: false, method: null };\n}\n","import { isAIAgent } from \"./detection\";\nimport type { DetectionResult, MinimalRequest } from \"./types\";\n\nconst DEFAULT_MARKDOWN_TYPES = [\"text/markdown\", \"text/x-markdown\"];\n\nexport interface AcceptMarkdownOptions {\n\tmediaTypes?: string[];\n}\n\n/**\n * Check if the request prefers markdown via the Accept header.\n */\nexport function acceptsMarkdown(request: MinimalRequest, options?: AcceptMarkdownOptions): boolean {\n\tconst accept = request.headers.get(\"accept\");\n\tif (!accept) return false;\n\n\tconst types = options?.mediaTypes ?? DEFAULT_MARKDOWN_TYPES;\n\tconst lowerAccept = accept.toLowerCase();\n\treturn types.some((type) => lowerAccept.includes(type));\n}\n\nexport interface ShouldServeMarkdownResult {\n\tserve: boolean;\n\treason: \"agent\" | \"accept-header\" | null;\n\tdetection: DetectionResult;\n}\n\n/**\n * Combines agent detection and content negotiation into one call.\n * Returns whether to serve markdown and why.\n */\nexport function shouldServeMarkdown(\n\trequest: MinimalRequest,\n\toptions?: AcceptMarkdownOptions,\n): ShouldServeMarkdownResult {\n\tconst detection = isAIAgent(request);\n\tif (detection.detected) {\n\t\treturn { serve: true, reason: \"agent\", detection };\n\t}\n\n\tif (acceptsMarkdown(request, options)) {\n\t\treturn { serve: true, reason: \"accept-header\", detection };\n\t}\n\n\treturn { serve: false, reason: null, detection };\n}\n","export interface NotFoundOptions {\n\tsitemapUrl?: string;\n\tindexUrl?: string;\n\tbaseUrl?: string;\n}\n\n/**\n * Generates a markdown body for missing pages with links to discovery endpoints.\n * Return with a 200 status (agents discard 404 response bodies).\n */\nexport function generateNotFoundMarkdown(path: string, options?: NotFoundOptions): string {\n\tconst sitemap = options?.sitemapUrl ?? \"/sitemap.md\";\n\tconst index = options?.indexUrl ?? \"/llms.txt\";\n\tconst base = options?.baseUrl ?? \"\";\n\n\treturn `# Page Not Found\n\nThe URL \\`${path}\\` does not exist.\n\n## How to find the correct page\n\n1. **Browse the sitemap**: [${sitemap}](${base}${sitemap}) - A structured index of all pages\n2. **Browse the full index**: [${index}](${base}${index}) - Complete documentation index\n\n## Tips for requesting documentation\n\n- For markdown responses, append \\`.md\\` to URLs (e.g., \\`/docs/functions.md\\`)\n- Use \\`Accept: text/markdown\\` header for content negotiation\n`;\n}\n"],"mappings":";AAKO,IAAM,uBAA0C;AAAA;AAAA,EAEtD;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA;AAAA,EAGA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACD;AAKO,IAAM,0BAA6C,CAAC,aAAa;AAMjE,IAAM,2BAA8C;AAAA,EAC1D;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACD;AAMO,IAAM,iBAAyB;;;ACrE/B,SAAS,UAAU,SAA0C;AACnE,QAAM,YAAY,QAAQ,QAAQ,IAAI,YAAY;AAClD,QAAM,UAAU,WAAW,YAAY,KAAK;AAG5C,MAAI,WAAW,qBAAqB,KAAK,CAAC,YAAY,QAAQ,SAAS,OAAO,CAAC,GAAG;AACjF,WAAO,EAAE,UAAU,MAAM,QAAQ,WAAW;AAAA,EAC7C;AAGA,QAAM,iBAAiB,QAAQ,QAAQ,IAAI,iBAAiB;AAC5D,MAAI,gBAAgB;AACnB,UAAM,WAAW,eAAe,YAAY;AAC5C,QAAI,wBAAwB,KAAK,CAAC,WAAW,SAAS,SAAS,MAAM,CAAC,GAAG;AACxE,aAAO,EAAE,UAAU,MAAM,QAAQ,kBAAkB;AAAA,IACpD;AAAA,EACD;AAKA,QAAM,eAAe,QAAQ,QAAQ,IAAI,gBAAgB;AACzD,MAAI,CAAC,gBAAgB,WAAW,eAAe,KAAK,OAAO,GAAG;AAC7D,UAAM,mBAAmB,yBAAyB,KAAK,CAAC,YAAY,QAAQ,SAAS,OAAO,CAAC;AAC7F,QAAI,CAAC,kBAAkB;AACtB,aAAO,EAAE,UAAU,MAAM,QAAQ,YAAY;AAAA,IAC9C;AAAA,EACD;AAEA,SAAO,EAAE,UAAU,OAAO,QAAQ,KAAK;AACxC;;;AC9CA,IAAM,yBAAyB,CAAC,iBAAiB,iBAAiB;AAS3D,SAAS,gBAAgB,SAAyB,SAA0C;AAClG,QAAM,SAAS,QAAQ,QAAQ,IAAI,QAAQ;AAC3C,MAAI,CAAC,OAAQ,QAAO;AAEpB,QAAM,QAAQ,SAAS,cAAc;AACrC,QAAM,cAAc,OAAO,YAAY;AACvC,SAAO,MAAM,KAAK,CAAC,SAAS,YAAY,SAAS,IAAI,CAAC;AACvD;AAYO,SAAS,oBACf,SACA,SAC4B;AAC5B,QAAM,YAAY,UAAU,OAAO;AACnC,MAAI,UAAU,UAAU;AACvB,WAAO,EAAE,OAAO,MAAM,QAAQ,SAAS,UAAU;AAAA,EAClD;AAEA,MAAI,gBAAgB,SAAS,OAAO,GAAG;AACtC,WAAO,EAAE,OAAO,MAAM,QAAQ,iBAAiB,UAAU;AAAA,EAC1D;AAEA,SAAO,EAAE,OAAO,OAAO,QAAQ,MAAM,UAAU;AAChD;;;ACnCO,SAAS,yBAAyB,MAAc,SAAmC;AACzF,QAAM,UAAU,SAAS,cAAc;AACvC,QAAM,QAAQ,SAAS,YAAY;AACnC,QAAM,OAAO,SAAS,WAAW;AAEjC,SAAO;AAAA;AAAA,YAEI,IAAI;AAAA;AAAA;AAAA;AAAA,8BAIc,OAAO,KAAK,IAAI,GAAG,OAAO;AAAA,iCACvB,KAAK,KAAK,IAAI,GAAG,KAAK;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAOvD;","names":[]}