@agent-seo/core 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/edge-AywqjCEh.d.cts +198 -0
- package/dist/edge-AywqjCEh.d.ts +198 -0
- package/dist/edge.cjs +247 -0
- package/dist/edge.cjs.map +1 -0
- package/dist/edge.d.cts +1 -0
- package/dist/edge.d.ts +1 -0
- package/dist/edge.js +214 -0
- package/dist/edge.js.map +1 -0
- package/dist/index.cjs +918 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +108 -0
- package/dist/index.d.ts +108 -0
- package/dist/index.js +867 -0
- package/dist/index.js.map +1 -0
- package/package.json +75 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Pontiggia
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
type BotPurpose = 'training' | 'search' | 'agent-browsing' | 'unknown';
|
|
2
|
+
interface BotInfo {
|
|
3
|
+
/** The canonical name of the bot (e.g., "GPTBot", "ClaudeBot") */
|
|
4
|
+
name: string;
|
|
5
|
+
/** The organization operating the bot */
|
|
6
|
+
operator: string;
|
|
7
|
+
/** What the bot is doing: training data collection, search indexing, or live agent browsing */
|
|
8
|
+
purpose: BotPurpose;
|
|
9
|
+
/** Whether this bot renders JavaScript */
|
|
10
|
+
rendersJs: boolean;
|
|
11
|
+
}
|
|
12
|
+
interface AIRequestContext {
|
|
13
|
+
/** Whether the request comes from a known AI bot */
|
|
14
|
+
isAIBot: boolean;
|
|
15
|
+
/** Details about the detected bot, or null if not an AI bot */
|
|
16
|
+
bot: BotInfo | null;
|
|
17
|
+
/** Whether the client explicitly requested Markdown via Accept header */
|
|
18
|
+
wantsMarkdown: boolean;
|
|
19
|
+
}
|
|
20
|
+
interface TransformOptions {
|
|
21
|
+
/** URL of the page being transformed (used for resolving relative links) */
|
|
22
|
+
url?: string;
|
|
23
|
+
/** Maximum token budget. If exceeded, content is truncated intelligently. Default: no limit */
|
|
24
|
+
tokenBudget?: number;
|
|
25
|
+
/** Extract JSON-LD blocks from the HTML. Default: true */
|
|
26
|
+
extractJsonLd?: boolean;
|
|
27
|
+
/** Additional CSS selectors to strip (merged with defaults) */
|
|
28
|
+
stripSelectors?: string[];
|
|
29
|
+
/** CSS selectors to preserve even if they'd normally be stripped */
|
|
30
|
+
preserveSelectors?: string[];
|
|
31
|
+
/** Whether to include a YAML frontmatter block with metadata. Default: true */
|
|
32
|
+
frontmatter?: boolean;
|
|
33
|
+
/** Custom Turndown rules to add */
|
|
34
|
+
turndownRules?: TurndownRule[];
|
|
35
|
+
}
|
|
36
|
+
interface TurndownRule {
|
|
37
|
+
name: string;
|
|
38
|
+
filter: string | string[] | ((node: Node) => boolean);
|
|
39
|
+
replacement: (content: string, node: Node) => string;
|
|
40
|
+
}
|
|
41
|
+
interface TransformResult {
|
|
42
|
+
/** The clean Markdown output */
|
|
43
|
+
markdown: string;
|
|
44
|
+
/** Estimated token count (chars / 4 heuristic) */
|
|
45
|
+
tokenEstimate: number;
|
|
46
|
+
/** Page title extracted from <title> or <h1> */
|
|
47
|
+
title: string;
|
|
48
|
+
/** Meta description */
|
|
49
|
+
description: string;
|
|
50
|
+
/** Extracted JSON-LD objects from the page */
|
|
51
|
+
jsonLd: Record<string, unknown>[];
|
|
52
|
+
/** Canonical URL if found */
|
|
53
|
+
canonicalUrl: string | null;
|
|
54
|
+
/** ISO date string of last modification if available */
|
|
55
|
+
lastModified: string | null;
|
|
56
|
+
/** Language of the document */
|
|
57
|
+
lang: string | null;
|
|
58
|
+
}
|
|
59
|
+
interface LlmsTxtRoute {
|
|
60
|
+
/** The URL path (e.g., "/docs/getting-started") */
|
|
61
|
+
path: string;
|
|
62
|
+
/** Human-readable title for this page */
|
|
63
|
+
title: string;
|
|
64
|
+
/** One-line description */
|
|
65
|
+
description?: string;
|
|
66
|
+
/** Section grouping (e.g., "Documentation", "API Reference", "Blog") */
|
|
67
|
+
section?: string;
|
|
68
|
+
}
|
|
69
|
+
interface LlmsTxtOptions {
|
|
70
|
+
/** Site name (used as H1 in llms.txt) */
|
|
71
|
+
siteName: string;
|
|
72
|
+
/** One-paragraph site description (used as blockquote) */
|
|
73
|
+
siteDescription: string;
|
|
74
|
+
/** The base URL of the site (e.g., "https://example.com") */
|
|
75
|
+
baseUrl: string;
|
|
76
|
+
/**
|
|
77
|
+
* Routes to include in llms.txt.
|
|
78
|
+
* - `LlmsTxtRoute[]`: explicit list of routes
|
|
79
|
+
* - Omit to require routes to be passed to `generateLlmsTxt()` directly
|
|
80
|
+
*/
|
|
81
|
+
routes?: LlmsTxtRoute[];
|
|
82
|
+
/** Sections to always exclude from llms.txt (e.g., ["/admin", "/internal"]) */
|
|
83
|
+
excludePatterns?: string[];
|
|
84
|
+
/** File extension for Markdown alternates. Default: ".md" */
|
|
85
|
+
markdownExtension?: string;
|
|
86
|
+
}
|
|
87
|
+
interface LlmsTxtResult {
|
|
88
|
+
/** The generated llms.txt content */
|
|
89
|
+
llmsTxt: string;
|
|
90
|
+
/** The generated llms-full.txt content (all pages concatenated) */
|
|
91
|
+
llmsFullTxt: string;
|
|
92
|
+
/** Number of routes included */
|
|
93
|
+
routeCount: number;
|
|
94
|
+
}
|
|
95
|
+
interface AgentSeoHeaders {
|
|
96
|
+
'Content-Type': string;
|
|
97
|
+
'Content-Disposition'?: string;
|
|
98
|
+
Vary: string;
|
|
99
|
+
'X-Markdown-Tokens': string;
|
|
100
|
+
'X-Robots-Tag'?: string;
|
|
101
|
+
'Content-Signal'?: string;
|
|
102
|
+
Link?: string;
|
|
103
|
+
}
|
|
104
|
+
interface AgentSeoOptions {
|
|
105
|
+
/** Site name for llms.txt generation */
|
|
106
|
+
siteName: string;
|
|
107
|
+
/** Brief site description for LLMs */
|
|
108
|
+
siteDescription: string;
|
|
109
|
+
/** Base URL of the site (e.g., "https://example.com") */
|
|
110
|
+
baseUrl: string;
|
|
111
|
+
/**
|
|
112
|
+
* Which paths to transform. Default: all paths.
|
|
113
|
+
* Use glob patterns: ["/docs/**", "/blog/**"]
|
|
114
|
+
* Or a function: (path: string) => boolean
|
|
115
|
+
*/
|
|
116
|
+
include?: string[] | ((path: string) => boolean);
|
|
117
|
+
/**
|
|
118
|
+
* Which paths to never transform.
|
|
119
|
+
* Default: ["/api/**", "/_next/**", "/static/**", "/assets/**"]
|
|
120
|
+
*/
|
|
121
|
+
exclude?: string[];
|
|
122
|
+
/** Transform pipeline options */
|
|
123
|
+
transform?: Omit<TransformOptions, 'url'>;
|
|
124
|
+
/**
|
|
125
|
+
* llms.txt generation options.
|
|
126
|
+
* Routes can be provided explicitly via `llmsTxt.routes`.
|
|
127
|
+
* For Next.js, use `appDir` on `createLlmsTxtHandler` for automatic route discovery.
|
|
128
|
+
*/
|
|
129
|
+
llmsTxt?: Partial<LlmsTxtOptions>;
|
|
130
|
+
/** Cache options */
|
|
131
|
+
cache?: {
|
|
132
|
+
/** Enable caching. Default: true */
|
|
133
|
+
enabled?: boolean;
|
|
134
|
+
/** Cache TTL in milliseconds. Default: 300_000 (5 minutes) */
|
|
135
|
+
ttl?: number;
|
|
136
|
+
/** Max cache entries. Default: 100 */
|
|
137
|
+
maxEntries?: number;
|
|
138
|
+
};
|
|
139
|
+
/** Content-Signal header values. Default: all true */
|
|
140
|
+
contentSignal?: {
|
|
141
|
+
aiTrain?: boolean;
|
|
142
|
+
search?: boolean;
|
|
143
|
+
aiInput?: boolean;
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
interface BotEntry {
|
|
148
|
+
/** Regex pattern to match against User-Agent string */
|
|
149
|
+
pattern: RegExp;
|
|
150
|
+
/** Bot metadata */
|
|
151
|
+
info: BotInfo;
|
|
152
|
+
}
|
|
153
|
+
declare const AI_BOT_REGISTRY: BotEntry[];
|
|
154
|
+
/**
|
|
155
|
+
* Detect if an incoming request is from an AI bot.
|
|
156
|
+
*
|
|
157
|
+
* @param userAgent - The User-Agent header value
|
|
158
|
+
* @param acceptHeader - The Accept header value (optional)
|
|
159
|
+
* @returns AIRequestContext with bot detection results
|
|
160
|
+
*/
|
|
161
|
+
declare function detectAgent(userAgent: string | null | undefined, acceptHeader?: string | null): AIRequestContext;
|
|
162
|
+
/**
|
|
163
|
+
* Check if a request should receive Markdown.
|
|
164
|
+
* Returns true if: the request is from a known AI bot OR the Accept header requests text/markdown.
|
|
165
|
+
*/
|
|
166
|
+
declare function shouldServeMarkdown(userAgent: string | null | undefined, acceptHeader?: string | null): boolean;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Estimate token count using the chars/4 heuristic.
|
|
170
|
+
* This is the same heuristic used by Cloudflare's X-Markdown-Tokens header.
|
|
171
|
+
* Accurate to within ~10% for English text across GPT and Claude tokenizers.
|
|
172
|
+
*/
|
|
173
|
+
declare function estimateTokens(text: string): number;
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Build the response headers for a Markdown response.
|
|
177
|
+
*/
|
|
178
|
+
declare function buildMarkdownHeaders(result: TransformResult, options: Pick<AgentSeoOptions, 'contentSignal'>, originalPath?: string): AgentSeoHeaders;
|
|
179
|
+
/**
|
|
180
|
+
* Build a Link header pointing to the Markdown alternate.
|
|
181
|
+
* This is injected into ALL HTML responses (not just Markdown ones)
|
|
182
|
+
* so crawlers can discover the alternate representation.
|
|
183
|
+
*/
|
|
184
|
+
declare function buildAlternateLinkHeader(path: string, ext?: string): string;
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Generate llms.txt and llms-full.txt content from route data.
|
|
188
|
+
*
|
|
189
|
+
* llms.txt format (per spec at llmstxt.org):
|
|
190
|
+
* # Site Name
|
|
191
|
+
* > Site description blockquote
|
|
192
|
+
*
|
|
193
|
+
* ## Section Name
|
|
194
|
+
* - [Page Title](url): Description
|
|
195
|
+
*/
|
|
196
|
+
declare function generateLlmsTxt(options: LlmsTxtOptions, routes: LlmsTxtRoute[], fullTextContents?: Map<string, string>): LlmsTxtResult;
|
|
197
|
+
|
|
198
|
+
export { type AIRequestContext as A, type BotInfo as B, type LlmsTxtRoute as L, type TransformOptions as T, type TransformResult as a, type TurndownRule as b, AI_BOT_REGISTRY as c, type AgentSeoHeaders as d, type AgentSeoOptions as e, type BotPurpose as f, type LlmsTxtOptions as g, type LlmsTxtResult as h, buildAlternateLinkHeader as i, buildMarkdownHeaders as j, detectAgent as k, estimateTokens as l, generateLlmsTxt as m, shouldServeMarkdown as s };
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
type BotPurpose = 'training' | 'search' | 'agent-browsing' | 'unknown';
|
|
2
|
+
interface BotInfo {
|
|
3
|
+
/** The canonical name of the bot (e.g., "GPTBot", "ClaudeBot") */
|
|
4
|
+
name: string;
|
|
5
|
+
/** The organization operating the bot */
|
|
6
|
+
operator: string;
|
|
7
|
+
/** What the bot is doing: training data collection, search indexing, or live agent browsing */
|
|
8
|
+
purpose: BotPurpose;
|
|
9
|
+
/** Whether this bot renders JavaScript */
|
|
10
|
+
rendersJs: boolean;
|
|
11
|
+
}
|
|
12
|
+
interface AIRequestContext {
|
|
13
|
+
/** Whether the request comes from a known AI bot */
|
|
14
|
+
isAIBot: boolean;
|
|
15
|
+
/** Details about the detected bot, or null if not an AI bot */
|
|
16
|
+
bot: BotInfo | null;
|
|
17
|
+
/** Whether the client explicitly requested Markdown via Accept header */
|
|
18
|
+
wantsMarkdown: boolean;
|
|
19
|
+
}
|
|
20
|
+
interface TransformOptions {
|
|
21
|
+
/** URL of the page being transformed (used for resolving relative links) */
|
|
22
|
+
url?: string;
|
|
23
|
+
/** Maximum token budget. If exceeded, content is truncated intelligently. Default: no limit */
|
|
24
|
+
tokenBudget?: number;
|
|
25
|
+
/** Extract JSON-LD blocks from the HTML. Default: true */
|
|
26
|
+
extractJsonLd?: boolean;
|
|
27
|
+
/** Additional CSS selectors to strip (merged with defaults) */
|
|
28
|
+
stripSelectors?: string[];
|
|
29
|
+
/** CSS selectors to preserve even if they'd normally be stripped */
|
|
30
|
+
preserveSelectors?: string[];
|
|
31
|
+
/** Whether to include a YAML frontmatter block with metadata. Default: true */
|
|
32
|
+
frontmatter?: boolean;
|
|
33
|
+
/** Custom Turndown rules to add */
|
|
34
|
+
turndownRules?: TurndownRule[];
|
|
35
|
+
}
|
|
36
|
+
interface TurndownRule {
|
|
37
|
+
name: string;
|
|
38
|
+
filter: string | string[] | ((node: Node) => boolean);
|
|
39
|
+
replacement: (content: string, node: Node) => string;
|
|
40
|
+
}
|
|
41
|
+
interface TransformResult {
|
|
42
|
+
/** The clean Markdown output */
|
|
43
|
+
markdown: string;
|
|
44
|
+
/** Estimated token count (chars / 4 heuristic) */
|
|
45
|
+
tokenEstimate: number;
|
|
46
|
+
/** Page title extracted from <title> or <h1> */
|
|
47
|
+
title: string;
|
|
48
|
+
/** Meta description */
|
|
49
|
+
description: string;
|
|
50
|
+
/** Extracted JSON-LD objects from the page */
|
|
51
|
+
jsonLd: Record<string, unknown>[];
|
|
52
|
+
/** Canonical URL if found */
|
|
53
|
+
canonicalUrl: string | null;
|
|
54
|
+
/** ISO date string of last modification if available */
|
|
55
|
+
lastModified: string | null;
|
|
56
|
+
/** Language of the document */
|
|
57
|
+
lang: string | null;
|
|
58
|
+
}
|
|
59
|
+
interface LlmsTxtRoute {
|
|
60
|
+
/** The URL path (e.g., "/docs/getting-started") */
|
|
61
|
+
path: string;
|
|
62
|
+
/** Human-readable title for this page */
|
|
63
|
+
title: string;
|
|
64
|
+
/** One-line description */
|
|
65
|
+
description?: string;
|
|
66
|
+
/** Section grouping (e.g., "Documentation", "API Reference", "Blog") */
|
|
67
|
+
section?: string;
|
|
68
|
+
}
|
|
69
|
+
interface LlmsTxtOptions {
|
|
70
|
+
/** Site name (used as H1 in llms.txt) */
|
|
71
|
+
siteName: string;
|
|
72
|
+
/** One-paragraph site description (used as blockquote) */
|
|
73
|
+
siteDescription: string;
|
|
74
|
+
/** The base URL of the site (e.g., "https://example.com") */
|
|
75
|
+
baseUrl: string;
|
|
76
|
+
/**
|
|
77
|
+
* Routes to include in llms.txt.
|
|
78
|
+
* - `LlmsTxtRoute[]`: explicit list of routes
|
|
79
|
+
* - Omit to require routes to be passed to `generateLlmsTxt()` directly
|
|
80
|
+
*/
|
|
81
|
+
routes?: LlmsTxtRoute[];
|
|
82
|
+
/** Sections to always exclude from llms.txt (e.g., ["/admin", "/internal"]) */
|
|
83
|
+
excludePatterns?: string[];
|
|
84
|
+
/** File extension for Markdown alternates. Default: ".md" */
|
|
85
|
+
markdownExtension?: string;
|
|
86
|
+
}
|
|
87
|
+
interface LlmsTxtResult {
|
|
88
|
+
/** The generated llms.txt content */
|
|
89
|
+
llmsTxt: string;
|
|
90
|
+
/** The generated llms-full.txt content (all pages concatenated) */
|
|
91
|
+
llmsFullTxt: string;
|
|
92
|
+
/** Number of routes included */
|
|
93
|
+
routeCount: number;
|
|
94
|
+
}
|
|
95
|
+
interface AgentSeoHeaders {
|
|
96
|
+
'Content-Type': string;
|
|
97
|
+
'Content-Disposition'?: string;
|
|
98
|
+
Vary: string;
|
|
99
|
+
'X-Markdown-Tokens': string;
|
|
100
|
+
'X-Robots-Tag'?: string;
|
|
101
|
+
'Content-Signal'?: string;
|
|
102
|
+
Link?: string;
|
|
103
|
+
}
|
|
104
|
+
interface AgentSeoOptions {
|
|
105
|
+
/** Site name for llms.txt generation */
|
|
106
|
+
siteName: string;
|
|
107
|
+
/** Brief site description for LLMs */
|
|
108
|
+
siteDescription: string;
|
|
109
|
+
/** Base URL of the site (e.g., "https://example.com") */
|
|
110
|
+
baseUrl: string;
|
|
111
|
+
/**
|
|
112
|
+
* Which paths to transform. Default: all paths.
|
|
113
|
+
* Use glob patterns: ["/docs/**", "/blog/**"]
|
|
114
|
+
* Or a function: (path: string) => boolean
|
|
115
|
+
*/
|
|
116
|
+
include?: string[] | ((path: string) => boolean);
|
|
117
|
+
/**
|
|
118
|
+
* Which paths to never transform.
|
|
119
|
+
* Default: ["/api/**", "/_next/**", "/static/**", "/assets/**"]
|
|
120
|
+
*/
|
|
121
|
+
exclude?: string[];
|
|
122
|
+
/** Transform pipeline options */
|
|
123
|
+
transform?: Omit<TransformOptions, 'url'>;
|
|
124
|
+
/**
|
|
125
|
+
* llms.txt generation options.
|
|
126
|
+
* Routes can be provided explicitly via `llmsTxt.routes`.
|
|
127
|
+
* For Next.js, use `appDir` on `createLlmsTxtHandler` for automatic route discovery.
|
|
128
|
+
*/
|
|
129
|
+
llmsTxt?: Partial<LlmsTxtOptions>;
|
|
130
|
+
/** Cache options */
|
|
131
|
+
cache?: {
|
|
132
|
+
/** Enable caching. Default: true */
|
|
133
|
+
enabled?: boolean;
|
|
134
|
+
/** Cache TTL in milliseconds. Default: 300_000 (5 minutes) */
|
|
135
|
+
ttl?: number;
|
|
136
|
+
/** Max cache entries. Default: 100 */
|
|
137
|
+
maxEntries?: number;
|
|
138
|
+
};
|
|
139
|
+
/** Content-Signal header values. Default: all true */
|
|
140
|
+
contentSignal?: {
|
|
141
|
+
aiTrain?: boolean;
|
|
142
|
+
search?: boolean;
|
|
143
|
+
aiInput?: boolean;
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
interface BotEntry {
|
|
148
|
+
/** Regex pattern to match against User-Agent string */
|
|
149
|
+
pattern: RegExp;
|
|
150
|
+
/** Bot metadata */
|
|
151
|
+
info: BotInfo;
|
|
152
|
+
}
|
|
153
|
+
declare const AI_BOT_REGISTRY: BotEntry[];
|
|
154
|
+
/**
|
|
155
|
+
* Detect if an incoming request is from an AI bot.
|
|
156
|
+
*
|
|
157
|
+
* @param userAgent - The User-Agent header value
|
|
158
|
+
* @param acceptHeader - The Accept header value (optional)
|
|
159
|
+
* @returns AIRequestContext with bot detection results
|
|
160
|
+
*/
|
|
161
|
+
declare function detectAgent(userAgent: string | null | undefined, acceptHeader?: string | null): AIRequestContext;
|
|
162
|
+
/**
|
|
163
|
+
* Check if a request should receive Markdown.
|
|
164
|
+
* Returns true if: the request is from a known AI bot OR the Accept header requests text/markdown.
|
|
165
|
+
*/
|
|
166
|
+
declare function shouldServeMarkdown(userAgent: string | null | undefined, acceptHeader?: string | null): boolean;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Estimate token count using the chars/4 heuristic.
|
|
170
|
+
* This is the same heuristic used by Cloudflare's X-Markdown-Tokens header.
|
|
171
|
+
* Accurate to within ~10% for English text across GPT and Claude tokenizers.
|
|
172
|
+
*/
|
|
173
|
+
declare function estimateTokens(text: string): number;
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Build the response headers for a Markdown response.
|
|
177
|
+
*/
|
|
178
|
+
declare function buildMarkdownHeaders(result: TransformResult, options: Pick<AgentSeoOptions, 'contentSignal'>, originalPath?: string): AgentSeoHeaders;
|
|
179
|
+
/**
|
|
180
|
+
* Build a Link header pointing to the Markdown alternate.
|
|
181
|
+
* This is injected into ALL HTML responses (not just Markdown ones)
|
|
182
|
+
* so crawlers can discover the alternate representation.
|
|
183
|
+
*/
|
|
184
|
+
declare function buildAlternateLinkHeader(path: string, ext?: string): string;
|
|
185
|
+
|
|
186
|
+
/**
|
|
187
|
+
* Generate llms.txt and llms-full.txt content from route data.
|
|
188
|
+
*
|
|
189
|
+
* llms.txt format (per spec at llmstxt.org):
|
|
190
|
+
* # Site Name
|
|
191
|
+
* > Site description blockquote
|
|
192
|
+
*
|
|
193
|
+
* ## Section Name
|
|
194
|
+
* - [Page Title](url): Description
|
|
195
|
+
*/
|
|
196
|
+
declare function generateLlmsTxt(options: LlmsTxtOptions, routes: LlmsTxtRoute[], fullTextContents?: Map<string, string>): LlmsTxtResult;
|
|
197
|
+
|
|
198
|
+
export { type AIRequestContext as A, type BotInfo as B, type LlmsTxtRoute as L, type TransformOptions as T, type TransformResult as a, type TurndownRule as b, AI_BOT_REGISTRY as c, type AgentSeoHeaders as d, type AgentSeoOptions as e, type BotPurpose as f, type LlmsTxtOptions as g, type LlmsTxtResult as h, buildAlternateLinkHeader as i, buildMarkdownHeaders as j, detectAgent as k, estimateTokens as l, generateLlmsTxt as m, shouldServeMarkdown as s };
|
package/dist/edge.cjs
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/edge.ts
|
|
21
|
+
var edge_exports = {};
|
|
22
|
+
__export(edge_exports, {
|
|
23
|
+
AI_BOT_REGISTRY: () => AI_BOT_REGISTRY,
|
|
24
|
+
buildAlternateLinkHeader: () => buildAlternateLinkHeader,
|
|
25
|
+
buildMarkdownHeaders: () => buildMarkdownHeaders,
|
|
26
|
+
detectAgent: () => detectAgent,
|
|
27
|
+
estimateTokens: () => estimateTokens,
|
|
28
|
+
generateLlmsTxt: () => generateLlmsTxt,
|
|
29
|
+
shouldServeMarkdown: () => shouldServeMarkdown
|
|
30
|
+
});
|
|
31
|
+
module.exports = __toCommonJS(edge_exports);
|
|
32
|
+
|
|
33
|
+
// src/detect.ts
|
|
34
|
+
var AI_BOT_REGISTRY = [
|
|
35
|
+
// === OpenAI ===
|
|
36
|
+
{
|
|
37
|
+
pattern: /GPTBot/i,
|
|
38
|
+
info: { name: "GPTBot", operator: "OpenAI", purpose: "training", rendersJs: false }
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
pattern: /OAI-SearchBot/i,
|
|
42
|
+
info: { name: "OAI-SearchBot", operator: "OpenAI", purpose: "search", rendersJs: false }
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
pattern: /ChatGPT-User/i,
|
|
46
|
+
info: { name: "ChatGPT-User", operator: "OpenAI", purpose: "agent-browsing", rendersJs: true }
|
|
47
|
+
},
|
|
48
|
+
// === Anthropic ===
|
|
49
|
+
{
|
|
50
|
+
pattern: /ClaudeBot/i,
|
|
51
|
+
info: { name: "ClaudeBot", operator: "Anthropic", purpose: "training", rendersJs: false }
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
pattern: /Claude-User/i,
|
|
55
|
+
info: { name: "Claude-User", operator: "Anthropic", purpose: "agent-browsing", rendersJs: true }
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
pattern: /Claude-SearchBot/i,
|
|
59
|
+
info: { name: "Claude-SearchBot", operator: "Anthropic", purpose: "search", rendersJs: false }
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
pattern: /anthropic-ai/i,
|
|
63
|
+
info: { name: "anthropic-ai", operator: "Anthropic", purpose: "training", rendersJs: false }
|
|
64
|
+
},
|
|
65
|
+
// === Perplexity ===
|
|
66
|
+
{
|
|
67
|
+
pattern: /PerplexityBot/i,
|
|
68
|
+
info: { name: "PerplexityBot", operator: "Perplexity", purpose: "search", rendersJs: false }
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
pattern: /Perplexity-User/i,
|
|
72
|
+
info: { name: "Perplexity-User", operator: "Perplexity", purpose: "agent-browsing", rendersJs: true }
|
|
73
|
+
},
|
|
74
|
+
// === Google ===
|
|
75
|
+
{
|
|
76
|
+
pattern: /Google-Extended/i,
|
|
77
|
+
info: { name: "Google-Extended", operator: "Google", purpose: "training", rendersJs: true }
|
|
78
|
+
},
|
|
79
|
+
// === Apple ===
|
|
80
|
+
{
|
|
81
|
+
pattern: /Applebot-Extended/i,
|
|
82
|
+
info: { name: "Applebot-Extended", operator: "Apple", purpose: "training", rendersJs: true }
|
|
83
|
+
},
|
|
84
|
+
// === Meta ===
|
|
85
|
+
{
|
|
86
|
+
pattern: /meta-externalagent/i,
|
|
87
|
+
info: { name: "Meta-ExternalAgent", operator: "Meta", purpose: "training", rendersJs: false }
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
pattern: /FacebookBot/i,
|
|
91
|
+
info: { name: "FacebookBot", operator: "Meta", purpose: "search", rendersJs: false }
|
|
92
|
+
},
|
|
93
|
+
// === Common Crawl ===
|
|
94
|
+
{
|
|
95
|
+
pattern: /CCBot/i,
|
|
96
|
+
info: { name: "CCBot", operator: "Common Crawl", purpose: "training", rendersJs: false }
|
|
97
|
+
},
|
|
98
|
+
// === Cohere ===
|
|
99
|
+
{
|
|
100
|
+
pattern: /cohere-ai/i,
|
|
101
|
+
info: { name: "cohere-ai", operator: "Cohere", purpose: "training", rendersJs: false }
|
|
102
|
+
},
|
|
103
|
+
// === Amazon ===
|
|
104
|
+
{
|
|
105
|
+
pattern: /Amazonbot/i,
|
|
106
|
+
info: { name: "Amazonbot", operator: "Amazon", purpose: "search", rendersJs: false }
|
|
107
|
+
},
|
|
108
|
+
// === Bytedance ===
|
|
109
|
+
{
|
|
110
|
+
pattern: /Bytespider/i,
|
|
111
|
+
info: { name: "Bytespider", operator: "ByteDance", purpose: "training", rendersJs: false }
|
|
112
|
+
},
|
|
113
|
+
// === You.com ===
|
|
114
|
+
{
|
|
115
|
+
pattern: /YouBot/i,
|
|
116
|
+
info: { name: "YouBot", operator: "You.com", purpose: "search", rendersJs: false }
|
|
117
|
+
},
|
|
118
|
+
// === DeepSeek ===
|
|
119
|
+
{
|
|
120
|
+
pattern: /Deepseek/i,
|
|
121
|
+
info: { name: "DeepSeekBot", operator: "DeepSeek", purpose: "training", rendersJs: false }
|
|
122
|
+
}
|
|
123
|
+
];
|
|
124
|
+
var TOKEN_REGISTRY = AI_BOT_REGISTRY.map((entry) => ({
|
|
125
|
+
entry,
|
|
126
|
+
token: regexToToken(entry.pattern)
|
|
127
|
+
}));
|
|
128
|
+
function detectAgent(userAgent, acceptHeader) {
|
|
129
|
+
const wantsMarkdown = acceptHeader ? /text\/markdown/i.test(acceptHeader) : false;
|
|
130
|
+
if (!userAgent) {
|
|
131
|
+
return { isAIBot: false, bot: null, wantsMarkdown };
|
|
132
|
+
}
|
|
133
|
+
const ua = userAgent.toLowerCase();
|
|
134
|
+
for (const { entry, token } of TOKEN_REGISTRY) {
|
|
135
|
+
if (token) {
|
|
136
|
+
if (ua.includes(token)) {
|
|
137
|
+
return { isAIBot: true, bot: entry.info, wantsMarkdown };
|
|
138
|
+
}
|
|
139
|
+
continue;
|
|
140
|
+
}
|
|
141
|
+
if (entry.pattern.test(userAgent)) {
|
|
142
|
+
return { isAIBot: true, bot: entry.info, wantsMarkdown };
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return { isAIBot: false, bot: null, wantsMarkdown };
|
|
146
|
+
}
|
|
147
|
+
function shouldServeMarkdown(userAgent, acceptHeader) {
|
|
148
|
+
const ctx = detectAgent(userAgent, acceptHeader);
|
|
149
|
+
return ctx.isAIBot || ctx.wantsMarkdown;
|
|
150
|
+
}
|
|
151
|
+
function regexToToken(pattern) {
|
|
152
|
+
const source = pattern.source;
|
|
153
|
+
if (/^[A-Za-z0-9-]+$/.test(source)) return source.toLowerCase();
|
|
154
|
+
return null;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// src/tokens.ts
|
|
158
|
+
function estimateTokens(text) {
|
|
159
|
+
return Math.ceil(text.length / 4);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// src/headers.ts
|
|
163
|
+
function buildMarkdownHeaders(result, options, originalPath) {
|
|
164
|
+
const headers = {
|
|
165
|
+
"Content-Type": "text/markdown; charset=utf-8",
|
|
166
|
+
"Content-Disposition": "inline",
|
|
167
|
+
"Vary": "Accept, User-Agent",
|
|
168
|
+
"X-Markdown-Tokens": String(result.tokenEstimate)
|
|
169
|
+
};
|
|
170
|
+
const signal = options.contentSignal ?? { aiTrain: true, search: true, aiInput: true };
|
|
171
|
+
const signalParts = [];
|
|
172
|
+
if (signal.aiTrain !== false) signalParts.push("ai-train=yes");
|
|
173
|
+
if (signal.search !== false) signalParts.push("search=yes");
|
|
174
|
+
if (signal.aiInput !== false) signalParts.push("ai-input=yes");
|
|
175
|
+
if (signalParts.length > 0) {
|
|
176
|
+
headers["Content-Signal"] = signalParts.join(", ");
|
|
177
|
+
}
|
|
178
|
+
headers["X-Robots-Tag"] = "all";
|
|
179
|
+
return headers;
|
|
180
|
+
}
|
|
181
|
+
function buildAlternateLinkHeader(path, ext = ".md") {
|
|
182
|
+
const mdPath = path.endsWith("/") ? `${path}index${ext}` : `${path}${ext}`;
|
|
183
|
+
return `<${mdPath}>; rel="alternate"; type="text/markdown"`;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// src/llms-txt.ts
|
|
187
|
+
function generateLlmsTxt(options, routes, fullTextContents) {
|
|
188
|
+
const { siteName, siteDescription, baseUrl, markdownExtension = ".md" } = options;
|
|
189
|
+
const sections = /* @__PURE__ */ new Map();
|
|
190
|
+
for (const route of routes) {
|
|
191
|
+
const section = route.section || "Pages";
|
|
192
|
+
if (!sections.has(section)) sections.set(section, []);
|
|
193
|
+
sections.get(section).push(route);
|
|
194
|
+
}
|
|
195
|
+
const lines = [];
|
|
196
|
+
lines.push(`# ${siteName}`);
|
|
197
|
+
lines.push("");
|
|
198
|
+
lines.push(`> ${siteDescription}`);
|
|
199
|
+
lines.push("");
|
|
200
|
+
for (const [section, sectionRoutes] of sections) {
|
|
201
|
+
lines.push(`## ${section}`);
|
|
202
|
+
lines.push("");
|
|
203
|
+
for (const route of sectionRoutes) {
|
|
204
|
+
const url = `${baseUrl}${route.path}${markdownExtension}`;
|
|
205
|
+
const desc = route.description ? `: ${route.description}` : "";
|
|
206
|
+
lines.push(`- [${route.title}](${url})${desc}`);
|
|
207
|
+
}
|
|
208
|
+
lines.push("");
|
|
209
|
+
}
|
|
210
|
+
const llmsTxt = lines.join("\n").trim() + "\n";
|
|
211
|
+
const fullLines = [];
|
|
212
|
+
fullLines.push(`# ${siteName}`);
|
|
213
|
+
fullLines.push("");
|
|
214
|
+
fullLines.push(`> ${siteDescription}`);
|
|
215
|
+
fullLines.push("");
|
|
216
|
+
if (fullTextContents) {
|
|
217
|
+
for (const route of routes) {
|
|
218
|
+
const content = fullTextContents.get(route.path);
|
|
219
|
+
if (content) {
|
|
220
|
+
fullLines.push(`---`);
|
|
221
|
+
fullLines.push("");
|
|
222
|
+
fullLines.push(`## ${route.title}`);
|
|
223
|
+
fullLines.push(`Source: ${baseUrl}${route.path}`);
|
|
224
|
+
fullLines.push("");
|
|
225
|
+
fullLines.push(content);
|
|
226
|
+
fullLines.push("");
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
const llmsFullTxt = fullLines.join("\n").trim() + "\n";
|
|
231
|
+
return {
|
|
232
|
+
llmsTxt,
|
|
233
|
+
llmsFullTxt,
|
|
234
|
+
routeCount: routes.length
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
238
|
+
0 && (module.exports = {
|
|
239
|
+
AI_BOT_REGISTRY,
|
|
240
|
+
buildAlternateLinkHeader,
|
|
241
|
+
buildMarkdownHeaders,
|
|
242
|
+
detectAgent,
|
|
243
|
+
estimateTokens,
|
|
244
|
+
generateLlmsTxt,
|
|
245
|
+
shouldServeMarkdown
|
|
246
|
+
});
|
|
247
|
+
//# sourceMappingURL=edge.cjs.map
|