@casoon/astro-crawler-policy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,109 @@
1
+ import { presetDefaults } from './presets.js';
2
+ import { defaultRegistry } from './registry.js';
3
+ function mergeOptions(base, override) {
4
+ return {
5
+ ...base,
6
+ ...override,
7
+ contentSignals: {
8
+ ...base.contentSignals,
9
+ ...override?.contentSignals
10
+ },
11
+ bots: {
12
+ ...base.bots,
13
+ ...override?.bots
14
+ },
15
+ groups: {
16
+ ...base.groups,
17
+ ...override?.groups
18
+ },
19
+ output: {
20
+ ...base.output,
21
+ ...override?.output
22
+ },
23
+ audit: {
24
+ ...base.audit,
25
+ ...override?.audit
26
+ },
27
+ rules: override?.rules ?? base.rules,
28
+ sitemaps: override?.sitemaps ?? base.sitemaps
29
+ };
30
+ }
31
+ function classifyDefaultAction(bot, options) {
32
+ if (bot.verified === false && options.groups?.unknownAi) {
33
+ return options.groups.unknownAi;
34
+ }
35
+ if (bot.categories.includes('search') && options.groups?.searchEngines) {
36
+ return options.groups.searchEngines;
37
+ }
38
+ if (bot.verified &&
39
+ bot.categories.some((category) => ['ai-search', 'ai-input', 'ai-training'].includes(category)) &&
40
+ options.groups?.verifiedAi) {
41
+ return options.groups.verifiedAi;
42
+ }
43
+ if (bot.categories.includes('unknown-ai') && options.groups?.unknownAi) {
44
+ return options.groups.unknownAi;
45
+ }
46
+ return bot.defaultAction ?? 'inherit';
47
+ }
48
+ function normalizeRules(rules) {
49
+ return (rules ?? []).map((rule) => ({
50
+ ...rule,
51
+ userAgent: Array.isArray(rule.userAgent) ? rule.userAgent : [rule.userAgent],
52
+ allow: rule.allow ?? [],
53
+ disallow: rule.disallow ?? []
54
+ }));
55
+ }
56
+ function normalizeSitemaps(site, sitemaps) {
57
+ const values = sitemaps ?? [];
58
+ const deduped = new Set();
59
+ for (const sitemap of values) {
60
+ if (/^https?:\/\//.test(sitemap)) {
61
+ deduped.add(sitemap);
62
+ continue;
63
+ }
64
+ if (site) {
65
+ deduped.add(new URL(sitemap, site).toString());
66
+ continue;
67
+ }
68
+ deduped.add(sitemap);
69
+ }
70
+ return [...deduped];
71
+ }
72
+ export function compilePolicy(input) {
73
+ const extraBots = input.options.extraBots ?? [];
74
+ const registry = [...(input.registry ?? defaultRegistry), ...extraBots];
75
+ const environment = input.environment;
76
+ const baseOptions = input.options;
77
+ if (baseOptions.preset !== undefined && !Object.prototype.hasOwnProperty.call(presetDefaults, baseOptions.preset)) {
78
+ throw new Error(`astro-crawler-policy: Unknown preset "${baseOptions.preset}". Valid presets are: ${Object.keys(presetDefaults).join(', ')}.`);
79
+ }
80
+ const basePreset = baseOptions.preset ?? 'citationFriendly';
81
+ const presetOptions = presetDefaults[basePreset] ?? {};
82
+ const withPreset = mergeOptions(presetOptions, baseOptions);
83
+ const withEnv = mergeOptions(withPreset, environment ? withPreset.env?.[environment] : undefined);
84
+ const preset = withEnv.preset ?? basePreset;
85
+ const rules = normalizeRules(withEnv.rules);
86
+ const botRules = registry.flatMap((bot) => {
87
+ const resolvedAction = withEnv.bots?.[bot.id] ?? classifyDefaultAction(bot, withEnv);
88
+ if (resolvedAction === 'inherit') {
89
+ return [];
90
+ }
91
+ return bot.userAgents.map((userAgent) => ({
92
+ id: bot.id,
93
+ userAgent,
94
+ provider: bot.provider,
95
+ action: resolvedAction,
96
+ categories: bot.categories,
97
+ verified: bot.verified ?? false
98
+ }));
99
+ });
100
+ return {
101
+ preset,
102
+ mergeStrategy: withEnv.mergeStrategy ?? 'prepend',
103
+ contentSignals: withEnv.contentSignals ?? {},
104
+ rules,
105
+ botRules,
106
+ sitemaps: normalizeSitemaps(input.site, withEnv.sitemaps),
107
+ host: withEnv.host
108
+ };
109
+ }
@@ -0,0 +1,11 @@
1
+ import { compilePolicy } from './compile.js';
2
+ import type { AstroCrawlerPolicyOptions } from './types.js';
3
+ export declare function generateRobotsTxt(input: {
4
+ options: AstroCrawlerPolicyOptions;
5
+ existingRobotsTxt?: string | null;
6
+ environment?: string;
7
+ site?: string;
8
+ }): {
9
+ content: string;
10
+ policy: ReturnType<typeof compilePolicy>;
11
+ };
@@ -0,0 +1,13 @@
1
+ import { compilePolicy } from './compile.js';
2
+ import { mergeRobotsTxt } from './merge.js';
3
+ import { renderRobotsTxt } from './render.js';
4
+ export function generateRobotsTxt(input) {
5
+ const policy = compilePolicy({
6
+ options: input.options,
7
+ environment: input.environment,
8
+ site: input.site
9
+ });
10
+ const generated = renderRobotsTxt(policy);
11
+ const content = mergeRobotsTxt(generated, input.existingRobotsTxt ?? null, policy.mergeStrategy);
12
+ return { content, policy };
13
+ }
@@ -0,0 +1,9 @@
1
+ export { auditPolicy } from './audit.js';
2
+ export { compilePolicy } from './compile.js';
3
+ export { generateRobotsTxt } from './generate.js';
4
+ export { defaultRegistry, REGISTRY_VERSION } from './registry.js';
5
+ export { mergeRobotsTxt } from './merge.js';
6
+ export { presetDefaults } from './presets.js';
7
+ export { renderLlmsTxt, renderRobotsTxt } from './render.js';
8
+ export type { AstroCrawlerPolicyOptions, AuditIssue, BotAction, BotCategory, ContentSignals, MergeStrategy, Preset, RegistryBot, ResolvedPolicy, RobotsRule } from './types.js';
9
+ export { default } from './integration.js';
package/dist/index.js ADDED
@@ -0,0 +1,8 @@
1
+ export { auditPolicy } from './audit.js';
2
+ export { compilePolicy } from './compile.js';
3
+ export { generateRobotsTxt } from './generate.js';
4
+ export { defaultRegistry, REGISTRY_VERSION } from './registry.js';
5
+ export { mergeRobotsTxt } from './merge.js';
6
+ export { presetDefaults } from './presets.js';
7
+ export { renderLlmsTxt, renderRobotsTxt } from './render.js';
8
+ export { default } from './integration.js';
@@ -0,0 +1,11 @@
1
+ import type { AstroBuildDoneContextLike, AstroConfigLike, AstroCrawlerPolicyOptions, AstroLoggerLike } from './types.js';
2
+ export default function crawlerPolicy(options?: AstroCrawlerPolicyOptions): {
3
+ name: string;
4
+ hooks: {
5
+ 'astro:config:setup': ({ config: astroConfig, logger }: {
6
+ config: AstroConfigLike;
7
+ logger?: AstroLoggerLike;
8
+ }) => void;
9
+ 'astro:build:done': ({ dir, logger }: AstroBuildDoneContextLike) => Promise<void>;
10
+ };
11
+ };
@@ -0,0 +1,96 @@
1
+ import { mkdir, readFile, writeFile } from 'node:fs/promises';
2
+ import { dirname, join } from 'node:path';
3
+ import { fileURLToPath } from 'node:url';
4
+ import { auditPolicy } from './audit.js';
5
+ import { generateRobotsTxt } from './generate.js';
6
+ import { defaultRegistry, REGISTRY_VERSION } from './registry.js';
7
+ import { renderLlmsTxt } from './render.js';
8
+ function logIssue(logger, issue) {
9
+ if (issue.level === 'error') {
10
+ logger?.error?.(issue.message);
11
+ return;
12
+ }
13
+ if (issue.level === 'warn') {
14
+ logger?.warn?.(issue.message);
15
+ return;
16
+ }
17
+ logger?.info?.(issue.message);
18
+ }
19
+ async function readOptionalFile(pathname) {
20
+ try {
21
+ return await readFile(pathname, 'utf8');
22
+ }
23
+ catch {
24
+ return null;
25
+ }
26
+ }
27
+ export default function crawlerPolicy(options = {}) {
28
+ let config;
29
+ return {
30
+ name: '@casoon/astro-crawler-policy',
31
+ hooks: {
32
+ 'astro:config:setup': ({ config: astroConfig, logger }) => {
33
+ config = astroConfig;
34
+ logger?.info?.('Initializing astro-crawler-policy');
35
+ },
36
+ 'astro:build:done': async ({ dir, logger }) => {
37
+ if (options.output?.robotsTxt === false) {
38
+ logger?.info?.('Skipping robots.txt generation because output.robotsTxt is false');
39
+ return;
40
+ }
41
+ const publicDir = config?.publicDir ? fileURLToPath(config.publicDir) : join(process.cwd(), 'public');
42
+ const site = config?.site ? String(config.site) : undefined;
43
+ const environment = process.env.CONTEXT ??
44
+ process.env.DEPLOYMENT_ENVIRONMENT ??
45
+ process.env.NODE_ENV ??
46
+ 'production';
47
+ const existingRobotsTxt = await readOptionalFile(join(publicDir, 'robots.txt'));
48
+ const { content, policy } = generateRobotsTxt({
49
+ options,
50
+ existingRobotsTxt,
51
+ environment,
52
+ site
53
+ });
54
+ const outputPath = join(fileURLToPath(dir), 'robots.txt');
55
+ await mkdir(dirname(outputPath), { recursive: true });
56
+ await writeFile(outputPath, content, 'utf8');
57
+ if (options.debug) {
58
+ logger?.info?.(`[debug] registry version: ${REGISTRY_VERSION}${options.extraBots?.length ? ` + ${options.extraBots.length} extra bot(s)` : ''}`);
59
+ logger?.info?.(`[debug] environment: ${environment}`);
60
+ logger?.info?.(`[debug] preset: ${policy.preset}`);
61
+ const cs = policy.contentSignals;
62
+ const signals = Object.entries(cs)
63
+ .filter(([, v]) => v !== undefined)
64
+ .map(([k, v]) => `${k}=${v ? 'yes' : 'no'}`)
65
+ .join(', ');
66
+ if (signals)
67
+ logger?.info?.(`[debug] content signals: ${signals}`);
68
+ for (const rule of policy.botRules) {
69
+ logger?.info?.(`[debug] bot: ${rule.userAgent} → ${rule.action}`);
70
+ }
71
+ for (const sitemap of policy.sitemaps) {
72
+ logger?.info?.(`[debug] sitemap: ${sitemap}`);
73
+ }
74
+ }
75
+ const registry = [...defaultRegistry, ...(options.extraBots ?? [])];
76
+ const issues = auditPolicy(policy, {
77
+ site,
78
+ environment,
79
+ registry,
80
+ rawOptions: options,
81
+ warnOnMissingSitemap: options.audit?.warnOnMissingSitemap,
82
+ warnOnConflicts: options.audit?.warnOnConflicts
83
+ });
84
+ for (const issue of issues) {
85
+ logIssue(logger, issue);
86
+ }
87
+ logger?.info?.(`Generated robots.txt at ${outputPath}`);
88
+ if (options.output?.llmsTxt === true) {
89
+ const llmsPath = join(fileURLToPath(dir), 'llms.txt');
90
+ await writeFile(llmsPath, renderLlmsTxt(policy, site), 'utf8');
91
+ logger?.info?.(`Generated llms.txt at ${llmsPath}`);
92
+ }
93
+ }
94
+ }
95
+ };
96
+ }
@@ -0,0 +1,2 @@
1
+ import type { MergeStrategy } from './types.js';
2
+ export declare function mergeRobotsTxt(generated: string, existing: string | null, strategy: MergeStrategy): string;
package/dist/merge.js ADDED
@@ -0,0 +1,13 @@
1
+ export function mergeRobotsTxt(generated, existing, strategy) {
2
+ if (!existing?.trim()) {
3
+ return generated.endsWith('\n') ? generated : `${generated}\n`;
4
+ }
5
+ switch (strategy) {
6
+ case 'replace':
7
+ return generated.endsWith('\n') ? generated : `${generated}\n`;
8
+ case 'prepend':
9
+ return `${generated.trim()}\n\n${existing.trim()}\n`;
10
+ case 'append':
11
+ return `${existing.trim()}\n\n${generated.trim()}\n`;
12
+ }
13
+ }
@@ -0,0 +1,2 @@
1
+ import type { AstroCrawlerPolicyOptions, Preset } from './types.js';
2
+ export declare const presetDefaults: Record<Preset, Partial<AstroCrawlerPolicyOptions>>;
@@ -0,0 +1,73 @@
1
+ export const presetDefaults = {
2
+ seoOnly: {
3
+ contentSignals: {
4
+ search: true,
5
+ aiInput: false,
6
+ aiTrain: false
7
+ },
8
+ groups: {
9
+ searchEngines: 'allow',
10
+ verifiedAi: 'disallow',
11
+ unknownAi: 'disallow'
12
+ }
13
+ },
14
+ citationFriendly: {
15
+ contentSignals: {
16
+ search: true,
17
+ aiInput: true,
18
+ aiTrain: false
19
+ },
20
+ groups: {
21
+ searchEngines: 'allow',
22
+ verifiedAi: 'allow',
23
+ unknownAi: 'disallow'
24
+ }
25
+ },
26
+ openToAi: {
27
+ contentSignals: {
28
+ search: true,
29
+ aiInput: true,
30
+ aiTrain: true
31
+ },
32
+ groups: {
33
+ searchEngines: 'allow',
34
+ verifiedAi: 'allow',
35
+ unknownAi: 'allow'
36
+ }
37
+ },
38
+ blockTraining: {
39
+ contentSignals: {
40
+ search: true,
41
+ aiInput: true,
42
+ aiTrain: false
43
+ },
44
+ groups: {
45
+ searchEngines: 'allow',
46
+ verifiedAi: 'allow',
47
+ unknownAi: 'disallow'
48
+ },
49
+ bots: {
50
+ GPTBot: 'disallow',
51
+ 'Google-Extended': 'disallow',
52
+ CCBot: 'disallow'
53
+ }
54
+ },
55
+ lockdown: {
56
+ contentSignals: {
57
+ search: false,
58
+ aiInput: false,
59
+ aiTrain: false
60
+ },
61
+ groups: {
62
+ searchEngines: 'disallow',
63
+ verifiedAi: 'disallow',
64
+ unknownAi: 'disallow'
65
+ },
66
+ rules: [
67
+ {
68
+ userAgent: '*',
69
+ disallow: ['/']
70
+ }
71
+ ]
72
+ }
73
+ };
@@ -0,0 +1,4 @@
1
+ import type { RegistryBot } from './types.js';
2
+ /** ISO date of the last registry update. Shown in debug output. */
3
+ export declare const REGISTRY_VERSION = "2026-04-09";
4
+ export declare const defaultRegistry: RegistryBot[];
@@ -0,0 +1,95 @@
1
+ /** ISO date of the last registry update. Shown in debug output. */
2
+ export const REGISTRY_VERSION = '2026-04-09';
3
+ export const defaultRegistry = [
4
+ {
5
+ id: 'GPTBot',
6
+ provider: 'OpenAI',
7
+ userAgents: ['GPTBot'],
8
+ categories: ['ai-training'],
9
+ verified: true
10
+ },
11
+ {
12
+ id: 'OAI-SearchBot',
13
+ provider: 'OpenAI',
14
+ userAgents: ['OAI-SearchBot'],
15
+ categories: ['ai-search', 'ai-input'],
16
+ verified: true
17
+ },
18
+ {
19
+ id: 'ClaudeBot',
20
+ provider: 'Anthropic',
21
+ userAgents: ['ClaudeBot'],
22
+ categories: ['ai-input', 'ai-training'],
23
+ verified: true
24
+ },
25
+ {
26
+ id: 'claude-web',
27
+ provider: 'Anthropic',
28
+ userAgents: ['claude-web'],
29
+ categories: ['ai-input'],
30
+ verified: true
31
+ },
32
+ {
33
+ id: 'Google-Extended',
34
+ provider: 'Google',
35
+ userAgents: ['Google-Extended'],
36
+ categories: ['ai-training'],
37
+ verified: true
38
+ },
39
+ {
40
+ id: 'CCBot',
41
+ provider: 'Common Crawl',
42
+ userAgents: ['CCBot'],
43
+ categories: ['ai-training'],
44
+ verified: true
45
+ },
46
+ {
47
+ id: 'PerplexityBot',
48
+ provider: 'Perplexity',
49
+ userAgents: ['PerplexityBot'],
50
+ categories: ['ai-search', 'ai-input'],
51
+ verified: true
52
+ },
53
+ {
54
+ id: 'Bytespider',
55
+ provider: 'ByteDance',
56
+ userAgents: ['Bytespider'],
57
+ categories: ['ai-training'],
58
+ verified: true
59
+ },
60
+ {
61
+ id: 'meta-externalagent',
62
+ provider: 'Meta',
63
+ userAgents: ['meta-externalagent'],
64
+ categories: ['ai-input', 'ai-training'],
65
+ verified: true
66
+ },
67
+ {
68
+ id: 'Amazonbot',
69
+ provider: 'Amazon',
70
+ userAgents: ['Amazonbot'],
71
+ categories: ['ai-search', 'ai-input'],
72
+ verified: true
73
+ },
74
+ {
75
+ id: 'Applebot-Extended',
76
+ provider: 'Apple',
77
+ userAgents: ['Applebot-Extended'],
78
+ categories: ['ai-training'],
79
+ verified: true
80
+ },
81
+ {
82
+ id: 'Googlebot',
83
+ provider: 'Google',
84
+ userAgents: ['Googlebot'],
85
+ categories: ['search'],
86
+ verified: true
87
+ },
88
+ {
89
+ id: 'Bingbot',
90
+ provider: 'Microsoft',
91
+ userAgents: ['Bingbot'],
92
+ categories: ['search'],
93
+ verified: true
94
+ }
95
+ ];
@@ -0,0 +1,3 @@
1
+ import type { ResolvedPolicy } from './types.js';
2
+ export declare function renderLlmsTxt(policy: ResolvedPolicy, site?: string): string;
3
+ export declare function renderRobotsTxt(policy: ResolvedPolicy): string;
package/dist/render.js ADDED
@@ -0,0 +1,134 @@
1
+ export function renderLlmsTxt(policy, site) {
2
+ let domain = 'this site';
3
+ if (site) {
4
+ try {
5
+ domain = new URL(site).hostname;
6
+ }
7
+ catch {
8
+ domain = site;
9
+ }
10
+ }
11
+ const lines = [];
12
+ lines.push(`# ${domain}`);
13
+ lines.push('');
14
+ lines.push(`> AI content access policy for ${domain}.`);
15
+ lines.push(`> Generated by astro-crawler-policy (preset: ${policy.preset}).`);
16
+ lines.push('');
17
+ lines.push('## Content Policy');
18
+ lines.push('');
19
+ const { search, aiInput, aiTrain } = policy.contentSignals;
20
+ if (search !== undefined) {
21
+ lines.push(`- Search indexing: ${search ? 'allowed' : 'not allowed'}`);
22
+ }
23
+ if (aiInput !== undefined) {
24
+ lines.push(`- AI citation and summarization: ${aiInput ? 'allowed' : 'not allowed'}`);
25
+ }
26
+ if (aiTrain !== undefined) {
27
+ lines.push(`- AI training data collection: ${aiTrain ? 'allowed' : 'not allowed'}`);
28
+ }
29
+ lines.push('');
30
+ const seenIds = new Set();
31
+ const allowedBots = [];
32
+ const blockedBots = [];
33
+ for (const bot of policy.botRules) {
34
+ if (seenIds.has(bot.id))
35
+ continue;
36
+ seenIds.add(bot.id);
37
+ (bot.action === 'allow' ? allowedBots : blockedBots).push(bot);
38
+ }
39
+ if (allowedBots.length > 0 || blockedBots.length > 0) {
40
+ lines.push('## AI Systems');
41
+ lines.push('');
42
+ if (allowedBots.length > 0) {
43
+ lines.push('### Allowed');
44
+ lines.push('');
45
+ for (const bot of allowedBots) {
46
+ lines.push(`- ${bot.id} (${bot.provider})`);
47
+ }
48
+ lines.push('');
49
+ }
50
+ if (blockedBots.length > 0) {
51
+ lines.push('### Blocked');
52
+ lines.push('');
53
+ for (const bot of blockedBots) {
54
+ lines.push(`- ${bot.id} (${bot.provider})`);
55
+ }
56
+ lines.push('');
57
+ }
58
+ }
59
+ if (policy.sitemaps.length > 0) {
60
+ lines.push('## Sitemaps');
61
+ lines.push('');
62
+ for (const sitemap of policy.sitemaps) {
63
+ lines.push(`- ${sitemap}`);
64
+ }
65
+ lines.push('');
66
+ }
67
+ return lines.join('\n').trimEnd() + '\n';
68
+ }
69
+ function renderContentSignals(contentSignals) {
70
+ const parts = [];
71
+ if (contentSignals.search !== undefined) {
72
+ parts.push(`search=${contentSignals.search ? 'yes' : 'no'}`);
73
+ }
74
+ if (contentSignals.aiInput !== undefined) {
75
+ parts.push(`ai-input=${contentSignals.aiInput ? 'yes' : 'no'}`);
76
+ }
77
+ if (contentSignals.aiTrain !== undefined) {
78
+ parts.push(`ai-train=${contentSignals.aiTrain ? 'yes' : 'no'}`);
79
+ }
80
+ return parts.length ? [`Content-signal: ${parts.join(', ')}`] : [];
81
+ }
82
+ function renderRule(rule) {
83
+ const lines = [];
84
+ for (const userAgent of rule.userAgent) {
85
+ lines.push(`User-agent: ${userAgent}`);
86
+ }
87
+ if (rule.comment) {
88
+ lines.push(`# ${rule.comment}`);
89
+ }
90
+ for (const allow of rule.allow ?? []) {
91
+ lines.push(`Allow: ${allow}`);
92
+ }
93
+ for (const disallow of rule.disallow ?? []) {
94
+ lines.push(`Disallow: ${disallow}`);
95
+ }
96
+ if (rule.crawlDelay !== undefined) {
97
+ lines.push(`Crawl-delay: ${rule.crawlDelay}`);
98
+ }
99
+ return lines;
100
+ }
101
+ export function renderRobotsTxt(policy) {
102
+ const lines = [
103
+ '# Generated by astro-crawler-policy',
104
+ `# preset: ${policy.preset}`,
105
+ ''
106
+ ];
107
+ const wildcardRule = policy.rules.find((rule) => rule.userAgent.includes('*')) ?? {
108
+ userAgent: ['*'],
109
+ allow: ['/'],
110
+ disallow: []
111
+ };
112
+ lines.push(...renderRule(wildcardRule));
113
+ lines.push(...renderContentSignals(policy.contentSignals));
114
+ lines.push('');
115
+ for (const rule of policy.rules) {
116
+ if (rule === wildcardRule) {
117
+ continue;
118
+ }
119
+ lines.push(...renderRule(rule));
120
+ lines.push('');
121
+ }
122
+ for (const botRule of policy.botRules) {
123
+ lines.push(`User-agent: ${botRule.userAgent}`);
124
+ lines.push(botRule.action === 'allow' ? 'Allow: /' : 'Disallow: /');
125
+ lines.push('');
126
+ }
127
+ if (policy.host) {
128
+ lines.push(`Host: ${policy.host}`);
129
+ }
130
+ for (const sitemap of policy.sitemaps) {
131
+ lines.push(`Sitemap: ${sitemap}`);
132
+ }
133
+ return `${lines.join('\n').trim()}\n`;
134
+ }