@casoon/astro-crawler-policy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +501 -0
- package/dist/audit.d.ts +9 -0
- package/dist/audit.js +113 -0
- package/dist/compile.d.ts +2 -0
- package/dist/compile.js +109 -0
- package/dist/generate.d.ts +11 -0
- package/dist/generate.js +13 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.js +8 -0
- package/dist/integration.d.ts +11 -0
- package/dist/integration.js +96 -0
- package/dist/merge.d.ts +2 -0
- package/dist/merge.js +13 -0
- package/dist/presets.d.ts +2 -0
- package/dist/presets.js +73 -0
- package/dist/registry.d.ts +4 -0
- package/dist/registry.js +95 -0
- package/dist/render.d.ts +3 -0
- package/dist/render.js +134 -0
- package/dist/types.d.ts +94 -0
- package/dist/types.js +1 -0
- package/package.json +68 -0
package/dist/compile.js
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import { presetDefaults } from './presets.js';
|
|
2
|
+
import { defaultRegistry } from './registry.js';
|
|
3
|
+
function mergeOptions(base, override) {
|
|
4
|
+
return {
|
|
5
|
+
...base,
|
|
6
|
+
...override,
|
|
7
|
+
contentSignals: {
|
|
8
|
+
...base.contentSignals,
|
|
9
|
+
...override?.contentSignals
|
|
10
|
+
},
|
|
11
|
+
bots: {
|
|
12
|
+
...base.bots,
|
|
13
|
+
...override?.bots
|
|
14
|
+
},
|
|
15
|
+
groups: {
|
|
16
|
+
...base.groups,
|
|
17
|
+
...override?.groups
|
|
18
|
+
},
|
|
19
|
+
output: {
|
|
20
|
+
...base.output,
|
|
21
|
+
...override?.output
|
|
22
|
+
},
|
|
23
|
+
audit: {
|
|
24
|
+
...base.audit,
|
|
25
|
+
...override?.audit
|
|
26
|
+
},
|
|
27
|
+
rules: override?.rules ?? base.rules,
|
|
28
|
+
sitemaps: override?.sitemaps ?? base.sitemaps
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
function classifyDefaultAction(bot, options) {
|
|
32
|
+
if (bot.verified === false && options.groups?.unknownAi) {
|
|
33
|
+
return options.groups.unknownAi;
|
|
34
|
+
}
|
|
35
|
+
if (bot.categories.includes('search') && options.groups?.searchEngines) {
|
|
36
|
+
return options.groups.searchEngines;
|
|
37
|
+
}
|
|
38
|
+
if (bot.verified &&
|
|
39
|
+
bot.categories.some((category) => ['ai-search', 'ai-input', 'ai-training'].includes(category)) &&
|
|
40
|
+
options.groups?.verifiedAi) {
|
|
41
|
+
return options.groups.verifiedAi;
|
|
42
|
+
}
|
|
43
|
+
if (bot.categories.includes('unknown-ai') && options.groups?.unknownAi) {
|
|
44
|
+
return options.groups.unknownAi;
|
|
45
|
+
}
|
|
46
|
+
return bot.defaultAction ?? 'inherit';
|
|
47
|
+
}
|
|
48
|
+
function normalizeRules(rules) {
|
|
49
|
+
return (rules ?? []).map((rule) => ({
|
|
50
|
+
...rule,
|
|
51
|
+
userAgent: Array.isArray(rule.userAgent) ? rule.userAgent : [rule.userAgent],
|
|
52
|
+
allow: rule.allow ?? [],
|
|
53
|
+
disallow: rule.disallow ?? []
|
|
54
|
+
}));
|
|
55
|
+
}
|
|
56
|
+
function normalizeSitemaps(site, sitemaps) {
|
|
57
|
+
const values = sitemaps ?? [];
|
|
58
|
+
const deduped = new Set();
|
|
59
|
+
for (const sitemap of values) {
|
|
60
|
+
if (/^https?:\/\//.test(sitemap)) {
|
|
61
|
+
deduped.add(sitemap);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
if (site) {
|
|
65
|
+
deduped.add(new URL(sitemap, site).toString());
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
deduped.add(sitemap);
|
|
69
|
+
}
|
|
70
|
+
return [...deduped];
|
|
71
|
+
}
|
|
72
|
+
export function compilePolicy(input) {
|
|
73
|
+
const extraBots = input.options.extraBots ?? [];
|
|
74
|
+
const registry = [...(input.registry ?? defaultRegistry), ...extraBots];
|
|
75
|
+
const environment = input.environment;
|
|
76
|
+
const baseOptions = input.options;
|
|
77
|
+
if (baseOptions.preset !== undefined && !Object.prototype.hasOwnProperty.call(presetDefaults, baseOptions.preset)) {
|
|
78
|
+
throw new Error(`astro-crawler-policy: Unknown preset "${baseOptions.preset}". Valid presets are: ${Object.keys(presetDefaults).join(', ')}.`);
|
|
79
|
+
}
|
|
80
|
+
const basePreset = baseOptions.preset ?? 'citationFriendly';
|
|
81
|
+
const presetOptions = presetDefaults[basePreset] ?? {};
|
|
82
|
+
const withPreset = mergeOptions(presetOptions, baseOptions);
|
|
83
|
+
const withEnv = mergeOptions(withPreset, environment ? withPreset.env?.[environment] : undefined);
|
|
84
|
+
const preset = withEnv.preset ?? basePreset;
|
|
85
|
+
const rules = normalizeRules(withEnv.rules);
|
|
86
|
+
const botRules = registry.flatMap((bot) => {
|
|
87
|
+
const resolvedAction = withEnv.bots?.[bot.id] ?? classifyDefaultAction(bot, withEnv);
|
|
88
|
+
if (resolvedAction === 'inherit') {
|
|
89
|
+
return [];
|
|
90
|
+
}
|
|
91
|
+
return bot.userAgents.map((userAgent) => ({
|
|
92
|
+
id: bot.id,
|
|
93
|
+
userAgent,
|
|
94
|
+
provider: bot.provider,
|
|
95
|
+
action: resolvedAction,
|
|
96
|
+
categories: bot.categories,
|
|
97
|
+
verified: bot.verified ?? false
|
|
98
|
+
}));
|
|
99
|
+
});
|
|
100
|
+
return {
|
|
101
|
+
preset,
|
|
102
|
+
mergeStrategy: withEnv.mergeStrategy ?? 'prepend',
|
|
103
|
+
contentSignals: withEnv.contentSignals ?? {},
|
|
104
|
+
rules,
|
|
105
|
+
botRules,
|
|
106
|
+
sitemaps: normalizeSitemaps(input.site, withEnv.sitemaps),
|
|
107
|
+
host: withEnv.host
|
|
108
|
+
};
|
|
109
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import { compilePolicy } from './compile.js';
|
|
2
|
+
import type { AstroCrawlerPolicyOptions } from './types.js';
|
|
3
|
+
export declare function generateRobotsTxt(input: {
|
|
4
|
+
options: AstroCrawlerPolicyOptions;
|
|
5
|
+
existingRobotsTxt?: string | null;
|
|
6
|
+
environment?: string;
|
|
7
|
+
site?: string;
|
|
8
|
+
}): {
|
|
9
|
+
content: string;
|
|
10
|
+
policy: ReturnType<typeof compilePolicy>;
|
|
11
|
+
};
|
package/dist/generate.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { compilePolicy } from './compile.js';
|
|
2
|
+
import { mergeRobotsTxt } from './merge.js';
|
|
3
|
+
import { renderRobotsTxt } from './render.js';
|
|
4
|
+
export function generateRobotsTxt(input) {
|
|
5
|
+
const policy = compilePolicy({
|
|
6
|
+
options: input.options,
|
|
7
|
+
environment: input.environment,
|
|
8
|
+
site: input.site
|
|
9
|
+
});
|
|
10
|
+
const generated = renderRobotsTxt(policy);
|
|
11
|
+
const content = mergeRobotsTxt(generated, input.existingRobotsTxt ?? null, policy.mergeStrategy);
|
|
12
|
+
return { content, policy };
|
|
13
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export { auditPolicy } from './audit.js';
|
|
2
|
+
export { compilePolicy } from './compile.js';
|
|
3
|
+
export { generateRobotsTxt } from './generate.js';
|
|
4
|
+
export { defaultRegistry, REGISTRY_VERSION } from './registry.js';
|
|
5
|
+
export { mergeRobotsTxt } from './merge.js';
|
|
6
|
+
export { presetDefaults } from './presets.js';
|
|
7
|
+
export { renderLlmsTxt, renderRobotsTxt } from './render.js';
|
|
8
|
+
export type { AstroCrawlerPolicyOptions, AuditIssue, BotAction, BotCategory, ContentSignals, MergeStrategy, Preset, RegistryBot, ResolvedPolicy, RobotsRule } from './types.js';
|
|
9
|
+
export { default } from './integration.js';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export { auditPolicy } from './audit.js';
|
|
2
|
+
export { compilePolicy } from './compile.js';
|
|
3
|
+
export { generateRobotsTxt } from './generate.js';
|
|
4
|
+
export { defaultRegistry, REGISTRY_VERSION } from './registry.js';
|
|
5
|
+
export { mergeRobotsTxt } from './merge.js';
|
|
6
|
+
export { presetDefaults } from './presets.js';
|
|
7
|
+
export { renderLlmsTxt, renderRobotsTxt } from './render.js';
|
|
8
|
+
export { default } from './integration.js';
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { AstroBuildDoneContextLike, AstroConfigLike, AstroCrawlerPolicyOptions, AstroLoggerLike } from './types.js';
|
|
2
|
+
export default function crawlerPolicy(options?: AstroCrawlerPolicyOptions): {
|
|
3
|
+
name: string;
|
|
4
|
+
hooks: {
|
|
5
|
+
'astro:config:setup': ({ config: astroConfig, logger }: {
|
|
6
|
+
config: AstroConfigLike;
|
|
7
|
+
logger?: AstroLoggerLike;
|
|
8
|
+
}) => void;
|
|
9
|
+
'astro:build:done': ({ dir, logger }: AstroBuildDoneContextLike) => Promise<void>;
|
|
10
|
+
};
|
|
11
|
+
};
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { mkdir, readFile, writeFile } from 'node:fs/promises';
|
|
2
|
+
import { dirname, join } from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import { auditPolicy } from './audit.js';
|
|
5
|
+
import { generateRobotsTxt } from './generate.js';
|
|
6
|
+
import { defaultRegistry, REGISTRY_VERSION } from './registry.js';
|
|
7
|
+
import { renderLlmsTxt } from './render.js';
|
|
8
|
+
function logIssue(logger, issue) {
|
|
9
|
+
if (issue.level === 'error') {
|
|
10
|
+
logger?.error?.(issue.message);
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
if (issue.level === 'warn') {
|
|
14
|
+
logger?.warn?.(issue.message);
|
|
15
|
+
return;
|
|
16
|
+
}
|
|
17
|
+
logger?.info?.(issue.message);
|
|
18
|
+
}
|
|
19
|
+
async function readOptionalFile(pathname) {
|
|
20
|
+
try {
|
|
21
|
+
return await readFile(pathname, 'utf8');
|
|
22
|
+
}
|
|
23
|
+
catch {
|
|
24
|
+
return null;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
export default function crawlerPolicy(options = {}) {
|
|
28
|
+
let config;
|
|
29
|
+
return {
|
|
30
|
+
name: '@casoon/astro-crawler-policy',
|
|
31
|
+
hooks: {
|
|
32
|
+
'astro:config:setup': ({ config: astroConfig, logger }) => {
|
|
33
|
+
config = astroConfig;
|
|
34
|
+
logger?.info?.('Initializing astro-crawler-policy');
|
|
35
|
+
},
|
|
36
|
+
'astro:build:done': async ({ dir, logger }) => {
|
|
37
|
+
if (options.output?.robotsTxt === false) {
|
|
38
|
+
logger?.info?.('Skipping robots.txt generation because output.robotsTxt is false');
|
|
39
|
+
return;
|
|
40
|
+
}
|
|
41
|
+
const publicDir = config?.publicDir ? fileURLToPath(config.publicDir) : join(process.cwd(), 'public');
|
|
42
|
+
const site = config?.site ? String(config.site) : undefined;
|
|
43
|
+
const environment = process.env.CONTEXT ??
|
|
44
|
+
process.env.DEPLOYMENT_ENVIRONMENT ??
|
|
45
|
+
process.env.NODE_ENV ??
|
|
46
|
+
'production';
|
|
47
|
+
const existingRobotsTxt = await readOptionalFile(join(publicDir, 'robots.txt'));
|
|
48
|
+
const { content, policy } = generateRobotsTxt({
|
|
49
|
+
options,
|
|
50
|
+
existingRobotsTxt,
|
|
51
|
+
environment,
|
|
52
|
+
site
|
|
53
|
+
});
|
|
54
|
+
const outputPath = join(fileURLToPath(dir), 'robots.txt');
|
|
55
|
+
await mkdir(dirname(outputPath), { recursive: true });
|
|
56
|
+
await writeFile(outputPath, content, 'utf8');
|
|
57
|
+
if (options.debug) {
|
|
58
|
+
logger?.info?.(`[debug] registry version: ${REGISTRY_VERSION}${options.extraBots?.length ? ` + ${options.extraBots.length} extra bot(s)` : ''}`);
|
|
59
|
+
logger?.info?.(`[debug] environment: ${environment}`);
|
|
60
|
+
logger?.info?.(`[debug] preset: ${policy.preset}`);
|
|
61
|
+
const cs = policy.contentSignals;
|
|
62
|
+
const signals = Object.entries(cs)
|
|
63
|
+
.filter(([, v]) => v !== undefined)
|
|
64
|
+
.map(([k, v]) => `${k}=${v ? 'yes' : 'no'}`)
|
|
65
|
+
.join(', ');
|
|
66
|
+
if (signals)
|
|
67
|
+
logger?.info?.(`[debug] content signals: ${signals}`);
|
|
68
|
+
for (const rule of policy.botRules) {
|
|
69
|
+
logger?.info?.(`[debug] bot: ${rule.userAgent} → ${rule.action}`);
|
|
70
|
+
}
|
|
71
|
+
for (const sitemap of policy.sitemaps) {
|
|
72
|
+
logger?.info?.(`[debug] sitemap: ${sitemap}`);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
const registry = [...defaultRegistry, ...(options.extraBots ?? [])];
|
|
76
|
+
const issues = auditPolicy(policy, {
|
|
77
|
+
site,
|
|
78
|
+
environment,
|
|
79
|
+
registry,
|
|
80
|
+
rawOptions: options,
|
|
81
|
+
warnOnMissingSitemap: options.audit?.warnOnMissingSitemap,
|
|
82
|
+
warnOnConflicts: options.audit?.warnOnConflicts
|
|
83
|
+
});
|
|
84
|
+
for (const issue of issues) {
|
|
85
|
+
logIssue(logger, issue);
|
|
86
|
+
}
|
|
87
|
+
logger?.info?.(`Generated robots.txt at ${outputPath}`);
|
|
88
|
+
if (options.output?.llmsTxt === true) {
|
|
89
|
+
const llmsPath = join(fileURLToPath(dir), 'llms.txt');
|
|
90
|
+
await writeFile(llmsPath, renderLlmsTxt(policy, site), 'utf8');
|
|
91
|
+
logger?.info?.(`Generated llms.txt at ${llmsPath}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
};
|
|
96
|
+
}
|
package/dist/merge.d.ts
ADDED
package/dist/merge.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export function mergeRobotsTxt(generated, existing, strategy) {
|
|
2
|
+
if (!existing?.trim()) {
|
|
3
|
+
return generated.endsWith('\n') ? generated : `${generated}\n`;
|
|
4
|
+
}
|
|
5
|
+
switch (strategy) {
|
|
6
|
+
case 'replace':
|
|
7
|
+
return generated.endsWith('\n') ? generated : `${generated}\n`;
|
|
8
|
+
case 'prepend':
|
|
9
|
+
return `${generated.trim()}\n\n${existing.trim()}\n`;
|
|
10
|
+
case 'append':
|
|
11
|
+
return `${existing.trim()}\n\n${generated.trim()}\n`;
|
|
12
|
+
}
|
|
13
|
+
}
|
package/dist/presets.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
export const presetDefaults = {
|
|
2
|
+
seoOnly: {
|
|
3
|
+
contentSignals: {
|
|
4
|
+
search: true,
|
|
5
|
+
aiInput: false,
|
|
6
|
+
aiTrain: false
|
|
7
|
+
},
|
|
8
|
+
groups: {
|
|
9
|
+
searchEngines: 'allow',
|
|
10
|
+
verifiedAi: 'disallow',
|
|
11
|
+
unknownAi: 'disallow'
|
|
12
|
+
}
|
|
13
|
+
},
|
|
14
|
+
citationFriendly: {
|
|
15
|
+
contentSignals: {
|
|
16
|
+
search: true,
|
|
17
|
+
aiInput: true,
|
|
18
|
+
aiTrain: false
|
|
19
|
+
},
|
|
20
|
+
groups: {
|
|
21
|
+
searchEngines: 'allow',
|
|
22
|
+
verifiedAi: 'allow',
|
|
23
|
+
unknownAi: 'disallow'
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
openToAi: {
|
|
27
|
+
contentSignals: {
|
|
28
|
+
search: true,
|
|
29
|
+
aiInput: true,
|
|
30
|
+
aiTrain: true
|
|
31
|
+
},
|
|
32
|
+
groups: {
|
|
33
|
+
searchEngines: 'allow',
|
|
34
|
+
verifiedAi: 'allow',
|
|
35
|
+
unknownAi: 'allow'
|
|
36
|
+
}
|
|
37
|
+
},
|
|
38
|
+
blockTraining: {
|
|
39
|
+
contentSignals: {
|
|
40
|
+
search: true,
|
|
41
|
+
aiInput: true,
|
|
42
|
+
aiTrain: false
|
|
43
|
+
},
|
|
44
|
+
groups: {
|
|
45
|
+
searchEngines: 'allow',
|
|
46
|
+
verifiedAi: 'allow',
|
|
47
|
+
unknownAi: 'disallow'
|
|
48
|
+
},
|
|
49
|
+
bots: {
|
|
50
|
+
GPTBot: 'disallow',
|
|
51
|
+
'Google-Extended': 'disallow',
|
|
52
|
+
CCBot: 'disallow'
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
lockdown: {
|
|
56
|
+
contentSignals: {
|
|
57
|
+
search: false,
|
|
58
|
+
aiInput: false,
|
|
59
|
+
aiTrain: false
|
|
60
|
+
},
|
|
61
|
+
groups: {
|
|
62
|
+
searchEngines: 'disallow',
|
|
63
|
+
verifiedAi: 'disallow',
|
|
64
|
+
unknownAi: 'disallow'
|
|
65
|
+
},
|
|
66
|
+
rules: [
|
|
67
|
+
{
|
|
68
|
+
userAgent: '*',
|
|
69
|
+
disallow: ['/']
|
|
70
|
+
}
|
|
71
|
+
]
|
|
72
|
+
}
|
|
73
|
+
};
|
package/dist/registry.js
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/** ISO date of the last registry update. Shown in debug output. */
|
|
2
|
+
export const REGISTRY_VERSION = '2026-04-09';
|
|
3
|
+
export const defaultRegistry = [
|
|
4
|
+
{
|
|
5
|
+
id: 'GPTBot',
|
|
6
|
+
provider: 'OpenAI',
|
|
7
|
+
userAgents: ['GPTBot'],
|
|
8
|
+
categories: ['ai-training'],
|
|
9
|
+
verified: true
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
id: 'OAI-SearchBot',
|
|
13
|
+
provider: 'OpenAI',
|
|
14
|
+
userAgents: ['OAI-SearchBot'],
|
|
15
|
+
categories: ['ai-search', 'ai-input'],
|
|
16
|
+
verified: true
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
id: 'ClaudeBot',
|
|
20
|
+
provider: 'Anthropic',
|
|
21
|
+
userAgents: ['ClaudeBot'],
|
|
22
|
+
categories: ['ai-input', 'ai-training'],
|
|
23
|
+
verified: true
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
id: 'claude-web',
|
|
27
|
+
provider: 'Anthropic',
|
|
28
|
+
userAgents: ['claude-web'],
|
|
29
|
+
categories: ['ai-input'],
|
|
30
|
+
verified: true
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
id: 'Google-Extended',
|
|
34
|
+
provider: 'Google',
|
|
35
|
+
userAgents: ['Google-Extended'],
|
|
36
|
+
categories: ['ai-training'],
|
|
37
|
+
verified: true
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
id: 'CCBot',
|
|
41
|
+
provider: 'Common Crawl',
|
|
42
|
+
userAgents: ['CCBot'],
|
|
43
|
+
categories: ['ai-training'],
|
|
44
|
+
verified: true
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
id: 'PerplexityBot',
|
|
48
|
+
provider: 'Perplexity',
|
|
49
|
+
userAgents: ['PerplexityBot'],
|
|
50
|
+
categories: ['ai-search', 'ai-input'],
|
|
51
|
+
verified: true
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
id: 'Bytespider',
|
|
55
|
+
provider: 'ByteDance',
|
|
56
|
+
userAgents: ['Bytespider'],
|
|
57
|
+
categories: ['ai-training'],
|
|
58
|
+
verified: true
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
id: 'meta-externalagent',
|
|
62
|
+
provider: 'Meta',
|
|
63
|
+
userAgents: ['meta-externalagent'],
|
|
64
|
+
categories: ['ai-input', 'ai-training'],
|
|
65
|
+
verified: true
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
id: 'Amazonbot',
|
|
69
|
+
provider: 'Amazon',
|
|
70
|
+
userAgents: ['Amazonbot'],
|
|
71
|
+
categories: ['ai-search', 'ai-input'],
|
|
72
|
+
verified: true
|
|
73
|
+
},
|
|
74
|
+
{
|
|
75
|
+
id: 'Applebot-Extended',
|
|
76
|
+
provider: 'Apple',
|
|
77
|
+
userAgents: ['Applebot-Extended'],
|
|
78
|
+
categories: ['ai-training'],
|
|
79
|
+
verified: true
|
|
80
|
+
},
|
|
81
|
+
{
|
|
82
|
+
id: 'Googlebot',
|
|
83
|
+
provider: 'Google',
|
|
84
|
+
userAgents: ['Googlebot'],
|
|
85
|
+
categories: ['search'],
|
|
86
|
+
verified: true
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
id: 'Bingbot',
|
|
90
|
+
provider: 'Microsoft',
|
|
91
|
+
userAgents: ['Bingbot'],
|
|
92
|
+
categories: ['search'],
|
|
93
|
+
verified: true
|
|
94
|
+
}
|
|
95
|
+
];
|
package/dist/render.d.ts
ADDED
package/dist/render.js
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
export function renderLlmsTxt(policy, site) {
|
|
2
|
+
let domain = 'this site';
|
|
3
|
+
if (site) {
|
|
4
|
+
try {
|
|
5
|
+
domain = new URL(site).hostname;
|
|
6
|
+
}
|
|
7
|
+
catch {
|
|
8
|
+
domain = site;
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
const lines = [];
|
|
12
|
+
lines.push(`# ${domain}`);
|
|
13
|
+
lines.push('');
|
|
14
|
+
lines.push(`> AI content access policy for ${domain}.`);
|
|
15
|
+
lines.push(`> Generated by astro-crawler-policy (preset: ${policy.preset}).`);
|
|
16
|
+
lines.push('');
|
|
17
|
+
lines.push('## Content Policy');
|
|
18
|
+
lines.push('');
|
|
19
|
+
const { search, aiInput, aiTrain } = policy.contentSignals;
|
|
20
|
+
if (search !== undefined) {
|
|
21
|
+
lines.push(`- Search indexing: ${search ? 'allowed' : 'not allowed'}`);
|
|
22
|
+
}
|
|
23
|
+
if (aiInput !== undefined) {
|
|
24
|
+
lines.push(`- AI citation and summarization: ${aiInput ? 'allowed' : 'not allowed'}`);
|
|
25
|
+
}
|
|
26
|
+
if (aiTrain !== undefined) {
|
|
27
|
+
lines.push(`- AI training data collection: ${aiTrain ? 'allowed' : 'not allowed'}`);
|
|
28
|
+
}
|
|
29
|
+
lines.push('');
|
|
30
|
+
const seenIds = new Set();
|
|
31
|
+
const allowedBots = [];
|
|
32
|
+
const blockedBots = [];
|
|
33
|
+
for (const bot of policy.botRules) {
|
|
34
|
+
if (seenIds.has(bot.id))
|
|
35
|
+
continue;
|
|
36
|
+
seenIds.add(bot.id);
|
|
37
|
+
(bot.action === 'allow' ? allowedBots : blockedBots).push(bot);
|
|
38
|
+
}
|
|
39
|
+
if (allowedBots.length > 0 || blockedBots.length > 0) {
|
|
40
|
+
lines.push('## AI Systems');
|
|
41
|
+
lines.push('');
|
|
42
|
+
if (allowedBots.length > 0) {
|
|
43
|
+
lines.push('### Allowed');
|
|
44
|
+
lines.push('');
|
|
45
|
+
for (const bot of allowedBots) {
|
|
46
|
+
lines.push(`- ${bot.id} (${bot.provider})`);
|
|
47
|
+
}
|
|
48
|
+
lines.push('');
|
|
49
|
+
}
|
|
50
|
+
if (blockedBots.length > 0) {
|
|
51
|
+
lines.push('### Blocked');
|
|
52
|
+
lines.push('');
|
|
53
|
+
for (const bot of blockedBots) {
|
|
54
|
+
lines.push(`- ${bot.id} (${bot.provider})`);
|
|
55
|
+
}
|
|
56
|
+
lines.push('');
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (policy.sitemaps.length > 0) {
|
|
60
|
+
lines.push('## Sitemaps');
|
|
61
|
+
lines.push('');
|
|
62
|
+
for (const sitemap of policy.sitemaps) {
|
|
63
|
+
lines.push(`- ${sitemap}`);
|
|
64
|
+
}
|
|
65
|
+
lines.push('');
|
|
66
|
+
}
|
|
67
|
+
return lines.join('\n').trimEnd() + '\n';
|
|
68
|
+
}
|
|
69
|
+
function renderContentSignals(contentSignals) {
|
|
70
|
+
const parts = [];
|
|
71
|
+
if (contentSignals.search !== undefined) {
|
|
72
|
+
parts.push(`search=${contentSignals.search ? 'yes' : 'no'}`);
|
|
73
|
+
}
|
|
74
|
+
if (contentSignals.aiInput !== undefined) {
|
|
75
|
+
parts.push(`ai-input=${contentSignals.aiInput ? 'yes' : 'no'}`);
|
|
76
|
+
}
|
|
77
|
+
if (contentSignals.aiTrain !== undefined) {
|
|
78
|
+
parts.push(`ai-train=${contentSignals.aiTrain ? 'yes' : 'no'}`);
|
|
79
|
+
}
|
|
80
|
+
return parts.length ? [`Content-signal: ${parts.join(', ')}`] : [];
|
|
81
|
+
}
|
|
82
|
+
function renderRule(rule) {
|
|
83
|
+
const lines = [];
|
|
84
|
+
for (const userAgent of rule.userAgent) {
|
|
85
|
+
lines.push(`User-agent: ${userAgent}`);
|
|
86
|
+
}
|
|
87
|
+
if (rule.comment) {
|
|
88
|
+
lines.push(`# ${rule.comment}`);
|
|
89
|
+
}
|
|
90
|
+
for (const allow of rule.allow ?? []) {
|
|
91
|
+
lines.push(`Allow: ${allow}`);
|
|
92
|
+
}
|
|
93
|
+
for (const disallow of rule.disallow ?? []) {
|
|
94
|
+
lines.push(`Disallow: ${disallow}`);
|
|
95
|
+
}
|
|
96
|
+
if (rule.crawlDelay !== undefined) {
|
|
97
|
+
lines.push(`Crawl-delay: ${rule.crawlDelay}`);
|
|
98
|
+
}
|
|
99
|
+
return lines;
|
|
100
|
+
}
|
|
101
|
+
export function renderRobotsTxt(policy) {
|
|
102
|
+
const lines = [
|
|
103
|
+
'# Generated by astro-crawler-policy',
|
|
104
|
+
`# preset: ${policy.preset}`,
|
|
105
|
+
''
|
|
106
|
+
];
|
|
107
|
+
const wildcardRule = policy.rules.find((rule) => rule.userAgent.includes('*')) ?? {
|
|
108
|
+
userAgent: ['*'],
|
|
109
|
+
allow: ['/'],
|
|
110
|
+
disallow: []
|
|
111
|
+
};
|
|
112
|
+
lines.push(...renderRule(wildcardRule));
|
|
113
|
+
lines.push(...renderContentSignals(policy.contentSignals));
|
|
114
|
+
lines.push('');
|
|
115
|
+
for (const rule of policy.rules) {
|
|
116
|
+
if (rule === wildcardRule) {
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
lines.push(...renderRule(rule));
|
|
120
|
+
lines.push('');
|
|
121
|
+
}
|
|
122
|
+
for (const botRule of policy.botRules) {
|
|
123
|
+
lines.push(`User-agent: ${botRule.userAgent}`);
|
|
124
|
+
lines.push(botRule.action === 'allow' ? 'Allow: /' : 'Disallow: /');
|
|
125
|
+
lines.push('');
|
|
126
|
+
}
|
|
127
|
+
if (policy.host) {
|
|
128
|
+
lines.push(`Host: ${policy.host}`);
|
|
129
|
+
}
|
|
130
|
+
for (const sitemap of policy.sitemaps) {
|
|
131
|
+
lines.push(`Sitemap: ${sitemap}`);
|
|
132
|
+
}
|
|
133
|
+
return `${lines.join('\n').trim()}\n`;
|
|
134
|
+
}
|