@casoon/astro-crawler-policy 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -6
- package/dist/presets.js +12 -1
- package/dist/render.js +17 -2
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -53,7 +53,9 @@ Presets are the primary way to express intent. Each preset sets default content
|
|
|
53
53
|
| `blockTraining` | allow | allow | disallow | disallow |
|
|
54
54
|
| `lockdown` | disallow | disallow | disallow | disallow |
|
|
55
55
|
|
|
56
|
-
`
|
|
56
|
+
`citationFriendly` allows bots that do citation or summarization but blocks bots whose only purpose is training data collection (GPTBot, Google-Extended, CCBot, Bytespider, Applebot-Extended). Bots with mixed roles like ClaudeBot are allowed.
|
|
57
|
+
|
|
58
|
+
`blockTraining` goes further and blocks every bot with any training category, including mixed bots like ClaudeBot and meta-externalagent.
|
|
57
59
|
|
|
58
60
|
`lockdown` adds a global `User-agent: * / Disallow: /` rule, overriding everything.
|
|
59
61
|
|
|
@@ -62,7 +64,9 @@ Presets are the primary way to express intent. Each preset sets default content
|
|
|
62
64
|
Content signals are non-standard directives appended to the wildcard `User-agent: *` block:
|
|
63
65
|
|
|
64
66
|
```
|
|
65
|
-
|
|
67
|
+
User-agent: *
|
|
68
|
+
Content-Signal: search=yes, ai-input=yes, ai-train=no
|
|
69
|
+
Allow: /
|
|
66
70
|
```
|
|
67
71
|
|
|
68
72
|
They communicate intent to crawlers that support them. The three signals map to:
|
|
@@ -73,7 +77,7 @@ They communicate intent to crawlers that support them. The three signals map to:
|
|
|
73
77
|
| `aiInput` | Using content as input for AI responses (citation, summarization) |
|
|
74
78
|
| `aiTrain` | Using content as AI training data |
|
|
75
79
|
|
|
76
|
-
|
|
80
|
+
The directive name and signal keys follow the [contentsignals.org](https://contentsignals.org) specification (proposed IETF aipref standard). Google Search Console may flag them as unrecognised directives — the audit system emits an `info` message when they are present.
|
|
77
81
|
|
|
78
82
|
Each preset sets default values for all three signals. You can override them individually:
|
|
79
83
|
|
|
@@ -379,8 +383,8 @@ crawlerPolicy({
|
|
|
379
383
|
# preset: citationFriendly
|
|
380
384
|
|
|
381
385
|
User-agent: *
|
|
386
|
+
Content-Signal: search=yes, ai-input=yes, ai-train=no
|
|
382
387
|
Allow: /
|
|
383
|
-
Content-signal: search=yes, ai-input=yes, ai-train=no
|
|
384
388
|
|
|
385
389
|
User-agent: GPTBot
|
|
386
390
|
Disallow: /
|
|
@@ -435,8 +439,8 @@ crawlerPolicy({ preset: 'seoOnly' })
|
|
|
435
439
|
# preset: seoOnly
|
|
436
440
|
|
|
437
441
|
User-agent: *
|
|
442
|
+
Content-Signal: search=yes, ai-input=no, ai-train=no
|
|
438
443
|
Allow: /
|
|
439
|
-
Content-signal: search=yes, ai-input=no, ai-train=no
|
|
440
444
|
|
|
441
445
|
User-agent: GPTBot
|
|
442
446
|
Disallow: /
|
|
@@ -496,6 +500,10 @@ When `CONTEXT=staging` or `NODE_ENV=staging`:
|
|
|
496
500
|
# preset: lockdown
|
|
497
501
|
|
|
498
502
|
User-agent: *
|
|
503
|
+
Content-Signal: search=no, ai-input=no, ai-train=no
|
|
499
504
|
Disallow: /
|
|
500
|
-
Content-signal: search=no, ai-input=no, ai-train=no
|
|
501
505
|
```
|
|
506
|
+
|
|
507
|
+
---
|
|
508
|
+
|
|
509
|
+
> This tool only works for crawlers and AI bots that actually respect robots.txt. Respect, however, is rare these days.
|
package/dist/presets.js
CHANGED
|
@@ -21,6 +21,13 @@ export const presetDefaults = {
|
|
|
21
21
|
searchEngines: 'allow',
|
|
22
22
|
verifiedAi: 'allow',
|
|
23
23
|
unknownAi: 'disallow'
|
|
24
|
+
},
|
|
25
|
+
bots: {
|
|
26
|
+
GPTBot: 'disallow',
|
|
27
|
+
'Google-Extended': 'disallow',
|
|
28
|
+
CCBot: 'disallow',
|
|
29
|
+
Bytespider: 'disallow',
|
|
30
|
+
'Applebot-Extended': 'disallow'
|
|
24
31
|
}
|
|
25
32
|
},
|
|
26
33
|
openToAi: {
|
|
@@ -48,8 +55,12 @@ export const presetDefaults = {
|
|
|
48
55
|
},
|
|
49
56
|
bots: {
|
|
50
57
|
GPTBot: 'disallow',
|
|
58
|
+
ClaudeBot: 'disallow',
|
|
51
59
|
'Google-Extended': 'disallow',
|
|
52
|
-
CCBot: 'disallow'
|
|
60
|
+
CCBot: 'disallow',
|
|
61
|
+
Bytespider: 'disallow',
|
|
62
|
+
'meta-externalagent': 'disallow',
|
|
63
|
+
'Applebot-Extended': 'disallow'
|
|
53
64
|
}
|
|
54
65
|
},
|
|
55
66
|
lockdown: {
|
package/dist/render.js
CHANGED
|
@@ -77,7 +77,7 @@ function renderContentSignals(contentSignals) {
|
|
|
77
77
|
if (contentSignals.aiTrain !== undefined) {
|
|
78
78
|
parts.push(`ai-train=${contentSignals.aiTrain ? 'yes' : 'no'}`);
|
|
79
79
|
}
|
|
80
|
-
return parts.length ? [`Content-
|
|
80
|
+
return parts.length ? [`Content-Signal: ${parts.join(', ')}`] : [];
|
|
81
81
|
}
|
|
82
82
|
function renderRule(rule) {
|
|
83
83
|
const lines = [];
|
|
@@ -109,8 +109,23 @@ export function renderRobotsTxt(policy) {
|
|
|
109
109
|
allow: ['/'],
|
|
110
110
|
disallow: []
|
|
111
111
|
};
|
|
112
|
-
|
|
112
|
+
// Render wildcard block with Content-Signal before Allow/Disallow (per spec)
|
|
113
|
+
for (const userAgent of wildcardRule.userAgent) {
|
|
114
|
+
lines.push(`User-agent: ${userAgent}`);
|
|
115
|
+
}
|
|
116
|
+
if (wildcardRule.comment) {
|
|
117
|
+
lines.push(`# ${wildcardRule.comment}`);
|
|
118
|
+
}
|
|
113
119
|
lines.push(...renderContentSignals(policy.contentSignals));
|
|
120
|
+
for (const allow of wildcardRule.allow ?? []) {
|
|
121
|
+
lines.push(`Allow: ${allow}`);
|
|
122
|
+
}
|
|
123
|
+
for (const disallow of wildcardRule.disallow ?? []) {
|
|
124
|
+
lines.push(`Disallow: ${disallow}`);
|
|
125
|
+
}
|
|
126
|
+
if (wildcardRule.crawlDelay !== undefined) {
|
|
127
|
+
lines.push(`Crawl-delay: ${wildcardRule.crawlDelay}`);
|
|
128
|
+
}
|
|
114
129
|
lines.push('');
|
|
115
130
|
for (const rule of policy.rules) {
|
|
116
131
|
if (rule === wildcardRule) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@casoon/astro-crawler-policy",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "Policy-first crawler control for Astro — generates robots.txt and llms.txt with presets, per-bot rules, AI crawler registry, and build-time audits.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/index.js",
|