@casoon/astro-crawler-policy 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -53,7 +53,9 @@ Presets are the primary way to express intent. Each preset sets default content
53
53
  | `blockTraining` | allow | allow | disallow | disallow |
54
54
  | `lockdown` | disallow | disallow | disallow | disallow |
55
55
 
56
- `blockTraining` additionally adds explicit `Disallow` rules for GPTBot, Google-Extended, and CCBot on top of the group-level setting.
56
+ `citationFriendly` allows bots that do citation or summarization but blocks bots whose only purpose is training data collection (GPTBot, Google-Extended, CCBot, Bytespider, Applebot-Extended). Bots with mixed roles like ClaudeBot are allowed.
57
+
58
+ `blockTraining` goes further and blocks every bot with any training category, including mixed bots like ClaudeBot and meta-externalagent.
57
59
 
58
60
  `lockdown` adds a global `User-agent: * / Disallow: /` rule, overriding everything.
59
61
 
@@ -62,7 +64,9 @@ Presets are the primary way to express intent. Each preset sets default content
62
64
  Content signals are non-standard directives appended to the wildcard `User-agent: *` block:
63
65
 
64
66
  ```
65
- Content-signal: search=yes, ai-input=yes, ai-train=no
67
+ User-agent: *
68
+ Content-Signal: search=yes, ai-input=yes, ai-train=no
69
+ Allow: /
66
70
  ```
67
71
 
68
72
  They communicate intent to crawlers that support them. The three signals map to:
@@ -73,7 +77,7 @@ They communicate intent to crawlers that support them. The three signals map to:
73
77
  | `aiInput` | Using content as input for AI responses (citation, summarization) |
74
78
  | `aiTrain` | Using content as AI training data |
75
79
 
76
- Content signals are not yet a web standard. Google Search Console may flag them as unrecognised directives — the audit system emits an `info` message when they are present.
80
+ The directive name and signal keys follow the [contentsignals.org](https://contentsignals.org) specification (proposed IETF aipref standard). Google Search Console may flag them as unrecognised directives — the audit system emits an `info` message when they are present.
77
81
 
78
82
  Each preset sets default values for all three signals. You can override them individually:
79
83
 
@@ -379,8 +383,8 @@ crawlerPolicy({
379
383
  # preset: citationFriendly
380
384
 
381
385
  User-agent: *
386
+ Content-Signal: search=yes, ai-input=yes, ai-train=no
382
387
  Allow: /
383
- Content-signal: search=yes, ai-input=yes, ai-train=no
384
388
 
385
389
  User-agent: GPTBot
386
390
  Disallow: /
@@ -435,8 +439,8 @@ crawlerPolicy({ preset: 'seoOnly' })
435
439
  # preset: seoOnly
436
440
 
437
441
  User-agent: *
442
+ Content-Signal: search=yes, ai-input=no, ai-train=no
438
443
  Allow: /
439
- Content-signal: search=yes, ai-input=no, ai-train=no
440
444
 
441
445
  User-agent: GPTBot
442
446
  Disallow: /
@@ -496,6 +500,10 @@ When `CONTEXT=staging` or `NODE_ENV=staging`:
496
500
  # preset: lockdown
497
501
 
498
502
  User-agent: *
503
+ Content-Signal: search=no, ai-input=no, ai-train=no
499
504
  Disallow: /
500
- Content-signal: search=no, ai-input=no, ai-train=no
501
505
  ```
506
+
507
+ ---
508
+
509
+ > This tool only works for crawlers and AI bots that actually respect robots.txt. Respect, however, is rare these days.
package/dist/presets.js CHANGED
@@ -21,6 +21,13 @@ export const presetDefaults = {
21
21
  searchEngines: 'allow',
22
22
  verifiedAi: 'allow',
23
23
  unknownAi: 'disallow'
24
+ },
25
+ bots: {
26
+ GPTBot: 'disallow',
27
+ 'Google-Extended': 'disallow',
28
+ CCBot: 'disallow',
29
+ Bytespider: 'disallow',
30
+ 'Applebot-Extended': 'disallow'
24
31
  }
25
32
  },
26
33
  openToAi: {
@@ -48,8 +55,12 @@ export const presetDefaults = {
48
55
  },
49
56
  bots: {
50
57
  GPTBot: 'disallow',
58
+ ClaudeBot: 'disallow',
51
59
  'Google-Extended': 'disallow',
52
- CCBot: 'disallow'
60
+ CCBot: 'disallow',
61
+ Bytespider: 'disallow',
62
+ 'meta-externalagent': 'disallow',
63
+ 'Applebot-Extended': 'disallow'
53
64
  }
54
65
  },
55
66
  lockdown: {
package/dist/render.js CHANGED
@@ -77,7 +77,7 @@ function renderContentSignals(contentSignals) {
77
77
  if (contentSignals.aiTrain !== undefined) {
78
78
  parts.push(`ai-train=${contentSignals.aiTrain ? 'yes' : 'no'}`);
79
79
  }
80
- return parts.length ? [`Content-signal: ${parts.join(', ')}`] : [];
80
+ return parts.length ? [`Content-Signal: ${parts.join(', ')}`] : [];
81
81
  }
82
82
  function renderRule(rule) {
83
83
  const lines = [];
@@ -109,8 +109,23 @@ export function renderRobotsTxt(policy) {
109
109
  allow: ['/'],
110
110
  disallow: []
111
111
  };
112
- lines.push(...renderRule(wildcardRule));
112
+ // Render wildcard block with Content-Signal before Allow/Disallow (per spec)
113
+ for (const userAgent of wildcardRule.userAgent) {
114
+ lines.push(`User-agent: ${userAgent}`);
115
+ }
116
+ if (wildcardRule.comment) {
117
+ lines.push(`# ${wildcardRule.comment}`);
118
+ }
113
119
  lines.push(...renderContentSignals(policy.contentSignals));
120
+ for (const allow of wildcardRule.allow ?? []) {
121
+ lines.push(`Allow: ${allow}`);
122
+ }
123
+ for (const disallow of wildcardRule.disallow ?? []) {
124
+ lines.push(`Disallow: ${disallow}`);
125
+ }
126
+ if (wildcardRule.crawlDelay !== undefined) {
127
+ lines.push(`Crawl-delay: ${wildcardRule.crawlDelay}`);
128
+ }
114
129
  lines.push('');
115
130
  for (const rule of policy.rules) {
116
131
  if (rule === wildcardRule) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@casoon/astro-crawler-policy",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "Policy-first crawler control for Astro — generates robots.txt and llms.txt with presets, per-bot rules, AI crawler registry, and build-time audits.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",