@aiacta-org/ai-attribution-lint 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +167 -0
- package/package.json +17 -0
- package/src/cli.js +23 -0
- package/src/fetcher.js +27 -0
- package/src/index.js +16 -0
- package/src/parser.js +72 -0
- package/src/rules/contact.js +9 -0
- package/src/rules/index.js +11 -0
- package/src/rules/purpose.js +21 -0
- package/src/rules/reward-tier.js +11 -0
- package/src/rules/robots-txt-conflict.js +73 -0
- package/src/rules/schema-version.js +11 -0
- package/src/rules/spdx-license.js +11 -0
- package/src/rules/webhook-reachability.js +23 -0
- package/src/runner.js +24 -0
- package/tests/parser.test.js +14 -0
- package/tests/robots-conflict.test.js +17 -0
- package/tests/rules.test.js +17 -0
package/README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# @aiacta-org/ai-attribution-lint
|
|
2
|
+
|
|
3
|
+
> CLI validator for `ai-attribution.txt` files — the AIACTA open standard for AI content attribution (Proposal 4, §5.7).
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/@aiacta-org/ai-attribution-lint)
|
|
6
|
+
[](../../LICENSE)
|
|
7
|
+
[](../../docs/proposals/proposal-4-ai-attribution-txt.md)
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## What is this?
|
|
12
|
+
|
|
13
|
+
`@aiacta-org/ai-attribution-lint` validates your `ai-attribution.txt` file — the plain-text file publishers place on their website to declare their preferences to AI systems. It checks syntax, required fields, valid values, SPDX licence identifiers, and webhook reachability.
|
|
14
|
+
|
|
15
|
+
Think of it as the `eslint` for the AIACTA standard.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Run once without installing (recommended for quick checks)
|
|
23
|
+
npx @aiacta-org/ai-attribution-lint https://yourdomain.com
|
|
24
|
+
|
|
25
|
+
# Install globally
|
|
26
|
+
npm install -g @aiacta-org/ai-attribution-lint
|
|
27
|
+
|
|
28
|
+
# Install as a dev dependency in a project
|
|
29
|
+
npm install --save-dev @aiacta-org/ai-attribution-lint
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
### Check a live website
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
npx @aiacta-org/ai-attribution-lint https://yourdomain.com
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
The linter automatically fetches `/.well-known/ai-attribution.txt` (and falls back to `/ai-attribution.txt`).
|
|
43
|
+
|
|
44
|
+
### Check a local file
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
npx @aiacta-org/ai-attribution-lint ./ai-attribution.txt
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### JSON output (for CI pipelines)
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
npx @aiacta-org/ai-attribution-lint https://yourdomain.com --json
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Output format:
|
|
57
|
+
```json
|
|
58
|
+
{
|
|
59
|
+
"errors": [],
|
|
60
|
+
"warnings": [
|
|
61
|
+
"Schema-Version is missing — defaults to 1.0"
|
|
62
|
+
],
|
|
63
|
+
"info": []
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Exit codes:
|
|
68
|
+
- `0` — Passed (no errors)
|
|
69
|
+
- `1` — Failed (one or more errors, or warnings in `--strict` mode)
|
|
70
|
+
- `2` — Could not fetch or parse the file
|
|
71
|
+
|
|
72
|
+
### Strict mode (treat warnings as errors)
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
npx @aiacta-org/ai-attribution-lint https://yourdomain.com --strict
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## What it checks
|
|
81
|
+
|
|
82
|
+
| Rule | Severity |
|
|
83
|
+
|------|----------|
|
|
84
|
+
| `Schema-Version` is present and is `"1.0"` | Warning if missing |
|
|
85
|
+
| `Contact` field is present | Warning if missing |
|
|
86
|
+
| `Allow-Purpose` / `Disallow-Purpose` values are valid enum values (`training`, `rag`, `index`, `quality-eval`) | Error |
|
|
87
|
+
| `Content-License` is a valid SPDX identifier or `All-Rights-Reserved` | Error |
|
|
88
|
+
| `Citation-Webhook` URL is reachable (HTTP HEAD request) | Warning if unreachable |
|
|
89
|
+
| `Reward-Tier` is a valid enum value | Error |
|
|
90
|
+
| `robots.txt` conflicts with `Allow-Purpose` | Warning |
|
|
91
|
+
| Unknown fields are silently ignored (forward-compatibility per §5.6) | — |
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Use in CI/CD
|
|
96
|
+
|
|
97
|
+
Add to your GitHub Actions workflow to validate your `ai-attribution.txt` on every deploy:
|
|
98
|
+
|
|
99
|
+
```yaml
|
|
100
|
+
- name: Validate ai-attribution.txt
|
|
101
|
+
run: npx @aiacta-org/ai-attribution-lint https://yourdomain.com --json
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Or validate a local file during build:
|
|
105
|
+
|
|
106
|
+
```yaml
|
|
107
|
+
- name: Validate ai-attribution.txt
|
|
108
|
+
run: npx @aiacta-org/ai-attribution-lint ./public/.well-known/ai-attribution.txt
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Node.js API
|
|
114
|
+
|
|
115
|
+
```javascript
|
|
116
|
+
const { lint } = require('@aiacta-org/ai-attribution-lint');
|
|
117
|
+
|
|
118
|
+
const result = await lint('https://yourdomain.com');
|
|
119
|
+
// or: await lint('./ai-attribution.txt')
|
|
120
|
+
|
|
121
|
+
console.log(result.errors); // string[] — blocking issues
|
|
122
|
+
console.log(result.warnings); // string[] — advisory issues
|
|
123
|
+
console.log(result.info); // string[] — informational notes
|
|
124
|
+
|
|
125
|
+
if (result.errors.length > 0) {
|
|
126
|
+
process.exit(1);
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Example ai-attribution.txt
|
|
133
|
+
|
|
134
|
+
```
|
|
135
|
+
Schema-Version: 1.0
|
|
136
|
+
Contact: ai-licensing@yourdomain.com
|
|
137
|
+
Preferred-Attribution: Your Publication Name (yourdomain.com)
|
|
138
|
+
Allow-Purpose: rag
|
|
139
|
+
Allow-Purpose: index
|
|
140
|
+
Disallow-Purpose: training
|
|
141
|
+
Require-Citation: true
|
|
142
|
+
Require-Source-Link: true
|
|
143
|
+
Citation-Format: title-and-url
|
|
144
|
+
Citation-Webhook: https://yourdomain.com/webhooks/ai-citations
|
|
145
|
+
Recrawl-After: 24h
|
|
146
|
+
Reward-Tier: standard
|
|
147
|
+
Content-License: All-Rights-Reserved
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Place this file at `https://yourdomain.com/.well-known/ai-attribution.txt`.
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
## Related packages
|
|
155
|
+
|
|
156
|
+
| Package | Purpose |
|
|
157
|
+
|---------|---------|
|
|
158
|
+
| [`@aiacta-org/ai-citation-sdk`](https://www.npmjs.com/package/@aiacta-org/ai-citation-sdk) | Receive and verify citation webhook events |
|
|
159
|
+
| [`@aiacta-org/crawl-manifest-client`](https://www.npmjs.com/package/@aiacta-org/crawl-manifest-client) | Query AI providers' crawl history for your domain |
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## License & Copyright
|
|
164
|
+
|
|
165
|
+
Copyright © 2026 Eric Michel, PhD. Licensed under the [Apache License 2.0](../../LICENSE).
|
|
166
|
+
|
|
167
|
+
AIACTA™ is part of the [AIACTA open standard](https://github.com/aiacta-org/aiacta).
|
package/package.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aiacta-org/ai-attribution-lint",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "CLI validator for ai-attribution.txt files (AIACTA Proposal 4 §5.7)",
|
|
5
|
+
"author": "Eric Michel",
|
|
6
|
+
"license": "Apache-2.0",
|
|
7
|
+
"bin": { "ai-attribution-lint": "./src/cli.js" },
|
|
8
|
+
"main": "./src/index.js",
|
|
9
|
+
"scripts": { "test": "jest" },
|
|
10
|
+
"dependencies": {
|
|
11
|
+
"axios": "^1.6.0",
|
|
12
|
+
"spdx-license-ids": "^3.0.0",
|
|
13
|
+
"chalk": "^5.0.0",
|
|
14
|
+
"yargs": "^17.0.0"
|
|
15
|
+
},
|
|
16
|
+
"devDependencies": { "jest": "^29.0.0" }
|
|
17
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* ai-attribution-lint CLI entry point (§5.7)
|
|
4
|
+
* Usage: npx ai-attribution-lint <url|file>
|
|
5
|
+
*/
|
|
6
|
+
'use strict';
|
|
7
|
+
const yargs = require('yargs');
|
|
8
|
+
const { lint } = require('./index');
|
|
9
|
+
|
|
10
|
+
const argv = yargs
|
|
11
|
+
.usage('Usage: $0 <url|path>')
|
|
12
|
+
.option('json', { type: 'boolean', description: 'Output results as JSON' })
|
|
13
|
+
.option('strict', { type: 'boolean', description: 'Exit 1 on warnings' })
|
|
14
|
+
.demandCommand(1)
|
|
15
|
+
.help()
|
|
16
|
+
.argv;
|
|
17
|
+
|
|
18
|
+
lint(argv._[0], { json: argv.json, strict: argv.strict })
|
|
19
|
+
.then(result => {
|
|
20
|
+
if (argv.json) { console.log(JSON.stringify(result, null, 2)); }
|
|
21
|
+
process.exit(result.errors.length > 0 || (argv.strict && result.warnings.length > 0) ? 1 : 0);
|
|
22
|
+
})
|
|
23
|
+
.catch(err => { console.error(err.message); process.exit(2); });
|
package/src/fetcher.js
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fetches ai-attribution.txt from a URL or reads from a local file path.
|
|
3
|
+
* Respects Cache-Control / Expires headers as per §5.2.
|
|
4
|
+
*/
|
|
5
|
+
'use strict';
|
|
6
|
+
const axios = require('axios');
|
|
7
|
+
const fs = require('fs');
|
|
8
|
+
|
|
9
|
+
async function fetchContent(target) {
|
|
10
|
+
if (target.startsWith('http://') || target.startsWith('https://')) {
|
|
11
|
+
// Try well-known location first (RFC 8615), fall back to root (§5.2)
|
|
12
|
+
const urls = [
|
|
13
|
+
target.replace(/\/?$/, '/.well-known/ai-attribution.txt'),
|
|
14
|
+
target.replace(/\/?$/, '/ai-attribution.txt'),
|
|
15
|
+
];
|
|
16
|
+
for (const url of urls) {
|
|
17
|
+
try {
|
|
18
|
+
const res = await axios.get(url, { timeout: 10_000, responseType: 'text' });
|
|
19
|
+
if (res.status === 200) return res.data;
|
|
20
|
+
} catch (_) { /* try next */ }
|
|
21
|
+
}
|
|
22
|
+
throw new Error(`Could not fetch ai-attribution.txt from ${target}`);
|
|
23
|
+
}
|
|
24
|
+
return fs.readFileSync(target, 'utf-8');
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
module.exports = { fetchContent };
|
package/src/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ai-attribution-lint — public API
|
|
3
|
+
* Returns { errors: [], warnings: [], info: [] } for a given URL or file path.
|
|
4
|
+
*/
|
|
5
|
+
'use strict';
|
|
6
|
+
const { fetchContent } = require('./fetcher');
|
|
7
|
+
const { parse } = require('./parser');
|
|
8
|
+
const { runRules } = require('./runner');
|
|
9
|
+
|
|
10
|
+
async function lint(target, opts = {}) {
|
|
11
|
+
const raw = await fetchContent(target);
|
|
12
|
+
const parsed = parse(raw);
|
|
13
|
+
return await runRules(parsed, target, opts);
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
module.exports = { lint };
|
package/src/parser.js
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parses the key: value line format of ai-attribution.txt (§5.3).
|
|
3
|
+
*
|
|
4
|
+
* Rules:
|
|
5
|
+
* - Lines starting with # are comments; blank lines are skipped.
|
|
6
|
+
* - Field names are case-insensitive (normalised to title-case key names).
|
|
7
|
+
* - Multi-value fields (Contact, Allow-Purpose, Disallow-Purpose) accumulate
|
|
8
|
+
* into arrays; all other fields take the last occurrence.
|
|
9
|
+
* - Unknown fields are silently ignored for forward-compatibility (§5.6).
|
|
10
|
+
*/
|
|
11
|
+
'use strict';
|
|
12
|
+
|
|
13
|
+
// Fields that may appear multiple times and accumulate as arrays (§5.3)
|
|
14
|
+
const MULTI_VALUE_FIELDS = new Set([
|
|
15
|
+
'Contact',
|
|
16
|
+
'Allow-Purpose',
|
|
17
|
+
'Disallow-Purpose',
|
|
18
|
+
]);
|
|
19
|
+
|
|
20
|
+
// Canonical field names — normalises alternate casings to the spec names
|
|
21
|
+
const CANONICAL = {
|
|
22
|
+
'schema-version': 'Schema-Version',
|
|
23
|
+
'contact': 'Contact',
|
|
24
|
+
'preferred-attribution':'Preferred-Attribution',
|
|
25
|
+
'canonical-author': 'Canonical-Author',
|
|
26
|
+
'allow-purpose': 'Allow-Purpose',
|
|
27
|
+
'disallow-purpose': 'Disallow-Purpose',
|
|
28
|
+
'require-citation': 'Require-Citation',
|
|
29
|
+
'require-source-link': 'Require-Source-Link',
|
|
30
|
+
'citation-format': 'Citation-Format',
|
|
31
|
+
'allow-utm-append': 'Allow-UTM-Append',
|
|
32
|
+
'preferred-utm-source': 'Preferred-UTM-Source',
|
|
33
|
+
'citation-webhook': 'Citation-Webhook',
|
|
34
|
+
'recrawl-after': 'Recrawl-After',
|
|
35
|
+
'licensing-contact': 'Licensing-Contact',
|
|
36
|
+
'licensing-url': 'Licensing-URL',
|
|
37
|
+
'reward-tier': 'Reward-Tier',
|
|
38
|
+
'content-license': 'Content-License',
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Parse raw ai-attribution.txt content into a structured object.
|
|
43
|
+
*
|
|
44
|
+
* @param {string} raw Raw file content
|
|
45
|
+
* @returns {object} Parsed key/value map
|
|
46
|
+
*/
|
|
47
|
+
function parse(raw) {
|
|
48
|
+
const result = {};
|
|
49
|
+
for (const line of raw.split('\n')) {
|
|
50
|
+
const trimmed = line.trim();
|
|
51
|
+
if (!trimmed || trimmed.startsWith('#')) continue;
|
|
52
|
+
|
|
53
|
+
const colon = trimmed.indexOf(':');
|
|
54
|
+
if (colon === -1) continue;
|
|
55
|
+
|
|
56
|
+
const rawKey = trimmed.slice(0, colon).trim();
|
|
57
|
+
const value = trimmed.slice(colon + 1).trim();
|
|
58
|
+
if (!value) continue;
|
|
59
|
+
|
|
60
|
+
// Normalise key
|
|
61
|
+
const key = CANONICAL[rawKey.toLowerCase()] || rawKey;
|
|
62
|
+
|
|
63
|
+
if (MULTI_VALUE_FIELDS.has(key)) {
|
|
64
|
+
result[key] = result[key] ? [...result[key], value] : [value];
|
|
65
|
+
} else {
|
|
66
|
+
result[key] = value;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
return result;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
module.exports = { parse, MULTI_VALUE_FIELDS, CANONICAL };
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/** Rule: At least one Contact field should be present (§5.4). */
|
|
2
|
+
'use strict';
|
|
3
|
+
module.exports = function ruleContact(parsed) {
|
|
4
|
+
const warnings = [];
|
|
5
|
+
if (!parsed['Contact'] || parsed['Contact'].length === 0) {
|
|
6
|
+
warnings.push('No Contact field found; publishers should provide a licensing contact');
|
|
7
|
+
}
|
|
8
|
+
return { errors: [], warnings };
|
|
9
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/** Aggregates all validation rules. Order matters — later rules may depend on earlier parsing. */
|
|
2
|
+
'use strict';
|
|
3
|
+
module.exports = [
|
|
4
|
+
require('./schema-version'),
|
|
5
|
+
require('./contact'),
|
|
6
|
+
require('./purpose'),
|
|
7
|
+
require('./spdx-license'),
|
|
8
|
+
require('./webhook-reachability'),
|
|
9
|
+
require('./reward-tier'),
|
|
10
|
+
require('./robots-txt-conflict'),
|
|
11
|
+
];
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/** Rule: Allow-Purpose and Disallow-Purpose must use valid enum values (§5.4). */
|
|
2
|
+
'use strict';
|
|
3
|
+
const VALID = new Set(['training', 'rag', 'index', 'quality-eval']);
|
|
4
|
+
function validatePurposeField(values, fieldName) {
|
|
5
|
+
const errors = [];
|
|
6
|
+
for (const v of (values || [])) {
|
|
7
|
+
if (!VALID.has(v.toLowerCase())) {
|
|
8
|
+
errors.push(`Invalid ${fieldName} value "${v}"; allowed: ${[...VALID].join(', ')}`);
|
|
9
|
+
}
|
|
10
|
+
}
|
|
11
|
+
return errors;
|
|
12
|
+
}
|
|
13
|
+
module.exports = function rulePurpose(parsed) {
|
|
14
|
+
return {
|
|
15
|
+
errors: [
|
|
16
|
+
...validatePurposeField(parsed['Allow-Purpose'], 'Allow-Purpose'),
|
|
17
|
+
...validatePurposeField(parsed['Disallow-Purpose'], 'Disallow-Purpose'),
|
|
18
|
+
],
|
|
19
|
+
warnings: [],
|
|
20
|
+
};
|
|
21
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/** Rule: Reward-Tier must be a valid enum value (§5.4). */
|
|
2
|
+
'use strict';
|
|
3
|
+
const VALID = new Set(['standard', 'premium', 'licensing-only', 'none']);
|
|
4
|
+
module.exports = function ruleRewardTier(parsed) {
|
|
5
|
+
const errors = [];
|
|
6
|
+
const tier = parsed['Reward-Tier'];
|
|
7
|
+
if (tier && !VALID.has(tier.toLowerCase())) {
|
|
8
|
+
errors.push(`Invalid Reward-Tier "${tier}"; allowed: ${[...VALID].join(', ')}`);
|
|
9
|
+
}
|
|
10
|
+
return { errors, warnings: [] };
|
|
11
|
+
};
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rule: robots.txt conflict checker (§5.5).
|
|
3
|
+
*
|
|
4
|
+
* Verifies that ai-attribution.txt Allow-Purpose directives do not conflict
|
|
5
|
+
* with Disallow rules in the site's robots.txt, since robots.txt Disallow
|
|
6
|
+
* takes precedence over ai-attribution.txt Allow-Purpose.
|
|
7
|
+
*
|
|
8
|
+
* Issues a WARNING (not error) because robots.txt may be intentionally
|
|
9
|
+
* restrictive while ai-attribution.txt is aspirational.
|
|
10
|
+
*/
|
|
11
|
+
'use strict';
|
|
12
|
+
const axios = require('axios');
|
|
13
|
+
const { URL } = require('url');
|
|
14
|
+
|
|
15
|
+
const KNOWN_AI_BOTS = ['GPTBot', 'ClaudeBot', 'Google-Extended', 'PerplexityBot', 'Grok-Bot'];
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Minimally parses a robots.txt to extract Disallow rules for known AI bots.
|
|
19
|
+
* Not a full RFC 9309 parser — only checks direct bot-name matches.
|
|
20
|
+
* @param {string} robotsTxt
|
|
21
|
+
* @returns {Set<string>} Set of disallowed path prefixes for AI bots
|
|
22
|
+
*/
|
|
23
|
+
function parseRobotsForAiBots(robotsTxt) {
|
|
24
|
+
const disallowed = new Set();
|
|
25
|
+
const lines = robotsTxt.split('\n').map(l => l.trim());
|
|
26
|
+
let inAiBlock = false;
|
|
27
|
+
|
|
28
|
+
for (const line of lines) {
|
|
29
|
+
if (line.startsWith('#') || !line) { inAiBlock = false; continue; }
|
|
30
|
+
if (line.toLowerCase().startsWith('user-agent:')) {
|
|
31
|
+
const ua = line.slice('user-agent:'.length).trim();
|
|
32
|
+
inAiBlock = ua === '*' || KNOWN_AI_BOTS.some(b => ua.toLowerCase().includes(b.toLowerCase()));
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
if (inAiBlock && line.toLowerCase().startsWith('disallow:')) {
|
|
36
|
+
const path = line.slice('disallow:'.length).trim();
|
|
37
|
+
if (path) disallowed.add(path);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return disallowed;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
module.exports = async function ruleRobotsTxtConflict(parsed, target) {
|
|
44
|
+
const warnings = [];
|
|
45
|
+
const info = [];
|
|
46
|
+
const allowPurpose = parsed['Allow-Purpose'] || [];
|
|
47
|
+
|
|
48
|
+
if (allowPurpose.length === 0) return { errors: [], warnings, info };
|
|
49
|
+
if (!target || !target.startsWith('http')) return { errors: [], warnings, info };
|
|
50
|
+
|
|
51
|
+
try {
|
|
52
|
+
const origin = new URL(target.replace(/\/?\.well-known.*$/, '').replace(/\/ai-attribution\.txt$/, '')).origin;
|
|
53
|
+
const robotsUrl = `${origin}/robots.txt`;
|
|
54
|
+
const res = await axios.get(robotsUrl, { timeout: 8_000, responseType: 'text' });
|
|
55
|
+
const disallowed = parseRobotsForAiBots(res.data);
|
|
56
|
+
|
|
57
|
+
if (disallowed.has('/') || disallowed.has('/*')) {
|
|
58
|
+
warnings.push(
|
|
59
|
+
`robots.txt Disallow: / blocks all AI bots, overriding Allow-Purpose: ${allowPurpose.join(', ')} (§5.5). ` +
|
|
60
|
+
`AI systems will respect robots.txt first.`
|
|
61
|
+
);
|
|
62
|
+
} else if (disallowed.size > 0) {
|
|
63
|
+
info.push(
|
|
64
|
+
`robots.txt restricts AI bots from ${disallowed.size} path(s): ${[...disallowed].slice(0,3).join(', ')}` +
|
|
65
|
+
`${disallowed.size > 3 ? '...' : ''}. These restrictions override ai-attribution.txt Allow-Purpose (§5.5).`
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
} catch (e) {
|
|
69
|
+
info.push(`Could not fetch robots.txt for conflict check: ${e.message}`);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return { errors: [], warnings, info };
|
|
73
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/** Rule: Schema-Version must be present and parseable (§5.4). */
|
|
2
|
+
'use strict';
|
|
3
|
+
module.exports = function ruleSchemaVersion(parsed) {
|
|
4
|
+
const errors = [], warnings = [];
|
|
5
|
+
if (!parsed['Schema-Version']) {
|
|
6
|
+
warnings.push('Schema-Version field missing; assuming 1.0');
|
|
7
|
+
} else if (parsed['Schema-Version'] !== '1.0') {
|
|
8
|
+
warnings.push(`Unknown Schema-Version "${parsed['Schema-Version']}"; parser may not support all fields`);
|
|
9
|
+
}
|
|
10
|
+
return { errors, warnings };
|
|
11
|
+
};
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
/** Rule: Content-License must be a valid SPDX identifier (§5.4, §10.3). */
|
|
2
|
+
'use strict';
|
|
3
|
+
const spdxIds = require('spdx-license-ids');
|
|
4
|
+
module.exports = function ruleSpdxLicense(parsed) {
|
|
5
|
+
const errors = [], warnings = [];
|
|
6
|
+
const lic = parsed['Content-License'];
|
|
7
|
+
if (lic && lic !== 'All-Rights-Reserved' && !spdxIds.includes(lic)) {
|
|
8
|
+
errors.push(`Content-License "${lic}" is not a valid SPDX identifier. See https://spdx.org/licenses/`);
|
|
9
|
+
}
|
|
10
|
+
return { errors, warnings };
|
|
11
|
+
};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Rule: Citation-Webhook endpoint should be reachable and on the same domain (§6.3).
|
|
3
|
+
* Issues a WARNING (not error) for network failures since CI may run offline.
|
|
4
|
+
*/
|
|
5
|
+
'use strict';
|
|
6
|
+
const axios = require('axios');
|
|
7
|
+
const { URL } = require('url');
|
|
8
|
+
module.exports = async function ruleWebhookReachability(parsed, target) {
|
|
9
|
+
const warnings = [];
|
|
10
|
+
const endpoint = parsed['Citation-Webhook'];
|
|
11
|
+
if (!endpoint) return { errors: [], warnings };
|
|
12
|
+
try {
|
|
13
|
+
const webhookHost = new URL(endpoint).hostname;
|
|
14
|
+
const targetHost = target.startsWith('http') ? new URL(target).hostname : null;
|
|
15
|
+
if (targetHost && webhookHost !== targetHost && !webhookHost.endsWith(`.${targetHost}`)) {
|
|
16
|
+
warnings.push(`Citation-Webhook host "${webhookHost}" differs from target domain "${targetHost}" — DNS verification required (§6.3)`);
|
|
17
|
+
}
|
|
18
|
+
await axios.head(endpoint, { timeout: 5_000 });
|
|
19
|
+
} catch (e) {
|
|
20
|
+
warnings.push(`Citation-Webhook endpoint not reachable: ${endpoint} (${e.message})`);
|
|
21
|
+
}
|
|
22
|
+
return { errors: [], warnings };
|
|
23
|
+
};
|
package/src/runner.js
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runs all validation rules against the parsed config and returns results.
|
|
3
|
+
*
|
|
4
|
+
* Rules may be sync or async (e.g. webhook-reachability does a network call).
|
|
5
|
+
* We await all rules so async ones — like ruleWebhookReachability — actually
|
|
6
|
+
* run to completion instead of silently returning an unresolved Promise.
|
|
7
|
+
*/
|
|
8
|
+
'use strict';
|
|
9
|
+
const rules = require('./rules');
|
|
10
|
+
|
|
11
|
+
async function runRules(parsed, target, opts) {
|
|
12
|
+
const errors = [], warnings = [], info = [];
|
|
13
|
+
for (const rule of rules) {
|
|
14
|
+
// await handles both sync rules (returns plain object) and
|
|
15
|
+
// async rules (returns Promise<object>) uniformly
|
|
16
|
+
const findings = await rule(parsed, target);
|
|
17
|
+
errors.push(...(findings.errors || []));
|
|
18
|
+
warnings.push(...(findings.warnings || []));
|
|
19
|
+
info.push(...(findings.info || []));
|
|
20
|
+
}
|
|
21
|
+
return { errors, warnings, info };
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
module.exports = { runRules };
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
const { parse } = require('../src/parser');
|
|
2
|
+
|
|
3
|
+
test('parses basic fields', () => {
|
|
4
|
+
const raw = 'Schema-Version: 1.0\nContact: licensing@example.com\nAllow-Purpose: rag\nAllow-Purpose: index';
|
|
5
|
+
const result = parse(raw);
|
|
6
|
+
expect(result['Schema-Version']).toBe('1.0');
|
|
7
|
+
expect(result['Contact']).toEqual(['licensing@example.com']);
|
|
8
|
+
expect(result['Allow-Purpose']).toEqual(['rag', 'index']);
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
test('ignores comments and blank lines', () => {
|
|
12
|
+
const raw = '# comment\n\nSchema-Version: 1.0';
|
|
13
|
+
expect(parse(raw)['Schema-Version']).toBe('1.0');
|
|
14
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for the robots.txt conflict rule (§5.5).
|
|
3
|
+
* Uses nock to mock robots.txt responses.
|
|
4
|
+
*/
|
|
5
|
+
const ruleRobotsTxtConflict = require('../src/rules/robots-txt-conflict');
|
|
6
|
+
|
|
7
|
+
// Test with a local file path (no robots.txt fetch attempted)
|
|
8
|
+
test('skips robots check for local file paths', async () => {
|
|
9
|
+
const r = await ruleRobotsTxtConflict({ 'Allow-Purpose': ['rag'] }, '/local/path/ai-attribution.txt');
|
|
10
|
+
expect(r.errors).toHaveLength(0);
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
test('skips robots check when no Allow-Purpose set', async () => {
|
|
14
|
+
const r = await ruleRobotsTxtConflict({}, 'https://example.com');
|
|
15
|
+
expect(r.errors).toHaveLength(0);
|
|
16
|
+
expect(r.warnings).toHaveLength(0);
|
|
17
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
const ruleSchemaVersion = require('../src/rules/schema-version');
|
|
2
|
+
const ruleSpdxLicense = require('../src/rules/spdx-license');
|
|
3
|
+
|
|
4
|
+
test('schema-version warns when missing', () => {
|
|
5
|
+
const r = ruleSchemaVersion({});
|
|
6
|
+
expect(r.warnings.length).toBeGreaterThan(0);
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
test('spdx-license errors on unknown identifier', () => {
|
|
10
|
+
const r = ruleSpdxLicense({ 'Content-License': 'INVALID-ID' });
|
|
11
|
+
expect(r.errors.length).toBe(1);
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
test('spdx-license passes on All-Rights-Reserved', () => {
|
|
15
|
+
const r = ruleSpdxLicense({ 'Content-License': 'All-Rights-Reserved' });
|
|
16
|
+
expect(r.errors.length).toBe(0);
|
|
17
|
+
});
|