@aiacta-org/crawl-manifest-client 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +193 -0
- package/package.json +10 -0
- package/src/node/content-hash.js +54 -0
- package/src/node/index.js +127 -0
- package/src/python/content_hash.py +37 -0
- package/src/python/crawl_manifest_client.py +44 -0
- package/src/python/requirements.txt +3 -0
- package/tests/client.test.js +23 -0
- package/tests/content-hash.test.js +49 -0
- package/tests/python_client.test.py +46 -0
package/README.md
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
# @aiacta-org/crawl-manifest-client
|
|
2
|
+
|
|
3
|
+
> Query any AIACTA-compliant AI provider's Crawl Manifest API — see exactly which of your pages were crawled, when, and for what purpose (Proposal 1, §2.2).
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/@aiacta-org/crawl-manifest-client)
|
|
6
|
+
[](../../LICENSE)
|
|
7
|
+
[](../../docs/proposals/proposal-1-crawl-manifests.md)
|
|
8
|
+
|
|
9
|
+
Available in **Node.js** and **Python**.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## What is this?
|
|
14
|
+
|
|
15
|
+
The Crawl Manifest API is what AI providers expose so publishers can audit their crawl history — the AI equivalent of Google Search Console. This client handles the API query, automatic pagination, rate-limit backoff, and local caching.
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
**Node.js**
|
|
22
|
+
```bash
|
|
23
|
+
npm install @aiacta-org/crawl-manifest-client
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
**Python**
|
|
27
|
+
```bash
|
|
28
|
+
pip install crawl-manifest-client
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Quick Start
|
|
34
|
+
|
|
35
|
+
### Node.js
|
|
36
|
+
|
|
37
|
+
```javascript
|
|
38
|
+
const { CrawlManifestClient } = require('@aiacta-org/crawl-manifest-client');
|
|
39
|
+
|
|
40
|
+
const client = new CrawlManifestClient({
|
|
41
|
+
provider: 'anthropic',
|
|
42
|
+
apiKey: process.env.ANTHROPIC_PUBLISHER_KEY,
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// Fetch all crawl events for your domain — pagination handled automatically
|
|
46
|
+
for await (const entry of client.fetchAll({
|
|
47
|
+
domain: 'yourdomain.com',
|
|
48
|
+
from: '2026-03-01T00:00:00Z',
|
|
49
|
+
to: '2026-03-31T23:59:59Z',
|
|
50
|
+
})) {
|
|
51
|
+
console.log(entry.url);
|
|
52
|
+
console.log('Last crawled:', entry.last_crawled);
|
|
53
|
+
console.log('Purpose:', entry.purpose); // ['rag', 'index', ...]
|
|
54
|
+
console.log('HTTP status:', entry.http_status_at_crawl);
|
|
55
|
+
console.log('Content hash:', entry.content_hash); // sha256:...
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Python
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from crawl_manifest_client import CrawlManifestClient
|
|
63
|
+
|
|
64
|
+
client = CrawlManifestClient(
|
|
65
|
+
provider='anthropic',
|
|
66
|
+
api_key=os.environ['ANTHROPIC_PUBLISHER_KEY']
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
for entry in client.fetch_all(
|
|
70
|
+
domain='yourdomain.com',
|
|
71
|
+
from_date='2026-03-01T00:00:00Z',
|
|
72
|
+
to_date='2026-03-31T23:59:59Z'
|
|
73
|
+
):
|
|
74
|
+
print(entry['url'], entry['purpose'])
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Filter by purpose
|
|
80
|
+
|
|
81
|
+
```javascript
|
|
82
|
+
// See only pages crawled for model training
|
|
83
|
+
for await (const entry of client.fetchAll({
|
|
84
|
+
domain: 'yourdomain.com',
|
|
85
|
+
from: '2026-01-01T00:00:00Z',
|
|
86
|
+
to: '2026-03-31T00:00:00Z',
|
|
87
|
+
purpose: ['training'],
|
|
88
|
+
})) {
|
|
89
|
+
console.log('Trained on:', entry.url);
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Valid purpose values: `training` · `rag` · `index` · `quality-eval`
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Each result contains
|
|
98
|
+
|
|
99
|
+
| Field | Type | Description |
|
|
100
|
+
|-------|------|-------------|
|
|
101
|
+
| `url` | string | The full URL that was crawled |
|
|
102
|
+
| `last_crawled` | ISO 8601 | When most recently crawled |
|
|
103
|
+
| `crawl_count_30d` | integer | How many times in the last 30 days |
|
|
104
|
+
| `purpose` | string[] | What the AI used the content for |
|
|
105
|
+
| `http_status_at_crawl` | integer | HTTP status received (200, 404, etc.) |
|
|
106
|
+
| `content_hash` | string | SHA-256 of the page content at crawl time |
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Date range limits
|
|
111
|
+
|
|
112
|
+
The API allows a maximum of **90 days per request** (§2.2). For longer periods, split your query:
|
|
113
|
+
|
|
114
|
+
```javascript
|
|
115
|
+
// Query 6 months across two requests
|
|
116
|
+
const ranges = [
|
|
117
|
+
{ from: '2025-10-01T00:00:00Z', to: '2025-12-30T00:00:00Z' },
|
|
118
|
+
{ from: '2026-01-01T00:00:00Z', to: '2026-03-31T00:00:00Z' },
|
|
119
|
+
];
|
|
120
|
+
for (const range of ranges) {
|
|
121
|
+
for await (const entry of client.fetchAll({ domain: 'yourdomain.com', ...range })) {
|
|
122
|
+
// process entry
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Requesting more than 90 days throws a `RangeError` before making any API calls.
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Rate limits & caching
|
|
132
|
+
|
|
133
|
+
- The spec allows 60 requests/hour per domain (§2.2)
|
|
134
|
+
- The client warns when you approach the limit
|
|
135
|
+
- Automatically waits on `429` responses, respecting `X-RateLimit-Reset`
|
|
136
|
+
- Caches responses in memory for 1 hour — repeated queries return instantly
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## API Reference
|
|
141
|
+
|
|
142
|
+
### `new CrawlManifestClient({ provider, apiKey, baseUrl? })`
|
|
143
|
+
|
|
144
|
+
| Option | Type | Required | Description |
|
|
145
|
+
|--------|------|----------|-------------|
|
|
146
|
+
| `provider` | string | Yes | Provider identifier, e.g. `'anthropic'`, `'openai'`, `'google'` |
|
|
147
|
+
| `apiKey` | string | Yes | Bearer token from the provider's Publisher Portal |
|
|
148
|
+
| `baseUrl` | string | No | Override the API base URL (for testing) |
|
|
149
|
+
|
|
150
|
+
### `client.fetchAll({ domain, from, to, purpose? })`
|
|
151
|
+
|
|
152
|
+
Returns an async generator yielding `CrawlManifestUrl` objects. Handles all pagination automatically.
|
|
153
|
+
|
|
154
|
+
| Option | Type | Required | Description |
|
|
155
|
+
|--------|------|----------|-------------|
|
|
156
|
+
| `domain` | string | Yes | Your domain, e.g. `'yourdomain.com'` |
|
|
157
|
+
| `from` | ISO 8601 | Yes | Start of query window |
|
|
158
|
+
| `to` | ISO 8601 | Yes | End of query window (max 90 days from `from`) |
|
|
159
|
+
| `purpose` | string[] | No | Filter to specific purpose values |
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Verify content integrity
|
|
164
|
+
|
|
165
|
+
```javascript
|
|
166
|
+
const { computeContentHash } = require('@aiacta-org/crawl-manifest-client/src/node/content-hash');
|
|
167
|
+
|
|
168
|
+
const yourPageHtml = fs.readFileSync('./article.html', 'utf-8');
|
|
169
|
+
const yourHash = computeContentHash(yourPageHtml);
|
|
170
|
+
|
|
171
|
+
for await (const entry of client.fetchAll({ domain: 'yourdomain.com', from, to })) {
|
|
172
|
+
if (entry.content_hash !== yourHash) {
|
|
173
|
+
console.warn('Content mismatch on', entry.url, '— AI may have crawled an older version');
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Related packages
|
|
181
|
+
|
|
182
|
+
| Package | Purpose |
|
|
183
|
+
|---------|---------|
|
|
184
|
+
| [`@aiacta-org/ai-attribution-lint`](https://www.npmjs.com/package/@aiacta-org/ai-attribution-lint) | Validate your `ai-attribution.txt` |
|
|
185
|
+
| [`@aiacta-org/ai-citation-sdk`](https://www.npmjs.com/package/@aiacta-org/ai-citation-sdk) | Receive citation webhook events |
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## License & Copyright
|
|
190
|
+
|
|
191
|
+
Copyright © 2026 Eric Michel, PhD. Licensed under the [Apache License 2.0](../../LICENSE).
|
|
192
|
+
|
|
193
|
+
Part of the [AIACTA open standard](https://github.com/aiacta-org/aiacta).
|
package/package.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@aiacta-org/crawl-manifest-client",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Client library for querying the AIACTA Crawl Manifest API with pagination, rate-limit handling, and local caching (Proposal 1 §2.2)",
|
|
5
|
+
"author": "Eric Michel",
|
|
6
|
+
"main": "./src/node/index.js",
|
|
7
|
+
"scripts": { "test": "jest" },
|
|
8
|
+
"dependencies": { "axios": "^1.6.0", "node-cache": "^5.1.0" },
|
|
9
|
+
"devDependencies": { "jest": "^29.0.0", "nock": "^14.0.11" }
|
|
10
|
+
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content hash utility — §2.3.
|
|
3
|
+
*
|
|
4
|
+
* Spec: "SHA-256 of UTF-8 normalized body text (HTML stripped, whitespace collapsed)"
|
|
5
|
+
*
|
|
6
|
+
* Used by AI providers when building crawl manifest entries.
|
|
7
|
+
* Publishers may cross-check hashes to detect content drift.
|
|
8
|
+
*/
|
|
9
|
+
'use strict';
|
|
10
|
+
const crypto = require('crypto');
|
|
11
|
+
|
|
12
|
+
// Minimal HTML tag stripper (no DOM dependency for broad compatibility)
|
|
13
|
+
function stripHtml(html) {
|
|
14
|
+
return html
|
|
15
|
+
.replace(/<script[\s\S]*?<\/script>/gi, ' ')
|
|
16
|
+
.replace(/<style[\s\S]*?<\/style>/gi, ' ')
|
|
17
|
+
.replace(/<[^>]+>/g, ' ');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Collapse whitespace as specified: normalise to single spaces, trim
|
|
21
|
+
function collapseWhitespace(text) {
|
|
22
|
+
return text.replace(/[\s\u00A0\u200B]+/g, ' ').trim();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Compute the AIACTA content hash for a crawled page.
|
|
27
|
+
*
|
|
28
|
+
* @param {string} rawHtml The full HTML source of the crawled page
|
|
29
|
+
* @returns {string} "sha256:<hex>" as defined in §2.3
|
|
30
|
+
*
|
|
31
|
+
* @example
|
|
32
|
+
* const { computeContentHash } = require('./content-hash');
|
|
33
|
+
* const hash = computeContentHash('<html><body>Hello world</body></html>');
|
|
34
|
+
* // => "sha256:b94d27b99..."
|
|
35
|
+
*/
|
|
36
|
+
function computeContentHash(rawHtml) {
|
|
37
|
+
const text = collapseWhitespace(stripHtml(rawHtml));
|
|
38
|
+
const digest = crypto.createHash('sha256').update(text, 'utf8').digest('hex');
|
|
39
|
+
return `sha256:${digest}`;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Verify that a stored content hash still matches current page content.
|
|
44
|
+
* Used by publishers to detect unexpected content changes after a crawl.
|
|
45
|
+
*
|
|
46
|
+
* @param {string} rawHtml Current page HTML
|
|
47
|
+
* @param {string} storedHash Previously recorded "sha256:<hex>"
|
|
48
|
+
* @returns {boolean}
|
|
49
|
+
*/
|
|
50
|
+
function verifyContentHash(rawHtml, storedHash) {
|
|
51
|
+
return computeContentHash(rawHtml) === storedHash;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
module.exports = { computeContentHash, verifyContentHash, stripHtml, collapseWhitespace };
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* crawl-manifest-client — Node.js
|
|
3
|
+
*
|
|
4
|
+
* Queries GET /crawl-manifest/v1 (§2.2) with:
|
|
5
|
+
* - Cursor-based automatic pagination
|
|
6
|
+
* - 90-day max range validation (spec requirement)
|
|
7
|
+
* - Rate-limit backoff (X-RateLimit-Remaining / X-RateLimit-Reset)
|
|
8
|
+
* - In-memory response caching (TTL = 1 hour)
|
|
9
|
+
*/
|
|
10
|
+
'use strict';
|
|
11
|
+
const axios = require('axios');
|
|
12
|
+
const NodeCache = require('node-cache');
|
|
13
|
+
|
|
14
|
+
const MAX_RANGE_DAYS = 90; // §2.2: max date range 90 days per request
|
|
15
|
+
const RATE_LIMIT_RPH = 60; // §2.2: 60 requests/hour per domain
|
|
16
|
+
|
|
17
|
+
const cache = new NodeCache({ stdTTL: 3600 });
|
|
18
|
+
|
|
19
|
+
class CrawlManifestClient {
|
|
20
|
+
/**
|
|
21
|
+
* @param {object} opts
|
|
22
|
+
* @param {string} opts.provider Provider identifier (e.g. 'anthropic')
|
|
23
|
+
* @param {string} opts.apiKey Publisher API key (Bearer token)
|
|
24
|
+
* @param {string} [opts.baseUrl] Override base URL for testing
|
|
25
|
+
*/
|
|
26
|
+
constructor({ provider, apiKey, baseUrl }) {
|
|
27
|
+
this.baseUrl = baseUrl || `https://api.${provider}.com/crawl-manifest/v1`;
|
|
28
|
+
this.apiKey = apiKey;
|
|
29
|
+
this._reqLog = []; // sliding window for client-side rate guard
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Validates that the requested date range does not exceed 90 days (§2.2).
|
|
34
|
+
* @param {string} from ISO 8601
|
|
35
|
+
* @param {string} to ISO 8601
|
|
36
|
+
* @throws {RangeError}
|
|
37
|
+
*/
|
|
38
|
+
_validateRange(from, to) {
|
|
39
|
+
const diffMs = new Date(to) - new Date(from);
|
|
40
|
+
const diffDays = diffMs / (1000 * 60 * 60 * 24);
|
|
41
|
+
if (diffDays > MAX_RANGE_DAYS) {
|
|
42
|
+
throw new RangeError(
|
|
43
|
+
`Date range ${diffDays.toFixed(1)} days exceeds the 90-day maximum per §2.2. ` +
|
|
44
|
+
`Split your query into multiple requests.`
|
|
45
|
+
);
|
|
46
|
+
}
|
|
47
|
+
if (diffMs < 0) {
|
|
48
|
+
throw new RangeError(`'from' must be before 'to'`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Client-side rate guard — warns (does not block) if approaching 60 req/hour.
|
|
54
|
+
*/
|
|
55
|
+
_trackRequest() {
|
|
56
|
+
const now = Date.now();
|
|
57
|
+
const hourAgo = now - 3600_000;
|
|
58
|
+
this._reqLog = this._reqLog.filter(t => t > hourAgo);
|
|
59
|
+
this._reqLog.push(now);
|
|
60
|
+
if (this._reqLog.length >= RATE_LIMIT_RPH - 5) {
|
|
61
|
+
console.warn(
|
|
62
|
+
`[crawl-manifest-client] Approaching rate limit: ` +
|
|
63
|
+
`${this._reqLog.length}/${RATE_LIMIT_RPH} requests in last hour`
|
|
64
|
+
);
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Fetches all crawl manifest records for a domain, handling all pagination.
|
|
70
|
+
* Yields individual CrawlManifestUrl objects.
|
|
71
|
+
*
|
|
72
|
+
* @param {object} params
|
|
73
|
+
* @param {string} params.domain
|
|
74
|
+
* @param {string} params.from ISO 8601 — start of query window
|
|
75
|
+
* @param {string} params.to ISO 8601 — end of query window (max 90d from 'from')
|
|
76
|
+
* @param {string[]} [params.purpose] Filter by purpose values
|
|
77
|
+
* @yields {CrawlManifestUrl}
|
|
78
|
+
*/
|
|
79
|
+
async *fetchAll({ domain, from, to, purpose = [] }) {
|
|
80
|
+
this._validateRange(from, to);
|
|
81
|
+
let cursor = null;
|
|
82
|
+
do {
|
|
83
|
+
const page = await this._fetchPage({ domain, from, to, purpose, cursor });
|
|
84
|
+
for (const url of page.urls) yield url;
|
|
85
|
+
cursor = page.next_cursor || null;
|
|
86
|
+
} while (cursor);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Fetch a single page of results (internal).
|
|
91
|
+
*/
|
|
92
|
+
async _fetchPage({ domain, from, to, purpose, cursor }) {
|
|
93
|
+
const cacheKey = `${domain}|${from}|${to}|${purpose.join(',')}|${cursor}`;
|
|
94
|
+
const cached = cache.get(cacheKey);
|
|
95
|
+
if (cached) return cached;
|
|
96
|
+
|
|
97
|
+
this._trackRequest();
|
|
98
|
+
|
|
99
|
+
const params = { domain, from, to, format: 'json' };
|
|
100
|
+
if (purpose.length) params.purpose = purpose.join(',');
|
|
101
|
+
if (cursor) params.cursor = cursor;
|
|
102
|
+
|
|
103
|
+
let response;
|
|
104
|
+
try {
|
|
105
|
+
response = await axios.get(this.baseUrl, {
|
|
106
|
+
params,
|
|
107
|
+
headers: { Authorization: `Bearer ${this.apiKey}` },
|
|
108
|
+
timeout: 15_000,
|
|
109
|
+
});
|
|
110
|
+
} catch (err) {
|
|
111
|
+
if (err.response?.status === 429) {
|
|
112
|
+
// Respect X-RateLimit-Reset header (§2.2)
|
|
113
|
+
const resetIn = parseInt(err.response.headers['x-ratelimit-reset'] || '60', 10);
|
|
114
|
+
console.warn(`[crawl-manifest-client] Rate limited. Waiting ${resetIn}s...`);
|
|
115
|
+
await new Promise(r => setTimeout(r, resetIn * 1000));
|
|
116
|
+
return this._fetchPage({ domain, from, to, purpose, cursor });
|
|
117
|
+
}
|
|
118
|
+
throw err;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const data = response.data;
|
|
122
|
+
cache.set(cacheKey, data);
|
|
123
|
+
return data;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
module.exports = { CrawlManifestClient, MAX_RANGE_DAYS };
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content hash utility — §2.3.
|
|
3
|
+
|
|
4
|
+
Spec: SHA-256 of UTF-8 normalized body text (HTML stripped, whitespace collapsed).
|
|
5
|
+
Used by AI providers when building crawl manifest entries.
|
|
6
|
+
"""
|
|
7
|
+
import hashlib
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def strip_html(html: str) -> str:
|
|
12
|
+
"""Remove all HTML tags, scripts, and style blocks."""
|
|
13
|
+
html = re.sub(r'<script[\s\S]*?<\/script>', ' ', html, flags=re.IGNORECASE)
|
|
14
|
+
html = re.sub(r'<style[\s\S]*?<\/style>', ' ', html, flags=re.IGNORECASE)
|
|
15
|
+
html = re.sub(r'<[^>]+>', ' ', html)
|
|
16
|
+
return html
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def collapse_whitespace(text: str) -> str:
|
|
20
|
+
"""Collapse all whitespace sequences to a single space and trim."""
|
|
21
|
+
return re.sub(r'[\s\u00A0\u200B]+', ' ', text).strip()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compute_content_hash(raw_html: str) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Compute the AIACTA content hash for a crawled page.
|
|
27
|
+
|
|
28
|
+
Returns a string of the form "sha256:<hex>" as defined in §2.3.
|
|
29
|
+
"""
|
|
30
|
+
text = collapse_whitespace(strip_html(raw_html))
|
|
31
|
+
digest = hashlib.sha256(text.encode('utf-8')).hexdigest()
|
|
32
|
+
return f'sha256:{digest}'
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def verify_content_hash(raw_html: str, stored_hash: str) -> bool:
|
|
36
|
+
"""Return True if the current page content matches the stored hash."""
|
|
37
|
+
return compute_content_hash(raw_html) == stored_hash
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AIACTA Crawl Manifest Client — Python
|
|
3
|
+
Queries GET /crawl-manifest/v1 with pagination, rate-limit backoff, and caching (§2.2).
|
|
4
|
+
"""
|
|
5
|
+
import time
|
|
6
|
+
from typing import Iterator
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
class CrawlManifestClient:
|
|
10
|
+
def __init__(self, provider: str, api_key: str, base_url: str = None):
|
|
11
|
+
self.base_url = base_url or f"https://api.{provider}.com/crawl-manifest/v1"
|
|
12
|
+
self.api_key = api_key
|
|
13
|
+
self._cache = {}
|
|
14
|
+
|
|
15
|
+
def fetch_all(self, domain: str, from_dt: str, to_dt: str, purpose: list[str] = None) -> Iterator[dict]:
|
|
16
|
+
"""Generator — yields individual URL records, handles all pagination."""
|
|
17
|
+
cursor = None
|
|
18
|
+
while True:
|
|
19
|
+
page = self._fetch_page(domain, from_dt, to_dt, purpose or [], cursor)
|
|
20
|
+
yield from page.get("urls", [])
|
|
21
|
+
cursor = page.get("next_cursor")
|
|
22
|
+
if not cursor:
|
|
23
|
+
break
|
|
24
|
+
|
|
25
|
+
def _fetch_page(self, domain, from_dt, to_dt, purpose, cursor):
|
|
26
|
+
params = {"domain": domain, "from": from_dt, "to": to_dt, "format": "json"}
|
|
27
|
+
if purpose: params["purpose"] = ",".join(purpose)
|
|
28
|
+
if cursor: params["cursor"] = cursor
|
|
29
|
+
|
|
30
|
+
cache_key = str(sorted(params.items()))
|
|
31
|
+
if cache_key in self._cache:
|
|
32
|
+
return self._cache[cache_key]
|
|
33
|
+
|
|
34
|
+
while True:
|
|
35
|
+
resp = requests.get(self.base_url, params=params,
|
|
36
|
+
headers={"Authorization": f"Bearer {self.api_key}"}, timeout=15)
|
|
37
|
+
if resp.status_code == 429:
|
|
38
|
+
reset = int(resp.headers.get("X-RateLimit-Reset", 60))
|
|
39
|
+
time.sleep(reset)
|
|
40
|
+
continue
|
|
41
|
+
resp.raise_for_status()
|
|
42
|
+
data = resp.json()
|
|
43
|
+
self._cache[cache_key] = data
|
|
44
|
+
return data
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
const nock = require('nock');
|
|
2
|
+
const { CrawlManifestClient } = require('../src/node/index');
|
|
3
|
+
|
|
4
|
+
const BASE = 'https://api.testprovider.com';
|
|
5
|
+
const FIXTURE = {
|
|
6
|
+
provider: 'testprovider', domain: 'example.com', schema_version: '1.0',
|
|
7
|
+
period: { from: '2026-01-01T00:00:00Z', to: '2026-03-01T00:00:00Z' },
|
|
8
|
+
total_crawled_urls: 1, next_cursor: null,
|
|
9
|
+
urls: [{ url: 'https://example.com/article', last_crawled: '2026-02-01T00:00:00Z',
|
|
10
|
+
crawl_count_30d: 2, purpose: ['rag'], http_status_at_crawl: 200,
|
|
11
|
+
content_hash: 'sha256:abc123' }],
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
test('fetches a single page and yields URLs', async () => {
|
|
15
|
+
nock(BASE).get('/crawl-manifest/v1').query(true).reply(200, FIXTURE);
|
|
16
|
+
const client = new CrawlManifestClient({ provider: 'testprovider', apiKey: 'key', baseUrl: `${BASE}/crawl-manifest/v1` });
|
|
17
|
+
const results = [];
|
|
18
|
+
for await (const url of client.fetchAll({ domain: 'example.com', from: '2026-01-01T00:00:00Z', to: '2026-03-01T00:00:00Z' })) {
|
|
19
|
+
results.push(url);
|
|
20
|
+
}
|
|
21
|
+
expect(results).toHaveLength(1);
|
|
22
|
+
expect(results[0].url).toBe('https://example.com/article');
|
|
23
|
+
});
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
const { computeContentHash, verifyContentHash, stripHtml, collapseWhitespace } = require('../src/node/content-hash');
|
|
2
|
+
|
|
3
|
+
test('strips HTML tags', () => {
|
|
4
|
+
expect(stripHtml('<p>Hello <b>world</b></p>')).toMatch(/Hello\s+world/);
|
|
5
|
+
});
|
|
6
|
+
|
|
7
|
+
test('removes script and style blocks entirely', () => {
|
|
8
|
+
const html = '<html><head><style>.a{color:red}</style><script>alert(1)</script></head><body>Text</body></html>';
|
|
9
|
+
const text = stripHtml(html);
|
|
10
|
+
expect(text).not.toContain('color:red');
|
|
11
|
+
expect(text).not.toContain('alert(1)');
|
|
12
|
+
expect(text).toContain('Text');
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
test('collapses whitespace', () => {
|
|
16
|
+
expect(collapseWhitespace(' hello world ')).toBe('hello world');
|
|
17
|
+
expect(collapseWhitespace('line1\n\n\nline2')).toBe('line1 line2');
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
test('computeContentHash returns sha256: prefix', () => {
|
|
21
|
+
const hash = computeContentHash('<p>Hello world</p>');
|
|
22
|
+
expect(hash).toMatch(/^sha256:[a-f0-9]{64}$/);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
test('same content produces same hash', () => {
|
|
26
|
+
const html = '<html><body><p>Consistent content</p></body></html>';
|
|
27
|
+
expect(computeContentHash(html)).toBe(computeContentHash(html));
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
test('different content produces different hash', () => {
|
|
31
|
+
expect(computeContentHash('<p>Hello</p>')).not.toBe(computeContentHash('<p>World</p>'));
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
test('whitespace differences in source do not change hash', () => {
|
|
35
|
+
const h1 = computeContentHash('<p>Hello world</p>');
|
|
36
|
+
const h2 = computeContentHash('<p>Hello world</p>');
|
|
37
|
+
expect(h1).toBe(h2); // collapsed to same normalised text
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
test('verifyContentHash passes for matching content', () => {
|
|
41
|
+
const html = '<article>Some article text</article>';
|
|
42
|
+
const hash = computeContentHash(html);
|
|
43
|
+
expect(verifyContentHash(html, hash)).toBe(true);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test('verifyContentHash fails for modified content', () => {
|
|
47
|
+
const hash = computeContentHash('<article>Original</article>');
|
|
48
|
+
expect(verifyContentHash('<article>Modified</article>', hash)).toBe(false);
|
|
49
|
+
});
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Unit tests for the Python crawl manifest client."""
|
|
2
|
+
import unittest
|
|
3
|
+
from unittest.mock import patch, MagicMock
|
|
4
|
+
import sys, os
|
|
5
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../src/python'))
|
|
6
|
+
from crawl_manifest_client import CrawlManifestClient
|
|
7
|
+
|
|
8
|
+
FIXTURE = {
|
|
9
|
+
'provider': 'mock', 'domain': 'example.com', 'schema_version': '1.0',
|
|
10
|
+
'period': {'from': '2026-01-01T00:00:00Z', 'to': '2026-03-01T00:00:00Z'},
|
|
11
|
+
'total_crawled_urls': 1, 'next_cursor': None,
|
|
12
|
+
'urls': [{'url': 'https://example.com/article', 'last_crawled': '2026-02-01T00:00:00Z',
|
|
13
|
+
'crawl_count_30d': 2, 'purpose': ['rag'], 'http_status_at_crawl': 200,
|
|
14
|
+
'content_hash': 'sha256:abc123'}],
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
class TestCrawlManifestClient(unittest.TestCase):
|
|
18
|
+
@patch('crawl_manifest_client.requests.get')
|
|
19
|
+
def test_fetch_all_yields_urls(self, mock_get):
|
|
20
|
+
mock_resp = MagicMock()
|
|
21
|
+
mock_resp.status_code = 200
|
|
22
|
+
mock_resp.json.return_value = FIXTURE
|
|
23
|
+
mock_resp.raise_for_status = MagicMock()
|
|
24
|
+
mock_get.return_value = mock_resp
|
|
25
|
+
|
|
26
|
+
client = CrawlManifestClient(provider='mock', api_key='key')
|
|
27
|
+
results = list(client.fetch_all('example.com', '2026-01-01T00:00:00Z', '2026-03-01T00:00:00Z'))
|
|
28
|
+
self.assertEqual(len(results), 1)
|
|
29
|
+
self.assertEqual(results[0]['url'], 'https://example.com/article')
|
|
30
|
+
|
|
31
|
+
@patch('crawl_manifest_client.requests.get')
|
|
32
|
+
def test_caches_responses(self, mock_get):
|
|
33
|
+
mock_resp = MagicMock()
|
|
34
|
+
mock_resp.status_code = 200
|
|
35
|
+
mock_resp.json.return_value = FIXTURE
|
|
36
|
+
mock_resp.raise_for_status = MagicMock()
|
|
37
|
+
mock_get.return_value = mock_resp
|
|
38
|
+
|
|
39
|
+
client = CrawlManifestClient(provider='mock', api_key='key')
|
|
40
|
+
list(client.fetch_all('example.com', '2026-01-01T00:00:00Z', '2026-03-01T00:00:00Z'))
|
|
41
|
+
list(client.fetch_all('example.com', '2026-01-01T00:00:00Z', '2026-03-01T00:00:00Z'))
|
|
42
|
+
# Should only call the API once due to caching
|
|
43
|
+
self.assertEqual(mock_get.call_count, 1)
|
|
44
|
+
|
|
45
|
+
if __name__ == '__main__':
|
|
46
|
+
unittest.main()
|