@hatk/hatk 0.0.1-alpha.1 → 0.0.1-alpha.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/backfill.d.ts +59 -0
- package/dist/backfill.d.ts.map +1 -1
- package/dist/backfill.js +135 -29
- package/dist/car.d.ts +59 -1
- package/dist/car.d.ts.map +1 -1
- package/dist/car.js +179 -7
- package/dist/cbor.d.ts +37 -0
- package/dist/cbor.d.ts.map +1 -1
- package/dist/cbor.js +36 -3
- package/dist/cid.d.ts +37 -0
- package/dist/cid.d.ts.map +1 -1
- package/dist/cid.js +38 -3
- package/dist/cli.js +62 -5
- package/dist/config.js +1 -1
- package/dist/db.d.ts +1 -0
- package/dist/db.d.ts.map +1 -1
- package/dist/db.js +4 -0
- package/dist/indexer.d.ts +1 -0
- package/dist/indexer.d.ts.map +1 -1
- package/dist/indexer.js +14 -2
- package/dist/main.js +1 -0
- package/dist/mst.d.ts +3 -1
- package/dist/mst.d.ts.map +1 -1
- package/dist/mst.js +6 -8
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +49 -6
- package/dist/vite-plugin.d.ts.map +1 -1
- package/dist/vite-plugin.js +2 -1
- package/package.json +7 -2
- package/public/robots.txt +2 -0
package/dist/backfill.d.ts
CHANGED
|
@@ -1,11 +1,70 @@
|
|
|
1
1
|
import type { BackfillConfig } from './config.ts';
|
|
2
|
+
/** Options passed to {@link runBackfill}. */
|
|
2
3
|
interface BackfillOpts {
|
|
4
|
+
/** Base URL of the relay or PDS to enumerate repos from (e.g. `wss://bsky.network`). */
|
|
3
5
|
pdsUrl: string;
|
|
6
|
+
/** PLC directory URL used to resolve `did:plc` identifiers (e.g. `https://plc.directory`). */
|
|
4
7
|
plcUrl: string;
|
|
8
|
+
/** AT Protocol collection NSIDs to index (e.g. `app.bsky.feed.post`). */
|
|
5
9
|
collections: Set<string>;
|
|
10
|
+
/** Backfill behavior settings from `config.yaml`. */
|
|
6
11
|
config: BackfillConfig;
|
|
7
12
|
}
|
|
13
|
+
/**
|
|
14
|
+
* Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
|
|
15
|
+
*
|
|
16
|
+
* The full flow:
|
|
17
|
+
* 1. Resolve the DID to find the user's PDS endpoint
|
|
18
|
+
* 2. Fetch the repo as a CAR file from the PDS
|
|
19
|
+
* 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
|
|
20
|
+
* 4. Delete any existing records for this DID (so deletions are reflected)
|
|
21
|
+
* 5. Bulk-insert all records matching the target collections
|
|
22
|
+
*
|
|
23
|
+
* On failure, applies exponential backoff retry logic. HTTP 4xx errors are
|
|
24
|
+
* treated as permanent failures (repo doesn't exist or is deactivated) and
|
|
25
|
+
* are not retried.
|
|
26
|
+
*
|
|
27
|
+
* @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
|
|
28
|
+
* @param collections - Collection NSIDs to index; records in other collections are skipped
|
|
29
|
+
* @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
|
|
30
|
+
* @returns The number of records successfully indexed
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```ts
|
|
34
|
+
* const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
|
|
35
|
+
* console.log(`Indexed ${count} records`)
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
8
38
|
export declare function backfillRepo(did: string, collections: Set<string>, fetchTimeout: number): Promise<number>;
|
|
39
|
+
/**
|
|
40
|
+
* Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
|
|
41
|
+
*
|
|
42
|
+
* Operates in one of three modes based on config:
|
|
43
|
+
* - **Pinned repos** — backfill only the DIDs listed in `config.repos`
|
|
44
|
+
* - **Full network** — enumerate every active repo on the relay via `listRepos`
|
|
45
|
+
* - **Collection signal** (default) — use `listReposByCollection` to discover repos that
|
|
46
|
+
* contain records in the configured signal collections, falling back to `listRepos`
|
|
47
|
+
* if the relay doesn't support collection-scoped enumeration
|
|
48
|
+
*
|
|
49
|
+
* After the initial pass, failed repos are retried with exponential backoff
|
|
50
|
+
* (up to `config.maxRetries` attempts). The run emits structured log events for
|
|
51
|
+
* monitoring via the `backfill.run` and `backfill.retry_round` event types.
|
|
52
|
+
*
|
|
53
|
+
* @example
|
|
54
|
+
* ```ts
|
|
55
|
+
* await runBackfill({
|
|
56
|
+
* pdsUrl: 'wss://bsky.network',
|
|
57
|
+
* plcUrl: 'https://plc.directory',
|
|
58
|
+
* collections: new Set(['xyz.statusphere.status']),
|
|
59
|
+
* config: {
|
|
60
|
+
* fullNetwork: false,
|
|
61
|
+
* parallelism: 10,
|
|
62
|
+
* fetchTimeout: 30,
|
|
63
|
+
* maxRetries: 5,
|
|
64
|
+
* },
|
|
65
|
+
* })
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
9
68
|
export declare function runBackfill(opts: BackfillOpts): Promise<void>;
|
|
10
69
|
export {};
|
|
11
70
|
//# sourceMappingURL=backfill.d.ts.map
|
package/dist/backfill.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAiBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAEjD,6CAA6C;AAC7C,UAAU,YAAY;IACpB,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IACd,8FAA8F;IAC9F,MAAM,EAAE,MAAM,CAAA;IACd,yEAAyE;IACzE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,qDAAqD;IACrD,MAAM,EAAE,cAAc,CAAA;CACvB;AAuGD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAwI/G;AAgCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAiInE"}
|
package/dist/backfill.js
CHANGED
|
@@ -1,10 +1,24 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { parseCarStream } from "./car.js";
|
|
2
2
|
import { cborDecode } from "./cbor.js";
|
|
3
3
|
import { walkMst } from "./mst.js";
|
|
4
|
-
import { setRepoStatus, getRepoStatus, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./db.js";
|
|
4
|
+
import { setRepoStatus, getRepoStatus, getRepoRev, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./db.js";
|
|
5
5
|
import { emit, timer } from "./logger.js";
|
|
6
|
+
/** In-memory cache of DID → PDS resolution results to avoid redundant lookups. */
|
|
6
7
|
const pdsCache = new Map();
|
|
7
8
|
let plcUrl;
|
|
9
|
+
/**
|
|
10
|
+
* Resolves a DID to its PDS endpoint and handle by fetching the DID document.
|
|
11
|
+
*
|
|
12
|
+
* Supports both `did:web` (fetches `/.well-known/did.json`) and `did:plc`
|
|
13
|
+
* (fetches from the PLC directory). Results are cached for the lifetime of the process.
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```ts
|
|
17
|
+
* const { pds, handle } = await resolvePds('did:plc:abc123')
|
|
18
|
+
* // pds = "https://puffball.us-east.host.bsky.network"
|
|
19
|
+
* // handle = "alice.bsky.social"
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
8
22
|
async function resolvePds(did) {
|
|
9
23
|
const cached = pdsCache.get(did);
|
|
10
24
|
if (cached)
|
|
@@ -33,7 +47,10 @@ async function resolvePds(did) {
|
|
|
33
47
|
pdsCache.set(did, result);
|
|
34
48
|
return result;
|
|
35
49
|
}
|
|
36
|
-
|
|
50
|
+
/**
|
|
51
|
+
* Paginates through all active repos on a relay/PDS using `com.atproto.sync.listRepos`.
|
|
52
|
+
* Yields `{ did, rev }` for each active repo. Skips deactivated repos.
|
|
53
|
+
*/
|
|
37
54
|
async function* listRepos(pdsUrl) {
|
|
38
55
|
let cursor;
|
|
39
56
|
while (true) {
|
|
@@ -53,6 +70,13 @@ async function* listRepos(pdsUrl) {
|
|
|
53
70
|
cursor = data.cursor;
|
|
54
71
|
}
|
|
55
72
|
}
|
|
73
|
+
/**
|
|
74
|
+
* Paginates through repos that contain records in a specific collection using
|
|
75
|
+
* `com.atproto.sync.listReposByCollection`. More efficient than {@link listRepos}
|
|
76
|
+
* when only a few collections are needed, since the relay can filter server-side.
|
|
77
|
+
*
|
|
78
|
+
* Not all relays support this endpoint — callers should fall back to {@link listRepos}.
|
|
79
|
+
*/
|
|
56
80
|
async function* listReposByCollection(pdsUrl, collection) {
|
|
57
81
|
let cursor;
|
|
58
82
|
while (true) {
|
|
@@ -71,7 +95,31 @@ async function* listReposByCollection(pdsUrl, collection) {
|
|
|
71
95
|
cursor = data.cursor;
|
|
72
96
|
}
|
|
73
97
|
}
|
|
74
|
-
|
|
98
|
+
/**
|
|
99
|
+
* Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
|
|
100
|
+
*
|
|
101
|
+
* The full flow:
|
|
102
|
+
* 1. Resolve the DID to find the user's PDS endpoint
|
|
103
|
+
* 2. Fetch the repo as a CAR file from the PDS
|
|
104
|
+
* 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
|
|
105
|
+
* 4. Delete any existing records for this DID (so deletions are reflected)
|
|
106
|
+
* 5. Bulk-insert all records matching the target collections
|
|
107
|
+
*
|
|
108
|
+
* On failure, applies exponential backoff retry logic. HTTP 4xx errors are
|
|
109
|
+
* treated as permanent failures (repo doesn't exist or is deactivated) and
|
|
110
|
+
* are not retried.
|
|
111
|
+
*
|
|
112
|
+
* @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
|
|
113
|
+
* @param collections - Collection NSIDs to index; records in other collections are skipped
|
|
114
|
+
* @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
|
|
115
|
+
* @returns The number of records successfully indexed
|
|
116
|
+
*
|
|
117
|
+
* @example
|
|
118
|
+
* ```ts
|
|
119
|
+
* const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
|
|
120
|
+
* console.log(`Indexed ${count} records`)
|
|
121
|
+
* ```
|
|
122
|
+
*/
|
|
75
123
|
export async function backfillRepo(did, collections, fetchTimeout) {
|
|
76
124
|
const elapsed = timer();
|
|
77
125
|
let count = 0;
|
|
@@ -80,6 +128,7 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
80
128
|
let error;
|
|
81
129
|
let resolvedPds;
|
|
82
130
|
let resolvedHandle = null;
|
|
131
|
+
let resolvedSince = null;
|
|
83
132
|
let retryCount;
|
|
84
133
|
let retryAfter;
|
|
85
134
|
const controller = new AbortController();
|
|
@@ -89,17 +138,23 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
89
138
|
resolvedPds = pdsUrl;
|
|
90
139
|
resolvedHandle = handle;
|
|
91
140
|
timeout = setTimeout(() => controller.abort(), fetchTimeout * 1000);
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
});
|
|
141
|
+
let lastRev = await getRepoRev(did);
|
|
142
|
+
const baseUrl = `${resolvedPds}/xrpc/com.atproto.sync.getRepo?did=${encodeURIComponent(did)}`;
|
|
143
|
+
let repoUrl = lastRev ? `${baseUrl}&since=${encodeURIComponent(lastRev)}` : baseUrl;
|
|
144
|
+
let res = await fetch(repoUrl, { signal: controller.signal });
|
|
145
|
+
// If the PDS rejected our `since` rev (compacted history), fall back to full import
|
|
146
|
+
if (res.status === 400 && lastRev) {
|
|
147
|
+
lastRev = null;
|
|
148
|
+
res = await fetch(baseUrl, { signal: controller.signal });
|
|
149
|
+
}
|
|
95
150
|
if (!res.ok) {
|
|
96
151
|
const httpErr = new Error(`getRepo failed for ${did}: ${res.status}`);
|
|
97
152
|
httpErr.httpStatus = res.status;
|
|
98
153
|
throw httpErr;
|
|
99
154
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
155
|
+
resolvedSince = lastRev;
|
|
156
|
+
const { roots, blocks, byteLength } = await parseCarStream(res.body);
|
|
157
|
+
carSizeBytes = byteLength;
|
|
103
158
|
// Decode commit to get MST root
|
|
104
159
|
const rootData = blocks.get(roots[0]);
|
|
105
160
|
if (!rootData)
|
|
@@ -107,7 +162,27 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
107
162
|
const { value: commit } = cborDecode(rootData);
|
|
108
163
|
// Walk MST to find all record paths
|
|
109
164
|
const entries = walkMst(blocks, commit.data.$link);
|
|
110
|
-
|
|
165
|
+
// Delete existing records for this DID before re-importing so deletions are reflected
|
|
166
|
+
// Only on full imports (no since) — diff CARs only contain changes
|
|
167
|
+
if (!lastRev) {
|
|
168
|
+
for (const col of collections) {
|
|
169
|
+
const schema = getSchema(col);
|
|
170
|
+
if (!schema)
|
|
171
|
+
continue;
|
|
172
|
+
await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
|
|
173
|
+
for (const child of schema.children) {
|
|
174
|
+
await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
|
|
175
|
+
}
|
|
176
|
+
for (const union of schema.unions) {
|
|
177
|
+
for (const branch of union.branches) {
|
|
178
|
+
await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
// Insert records in chunks to limit memory usage
|
|
184
|
+
const CHUNK_SIZE = 1000;
|
|
185
|
+
let chunk = [];
|
|
111
186
|
for (const entry of entries) {
|
|
112
187
|
const collection = entry.path.split('/')[0];
|
|
113
188
|
if (!collections.has(collection))
|
|
@@ -115,13 +190,18 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
115
190
|
const blockData = blocks.get(entry.cid);
|
|
116
191
|
if (!blockData)
|
|
117
192
|
continue;
|
|
193
|
+
blocks.delete(entry.cid); // free block data as we go
|
|
118
194
|
try {
|
|
119
195
|
const { value: record } = cborDecode(blockData);
|
|
120
196
|
if (!record?.$type)
|
|
121
197
|
continue;
|
|
122
198
|
const rkey = entry.path.split('/').slice(1).join('/');
|
|
123
199
|
const uri = `at://${did}/${collection}/${rkey}`;
|
|
124
|
-
|
|
200
|
+
chunk.push({ collection, uri, cid: entry.cid, did, record });
|
|
201
|
+
if (chunk.length >= CHUNK_SIZE) {
|
|
202
|
+
count += await bulkInsertRecords(chunk);
|
|
203
|
+
chunk = [];
|
|
204
|
+
}
|
|
125
205
|
}
|
|
126
206
|
catch (recordErr) {
|
|
127
207
|
emit('backfill', 'record_error', {
|
|
@@ -132,22 +212,9 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
132
212
|
});
|
|
133
213
|
}
|
|
134
214
|
}
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
const schema = getSchema(col);
|
|
138
|
-
if (!schema)
|
|
139
|
-
continue;
|
|
140
|
-
await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
|
|
141
|
-
for (const child of schema.children) {
|
|
142
|
-
await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
|
|
143
|
-
}
|
|
144
|
-
for (const union of schema.unions) {
|
|
145
|
-
for (const branch of union.branches) {
|
|
146
|
-
await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
|
|
147
|
-
}
|
|
148
|
-
}
|
|
215
|
+
if (chunk.length > 0) {
|
|
216
|
+
count += await bulkInsertRecords(chunk);
|
|
149
217
|
}
|
|
150
|
-
count = await bulkInsertRecords(bulk);
|
|
151
218
|
await setRepoStatus(did, 'active', commit.rev, { handle });
|
|
152
219
|
return count;
|
|
153
220
|
}
|
|
@@ -179,13 +246,24 @@ export async function backfillRepo(did, collections, fetchTimeout) {
|
|
|
179
246
|
error,
|
|
180
247
|
pds_url: resolvedPds,
|
|
181
248
|
car_size_bytes: carSizeBytes,
|
|
249
|
+
import_mode: carSizeBytes !== undefined ? (resolvedSince ? 'diff' : 'full') : undefined,
|
|
250
|
+
since_rev: resolvedSince,
|
|
182
251
|
retry_count: retryCount,
|
|
183
252
|
retry_after: retryAfter,
|
|
184
253
|
permanent_failure: retryCount === 999 ? true : undefined,
|
|
185
254
|
});
|
|
186
255
|
}
|
|
187
256
|
}
|
|
188
|
-
|
|
257
|
+
/**
|
|
258
|
+
* Processes items concurrently with a fixed number of workers.
|
|
259
|
+
* Workers pull from a shared index so the pool stays saturated even when
|
|
260
|
+
* individual items complete at different speeds. Errors from `fn` are
|
|
261
|
+
* swallowed (they're expected to be captured via structured logging).
|
|
262
|
+
*
|
|
263
|
+
* @param items - The work items to process
|
|
264
|
+
* @param parallelism - Maximum number of concurrent workers
|
|
265
|
+
* @param fn - Async function to run for each item
|
|
266
|
+
*/
|
|
189
267
|
async function runWorkerPool(items, parallelism, fn) {
|
|
190
268
|
let index = 0;
|
|
191
269
|
async function worker() {
|
|
@@ -202,7 +280,35 @@ async function runWorkerPool(items, parallelism, fn) {
|
|
|
202
280
|
const workers = Array.from({ length: Math.min(parallelism, items.length) }, () => worker());
|
|
203
281
|
await Promise.all(workers);
|
|
204
282
|
}
|
|
205
|
-
|
|
283
|
+
/**
|
|
284
|
+
* Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
|
|
285
|
+
*
|
|
286
|
+
* Operates in one of three modes based on config:
|
|
287
|
+
* - **Pinned repos** — backfill only the DIDs listed in `config.repos`
|
|
288
|
+
* - **Full network** — enumerate every active repo on the relay via `listRepos`
|
|
289
|
+
* - **Collection signal** (default) — use `listReposByCollection` to discover repos that
|
|
290
|
+
* contain records in the configured signal collections, falling back to `listRepos`
|
|
291
|
+
* if the relay doesn't support collection-scoped enumeration
|
|
292
|
+
*
|
|
293
|
+
* After the initial pass, failed repos are retried with exponential backoff
|
|
294
|
+
* (up to `config.maxRetries` attempts). The run emits structured log events for
|
|
295
|
+
* monitoring via the `backfill.run` and `backfill.retry_round` event types.
|
|
296
|
+
*
|
|
297
|
+
* @example
|
|
298
|
+
* ```ts
|
|
299
|
+
* await runBackfill({
|
|
300
|
+
* pdsUrl: 'wss://bsky.network',
|
|
301
|
+
* plcUrl: 'https://plc.directory',
|
|
302
|
+
* collections: new Set(['xyz.statusphere.status']),
|
|
303
|
+
* config: {
|
|
304
|
+
* fullNetwork: false,
|
|
305
|
+
* parallelism: 10,
|
|
306
|
+
* fetchTimeout: 30,
|
|
307
|
+
* maxRetries: 5,
|
|
308
|
+
* },
|
|
309
|
+
* })
|
|
310
|
+
* ```
|
|
311
|
+
*/
|
|
206
312
|
export async function runBackfill(opts) {
|
|
207
313
|
const { pdsUrl, collections, config } = opts;
|
|
208
314
|
plcUrl = opts.plcUrl;
|
package/dist/car.d.ts
CHANGED
|
@@ -1,5 +1,63 @@
|
|
|
1
|
-
|
|
1
|
+
/**
|
|
2
|
+
* CAR (Content Addressable aRchive) parser.
|
|
3
|
+
*
|
|
4
|
+
* CAR files bundle content-addressed blocks into a single binary container.
|
|
5
|
+
* They're used by the AT Protocol firehose (`com.atproto.sync.getRepo`) to
|
|
6
|
+
* deliver entire repos and by commit events to deliver individual changes.
|
|
7
|
+
*
|
|
8
|
+
* Format: `varint(headerLen) | CBOR(header) | block*`
|
|
9
|
+
* Each block: `varint(blockLen) | CID | data`
|
|
10
|
+
*
|
|
11
|
+
* @see https://ipld.io/specs/transport/car/carv1/
|
|
12
|
+
* @module
|
|
13
|
+
*/
|
|
14
|
+
/**
|
|
15
|
+
* A memory-efficient block map that stores byte offsets into the original CAR
|
|
16
|
+
* buffer instead of copying block data. Implements the same `get`/`delete`/`size`
|
|
17
|
+
* interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
|
|
18
|
+
*/
|
|
19
|
+
export declare class LazyBlockMap {
|
|
20
|
+
private offsets;
|
|
21
|
+
private carBytes;
|
|
22
|
+
constructor(carBytes: Uint8Array, offsets: Map<string, [number, number]>);
|
|
23
|
+
get(cid: string): Uint8Array | undefined;
|
|
24
|
+
delete(cid: string): boolean;
|
|
25
|
+
get size(): number;
|
|
26
|
+
[Symbol.iterator](): IterableIterator<[string, Uint8Array]>;
|
|
27
|
+
/** Release the underlying CAR buffer */
|
|
28
|
+
free(): void;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Parses a CARv1 stream incrementally from a `ReadableStream`.
|
|
32
|
+
*
|
|
33
|
+
* Instead of buffering the entire CAR into a single ArrayBuffer, this reads
|
|
34
|
+
* chunks from the stream and parses blocks as they arrive. Each block's data
|
|
35
|
+
* is `.slice()`d into its own small `Uint8Array`, allowing V8 to GC individual
|
|
36
|
+
* blocks as they're consumed during the MST walk.
|
|
37
|
+
*
|
|
38
|
+
* This is critical for backfill where multiple workers download 30-90MB CARs
|
|
39
|
+
* concurrently — buffered downloads cause OOMs because `ArrayBuffer` memory
|
|
40
|
+
* is "external" to V8's heap and not controlled by `--max-old-space-size`.
|
|
41
|
+
*
|
|
42
|
+
* @param body - The response body stream (e.g. `res.body` from `fetch()`)
|
|
43
|
+
* @returns `roots` — root CID strings; `blocks` — map of CID → block data; `byteLength` — total bytes read
|
|
44
|
+
*/
|
|
45
|
+
export declare function parseCarStream(body: ReadableStream<Uint8Array>): Promise<{
|
|
2
46
|
roots: string[];
|
|
3
47
|
blocks: Map<string, Uint8Array>;
|
|
48
|
+
byteLength: number;
|
|
49
|
+
}>;
|
|
50
|
+
/**
|
|
51
|
+
* Parses a CARv1 binary frame into its root CIDs and a lazy block map.
|
|
52
|
+
*
|
|
53
|
+
* The block map stores byte offsets into `carBytes` rather than copying data,
|
|
54
|
+
* reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
|
|
55
|
+
*
|
|
56
|
+
* @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
|
|
57
|
+
* @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
|
|
58
|
+
*/
|
|
59
|
+
export declare function parseCarFrame(carBytes: Uint8Array): {
|
|
60
|
+
roots: string[];
|
|
61
|
+
blocks: LazyBlockMap;
|
|
4
62
|
};
|
|
5
63
|
//# sourceMappingURL=car.d.ts.map
|
package/dist/car.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAuCH;;;;GAIG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAA+B;IAC9C,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAKxE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAMxC,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,IAAI,IAAI,IAAI,MAAM,CAEjB;IAEA,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,gBAAgB,CAAC,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IAO5D,wCAAwC;IACxC,IAAI,IAAI,IAAI;CAIb;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,cAAc,CAAC,UAAU,CAAC,GAAG,OAAO,CAAC;IAC9E,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;IAC/B,UAAU,EAAE,MAAM,CAAA;CACnB,CAAC,CAsGD;AAED;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,YAAY,CAAA;CACrB,CAiCA"}
|
package/dist/car.js
CHANGED
|
@@ -1,7 +1,26 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
/**
|
|
2
|
+
* CAR (Content Addressable aRchive) parser.
|
|
3
|
+
*
|
|
4
|
+
* CAR files bundle content-addressed blocks into a single binary container.
|
|
5
|
+
* They're used by the AT Protocol firehose (`com.atproto.sync.getRepo`) to
|
|
6
|
+
* deliver entire repos and by commit events to deliver individual changes.
|
|
7
|
+
*
|
|
8
|
+
* Format: `varint(headerLen) | CBOR(header) | block*`
|
|
9
|
+
* Each block: `varint(blockLen) | CID | data`
|
|
10
|
+
*
|
|
11
|
+
* @see https://ipld.io/specs/transport/car/carv1/
|
|
12
|
+
* @module
|
|
13
|
+
*/
|
|
3
14
|
import { cborDecode } from "./cbor.js";
|
|
4
15
|
import { cidToString, readVarint } from "./cid.js";
|
|
16
|
+
/**
|
|
17
|
+
* Parses a CID (Content Identifier) from raw bytes at the given offset.
|
|
18
|
+
*
|
|
19
|
+
* Handles both CIDv0 (bare SHA-256 multihash, starts with `0x12`) and
|
|
20
|
+
* CIDv1 (version + codec + multihash with varint-encoded lengths).
|
|
21
|
+
*
|
|
22
|
+
* @returns A tuple of `[cidBytes, nextOffset]`
|
|
23
|
+
*/
|
|
5
24
|
function parseCidFromBytes(bytes, offset) {
|
|
6
25
|
const firstByte = bytes[offset];
|
|
7
26
|
if (firstByte === 0x12) {
|
|
@@ -22,6 +41,160 @@ function parseCidFromBytes(bytes, offset) {
|
|
|
22
41
|
pos = afterDigestLen + digestLen;
|
|
23
42
|
return [bytes.slice(offset, pos), pos];
|
|
24
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* A memory-efficient block map that stores byte offsets into the original CAR
|
|
46
|
+
* buffer instead of copying block data. Implements the same `get`/`delete`/`size`
|
|
47
|
+
* interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
|
|
48
|
+
*/
|
|
49
|
+
export class LazyBlockMap {
|
|
50
|
+
offsets;
|
|
51
|
+
carBytes;
|
|
52
|
+
constructor(carBytes, offsets) {
|
|
53
|
+
this.carBytes = carBytes;
|
|
54
|
+
this.offsets = offsets;
|
|
55
|
+
}
|
|
56
|
+
get(cid) {
|
|
57
|
+
const range = this.offsets.get(cid);
|
|
58
|
+
if (!range || !this.carBytes)
|
|
59
|
+
return undefined;
|
|
60
|
+
return this.carBytes.subarray(range[0], range[1]);
|
|
61
|
+
}
|
|
62
|
+
delete(cid) {
|
|
63
|
+
return this.offsets.delete(cid);
|
|
64
|
+
}
|
|
65
|
+
get size() {
|
|
66
|
+
return this.offsets.size;
|
|
67
|
+
}
|
|
68
|
+
*[Symbol.iterator]() {
|
|
69
|
+
for (const [cid, range] of this.offsets) {
|
|
70
|
+
if (!this.carBytes)
|
|
71
|
+
return;
|
|
72
|
+
yield [cid, this.carBytes.subarray(range[0], range[1])];
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
/** Release the underlying CAR buffer */
|
|
76
|
+
free() {
|
|
77
|
+
this.carBytes = null;
|
|
78
|
+
this.offsets.clear();
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Parses a CARv1 stream incrementally from a `ReadableStream`.
|
|
83
|
+
*
|
|
84
|
+
* Instead of buffering the entire CAR into a single ArrayBuffer, this reads
|
|
85
|
+
* chunks from the stream and parses blocks as they arrive. Each block's data
|
|
86
|
+
* is `.slice()`d into its own small `Uint8Array`, allowing V8 to GC individual
|
|
87
|
+
* blocks as they're consumed during the MST walk.
|
|
88
|
+
*
|
|
89
|
+
* This is critical for backfill where multiple workers download 30-90MB CARs
|
|
90
|
+
* concurrently — buffered downloads cause OOMs because `ArrayBuffer` memory
|
|
91
|
+
* is "external" to V8's heap and not controlled by `--max-old-space-size`.
|
|
92
|
+
*
|
|
93
|
+
* @param body - The response body stream (e.g. `res.body` from `fetch()`)
|
|
94
|
+
* @returns `roots` — root CID strings; `blocks` — map of CID → block data; `byteLength` — total bytes read
|
|
95
|
+
*/
|
|
96
|
+
export async function parseCarStream(body) {
|
|
97
|
+
const reader = body.getReader();
|
|
98
|
+
// Growable buffer with position tracking. We reuse a single allocation and
|
|
99
|
+
// compact (shift data to front) when the read position passes the midpoint,
|
|
100
|
+
// avoiding per-chunk allocations and subarray references that pin old memory.
|
|
101
|
+
let buf = new Uint8Array(64 * 1024);
|
|
102
|
+
let pos = 0; // read cursor
|
|
103
|
+
let len = 0; // bytes of valid data in buf
|
|
104
|
+
let byteLength = 0;
|
|
105
|
+
// Ensure at least `need` bytes are available at buf[pos..pos+need)
|
|
106
|
+
async function fill(need) {
|
|
107
|
+
while (len - pos < need) {
|
|
108
|
+
const { done, value } = await reader.read();
|
|
109
|
+
if (done)
|
|
110
|
+
return (len - pos) >= need;
|
|
111
|
+
byteLength += value.length;
|
|
112
|
+
// Compact: shift remaining data to front when read cursor passes midpoint
|
|
113
|
+
if (pos > 0 && pos > buf.length >>> 1) {
|
|
114
|
+
buf.copyWithin(0, pos, len);
|
|
115
|
+
len -= pos;
|
|
116
|
+
pos = 0;
|
|
117
|
+
}
|
|
118
|
+
// Grow if needed
|
|
119
|
+
const required = len + value.length;
|
|
120
|
+
if (required > buf.length) {
|
|
121
|
+
const newBuf = new Uint8Array(Math.max(required, buf.length * 2));
|
|
122
|
+
newBuf.set(buf.subarray(0, len));
|
|
123
|
+
buf = newBuf;
|
|
124
|
+
}
|
|
125
|
+
buf.set(value, len);
|
|
126
|
+
len += value.length;
|
|
127
|
+
}
|
|
128
|
+
return true;
|
|
129
|
+
}
|
|
130
|
+
function consume(n) {
|
|
131
|
+
pos += n;
|
|
132
|
+
}
|
|
133
|
+
// Read a varint starting at buf[pos]
|
|
134
|
+
function readVarintFromBuf() {
|
|
135
|
+
let value = 0;
|
|
136
|
+
let shift = 0;
|
|
137
|
+
let p = pos;
|
|
138
|
+
while (p < len) {
|
|
139
|
+
const byte = buf[p++];
|
|
140
|
+
value |= (byte & 0x7f) << shift;
|
|
141
|
+
if ((byte & 0x80) === 0)
|
|
142
|
+
return [value, p - pos];
|
|
143
|
+
shift += 7;
|
|
144
|
+
if (shift > 35)
|
|
145
|
+
throw new Error('Varint too long');
|
|
146
|
+
}
|
|
147
|
+
throw new Error('Unexpected end of varint');
|
|
148
|
+
}
|
|
149
|
+
// Parse header: varint(headerLen) + CBOR(header)
|
|
150
|
+
if (!(await fill(1)))
|
|
151
|
+
throw new Error('Empty CAR stream');
|
|
152
|
+
// Prefetch up to 10 bytes for the varint; readVarintFromBuf bounds to `len`
|
|
153
|
+
await fill(10);
|
|
154
|
+
const [headerLen, headerVarintSize] = readVarintFromBuf();
|
|
155
|
+
consume(headerVarintSize);
|
|
156
|
+
if (!(await fill(headerLen)))
|
|
157
|
+
throw new Error('Truncated CAR header');
|
|
158
|
+
// .slice() copies out of the reusable buffer
|
|
159
|
+
const headerSlice = buf.slice(pos, pos + headerLen);
|
|
160
|
+
const { value: header } = cborDecode(headerSlice);
|
|
161
|
+
consume(headerLen);
|
|
162
|
+
const roots = (header.roots || []).map((root) => root?.$link ?? cidToString(root));
|
|
163
|
+
// Parse blocks
|
|
164
|
+
const blocks = new Map();
|
|
165
|
+
while (true) {
|
|
166
|
+
if (!(await fill(1)))
|
|
167
|
+
break;
|
|
168
|
+
// Prefetch up to 10 bytes for the varint; readVarintFromBuf bounds to `len`
|
|
169
|
+
await fill(10);
|
|
170
|
+
const [blockLen, blockVarintSize] = readVarintFromBuf();
|
|
171
|
+
consume(blockVarintSize);
|
|
172
|
+
if (blockLen === 0)
|
|
173
|
+
break;
|
|
174
|
+
if (!(await fill(blockLen)))
|
|
175
|
+
throw new Error('Truncated CAR block');
|
|
176
|
+
const [cidBytes, afterCid] = parseCidFromBytes(buf, pos);
|
|
177
|
+
const cid = cidToString(cidBytes);
|
|
178
|
+
const cidLen = afterCid - pos;
|
|
179
|
+
// .slice() creates an independent copy — the buffer can be reused
|
|
180
|
+
const data = buf.slice(afterCid, afterCid + blockLen - cidLen);
|
|
181
|
+
blocks.set(cid, data);
|
|
182
|
+
consume(blockLen);
|
|
183
|
+
}
|
|
184
|
+
reader.releaseLock();
|
|
185
|
+
// Release the internal buffer
|
|
186
|
+
buf = null;
|
|
187
|
+
return { roots, blocks, byteLength };
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Parses a CARv1 binary frame into its root CIDs and a lazy block map.
|
|
191
|
+
*
|
|
192
|
+
* The block map stores byte offsets into `carBytes` rather than copying data,
|
|
193
|
+
* reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
|
|
194
|
+
*
|
|
195
|
+
* @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
|
|
196
|
+
* @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
|
|
197
|
+
*/
|
|
25
198
|
export function parseCarFrame(carBytes) {
|
|
26
199
|
let offset = 0;
|
|
27
200
|
// Read header length (varint-prefixed CBOR)
|
|
@@ -34,8 +207,8 @@ export function parseCarFrame(carBytes) {
|
|
|
34
207
|
// Our CBOR decoder converts tag-42 CIDs to { $link: "b..." } objects,
|
|
35
208
|
// so roots may already be decoded strings
|
|
36
209
|
const roots = (header.roots || []).map((root) => root?.$link ?? cidToString(root));
|
|
37
|
-
//
|
|
38
|
-
const
|
|
210
|
+
// Build offset index: CID → [start, end] into carBytes
|
|
211
|
+
const offsets = new Map();
|
|
39
212
|
while (offset < carBytes.length) {
|
|
40
213
|
const [blockLen, afterBlockLen] = readVarint(carBytes, offset);
|
|
41
214
|
offset = afterBlockLen;
|
|
@@ -44,9 +217,8 @@ export function parseCarFrame(carBytes) {
|
|
|
44
217
|
const [cidBytes, afterCid] = parseCidFromBytes(carBytes, offset);
|
|
45
218
|
const cid = cidToString(cidBytes);
|
|
46
219
|
const dataLen = blockLen - (afterCid - offset);
|
|
47
|
-
|
|
48
|
-
blocks.set(cid, data);
|
|
220
|
+
offsets.set(cid, [afterCid, afterCid + dataLen]);
|
|
49
221
|
offset = afterCid + dataLen;
|
|
50
222
|
}
|
|
51
|
-
return { roots, blocks };
|
|
223
|
+
return { roots, blocks: new LazyBlockMap(carBytes, offsets) };
|
|
52
224
|
}
|
package/dist/cbor.d.ts
CHANGED
|
@@ -1,7 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Minimal CBOR (RFC 8949) decoder with DAG-CBOR CID support.
|
|
3
|
+
*
|
|
4
|
+
* Returns `{ value, offset }` so callers can decode concatenated CBOR values —
|
|
5
|
+
* the AT Protocol firehose sends frames as two back-to-back CBOR items
|
|
6
|
+
* (header + body).
|
|
7
|
+
*
|
|
8
|
+
* DAG-CBOR tag 42 (CID links) are decoded as `{ $link: "bafy..." }` objects,
|
|
9
|
+
* matching the convention used by the AT Protocol.
|
|
10
|
+
*
|
|
11
|
+
* @see https://www.rfc-editor.org/rfc/rfc8949 — CBOR spec
|
|
12
|
+
* @see https://ipld.io/specs/codecs/dag-cbor/spec/ — DAG-CBOR spec
|
|
13
|
+
* @module
|
|
14
|
+
*/
|
|
1
15
|
interface DecodeResult {
|
|
16
|
+
/** The decoded JavaScript value. */
|
|
2
17
|
value: any;
|
|
18
|
+
/** Byte offset immediately after the decoded value — use as `startOffset` to decode the next item. */
|
|
3
19
|
offset: number;
|
|
4
20
|
}
|
|
21
|
+
/**
|
|
22
|
+
* Decodes a single CBOR value from a byte array.
|
|
23
|
+
*
|
|
24
|
+
* Supports all major types: unsigned/negative integers, byte/text strings,
|
|
25
|
+
* arrays, maps, tags (with special handling for CID tag 42), and simple
|
|
26
|
+
* values (true, false, null).
|
|
27
|
+
*
|
|
28
|
+
* @param bytes - Raw CBOR bytes
|
|
29
|
+
* @param startOffset - Byte position to start decoding from (default `0`)
|
|
30
|
+
* @returns The decoded value and the offset of the next byte after it
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* ```ts
|
|
34
|
+
* // Decode a single value
|
|
35
|
+
* const { value } = cborDecode(bytes)
|
|
36
|
+
*
|
|
37
|
+
* // Decode two concatenated values (firehose frame)
|
|
38
|
+
* const { value: header, offset } = cborDecode(frameBytes)
|
|
39
|
+
* const { value: body } = cborDecode(frameBytes, offset)
|
|
40
|
+
* ```
|
|
41
|
+
*/
|
|
5
42
|
export declare function cborDecode(bytes: Uint8Array, startOffset?: number): DecodeResult;
|
|
6
43
|
export {};
|
|
7
44
|
//# sourceMappingURL=cbor.d.ts.map
|
package/dist/cbor.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"cbor.d.ts","sourceRoot":"","sources":["../src/cbor.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"cbor.d.ts","sourceRoot":"","sources":["../src/cbor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAOH,UAAU,YAAY;IACpB,oCAAoC;IACpC,KAAK,EAAE,GAAG,CAAA;IACV,sGAAsG;IACtG,MAAM,EAAE,MAAM,CAAA;CACf;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,EAAE,WAAW,SAAI,GAAG,YAAY,CAgF3E"}
|