@hatk/hatk 0.0.1-alpha.0 → 0.0.1-alpha.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,70 @@
1
1
  import type { BackfillConfig } from './config.ts';
2
+ /** Options passed to {@link runBackfill}. */
2
3
  interface BackfillOpts {
4
+ /** Base URL of the relay or PDS to enumerate repos from (e.g. `wss://bsky.network`). */
3
5
  pdsUrl: string;
6
+ /** PLC directory URL used to resolve `did:plc` identifiers (e.g. `https://plc.directory`). */
4
7
  plcUrl: string;
8
+ /** AT Protocol collection NSIDs to index (e.g. `app.bsky.feed.post`). */
5
9
  collections: Set<string>;
10
+ /** Backfill behavior settings from `config.yaml`. */
6
11
  config: BackfillConfig;
7
12
  }
13
+ /**
14
+ * Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
15
+ *
16
+ * The full flow:
17
+ * 1. Resolve the DID to find the user's PDS endpoint
18
+ * 2. Fetch the repo as a CAR file from the PDS
19
+ * 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
20
+ * 4. Delete any existing records for this DID (so deletions are reflected)
21
+ * 5. Bulk-insert all records matching the target collections
22
+ *
23
+ * On failure, applies exponential backoff retry logic. HTTP 4xx errors are
24
+ * treated as permanent failures (repo doesn't exist or is deactivated) and
25
+ * are not retried.
26
+ *
27
+ * @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
28
+ * @param collections - Collection NSIDs to index; records in other collections are skipped
29
+ * @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
30
+ * @returns The number of records successfully indexed
31
+ *
32
+ * @example
33
+ * ```ts
34
+ * const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
35
+ * console.log(`Indexed ${count} records`)
36
+ * ```
37
+ */
8
38
  export declare function backfillRepo(did: string, collections: Set<string>, fetchTimeout: number): Promise<number>;
39
+ /**
40
+ * Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
41
+ *
42
+ * Operates in one of three modes based on config:
43
+ * - **Pinned repos** — backfill only the DIDs listed in `config.repos`
44
+ * - **Full network** — enumerate every active repo on the relay via `listRepos`
45
+ * - **Collection signal** (default) — use `listReposByCollection` to discover repos that
46
+ * contain records in the configured signal collections, falling back to `listRepos`
47
+ * if the relay doesn't support collection-scoped enumeration
48
+ *
49
+ * After the initial pass, failed repos are retried with exponential backoff
50
+ * (up to `config.maxRetries` attempts). The run emits structured log events for
51
+ * monitoring via the `backfill.run` and `backfill.retry_round` event types.
52
+ *
53
+ * @example
54
+ * ```ts
55
+ * await runBackfill({
56
+ * pdsUrl: 'wss://bsky.network',
57
+ * plcUrl: 'https://plc.directory',
58
+ * collections: new Set(['xyz.statusphere.status']),
59
+ * config: {
60
+ * fullNetwork: false,
61
+ * parallelism: 10,
62
+ * fetchTimeout: 30,
63
+ * maxRetries: 5,
64
+ * },
65
+ * })
66
+ * ```
67
+ */
9
68
  export declare function runBackfill(opts: BackfillOpts): Promise<void>;
10
69
  export {};
11
70
  //# sourceMappingURL=backfill.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAEjD,UAAU,YAAY;IACpB,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,EAAE,MAAM,CAAA;IACd,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,MAAM,EAAE,cAAc,CAAA;CACvB;AA+ED,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAiH/G;AAwBD,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAiInE"}
1
+ {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAEjD,6CAA6C;AAC7C,UAAU,YAAY;IACpB,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IACd,8FAA8F;IAC9F,MAAM,EAAE,MAAM,CAAA;IACd,yEAAyE;IACzE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,qDAAqD;IACrD,MAAM,EAAE,cAAc,CAAA;CACvB;AAuGD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CA4H/G;AAgCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAiInE"}
package/dist/backfill.js CHANGED
@@ -3,8 +3,22 @@ import { cborDecode } from "./cbor.js";
3
3
  import { walkMst } from "./mst.js";
4
4
  import { setRepoStatus, getRepoStatus, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./db.js";
5
5
  import { emit, timer } from "./logger.js";
6
+ /** In-memory cache of DID → PDS resolution results to avoid redundant lookups. */
6
7
  const pdsCache = new Map();
7
8
  let plcUrl;
9
+ /**
10
+ * Resolves a DID to its PDS endpoint and handle by fetching the DID document.
11
+ *
12
+ * Supports both `did:web` (fetches `/.well-known/did.json`) and `did:plc`
13
+ * (fetches from the PLC directory). Results are cached for the lifetime of the process.
14
+ *
15
+ * @example
16
+ * ```ts
17
+ * const { pds, handle } = await resolvePds('did:plc:abc123')
18
+ * // pds = "https://puffball.us-east.host.bsky.network"
19
+ * // handle = "alice.bsky.social"
20
+ * ```
21
+ */
8
22
  async function resolvePds(did) {
9
23
  const cached = pdsCache.get(did);
10
24
  if (cached)
@@ -33,7 +47,10 @@ async function resolvePds(did) {
33
47
  pdsCache.set(did, result);
34
48
  return result;
35
49
  }
36
- // --- Repo Enumeration ---
50
+ /**
51
+ * Paginates through all active repos on a relay/PDS using `com.atproto.sync.listRepos`.
52
+ * Yields `{ did, rev }` for each active repo. Skips deactivated repos.
53
+ */
37
54
  async function* listRepos(pdsUrl) {
38
55
  let cursor;
39
56
  while (true) {
@@ -53,6 +70,13 @@ async function* listRepos(pdsUrl) {
53
70
  cursor = data.cursor;
54
71
  }
55
72
  }
73
+ /**
74
+ * Paginates through repos that contain records in a specific collection using
75
+ * `com.atproto.sync.listReposByCollection`. More efficient than {@link listRepos}
76
+ * when only a few collections are needed, since the relay can filter server-side.
77
+ *
78
+ * Not all relays support this endpoint — callers should fall back to {@link listRepos}.
79
+ */
56
80
  async function* listReposByCollection(pdsUrl, collection) {
57
81
  let cursor;
58
82
  while (true) {
@@ -71,7 +95,31 @@ async function* listReposByCollection(pdsUrl, collection) {
71
95
  cursor = data.cursor;
72
96
  }
73
97
  }
74
- // --- Single Repo Backfill ---
98
+ /**
99
+ * Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
100
+ *
101
+ * The full flow:
102
+ * 1. Resolve the DID to find the user's PDS endpoint
103
+ * 2. Fetch the repo as a CAR file from the PDS
104
+ * 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
105
+ * 4. Delete any existing records for this DID (so deletions are reflected)
106
+ * 5. Bulk-insert all records matching the target collections
107
+ *
108
+ * On failure, applies exponential backoff retry logic. HTTP 4xx errors are
109
+ * treated as permanent failures (repo doesn't exist or is deactivated) and
110
+ * are not retried.
111
+ *
112
+ * @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
113
+ * @param collections - Collection NSIDs to index; records in other collections are skipped
114
+ * @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
115
+ * @returns The number of records successfully indexed
116
+ *
117
+ * @example
118
+ * ```ts
119
+ * const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
120
+ * console.log(`Indexed ${count} records`)
121
+ * ```
122
+ */
75
123
  export async function backfillRepo(did, collections, fetchTimeout) {
76
124
  const elapsed = timer();
77
125
  let count = 0;
@@ -99,7 +147,7 @@ export async function backfillRepo(did, collections, fetchTimeout) {
99
147
  }
100
148
  const carBytes = new Uint8Array(await res.arrayBuffer());
101
149
  carSizeBytes = carBytes.length;
102
- const { roots, blocks } = parseCarFrame(carBytes);
150
+ let { roots, blocks } = parseCarFrame(carBytes);
103
151
  // Decode commit to get MST root
104
152
  const rootData = blocks.get(roots[0]);
105
153
  if (!rootData)
@@ -107,7 +155,24 @@ export async function backfillRepo(did, collections, fetchTimeout) {
107
155
  const { value: commit } = cborDecode(rootData);
108
156
  // Walk MST to find all record paths
109
157
  const entries = walkMst(blocks, commit.data.$link);
110
- const bulk = [];
158
+ // Delete existing records for this DID before re-importing so deletions are reflected
159
+ for (const col of collections) {
160
+ const schema = getSchema(col);
161
+ if (!schema)
162
+ continue;
163
+ await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
164
+ for (const child of schema.children) {
165
+ await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
166
+ }
167
+ for (const union of schema.unions) {
168
+ for (const branch of union.branches) {
169
+ await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
170
+ }
171
+ }
172
+ }
173
+ // Insert records in chunks to limit memory usage
174
+ const CHUNK_SIZE = 1000;
175
+ let chunk = [];
111
176
  for (const entry of entries) {
112
177
  const collection = entry.path.split('/')[0];
113
178
  if (!collections.has(collection))
@@ -115,13 +180,18 @@ export async function backfillRepo(did, collections, fetchTimeout) {
115
180
  const blockData = blocks.get(entry.cid);
116
181
  if (!blockData)
117
182
  continue;
183
+ blocks.delete(entry.cid); // free block data as we go
118
184
  try {
119
185
  const { value: record } = cborDecode(blockData);
120
186
  if (!record?.$type)
121
187
  continue;
122
188
  const rkey = entry.path.split('/').slice(1).join('/');
123
189
  const uri = `at://${did}/${collection}/${rkey}`;
124
- bulk.push({ collection, uri, cid: entry.cid, did, record });
190
+ chunk.push({ collection, uri, cid: entry.cid, did, record });
191
+ if (chunk.length >= CHUNK_SIZE) {
192
+ count += await bulkInsertRecords(chunk);
193
+ chunk = [];
194
+ }
125
195
  }
126
196
  catch (recordErr) {
127
197
  emit('backfill', 'record_error', {
@@ -132,22 +202,11 @@ export async function backfillRepo(did, collections, fetchTimeout) {
132
202
  });
133
203
  }
134
204
  }
135
- // Delete existing records for this DID before re-importing so deletions are reflected
136
- for (const col of collections) {
137
- const schema = getSchema(col);
138
- if (!schema)
139
- continue;
140
- await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
141
- for (const child of schema.children) {
142
- await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
143
- }
144
- for (const union of schema.unions) {
145
- for (const branch of union.branches) {
146
- await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
147
- }
148
- }
205
+ blocks.free();
206
+ blocks = null;
207
+ if (chunk.length > 0) {
208
+ count += await bulkInsertRecords(chunk);
149
209
  }
150
- count = await bulkInsertRecords(bulk);
151
210
  await setRepoStatus(did, 'active', commit.rev, { handle });
152
211
  return count;
153
212
  }
@@ -185,7 +244,16 @@ export async function backfillRepo(did, collections, fetchTimeout) {
185
244
  });
186
245
  }
187
246
  }
188
- // --- Worker Pool ---
247
+ /**
248
+ * Processes items concurrently with a fixed number of workers.
249
+ * Workers pull from a shared index so the pool stays saturated even when
250
+ * individual items complete at different speeds. Errors from `fn` are
251
+ * swallowed (they're expected to be captured via structured logging).
252
+ *
253
+ * @param items - The work items to process
254
+ * @param parallelism - Maximum number of concurrent workers
255
+ * @param fn - Async function to run for each item
256
+ */
189
257
  async function runWorkerPool(items, parallelism, fn) {
190
258
  let index = 0;
191
259
  async function worker() {
@@ -202,7 +270,35 @@ async function runWorkerPool(items, parallelism, fn) {
202
270
  const workers = Array.from({ length: Math.min(parallelism, items.length) }, () => worker());
203
271
  await Promise.all(workers);
204
272
  }
205
- // --- Main Backfill Entry Point ---
273
+ /**
274
+ * Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
275
+ *
276
+ * Operates in one of three modes based on config:
277
+ * - **Pinned repos** — backfill only the DIDs listed in `config.repos`
278
+ * - **Full network** — enumerate every active repo on the relay via `listRepos`
279
+ * - **Collection signal** (default) — use `listReposByCollection` to discover repos that
280
+ * contain records in the configured signal collections, falling back to `listRepos`
281
+ * if the relay doesn't support collection-scoped enumeration
282
+ *
283
+ * After the initial pass, failed repos are retried with exponential backoff
284
+ * (up to `config.maxRetries` attempts). The run emits structured log events for
285
+ * monitoring via the `backfill.run` and `backfill.retry_round` event types.
286
+ *
287
+ * @example
288
+ * ```ts
289
+ * await runBackfill({
290
+ * pdsUrl: 'wss://bsky.network',
291
+ * plcUrl: 'https://plc.directory',
292
+ * collections: new Set(['xyz.statusphere.status']),
293
+ * config: {
294
+ * fullNetwork: false,
295
+ * parallelism: 10,
296
+ * fetchTimeout: 30,
297
+ * maxRetries: 5,
298
+ * },
299
+ * })
300
+ * ```
301
+ */
206
302
  export async function runBackfill(opts) {
207
303
  const { pdsUrl, collections, config } = opts;
208
304
  plcUrl = opts.plcUrl;
package/dist/car.d.ts CHANGED
@@ -1,5 +1,43 @@
1
+ /**
2
+ * CAR (Content Addressable aRchive) parser.
3
+ *
4
+ * CAR files bundle content-addressed blocks into a single binary container.
5
+ * They're used by the AT Protocol firehose (`com.atproto.sync.getRepo`) to
6
+ * deliver entire repos and by commit events to deliver individual changes.
7
+ *
8
+ * Format: `varint(headerLen) | CBOR(header) | block*`
9
+ * Each block: `varint(blockLen) | CID | data`
10
+ *
11
+ * @see https://ipld.io/specs/transport/car/carv1/
12
+ * @module
13
+ */
14
+ /**
15
+ * A memory-efficient block map that stores byte offsets into the original CAR
16
+ * buffer instead of copying block data. Implements the same `get`/`delete`/`size`
17
+ * interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
18
+ */
19
+ export declare class LazyBlockMap {
20
+ private offsets;
21
+ private carBytes;
22
+ constructor(carBytes: Uint8Array, offsets: Map<string, [number, number]>);
23
+ get(cid: string): Uint8Array | undefined;
24
+ delete(cid: string): boolean;
25
+ get size(): number;
26
+ [Symbol.iterator](): IterableIterator<[string, Uint8Array]>;
27
+ /** Release the underlying CAR buffer */
28
+ free(): void;
29
+ }
30
+ /**
31
+ * Parses a CARv1 binary frame into its root CIDs and a lazy block map.
32
+ *
33
+ * The block map stores byte offsets into `carBytes` rather than copying data,
34
+ * reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
35
+ *
36
+ * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
37
+ * @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
38
+ */
1
39
  export declare function parseCarFrame(carBytes: Uint8Array): {
2
40
  roots: string[];
3
- blocks: Map<string, Uint8Array>;
41
+ blocks: LazyBlockMap;
4
42
  };
5
43
  //# sourceMappingURL=car.d.ts.map
package/dist/car.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAgCA,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;CAChC,CAmCA"}
1
+ {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAuCH;;;;GAIG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAA+B;IAC9C,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAKxE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAMxC,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,IAAI,IAAI,IAAI,MAAM,CAEjB;IAEA,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,gBAAgB,CAAC,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IAO5D,wCAAwC;IACxC,IAAI,IAAI,IAAI;CAIb;AAED;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,YAAY,CAAA;CACrB,CAiCA"}
package/dist/car.js CHANGED
@@ -1,7 +1,26 @@
1
- // CAR (Content Addressable aRchive) parser from scratch
2
- // CAR files bundle content-addressed blocks — used in firehose events
1
+ /**
2
+ * CAR (Content Addressable aRchive) parser.
3
+ *
4
+ * CAR files bundle content-addressed blocks into a single binary container.
5
+ * They're used by the AT Protocol firehose (`com.atproto.sync.getRepo`) to
6
+ * deliver entire repos and by commit events to deliver individual changes.
7
+ *
8
+ * Format: `varint(headerLen) | CBOR(header) | block*`
9
+ * Each block: `varint(blockLen) | CID | data`
10
+ *
11
+ * @see https://ipld.io/specs/transport/car/carv1/
12
+ * @module
13
+ */
3
14
  import { cborDecode } from "./cbor.js";
4
15
  import { cidToString, readVarint } from "./cid.js";
16
+ /**
17
+ * Parses a CID (Content Identifier) from raw bytes at the given offset.
18
+ *
19
+ * Handles both CIDv0 (bare SHA-256 multihash, starts with `0x12`) and
20
+ * CIDv1 (version + codec + multihash with varint-encoded lengths).
21
+ *
22
+ * @returns A tuple of `[cidBytes, nextOffset]`
23
+ */
5
24
  function parseCidFromBytes(bytes, offset) {
6
25
  const firstByte = bytes[offset];
7
26
  if (firstByte === 0x12) {
@@ -22,6 +41,52 @@ function parseCidFromBytes(bytes, offset) {
22
41
  pos = afterDigestLen + digestLen;
23
42
  return [bytes.slice(offset, pos), pos];
24
43
  }
44
+ /**
45
+ * A memory-efficient block map that stores byte offsets into the original CAR
46
+ * buffer instead of copying block data. Implements the same `get`/`delete`/`size`
47
+ * interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
48
+ */
49
+ export class LazyBlockMap {
50
+ offsets;
51
+ carBytes;
52
+ constructor(carBytes, offsets) {
53
+ this.carBytes = carBytes;
54
+ this.offsets = offsets;
55
+ }
56
+ get(cid) {
57
+ const range = this.offsets.get(cid);
58
+ if (!range || !this.carBytes)
59
+ return undefined;
60
+ return this.carBytes.subarray(range[0], range[1]);
61
+ }
62
+ delete(cid) {
63
+ return this.offsets.delete(cid);
64
+ }
65
+ get size() {
66
+ return this.offsets.size;
67
+ }
68
+ *[Symbol.iterator]() {
69
+ for (const [cid, range] of this.offsets) {
70
+ if (!this.carBytes)
71
+ return;
72
+ yield [cid, this.carBytes.subarray(range[0], range[1])];
73
+ }
74
+ }
75
+ /** Release the underlying CAR buffer */
76
+ free() {
77
+ this.carBytes = null;
78
+ this.offsets.clear();
79
+ }
80
+ }
81
+ /**
82
+ * Parses a CARv1 binary frame into its root CIDs and a lazy block map.
83
+ *
84
+ * The block map stores byte offsets into `carBytes` rather than copying data,
85
+ * reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
86
+ *
87
+ * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
88
+ * @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
89
+ */
25
90
  export function parseCarFrame(carBytes) {
26
91
  let offset = 0;
27
92
  // Read header length (varint-prefixed CBOR)
@@ -34,8 +99,8 @@ export function parseCarFrame(carBytes) {
34
99
  // Our CBOR decoder converts tag-42 CIDs to { $link: "b..." } objects,
35
100
  // so roots may already be decoded strings
36
101
  const roots = (header.roots || []).map((root) => root?.$link ?? cidToString(root));
37
- // Parse blocks: each is varint(len) + CID + data
38
- const blocks = new Map();
102
+ // Build offset index: CID [start, end] into carBytes
103
+ const offsets = new Map();
39
104
  while (offset < carBytes.length) {
40
105
  const [blockLen, afterBlockLen] = readVarint(carBytes, offset);
41
106
  offset = afterBlockLen;
@@ -44,9 +109,8 @@ export function parseCarFrame(carBytes) {
44
109
  const [cidBytes, afterCid] = parseCidFromBytes(carBytes, offset);
45
110
  const cid = cidToString(cidBytes);
46
111
  const dataLen = blockLen - (afterCid - offset);
47
- const data = carBytes.slice(afterCid, afterCid + dataLen);
48
- blocks.set(cid, data);
112
+ offsets.set(cid, [afterCid, afterCid + dataLen]);
49
113
  offset = afterCid + dataLen;
50
114
  }
51
- return { roots, blocks };
115
+ return { roots, blocks: new LazyBlockMap(carBytes, offsets) };
52
116
  }
package/dist/cbor.d.ts CHANGED
@@ -1,7 +1,44 @@
1
+ /**
2
+ * Minimal CBOR (RFC 8949) decoder with DAG-CBOR CID support.
3
+ *
4
+ * Returns `{ value, offset }` so callers can decode concatenated CBOR values —
5
+ * the AT Protocol firehose sends frames as two back-to-back CBOR items
6
+ * (header + body).
7
+ *
8
+ * DAG-CBOR tag 42 (CID links) are decoded as `{ $link: "bafy..." }` objects,
9
+ * matching the convention used by the AT Protocol.
10
+ *
11
+ * @see https://www.rfc-editor.org/rfc/rfc8949 — CBOR spec
12
+ * @see https://ipld.io/specs/codecs/dag-cbor/spec/ — DAG-CBOR spec
13
+ * @module
14
+ */
1
15
  interface DecodeResult {
16
+ /** The decoded JavaScript value. */
2
17
  value: any;
18
+ /** Byte offset immediately after the decoded value — use as `startOffset` to decode the next item. */
3
19
  offset: number;
4
20
  }
21
+ /**
22
+ * Decodes a single CBOR value from a byte array.
23
+ *
24
+ * Supports all major types: unsigned/negative integers, byte/text strings,
25
+ * arrays, maps, tags (with special handling for CID tag 42), and simple
26
+ * values (true, false, null).
27
+ *
28
+ * @param bytes - Raw CBOR bytes
29
+ * @param startOffset - Byte position to start decoding from (default `0`)
30
+ * @returns The decoded value and the offset of the next byte after it
31
+ *
32
+ * @example
33
+ * ```ts
34
+ * // Decode a single value
35
+ * const { value } = cborDecode(bytes)
36
+ *
37
+ * // Decode two concatenated values (firehose frame)
38
+ * const { value: header, offset } = cborDecode(frameBytes)
39
+ * const { value: body } = cborDecode(frameBytes, offset)
40
+ * ```
41
+ */
5
42
  export declare function cborDecode(bytes: Uint8Array, startOffset?: number): DecodeResult;
6
43
  export {};
7
44
  //# sourceMappingURL=cbor.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"cbor.d.ts","sourceRoot":"","sources":["../src/cbor.ts"],"names":[],"mappings":"AAQA,UAAU,YAAY;IACpB,KAAK,EAAE,GAAG,CAAA;IACV,MAAM,EAAE,MAAM,CAAA;CACf;AAED,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,EAAE,WAAW,SAAI,GAAG,YAAY,CAgF3E"}
1
+ {"version":3,"file":"cbor.d.ts","sourceRoot":"","sources":["../src/cbor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAOH,UAAU,YAAY;IACpB,oCAAoC;IACpC,KAAK,EAAE,GAAG,CAAA;IACV,sGAAsG;IACtG,MAAM,EAAE,MAAM,CAAA;CACf;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,EAAE,WAAW,SAAI,GAAG,YAAY,CAgF3E"}
package/dist/cbor.js CHANGED
@@ -1,8 +1,41 @@
1
- // CBOR decoder from scratch (RFC 8949)
2
- // Returns { value, offset } so we can split firehose frames
3
- // (two concatenated CBOR values: header + body)
1
+ /**
2
+ * Minimal CBOR (RFC 8949) decoder with DAG-CBOR CID support.
3
+ *
4
+ * Returns `{ value, offset }` so callers can decode concatenated CBOR values —
5
+ * the AT Protocol firehose sends frames as two back-to-back CBOR items
6
+ * (header + body).
7
+ *
8
+ * DAG-CBOR tag 42 (CID links) are decoded as `{ $link: "bafy..." }` objects,
9
+ * matching the convention used by the AT Protocol.
10
+ *
11
+ * @see https://www.rfc-editor.org/rfc/rfc8949 — CBOR spec
12
+ * @see https://ipld.io/specs/codecs/dag-cbor/spec/ — DAG-CBOR spec
13
+ * @module
14
+ */
4
15
  import { cidToString } from "./cid.js";
16
+ /** CBOR tag number for DAG-CBOR CID links. */
5
17
  const CBOR_TAG_CID = 42;
18
+ /**
19
+ * Decodes a single CBOR value from a byte array.
20
+ *
21
+ * Supports all major types: unsigned/negative integers, byte/text strings,
22
+ * arrays, maps, tags (with special handling for CID tag 42), and simple
23
+ * values (true, false, null).
24
+ *
25
+ * @param bytes - Raw CBOR bytes
26
+ * @param startOffset - Byte position to start decoding from (default `0`)
27
+ * @returns The decoded value and the offset of the next byte after it
28
+ *
29
+ * @example
30
+ * ```ts
31
+ * // Decode a single value
32
+ * const { value } = cborDecode(bytes)
33
+ *
34
+ * // Decode two concatenated values (firehose frame)
35
+ * const { value: header, offset } = cborDecode(frameBytes)
36
+ * const { value: body } = cborDecode(frameBytes, offset)
37
+ * ```
38
+ */
6
39
  export function cborDecode(bytes, startOffset = 0) {
7
40
  let offset = startOffset;
8
41
  function read() {
package/dist/cid.d.ts CHANGED
@@ -1,4 +1,41 @@
1
+ /**
2
+ * CID (Content Identifier), base32, and varint primitives.
3
+ *
4
+ * CIDs are self-describing content hashes used throughout the AT Protocol
5
+ * to reference blocks in repos and CAR files. This module provides the
6
+ * low-level encoding needed to convert raw CID bytes into their string
7
+ * representation (base32lower with `b` multibase prefix).
8
+ *
9
+ * @see https://github.com/multiformats/cid
10
+ * @module
11
+ */
12
+ /**
13
+ * Encodes raw bytes as a base32 lowercase string (RFC 4648, no padding).
14
+ *
15
+ * @example
16
+ * ```ts
17
+ * base32Encode(new Uint8Array([0x01, 0x71])) // "afyq"
18
+ * ```
19
+ */
1
20
  export declare function base32Encode(bytes: Uint8Array): string;
21
+ /**
22
+ * Converts raw CID bytes to their multibase-encoded string form (`b` prefix + base32lower).
23
+ *
24
+ * @example
25
+ * ```ts
26
+ * cidToString(cidBytes) // "bafyreig..."
27
+ * ```
28
+ */
2
29
  export declare function cidToString(cidBytes: Uint8Array): string;
30
+ /**
31
+ * Reads an unsigned LEB128 varint from a byte array.
32
+ *
33
+ * Varints are used extensively in CID encoding and CAR framing to represent
34
+ * variable-length integers in a compact form.
35
+ *
36
+ * @param bytes - Source byte array
37
+ * @param offset - Position to start reading from
38
+ * @returns A tuple of `[value, nextOffset]`
39
+ */
3
40
  export declare function readVarint(bytes: Uint8Array, offset: number): [number, number];
4
41
  //# sourceMappingURL=cid.d.ts.map
package/dist/cid.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"cid.d.ts","sourceRoot":"","sources":["../src/cid.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAmBtD;AAED,wBAAgB,WAAW,CAAC,QAAQ,EAAE,UAAU,GAAG,MAAM,CAGxD;AAED,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAc9E"}
1
+ {"version":3,"file":"cid.d.ts","sourceRoot":"","sources":["../src/cid.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAKH;;;;;;;GAOG;AACH,wBAAgB,YAAY,CAAC,KAAK,EAAE,UAAU,GAAG,MAAM,CAmBtD;AAED;;;;;;;GAOG;AACH,wBAAgB,WAAW,CAAC,QAAQ,EAAE,UAAU,GAAG,MAAM,CAExD;AAED;;;;;;;;;GASG;AACH,wBAAgB,UAAU,CAAC,KAAK,EAAE,UAAU,EAAE,MAAM,EAAE,MAAM,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAc9E"}
package/dist/cid.js CHANGED
@@ -1,6 +1,24 @@
1
- // CID (Content Identifier) + base32 + varint — from scratch
2
- // CIDs are self-describing content hashes used throughout AT Protocol
1
+ /**
2
+ * CID (Content Identifier), base32, and varint primitives.
3
+ *
4
+ * CIDs are self-describing content hashes used throughout the AT Protocol
5
+ * to reference blocks in repos and CAR files. This module provides the
6
+ * low-level encoding needed to convert raw CID bytes into their string
7
+ * representation (base32lower with `b` multibase prefix).
8
+ *
9
+ * @see https://github.com/multiformats/cid
10
+ * @module
11
+ */
12
+ /** RFC 4648 base32 lowercase alphabet (no padding). */
3
13
  const BASE32_ALPHABET = 'abcdefghijklmnopqrstuvwxyz234567';
14
+ /**
15
+ * Encodes raw bytes as a base32 lowercase string (RFC 4648, no padding).
16
+ *
17
+ * @example
18
+ * ```ts
19
+ * base32Encode(new Uint8Array([0x01, 0x71])) // "afyq"
20
+ * ```
21
+ */
4
22
  export function base32Encode(bytes) {
5
23
  let result = '';
6
24
  let bits = 0;
@@ -18,10 +36,27 @@ export function base32Encode(bytes) {
18
36
  }
19
37
  return result;
20
38
  }
39
+ /**
40
+ * Converts raw CID bytes to their multibase-encoded string form (`b` prefix + base32lower).
41
+ *
42
+ * @example
43
+ * ```ts
44
+ * cidToString(cidBytes) // "bafyreig..."
45
+ * ```
46
+ */
21
47
  export function cidToString(cidBytes) {
22
- // base32lower with 'b' multibase prefix
23
48
  return `b${base32Encode(cidBytes)}`;
24
49
  }
50
+ /**
51
+ * Reads an unsigned LEB128 varint from a byte array.
52
+ *
53
+ * Varints are used extensively in CID encoding and CAR framing to represent
54
+ * variable-length integers in a compact form.
55
+ *
56
+ * @param bytes - Source byte array
57
+ * @param offset - Position to start reading from
58
+ * @returns A tuple of `[value, nextOffset]`
59
+ */
25
60
  export function readVarint(bytes, offset) {
26
61
  let value = 0;
27
62
  let shift = 0;
package/dist/cli.js CHANGED
@@ -135,7 +135,7 @@ export default defineQuery('${name}', async (ctx) => {
135
135
  })
136
136
  })
137
137
  `,
138
- label: (name) => `import type { LabelRuleContext } from 'hatk/labels'
138
+ label: (name) => `import type { LabelRuleContext } from '@hatk/hatk/labels'
139
139
 
140
140
  export default {
141
141
  definition: {
@@ -151,7 +151,7 @@ export default {
151
151
  },
152
152
  }
153
153
  `,
154
- og: (name) => `import type { OpengraphContext, OpengraphResult } from 'hatk/opengraph'
154
+ og: (name) => `import type { OpengraphContext, OpengraphResult } from '@hatk/hatk/opengraph'
155
155
 
156
156
  export default {
157
157
  path: '/og/${name}/:id',
@@ -187,7 +187,7 @@ function xrpcImportPath(nsid) {
187
187
  }
188
188
  const testTemplates = {
189
189
  feed: (name) => `import { describe, test, expect, beforeAll, afterAll } from 'vitest'
190
- import { createTestContext } from 'hatk/test'
190
+ import { createTestContext } from '@hatk/hatk/test'
191
191
 
192
192
  let ctx: Awaited<ReturnType<typeof createTestContext>>
193
193
 
@@ -207,7 +207,7 @@ describe('${name} feed', () => {
207
207
  })
208
208
  `,
209
209
  xrpc: (name) => `import { describe, test, expect, beforeAll, afterAll } from 'vitest'
210
- import { createTestContext } from 'hatk/test'
210
+ import { createTestContext } from '@hatk/hatk/test'
211
211
 
212
212
  let ctx: Awaited<ReturnType<typeof createTestContext>>
213
213
 
@@ -591,6 +591,53 @@ backfill:
591
591
  },
592
592
  },
593
593
  }, null, 2) + '\n');
594
+ writeFileSync(join(coreLexDir, 'getPreferences.json'), JSON.stringify({
595
+ lexicon: 1,
596
+ id: 'dev.hatk.getPreferences',
597
+ defs: {
598
+ main: {
599
+ type: 'query',
600
+ description: 'Get all preferences for the authenticated user.',
601
+ output: {
602
+ encoding: 'application/json',
603
+ schema: {
604
+ type: 'object',
605
+ properties: {
606
+ preferences: { type: 'unknown' },
607
+ },
608
+ },
609
+ },
610
+ },
611
+ },
612
+ }, null, 2) + '\n');
613
+ writeFileSync(join(coreLexDir, 'putPreference.json'), JSON.stringify({
614
+ lexicon: 1,
615
+ id: 'dev.hatk.putPreference',
616
+ defs: {
617
+ main: {
618
+ type: 'procedure',
619
+ description: 'Set a single preference by key.',
620
+ input: {
621
+ encoding: 'application/json',
622
+ schema: {
623
+ type: 'object',
624
+ required: ['key', 'value'],
625
+ properties: {
626
+ key: { type: 'string' },
627
+ value: { type: 'unknown' },
628
+ },
629
+ },
630
+ },
631
+ output: {
632
+ encoding: 'application/json',
633
+ schema: {
634
+ type: 'object',
635
+ properties: {},
636
+ },
637
+ },
638
+ },
639
+ },
640
+ }, null, 2) + '\n');
594
641
  writeFileSync(join(coreLexDir, 'getFeed.json'), JSON.stringify({
595
642
  lexicon: 1,
596
643
  id: 'dev.hatk.getFeed',
@@ -611,6 +658,7 @@ backfill:
611
658
  encoding: 'application/json',
612
659
  schema: {
613
660
  type: 'object',
661
+ required: ['items'],
614
662
  properties: {
615
663
  items: { type: 'array', items: { type: 'unknown' } },
616
664
  cursor: { type: 'string' },
@@ -668,6 +716,7 @@ backfill:
668
716
  encoding: 'application/json',
669
717
  schema: {
670
718
  type: 'object',
719
+ required: ['items'],
671
720
  properties: {
672
721
  items: { type: 'array', items: { type: 'unknown' } },
673
722
  cursor: { type: 'string' },
@@ -699,6 +748,7 @@ backfill:
699
748
  encoding: 'application/json',
700
749
  schema: {
701
750
  type: 'object',
751
+ required: ['items'],
702
752
  properties: {
703
753
  items: { type: 'array', items: { type: 'unknown' } },
704
754
  cursor: { type: 'string' },
@@ -793,11 +843,12 @@ public
793
843
  writeFileSync(join(dir, 'Dockerfile'), `FROM node:25-slim
794
844
  WORKDIR /app
795
845
  COPY package.json package-lock.json ./
796
- RUN npm ci --omit=dev
846
+ RUN npm ci
797
847
  COPY . .
798
848
  RUN node_modules/.bin/hatk build
849
+ RUN npm prune --omit=dev
799
850
  EXPOSE 3000
800
- CMD ["node", "--experimental-strip-types", "--no-warnings", "node_modules/hatk/src/main.ts", "config.yaml"]
851
+ CMD ["node", "--max-old-space-size=512", "node_modules/@hatk/hatk/dist/main.js", "config.yaml"]
801
852
  `);
802
853
  const pkgDeps = { '@hatk/oauth-client': '*', hatk: '*' };
803
854
  const pkgDevDeps = {
@@ -807,6 +858,7 @@ CMD ["node", "--experimental-strip-types", "--no-warnings", "node_modules/hatk/s
807
858
  typescript: '^5',
808
859
  vite: '^6',
809
860
  vitest: '^4',
861
+ '@types/node': '^22',
810
862
  };
811
863
  if (withSvelte) {
812
864
  pkgDevDeps['@sveltejs/adapter-static'] = '^3';
@@ -899,7 +951,7 @@ export default {
899
951
  }
900
952
  `);
901
953
  writeFileSync(join(dir, 'vite.config.ts'), `import { sveltekit } from '@sveltejs/kit/vite'
902
- import { hatk } from 'hatk/vite-plugin'
954
+ import { hatk } from '@hatk/hatk/vite-plugin'
903
955
  import { defineConfig } from 'vite'
904
956
 
905
957
  export default defineConfig({
@@ -926,6 +978,7 @@ export default defineConfig({
926
978
  <head>
927
979
  <meta charset="utf-8" />
928
980
  <meta name="viewport" content="width=device-width, initial-scale=1" />
981
+ <meta name="description" content="${name}" />
929
982
  <title>${name}</title>
930
983
  %sveltekit.head%
931
984
  </head>
@@ -1118,10 +1171,10 @@ else if (command === 'generate') {
1118
1171
  // Collect which wrappers are used (only from entries with a main type)
1119
1172
  const usedWrappers = new Set(entries.filter((e) => e.defType).map((e) => wrapperMap[e.defType]));
1120
1173
  let out = '// Auto-generated from lexicons. Do not edit.\n';
1121
- out += `import type { ${[...usedWrappers].sort().join(', ')}, LexServerParams, Checked, Prettify, StrictArg } from 'hatk/lex-types'\n`;
1122
- out += `import type { XrpcContext } from 'hatk/xrpc'\n`;
1123
- out += `import { defineFeed as _defineFeed, type FeedResult, type FeedContext, type HydrateContext } from 'hatk/feeds'\n`;
1124
- out += `import { seed as _seed, type SeedOpts } from 'hatk/seed'\n`;
1174
+ out += `import type { ${[...usedWrappers].sort().join(', ')}, LexServerParams, Checked, Prettify, StrictArg } from '@hatk/hatk/lex-types'\n`;
1175
+ out += `import type { XrpcContext } from '@hatk/hatk/xrpc'\n`;
1176
+ out += `import { defineFeed as _defineFeed, type FeedResult, type FeedContext, type HydrateContext } from '@hatk/hatk/feeds'\n`;
1177
+ out += `import { seed as _seed, type SeedOpts } from '@hatk/hatk/seed'\n`;
1125
1178
  // Emit ALL lexicons as `const ... = {...} as const` (including defs-only)
1126
1179
  out += `\n// ─── Lexicon Definitions ────────────────────────────────────────────\n\n`;
1127
1180
  for (const { nsid } of entries) {
@@ -1276,8 +1329,8 @@ else if (command === 'generate') {
1276
1329
  out += `}\n`;
1277
1330
  // Emit Ctx helper for typesafe XRPC handler contexts
1278
1331
  out += `\n// ─── XRPC Helpers ───────────────────────────────────────────────────\n\n`;
1279
- out += `export type { HydrateContext } from 'hatk/feeds'\n`;
1280
- out += `export { InvalidRequestError, NotFoundError } from 'hatk/xrpc'\n`;
1332
+ out += `export type { HydrateContext } from '@hatk/hatk/feeds'\n`;
1333
+ out += `export { InvalidRequestError, NotFoundError } from '@hatk/hatk/xrpc'\n`;
1281
1334
  out += `export type Ctx<K extends keyof XrpcSchema & keyof Registry> = XrpcContext<\n`;
1282
1335
  out += ` LexServerParams<Registry[K], Registry>,\n`;
1283
1336
  out += ` RecordRegistry,\n`;
@@ -1324,7 +1377,7 @@ else if (command === 'generate') {
1324
1377
  // Patch imports to include LexDef if needed
1325
1378
  if (hasLexDef) {
1326
1379
  usedWrappers.add('LexDef');
1327
- out = out.replace(/import type \{ ([^}]+) \} from 'hatk\/lex-types'/, `import type { ${[...usedWrappers].sort().join(', ')}, LexServerParams, Checked, Prettify, StrictArg } from 'hatk/lex-types'`);
1380
+ out = out.replace(/import type \{ ([^}]+) \} from '@hatk\/hatk\/lex-types'/, `import type { ${[...usedWrappers].sort().join(', ')}, LexServerParams, Checked, Prettify, StrictArg } from '@hatk/hatk/lex-types'`);
1328
1381
  }
1329
1382
  writeFileSync(outPath, out);
1330
1383
  console.log(`Generated ${outPath} with ${entries.length} types: ${entries.map((e) => capitalize(varNames.get(e.nsid))).join(', ')}`);
@@ -1436,8 +1489,12 @@ else if (command === 'dev') {
1436
1489
  }
1437
1490
  else {
1438
1491
  // No frontend — just run the hatk server directly
1439
- const mainPath = resolve(import.meta.dirname, 'main.ts');
1440
- execSync(`npx tsx ${mainPath} config.yaml`, { stdio: 'inherit', cwd: process.cwd() });
1492
+ const mainPath = resolve(import.meta.dirname, 'main.js');
1493
+ execSync(`npx tsx ${mainPath} config.yaml`, {
1494
+ stdio: 'inherit',
1495
+ cwd: process.cwd(),
1496
+ env: { ...process.env, DEV_MODE: '1' },
1497
+ });
1441
1498
  }
1442
1499
  }
1443
1500
  catch (e) {
@@ -1649,7 +1706,7 @@ else if (command === 'schema') {
1649
1706
  }
1650
1707
  else if (command === 'start') {
1651
1708
  try {
1652
- const mainPath = resolve(import.meta.dirname, 'main.ts');
1709
+ const mainPath = resolve(import.meta.dirname, 'main.js');
1653
1710
  execSync(`npx tsx ${mainPath} config.yaml`, { stdio: 'inherit', cwd: process.cwd() });
1654
1711
  }
1655
1712
  catch (e) {
package/dist/config.js CHANGED
@@ -23,7 +23,7 @@ export function loadConfig(configPath) {
23
23
  signalCollections: backfillRaw.signalCollections || undefined,
24
24
  repos: env.BACKFILL_REPOS ? env.BACKFILL_REPOS.split(',').map((s) => s.trim()) : backfillRaw.repos || undefined,
25
25
  fullNetwork: env.BACKFILL_FULL_NETWORK ? env.BACKFILL_FULL_NETWORK === 'true' : backfillRaw.fullNetwork || false,
26
- parallelism: parseInt(env.BACKFILL_PARALLELISM || '') || backfillRaw.parallelism || 5,
26
+ parallelism: parseInt(env.BACKFILL_PARALLELISM || '') || backfillRaw.parallelism || 3,
27
27
  fetchTimeout: parseInt(env.BACKFILL_FETCH_TIMEOUT || '') || backfillRaw.fetchTimeout || 300,
28
28
  maxRetries: parseInt(env.BACKFILL_MAX_RETRIES || '') || backfillRaw.maxRetries || 5,
29
29
  },
package/dist/indexer.d.ts CHANGED
@@ -7,6 +7,7 @@ interface IndexerOpts {
7
7
  cursor?: string | null;
8
8
  fetchTimeout: number;
9
9
  maxRetries: number;
10
+ parallelism?: number;
10
11
  ftsRebuildInterval?: number;
11
12
  }
12
13
  export declare function startIndexer(opts: IndexerOpts): Promise<WebSocket>;
@@ -1 +1 @@
1
- {"version":3,"file":"indexer.d.ts","sourceRoot":"","sources":["../src/indexer.ts"],"names":[],"mappings":"AAkIA,wBAAsB,mBAAmB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,SAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAsDjF;AAED,UAAU,WAAW;IACnB,QAAQ,EAAE,MAAM,CAAA;IAChB,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IAC/B,WAAW,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACzB,MAAM,CAAC,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,YAAY,EAAE,MAAM,CAAA;IACpB,UAAU,EAAE,MAAM,CAAA;IAClB,kBAAkB,CAAC,EAAE,MAAM,CAAA;CAC5B;AAyBD,wBAAsB,YAAY,CAAC,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,SAAS,CAAC,CAkDxE"}
1
+ {"version":3,"file":"indexer.d.ts","sourceRoot":"","sources":["../src/indexer.ts"],"names":[],"mappings":"AAmIA,wBAAsB,mBAAmB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,SAAI,GAAG,OAAO,CAAC,IAAI,CAAC,CAgEjF;AAED,UAAU,WAAW;IACnB,QAAQ,EAAE,MAAM,CAAA;IAChB,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IAC/B,WAAW,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACzB,MAAM,CAAC,EAAE,MAAM,GAAG,IAAI,CAAA;IACtB,YAAY,EAAE,MAAM,CAAA;IACpB,UAAU,EAAE,MAAM,CAAA;IAClB,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,kBAAkB,CAAC,EAAE,MAAM,CAAA;CAC5B;AAyBD,wBAAsB,YAAY,CAAC,IAAI,EAAE,WAAW,GAAG,OAAO,CAAC,SAAS,CAAC,CAmDxE"}
package/dist/indexer.js CHANGED
@@ -18,7 +18,7 @@ let ftsRebuildInterval = 500;
18
18
  const pendingBuffers = new Map();
19
19
  // Track in-flight backfills to avoid duplicates
20
20
  const backfillInFlight = new Set();
21
- const MAX_CONCURRENT_BACKFILLS = 5;
21
+ const pendingReschedule = new Set();
22
22
  // In-memory cache of repo status to avoid flooding the DB read queue
23
23
  const repoStatusCache = new Map();
24
24
  // Set by startIndexer
@@ -27,6 +27,7 @@ let indexerSignalCollections;
27
27
  let indexerPinnedRepos = null;
28
28
  let indexerFetchTimeout;
29
29
  let indexerMaxRetries;
30
+ let maxConcurrentBackfills = 3;
30
31
  async function flushBuffer() {
31
32
  if (buffer.length === 0)
32
33
  return;
@@ -113,6 +114,16 @@ function bufferWrite(item) {
113
114
  export async function triggerAutoBackfill(did, attempt = 0) {
114
115
  if (backfillInFlight.has(did))
115
116
  return;
117
+ if (backfillInFlight.size >= maxConcurrentBackfills) {
118
+ if (!pendingReschedule.has(did)) {
119
+ pendingReschedule.add(did);
120
+ setTimeout(() => {
121
+ pendingReschedule.delete(did);
122
+ triggerAutoBackfill(did, attempt);
123
+ }, 10_000);
124
+ }
125
+ return;
126
+ }
116
127
  backfillInFlight.add(did);
117
128
  pendingBuffers.set(did, []);
118
129
  if (attempt === 0)
@@ -193,6 +204,7 @@ export async function startIndexer(opts) {
193
204
  indexerPinnedRepos = opts.pinnedRepos || null;
194
205
  indexerFetchTimeout = fetchTimeout;
195
206
  indexerMaxRetries = opts.maxRetries;
207
+ maxConcurrentBackfills = opts.parallelism ?? 3;
196
208
  // Pre-populate repo status cache from DB so non-signal updates
197
209
  // (e.g. profile changes) are processed for already-tracked DIDs
198
210
  if (repoStatusCache.size === 0) {
@@ -264,7 +276,7 @@ function processMessage(bytes, collections) {
264
276
  repoStatusCache.set(did, 'unknown');
265
277
  }
266
278
  if (hasSignalOp && (!indexerPinnedRepos || indexerPinnedRepos.has(did))) {
267
- if (repoStatus === null && backfillInFlight.size < MAX_CONCURRENT_BACKFILLS) {
279
+ if (repoStatus === null && backfillInFlight.size < maxConcurrentBackfills) {
268
280
  repoStatusCache.set(did, 'pending');
269
281
  triggerAutoBackfill(did);
270
282
  }
package/dist/main.js CHANGED
@@ -127,6 +127,7 @@ startIndexer({
127
127
  cursor,
128
128
  fetchTimeout: config.backfill.fetchTimeout,
129
129
  maxRetries: config.backfill.maxRetries,
130
+ parallelism: config.backfill.parallelism,
130
131
  ftsRebuildInterval: config.ftsRebuildInterval,
131
132
  });
132
133
  // 7. Run backfill in background
package/dist/mst.d.ts CHANGED
@@ -2,5 +2,7 @@ export interface MstEntry {
2
2
  path: string;
3
3
  cid: string;
4
4
  }
5
- export declare function walkMst(blocks: Map<string, Uint8Array>, rootCid: string): MstEntry[];
5
+ export declare function walkMst(blocks: {
6
+ get(cid: string): Uint8Array | undefined;
7
+ }, rootCid: string): Generator<MstEntry>;
6
8
  //# sourceMappingURL=mst.d.ts.map
package/dist/mst.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"mst.d.ts","sourceRoot":"","sources":["../src/mst.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,CAAA;CACZ;AAED,wBAAgB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,EAAE,OAAO,EAAE,MAAM,GAAG,QAAQ,EAAE,CAiCpF"}
1
+ {"version":3,"file":"mst.d.ts","sourceRoot":"","sources":["../src/mst.ts"],"names":[],"mappings":"AAEA,MAAM,WAAW,QAAQ;IACvB,IAAI,EAAE,MAAM,CAAA;IACZ,GAAG,EAAE,MAAM,CAAA;CACZ;AAED,wBAAiB,OAAO,CAAC,MAAM,EAAE;IAAE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS,CAAA;CAAE,EAAE,OAAO,EAAE,MAAM,GAAG,SAAS,CAAC,QAAQ,CAAC,CA8BnH"}
package/dist/mst.js CHANGED
@@ -1,14 +1,13 @@
1
1
  import { cborDecode } from "./cbor.js";
2
- export function walkMst(blocks, rootCid) {
3
- const entries = [];
4
- function visit(cid, prefix) {
2
+ export function* walkMst(blocks, rootCid) {
3
+ function* visit(cid, prefix) {
5
4
  const data = blocks.get(cid);
6
5
  if (!data)
7
6
  return prefix;
8
7
  const { value: node } = cborDecode(data);
9
8
  // Visit left subtree
10
9
  if (node.l?.$link)
11
- visit(node.l.$link, prefix);
10
+ yield* visit(node.l.$link, prefix);
12
11
  let lastKey = prefix;
13
12
  for (const entry of node.e || []) {
14
13
  const keySuffix = entry.k instanceof Uint8Array ? new TextDecoder().decode(entry.k) : entry.k;
@@ -16,15 +15,14 @@ export function walkMst(blocks, rootCid) {
16
15
  const fullKey = lastKey.substring(0, prefixLen) + keySuffix;
17
16
  lastKey = fullKey;
18
17
  if (entry.v?.$link) {
19
- entries.push({ path: fullKey, cid: entry.v.$link });
18
+ yield { path: fullKey, cid: entry.v.$link };
20
19
  }
21
20
  // Visit right subtree
22
21
  if (entry.t?.$link) {
23
- visit(entry.t.$link, lastKey);
22
+ yield* visit(entry.t.$link, lastKey);
24
23
  }
25
24
  }
26
25
  return lastKey;
27
26
  }
28
- visit(rootCid, '');
29
- return entries;
27
+ yield* visit(rootCid, '');
30
28
  }
@@ -1 +1 @@
1
- {"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../src/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAgB,KAAK,MAAM,EAAE,KAAK,eAAe,EAAE,MAAM,WAAW,CAAA;AAiD3E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AA2B9C,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,WAAW,EAAE,MAAM,EAAE,EACrB,SAAS,EAAE,MAAM,GAAG,IAAI,EACxB,KAAK,EAAE,WAAW,GAAG,IAAI,EACzB,MAAM,GAAE,MAAM,EAAO,EACrB,aAAa,CAAC,EAAE,CAAC,GAAG,EAAE,eAAe,KAAK;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,GAC/D,MAAM,CAm7BR"}
1
+ {"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../src/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAgB,KAAK,MAAM,EAAE,KAAK,eAAe,EAAE,MAAM,WAAW,CAAA;AAmD3E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AA2B9C,wBAAgB,WAAW,CACzB,IAAI,EAAE,MAAM,EACZ,WAAW,EAAE,MAAM,EAAE,EACrB,SAAS,EAAE,MAAM,GAAG,IAAI,EACxB,KAAK,EAAE,WAAW,GAAG,IAAI,EACzB,MAAM,GAAE,MAAM,EAAO,EACrB,aAAa,CAAC,EAAE,CAAC,GAAG,EAAE,eAAe,KAAK;IAAE,GAAG,EAAE,MAAM,CAAA;CAAE,GAAG,IAAI,GAC/D,MAAM,CA28BR"}
package/dist/server.js CHANGED
@@ -1,4 +1,6 @@
1
1
  import { createServer } from 'node:http';
2
+ import { gzipSync } from 'node:zlib';
3
+ import { existsSync } from 'node:fs';
2
4
  import { readFile } from 'node:fs/promises';
3
5
  import { join, extname } from 'node:path';
4
6
  import { queryRecords, getRecordByUri, searchRecords, getSchema, reshapeRow, setRepoStatus, getRepoStatus, getRepoRetryInfo, querySQL, insertRecord, deleteRecord, queryLabelsForUris, insertLabels, searchAccounts, listReposPaginated, getCollectionCounts, normalizeValue, getSchemaDump, getPreferences, putPreference, } from "./db.js";
@@ -39,12 +41,13 @@ function readBodyRaw(req) {
39
41
  }
40
42
  export function startServer(port, collections, publicDir, oauth, admins = [], resolveViewer) {
41
43
  const coreXrpc = (method) => `/xrpc/dev.hatk.${method}`;
44
+ const devMode = process.env.DEV_MODE === '1';
42
45
  function requireAdmin(viewer, res) {
43
46
  if (!viewer) {
44
47
  jsonError(res, 401, 'Authentication required');
45
48
  return false;
46
49
  }
47
- if (!admins.includes(viewer.did)) {
50
+ if (!devMode && !admins.includes(viewer.did)) {
48
51
  jsonError(res, 403, 'Admin access required');
49
52
  return false;
50
53
  }
@@ -478,7 +481,14 @@ export function startServer(port, collections, publicDir, oauth, admins = [], re
478
481
  const sizeRows = await querySQL(`SELECT database_size, memory_usage, memory_limit FROM pragma_database_size()`);
479
482
  const dbInfo = sizeRows[0] ?? {};
480
483
  const collectionCounts = await getCollectionCounts();
481
- jsonResponse(res, { repos: counts, duckdb: dbInfo, collections: collectionCounts });
484
+ const mem = process.memoryUsage();
485
+ const node = {
486
+ rss: `${(mem.rss / 1024 / 1024).toFixed(1)} MiB`,
487
+ heapUsed: `${(mem.heapUsed / 1024 / 1024).toFixed(1)} MiB`,
488
+ heapTotal: `${(mem.heapTotal / 1024 / 1024).toFixed(1)} MiB`,
489
+ external: `${(mem.external / 1024 / 1024).toFixed(1)} MiB`,
490
+ };
491
+ jsonResponse(res, { repos: counts, duckdb: dbInfo, node, collections: collectionCounts });
482
492
  return;
483
493
  }
484
494
  // GET /admin/info/:did — repo status info
@@ -864,6 +874,21 @@ export function startServer(port, collections, publicDir, oauth, admins = [], re
864
874
  throw err;
865
875
  }
866
876
  }
877
+ // GET /robots.txt — serve from user's public dir or fall back to hatk default
878
+ if (url.pathname === '/robots.txt') {
879
+ const userRobots = publicDir ? join(publicDir, 'robots.txt') : null;
880
+ const defaultRobots = join(import.meta.dirname, '../public/robots.txt');
881
+ const robotsPath = userRobots && existsSync(userRobots) ? userRobots : defaultRobots;
882
+ try {
883
+ const content = await readFile(robotsPath);
884
+ res.writeHead(200, { 'Content-Type': 'text/plain' });
885
+ res.end(content);
886
+ return;
887
+ }
888
+ catch {
889
+ // fall through
890
+ }
891
+ }
867
892
  // Static file serving
868
893
  if (publicDir) {
869
894
  try {
@@ -912,15 +937,33 @@ export function startServer(port, collections, publicDir, oauth, admins = [], re
912
937
  server.listen(port, () => log(`[server] ${oauth?.issuer || `http://localhost:${port}`}`));
913
938
  return server;
914
939
  }
940
+ function sendJson(res, status, body) {
941
+ const acceptEncoding = res.req?.headers['accept-encoding'] || '';
942
+ if (body.length > 1024 && /\bgzip\b/.test(acceptEncoding)) {
943
+ const compressed = gzipSync(body);
944
+ res.writeHead(status, {
945
+ 'Content-Type': 'application/json',
946
+ 'Content-Encoding': 'gzip',
947
+ Vary: 'Accept-Encoding',
948
+ ...(status === 200 ? { 'Cache-Control': 'no-store' } : {}),
949
+ });
950
+ res.end(compressed);
951
+ }
952
+ else {
953
+ res.writeHead(status, {
954
+ 'Content-Type': 'application/json',
955
+ ...(status === 200 ? { 'Cache-Control': 'no-store' } : {}),
956
+ });
957
+ res.end(body);
958
+ }
959
+ }
915
960
  function jsonResponse(res, data) {
916
- res.writeHead(200, { 'Content-Type': 'application/json', 'Cache-Control': 'no-store' });
917
- res.end(JSON.stringify(data, (_, v) => normalizeValue(v)));
961
+ sendJson(res, 200, Buffer.from(JSON.stringify(data, (_, v) => normalizeValue(v))));
918
962
  }
919
963
  function jsonError(res, status, message) {
920
964
  if (res.headersSent)
921
965
  return;
922
- res.writeHead(status, { 'Content-Type': 'application/json' });
923
- res.end(JSON.stringify({ error: message }));
966
+ sendJson(res, status, Buffer.from(JSON.stringify({ error: message })));
924
967
  }
925
968
  /** Proxy a request to the user's PDS with DPoP + automatic nonce retry + token refresh. */
926
969
  async function proxyToPds(oauthConfig, session, method, pdsUrl, body) {
@@ -1 +1 @@
1
- {"version":3,"file":"vite-plugin.d.ts","sourceRoot":"","sources":["../src/vite-plugin.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,MAAM,CAAA;AAKlC,wBAAgB,IAAI,CAAC,IAAI,CAAC,EAAE;IAAE,IAAI,CAAC,EAAE,MAAM,CAAA;CAAE,GAAG,MAAM,CAsFrD"}
1
+ {"version":3,"file":"vite-plugin.d.ts","sourceRoot":"","sources":["../src/vite-plugin.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,MAAM,CAAA;AAKlC,wBAAgB,IAAI,CAAC,IAAI,CAAC,EAAE;IAAE,IAAI,CAAC,EAAE,MAAM,CAAA;CAAE,GAAG,MAAM,CAuFrD"}
@@ -61,7 +61,7 @@ export function hatk(opts) {
61
61
  };
62
62
  },
63
63
  configureServer(server) {
64
- const mainPath = resolve(import.meta.dirname, 'main.ts');
64
+ const mainPath = resolve(import.meta.dirname, 'main.js');
65
65
  const watchDirs = ['xrpc', 'feeds', 'labels', 'jobs', 'setup', 'lexicons'].filter((d) => existsSync(d));
66
66
  const watchArgs = watchDirs.flatMap((d) => ['--watch-path', d]);
67
67
  serverProcess = spawn('npx', ['tsx', 'watch', ...watchArgs, mainPath, 'config.yaml'], {
@@ -71,6 +71,7 @@ export function hatk(opts) {
71
71
  ...process.env,
72
72
  PORT: String(backendPort),
73
73
  OAUTH_ISSUER: process.env.OAUTH_ISSUER || issuer,
74
+ DEV_MODE: '1',
74
75
  },
75
76
  });
76
77
  server.httpServer?.on('close', () => {
package/package.json CHANGED
@@ -1,9 +1,15 @@
1
1
  {
2
2
  "name": "@hatk/hatk",
3
- "version": "0.0.1-alpha.0",
3
+ "version": "0.0.1-alpha.10",
4
+ "license": "MIT",
4
5
  "bin": {
5
6
  "hatk": "dist/cli.js"
6
7
  },
8
+ "files": [
9
+ "dist",
10
+ "fonts",
11
+ "public"
12
+ ],
7
13
  "type": "module",
8
14
  "exports": {
9
15
  "./feeds": "./dist/feeds.js",
@@ -19,7 +25,6 @@
19
25
  "./test/browser": "./dist/test-browser.js",
20
26
  "./vite-plugin": "./dist/vite-plugin.js"
21
27
  },
22
- "files": ["dist", "fonts", "public"],
23
28
  "scripts": {
24
29
  "build": "tsc -p tsconfig.build.json",
25
30
  "prepublishOnly": "npm run build"
@@ -0,0 +1,2 @@
1
+ User-agent: *
2
+ Allow: /