@hatk/hatk 0.0.1-alpha.5 → 0.0.1-alpha.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (168) hide show
  1. package/dist/adapter.d.ts +19 -0
  2. package/dist/adapter.d.ts.map +1 -0
  3. package/dist/adapter.js +107 -0
  4. package/dist/backfill.d.ts +60 -1
  5. package/dist/backfill.d.ts.map +1 -1
  6. package/dist/backfill.js +167 -33
  7. package/dist/car.d.ts +59 -1
  8. package/dist/car.d.ts.map +1 -1
  9. package/dist/car.js +179 -7
  10. package/dist/cbor.d.ts +37 -0
  11. package/dist/cbor.d.ts.map +1 -1
  12. package/dist/cbor.js +36 -3
  13. package/dist/cid.d.ts +37 -0
  14. package/dist/cid.d.ts.map +1 -1
  15. package/dist/cid.js +38 -3
  16. package/dist/cli.js +243 -996
  17. package/dist/config.d.ts +24 -1
  18. package/dist/config.d.ts.map +1 -1
  19. package/dist/config.js +37 -9
  20. package/dist/database/adapter-factory.d.ts +6 -0
  21. package/dist/database/adapter-factory.d.ts.map +1 -0
  22. package/dist/database/adapter-factory.js +20 -0
  23. package/dist/database/adapters/duckdb-search.d.ts +12 -0
  24. package/dist/database/adapters/duckdb-search.d.ts.map +1 -0
  25. package/dist/database/adapters/duckdb-search.js +27 -0
  26. package/dist/database/adapters/duckdb.d.ts +25 -0
  27. package/dist/database/adapters/duckdb.d.ts.map +1 -0
  28. package/dist/database/adapters/duckdb.js +161 -0
  29. package/dist/database/adapters/sqlite-search.d.ts +23 -0
  30. package/dist/database/adapters/sqlite-search.d.ts.map +1 -0
  31. package/dist/database/adapters/sqlite-search.js +74 -0
  32. package/dist/database/adapters/sqlite.d.ts +18 -0
  33. package/dist/database/adapters/sqlite.d.ts.map +1 -0
  34. package/dist/database/adapters/sqlite.js +88 -0
  35. package/dist/{db.d.ts → database/db.d.ts} +56 -6
  36. package/dist/database/db.d.ts.map +1 -0
  37. package/dist/{db.js → database/db.js} +727 -549
  38. package/dist/database/dialect.d.ts +45 -0
  39. package/dist/database/dialect.d.ts.map +1 -0
  40. package/dist/database/dialect.js +72 -0
  41. package/dist/{fts.d.ts → database/fts.d.ts} +7 -0
  42. package/dist/database/fts.d.ts.map +1 -0
  43. package/dist/{fts.js → database/fts.js} +116 -32
  44. package/dist/database/index.d.ts +7 -0
  45. package/dist/database/index.d.ts.map +1 -0
  46. package/dist/database/index.js +6 -0
  47. package/dist/database/ports.d.ts +50 -0
  48. package/dist/database/ports.d.ts.map +1 -0
  49. package/dist/database/ports.js +1 -0
  50. package/dist/{schema.d.ts → database/schema.d.ts} +14 -3
  51. package/dist/database/schema.d.ts.map +1 -0
  52. package/dist/{schema.js → database/schema.js} +81 -41
  53. package/dist/dev-entry.d.ts +8 -0
  54. package/dist/dev-entry.d.ts.map +1 -0
  55. package/dist/dev-entry.js +111 -0
  56. package/dist/feeds.d.ts +12 -8
  57. package/dist/feeds.d.ts.map +1 -1
  58. package/dist/feeds.js +45 -6
  59. package/dist/hooks.d.ts +85 -0
  60. package/dist/hooks.d.ts.map +1 -0
  61. package/dist/hooks.js +161 -0
  62. package/dist/hydrate.d.ts +6 -5
  63. package/dist/hydrate.d.ts.map +1 -1
  64. package/dist/hydrate.js +4 -16
  65. package/dist/indexer.d.ts +22 -0
  66. package/dist/indexer.d.ts.map +1 -1
  67. package/dist/indexer.js +96 -8
  68. package/dist/labels.d.ts +36 -0
  69. package/dist/labels.d.ts.map +1 -1
  70. package/dist/labels.js +71 -6
  71. package/dist/lexicon-resolve.d.ts.map +1 -1
  72. package/dist/lexicon-resolve.js +27 -112
  73. package/dist/lexicons/com/atproto/label/defs.json +75 -0
  74. package/dist/lexicons/com/atproto/moderation/defs.json +30 -0
  75. package/dist/lexicons/com/atproto/repo/strongRef.json +24 -0
  76. package/dist/lexicons/dev/hatk/createRecord.json +40 -0
  77. package/dist/lexicons/dev/hatk/createReport.json +48 -0
  78. package/dist/lexicons/dev/hatk/deleteRecord.json +25 -0
  79. package/dist/lexicons/dev/hatk/describeCollections.json +41 -0
  80. package/dist/lexicons/dev/hatk/describeFeeds.json +29 -0
  81. package/dist/lexicons/dev/hatk/describeLabels.json +45 -0
  82. package/dist/lexicons/dev/hatk/getFeed.json +30 -0
  83. package/dist/lexicons/dev/hatk/getPreferences.json +19 -0
  84. package/dist/lexicons/dev/hatk/getRecord.json +26 -0
  85. package/dist/lexicons/dev/hatk/getRecords.json +32 -0
  86. package/dist/lexicons/dev/hatk/putPreference.json +28 -0
  87. package/dist/lexicons/dev/hatk/putRecord.json +41 -0
  88. package/dist/lexicons/dev/hatk/searchRecords.json +32 -0
  89. package/dist/lexicons/dev/hatk/uploadBlob.json +23 -0
  90. package/dist/logger.d.ts +29 -0
  91. package/dist/logger.d.ts.map +1 -1
  92. package/dist/logger.js +29 -0
  93. package/dist/main.js +136 -67
  94. package/dist/mst.d.ts +18 -1
  95. package/dist/mst.d.ts.map +1 -1
  96. package/dist/mst.js +19 -8
  97. package/dist/oauth/db.d.ts +3 -1
  98. package/dist/oauth/db.d.ts.map +1 -1
  99. package/dist/oauth/db.js +48 -19
  100. package/dist/oauth/server.d.ts +24 -0
  101. package/dist/oauth/server.d.ts.map +1 -1
  102. package/dist/oauth/server.js +198 -22
  103. package/dist/oauth/session.d.ts +11 -0
  104. package/dist/oauth/session.d.ts.map +1 -0
  105. package/dist/oauth/session.js +65 -0
  106. package/dist/opengraph.d.ts +10 -0
  107. package/dist/opengraph.d.ts.map +1 -1
  108. package/dist/opengraph.js +73 -39
  109. package/dist/pds-proxy.d.ts +42 -0
  110. package/dist/pds-proxy.d.ts.map +1 -0
  111. package/dist/pds-proxy.js +207 -0
  112. package/dist/push.d.ts +33 -0
  113. package/dist/push.d.ts.map +1 -0
  114. package/dist/push.js +166 -0
  115. package/dist/renderer.d.ts +27 -0
  116. package/dist/renderer.d.ts.map +1 -0
  117. package/dist/renderer.js +46 -0
  118. package/dist/resolve-hatk.d.ts +6 -0
  119. package/dist/resolve-hatk.d.ts.map +1 -0
  120. package/dist/resolve-hatk.js +20 -0
  121. package/dist/response.d.ts +16 -0
  122. package/dist/response.d.ts.map +1 -0
  123. package/dist/response.js +69 -0
  124. package/dist/scanner.d.ts +21 -0
  125. package/dist/scanner.d.ts.map +1 -0
  126. package/dist/scanner.js +88 -0
  127. package/dist/seed.d.ts +19 -0
  128. package/dist/seed.d.ts.map +1 -1
  129. package/dist/seed.js +43 -4
  130. package/dist/server-init.d.ts +8 -0
  131. package/dist/server-init.d.ts.map +1 -0
  132. package/dist/server-init.js +62 -0
  133. package/dist/server.d.ts +26 -3
  134. package/dist/server.d.ts.map +1 -1
  135. package/dist/server.js +624 -635
  136. package/dist/setup.d.ts +28 -1
  137. package/dist/setup.d.ts.map +1 -1
  138. package/dist/setup.js +50 -3
  139. package/dist/templates/feed.tpl +14 -0
  140. package/dist/templates/hook.tpl +5 -0
  141. package/dist/templates/label.tpl +15 -0
  142. package/dist/templates/og.tpl +17 -0
  143. package/dist/templates/seed.tpl +11 -0
  144. package/dist/templates/setup.tpl +5 -0
  145. package/dist/templates/test-feed.tpl +19 -0
  146. package/dist/templates/test-xrpc.tpl +19 -0
  147. package/dist/templates/xrpc.tpl +41 -0
  148. package/dist/test.d.ts +1 -1
  149. package/dist/test.d.ts.map +1 -1
  150. package/dist/test.js +38 -32
  151. package/dist/views.js +1 -1
  152. package/dist/vite-plugin.d.ts +1 -1
  153. package/dist/vite-plugin.d.ts.map +1 -1
  154. package/dist/vite-plugin.js +254 -66
  155. package/dist/xrpc.d.ts +60 -10
  156. package/dist/xrpc.d.ts.map +1 -1
  157. package/dist/xrpc.js +155 -39
  158. package/package.json +15 -7
  159. package/public/admin.html +133 -54
  160. package/dist/db.d.ts.map +0 -1
  161. package/dist/fts.d.ts.map +0 -1
  162. package/dist/oauth/hooks.d.ts +0 -10
  163. package/dist/oauth/hooks.d.ts.map +0 -1
  164. package/dist/oauth/hooks.js +0 -40
  165. package/dist/schema.d.ts.map +0 -1
  166. package/dist/test-browser.d.ts +0 -14
  167. package/dist/test-browser.d.ts.map +0 -1
  168. package/dist/test-browser.js +0 -26
@@ -0,0 +1,19 @@
1
+ import { type IncomingMessage, type ServerResponse } from 'node:http';
2
+ /**
3
+ * Convert a Node.js IncomingMessage to a Web Standard Request.
4
+ */
5
+ export declare function toRequest(req: IncomingMessage, base: string): Request;
6
+ /**
7
+ * Pipe a Web Standard Response back to a Node.js ServerResponse.
8
+ */
9
+ export declare function sendResponse(res: ServerResponse, response: Response): Promise<void>;
10
+ /** Routes handled by hatk — everything else can fall through to a framework handler. */
11
+ export declare const HATK_ROUTES: string[];
12
+ export declare function isHatkRoute(pathname: string): boolean;
13
+ /**
14
+ * Create a Node.js HTTP server from a Web Standard fetch handler.
15
+ * If a fallback Node middleware is provided, non-hatk routes are sent to it
16
+ * (e.g. SvelteKit's handler from build/handler.js).
17
+ */
18
+ export declare function serve(handler: (request: Request) => Promise<Response>, port: number, base?: string, fallback?: (req: IncomingMessage, res: ServerResponse, next: () => void) => void): import("node:http").Server<typeof IncomingMessage, typeof ServerResponse>;
19
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../src/adapter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,eAAe,EAAE,KAAK,cAAc,EAAgB,MAAM,WAAW,CAAA;AAEnF;;GAEG;AACH,wBAAgB,SAAS,CAAC,GAAG,EAAE,eAAe,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CA0BrE;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC,CAuBzF;AAED,wFAAwF;AACxF,eAAO,MAAM,WAAW,UAavB,CAAA;AAED,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAErD;AAED;;;;GAIG;AACH,wBAAgB,KAAK,CACnB,OAAO,EAAE,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,QAAQ,CAAC,EAChD,IAAI,EAAE,MAAM,EACZ,IAAI,CAAC,EAAE,MAAM,EACb,QAAQ,CAAC,EAAE,CAAC,GAAG,EAAE,eAAe,EAAE,GAAG,EAAE,cAAc,EAAE,IAAI,EAAE,MAAM,IAAI,KAAK,IAAI,6EA4BjF"}
@@ -0,0 +1,107 @@
1
+ import { createServer } from 'node:http';
2
+ /**
3
+ * Convert a Node.js IncomingMessage to a Web Standard Request.
4
+ */
5
+ export function toRequest(req, base) {
6
+ const url = new URL(req.url, base);
7
+ const headers = new Headers();
8
+ for (const [key, value] of Object.entries(req.headers)) {
9
+ if (value) {
10
+ if (Array.isArray(value)) {
11
+ for (const v of value)
12
+ headers.append(key, v);
13
+ }
14
+ else {
15
+ headers.set(key, value);
16
+ }
17
+ }
18
+ }
19
+ const init = {
20
+ method: req.method,
21
+ headers,
22
+ };
23
+ // GET and HEAD requests cannot have a body
24
+ if (req.method !== 'GET' && req.method !== 'HEAD') {
25
+ // @ts-expect-error — Node.js streams are valid body sources
26
+ init.body = req;
27
+ init.duplex = 'half';
28
+ }
29
+ return new Request(url.href, init);
30
+ }
31
+ /**
32
+ * Pipe a Web Standard Response back to a Node.js ServerResponse.
33
+ */
34
+ export async function sendResponse(res, response) {
35
+ const rawHeaders = [];
36
+ response.headers.forEach((value, name) => {
37
+ rawHeaders.push(name, value);
38
+ });
39
+ res.writeHead(response.status, rawHeaders);
40
+ if (!response.body) {
41
+ res.end();
42
+ return;
43
+ }
44
+ const reader = response.body.getReader();
45
+ try {
46
+ while (true) {
47
+ const { done, value } = await reader.read();
48
+ if (done)
49
+ break;
50
+ res.write(value);
51
+ }
52
+ }
53
+ finally {
54
+ reader.releaseLock();
55
+ res.end();
56
+ }
57
+ }
58
+ /** Routes handled by hatk — everything else can fall through to a framework handler. */
59
+ export const HATK_ROUTES = [
60
+ '/xrpc/',
61
+ '/oauth/',
62
+ '/oauth-client-metadata.json',
63
+ '/.well-known/',
64
+ '/og/',
65
+ '/admin',
66
+ '/repos',
67
+ '/info/',
68
+ '/_health',
69
+ '/robots.txt',
70
+ '/auth/logout',
71
+ '/__dev/',
72
+ ];
73
+ export function isHatkRoute(pathname) {
74
+ return HATK_ROUTES.some((r) => pathname.startsWith(r) || pathname === r);
75
+ }
76
+ /**
77
+ * Create a Node.js HTTP server from a Web Standard fetch handler.
78
+ * If a fallback Node middleware is provided, non-hatk routes are sent to it
79
+ * (e.g. SvelteKit's handler from build/handler.js).
80
+ */
81
+ export function serve(handler, port, base, fallback) {
82
+ const origin = base || `http://localhost:${port}`;
83
+ const server = createServer(async (req, res) => {
84
+ try {
85
+ const url = new URL(req.url, origin);
86
+ // If we have a fallback (e.g. SvelteKit) and this isn't a hatk route, skip hatk
87
+ if (fallback && !isHatkRoute(url.pathname)) {
88
+ fallback(req, res, () => {
89
+ res.writeHead(404);
90
+ res.end('Not found');
91
+ });
92
+ return;
93
+ }
94
+ const request = toRequest(req, origin);
95
+ const response = await handler(request);
96
+ await sendResponse(res, response);
97
+ }
98
+ catch (err) {
99
+ if (!res.headersSent) {
100
+ res.writeHead(500, { 'Content-Type': 'application/json' });
101
+ }
102
+ res.end(JSON.stringify({ error: err.message }));
103
+ }
104
+ });
105
+ server.listen(port);
106
+ return server;
107
+ }
@@ -1,11 +1,70 @@
1
1
  import type { BackfillConfig } from './config.ts';
2
+ /** Options passed to {@link runBackfill}. */
2
3
  interface BackfillOpts {
4
+ /** Base URL of the relay or PDS to enumerate repos from (e.g. `wss://bsky.network`). */
3
5
  pdsUrl: string;
6
+ /** PLC directory URL used to resolve `did:plc` identifiers (e.g. `https://plc.directory`). */
4
7
  plcUrl: string;
8
+ /** AT Protocol collection NSIDs to index (e.g. `app.bsky.feed.post`). */
5
9
  collections: Set<string>;
10
+ /** Backfill behavior settings from `hatk.config.ts`. */
6
11
  config: BackfillConfig;
7
12
  }
13
+ /**
14
+ * Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
15
+ *
16
+ * The full flow:
17
+ * 1. Resolve the DID to find the user's PDS endpoint
18
+ * 2. Fetch the repo as a CAR file from the PDS
19
+ * 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
20
+ * 4. Delete any existing records for this DID (so deletions are reflected)
21
+ * 5. Bulk-insert all records matching the target collections
22
+ *
23
+ * On failure, applies exponential backoff retry logic. HTTP 4xx errors are
24
+ * treated as permanent failures (repo doesn't exist or is deactivated) and
25
+ * are not retried.
26
+ *
27
+ * @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
28
+ * @param collections - Collection NSIDs to index; records in other collections are skipped
29
+ * @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
30
+ * @returns The number of records successfully indexed
31
+ *
32
+ * @example
33
+ * ```ts
34
+ * const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
35
+ * console.log(`Indexed ${count} records`)
36
+ * ```
37
+ */
8
38
  export declare function backfillRepo(did: string, collections: Set<string>, fetchTimeout: number): Promise<number>;
9
- export declare function runBackfill(opts: BackfillOpts): Promise<void>;
39
+ /**
40
+ * Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
41
+ *
42
+ * Operates in one of three modes based on config:
43
+ * - **Pinned repos** — backfill only the DIDs listed in `config.repos`
44
+ * - **Full network** — enumerate every active repo on the relay via `listRepos`
45
+ * - **Collection signal** (default) — use `listReposByCollection` to discover repos that
46
+ * contain records in the configured signal collections, falling back to `listRepos`
47
+ * if the relay doesn't support collection-scoped enumeration
48
+ *
49
+ * After the initial pass, failed repos are retried with exponential backoff
50
+ * (up to `config.maxRetries` attempts). The run emits structured log events for
51
+ * monitoring via the `backfill.run` and `backfill.retry_round` event types.
52
+ *
53
+ * @example
54
+ * ```ts
55
+ * await runBackfill({
56
+ * pdsUrl: 'wss://bsky.network',
57
+ * plcUrl: 'https://plc.directory',
58
+ * collections: new Set(['xyz.statusphere.status']),
59
+ * config: {
60
+ * fullNetwork: false,
61
+ * parallelism: 10,
62
+ * fetchTimeout: 30,
63
+ * maxRetries: 5,
64
+ * },
65
+ * })
66
+ * ```
67
+ */
68
+ export declare function runBackfill(opts: BackfillOpts): Promise<number>;
10
69
  export {};
11
70
  //# sourceMappingURL=backfill.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAEjD,UAAU,YAAY;IACpB,MAAM,EAAE,MAAM,CAAA;IACd,MAAM,EAAE,MAAM,CAAA;IACd,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,MAAM,EAAE,cAAc,CAAA;CACvB;AA+ED,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAiH/G;AAwBD,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAiInE"}
1
+ {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAiBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAIjD,6CAA6C;AAC7C,UAAU,YAAY;IACpB,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IACd,8FAA8F;IAC9F,MAAM,EAAE,MAAM,CAAA;IACd,yEAAyE;IACzE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,wDAAwD;IACxD,MAAM,EAAE,cAAc,CAAA;CACvB;AAoGD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAkK/G;AA8BD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAkIrE"}
package/dist/backfill.js CHANGED
@@ -1,10 +1,26 @@
1
- import { parseCarFrame } from "./car.js";
1
+ import { parseCarStream } from "./car.js";
2
2
  import { cborDecode } from "./cbor.js";
3
3
  import { walkMst } from "./mst.js";
4
- import { setRepoStatus, getRepoStatus, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./db.js";
4
+ import { setRepoStatus, getRepoStatus, getRepoRev, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./database/db.js";
5
5
  import { emit, timer } from "./logger.js";
6
+ import { validateRecord } from '@bigmoves/lexicon';
7
+ import { getLexiconArray } from "./database/schema.js";
8
+ /** In-memory cache of DID → PDS resolution results to avoid redundant lookups. */
6
9
  const pdsCache = new Map();
7
10
  let plcUrl;
11
+ /**
12
+ * Resolves a DID to its PDS endpoint and handle by fetching the DID document.
13
+ *
14
+ * Supports both `did:web` (fetches `/.well-known/did.json`) and `did:plc`
15
+ * (fetches from the PLC directory). Results are cached for the lifetime of the process.
16
+ *
17
+ * @example
18
+ * ```ts
19
+ * const { pds, handle } = await resolvePds('did:plc:abc123')
20
+ * // pds = "https://puffball.us-east.host.bsky.network"
21
+ * // handle = "alice.bsky.social"
22
+ * ```
23
+ */
8
24
  async function resolvePds(did) {
9
25
  const cached = pdsCache.get(did);
10
26
  if (cached)
@@ -33,7 +49,10 @@ async function resolvePds(did) {
33
49
  pdsCache.set(did, result);
34
50
  return result;
35
51
  }
36
- // --- Repo Enumeration ---
52
+ /**
53
+ * Paginates through all active repos on a relay/PDS using `com.atproto.sync.listRepos`.
54
+ * Yields `{ did, rev }` for each active repo. Skips deactivated repos.
55
+ */
37
56
  async function* listRepos(pdsUrl) {
38
57
  let cursor;
39
58
  while (true) {
@@ -53,6 +72,13 @@ async function* listRepos(pdsUrl) {
53
72
  cursor = data.cursor;
54
73
  }
55
74
  }
75
+ /**
76
+ * Paginates through repos that contain records in a specific collection using
77
+ * `com.atproto.sync.listReposByCollection`. More efficient than {@link listRepos}
78
+ * when only a few collections are needed, since the relay can filter server-side.
79
+ *
80
+ * Not all relays support this endpoint — callers should fall back to {@link listRepos}.
81
+ */
56
82
  async function* listReposByCollection(pdsUrl, collection) {
57
83
  let cursor;
58
84
  while (true) {
@@ -71,7 +97,31 @@ async function* listReposByCollection(pdsUrl, collection) {
71
97
  cursor = data.cursor;
72
98
  }
73
99
  }
74
- // --- Single Repo Backfill ---
100
+ /**
101
+ * Downloads and indexes a single user's repo via `com.atproto.sync.getRepo`.
102
+ *
103
+ * The full flow:
104
+ * 1. Resolve the DID to find the user's PDS endpoint
105
+ * 2. Fetch the repo as a CAR file from the PDS
106
+ * 3. Parse the CAR, decode the commit, and walk the MST (Merkle Search Tree)
107
+ * 4. Delete any existing records for this DID (so deletions are reflected)
108
+ * 5. Bulk-insert all records matching the target collections
109
+ *
110
+ * On failure, applies exponential backoff retry logic. HTTP 4xx errors are
111
+ * treated as permanent failures (repo doesn't exist or is deactivated) and
112
+ * are not retried.
113
+ *
114
+ * @param did - The DID of the repo to backfill (e.g. `did:plc:abc123`)
115
+ * @param collections - Collection NSIDs to index; records in other collections are skipped
116
+ * @param fetchTimeout - Maximum seconds to wait for the CAR download before aborting
117
+ * @returns The number of records successfully indexed
118
+ *
119
+ * @example
120
+ * ```ts
121
+ * const count = await backfillRepo('did:plc:abc123', new Set(['app.bsky.feed.post']), 30)
122
+ * console.log(`Indexed ${count} records`)
123
+ * ```
124
+ */
75
125
  export async function backfillRepo(did, collections, fetchTimeout) {
76
126
  const elapsed = timer();
77
127
  let count = 0;
@@ -80,6 +130,7 @@ export async function backfillRepo(did, collections, fetchTimeout) {
80
130
  let error;
81
131
  let resolvedPds;
82
132
  let resolvedHandle = null;
133
+ let resolvedSince = null;
83
134
  let retryCount;
84
135
  let retryAfter;
85
136
  const controller = new AbortController();
@@ -89,25 +140,67 @@ export async function backfillRepo(did, collections, fetchTimeout) {
89
140
  resolvedPds = pdsUrl;
90
141
  resolvedHandle = handle;
91
142
  timeout = setTimeout(() => controller.abort(), fetchTimeout * 1000);
92
- const res = await fetch(`${resolvedPds}/xrpc/com.atproto.sync.getRepo?did=${encodeURIComponent(did)}`, {
93
- signal: controller.signal,
94
- });
143
+ let lastRev = await getRepoRev(did);
144
+ const baseUrl = `${resolvedPds}/xrpc/com.atproto.sync.getRepo?did=${encodeURIComponent(did)}`;
145
+ let repoUrl = lastRev ? `${baseUrl}&since=${encodeURIComponent(lastRev)}` : baseUrl;
146
+ let res = await fetch(repoUrl, { signal: controller.signal });
147
+ // If the PDS rejected our `since` rev (compacted history), fall back to full import
148
+ if (res.status === 400 && lastRev) {
149
+ lastRev = null;
150
+ res = await fetch(baseUrl, { signal: controller.signal });
151
+ }
95
152
  if (!res.ok) {
96
153
  const httpErr = new Error(`getRepo failed for ${did}: ${res.status}`);
97
154
  httpErr.httpStatus = res.status;
98
155
  throw httpErr;
99
156
  }
100
- const carBytes = new Uint8Array(await res.arrayBuffer());
101
- carSizeBytes = carBytes.length;
102
- const { roots, blocks } = parseCarFrame(carBytes);
103
- // Decode commit to get MST root
104
- const rootData = blocks.get(roots[0]);
157
+ resolvedSince = lastRev;
158
+ let { roots, blocks, byteLength } = await parseCarStream(res.body);
159
+ carSizeBytes = byteLength;
160
+ // Decode commit to get MST root — if the diff CAR is missing the root block,
161
+ // fall back to a full import (the PDS compacted past our `since` rev)
162
+ let rootData = blocks.get(roots[0]);
163
+ if (!rootData && lastRev) {
164
+ lastRev = null;
165
+ resolvedSince = null;
166
+ res = await fetch(baseUrl, { signal: controller.signal });
167
+ if (!res.ok) {
168
+ const httpErr = new Error(`getRepo failed for ${did}: ${res.status}`);
169
+ httpErr.httpStatus = res.status;
170
+ throw httpErr;
171
+ }
172
+ ;
173
+ ({ roots, blocks, byteLength } = await parseCarStream(res.body));
174
+ carSizeBytes = byteLength;
175
+ rootData = blocks.get(roots[0]);
176
+ }
105
177
  if (!rootData)
106
178
  throw new Error(`No root block for ${did}`);
107
179
  const { value: commit } = cborDecode(rootData);
108
180
  // Walk MST to find all record paths
109
181
  const entries = walkMst(blocks, commit.data.$link);
110
- const bulk = [];
182
+ // Delete existing records for this DID before re-importing so deletions are reflected
183
+ // Only on full imports (no since) — diff CARs only contain changes
184
+ if (!lastRev) {
185
+ for (const col of collections) {
186
+ const schema = getSchema(col);
187
+ if (!schema)
188
+ continue;
189
+ await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, [did]);
190
+ for (const child of schema.children) {
191
+ await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, [did]);
192
+ }
193
+ for (const union of schema.unions) {
194
+ for (const branch of union.branches) {
195
+ await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, [did]);
196
+ }
197
+ }
198
+ }
199
+ }
200
+ // Insert records in chunks to limit memory usage
201
+ const CHUNK_SIZE = 1000;
202
+ let chunk = [];
203
+ const validationSkips = {};
111
204
  for (const entry of entries) {
112
205
  const collection = entry.path.split('/')[0];
113
206
  if (!collections.has(collection))
@@ -115,13 +208,23 @@ export async function backfillRepo(did, collections, fetchTimeout) {
115
208
  const blockData = blocks.get(entry.cid);
116
209
  if (!blockData)
117
210
  continue;
211
+ blocks.delete(entry.cid); // free block data as we go
118
212
  try {
119
213
  const { value: record } = cborDecode(blockData);
120
214
  if (!record?.$type)
121
215
  continue;
122
216
  const rkey = entry.path.split('/').slice(1).join('/');
123
217
  const uri = `at://${did}/${collection}/${rkey}`;
124
- bulk.push({ collection, uri, cid: entry.cid, did, record });
218
+ const validationError = validateRecord(getLexiconArray(), collection, record);
219
+ if (validationError) {
220
+ validationSkips[collection] = (validationSkips[collection] || 0) + 1;
221
+ continue;
222
+ }
223
+ chunk.push({ collection, uri, cid: entry.cid, did, record });
224
+ if (chunk.length >= CHUNK_SIZE) {
225
+ count += await bulkInsertRecords(chunk);
226
+ chunk = [];
227
+ }
125
228
  }
126
229
  catch (recordErr) {
127
230
  emit('backfill', 'record_error', {
@@ -132,22 +235,13 @@ export async function backfillRepo(did, collections, fetchTimeout) {
132
235
  });
133
236
  }
134
237
  }
135
- // Delete existing records for this DID before re-importing so deletions are reflected
136
- for (const col of collections) {
137
- const schema = getSchema(col);
138
- if (!schema)
139
- continue;
140
- await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
141
- for (const child of schema.children) {
142
- await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
143
- }
144
- for (const union of schema.unions) {
145
- for (const branch of union.branches) {
146
- await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
147
- }
148
- }
238
+ if (chunk.length > 0) {
239
+ count += await bulkInsertRecords(chunk);
240
+ }
241
+ const totalSkips = Object.values(validationSkips).reduce((a, b) => a + b, 0);
242
+ if (totalSkips > 0) {
243
+ emit('backfill', 'validation_skips', { did, total: totalSkips, by_collection: validationSkips });
149
244
  }
150
- count = await bulkInsertRecords(bulk);
151
245
  await setRepoStatus(did, 'active', commit.rev, { handle });
152
246
  return count;
153
247
  }
@@ -179,13 +273,24 @@ export async function backfillRepo(did, collections, fetchTimeout) {
179
273
  error,
180
274
  pds_url: resolvedPds,
181
275
  car_size_bytes: carSizeBytes,
276
+ import_mode: carSizeBytes !== undefined ? (resolvedSince ? 'diff' : 'full') : undefined,
277
+ since_rev: resolvedSince,
182
278
  retry_count: retryCount,
183
279
  retry_after: retryAfter,
184
280
  permanent_failure: retryCount === 999 ? true : undefined,
185
281
  });
186
282
  }
187
283
  }
188
- // --- Worker Pool ---
284
+ /**
285
+ * Processes items concurrently with a fixed number of workers.
286
+ * Workers pull from a shared index so the pool stays saturated even when
287
+ * individual items complete at different speeds. Errors from `fn` are
288
+ * swallowed (they're expected to be captured via structured logging).
289
+ *
290
+ * @param items - The work items to process
291
+ * @param parallelism - Maximum number of concurrent workers
292
+ * @param fn - Async function to run for each item
293
+ */
189
294
  async function runWorkerPool(items, parallelism, fn) {
190
295
  let index = 0;
191
296
  async function worker() {
@@ -202,7 +307,35 @@ async function runWorkerPool(items, parallelism, fn) {
202
307
  const workers = Array.from({ length: Math.min(parallelism, items.length) }, () => worker());
203
308
  await Promise.all(workers);
204
309
  }
205
- // --- Main Backfill Entry Point ---
310
+ /**
311
+ * Orchestrates a full backfill run: enumerate repos, filter to pending, download, and index.
312
+ *
313
+ * Operates in one of three modes based on config:
314
+ * - **Pinned repos** — backfill only the DIDs listed in `config.repos`
315
+ * - **Full network** — enumerate every active repo on the relay via `listRepos`
316
+ * - **Collection signal** (default) — use `listReposByCollection` to discover repos that
317
+ * contain records in the configured signal collections, falling back to `listRepos`
318
+ * if the relay doesn't support collection-scoped enumeration
319
+ *
320
+ * After the initial pass, failed repos are retried with exponential backoff
321
+ * (up to `config.maxRetries` attempts). The run emits structured log events for
322
+ * monitoring via the `backfill.run` and `backfill.retry_round` event types.
323
+ *
324
+ * @example
325
+ * ```ts
326
+ * await runBackfill({
327
+ * pdsUrl: 'wss://bsky.network',
328
+ * plcUrl: 'https://plc.directory',
329
+ * collections: new Set(['xyz.statusphere.status']),
330
+ * config: {
331
+ * fullNetwork: false,
332
+ * parallelism: 10,
333
+ * fetchTimeout: 30,
334
+ * maxRetries: 5,
335
+ * },
336
+ * })
337
+ * ```
338
+ */
206
339
  export async function runBackfill(opts) {
207
340
  const { pdsUrl, collections, config } = opts;
208
341
  plcUrl = opts.plcUrl;
@@ -267,7 +400,7 @@ export async function runBackfill(opts) {
267
400
  parallelism: config.parallelism,
268
401
  status: 'success',
269
402
  });
270
- return;
403
+ return 0;
271
404
  }
272
405
  // 3. Backfill with worker pool
273
406
  let totalRecords = 0;
@@ -291,7 +424,7 @@ export async function runBackfill(opts) {
291
424
  retryRound++;
292
425
  // Wait until the earliest retry_after has passed
293
426
  const now = Math.floor(Date.now() / 1000);
294
- const rows = await querySQL(`SELECT MIN(retry_after) as earliest FROM _repos WHERE status = 'failed' AND retry_after > $1 AND retry_count < $2`, [now, maxRetries]);
427
+ const rows = (await querySQL(`SELECT MIN(retry_after) as earliest FROM _repos WHERE status = 'failed' AND retry_after > $1 AND retry_count < $2`, [now, maxRetries]));
295
428
  const earliest = rows[0]?.earliest ? Number(rows[0].earliest) : 0;
296
429
  if (earliest > now) {
297
430
  await new Promise((resolve) => setTimeout(resolve, (earliest - now) * 1000));
@@ -325,4 +458,5 @@ export async function runBackfill(opts) {
325
458
  retry_rounds: retryRound,
326
459
  status: failedCount > 0 ? 'partial' : 'success',
327
460
  });
461
+ return totalRecords;
328
462
  }
package/dist/car.d.ts CHANGED
@@ -1,5 +1,63 @@
1
- export declare function parseCarFrame(carBytes: Uint8Array): {
1
+ /**
2
+ * CAR (Content Addressable aRchive) parser.
3
+ *
4
+ * CAR files bundle content-addressed blocks into a single binary container.
5
+ * They're used by the AT Protocol firehose (`com.atproto.sync.getRepo`) to
6
+ * deliver entire repos and by commit events to deliver individual changes.
7
+ *
8
+ * Format: `varint(headerLen) | CBOR(header) | block*`
9
+ * Each block: `varint(blockLen) | CID | data`
10
+ *
11
+ * @see https://ipld.io/specs/transport/car/carv1/
12
+ * @module
13
+ */
14
+ /**
15
+ * A memory-efficient block map that stores byte offsets into the original CAR
16
+ * buffer instead of copying block data. Implements the same `get`/`delete`/`size`
17
+ * interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
18
+ */
19
+ export declare class LazyBlockMap {
20
+ private offsets;
21
+ private carBytes;
22
+ constructor(carBytes: Uint8Array, offsets: Map<string, [number, number]>);
23
+ get(cid: string): Uint8Array | undefined;
24
+ delete(cid: string): boolean;
25
+ get size(): number;
26
+ [Symbol.iterator](): IterableIterator<[string, Uint8Array]>;
27
+ /** Release the underlying CAR buffer */
28
+ free(): void;
29
+ }
30
+ /**
31
+ * Parses a CARv1 stream incrementally from a `ReadableStream`.
32
+ *
33
+ * Instead of buffering the entire CAR into a single ArrayBuffer, this reads
34
+ * chunks from the stream and parses blocks as they arrive. Each block's data
35
+ * is `.slice()`d into its own small `Uint8Array`, allowing V8 to GC individual
36
+ * blocks as they're consumed during the MST walk.
37
+ *
38
+ * This is critical for backfill where multiple workers download 30-90MB CARs
39
+ * concurrently — buffered downloads cause OOMs because `ArrayBuffer` memory
40
+ * is "external" to V8's heap and not controlled by `--max-old-space-size`.
41
+ *
42
+ * @param body - The response body stream (e.g. `res.body` from `fetch()`)
43
+ * @returns `roots` — root CID strings; `blocks` — map of CID → block data; `byteLength` — total bytes read
44
+ */
45
+ export declare function parseCarStream(body: ReadableStream<Uint8Array>): Promise<{
2
46
  roots: string[];
3
47
  blocks: Map<string, Uint8Array>;
48
+ byteLength: number;
49
+ }>;
50
+ /**
51
+ * Parses a CARv1 binary frame into its root CIDs and a lazy block map.
52
+ *
53
+ * The block map stores byte offsets into `carBytes` rather than copying data,
54
+ * reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
55
+ *
56
+ * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
57
+ * @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
58
+ */
59
+ export declare function parseCarFrame(carBytes: Uint8Array): {
60
+ roots: string[];
61
+ blocks: LazyBlockMap;
4
62
  };
5
63
  //# sourceMappingURL=car.d.ts.map
package/dist/car.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAgCA,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;CAChC,CAmCA"}
1
+ {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAuCH;;;;GAIG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAA+B;IAC9C,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAKxE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAMxC,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,IAAI,IAAI,IAAI,MAAM,CAEjB;IAEA,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,gBAAgB,CAAC,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IAO5D,wCAAwC;IACxC,IAAI,IAAI,IAAI;CAIb;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,cAAc,CAAC,UAAU,CAAC,GAAG,OAAO,CAAC;IAC9E,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;IAC/B,UAAU,EAAE,MAAM,CAAA;CACnB,CAAC,CAsGD;AAED;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,YAAY,CAAA;CACrB,CAiCA"}