@hatk/hatk 0.0.1-alpha.6 → 0.0.1-alpha.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. package/dist/adapter.d.ts +19 -0
  2. package/dist/adapter.d.ts.map +1 -0
  3. package/dist/adapter.js +108 -0
  4. package/dist/backfill.d.ts +2 -2
  5. package/dist/backfill.d.ts.map +1 -1
  6. package/dist/backfill.js +78 -31
  7. package/dist/car.d.ts +42 -10
  8. package/dist/car.d.ts.map +1 -1
  9. package/dist/car.js +154 -14
  10. package/dist/cli.js +243 -1043
  11. package/dist/config.d.ts +31 -1
  12. package/dist/config.d.ts.map +1 -1
  13. package/dist/config.js +40 -9
  14. package/dist/database/adapter-factory.d.ts +6 -0
  15. package/dist/database/adapter-factory.d.ts.map +1 -0
  16. package/dist/database/adapter-factory.js +20 -0
  17. package/dist/database/adapters/duckdb-search.d.ts +12 -0
  18. package/dist/database/adapters/duckdb-search.d.ts.map +1 -0
  19. package/dist/database/adapters/duckdb-search.js +27 -0
  20. package/dist/database/adapters/duckdb.d.ts +25 -0
  21. package/dist/database/adapters/duckdb.d.ts.map +1 -0
  22. package/dist/database/adapters/duckdb.js +161 -0
  23. package/dist/database/adapters/sqlite-search.d.ts +23 -0
  24. package/dist/database/adapters/sqlite-search.d.ts.map +1 -0
  25. package/dist/database/adapters/sqlite-search.js +74 -0
  26. package/dist/database/adapters/sqlite.d.ts +18 -0
  27. package/dist/database/adapters/sqlite.d.ts.map +1 -0
  28. package/dist/database/adapters/sqlite.js +88 -0
  29. package/dist/{db.d.ts → database/db.d.ts} +57 -6
  30. package/dist/database/db.d.ts.map +1 -0
  31. package/dist/{db.js → database/db.js} +730 -549
  32. package/dist/database/dialect.d.ts +45 -0
  33. package/dist/database/dialect.d.ts.map +1 -0
  34. package/dist/database/dialect.js +72 -0
  35. package/dist/{fts.d.ts → database/fts.d.ts} +7 -0
  36. package/dist/database/fts.d.ts.map +1 -0
  37. package/dist/{fts.js → database/fts.js} +116 -32
  38. package/dist/database/index.d.ts +7 -0
  39. package/dist/database/index.d.ts.map +1 -0
  40. package/dist/database/index.js +6 -0
  41. package/dist/database/ports.d.ts +50 -0
  42. package/dist/database/ports.d.ts.map +1 -0
  43. package/dist/database/ports.js +1 -0
  44. package/dist/{schema.d.ts → database/schema.d.ts} +14 -3
  45. package/dist/database/schema.d.ts.map +1 -0
  46. package/dist/{schema.js → database/schema.js} +81 -41
  47. package/dist/dev-entry.d.ts +8 -0
  48. package/dist/dev-entry.d.ts.map +1 -0
  49. package/dist/dev-entry.js +112 -0
  50. package/dist/feeds.d.ts +12 -8
  51. package/dist/feeds.d.ts.map +1 -1
  52. package/dist/feeds.js +51 -6
  53. package/dist/hooks.d.ts +85 -0
  54. package/dist/hooks.d.ts.map +1 -0
  55. package/dist/hooks.js +161 -0
  56. package/dist/hydrate.d.ts +7 -6
  57. package/dist/hydrate.d.ts.map +1 -1
  58. package/dist/hydrate.js +4 -16
  59. package/dist/indexer.d.ts +22 -0
  60. package/dist/indexer.d.ts.map +1 -1
  61. package/dist/indexer.js +123 -32
  62. package/dist/labels.d.ts +36 -0
  63. package/dist/labels.d.ts.map +1 -1
  64. package/dist/labels.js +71 -6
  65. package/dist/lexicon-resolve.d.ts.map +1 -1
  66. package/dist/lexicon-resolve.js +27 -112
  67. package/dist/lexicons/com/atproto/label/defs.json +75 -0
  68. package/dist/lexicons/com/atproto/moderation/defs.json +30 -0
  69. package/dist/lexicons/com/atproto/repo/strongRef.json +24 -0
  70. package/dist/lexicons/dev/hatk/applyWrites.json +87 -0
  71. package/dist/lexicons/dev/hatk/createRecord.json +40 -0
  72. package/dist/lexicons/dev/hatk/createReport.json +48 -0
  73. package/dist/lexicons/dev/hatk/deleteRecord.json +25 -0
  74. package/dist/lexicons/dev/hatk/describeCollections.json +41 -0
  75. package/dist/lexicons/dev/hatk/describeFeeds.json +29 -0
  76. package/dist/lexicons/dev/hatk/describeLabels.json +45 -0
  77. package/dist/lexicons/dev/hatk/getFeed.json +30 -0
  78. package/dist/lexicons/dev/hatk/getPreferences.json +19 -0
  79. package/dist/lexicons/dev/hatk/getRecord.json +26 -0
  80. package/dist/lexicons/dev/hatk/getRecords.json +32 -0
  81. package/dist/lexicons/dev/hatk/putPreference.json +28 -0
  82. package/dist/lexicons/dev/hatk/putRecord.json +41 -0
  83. package/dist/lexicons/dev/hatk/searchRecords.json +32 -0
  84. package/dist/lexicons/dev/hatk/uploadBlob.json +23 -0
  85. package/dist/logger.d.ts +29 -0
  86. package/dist/logger.d.ts.map +1 -1
  87. package/dist/logger.js +29 -0
  88. package/dist/main.js +137 -67
  89. package/dist/mst.d.ts +18 -1
  90. package/dist/mst.d.ts.map +1 -1
  91. package/dist/mst.js +19 -8
  92. package/dist/oauth/db.d.ts +3 -1
  93. package/dist/oauth/db.d.ts.map +1 -1
  94. package/dist/oauth/db.js +48 -19
  95. package/dist/oauth/server.d.ts +24 -0
  96. package/dist/oauth/server.d.ts.map +1 -1
  97. package/dist/oauth/server.js +198 -22
  98. package/dist/oauth/session.d.ts +11 -0
  99. package/dist/oauth/session.d.ts.map +1 -0
  100. package/dist/oauth/session.js +65 -0
  101. package/dist/opengraph.d.ts +10 -0
  102. package/dist/opengraph.d.ts.map +1 -1
  103. package/dist/opengraph.js +80 -40
  104. package/dist/pds-proxy.d.ts +60 -0
  105. package/dist/pds-proxy.d.ts.map +1 -0
  106. package/dist/pds-proxy.js +277 -0
  107. package/dist/push.d.ts +34 -0
  108. package/dist/push.d.ts.map +1 -0
  109. package/dist/push.js +184 -0
  110. package/dist/renderer.d.ts +27 -0
  111. package/dist/renderer.d.ts.map +1 -0
  112. package/dist/renderer.js +46 -0
  113. package/dist/resolve-hatk.d.ts +6 -0
  114. package/dist/resolve-hatk.d.ts.map +1 -0
  115. package/dist/resolve-hatk.js +20 -0
  116. package/dist/response.d.ts +16 -0
  117. package/dist/response.d.ts.map +1 -0
  118. package/dist/response.js +69 -0
  119. package/dist/scanner.d.ts +21 -0
  120. package/dist/scanner.d.ts.map +1 -0
  121. package/dist/scanner.js +88 -0
  122. package/dist/seed.d.ts +19 -0
  123. package/dist/seed.d.ts.map +1 -1
  124. package/dist/seed.js +43 -4
  125. package/dist/server-init.d.ts +8 -0
  126. package/dist/server-init.d.ts.map +1 -0
  127. package/dist/server-init.js +62 -0
  128. package/dist/server.d.ts +26 -3
  129. package/dist/server.d.ts.map +1 -1
  130. package/dist/server.js +629 -635
  131. package/dist/setup.d.ts +28 -1
  132. package/dist/setup.d.ts.map +1 -1
  133. package/dist/setup.js +50 -3
  134. package/dist/templates/feed.tpl +14 -0
  135. package/dist/templates/hook.tpl +5 -0
  136. package/dist/templates/label.tpl +15 -0
  137. package/dist/templates/og.tpl +17 -0
  138. package/dist/templates/seed.tpl +11 -0
  139. package/dist/templates/setup.tpl +5 -0
  140. package/dist/templates/test-feed.tpl +19 -0
  141. package/dist/templates/test-xrpc.tpl +19 -0
  142. package/dist/templates/xrpc.tpl +41 -0
  143. package/dist/test.d.ts +1 -1
  144. package/dist/test.d.ts.map +1 -1
  145. package/dist/test.js +39 -32
  146. package/dist/views.js +1 -1
  147. package/dist/vite-plugin.d.ts +1 -1
  148. package/dist/vite-plugin.d.ts.map +1 -1
  149. package/dist/vite-plugin.js +254 -66
  150. package/dist/xrpc.d.ts +75 -11
  151. package/dist/xrpc.d.ts.map +1 -1
  152. package/dist/xrpc.js +189 -39
  153. package/package.json +14 -7
  154. package/public/admin.html +133 -54
  155. package/dist/db.d.ts.map +0 -1
  156. package/dist/fts.d.ts.map +0 -1
  157. package/dist/oauth/hooks.d.ts +0 -10
  158. package/dist/oauth/hooks.d.ts.map +0 -1
  159. package/dist/oauth/hooks.js +0 -40
  160. package/dist/schema.d.ts.map +0 -1
  161. package/dist/test-browser.d.ts +0 -14
  162. package/dist/test-browser.d.ts.map +0 -1
  163. package/dist/test-browser.js +0 -26
@@ -0,0 +1,19 @@
1
+ import { type IncomingMessage, type ServerResponse } from 'node:http';
2
+ /**
3
+ * Convert a Node.js IncomingMessage to a Web Standard Request.
4
+ */
5
+ export declare function toRequest(req: IncomingMessage, base: string): Request;
6
+ /**
7
+ * Pipe a Web Standard Response back to a Node.js ServerResponse.
8
+ */
9
+ export declare function sendResponse(res: ServerResponse, response: Response): Promise<void>;
10
+ /** Routes handled by hatk — everything else can fall through to a framework handler. */
11
+ export declare const HATK_ROUTES: string[];
12
+ export declare function isHatkRoute(pathname: string): boolean;
13
+ /**
14
+ * Create a Node.js HTTP server from a Web Standard fetch handler.
15
+ * If a fallback Node middleware is provided, non-hatk routes are sent to it
16
+ * (e.g. SvelteKit's handler from build/handler.js).
17
+ */
18
+ export declare function serve(handler: (request: Request) => Promise<Response>, port: number, base?: string, fallback?: (req: IncomingMessage, res: ServerResponse, next: () => void) => void): import("node:http").Server<typeof IncomingMessage, typeof ServerResponse>;
19
+ //# sourceMappingURL=adapter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"adapter.d.ts","sourceRoot":"","sources":["../src/adapter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,KAAK,eAAe,EAAE,KAAK,cAAc,EAAgB,MAAM,WAAW,CAAA;AAEnF;;GAEG;AACH,wBAAgB,SAAS,CAAC,GAAG,EAAE,eAAe,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CA0BrE;AAED;;GAEG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAAC,IAAI,CAAC,CAuBzF;AAED,wFAAwF;AACxF,eAAO,MAAM,WAAW,UAcvB,CAAA;AAED,wBAAgB,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAErD;AAED;;;;GAIG;AACH,wBAAgB,KAAK,CACnB,OAAO,EAAE,CAAC,OAAO,EAAE,OAAO,KAAK,OAAO,CAAC,QAAQ,CAAC,EAChD,IAAI,EAAE,MAAM,EACZ,IAAI,CAAC,EAAE,MAAM,EACb,QAAQ,CAAC,EAAE,CAAC,GAAG,EAAE,eAAe,EAAE,GAAG,EAAE,cAAc,EAAE,IAAI,EAAE,MAAM,IAAI,KAAK,IAAI,6EA4BjF"}
@@ -0,0 +1,108 @@
1
+ import { createServer } from 'node:http';
2
+ /**
3
+ * Convert a Node.js IncomingMessage to a Web Standard Request.
4
+ */
5
+ export function toRequest(req, base) {
6
+ const url = new URL(req.url, base);
7
+ const headers = new Headers();
8
+ for (const [key, value] of Object.entries(req.headers)) {
9
+ if (value) {
10
+ if (Array.isArray(value)) {
11
+ for (const v of value)
12
+ headers.append(key, v);
13
+ }
14
+ else {
15
+ headers.set(key, value);
16
+ }
17
+ }
18
+ }
19
+ const init = {
20
+ method: req.method,
21
+ headers,
22
+ };
23
+ // GET and HEAD requests cannot have a body
24
+ if (req.method !== 'GET' && req.method !== 'HEAD') {
25
+ // @ts-expect-error — Node.js streams are valid body sources
26
+ init.body = req;
27
+ init.duplex = 'half';
28
+ }
29
+ return new Request(url.href, init);
30
+ }
31
+ /**
32
+ * Pipe a Web Standard Response back to a Node.js ServerResponse.
33
+ */
34
+ export async function sendResponse(res, response) {
35
+ const rawHeaders = [];
36
+ response.headers.forEach((value, name) => {
37
+ rawHeaders.push(name, value);
38
+ });
39
+ res.writeHead(response.status, rawHeaders);
40
+ if (!response.body) {
41
+ res.end();
42
+ return;
43
+ }
44
+ const reader = response.body.getReader();
45
+ try {
46
+ while (true) {
47
+ const { done, value } = await reader.read();
48
+ if (done)
49
+ break;
50
+ res.write(value);
51
+ }
52
+ }
53
+ finally {
54
+ reader.releaseLock();
55
+ res.end();
56
+ }
57
+ }
58
+ /** Routes handled by hatk — everything else can fall through to a framework handler. */
59
+ export const HATK_ROUTES = [
60
+ '/xrpc/',
61
+ '/oauth/',
62
+ '/oauth-client-metadata.json',
63
+ '/.well-known/oauth-authorization-server',
64
+ '/.well-known/oauth-protected-resource',
65
+ '/og/',
66
+ '/admin',
67
+ '/repos',
68
+ '/info/',
69
+ '/_health',
70
+ '/robots.txt',
71
+ '/auth/logout',
72
+ '/__dev/',
73
+ ];
74
+ export function isHatkRoute(pathname) {
75
+ return HATK_ROUTES.some((r) => pathname.startsWith(r) || pathname === r);
76
+ }
77
+ /**
78
+ * Create a Node.js HTTP server from a Web Standard fetch handler.
79
+ * If a fallback Node middleware is provided, non-hatk routes are sent to it
80
+ * (e.g. SvelteKit's handler from build/handler.js).
81
+ */
82
+ export function serve(handler, port, base, fallback) {
83
+ const origin = base || `http://localhost:${port}`;
84
+ const server = createServer(async (req, res) => {
85
+ try {
86
+ const url = new URL(req.url, origin);
87
+ // If we have a fallback (e.g. SvelteKit) and this isn't a hatk route, skip hatk
88
+ if (fallback && !isHatkRoute(url.pathname)) {
89
+ fallback(req, res, () => {
90
+ res.writeHead(404);
91
+ res.end('Not found');
92
+ });
93
+ return;
94
+ }
95
+ const request = toRequest(req, origin);
96
+ const response = await handler(request);
97
+ await sendResponse(res, response);
98
+ }
99
+ catch (err) {
100
+ if (!res.headersSent) {
101
+ res.writeHead(500, { 'Content-Type': 'application/json' });
102
+ }
103
+ res.end(JSON.stringify({ error: err.message }));
104
+ }
105
+ });
106
+ server.listen(port);
107
+ return server;
108
+ }
@@ -7,7 +7,7 @@ interface BackfillOpts {
7
7
  plcUrl: string;
8
8
  /** AT Protocol collection NSIDs to index (e.g. `app.bsky.feed.post`). */
9
9
  collections: Set<string>;
10
- /** Backfill behavior settings from `config.yaml`. */
10
+ /** Backfill behavior settings from `hatk.config.ts`. */
11
11
  config: BackfillConfig;
12
12
  }
13
13
  /**
@@ -65,6 +65,6 @@ export declare function backfillRepo(did: string, collections: Set<string>, fetc
65
65
  * })
66
66
  * ```
67
67
  */
68
- export declare function runBackfill(opts: BackfillOpts): Promise<void>;
68
+ export declare function runBackfill(opts: BackfillOpts): Promise<number>;
69
69
  export {};
70
70
  //# sourceMappingURL=backfill.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAgBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAEjD,6CAA6C;AAC7C,UAAU,YAAY;IACpB,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IACd,8FAA8F;IAC9F,MAAM,EAAE,MAAM,CAAA;IACd,yEAAyE;IACzE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,qDAAqD;IACrD,MAAM,EAAE,cAAc,CAAA;CACvB;AAuGD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAmH/G;AAgCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC,CAiInE"}
1
+ {"version":3,"file":"backfill.d.ts","sourceRoot":"","sources":["../src/backfill.ts"],"names":[],"mappings":"AAiBA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,aAAa,CAAA;AAIjD,6CAA6C;AAC7C,UAAU,YAAY;IACpB,wFAAwF;IACxF,MAAM,EAAE,MAAM,CAAA;IACd,8FAA8F;IAC9F,MAAM,EAAE,MAAM,CAAA;IACd,yEAAyE;IACzE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,CAAA;IACxB,wDAAwD;IACxD,MAAM,EAAE,cAAc,CAAA;CACvB;AAoGD;;;;;;;;;;;;;;;;;;;;;;;;GAwBG;AACH,wBAAsB,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,CAAC,MAAM,CAAC,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAkK/G;AA8BD;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA4BG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAkIrE"}
package/dist/backfill.js CHANGED
@@ -1,8 +1,10 @@
1
- import { parseCarFrame } from "./car.js";
1
+ import { parseCarStream } from "./car.js";
2
2
  import { cborDecode } from "./cbor.js";
3
3
  import { walkMst } from "./mst.js";
4
- import { setRepoStatus, getRepoStatus, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./db.js";
4
+ import { setRepoStatus, getRepoStatus, getRepoRev, getRepoRetryInfo, listRetryEligibleRepos, listPendingRepos, querySQL, runSQL, getSchema, bulkInsertRecords, } from "./database/db.js";
5
5
  import { emit, timer } from "./logger.js";
6
+ import { validateRecord } from '@bigmoves/lexicon';
7
+ import { getLexiconArray } from "./database/schema.js";
6
8
  /** In-memory cache of DID → PDS resolution results to avoid redundant lookups. */
7
9
  const pdsCache = new Map();
8
10
  let plcUrl;
@@ -128,6 +130,7 @@ export async function backfillRepo(did, collections, fetchTimeout) {
128
130
  let error;
129
131
  let resolvedPds;
130
132
  let resolvedHandle = null;
133
+ let resolvedSince = null;
131
134
  let retryCount;
132
135
  let retryAfter;
133
136
  const controller = new AbortController();
@@ -137,26 +140,67 @@ export async function backfillRepo(did, collections, fetchTimeout) {
137
140
  resolvedPds = pdsUrl;
138
141
  resolvedHandle = handle;
139
142
  timeout = setTimeout(() => controller.abort(), fetchTimeout * 1000);
140
- const res = await fetch(`${resolvedPds}/xrpc/com.atproto.sync.getRepo?did=${encodeURIComponent(did)}`, {
141
- signal: controller.signal,
142
- });
143
+ let lastRev = await getRepoRev(did);
144
+ const baseUrl = `${resolvedPds}/xrpc/com.atproto.sync.getRepo?did=${encodeURIComponent(did)}`;
145
+ let repoUrl = lastRev ? `${baseUrl}&since=${encodeURIComponent(lastRev)}` : baseUrl;
146
+ let res = await fetch(repoUrl, { signal: controller.signal });
147
+ // If the PDS rejected our `since` rev (compacted history), fall back to full import
148
+ if (res.status === 400 && lastRev) {
149
+ lastRev = null;
150
+ res = await fetch(baseUrl, { signal: controller.signal });
151
+ }
143
152
  if (!res.ok) {
144
153
  const httpErr = new Error(`getRepo failed for ${did}: ${res.status}`);
145
154
  httpErr.httpStatus = res.status;
146
155
  throw httpErr;
147
156
  }
148
- let carBytes = new Uint8Array(await res.arrayBuffer());
149
- carSizeBytes = carBytes.length;
150
- let { roots, blocks } = parseCarFrame(carBytes);
151
- carBytes = null; // free CAR bytes before bulk insert
152
- // Decode commit to get MST root
153
- const rootData = blocks.get(roots[0]);
157
+ resolvedSince = lastRev;
158
+ let { roots, blocks, byteLength } = await parseCarStream(res.body);
159
+ carSizeBytes = byteLength;
160
+ // Decode commit to get MST root — if the diff CAR is missing the root block,
161
+ // fall back to a full import (the PDS compacted past our `since` rev)
162
+ let rootData = blocks.get(roots[0]);
163
+ if (!rootData && lastRev) {
164
+ lastRev = null;
165
+ resolvedSince = null;
166
+ res = await fetch(baseUrl, { signal: controller.signal });
167
+ if (!res.ok) {
168
+ const httpErr = new Error(`getRepo failed for ${did}: ${res.status}`);
169
+ httpErr.httpStatus = res.status;
170
+ throw httpErr;
171
+ }
172
+ ;
173
+ ({ roots, blocks, byteLength } = await parseCarStream(res.body));
174
+ carSizeBytes = byteLength;
175
+ rootData = blocks.get(roots[0]);
176
+ }
154
177
  if (!rootData)
155
178
  throw new Error(`No root block for ${did}`);
156
179
  const { value: commit } = cborDecode(rootData);
157
180
  // Walk MST to find all record paths
158
181
  const entries = walkMst(blocks, commit.data.$link);
159
- const bulk = [];
182
+ // Delete existing records for this DID before re-importing so deletions are reflected
183
+ // Only on full imports (no since) — diff CARs only contain changes
184
+ if (!lastRev) {
185
+ for (const col of collections) {
186
+ const schema = getSchema(col);
187
+ if (!schema)
188
+ continue;
189
+ await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, [did]);
190
+ for (const child of schema.children) {
191
+ await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, [did]);
192
+ }
193
+ for (const union of schema.unions) {
194
+ for (const branch of union.branches) {
195
+ await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, [did]);
196
+ }
197
+ }
198
+ }
199
+ }
200
+ // Insert records in chunks to limit memory usage
201
+ const CHUNK_SIZE = 1000;
202
+ let chunk = [];
203
+ const validationSkips = {};
160
204
  for (const entry of entries) {
161
205
  const collection = entry.path.split('/')[0];
162
206
  if (!collections.has(collection))
@@ -164,13 +208,23 @@ export async function backfillRepo(did, collections, fetchTimeout) {
164
208
  const blockData = blocks.get(entry.cid);
165
209
  if (!blockData)
166
210
  continue;
211
+ blocks.delete(entry.cid); // free block data as we go
167
212
  try {
168
213
  const { value: record } = cborDecode(blockData);
169
214
  if (!record?.$type)
170
215
  continue;
171
216
  const rkey = entry.path.split('/').slice(1).join('/');
172
217
  const uri = `at://${did}/${collection}/${rkey}`;
173
- bulk.push({ collection, uri, cid: entry.cid, did, record });
218
+ const validationError = validateRecord(getLexiconArray(), collection, record);
219
+ if (validationError) {
220
+ validationSkips[collection] = (validationSkips[collection] || 0) + 1;
221
+ continue;
222
+ }
223
+ chunk.push({ collection, uri, cid: entry.cid, did, record });
224
+ if (chunk.length >= CHUNK_SIZE) {
225
+ count += await bulkInsertRecords(chunk);
226
+ chunk = [];
227
+ }
174
228
  }
175
229
  catch (recordErr) {
176
230
  emit('backfill', 'record_error', {
@@ -181,23 +235,13 @@ export async function backfillRepo(did, collections, fetchTimeout) {
181
235
  });
182
236
  }
183
237
  }
184
- blocks = null; // free block map before bulk insert
185
- // Delete existing records for this DID before re-importing so deletions are reflected
186
- for (const col of collections) {
187
- const schema = getSchema(col);
188
- if (!schema)
189
- continue;
190
- await runSQL(`DELETE FROM ${schema.tableName} WHERE did = $1`, did);
191
- for (const child of schema.children) {
192
- await runSQL(`DELETE FROM ${child.tableName} WHERE parent_did = $1`, did);
193
- }
194
- for (const union of schema.unions) {
195
- for (const branch of union.branches) {
196
- await runSQL(`DELETE FROM ${branch.tableName} WHERE parent_did = $1`, did);
197
- }
198
- }
238
+ if (chunk.length > 0) {
239
+ count += await bulkInsertRecords(chunk);
240
+ }
241
+ const totalSkips = Object.values(validationSkips).reduce((a, b) => a + b, 0);
242
+ if (totalSkips > 0) {
243
+ emit('backfill', 'validation_skips', { did, total: totalSkips, by_collection: validationSkips });
199
244
  }
200
- count = await bulkInsertRecords(bulk);
201
245
  await setRepoStatus(did, 'active', commit.rev, { handle });
202
246
  return count;
203
247
  }
@@ -229,6 +273,8 @@ export async function backfillRepo(did, collections, fetchTimeout) {
229
273
  error,
230
274
  pds_url: resolvedPds,
231
275
  car_size_bytes: carSizeBytes,
276
+ import_mode: carSizeBytes !== undefined ? (resolvedSince ? 'diff' : 'full') : undefined,
277
+ since_rev: resolvedSince,
232
278
  retry_count: retryCount,
233
279
  retry_after: retryAfter,
234
280
  permanent_failure: retryCount === 999 ? true : undefined,
@@ -354,7 +400,7 @@ export async function runBackfill(opts) {
354
400
  parallelism: config.parallelism,
355
401
  status: 'success',
356
402
  });
357
- return;
403
+ return 0;
358
404
  }
359
405
  // 3. Backfill with worker pool
360
406
  let totalRecords = 0;
@@ -378,7 +424,7 @@ export async function runBackfill(opts) {
378
424
  retryRound++;
379
425
  // Wait until the earliest retry_after has passed
380
426
  const now = Math.floor(Date.now() / 1000);
381
- const rows = await querySQL(`SELECT MIN(retry_after) as earliest FROM _repos WHERE status = 'failed' AND retry_after > $1 AND retry_count < $2`, [now, maxRetries]);
427
+ const rows = (await querySQL(`SELECT MIN(retry_after) as earliest FROM _repos WHERE status = 'failed' AND retry_after > $1 AND retry_count < $2`, [now, maxRetries]));
382
428
  const earliest = rows[0]?.earliest ? Number(rows[0].earliest) : 0;
383
429
  if (earliest > now) {
384
430
  await new Promise((resolve) => setTimeout(resolve, (earliest - now) * 1000));
@@ -412,4 +458,5 @@ export async function runBackfill(opts) {
412
458
  retry_rounds: retryRound,
413
459
  status: failedCount > 0 ? 'partial' : 'success',
414
460
  });
461
+ return totalRecords;
415
462
  }
package/dist/car.d.ts CHANGED
@@ -12,20 +12,52 @@
12
12
  * @module
13
13
  */
14
14
  /**
15
- * Parses a CARv1 binary frame into its root CIDs and block map.
15
+ * A memory-efficient block map that stores byte offsets into the original CAR
16
+ * buffer instead of copying block data. Implements the same `get`/`delete`/`size`
17
+ * interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
18
+ */
19
+ export declare class LazyBlockMap {
20
+ private offsets;
21
+ private carBytes;
22
+ constructor(carBytes: Uint8Array, offsets: Map<string, [number, number]>);
23
+ get(cid: string): Uint8Array | undefined;
24
+ delete(cid: string): boolean;
25
+ get size(): number;
26
+ [Symbol.iterator](): IterableIterator<[string, Uint8Array]>;
27
+ /** Release the underlying CAR buffer */
28
+ free(): void;
29
+ }
30
+ /**
31
+ * Parses a CARv1 stream incrementally from a `ReadableStream`.
32
+ *
33
+ * Instead of buffering the entire CAR into a single ArrayBuffer, this reads
34
+ * chunks from the stream and parses blocks as they arrive. Each block's data
35
+ * is `.slice()`d into its own small `Uint8Array`, allowing V8 to GC individual
36
+ * blocks as they're consumed during the MST walk.
37
+ *
38
+ * This is critical for backfill where multiple workers download 30-90MB CARs
39
+ * concurrently — buffered downloads cause OOMs because `ArrayBuffer` memory
40
+ * is "external" to V8's heap and not controlled by `--max-old-space-size`.
41
+ *
42
+ * @param body - The response body stream (e.g. `res.body` from `fetch()`)
43
+ * @returns `roots` — root CID strings; `blocks` — map of CID → block data; `byteLength` — total bytes read
44
+ */
45
+ export declare function parseCarStream(body: ReadableStream<Uint8Array>): Promise<{
46
+ roots: string[];
47
+ blocks: Map<string, Uint8Array>;
48
+ byteLength: number;
49
+ }>;
50
+ /**
51
+ * Parses a CARv1 binary frame into its root CIDs and a lazy block map.
52
+ *
53
+ * The block map stores byte offsets into `carBytes` rather than copying data,
54
+ * reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
16
55
  *
17
56
  * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
18
- * @returns `roots` — ordered list of root CID strings; `blocks` — map of CID string → raw block data
19
- *
20
- * @example
21
- * ```ts
22
- * const car = new Uint8Array(await res.arrayBuffer())
23
- * const { roots, blocks } = parseCarFrame(car)
24
- * const commitData = blocks.get(roots[0])
25
- * ```
57
+ * @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
26
58
  */
27
59
  export declare function parseCarFrame(carBytes: Uint8Array): {
28
60
  roots: string[];
29
- blocks: Map<string, Uint8Array>;
61
+ blocks: LazyBlockMap;
30
62
  };
31
63
  //# sourceMappingURL=car.d.ts.map
package/dist/car.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAuCH;;;;;;;;;;;;GAYG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;CAChC,CAmCA"}
1
+ {"version":3,"file":"car.d.ts","sourceRoot":"","sources":["../src/car.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG;AAuCH;;;;GAIG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,OAAO,CAA+B;IAC9C,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,QAAQ,EAAE,UAAU,EAAE,OAAO,EAAE,GAAG,CAAC,MAAM,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAKxE,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,UAAU,GAAG,SAAS;IAMxC,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,IAAI,IAAI,IAAI,MAAM,CAEjB;IAEA,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,gBAAgB,CAAC,CAAC,MAAM,EAAE,UAAU,CAAC,CAAC;IAO5D,wCAAwC;IACxC,IAAI,IAAI,IAAI;CAIb;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,cAAc,CAAC,IAAI,EAAE,cAAc,CAAC,UAAU,CAAC,GAAG,OAAO,CAAC;IAC9E,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,GAAG,CAAC,MAAM,EAAE,UAAU,CAAC,CAAA;IAC/B,UAAU,EAAE,MAAM,CAAA;CACnB,CAAC,CAsGD;AAED;;;;;;;;GAQG;AACH,wBAAgB,aAAa,CAAC,QAAQ,EAAE,UAAU,GAAG;IACnD,KAAK,EAAE,MAAM,EAAE,CAAA;IACf,MAAM,EAAE,YAAY,CAAA;CACrB,CAiCA"}
package/dist/car.js CHANGED
@@ -42,17 +42,158 @@ function parseCidFromBytes(bytes, offset) {
42
42
  return [bytes.slice(offset, pos), pos];
43
43
  }
44
44
  /**
45
- * Parses a CARv1 binary frame into its root CIDs and block map.
45
+ * A memory-efficient block map that stores byte offsets into the original CAR
46
+ * buffer instead of copying block data. Implements the same `get`/`delete`/`size`
47
+ * interface as `Map<string, Uint8Array>` so it can be used as a drop-in replacement.
48
+ */
49
+ export class LazyBlockMap {
50
+ offsets;
51
+ carBytes;
52
+ constructor(carBytes, offsets) {
53
+ this.carBytes = carBytes;
54
+ this.offsets = offsets;
55
+ }
56
+ get(cid) {
57
+ const range = this.offsets.get(cid);
58
+ if (!range || !this.carBytes)
59
+ return undefined;
60
+ return this.carBytes.subarray(range[0], range[1]);
61
+ }
62
+ delete(cid) {
63
+ return this.offsets.delete(cid);
64
+ }
65
+ get size() {
66
+ return this.offsets.size;
67
+ }
68
+ *[Symbol.iterator]() {
69
+ for (const [cid, range] of this.offsets) {
70
+ if (!this.carBytes)
71
+ return;
72
+ yield [cid, this.carBytes.subarray(range[0], range[1])];
73
+ }
74
+ }
75
+ /** Release the underlying CAR buffer */
76
+ free() {
77
+ this.carBytes = null;
78
+ this.offsets.clear();
79
+ }
80
+ }
81
+ /**
82
+ * Parses a CARv1 stream incrementally from a `ReadableStream`.
46
83
  *
47
- * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
48
- * @returns `roots` ordered list of root CID strings; `blocks` map of CID string → raw block data
84
+ * Instead of buffering the entire CAR into a single ArrayBuffer, this reads
85
+ * chunks from the stream and parses blocks as they arrive. Each block's data
86
+ * is `.slice()`d into its own small `Uint8Array`, allowing V8 to GC individual
87
+ * blocks as they're consumed during the MST walk.
49
88
  *
50
- * @example
51
- * ```ts
52
- * const car = new Uint8Array(await res.arrayBuffer())
53
- * const { roots, blocks } = parseCarFrame(car)
54
- * const commitData = blocks.get(roots[0])
55
- * ```
89
+ * This is critical for backfill where multiple workers download 30-90MB CARs
90
+ * concurrently — buffered downloads cause OOMs because `ArrayBuffer` memory
91
+ * is "external" to V8's heap and not controlled by `--max-old-space-size`.
92
+ *
93
+ * @param body - The response body stream (e.g. `res.body` from `fetch()`)
94
+ * @returns `roots` — root CID strings; `blocks` — map of CID → block data; `byteLength` — total bytes read
95
+ */
96
+ export async function parseCarStream(body) {
97
+ const reader = body.getReader();
98
+ // Growable buffer with position tracking. We reuse a single allocation and
99
+ // compact (shift data to front) when the read position passes the midpoint,
100
+ // avoiding per-chunk allocations and subarray references that pin old memory.
101
+ let buf = new Uint8Array(64 * 1024);
102
+ let pos = 0; // read cursor
103
+ let len = 0; // bytes of valid data in buf
104
+ let byteLength = 0;
105
+ // Ensure at least `need` bytes are available at buf[pos..pos+need)
106
+ async function fill(need) {
107
+ while (len - pos < need) {
108
+ const { done, value } = await reader.read();
109
+ if (done)
110
+ return len - pos >= need;
111
+ byteLength += value.length;
112
+ // Compact: shift remaining data to front when read cursor passes midpoint
113
+ if (pos > 0 && pos > buf.length >>> 1) {
114
+ buf.copyWithin(0, pos, len);
115
+ len -= pos;
116
+ pos = 0;
117
+ }
118
+ // Grow if needed
119
+ const required = len + value.length;
120
+ if (required > buf.length) {
121
+ const newBuf = new Uint8Array(Math.max(required, buf.length * 2));
122
+ newBuf.set(buf.subarray(0, len));
123
+ buf = newBuf;
124
+ }
125
+ buf.set(value, len);
126
+ len += value.length;
127
+ }
128
+ return true;
129
+ }
130
+ function consume(n) {
131
+ pos += n;
132
+ }
133
+ // Read a varint starting at buf[pos]
134
+ function readVarintFromBuf() {
135
+ let value = 0;
136
+ let shift = 0;
137
+ let p = pos;
138
+ while (p < len) {
139
+ const byte = buf[p++];
140
+ value |= (byte & 0x7f) << shift;
141
+ if ((byte & 0x80) === 0)
142
+ return [value, p - pos];
143
+ shift += 7;
144
+ if (shift > 35)
145
+ throw new Error('Varint too long');
146
+ }
147
+ throw new Error('Unexpected end of varint');
148
+ }
149
+ // Parse header: varint(headerLen) + CBOR(header)
150
+ if (!(await fill(1)))
151
+ throw new Error('Empty CAR stream');
152
+ // Prefetch up to 10 bytes for the varint; readVarintFromBuf bounds to `len`
153
+ await fill(10);
154
+ const [headerLen, headerVarintSize] = readVarintFromBuf();
155
+ consume(headerVarintSize);
156
+ if (!(await fill(headerLen)))
157
+ throw new Error('Truncated CAR header');
158
+ // .slice() copies out of the reusable buffer
159
+ const headerSlice = buf.slice(pos, pos + headerLen);
160
+ const { value: header } = cborDecode(headerSlice);
161
+ consume(headerLen);
162
+ const roots = (header.roots || []).map((root) => root?.$link ?? cidToString(root));
163
+ // Parse blocks
164
+ const blocks = new Map();
165
+ while (true) {
166
+ if (!(await fill(1)))
167
+ break;
168
+ // Prefetch up to 10 bytes for the varint; readVarintFromBuf bounds to `len`
169
+ await fill(10);
170
+ const [blockLen, blockVarintSize] = readVarintFromBuf();
171
+ consume(blockVarintSize);
172
+ if (blockLen === 0)
173
+ break;
174
+ if (!(await fill(blockLen)))
175
+ throw new Error('Truncated CAR block');
176
+ const [cidBytes, afterCid] = parseCidFromBytes(buf, pos);
177
+ const cid = cidToString(cidBytes);
178
+ const cidLen = afterCid - pos;
179
+ // .slice() creates an independent copy — the buffer can be reused
180
+ const data = buf.slice(afterCid, afterCid + blockLen - cidLen);
181
+ blocks.set(cid, data);
182
+ consume(blockLen);
183
+ }
184
+ reader.releaseLock();
185
+ // Release the internal buffer
186
+ buf = null;
187
+ return { roots, blocks, byteLength };
188
+ }
189
+ /**
190
+ * Parses a CARv1 binary frame into its root CIDs and a lazy block map.
191
+ *
192
+ * The block map stores byte offsets into `carBytes` rather than copying data,
193
+ * reducing heap usage from O(total block bytes) to O(number of blocks * 16 bytes).
194
+ *
195
+ * @param carBytes - Raw CAR file bytes (e.g. from `getRepo` or a firehose commit)
196
+ * @returns `roots` — ordered list of root CID strings; `blocks` — lazy block map
56
197
  */
57
198
  export function parseCarFrame(carBytes) {
58
199
  let offset = 0;
@@ -66,8 +207,8 @@ export function parseCarFrame(carBytes) {
66
207
  // Our CBOR decoder converts tag-42 CIDs to { $link: "b..." } objects,
67
208
  // so roots may already be decoded strings
68
209
  const roots = (header.roots || []).map((root) => root?.$link ?? cidToString(root));
69
- // Parse blocks: each is varint(len) + CID + data
70
- const blocks = new Map();
210
+ // Build offset index: CID [start, end] into carBytes
211
+ const offsets = new Map();
71
212
  while (offset < carBytes.length) {
72
213
  const [blockLen, afterBlockLen] = readVarint(carBytes, offset);
73
214
  offset = afterBlockLen;
@@ -76,9 +217,8 @@ export function parseCarFrame(carBytes) {
76
217
  const [cidBytes, afterCid] = parseCidFromBytes(carBytes, offset);
77
218
  const cid = cidToString(cidBytes);
78
219
  const dataLen = blockLen - (afterCid - offset);
79
- const data = carBytes.slice(afterCid, afterCid + dataLen);
80
- blocks.set(cid, data);
220
+ offsets.set(cid, [afterCid, afterCid + dataLen]);
81
221
  offset = afterCid + dataLen;
82
222
  }
83
- return { roots, blocks };
223
+ return { roots, blocks: new LazyBlockMap(carBytes, offsets) };
84
224
  }