@indigoai-us/hq-cloud 5.46.0 → 5.47.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/dist/bin/sync-runner.d.ts +12 -0
  2. package/dist/bin/sync-runner.d.ts.map +1 -1
  3. package/dist/bin/sync-runner.js +39 -0
  4. package/dist/bin/sync-runner.js.map +1 -1
  5. package/dist/bin/sync-runner.test.js +27 -1
  6. package/dist/bin/sync-runner.test.js.map +1 -1
  7. package/dist/cli/share.d.ts.map +1 -1
  8. package/dist/cli/share.js +17 -2
  9. package/dist/cli/share.js.map +1 -1
  10. package/dist/cli/share.test.js +2 -0
  11. package/dist/cli/share.test.js.map +1 -1
  12. package/dist/cli/sync-scope.test.js +1 -0
  13. package/dist/cli/sync-scope.test.js.map +1 -1
  14. package/dist/cli/sync.d.ts.map +1 -1
  15. package/dist/cli/sync.js +11 -1
  16. package/dist/cli/sync.js.map +1 -1
  17. package/dist/cli/sync.test.js +1 -0
  18. package/dist/cli/sync.test.js.map +1 -1
  19. package/dist/object-io.d.ts +218 -0
  20. package/dist/object-io.d.ts.map +1 -0
  21. package/dist/object-io.js +588 -0
  22. package/dist/object-io.js.map +1 -0
  23. package/dist/object-io.test.d.ts +11 -0
  24. package/dist/object-io.test.d.ts.map +1 -0
  25. package/dist/object-io.test.js +568 -0
  26. package/dist/object-io.test.js.map +1 -0
  27. package/dist/s3.d.ts +37 -0
  28. package/dist/s3.d.ts.map +1 -1
  29. package/dist/s3.js +207 -198
  30. package/dist/s3.js.map +1 -1
  31. package/dist/vault-client.d.ts +68 -0
  32. package/dist/vault-client.d.ts.map +1 -1
  33. package/dist/vault-client.js +35 -0
  34. package/dist/vault-client.js.map +1 -1
  35. package/package.json +1 -1
  36. package/scripts/presign-transport-e2e.mjs +203 -0
  37. package/scripts/vault-rebaseline.sh +275 -0
  38. package/scripts/vault-rescue.sh +8 -0
  39. package/src/bin/sync-runner.test.ts +41 -0
  40. package/src/bin/sync-runner.ts +52 -0
  41. package/src/cli/share.test.ts +2 -0
  42. package/src/cli/share.ts +29 -2
  43. package/src/cli/sync-scope.test.ts +1 -0
  44. package/src/cli/sync.test.ts +1 -0
  45. package/src/cli/sync.ts +22 -1
  46. package/src/object-io.test.ts +663 -0
  47. package/src/object-io.ts +782 -0
  48. package/src/s3.ts +259 -233
  49. package/src/vault-client.ts +101 -0
@@ -0,0 +1,782 @@
1
+ /**
2
+ * ObjectIO — transport seam for vault object byte/metadata movement.
3
+ *
4
+ * s3.ts holds the *semantics* of sync (symlink-record encoding, mode/mtime
5
+ * stamping, created-at preservation, directory-marker filtering). Those never
6
+ * change. What CAN change is the *wire transport* underneath them:
7
+ *
8
+ * - `S3SdkObjectIO` — the historical path. STS-vended credentials + the AWS
9
+ * S3 SDK talking directly to the per-company bucket. No policy-size
10
+ * ceiling concern for the BYTES, but the STS session policy that grants
11
+ * access has the 2048-char IAM limit that motivated the presigned model.
12
+ *
13
+ * - `PresignObjectIO` — the presigned-URL path. The vault-service decides
14
+ * access as a runtime DDB check (no IAM policy ceiling) and hands back
15
+ * short-lived presigned GET/PUT/DELETE URLs + (for PUT) the exact headers
16
+ * to replay. The client never holds AWS credentials — it just `fetch`es
17
+ * the signed URLs.
18
+ *
19
+ * The seam is a per-EntityContext factory resolved INSIDE s3.ts, so every
20
+ * existing call site (`uploadFile(ctx, …)`, `downloadFile(ctx, …)`, …) keeps
21
+ * its signature. `runRunner` selects the transport once per session via
22
+ * {@link setObjectIOFactory}; absent any selection the default is the S3 SDK,
23
+ * preserving today's behavior for every non-gated caller.
24
+ */
25
+
26
+ import {
27
+ S3Client,
28
+ PutObjectCommand,
29
+ GetObjectCommand,
30
+ ListObjectsV2Command,
31
+ DeleteObjectCommand,
32
+ HeadObjectCommand,
33
+ } from "@aws-sdk/client-s3";
34
+ import type { EntityContext } from "./types.js";
35
+ import type {
36
+ PresignOp,
37
+ PresignKeyInput,
38
+ PresignResultRow,
39
+ VaultListedObject,
40
+ } from "./vault-client.js";
41
+ import { VaultClientError } from "./vault-client.js";
42
+
43
+ /**
44
+ * The slice of {@link VaultClient} the presigned transport needs. Narrowed to
45
+ * just `presign` + `listFiles` so the factory accepts any caller that exposes
46
+ * those two (the real VaultClient, or a stub in tests) without depending on
47
+ * the full 20-method surface.
48
+ */
49
+ export interface PresignTransportClient {
50
+ presign(input: {
51
+ companyUid: string;
52
+ op?: PresignOp;
53
+ expiresIn?: number;
54
+ keys: PresignKeyInput[];
55
+ }): Promise<{ results: PresignResultRow[]; expiresAt: string }>;
56
+ listFiles(
57
+ companyUid: string,
58
+ prefix?: string,
59
+ cursor?: string,
60
+ ): Promise<{
61
+ objects: VaultListedObject[];
62
+ cursor: string | null;
63
+ truncated: boolean;
64
+ }>;
65
+ }
66
+
67
+ // ---------------------------------------------------------------------------
68
+ // Wire-primitive shapes
69
+ // ---------------------------------------------------------------------------
70
+
71
+ export interface PutObjectInput {
72
+ key: string;
73
+ body: Buffer;
74
+ contentType: string;
75
+ /** S3 user metadata (x-amz-meta-*). Lowercased keys by convention. */
76
+ metadata?: Record<string, string>;
77
+ }
78
+
79
+ export interface GetObjectResult {
80
+ body: Buffer;
81
+ /** S3 user metadata (keys lowercased by S3). */
82
+ metadata?: Record<string, string>;
83
+ }
84
+
85
+ export interface ListObjectsInput {
86
+ prefix?: string;
87
+ continuationToken?: string;
88
+ }
89
+
90
+ export interface ListedRemoteObject {
91
+ key: string;
92
+ size: number;
93
+ lastModified: Date;
94
+ etag: string;
95
+ }
96
+
97
+ export interface ListObjectsResult {
98
+ objects: ListedRemoteObject[];
99
+ /** Opaque cursor for the next page; undefined when the listing is exhausted. */
100
+ nextContinuationToken?: string;
101
+ }
102
+
103
+ export interface HeadObjectResult {
104
+ lastModified: Date;
105
+ etag: string;
106
+ size: number;
107
+ metadata?: Record<string, string>;
108
+ }
109
+
110
+ /**
111
+ * The minimal byte/metadata transport s3.ts needs. Deliberately narrow — no
112
+ * symlink, mode, or created-at concepts leak in here; those live one layer up
113
+ * in s3.ts and compose on top of these five primitives.
114
+ */
115
+ export interface ObjectIO {
116
+ putObject(input: PutObjectInput): Promise<{ etag: string }>;
117
+ getObject(key: string): Promise<GetObjectResult>;
118
+ listObjects(input: ListObjectsInput): Promise<ListObjectsResult>;
119
+ deleteObject(key: string): Promise<void>;
120
+ /** Null when the key does not exist (404 / 403-as-absent). */
121
+ headObject(key: string): Promise<HeadObjectResult | null>;
122
+ /**
123
+ * Optional batch pre-mint. Warms an internal URL cache for `keys` under `op`
124
+ * so subsequent per-key get/head (and, when primed, put/delete) calls reuse a
125
+ * pre-signed URL instead of issuing one presign request each. This is what
126
+ * turns an N-file sync from N presign calls into ceil(N/chunk) — the
127
+ * difference between staying under and blowing past the 100-req/hr limit on a
128
+ * bulk pull. The S3 SDK transport has no presign step and omits this (the
129
+ * per-call cost there is the SDK request itself, not a separate presign).
130
+ * Best-effort: a failed chunk or per-key denial simply leaves those keys
131
+ * uncached, and the per-key call falls back to a single presign.
132
+ */
133
+ prime?(op: PresignOp, keys: PresignKeyInput[]): Promise<void>;
134
+ /**
135
+ * True if a live primed PUT URL exists for `key`. Lets uploadFile/uploadSymlink
136
+ * skip recomputing metadata + the created-at HEAD when a `prime("put", …)`
137
+ * pre-pass already signed the metadata into the cached URL (the upload just
138
+ * sends the body, replaying the cached headers). Absent (undefined) on the S3
139
+ * SDK transport → callers take their normal compute-metadata path.
140
+ */
141
+ hasPrimedPut?(key: string): boolean;
142
+ }
143
+
144
+ // ---------------------------------------------------------------------------
145
+ // S3 SDK transport (default)
146
+ // ---------------------------------------------------------------------------
147
+
148
+ function stripQuotes(etag: string | undefined): string {
149
+ return etag ? etag.replace(/^"|"$/g, "") : "";
150
+ }
151
+
152
+ async function drainToBuffer(
153
+ body: AsyncIterable<Uint8Array> | undefined,
154
+ ): Promise<Buffer> {
155
+ if (!body) return Buffer.alloc(0);
156
+ const chunks: Buffer[] = [];
157
+ for await (const chunk of body) {
158
+ chunks.push(Buffer.from(chunk));
159
+ }
160
+ return Buffer.concat(chunks);
161
+ }
162
+
163
+ /**
164
+ * Direct-to-S3 transport over STS-vended credentials. A fresh client per
165
+ * instance so the latest credentials from the EntityContext are always used
166
+ * (caching/refresh is the caller's concern, at the EntityContext level — see
167
+ * the original buildClient note in s3.ts).
168
+ */
169
+ export class S3SdkObjectIO implements ObjectIO {
170
+ private readonly client: S3Client;
171
+ private readonly bucket: string;
172
+
173
+ constructor(ctx: EntityContext) {
174
+ this.bucket = ctx.bucketName;
175
+ this.client = new S3Client({
176
+ region: ctx.region,
177
+ credentials: {
178
+ accessKeyId: ctx.credentials.accessKeyId,
179
+ secretAccessKey: ctx.credentials.secretAccessKey,
180
+ sessionToken: ctx.credentials.sessionToken,
181
+ },
182
+ });
183
+ }
184
+
185
+ async putObject(input: PutObjectInput): Promise<{ etag: string }> {
186
+ const res = await this.client.send(
187
+ new PutObjectCommand({
188
+ Bucket: this.bucket,
189
+ Key: input.key,
190
+ Body: input.body,
191
+ ContentType: input.contentType,
192
+ ...(input.metadata && Object.keys(input.metadata).length > 0
193
+ ? { Metadata: input.metadata }
194
+ : {}),
195
+ }),
196
+ );
197
+ return { etag: res.ETag || "" };
198
+ }
199
+
200
+ async getObject(key: string): Promise<GetObjectResult> {
201
+ const res = await this.client.send(
202
+ new GetObjectCommand({ Bucket: this.bucket, Key: key }),
203
+ );
204
+ if (!res.Body) {
205
+ throw new Error(`Empty response for ${key}`);
206
+ }
207
+ const body = await drainToBuffer(res.Body as AsyncIterable<Uint8Array>);
208
+ return { body, metadata: res.Metadata };
209
+ }
210
+
211
+ async listObjects(input: ListObjectsInput): Promise<ListObjectsResult> {
212
+ const res = await this.client.send(
213
+ new ListObjectsV2Command({
214
+ Bucket: this.bucket,
215
+ Prefix: input.prefix,
216
+ ContinuationToken: input.continuationToken,
217
+ }),
218
+ );
219
+ const objects: ListedRemoteObject[] = [];
220
+ for (const obj of res.Contents || []) {
221
+ if (!obj.Key) continue;
222
+ objects.push({
223
+ key: obj.Key,
224
+ size: obj.Size ?? 0,
225
+ lastModified: obj.LastModified || new Date(),
226
+ etag: obj.ETag || "",
227
+ });
228
+ }
229
+ return {
230
+ objects,
231
+ nextContinuationToken: res.NextContinuationToken,
232
+ };
233
+ }
234
+
235
+ async deleteObject(key: string): Promise<void> {
236
+ await this.client.send(
237
+ new DeleteObjectCommand({ Bucket: this.bucket, Key: key }),
238
+ );
239
+ }
240
+
241
+ async headObject(key: string): Promise<HeadObjectResult | null> {
242
+ try {
243
+ const res = await this.client.send(
244
+ new HeadObjectCommand({ Bucket: this.bucket, Key: key }),
245
+ );
246
+ return {
247
+ lastModified: res.LastModified || new Date(),
248
+ etag: res.ETag || "",
249
+ size: res.ContentLength || 0,
250
+ metadata: res.Metadata,
251
+ };
252
+ } catch (err: unknown) {
253
+ if (
254
+ err &&
255
+ typeof err === "object" &&
256
+ "name" in err &&
257
+ (err as { name?: string }).name === "NotFound"
258
+ ) {
259
+ return null;
260
+ }
261
+ throw err;
262
+ }
263
+ }
264
+ }
265
+
266
+ // ---------------------------------------------------------------------------
267
+ // Presigned-URL transport
268
+ // ---------------------------------------------------------------------------
269
+
270
+ /**
271
+ * Pull every `x-amz-meta-*` response header into a plain metadata record with
272
+ * the prefix stripped. S3 surfaces user metadata this way on GET/HEAD; fetch
273
+ * lowercases header names, matching how the SDK lowercases `Metadata` keys, so
274
+ * the read path in s3.ts is identical across both transports.
275
+ */
276
+ function metaFromHeaders(headers: Headers): Record<string, string> {
277
+ const meta: Record<string, string> = {};
278
+ headers.forEach((value, name) => {
279
+ if (name.startsWith("x-amz-meta-")) {
280
+ meta[name.slice("x-amz-meta-".length)] = value;
281
+ }
282
+ });
283
+ return meta;
284
+ }
285
+
286
+ function firstRowOrThrow(
287
+ results: PresignResultRow[],
288
+ key: string,
289
+ op: string,
290
+ ): PresignResultRow {
291
+ const row = results[0];
292
+ if (!row) {
293
+ throw new Error(`presign ${op} returned no row for ${key}`);
294
+ }
295
+ if (row.error || !row.url) {
296
+ throw new Error(
297
+ `presign ${op} denied for ${key}: ${row.error ?? "no url"}${
298
+ row.code ? ` (${row.code})` : ""
299
+ }`,
300
+ );
301
+ }
302
+ return row;
303
+ }
304
+
305
+ /**
306
+ * An error shaped like the AWS SDK's NoSuchKey/NotFound so existing catch
307
+ * sites in s3.ts (which test `err.name === "NotFound"`) treat a presigned
308
+ * 404 the same as an SDK 404.
309
+ */
310
+ function notFoundError(key: string): Error {
311
+ return Object.assign(new Error(`Not found: ${key}`), { name: "NotFound" });
312
+ }
313
+
314
+ /**
315
+ * Max keys per presign request when priming — the server's hard batch cap
316
+ * (hq-pro files-presign MAX_BATCH_KEYS = 1000). One presign call costs ONE
317
+ * audit row toward the 100/hr limit regardless of how many keys it carries, so
318
+ * filling the batch is strictly better: a 5.5k-file pull is 6 presign calls at
319
+ * 1000/chunk vs 55 at 100. (The sibling list page size is fixed at 1000 by
320
+ * AWS's ListObjectsV2 MaxKeys cap — that one we can't raise.)
321
+ */
322
+ const PRIME_CHUNK = 1000;
323
+ /**
324
+ * Lifetime requested for primed URLs. Generous (30 min) so a whole sync batch
325
+ * completes within one prime, but well inside the presign Lambda role's
326
+ * credential lifetime (a presigned URL cannot outlive the creds that signed
327
+ * it). A batch that somehow outruns this falls back to per-key single presign.
328
+ */
329
+ const PRIME_EXPIRES_IN_SECONDS = 1800;
330
+ /** Concurrent prime chunks in flight (each is one presign HTTP call). */
331
+ const PRIME_CONCURRENCY = 4;
332
+ /** Treat a cached URL within this window of expiry as a miss (re-presign). */
333
+ const CACHE_SAFETY_MS = 60_000;
334
+
335
+ interface CacheEntry {
336
+ url: string;
337
+ headers?: Record<string, string>;
338
+ expiresAtMs: number;
339
+ }
340
+
341
+ /**
342
+ * Thrown when a presign mint is skipped because the per-user 100/hr vault rate
343
+ * budget is exhausted. Distinct name so callers can tell "deferred, retry next
344
+ * sync" apart from a real transfer failure. The key was NOT synced.
345
+ */
346
+ export class RateLimitedError extends Error {
347
+ constructor(
348
+ readonly key: string,
349
+ readonly op: PresignOp,
350
+ options?: { cause?: unknown },
351
+ ) {
352
+ super(`rate limited (100/hr) — ${op} ${key} deferred to next sync`);
353
+ this.name = "RateLimited";
354
+ if (options?.cause !== undefined) {
355
+ (this as { cause?: unknown }).cause = options.cause;
356
+ }
357
+ }
358
+ }
359
+
360
+ /**
361
+ * One-way circuit breaker shared across a run's per-company transports. The
362
+ * first 429 (vault rate budget exhausted) trips it; thereafter every UNCACHED
363
+ * presign fails fast with {@link RateLimitedError} instead of hitting the wire.
364
+ *
365
+ * Without this, an exhausted budget spirals: prime chunks 429 → keys uncached
366
+ * → per-file presign → each 429s (after VaultClient's own 3 retries +
367
+ * backoff) → an 86-minute storm of ~10k doomed calls (observed live). Tripping
368
+ * once and short-circuiting turns that into a clean fast finish: primed URLs
369
+ * still work, un-primed keys are deferred, and the run reports them so the next
370
+ * sync (after the rolling hour recovers) picks them up.
371
+ */
372
+ export class RateLimitBreaker {
373
+ private tripped = false;
374
+ isTripped(): boolean {
375
+ return this.tripped;
376
+ }
377
+ trip(): void {
378
+ this.tripped = true;
379
+ }
380
+ }
381
+
382
+ /** A VaultClient 429 (rate budget exhausted) after its own retries. */
383
+ function isRateLimit(err: unknown): boolean {
384
+ return err instanceof VaultClientError && err.statusCode === 429;
385
+ }
386
+
387
+ /**
388
+ * Transport that moves bytes over short-lived presigned URLs minted by the
389
+ * vault-service. Holds no AWS credentials. `companyUid` is the EntityContext's
390
+ * `uid` — the server resolves the per-company bucket from it, so cross-company
391
+ * reach is structurally impossible (same authority model as the list/presign
392
+ * handlers).
393
+ *
394
+ * URL cache: {@link prime} batch-mints URLs into `urlCache` (keyed by op+key)
395
+ * so the per-file get/head calls during a sync reuse them instead of issuing a
396
+ * presign request each. A single instance is shared across all s3.ts calls for
397
+ * one company within a run (see {@link presignObjectIOFactory} memoization), so
398
+ * a prime before the transfer loop warms the cache the loop then drains.
399
+ */
400
+ export class PresignObjectIO implements ObjectIO {
401
+ private readonly urlCache = new Map<string, CacheEntry>();
402
+
403
+ constructor(
404
+ private readonly vault: PresignTransportClient,
405
+ private readonly companyUid: string,
406
+ // Shared across a run's per-company instances by the factory; a directly
407
+ // constructed instance gets its own (fine for tests / one-offs).
408
+ private readonly breaker: RateLimitBreaker = new RateLimitBreaker(),
409
+ ) {}
410
+
411
+ private cacheKey(op: PresignOp, key: string): string {
412
+ return `${op}${key}`;
413
+ }
414
+
415
+ hasPrimedPut(key: string): boolean {
416
+ return this.cached("put", key) !== undefined;
417
+ }
418
+
419
+ /** A live (non-expiring) cached URL for op+key, or undefined. */
420
+ private cached(op: PresignOp, key: string): CacheEntry | undefined {
421
+ const hit = this.urlCache.get(this.cacheKey(op, key));
422
+ if (!hit) return undefined;
423
+ if (Date.now() >= hit.expiresAtMs - CACHE_SAFETY_MS) {
424
+ this.urlCache.delete(this.cacheKey(op, key));
425
+ return undefined;
426
+ }
427
+ return hit;
428
+ }
429
+
430
+ /**
431
+ * Resolve a presigned URL (+ replay headers) for op+key: cache hit if primed,
432
+ * else a single presign. Throws on per-key denial (matches the SDK path's
433
+ * access error). `extra` carries PUT contentType/metadata on the miss path.
434
+ */
435
+ private async resolveUrl(
436
+ op: PresignOp,
437
+ key: string,
438
+ extra?: { contentType?: string; metadata?: Record<string, string> },
439
+ ): Promise<{ url: string; headers?: Record<string, string> }> {
440
+ const hit = this.cached(op, key);
441
+ if (hit) return { url: hit.url, headers: hit.headers };
442
+ // Primed URLs still serve once the breaker trips (no wire needed); only an
443
+ // uncached key on an exhausted budget fails fast.
444
+ if (this.breaker.isTripped()) throw new RateLimitedError(key, op);
445
+ let results;
446
+ try {
447
+ ({ results } = await this.vault.presign({
448
+ companyUid: this.companyUid,
449
+ op,
450
+ keys: [{ key, op, ...extra }],
451
+ }));
452
+ } catch (err) {
453
+ if (isRateLimit(err)) {
454
+ this.breaker.trip();
455
+ throw new RateLimitedError(key, op, { cause: err });
456
+ }
457
+ throw err;
458
+ }
459
+ const row = firstRowOrThrow(results, key, op);
460
+ return { url: row.url!, headers: row.headers };
461
+ }
462
+
463
+ async prime(op: PresignOp, keys: PresignKeyInput[]): Promise<void> {
464
+ if (keys.length === 0) return;
465
+ const chunks: PresignKeyInput[][] = [];
466
+ for (let i = 0; i < keys.length; i += PRIME_CHUNK) {
467
+ chunks.push(keys.slice(i, i + PRIME_CHUNK));
468
+ }
469
+ let next = 0;
470
+ const worker = async (): Promise<void> => {
471
+ while (next < chunks.length) {
472
+ // Budget already exhausted — stop priming; remaining keys defer.
473
+ if (this.breaker.isTripped()) return;
474
+ const chunk = chunks[next++];
475
+ let resp;
476
+ try {
477
+ resp = await this.vault.presign({
478
+ companyUid: this.companyUid,
479
+ op,
480
+ expiresIn: PRIME_EXPIRES_IN_SECONDS,
481
+ keys: chunk.map((k) => ({ ...k, op })),
482
+ });
483
+ } catch (err) {
484
+ if (isRateLimit(err)) {
485
+ // Budget exhausted mid-prime: trip the breaker and stop. Priming
486
+ // on means every remaining chunk + per-file fallback would 429 —
487
+ // the spiral. Tripping makes the transfer loop fail fast instead.
488
+ this.breaker.trip();
489
+ return;
490
+ }
491
+ // A non-429 chunk failure just means those keys aren't cached — the
492
+ // per-key call will single-presign. Never let priming fail the sync.
493
+ continue;
494
+ }
495
+ const now = Date.now();
496
+ for (const row of resp.results) {
497
+ if (row.error || !row.url) continue;
498
+ this.urlCache.set(this.cacheKey(op, row.key), {
499
+ url: row.url,
500
+ headers: row.headers,
501
+ expiresAtMs: now + (row.expiresIn ?? PRIME_EXPIRES_IN_SECONDS) * 1000,
502
+ });
503
+ }
504
+ }
505
+ };
506
+ await Promise.all(
507
+ Array.from({ length: Math.min(PRIME_CONCURRENCY, chunks.length) }, worker),
508
+ );
509
+ }
510
+
511
+ async putObject(input: PutObjectInput): Promise<{ etag: string }> {
512
+ const row = await this.resolveUrl("put", input.key, {
513
+ contentType: input.contentType,
514
+ ...(input.metadata && Object.keys(input.metadata).length > 0
515
+ ? { metadata: input.metadata }
516
+ : {}),
517
+ });
518
+ // The server signs Content-Type, SSE-KMS, and every x-amz-meta-* into the
519
+ // signature and returns them in `headers`; they MUST be replayed verbatim
520
+ // or SigV4 rejects the PUT.
521
+ const res = await fetchWithRetry(
522
+ row.url,
523
+ { method: "PUT", body: input.body, headers: row.headers ?? {} },
524
+ `presigned PUT ${input.key}`,
525
+ );
526
+ if (!res.ok) {
527
+ const detail = await safeText(res);
528
+ throw new Error(
529
+ `presigned PUT failed for ${input.key}: ${res.status} ${detail}`,
530
+ );
531
+ }
532
+ return { etag: stripQuotes(res.headers.get("etag") ?? undefined) };
533
+ }
534
+
535
+ async getObject(key: string): Promise<GetObjectResult> {
536
+ const row = await this.resolveUrl("get", key);
537
+ const res = await fetchWithRetry(row.url, { method: "GET" }, `presigned GET ${key}`);
538
+ if (res.status === 404) {
539
+ await cancelBody(res);
540
+ throw notFoundError(key);
541
+ }
542
+ if (!res.ok) {
543
+ const detail = await safeText(res);
544
+ throw new Error(`presigned GET failed for ${key}: ${res.status} ${detail}`);
545
+ }
546
+ const body = Buffer.from(await res.arrayBuffer());
547
+ return { body, metadata: metaFromHeaders(res.headers) };
548
+ }
549
+
550
+ async listObjects(input: ListObjectsInput): Promise<ListObjectsResult> {
551
+ const { objects, cursor } = await this.vault.listFiles(
552
+ this.companyUid,
553
+ input.prefix,
554
+ input.continuationToken,
555
+ );
556
+ return {
557
+ objects: objects.map((o) => ({
558
+ key: o.key,
559
+ size: o.size,
560
+ lastModified: o.lastModified ? new Date(o.lastModified) : new Date(),
561
+ etag: o.etag ?? "",
562
+ })),
563
+ nextContinuationToken: cursor ?? undefined,
564
+ };
565
+ }
566
+
567
+ async deleteObject(key: string): Promise<void> {
568
+ const row = await this.resolveUrl("delete", key);
569
+ const res = await fetchWithRetry(row.url, { method: "DELETE" }, `presigned DELETE ${key}`);
570
+ // S3 DELETE is idempotent — a 204 (deleted) and a 404 (already gone) are
571
+ // both success for the sync engine's purposes.
572
+ if (!res.ok && res.status !== 404) {
573
+ const detail = await safeText(res);
574
+ throw new Error(
575
+ `presigned DELETE failed for ${key}: ${res.status} ${detail}`,
576
+ );
577
+ }
578
+ }
579
+
580
+ async headObject(key: string): Promise<HeadObjectResult | null> {
581
+ // The presign endpoint has no HEAD op (get/put/delete only). A presigned
582
+ // GET signs the GET method, so we issue a real GET and read only the
583
+ // response headers, cancelling the body stream before it downloads — the
584
+ // headers (etag, content-length, last-modified, x-amz-meta-*) are all we
585
+ // need and arrive before the body. Cheap for the created-at-preservation
586
+ // and conflict-detection call sites that use headObject. Reuses the GET
587
+ // cache: a prime("get", …) before a pull warms these HEADs for free.
588
+ let url: string;
589
+ const hit = this.cached("get", key);
590
+ if (hit) {
591
+ url = hit.url;
592
+ } else {
593
+ if (this.breaker.isTripped()) throw new RateLimitedError(key, "get");
594
+ let results;
595
+ try {
596
+ ({ results } = await this.vault.presign({
597
+ companyUid: this.companyUid,
598
+ op: "get",
599
+ keys: [{ key, op: "get" }],
600
+ }));
601
+ } catch (err) {
602
+ if (isRateLimit(err)) {
603
+ this.breaker.trip();
604
+ throw new RateLimitedError(key, "get", { cause: err });
605
+ }
606
+ throw err;
607
+ }
608
+ const row = results[0];
609
+ if (!row || row.error || !row.url) {
610
+ // A per-key denial here means the caller can't read the key — treat as
611
+ // absent for HEAD semantics (the SDK path would 403, which callers map
612
+ // to "no usable head"); they all tolerate null.
613
+ return null;
614
+ }
615
+ url = row.url;
616
+ }
617
+ const res = await fetchWithRetry(url, { method: "GET" }, `presigned HEAD ${key}`);
618
+ if (res.status === 404 || res.status === 403) {
619
+ await cancelBody(res);
620
+ return null;
621
+ }
622
+ if (!res.ok) {
623
+ await cancelBody(res);
624
+ const detail = await safeText(res);
625
+ throw new Error(`presigned HEAD failed for ${key}: ${res.status} ${detail}`);
626
+ }
627
+ const result: HeadObjectResult = {
628
+ lastModified: parseLastModified(res.headers.get("last-modified")),
629
+ etag: stripQuotes(res.headers.get("etag") ?? undefined),
630
+ size: Number(res.headers.get("content-length") ?? "0"),
631
+ metadata: metaFromHeaders(res.headers),
632
+ };
633
+ await cancelBody(res);
634
+ return result;
635
+ }
636
+ }
637
+
638
+ async function safeText(res: Response): Promise<string> {
639
+ try {
640
+ return (await res.text()).slice(0, 200);
641
+ } catch {
642
+ return "";
643
+ }
644
+ }
645
+
646
+ async function cancelBody(res: Response): Promise<void> {
647
+ try {
648
+ await res.body?.cancel();
649
+ } catch {
650
+ // Best-effort — the socket is released either way once GC'd.
651
+ }
652
+ }
653
+
654
+ function parseLastModified(value: string | null): Date {
655
+ if (!value) return new Date();
656
+ const d = new Date(value);
657
+ return Number.isNaN(d.getTime()) ? new Date() : d;
658
+ }
659
+
660
+ // ---------------------------------------------------------------------------
661
+ // Transient-failure retry for the presigned-URL fetches
662
+ // ---------------------------------------------------------------------------
663
+ //
664
+ // The AWS S3 SDK retries transient errors (5xx, throttling, dropped sockets)
665
+ // automatically with backoff. Moving the byte transfer to `fetch` over a
666
+ // presigned URL dropped that resilience — and at sync scale (thousands of
667
+ // objects per pull) transient S3 5xx (notably 503 SlowDown) are routine, so
668
+ // without retry a large sync sporadically loses files. This restores
669
+ // SDK-parity: retry network errors and transient 5xx with exponential backoff
670
+ // + jitter; 4xx (404/403) are definitive and pass straight through.
671
+
672
+ const FETCH_MAX_RETRIES = 3;
673
+ const FETCH_BASE_DELAY_MS = 400;
674
+
675
+ function isTransientStatus(status: number): boolean {
676
+ return status === 500 || status === 502 || status === 503 || status === 504;
677
+ }
678
+
679
+ function sleep(ms: number): Promise<void> {
680
+ return new Promise((resolve) => setTimeout(resolve, ms));
681
+ }
682
+
683
+ /**
684
+ * `fetch` with bounded retry on network errors + transient 5xx. The presigned
685
+ * URL is reusable until expiry and the bodies are in-memory Buffers, so a
686
+ * retry simply re-issues the same request. Jitter avoids a thundering-herd
687
+ * re-retry when the whole transfer pool hits a 503 SlowDown at once. After
688
+ * exhausting retries on a 5xx it returns the final (failing) Response so the
689
+ * caller's normal status handling reports it; network errors throw.
690
+ */
691
+ async function fetchWithRetry(
692
+ url: string,
693
+ init: RequestInit,
694
+ what: string,
695
+ ): Promise<Response> {
696
+ let lastError: unknown;
697
+ for (let attempt = 0; attempt <= FETCH_MAX_RETRIES; attempt++) {
698
+ if (attempt > 0) {
699
+ const backoff = FETCH_BASE_DELAY_MS * 2 ** (attempt - 1);
700
+ const jitter = Math.floor(Math.random() * FETCH_BASE_DELAY_MS);
701
+ await sleep(backoff + jitter);
702
+ }
703
+ let res: Response;
704
+ try {
705
+ res = await fetch(url, init);
706
+ } catch (err) {
707
+ lastError = err; // socket reset / DNS / TLS — retry
708
+ continue;
709
+ }
710
+ if (isTransientStatus(res.status) && attempt < FETCH_MAX_RETRIES) {
711
+ await cancelBody(res); // free the socket before backoff
712
+ lastError = new Error(`${what}: transient ${res.status}`);
713
+ continue;
714
+ }
715
+ return res; // success, a non-transient status, or the last 5xx attempt
716
+ }
717
+ throw lastError instanceof Error
718
+ ? lastError
719
+ : new Error(`${what}: failed after ${FETCH_MAX_RETRIES} retries`);
720
+ }
721
+
722
+ // ---------------------------------------------------------------------------
723
+ // Factory registry — selected once per session by runRunner
724
+ // ---------------------------------------------------------------------------
725
+
726
+ export type ObjectIOFactory = (ctx: EntityContext) => ObjectIO;
727
+
728
+ const DEFAULT_FACTORY: ObjectIOFactory = (ctx) => new S3SdkObjectIO(ctx);
729
+
730
+ let activeFactory: ObjectIOFactory = DEFAULT_FACTORY;
731
+
732
+ /**
733
+ * Install the transport factory for the current process. Passing `null`
734
+ * resets to the default S3 SDK transport. Called once by `runRunner` after it
735
+ * resolves the caller's identity + feature-flag gate; every subsequent s3.ts
736
+ * call resolves its transport through this.
737
+ */
738
+ export function setObjectIOFactory(factory: ObjectIOFactory | null): void {
739
+ activeFactory = factory ?? DEFAULT_FACTORY;
740
+ }
741
+
742
+ /** Resolve the transport for an EntityContext using the active factory. */
743
+ export function resolveObjectIO(ctx: EntityContext): ObjectIO {
744
+ return activeFactory(ctx);
745
+ }
746
+
747
+ /**
748
+ * Build a factory that routes every EntityContext through the presigned-URL
749
+ * transport, reusing the one already-authenticated VaultClient and deriving
750
+ * the per-company authority from `ctx.uid`.
751
+ */
752
+ export function presignObjectIOFactory(
753
+ vault: PresignTransportClient,
754
+ ): ObjectIOFactory {
755
+ // Memoize one PresignObjectIO per company for the run, so a prime() and the
756
+ // transfer loop that drains its URL cache share the SAME instance. (Safe to
757
+ // memoize: PresignObjectIO holds only the vault client — whose token is a
758
+ // live getter — and the stable companyUid; it captures no rotating
759
+ // credentials, unlike S3SdkObjectIO, which is why the default factory is
760
+ // intentionally NOT memoized.)
761
+ // One breaker per run, shared across companies: the 100/hr budget is
762
+ // per-user, so a 429 in one company means the whole run should stop minting.
763
+ const breaker = new RateLimitBreaker();
764
+ const byCompany = new Map<string, PresignObjectIO>();
765
+ return (ctx) => {
766
+ // Personal vaults are a PERSON entity (prs_*) accessed via the membership-
767
+ // less vend-self model; the list/presign endpoints are membership-gated and
768
+ // 403 ("no active membership in company prs_…") for them. Personal vaults
769
+ // also have no ACL-scale problem (single owner), so they don't need
770
+ // presign — keep them on the S3 SDK (STS) transport. Presign is for company
771
+ // vaults (cmp_*), which is where the unbounded-grants problem lives.
772
+ if (!ctx.uid.startsWith("cmp_")) {
773
+ return new S3SdkObjectIO(ctx);
774
+ }
775
+ let io = byCompany.get(ctx.uid);
776
+ if (!io) {
777
+ io = new PresignObjectIO(vault, ctx.uid, breaker);
778
+ byCompany.set(ctx.uid, io);
779
+ }
780
+ return io;
781
+ };
782
+ }