@vellumai/cli 0.6.6 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/AGENTS.md +8 -2
  2. package/README.md +49 -0
  3. package/package.json +1 -1
  4. package/src/__tests__/assistant-config.test.ts +1 -7
  5. package/src/__tests__/backup.test.ts +475 -0
  6. package/src/__tests__/config-utils.test.ts +146 -0
  7. package/src/__tests__/env-drift.test.ts +10 -32
  8. package/src/__tests__/llm-provider-env-var-parity.test.ts +1 -21
  9. package/src/__tests__/multi-local.test.ts +0 -5
  10. package/src/__tests__/sleep.test.ts +1 -2
  11. package/src/__tests__/teleport.test.ts +988 -1266
  12. package/src/commands/backup.ts +117 -71
  13. package/src/commands/client.ts +10 -9
  14. package/src/commands/env.ts +93 -0
  15. package/src/commands/events.ts +2 -0
  16. package/src/commands/exec.ts +58 -13
  17. package/src/commands/login.ts +77 -12
  18. package/src/commands/logs.ts +2 -7
  19. package/src/commands/ps.ts +144 -25
  20. package/src/commands/restore.ts +26 -47
  21. package/src/commands/sleep.ts +5 -2
  22. package/src/commands/ssh.ts +17 -7
  23. package/src/commands/teleport.ts +462 -584
  24. package/src/commands/terminal.ts +9 -221
  25. package/src/commands/tunnel.ts +2 -7
  26. package/src/commands/upgrade.ts +108 -7
  27. package/src/commands/wake.ts +2 -1
  28. package/src/components/DefaultMainScreen.tsx +328 -154
  29. package/src/index.ts +5 -7
  30. package/src/lib/__tests__/docker.test.ts +50 -74
  31. package/src/lib/__tests__/job-polling.test.ts +278 -0
  32. package/src/lib/__tests__/local-runtime-client.test.ts +480 -0
  33. package/src/lib/__tests__/platform-client-signed-url.test.ts +405 -0
  34. package/src/lib/__tests__/runtime-url.test.ts +87 -0
  35. package/src/lib/__tests__/terminal-session.test.ts +202 -0
  36. package/src/lib/assistant-client.ts +5 -21
  37. package/src/lib/assistant-config.ts +46 -24
  38. package/src/lib/cli-error.ts +1 -0
  39. package/src/lib/client-identity.ts +67 -0
  40. package/src/lib/docker.ts +75 -77
  41. package/src/lib/environments/__tests__/paths.test.ts +2 -0
  42. package/src/lib/environments/resolve.ts +89 -7
  43. package/src/lib/environments/seeds.ts +8 -5
  44. package/src/lib/environments/types.ts +10 -0
  45. package/src/lib/hatch-local.ts +15 -120
  46. package/src/lib/health-check.ts +98 -0
  47. package/src/lib/job-polling.ts +195 -0
  48. package/src/lib/local-runtime-client.ts +231 -0
  49. package/src/lib/local.ts +165 -72
  50. package/src/lib/orphan-detection.ts +2 -35
  51. package/src/lib/platform-client.ts +190 -194
  52. package/src/lib/platform-releases.ts +23 -0
  53. package/src/lib/retire-local.ts +6 -2
  54. package/src/lib/runtime-url.ts +30 -0
  55. package/src/lib/sync-cloud-assistants.ts +126 -0
  56. package/src/lib/terminal-client.ts +6 -1
  57. package/src/lib/terminal-session.ts +536 -0
  58. package/src/lib/tui-log.ts +60 -0
  59. package/src/lib/xdg-log.ts +10 -4
  60. package/src/shared/provider-env-vars.ts +2 -3
  61. package/src/__tests__/orphan-detection.test.ts +0 -214
@@ -28,13 +28,11 @@ function portBlock(base: number): PortMap {
28
28
  * Built-in environment definitions. Mirrors Swift's
29
29
  * `clients/macos/vellum-assistant/App/VellumEnvironment.swift` enum and is
30
30
  * the TS-side source of truth for the set of known environment names.
31
- * Two other TS sites duplicate the name list:
31
+ * One other TS site duplicates the name list:
32
32
  * - `assistant/src/util/platform.ts` (`KNOWN_ENVIRONMENTS`)
33
- * - `clients/chrome-extension/native-host/src/lockfile.ts`
34
- * (`NON_PRODUCTION_ENVIRONMENTS`, excludes `production`)
35
- * Drift between these three sites is caught at test time by
33
+ * Drift between these two sites is caught at test time by
36
34
  * `cli/src/__tests__/env-drift.test.ts`. Fast follow: hoist the shared
37
- * list into a `packages/environments` package so all three sites import
35
+ * list into a `packages/environments` package so both sites import
38
36
  * from one place.
39
37
  *
40
38
  * Custom environments via a user config file are a future phase — see the
@@ -45,10 +43,12 @@ export const SEEDS: Record<string, EnvironmentDefinition> = {
45
43
  production: {
46
44
  name: "production",
47
45
  platformUrl: "https://platform.vellum.ai",
46
+ webUrl: "https://www.vellum.ai",
48
47
  },
49
48
  staging: {
50
49
  name: "staging",
51
50
  platformUrl: "https://staging-platform.vellum.ai",
51
+ webUrl: "https://staging-assistant.vellum.ai",
52
52
  portsOverride: portBlock(17000),
53
53
  },
54
54
  test: {
@@ -56,16 +56,19 @@ export const SEEDS: Record<string, EnvironmentDefinition> = {
56
56
  // Non-functional URL — used only by unit tests for URL resolution, never
57
57
  // hit in production.
58
58
  platformUrl: "https://test-platform.vellum.ai",
59
+ webUrl: "https://dev-assistant.vellum.ai",
59
60
  portsOverride: portBlock(19000),
60
61
  },
61
62
  dev: {
62
63
  name: "dev",
63
64
  platformUrl: "https://dev-platform.vellum.ai",
65
+ webUrl: "https://dev-assistant.vellum.ai",
64
66
  portsOverride: portBlock(18000),
65
67
  },
66
68
  local: {
67
69
  name: "local",
68
70
  platformUrl: "http://localhost:8000",
71
+ webUrl: "http://localhost:3000",
69
72
  // assistantPlatformUrl: "http://host.docker.internal:8000",
70
73
  // ^ uncomment this once dockerized hatch path is live.
71
74
  // The assistant runs in a different network namespace than the host.
@@ -30,6 +30,16 @@ export interface EnvironmentDefinition {
30
30
  name: string;
31
31
  platformUrl: string;
32
32
 
33
+ /**
34
+ * The web app (Next.js) base URL for browser-facing pages like
35
+ * `/account/login`. In production this is separate from the API backend
36
+ * (e.g. `www.vellum.ai` vs `platform.vellum.ai`); locally it's
37
+ * `localhost:3000` vs `localhost:8000`.
38
+ *
39
+ * Mirrors `VellumEnvironment.webURL` on the Swift side.
40
+ */
41
+ webUrl: string;
42
+
33
43
  /**
34
44
  * Override for the platform URL the assistant process itself uses. Only
35
45
  * differs from `platformUrl` when the assistant runs in a different network
@@ -3,7 +3,6 @@ import {
3
3
  lstatSync,
4
4
  mkdirSync,
5
5
  readlinkSync,
6
- rmSync,
7
6
  symlinkSync,
8
7
  unlinkSync,
9
8
  writeFileSync,
@@ -19,15 +18,11 @@ import cliPkg from "../../package.json";
19
18
  import {
20
19
  allocateLocalResources,
21
20
  findAssistantByName,
22
- loadAllAssistants,
23
21
  saveAssistantEntry,
24
22
  setActiveAssistant,
25
23
  syncConfigToLockfile,
26
24
  } from "./assistant-config.js";
27
- import type {
28
- AssistantEntry,
29
- LocalInstanceResources,
30
- } from "./assistant-config.js";
25
+ import type { AssistantEntry } from "./assistant-config.js";
31
26
  import type { Species } from "./constants.js";
32
27
  import { writeInitialConfig } from "./config-utils.js";
33
28
  import {
@@ -37,20 +32,12 @@ import {
37
32
  stopLocalProcesses,
38
33
  } from "./local.js";
39
34
  import { maybeStartNgrokTunnel } from "./ngrok.js";
40
- import { httpHealthCheck } from "./http-client.js";
41
- import { detectOrphanedProcesses } from "./orphan-detection.js";
42
- import { isProcessAlive, stopProcess } from "./process.js";
35
+
43
36
  import { generateInstanceName } from "./random-name.js";
44
37
  import { leaseGuardianToken } from "./guardian-token.js";
45
38
  import { archiveLogFile, resetLogFile } from "./xdg-log.js";
46
39
  import { emitProgress } from "./desktop-progress.js";
47
40
 
48
- const IS_DESKTOP = !!process.env.VELLUM_DESKTOP_APP;
49
-
50
- function desktopLog(msg: string): void {
51
- process.stdout.write(msg + "\n");
52
- }
53
-
54
41
  /**
55
42
  * Attempts to place a symlink at the given path pointing to cliBinary.
56
43
  * Returns true if the symlink was created (or already correct), false on failure.
@@ -153,110 +140,18 @@ export async function hatchLocal(
153
140
  name ?? process.env.VELLUM_ASSISTANT_NAME,
154
141
  );
155
142
 
156
- emitProgress(1, 7, "Preparing workspace...");
157
-
158
- // Clean up stale local state: if daemon/gateway processes are running but
159
- // the lock file has no entries AND the daemon is not healthy, stop them
160
- // before starting fresh. A healthy daemon should be reused, not killed —
161
- // it may have been started intentionally via `vellum wake`.
162
- const vellumDir = join(homedir(), ".vellum");
163
- const existingAssistants = loadAllAssistants();
164
- const localAssistants = existingAssistants.filter((a) => a.cloud === "local");
165
- if (localAssistants.length === 0) {
166
- const daemonPid = isProcessAlive(join(vellumDir, "vellum.pid"));
167
- const gatewayPid = isProcessAlive(join(vellumDir, "gateway.pid"));
168
- if (daemonPid.alive || gatewayPid.alive) {
169
- // Check if the daemon is actually healthy before killing it.
170
- // Default port 7821 is used when there's no lockfile entry.
171
- const defaultPort = parseInt(process.env.RUNTIME_HTTP_PORT || "7821", 10);
172
- const healthy = await httpHealthCheck(defaultPort);
173
- if (!healthy) {
174
- console.log(
175
- "🧹 Cleaning up stale local processes (no lock file entry)...\n",
176
- );
177
- await stopLocalProcesses();
178
- }
179
- }
180
- }
181
-
182
- // On desktop, scan the process table for orphaned vellum processes that
183
- // are not tracked by any PID file or lock file entry and kill them before
184
- // starting new ones. This prevents resource leaks when the desktop app
185
- // crashes or is force-quit without a clean shutdown.
186
- //
187
- // Skip orphan cleanup if the daemon is already healthy on the expected port
188
- // — those processes are intentional (e.g. started via `vellum wake`) and
189
- // startLocalDaemon() will reuse them.
190
- if (IS_DESKTOP) {
191
- const existingResources = findAssistantByName(instanceName);
192
- const expectedPort =
193
- existingResources?.cloud === "local" && existingResources.resources
194
- ? existingResources.resources.daemonPort
195
- : undefined;
196
- const daemonAlreadyHealthy = expectedPort
197
- ? await httpHealthCheck(expectedPort)
198
- : false;
143
+ emitProgress(1, 6, "Allocating resources...");
199
144
 
200
- if (!daemonAlreadyHealthy) {
201
- const orphans = await detectOrphanedProcesses();
202
- if (orphans.length > 0) {
203
- desktopLog(
204
- `🧹 Found ${orphans.length} orphaned process${orphans.length === 1 ? "" : "es"} cleaning up...`,
205
- );
206
- for (const orphan of orphans) {
207
- await stopProcess(
208
- parseInt(orphan.pid, 10),
209
- `${orphan.name} (PID ${orphan.pid})`,
210
- );
211
- }
212
- }
213
- }
214
- }
215
-
216
- emitProgress(2, 7, "Allocating resources...");
217
-
218
- // Reuse existing resources if re-hatching with --name that matches a known
219
- // local assistant, otherwise allocate fresh per-instance ports and directories.
220
- let resources: LocalInstanceResources;
221
- const existingEntry = findAssistantByName(instanceName);
222
- if (existingEntry?.cloud === "local" && existingEntry.resources) {
223
- resources = existingEntry.resources;
224
- } else {
225
- resources = await allocateLocalResources(instanceName);
226
- }
227
-
228
- // Clean up stale workspace data: if the workspace directory already exists for
229
- // this instance but no local lockfile entry owns it, a previous retire failed
230
- // to archive it (or a managed-only retire left local data behind). Remove the
231
- // workspace subtree so the new assistant starts fresh — but preserve the rest
232
- // of .vellum (e.g. protected/, credentials) which may be shared.
233
- if (
234
- !existingEntry ||
235
- (existingEntry.cloud != null && existingEntry.cloud !== "local")
236
- ) {
237
- const instanceWorkspaceDir = join(
238
- resources.instanceDir,
239
- ".vellum",
240
- "workspace",
145
+ const existing = findAssistantByName(instanceName);
146
+ if (existing && (!existing.cloud || existing.cloud === "local")) {
147
+ throw new Error(
148
+ `An assistant named "${instanceName}" is already hatched.\n` +
149
+ `Run \`vellum wake\` to restart it, or \`vellum retire ${instanceName}\` to remove it first.`,
241
150
  );
242
- if (existsSync(instanceWorkspaceDir)) {
243
- const ownedByOther = loadAllAssistants().some((a) => {
244
- if ((a.cloud != null && a.cloud !== "local") || !a.resources)
245
- return false;
246
- return (
247
- join(a.resources.instanceDir, ".vellum", "workspace") ===
248
- instanceWorkspaceDir
249
- );
250
- });
251
- if (!ownedByOther) {
252
- console.log(
253
- `🧹 Removing stale workspace at ${instanceWorkspaceDir} (not owned by any assistant)...\n`,
254
- );
255
- rmSync(instanceWorkspaceDir, { recursive: true, force: true });
256
- }
257
- }
258
151
  }
259
152
 
153
+ const resources = await allocateLocalResources(instanceName);
154
+
260
155
  const logsDir = join(
261
156
  resources.instanceDir,
262
157
  ".vellum",
@@ -275,17 +170,17 @@ export async function hatchLocal(
275
170
  process.env.APP_VERSION = cliPkg.version;
276
171
  }
277
172
 
278
- emitProgress(3, 7, "Writing configuration...");
173
+ emitProgress(2, 6, "Writing configuration...");
279
174
  const defaultWorkspaceConfigPath = writeInitialConfig(configValues);
280
175
 
281
- emitProgress(4, 7, "Starting assistant...");
176
+ emitProgress(3, 6, "Starting assistant...");
282
177
  const signingKey = generateLocalSigningKey();
283
178
  await startLocalDaemon(watch, resources, {
284
179
  defaultWorkspaceConfigPath,
285
180
  signingKey,
286
181
  });
287
182
 
288
- emitProgress(5, 7, "Starting gateway...");
183
+ emitProgress(4, 6, "Starting gateway...");
289
184
  let runtimeUrl = `http://127.0.0.1:${resources.gatewayPort}`;
290
185
  try {
291
186
  runtimeUrl = await startGateway(watch, resources, { signingKey });
@@ -303,7 +198,7 @@ export async function hatchLocal(
303
198
  // instead of hitting /v1/guardian/init itself. Use loopback to satisfy
304
199
  // the daemon's local-only check — the mDNS runtimeUrl resolves to a LAN
305
200
  // IP which the daemon rejects as non-loopback.
306
- emitProgress(6, 7, "Securing connection...");
201
+ emitProgress(5, 6, "Securing connection...");
307
202
  const loopbackUrl = `http://127.0.0.1:${resources.gatewayPort}`;
308
203
  const maxLeaseAttempts = 3;
309
204
  for (let attempt = 1; attempt <= maxLeaseAttempts; attempt++) {
@@ -350,7 +245,7 @@ export async function hatchLocal(
350
245
  writeFileSync(ngrokPidFile, String(ngrokChild.pid));
351
246
  }
352
247
 
353
- emitProgress(7, 7, "Saving configuration...");
248
+ emitProgress(6, 6, "Saving configuration...");
354
249
  saveAssistantEntry(localEntry);
355
250
  setActiveAssistant(instanceName);
356
251
  syncConfigToLockfile();
@@ -71,6 +71,104 @@ export async function checkManagedHealth(
71
71
  }
72
72
  }
73
73
 
74
+ export interface ManagedProcessEntry {
75
+ name: string;
76
+ status: "running" | "not_running" | "unreachable";
77
+ children?: ManagedProcessEntry[];
78
+ info?: string;
79
+ }
80
+
81
+ export interface ManagedPsResponse {
82
+ processes: ManagedProcessEntry[];
83
+ }
84
+
85
+ export async function fetchManagedPs(
86
+ runtimeUrl: string,
87
+ assistantId: string,
88
+ ): Promise<ManagedPsResponse | null> {
89
+ const { readPlatformToken, authHeaders } =
90
+ await import("./platform-client.js");
91
+ const token = readPlatformToken();
92
+ if (!token) return null;
93
+
94
+ let headers: Record<string, string>;
95
+ try {
96
+ headers = await authHeaders(token, runtimeUrl);
97
+ } catch {
98
+ return null;
99
+ }
100
+
101
+ // Try the /ps endpoint first; fall back to legacy /connection-status
102
+ // for platform versions that haven't rolled it out yet.
103
+ try {
104
+ const psUrl = `${runtimeUrl}/v1/assistants/${encodeURIComponent(assistantId)}/ps/`;
105
+ const controller = new AbortController();
106
+ const timeoutId = setTimeout(() => controller.abort(), 5000);
107
+
108
+ const response = await fetch(psUrl, {
109
+ signal: controller.signal,
110
+ headers,
111
+ });
112
+
113
+ clearTimeout(timeoutId);
114
+
115
+ if (response.ok) {
116
+ return (await response.json()) as ManagedPsResponse;
117
+ }
118
+
119
+ // /ps not available — fall back to legacy connection-status
120
+ if (response.status === 404 || response.status === 405) {
121
+ return fetchLegacyConnectionStatus(runtimeUrl, assistantId, headers);
122
+ }
123
+
124
+ return null;
125
+ } catch {
126
+ return null;
127
+ }
128
+ }
129
+
130
+ interface LegacyConnectionStatus {
131
+ state: string;
132
+ is_awake: boolean;
133
+ pod_status: string | null;
134
+ detail: string | null;
135
+ }
136
+
137
+ async function fetchLegacyConnectionStatus(
138
+ runtimeUrl: string,
139
+ assistantId: string,
140
+ headers: Record<string, string>,
141
+ ): Promise<ManagedPsResponse | null> {
142
+ try {
143
+ const url = `${runtimeUrl}/v1/assistants/${encodeURIComponent(assistantId)}/connection-status/`;
144
+ const controller = new AbortController();
145
+ const timeoutId = setTimeout(() => controller.abort(), 5000);
146
+
147
+ const response = await fetch(url, {
148
+ method: "POST",
149
+ signal: controller.signal,
150
+ headers,
151
+ });
152
+
153
+ clearTimeout(timeoutId);
154
+ if (!response.ok) return null;
155
+
156
+ const data = (await response.json()) as LegacyConnectionStatus;
157
+
158
+ // Translate legacy shape into the ps process tree
159
+ const status: ManagedProcessEntry["status"] = data.is_awake
160
+ ? "running"
161
+ : "not_running";
162
+ return {
163
+ processes: [
164
+ { name: "assistant", status, info: data.detail ?? undefined },
165
+ ],
166
+ };
167
+ } catch {
168
+ return null;
169
+ }
170
+ }
171
+
74
172
  export async function checkHealth(
75
173
  runtimeUrl: string,
76
174
  bearerToken?: string,
@@ -0,0 +1,195 @@
1
+ import type { UnifiedJobStatus } from "./platform-client.js";
2
+
3
+ /**
4
+ * Terminal status returned by {@link pollJobUntilDone}. Callers decide
5
+ * whether to treat `failed` as a fatal error or retry logic concern.
6
+ */
7
+ export type TerminalJobStatus = Extract<
8
+ UnifiedJobStatus,
9
+ { status: "complete" | "failed" }
10
+ >;
11
+
12
+ export interface PollJobUntilDoneOptions {
13
+ /** Async producer that returns the latest job status. */
14
+ poll: () => Promise<UnifiedJobStatus>;
15
+ /** Sleep between successive polls. Defaults to 2_000 ms. */
16
+ intervalMs?: number;
17
+ /** Maximum wall-clock time to wait. Defaults to 60 minutes. */
18
+ timeoutMs?: number;
19
+ /** Human-readable label used in the timeout error message (e.g. "export job"). */
20
+ label: string;
21
+ /**
22
+ * Maximum consecutive transient (retryable) poll errors tolerated before
23
+ * the last error is propagated. Transient errors (5xx / network) between
24
+ * successful polls reset the counter. Defaults to 5.
25
+ */
26
+ maxTransientErrors?: number;
27
+ /**
28
+ * Optional async hook invoked when `poll()` throws an error containing a
29
+ * `401` HTTP status. The callback is expected to refresh whatever
30
+ * credential the poll closure reads (e.g. re-lease a guardian token), then
31
+ * return. The polling loop will retry the poll after the callback resolves
32
+ * instead of propagating the 401.
33
+ *
34
+ * Used by long-running migrations where the cached access token may expire
35
+ * mid-poll. Without this hook, 4xx errors (except 429) are permanent and
36
+ * would abandon a migration that's still running on the server.
37
+ */
38
+ refreshOn401?: () => Promise<void>;
39
+ /**
40
+ * Maximum consecutive 401 refreshes tolerated before the last 401 is
41
+ * propagated. Tracked separately from {@link maxTransientErrors} because
42
+ * a persistent 401 after a refresh usually means the underlying credential
43
+ * is revoked, not a transient network issue. Defaults to 3.
44
+ */
45
+ maxAuthRefreshes?: number;
46
+ }
47
+
48
+ const DEFAULT_INTERVAL_MS = 2_000;
49
+ // Matches the server-side runtime migration window: the GCS upload PUT and
50
+ // the import-URL fetch in assistant/src/runtime/routes/migration-routes.ts
51
+ // use AbortSignal.timeout(60 * 60 * 1000), so a shorter CLI poll cap would
52
+ // abort a job that's still legitimately in progress on the server.
53
+ const DEFAULT_TIMEOUT_MS = 60 * 60 * 1000;
54
+ const DEFAULT_MAX_TRANSIENT_ERRORS = 5;
55
+ const DEFAULT_MAX_AUTH_REFRESHES = 3;
56
+
57
+ function is401Error(err: unknown): boolean {
58
+ const msg = err instanceof Error ? err.message : String(err);
59
+ return /\b401\b/.test(msg);
60
+ }
61
+
62
+ function sleep(ms: number): Promise<void> {
63
+ return new Promise((resolve) => setTimeout(resolve, ms));
64
+ }
65
+
66
+ /**
67
+ * Heuristic classification used by {@link pollJobUntilDone} to decide whether
68
+ * to retry a failed poll.
69
+ *
70
+ * - 5xx responses and unclassifiable network-style errors (fetch failed,
71
+ * ECONNRESET, etc.) are treated as transient.
72
+ * - 4xx responses are treated as permanent, except 429 (rate limited) which is
73
+ * transient.
74
+ * - "not found" errors are permanent — they indicate the job id is wrong and
75
+ * retrying won't help.
76
+ *
77
+ * The poll helpers (`platformPollJobStatus`, `localRuntimePollJobStatus`)
78
+ * raise errors whose message contains the HTTP status (e.g. `"Local job
79
+ * status check failed: 503 Service Unavailable"`), so we parse that out when
80
+ * available and default to "retry" when unsure.
81
+ */
82
+ function isTransientPollError(err: unknown): boolean {
83
+ const msg = err instanceof Error ? err.message : String(err);
84
+
85
+ if (msg.includes("not found")) return false;
86
+
87
+ const match = msg.match(/(?:status check failed|failed)[^\d]*(\d{3})/i);
88
+ if (match) {
89
+ const code = parseInt(match[1], 10);
90
+ if (code === 429) return true;
91
+ if (code >= 400 && code < 500) return false;
92
+ if (code >= 500) return true;
93
+ }
94
+
95
+ // Unclassifiable (e.g. "fetch failed", ECONNRESET) — treat as transient so
96
+ // a single network hiccup doesn't abort a long-running migration.
97
+ return true;
98
+ }
99
+
100
+ /**
101
+ * Poll `options.poll` until it returns a terminal status (`complete` or
102
+ * `failed`), or until `timeoutMs` elapses.
103
+ *
104
+ * On terminal status, returns the status object — including the `failed`
105
+ * case. The caller decides how to treat a failed terminal status (e.g.
106
+ * print the `error` field and exit). Timeouts throw.
107
+ *
108
+ * Transient errors raised by `poll()` (5xx, network hiccups, rate-limits) are
109
+ * retried up to `maxTransientErrors` times before the last error propagates,
110
+ * matching the pre-rewrite migration-export polling loop's behavior so a
111
+ * single flaky poll doesn't abort a migration that may still be running.
112
+ */
113
+ export async function pollJobUntilDone(
114
+ options: PollJobUntilDoneOptions,
115
+ ): Promise<TerminalJobStatus> {
116
+ const intervalMs = options.intervalMs ?? DEFAULT_INTERVAL_MS;
117
+ const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
118
+ const maxTransientErrors =
119
+ options.maxTransientErrors ?? DEFAULT_MAX_TRANSIENT_ERRORS;
120
+ const maxAuthRefreshes =
121
+ options.maxAuthRefreshes ?? DEFAULT_MAX_AUTH_REFRESHES;
122
+ const deadline = Date.now() + timeoutMs;
123
+
124
+ let consecutiveTransientErrors = 0;
125
+ let consecutiveAuthRefreshes = 0;
126
+
127
+ // First poll happens immediately so fast-path completions don't wait
128
+ // one interval before returning.
129
+ while (true) {
130
+ let status: UnifiedJobStatus;
131
+ try {
132
+ status = await options.poll();
133
+ consecutiveTransientErrors = 0;
134
+ consecutiveAuthRefreshes = 0;
135
+ } catch (err) {
136
+ // 401 Unauthorized takes precedence over the generic transient
137
+ // classifier: when a refresh callback is registered, a long-running
138
+ // poll loop can re-lease its credential and keep going instead of
139
+ // abandoning a migration that's still running on the server.
140
+ if (options.refreshOn401 && is401Error(err)) {
141
+ consecutiveAuthRefreshes += 1;
142
+ if (consecutiveAuthRefreshes > maxAuthRefreshes) {
143
+ throw err;
144
+ }
145
+ const msg = err instanceof Error ? err.message : String(err);
146
+ console.warn(
147
+ `${options.label} polling got 401, refreshing auth and retrying... (${msg})`,
148
+ );
149
+ await options.refreshOn401();
150
+ if (Date.now() >= deadline) {
151
+ throw new Error(
152
+ `Timed out waiting for ${options.label} after ${Math.round(
153
+ timeoutMs / 1000,
154
+ )}s`,
155
+ );
156
+ }
157
+ await sleep(intervalMs);
158
+ continue;
159
+ }
160
+
161
+ if (!isTransientPollError(err)) {
162
+ throw err;
163
+ }
164
+ consecutiveTransientErrors += 1;
165
+ if (consecutiveTransientErrors > maxTransientErrors) {
166
+ throw err;
167
+ }
168
+ const msg = err instanceof Error ? err.message : String(err);
169
+ console.warn(`${options.label} polling failed, retrying... (${msg})`);
170
+ if (Date.now() >= deadline) {
171
+ throw new Error(
172
+ `Timed out waiting for ${options.label} after ${Math.round(
173
+ timeoutMs / 1000,
174
+ )}s`,
175
+ );
176
+ }
177
+ await sleep(intervalMs);
178
+ continue;
179
+ }
180
+
181
+ if (status.status === "complete" || status.status === "failed") {
182
+ return status;
183
+ }
184
+
185
+ if (Date.now() >= deadline) {
186
+ throw new Error(
187
+ `Timed out waiting for ${options.label} after ${Math.round(
188
+ timeoutMs / 1000,
189
+ )}s`,
190
+ );
191
+ }
192
+
193
+ await sleep(intervalMs);
194
+ }
195
+ }