@vellumai/cli 0.6.6 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +8 -2
- package/README.md +49 -0
- package/package.json +1 -1
- package/src/__tests__/assistant-config.test.ts +1 -7
- package/src/__tests__/backup.test.ts +475 -0
- package/src/__tests__/config-utils.test.ts +146 -0
- package/src/__tests__/env-drift.test.ts +10 -32
- package/src/__tests__/llm-provider-env-var-parity.test.ts +1 -21
- package/src/__tests__/multi-local.test.ts +0 -5
- package/src/__tests__/sleep.test.ts +1 -2
- package/src/__tests__/teleport.test.ts +988 -1266
- package/src/commands/backup.ts +117 -71
- package/src/commands/client.ts +10 -9
- package/src/commands/env.ts +93 -0
- package/src/commands/events.ts +2 -0
- package/src/commands/exec.ts +58 -13
- package/src/commands/login.ts +77 -12
- package/src/commands/logs.ts +2 -7
- package/src/commands/ps.ts +144 -25
- package/src/commands/restore.ts +26 -47
- package/src/commands/sleep.ts +5 -2
- package/src/commands/ssh.ts +17 -7
- package/src/commands/teleport.ts +462 -584
- package/src/commands/terminal.ts +9 -221
- package/src/commands/tunnel.ts +2 -7
- package/src/commands/upgrade.ts +108 -7
- package/src/commands/wake.ts +2 -1
- package/src/components/DefaultMainScreen.tsx +328 -154
- package/src/index.ts +5 -7
- package/src/lib/__tests__/docker.test.ts +50 -74
- package/src/lib/__tests__/job-polling.test.ts +278 -0
- package/src/lib/__tests__/local-runtime-client.test.ts +480 -0
- package/src/lib/__tests__/platform-client-signed-url.test.ts +405 -0
- package/src/lib/__tests__/runtime-url.test.ts +87 -0
- package/src/lib/__tests__/terminal-session.test.ts +202 -0
- package/src/lib/assistant-client.ts +5 -21
- package/src/lib/assistant-config.ts +46 -24
- package/src/lib/cli-error.ts +1 -0
- package/src/lib/client-identity.ts +67 -0
- package/src/lib/docker.ts +75 -77
- package/src/lib/environments/__tests__/paths.test.ts +2 -0
- package/src/lib/environments/resolve.ts +89 -7
- package/src/lib/environments/seeds.ts +8 -5
- package/src/lib/environments/types.ts +10 -0
- package/src/lib/hatch-local.ts +15 -120
- package/src/lib/health-check.ts +98 -0
- package/src/lib/job-polling.ts +195 -0
- package/src/lib/local-runtime-client.ts +231 -0
- package/src/lib/local.ts +165 -72
- package/src/lib/orphan-detection.ts +2 -35
- package/src/lib/platform-client.ts +190 -194
- package/src/lib/platform-releases.ts +23 -0
- package/src/lib/retire-local.ts +6 -2
- package/src/lib/runtime-url.ts +30 -0
- package/src/lib/sync-cloud-assistants.ts +126 -0
- package/src/lib/terminal-client.ts +6 -1
- package/src/lib/terminal-session.ts +536 -0
- package/src/lib/tui-log.ts +60 -0
- package/src/lib/xdg-log.ts +10 -4
- package/src/shared/provider-env-vars.ts +2 -3
- package/src/__tests__/orphan-detection.test.ts +0 -214
|
@@ -28,13 +28,11 @@ function portBlock(base: number): PortMap {
|
|
|
28
28
|
* Built-in environment definitions. Mirrors Swift's
|
|
29
29
|
* `clients/macos/vellum-assistant/App/VellumEnvironment.swift` enum and is
|
|
30
30
|
* the TS-side source of truth for the set of known environment names.
|
|
31
|
-
*
|
|
31
|
+
* One other TS site duplicates the name list:
|
|
32
32
|
* - `assistant/src/util/platform.ts` (`KNOWN_ENVIRONMENTS`)
|
|
33
|
-
*
|
|
34
|
-
* (`NON_PRODUCTION_ENVIRONMENTS`, excludes `production`)
|
|
35
|
-
* Drift between these three sites is caught at test time by
|
|
33
|
+
* Drift between these two sites is caught at test time by
|
|
36
34
|
* `cli/src/__tests__/env-drift.test.ts`. Fast follow: hoist the shared
|
|
37
|
-
* list into a `packages/environments` package so
|
|
35
|
+
* list into a `packages/environments` package so both sites import
|
|
38
36
|
* from one place.
|
|
39
37
|
*
|
|
40
38
|
* Custom environments via a user config file are a future phase — see the
|
|
@@ -45,10 +43,12 @@ export const SEEDS: Record<string, EnvironmentDefinition> = {
|
|
|
45
43
|
production: {
|
|
46
44
|
name: "production",
|
|
47
45
|
platformUrl: "https://platform.vellum.ai",
|
|
46
|
+
webUrl: "https://www.vellum.ai",
|
|
48
47
|
},
|
|
49
48
|
staging: {
|
|
50
49
|
name: "staging",
|
|
51
50
|
platformUrl: "https://staging-platform.vellum.ai",
|
|
51
|
+
webUrl: "https://staging-assistant.vellum.ai",
|
|
52
52
|
portsOverride: portBlock(17000),
|
|
53
53
|
},
|
|
54
54
|
test: {
|
|
@@ -56,16 +56,19 @@ export const SEEDS: Record<string, EnvironmentDefinition> = {
|
|
|
56
56
|
// Non-functional URL — used only by unit tests for URL resolution, never
|
|
57
57
|
// hit in production.
|
|
58
58
|
platformUrl: "https://test-platform.vellum.ai",
|
|
59
|
+
webUrl: "https://dev-assistant.vellum.ai",
|
|
59
60
|
portsOverride: portBlock(19000),
|
|
60
61
|
},
|
|
61
62
|
dev: {
|
|
62
63
|
name: "dev",
|
|
63
64
|
platformUrl: "https://dev-platform.vellum.ai",
|
|
65
|
+
webUrl: "https://dev-assistant.vellum.ai",
|
|
64
66
|
portsOverride: portBlock(18000),
|
|
65
67
|
},
|
|
66
68
|
local: {
|
|
67
69
|
name: "local",
|
|
68
70
|
platformUrl: "http://localhost:8000",
|
|
71
|
+
webUrl: "http://localhost:3000",
|
|
69
72
|
// assistantPlatformUrl: "http://host.docker.internal:8000",
|
|
70
73
|
// ^ uncomment this once dockerized hatch path is live.
|
|
71
74
|
// The assistant runs in a different network namespace than the host.
|
|
@@ -30,6 +30,16 @@ export interface EnvironmentDefinition {
|
|
|
30
30
|
name: string;
|
|
31
31
|
platformUrl: string;
|
|
32
32
|
|
|
33
|
+
/**
|
|
34
|
+
* The web app (Next.js) base URL for browser-facing pages like
|
|
35
|
+
* `/account/login`. In production this is separate from the API backend
|
|
36
|
+
* (e.g. `www.vellum.ai` vs `platform.vellum.ai`); locally it's
|
|
37
|
+
* `localhost:3000` vs `localhost:8000`.
|
|
38
|
+
*
|
|
39
|
+
* Mirrors `VellumEnvironment.webURL` on the Swift side.
|
|
40
|
+
*/
|
|
41
|
+
webUrl: string;
|
|
42
|
+
|
|
33
43
|
/**
|
|
34
44
|
* Override for the platform URL the assistant process itself uses. Only
|
|
35
45
|
* differs from `platformUrl` when the assistant runs in a different network
|
package/src/lib/hatch-local.ts
CHANGED
|
@@ -3,7 +3,6 @@ import {
|
|
|
3
3
|
lstatSync,
|
|
4
4
|
mkdirSync,
|
|
5
5
|
readlinkSync,
|
|
6
|
-
rmSync,
|
|
7
6
|
symlinkSync,
|
|
8
7
|
unlinkSync,
|
|
9
8
|
writeFileSync,
|
|
@@ -19,15 +18,11 @@ import cliPkg from "../../package.json";
|
|
|
19
18
|
import {
|
|
20
19
|
allocateLocalResources,
|
|
21
20
|
findAssistantByName,
|
|
22
|
-
loadAllAssistants,
|
|
23
21
|
saveAssistantEntry,
|
|
24
22
|
setActiveAssistant,
|
|
25
23
|
syncConfigToLockfile,
|
|
26
24
|
} from "./assistant-config.js";
|
|
27
|
-
import type {
|
|
28
|
-
AssistantEntry,
|
|
29
|
-
LocalInstanceResources,
|
|
30
|
-
} from "./assistant-config.js";
|
|
25
|
+
import type { AssistantEntry } from "./assistant-config.js";
|
|
31
26
|
import type { Species } from "./constants.js";
|
|
32
27
|
import { writeInitialConfig } from "./config-utils.js";
|
|
33
28
|
import {
|
|
@@ -37,20 +32,12 @@ import {
|
|
|
37
32
|
stopLocalProcesses,
|
|
38
33
|
} from "./local.js";
|
|
39
34
|
import { maybeStartNgrokTunnel } from "./ngrok.js";
|
|
40
|
-
|
|
41
|
-
import { detectOrphanedProcesses } from "./orphan-detection.js";
|
|
42
|
-
import { isProcessAlive, stopProcess } from "./process.js";
|
|
35
|
+
|
|
43
36
|
import { generateInstanceName } from "./random-name.js";
|
|
44
37
|
import { leaseGuardianToken } from "./guardian-token.js";
|
|
45
38
|
import { archiveLogFile, resetLogFile } from "./xdg-log.js";
|
|
46
39
|
import { emitProgress } from "./desktop-progress.js";
|
|
47
40
|
|
|
48
|
-
const IS_DESKTOP = !!process.env.VELLUM_DESKTOP_APP;
|
|
49
|
-
|
|
50
|
-
function desktopLog(msg: string): void {
|
|
51
|
-
process.stdout.write(msg + "\n");
|
|
52
|
-
}
|
|
53
|
-
|
|
54
41
|
/**
|
|
55
42
|
* Attempts to place a symlink at the given path pointing to cliBinary.
|
|
56
43
|
* Returns true if the symlink was created (or already correct), false on failure.
|
|
@@ -153,110 +140,18 @@ export async function hatchLocal(
|
|
|
153
140
|
name ?? process.env.VELLUM_ASSISTANT_NAME,
|
|
154
141
|
);
|
|
155
142
|
|
|
156
|
-
emitProgress(1,
|
|
157
|
-
|
|
158
|
-
// Clean up stale local state: if daemon/gateway processes are running but
|
|
159
|
-
// the lock file has no entries AND the daemon is not healthy, stop them
|
|
160
|
-
// before starting fresh. A healthy daemon should be reused, not killed —
|
|
161
|
-
// it may have been started intentionally via `vellum wake`.
|
|
162
|
-
const vellumDir = join(homedir(), ".vellum");
|
|
163
|
-
const existingAssistants = loadAllAssistants();
|
|
164
|
-
const localAssistants = existingAssistants.filter((a) => a.cloud === "local");
|
|
165
|
-
if (localAssistants.length === 0) {
|
|
166
|
-
const daemonPid = isProcessAlive(join(vellumDir, "vellum.pid"));
|
|
167
|
-
const gatewayPid = isProcessAlive(join(vellumDir, "gateway.pid"));
|
|
168
|
-
if (daemonPid.alive || gatewayPid.alive) {
|
|
169
|
-
// Check if the daemon is actually healthy before killing it.
|
|
170
|
-
// Default port 7821 is used when there's no lockfile entry.
|
|
171
|
-
const defaultPort = parseInt(process.env.RUNTIME_HTTP_PORT || "7821", 10);
|
|
172
|
-
const healthy = await httpHealthCheck(defaultPort);
|
|
173
|
-
if (!healthy) {
|
|
174
|
-
console.log(
|
|
175
|
-
"🧹 Cleaning up stale local processes (no lock file entry)...\n",
|
|
176
|
-
);
|
|
177
|
-
await stopLocalProcesses();
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
// On desktop, scan the process table for orphaned vellum processes that
|
|
183
|
-
// are not tracked by any PID file or lock file entry and kill them before
|
|
184
|
-
// starting new ones. This prevents resource leaks when the desktop app
|
|
185
|
-
// crashes or is force-quit without a clean shutdown.
|
|
186
|
-
//
|
|
187
|
-
// Skip orphan cleanup if the daemon is already healthy on the expected port
|
|
188
|
-
// — those processes are intentional (e.g. started via `vellum wake`) and
|
|
189
|
-
// startLocalDaemon() will reuse them.
|
|
190
|
-
if (IS_DESKTOP) {
|
|
191
|
-
const existingResources = findAssistantByName(instanceName);
|
|
192
|
-
const expectedPort =
|
|
193
|
-
existingResources?.cloud === "local" && existingResources.resources
|
|
194
|
-
? existingResources.resources.daemonPort
|
|
195
|
-
: undefined;
|
|
196
|
-
const daemonAlreadyHealthy = expectedPort
|
|
197
|
-
? await httpHealthCheck(expectedPort)
|
|
198
|
-
: false;
|
|
143
|
+
emitProgress(1, 6, "Allocating resources...");
|
|
199
144
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
);
|
|
206
|
-
for (const orphan of orphans) {
|
|
207
|
-
await stopProcess(
|
|
208
|
-
parseInt(orphan.pid, 10),
|
|
209
|
-
`${orphan.name} (PID ${orphan.pid})`,
|
|
210
|
-
);
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
emitProgress(2, 7, "Allocating resources...");
|
|
217
|
-
|
|
218
|
-
// Reuse existing resources if re-hatching with --name that matches a known
|
|
219
|
-
// local assistant, otherwise allocate fresh per-instance ports and directories.
|
|
220
|
-
let resources: LocalInstanceResources;
|
|
221
|
-
const existingEntry = findAssistantByName(instanceName);
|
|
222
|
-
if (existingEntry?.cloud === "local" && existingEntry.resources) {
|
|
223
|
-
resources = existingEntry.resources;
|
|
224
|
-
} else {
|
|
225
|
-
resources = await allocateLocalResources(instanceName);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Clean up stale workspace data: if the workspace directory already exists for
|
|
229
|
-
// this instance but no local lockfile entry owns it, a previous retire failed
|
|
230
|
-
// to archive it (or a managed-only retire left local data behind). Remove the
|
|
231
|
-
// workspace subtree so the new assistant starts fresh — but preserve the rest
|
|
232
|
-
// of .vellum (e.g. protected/, credentials) which may be shared.
|
|
233
|
-
if (
|
|
234
|
-
!existingEntry ||
|
|
235
|
-
(existingEntry.cloud != null && existingEntry.cloud !== "local")
|
|
236
|
-
) {
|
|
237
|
-
const instanceWorkspaceDir = join(
|
|
238
|
-
resources.instanceDir,
|
|
239
|
-
".vellum",
|
|
240
|
-
"workspace",
|
|
145
|
+
const existing = findAssistantByName(instanceName);
|
|
146
|
+
if (existing && (!existing.cloud || existing.cloud === "local")) {
|
|
147
|
+
throw new Error(
|
|
148
|
+
`An assistant named "${instanceName}" is already hatched.\n` +
|
|
149
|
+
`Run \`vellum wake\` to restart it, or \`vellum retire ${instanceName}\` to remove it first.`,
|
|
241
150
|
);
|
|
242
|
-
if (existsSync(instanceWorkspaceDir)) {
|
|
243
|
-
const ownedByOther = loadAllAssistants().some((a) => {
|
|
244
|
-
if ((a.cloud != null && a.cloud !== "local") || !a.resources)
|
|
245
|
-
return false;
|
|
246
|
-
return (
|
|
247
|
-
join(a.resources.instanceDir, ".vellum", "workspace") ===
|
|
248
|
-
instanceWorkspaceDir
|
|
249
|
-
);
|
|
250
|
-
});
|
|
251
|
-
if (!ownedByOther) {
|
|
252
|
-
console.log(
|
|
253
|
-
`🧹 Removing stale workspace at ${instanceWorkspaceDir} (not owned by any assistant)...\n`,
|
|
254
|
-
);
|
|
255
|
-
rmSync(instanceWorkspaceDir, { recursive: true, force: true });
|
|
256
|
-
}
|
|
257
|
-
}
|
|
258
151
|
}
|
|
259
152
|
|
|
153
|
+
const resources = await allocateLocalResources(instanceName);
|
|
154
|
+
|
|
260
155
|
const logsDir = join(
|
|
261
156
|
resources.instanceDir,
|
|
262
157
|
".vellum",
|
|
@@ -275,17 +170,17 @@ export async function hatchLocal(
|
|
|
275
170
|
process.env.APP_VERSION = cliPkg.version;
|
|
276
171
|
}
|
|
277
172
|
|
|
278
|
-
emitProgress(
|
|
173
|
+
emitProgress(2, 6, "Writing configuration...");
|
|
279
174
|
const defaultWorkspaceConfigPath = writeInitialConfig(configValues);
|
|
280
175
|
|
|
281
|
-
emitProgress(
|
|
176
|
+
emitProgress(3, 6, "Starting assistant...");
|
|
282
177
|
const signingKey = generateLocalSigningKey();
|
|
283
178
|
await startLocalDaemon(watch, resources, {
|
|
284
179
|
defaultWorkspaceConfigPath,
|
|
285
180
|
signingKey,
|
|
286
181
|
});
|
|
287
182
|
|
|
288
|
-
emitProgress(
|
|
183
|
+
emitProgress(4, 6, "Starting gateway...");
|
|
289
184
|
let runtimeUrl = `http://127.0.0.1:${resources.gatewayPort}`;
|
|
290
185
|
try {
|
|
291
186
|
runtimeUrl = await startGateway(watch, resources, { signingKey });
|
|
@@ -303,7 +198,7 @@ export async function hatchLocal(
|
|
|
303
198
|
// instead of hitting /v1/guardian/init itself. Use loopback to satisfy
|
|
304
199
|
// the daemon's local-only check — the mDNS runtimeUrl resolves to a LAN
|
|
305
200
|
// IP which the daemon rejects as non-loopback.
|
|
306
|
-
emitProgress(
|
|
201
|
+
emitProgress(5, 6, "Securing connection...");
|
|
307
202
|
const loopbackUrl = `http://127.0.0.1:${resources.gatewayPort}`;
|
|
308
203
|
const maxLeaseAttempts = 3;
|
|
309
204
|
for (let attempt = 1; attempt <= maxLeaseAttempts; attempt++) {
|
|
@@ -350,7 +245,7 @@ export async function hatchLocal(
|
|
|
350
245
|
writeFileSync(ngrokPidFile, String(ngrokChild.pid));
|
|
351
246
|
}
|
|
352
247
|
|
|
353
|
-
emitProgress(
|
|
248
|
+
emitProgress(6, 6, "Saving configuration...");
|
|
354
249
|
saveAssistantEntry(localEntry);
|
|
355
250
|
setActiveAssistant(instanceName);
|
|
356
251
|
syncConfigToLockfile();
|
package/src/lib/health-check.ts
CHANGED
|
@@ -71,6 +71,104 @@ export async function checkManagedHealth(
|
|
|
71
71
|
}
|
|
72
72
|
}
|
|
73
73
|
|
|
74
|
+
export interface ManagedProcessEntry {
|
|
75
|
+
name: string;
|
|
76
|
+
status: "running" | "not_running" | "unreachable";
|
|
77
|
+
children?: ManagedProcessEntry[];
|
|
78
|
+
info?: string;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export interface ManagedPsResponse {
|
|
82
|
+
processes: ManagedProcessEntry[];
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export async function fetchManagedPs(
|
|
86
|
+
runtimeUrl: string,
|
|
87
|
+
assistantId: string,
|
|
88
|
+
): Promise<ManagedPsResponse | null> {
|
|
89
|
+
const { readPlatformToken, authHeaders } =
|
|
90
|
+
await import("./platform-client.js");
|
|
91
|
+
const token = readPlatformToken();
|
|
92
|
+
if (!token) return null;
|
|
93
|
+
|
|
94
|
+
let headers: Record<string, string>;
|
|
95
|
+
try {
|
|
96
|
+
headers = await authHeaders(token, runtimeUrl);
|
|
97
|
+
} catch {
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Try the /ps endpoint first; fall back to legacy /connection-status
|
|
102
|
+
// for platform versions that haven't rolled it out yet.
|
|
103
|
+
try {
|
|
104
|
+
const psUrl = `${runtimeUrl}/v1/assistants/${encodeURIComponent(assistantId)}/ps/`;
|
|
105
|
+
const controller = new AbortController();
|
|
106
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
107
|
+
|
|
108
|
+
const response = await fetch(psUrl, {
|
|
109
|
+
signal: controller.signal,
|
|
110
|
+
headers,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
clearTimeout(timeoutId);
|
|
114
|
+
|
|
115
|
+
if (response.ok) {
|
|
116
|
+
return (await response.json()) as ManagedPsResponse;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// /ps not available — fall back to legacy connection-status
|
|
120
|
+
if (response.status === 404 || response.status === 405) {
|
|
121
|
+
return fetchLegacyConnectionStatus(runtimeUrl, assistantId, headers);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return null;
|
|
125
|
+
} catch {
|
|
126
|
+
return null;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
interface LegacyConnectionStatus {
|
|
131
|
+
state: string;
|
|
132
|
+
is_awake: boolean;
|
|
133
|
+
pod_status: string | null;
|
|
134
|
+
detail: string | null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async function fetchLegacyConnectionStatus(
|
|
138
|
+
runtimeUrl: string,
|
|
139
|
+
assistantId: string,
|
|
140
|
+
headers: Record<string, string>,
|
|
141
|
+
): Promise<ManagedPsResponse | null> {
|
|
142
|
+
try {
|
|
143
|
+
const url = `${runtimeUrl}/v1/assistants/${encodeURIComponent(assistantId)}/connection-status/`;
|
|
144
|
+
const controller = new AbortController();
|
|
145
|
+
const timeoutId = setTimeout(() => controller.abort(), 5000);
|
|
146
|
+
|
|
147
|
+
const response = await fetch(url, {
|
|
148
|
+
method: "POST",
|
|
149
|
+
signal: controller.signal,
|
|
150
|
+
headers,
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
clearTimeout(timeoutId);
|
|
154
|
+
if (!response.ok) return null;
|
|
155
|
+
|
|
156
|
+
const data = (await response.json()) as LegacyConnectionStatus;
|
|
157
|
+
|
|
158
|
+
// Translate legacy shape into the ps process tree
|
|
159
|
+
const status: ManagedProcessEntry["status"] = data.is_awake
|
|
160
|
+
? "running"
|
|
161
|
+
: "not_running";
|
|
162
|
+
return {
|
|
163
|
+
processes: [
|
|
164
|
+
{ name: "assistant", status, info: data.detail ?? undefined },
|
|
165
|
+
],
|
|
166
|
+
};
|
|
167
|
+
} catch {
|
|
168
|
+
return null;
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
74
172
|
export async function checkHealth(
|
|
75
173
|
runtimeUrl: string,
|
|
76
174
|
bearerToken?: string,
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
import type { UnifiedJobStatus } from "./platform-client.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Terminal status returned by {@link pollJobUntilDone}. Callers decide
|
|
5
|
+
* whether to treat `failed` as a fatal error or retry logic concern.
|
|
6
|
+
*/
|
|
7
|
+
export type TerminalJobStatus = Extract<
|
|
8
|
+
UnifiedJobStatus,
|
|
9
|
+
{ status: "complete" | "failed" }
|
|
10
|
+
>;
|
|
11
|
+
|
|
12
|
+
export interface PollJobUntilDoneOptions {
|
|
13
|
+
/** Async producer that returns the latest job status. */
|
|
14
|
+
poll: () => Promise<UnifiedJobStatus>;
|
|
15
|
+
/** Sleep between successive polls. Defaults to 2_000 ms. */
|
|
16
|
+
intervalMs?: number;
|
|
17
|
+
/** Maximum wall-clock time to wait. Defaults to 60 minutes. */
|
|
18
|
+
timeoutMs?: number;
|
|
19
|
+
/** Human-readable label used in the timeout error message (e.g. "export job"). */
|
|
20
|
+
label: string;
|
|
21
|
+
/**
|
|
22
|
+
* Maximum consecutive transient (retryable) poll errors tolerated before
|
|
23
|
+
* the last error is propagated. Transient errors (5xx / network) between
|
|
24
|
+
* successful polls reset the counter. Defaults to 5.
|
|
25
|
+
*/
|
|
26
|
+
maxTransientErrors?: number;
|
|
27
|
+
/**
|
|
28
|
+
* Optional async hook invoked when `poll()` throws an error containing a
|
|
29
|
+
* `401` HTTP status. The callback is expected to refresh whatever
|
|
30
|
+
* credential the poll closure reads (e.g. re-lease a guardian token), then
|
|
31
|
+
* return. The polling loop will retry the poll after the callback resolves
|
|
32
|
+
* instead of propagating the 401.
|
|
33
|
+
*
|
|
34
|
+
* Used by long-running migrations where the cached access token may expire
|
|
35
|
+
* mid-poll. Without this hook, 4xx errors (except 429) are permanent and
|
|
36
|
+
* would abandon a migration that's still running on the server.
|
|
37
|
+
*/
|
|
38
|
+
refreshOn401?: () => Promise<void>;
|
|
39
|
+
/**
|
|
40
|
+
* Maximum consecutive 401 refreshes tolerated before the last 401 is
|
|
41
|
+
* propagated. Tracked separately from {@link maxTransientErrors} because
|
|
42
|
+
* a persistent 401 after a refresh usually means the underlying credential
|
|
43
|
+
* is revoked, not a transient network issue. Defaults to 3.
|
|
44
|
+
*/
|
|
45
|
+
maxAuthRefreshes?: number;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const DEFAULT_INTERVAL_MS = 2_000;
|
|
49
|
+
// Matches the server-side runtime migration window: the GCS upload PUT and
|
|
50
|
+
// the import-URL fetch in assistant/src/runtime/routes/migration-routes.ts
|
|
51
|
+
// use AbortSignal.timeout(60 * 60 * 1000), so a shorter CLI poll cap would
|
|
52
|
+
// abort a job that's still legitimately in progress on the server.
|
|
53
|
+
const DEFAULT_TIMEOUT_MS = 60 * 60 * 1000;
|
|
54
|
+
const DEFAULT_MAX_TRANSIENT_ERRORS = 5;
|
|
55
|
+
const DEFAULT_MAX_AUTH_REFRESHES = 3;
|
|
56
|
+
|
|
57
|
+
function is401Error(err: unknown): boolean {
|
|
58
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
59
|
+
return /\b401\b/.test(msg);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function sleep(ms: number): Promise<void> {
|
|
63
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Heuristic classification used by {@link pollJobUntilDone} to decide whether
|
|
68
|
+
* to retry a failed poll.
|
|
69
|
+
*
|
|
70
|
+
* - 5xx responses and unclassifiable network-style errors (fetch failed,
|
|
71
|
+
* ECONNRESET, etc.) are treated as transient.
|
|
72
|
+
* - 4xx responses are treated as permanent, except 429 (rate limited) which is
|
|
73
|
+
* transient.
|
|
74
|
+
* - "not found" errors are permanent — they indicate the job id is wrong and
|
|
75
|
+
* retrying won't help.
|
|
76
|
+
*
|
|
77
|
+
* The poll helpers (`platformPollJobStatus`, `localRuntimePollJobStatus`)
|
|
78
|
+
* raise errors whose message contains the HTTP status (e.g. `"Local job
|
|
79
|
+
* status check failed: 503 Service Unavailable"`), so we parse that out when
|
|
80
|
+
* available and default to "retry" when unsure.
|
|
81
|
+
*/
|
|
82
|
+
function isTransientPollError(err: unknown): boolean {
|
|
83
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
84
|
+
|
|
85
|
+
if (msg.includes("not found")) return false;
|
|
86
|
+
|
|
87
|
+
const match = msg.match(/(?:status check failed|failed)[^\d]*(\d{3})/i);
|
|
88
|
+
if (match) {
|
|
89
|
+
const code = parseInt(match[1], 10);
|
|
90
|
+
if (code === 429) return true;
|
|
91
|
+
if (code >= 400 && code < 500) return false;
|
|
92
|
+
if (code >= 500) return true;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Unclassifiable (e.g. "fetch failed", ECONNRESET) — treat as transient so
|
|
96
|
+
// a single network hiccup doesn't abort a long-running migration.
|
|
97
|
+
return true;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Poll `options.poll` until it returns a terminal status (`complete` or
|
|
102
|
+
* `failed`), or until `timeoutMs` elapses.
|
|
103
|
+
*
|
|
104
|
+
* On terminal status, returns the status object — including the `failed`
|
|
105
|
+
* case. The caller decides how to treat a failed terminal status (e.g.
|
|
106
|
+
* print the `error` field and exit). Timeouts throw.
|
|
107
|
+
*
|
|
108
|
+
* Transient errors raised by `poll()` (5xx, network hiccups, rate-limits) are
|
|
109
|
+
* retried up to `maxTransientErrors` times before the last error propagates,
|
|
110
|
+
* matching the pre-rewrite migration-export polling loop's behavior so a
|
|
111
|
+
* single flaky poll doesn't abort a migration that may still be running.
|
|
112
|
+
*/
|
|
113
|
+
export async function pollJobUntilDone(
|
|
114
|
+
options: PollJobUntilDoneOptions,
|
|
115
|
+
): Promise<TerminalJobStatus> {
|
|
116
|
+
const intervalMs = options.intervalMs ?? DEFAULT_INTERVAL_MS;
|
|
117
|
+
const timeoutMs = options.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
118
|
+
const maxTransientErrors =
|
|
119
|
+
options.maxTransientErrors ?? DEFAULT_MAX_TRANSIENT_ERRORS;
|
|
120
|
+
const maxAuthRefreshes =
|
|
121
|
+
options.maxAuthRefreshes ?? DEFAULT_MAX_AUTH_REFRESHES;
|
|
122
|
+
const deadline = Date.now() + timeoutMs;
|
|
123
|
+
|
|
124
|
+
let consecutiveTransientErrors = 0;
|
|
125
|
+
let consecutiveAuthRefreshes = 0;
|
|
126
|
+
|
|
127
|
+
// First poll happens immediately so fast-path completions don't wait
|
|
128
|
+
// one interval before returning.
|
|
129
|
+
while (true) {
|
|
130
|
+
let status: UnifiedJobStatus;
|
|
131
|
+
try {
|
|
132
|
+
status = await options.poll();
|
|
133
|
+
consecutiveTransientErrors = 0;
|
|
134
|
+
consecutiveAuthRefreshes = 0;
|
|
135
|
+
} catch (err) {
|
|
136
|
+
// 401 Unauthorized takes precedence over the generic transient
|
|
137
|
+
// classifier: when a refresh callback is registered, a long-running
|
|
138
|
+
// poll loop can re-lease its credential and keep going instead of
|
|
139
|
+
// abandoning a migration that's still running on the server.
|
|
140
|
+
if (options.refreshOn401 && is401Error(err)) {
|
|
141
|
+
consecutiveAuthRefreshes += 1;
|
|
142
|
+
if (consecutiveAuthRefreshes > maxAuthRefreshes) {
|
|
143
|
+
throw err;
|
|
144
|
+
}
|
|
145
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
146
|
+
console.warn(
|
|
147
|
+
`${options.label} polling got 401, refreshing auth and retrying... (${msg})`,
|
|
148
|
+
);
|
|
149
|
+
await options.refreshOn401();
|
|
150
|
+
if (Date.now() >= deadline) {
|
|
151
|
+
throw new Error(
|
|
152
|
+
`Timed out waiting for ${options.label} after ${Math.round(
|
|
153
|
+
timeoutMs / 1000,
|
|
154
|
+
)}s`,
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
await sleep(intervalMs);
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (!isTransientPollError(err)) {
|
|
162
|
+
throw err;
|
|
163
|
+
}
|
|
164
|
+
consecutiveTransientErrors += 1;
|
|
165
|
+
if (consecutiveTransientErrors > maxTransientErrors) {
|
|
166
|
+
throw err;
|
|
167
|
+
}
|
|
168
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
169
|
+
console.warn(`${options.label} polling failed, retrying... (${msg})`);
|
|
170
|
+
if (Date.now() >= deadline) {
|
|
171
|
+
throw new Error(
|
|
172
|
+
`Timed out waiting for ${options.label} after ${Math.round(
|
|
173
|
+
timeoutMs / 1000,
|
|
174
|
+
)}s`,
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
await sleep(intervalMs);
|
|
178
|
+
continue;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
if (status.status === "complete" || status.status === "failed") {
|
|
182
|
+
return status;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
if (Date.now() >= deadline) {
|
|
186
|
+
throw new Error(
|
|
187
|
+
`Timed out waiting for ${options.label} after ${Math.round(
|
|
188
|
+
timeoutMs / 1000,
|
|
189
|
+
)}s`,
|
|
190
|
+
);
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
await sleep(intervalMs);
|
|
194
|
+
}
|
|
195
|
+
}
|