@vellumai/cli 0.5.6 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { randomBytes } from "crypto";
2
+ import { join } from "node:path";
2
3
 
3
4
  import cliPkg from "../../package.json";
4
5
 
@@ -11,9 +12,6 @@ import {
11
12
  import type { AssistantEntry } from "../lib/assistant-config";
12
13
  import {
13
14
  captureImageRefs,
14
- clearSigningKeyBootstrapLock,
15
- DOCKERHUB_IMAGES,
16
- DOCKER_READY_TIMEOUT_MS,
17
15
  GATEWAY_INTERNAL_PORT,
18
16
  dockerResourceNames,
19
17
  migrateCesSecurityFiles,
@@ -21,14 +19,37 @@ import {
21
19
  startContainers,
22
20
  stopContainers,
23
21
  } from "../lib/docker";
24
- import type { ServiceName } from "../lib/docker";
22
+ import { resolveImageRefs } from "../lib/platform-releases";
25
23
  import {
26
24
  fetchOrganizationId,
27
25
  getPlatformUrl,
28
26
  readPlatformToken,
29
27
  } from "../lib/platform-client";
30
- import { loadBootstrapSecret, loadGuardianToken } from "../lib/guardian-token";
31
- import { exec, execOutput } from "../lib/step-runner";
28
+ import {
29
+ loadBootstrapSecret,
30
+ saveBootstrapSecret,
31
+ } from "../lib/guardian-token";
32
+ import {
33
+ createBackup,
34
+ pruneOldBackups,
35
+ restoreBackup,
36
+ } from "../lib/backup-ops.js";
37
+ import { emitCliError, categorizeUpgradeError } from "../lib/cli-error.js";
38
+ import { exec } from "../lib/step-runner.js";
39
+ import {
40
+ broadcastUpgradeEvent,
41
+ buildCompleteEvent,
42
+ buildProgressEvent,
43
+ buildStartingEvent,
44
+ buildUpgradeCommitMessage,
45
+ captureContainerEnv,
46
+ CONTAINER_ENV_EXCLUDE_KEYS,
47
+ rollbackMigrations,
48
+ UPGRADE_PROGRESS,
49
+ waitForReady,
50
+ } from "../lib/upgrade-lifecycle.js";
51
+ import { parseVersion } from "../lib/version-compat.js";
52
+ import { commitWorkspaceState } from "../lib/workspace-git.js";
32
53
 
33
54
  interface UpgradeArgs {
34
55
  name: string | null;
@@ -72,6 +93,7 @@ function parseArgs(): UpgradeArgs {
72
93
  const next = args[i + 1];
73
94
  if (!next || next.startsWith("-")) {
74
95
  console.error("Error: --version requires a value");
96
+ emitCliError("UNKNOWN", "--version requires a value");
75
97
  process.exit(1);
76
98
  }
77
99
  version = next;
@@ -80,6 +102,7 @@ function parseArgs(): UpgradeArgs {
80
102
  name = arg;
81
103
  } else {
82
104
  console.error(`Error: Unknown option '${arg}'.`);
105
+ emitCliError("UNKNOWN", `Unknown option '${arg}'`);
83
106
  process.exit(1);
84
107
  }
85
108
  }
@@ -111,6 +134,10 @@ function resolveTargetAssistant(nameArg: string | null): AssistantEntry {
111
134
  const entry = findAssistantByName(nameArg);
112
135
  if (!entry) {
113
136
  console.error(`No assistant found with name '${nameArg}'.`);
137
+ emitCliError(
138
+ "ASSISTANT_NOT_FOUND",
139
+ `No assistant found with name '${nameArg}'.`,
140
+ );
114
141
  process.exit(1);
115
142
  }
116
143
  return entry;
@@ -126,124 +153,32 @@ function resolveTargetAssistant(nameArg: string | null): AssistantEntry {
126
153
  if (all.length === 1) return all[0];
127
154
 
128
155
  if (all.length === 0) {
129
- console.error("No assistants found. Run 'vellum hatch' first.");
156
+ const msg = "No assistants found. Run 'vellum hatch' first.";
157
+ console.error(msg);
158
+ emitCliError("ASSISTANT_NOT_FOUND", msg);
130
159
  } else {
131
- console.error(
132
- "Multiple assistants found. Specify a name or set an active assistant with 'vellum use <name>'.",
133
- );
160
+ const msg =
161
+ "Multiple assistants found. Specify a name or set an active assistant with 'vellum use <name>'.";
162
+ console.error(msg);
163
+ emitCliError("ASSISTANT_NOT_FOUND", msg);
134
164
  }
135
165
  process.exit(1);
136
166
  }
137
167
 
138
- /**
139
- * Capture environment variables from a running Docker container so they
140
- * can be replayed onto the replacement container after upgrade.
141
- */
142
- export async function captureContainerEnv(
143
- containerName: string,
144
- ): Promise<Record<string, string>> {
145
- const captured: Record<string, string> = {};
146
- try {
147
- const raw = await execOutput("docker", [
148
- "inspect",
149
- "--format",
150
- "{{json .Config.Env}}",
151
- containerName,
152
- ]);
153
- const entries = JSON.parse(raw) as string[];
154
- for (const entry of entries) {
155
- const eqIdx = entry.indexOf("=");
156
- if (eqIdx > 0) {
157
- captured[entry.slice(0, eqIdx)] = entry.slice(eqIdx + 1);
158
- }
159
- }
160
- } catch {
161
- // Container may not exist or not be inspectable
162
- }
163
- return captured;
164
- }
165
-
166
- /**
167
- * Poll the gateway `/readyz` endpoint until it returns 200 or the timeout
168
- * elapses. Returns whether the assistant became ready.
169
- */
170
- export async function waitForReady(runtimeUrl: string): Promise<boolean> {
171
- const readyUrl = `${runtimeUrl}/readyz`;
172
- const start = Date.now();
173
-
174
- while (Date.now() - start < DOCKER_READY_TIMEOUT_MS) {
175
- try {
176
- const resp = await fetch(readyUrl, {
177
- signal: AbortSignal.timeout(5000),
178
- });
179
- if (resp.ok) {
180
- const elapsedSec = ((Date.now() - start) / 1000).toFixed(1);
181
- console.log(`Assistant ready after ${elapsedSec}s`);
182
- return true;
183
- }
184
- let detail = "";
185
- try {
186
- const body = await resp.text();
187
- const json = JSON.parse(body);
188
- const parts = [json.status];
189
- if (json.upstream != null) parts.push(`upstream=${json.upstream}`);
190
- detail = ` — ${parts.join(", ")}`;
191
- } catch {
192
- // ignore parse errors
193
- }
194
- console.log(`Readiness check: ${resp.status}${detail} (retrying...)`);
195
- } catch {
196
- // Connection refused / timeout — not up yet
197
- }
198
- await new Promise((r) => setTimeout(r, 1000));
199
- }
200
-
201
- return false;
202
- }
203
-
204
- /**
205
- * Best-effort broadcast of an upgrade lifecycle event to connected clients
206
- * via the gateway's upgrade-broadcast proxy. Uses guardian token auth.
207
- * Failures are logged but never block the upgrade flow.
208
- */
209
- export async function broadcastUpgradeEvent(
210
- gatewayUrl: string,
211
- assistantId: string,
212
- event: Record<string, unknown>,
213
- ): Promise<void> {
214
- try {
215
- const token = loadGuardianToken(assistantId);
216
- const headers: Record<string, string> = {
217
- "Content-Type": "application/json",
218
- };
219
- if (token?.accessToken) {
220
- headers["Authorization"] = `Bearer ${token.accessToken}`;
221
- }
222
- await fetch(`${gatewayUrl}/v1/admin/upgrade-broadcast`, {
223
- method: "POST",
224
- headers,
225
- body: JSON.stringify(event),
226
- signal: AbortSignal.timeout(3000),
227
- });
228
- } catch {
229
- // Best-effort — gateway/daemon may already be shutting down or not yet ready
230
- }
231
- }
232
-
233
168
  async function upgradeDocker(
234
169
  entry: AssistantEntry,
235
170
  version: string | null,
236
171
  ): Promise<void> {
237
172
  const instanceName = entry.assistantId;
238
173
  const res = dockerResourceNames(instanceName);
174
+ const workspaceDir = entry.resources
175
+ ? join(entry.resources.instanceDir, ".vellum", "workspace")
176
+ : null;
239
177
 
240
178
  const versionTag =
241
179
  version ?? (cliPkg.version ? `v${cliPkg.version}` : "latest");
242
- const imageTags: Record<ServiceName, string> = {
243
- assistant: `${DOCKERHUB_IMAGES.assistant}:${versionTag}`,
244
- "credential-executor": `${DOCKERHUB_IMAGES["credential-executor"]}:${versionTag}`,
245
- gateway: `${DOCKERHUB_IMAGES.gateway}:${versionTag}`,
246
- };
180
+ console.log("🔍 Resolving image references...");
181
+ const { imageTags } = await resolveImageRefs(versionTag);
247
182
 
248
183
  console.log(
249
184
  `🔄 Upgrading Docker assistant '${instanceName}' to ${versionTag}...\n`,
@@ -265,6 +200,86 @@ async function upgradeDocker(
265
200
  );
266
201
  }
267
202
 
203
+ // Capture current migration state for rollback targeting.
204
+ // Must happen while daemon is still running (before containers are stopped).
205
+ let preMigrationState: {
206
+ dbVersion?: number;
207
+ lastWorkspaceMigrationId?: string;
208
+ } = {};
209
+ try {
210
+ const healthResp = await fetch(
211
+ `${entry.runtimeUrl}/healthz?include=migrations`,
212
+ {
213
+ signal: AbortSignal.timeout(5000),
214
+ },
215
+ );
216
+ if (healthResp.ok) {
217
+ const health = (await healthResp.json()) as {
218
+ migrations?: { dbVersion?: number; lastWorkspaceMigrationId?: string };
219
+ };
220
+ preMigrationState = health.migrations ?? {};
221
+ }
222
+ } catch {
223
+ // Best-effort — if we can't get migration state, rollback will skip migration reversal
224
+ }
225
+
226
+ // Detect if this upgrade is actually a downgrade (user picked an older
227
+ // version via the version picker). Used after readiness succeeds to align
228
+ // the DB schema with the now-running old daemon.
229
+ const currentVersion = entry.serviceGroupVersion;
230
+ const isDowngrade =
231
+ currentVersion &&
232
+ versionTag &&
233
+ (() => {
234
+ const current = parseVersion(currentVersion);
235
+ const target = parseVersion(versionTag);
236
+ if (!current || !target) return false;
237
+ if (target.major !== current.major) return target.major < current.major;
238
+ if (target.minor !== current.minor) return target.minor < current.minor;
239
+ return target.patch < current.patch;
240
+ })();
241
+
242
+ // For downgrades, fetch the target version's migration ceiling from the
243
+ // releases API. This tells us exactly which DB migration version and
244
+ // workspace migration the target version expects, enabling a precise
245
+ // rollback on the CURRENT (newer) daemon before swapping containers.
246
+ let targetMigrationCeiling: {
247
+ dbVersion?: number;
248
+ workspaceMigrationId?: string;
249
+ } = {};
250
+ if (isDowngrade) {
251
+ try {
252
+ const platformUrl = getPlatformUrl();
253
+ const releasesResp = await fetch(
254
+ `${platformUrl}/v1/releases/?stable=true`,
255
+ { signal: AbortSignal.timeout(10000) },
256
+ );
257
+ if (releasesResp.ok) {
258
+ const releases = (await releasesResp.json()) as Array<{
259
+ version: string;
260
+ db_migration_version?: number | null;
261
+ last_workspace_migration_id?: string;
262
+ }>;
263
+ const normalizedTag = versionTag.replace(/^v/, "");
264
+ const targetRelease = releases.find(
265
+ (r) => r.version?.replace(/^v/, "") === normalizedTag,
266
+ );
267
+ if (
268
+ targetRelease?.db_migration_version != null ||
269
+ targetRelease?.last_workspace_migration_id
270
+ ) {
271
+ targetMigrationCeiling = {
272
+ dbVersion: targetRelease.db_migration_version ?? undefined,
273
+ workspaceMigrationId:
274
+ targetRelease.last_workspace_migration_id || undefined,
275
+ };
276
+ }
277
+ }
278
+ } catch {
279
+ // Best-effort — fall back to rollbackToRegistryCeiling post-swap
280
+ }
281
+ }
282
+
268
283
  // Persist rollback state to lockfile BEFORE any destructive changes.
269
284
  // This enables the `vellum rollback` command to restore the previous version.
270
285
  if (entry.serviceGroupVersion && entry.containerInfo) {
@@ -272,36 +287,80 @@ async function upgradeDocker(
272
287
  ...entry,
273
288
  previousServiceGroupVersion: entry.serviceGroupVersion,
274
289
  previousContainerInfo: { ...entry.containerInfo },
290
+ previousDbMigrationVersion: preMigrationState.dbVersion,
291
+ previousWorkspaceMigrationId: preMigrationState.lastWorkspaceMigrationId,
275
292
  };
276
293
  saveAssistantEntry(rollbackEntry);
277
294
  console.log(` Saved rollback state: ${entry.serviceGroupVersion}\n`);
278
295
  }
279
296
 
297
+ // Record version transition start in workspace git history
298
+ if (workspaceDir) {
299
+ try {
300
+ await commitWorkspaceState(
301
+ workspaceDir,
302
+ buildUpgradeCommitMessage({
303
+ action: "upgrade",
304
+ phase: "starting",
305
+ from: entry.serviceGroupVersion ?? "unknown",
306
+ to: versionTag,
307
+ topology: "docker",
308
+ assistantId: entry.assistantId,
309
+ }),
310
+ );
311
+ } catch (err) {
312
+ console.warn(
313
+ `⚠️ Failed to create pre-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
314
+ );
315
+ }
316
+ }
317
+
280
318
  console.log("💾 Capturing existing container environment...");
281
319
  const capturedEnv = await captureContainerEnv(res.assistantContainer);
282
320
  console.log(
283
321
  ` Captured ${Object.keys(capturedEnv).length} env var(s) from ${res.assistantContainer}\n`,
284
322
  );
285
323
 
286
- console.log("📦 Pulling new Docker images...");
287
- await exec("docker", ["pull", imageTags.assistant]);
288
- await exec("docker", ["pull", imageTags.gateway]);
289
- await exec("docker", ["pull", imageTags["credential-executor"]]);
290
- console.log("✅ Docker images pulled\n");
291
-
292
324
  // Notify connected clients that an upgrade is about to begin.
325
+ // This must fire BEFORE any progress broadcasts so the UI sets
326
+ // isUpdateInProgress = true and starts displaying status messages.
293
327
  console.log("📢 Notifying connected clients...");
294
- await broadcastUpgradeEvent(entry.runtimeUrl, entry.assistantId, {
295
- type: "starting",
296
- targetVersion: versionTag,
297
- expectedDowntimeSeconds: 60,
298
- });
299
- // Brief pause to allow SSE delivery before containers stop.
328
+ await broadcastUpgradeEvent(
329
+ entry.runtimeUrl,
330
+ entry.assistantId,
331
+ buildStartingEvent(versionTag),
332
+ );
333
+ // Brief pause to allow SSE delivery before progress events.
300
334
  await new Promise((r) => setTimeout(r, 500));
301
335
 
302
- console.log("🛑 Stopping existing containers...");
303
- await stopContainers(res);
304
- console.log("✅ Containers stopped\n");
336
+ await broadcastUpgradeEvent(
337
+ entry.runtimeUrl,
338
+ entry.assistantId,
339
+ buildProgressEvent(UPGRADE_PROGRESS.DOWNLOADING),
340
+ );
341
+ console.log("📦 Pulling new Docker images...");
342
+ const pullImages: Array<[string, string]> = [
343
+ ["assistant", imageTags.assistant],
344
+ ["gateway", imageTags.gateway],
345
+ ["credential-executor", imageTags["credential-executor"]],
346
+ ];
347
+ try {
348
+ for (const [service, image] of pullImages) {
349
+ console.log(` Pulling ${service}: ${image}`);
350
+ await exec("docker", ["pull", image]);
351
+ }
352
+ } catch (pullErr) {
353
+ const detail = pullErr instanceof Error ? pullErr.message : String(pullErr);
354
+ console.error(`\n❌ Failed to pull Docker images: ${detail}`);
355
+ await broadcastUpgradeEvent(
356
+ entry.runtimeUrl,
357
+ entry.assistantId,
358
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
359
+ );
360
+ emitCliError("IMAGE_PULL_FAILED", "Failed to pull Docker images", detail);
361
+ process.exit(1);
362
+ }
363
+ console.log("✅ Docker images pulled\n");
305
364
 
306
365
  // Parse gateway port from entry's runtimeUrl, fall back to default
307
366
  let gatewayPort = GATEWAY_INTERNAL_PORT;
@@ -324,18 +383,91 @@ async function upgradeDocker(
324
383
 
325
384
  // Retrieve or generate a bootstrap secret for the gateway. The secret was
326
385
  // persisted to disk during hatch; older instances won't have one yet.
327
- const bootstrapSecret =
328
- loadBootstrapSecret(instanceName) || randomBytes(32).toString("hex");
386
+ // This runs BEFORE stopping containers so a write failure (disk full,
387
+ // permissions) doesn't leave the assistant offline.
388
+ const loadedSecret = loadBootstrapSecret(instanceName);
389
+ const bootstrapSecret = loadedSecret || randomBytes(32).toString("hex");
390
+ if (!loadedSecret) {
391
+ saveBootstrapSecret(instanceName, bootstrapSecret);
392
+ }
393
+
394
+ // Extract or generate the shared JWT signing key. Pre-env-var instances
395
+ // won't have it in capturedEnv, so generate fresh in that case.
396
+ const signingKey =
397
+ capturedEnv["ACTOR_TOKEN_SIGNING_KEY"] || randomBytes(32).toString("hex");
398
+
399
+ // Create pre-upgrade backup (best-effort, daemon must be running)
400
+ await broadcastUpgradeEvent(
401
+ entry.runtimeUrl,
402
+ entry.assistantId,
403
+ buildProgressEvent(UPGRADE_PROGRESS.BACKING_UP),
404
+ );
405
+ console.log("📦 Creating pre-upgrade backup...");
406
+ const backupPath = await createBackup(entry.runtimeUrl, entry.assistantId, {
407
+ prefix: `${entry.assistantId}-pre-upgrade`,
408
+ description: `Pre-upgrade snapshot before ${entry.serviceGroupVersion ?? "unknown"} → ${versionTag}`,
409
+ });
410
+ if (backupPath) {
411
+ console.log(` Backup saved: ${backupPath}\n`);
412
+ // Clean up old pre-upgrade backups, keep last 3
413
+ pruneOldBackups(entry.assistantId, 3);
414
+ } else {
415
+ console.warn("⚠️ Pre-upgrade backup failed (continuing with upgrade)\n");
416
+ }
417
+
418
+ // Persist the backup path so `vellum rollback` can restore the exact backup
419
+ // created for this upgrade attempt — never a stale backup from a prior cycle.
420
+ // Re-read the entry to pick up the rollback state saved earlier.
421
+ {
422
+ const current = findAssistantByName(entry.assistantId);
423
+ if (current) {
424
+ saveAssistantEntry({
425
+ ...current,
426
+ preUpgradeBackupPath: backupPath ?? undefined,
427
+ });
428
+ }
429
+ }
430
+
431
+ await broadcastUpgradeEvent(
432
+ entry.runtimeUrl,
433
+ entry.assistantId,
434
+ buildProgressEvent(UPGRADE_PROGRESS.INSTALLING),
435
+ );
436
+
437
+ // If we have the target version's migration ceiling, run a PRECISE
438
+ // rollback on the CURRENT (newer) daemon before stopping it. The current
439
+ // daemon has the `down()` code for all migrations it applied, so it can
440
+ // cleanly revert to the target version's ceiling. This is critical for
441
+ // multi-version downgrades where the old daemon wouldn't know about
442
+ // migrations introduced after its release.
443
+ let preSwapRollbackOk = true;
444
+ if (
445
+ isDowngrade &&
446
+ (targetMigrationCeiling.dbVersion !== undefined ||
447
+ targetMigrationCeiling.workspaceMigrationId !== undefined)
448
+ ) {
449
+ console.log("🔄 Reverting database changes for downgrade...");
450
+ await broadcastUpgradeEvent(
451
+ entry.runtimeUrl,
452
+ entry.assistantId,
453
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING_MIGRATIONS),
454
+ );
455
+ preSwapRollbackOk = await rollbackMigrations(
456
+ entry.runtimeUrl,
457
+ entry.assistantId,
458
+ targetMigrationCeiling.dbVersion,
459
+ targetMigrationCeiling.workspaceMigrationId,
460
+ );
461
+ }
462
+
463
+ console.log("🛑 Stopping existing containers...");
464
+ await stopContainers(res);
465
+ console.log("✅ Containers stopped\n");
329
466
 
330
467
  // Build the set of extra env vars to replay on the new assistant container.
331
468
  // Captured env vars serve as the base; keys already managed by
332
469
  // serviceDockerRunArgs are excluded to avoid duplicates.
333
- const envKeysSetByRunArgs = new Set([
334
- "CES_SERVICE_TOKEN",
335
- "VELLUM_ASSISTANT_NAME",
336
- "RUNTIME_HTTP_HOST",
337
- "PATH",
338
- ]);
470
+ const envKeysSetByRunArgs = new Set(CONTAINER_ENV_EXCLUDE_KEYS);
339
471
  // Only exclude keys that serviceDockerRunArgs will actually set
340
472
  for (const envVar of ["ANTHROPIC_API_KEY", "VELLUM_PLATFORM_URL"]) {
341
473
  if (process.env[envVar]) {
@@ -355,12 +487,10 @@ async function upgradeDocker(
355
487
  console.log("🔄 Migrating credential files to CES security volume...");
356
488
  await migrateCesSecurityFiles(res, (msg) => console.log(msg));
357
489
 
358
- console.log("🔑 Clearing signing key bootstrap lock...");
359
- await clearSigningKeyBootstrapLock(res);
360
-
361
490
  console.log("🚀 Starting upgraded containers...");
362
491
  await startContainers(
363
492
  {
493
+ signingKey,
364
494
  bootstrapSecret,
365
495
  cesServiceToken,
366
496
  extraAssistantEnv,
@@ -392,15 +522,62 @@ async function upgradeDocker(
392
522
  },
393
523
  previousServiceGroupVersion: entry.serviceGroupVersion,
394
524
  previousContainerInfo: entry.containerInfo,
525
+ previousDbMigrationVersion: preMigrationState.dbVersion,
526
+ previousWorkspaceMigrationId: preMigrationState.lastWorkspaceMigrationId,
527
+ // Preserve the backup path so `vellum rollback` can restore it later
528
+ preUpgradeBackupPath: backupPath ?? undefined,
395
529
  };
396
530
  saveAssistantEntry(updatedEntry);
397
531
 
532
+ // After a downgrade, fall back to asking the now-running old daemon
533
+ // to roll back migrations above its own registry ceiling when either:
534
+ // (a) no release metadata was available for a precise pre-swap rollback, or
535
+ // (b) the precise pre-swap rollback failed (timeout, daemon crash, etc.).
536
+ // This is a no-op for multi-version jumps where the old daemon doesn't
537
+ // know about the newer migrations, but correct for single-step rollbacks.
538
+ if (
539
+ isDowngrade &&
540
+ (!preSwapRollbackOk ||
541
+ (targetMigrationCeiling.dbVersion === undefined &&
542
+ targetMigrationCeiling.workspaceMigrationId === undefined))
543
+ ) {
544
+ await rollbackMigrations(
545
+ entry.runtimeUrl,
546
+ entry.assistantId,
547
+ undefined,
548
+ undefined,
549
+ true,
550
+ );
551
+ }
552
+
398
553
  // Notify clients on the new service group that the upgrade succeeded.
399
- await broadcastUpgradeEvent(entry.runtimeUrl, entry.assistantId, {
400
- type: "complete",
401
- installedVersion: versionTag,
402
- success: true,
403
- });
554
+ await broadcastUpgradeEvent(
555
+ entry.runtimeUrl,
556
+ entry.assistantId,
557
+ buildCompleteEvent(versionTag, true),
558
+ );
559
+
560
+ // Record successful upgrade in workspace git history
561
+ if (workspaceDir) {
562
+ try {
563
+ await commitWorkspaceState(
564
+ workspaceDir,
565
+ buildUpgradeCommitMessage({
566
+ action: "upgrade",
567
+ phase: "complete",
568
+ from: entry.serviceGroupVersion ?? "unknown",
569
+ to: versionTag,
570
+ topology: "docker",
571
+ assistantId: entry.assistantId,
572
+ result: "success",
573
+ }),
574
+ );
575
+ } catch (err) {
576
+ console.warn(
577
+ `⚠️ Failed to create post-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
578
+ );
579
+ }
580
+ }
404
581
 
405
582
  console.log(
406
583
  `\n✅ Docker assistant '${instanceName}' upgraded to ${versionTag}.`,
@@ -409,12 +586,41 @@ async function upgradeDocker(
409
586
  console.error(`\n❌ Containers failed to become ready within the timeout.`);
410
587
 
411
588
  if (previousImageRefs) {
589
+ await broadcastUpgradeEvent(
590
+ entry.runtimeUrl,
591
+ entry.assistantId,
592
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING),
593
+ );
412
594
  console.log(`\n🔄 Rolling back to previous images...`);
413
595
  try {
596
+ // Attempt to roll back migrations before swapping containers.
597
+ // The new daemon may be partially up — try best-effort.
598
+ if (
599
+ preMigrationState.dbVersion !== undefined ||
600
+ preMigrationState.lastWorkspaceMigrationId !== undefined
601
+ ) {
602
+ console.log("🔄 Reverting database changes...");
603
+ await broadcastUpgradeEvent(
604
+ entry.runtimeUrl,
605
+ entry.assistantId,
606
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING_MIGRATIONS),
607
+ );
608
+ await rollbackMigrations(
609
+ entry.runtimeUrl,
610
+ entry.assistantId,
611
+ preMigrationState.dbVersion,
612
+ preMigrationState.lastWorkspaceMigrationId,
613
+ );
614
+ }
615
+
414
616
  await stopContainers(res);
415
617
 
618
+ await migrateGatewaySecurityFiles(res, (msg) => console.log(msg));
619
+ await migrateCesSecurityFiles(res, (msg) => console.log(msg));
620
+
416
621
  await startContainers(
417
622
  {
623
+ signingKey,
418
624
  bootstrapSecret,
419
625
  cesServiceToken,
420
626
  extraAssistantEnv,
@@ -428,46 +634,90 @@ async function upgradeDocker(
428
634
 
429
635
  const rollbackReady = await waitForReady(entry.runtimeUrl);
430
636
  if (rollbackReady) {
431
- // Restore previous container info in lockfile after rollback.
432
- // previousImageRefs contains sha256 digests from `docker inspect
433
- // --format {{.Image}}`. The *Image fields should hold
434
- // human-readable image:tag names, so prefer the pre-upgrade
435
- // containerInfo values and store digests in the *Digest fields.
436
- if (previousImageRefs) {
437
- const rolledBackEntry: AssistantEntry = {
438
- ...entry,
439
- containerInfo: {
440
- assistantImage:
441
- entry.containerInfo?.assistantImage ??
442
- previousImageRefs.assistant,
443
- gatewayImage:
444
- entry.containerInfo?.gatewayImage ??
445
- previousImageRefs.gateway,
446
- cesImage:
447
- entry.containerInfo?.cesImage ??
448
- previousImageRefs["credential-executor"],
449
- assistantDigest: previousImageRefs.assistant,
450
- gatewayDigest: previousImageRefs.gateway,
451
- cesDigest: previousImageRefs["credential-executor"],
452
- networkName: res.network,
453
- },
454
- previousServiceGroupVersion: undefined,
455
- previousContainerInfo: undefined,
456
- };
457
- saveAssistantEntry(rolledBackEntry);
637
+ // Restore data from the backup created for THIS upgrade attempt.
638
+ // Only use the specific backupPath never scan for the latest
639
+ // backup on disk, which could be from a previous upgrade cycle
640
+ // and contain stale data.
641
+ if (backupPath) {
642
+ await broadcastUpgradeEvent(
643
+ entry.runtimeUrl,
644
+ entry.assistantId,
645
+ buildProgressEvent(UPGRADE_PROGRESS.RESTORING),
646
+ );
647
+ console.log(`📦 Restoring data from pre-upgrade backup...`);
648
+ console.log(` Source: ${backupPath}`);
649
+ const restored = await restoreBackup(
650
+ entry.runtimeUrl,
651
+ entry.assistantId,
652
+ backupPath,
653
+ );
654
+ if (restored) {
655
+ console.log(" ✅ Data restored successfully\n");
656
+ } else {
657
+ console.warn(
658
+ " ⚠️ Data restore failed (rollback continues without data restoration)\n",
659
+ );
660
+ }
661
+ } else {
662
+ console.log(
663
+ "ℹ️ No pre-upgrade backup was created for this attempt, skipping data restoration\n",
664
+ );
458
665
  }
459
666
 
667
+ // Capture fresh digests from the now-running rolled-back containers.
668
+ const rollbackDigests = await captureImageRefs(res);
669
+
670
+ // Restore previous container info in lockfile after rollback.
671
+ // The *Image fields hold human-readable image:tag names from the
672
+ // pre-upgrade containerInfo; *Digest fields get fresh values from
673
+ // the running containers (or fall back to previousImageRefs).
674
+ const rolledBackEntry: AssistantEntry = {
675
+ ...entry,
676
+ containerInfo: {
677
+ assistantImage:
678
+ entry.containerInfo?.assistantImage ??
679
+ previousImageRefs.assistant,
680
+ gatewayImage:
681
+ entry.containerInfo?.gatewayImage ?? previousImageRefs.gateway,
682
+ cesImage:
683
+ entry.containerInfo?.cesImage ??
684
+ previousImageRefs["credential-executor"],
685
+ assistantDigest:
686
+ rollbackDigests?.assistant ?? previousImageRefs.assistant,
687
+ gatewayDigest:
688
+ rollbackDigests?.gateway ?? previousImageRefs.gateway,
689
+ cesDigest:
690
+ rollbackDigests?.["credential-executor"] ??
691
+ previousImageRefs["credential-executor"],
692
+ networkName: res.network,
693
+ },
694
+ previousServiceGroupVersion: undefined,
695
+ previousContainerInfo: undefined,
696
+ previousDbMigrationVersion: undefined,
697
+ previousWorkspaceMigrationId: undefined,
698
+ // Clear the backup path — the upgrade that created it just failed
699
+ preUpgradeBackupPath: undefined,
700
+ };
701
+ saveAssistantEntry(rolledBackEntry);
702
+
460
703
  // Notify clients that the upgrade failed and rolled back.
461
- await broadcastUpgradeEvent(entry.runtimeUrl, entry.assistantId, {
462
- type: "complete",
463
- installedVersion: entry.serviceGroupVersion ?? "unknown",
464
- success: false,
465
- rolledBackToVersion: entry.serviceGroupVersion,
466
- });
704
+ await broadcastUpgradeEvent(
705
+ entry.runtimeUrl,
706
+ entry.assistantId,
707
+ buildCompleteEvent(
708
+ entry.serviceGroupVersion ?? "unknown",
709
+ false,
710
+ entry.serviceGroupVersion,
711
+ ),
712
+ );
467
713
 
468
714
  console.log(
469
715
  `\n⚠️ Rolled back to previous version. Upgrade to ${versionTag} failed.`,
470
716
  );
717
+ emitCliError(
718
+ "READINESS_TIMEOUT",
719
+ `Upgrade to ${versionTag} failed: containers did not become ready. Rolled back to previous version.`,
720
+ );
471
721
  } else {
472
722
  console.error(
473
723
  `\n❌ Rollback also failed. Manual intervention required.`,
@@ -475,21 +725,51 @@ async function upgradeDocker(
475
725
  console.log(
476
726
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
477
727
  );
728
+ await broadcastUpgradeEvent(
729
+ entry.runtimeUrl,
730
+ entry.assistantId,
731
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
732
+ );
733
+ emitCliError(
734
+ "ROLLBACK_FAILED",
735
+ "Rollback also failed after readiness timeout. Manual intervention required.",
736
+ );
478
737
  }
479
738
  } catch (rollbackErr) {
480
- console.error(
481
- `\n❌ Rollback failed: ${rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr)}`,
482
- );
739
+ const rollbackDetail =
740
+ rollbackErr instanceof Error
741
+ ? rollbackErr.message
742
+ : String(rollbackErr);
743
+ console.error(`\n❌ Rollback failed: ${rollbackDetail}`);
483
744
  console.error(` Manual intervention required.`);
484
745
  console.log(
485
746
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
486
747
  );
748
+ await broadcastUpgradeEvent(
749
+ entry.runtimeUrl,
750
+ entry.assistantId,
751
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
752
+ );
753
+ emitCliError(
754
+ "ROLLBACK_FAILED",
755
+ "Auto-rollback failed after readiness timeout. Manual intervention required.",
756
+ rollbackDetail,
757
+ );
487
758
  }
488
759
  } else {
489
760
  console.log(` No previous images available for rollback.`);
490
761
  console.log(
491
762
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
492
763
  );
764
+ await broadcastUpgradeEvent(
765
+ entry.runtimeUrl,
766
+ entry.assistantId,
767
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
768
+ );
769
+ emitCliError(
770
+ "ROLLBACK_NO_STATE",
771
+ "Containers failed to become ready and no previous images available for rollback.",
772
+ );
493
773
  }
494
774
 
495
775
  process.exit(1);
@@ -505,15 +785,41 @@ async function upgradePlatform(
505
785
  entry: AssistantEntry,
506
786
  version: string | null,
507
787
  ): Promise<void> {
788
+ const workspaceDir = entry.resources
789
+ ? join(entry.resources.instanceDir, ".vellum", "workspace")
790
+ : null;
791
+
792
+ // Record version transition start in workspace git history
793
+ if (workspaceDir) {
794
+ try {
795
+ await commitWorkspaceState(
796
+ workspaceDir,
797
+ buildUpgradeCommitMessage({
798
+ action: "upgrade",
799
+ phase: "starting",
800
+ from: entry.serviceGroupVersion ?? "unknown",
801
+ to: version ?? "latest",
802
+ topology: "managed",
803
+ assistantId: entry.assistantId,
804
+ }),
805
+ );
806
+ } catch (err) {
807
+ console.warn(
808
+ `⚠️ Failed to create pre-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
809
+ );
810
+ }
811
+ }
812
+
508
813
  console.log(
509
814
  `🔄 Upgrading platform-hosted assistant '${entry.assistantId}'...\n`,
510
815
  );
511
816
 
512
817
  const token = readPlatformToken();
513
818
  if (!token) {
514
- console.error(
515
- "Error: Not logged in. Run `vellum login --token <token>` first.",
516
- );
819
+ const msg =
820
+ "Error: Not logged in. Run `vellum login --token <token>` first.";
821
+ console.error(msg);
822
+ emitCliError("AUTH_FAILED", msg);
517
823
  process.exit(1);
518
824
  }
519
825
 
@@ -530,11 +836,11 @@ async function upgradePlatform(
530
836
  // Notify connected clients that an upgrade is about to begin.
531
837
  const targetVersion = version ?? `v${cliPkg.version}`;
532
838
  console.log("📢 Notifying connected clients...");
533
- await broadcastUpgradeEvent(entry.runtimeUrl, entry.assistantId, {
534
- type: "starting",
535
- targetVersion,
536
- expectedDowntimeSeconds: 90,
537
- });
839
+ await broadcastUpgradeEvent(
840
+ entry.runtimeUrl,
841
+ entry.assistantId,
842
+ buildStartingEvent(targetVersion, 90),
843
+ );
538
844
 
539
845
  const response = await fetch(url, {
540
846
  method: "POST",
@@ -551,11 +857,16 @@ async function upgradePlatform(
551
857
  console.error(
552
858
  `Error: Platform upgrade failed (${response.status}): ${text}`,
553
859
  );
554
- await broadcastUpgradeEvent(entry.runtimeUrl, entry.assistantId, {
555
- type: "complete",
556
- installedVersion: entry.serviceGroupVersion ?? "unknown",
557
- success: false,
558
- });
860
+ emitCliError(
861
+ "PLATFORM_API_ERROR",
862
+ `Platform upgrade failed (${response.status})`,
863
+ text,
864
+ );
865
+ await broadcastUpgradeEvent(
866
+ entry.runtimeUrl,
867
+ entry.assistantId,
868
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
869
+ );
559
870
  process.exit(1);
560
871
  }
561
872
 
@@ -568,6 +879,28 @@ async function upgradePlatform(
568
879
  // version-change detection (DaemonConnection.swift) once the new
569
880
  // version actually appears after the platform restarts the service group.
570
881
 
882
+ // Record successful upgrade in workspace git history
883
+ if (workspaceDir) {
884
+ try {
885
+ await commitWorkspaceState(
886
+ workspaceDir,
887
+ buildUpgradeCommitMessage({
888
+ action: "upgrade",
889
+ phase: "complete",
890
+ from: entry.serviceGroupVersion ?? "unknown",
891
+ to: version ?? "latest",
892
+ topology: "managed",
893
+ assistantId: entry.assistantId,
894
+ result: "success",
895
+ }),
896
+ );
897
+ } catch (err) {
898
+ console.warn(
899
+ `⚠️ Failed to create post-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
900
+ );
901
+ }
902
+ }
903
+
571
904
  console.log(`✅ ${result.detail}`);
572
905
  if (result.version) {
573
906
  console.log(` Version: ${result.version}`);
@@ -579,18 +912,33 @@ export async function upgrade(): Promise<void> {
579
912
  const entry = resolveTargetAssistant(name);
580
913
  const cloud = resolveCloud(entry);
581
914
 
582
- if (cloud === "docker") {
583
- await upgradeDocker(entry, version);
584
- return;
585
- }
915
+ try {
916
+ if (cloud === "docker") {
917
+ await upgradeDocker(entry, version);
918
+ return;
919
+ }
586
920
 
587
- if (cloud === "vellum") {
588
- await upgradePlatform(entry, version);
589
- return;
921
+ if (cloud === "vellum") {
922
+ await upgradePlatform(entry, version);
923
+ return;
924
+ }
925
+ } catch (err) {
926
+ const detail = err instanceof Error ? err.message : String(err);
927
+ console.error(`\n❌ Upgrade failed: ${detail}`);
928
+ // Best-effort: notify connected clients that the upgrade failed.
929
+ // A `starting` event may have been sent inside upgradeDocker/upgradePlatform
930
+ // before the error was thrown, so we must close with `complete`.
931
+ await broadcastUpgradeEvent(
932
+ entry.runtimeUrl,
933
+ entry.assistantId,
934
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
935
+ );
936
+ emitCliError(categorizeUpgradeError(err), "Upgrade failed", detail);
937
+ process.exit(1);
590
938
  }
591
939
 
592
- console.error(
593
- `Error: Upgrade is not supported for '${cloud}' assistants. Only 'docker' and 'vellum' assistants can be upgraded via the CLI.`,
594
- );
940
+ const msg = `Error: Upgrade is not supported for '${cloud}' assistants. Only 'docker' and 'vellum' assistants can be upgraded via the CLI.`;
941
+ console.error(msg);
942
+ emitCliError("UNSUPPORTED_TOPOLOGY", msg);
595
943
  process.exit(1);
596
944
  }