@vellumai/cli 0.5.5 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { randomBytes } from "crypto";
2
+ import { join } from "node:path";
2
3
 
3
4
  import cliPkg from "../../package.json";
4
5
 
@@ -11,8 +12,6 @@ import {
11
12
  import type { AssistantEntry } from "../lib/assistant-config";
12
13
  import {
13
14
  captureImageRefs,
14
- DOCKERHUB_IMAGES,
15
- DOCKER_READY_TIMEOUT_MS,
16
15
  GATEWAY_INTERNAL_PORT,
17
16
  dockerResourceNames,
18
17
  migrateCesSecurityFiles,
@@ -20,13 +19,37 @@ import {
20
19
  startContainers,
21
20
  stopContainers,
22
21
  } from "../lib/docker";
23
- import type { ServiceName } from "../lib/docker";
22
+ import { resolveImageRefs } from "../lib/platform-releases";
24
23
  import {
25
24
  fetchOrganizationId,
26
25
  getPlatformUrl,
27
26
  readPlatformToken,
28
27
  } from "../lib/platform-client";
29
- import { exec, execOutput } from "../lib/step-runner";
28
+ import {
29
+ loadBootstrapSecret,
30
+ saveBootstrapSecret,
31
+ } from "../lib/guardian-token";
32
+ import {
33
+ createBackup,
34
+ pruneOldBackups,
35
+ restoreBackup,
36
+ } from "../lib/backup-ops.js";
37
+ import { emitCliError, categorizeUpgradeError } from "../lib/cli-error.js";
38
+ import { exec } from "../lib/step-runner.js";
39
+ import {
40
+ broadcastUpgradeEvent,
41
+ buildCompleteEvent,
42
+ buildProgressEvent,
43
+ buildStartingEvent,
44
+ buildUpgradeCommitMessage,
45
+ captureContainerEnv,
46
+ CONTAINER_ENV_EXCLUDE_KEYS,
47
+ rollbackMigrations,
48
+ UPGRADE_PROGRESS,
49
+ waitForReady,
50
+ } from "../lib/upgrade-lifecycle.js";
51
+ import { parseVersion } from "../lib/version-compat.js";
52
+ import { commitWorkspaceState } from "../lib/workspace-git.js";
30
53
 
31
54
  interface UpgradeArgs {
32
55
  name: string | null;
@@ -70,6 +93,7 @@ function parseArgs(): UpgradeArgs {
70
93
  const next = args[i + 1];
71
94
  if (!next || next.startsWith("-")) {
72
95
  console.error("Error: --version requires a value");
96
+ emitCliError("UNKNOWN", "--version requires a value");
73
97
  process.exit(1);
74
98
  }
75
99
  version = next;
@@ -78,6 +102,7 @@ function parseArgs(): UpgradeArgs {
78
102
  name = arg;
79
103
  } else {
80
104
  console.error(`Error: Unknown option '${arg}'.`);
105
+ emitCliError("UNKNOWN", `Unknown option '${arg}'`);
81
106
  process.exit(1);
82
107
  }
83
108
  }
@@ -109,6 +134,10 @@ function resolveTargetAssistant(nameArg: string | null): AssistantEntry {
109
134
  const entry = findAssistantByName(nameArg);
110
135
  if (!entry) {
111
136
  console.error(`No assistant found with name '${nameArg}'.`);
137
+ emitCliError(
138
+ "ASSISTANT_NOT_FOUND",
139
+ `No assistant found with name '${nameArg}'.`,
140
+ );
112
141
  process.exit(1);
113
142
  }
114
143
  return entry;
@@ -124,95 +153,32 @@ function resolveTargetAssistant(nameArg: string | null): AssistantEntry {
124
153
  if (all.length === 1) return all[0];
125
154
 
126
155
  if (all.length === 0) {
127
- console.error("No assistants found. Run 'vellum hatch' first.");
156
+ const msg = "No assistants found. Run 'vellum hatch' first.";
157
+ console.error(msg);
158
+ emitCliError("ASSISTANT_NOT_FOUND", msg);
128
159
  } else {
129
- console.error(
130
- "Multiple assistants found. Specify a name or set an active assistant with 'vellum use <name>'.",
131
- );
160
+ const msg =
161
+ "Multiple assistants found. Specify a name or set an active assistant with 'vellum use <name>'.";
162
+ console.error(msg);
163
+ emitCliError("ASSISTANT_NOT_FOUND", msg);
132
164
  }
133
165
  process.exit(1);
134
166
  }
135
167
 
136
- /**
137
- * Capture environment variables from a running Docker container so they
138
- * can be replayed onto the replacement container after upgrade.
139
- */
140
- async function captureContainerEnv(
141
- containerName: string,
142
- ): Promise<Record<string, string>> {
143
- const captured: Record<string, string> = {};
144
- try {
145
- const raw = await execOutput("docker", [
146
- "inspect",
147
- "--format",
148
- "{{json .Config.Env}}",
149
- containerName,
150
- ]);
151
- const entries = JSON.parse(raw) as string[];
152
- for (const entry of entries) {
153
- const eqIdx = entry.indexOf("=");
154
- if (eqIdx > 0) {
155
- captured[entry.slice(0, eqIdx)] = entry.slice(eqIdx + 1);
156
- }
157
- }
158
- } catch {
159
- // Container may not exist or not be inspectable
160
- }
161
- return captured;
162
- }
163
-
164
- /**
165
- * Poll the gateway `/readyz` endpoint until it returns 200 or the timeout
166
- * elapses. Returns whether the assistant became ready.
167
- */
168
- async function waitForReady(runtimeUrl: string): Promise<boolean> {
169
- const readyUrl = `${runtimeUrl}/readyz`;
170
- const start = Date.now();
171
-
172
- while (Date.now() - start < DOCKER_READY_TIMEOUT_MS) {
173
- try {
174
- const resp = await fetch(readyUrl, {
175
- signal: AbortSignal.timeout(5000),
176
- });
177
- if (resp.ok) {
178
- const elapsedSec = ((Date.now() - start) / 1000).toFixed(1);
179
- console.log(`Assistant ready after ${elapsedSec}s`);
180
- return true;
181
- }
182
- let detail = "";
183
- try {
184
- const body = await resp.text();
185
- const json = JSON.parse(body);
186
- const parts = [json.status];
187
- if (json.upstream != null) parts.push(`upstream=${json.upstream}`);
188
- detail = ` — ${parts.join(", ")}`;
189
- } catch {
190
- // ignore parse errors
191
- }
192
- console.log(`Readiness check: ${resp.status}${detail} (retrying...)`);
193
- } catch {
194
- // Connection refused / timeout — not up yet
195
- }
196
- await new Promise((r) => setTimeout(r, 1000));
197
- }
198
-
199
- return false;
200
- }
201
-
202
168
  async function upgradeDocker(
203
169
  entry: AssistantEntry,
204
170
  version: string | null,
205
171
  ): Promise<void> {
206
172
  const instanceName = entry.assistantId;
207
173
  const res = dockerResourceNames(instanceName);
174
+ const workspaceDir = entry.resources
175
+ ? join(entry.resources.instanceDir, ".vellum", "workspace")
176
+ : null;
208
177
 
209
178
  const versionTag =
210
179
  version ?? (cliPkg.version ? `v${cliPkg.version}` : "latest");
211
- const imageTags: Record<ServiceName, string> = {
212
- assistant: `${DOCKERHUB_IMAGES.assistant}:${versionTag}`,
213
- "credential-executor": `${DOCKERHUB_IMAGES["credential-executor"]}:${versionTag}`,
214
- gateway: `${DOCKERHUB_IMAGES.gateway}:${versionTag}`,
215
- };
180
+ console.log("🔍 Resolving image references...");
181
+ const { imageTags } = await resolveImageRefs(versionTag);
216
182
 
217
183
  console.log(
218
184
  `🔄 Upgrading Docker assistant '${instanceName}' to ${versionTag}...\n`,
@@ -234,22 +200,168 @@ async function upgradeDocker(
234
200
  );
235
201
  }
236
202
 
203
+ // Capture current migration state for rollback targeting.
204
+ // Must happen while daemon is still running (before containers are stopped).
205
+ let preMigrationState: {
206
+ dbVersion?: number;
207
+ lastWorkspaceMigrationId?: string;
208
+ } = {};
209
+ try {
210
+ const healthResp = await fetch(
211
+ `${entry.runtimeUrl}/healthz?include=migrations`,
212
+ {
213
+ signal: AbortSignal.timeout(5000),
214
+ },
215
+ );
216
+ if (healthResp.ok) {
217
+ const health = (await healthResp.json()) as {
218
+ migrations?: { dbVersion?: number; lastWorkspaceMigrationId?: string };
219
+ };
220
+ preMigrationState = health.migrations ?? {};
221
+ }
222
+ } catch {
223
+ // Best-effort — if we can't get migration state, rollback will skip migration reversal
224
+ }
225
+
226
+ // Detect if this upgrade is actually a downgrade (user picked an older
227
+ // version via the version picker). Used after readiness succeeds to align
228
+ // the DB schema with the now-running old daemon.
229
+ const currentVersion = entry.serviceGroupVersion;
230
+ const isDowngrade =
231
+ currentVersion &&
232
+ versionTag &&
233
+ (() => {
234
+ const current = parseVersion(currentVersion);
235
+ const target = parseVersion(versionTag);
236
+ if (!current || !target) return false;
237
+ if (target.major !== current.major) return target.major < current.major;
238
+ if (target.minor !== current.minor) return target.minor < current.minor;
239
+ return target.patch < current.patch;
240
+ })();
241
+
242
+ // For downgrades, fetch the target version's migration ceiling from the
243
+ // releases API. This tells us exactly which DB migration version and
244
+ // workspace migration the target version expects, enabling a precise
245
+ // rollback on the CURRENT (newer) daemon before swapping containers.
246
+ let targetMigrationCeiling: {
247
+ dbVersion?: number;
248
+ workspaceMigrationId?: string;
249
+ } = {};
250
+ if (isDowngrade) {
251
+ try {
252
+ const platformUrl = getPlatformUrl();
253
+ const releasesResp = await fetch(
254
+ `${platformUrl}/v1/releases/?stable=true`,
255
+ { signal: AbortSignal.timeout(10000) },
256
+ );
257
+ if (releasesResp.ok) {
258
+ const releases = (await releasesResp.json()) as Array<{
259
+ version: string;
260
+ db_migration_version?: number | null;
261
+ last_workspace_migration_id?: string;
262
+ }>;
263
+ const normalizedTag = versionTag.replace(/^v/, "");
264
+ const targetRelease = releases.find(
265
+ (r) => r.version?.replace(/^v/, "") === normalizedTag,
266
+ );
267
+ if (
268
+ targetRelease?.db_migration_version != null ||
269
+ targetRelease?.last_workspace_migration_id
270
+ ) {
271
+ targetMigrationCeiling = {
272
+ dbVersion: targetRelease.db_migration_version ?? undefined,
273
+ workspaceMigrationId:
274
+ targetRelease.last_workspace_migration_id || undefined,
275
+ };
276
+ }
277
+ }
278
+ } catch {
279
+ // Best-effort — fall back to rollbackToRegistryCeiling post-swap
280
+ }
281
+ }
282
+
283
+ // Persist rollback state to lockfile BEFORE any destructive changes.
284
+ // This enables the `vellum rollback` command to restore the previous version.
285
+ if (entry.serviceGroupVersion && entry.containerInfo) {
286
+ const rollbackEntry: AssistantEntry = {
287
+ ...entry,
288
+ previousServiceGroupVersion: entry.serviceGroupVersion,
289
+ previousContainerInfo: { ...entry.containerInfo },
290
+ previousDbMigrationVersion: preMigrationState.dbVersion,
291
+ previousWorkspaceMigrationId: preMigrationState.lastWorkspaceMigrationId,
292
+ };
293
+ saveAssistantEntry(rollbackEntry);
294
+ console.log(` Saved rollback state: ${entry.serviceGroupVersion}\n`);
295
+ }
296
+
297
+ // Record version transition start in workspace git history
298
+ if (workspaceDir) {
299
+ try {
300
+ await commitWorkspaceState(
301
+ workspaceDir,
302
+ buildUpgradeCommitMessage({
303
+ action: "upgrade",
304
+ phase: "starting",
305
+ from: entry.serviceGroupVersion ?? "unknown",
306
+ to: versionTag,
307
+ topology: "docker",
308
+ assistantId: entry.assistantId,
309
+ }),
310
+ );
311
+ } catch (err) {
312
+ console.warn(
313
+ `⚠️ Failed to create pre-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
314
+ );
315
+ }
316
+ }
317
+
237
318
  console.log("💾 Capturing existing container environment...");
238
319
  const capturedEnv = await captureContainerEnv(res.assistantContainer);
239
320
  console.log(
240
321
  ` Captured ${Object.keys(capturedEnv).length} env var(s) from ${res.assistantContainer}\n`,
241
322
  );
242
323
 
324
+ // Notify connected clients that an upgrade is about to begin.
325
+ // This must fire BEFORE any progress broadcasts so the UI sets
326
+ // isUpdateInProgress = true and starts displaying status messages.
327
+ console.log("📢 Notifying connected clients...");
328
+ await broadcastUpgradeEvent(
329
+ entry.runtimeUrl,
330
+ entry.assistantId,
331
+ buildStartingEvent(versionTag),
332
+ );
333
+ // Brief pause to allow SSE delivery before progress events.
334
+ await new Promise((r) => setTimeout(r, 500));
335
+
336
+ await broadcastUpgradeEvent(
337
+ entry.runtimeUrl,
338
+ entry.assistantId,
339
+ buildProgressEvent(UPGRADE_PROGRESS.DOWNLOADING),
340
+ );
243
341
  console.log("📦 Pulling new Docker images...");
244
- await exec("docker", ["pull", imageTags.assistant]);
245
- await exec("docker", ["pull", imageTags.gateway]);
246
- await exec("docker", ["pull", imageTags["credential-executor"]]);
342
+ const pullImages: Array<[string, string]> = [
343
+ ["assistant", imageTags.assistant],
344
+ ["gateway", imageTags.gateway],
345
+ ["credential-executor", imageTags["credential-executor"]],
346
+ ];
347
+ try {
348
+ for (const [service, image] of pullImages) {
349
+ console.log(` Pulling ${service}: ${image}`);
350
+ await exec("docker", ["pull", image]);
351
+ }
352
+ } catch (pullErr) {
353
+ const detail = pullErr instanceof Error ? pullErr.message : String(pullErr);
354
+ console.error(`\n❌ Failed to pull Docker images: ${detail}`);
355
+ await broadcastUpgradeEvent(
356
+ entry.runtimeUrl,
357
+ entry.assistantId,
358
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
359
+ );
360
+ emitCliError("IMAGE_PULL_FAILED", "Failed to pull Docker images", detail);
361
+ process.exit(1);
362
+ }
247
363
  console.log("✅ Docker images pulled\n");
248
364
 
249
- console.log("🛑 Stopping existing containers...");
250
- await stopContainers(res);
251
- console.log("✅ Containers stopped\n");
252
-
253
365
  // Parse gateway port from entry's runtimeUrl, fall back to default
254
366
  let gatewayPort = GATEWAY_INTERNAL_PORT;
255
367
  try {
@@ -269,15 +381,93 @@ async function upgradeDocker(
269
381
  const cesServiceToken =
270
382
  capturedEnv["CES_SERVICE_TOKEN"] || randomBytes(32).toString("hex");
271
383
 
384
+ // Retrieve or generate a bootstrap secret for the gateway. The secret was
385
+ // persisted to disk during hatch; older instances won't have one yet.
386
+ // This runs BEFORE stopping containers so a write failure (disk full,
387
+ // permissions) doesn't leave the assistant offline.
388
+ const loadedSecret = loadBootstrapSecret(instanceName);
389
+ const bootstrapSecret = loadedSecret || randomBytes(32).toString("hex");
390
+ if (!loadedSecret) {
391
+ saveBootstrapSecret(instanceName, bootstrapSecret);
392
+ }
393
+
394
+ // Extract or generate the shared JWT signing key. Pre-env-var instances
395
+ // won't have it in capturedEnv, so generate fresh in that case.
396
+ const signingKey =
397
+ capturedEnv["ACTOR_TOKEN_SIGNING_KEY"] || randomBytes(32).toString("hex");
398
+
399
+ // Create pre-upgrade backup (best-effort, daemon must be running)
400
+ await broadcastUpgradeEvent(
401
+ entry.runtimeUrl,
402
+ entry.assistantId,
403
+ buildProgressEvent(UPGRADE_PROGRESS.BACKING_UP),
404
+ );
405
+ console.log("📦 Creating pre-upgrade backup...");
406
+ const backupPath = await createBackup(entry.runtimeUrl, entry.assistantId, {
407
+ prefix: `${entry.assistantId}-pre-upgrade`,
408
+ description: `Pre-upgrade snapshot before ${entry.serviceGroupVersion ?? "unknown"} → ${versionTag}`,
409
+ });
410
+ if (backupPath) {
411
+ console.log(` Backup saved: ${backupPath}\n`);
412
+ // Clean up old pre-upgrade backups, keep last 3
413
+ pruneOldBackups(entry.assistantId, 3);
414
+ } else {
415
+ console.warn("⚠️ Pre-upgrade backup failed (continuing with upgrade)\n");
416
+ }
417
+
418
+ // Persist the backup path so `vellum rollback` can restore the exact backup
419
+ // created for this upgrade attempt — never a stale backup from a prior cycle.
420
+ // Re-read the entry to pick up the rollback state saved earlier.
421
+ {
422
+ const current = findAssistantByName(entry.assistantId);
423
+ if (current) {
424
+ saveAssistantEntry({
425
+ ...current,
426
+ preUpgradeBackupPath: backupPath ?? undefined,
427
+ });
428
+ }
429
+ }
430
+
431
+ await broadcastUpgradeEvent(
432
+ entry.runtimeUrl,
433
+ entry.assistantId,
434
+ buildProgressEvent(UPGRADE_PROGRESS.INSTALLING),
435
+ );
436
+
437
+ // If we have the target version's migration ceiling, run a PRECISE
438
+ // rollback on the CURRENT (newer) daemon before stopping it. The current
439
+ // daemon has the `down()` code for all migrations it applied, so it can
440
+ // cleanly revert to the target version's ceiling. This is critical for
441
+ // multi-version downgrades where the old daemon wouldn't know about
442
+ // migrations introduced after its release.
443
+ let preSwapRollbackOk = true;
444
+ if (
445
+ isDowngrade &&
446
+ (targetMigrationCeiling.dbVersion !== undefined ||
447
+ targetMigrationCeiling.workspaceMigrationId !== undefined)
448
+ ) {
449
+ console.log("🔄 Reverting database changes for downgrade...");
450
+ await broadcastUpgradeEvent(
451
+ entry.runtimeUrl,
452
+ entry.assistantId,
453
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING_MIGRATIONS),
454
+ );
455
+ preSwapRollbackOk = await rollbackMigrations(
456
+ entry.runtimeUrl,
457
+ entry.assistantId,
458
+ targetMigrationCeiling.dbVersion,
459
+ targetMigrationCeiling.workspaceMigrationId,
460
+ );
461
+ }
462
+
463
+ console.log("🛑 Stopping existing containers...");
464
+ await stopContainers(res);
465
+ console.log("✅ Containers stopped\n");
466
+
272
467
  // Build the set of extra env vars to replay on the new assistant container.
273
468
  // Captured env vars serve as the base; keys already managed by
274
469
  // serviceDockerRunArgs are excluded to avoid duplicates.
275
- const envKeysSetByRunArgs = new Set([
276
- "CES_SERVICE_TOKEN",
277
- "VELLUM_ASSISTANT_NAME",
278
- "RUNTIME_HTTP_HOST",
279
- "PATH",
280
- ]);
470
+ const envKeysSetByRunArgs = new Set(CONTAINER_ENV_EXCLUDE_KEYS);
281
471
  // Only exclude keys that serviceDockerRunArgs will actually set
282
472
  for (const envVar of ["ANTHROPIC_API_KEY", "VELLUM_PLATFORM_URL"]) {
283
473
  if (process.env[envVar]) {
@@ -300,6 +490,8 @@ async function upgradeDocker(
300
490
  console.log("🚀 Starting upgraded containers...");
301
491
  await startContainers(
302
492
  {
493
+ signingKey,
494
+ bootstrapSecret,
303
495
  cesServiceToken,
304
496
  extraAssistantEnv,
305
497
  gatewayPort,
@@ -328,9 +520,65 @@ async function upgradeDocker(
328
520
  cesDigest: newDigests?.["credential-executor"],
329
521
  networkName: res.network,
330
522
  },
523
+ previousServiceGroupVersion: entry.serviceGroupVersion,
524
+ previousContainerInfo: entry.containerInfo,
525
+ previousDbMigrationVersion: preMigrationState.dbVersion,
526
+ previousWorkspaceMigrationId: preMigrationState.lastWorkspaceMigrationId,
527
+ // Preserve the backup path so `vellum rollback` can restore it later
528
+ preUpgradeBackupPath: backupPath ?? undefined,
331
529
  };
332
530
  saveAssistantEntry(updatedEntry);
333
531
 
532
+ // After a downgrade, fall back to asking the now-running old daemon
533
+ // to roll back migrations above its own registry ceiling when either:
534
+ // (a) no release metadata was available for a precise pre-swap rollback, or
535
+ // (b) the precise pre-swap rollback failed (timeout, daemon crash, etc.).
536
+ // This is a no-op for multi-version jumps where the old daemon doesn't
537
+ // know about the newer migrations, but correct for single-step rollbacks.
538
+ if (
539
+ isDowngrade &&
540
+ (!preSwapRollbackOk ||
541
+ (targetMigrationCeiling.dbVersion === undefined &&
542
+ targetMigrationCeiling.workspaceMigrationId === undefined))
543
+ ) {
544
+ await rollbackMigrations(
545
+ entry.runtimeUrl,
546
+ entry.assistantId,
547
+ undefined,
548
+ undefined,
549
+ true,
550
+ );
551
+ }
552
+
553
+ // Notify clients on the new service group that the upgrade succeeded.
554
+ await broadcastUpgradeEvent(
555
+ entry.runtimeUrl,
556
+ entry.assistantId,
557
+ buildCompleteEvent(versionTag, true),
558
+ );
559
+
560
+ // Record successful upgrade in workspace git history
561
+ if (workspaceDir) {
562
+ try {
563
+ await commitWorkspaceState(
564
+ workspaceDir,
565
+ buildUpgradeCommitMessage({
566
+ action: "upgrade",
567
+ phase: "complete",
568
+ from: entry.serviceGroupVersion ?? "unknown",
569
+ to: versionTag,
570
+ topology: "docker",
571
+ assistantId: entry.assistantId,
572
+ result: "success",
573
+ }),
574
+ );
575
+ } catch (err) {
576
+ console.warn(
577
+ `⚠️ Failed to create post-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
578
+ );
579
+ }
580
+ }
581
+
334
582
  console.log(
335
583
  `\n✅ Docker assistant '${instanceName}' upgraded to ${versionTag}.`,
336
584
  );
@@ -338,12 +586,42 @@ async function upgradeDocker(
338
586
  console.error(`\n❌ Containers failed to become ready within the timeout.`);
339
587
 
340
588
  if (previousImageRefs) {
589
+ await broadcastUpgradeEvent(
590
+ entry.runtimeUrl,
591
+ entry.assistantId,
592
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING),
593
+ );
341
594
  console.log(`\n🔄 Rolling back to previous images...`);
342
595
  try {
596
+ // Attempt to roll back migrations before swapping containers.
597
+ // The new daemon may be partially up — try best-effort.
598
+ if (
599
+ preMigrationState.dbVersion !== undefined ||
600
+ preMigrationState.lastWorkspaceMigrationId !== undefined
601
+ ) {
602
+ console.log("🔄 Reverting database changes...");
603
+ await broadcastUpgradeEvent(
604
+ entry.runtimeUrl,
605
+ entry.assistantId,
606
+ buildProgressEvent(UPGRADE_PROGRESS.REVERTING_MIGRATIONS),
607
+ );
608
+ await rollbackMigrations(
609
+ entry.runtimeUrl,
610
+ entry.assistantId,
611
+ preMigrationState.dbVersion,
612
+ preMigrationState.lastWorkspaceMigrationId,
613
+ );
614
+ }
615
+
343
616
  await stopContainers(res);
344
617
 
618
+ await migrateGatewaySecurityFiles(res, (msg) => console.log(msg));
619
+ await migrateCesSecurityFiles(res, (msg) => console.log(msg));
620
+
345
621
  await startContainers(
346
622
  {
623
+ signingKey,
624
+ bootstrapSecret,
347
625
  cesServiceToken,
348
626
  extraAssistantEnv,
349
627
  gatewayPort,
@@ -356,22 +634,90 @@ async function upgradeDocker(
356
634
 
357
635
  const rollbackReady = await waitForReady(entry.runtimeUrl);
358
636
  if (rollbackReady) {
359
- // Restore previous container info in lockfile after rollback
360
- if (previousImageRefs) {
361
- const rolledBackEntry: AssistantEntry = {
362
- ...entry,
363
- containerInfo: {
364
- assistantImage: previousImageRefs.assistant,
365
- gatewayImage: previousImageRefs.gateway,
366
- cesImage: previousImageRefs["credential-executor"],
367
- networkName: res.network,
368
- },
369
- };
370
- saveAssistantEntry(rolledBackEntry);
637
+ // Restore data from the backup created for THIS upgrade attempt.
638
+ // Only use the specific backupPath — never scan for the latest
639
+ // backup on disk, which could be from a previous upgrade cycle
640
+ // and contain stale data.
641
+ if (backupPath) {
642
+ await broadcastUpgradeEvent(
643
+ entry.runtimeUrl,
644
+ entry.assistantId,
645
+ buildProgressEvent(UPGRADE_PROGRESS.RESTORING),
646
+ );
647
+ console.log(`📦 Restoring data from pre-upgrade backup...`);
648
+ console.log(` Source: ${backupPath}`);
649
+ const restored = await restoreBackup(
650
+ entry.runtimeUrl,
651
+ entry.assistantId,
652
+ backupPath,
653
+ );
654
+ if (restored) {
655
+ console.log(" ✅ Data restored successfully\n");
656
+ } else {
657
+ console.warn(
658
+ " ⚠️ Data restore failed (rollback continues without data restoration)\n",
659
+ );
660
+ }
661
+ } else {
662
+ console.log(
663
+ "ℹ️ No pre-upgrade backup was created for this attempt, skipping data restoration\n",
664
+ );
371
665
  }
666
+
667
+ // Capture fresh digests from the now-running rolled-back containers.
668
+ const rollbackDigests = await captureImageRefs(res);
669
+
670
+ // Restore previous container info in lockfile after rollback.
671
+ // The *Image fields hold human-readable image:tag names from the
672
+ // pre-upgrade containerInfo; *Digest fields get fresh values from
673
+ // the running containers (or fall back to previousImageRefs).
674
+ const rolledBackEntry: AssistantEntry = {
675
+ ...entry,
676
+ containerInfo: {
677
+ assistantImage:
678
+ entry.containerInfo?.assistantImage ??
679
+ previousImageRefs.assistant,
680
+ gatewayImage:
681
+ entry.containerInfo?.gatewayImage ?? previousImageRefs.gateway,
682
+ cesImage:
683
+ entry.containerInfo?.cesImage ??
684
+ previousImageRefs["credential-executor"],
685
+ assistantDigest:
686
+ rollbackDigests?.assistant ?? previousImageRefs.assistant,
687
+ gatewayDigest:
688
+ rollbackDigests?.gateway ?? previousImageRefs.gateway,
689
+ cesDigest:
690
+ rollbackDigests?.["credential-executor"] ??
691
+ previousImageRefs["credential-executor"],
692
+ networkName: res.network,
693
+ },
694
+ previousServiceGroupVersion: undefined,
695
+ previousContainerInfo: undefined,
696
+ previousDbMigrationVersion: undefined,
697
+ previousWorkspaceMigrationId: undefined,
698
+ // Clear the backup path — the upgrade that created it just failed
699
+ preUpgradeBackupPath: undefined,
700
+ };
701
+ saveAssistantEntry(rolledBackEntry);
702
+
703
+ // Notify clients that the upgrade failed and rolled back.
704
+ await broadcastUpgradeEvent(
705
+ entry.runtimeUrl,
706
+ entry.assistantId,
707
+ buildCompleteEvent(
708
+ entry.serviceGroupVersion ?? "unknown",
709
+ false,
710
+ entry.serviceGroupVersion,
711
+ ),
712
+ );
713
+
372
714
  console.log(
373
715
  `\n⚠️ Rolled back to previous version. Upgrade to ${versionTag} failed.`,
374
716
  );
717
+ emitCliError(
718
+ "READINESS_TIMEOUT",
719
+ `Upgrade to ${versionTag} failed: containers did not become ready. Rolled back to previous version.`,
720
+ );
375
721
  } else {
376
722
  console.error(
377
723
  `\n❌ Rollback also failed. Manual intervention required.`,
@@ -379,21 +725,51 @@ async function upgradeDocker(
379
725
  console.log(
380
726
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
381
727
  );
728
+ await broadcastUpgradeEvent(
729
+ entry.runtimeUrl,
730
+ entry.assistantId,
731
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
732
+ );
733
+ emitCliError(
734
+ "ROLLBACK_FAILED",
735
+ "Rollback also failed after readiness timeout. Manual intervention required.",
736
+ );
382
737
  }
383
738
  } catch (rollbackErr) {
384
- console.error(
385
- `\n❌ Rollback failed: ${rollbackErr instanceof Error ? rollbackErr.message : String(rollbackErr)}`,
386
- );
739
+ const rollbackDetail =
740
+ rollbackErr instanceof Error
741
+ ? rollbackErr.message
742
+ : String(rollbackErr);
743
+ console.error(`\n❌ Rollback failed: ${rollbackDetail}`);
387
744
  console.error(` Manual intervention required.`);
388
745
  console.log(
389
746
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
390
747
  );
748
+ await broadcastUpgradeEvent(
749
+ entry.runtimeUrl,
750
+ entry.assistantId,
751
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
752
+ );
753
+ emitCliError(
754
+ "ROLLBACK_FAILED",
755
+ "Auto-rollback failed after readiness timeout. Manual intervention required.",
756
+ rollbackDetail,
757
+ );
391
758
  }
392
759
  } else {
393
760
  console.log(` No previous images available for rollback.`);
394
761
  console.log(
395
762
  ` Check logs with: docker logs -f ${res.assistantContainer}`,
396
763
  );
764
+ await broadcastUpgradeEvent(
765
+ entry.runtimeUrl,
766
+ entry.assistantId,
767
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
768
+ );
769
+ emitCliError(
770
+ "ROLLBACK_NO_STATE",
771
+ "Containers failed to become ready and no previous images available for rollback.",
772
+ );
397
773
  }
398
774
 
399
775
  process.exit(1);
@@ -409,15 +785,41 @@ async function upgradePlatform(
409
785
  entry: AssistantEntry,
410
786
  version: string | null,
411
787
  ): Promise<void> {
788
+ const workspaceDir = entry.resources
789
+ ? join(entry.resources.instanceDir, ".vellum", "workspace")
790
+ : null;
791
+
792
+ // Record version transition start in workspace git history
793
+ if (workspaceDir) {
794
+ try {
795
+ await commitWorkspaceState(
796
+ workspaceDir,
797
+ buildUpgradeCommitMessage({
798
+ action: "upgrade",
799
+ phase: "starting",
800
+ from: entry.serviceGroupVersion ?? "unknown",
801
+ to: version ?? "latest",
802
+ topology: "managed",
803
+ assistantId: entry.assistantId,
804
+ }),
805
+ );
806
+ } catch (err) {
807
+ console.warn(
808
+ `⚠️ Failed to create pre-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
809
+ );
810
+ }
811
+ }
812
+
412
813
  console.log(
413
814
  `🔄 Upgrading platform-hosted assistant '${entry.assistantId}'...\n`,
414
815
  );
415
816
 
416
817
  const token = readPlatformToken();
417
818
  if (!token) {
418
- console.error(
419
- "Error: Not logged in. Run `vellum login --token <token>` first.",
420
- );
819
+ const msg =
820
+ "Error: Not logged in. Run `vellum login --token <token>` first.";
821
+ console.error(msg);
822
+ emitCliError("AUTH_FAILED", msg);
421
823
  process.exit(1);
422
824
  }
423
825
 
@@ -431,6 +833,15 @@ async function upgradePlatform(
431
833
  body.version = version;
432
834
  }
433
835
 
836
+ // Notify connected clients that an upgrade is about to begin.
837
+ const targetVersion = version ?? `v${cliPkg.version}`;
838
+ console.log("📢 Notifying connected clients...");
839
+ await broadcastUpgradeEvent(
840
+ entry.runtimeUrl,
841
+ entry.assistantId,
842
+ buildStartingEvent(targetVersion, 90),
843
+ );
844
+
434
845
  const response = await fetch(url, {
435
846
  method: "POST",
436
847
  headers: {
@@ -446,10 +857,50 @@ async function upgradePlatform(
446
857
  console.error(
447
858
  `Error: Platform upgrade failed (${response.status}): ${text}`,
448
859
  );
860
+ emitCliError(
861
+ "PLATFORM_API_ERROR",
862
+ `Platform upgrade failed (${response.status})`,
863
+ text,
864
+ );
865
+ await broadcastUpgradeEvent(
866
+ entry.runtimeUrl,
867
+ entry.assistantId,
868
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
869
+ );
449
870
  process.exit(1);
450
871
  }
451
872
 
452
873
  const result = (await response.json()) as UpgradeApiResponse;
874
+
875
+ // NOTE: We intentionally do NOT broadcast a "complete" event here.
876
+ // The platform API returning 200 only means "upgrade request accepted" —
877
+ // the service group has not yet restarted with the new version. The
878
+ // completion signal will come from the client's health-check
879
+ // version-change detection (DaemonConnection.swift) once the new
880
+ // version actually appears after the platform restarts the service group.
881
+
882
+ // Record successful upgrade in workspace git history
883
+ if (workspaceDir) {
884
+ try {
885
+ await commitWorkspaceState(
886
+ workspaceDir,
887
+ buildUpgradeCommitMessage({
888
+ action: "upgrade",
889
+ phase: "complete",
890
+ from: entry.serviceGroupVersion ?? "unknown",
891
+ to: version ?? "latest",
892
+ topology: "managed",
893
+ assistantId: entry.assistantId,
894
+ result: "success",
895
+ }),
896
+ );
897
+ } catch (err) {
898
+ console.warn(
899
+ `⚠️ Failed to create post-upgrade workspace commit: ${err instanceof Error ? err.message : String(err)}`,
900
+ );
901
+ }
902
+ }
903
+
453
904
  console.log(`✅ ${result.detail}`);
454
905
  if (result.version) {
455
906
  console.log(` Version: ${result.version}`);
@@ -461,18 +912,33 @@ export async function upgrade(): Promise<void> {
461
912
  const entry = resolveTargetAssistant(name);
462
913
  const cloud = resolveCloud(entry);
463
914
 
464
- if (cloud === "docker") {
465
- await upgradeDocker(entry, version);
466
- return;
467
- }
915
+ try {
916
+ if (cloud === "docker") {
917
+ await upgradeDocker(entry, version);
918
+ return;
919
+ }
468
920
 
469
- if (cloud === "vellum") {
470
- await upgradePlatform(entry, version);
471
- return;
921
+ if (cloud === "vellum") {
922
+ await upgradePlatform(entry, version);
923
+ return;
924
+ }
925
+ } catch (err) {
926
+ const detail = err instanceof Error ? err.message : String(err);
927
+ console.error(`\n❌ Upgrade failed: ${detail}`);
928
+ // Best-effort: notify connected clients that the upgrade failed.
929
+ // A `starting` event may have been sent inside upgradeDocker/upgradePlatform
930
+ // before the error was thrown, so we must close with `complete`.
931
+ await broadcastUpgradeEvent(
932
+ entry.runtimeUrl,
933
+ entry.assistantId,
934
+ buildCompleteEvent(entry.serviceGroupVersion ?? "unknown", false),
935
+ );
936
+ emitCliError(categorizeUpgradeError(err), "Upgrade failed", detail);
937
+ process.exit(1);
472
938
  }
473
939
 
474
- console.error(
475
- `Error: Upgrade is not supported for '${cloud}' assistants. Only 'docker' and 'vellum' assistants can be upgraded via the CLI.`,
476
- );
940
+ const msg = `Error: Upgrade is not supported for '${cloud}' assistants. Only 'docker' and 'vellum' assistants can be upgraded via the CLI.`;
941
+ console.error(msg);
942
+ emitCliError("UNSUPPORTED_TOPOLOGY", msg);
477
943
  process.exit(1);
478
944
  }