@pleri/olam-cli 0.1.12 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/__tests__/image-presence.test.d.ts +2 -0
  2. package/dist/__tests__/image-presence.test.d.ts.map +1 -0
  3. package/dist/__tests__/image-presence.test.js +44 -0
  4. package/dist/__tests__/image-presence.test.js.map +1 -0
  5. package/dist/__tests__/protocol-version.test.d.ts +2 -0
  6. package/dist/__tests__/protocol-version.test.d.ts.map +1 -0
  7. package/dist/__tests__/protocol-version.test.js +170 -0
  8. package/dist/__tests__/protocol-version.test.js.map +1 -0
  9. package/dist/__tests__/registry-allowlist.test.d.ts +2 -0
  10. package/dist/__tests__/registry-allowlist.test.d.ts.map +1 -0
  11. package/dist/__tests__/registry-allowlist.test.js +129 -0
  12. package/dist/__tests__/registry-allowlist.test.js.map +1 -0
  13. package/dist/commands/__tests__/upgrade.all-three.test.d.ts +19 -0
  14. package/dist/commands/__tests__/upgrade.all-three.test.d.ts.map +1 -0
  15. package/dist/commands/__tests__/upgrade.all-three.test.js +92 -0
  16. package/dist/commands/__tests__/upgrade.all-three.test.js.map +1 -0
  17. package/dist/commands/__tests__/upgrade.history.test.d.ts +15 -0
  18. package/dist/commands/__tests__/upgrade.history.test.d.ts.map +1 -0
  19. package/dist/commands/__tests__/upgrade.history.test.js +199 -0
  20. package/dist/commands/__tests__/upgrade.history.test.js.map +1 -0
  21. package/dist/commands/__tests__/upgrade.lock.test.d.ts +15 -0
  22. package/dist/commands/__tests__/upgrade.lock.test.d.ts.map +1 -0
  23. package/dist/commands/__tests__/upgrade.lock.test.js +253 -0
  24. package/dist/commands/__tests__/upgrade.lock.test.js.map +1 -0
  25. package/dist/commands/__tests__/upgrade.olam-tag.test.d.ts +21 -0
  26. package/dist/commands/__tests__/upgrade.olam-tag.test.d.ts.map +1 -0
  27. package/dist/commands/__tests__/upgrade.olam-tag.test.js +127 -0
  28. package/dist/commands/__tests__/upgrade.olam-tag.test.js.map +1 -0
  29. package/dist/commands/__tests__/upgrade.poll.test.d.ts +14 -0
  30. package/dist/commands/__tests__/upgrade.poll.test.d.ts.map +1 -0
  31. package/dist/commands/__tests__/upgrade.poll.test.js +136 -0
  32. package/dist/commands/__tests__/upgrade.poll.test.js.map +1 -0
  33. package/dist/commands/__tests__/upgrade.recreate.test.d.ts +17 -0
  34. package/dist/commands/__tests__/upgrade.recreate.test.d.ts.map +1 -0
  35. package/dist/commands/__tests__/upgrade.recreate.test.js +95 -0
  36. package/dist/commands/__tests__/upgrade.recreate.test.js.map +1 -0
  37. package/dist/commands/__tests__/upgrade.rollback.test.d.ts +12 -0
  38. package/dist/commands/__tests__/upgrade.rollback.test.d.ts.map +1 -0
  39. package/dist/commands/__tests__/upgrade.rollback.test.js +275 -0
  40. package/dist/commands/__tests__/upgrade.rollback.test.js.map +1 -0
  41. package/dist/commands/__tests__/upgrade.sha-capture.test.d.ts +12 -0
  42. package/dist/commands/__tests__/upgrade.sha-capture.test.d.ts.map +1 -0
  43. package/dist/commands/__tests__/upgrade.sha-capture.test.js +63 -0
  44. package/dist/commands/__tests__/upgrade.sha-capture.test.js.map +1 -0
  45. package/dist/commands/__tests__/upgrade.smoke.test.d.ts +19 -0
  46. package/dist/commands/__tests__/upgrade.smoke.test.d.ts.map +1 -0
  47. package/dist/commands/__tests__/upgrade.smoke.test.js +101 -0
  48. package/dist/commands/__tests__/upgrade.smoke.test.js.map +1 -0
  49. package/dist/commands/__tests__/upgrade.swap.test.d.ts +19 -0
  50. package/dist/commands/__tests__/upgrade.swap.test.d.ts.map +1 -0
  51. package/dist/commands/__tests__/upgrade.swap.test.js +333 -0
  52. package/dist/commands/__tests__/upgrade.swap.test.js.map +1 -0
  53. package/dist/commands/create.d.ts.map +1 -1
  54. package/dist/commands/create.js +31 -0
  55. package/dist/commands/create.js.map +1 -1
  56. package/dist/commands/upgrade-history.d.ts +17 -0
  57. package/dist/commands/upgrade-history.d.ts.map +1 -0
  58. package/dist/commands/upgrade-history.js +40 -0
  59. package/dist/commands/upgrade-history.js.map +1 -0
  60. package/dist/commands/upgrade-lock.d.ts +102 -0
  61. package/dist/commands/upgrade-lock.d.ts.map +1 -0
  62. package/dist/commands/upgrade-lock.js +225 -0
  63. package/dist/commands/upgrade-lock.js.map +1 -0
  64. package/dist/commands/upgrade-log.d.ts +86 -0
  65. package/dist/commands/upgrade-log.d.ts.map +1 -0
  66. package/dist/commands/upgrade-log.js +146 -0
  67. package/dist/commands/upgrade-log.js.map +1 -0
  68. package/dist/commands/upgrade.d.ts +265 -0
  69. package/dist/commands/upgrade.d.ts.map +1 -1
  70. package/dist/commands/upgrade.js +840 -10
  71. package/dist/commands/upgrade.js.map +1 -1
  72. package/dist/image-presence.d.ts +40 -0
  73. package/dist/image-presence.d.ts.map +1 -0
  74. package/dist/image-presence.js +39 -0
  75. package/dist/image-presence.js.map +1 -0
  76. package/dist/index.js +1015 -163
  77. package/dist/protocol-version.d.ts +79 -0
  78. package/dist/protocol-version.d.ts.map +1 -0
  79. package/dist/protocol-version.js +133 -0
  80. package/dist/protocol-version.js.map +1 -0
  81. package/dist/registry-allowlist.d.ts +47 -0
  82. package/dist/registry-allowlist.d.ts.map +1 -0
  83. package/dist/registry-allowlist.js +67 -0
  84. package/dist/registry-allowlist.js.map +1 -0
  85. package/package.json +1 -1
@@ -16,6 +16,11 @@ import { spawnSync } from 'node:child_process';
16
16
  import pc from 'picocolors';
17
17
  import { printError, printSuccess, printInfo, printWarning, printHeader } from '../output.js';
18
18
  import { buildComposeEnv, readAuthSecret, runCompose } from './host-cp.js';
19
+ import { acquireLock, releaseLock, formatRefusalMessage, LOCK_FILE_PATH } from './upgrade-lock.js';
20
+ import { appendUpgradeLog } from './upgrade-log.js';
21
+ import { handleHistory, parseHistoryOpts } from './upgrade-history.js';
22
+ import { AuthContainerController } from '@olam/core/src/auth/index.js';
23
+ const AUTH_HEALTH_URL = 'http://127.0.0.1:9999/health';
19
24
  /**
20
25
  * Check whether node_modules is in sync with package-lock.json.
21
26
  *
@@ -71,11 +76,23 @@ export function validateRepoRoot(cwd) {
71
76
  }
72
77
  /** Normalise raw Commander option object into typed opts. */
73
78
  export function parseUpgradeOpts(raw) {
79
+ const rawN = raw.n;
80
+ const historyN = typeof rawN === 'number'
81
+ ? rawN
82
+ : typeof rawN === 'string'
83
+ ? Number.parseInt(rawN, 10)
84
+ : 10;
74
85
  return {
75
86
  yes: raw.yes === true,
76
87
  skipImage: raw.skipImage === true,
77
88
  skipInstall: raw.skipInstall === true,
78
89
  branch: raw.branch ?? null,
90
+ rollback: raw.rollback === true,
91
+ force: raw.force === true,
92
+ noCache: raw.noCache === true,
93
+ history: raw.history === true,
94
+ historyN: Number.isFinite(historyN) && historyN > 0 ? historyN : 10,
95
+ historyJson: raw.json === true,
79
96
  };
80
97
  }
81
98
  /**
@@ -122,6 +139,321 @@ function hasGitUpstream(cwd) {
122
139
  });
123
140
  return result.status === 0;
124
141
  }
142
+ /**
143
+ * Capture HEAD SHA via `git rev-parse HEAD`. Returns null on failure.
144
+ *
145
+ * Phase 2a — A2: must be invoked AFTER `git pull --ff-only` so the captured
146
+ * SHA reflects the state we're upgrading TO (not the pre-pull state). The
147
+ * pull's whole purpose is to advance HEAD; capturing before would refuse the
148
+ * CLI's own pull as drift at A6's swap-boundary check.
149
+ *
150
+ * The returned SHA is sticky for the rest of the run (no per-step re-reads);
151
+ * A6 / B4 re-read once at the swap boundary to detect operator-driven mid-flight
152
+ * `git checkout` / `git reset` that happen DURING the build window.
153
+ */
154
+ export function captureHeadSha(cwd) {
155
+ const result = spawnSync('git', ['rev-parse', 'HEAD'], {
156
+ encoding: 'utf-8',
157
+ stdio: ['ignore', 'pipe', 'pipe'],
158
+ cwd,
159
+ });
160
+ if (result.status !== 0)
161
+ return null;
162
+ const sha = (result.stdout ?? '').trim();
163
+ // git rev-parse HEAD returns 40-char lowercase hex; defensive validation.
164
+ if (!/^[0-9a-f]{40}$/.test(sha))
165
+ return null;
166
+ return sha;
167
+ }
168
+ /** Abbreviate a 40-char SHA to 8 chars for human-readable output. */
169
+ export function abbreviateSha(sha) {
170
+ return sha.slice(0, 8);
171
+ }
172
+ /**
173
+ * Check whether a docker image tag exists locally (Phase 2b — B1).
174
+ *
175
+ * Uses `docker image inspect` which exits 0 only when ALL specified
176
+ * images exist locally. Single-image variant for the rollback pre-flight.
177
+ */
178
+ export function imageExists(tag) {
179
+ try {
180
+ const result = spawnSync('docker', ['image', 'inspect', '--format', '{{.Id}}', tag], {
181
+ encoding: 'utf-8',
182
+ stdio: ['ignore', 'pipe', 'ignore'],
183
+ });
184
+ return result.status === 0;
185
+ }
186
+ catch {
187
+ return false;
188
+ }
189
+ }
190
+ /**
191
+ * Pre-flight check for `olam upgrade --rollback` (Phase 2b — B1).
192
+ *
193
+ * Verifies that all three `:olam-rollback` tags exist. Returns an error
194
+ * message naming the missing image(s) when any are absent — typically
195
+ * the first-upgrade case where no prior canonical existed for one or
196
+ * more components, leaving the rollback set incoherent (see audit A6-001).
197
+ *
198
+ * Returns null when all three are present (rollback is safe to proceed).
199
+ */
200
+ export function checkRollbackSetExists(plan) {
201
+ const missing = plan.filter((p) => !imageExists(p.rollback)).map((p) => p.rollback);
202
+ if (missing.length === 0)
203
+ return null;
204
+ return missing.join(', ');
205
+ }
206
+ /**
207
+ * Run docker create + docker inspect for a single image.
208
+ *
209
+ * Returns ok=true when:
210
+ * - `docker create <image>` exits 0 (image manifest valid, layers downloadable).
211
+ * - `docker inspect <image> --format '{{.Config.Labels.olam_build_sha}}'`
212
+ * returns the expected `targetSha`.
213
+ *
214
+ * The container created by `docker create` is removed via `docker rm` even on
215
+ * failure paths (best-effort cleanup; orphans are harmless and pruned by the
216
+ * daemon's GC eventually).
217
+ */
218
+ export function smokeImage(image, targetSha) {
219
+ // 1. docker create — allocates the container; doesn't start the entrypoint.
220
+ const createResult = spawnSync('docker', ['create', '--name', `olam-smoke-${Date.now()}`, image], {
221
+ encoding: 'utf-8',
222
+ stdio: ['ignore', 'pipe', 'pipe'],
223
+ });
224
+ if (createResult.status !== 0) {
225
+ return {
226
+ image,
227
+ ok: false,
228
+ bakedSha: null,
229
+ error: `docker create failed: ${(createResult.stderr ?? '').trim()}`,
230
+ };
231
+ }
232
+ const containerId = (createResult.stdout ?? '').trim();
233
+ // 2. docker inspect — read the OLAM_BUILD_SHA label.
234
+ const inspectResult = spawnSync('docker', ['inspect', '--format', '{{index .Config.Labels "olam_build_sha"}}', image], {
235
+ encoding: 'utf-8',
236
+ stdio: ['ignore', 'pipe', 'pipe'],
237
+ });
238
+ // 3. Cleanup — best-effort; ignore exit code.
239
+ if (containerId.length > 0) {
240
+ spawnSync('docker', ['rm', '-f', containerId], {
241
+ encoding: 'utf-8',
242
+ stdio: ['ignore', 'ignore', 'ignore'],
243
+ });
244
+ }
245
+ if (inspectResult.status !== 0) {
246
+ return {
247
+ image,
248
+ ok: false,
249
+ bakedSha: null,
250
+ error: `docker inspect failed: ${(inspectResult.stderr ?? '').trim()}`,
251
+ };
252
+ }
253
+ const bakedSha = (inspectResult.stdout ?? '').trim();
254
+ // Empty output means the label is absent — that's a build-corrupt signal.
255
+ if (bakedSha.length === 0) {
256
+ return {
257
+ image,
258
+ ok: false,
259
+ bakedSha: null,
260
+ error: 'olam_build_sha label is missing or empty',
261
+ };
262
+ }
263
+ // Allow either full 40-char SHA or "unknown" (build-host-cp.sh writes
264
+ // "unknown" when git rev-parse fails). Match against targetSha for
265
+ // success.
266
+ if (bakedSha !== targetSha) {
267
+ return {
268
+ image,
269
+ ok: false,
270
+ bakedSha,
271
+ error: `baked SHA ${abbreviateSha(bakedSha)} ≠ target SHA ${abbreviateSha(targetSha)}`,
272
+ };
273
+ }
274
+ return { image, ok: true, bakedSha };
275
+ }
276
+ export const PRODUCTION_SWAP_PLAN = [
277
+ { transient: 'olam-auth:olam-next', canonical: 'olam-auth:local', rollback: 'olam-auth:olam-rollback' },
278
+ { transient: 'olam-devbox:olam-next', canonical: 'olam-devbox:latest', rollback: 'olam-devbox:olam-rollback' },
279
+ { transient: 'olam-host-cp:olam-next', canonical: 'olam-host-cp:latest', rollback: 'olam-host-cp:olam-rollback' },
280
+ ];
281
+ /**
282
+ * Run `docker tag <source> <dest>`. Returns ok=false with stderr trimmed
283
+ * on failure (e.g. source image absent). No retry — caller decides.
284
+ *
285
+ * Per audit A6-003: spawnSync may throw synchronously under fork pressure
286
+ * (libuv clone(2) failures). The try/catch ensures performAtomicSwap can
287
+ * always proceed to its summary phase — a thrown exception escaping
288
+ * dockerTag would leak the upgrade lock and produce no SwapResult, which
289
+ * confuses both the operator AND Phase 2b's --rollback recovery path.
290
+ */
291
+ export function dockerTag(source, dest) {
292
+ try {
293
+ const result = spawnSync('docker', ['tag', source, dest], {
294
+ encoding: 'utf-8',
295
+ stdio: ['ignore', 'ignore', 'pipe'],
296
+ });
297
+ if (result.status === 0 && result.error === undefined)
298
+ return { ok: true };
299
+ return {
300
+ ok: false,
301
+ error: (result.stderr ?? '').trim() || result.error?.message || 'docker tag failed',
302
+ };
303
+ }
304
+ catch (err) {
305
+ return {
306
+ ok: false,
307
+ error: err instanceof Error ? `spawnSync threw: ${err.message}` : 'spawnSync threw',
308
+ };
309
+ }
310
+ }
311
+ /**
312
+ * Atomic-ish 3-image set swap.
313
+ *
314
+ * Six sequential `docker tag` ops in two phases:
315
+ *
316
+ * Phase 1 (rollback-save):
317
+ * 1. canonical → :olam-rollback (image 1)
318
+ * 2. canonical → :olam-rollback (image 2)
319
+ * 3. canonical → :olam-rollback (image 3)
320
+ *
321
+ * Phase 2 (canonical-advance):
322
+ * 4. :olam-next → canonical (image 1)
323
+ * 5. :olam-next → canonical (image 2)
324
+ * 6. :olam-next → canonical (image 3)
325
+ *
326
+ * Invariants:
327
+ *
328
+ * - **First-upgrade tolerance**: any of steps 1-3 may fail with "no such
329
+ * image" if the operator has never had a canonical-tagged image. Those
330
+ * failures are NON-FATAL (recorded in rollbackError but not aborted) —
331
+ * `:olam-rollback` simply doesn't exist for that image; Phase 2b's
332
+ * `--rollback` pre-flight detects this and refuses.
333
+ *
334
+ * - **Canonical-advance fatality**: any failure in steps 4-6 is fatal.
335
+ * The swap is partially advanced; canonical tags are now mixed (some
336
+ * at SHA-Y, some at SHA-X). Operator runs `olam upgrade --rollback`
337
+ * (Phase 2b) which uses the FULL `:olam-rollback` set written in Phase 1
338
+ * to restore coherent prior state.
339
+ *
340
+ * - **SIGKILL recovery**: if killed during Phase 1, partial `:olam-rollback`
341
+ * exists but canonical is intact — operator's next `olam upgrade` succeeds
342
+ * normally (the partial `:olam-rollback` is overwritten by the next
343
+ * successful run). If killed during Phase 2, canonical is mixed —
344
+ * operator must `olam upgrade --rollback` to recover.
345
+ *
346
+ * The "atomic-ish" qualifier: `docker tag` is per-image atomic (POSIX rename
347
+ * of a symbolic name), but the SET of 3 canonical tags is updated sequentially
348
+ * across ~1s wall-clock. Sub-second window is acceptable for solo-dev/dogfood
349
+ * per the plan's local-dev/dogfood priority axis.
350
+ */
351
+ export function performAtomicSwap(plan) {
352
+ const steps = plan.map((p) => ({
353
+ image: p.canonical,
354
+ rollbackSaved: false,
355
+ canonicalAdvanced: false,
356
+ }));
357
+ // Phase 1: preserve previous-good as :olam-rollback (steps 1-3).
358
+ // Non-fatal failures: missing canonical (first-upgrade) is acceptable.
359
+ for (let i = 0; i < plan.length; i++) {
360
+ const p = plan[i];
361
+ const r = dockerTag(p.canonical, p.rollback);
362
+ steps[i] = {
363
+ ...steps[i],
364
+ rollbackSaved: r.ok,
365
+ ...(r.error !== undefined && { rollbackError: r.error }),
366
+ };
367
+ }
368
+ // Phase 2: advance canonical to :olam-next (steps 4-6).
369
+ // FATAL failure: canonical is mixed. Recovery via `olam upgrade --rollback`.
370
+ let advanceFailed = false;
371
+ let firstFailureIdx = -1;
372
+ for (let i = 0; i < plan.length; i++) {
373
+ const p = plan[i];
374
+ if (advanceFailed) {
375
+ // Skip remaining advances after first failure — leaves canonical mixed
376
+ // but caller's recovery path is `olam upgrade --rollback`, not
377
+ // continue-and-hope.
378
+ steps[i] = { ...steps[i], canonicalAdvanced: false };
379
+ continue;
380
+ }
381
+ const r = dockerTag(p.transient, p.canonical);
382
+ steps[i] = {
383
+ ...steps[i],
384
+ canonicalAdvanced: r.ok,
385
+ ...(r.error !== undefined && { canonicalError: r.error }),
386
+ };
387
+ if (!r.ok) {
388
+ advanceFailed = true;
389
+ firstFailureIdx = i;
390
+ }
391
+ }
392
+ const allAdvanced = steps.every((s) => s.canonicalAdvanced);
393
+ const noneAdvanced = steps.every((s) => !s.canonicalAdvanced);
394
+ const partialAdvance = !allAdvanced && !noneAdvanced;
395
+ const rollbackCoherent = steps.every((s) => s.rollbackSaved);
396
+ let summary;
397
+ if (allAdvanced) {
398
+ const rollbacks = steps.filter((s) => s.rollbackSaved).length;
399
+ summary = `Swapped ${plan.length} canonical tags; ${rollbacks} :olam-rollback preserved`;
400
+ }
401
+ else if (partialAdvance) {
402
+ const advanced = steps.filter((s) => s.canonicalAdvanced).length;
403
+ const failedStep = steps[firstFailureIdx];
404
+ // Audit A6-001: only recommend --rollback when the rollback set is COHERENT.
405
+ // Otherwise the operator would either partially restore or hit Phase 2b's
406
+ // pre-flight refusal — misleading either way.
407
+ const recoveryHint = rollbackCoherent
408
+ ? `Run \`olam upgrade --rollback\` to restore coherent prior state.`
409
+ : `Rollback set INCOHERENT (${steps.filter((s) => s.rollbackSaved).length} of ${plan.length} :olam-rollback tags written). Manual recovery required: inspect images and re-tag canonical from a known-good source.`;
410
+ summary = `PARTIAL: ${advanced} of ${plan.length} canonical tags advanced before failure on ${failedStep?.image}: ${failedStep?.canonicalError}. ${recoveryHint}`;
411
+ }
412
+ else {
413
+ const failedStep = steps[firstFailureIdx];
414
+ summary = `Failed on first canonical-advance (${failedStep?.image}): ${failedStep?.canonicalError}. Canonical tags untouched.`;
415
+ }
416
+ return {
417
+ ok: allAdvanced,
418
+ steps,
419
+ partialAdvance,
420
+ rollbackCoherent,
421
+ summary,
422
+ };
423
+ }
424
+ /**
425
+ * Inverse of performAtomicSwap — restore canonical from :olam-rollback
426
+ * (Phase 2b — B1). Three sequential `docker tag` ops:
427
+ *
428
+ * docker tag olam-auth:olam-rollback olam-auth:local
429
+ * docker tag olam-devbox:olam-rollback olam-devbox:latest
430
+ * docker tag olam-host-cp:olam-rollback olam-host-cp:latest
431
+ *
432
+ * No two-phase ceremony — the source `:olam-rollback` set is already a
433
+ * coherent prior-good captured by a previous successful `olam upgrade`,
434
+ * so we don't need to preserve current canonical (it's known-broken,
435
+ * which is why we're rolling back).
436
+ *
437
+ * Caller MUST pre-flight via `checkRollbackSetExists()` before invoking.
438
+ * Behavior on missing source is per-image fatal (returns ok=false +
439
+ * error naming the missing image).
440
+ */
441
+ export function performRollbackSwap(plan) {
442
+ const results = [];
443
+ for (const p of plan) {
444
+ const r = dockerTag(p.rollback, p.canonical);
445
+ results.push({
446
+ image: p.canonical,
447
+ ok: r.ok,
448
+ ...(r.error !== undefined && { error: r.error }),
449
+ });
450
+ }
451
+ const allOk = results.every((r) => r.ok);
452
+ const summary = allOk
453
+ ? `Rolled back ${plan.length} canonical tags from :olam-rollback`
454
+ : `PARTIAL rollback: ${results.filter((r) => r.ok).length} of ${plan.length} succeeded; failed: ${results.filter((r) => !r.ok).map((r) => r.image).join(', ')}`;
455
+ return { ok: allOk, results, summary };
456
+ }
125
457
  async function confirm(message) {
126
458
  if (!process.stdin.isTTY)
127
459
  return true;
@@ -151,6 +483,144 @@ async function waitForHealth(timeoutMs = 10_000) {
151
483
  }
152
484
  return false;
153
485
  }
486
+ /**
487
+ * Poll /api/version/status until all three component `.running` SHAs match
488
+ * `targetSha`, or until `timeoutMs` elapses. Returns the final snapshot.
489
+ *
490
+ * Phase 2a — A8: this is the success criterion for the entire upgrade.
491
+ * After A6's atomic swap + A7's recreate, the new images should report
492
+ * the new SHA via OLAM_BUILD_SHA baked at build time. Round-trip through
493
+ * Phase 1's detection path closes the loop.
494
+ *
495
+ * Returns:
496
+ * - { matched: true, snapshot } when all three SHAs equal targetSha within timeout.
497
+ * - { matched: false, snapshot } when timeout expires; caller decides
498
+ * whether to warn (recreate succeeded but propagation slow) or error.
499
+ * - { matched: false, snapshot: null } when /api/version/status is
500
+ * unreachable for the entire timeout (host-cp didn't come back up).
501
+ */
502
+ export async function waitForVersionMatch(targetSha, timeoutMs = 60_000, pollIntervalMs = 1_000) {
503
+ const deadline = Date.now() + timeoutMs;
504
+ let lastSnapshot = null;
505
+ while (Date.now() < deadline) {
506
+ try {
507
+ const res = await fetch('http://127.0.0.1:19000/api/version/status', {
508
+ signal: AbortSignal.timeout(2_000),
509
+ });
510
+ if (res.ok) {
511
+ const snapshot = (await res.json());
512
+ lastSnapshot = snapshot;
513
+ if (snapshot.hostCp?.running === targetSha &&
514
+ snapshot.authService?.running === targetSha &&
515
+ snapshot.devbox?.running === targetSha) {
516
+ return { matched: true, snapshot };
517
+ }
518
+ }
519
+ }
520
+ catch {
521
+ // host-cp not yet ready or transient network blip
522
+ }
523
+ await new Promise((r) => setTimeout(r, pollIntervalMs));
524
+ }
525
+ return { matched: false, snapshot: lastSnapshot };
526
+ }
527
+ /**
528
+ * Format a version-snapshot mismatch into a readable per-component diff
529
+ * (Phase 2a — A8). Used in the timeout-warn path so the operator sees
530
+ * which component is lagging.
531
+ */
532
+ export function formatVersionMismatch(targetSha, snapshot) {
533
+ if (!snapshot)
534
+ return 'No /api/version/status response received within timeout.';
535
+ const lines = [];
536
+ for (const [name, comp] of [
537
+ ['host-cp', snapshot.hostCp],
538
+ ['auth-service', snapshot.authService],
539
+ ['devbox', snapshot.devbox],
540
+ ]) {
541
+ const match = comp?.running === targetSha;
542
+ lines.push(` ${match ? '✓' : '✗'} ${name}: running=${abbreviateSha(comp?.running ?? 'unknown')} target=${abbreviateSha(targetSha)}`);
543
+ }
544
+ return lines.join('\n');
545
+ }
546
+ /**
547
+ * Block until auth-service /health responds or timeout expires (Phase 2a — A7).
548
+ *
549
+ * Mirrors auth-upgrade.ts's waitForAuthHealth — kept inline to avoid a
550
+ * circular dep. When auth-upgrade.ts is refactored later (Phase G+),
551
+ * extract a shared helper.
552
+ */
553
+ async function waitForAuthHealthLocal(timeoutMs = 15_000) {
554
+ const deadline = Date.now() + timeoutMs;
555
+ while (Date.now() < deadline) {
556
+ try {
557
+ const res = await fetch(AUTH_HEALTH_URL, { signal: AbortSignal.timeout(2000) });
558
+ if (res.ok)
559
+ return true;
560
+ }
561
+ catch {
562
+ // not up yet
563
+ }
564
+ await new Promise((r) => setTimeout(r, 500));
565
+ }
566
+ return false;
567
+ }
568
+ /**
569
+ * Recreate the auth-service container against the freshly-tagged
570
+ * `olam-auth:local` image (Phase 2a — A7).
571
+ *
572
+ * Mirrors auth-upgrade.ts:237-275: docker stop → docker rm →
573
+ * AuthContainerController.start(). Auth-service is NOT in compose.yaml
574
+ * (it runs via the controller's docker run with secret injection), so
575
+ * we cannot reuse `docker compose --force-recreate auth-service` —
576
+ * compose would fail with "no such service: auth-service" (verified
577
+ * during pass-2 review F2 audit).
578
+ *
579
+ * Errors:
580
+ * - docker stop / docker rm errors are swallowed (container may not
581
+ * be running or may not exist; both are recoverable states).
582
+ * - AuthContainerController.start() throws on real failures (image
583
+ * missing, port conflict, secret missing); caller catches and
584
+ * reports.
585
+ *
586
+ * Returns true on successful recreate + /health response within 15s.
587
+ */
588
+ async function recreateAuthService() {
589
+ const start = Date.now();
590
+ try {
591
+ // Step 1: stop + remove. Errors swallowed — container may be absent/stopped.
592
+ spawnSync('docker', ['stop', 'olam-auth'], {
593
+ encoding: 'utf-8',
594
+ stdio: ['ignore', 'ignore', 'ignore'],
595
+ });
596
+ spawnSync('docker', ['rm', 'olam-auth'], {
597
+ encoding: 'utf-8',
598
+ stdio: ['ignore', 'ignore', 'ignore'],
599
+ });
600
+ // Step 2: start the new container via the controller (handles secret
601
+ // injection; reads OLAM_AUTH_SECRET from env or ~/.olam/auth-secret).
602
+ const controller = new AuthContainerController();
603
+ controller.start();
604
+ // Step 3: wait for /health.
605
+ const healthy = await waitForAuthHealthLocal(15_000);
606
+ const durationMs = Date.now() - start;
607
+ if (!healthy) {
608
+ return {
609
+ ok: false,
610
+ durationMs,
611
+ error: 'auth-service /health did not respond within 15s after recreate',
612
+ };
613
+ }
614
+ return { ok: true, durationMs };
615
+ }
616
+ catch (err) {
617
+ return {
618
+ ok: false,
619
+ durationMs: Date.now() - start,
620
+ error: err instanceof Error ? err.message : String(err),
621
+ };
622
+ }
623
+ }
154
624
  function readBundleHash(cwd) {
155
625
  const indexPath = path.join(cwd, 'packages/control-plane/public/index.html');
156
626
  if (!fs.existsSync(indexPath))
@@ -197,6 +667,174 @@ async function handleUpgrade(opts) {
197
667
  return;
198
668
  }
199
669
  }
670
+ // Phase 2c — C2: --history reads the audit log; no state changes.
671
+ if (opts.history) {
672
+ handleHistory(parseHistoryOpts({ n: opts.historyN, json: opts.historyJson }));
673
+ return;
674
+ }
675
+ // Phase 2b — B1: rollback path. Branches at the top so --rollback skips
676
+ // the entire git-pull/build/swap sequence and only retags + recreates.
677
+ if (opts.rollback) {
678
+ return await handleRollback();
679
+ }
680
+ // 3b. Acquire CLI lock (Phase 2a — A1). Refuses if a live upgrade is in flight;
681
+ // auto-recovers stale locks (parse-error / empty / dead-pid / >5 min).
682
+ const lock = acquireLock();
683
+ if (!lock.acquired) {
684
+ printError(formatRefusalMessage(lock, LOCK_FILE_PATH));
685
+ process.exitCode = 1;
686
+ return;
687
+ }
688
+ // SIGINT / SIGTERM handler — release the lock before terminating so the
689
+ // operator's next invocation doesn't have to wait for stale-recovery
690
+ // (audit A1-005). `process.once` so a second Ctrl-C terminates immediately.
691
+ let signalReleased = false;
692
+ const releaseOnSignal = (signal) => {
693
+ if (signalReleased)
694
+ return;
695
+ signalReleased = true;
696
+ try {
697
+ releaseLock();
698
+ }
699
+ catch {
700
+ // best-effort
701
+ }
702
+ // Standard shell exit code for signal-induced termination: 128 + signal-number.
703
+ process.exit(signal === 'SIGINT' ? 130 : 143);
704
+ };
705
+ process.once('SIGINT', releaseOnSignal);
706
+ process.once('SIGTERM', releaseOnSignal);
707
+ // Phase 2c — C1: collect a JSONL log row. Mutated as the upgrade progresses;
708
+ // appended once on the way out (success OR failure) so --history surfaces
709
+ // every attempt, not just successful ones.
710
+ const logRow = {
711
+ started_at: Date.now(),
712
+ durations_ms: {},
713
+ sha_target: '',
714
+ failed_step: null,
715
+ status: 'failed', // default; flipped to 'success' on clean exit
716
+ };
717
+ try {
718
+ await runUpgradeStepsWithLockHeld(opts, cwd, logRow);
719
+ if (process.exitCode !== 1)
720
+ logRow.status = 'success';
721
+ }
722
+ finally {
723
+ const ended_at = Date.now();
724
+ const row = {
725
+ ts: new Date(ended_at).toISOString(),
726
+ started_at: logRow.started_at,
727
+ ended_at,
728
+ sha_target: logRow.sha_target,
729
+ status: logRow.status,
730
+ failed_step: logRow.failed_step,
731
+ durations_ms: logRow.durations_ms,
732
+ };
733
+ appendUpgradeLog(row);
734
+ releaseLock();
735
+ process.removeListener('SIGINT', releaseOnSignal);
736
+ process.removeListener('SIGTERM', releaseOnSignal);
737
+ }
738
+ }
739
+ /**
740
+ * Phase 2b — B1: handle `olam upgrade --rollback`.
741
+ *
742
+ * Pre-flights all three :olam-rollback tags exist; refuses with exit 1 if
743
+ * any missing. Else atomically retags :olam-rollback → canonical for all
744
+ * three images, then recreates host-cp (compose) + auth-service (controller).
745
+ * No git pull, no build, no smoke — the rollback target is a known-good
746
+ * image set captured by a previous successful upgrade.
747
+ *
748
+ * Acquires the same upgrade lock as the regular path so concurrent
749
+ * --rollback + --normal-upgrade refuse at the file-mutex layer.
750
+ */
751
+ async function handleRollback() {
752
+ printHeader('olam upgrade --rollback');
753
+ // 1. Pre-flight — verify rollback set exists.
754
+ const missing = checkRollbackSetExists(PRODUCTION_SWAP_PLAN);
755
+ if (missing !== null) {
756
+ printError(`No rollback-set available — missing :olam-rollback tag(s): ${missing}\n\n` +
757
+ 'A rollback-set is created by the FIRST successful `olam upgrade`. If this\n' +
758
+ 'is your first install, run `olam upgrade` to populate the rollback set.\n' +
759
+ 'If a previous upgrade was incomplete, the rollback set may be partial;\n' +
760
+ 'manually inspect images with `docker images olam-*:olam-rollback`.');
761
+ process.exitCode = 1;
762
+ return;
763
+ }
764
+ // 2. Acquire lock (same primitive as the upgrade path).
765
+ const lock = acquireLock();
766
+ if (!lock.acquired) {
767
+ printError(formatRefusalMessage(lock, LOCK_FILE_PATH));
768
+ process.exitCode = 1;
769
+ return;
770
+ }
771
+ let signalReleased = false;
772
+ const releaseOnSignal = (signal) => {
773
+ if (signalReleased)
774
+ return;
775
+ signalReleased = true;
776
+ try {
777
+ releaseLock();
778
+ }
779
+ catch {
780
+ /* best-effort */
781
+ }
782
+ process.exit(signal === 'SIGINT' ? 130 : 143);
783
+ };
784
+ process.once('SIGINT', releaseOnSignal);
785
+ process.once('SIGTERM', releaseOnSignal);
786
+ try {
787
+ // 3. Inverse swap: 3 docker tag ops.
788
+ process.stdout.write(` ${pc.dim('rollback retag (3 ops)'.padEnd(34))}`);
789
+ const swapStart = Date.now();
790
+ const swapResult = performRollbackSwap(PRODUCTION_SWAP_PLAN);
791
+ const swapDur = `${((Date.now() - swapStart) / 1000).toFixed(1)}s`;
792
+ process.stdout.write(`${swapResult.ok ? pc.green('✓') : pc.red('✗')} ${swapDur}\n`);
793
+ if (!swapResult.ok) {
794
+ printError(`Rollback retag failed: ${swapResult.summary}`);
795
+ process.exitCode = 1;
796
+ return;
797
+ }
798
+ printInfo('Rollback', swapResult.summary);
799
+ // 4. Recreate containers (host-cp via compose, auth via controller).
800
+ const cwd = process.cwd();
801
+ const composeFile = path.join(cwd, 'packages/host-cp/compose.yaml');
802
+ const authSecret = readAuthSecret();
803
+ process.stdout.write(` ${pc.dim('docker compose recreate host-cp'.padEnd(34))}`);
804
+ const composeStart = Date.now();
805
+ const composeResult = runCompose(['up', '-d', '--force-recreate', 'host-cp'], composeFile, buildComposeEnv(authSecret));
806
+ const composeDur = `${((Date.now() - composeStart) / 1000).toFixed(1)}s`;
807
+ process.stdout.write(`${composeResult.ok ? pc.green('✓') : pc.red('✗')} ${composeDur}\n`);
808
+ if (!composeResult.ok) {
809
+ printError(`Rollback compose recreate failed:\n${composeResult.stderr}\n` +
810
+ 'Canonical tags are at :olam-rollback (good); container restart pending. ' +
811
+ 'Manually: `docker compose -f packages/host-cp/compose.yaml up -d --force-recreate host-cp`.');
812
+ process.exitCode = 1;
813
+ return;
814
+ }
815
+ process.stdout.write(` ${pc.dim('recreate auth-service'.padEnd(34))}`);
816
+ const authResult = await recreateAuthService();
817
+ const authDur = `${(authResult.durationMs / 1000).toFixed(1)}s`;
818
+ process.stdout.write(`${authResult.ok ? pc.green('✓') : pc.red('✗')} ${authDur}\n`);
819
+ if (!authResult.ok) {
820
+ printError(`Auth-service recreate failed: ${authResult.error ?? 'unknown'}`);
821
+ process.exitCode = 1;
822
+ return;
823
+ }
824
+ process.stdout.write('\n');
825
+ printSuccess('Rollback complete — canonical tags restored from :olam-rollback');
826
+ }
827
+ finally {
828
+ releaseLock();
829
+ process.removeListener('SIGINT', releaseOnSignal);
830
+ process.removeListener('SIGTERM', releaseOnSignal);
831
+ }
832
+ }
833
+ /**
834
+ * Internal — runs all state-changing upgrade steps inside the lock.
835
+ * Extracted so handleUpgrade can wrap in try/finally without indenting the body.
836
+ */
837
+ async function runUpgradeStepsWithLockHeld(opts, cwd, logRow) {
200
838
  // 4a. Branch switch (--branch).
201
839
  if (opts.branch !== null) {
202
840
  if (isGitDirty(cwd)) {
@@ -241,6 +879,21 @@ async function handleUpgrade(opts) {
241
879
  process.exitCode = 1;
242
880
  return;
243
881
  }
882
+ // Phase 2a — A2: capture HEAD SHA AFTER pull (sticky for the run).
883
+ // The pull is what we're upgrading to; capturing before would self-refuse
884
+ // at A6's swap-boundary drift check. _targetSha is consumed by A6 (atomic
885
+ // swap) and B4 (drift refusal). Phase 2a A2 just stashes it; A6/B4 land it
886
+ // load-bearing.
887
+ const _targetSha = captureHeadSha(cwd);
888
+ logRow.sha_target = _targetSha ?? '';
889
+ if (_targetSha === null) {
890
+ logRow.failed_step = 'capture HEAD SHA';
891
+ printError('Failed to capture HEAD SHA via `git rev-parse HEAD`. Aborting upgrade.\n' +
892
+ 'Re-run from a clean git checkout; ensure `git rev-parse HEAD` returns a 40-char SHA.');
893
+ process.exitCode = 1;
894
+ return;
895
+ }
896
+ printInfo('Target SHA', abbreviateSha(_targetSha));
244
897
  // Step c: npm install (skip when in sync or --skip-install).
245
898
  const installDecision = shouldSkipInstall(opts, cwd);
246
899
  if (installDecision.skip) {
@@ -290,15 +943,133 @@ async function handleUpgrade(opts) {
290
943
  printTimings(timings);
291
944
  return;
292
945
  }
293
- // Step f: docker image build.
294
- const buildScript = path.join(cwd, 'packages/adapters/src/docker/build-host-cp.sh');
295
- const imageResult = runStep('bash build-host-cp.sh', 'bash', [buildScript], { cwd });
296
- timings.push({ label: 'docker image build', durationMs: imageResult.durationMs });
297
- if (!imageResult.ok) {
298
- printError(`Docker image build failed:\n${imageResult.stderr}`);
946
+ // Note: A4-A8 step durations are captured in `timings`; the per-step
947
+ // durations_ms snapshot on the log row reflects the full timings array
948
+ // at the end of the run (logRow.durations_ms is updated below at each
949
+ // significant boundary so a mid-run failure is recorded with what we know).
950
+ // Phase 2a — A4: sequential build invocation (auth → devbox → host-cp)
951
+ // with OLAM_TAG=olam-next so each script tags its image transiently per
952
+ // A3's retag block. Order is load-bearing: auth first minimises P3's
953
+ // in-flight 401 window when the recreate (A7) restarts auth before
954
+ // host-cp. Devbox uses inherit-stdio (live tee) per audit F13 since
955
+ // its cold-cache build dominates the 12-22 min budget and silent
956
+ // capture is indistinguishable from a hang.
957
+ // Phase 2b — B3: --no-cache passes through to all three build scripts.
958
+ // The build scripts honor DOCKER_BUILD_NO_CACHE via the build-arg env mechanism
959
+ // documented in their shell. (B3 implementation: forward the env; build
960
+ // scripts treat unset as default cache enabled.)
961
+ const olamTagEnv = { OLAM_TAG: 'olam-next' };
962
+ if (opts.noCache) {
963
+ olamTagEnv.DOCKER_BUILD_NO_CACHE = '1';
964
+ }
965
+ const buildScripts = [
966
+ { label: 'bash build-auth.sh', relPath: 'packages/adapters/src/docker/build-auth.sh', tee: false },
967
+ { label: 'bash build-devbox.sh', relPath: 'packages/adapters/src/docker/build-devbox.sh', tee: true },
968
+ { label: 'bash build-host-cp.sh', relPath: 'packages/adapters/src/docker/build-host-cp.sh', tee: false },
969
+ ];
970
+ for (const step of buildScripts) {
971
+ const scriptPath = path.join(cwd, step.relPath);
972
+ if (step.tee) {
973
+ // Live-tee variant: stdio: 'inherit' so docker build's apt/bundle/npm
974
+ // progress reaches the operator's terminal in real-time. No stdout
975
+ // capture means we can't include stderr in the failure message —
976
+ // operator already saw the failure inline.
977
+ process.stdout.write(` ${pc.dim(step.label.padEnd(34))}\n`);
978
+ const start = Date.now();
979
+ const result = spawnSync('bash', [scriptPath], {
980
+ stdio: 'inherit',
981
+ cwd,
982
+ env: { ...process.env, ...olamTagEnv },
983
+ });
984
+ const durationMs = Date.now() - start;
985
+ const ok = result.status === 0 && result.error === undefined;
986
+ const dur = `${(durationMs / 1000).toFixed(1)}s`;
987
+ process.stdout.write(` ${pc.dim(step.label.padEnd(34))}${ok ? pc.green('✓') : pc.red('✗')} ${dur}\n`);
988
+ timings.push({ label: step.label, durationMs });
989
+ if (!ok) {
990
+ printError(`${step.label} failed (see output above for details).`);
991
+ process.exitCode = 1;
992
+ return;
993
+ }
994
+ }
995
+ else {
996
+ const result = runStep(step.label, 'bash', [scriptPath], {
997
+ cwd,
998
+ env: olamTagEnv,
999
+ });
1000
+ timings.push({ label: step.label, durationMs: result.durationMs });
1001
+ logRow.durations_ms[step.label] = result.durationMs;
1002
+ if (!result.ok) {
1003
+ logRow.failed_step = step.label;
1004
+ printError(`${step.label} failed:\n${result.stderr.split('\n').slice(-3).join('\n')}`);
1005
+ process.exitCode = 1;
1006
+ return;
1007
+ }
1008
+ }
1009
+ }
1010
+ // Snapshot durations to logRow so a later-step failure preserves what we know.
1011
+ for (const t of timings)
1012
+ logRow.durations_ms[t.label] = t.durationMs;
1013
+ // Phase 2a — A5: smoke each :olam-next image via docker create + inspect.
1014
+ // Catches build-corrupt cases (manifest invalid, OLAM_BUILD_SHA label
1015
+ // missing, baked SHA != target SHA) before A6's atomic swap touches
1016
+ // canonical tags. Sub-second per image; no port bind.
1017
+ const smokeStart = Date.now();
1018
+ process.stdout.write(` ${pc.dim('smoke (docker create + inspect)'.padEnd(34))}`);
1019
+ const smokeImages = [
1020
+ 'olam-auth:olam-next',
1021
+ 'olam-devbox:olam-next',
1022
+ 'olam-host-cp:olam-next',
1023
+ ];
1024
+ const smokeResults = smokeImages.map((img) => smokeImage(img, _targetSha));
1025
+ const smokeFailures = smokeResults.filter((r) => !r.ok);
1026
+ const smokeDurationMs = Date.now() - smokeStart;
1027
+ const smokeDur = `${(smokeDurationMs / 1000).toFixed(1)}s`;
1028
+ process.stdout.write(`${smokeFailures.length === 0 ? pc.green('✓') : pc.red('✗')} ${smokeDur}\n`);
1029
+ timings.push({ label: 'smoke', durationMs: smokeDurationMs });
1030
+ if (smokeFailures.length > 0) {
1031
+ printError(`Smoke failed for ${smokeFailures.length} of ${smokeResults.length} images:\n` +
1032
+ smokeFailures.map((r) => ` - ${r.image}: ${r.error}`).join('\n') +
1033
+ '\nCanonical tags (`:latest`/`:local`) untouched. Investigate the failed image(s),' +
1034
+ ' then re-run `olam upgrade` (--no-cache if cache-poisoning suspected).');
299
1035
  process.exitCode = 1;
300
1036
  return;
301
1037
  }
1038
+ // Phase 2b — B4: SHA drift check at swap boundary.
1039
+ // Re-read HEAD via `git rev-parse HEAD` and compare to A2's captured
1040
+ // _targetSha. If different, refuse the swap unless --force.
1041
+ const swapBoundarySha = captureHeadSha(cwd);
1042
+ if (swapBoundarySha !== null && swapBoundarySha !== _targetSha && !opts.force) {
1043
+ printError(`HEAD drifted during build window:\n` +
1044
+ ` captured (after pull): ${abbreviateSha(_targetSha)}\n` +
1045
+ ` current at swap: ${abbreviateSha(swapBoundarySha)}\n\n` +
1046
+ 'Operator-driven `git checkout` or `git reset` triggered drift.\n' +
1047
+ 'Recovery options:\n' +
1048
+ ' • Re-run `olam upgrade` (will rebuild against current HEAD).\n' +
1049
+ ' • Pass `--force` to swap anyway (canonical advances to the\n' +
1050
+ ' captured-at-pull SHA, NOT current HEAD).');
1051
+ process.exitCode = 1;
1052
+ return;
1053
+ }
1054
+ // Phase 2a — A6: atomic 6-tag swap.
1055
+ // Phase 1 of swap: preserve previous-good as :olam-rollback (3 ops).
1056
+ // Phase 2 of swap: advance canonical to :olam-next (3 ops).
1057
+ // Sub-second wall-clock; SIGKILL during Phase 2 is recoverable via
1058
+ // `olam upgrade --rollback` (Phase 2b) since :olam-rollback is fully
1059
+ // populated before any canonical tag is touched.
1060
+ process.stdout.write(` ${pc.dim('atomic 6-tag swap'.padEnd(34))}`);
1061
+ const swapStart = Date.now();
1062
+ const swapResult = performAtomicSwap(PRODUCTION_SWAP_PLAN);
1063
+ const swapDurationMs = Date.now() - swapStart;
1064
+ const swapDur = `${(swapDurationMs / 1000).toFixed(1)}s`;
1065
+ process.stdout.write(`${swapResult.ok ? pc.green('✓') : pc.red('✗')} ${swapDur}\n`);
1066
+ timings.push({ label: 'atomic swap', durationMs: swapDurationMs });
1067
+ if (!swapResult.ok) {
1068
+ printError(`Atomic swap failed: ${swapResult.summary}`);
1069
+ process.exitCode = 1;
1070
+ return;
1071
+ }
1072
+ printInfo('Swap', swapResult.summary);
302
1073
  // Step g: docker compose up -d --force-recreate.
303
1074
  const composeFile = path.join(cwd, 'packages/host-cp/compose.yaml');
304
1075
  process.stdout.write(` ${pc.dim('docker compose recreate'.padEnd(34))}`);
@@ -310,11 +1081,36 @@ async function handleUpgrade(opts) {
310
1081
  process.stdout.write(`${composeOk ? pc.green('✓') : pc.red('✗')} ${composeDur}\n`);
311
1082
  timings.push({ label: 'container recreate', durationMs: composeDurationMs });
312
1083
  if (!composeOk) {
313
- printError(`docker compose up --force-recreate failed:\n${composeResult.stderr}`);
1084
+ // Audit A6-002: at this point canonical tags are at NEW SHA but the stack
1085
+ // failed to start. Operator needs to know --rollback is one command away.
1086
+ printError(`docker compose up --force-recreate failed:\n${composeResult.stderr}\n\n` +
1087
+ 'Canonical tags advanced to new SHA but the stack failed to start.\n' +
1088
+ 'Recovery options:\n' +
1089
+ ' • Run `olam upgrade --rollback` to restore the prior :olam-rollback set, then investigate.\n' +
1090
+ ' • Manually `docker logs olam-host-cp` to diagnose; if recoverable, retry recreate without rollback.');
1091
+ process.exitCode = 1;
1092
+ return;
1093
+ }
1094
+ // Phase 2a — A7: recreate auth-service via AuthContainerController.
1095
+ // Auth is NOT in compose.yaml; reusing the auth-upgrade.ts recreate pattern
1096
+ // (docker stop → docker rm → controller.start() → wait /health). The 25s
1097
+ // in-flight 401 window for active world API calls during this recreate is
1098
+ // documented in the operator's confirmation prompt (P3 mitigation).
1099
+ process.stdout.write(` ${pc.dim('recreate auth-service'.padEnd(34))}`);
1100
+ const authResult = await recreateAuthService();
1101
+ const authDur = `${(authResult.durationMs / 1000).toFixed(1)}s`;
1102
+ process.stdout.write(`${authResult.ok ? pc.green('✓') : pc.red('✗')} ${authDur}\n`);
1103
+ timings.push({ label: 'auth recreate', durationMs: authResult.durationMs });
1104
+ if (!authResult.ok) {
1105
+ printError(`Auth-service recreate failed: ${authResult.error ?? 'unknown'}\n\n` +
1106
+ 'Canonical tags advanced to new SHA; host-cp recreated but auth-service is broken.\n' +
1107
+ 'Recovery options:\n' +
1108
+ ' • Run `olam upgrade --rollback` to restore the prior :olam-rollback set + working stack.\n' +
1109
+ ' • Manually: `docker logs olam-auth` to diagnose; `olam auth up` to restart.');
314
1110
  process.exitCode = 1;
315
1111
  return;
316
1112
  }
317
- // Step h: wait for /health.
1113
+ // Step h: wait for /health (host-cp readiness probe).
318
1114
  process.stdout.write(` ${pc.dim('waiting for /health'.padEnd(34))}`);
319
1115
  const healthStart = Date.now();
320
1116
  const healthy = await waitForHealth(10_000);
@@ -323,7 +1119,31 @@ async function handleUpgrade(opts) {
323
1119
  process.stdout.write(`${healthy ? pc.green('✓') : pc.yellow('?')} ${healthDur}\n`);
324
1120
  timings.push({ label: '/health', durationMs: healthDurationMs });
325
1121
  if (!healthy) {
326
- printWarning('Host CP started but /health did not respond within 10s. Check: docker logs olam-host-cp');
1122
+ printWarning('Host CP started but /health did not respond within 10s.\n' +
1123
+ ' • Check: docker logs olam-host-cp\n' +
1124
+ ' • If the new SHA is broken: `olam upgrade --rollback` restores the prior set in <30s.');
1125
+ }
1126
+ // Phase 2a — A8: poll /api/version/status until all three SHAs match
1127
+ // captured target. This is the success criterion for the entire upgrade —
1128
+ // round-trips through Phase 1's detection path so the SPA banner clears
1129
+ // automatically once the polling loop succeeds.
1130
+ process.stdout.write(` ${pc.dim('verify /version/status round-trip'.padEnd(34))}`);
1131
+ const versionStart = Date.now();
1132
+ const versionMatch = await waitForVersionMatch(_targetSha, 60_000);
1133
+ const versionDurationMs = Date.now() - versionStart;
1134
+ const versionDur = `${(versionDurationMs / 1000).toFixed(1)}s`;
1135
+ process.stdout.write(`${versionMatch.matched ? pc.green('✓') : pc.yellow('?')} ${versionDur}\n`);
1136
+ timings.push({ label: '/version/status round-trip', durationMs: versionDurationMs });
1137
+ if (!versionMatch.matched) {
1138
+ // Non-fatal — recreate succeeded; SHA propagation may be slow on cold
1139
+ // host-cp boot. Operator gets diagnostic output + can decide whether to
1140
+ // re-run, wait, or roll back.
1141
+ printWarning(`Version round-trip incomplete after ${(versionDurationMs / 1000).toFixed(0)}s:\n` +
1142
+ formatVersionMismatch(_targetSha, versionMatch.snapshot) + '\n' +
1143
+ ' • Banner may still show UPDATE AVAILABLE until host-cp\'s next ' +
1144
+ 'poll cycle (~60s).\n' +
1145
+ ' • If the mismatch persists, `olam upgrade --rollback` restores ' +
1146
+ 'the prior :olam-rollback set.');
327
1147
  }
328
1148
  // 5. Summary.
329
1149
  process.stdout.write('\n');
@@ -345,12 +1165,22 @@ function printTimings(timings) {
345
1165
  export function registerUpgrade(program) {
346
1166
  program
347
1167
  .command('upgrade')
348
- .description('Self-upgrade the local Olam dev stack (pull + rebuild + restart host-cp)')
1168
+ .description('Self-upgrade the local Olam dev stack (pull + rebuild + restart all three components)')
349
1169
  .option('-y, --yes', 'Skip the confirmation prompt')
350
1170
  .option('--skip-image', 'Skip docker image rebuild + container recreate (source rebuild only)')
351
1171
  .option('--skip-install', 'Skip npm install entirely (use existing node_modules as-is). ' +
352
1172
  'Useful when a native-module build failure blocks the normal upgrade path.')
353
1173
  .option('--branch <name>', 'Switch to this branch before pulling (refuses if working tree is dirty)')
1174
+ .option('--rollback', 'Restore canonical tags from the :olam-rollback set (created by the prior successful upgrade).\n' +
1175
+ ' No git pull, no build, no smoke — just retag + recreate.')
1176
+ .option('--force', 'Bypass HEAD-drift refusal at the swap boundary. Swap advances canonical to the\n' +
1177
+ ' captured-at-pull SHA even if current HEAD differs.')
1178
+ .option('--no-cache', 'Pass --no-cache to all three build scripts (DOCKER_BUILD_NO_CACHE=1).\n' +
1179
+ ' Useful when retrying after a cache-poisoning failure.')
1180
+ .option('--history', 'Print the upgrade history (~/.olam/upgrade.log) and exit.\n' +
1181
+ ' No upgrade is performed.')
1182
+ .option('-n <count>', 'Number of history rows to print (default 10)', '10')
1183
+ .option('--json', 'Emit history as JSONL instead of a table')
354
1184
  .action(async (opts) => {
355
1185
  await handleUpgrade(parseUpgradeOpts(opts));
356
1186
  });