@wopr-network/platform-core 1.21.0 → 1.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/billing/crypto/btc/settler.js +1 -1
  2. package/dist/billing/crypto/btc/watcher.d.ts +5 -3
  3. package/dist/billing/crypto/btc/watcher.js +9 -8
  4. package/dist/billing/crypto/evm/__tests__/eth-checkout.test.d.ts +1 -0
  5. package/dist/billing/crypto/evm/__tests__/eth-checkout.test.js +49 -0
  6. package/dist/billing/crypto/evm/__tests__/eth-settler.test.d.ts +1 -0
  7. package/dist/billing/crypto/evm/__tests__/eth-settler.test.js +80 -0
  8. package/dist/billing/crypto/evm/__tests__/eth-watcher.test.d.ts +1 -0
  9. package/dist/billing/crypto/evm/__tests__/eth-watcher.test.js +134 -0
  10. package/dist/billing/crypto/evm/eth-checkout.d.ts +34 -0
  11. package/dist/billing/crypto/evm/eth-checkout.js +53 -0
  12. package/dist/billing/crypto/evm/eth-settler.d.ts +23 -0
  13. package/dist/billing/crypto/evm/eth-settler.js +52 -0
  14. package/dist/billing/crypto/evm/eth-watcher.d.ts +53 -0
  15. package/dist/billing/crypto/evm/eth-watcher.js +83 -0
  16. package/dist/billing/crypto/evm/index.d.ts +6 -0
  17. package/dist/billing/crypto/evm/index.js +3 -0
  18. package/dist/billing/crypto/evm/settler.js +1 -1
  19. package/dist/fleet/__tests__/rollout-orchestrator.test.d.ts +1 -0
  20. package/dist/fleet/__tests__/rollout-orchestrator.test.js +262 -0
  21. package/dist/fleet/index.d.ts +1 -0
  22. package/dist/fleet/index.js +1 -0
  23. package/dist/fleet/rollout-orchestrator.d.ts +69 -0
  24. package/dist/fleet/rollout-orchestrator.js +204 -0
  25. package/dist/fleet/services.d.ts +6 -0
  26. package/dist/fleet/services.js +22 -0
  27. package/package.json +1 -1
  28. package/src/billing/crypto/btc/settler.ts +1 -1
  29. package/src/billing/crypto/btc/watcher.ts +12 -11
  30. package/src/billing/crypto/evm/__tests__/eth-checkout.test.ts +60 -0
  31. package/src/billing/crypto/evm/__tests__/eth-settler.test.ts +98 -0
  32. package/src/billing/crypto/evm/__tests__/eth-watcher.test.ts +157 -0
  33. package/src/billing/crypto/evm/eth-checkout.ts +84 -0
  34. package/src/billing/crypto/evm/eth-settler.ts +71 -0
  35. package/src/billing/crypto/evm/eth-watcher.ts +129 -0
  36. package/src/billing/crypto/evm/index.ts +6 -0
  37. package/src/billing/crypto/evm/settler.ts +1 -1
  38. package/src/fleet/__tests__/rollout-orchestrator.test.ts +321 -0
  39. package/src/fleet/index.ts +1 -0
  40. package/src/fleet/rollout-orchestrator.ts +262 -0
  41. package/src/fleet/services.ts +28 -0
@@ -28,7 +28,7 @@ export async function settleEvmPayment(deps: EvmSettlerDeps, event: EvmPaymentEv
28
28
 
29
29
  const charge = await chargeStore.getByDepositAddress(event.to.toLowerCase());
30
30
  if (!charge) {
31
- return { handled: false, status: "Settled" };
31
+ return { handled: false, status: "Invalid" };
32
32
  }
33
33
 
34
34
  // Update charge status to Settled.
@@ -0,0 +1,321 @@
1
+ import { beforeEach, describe, expect, it, vi } from "vitest";
2
+ import { RolloutOrchestrator } from "../rollout-orchestrator.js";
3
+ import type { IRolloutStrategy } from "../rollout-strategy.js";
4
+ import type { BotProfile } from "../types.js";
5
+ import type { ContainerUpdater, UpdateResult } from "../updater.js";
6
+ import type { VolumeSnapshotManager } from "../volume-snapshot-manager.js";
7
+
8
+ function makeProfile(id: string, volumeName?: string): BotProfile {
9
+ return {
10
+ id,
11
+ tenantId: "tenant-1",
12
+ name: `bot-${id}`,
13
+ description: "",
14
+ image: "ghcr.io/wopr-network/paperclip:managed",
15
+ env: {},
16
+ restartPolicy: "unless-stopped",
17
+ releaseChannel: "stable",
18
+ updatePolicy: "nightly",
19
+ volumeName,
20
+ } as BotProfile;
21
+ }
22
+
23
+ function makeResult(botId: string, success: boolean): UpdateResult {
24
+ return {
25
+ botId,
26
+ success,
27
+ previousImage: "old:latest",
28
+ newImage: "new:latest",
29
+ previousDigest: "sha256:old",
30
+ newDigest: "sha256:new",
31
+ rolledBack: !success,
32
+ error: success ? undefined : "Health check failed",
33
+ };
34
+ }
35
+
36
+ function mockUpdater(results: Map<string, UpdateResult>): ContainerUpdater {
37
+ return {
38
+ updateBot: vi.fn(async (botId: string) => results.get(botId) ?? makeResult(botId, true)),
39
+ } as unknown as ContainerUpdater;
40
+ }
41
+
42
+ function mockSnapshotManager(): VolumeSnapshotManager {
43
+ return {
44
+ snapshot: vi.fn(async (volumeName: string) => ({
45
+ id: `${volumeName}-snap`,
46
+ volumeName,
47
+ archivePath: `/backup/${volumeName}-snap.tar`,
48
+ createdAt: new Date(),
49
+ sizeBytes: 1024,
50
+ })),
51
+ restore: vi.fn(async () => {}),
52
+ delete: vi.fn(async () => {}),
53
+ } as unknown as VolumeSnapshotManager;
54
+ }
55
+
56
+ function mockStrategy(overrides: Partial<IRolloutStrategy> = {}): IRolloutStrategy {
57
+ return {
58
+ nextBatch: (remaining) => remaining.slice(0, 2),
59
+ pauseDuration: () => 0,
60
+ onBotFailure: () => "skip",
61
+ maxRetries: () => 2,
62
+ healthCheckTimeout: () => 120_000,
63
+ ...overrides,
64
+ };
65
+ }
66
+
67
+ describe("RolloutOrchestrator", () => {
68
+ let updater: ReturnType<typeof mockUpdater>;
69
+ let snapMgr: ReturnType<typeof mockSnapshotManager>;
70
+
71
+ beforeEach(() => {
72
+ vi.clearAllMocks();
73
+ updater = mockUpdater(new Map());
74
+ snapMgr = mockSnapshotManager();
75
+ });
76
+
77
+ it("processes all bots in batches", async () => {
78
+ const profiles = [makeProfile("b1", "vol-1"), makeProfile("b2", "vol-2"), makeProfile("b3", "vol-3")];
79
+ const strategy = mockStrategy({ nextBatch: (r) => r.slice(0, 2) });
80
+
81
+ const orch = new RolloutOrchestrator({
82
+ updater,
83
+ snapshotManager: snapMgr,
84
+ strategy,
85
+ getUpdatableProfiles: async () => profiles,
86
+ });
87
+
88
+ const result = await orch.rollout();
89
+
90
+ expect(result.totalBots).toBe(3);
91
+ expect(result.succeeded).toBe(3);
92
+ expect(result.failed).toBe(0);
93
+ expect(result.aborted).toBe(false);
94
+ expect(updater.updateBot).toHaveBeenCalledTimes(3);
95
+ });
96
+
97
+ it("snapshots volumes before updating", async () => {
98
+ const profiles = [makeProfile("b1", "my-volume")];
99
+
100
+ const orch = new RolloutOrchestrator({
101
+ updater,
102
+ snapshotManager: snapMgr,
103
+ strategy: mockStrategy(),
104
+ getUpdatableProfiles: async () => profiles,
105
+ });
106
+
107
+ await orch.rollout();
108
+
109
+ expect(snapMgr.snapshot).toHaveBeenCalledWith("my-volume");
110
+ // On success, snapshot is cleaned up
111
+ expect(snapMgr.delete).toHaveBeenCalledWith("my-volume-snap");
112
+ });
113
+
114
+ it("skips snapshot for bots without volumes", async () => {
115
+ const profiles = [makeProfile("b1")]; // no volumeName
116
+
117
+ const orch = new RolloutOrchestrator({
118
+ updater,
119
+ snapshotManager: snapMgr,
120
+ strategy: mockStrategy(),
121
+ getUpdatableProfiles: async () => profiles,
122
+ });
123
+
124
+ await orch.rollout();
125
+
126
+ expect(snapMgr.snapshot).not.toHaveBeenCalled();
127
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
128
+ });
129
+
130
+ it("restores volumes on update failure", async () => {
131
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
132
+ updater = mockUpdater(failResults);
133
+ const profiles = [makeProfile("b1", "my-volume")];
134
+
135
+ const orch = new RolloutOrchestrator({
136
+ updater,
137
+ snapshotManager: snapMgr,
138
+ strategy: mockStrategy(),
139
+ getUpdatableProfiles: async () => profiles,
140
+ });
141
+
142
+ const result = await orch.rollout();
143
+
144
+ expect(result.failed).toBe(1);
145
+ expect(result.results[0].volumeRestored).toBe(true);
146
+ expect(snapMgr.restore).toHaveBeenCalledWith("my-volume-snap");
147
+ // Snapshot NOT deleted on failure (restored instead)
148
+ expect(snapMgr.delete).not.toHaveBeenCalled();
149
+ });
150
+
151
+ it("aborts rollout when strategy says abort", async () => {
152
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
153
+ updater = mockUpdater(failResults);
154
+ const profiles = [makeProfile("b1", "v1"), makeProfile("b2", "v2"), makeProfile("b3", "v3")];
155
+ const strategy = mockStrategy({
156
+ nextBatch: (r) => r.slice(0, 1),
157
+ onBotFailure: () => "abort",
158
+ });
159
+
160
+ const orch = new RolloutOrchestrator({
161
+ updater,
162
+ snapshotManager: snapMgr,
163
+ strategy,
164
+ getUpdatableProfiles: async () => profiles,
165
+ });
166
+
167
+ const result = await orch.rollout();
168
+
169
+ expect(result.aborted).toBe(true);
170
+ expect(result.succeeded).toBe(0);
171
+ expect(result.failed).toBe(1);
172
+ expect(result.skipped).toBe(2); // b2, b3 never processed
173
+ expect(updater.updateBot).toHaveBeenCalledTimes(1);
174
+ });
175
+
176
+ it("returns empty result when no bots to update", async () => {
177
+ const orch = new RolloutOrchestrator({
178
+ updater,
179
+ snapshotManager: snapMgr,
180
+ strategy: mockStrategy(),
181
+ getUpdatableProfiles: async () => [],
182
+ });
183
+
184
+ const result = await orch.rollout();
185
+
186
+ expect(result.totalBots).toBe(0);
187
+ expect(result.results).toHaveLength(0);
188
+ });
189
+
190
+ it("rejects concurrent rollouts", async () => {
191
+ const profiles = [makeProfile("b1")];
192
+ // Make updateBot slow
193
+ updater = {
194
+ updateBot: vi.fn(async (botId: string) => {
195
+ await new Promise((r) => setTimeout(r, 100));
196
+ return makeResult(botId, true);
197
+ }),
198
+ } as unknown as ContainerUpdater;
199
+
200
+ const orch = new RolloutOrchestrator({
201
+ updater,
202
+ snapshotManager: snapMgr,
203
+ strategy: mockStrategy(),
204
+ getUpdatableProfiles: async () => profiles,
205
+ });
206
+
207
+ const [r1, r2] = await Promise.all([orch.rollout(), orch.rollout()]);
208
+
209
+ // One succeeds, one is rejected as already running
210
+ const succeeded = [r1, r2].find((r) => r.totalBots > 0);
211
+ const rejected = [r1, r2].find((r) => r.alreadyRunning);
212
+ expect(succeeded).toBeDefined();
213
+ expect(rejected).toBeDefined();
214
+ expect(rejected?.alreadyRunning).toBe(true);
215
+ expect(rejected?.totalBots).toBe(0);
216
+ });
217
+
218
+ it("retries failed bots when strategy says retry", async () => {
219
+ let callCount = 0;
220
+ updater = {
221
+ updateBot: vi.fn(async (botId: string) => {
222
+ callCount++;
223
+ // Fail first attempt, succeed on retry
224
+ if (botId === "b1" && callCount === 1) return makeResult("b1", false);
225
+ return makeResult(botId, true);
226
+ }),
227
+ } as unknown as ContainerUpdater;
228
+
229
+ const profiles = [makeProfile("b1")];
230
+ const strategy = mockStrategy({
231
+ nextBatch: (r) => r.slice(0, 1),
232
+ onBotFailure: (_botId, _err, attempt) => (attempt < 2 ? "retry" : "skip"),
233
+ });
234
+
235
+ const orch = new RolloutOrchestrator({
236
+ updater,
237
+ snapshotManager: snapMgr,
238
+ strategy,
239
+ getUpdatableProfiles: async () => profiles,
240
+ });
241
+
242
+ const result = await orch.rollout();
243
+
244
+ // b1 failed once, retried, succeeded
245
+ expect(updater.updateBot).toHaveBeenCalledTimes(2);
246
+ expect(result.succeeded).toBe(1);
247
+ expect(result.failed).toBe(1); // first attempt counted as failed
248
+ });
249
+
250
+ it("calls onBotUpdated callback for each bot", async () => {
251
+ const profiles = [makeProfile("b1"), makeProfile("b2")];
252
+ const onBotUpdated = vi.fn();
253
+
254
+ const orch = new RolloutOrchestrator({
255
+ updater,
256
+ snapshotManager: snapMgr,
257
+ strategy: mockStrategy(),
258
+ getUpdatableProfiles: async () => profiles,
259
+ onBotUpdated,
260
+ });
261
+
262
+ await orch.rollout();
263
+
264
+ expect(onBotUpdated).toHaveBeenCalledTimes(2);
265
+ });
266
+
267
+ it("calls onRolloutComplete callback", async () => {
268
+ const profiles = [makeProfile("b1")];
269
+ const onRolloutComplete = vi.fn();
270
+
271
+ const orch = new RolloutOrchestrator({
272
+ updater,
273
+ snapshotManager: snapMgr,
274
+ strategy: mockStrategy(),
275
+ getUpdatableProfiles: async () => profiles,
276
+ onRolloutComplete,
277
+ });
278
+
279
+ await orch.rollout();
280
+
281
+ expect(onRolloutComplete).toHaveBeenCalledTimes(1);
282
+ expect(onRolloutComplete).toHaveBeenCalledWith(
283
+ expect.objectContaining({ totalBots: 1, succeeded: 1, aborted: false }),
284
+ );
285
+ });
286
+
287
+ it("continues on snapshot failure (best-effort)", async () => {
288
+ const profiles = [makeProfile("b1", "my-volume")];
289
+ snapMgr.snapshot = vi.fn().mockRejectedValue(new Error("disk full"));
290
+
291
+ const orch = new RolloutOrchestrator({
292
+ updater,
293
+ snapshotManager: snapMgr,
294
+ strategy: mockStrategy(),
295
+ getUpdatableProfiles: async () => profiles,
296
+ });
297
+
298
+ const result = await orch.rollout();
299
+
300
+ // Update still proceeds despite snapshot failure
301
+ expect(result.succeeded).toBe(1);
302
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
303
+ });
304
+
305
+ it("isRolling reflects rollout state", async () => {
306
+ const profiles = [makeProfile("b1")];
307
+
308
+ const orch = new RolloutOrchestrator({
309
+ updater,
310
+ snapshotManager: snapMgr,
311
+ strategy: mockStrategy(),
312
+ getUpdatableProfiles: async () => profiles,
313
+ });
314
+
315
+ expect(orch.isRolling).toBe(false);
316
+ const promise = orch.rollout();
317
+ // isRolling is true during rollout (may already be done for sync mocks)
318
+ await promise;
319
+ expect(orch.isRolling).toBe(false);
320
+ });
321
+ });
@@ -1,4 +1,5 @@
1
1
  export * from "./repository-types.js";
2
+ export * from "./rollout-orchestrator.js";
2
3
  export * from "./rollout-strategy.js";
3
4
  export * from "./services.js";
4
5
  export * from "./types.js";
@@ -0,0 +1,262 @@
1
+ /**
2
+ * RolloutOrchestrator — coordinates fleet-wide container updates using
3
+ * pluggable rollout strategies and volume snapshots for nuclear rollback.
4
+ *
5
+ * Sits between ImagePoller (detects new digests) and ContainerUpdater
6
+ * (handles per-bot pull/stop/recreate/health). Adds:
7
+ * - Strategy-driven batching (rolling wave, single bot, immediate)
8
+ * - Pre-update volume snapshots via VolumeSnapshotManager
9
+ * - Volume restore on health check failure (nuclear rollback)
10
+ * - Per-tenant update orchestration
11
+ */
12
+
13
+ import { logger } from "../config/logger.js";
14
+ import type { IRolloutStrategy } from "./rollout-strategy.js";
15
+ import type { BotProfile } from "./types.js";
16
+ import type { ContainerUpdater, UpdateResult } from "./updater.js";
17
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
18
+
19
+ export interface RolloutOrchestratorDeps {
20
+ updater: ContainerUpdater;
21
+ snapshotManager: VolumeSnapshotManager;
22
+ strategy: IRolloutStrategy;
23
+ /** Resolve running profiles that need updating for a given image digest */
24
+ getUpdatableProfiles: () => Promise<BotProfile[]>;
25
+ /** Optional callback after each bot update (success or failure) */
26
+ onBotUpdated?: (result: UpdateResult & { volumeRestored: boolean }) => void;
27
+ /** Optional callback when a rollout completes */
28
+ onRolloutComplete?: (results: RolloutResult) => void;
29
+ }
30
+
31
+ export interface BotUpdateResult extends UpdateResult {
32
+ volumeRestored: boolean;
33
+ }
34
+
35
+ export interface RolloutResult {
36
+ totalBots: number;
37
+ succeeded: number;
38
+ failed: number;
39
+ skipped: number;
40
+ aborted: boolean;
41
+ /** True when a concurrent rollout was already in progress */
42
+ alreadyRunning: boolean;
43
+ results: BotUpdateResult[];
44
+ }
45
+
46
+ export class RolloutOrchestrator {
47
+ private readonly updater: ContainerUpdater;
48
+ private readonly snapshotManager: VolumeSnapshotManager;
49
+ private readonly strategy: IRolloutStrategy;
50
+ private readonly getUpdatableProfiles: () => Promise<BotProfile[]>;
51
+ private readonly onBotUpdated?: (result: BotUpdateResult) => void;
52
+ private readonly onRolloutComplete?: (results: RolloutResult) => void;
53
+ private rolling = false;
54
+
55
+ constructor(deps: RolloutOrchestratorDeps) {
56
+ this.updater = deps.updater;
57
+ this.snapshotManager = deps.snapshotManager;
58
+ this.strategy = deps.strategy;
59
+ this.getUpdatableProfiles = deps.getUpdatableProfiles;
60
+ this.onBotUpdated = deps.onBotUpdated;
61
+ this.onRolloutComplete = deps.onRolloutComplete;
62
+ }
63
+
64
+ /** Whether a rollout is currently in progress. */
65
+ get isRolling(): boolean {
66
+ return this.rolling;
67
+ }
68
+
69
+ /**
70
+ * Execute a rollout across all updatable bots.
71
+ * Uses the configured strategy for batching, pausing, and failure handling.
72
+ */
73
+ async rollout(): Promise<RolloutResult> {
74
+ if (this.rolling) {
75
+ logger.warn("Rollout already in progress — skipping");
76
+ return { totalBots: 0, succeeded: 0, failed: 0, skipped: 0, aborted: false, alreadyRunning: true, results: [] };
77
+ }
78
+
79
+ this.rolling = true;
80
+ const allResults: BotUpdateResult[] = [];
81
+ let aborted = false;
82
+
83
+ try {
84
+ let remaining = await this.getUpdatableProfiles();
85
+ const totalBots = remaining.length;
86
+
87
+ if (totalBots === 0) {
88
+ logger.info("Rollout: no bots to update");
89
+ return {
90
+ totalBots: 0,
91
+ succeeded: 0,
92
+ failed: 0,
93
+ skipped: 0,
94
+ aborted: false,
95
+ alreadyRunning: false,
96
+ results: [],
97
+ };
98
+ }
99
+
100
+ logger.info(`Rollout starting: ${totalBots} bots to update`);
101
+
102
+ while (remaining.length > 0 && !aborted) {
103
+ const batch = this.strategy.nextBatch(remaining);
104
+ if (batch.length === 0) break;
105
+
106
+ logger.info(`Rollout wave: ${batch.length} bots (${remaining.length} remaining)`);
107
+
108
+ // Process batch — each bot sequentially within a wave for safety
109
+ const retryProfiles: BotProfile[] = [];
110
+ for (const profile of batch) {
111
+ if (aborted) break;
112
+
113
+ const result = await this.updateBot(profile);
114
+ allResults.push(result);
115
+ this.onBotUpdated?.(result);
116
+
117
+ if (!result.success) {
118
+ const action = this.handleFailure(profile.id, result, allResults);
119
+ if (action === "abort") {
120
+ aborted = true;
121
+ logger.warn(`Rollout aborted after bot ${profile.id} failure`);
122
+ } else if (action === "retry") {
123
+ retryProfiles.push(profile);
124
+ }
125
+ // "skip" → don't re-add, bot is dropped
126
+ }
127
+ }
128
+
129
+ // Remove processed bots from remaining, but re-add retries
130
+ const processedIds = new Set(batch.map((b) => b.id));
131
+ const retryIds = new Set(retryProfiles.map((b) => b.id));
132
+ remaining = [
133
+ ...remaining.filter((b) => !processedIds.has(b.id)),
134
+ ...retryProfiles.filter((b) => retryIds.has(b.id)),
135
+ ];
136
+
137
+ // Pause between waves (unless aborted or done)
138
+ if (remaining.length > 0 && !aborted) {
139
+ const pause = this.strategy.pauseDuration();
140
+ if (pause > 0) {
141
+ logger.info(`Rollout: pausing ${pause}ms before next wave`);
142
+ await sleep(pause);
143
+ }
144
+ }
145
+ }
146
+
147
+ const succeeded = allResults.filter((r) => r.success).length;
148
+ const failed = allResults.filter((r) => !r.success).length;
149
+ const skipped = totalBots - allResults.length;
150
+
151
+ const rolloutResult: RolloutResult = {
152
+ totalBots,
153
+ succeeded,
154
+ failed,
155
+ skipped,
156
+ aborted,
157
+ alreadyRunning: false,
158
+ results: allResults,
159
+ };
160
+
161
+ logger.info(`Rollout complete: ${succeeded} succeeded, ${failed} failed, ${skipped} skipped, aborted=${aborted}`);
162
+ this.onRolloutComplete?.(rolloutResult);
163
+
164
+ return rolloutResult;
165
+ } finally {
166
+ this.rolling = false;
167
+ }
168
+ }
169
+
170
+ /**
171
+ * Update a single bot with volume snapshot + nuclear rollback.
172
+ */
173
+ private async updateBot(profile: BotProfile): Promise<BotUpdateResult> {
174
+ const snapshotIds: string[] = [];
175
+
176
+ try {
177
+ // Step 1: Snapshot volumes before update
178
+ if (profile.volumeName) {
179
+ try {
180
+ const snap = await this.snapshotManager.snapshot(profile.volumeName);
181
+ snapshotIds.push(snap.id);
182
+ logger.info(`Pre-update snapshot for ${profile.id}: ${snap.id}`);
183
+ } catch (err) {
184
+ logger.warn(`Volume snapshot failed for ${profile.id} — proceeding without backup`, { err });
185
+ }
186
+ }
187
+
188
+ // Step 2: Delegate to ContainerUpdater
189
+ const result = await this.updater.updateBot(profile.id);
190
+
191
+ if (result.success) {
192
+ // Clean up snapshots on success
193
+ await this.cleanupSnapshots(snapshotIds);
194
+ return { ...result, volumeRestored: false };
195
+ }
196
+
197
+ // Step 3: Nuclear rollback — restore volumes if update failed
198
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
199
+ return { ...result, volumeRestored };
200
+ } catch (err) {
201
+ logger.error(`Unexpected error updating bot ${profile.id}`, { err });
202
+
203
+ // Attempt volume restore on unexpected errors too
204
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
205
+
206
+ return {
207
+ botId: profile.id,
208
+ success: false,
209
+ previousImage: profile.image,
210
+ newImage: profile.image,
211
+ previousDigest: null,
212
+ newDigest: null,
213
+ rolledBack: false,
214
+ volumeRestored,
215
+ error: err instanceof Error ? err.message : String(err),
216
+ };
217
+ }
218
+ }
219
+
220
+ /**
221
+ * Handle a bot failure using the strategy's failure policy.
222
+ * Retries the update up to maxRetries before escalating.
223
+ */
224
+ private handleFailure(
225
+ botId: string,
226
+ result: BotUpdateResult,
227
+ allResults: BotUpdateResult[],
228
+ ): "abort" | "skip" | "retry" {
229
+ const error = new Error(result.error ?? "Unknown error");
230
+ const failCount = allResults.filter((r) => r.botId === botId && !r.success).length;
231
+ return this.strategy.onBotFailure(botId, error, failCount);
232
+ }
233
+
234
+ private async restoreVolumes(botId: string, snapshotIds: string[]): Promise<boolean> {
235
+ if (snapshotIds.length === 0) return false;
236
+
237
+ for (const id of snapshotIds) {
238
+ try {
239
+ await this.snapshotManager.restore(id);
240
+ logger.info(`Volume restored for ${botId} from snapshot ${id}`);
241
+ return true;
242
+ } catch (err) {
243
+ logger.error(`Volume restore failed for ${botId} snapshot ${id}`, { err });
244
+ }
245
+ }
246
+ return false;
247
+ }
248
+
249
+ private async cleanupSnapshots(snapshotIds: string[]): Promise<void> {
250
+ for (const id of snapshotIds) {
251
+ try {
252
+ await this.snapshotManager.delete(id);
253
+ } catch (err) {
254
+ logger.warn(`Failed to clean up snapshot ${id}`, { err });
255
+ }
256
+ }
257
+ }
258
+ }
259
+
260
+ function sleep(ms: number): Promise<void> {
261
+ return new Promise((resolve) => setTimeout(resolve, ms));
262
+ }
@@ -32,6 +32,8 @@ import { SystemResourceMonitor } from "../observability/system-resources.js";
32
32
  // Stub re-exports so existing references compile; consumers must call initPlatformServices().
33
33
  // TODO: Replace with proper DI / service-locator pattern in platform-core.
34
34
  import { DrizzleTwoFactorRepository } from "../security/two-factor-repository.js";
35
+ import type { RolloutOrchestrator } from "./rollout-orchestrator.js";
36
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
35
37
 
36
38
  // Platform singletons (getAdminAuditLog, getCreditLedger, etc.) are wired by
37
39
  // the consuming application's own composition root (e.g. wopr-platform's
@@ -136,6 +138,8 @@ let _restoreLogStore: IRestoreLogStore | null = null;
136
138
  let _restoreService: RestoreService | null = null;
137
139
  let _backupStatusStore: IBackupStatusStore | null = null;
138
140
  let _snapshotManager: SnapshotManager | null = null;
141
+ let _volumeSnapshotManager: VolumeSnapshotManager | null = null;
142
+ let _rolloutOrchestrator: RolloutOrchestrator | null = null;
139
143
 
140
144
  const S3_BUCKET = process.env.S3_BUCKET || "wopr-backups";
141
145
 
@@ -537,6 +541,28 @@ export function getSnapshotManager(): SnapshotManager {
537
541
  return _snapshotManager;
538
542
  }
539
543
 
544
+ export function getVolumeSnapshotManager(): VolumeSnapshotManager {
545
+ if (!_volumeSnapshotManager) {
546
+ throw new Error("VolumeSnapshotManager not initialized — call setVolumeSnapshotManager() first");
547
+ }
548
+ return _volumeSnapshotManager;
549
+ }
550
+
551
+ export function setVolumeSnapshotManager(mgr: VolumeSnapshotManager): void {
552
+ _volumeSnapshotManager = mgr;
553
+ }
554
+
555
+ export function getRolloutOrchestrator(): RolloutOrchestrator {
556
+ if (!_rolloutOrchestrator) {
557
+ throw new Error("RolloutOrchestrator not initialized — call setRolloutOrchestrator() first");
558
+ }
559
+ return _rolloutOrchestrator;
560
+ }
561
+
562
+ export function setRolloutOrchestrator(orch: RolloutOrchestrator): void {
563
+ _rolloutOrchestrator = orch;
564
+ }
565
+
540
566
  export function getRestoreService(): RestoreService {
541
567
  if (!_restoreService) {
542
568
  _restoreService = new RestoreService({
@@ -877,6 +903,8 @@ export function _resetForTest(): void {
877
903
  _restoreService = null;
878
904
  _backupStatusStore = null;
879
905
  _snapshotManager = null;
906
+ _volumeSnapshotManager = null;
907
+ _rolloutOrchestrator = null;
880
908
  _botBilling = null;
881
909
  _phoneNumberRepo = null;
882
910
  _affiliateRepo = null;