@wopr-network/platform-core 1.22.0 → 1.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,262 @@
1
+ import { beforeEach, describe, expect, it, vi } from "vitest";
2
+ import { RolloutOrchestrator } from "../rollout-orchestrator.js";
3
+ function makeProfile(id, volumeName) {
4
+ return {
5
+ id,
6
+ tenantId: "tenant-1",
7
+ name: `bot-${id}`,
8
+ description: "",
9
+ image: "ghcr.io/wopr-network/paperclip:managed",
10
+ env: {},
11
+ restartPolicy: "unless-stopped",
12
+ releaseChannel: "stable",
13
+ updatePolicy: "nightly",
14
+ volumeName,
15
+ };
16
+ }
17
+ function makeResult(botId, success) {
18
+ return {
19
+ botId,
20
+ success,
21
+ previousImage: "old:latest",
22
+ newImage: "new:latest",
23
+ previousDigest: "sha256:old",
24
+ newDigest: "sha256:new",
25
+ rolledBack: !success,
26
+ error: success ? undefined : "Health check failed",
27
+ };
28
+ }
29
+ function mockUpdater(results) {
30
+ return {
31
+ updateBot: vi.fn(async (botId) => results.get(botId) ?? makeResult(botId, true)),
32
+ };
33
+ }
34
+ function mockSnapshotManager() {
35
+ return {
36
+ snapshot: vi.fn(async (volumeName) => ({
37
+ id: `${volumeName}-snap`,
38
+ volumeName,
39
+ archivePath: `/backup/${volumeName}-snap.tar`,
40
+ createdAt: new Date(),
41
+ sizeBytes: 1024,
42
+ })),
43
+ restore: vi.fn(async () => { }),
44
+ delete: vi.fn(async () => { }),
45
+ };
46
+ }
47
+ function mockStrategy(overrides = {}) {
48
+ return {
49
+ nextBatch: (remaining) => remaining.slice(0, 2),
50
+ pauseDuration: () => 0,
51
+ onBotFailure: () => "skip",
52
+ maxRetries: () => 2,
53
+ healthCheckTimeout: () => 120_000,
54
+ ...overrides,
55
+ };
56
+ }
57
+ describe("RolloutOrchestrator", () => {
58
+ let updater;
59
+ let snapMgr;
60
+ beforeEach(() => {
61
+ vi.clearAllMocks();
62
+ updater = mockUpdater(new Map());
63
+ snapMgr = mockSnapshotManager();
64
+ });
65
+ it("processes all bots in batches", async () => {
66
+ const profiles = [makeProfile("b1", "vol-1"), makeProfile("b2", "vol-2"), makeProfile("b3", "vol-3")];
67
+ const strategy = mockStrategy({ nextBatch: (r) => r.slice(0, 2) });
68
+ const orch = new RolloutOrchestrator({
69
+ updater,
70
+ snapshotManager: snapMgr,
71
+ strategy,
72
+ getUpdatableProfiles: async () => profiles,
73
+ });
74
+ const result = await orch.rollout();
75
+ expect(result.totalBots).toBe(3);
76
+ expect(result.succeeded).toBe(3);
77
+ expect(result.failed).toBe(0);
78
+ expect(result.aborted).toBe(false);
79
+ expect(updater.updateBot).toHaveBeenCalledTimes(3);
80
+ });
81
+ it("snapshots volumes before updating", async () => {
82
+ const profiles = [makeProfile("b1", "my-volume")];
83
+ const orch = new RolloutOrchestrator({
84
+ updater,
85
+ snapshotManager: snapMgr,
86
+ strategy: mockStrategy(),
87
+ getUpdatableProfiles: async () => profiles,
88
+ });
89
+ await orch.rollout();
90
+ expect(snapMgr.snapshot).toHaveBeenCalledWith("my-volume");
91
+ // On success, snapshot is cleaned up
92
+ expect(snapMgr.delete).toHaveBeenCalledWith("my-volume-snap");
93
+ });
94
+ it("skips snapshot for bots without volumes", async () => {
95
+ const profiles = [makeProfile("b1")]; // no volumeName
96
+ const orch = new RolloutOrchestrator({
97
+ updater,
98
+ snapshotManager: snapMgr,
99
+ strategy: mockStrategy(),
100
+ getUpdatableProfiles: async () => profiles,
101
+ });
102
+ await orch.rollout();
103
+ expect(snapMgr.snapshot).not.toHaveBeenCalled();
104
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
105
+ });
106
+ it("restores volumes on update failure", async () => {
107
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
108
+ updater = mockUpdater(failResults);
109
+ const profiles = [makeProfile("b1", "my-volume")];
110
+ const orch = new RolloutOrchestrator({
111
+ updater,
112
+ snapshotManager: snapMgr,
113
+ strategy: mockStrategy(),
114
+ getUpdatableProfiles: async () => profiles,
115
+ });
116
+ const result = await orch.rollout();
117
+ expect(result.failed).toBe(1);
118
+ expect(result.results[0].volumeRestored).toBe(true);
119
+ expect(snapMgr.restore).toHaveBeenCalledWith("my-volume-snap");
120
+ // Snapshot NOT deleted on failure (restored instead)
121
+ expect(snapMgr.delete).not.toHaveBeenCalled();
122
+ });
123
+ it("aborts rollout when strategy says abort", async () => {
124
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
125
+ updater = mockUpdater(failResults);
126
+ const profiles = [makeProfile("b1", "v1"), makeProfile("b2", "v2"), makeProfile("b3", "v3")];
127
+ const strategy = mockStrategy({
128
+ nextBatch: (r) => r.slice(0, 1),
129
+ onBotFailure: () => "abort",
130
+ });
131
+ const orch = new RolloutOrchestrator({
132
+ updater,
133
+ snapshotManager: snapMgr,
134
+ strategy,
135
+ getUpdatableProfiles: async () => profiles,
136
+ });
137
+ const result = await orch.rollout();
138
+ expect(result.aborted).toBe(true);
139
+ expect(result.succeeded).toBe(0);
140
+ expect(result.failed).toBe(1);
141
+ expect(result.skipped).toBe(2); // b2, b3 never processed
142
+ expect(updater.updateBot).toHaveBeenCalledTimes(1);
143
+ });
144
+ it("returns empty result when no bots to update", async () => {
145
+ const orch = new RolloutOrchestrator({
146
+ updater,
147
+ snapshotManager: snapMgr,
148
+ strategy: mockStrategy(),
149
+ getUpdatableProfiles: async () => [],
150
+ });
151
+ const result = await orch.rollout();
152
+ expect(result.totalBots).toBe(0);
153
+ expect(result.results).toHaveLength(0);
154
+ });
155
+ it("rejects concurrent rollouts", async () => {
156
+ const profiles = [makeProfile("b1")];
157
+ // Make updateBot slow
158
+ updater = {
159
+ updateBot: vi.fn(async (botId) => {
160
+ await new Promise((r) => setTimeout(r, 100));
161
+ return makeResult(botId, true);
162
+ }),
163
+ };
164
+ const orch = new RolloutOrchestrator({
165
+ updater,
166
+ snapshotManager: snapMgr,
167
+ strategy: mockStrategy(),
168
+ getUpdatableProfiles: async () => profiles,
169
+ });
170
+ const [r1, r2] = await Promise.all([orch.rollout(), orch.rollout()]);
171
+ // One succeeds, one is rejected as already running
172
+ const succeeded = [r1, r2].find((r) => r.totalBots > 0);
173
+ const rejected = [r1, r2].find((r) => r.alreadyRunning);
174
+ expect(succeeded).toBeDefined();
175
+ expect(rejected).toBeDefined();
176
+ expect(rejected?.alreadyRunning).toBe(true);
177
+ expect(rejected?.totalBots).toBe(0);
178
+ });
179
+ it("retries failed bots when strategy says retry", async () => {
180
+ let callCount = 0;
181
+ updater = {
182
+ updateBot: vi.fn(async (botId) => {
183
+ callCount++;
184
+ // Fail first attempt, succeed on retry
185
+ if (botId === "b1" && callCount === 1)
186
+ return makeResult("b1", false);
187
+ return makeResult(botId, true);
188
+ }),
189
+ };
190
+ const profiles = [makeProfile("b1")];
191
+ const strategy = mockStrategy({
192
+ nextBatch: (r) => r.slice(0, 1),
193
+ onBotFailure: (_botId, _err, attempt) => (attempt < 2 ? "retry" : "skip"),
194
+ });
195
+ const orch = new RolloutOrchestrator({
196
+ updater,
197
+ snapshotManager: snapMgr,
198
+ strategy,
199
+ getUpdatableProfiles: async () => profiles,
200
+ });
201
+ const result = await orch.rollout();
202
+ // b1 failed once, retried, succeeded
203
+ expect(updater.updateBot).toHaveBeenCalledTimes(2);
204
+ expect(result.succeeded).toBe(1);
205
+ expect(result.failed).toBe(1); // first attempt counted as failed
206
+ });
207
+ it("calls onBotUpdated callback for each bot", async () => {
208
+ const profiles = [makeProfile("b1"), makeProfile("b2")];
209
+ const onBotUpdated = vi.fn();
210
+ const orch = new RolloutOrchestrator({
211
+ updater,
212
+ snapshotManager: snapMgr,
213
+ strategy: mockStrategy(),
214
+ getUpdatableProfiles: async () => profiles,
215
+ onBotUpdated,
216
+ });
217
+ await orch.rollout();
218
+ expect(onBotUpdated).toHaveBeenCalledTimes(2);
219
+ });
220
+ it("calls onRolloutComplete callback", async () => {
221
+ const profiles = [makeProfile("b1")];
222
+ const onRolloutComplete = vi.fn();
223
+ const orch = new RolloutOrchestrator({
224
+ updater,
225
+ snapshotManager: snapMgr,
226
+ strategy: mockStrategy(),
227
+ getUpdatableProfiles: async () => profiles,
228
+ onRolloutComplete,
229
+ });
230
+ await orch.rollout();
231
+ expect(onRolloutComplete).toHaveBeenCalledTimes(1);
232
+ expect(onRolloutComplete).toHaveBeenCalledWith(expect.objectContaining({ totalBots: 1, succeeded: 1, aborted: false }));
233
+ });
234
+ it("continues on snapshot failure (best-effort)", async () => {
235
+ const profiles = [makeProfile("b1", "my-volume")];
236
+ snapMgr.snapshot = vi.fn().mockRejectedValue(new Error("disk full"));
237
+ const orch = new RolloutOrchestrator({
238
+ updater,
239
+ snapshotManager: snapMgr,
240
+ strategy: mockStrategy(),
241
+ getUpdatableProfiles: async () => profiles,
242
+ });
243
+ const result = await orch.rollout();
244
+ // Update still proceeds despite snapshot failure
245
+ expect(result.succeeded).toBe(1);
246
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
247
+ });
248
+ it("isRolling reflects rollout state", async () => {
249
+ const profiles = [makeProfile("b1")];
250
+ const orch = new RolloutOrchestrator({
251
+ updater,
252
+ snapshotManager: snapMgr,
253
+ strategy: mockStrategy(),
254
+ getUpdatableProfiles: async () => profiles,
255
+ });
256
+ expect(orch.isRolling).toBe(false);
257
+ const promise = orch.rollout();
258
+ // isRolling is true during rollout (may already be done for sync mocks)
259
+ await promise;
260
+ expect(orch.isRolling).toBe(false);
261
+ });
262
+ });
@@ -1,4 +1,5 @@
1
1
  export * from "./repository-types.js";
2
+ export * from "./rollout-orchestrator.js";
2
3
  export * from "./rollout-strategy.js";
3
4
  export * from "./services.js";
4
5
  export * from "./types.js";
@@ -1,4 +1,5 @@
1
1
  export * from "./repository-types.js";
2
+ export * from "./rollout-orchestrator.js";
2
3
  export * from "./rollout-strategy.js";
3
4
  export * from "./services.js";
4
5
  export * from "./types.js";
@@ -0,0 +1,69 @@
1
+ /**
2
+ * RolloutOrchestrator — coordinates fleet-wide container updates using
3
+ * pluggable rollout strategies and volume snapshots for nuclear rollback.
4
+ *
5
+ * Sits between ImagePoller (detects new digests) and ContainerUpdater
6
+ * (handles per-bot pull/stop/recreate/health). Adds:
7
+ * - Strategy-driven batching (rolling wave, single bot, immediate)
8
+ * - Pre-update volume snapshots via VolumeSnapshotManager
9
+ * - Volume restore on health check failure (nuclear rollback)
10
+ * - Per-tenant update orchestration
11
+ */
12
+ import type { IRolloutStrategy } from "./rollout-strategy.js";
13
+ import type { BotProfile } from "./types.js";
14
+ import type { ContainerUpdater, UpdateResult } from "./updater.js";
15
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
16
+ export interface RolloutOrchestratorDeps {
17
+ updater: ContainerUpdater;
18
+ snapshotManager: VolumeSnapshotManager;
19
+ strategy: IRolloutStrategy;
20
+ /** Resolve running profiles that need updating for a given image digest */
21
+ getUpdatableProfiles: () => Promise<BotProfile[]>;
22
+ /** Optional callback after each bot update (success or failure) */
23
+ onBotUpdated?: (result: UpdateResult & {
24
+ volumeRestored: boolean;
25
+ }) => void;
26
+ /** Optional callback when a rollout completes */
27
+ onRolloutComplete?: (results: RolloutResult) => void;
28
+ }
29
+ export interface BotUpdateResult extends UpdateResult {
30
+ volumeRestored: boolean;
31
+ }
32
+ export interface RolloutResult {
33
+ totalBots: number;
34
+ succeeded: number;
35
+ failed: number;
36
+ skipped: number;
37
+ aborted: boolean;
38
+ /** True when a concurrent rollout was already in progress */
39
+ alreadyRunning: boolean;
40
+ results: BotUpdateResult[];
41
+ }
42
+ export declare class RolloutOrchestrator {
43
+ private readonly updater;
44
+ private readonly snapshotManager;
45
+ private readonly strategy;
46
+ private readonly getUpdatableProfiles;
47
+ private readonly onBotUpdated?;
48
+ private readonly onRolloutComplete?;
49
+ private rolling;
50
+ constructor(deps: RolloutOrchestratorDeps);
51
+ /** Whether a rollout is currently in progress. */
52
+ get isRolling(): boolean;
53
+ /**
54
+ * Execute a rollout across all updatable bots.
55
+ * Uses the configured strategy for batching, pausing, and failure handling.
56
+ */
57
+ rollout(): Promise<RolloutResult>;
58
+ /**
59
+ * Update a single bot with volume snapshot + nuclear rollback.
60
+ */
61
+ private updateBot;
62
+ /**
63
+ * Handle a bot failure using the strategy's failure policy.
64
+ * Retries the update up to maxRetries before escalating.
65
+ */
66
+ private handleFailure;
67
+ private restoreVolumes;
68
+ private cleanupSnapshots;
69
+ }
@@ -0,0 +1,204 @@
1
+ /**
2
+ * RolloutOrchestrator — coordinates fleet-wide container updates using
3
+ * pluggable rollout strategies and volume snapshots for nuclear rollback.
4
+ *
5
+ * Sits between ImagePoller (detects new digests) and ContainerUpdater
6
+ * (handles per-bot pull/stop/recreate/health). Adds:
7
+ * - Strategy-driven batching (rolling wave, single bot, immediate)
8
+ * - Pre-update volume snapshots via VolumeSnapshotManager
9
+ * - Volume restore on health check failure (nuclear rollback)
10
+ * - Per-tenant update orchestration
11
+ */
12
+ import { logger } from "../config/logger.js";
13
+ export class RolloutOrchestrator {
14
+ updater;
15
+ snapshotManager;
16
+ strategy;
17
+ getUpdatableProfiles;
18
+ onBotUpdated;
19
+ onRolloutComplete;
20
+ rolling = false;
21
+ constructor(deps) {
22
+ this.updater = deps.updater;
23
+ this.snapshotManager = deps.snapshotManager;
24
+ this.strategy = deps.strategy;
25
+ this.getUpdatableProfiles = deps.getUpdatableProfiles;
26
+ this.onBotUpdated = deps.onBotUpdated;
27
+ this.onRolloutComplete = deps.onRolloutComplete;
28
+ }
29
+ /** Whether a rollout is currently in progress. */
30
+ get isRolling() {
31
+ return this.rolling;
32
+ }
33
+ /**
34
+ * Execute a rollout across all updatable bots.
35
+ * Uses the configured strategy for batching, pausing, and failure handling.
36
+ */
37
+ async rollout() {
38
+ if (this.rolling) {
39
+ logger.warn("Rollout already in progress — skipping");
40
+ return { totalBots: 0, succeeded: 0, failed: 0, skipped: 0, aborted: false, alreadyRunning: true, results: [] };
41
+ }
42
+ this.rolling = true;
43
+ const allResults = [];
44
+ let aborted = false;
45
+ try {
46
+ let remaining = await this.getUpdatableProfiles();
47
+ const totalBots = remaining.length;
48
+ if (totalBots === 0) {
49
+ logger.info("Rollout: no bots to update");
50
+ return {
51
+ totalBots: 0,
52
+ succeeded: 0,
53
+ failed: 0,
54
+ skipped: 0,
55
+ aborted: false,
56
+ alreadyRunning: false,
57
+ results: [],
58
+ };
59
+ }
60
+ logger.info(`Rollout starting: ${totalBots} bots to update`);
61
+ while (remaining.length > 0 && !aborted) {
62
+ const batch = this.strategy.nextBatch(remaining);
63
+ if (batch.length === 0)
64
+ break;
65
+ logger.info(`Rollout wave: ${batch.length} bots (${remaining.length} remaining)`);
66
+ // Process batch — each bot sequentially within a wave for safety
67
+ const retryProfiles = [];
68
+ for (const profile of batch) {
69
+ if (aborted)
70
+ break;
71
+ const result = await this.updateBot(profile);
72
+ allResults.push(result);
73
+ this.onBotUpdated?.(result);
74
+ if (!result.success) {
75
+ const action = this.handleFailure(profile.id, result, allResults);
76
+ if (action === "abort") {
77
+ aborted = true;
78
+ logger.warn(`Rollout aborted after bot ${profile.id} failure`);
79
+ }
80
+ else if (action === "retry") {
81
+ retryProfiles.push(profile);
82
+ }
83
+ // "skip" → don't re-add, bot is dropped
84
+ }
85
+ }
86
+ // Remove processed bots from remaining, but re-add retries
87
+ const processedIds = new Set(batch.map((b) => b.id));
88
+ const retryIds = new Set(retryProfiles.map((b) => b.id));
89
+ remaining = [
90
+ ...remaining.filter((b) => !processedIds.has(b.id)),
91
+ ...retryProfiles.filter((b) => retryIds.has(b.id)),
92
+ ];
93
+ // Pause between waves (unless aborted or done)
94
+ if (remaining.length > 0 && !aborted) {
95
+ const pause = this.strategy.pauseDuration();
96
+ if (pause > 0) {
97
+ logger.info(`Rollout: pausing ${pause}ms before next wave`);
98
+ await sleep(pause);
99
+ }
100
+ }
101
+ }
102
+ const succeeded = allResults.filter((r) => r.success).length;
103
+ const failed = allResults.filter((r) => !r.success).length;
104
+ const skipped = totalBots - allResults.length;
105
+ const rolloutResult = {
106
+ totalBots,
107
+ succeeded,
108
+ failed,
109
+ skipped,
110
+ aborted,
111
+ alreadyRunning: false,
112
+ results: allResults,
113
+ };
114
+ logger.info(`Rollout complete: ${succeeded} succeeded, ${failed} failed, ${skipped} skipped, aborted=${aborted}`);
115
+ this.onRolloutComplete?.(rolloutResult);
116
+ return rolloutResult;
117
+ }
118
+ finally {
119
+ this.rolling = false;
120
+ }
121
+ }
122
+ /**
123
+ * Update a single bot with volume snapshot + nuclear rollback.
124
+ */
125
+ async updateBot(profile) {
126
+ const snapshotIds = [];
127
+ try {
128
+ // Step 1: Snapshot volumes before update
129
+ if (profile.volumeName) {
130
+ try {
131
+ const snap = await this.snapshotManager.snapshot(profile.volumeName);
132
+ snapshotIds.push(snap.id);
133
+ logger.info(`Pre-update snapshot for ${profile.id}: ${snap.id}`);
134
+ }
135
+ catch (err) {
136
+ logger.warn(`Volume snapshot failed for ${profile.id} — proceeding without backup`, { err });
137
+ }
138
+ }
139
+ // Step 2: Delegate to ContainerUpdater
140
+ const result = await this.updater.updateBot(profile.id);
141
+ if (result.success) {
142
+ // Clean up snapshots on success
143
+ await this.cleanupSnapshots(snapshotIds);
144
+ return { ...result, volumeRestored: false };
145
+ }
146
+ // Step 3: Nuclear rollback — restore volumes if update failed
147
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
148
+ return { ...result, volumeRestored };
149
+ }
150
+ catch (err) {
151
+ logger.error(`Unexpected error updating bot ${profile.id}`, { err });
152
+ // Attempt volume restore on unexpected errors too
153
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
154
+ return {
155
+ botId: profile.id,
156
+ success: false,
157
+ previousImage: profile.image,
158
+ newImage: profile.image,
159
+ previousDigest: null,
160
+ newDigest: null,
161
+ rolledBack: false,
162
+ volumeRestored,
163
+ error: err instanceof Error ? err.message : String(err),
164
+ };
165
+ }
166
+ }
167
+ /**
168
+ * Handle a bot failure using the strategy's failure policy.
169
+ * Retries the update up to maxRetries before escalating.
170
+ */
171
+ handleFailure(botId, result, allResults) {
172
+ const error = new Error(result.error ?? "Unknown error");
173
+ const failCount = allResults.filter((r) => r.botId === botId && !r.success).length;
174
+ return this.strategy.onBotFailure(botId, error, failCount);
175
+ }
176
+ async restoreVolumes(botId, snapshotIds) {
177
+ if (snapshotIds.length === 0)
178
+ return false;
179
+ for (const id of snapshotIds) {
180
+ try {
181
+ await this.snapshotManager.restore(id);
182
+ logger.info(`Volume restored for ${botId} from snapshot ${id}`);
183
+ return true;
184
+ }
185
+ catch (err) {
186
+ logger.error(`Volume restore failed for ${botId} snapshot ${id}`, { err });
187
+ }
188
+ }
189
+ return false;
190
+ }
191
+ async cleanupSnapshots(snapshotIds) {
192
+ for (const id of snapshotIds) {
193
+ try {
194
+ await this.snapshotManager.delete(id);
195
+ }
196
+ catch (err) {
197
+ logger.warn(`Failed to clean up snapshot ${id}`, { err });
198
+ }
199
+ }
200
+ }
201
+ }
202
+ function sleep(ms) {
203
+ return new Promise((resolve) => setTimeout(resolve, ms));
204
+ }
@@ -14,6 +14,8 @@ import type { IAffiliateRepository } from "../monetization/affiliate/drizzle-aff
14
14
  import type { IBotBilling } from "../monetization/credits/bot-billing.js";
15
15
  import type { IPhoneNumberRepository } from "../monetization/credits/drizzle-phone-number-repository.js";
16
16
  import { SystemResourceMonitor } from "../observability/system-resources.js";
17
+ import type { RolloutOrchestrator } from "./rollout-orchestrator.js";
18
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
17
19
  import { AdminNotifier } from "./admin-notifier.js";
18
20
  import type { IBotInstanceRepository } from "./bot-instance-repository.js";
19
21
  import type { IBotProfileRepository } from "./bot-profile-repository.js";
@@ -79,6 +81,10 @@ export declare function getCapacityPolicy(configOverrides?: Partial<CapacityPoli
79
81
  export declare function getRestoreLogStore(): IRestoreLogStore;
80
82
  export declare function getBackupStatusStore(): IBackupStatusStore;
81
83
  export declare function getSnapshotManager(): SnapshotManager;
84
+ export declare function getVolumeSnapshotManager(): VolumeSnapshotManager;
85
+ export declare function setVolumeSnapshotManager(mgr: VolumeSnapshotManager): void;
86
+ export declare function getRolloutOrchestrator(): RolloutOrchestrator;
87
+ export declare function setRolloutOrchestrator(orch: RolloutOrchestrator): void;
82
88
  export declare function getRestoreService(): RestoreService;
83
89
  /** Call once at server startup to wire up fleet services. */
84
90
  export declare function initFleet(): void;
@@ -105,6 +105,8 @@ let _restoreLogStore = null;
105
105
  let _restoreService = null;
106
106
  let _backupStatusStore = null;
107
107
  let _snapshotManager = null;
108
+ let _volumeSnapshotManager = null;
109
+ let _rolloutOrchestrator = null;
108
110
  const S3_BUCKET = process.env.S3_BUCKET || "wopr-backups";
109
111
  function envInt(key, fallback) {
110
112
  const raw = process.env[key];
@@ -427,6 +429,24 @@ export function getSnapshotManager() {
427
429
  }
428
430
  return _snapshotManager;
429
431
  }
432
+ export function getVolumeSnapshotManager() {
433
+ if (!_volumeSnapshotManager) {
434
+ throw new Error("VolumeSnapshotManager not initialized — call setVolumeSnapshotManager() first");
435
+ }
436
+ return _volumeSnapshotManager;
437
+ }
438
+ export function setVolumeSnapshotManager(mgr) {
439
+ _volumeSnapshotManager = mgr;
440
+ }
441
+ export function getRolloutOrchestrator() {
442
+ if (!_rolloutOrchestrator) {
443
+ throw new Error("RolloutOrchestrator not initialized — call setRolloutOrchestrator() first");
444
+ }
445
+ return _rolloutOrchestrator;
446
+ }
447
+ export function setRolloutOrchestrator(orch) {
448
+ _rolloutOrchestrator = orch;
449
+ }
430
450
  export function getRestoreService() {
431
451
  if (!_restoreService) {
432
452
  _restoreService = new RestoreService({
@@ -683,6 +703,8 @@ export function _resetForTest() {
683
703
  _restoreService = null;
684
704
  _backupStatusStore = null;
685
705
  _snapshotManager = null;
706
+ _volumeSnapshotManager = null;
707
+ _rolloutOrchestrator = null;
686
708
  _botBilling = null;
687
709
  _phoneNumberRepo = null;
688
710
  _affiliateRepo = null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@wopr-network/platform-core",
3
- "version": "1.22.0",
3
+ "version": "1.23.0",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -0,0 +1,321 @@
1
+ import { beforeEach, describe, expect, it, vi } from "vitest";
2
+ import { RolloutOrchestrator } from "../rollout-orchestrator.js";
3
+ import type { IRolloutStrategy } from "../rollout-strategy.js";
4
+ import type { BotProfile } from "../types.js";
5
+ import type { ContainerUpdater, UpdateResult } from "../updater.js";
6
+ import type { VolumeSnapshotManager } from "../volume-snapshot-manager.js";
7
+
8
+ function makeProfile(id: string, volumeName?: string): BotProfile {
9
+ return {
10
+ id,
11
+ tenantId: "tenant-1",
12
+ name: `bot-${id}`,
13
+ description: "",
14
+ image: "ghcr.io/wopr-network/paperclip:managed",
15
+ env: {},
16
+ restartPolicy: "unless-stopped",
17
+ releaseChannel: "stable",
18
+ updatePolicy: "nightly",
19
+ volumeName,
20
+ } as BotProfile;
21
+ }
22
+
23
+ function makeResult(botId: string, success: boolean): UpdateResult {
24
+ return {
25
+ botId,
26
+ success,
27
+ previousImage: "old:latest",
28
+ newImage: "new:latest",
29
+ previousDigest: "sha256:old",
30
+ newDigest: "sha256:new",
31
+ rolledBack: !success,
32
+ error: success ? undefined : "Health check failed",
33
+ };
34
+ }
35
+
36
+ function mockUpdater(results: Map<string, UpdateResult>): ContainerUpdater {
37
+ return {
38
+ updateBot: vi.fn(async (botId: string) => results.get(botId) ?? makeResult(botId, true)),
39
+ } as unknown as ContainerUpdater;
40
+ }
41
+
42
+ function mockSnapshotManager(): VolumeSnapshotManager {
43
+ return {
44
+ snapshot: vi.fn(async (volumeName: string) => ({
45
+ id: `${volumeName}-snap`,
46
+ volumeName,
47
+ archivePath: `/backup/${volumeName}-snap.tar`,
48
+ createdAt: new Date(),
49
+ sizeBytes: 1024,
50
+ })),
51
+ restore: vi.fn(async () => {}),
52
+ delete: vi.fn(async () => {}),
53
+ } as unknown as VolumeSnapshotManager;
54
+ }
55
+
56
+ function mockStrategy(overrides: Partial<IRolloutStrategy> = {}): IRolloutStrategy {
57
+ return {
58
+ nextBatch: (remaining) => remaining.slice(0, 2),
59
+ pauseDuration: () => 0,
60
+ onBotFailure: () => "skip",
61
+ maxRetries: () => 2,
62
+ healthCheckTimeout: () => 120_000,
63
+ ...overrides,
64
+ };
65
+ }
66
+
67
+ describe("RolloutOrchestrator", () => {
68
+ let updater: ReturnType<typeof mockUpdater>;
69
+ let snapMgr: ReturnType<typeof mockSnapshotManager>;
70
+
71
+ beforeEach(() => {
72
+ vi.clearAllMocks();
73
+ updater = mockUpdater(new Map());
74
+ snapMgr = mockSnapshotManager();
75
+ });
76
+
77
+ it("processes all bots in batches", async () => {
78
+ const profiles = [makeProfile("b1", "vol-1"), makeProfile("b2", "vol-2"), makeProfile("b3", "vol-3")];
79
+ const strategy = mockStrategy({ nextBatch: (r) => r.slice(0, 2) });
80
+
81
+ const orch = new RolloutOrchestrator({
82
+ updater,
83
+ snapshotManager: snapMgr,
84
+ strategy,
85
+ getUpdatableProfiles: async () => profiles,
86
+ });
87
+
88
+ const result = await orch.rollout();
89
+
90
+ expect(result.totalBots).toBe(3);
91
+ expect(result.succeeded).toBe(3);
92
+ expect(result.failed).toBe(0);
93
+ expect(result.aborted).toBe(false);
94
+ expect(updater.updateBot).toHaveBeenCalledTimes(3);
95
+ });
96
+
97
+ it("snapshots volumes before updating", async () => {
98
+ const profiles = [makeProfile("b1", "my-volume")];
99
+
100
+ const orch = new RolloutOrchestrator({
101
+ updater,
102
+ snapshotManager: snapMgr,
103
+ strategy: mockStrategy(),
104
+ getUpdatableProfiles: async () => profiles,
105
+ });
106
+
107
+ await orch.rollout();
108
+
109
+ expect(snapMgr.snapshot).toHaveBeenCalledWith("my-volume");
110
+ // On success, snapshot is cleaned up
111
+ expect(snapMgr.delete).toHaveBeenCalledWith("my-volume-snap");
112
+ });
113
+
114
+ it("skips snapshot for bots without volumes", async () => {
115
+ const profiles = [makeProfile("b1")]; // no volumeName
116
+
117
+ const orch = new RolloutOrchestrator({
118
+ updater,
119
+ snapshotManager: snapMgr,
120
+ strategy: mockStrategy(),
121
+ getUpdatableProfiles: async () => profiles,
122
+ });
123
+
124
+ await orch.rollout();
125
+
126
+ expect(snapMgr.snapshot).not.toHaveBeenCalled();
127
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
128
+ });
129
+
130
+ it("restores volumes on update failure", async () => {
131
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
132
+ updater = mockUpdater(failResults);
133
+ const profiles = [makeProfile("b1", "my-volume")];
134
+
135
+ const orch = new RolloutOrchestrator({
136
+ updater,
137
+ snapshotManager: snapMgr,
138
+ strategy: mockStrategy(),
139
+ getUpdatableProfiles: async () => profiles,
140
+ });
141
+
142
+ const result = await orch.rollout();
143
+
144
+ expect(result.failed).toBe(1);
145
+ expect(result.results[0].volumeRestored).toBe(true);
146
+ expect(snapMgr.restore).toHaveBeenCalledWith("my-volume-snap");
147
+ // Snapshot NOT deleted on failure (restored instead)
148
+ expect(snapMgr.delete).not.toHaveBeenCalled();
149
+ });
150
+
151
+ it("aborts rollout when strategy says abort", async () => {
152
+ const failResults = new Map([["b1", makeResult("b1", false)]]);
153
+ updater = mockUpdater(failResults);
154
+ const profiles = [makeProfile("b1", "v1"), makeProfile("b2", "v2"), makeProfile("b3", "v3")];
155
+ const strategy = mockStrategy({
156
+ nextBatch: (r) => r.slice(0, 1),
157
+ onBotFailure: () => "abort",
158
+ });
159
+
160
+ const orch = new RolloutOrchestrator({
161
+ updater,
162
+ snapshotManager: snapMgr,
163
+ strategy,
164
+ getUpdatableProfiles: async () => profiles,
165
+ });
166
+
167
+ const result = await orch.rollout();
168
+
169
+ expect(result.aborted).toBe(true);
170
+ expect(result.succeeded).toBe(0);
171
+ expect(result.failed).toBe(1);
172
+ expect(result.skipped).toBe(2); // b2, b3 never processed
173
+ expect(updater.updateBot).toHaveBeenCalledTimes(1);
174
+ });
175
+
176
+ it("returns empty result when no bots to update", async () => {
177
+ const orch = new RolloutOrchestrator({
178
+ updater,
179
+ snapshotManager: snapMgr,
180
+ strategy: mockStrategy(),
181
+ getUpdatableProfiles: async () => [],
182
+ });
183
+
184
+ const result = await orch.rollout();
185
+
186
+ expect(result.totalBots).toBe(0);
187
+ expect(result.results).toHaveLength(0);
188
+ });
189
+
190
+ it("rejects concurrent rollouts", async () => {
191
+ const profiles = [makeProfile("b1")];
192
+ // Make updateBot slow
193
+ updater = {
194
+ updateBot: vi.fn(async (botId: string) => {
195
+ await new Promise((r) => setTimeout(r, 100));
196
+ return makeResult(botId, true);
197
+ }),
198
+ } as unknown as ContainerUpdater;
199
+
200
+ const orch = new RolloutOrchestrator({
201
+ updater,
202
+ snapshotManager: snapMgr,
203
+ strategy: mockStrategy(),
204
+ getUpdatableProfiles: async () => profiles,
205
+ });
206
+
207
+ const [r1, r2] = await Promise.all([orch.rollout(), orch.rollout()]);
208
+
209
+ // One succeeds, one is rejected as already running
210
+ const succeeded = [r1, r2].find((r) => r.totalBots > 0);
211
+ const rejected = [r1, r2].find((r) => r.alreadyRunning);
212
+ expect(succeeded).toBeDefined();
213
+ expect(rejected).toBeDefined();
214
+ expect(rejected?.alreadyRunning).toBe(true);
215
+ expect(rejected?.totalBots).toBe(0);
216
+ });
217
+
218
+ it("retries failed bots when strategy says retry", async () => {
219
+ let callCount = 0;
220
+ updater = {
221
+ updateBot: vi.fn(async (botId: string) => {
222
+ callCount++;
223
+ // Fail first attempt, succeed on retry
224
+ if (botId === "b1" && callCount === 1) return makeResult("b1", false);
225
+ return makeResult(botId, true);
226
+ }),
227
+ } as unknown as ContainerUpdater;
228
+
229
+ const profiles = [makeProfile("b1")];
230
+ const strategy = mockStrategy({
231
+ nextBatch: (r) => r.slice(0, 1),
232
+ onBotFailure: (_botId, _err, attempt) => (attempt < 2 ? "retry" : "skip"),
233
+ });
234
+
235
+ const orch = new RolloutOrchestrator({
236
+ updater,
237
+ snapshotManager: snapMgr,
238
+ strategy,
239
+ getUpdatableProfiles: async () => profiles,
240
+ });
241
+
242
+ const result = await orch.rollout();
243
+
244
+ // b1 failed once, retried, succeeded
245
+ expect(updater.updateBot).toHaveBeenCalledTimes(2);
246
+ expect(result.succeeded).toBe(1);
247
+ expect(result.failed).toBe(1); // first attempt counted as failed
248
+ });
249
+
250
+ it("calls onBotUpdated callback for each bot", async () => {
251
+ const profiles = [makeProfile("b1"), makeProfile("b2")];
252
+ const onBotUpdated = vi.fn();
253
+
254
+ const orch = new RolloutOrchestrator({
255
+ updater,
256
+ snapshotManager: snapMgr,
257
+ strategy: mockStrategy(),
258
+ getUpdatableProfiles: async () => profiles,
259
+ onBotUpdated,
260
+ });
261
+
262
+ await orch.rollout();
263
+
264
+ expect(onBotUpdated).toHaveBeenCalledTimes(2);
265
+ });
266
+
267
+ it("calls onRolloutComplete callback", async () => {
268
+ const profiles = [makeProfile("b1")];
269
+ const onRolloutComplete = vi.fn();
270
+
271
+ const orch = new RolloutOrchestrator({
272
+ updater,
273
+ snapshotManager: snapMgr,
274
+ strategy: mockStrategy(),
275
+ getUpdatableProfiles: async () => profiles,
276
+ onRolloutComplete,
277
+ });
278
+
279
+ await orch.rollout();
280
+
281
+ expect(onRolloutComplete).toHaveBeenCalledTimes(1);
282
+ expect(onRolloutComplete).toHaveBeenCalledWith(
283
+ expect.objectContaining({ totalBots: 1, succeeded: 1, aborted: false }),
284
+ );
285
+ });
286
+
287
+ it("continues on snapshot failure (best-effort)", async () => {
288
+ const profiles = [makeProfile("b1", "my-volume")];
289
+ snapMgr.snapshot = vi.fn().mockRejectedValue(new Error("disk full"));
290
+
291
+ const orch = new RolloutOrchestrator({
292
+ updater,
293
+ snapshotManager: snapMgr,
294
+ strategy: mockStrategy(),
295
+ getUpdatableProfiles: async () => profiles,
296
+ });
297
+
298
+ const result = await orch.rollout();
299
+
300
+ // Update still proceeds despite snapshot failure
301
+ expect(result.succeeded).toBe(1);
302
+ expect(updater.updateBot).toHaveBeenCalledWith("b1");
303
+ });
304
+
305
+ it("isRolling reflects rollout state", async () => {
306
+ const profiles = [makeProfile("b1")];
307
+
308
+ const orch = new RolloutOrchestrator({
309
+ updater,
310
+ snapshotManager: snapMgr,
311
+ strategy: mockStrategy(),
312
+ getUpdatableProfiles: async () => profiles,
313
+ });
314
+
315
+ expect(orch.isRolling).toBe(false);
316
+ const promise = orch.rollout();
317
+ // isRolling is true during rollout (may already be done for sync mocks)
318
+ await promise;
319
+ expect(orch.isRolling).toBe(false);
320
+ });
321
+ });
@@ -1,4 +1,5 @@
1
1
  export * from "./repository-types.js";
2
+ export * from "./rollout-orchestrator.js";
2
3
  export * from "./rollout-strategy.js";
3
4
  export * from "./services.js";
4
5
  export * from "./types.js";
@@ -0,0 +1,262 @@
1
+ /**
2
+ * RolloutOrchestrator — coordinates fleet-wide container updates using
3
+ * pluggable rollout strategies and volume snapshots for nuclear rollback.
4
+ *
5
+ * Sits between ImagePoller (detects new digests) and ContainerUpdater
6
+ * (handles per-bot pull/stop/recreate/health). Adds:
7
+ * - Strategy-driven batching (rolling wave, single bot, immediate)
8
+ * - Pre-update volume snapshots via VolumeSnapshotManager
9
+ * - Volume restore on health check failure (nuclear rollback)
10
+ * - Per-tenant update orchestration
11
+ */
12
+
13
+ import { logger } from "../config/logger.js";
14
+ import type { IRolloutStrategy } from "./rollout-strategy.js";
15
+ import type { BotProfile } from "./types.js";
16
+ import type { ContainerUpdater, UpdateResult } from "./updater.js";
17
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
18
+
19
+ export interface RolloutOrchestratorDeps {
20
+ updater: ContainerUpdater;
21
+ snapshotManager: VolumeSnapshotManager;
22
+ strategy: IRolloutStrategy;
23
+ /** Resolve running profiles that need updating for a given image digest */
24
+ getUpdatableProfiles: () => Promise<BotProfile[]>;
25
+ /** Optional callback after each bot update (success or failure) */
26
+ onBotUpdated?: (result: UpdateResult & { volumeRestored: boolean }) => void;
27
+ /** Optional callback when a rollout completes */
28
+ onRolloutComplete?: (results: RolloutResult) => void;
29
+ }
30
+
31
+ export interface BotUpdateResult extends UpdateResult {
32
+ volumeRestored: boolean;
33
+ }
34
+
35
+ export interface RolloutResult {
36
+ totalBots: number;
37
+ succeeded: number;
38
+ failed: number;
39
+ skipped: number;
40
+ aborted: boolean;
41
+ /** True when a concurrent rollout was already in progress */
42
+ alreadyRunning: boolean;
43
+ results: BotUpdateResult[];
44
+ }
45
+
46
+ export class RolloutOrchestrator {
47
+ private readonly updater: ContainerUpdater;
48
+ private readonly snapshotManager: VolumeSnapshotManager;
49
+ private readonly strategy: IRolloutStrategy;
50
+ private readonly getUpdatableProfiles: () => Promise<BotProfile[]>;
51
+ private readonly onBotUpdated?: (result: BotUpdateResult) => void;
52
+ private readonly onRolloutComplete?: (results: RolloutResult) => void;
53
+ private rolling = false;
54
+
55
+ constructor(deps: RolloutOrchestratorDeps) {
56
+ this.updater = deps.updater;
57
+ this.snapshotManager = deps.snapshotManager;
58
+ this.strategy = deps.strategy;
59
+ this.getUpdatableProfiles = deps.getUpdatableProfiles;
60
+ this.onBotUpdated = deps.onBotUpdated;
61
+ this.onRolloutComplete = deps.onRolloutComplete;
62
+ }
63
+
64
+ /** Whether a rollout is currently in progress. */
65
+ get isRolling(): boolean {
66
+ return this.rolling;
67
+ }
68
+
69
+ /**
70
+ * Execute a rollout across all updatable bots.
71
+ * Uses the configured strategy for batching, pausing, and failure handling.
72
+ */
73
+ async rollout(): Promise<RolloutResult> {
74
+ if (this.rolling) {
75
+ logger.warn("Rollout already in progress — skipping");
76
+ return { totalBots: 0, succeeded: 0, failed: 0, skipped: 0, aborted: false, alreadyRunning: true, results: [] };
77
+ }
78
+
79
+ this.rolling = true;
80
+ const allResults: BotUpdateResult[] = [];
81
+ let aborted = false;
82
+
83
+ try {
84
+ let remaining = await this.getUpdatableProfiles();
85
+ const totalBots = remaining.length;
86
+
87
+ if (totalBots === 0) {
88
+ logger.info("Rollout: no bots to update");
89
+ return {
90
+ totalBots: 0,
91
+ succeeded: 0,
92
+ failed: 0,
93
+ skipped: 0,
94
+ aborted: false,
95
+ alreadyRunning: false,
96
+ results: [],
97
+ };
98
+ }
99
+
100
+ logger.info(`Rollout starting: ${totalBots} bots to update`);
101
+
102
+ while (remaining.length > 0 && !aborted) {
103
+ const batch = this.strategy.nextBatch(remaining);
104
+ if (batch.length === 0) break;
105
+
106
+ logger.info(`Rollout wave: ${batch.length} bots (${remaining.length} remaining)`);
107
+
108
+ // Process batch — each bot sequentially within a wave for safety
109
+ const retryProfiles: BotProfile[] = [];
110
+ for (const profile of batch) {
111
+ if (aborted) break;
112
+
113
+ const result = await this.updateBot(profile);
114
+ allResults.push(result);
115
+ this.onBotUpdated?.(result);
116
+
117
+ if (!result.success) {
118
+ const action = this.handleFailure(profile.id, result, allResults);
119
+ if (action === "abort") {
120
+ aborted = true;
121
+ logger.warn(`Rollout aborted after bot ${profile.id} failure`);
122
+ } else if (action === "retry") {
123
+ retryProfiles.push(profile);
124
+ }
125
+ // "skip" → don't re-add, bot is dropped
126
+ }
127
+ }
128
+
129
+ // Remove processed bots from remaining, but re-add retries
130
+ const processedIds = new Set(batch.map((b) => b.id));
131
+ const retryIds = new Set(retryProfiles.map((b) => b.id));
132
+ remaining = [
133
+ ...remaining.filter((b) => !processedIds.has(b.id)),
134
+ ...retryProfiles.filter((b) => retryIds.has(b.id)),
135
+ ];
136
+
137
+ // Pause between waves (unless aborted or done)
138
+ if (remaining.length > 0 && !aborted) {
139
+ const pause = this.strategy.pauseDuration();
140
+ if (pause > 0) {
141
+ logger.info(`Rollout: pausing ${pause}ms before next wave`);
142
+ await sleep(pause);
143
+ }
144
+ }
145
+ }
146
+
147
+ const succeeded = allResults.filter((r) => r.success).length;
148
+ const failed = allResults.filter((r) => !r.success).length;
149
+ const skipped = totalBots - allResults.length;
150
+
151
+ const rolloutResult: RolloutResult = {
152
+ totalBots,
153
+ succeeded,
154
+ failed,
155
+ skipped,
156
+ aborted,
157
+ alreadyRunning: false,
158
+ results: allResults,
159
+ };
160
+
161
+ logger.info(`Rollout complete: ${succeeded} succeeded, ${failed} failed, ${skipped} skipped, aborted=${aborted}`);
162
+ this.onRolloutComplete?.(rolloutResult);
163
+
164
+ return rolloutResult;
165
+ } finally {
166
+ this.rolling = false;
167
+ }
168
+ }
169
+
170
+ /**
171
+ * Update a single bot with volume snapshot + nuclear rollback.
172
+ */
173
+ private async updateBot(profile: BotProfile): Promise<BotUpdateResult> {
174
+ const snapshotIds: string[] = [];
175
+
176
+ try {
177
+ // Step 1: Snapshot volumes before update
178
+ if (profile.volumeName) {
179
+ try {
180
+ const snap = await this.snapshotManager.snapshot(profile.volumeName);
181
+ snapshotIds.push(snap.id);
182
+ logger.info(`Pre-update snapshot for ${profile.id}: ${snap.id}`);
183
+ } catch (err) {
184
+ logger.warn(`Volume snapshot failed for ${profile.id} — proceeding without backup`, { err });
185
+ }
186
+ }
187
+
188
+ // Step 2: Delegate to ContainerUpdater
189
+ const result = await this.updater.updateBot(profile.id);
190
+
191
+ if (result.success) {
192
+ // Clean up snapshots on success
193
+ await this.cleanupSnapshots(snapshotIds);
194
+ return { ...result, volumeRestored: false };
195
+ }
196
+
197
+ // Step 3: Nuclear rollback — restore volumes if update failed
198
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
199
+ return { ...result, volumeRestored };
200
+ } catch (err) {
201
+ logger.error(`Unexpected error updating bot ${profile.id}`, { err });
202
+
203
+ // Attempt volume restore on unexpected errors too
204
+ const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
205
+
206
+ return {
207
+ botId: profile.id,
208
+ success: false,
209
+ previousImage: profile.image,
210
+ newImage: profile.image,
211
+ previousDigest: null,
212
+ newDigest: null,
213
+ rolledBack: false,
214
+ volumeRestored,
215
+ error: err instanceof Error ? err.message : String(err),
216
+ };
217
+ }
218
+ }
219
+
220
+ /**
221
+ * Handle a bot failure using the strategy's failure policy.
222
+ * Retries the update up to maxRetries before escalating.
223
+ */
224
+ private handleFailure(
225
+ botId: string,
226
+ result: BotUpdateResult,
227
+ allResults: BotUpdateResult[],
228
+ ): "abort" | "skip" | "retry" {
229
+ const error = new Error(result.error ?? "Unknown error");
230
+ const failCount = allResults.filter((r) => r.botId === botId && !r.success).length;
231
+ return this.strategy.onBotFailure(botId, error, failCount);
232
+ }
233
+
234
+ private async restoreVolumes(botId: string, snapshotIds: string[]): Promise<boolean> {
235
+ if (snapshotIds.length === 0) return false;
236
+
237
+ for (const id of snapshotIds) {
238
+ try {
239
+ await this.snapshotManager.restore(id);
240
+ logger.info(`Volume restored for ${botId} from snapshot ${id}`);
241
+ return true;
242
+ } catch (err) {
243
+ logger.error(`Volume restore failed for ${botId} snapshot ${id}`, { err });
244
+ }
245
+ }
246
+ return false;
247
+ }
248
+
249
+ private async cleanupSnapshots(snapshotIds: string[]): Promise<void> {
250
+ for (const id of snapshotIds) {
251
+ try {
252
+ await this.snapshotManager.delete(id);
253
+ } catch (err) {
254
+ logger.warn(`Failed to clean up snapshot ${id}`, { err });
255
+ }
256
+ }
257
+ }
258
+ }
259
+
260
+ function sleep(ms: number): Promise<void> {
261
+ return new Promise((resolve) => setTimeout(resolve, ms));
262
+ }
@@ -32,6 +32,8 @@ import { SystemResourceMonitor } from "../observability/system-resources.js";
32
32
  // Stub re-exports so existing references compile; consumers must call initPlatformServices().
33
33
  // TODO: Replace with proper DI / service-locator pattern in platform-core.
34
34
  import { DrizzleTwoFactorRepository } from "../security/two-factor-repository.js";
35
+ import type { RolloutOrchestrator } from "./rollout-orchestrator.js";
36
+ import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
35
37
 
36
38
  // Platform singletons (getAdminAuditLog, getCreditLedger, etc.) are wired by
37
39
  // the consuming application's own composition root (e.g. wopr-platform's
@@ -136,6 +138,8 @@ let _restoreLogStore: IRestoreLogStore | null = null;
136
138
  let _restoreService: RestoreService | null = null;
137
139
  let _backupStatusStore: IBackupStatusStore | null = null;
138
140
  let _snapshotManager: SnapshotManager | null = null;
141
+ let _volumeSnapshotManager: VolumeSnapshotManager | null = null;
142
+ let _rolloutOrchestrator: RolloutOrchestrator | null = null;
139
143
 
140
144
  const S3_BUCKET = process.env.S3_BUCKET || "wopr-backups";
141
145
 
@@ -537,6 +541,28 @@ export function getSnapshotManager(): SnapshotManager {
537
541
  return _snapshotManager;
538
542
  }
539
543
 
544
+ export function getVolumeSnapshotManager(): VolumeSnapshotManager {
545
+ if (!_volumeSnapshotManager) {
546
+ throw new Error("VolumeSnapshotManager not initialized — call setVolumeSnapshotManager() first");
547
+ }
548
+ return _volumeSnapshotManager;
549
+ }
550
+
551
+ export function setVolumeSnapshotManager(mgr: VolumeSnapshotManager): void {
552
+ _volumeSnapshotManager = mgr;
553
+ }
554
+
555
+ export function getRolloutOrchestrator(): RolloutOrchestrator {
556
+ if (!_rolloutOrchestrator) {
557
+ throw new Error("RolloutOrchestrator not initialized — call setRolloutOrchestrator() first");
558
+ }
559
+ return _rolloutOrchestrator;
560
+ }
561
+
562
+ export function setRolloutOrchestrator(orch: RolloutOrchestrator): void {
563
+ _rolloutOrchestrator = orch;
564
+ }
565
+
540
566
  export function getRestoreService(): RestoreService {
541
567
  if (!_restoreService) {
542
568
  _restoreService = new RestoreService({
@@ -877,6 +903,8 @@ export function _resetForTest(): void {
877
903
  _restoreService = null;
878
904
  _backupStatusStore = null;
879
905
  _snapshotManager = null;
906
+ _volumeSnapshotManager = null;
907
+ _rolloutOrchestrator = null;
880
908
  _botBilling = null;
881
909
  _phoneNumberRepo = null;
882
910
  _affiliateRepo = null;