@wopr-network/platform-core 1.22.0 → 1.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/fleet/__tests__/rollout-orchestrator.test.d.ts +1 -0
- package/dist/fleet/__tests__/rollout-orchestrator.test.js +262 -0
- package/dist/fleet/index.d.ts +1 -0
- package/dist/fleet/index.js +1 -0
- package/dist/fleet/rollout-orchestrator.d.ts +69 -0
- package/dist/fleet/rollout-orchestrator.js +204 -0
- package/dist/fleet/services.d.ts +6 -0
- package/dist/fleet/services.js +22 -0
- package/package.json +1 -1
- package/src/fleet/__tests__/rollout-orchestrator.test.ts +321 -0
- package/src/fleet/index.ts +1 -0
- package/src/fleet/rollout-orchestrator.ts +262 -0
- package/src/fleet/services.ts +28 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
2
|
+
import { RolloutOrchestrator } from "../rollout-orchestrator.js";
|
|
3
|
+
function makeProfile(id, volumeName) {
|
|
4
|
+
return {
|
|
5
|
+
id,
|
|
6
|
+
tenantId: "tenant-1",
|
|
7
|
+
name: `bot-${id}`,
|
|
8
|
+
description: "",
|
|
9
|
+
image: "ghcr.io/wopr-network/paperclip:managed",
|
|
10
|
+
env: {},
|
|
11
|
+
restartPolicy: "unless-stopped",
|
|
12
|
+
releaseChannel: "stable",
|
|
13
|
+
updatePolicy: "nightly",
|
|
14
|
+
volumeName,
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
function makeResult(botId, success) {
|
|
18
|
+
return {
|
|
19
|
+
botId,
|
|
20
|
+
success,
|
|
21
|
+
previousImage: "old:latest",
|
|
22
|
+
newImage: "new:latest",
|
|
23
|
+
previousDigest: "sha256:old",
|
|
24
|
+
newDigest: "sha256:new",
|
|
25
|
+
rolledBack: !success,
|
|
26
|
+
error: success ? undefined : "Health check failed",
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
function mockUpdater(results) {
|
|
30
|
+
return {
|
|
31
|
+
updateBot: vi.fn(async (botId) => results.get(botId) ?? makeResult(botId, true)),
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
function mockSnapshotManager() {
|
|
35
|
+
return {
|
|
36
|
+
snapshot: vi.fn(async (volumeName) => ({
|
|
37
|
+
id: `${volumeName}-snap`,
|
|
38
|
+
volumeName,
|
|
39
|
+
archivePath: `/backup/${volumeName}-snap.tar`,
|
|
40
|
+
createdAt: new Date(),
|
|
41
|
+
sizeBytes: 1024,
|
|
42
|
+
})),
|
|
43
|
+
restore: vi.fn(async () => { }),
|
|
44
|
+
delete: vi.fn(async () => { }),
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
function mockStrategy(overrides = {}) {
|
|
48
|
+
return {
|
|
49
|
+
nextBatch: (remaining) => remaining.slice(0, 2),
|
|
50
|
+
pauseDuration: () => 0,
|
|
51
|
+
onBotFailure: () => "skip",
|
|
52
|
+
maxRetries: () => 2,
|
|
53
|
+
healthCheckTimeout: () => 120_000,
|
|
54
|
+
...overrides,
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
describe("RolloutOrchestrator", () => {
|
|
58
|
+
let updater;
|
|
59
|
+
let snapMgr;
|
|
60
|
+
beforeEach(() => {
|
|
61
|
+
vi.clearAllMocks();
|
|
62
|
+
updater = mockUpdater(new Map());
|
|
63
|
+
snapMgr = mockSnapshotManager();
|
|
64
|
+
});
|
|
65
|
+
it("processes all bots in batches", async () => {
|
|
66
|
+
const profiles = [makeProfile("b1", "vol-1"), makeProfile("b2", "vol-2"), makeProfile("b3", "vol-3")];
|
|
67
|
+
const strategy = mockStrategy({ nextBatch: (r) => r.slice(0, 2) });
|
|
68
|
+
const orch = new RolloutOrchestrator({
|
|
69
|
+
updater,
|
|
70
|
+
snapshotManager: snapMgr,
|
|
71
|
+
strategy,
|
|
72
|
+
getUpdatableProfiles: async () => profiles,
|
|
73
|
+
});
|
|
74
|
+
const result = await orch.rollout();
|
|
75
|
+
expect(result.totalBots).toBe(3);
|
|
76
|
+
expect(result.succeeded).toBe(3);
|
|
77
|
+
expect(result.failed).toBe(0);
|
|
78
|
+
expect(result.aborted).toBe(false);
|
|
79
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(3);
|
|
80
|
+
});
|
|
81
|
+
it("snapshots volumes before updating", async () => {
|
|
82
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
83
|
+
const orch = new RolloutOrchestrator({
|
|
84
|
+
updater,
|
|
85
|
+
snapshotManager: snapMgr,
|
|
86
|
+
strategy: mockStrategy(),
|
|
87
|
+
getUpdatableProfiles: async () => profiles,
|
|
88
|
+
});
|
|
89
|
+
await orch.rollout();
|
|
90
|
+
expect(snapMgr.snapshot).toHaveBeenCalledWith("my-volume");
|
|
91
|
+
// On success, snapshot is cleaned up
|
|
92
|
+
expect(snapMgr.delete).toHaveBeenCalledWith("my-volume-snap");
|
|
93
|
+
});
|
|
94
|
+
it("skips snapshot for bots without volumes", async () => {
|
|
95
|
+
const profiles = [makeProfile("b1")]; // no volumeName
|
|
96
|
+
const orch = new RolloutOrchestrator({
|
|
97
|
+
updater,
|
|
98
|
+
snapshotManager: snapMgr,
|
|
99
|
+
strategy: mockStrategy(),
|
|
100
|
+
getUpdatableProfiles: async () => profiles,
|
|
101
|
+
});
|
|
102
|
+
await orch.rollout();
|
|
103
|
+
expect(snapMgr.snapshot).not.toHaveBeenCalled();
|
|
104
|
+
expect(updater.updateBot).toHaveBeenCalledWith("b1");
|
|
105
|
+
});
|
|
106
|
+
it("restores volumes on update failure", async () => {
|
|
107
|
+
const failResults = new Map([["b1", makeResult("b1", false)]]);
|
|
108
|
+
updater = mockUpdater(failResults);
|
|
109
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
110
|
+
const orch = new RolloutOrchestrator({
|
|
111
|
+
updater,
|
|
112
|
+
snapshotManager: snapMgr,
|
|
113
|
+
strategy: mockStrategy(),
|
|
114
|
+
getUpdatableProfiles: async () => profiles,
|
|
115
|
+
});
|
|
116
|
+
const result = await orch.rollout();
|
|
117
|
+
expect(result.failed).toBe(1);
|
|
118
|
+
expect(result.results[0].volumeRestored).toBe(true);
|
|
119
|
+
expect(snapMgr.restore).toHaveBeenCalledWith("my-volume-snap");
|
|
120
|
+
// Snapshot NOT deleted on failure (restored instead)
|
|
121
|
+
expect(snapMgr.delete).not.toHaveBeenCalled();
|
|
122
|
+
});
|
|
123
|
+
it("aborts rollout when strategy says abort", async () => {
|
|
124
|
+
const failResults = new Map([["b1", makeResult("b1", false)]]);
|
|
125
|
+
updater = mockUpdater(failResults);
|
|
126
|
+
const profiles = [makeProfile("b1", "v1"), makeProfile("b2", "v2"), makeProfile("b3", "v3")];
|
|
127
|
+
const strategy = mockStrategy({
|
|
128
|
+
nextBatch: (r) => r.slice(0, 1),
|
|
129
|
+
onBotFailure: () => "abort",
|
|
130
|
+
});
|
|
131
|
+
const orch = new RolloutOrchestrator({
|
|
132
|
+
updater,
|
|
133
|
+
snapshotManager: snapMgr,
|
|
134
|
+
strategy,
|
|
135
|
+
getUpdatableProfiles: async () => profiles,
|
|
136
|
+
});
|
|
137
|
+
const result = await orch.rollout();
|
|
138
|
+
expect(result.aborted).toBe(true);
|
|
139
|
+
expect(result.succeeded).toBe(0);
|
|
140
|
+
expect(result.failed).toBe(1);
|
|
141
|
+
expect(result.skipped).toBe(2); // b2, b3 never processed
|
|
142
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(1);
|
|
143
|
+
});
|
|
144
|
+
it("returns empty result when no bots to update", async () => {
|
|
145
|
+
const orch = new RolloutOrchestrator({
|
|
146
|
+
updater,
|
|
147
|
+
snapshotManager: snapMgr,
|
|
148
|
+
strategy: mockStrategy(),
|
|
149
|
+
getUpdatableProfiles: async () => [],
|
|
150
|
+
});
|
|
151
|
+
const result = await orch.rollout();
|
|
152
|
+
expect(result.totalBots).toBe(0);
|
|
153
|
+
expect(result.results).toHaveLength(0);
|
|
154
|
+
});
|
|
155
|
+
it("rejects concurrent rollouts", async () => {
|
|
156
|
+
const profiles = [makeProfile("b1")];
|
|
157
|
+
// Make updateBot slow
|
|
158
|
+
updater = {
|
|
159
|
+
updateBot: vi.fn(async (botId) => {
|
|
160
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
161
|
+
return makeResult(botId, true);
|
|
162
|
+
}),
|
|
163
|
+
};
|
|
164
|
+
const orch = new RolloutOrchestrator({
|
|
165
|
+
updater,
|
|
166
|
+
snapshotManager: snapMgr,
|
|
167
|
+
strategy: mockStrategy(),
|
|
168
|
+
getUpdatableProfiles: async () => profiles,
|
|
169
|
+
});
|
|
170
|
+
const [r1, r2] = await Promise.all([orch.rollout(), orch.rollout()]);
|
|
171
|
+
// One succeeds, one is rejected as already running
|
|
172
|
+
const succeeded = [r1, r2].find((r) => r.totalBots > 0);
|
|
173
|
+
const rejected = [r1, r2].find((r) => r.alreadyRunning);
|
|
174
|
+
expect(succeeded).toBeDefined();
|
|
175
|
+
expect(rejected).toBeDefined();
|
|
176
|
+
expect(rejected?.alreadyRunning).toBe(true);
|
|
177
|
+
expect(rejected?.totalBots).toBe(0);
|
|
178
|
+
});
|
|
179
|
+
it("retries failed bots when strategy says retry", async () => {
|
|
180
|
+
let callCount = 0;
|
|
181
|
+
updater = {
|
|
182
|
+
updateBot: vi.fn(async (botId) => {
|
|
183
|
+
callCount++;
|
|
184
|
+
// Fail first attempt, succeed on retry
|
|
185
|
+
if (botId === "b1" && callCount === 1)
|
|
186
|
+
return makeResult("b1", false);
|
|
187
|
+
return makeResult(botId, true);
|
|
188
|
+
}),
|
|
189
|
+
};
|
|
190
|
+
const profiles = [makeProfile("b1")];
|
|
191
|
+
const strategy = mockStrategy({
|
|
192
|
+
nextBatch: (r) => r.slice(0, 1),
|
|
193
|
+
onBotFailure: (_botId, _err, attempt) => (attempt < 2 ? "retry" : "skip"),
|
|
194
|
+
});
|
|
195
|
+
const orch = new RolloutOrchestrator({
|
|
196
|
+
updater,
|
|
197
|
+
snapshotManager: snapMgr,
|
|
198
|
+
strategy,
|
|
199
|
+
getUpdatableProfiles: async () => profiles,
|
|
200
|
+
});
|
|
201
|
+
const result = await orch.rollout();
|
|
202
|
+
// b1 failed once, retried, succeeded
|
|
203
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(2);
|
|
204
|
+
expect(result.succeeded).toBe(1);
|
|
205
|
+
expect(result.failed).toBe(1); // first attempt counted as failed
|
|
206
|
+
});
|
|
207
|
+
it("calls onBotUpdated callback for each bot", async () => {
|
|
208
|
+
const profiles = [makeProfile("b1"), makeProfile("b2")];
|
|
209
|
+
const onBotUpdated = vi.fn();
|
|
210
|
+
const orch = new RolloutOrchestrator({
|
|
211
|
+
updater,
|
|
212
|
+
snapshotManager: snapMgr,
|
|
213
|
+
strategy: mockStrategy(),
|
|
214
|
+
getUpdatableProfiles: async () => profiles,
|
|
215
|
+
onBotUpdated,
|
|
216
|
+
});
|
|
217
|
+
await orch.rollout();
|
|
218
|
+
expect(onBotUpdated).toHaveBeenCalledTimes(2);
|
|
219
|
+
});
|
|
220
|
+
it("calls onRolloutComplete callback", async () => {
|
|
221
|
+
const profiles = [makeProfile("b1")];
|
|
222
|
+
const onRolloutComplete = vi.fn();
|
|
223
|
+
const orch = new RolloutOrchestrator({
|
|
224
|
+
updater,
|
|
225
|
+
snapshotManager: snapMgr,
|
|
226
|
+
strategy: mockStrategy(),
|
|
227
|
+
getUpdatableProfiles: async () => profiles,
|
|
228
|
+
onRolloutComplete,
|
|
229
|
+
});
|
|
230
|
+
await orch.rollout();
|
|
231
|
+
expect(onRolloutComplete).toHaveBeenCalledTimes(1);
|
|
232
|
+
expect(onRolloutComplete).toHaveBeenCalledWith(expect.objectContaining({ totalBots: 1, succeeded: 1, aborted: false }));
|
|
233
|
+
});
|
|
234
|
+
it("continues on snapshot failure (best-effort)", async () => {
|
|
235
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
236
|
+
snapMgr.snapshot = vi.fn().mockRejectedValue(new Error("disk full"));
|
|
237
|
+
const orch = new RolloutOrchestrator({
|
|
238
|
+
updater,
|
|
239
|
+
snapshotManager: snapMgr,
|
|
240
|
+
strategy: mockStrategy(),
|
|
241
|
+
getUpdatableProfiles: async () => profiles,
|
|
242
|
+
});
|
|
243
|
+
const result = await orch.rollout();
|
|
244
|
+
// Update still proceeds despite snapshot failure
|
|
245
|
+
expect(result.succeeded).toBe(1);
|
|
246
|
+
expect(updater.updateBot).toHaveBeenCalledWith("b1");
|
|
247
|
+
});
|
|
248
|
+
it("isRolling reflects rollout state", async () => {
|
|
249
|
+
const profiles = [makeProfile("b1")];
|
|
250
|
+
const orch = new RolloutOrchestrator({
|
|
251
|
+
updater,
|
|
252
|
+
snapshotManager: snapMgr,
|
|
253
|
+
strategy: mockStrategy(),
|
|
254
|
+
getUpdatableProfiles: async () => profiles,
|
|
255
|
+
});
|
|
256
|
+
expect(orch.isRolling).toBe(false);
|
|
257
|
+
const promise = orch.rollout();
|
|
258
|
+
// isRolling is true during rollout (may already be done for sync mocks)
|
|
259
|
+
await promise;
|
|
260
|
+
expect(orch.isRolling).toBe(false);
|
|
261
|
+
});
|
|
262
|
+
});
|
package/dist/fleet/index.d.ts
CHANGED
package/dist/fleet/index.js
CHANGED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RolloutOrchestrator — coordinates fleet-wide container updates using
|
|
3
|
+
* pluggable rollout strategies and volume snapshots for nuclear rollback.
|
|
4
|
+
*
|
|
5
|
+
* Sits between ImagePoller (detects new digests) and ContainerUpdater
|
|
6
|
+
* (handles per-bot pull/stop/recreate/health). Adds:
|
|
7
|
+
* - Strategy-driven batching (rolling wave, single bot, immediate)
|
|
8
|
+
* - Pre-update volume snapshots via VolumeSnapshotManager
|
|
9
|
+
* - Volume restore on health check failure (nuclear rollback)
|
|
10
|
+
* - Per-tenant update orchestration
|
|
11
|
+
*/
|
|
12
|
+
import type { IRolloutStrategy } from "./rollout-strategy.js";
|
|
13
|
+
import type { BotProfile } from "./types.js";
|
|
14
|
+
import type { ContainerUpdater, UpdateResult } from "./updater.js";
|
|
15
|
+
import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
|
|
16
|
+
export interface RolloutOrchestratorDeps {
|
|
17
|
+
updater: ContainerUpdater;
|
|
18
|
+
snapshotManager: VolumeSnapshotManager;
|
|
19
|
+
strategy: IRolloutStrategy;
|
|
20
|
+
/** Resolve running profiles that need updating for a given image digest */
|
|
21
|
+
getUpdatableProfiles: () => Promise<BotProfile[]>;
|
|
22
|
+
/** Optional callback after each bot update (success or failure) */
|
|
23
|
+
onBotUpdated?: (result: UpdateResult & {
|
|
24
|
+
volumeRestored: boolean;
|
|
25
|
+
}) => void;
|
|
26
|
+
/** Optional callback when a rollout completes */
|
|
27
|
+
onRolloutComplete?: (results: RolloutResult) => void;
|
|
28
|
+
}
|
|
29
|
+
export interface BotUpdateResult extends UpdateResult {
|
|
30
|
+
volumeRestored: boolean;
|
|
31
|
+
}
|
|
32
|
+
export interface RolloutResult {
|
|
33
|
+
totalBots: number;
|
|
34
|
+
succeeded: number;
|
|
35
|
+
failed: number;
|
|
36
|
+
skipped: number;
|
|
37
|
+
aborted: boolean;
|
|
38
|
+
/** True when a concurrent rollout was already in progress */
|
|
39
|
+
alreadyRunning: boolean;
|
|
40
|
+
results: BotUpdateResult[];
|
|
41
|
+
}
|
|
42
|
+
export declare class RolloutOrchestrator {
|
|
43
|
+
private readonly updater;
|
|
44
|
+
private readonly snapshotManager;
|
|
45
|
+
private readonly strategy;
|
|
46
|
+
private readonly getUpdatableProfiles;
|
|
47
|
+
private readonly onBotUpdated?;
|
|
48
|
+
private readonly onRolloutComplete?;
|
|
49
|
+
private rolling;
|
|
50
|
+
constructor(deps: RolloutOrchestratorDeps);
|
|
51
|
+
/** Whether a rollout is currently in progress. */
|
|
52
|
+
get isRolling(): boolean;
|
|
53
|
+
/**
|
|
54
|
+
* Execute a rollout across all updatable bots.
|
|
55
|
+
* Uses the configured strategy for batching, pausing, and failure handling.
|
|
56
|
+
*/
|
|
57
|
+
rollout(): Promise<RolloutResult>;
|
|
58
|
+
/**
|
|
59
|
+
* Update a single bot with volume snapshot + nuclear rollback.
|
|
60
|
+
*/
|
|
61
|
+
private updateBot;
|
|
62
|
+
/**
|
|
63
|
+
* Handle a bot failure using the strategy's failure policy.
|
|
64
|
+
* Retries the update up to maxRetries before escalating.
|
|
65
|
+
*/
|
|
66
|
+
private handleFailure;
|
|
67
|
+
private restoreVolumes;
|
|
68
|
+
private cleanupSnapshots;
|
|
69
|
+
}
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RolloutOrchestrator — coordinates fleet-wide container updates using
|
|
3
|
+
* pluggable rollout strategies and volume snapshots for nuclear rollback.
|
|
4
|
+
*
|
|
5
|
+
* Sits between ImagePoller (detects new digests) and ContainerUpdater
|
|
6
|
+
* (handles per-bot pull/stop/recreate/health). Adds:
|
|
7
|
+
* - Strategy-driven batching (rolling wave, single bot, immediate)
|
|
8
|
+
* - Pre-update volume snapshots via VolumeSnapshotManager
|
|
9
|
+
* - Volume restore on health check failure (nuclear rollback)
|
|
10
|
+
* - Per-tenant update orchestration
|
|
11
|
+
*/
|
|
12
|
+
import { logger } from "../config/logger.js";
|
|
13
|
+
export class RolloutOrchestrator {
|
|
14
|
+
updater;
|
|
15
|
+
snapshotManager;
|
|
16
|
+
strategy;
|
|
17
|
+
getUpdatableProfiles;
|
|
18
|
+
onBotUpdated;
|
|
19
|
+
onRolloutComplete;
|
|
20
|
+
rolling = false;
|
|
21
|
+
constructor(deps) {
|
|
22
|
+
this.updater = deps.updater;
|
|
23
|
+
this.snapshotManager = deps.snapshotManager;
|
|
24
|
+
this.strategy = deps.strategy;
|
|
25
|
+
this.getUpdatableProfiles = deps.getUpdatableProfiles;
|
|
26
|
+
this.onBotUpdated = deps.onBotUpdated;
|
|
27
|
+
this.onRolloutComplete = deps.onRolloutComplete;
|
|
28
|
+
}
|
|
29
|
+
/** Whether a rollout is currently in progress. */
|
|
30
|
+
get isRolling() {
|
|
31
|
+
return this.rolling;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Execute a rollout across all updatable bots.
|
|
35
|
+
* Uses the configured strategy for batching, pausing, and failure handling.
|
|
36
|
+
*/
|
|
37
|
+
async rollout() {
|
|
38
|
+
if (this.rolling) {
|
|
39
|
+
logger.warn("Rollout already in progress — skipping");
|
|
40
|
+
return { totalBots: 0, succeeded: 0, failed: 0, skipped: 0, aborted: false, alreadyRunning: true, results: [] };
|
|
41
|
+
}
|
|
42
|
+
this.rolling = true;
|
|
43
|
+
const allResults = [];
|
|
44
|
+
let aborted = false;
|
|
45
|
+
try {
|
|
46
|
+
let remaining = await this.getUpdatableProfiles();
|
|
47
|
+
const totalBots = remaining.length;
|
|
48
|
+
if (totalBots === 0) {
|
|
49
|
+
logger.info("Rollout: no bots to update");
|
|
50
|
+
return {
|
|
51
|
+
totalBots: 0,
|
|
52
|
+
succeeded: 0,
|
|
53
|
+
failed: 0,
|
|
54
|
+
skipped: 0,
|
|
55
|
+
aborted: false,
|
|
56
|
+
alreadyRunning: false,
|
|
57
|
+
results: [],
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
logger.info(`Rollout starting: ${totalBots} bots to update`);
|
|
61
|
+
while (remaining.length > 0 && !aborted) {
|
|
62
|
+
const batch = this.strategy.nextBatch(remaining);
|
|
63
|
+
if (batch.length === 0)
|
|
64
|
+
break;
|
|
65
|
+
logger.info(`Rollout wave: ${batch.length} bots (${remaining.length} remaining)`);
|
|
66
|
+
// Process batch — each bot sequentially within a wave for safety
|
|
67
|
+
const retryProfiles = [];
|
|
68
|
+
for (const profile of batch) {
|
|
69
|
+
if (aborted)
|
|
70
|
+
break;
|
|
71
|
+
const result = await this.updateBot(profile);
|
|
72
|
+
allResults.push(result);
|
|
73
|
+
this.onBotUpdated?.(result);
|
|
74
|
+
if (!result.success) {
|
|
75
|
+
const action = this.handleFailure(profile.id, result, allResults);
|
|
76
|
+
if (action === "abort") {
|
|
77
|
+
aborted = true;
|
|
78
|
+
logger.warn(`Rollout aborted after bot ${profile.id} failure`);
|
|
79
|
+
}
|
|
80
|
+
else if (action === "retry") {
|
|
81
|
+
retryProfiles.push(profile);
|
|
82
|
+
}
|
|
83
|
+
// "skip" → don't re-add, bot is dropped
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
// Remove processed bots from remaining, but re-add retries
|
|
87
|
+
const processedIds = new Set(batch.map((b) => b.id));
|
|
88
|
+
const retryIds = new Set(retryProfiles.map((b) => b.id));
|
|
89
|
+
remaining = [
|
|
90
|
+
...remaining.filter((b) => !processedIds.has(b.id)),
|
|
91
|
+
...retryProfiles.filter((b) => retryIds.has(b.id)),
|
|
92
|
+
];
|
|
93
|
+
// Pause between waves (unless aborted or done)
|
|
94
|
+
if (remaining.length > 0 && !aborted) {
|
|
95
|
+
const pause = this.strategy.pauseDuration();
|
|
96
|
+
if (pause > 0) {
|
|
97
|
+
logger.info(`Rollout: pausing ${pause}ms before next wave`);
|
|
98
|
+
await sleep(pause);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
const succeeded = allResults.filter((r) => r.success).length;
|
|
103
|
+
const failed = allResults.filter((r) => !r.success).length;
|
|
104
|
+
const skipped = totalBots - allResults.length;
|
|
105
|
+
const rolloutResult = {
|
|
106
|
+
totalBots,
|
|
107
|
+
succeeded,
|
|
108
|
+
failed,
|
|
109
|
+
skipped,
|
|
110
|
+
aborted,
|
|
111
|
+
alreadyRunning: false,
|
|
112
|
+
results: allResults,
|
|
113
|
+
};
|
|
114
|
+
logger.info(`Rollout complete: ${succeeded} succeeded, ${failed} failed, ${skipped} skipped, aborted=${aborted}`);
|
|
115
|
+
this.onRolloutComplete?.(rolloutResult);
|
|
116
|
+
return rolloutResult;
|
|
117
|
+
}
|
|
118
|
+
finally {
|
|
119
|
+
this.rolling = false;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Update a single bot with volume snapshot + nuclear rollback.
|
|
124
|
+
*/
|
|
125
|
+
async updateBot(profile) {
|
|
126
|
+
const snapshotIds = [];
|
|
127
|
+
try {
|
|
128
|
+
// Step 1: Snapshot volumes before update
|
|
129
|
+
if (profile.volumeName) {
|
|
130
|
+
try {
|
|
131
|
+
const snap = await this.snapshotManager.snapshot(profile.volumeName);
|
|
132
|
+
snapshotIds.push(snap.id);
|
|
133
|
+
logger.info(`Pre-update snapshot for ${profile.id}: ${snap.id}`);
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
logger.warn(`Volume snapshot failed for ${profile.id} — proceeding without backup`, { err });
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// Step 2: Delegate to ContainerUpdater
|
|
140
|
+
const result = await this.updater.updateBot(profile.id);
|
|
141
|
+
if (result.success) {
|
|
142
|
+
// Clean up snapshots on success
|
|
143
|
+
await this.cleanupSnapshots(snapshotIds);
|
|
144
|
+
return { ...result, volumeRestored: false };
|
|
145
|
+
}
|
|
146
|
+
// Step 3: Nuclear rollback — restore volumes if update failed
|
|
147
|
+
const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
|
|
148
|
+
return { ...result, volumeRestored };
|
|
149
|
+
}
|
|
150
|
+
catch (err) {
|
|
151
|
+
logger.error(`Unexpected error updating bot ${profile.id}`, { err });
|
|
152
|
+
// Attempt volume restore on unexpected errors too
|
|
153
|
+
const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
|
|
154
|
+
return {
|
|
155
|
+
botId: profile.id,
|
|
156
|
+
success: false,
|
|
157
|
+
previousImage: profile.image,
|
|
158
|
+
newImage: profile.image,
|
|
159
|
+
previousDigest: null,
|
|
160
|
+
newDigest: null,
|
|
161
|
+
rolledBack: false,
|
|
162
|
+
volumeRestored,
|
|
163
|
+
error: err instanceof Error ? err.message : String(err),
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Handle a bot failure using the strategy's failure policy.
|
|
169
|
+
* Retries the update up to maxRetries before escalating.
|
|
170
|
+
*/
|
|
171
|
+
handleFailure(botId, result, allResults) {
|
|
172
|
+
const error = new Error(result.error ?? "Unknown error");
|
|
173
|
+
const failCount = allResults.filter((r) => r.botId === botId && !r.success).length;
|
|
174
|
+
return this.strategy.onBotFailure(botId, error, failCount);
|
|
175
|
+
}
|
|
176
|
+
async restoreVolumes(botId, snapshotIds) {
|
|
177
|
+
if (snapshotIds.length === 0)
|
|
178
|
+
return false;
|
|
179
|
+
for (const id of snapshotIds) {
|
|
180
|
+
try {
|
|
181
|
+
await this.snapshotManager.restore(id);
|
|
182
|
+
logger.info(`Volume restored for ${botId} from snapshot ${id}`);
|
|
183
|
+
return true;
|
|
184
|
+
}
|
|
185
|
+
catch (err) {
|
|
186
|
+
logger.error(`Volume restore failed for ${botId} snapshot ${id}`, { err });
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
return false;
|
|
190
|
+
}
|
|
191
|
+
async cleanupSnapshots(snapshotIds) {
|
|
192
|
+
for (const id of snapshotIds) {
|
|
193
|
+
try {
|
|
194
|
+
await this.snapshotManager.delete(id);
|
|
195
|
+
}
|
|
196
|
+
catch (err) {
|
|
197
|
+
logger.warn(`Failed to clean up snapshot ${id}`, { err });
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
function sleep(ms) {
|
|
203
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
204
|
+
}
|
package/dist/fleet/services.d.ts
CHANGED
|
@@ -14,6 +14,8 @@ import type { IAffiliateRepository } from "../monetization/affiliate/drizzle-aff
|
|
|
14
14
|
import type { IBotBilling } from "../monetization/credits/bot-billing.js";
|
|
15
15
|
import type { IPhoneNumberRepository } from "../monetization/credits/drizzle-phone-number-repository.js";
|
|
16
16
|
import { SystemResourceMonitor } from "../observability/system-resources.js";
|
|
17
|
+
import type { RolloutOrchestrator } from "./rollout-orchestrator.js";
|
|
18
|
+
import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
|
|
17
19
|
import { AdminNotifier } from "./admin-notifier.js";
|
|
18
20
|
import type { IBotInstanceRepository } from "./bot-instance-repository.js";
|
|
19
21
|
import type { IBotProfileRepository } from "./bot-profile-repository.js";
|
|
@@ -79,6 +81,10 @@ export declare function getCapacityPolicy(configOverrides?: Partial<CapacityPoli
|
|
|
79
81
|
export declare function getRestoreLogStore(): IRestoreLogStore;
|
|
80
82
|
export declare function getBackupStatusStore(): IBackupStatusStore;
|
|
81
83
|
export declare function getSnapshotManager(): SnapshotManager;
|
|
84
|
+
export declare function getVolumeSnapshotManager(): VolumeSnapshotManager;
|
|
85
|
+
export declare function setVolumeSnapshotManager(mgr: VolumeSnapshotManager): void;
|
|
86
|
+
export declare function getRolloutOrchestrator(): RolloutOrchestrator;
|
|
87
|
+
export declare function setRolloutOrchestrator(orch: RolloutOrchestrator): void;
|
|
82
88
|
export declare function getRestoreService(): RestoreService;
|
|
83
89
|
/** Call once at server startup to wire up fleet services. */
|
|
84
90
|
export declare function initFleet(): void;
|
package/dist/fleet/services.js
CHANGED
|
@@ -105,6 +105,8 @@ let _restoreLogStore = null;
|
|
|
105
105
|
let _restoreService = null;
|
|
106
106
|
let _backupStatusStore = null;
|
|
107
107
|
let _snapshotManager = null;
|
|
108
|
+
let _volumeSnapshotManager = null;
|
|
109
|
+
let _rolloutOrchestrator = null;
|
|
108
110
|
const S3_BUCKET = process.env.S3_BUCKET || "wopr-backups";
|
|
109
111
|
function envInt(key, fallback) {
|
|
110
112
|
const raw = process.env[key];
|
|
@@ -427,6 +429,24 @@ export function getSnapshotManager() {
|
|
|
427
429
|
}
|
|
428
430
|
return _snapshotManager;
|
|
429
431
|
}
|
|
432
|
+
export function getVolumeSnapshotManager() {
|
|
433
|
+
if (!_volumeSnapshotManager) {
|
|
434
|
+
throw new Error("VolumeSnapshotManager not initialized — call setVolumeSnapshotManager() first");
|
|
435
|
+
}
|
|
436
|
+
return _volumeSnapshotManager;
|
|
437
|
+
}
|
|
438
|
+
export function setVolumeSnapshotManager(mgr) {
|
|
439
|
+
_volumeSnapshotManager = mgr;
|
|
440
|
+
}
|
|
441
|
+
export function getRolloutOrchestrator() {
|
|
442
|
+
if (!_rolloutOrchestrator) {
|
|
443
|
+
throw new Error("RolloutOrchestrator not initialized — call setRolloutOrchestrator() first");
|
|
444
|
+
}
|
|
445
|
+
return _rolloutOrchestrator;
|
|
446
|
+
}
|
|
447
|
+
export function setRolloutOrchestrator(orch) {
|
|
448
|
+
_rolloutOrchestrator = orch;
|
|
449
|
+
}
|
|
430
450
|
export function getRestoreService() {
|
|
431
451
|
if (!_restoreService) {
|
|
432
452
|
_restoreService = new RestoreService({
|
|
@@ -683,6 +703,8 @@ export function _resetForTest() {
|
|
|
683
703
|
_restoreService = null;
|
|
684
704
|
_backupStatusStore = null;
|
|
685
705
|
_snapshotManager = null;
|
|
706
|
+
_volumeSnapshotManager = null;
|
|
707
|
+
_rolloutOrchestrator = null;
|
|
686
708
|
_botBilling = null;
|
|
687
709
|
_phoneNumberRepo = null;
|
|
688
710
|
_affiliateRepo = null;
|
package/package.json
CHANGED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
2
|
+
import { RolloutOrchestrator } from "../rollout-orchestrator.js";
|
|
3
|
+
import type { IRolloutStrategy } from "../rollout-strategy.js";
|
|
4
|
+
import type { BotProfile } from "../types.js";
|
|
5
|
+
import type { ContainerUpdater, UpdateResult } from "../updater.js";
|
|
6
|
+
import type { VolumeSnapshotManager } from "../volume-snapshot-manager.js";
|
|
7
|
+
|
|
8
|
+
function makeProfile(id: string, volumeName?: string): BotProfile {
|
|
9
|
+
return {
|
|
10
|
+
id,
|
|
11
|
+
tenantId: "tenant-1",
|
|
12
|
+
name: `bot-${id}`,
|
|
13
|
+
description: "",
|
|
14
|
+
image: "ghcr.io/wopr-network/paperclip:managed",
|
|
15
|
+
env: {},
|
|
16
|
+
restartPolicy: "unless-stopped",
|
|
17
|
+
releaseChannel: "stable",
|
|
18
|
+
updatePolicy: "nightly",
|
|
19
|
+
volumeName,
|
|
20
|
+
} as BotProfile;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function makeResult(botId: string, success: boolean): UpdateResult {
|
|
24
|
+
return {
|
|
25
|
+
botId,
|
|
26
|
+
success,
|
|
27
|
+
previousImage: "old:latest",
|
|
28
|
+
newImage: "new:latest",
|
|
29
|
+
previousDigest: "sha256:old",
|
|
30
|
+
newDigest: "sha256:new",
|
|
31
|
+
rolledBack: !success,
|
|
32
|
+
error: success ? undefined : "Health check failed",
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function mockUpdater(results: Map<string, UpdateResult>): ContainerUpdater {
|
|
37
|
+
return {
|
|
38
|
+
updateBot: vi.fn(async (botId: string) => results.get(botId) ?? makeResult(botId, true)),
|
|
39
|
+
} as unknown as ContainerUpdater;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function mockSnapshotManager(): VolumeSnapshotManager {
|
|
43
|
+
return {
|
|
44
|
+
snapshot: vi.fn(async (volumeName: string) => ({
|
|
45
|
+
id: `${volumeName}-snap`,
|
|
46
|
+
volumeName,
|
|
47
|
+
archivePath: `/backup/${volumeName}-snap.tar`,
|
|
48
|
+
createdAt: new Date(),
|
|
49
|
+
sizeBytes: 1024,
|
|
50
|
+
})),
|
|
51
|
+
restore: vi.fn(async () => {}),
|
|
52
|
+
delete: vi.fn(async () => {}),
|
|
53
|
+
} as unknown as VolumeSnapshotManager;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function mockStrategy(overrides: Partial<IRolloutStrategy> = {}): IRolloutStrategy {
|
|
57
|
+
return {
|
|
58
|
+
nextBatch: (remaining) => remaining.slice(0, 2),
|
|
59
|
+
pauseDuration: () => 0,
|
|
60
|
+
onBotFailure: () => "skip",
|
|
61
|
+
maxRetries: () => 2,
|
|
62
|
+
healthCheckTimeout: () => 120_000,
|
|
63
|
+
...overrides,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
describe("RolloutOrchestrator", () => {
|
|
68
|
+
let updater: ReturnType<typeof mockUpdater>;
|
|
69
|
+
let snapMgr: ReturnType<typeof mockSnapshotManager>;
|
|
70
|
+
|
|
71
|
+
beforeEach(() => {
|
|
72
|
+
vi.clearAllMocks();
|
|
73
|
+
updater = mockUpdater(new Map());
|
|
74
|
+
snapMgr = mockSnapshotManager();
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
it("processes all bots in batches", async () => {
|
|
78
|
+
const profiles = [makeProfile("b1", "vol-1"), makeProfile("b2", "vol-2"), makeProfile("b3", "vol-3")];
|
|
79
|
+
const strategy = mockStrategy({ nextBatch: (r) => r.slice(0, 2) });
|
|
80
|
+
|
|
81
|
+
const orch = new RolloutOrchestrator({
|
|
82
|
+
updater,
|
|
83
|
+
snapshotManager: snapMgr,
|
|
84
|
+
strategy,
|
|
85
|
+
getUpdatableProfiles: async () => profiles,
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
const result = await orch.rollout();
|
|
89
|
+
|
|
90
|
+
expect(result.totalBots).toBe(3);
|
|
91
|
+
expect(result.succeeded).toBe(3);
|
|
92
|
+
expect(result.failed).toBe(0);
|
|
93
|
+
expect(result.aborted).toBe(false);
|
|
94
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(3);
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it("snapshots volumes before updating", async () => {
|
|
98
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
99
|
+
|
|
100
|
+
const orch = new RolloutOrchestrator({
|
|
101
|
+
updater,
|
|
102
|
+
snapshotManager: snapMgr,
|
|
103
|
+
strategy: mockStrategy(),
|
|
104
|
+
getUpdatableProfiles: async () => profiles,
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
await orch.rollout();
|
|
108
|
+
|
|
109
|
+
expect(snapMgr.snapshot).toHaveBeenCalledWith("my-volume");
|
|
110
|
+
// On success, snapshot is cleaned up
|
|
111
|
+
expect(snapMgr.delete).toHaveBeenCalledWith("my-volume-snap");
|
|
112
|
+
});
|
|
113
|
+
|
|
114
|
+
it("skips snapshot for bots without volumes", async () => {
|
|
115
|
+
const profiles = [makeProfile("b1")]; // no volumeName
|
|
116
|
+
|
|
117
|
+
const orch = new RolloutOrchestrator({
|
|
118
|
+
updater,
|
|
119
|
+
snapshotManager: snapMgr,
|
|
120
|
+
strategy: mockStrategy(),
|
|
121
|
+
getUpdatableProfiles: async () => profiles,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
await orch.rollout();
|
|
125
|
+
|
|
126
|
+
expect(snapMgr.snapshot).not.toHaveBeenCalled();
|
|
127
|
+
expect(updater.updateBot).toHaveBeenCalledWith("b1");
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
it("restores volumes on update failure", async () => {
|
|
131
|
+
const failResults = new Map([["b1", makeResult("b1", false)]]);
|
|
132
|
+
updater = mockUpdater(failResults);
|
|
133
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
134
|
+
|
|
135
|
+
const orch = new RolloutOrchestrator({
|
|
136
|
+
updater,
|
|
137
|
+
snapshotManager: snapMgr,
|
|
138
|
+
strategy: mockStrategy(),
|
|
139
|
+
getUpdatableProfiles: async () => profiles,
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
const result = await orch.rollout();
|
|
143
|
+
|
|
144
|
+
expect(result.failed).toBe(1);
|
|
145
|
+
expect(result.results[0].volumeRestored).toBe(true);
|
|
146
|
+
expect(snapMgr.restore).toHaveBeenCalledWith("my-volume-snap");
|
|
147
|
+
// Snapshot NOT deleted on failure (restored instead)
|
|
148
|
+
expect(snapMgr.delete).not.toHaveBeenCalled();
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
it("aborts rollout when strategy says abort", async () => {
|
|
152
|
+
const failResults = new Map([["b1", makeResult("b1", false)]]);
|
|
153
|
+
updater = mockUpdater(failResults);
|
|
154
|
+
const profiles = [makeProfile("b1", "v1"), makeProfile("b2", "v2"), makeProfile("b3", "v3")];
|
|
155
|
+
const strategy = mockStrategy({
|
|
156
|
+
nextBatch: (r) => r.slice(0, 1),
|
|
157
|
+
onBotFailure: () => "abort",
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
const orch = new RolloutOrchestrator({
|
|
161
|
+
updater,
|
|
162
|
+
snapshotManager: snapMgr,
|
|
163
|
+
strategy,
|
|
164
|
+
getUpdatableProfiles: async () => profiles,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
const result = await orch.rollout();
|
|
168
|
+
|
|
169
|
+
expect(result.aborted).toBe(true);
|
|
170
|
+
expect(result.succeeded).toBe(0);
|
|
171
|
+
expect(result.failed).toBe(1);
|
|
172
|
+
expect(result.skipped).toBe(2); // b2, b3 never processed
|
|
173
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(1);
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
it("returns empty result when no bots to update", async () => {
|
|
177
|
+
const orch = new RolloutOrchestrator({
|
|
178
|
+
updater,
|
|
179
|
+
snapshotManager: snapMgr,
|
|
180
|
+
strategy: mockStrategy(),
|
|
181
|
+
getUpdatableProfiles: async () => [],
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
const result = await orch.rollout();
|
|
185
|
+
|
|
186
|
+
expect(result.totalBots).toBe(0);
|
|
187
|
+
expect(result.results).toHaveLength(0);
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
it("rejects concurrent rollouts", async () => {
|
|
191
|
+
const profiles = [makeProfile("b1")];
|
|
192
|
+
// Make updateBot slow
|
|
193
|
+
updater = {
|
|
194
|
+
updateBot: vi.fn(async (botId: string) => {
|
|
195
|
+
await new Promise((r) => setTimeout(r, 100));
|
|
196
|
+
return makeResult(botId, true);
|
|
197
|
+
}),
|
|
198
|
+
} as unknown as ContainerUpdater;
|
|
199
|
+
|
|
200
|
+
const orch = new RolloutOrchestrator({
|
|
201
|
+
updater,
|
|
202
|
+
snapshotManager: snapMgr,
|
|
203
|
+
strategy: mockStrategy(),
|
|
204
|
+
getUpdatableProfiles: async () => profiles,
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
const [r1, r2] = await Promise.all([orch.rollout(), orch.rollout()]);
|
|
208
|
+
|
|
209
|
+
// One succeeds, one is rejected as already running
|
|
210
|
+
const succeeded = [r1, r2].find((r) => r.totalBots > 0);
|
|
211
|
+
const rejected = [r1, r2].find((r) => r.alreadyRunning);
|
|
212
|
+
expect(succeeded).toBeDefined();
|
|
213
|
+
expect(rejected).toBeDefined();
|
|
214
|
+
expect(rejected?.alreadyRunning).toBe(true);
|
|
215
|
+
expect(rejected?.totalBots).toBe(0);
|
|
216
|
+
});
|
|
217
|
+
|
|
218
|
+
it("retries failed bots when strategy says retry", async () => {
|
|
219
|
+
let callCount = 0;
|
|
220
|
+
updater = {
|
|
221
|
+
updateBot: vi.fn(async (botId: string) => {
|
|
222
|
+
callCount++;
|
|
223
|
+
// Fail first attempt, succeed on retry
|
|
224
|
+
if (botId === "b1" && callCount === 1) return makeResult("b1", false);
|
|
225
|
+
return makeResult(botId, true);
|
|
226
|
+
}),
|
|
227
|
+
} as unknown as ContainerUpdater;
|
|
228
|
+
|
|
229
|
+
const profiles = [makeProfile("b1")];
|
|
230
|
+
const strategy = mockStrategy({
|
|
231
|
+
nextBatch: (r) => r.slice(0, 1),
|
|
232
|
+
onBotFailure: (_botId, _err, attempt) => (attempt < 2 ? "retry" : "skip"),
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
const orch = new RolloutOrchestrator({
|
|
236
|
+
updater,
|
|
237
|
+
snapshotManager: snapMgr,
|
|
238
|
+
strategy,
|
|
239
|
+
getUpdatableProfiles: async () => profiles,
|
|
240
|
+
});
|
|
241
|
+
|
|
242
|
+
const result = await orch.rollout();
|
|
243
|
+
|
|
244
|
+
// b1 failed once, retried, succeeded
|
|
245
|
+
expect(updater.updateBot).toHaveBeenCalledTimes(2);
|
|
246
|
+
expect(result.succeeded).toBe(1);
|
|
247
|
+
expect(result.failed).toBe(1); // first attempt counted as failed
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
it("calls onBotUpdated callback for each bot", async () => {
|
|
251
|
+
const profiles = [makeProfile("b1"), makeProfile("b2")];
|
|
252
|
+
const onBotUpdated = vi.fn();
|
|
253
|
+
|
|
254
|
+
const orch = new RolloutOrchestrator({
|
|
255
|
+
updater,
|
|
256
|
+
snapshotManager: snapMgr,
|
|
257
|
+
strategy: mockStrategy(),
|
|
258
|
+
getUpdatableProfiles: async () => profiles,
|
|
259
|
+
onBotUpdated,
|
|
260
|
+
});
|
|
261
|
+
|
|
262
|
+
await orch.rollout();
|
|
263
|
+
|
|
264
|
+
expect(onBotUpdated).toHaveBeenCalledTimes(2);
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
it("calls onRolloutComplete callback", async () => {
|
|
268
|
+
const profiles = [makeProfile("b1")];
|
|
269
|
+
const onRolloutComplete = vi.fn();
|
|
270
|
+
|
|
271
|
+
const orch = new RolloutOrchestrator({
|
|
272
|
+
updater,
|
|
273
|
+
snapshotManager: snapMgr,
|
|
274
|
+
strategy: mockStrategy(),
|
|
275
|
+
getUpdatableProfiles: async () => profiles,
|
|
276
|
+
onRolloutComplete,
|
|
277
|
+
});
|
|
278
|
+
|
|
279
|
+
await orch.rollout();
|
|
280
|
+
|
|
281
|
+
expect(onRolloutComplete).toHaveBeenCalledTimes(1);
|
|
282
|
+
expect(onRolloutComplete).toHaveBeenCalledWith(
|
|
283
|
+
expect.objectContaining({ totalBots: 1, succeeded: 1, aborted: false }),
|
|
284
|
+
);
|
|
285
|
+
});
|
|
286
|
+
|
|
287
|
+
it("continues on snapshot failure (best-effort)", async () => {
|
|
288
|
+
const profiles = [makeProfile("b1", "my-volume")];
|
|
289
|
+
snapMgr.snapshot = vi.fn().mockRejectedValue(new Error("disk full"));
|
|
290
|
+
|
|
291
|
+
const orch = new RolloutOrchestrator({
|
|
292
|
+
updater,
|
|
293
|
+
snapshotManager: snapMgr,
|
|
294
|
+
strategy: mockStrategy(),
|
|
295
|
+
getUpdatableProfiles: async () => profiles,
|
|
296
|
+
});
|
|
297
|
+
|
|
298
|
+
const result = await orch.rollout();
|
|
299
|
+
|
|
300
|
+
// Update still proceeds despite snapshot failure
|
|
301
|
+
expect(result.succeeded).toBe(1);
|
|
302
|
+
expect(updater.updateBot).toHaveBeenCalledWith("b1");
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
it("isRolling reflects rollout state", async () => {
|
|
306
|
+
const profiles = [makeProfile("b1")];
|
|
307
|
+
|
|
308
|
+
const orch = new RolloutOrchestrator({
|
|
309
|
+
updater,
|
|
310
|
+
snapshotManager: snapMgr,
|
|
311
|
+
strategy: mockStrategy(),
|
|
312
|
+
getUpdatableProfiles: async () => profiles,
|
|
313
|
+
});
|
|
314
|
+
|
|
315
|
+
expect(orch.isRolling).toBe(false);
|
|
316
|
+
const promise = orch.rollout();
|
|
317
|
+
// isRolling is true during rollout (may already be done for sync mocks)
|
|
318
|
+
await promise;
|
|
319
|
+
expect(orch.isRolling).toBe(false);
|
|
320
|
+
});
|
|
321
|
+
});
|
package/src/fleet/index.ts
CHANGED
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RolloutOrchestrator — coordinates fleet-wide container updates using
|
|
3
|
+
* pluggable rollout strategies and volume snapshots for nuclear rollback.
|
|
4
|
+
*
|
|
5
|
+
* Sits between ImagePoller (detects new digests) and ContainerUpdater
|
|
6
|
+
* (handles per-bot pull/stop/recreate/health). Adds:
|
|
7
|
+
* - Strategy-driven batching (rolling wave, single bot, immediate)
|
|
8
|
+
* - Pre-update volume snapshots via VolumeSnapshotManager
|
|
9
|
+
* - Volume restore on health check failure (nuclear rollback)
|
|
10
|
+
* - Per-tenant update orchestration
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { logger } from "../config/logger.js";
|
|
14
|
+
import type { IRolloutStrategy } from "./rollout-strategy.js";
|
|
15
|
+
import type { BotProfile } from "./types.js";
|
|
16
|
+
import type { ContainerUpdater, UpdateResult } from "./updater.js";
|
|
17
|
+
import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
|
|
18
|
+
|
|
19
|
+
export interface RolloutOrchestratorDeps {
|
|
20
|
+
updater: ContainerUpdater;
|
|
21
|
+
snapshotManager: VolumeSnapshotManager;
|
|
22
|
+
strategy: IRolloutStrategy;
|
|
23
|
+
/** Resolve running profiles that need updating for a given image digest */
|
|
24
|
+
getUpdatableProfiles: () => Promise<BotProfile[]>;
|
|
25
|
+
/** Optional callback after each bot update (success or failure) */
|
|
26
|
+
onBotUpdated?: (result: UpdateResult & { volumeRestored: boolean }) => void;
|
|
27
|
+
/** Optional callback when a rollout completes */
|
|
28
|
+
onRolloutComplete?: (results: RolloutResult) => void;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface BotUpdateResult extends UpdateResult {
|
|
32
|
+
volumeRestored: boolean;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export interface RolloutResult {
|
|
36
|
+
totalBots: number;
|
|
37
|
+
succeeded: number;
|
|
38
|
+
failed: number;
|
|
39
|
+
skipped: number;
|
|
40
|
+
aborted: boolean;
|
|
41
|
+
/** True when a concurrent rollout was already in progress */
|
|
42
|
+
alreadyRunning: boolean;
|
|
43
|
+
results: BotUpdateResult[];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export class RolloutOrchestrator {
|
|
47
|
+
private readonly updater: ContainerUpdater;
|
|
48
|
+
private readonly snapshotManager: VolumeSnapshotManager;
|
|
49
|
+
private readonly strategy: IRolloutStrategy;
|
|
50
|
+
private readonly getUpdatableProfiles: () => Promise<BotProfile[]>;
|
|
51
|
+
private readonly onBotUpdated?: (result: BotUpdateResult) => void;
|
|
52
|
+
private readonly onRolloutComplete?: (results: RolloutResult) => void;
|
|
53
|
+
private rolling = false;
|
|
54
|
+
|
|
55
|
+
constructor(deps: RolloutOrchestratorDeps) {
|
|
56
|
+
this.updater = deps.updater;
|
|
57
|
+
this.snapshotManager = deps.snapshotManager;
|
|
58
|
+
this.strategy = deps.strategy;
|
|
59
|
+
this.getUpdatableProfiles = deps.getUpdatableProfiles;
|
|
60
|
+
this.onBotUpdated = deps.onBotUpdated;
|
|
61
|
+
this.onRolloutComplete = deps.onRolloutComplete;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Whether a rollout is currently in progress. */
|
|
65
|
+
get isRolling(): boolean {
|
|
66
|
+
return this.rolling;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Execute a rollout across all updatable bots.
|
|
71
|
+
* Uses the configured strategy for batching, pausing, and failure handling.
|
|
72
|
+
*/
|
|
73
|
+
async rollout(): Promise<RolloutResult> {
|
|
74
|
+
if (this.rolling) {
|
|
75
|
+
logger.warn("Rollout already in progress — skipping");
|
|
76
|
+
return { totalBots: 0, succeeded: 0, failed: 0, skipped: 0, aborted: false, alreadyRunning: true, results: [] };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
this.rolling = true;
|
|
80
|
+
const allResults: BotUpdateResult[] = [];
|
|
81
|
+
let aborted = false;
|
|
82
|
+
|
|
83
|
+
try {
|
|
84
|
+
let remaining = await this.getUpdatableProfiles();
|
|
85
|
+
const totalBots = remaining.length;
|
|
86
|
+
|
|
87
|
+
if (totalBots === 0) {
|
|
88
|
+
logger.info("Rollout: no bots to update");
|
|
89
|
+
return {
|
|
90
|
+
totalBots: 0,
|
|
91
|
+
succeeded: 0,
|
|
92
|
+
failed: 0,
|
|
93
|
+
skipped: 0,
|
|
94
|
+
aborted: false,
|
|
95
|
+
alreadyRunning: false,
|
|
96
|
+
results: [],
|
|
97
|
+
};
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
logger.info(`Rollout starting: ${totalBots} bots to update`);
|
|
101
|
+
|
|
102
|
+
while (remaining.length > 0 && !aborted) {
|
|
103
|
+
const batch = this.strategy.nextBatch(remaining);
|
|
104
|
+
if (batch.length === 0) break;
|
|
105
|
+
|
|
106
|
+
logger.info(`Rollout wave: ${batch.length} bots (${remaining.length} remaining)`);
|
|
107
|
+
|
|
108
|
+
// Process batch — each bot sequentially within a wave for safety
|
|
109
|
+
const retryProfiles: BotProfile[] = [];
|
|
110
|
+
for (const profile of batch) {
|
|
111
|
+
if (aborted) break;
|
|
112
|
+
|
|
113
|
+
const result = await this.updateBot(profile);
|
|
114
|
+
allResults.push(result);
|
|
115
|
+
this.onBotUpdated?.(result);
|
|
116
|
+
|
|
117
|
+
if (!result.success) {
|
|
118
|
+
const action = this.handleFailure(profile.id, result, allResults);
|
|
119
|
+
if (action === "abort") {
|
|
120
|
+
aborted = true;
|
|
121
|
+
logger.warn(`Rollout aborted after bot ${profile.id} failure`);
|
|
122
|
+
} else if (action === "retry") {
|
|
123
|
+
retryProfiles.push(profile);
|
|
124
|
+
}
|
|
125
|
+
// "skip" → don't re-add, bot is dropped
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Remove processed bots from remaining, but re-add retries
|
|
130
|
+
const processedIds = new Set(batch.map((b) => b.id));
|
|
131
|
+
const retryIds = new Set(retryProfiles.map((b) => b.id));
|
|
132
|
+
remaining = [
|
|
133
|
+
...remaining.filter((b) => !processedIds.has(b.id)),
|
|
134
|
+
...retryProfiles.filter((b) => retryIds.has(b.id)),
|
|
135
|
+
];
|
|
136
|
+
|
|
137
|
+
// Pause between waves (unless aborted or done)
|
|
138
|
+
if (remaining.length > 0 && !aborted) {
|
|
139
|
+
const pause = this.strategy.pauseDuration();
|
|
140
|
+
if (pause > 0) {
|
|
141
|
+
logger.info(`Rollout: pausing ${pause}ms before next wave`);
|
|
142
|
+
await sleep(pause);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const succeeded = allResults.filter((r) => r.success).length;
|
|
148
|
+
const failed = allResults.filter((r) => !r.success).length;
|
|
149
|
+
const skipped = totalBots - allResults.length;
|
|
150
|
+
|
|
151
|
+
const rolloutResult: RolloutResult = {
|
|
152
|
+
totalBots,
|
|
153
|
+
succeeded,
|
|
154
|
+
failed,
|
|
155
|
+
skipped,
|
|
156
|
+
aborted,
|
|
157
|
+
alreadyRunning: false,
|
|
158
|
+
results: allResults,
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
logger.info(`Rollout complete: ${succeeded} succeeded, ${failed} failed, ${skipped} skipped, aborted=${aborted}`);
|
|
162
|
+
this.onRolloutComplete?.(rolloutResult);
|
|
163
|
+
|
|
164
|
+
return rolloutResult;
|
|
165
|
+
} finally {
|
|
166
|
+
this.rolling = false;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
/**
|
|
171
|
+
* Update a single bot with volume snapshot + nuclear rollback.
|
|
172
|
+
*/
|
|
173
|
+
private async updateBot(profile: BotProfile): Promise<BotUpdateResult> {
|
|
174
|
+
const snapshotIds: string[] = [];
|
|
175
|
+
|
|
176
|
+
try {
|
|
177
|
+
// Step 1: Snapshot volumes before update
|
|
178
|
+
if (profile.volumeName) {
|
|
179
|
+
try {
|
|
180
|
+
const snap = await this.snapshotManager.snapshot(profile.volumeName);
|
|
181
|
+
snapshotIds.push(snap.id);
|
|
182
|
+
logger.info(`Pre-update snapshot for ${profile.id}: ${snap.id}`);
|
|
183
|
+
} catch (err) {
|
|
184
|
+
logger.warn(`Volume snapshot failed for ${profile.id} — proceeding without backup`, { err });
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Step 2: Delegate to ContainerUpdater
|
|
189
|
+
const result = await this.updater.updateBot(profile.id);
|
|
190
|
+
|
|
191
|
+
if (result.success) {
|
|
192
|
+
// Clean up snapshots on success
|
|
193
|
+
await this.cleanupSnapshots(snapshotIds);
|
|
194
|
+
return { ...result, volumeRestored: false };
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Step 3: Nuclear rollback — restore volumes if update failed
|
|
198
|
+
const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
|
|
199
|
+
return { ...result, volumeRestored };
|
|
200
|
+
} catch (err) {
|
|
201
|
+
logger.error(`Unexpected error updating bot ${profile.id}`, { err });
|
|
202
|
+
|
|
203
|
+
// Attempt volume restore on unexpected errors too
|
|
204
|
+
const volumeRestored = await this.restoreVolumes(profile.id, snapshotIds);
|
|
205
|
+
|
|
206
|
+
return {
|
|
207
|
+
botId: profile.id,
|
|
208
|
+
success: false,
|
|
209
|
+
previousImage: profile.image,
|
|
210
|
+
newImage: profile.image,
|
|
211
|
+
previousDigest: null,
|
|
212
|
+
newDigest: null,
|
|
213
|
+
rolledBack: false,
|
|
214
|
+
volumeRestored,
|
|
215
|
+
error: err instanceof Error ? err.message : String(err),
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Handle a bot failure using the strategy's failure policy.
|
|
222
|
+
* Retries the update up to maxRetries before escalating.
|
|
223
|
+
*/
|
|
224
|
+
private handleFailure(
|
|
225
|
+
botId: string,
|
|
226
|
+
result: BotUpdateResult,
|
|
227
|
+
allResults: BotUpdateResult[],
|
|
228
|
+
): "abort" | "skip" | "retry" {
|
|
229
|
+
const error = new Error(result.error ?? "Unknown error");
|
|
230
|
+
const failCount = allResults.filter((r) => r.botId === botId && !r.success).length;
|
|
231
|
+
return this.strategy.onBotFailure(botId, error, failCount);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
private async restoreVolumes(botId: string, snapshotIds: string[]): Promise<boolean> {
|
|
235
|
+
if (snapshotIds.length === 0) return false;
|
|
236
|
+
|
|
237
|
+
for (const id of snapshotIds) {
|
|
238
|
+
try {
|
|
239
|
+
await this.snapshotManager.restore(id);
|
|
240
|
+
logger.info(`Volume restored for ${botId} from snapshot ${id}`);
|
|
241
|
+
return true;
|
|
242
|
+
} catch (err) {
|
|
243
|
+
logger.error(`Volume restore failed for ${botId} snapshot ${id}`, { err });
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
return false;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
private async cleanupSnapshots(snapshotIds: string[]): Promise<void> {
|
|
250
|
+
for (const id of snapshotIds) {
|
|
251
|
+
try {
|
|
252
|
+
await this.snapshotManager.delete(id);
|
|
253
|
+
} catch (err) {
|
|
254
|
+
logger.warn(`Failed to clean up snapshot ${id}`, { err });
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
function sleep(ms: number): Promise<void> {
|
|
261
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
262
|
+
}
|
package/src/fleet/services.ts
CHANGED
|
@@ -32,6 +32,8 @@ import { SystemResourceMonitor } from "../observability/system-resources.js";
|
|
|
32
32
|
// Stub re-exports so existing references compile; consumers must call initPlatformServices().
|
|
33
33
|
// TODO: Replace with proper DI / service-locator pattern in platform-core.
|
|
34
34
|
import { DrizzleTwoFactorRepository } from "../security/two-factor-repository.js";
|
|
35
|
+
import type { RolloutOrchestrator } from "./rollout-orchestrator.js";
|
|
36
|
+
import type { VolumeSnapshotManager } from "./volume-snapshot-manager.js";
|
|
35
37
|
|
|
36
38
|
// Platform singletons (getAdminAuditLog, getCreditLedger, etc.) are wired by
|
|
37
39
|
// the consuming application's own composition root (e.g. wopr-platform's
|
|
@@ -136,6 +138,8 @@ let _restoreLogStore: IRestoreLogStore | null = null;
|
|
|
136
138
|
let _restoreService: RestoreService | null = null;
|
|
137
139
|
let _backupStatusStore: IBackupStatusStore | null = null;
|
|
138
140
|
let _snapshotManager: SnapshotManager | null = null;
|
|
141
|
+
let _volumeSnapshotManager: VolumeSnapshotManager | null = null;
|
|
142
|
+
let _rolloutOrchestrator: RolloutOrchestrator | null = null;
|
|
139
143
|
|
|
140
144
|
const S3_BUCKET = process.env.S3_BUCKET || "wopr-backups";
|
|
141
145
|
|
|
@@ -537,6 +541,28 @@ export function getSnapshotManager(): SnapshotManager {
|
|
|
537
541
|
return _snapshotManager;
|
|
538
542
|
}
|
|
539
543
|
|
|
544
|
+
export function getVolumeSnapshotManager(): VolumeSnapshotManager {
|
|
545
|
+
if (!_volumeSnapshotManager) {
|
|
546
|
+
throw new Error("VolumeSnapshotManager not initialized — call setVolumeSnapshotManager() first");
|
|
547
|
+
}
|
|
548
|
+
return _volumeSnapshotManager;
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
export function setVolumeSnapshotManager(mgr: VolumeSnapshotManager): void {
|
|
552
|
+
_volumeSnapshotManager = mgr;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
export function getRolloutOrchestrator(): RolloutOrchestrator {
|
|
556
|
+
if (!_rolloutOrchestrator) {
|
|
557
|
+
throw new Error("RolloutOrchestrator not initialized — call setRolloutOrchestrator() first");
|
|
558
|
+
}
|
|
559
|
+
return _rolloutOrchestrator;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
export function setRolloutOrchestrator(orch: RolloutOrchestrator): void {
|
|
563
|
+
_rolloutOrchestrator = orch;
|
|
564
|
+
}
|
|
565
|
+
|
|
540
566
|
export function getRestoreService(): RestoreService {
|
|
541
567
|
if (!_restoreService) {
|
|
542
568
|
_restoreService = new RestoreService({
|
|
@@ -877,6 +903,8 @@ export function _resetForTest(): void {
|
|
|
877
903
|
_restoreService = null;
|
|
878
904
|
_backupStatusStore = null;
|
|
879
905
|
_snapshotManager = null;
|
|
906
|
+
_volumeSnapshotManager = null;
|
|
907
|
+
_rolloutOrchestrator = null;
|
|
880
908
|
_botBilling = null;
|
|
881
909
|
_phoneNumberRepo = null;
|
|
882
910
|
_affiliateRepo = null;
|