@wopr-network/platform-core 1.17.0 → 1.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/routes/admin-audit-helper.d.ts +1 -1
- package/dist/billing/crypto/evm/__tests__/config.test.js +10 -0
- package/dist/billing/crypto/evm/config.js +12 -0
- package/dist/billing/crypto/evm/types.d.ts +1 -1
- package/dist/fleet/__tests__/rollout-strategy.test.d.ts +1 -0
- package/dist/fleet/__tests__/rollout-strategy.test.js +157 -0
- package/dist/fleet/__tests__/volume-snapshot-manager.test.d.ts +1 -0
- package/dist/fleet/__tests__/volume-snapshot-manager.test.js +171 -0
- package/dist/fleet/index.d.ts +2 -0
- package/dist/fleet/index.js +2 -0
- package/dist/fleet/rollout-strategy.d.ts +52 -0
- package/dist/fleet/rollout-strategy.js +91 -0
- package/dist/fleet/volume-snapshot-manager.d.ts +35 -0
- package/dist/fleet/volume-snapshot-manager.js +185 -0
- package/docs/superpowers/specs/2026-03-14-fleet-auto-update-design.md +300 -0
- package/docs/superpowers/specs/2026-03-14-paperclip-org-integration-design.md +359 -0
- package/docs/superpowers/specs/2026-03-14-role-permissions-design.md +346 -0
- package/package.json +1 -1
- package/src/api/routes/admin-audit-helper.ts +1 -1
- package/src/billing/crypto/evm/__tests__/config.test.ts +12 -0
- package/src/billing/crypto/evm/config.ts +13 -1
- package/src/billing/crypto/evm/types.ts +1 -1
- package/src/fleet/__tests__/rollout-strategy.test.ts +192 -0
- package/src/fleet/__tests__/volume-snapshot-manager.test.ts +218 -0
- package/src/fleet/index.ts +2 -0
- package/src/fleet/rollout-strategy.ts +128 -0
- package/src/fleet/volume-snapshot-manager.ts +213 -0
- package/src/marketplace/volume-installer.test.ts +8 -2
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import { mkdir, readdir, rm, stat } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { logger } from "../config/logger.js";
|
|
4
|
+
const ALPINE_IMAGE = "alpine:latest";
|
|
5
|
+
/** Strict validation for snapshot IDs — prevents path traversal and shell injection. */
|
|
6
|
+
const SNAPSHOT_ID_RE = /^[A-Za-z0-9._-]+$/;
|
|
7
|
+
function validateSnapshotId(snapshotId) {
|
|
8
|
+
if (!SNAPSHOT_ID_RE.test(snapshotId)) {
|
|
9
|
+
throw new Error(`Invalid snapshot ID: ${snapshotId}`);
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Snapshots and restores Docker named volumes using temporary alpine containers.
|
|
14
|
+
* Used for nuclear rollback during fleet updates — if a container update fails,
|
|
15
|
+
* we roll back both the image AND the data volumes.
|
|
16
|
+
*/
|
|
17
|
+
export class VolumeSnapshotManager {
|
|
18
|
+
docker;
|
|
19
|
+
backupDir;
|
|
20
|
+
constructor(docker, backupDir = "/data/fleet/snapshots") {
|
|
21
|
+
this.docker = docker;
|
|
22
|
+
this.backupDir = backupDir;
|
|
23
|
+
}
|
|
24
|
+
/** Create a snapshot of a Docker named volume */
|
|
25
|
+
async snapshot(volumeName) {
|
|
26
|
+
await mkdir(this.backupDir, { recursive: true });
|
|
27
|
+
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
28
|
+
const id = `${volumeName}-${timestamp}`;
|
|
29
|
+
const archivePath = join(this.backupDir, `${id}.tar`);
|
|
30
|
+
const container = await this.docker.createContainer({
|
|
31
|
+
Image: ALPINE_IMAGE,
|
|
32
|
+
Cmd: ["tar", "cf", `/backup/${id}.tar`, "-C", "/source", "."],
|
|
33
|
+
HostConfig: {
|
|
34
|
+
Binds: [`${volumeName}:/source:ro`, `${this.backupDir}:/backup`],
|
|
35
|
+
AutoRemove: true,
|
|
36
|
+
},
|
|
37
|
+
});
|
|
38
|
+
try {
|
|
39
|
+
await container.start();
|
|
40
|
+
const result = await container.wait();
|
|
41
|
+
if (result.StatusCode !== 0) {
|
|
42
|
+
throw new Error(`Snapshot container exited with code ${result.StatusCode}`);
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
catch (err) {
|
|
46
|
+
// AutoRemove handles cleanup, but if start failed the container may still exist
|
|
47
|
+
try {
|
|
48
|
+
await container.remove({ force: true });
|
|
49
|
+
}
|
|
50
|
+
catch {
|
|
51
|
+
// already removed by AutoRemove
|
|
52
|
+
}
|
|
53
|
+
throw err;
|
|
54
|
+
}
|
|
55
|
+
const info = await stat(archivePath);
|
|
56
|
+
const snapshot = {
|
|
57
|
+
id,
|
|
58
|
+
volumeName,
|
|
59
|
+
archivePath,
|
|
60
|
+
createdAt: new Date(),
|
|
61
|
+
sizeBytes: info.size,
|
|
62
|
+
};
|
|
63
|
+
logger.info(`Volume snapshot created: ${id} (${info.size} bytes)`);
|
|
64
|
+
return snapshot;
|
|
65
|
+
}
|
|
66
|
+
/** Restore a volume from a snapshot */
|
|
67
|
+
async restore(snapshotId) {
|
|
68
|
+
validateSnapshotId(snapshotId);
|
|
69
|
+
const archivePath = join(this.backupDir, `${snapshotId}.tar`);
|
|
70
|
+
// Verify archive exists
|
|
71
|
+
await stat(archivePath);
|
|
72
|
+
// Extract volume name from snapshot ID (everything before the last ISO timestamp)
|
|
73
|
+
const volumeName = this.extractVolumeName(snapshotId);
|
|
74
|
+
const container = await this.docker.createContainer({
|
|
75
|
+
Image: ALPINE_IMAGE,
|
|
76
|
+
Cmd: ["sh", "-c", `cd /target && rm -rf ./* ./.??* && tar xf /backup/${snapshotId}.tar -C /target`],
|
|
77
|
+
HostConfig: {
|
|
78
|
+
Binds: [`${volumeName}:/target`, `${this.backupDir}:/backup:ro`],
|
|
79
|
+
AutoRemove: true,
|
|
80
|
+
},
|
|
81
|
+
});
|
|
82
|
+
try {
|
|
83
|
+
await container.start();
|
|
84
|
+
const result = await container.wait();
|
|
85
|
+
if (result.StatusCode !== 0) {
|
|
86
|
+
throw new Error(`Restore container exited with code ${result.StatusCode}`);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
catch (err) {
|
|
90
|
+
try {
|
|
91
|
+
await container.remove({ force: true });
|
|
92
|
+
}
|
|
93
|
+
catch {
|
|
94
|
+
// already removed by AutoRemove
|
|
95
|
+
}
|
|
96
|
+
throw err;
|
|
97
|
+
}
|
|
98
|
+
logger.info(`Volume restored from snapshot: ${snapshotId}`);
|
|
99
|
+
}
|
|
100
|
+
/** List all snapshots for a volume */
|
|
101
|
+
async list(volumeName) {
|
|
102
|
+
let files;
|
|
103
|
+
try {
|
|
104
|
+
files = await readdir(this.backupDir);
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
return [];
|
|
108
|
+
}
|
|
109
|
+
const prefix = `${volumeName}-`;
|
|
110
|
+
const matching = files.filter((f) => f.startsWith(prefix) && f.endsWith(".tar"));
|
|
111
|
+
const snapshots = [];
|
|
112
|
+
for (const file of matching) {
|
|
113
|
+
const id = file.replace(/\.tar$/, "");
|
|
114
|
+
const archivePath = join(this.backupDir, file);
|
|
115
|
+
try {
|
|
116
|
+
const info = await stat(archivePath);
|
|
117
|
+
snapshots.push({
|
|
118
|
+
id,
|
|
119
|
+
volumeName,
|
|
120
|
+
archivePath,
|
|
121
|
+
createdAt: info.mtime,
|
|
122
|
+
sizeBytes: info.size,
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
catch {
|
|
126
|
+
// File disappeared between readdir and stat — skip
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
// Sort newest first
|
|
130
|
+
snapshots.sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime());
|
|
131
|
+
return snapshots;
|
|
132
|
+
}
|
|
133
|
+
/** Delete a snapshot archive */
|
|
134
|
+
async delete(snapshotId) {
|
|
135
|
+
validateSnapshotId(snapshotId);
|
|
136
|
+
const archivePath = join(this.backupDir, `${snapshotId}.tar`);
|
|
137
|
+
await rm(archivePath, { force: true });
|
|
138
|
+
logger.info(`Volume snapshot deleted: ${snapshotId}`);
|
|
139
|
+
}
|
|
140
|
+
/** Delete all snapshots older than maxAge ms */
|
|
141
|
+
async cleanup(maxAgeMs) {
|
|
142
|
+
let files;
|
|
143
|
+
try {
|
|
144
|
+
files = await readdir(this.backupDir);
|
|
145
|
+
}
|
|
146
|
+
catch {
|
|
147
|
+
return 0;
|
|
148
|
+
}
|
|
149
|
+
const cutoff = Date.now() - maxAgeMs;
|
|
150
|
+
let deleted = 0;
|
|
151
|
+
for (const file of files) {
|
|
152
|
+
if (!file.endsWith(".tar"))
|
|
153
|
+
continue;
|
|
154
|
+
const archivePath = join(this.backupDir, file);
|
|
155
|
+
try {
|
|
156
|
+
const info = await stat(archivePath);
|
|
157
|
+
if (info.mtime.getTime() < cutoff) {
|
|
158
|
+
await rm(archivePath, { force: true });
|
|
159
|
+
deleted++;
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
catch {
|
|
163
|
+
// File disappeared — skip
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
if (deleted > 0) {
|
|
167
|
+
logger.info(`Volume snapshot cleanup: removed ${deleted} old snapshots`);
|
|
168
|
+
}
|
|
169
|
+
return deleted;
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Extract volume name from snapshot ID.
|
|
173
|
+
* Snapshot IDs are `${volumeName}-${ISO timestamp with colons/dots replaced}`.
|
|
174
|
+
* ISO timestamps start with 4 digits (year), so we find the last occurrence
|
|
175
|
+
* of `-YYYY` pattern to split.
|
|
176
|
+
*/
|
|
177
|
+
extractVolumeName(snapshotId) {
|
|
178
|
+
// Match the timestamp part: -YYYY-MM-DDTHH-MM-SS-MMMZ
|
|
179
|
+
const match = snapshotId.match(/^(.+)-\d{4}-\d{2}-\d{2}T/);
|
|
180
|
+
if (!match) {
|
|
181
|
+
throw new Error(`Cannot extract volume name from snapshot ID: ${snapshotId}`);
|
|
182
|
+
}
|
|
183
|
+
return match[1];
|
|
184
|
+
}
|
|
185
|
+
}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
# Fleet Auto-Update with Rolling Waves
|
|
2
|
+
|
|
3
|
+
**Date:** 2026-03-14
|
|
4
|
+
**Status:** Draft
|
|
5
|
+
**Repos:** platform-core, paperclip, paperclip-platform, paperclip-platform-ui
|
|
6
|
+
|
|
7
|
+
## Problem
|
|
8
|
+
|
|
9
|
+
Upstream Paperclip changes land nightly via `upstream-sync.mjs`, which rebases our fork and creates a PR. After manual review and merge, `docker-managed.yml` auto-builds and pushes `ghcr.io/wopr-network/paperclip:managed`. But existing running containers never receive the update. New containers get `:managed` on first pull; old containers are stuck on whatever digest they were created with.
|
|
10
|
+
|
|
11
|
+
platform-core has `ImagePoller` and `ContainerUpdater` classes that are fully implemented and tested but **not wired into the application lifecycle**.
|
|
12
|
+
|
|
13
|
+
## Design
|
|
14
|
+
|
|
15
|
+
### Pipeline Overview
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
paperclipai/paperclip (upstream)
|
|
19
|
+
| nightly 06:00 UTC
|
|
20
|
+
upstream-sync.mjs (rebase + hostedMode guards + changelog generation)
|
|
21
|
+
| creates PR
|
|
22
|
+
human reviews & merges PR
|
|
23
|
+
| push to master
|
|
24
|
+
docker-managed.yml (auto-build)
|
|
25
|
+
| pushes ghcr.io/wopr-network/paperclip:managed
|
|
26
|
+
ImagePoller detects new digest
|
|
27
|
+
| groups bots by tenant
|
|
28
|
+
RolloutOrchestrator executes strategy
|
|
29
|
+
| per-bot update sequence
|
|
30
|
+
ContainerUpdater (snapshot + pull + recreate + health check + rollback)
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Human Gate
|
|
34
|
+
|
|
35
|
+
The only human checkpoint is **reviewing and merging the upstream sync PR**. Everything downstream is automatic.
|
|
36
|
+
|
|
37
|
+
### 1. Changelog Generation (changes to `paperclip/scripts/upstream-sync.mjs`)
|
|
38
|
+
|
|
39
|
+
After rebase and hostedMode gap scanning, the sync agent generates two changelogs:
|
|
40
|
+
|
|
41
|
+
**Internal changelog** (`changelogs/internal/YYYY-MM-DD.md`):
|
|
42
|
+
- Full developer-facing diff summary
|
|
43
|
+
- What upstream changed, what guards were added, conflicts resolved
|
|
44
|
+
- For PR review purposes
|
|
45
|
+
|
|
46
|
+
**User-facing changelog** (`changelogs/user-facing/YYYY-MM-DD.json`):
|
|
47
|
+
- Structured format: `{ version, date, sections: [{ title: "New" | "Improved" | "Fixed", items: string[] }] }`
|
|
48
|
+
- Filtered through hosted-mode exclusion list — silently drops anything related to: adapters, model selection, thinking effort, runtime/heartbeat config, provider API keys, CLI, deployment modes, infrastructure, self-hosting
|
|
49
|
+
- Same `HOSTED_MODE_CONTEXT` that drives the guard scanner drives the changelog filter
|
|
50
|
+
|
|
51
|
+
Both files are committed in the sync PR. The user-facing JSON is copied into the Docker image during build (add `COPY changelogs/user-facing/ /app/changelogs/` to `Dockerfile.managed`). If the image exists, its changelog exists.
|
|
52
|
+
|
|
53
|
+
**Changelog retrieval:** After pulling a new image (before starting the update sequence), extract the changelog:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
docker run --rm ghcr.io/wopr-network/paperclip:managed cat /app/changelogs/latest.json
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
The extracted JSON is stored in the fleet event payload for email and UI consumption. The `latest.json` symlink always points to the most recent changelog file.
|
|
60
|
+
|
|
61
|
+
### 2. Image Detection (wire existing code in `platform-core`)
|
|
62
|
+
|
|
63
|
+
Changes to `src/fleet/services.ts`:
|
|
64
|
+
|
|
65
|
+
- Add `ImagePoller` and `ContainerUpdater` singletons
|
|
66
|
+
- `initFleet()` starts the poller and wires `poller.onUpdateAvailable` to `RolloutOrchestrator`
|
|
67
|
+
- ImagePoller already handles poll intervals per release channel (canary=5m, staging=15m, stable=30m)
|
|
68
|
+
|
|
69
|
+
### 3. Rollout Orchestrator (new: `src/fleet/rollout-orchestrator.ts`)
|
|
70
|
+
|
|
71
|
+
GoF Strategy pattern. The orchestrator is the context; strategies are interchangeable.
|
|
72
|
+
|
|
73
|
+
**`IRolloutStrategy` interface:**
|
|
74
|
+
|
|
75
|
+
```typescript
|
|
76
|
+
interface IRolloutStrategy {
|
|
77
|
+
/** Select next batch from remaining bots */
|
|
78
|
+
nextBatch(remaining: BotProfile[]): BotProfile[];
|
|
79
|
+
/** Milliseconds to wait between waves */
|
|
80
|
+
pauseDuration(): number;
|
|
81
|
+
/** What to do when a single bot update fails */
|
|
82
|
+
onBotFailure(botId: string, error: Error, attempt: number): "abort" | "skip" | "retry";
|
|
83
|
+
/** Max retries per bot before skip/abort */
|
|
84
|
+
maxRetries(): number;
|
|
85
|
+
/** Health check timeout per bot (ms) */
|
|
86
|
+
healthCheckTimeout(): number;
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
**Concrete strategies:**
|
|
91
|
+
|
|
92
|
+
| Strategy | Batch | Pause | Failure | Use Case |
|
|
93
|
+
|----------|-------|-------|---------|----------|
|
|
94
|
+
| `RollingWaveStrategy` | configurable % | configurable | abort on N+ failures | Default for auto-update |
|
|
95
|
+
| `SingleBotStrategy` | 1 bot | N/A | report | Manual per-bot update button |
|
|
96
|
+
| `ImmediateStrategy` | all | 0 | skip | Emergency hotfix |
|
|
97
|
+
|
|
98
|
+
Strategy selection is **admin-controlled only** — users never see this.
|
|
99
|
+
|
|
100
|
+
**Orchestrator flow:**
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
1. Group update-eligible bots by tenant
|
|
104
|
+
2. For each tenant:
|
|
105
|
+
a. Check tenant update mode (auto/manual)
|
|
106
|
+
b. If manual: mark bots as "update available", send notification, stop
|
|
107
|
+
c. If auto: check if current time is within tenant's preferred window
|
|
108
|
+
d. Select strategy (from admin config)
|
|
109
|
+
e. Execute waves:
|
|
110
|
+
- batch = strategy.nextBatch(remaining)
|
|
111
|
+
- for each bot in batch: ContainerUpdater.updateBot()
|
|
112
|
+
- if any failure: strategy.onBotFailure() → abort/skip/retry
|
|
113
|
+
- sleep(strategy.pauseDuration())
|
|
114
|
+
- repeat until remaining is empty
|
|
115
|
+
3. Send notification emails with changelog
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### 4. Update Sequence Per Bot (major rework of `ContainerUpdater`)
|
|
119
|
+
|
|
120
|
+
Nuclear rollback — image AND volumes roll back together.
|
|
121
|
+
|
|
122
|
+
**Volume Snapshot Mechanism (new: `VolumeSnapshotManager`):**
|
|
123
|
+
|
|
124
|
+
The existing `SnapshotManager` operates on filesystem paths, not Docker named volumes. A new `VolumeSnapshotManager` is needed that snapshots Docker named volumes using a temporary container:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
# Snapshot a named volume to a tar archive:
|
|
128
|
+
docker run --rm -v <volume-name>:/source -v <backup-dir>:/backup alpine \
|
|
129
|
+
tar cf /backup/<volume-name>-<timestamp>.tar -C /source .
|
|
130
|
+
|
|
131
|
+
# Restore a named volume from a tar archive:
|
|
132
|
+
docker run --rm -v <volume-name>:/target -v <backup-dir>:/backup alpine \
|
|
133
|
+
sh -c "rm -rf /target/* && tar xf /backup/<volume-name>-<timestamp>.tar -C /target"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
This is a new class (`src/fleet/volume-snapshot-manager.ts`), not a modification of the existing `SnapshotManager`.
|
|
137
|
+
|
|
138
|
+
**Update sequence:**
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
1. Snapshot /data and /paperclip volumes (via VolumeSnapshotManager)
|
|
142
|
+
2. Record previous image digest (already implemented)
|
|
143
|
+
3. Pull new image
|
|
144
|
+
4. Stop container
|
|
145
|
+
5. Recreate container with new image (named volumes remount automatically)
|
|
146
|
+
6. Start container (PAPERCLIP_MIGRATION_AUTO_APPLY=true runs Drizzle migrations on boot)
|
|
147
|
+
7. Health check: HTTP GET http://container:3100/health, expect {"status":"ok"}
|
|
148
|
+
- Timeout: 120s (increased from current 60s to allow for Drizzle migration time)
|
|
149
|
+
- Poll interval: 5s
|
|
150
|
+
8a. HEALTHY:
|
|
151
|
+
- Delete volume snapshots
|
|
152
|
+
- Emit fleet event: bot.updated
|
|
153
|
+
- Record new digest
|
|
154
|
+
8b. UNHEALTHY:
|
|
155
|
+
- Stop container
|
|
156
|
+
- Restore volume snapshots from step 1 (via VolumeSnapshotManager)
|
|
157
|
+
- Recreate container with OLD image (digest-pinned to prevent re-pulling new)
|
|
158
|
+
- Start container
|
|
159
|
+
- Verify old container is healthy
|
|
160
|
+
- Emit fleet event: bot.update_failed
|
|
161
|
+
- Report to orchestrator (abort/skip/retry per strategy)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
**Health check upgrade:** Replace `node -e 'process.exit(0)'` in `createContainer()` with:
|
|
165
|
+
|
|
166
|
+
```typescript
|
|
167
|
+
Healthcheck: {
|
|
168
|
+
// Use node+fetch instead of curl — Paperclip's base image (node:lts-trixie-slim)
|
|
169
|
+
// may not have curl installed.
|
|
170
|
+
Test: ["CMD-SHELL", "node -e \"fetch('http://localhost:3100/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))\""],
|
|
171
|
+
Interval: 30_000_000_000,
|
|
172
|
+
Timeout: 10_000_000_000,
|
|
173
|
+
Retries: 3,
|
|
174
|
+
StartPeriod: 60_000_000_000, // 60s for Drizzle migrations on boot
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Note:** `HEALTH_CHECK_TIMEOUT_MS` in `ContainerUpdater` must be increased from 60,000 to 120,000 to match the spec's 120s timeout.
|
|
179
|
+
|
|
180
|
+
### 5. Tenant Update Config
|
|
181
|
+
|
|
182
|
+
Stored per-tenant (moves to per-org when org support ships — see `2026-03-14-paperclip-org-integration-design.md`).
|
|
183
|
+
|
|
184
|
+
```typescript
|
|
185
|
+
interface TenantUpdateConfig {
|
|
186
|
+
/** "auto" = rolling wave in preferred window; "manual" = badge + button */
|
|
187
|
+
mode: "auto" | "manual";
|
|
188
|
+
/** Hour of day (UTC) for auto-update window. Only used when mode=auto. */
|
|
189
|
+
preferredHourUtc: number; // 0-23, default 3
|
|
190
|
+
}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Default for new tenants: `{ mode: "manual", preferredHourUtc: 3 }`.
|
|
194
|
+
|
|
195
|
+
**Repository interface** (follows the `IFooRepository` pattern used throughout platform-core):
|
|
196
|
+
|
|
197
|
+
```typescript
|
|
198
|
+
export interface ITenantUpdateConfigRepository {
|
|
199
|
+
get(tenantId: string): Promise<TenantUpdateConfig | null>;
|
|
200
|
+
upsert(tenantId: string, config: TenantUpdateConfig): Promise<void>;
|
|
201
|
+
listAutoEnabled(): Promise<Array<{ tenantId: string; config: TenantUpdateConfig }>>;
|
|
202
|
+
}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
`DrizzleTenantUpdateConfigRepository` implements this against a `tenant_update_configs` table with columns `(tenant_id TEXT PK, mode TEXT, preferred_hour_utc INTEGER, updated_at BIGINT)`.
|
|
206
|
+
|
|
207
|
+
**Audit logging:** All config changes (mode switch, hour change) are logged via `logger.info("Tenant update config changed", { tenantId, oldConfig, newConfig, actorUserId })`. Admin-triggered updates via the `/admin/updates` route include the actor in the log entry.
|
|
208
|
+
|
|
209
|
+
Admin panel can override per-tenant or set global defaults.
|
|
210
|
+
|
|
211
|
+
**Precedence: tenant config overrides per-bot `updatePolicy`.** The existing `BotProfile.updatePolicy` field (per-bot: `on-push`, `nightly`, `manual`, `cron:*`) is superseded by `TenantUpdateConfig` for hosted deployments. The `RolloutOrchestrator` reads tenant config, not bot-level policy. `ImagePoller.shouldAutoUpdate()` is refactored to always return `false` — the poller's only job is to detect new digests and notify the orchestrator, which makes the auto/manual decision based on tenant config.
|
|
212
|
+
|
|
213
|
+
`ImagePoller.isNightlyWindow()` (hardcoded 03:00-03:30 UTC) is superseded by the orchestrator's per-tenant `preferredHourUtc` window check. The poller's nightly logic becomes a no-op.
|
|
214
|
+
|
|
215
|
+
Per-bot `updatePolicy` is preserved in the schema for self-hosted (non-platform) deployments where there is no tenant config.
|
|
216
|
+
|
|
217
|
+
### 6. Admin Controls
|
|
218
|
+
|
|
219
|
+
Admin panel (platform-core admin routes, not user-facing):
|
|
220
|
+
|
|
221
|
+
- **Global update mode**: auto / manual / paused (pause halts all rollouts fleet-wide)
|
|
222
|
+
- **Strategy config**: batch %, pause duration, failure threshold
|
|
223
|
+
- **Default update window**: hour UTC
|
|
224
|
+
- **Per-tenant overrides**: mode, window
|
|
225
|
+
- **Manual triggers**: "roll out now" for a specific image digest
|
|
226
|
+
- **Rollout status dashboard**: which bots updated, which failed, which pending
|
|
227
|
+
|
|
228
|
+
### 7. User-Facing Experience
|
|
229
|
+
|
|
230
|
+
**Auto mode (tenant doesn't know or care):**
|
|
231
|
+
- Updates happen silently during configured window
|
|
232
|
+
- Email after: "Your Paperclip was updated. Here's what's new: [changelog]"
|
|
233
|
+
- Brief downtime during container restart (seconds)
|
|
234
|
+
|
|
235
|
+
**Manual mode:**
|
|
236
|
+
- Email when update available: "A new update is available for your Paperclip. [changelog]"
|
|
237
|
+
- In-app: badge on bot in UI indicating update available
|
|
238
|
+
- Click "Update" → modal shows user-facing changelog with "Update Now" / "Later" buttons
|
|
239
|
+
- "Update Now" triggers `SingleBotStrategy` immediately
|
|
240
|
+
- Email after: "Your Paperclip was updated. Here's what's new: [changelog]"
|
|
241
|
+
|
|
242
|
+
**Both modes:**
|
|
243
|
+
- Admin email on rollback failure
|
|
244
|
+
- Fleet event log for audit
|
|
245
|
+
|
|
246
|
+
### 8. Image Allowlist
|
|
247
|
+
|
|
248
|
+
`FLEET_IMAGE_ALLOWLIST` already allows `ghcr.io/wopr-network/` — covers both WOPR and Paperclip images. Future brands add their prefix.
|
|
249
|
+
|
|
250
|
+
## Files to Create/Modify
|
|
251
|
+
|
|
252
|
+
### platform-core
|
|
253
|
+
|
|
254
|
+
| File | Action | Description |
|
|
255
|
+
|------|--------|-------------|
|
|
256
|
+
| `src/fleet/rollout-orchestrator.ts` | Create | Strategy pattern orchestrator |
|
|
257
|
+
| `src/fleet/rollout-strategies.ts` | Create | RollingWave, SingleBot, Immediate strategies |
|
|
258
|
+
| `src/fleet/services.ts` | Modify | Wire ImagePoller + ContainerUpdater + RolloutOrchestrator into initFleet() |
|
|
259
|
+
| `src/fleet/updater.ts` | Major rework | Add volume snapshot/restore lifecycle, replace FleetManager delegation with direct Docker operations for atomic update, upgrade health check from Docker HEALTHCHECK polling to HTTP GET, increase timeout from 60s to 120s |
|
|
260
|
+
| `src/fleet/volume-snapshot-manager.ts` | Create | Snapshot and restore Docker named volumes using temporary alpine containers |
|
|
261
|
+
| `src/fleet/fleet-manager.ts` | Modify | Upgrade HEALTHCHECK in createContainer() to use node+fetch instead of node -e |
|
|
262
|
+
| `src/fleet/image-poller.ts` | Modify | Wire onUpdateAvailable to orchestrator instead of direct updater |
|
|
263
|
+
| `src/db/schema/tenant-update-config.ts` | Create | Drizzle schema for tenant update preferences |
|
|
264
|
+
| `src/api/routes/admin-updates.ts` | Create | Admin API for update management |
|
|
265
|
+
| `src/fleet/update-notifier.ts` | Create | Email notifications for updates |
|
|
266
|
+
|
|
267
|
+
### paperclip
|
|
268
|
+
|
|
269
|
+
| File | Action | Description |
|
|
270
|
+
|------|--------|-------------|
|
|
271
|
+
| `scripts/upstream-sync.mjs` | Modify | Add changelog generation step |
|
|
272
|
+
| `Dockerfile.managed` | Modify | COPY changelogs into image |
|
|
273
|
+
| `changelogs/` | Create | Directory for generated changelogs |
|
|
274
|
+
|
|
275
|
+
### paperclip-platform-ui
|
|
276
|
+
|
|
277
|
+
| File | Action | Description |
|
|
278
|
+
|------|--------|-------------|
|
|
279
|
+
| Update modal component | Create | Shows changelog, "Update Now" / "Later" |
|
|
280
|
+
| Bot card badge | Modify | Show "Update Available" indicator |
|
|
281
|
+
|
|
282
|
+
## Dependencies
|
|
283
|
+
|
|
284
|
+
- **Implementation work required:**
|
|
285
|
+
- `ImagePoller` and `ContainerUpdater` classes exist and are tested, but have no singleton getters in `services.ts` and are not imported or wired. Docker instance injection needs to be plumbed through.
|
|
286
|
+
- `ContainerUpdater` needs significant enhancement: volume snapshot/restore integration with `SnapshotManager`, HTTP-based health checks (replacing `node -e`), increased timeout from 60s to 120s for migration time.
|
|
287
|
+
- `RolloutOrchestrator` and strategies are entirely new code.
|
|
288
|
+
- `SnapshotManager` exists in `src/backup/` but has no integration with `ContainerUpdater`.
|
|
289
|
+
- **Future:** Org support (see `2026-03-14-paperclip-org-integration-design.md`) — update config moves from tenant to org level after org integration ships
|
|
290
|
+
- **Future:** Cron policy implementation in ImagePoller (currently stubbed)
|
|
291
|
+
|
|
292
|
+
## Risks
|
|
293
|
+
|
|
294
|
+
| Risk | Mitigation |
|
|
295
|
+
|------|------------|
|
|
296
|
+
| Bad upstream migration corrupts data | Nuclear rollback: volume snapshot restored alongside image rollback |
|
|
297
|
+
| Upstream pushes breaking change | Human gate at sync PR review catches this before any image is built |
|
|
298
|
+
| Rolling wave takes too long | ImmediateStrategy available for emergency hotfixes |
|
|
299
|
+
| Health check passes but app is subtly broken | `/health` endpoint queries DB, so migration failures surface. Consider adding deeper health checks later. |
|
|
300
|
+
| Volume snapshots consume disk | Snapshots deleted after successful update. Failed rollbacks alert admin for manual cleanup. |
|