@wopr-network/platform-core 1.17.0 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. package/dist/api/routes/admin-audit-helper.d.ts +1 -1
  2. package/dist/billing/crypto/evm/__tests__/config.test.js +10 -0
  3. package/dist/billing/crypto/evm/config.js +12 -0
  4. package/dist/billing/crypto/evm/types.d.ts +1 -1
  5. package/dist/fleet/__tests__/rollout-strategy.test.d.ts +1 -0
  6. package/dist/fleet/__tests__/rollout-strategy.test.js +157 -0
  7. package/dist/fleet/__tests__/volume-snapshot-manager.test.d.ts +1 -0
  8. package/dist/fleet/__tests__/volume-snapshot-manager.test.js +171 -0
  9. package/dist/fleet/index.d.ts +2 -0
  10. package/dist/fleet/index.js +2 -0
  11. package/dist/fleet/rollout-strategy.d.ts +52 -0
  12. package/dist/fleet/rollout-strategy.js +91 -0
  13. package/dist/fleet/volume-snapshot-manager.d.ts +35 -0
  14. package/dist/fleet/volume-snapshot-manager.js +185 -0
  15. package/docs/superpowers/specs/2026-03-14-fleet-auto-update-design.md +300 -0
  16. package/docs/superpowers/specs/2026-03-14-paperclip-org-integration-design.md +359 -0
  17. package/docs/superpowers/specs/2026-03-14-role-permissions-design.md +346 -0
  18. package/package.json +1 -1
  19. package/src/api/routes/admin-audit-helper.ts +1 -1
  20. package/src/billing/crypto/evm/__tests__/config.test.ts +12 -0
  21. package/src/billing/crypto/evm/config.ts +13 -1
  22. package/src/billing/crypto/evm/types.ts +1 -1
  23. package/src/fleet/__tests__/rollout-strategy.test.ts +192 -0
  24. package/src/fleet/__tests__/volume-snapshot-manager.test.ts +218 -0
  25. package/src/fleet/index.ts +2 -0
  26. package/src/fleet/rollout-strategy.ts +128 -0
  27. package/src/fleet/volume-snapshot-manager.ts +213 -0
  28. package/src/marketplace/volume-installer.test.ts +8 -2
@@ -0,0 +1,185 @@
1
+ import { mkdir, readdir, rm, stat } from "node:fs/promises";
2
+ import { join } from "node:path";
3
+ import { logger } from "../config/logger.js";
4
+ const ALPINE_IMAGE = "alpine:latest";
5
+ /** Strict validation for snapshot IDs — prevents path traversal and shell injection. */
6
+ const SNAPSHOT_ID_RE = /^[A-Za-z0-9._-]+$/;
7
+ function validateSnapshotId(snapshotId) {
8
+ if (!SNAPSHOT_ID_RE.test(snapshotId)) {
9
+ throw new Error(`Invalid snapshot ID: ${snapshotId}`);
10
+ }
11
+ }
12
+ /**
13
+ * Snapshots and restores Docker named volumes using temporary alpine containers.
14
+ * Used for nuclear rollback during fleet updates — if a container update fails,
15
+ * we roll back both the image AND the data volumes.
16
+ */
17
+ export class VolumeSnapshotManager {
18
+ docker;
19
+ backupDir;
20
+ constructor(docker, backupDir = "/data/fleet/snapshots") {
21
+ this.docker = docker;
22
+ this.backupDir = backupDir;
23
+ }
24
+ /** Create a snapshot of a Docker named volume */
25
+ async snapshot(volumeName) {
26
+ await mkdir(this.backupDir, { recursive: true });
27
+ const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
28
+ const id = `${volumeName}-${timestamp}`;
29
+ const archivePath = join(this.backupDir, `${id}.tar`);
30
+ const container = await this.docker.createContainer({
31
+ Image: ALPINE_IMAGE,
32
+ Cmd: ["tar", "cf", `/backup/${id}.tar`, "-C", "/source", "."],
33
+ HostConfig: {
34
+ Binds: [`${volumeName}:/source:ro`, `${this.backupDir}:/backup`],
35
+ AutoRemove: true,
36
+ },
37
+ });
38
+ try {
39
+ await container.start();
40
+ const result = await container.wait();
41
+ if (result.StatusCode !== 0) {
42
+ throw new Error(`Snapshot container exited with code ${result.StatusCode}`);
43
+ }
44
+ }
45
+ catch (err) {
46
+ // AutoRemove handles cleanup, but if start failed the container may still exist
47
+ try {
48
+ await container.remove({ force: true });
49
+ }
50
+ catch {
51
+ // already removed by AutoRemove
52
+ }
53
+ throw err;
54
+ }
55
+ const info = await stat(archivePath);
56
+ const snapshot = {
57
+ id,
58
+ volumeName,
59
+ archivePath,
60
+ createdAt: new Date(),
61
+ sizeBytes: info.size,
62
+ };
63
+ logger.info(`Volume snapshot created: ${id} (${info.size} bytes)`);
64
+ return snapshot;
65
+ }
66
+ /** Restore a volume from a snapshot */
67
+ async restore(snapshotId) {
68
+ validateSnapshotId(snapshotId);
69
+ const archivePath = join(this.backupDir, `${snapshotId}.tar`);
70
+ // Verify archive exists
71
+ await stat(archivePath);
72
+ // Extract volume name from snapshot ID (everything before the last ISO timestamp)
73
+ const volumeName = this.extractVolumeName(snapshotId);
74
+ const container = await this.docker.createContainer({
75
+ Image: ALPINE_IMAGE,
76
+ Cmd: ["sh", "-c", `cd /target && rm -rf ./* ./.??* && tar xf /backup/${snapshotId}.tar -C /target`],
77
+ HostConfig: {
78
+ Binds: [`${volumeName}:/target`, `${this.backupDir}:/backup:ro`],
79
+ AutoRemove: true,
80
+ },
81
+ });
82
+ try {
83
+ await container.start();
84
+ const result = await container.wait();
85
+ if (result.StatusCode !== 0) {
86
+ throw new Error(`Restore container exited with code ${result.StatusCode}`);
87
+ }
88
+ }
89
+ catch (err) {
90
+ try {
91
+ await container.remove({ force: true });
92
+ }
93
+ catch {
94
+ // already removed by AutoRemove
95
+ }
96
+ throw err;
97
+ }
98
+ logger.info(`Volume restored from snapshot: ${snapshotId}`);
99
+ }
100
+ /** List all snapshots for a volume */
101
+ async list(volumeName) {
102
+ let files;
103
+ try {
104
+ files = await readdir(this.backupDir);
105
+ }
106
+ catch {
107
+ return [];
108
+ }
109
+ const prefix = `${volumeName}-`;
110
+ const matching = files.filter((f) => f.startsWith(prefix) && f.endsWith(".tar"));
111
+ const snapshots = [];
112
+ for (const file of matching) {
113
+ const id = file.replace(/\.tar$/, "");
114
+ const archivePath = join(this.backupDir, file);
115
+ try {
116
+ const info = await stat(archivePath);
117
+ snapshots.push({
118
+ id,
119
+ volumeName,
120
+ archivePath,
121
+ createdAt: info.mtime,
122
+ sizeBytes: info.size,
123
+ });
124
+ }
125
+ catch {
126
+ // File disappeared between readdir and stat — skip
127
+ }
128
+ }
129
+ // Sort newest first
130
+ snapshots.sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime());
131
+ return snapshots;
132
+ }
133
+ /** Delete a snapshot archive */
134
+ async delete(snapshotId) {
135
+ validateSnapshotId(snapshotId);
136
+ const archivePath = join(this.backupDir, `${snapshotId}.tar`);
137
+ await rm(archivePath, { force: true });
138
+ logger.info(`Volume snapshot deleted: ${snapshotId}`);
139
+ }
140
+ /** Delete all snapshots older than maxAge ms */
141
+ async cleanup(maxAgeMs) {
142
+ let files;
143
+ try {
144
+ files = await readdir(this.backupDir);
145
+ }
146
+ catch {
147
+ return 0;
148
+ }
149
+ const cutoff = Date.now() - maxAgeMs;
150
+ let deleted = 0;
151
+ for (const file of files) {
152
+ if (!file.endsWith(".tar"))
153
+ continue;
154
+ const archivePath = join(this.backupDir, file);
155
+ try {
156
+ const info = await stat(archivePath);
157
+ if (info.mtime.getTime() < cutoff) {
158
+ await rm(archivePath, { force: true });
159
+ deleted++;
160
+ }
161
+ }
162
+ catch {
163
+ // File disappeared — skip
164
+ }
165
+ }
166
+ if (deleted > 0) {
167
+ logger.info(`Volume snapshot cleanup: removed ${deleted} old snapshots`);
168
+ }
169
+ return deleted;
170
+ }
171
+ /**
172
+ * Extract volume name from snapshot ID.
173
+ * Snapshot IDs are `${volumeName}-${ISO timestamp with colons/dots replaced}`.
174
+ * ISO timestamps start with 4 digits (year), so we find the last occurrence
175
+ * of `-YYYY` pattern to split.
176
+ */
177
+ extractVolumeName(snapshotId) {
178
+ // Match the timestamp part: -YYYY-MM-DDTHH-MM-SS-MMMZ
179
+ const match = snapshotId.match(/^(.+)-\d{4}-\d{2}-\d{2}T/);
180
+ if (!match) {
181
+ throw new Error(`Cannot extract volume name from snapshot ID: ${snapshotId}`);
182
+ }
183
+ return match[1];
184
+ }
185
+ }
@@ -0,0 +1,300 @@
1
+ # Fleet Auto-Update with Rolling Waves
2
+
3
+ **Date:** 2026-03-14
4
+ **Status:** Draft
5
+ **Repos:** platform-core, paperclip, paperclip-platform, paperclip-platform-ui
6
+
7
+ ## Problem
8
+
9
+ Upstream Paperclip changes land nightly via `upstream-sync.mjs`, which rebases our fork and creates a PR. After manual review and merge, `docker-managed.yml` auto-builds and pushes `ghcr.io/wopr-network/paperclip:managed`. But existing running containers never receive the update. New containers get `:managed` on first pull; old containers are stuck on whatever digest they were created with.
10
+
11
+ platform-core has `ImagePoller` and `ContainerUpdater` classes that are fully implemented and tested but **not wired into the application lifecycle**.
12
+
13
+ ## Design
14
+
15
+ ### Pipeline Overview
16
+
17
+ ```
18
+ paperclipai/paperclip (upstream)
19
+ | nightly 06:00 UTC
20
+ upstream-sync.mjs (rebase + hostedMode guards + changelog generation)
21
+ | creates PR
22
+ human reviews & merges PR
23
+ | push to master
24
+ docker-managed.yml (auto-build)
25
+ | pushes ghcr.io/wopr-network/paperclip:managed
26
+ ImagePoller detects new digest
27
+ | groups bots by tenant
28
+ RolloutOrchestrator executes strategy
29
+ | per-bot update sequence
30
+ ContainerUpdater (snapshot + pull + recreate + health check + rollback)
31
+ ```
32
+
33
+ ### Human Gate
34
+
35
+ The only human checkpoint is **reviewing and merging the upstream sync PR**. Everything downstream is automatic.
36
+
37
+ ### 1. Changelog Generation (changes to `paperclip/scripts/upstream-sync.mjs`)
38
+
39
+ After rebase and hostedMode gap scanning, the sync agent generates two changelogs:
40
+
41
+ **Internal changelog** (`changelogs/internal/YYYY-MM-DD.md`):
42
+ - Full developer-facing diff summary
43
+ - What upstream changed, what guards were added, conflicts resolved
44
+ - For PR review purposes
45
+
46
+ **User-facing changelog** (`changelogs/user-facing/YYYY-MM-DD.json`):
47
+ - Structured format: `{ version, date, sections: [{ title: "New" | "Improved" | "Fixed", items: string[] }] }`
48
+ - Filtered through hosted-mode exclusion list — silently drops anything related to: adapters, model selection, thinking effort, runtime/heartbeat config, provider API keys, CLI, deployment modes, infrastructure, self-hosting
49
+ - Same `HOSTED_MODE_CONTEXT` that drives the guard scanner drives the changelog filter
50
+
51
+ Both files are committed in the sync PR. The user-facing JSON is copied into the Docker image during build (add `COPY changelogs/user-facing/ /app/changelogs/` to `Dockerfile.managed`). If the image exists, its changelog exists.
52
+
53
+ **Changelog retrieval:** After pulling a new image (before starting the update sequence), extract the changelog:
54
+
55
+ ```bash
56
+ docker run --rm ghcr.io/wopr-network/paperclip:managed cat /app/changelogs/latest.json
57
+ ```
58
+
59
+ The extracted JSON is stored in the fleet event payload for email and UI consumption. The `latest.json` symlink always points to the most recent changelog file.
60
+
61
+ ### 2. Image Detection (wire existing code in `platform-core`)
62
+
63
+ Changes to `src/fleet/services.ts`:
64
+
65
+ - Add `ImagePoller` and `ContainerUpdater` singletons
66
+ - `initFleet()` starts the poller and wires `poller.onUpdateAvailable` to `RolloutOrchestrator`
67
+ - ImagePoller already handles poll intervals per release channel (canary=5m, staging=15m, stable=30m)
68
+
69
+ ### 3. Rollout Orchestrator (new: `src/fleet/rollout-orchestrator.ts`)
70
+
71
+ GoF Strategy pattern. The orchestrator is the context; strategies are interchangeable.
72
+
73
+ **`IRolloutStrategy` interface:**
74
+
75
+ ```typescript
76
+ interface IRolloutStrategy {
77
+ /** Select next batch from remaining bots */
78
+ nextBatch(remaining: BotProfile[]): BotProfile[];
79
+ /** Milliseconds to wait between waves */
80
+ pauseDuration(): number;
81
+ /** What to do when a single bot update fails */
82
+ onBotFailure(botId: string, error: Error, attempt: number): "abort" | "skip" | "retry";
83
+ /** Max retries per bot before skip/abort */
84
+ maxRetries(): number;
85
+ /** Health check timeout per bot (ms) */
86
+ healthCheckTimeout(): number;
87
+ }
88
+ ```
89
+
90
+ **Concrete strategies:**
91
+
92
+ | Strategy | Batch | Pause | Failure | Use Case |
93
+ |----------|-------|-------|---------|----------|
94
+ | `RollingWaveStrategy` | configurable % | configurable | abort on N+ failures | Default for auto-update |
95
+ | `SingleBotStrategy` | 1 bot | N/A | report | Manual per-bot update button |
96
+ | `ImmediateStrategy` | all | 0 | skip | Emergency hotfix |
97
+
98
+ Strategy selection is **admin-controlled only** — users never see this.
99
+
100
+ **Orchestrator flow:**
101
+
102
+ ```
103
+ 1. Group update-eligible bots by tenant
104
+ 2. For each tenant:
105
+ a. Check tenant update mode (auto/manual)
106
+ b. If manual: mark bots as "update available", send notification, stop
107
+ c. If auto: check if current time is within tenant's preferred window
108
+ d. Select strategy (from admin config)
109
+ e. Execute waves:
110
+ - batch = strategy.nextBatch(remaining)
111
+ - for each bot in batch: ContainerUpdater.updateBot()
112
+ - if any failure: strategy.onBotFailure() → abort/skip/retry
113
+ - sleep(strategy.pauseDuration())
114
+ - repeat until remaining is empty
115
+ 3. Send notification emails with changelog
116
+ ```
117
+
118
+ ### 4. Update Sequence Per Bot (major rework of `ContainerUpdater`)
119
+
120
+ Nuclear rollback — image AND volumes roll back together.
121
+
122
+ **Volume Snapshot Mechanism (new: `VolumeSnapshotManager`):**
123
+
124
+ The existing `SnapshotManager` operates on filesystem paths, not Docker named volumes. A new `VolumeSnapshotManager` is needed that snapshots Docker named volumes using a temporary container:
125
+
126
+ ```bash
127
+ # Snapshot a named volume to a tar archive:
128
+ docker run --rm -v <volume-name>:/source -v <backup-dir>:/backup alpine \
129
+ tar cf /backup/<volume-name>-<timestamp>.tar -C /source .
130
+
131
+ # Restore a named volume from a tar archive:
132
+ docker run --rm -v <volume-name>:/target -v <backup-dir>:/backup alpine \
133
+ sh -c "rm -rf /target/* && tar xf /backup/<volume-name>-<timestamp>.tar -C /target"
134
+ ```
135
+
136
+ This is a new class (`src/fleet/volume-snapshot-manager.ts`), not a modification of the existing `SnapshotManager`.
137
+
138
+ **Update sequence:**
139
+
140
+ ```
141
+ 1. Snapshot /data and /paperclip volumes (via VolumeSnapshotManager)
142
+ 2. Record previous image digest (already implemented)
143
+ 3. Pull new image
144
+ 4. Stop container
145
+ 5. Recreate container with new image (named volumes remount automatically)
146
+ 6. Start container (PAPERCLIP_MIGRATION_AUTO_APPLY=true runs Drizzle migrations on boot)
147
+ 7. Health check: HTTP GET http://container:3100/health, expect {"status":"ok"}
148
+ - Timeout: 120s (increased from current 60s to allow for Drizzle migration time)
149
+ - Poll interval: 5s
150
+ 8a. HEALTHY:
151
+ - Delete volume snapshots
152
+ - Emit fleet event: bot.updated
153
+ - Record new digest
154
+ 8b. UNHEALTHY:
155
+ - Stop container
156
+ - Restore volume snapshots from step 1 (via VolumeSnapshotManager)
157
+ - Recreate container with OLD image (digest-pinned to prevent re-pulling new)
158
+ - Start container
159
+ - Verify old container is healthy
160
+ - Emit fleet event: bot.update_failed
161
+ - Report to orchestrator (abort/skip/retry per strategy)
162
+ ```
163
+
164
+ **Health check upgrade:** Replace `node -e 'process.exit(0)'` in `createContainer()` with:
165
+
166
+ ```typescript
167
+ Healthcheck: {
168
+ // Use node+fetch instead of curl — Paperclip's base image (node:lts-trixie-slim)
169
+ // may not have curl installed.
170
+ Test: ["CMD-SHELL", "node -e \"fetch('http://localhost:3100/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))\""],
171
+ Interval: 30_000_000_000,
172
+ Timeout: 10_000_000_000,
173
+ Retries: 3,
174
+ StartPeriod: 60_000_000_000, // 60s for Drizzle migrations on boot
175
+ }
176
+ ```
177
+
178
+ **Note:** `HEALTH_CHECK_TIMEOUT_MS` in `ContainerUpdater` must be increased from 60,000 to 120,000 to match the spec's 120s timeout.
179
+
180
+ ### 5. Tenant Update Config
181
+
182
+ Stored per-tenant (moves to per-org when org support ships — see `2026-03-14-paperclip-org-integration-design.md`).
183
+
184
+ ```typescript
185
+ interface TenantUpdateConfig {
186
+ /** "auto" = rolling wave in preferred window; "manual" = badge + button */
187
+ mode: "auto" | "manual";
188
+ /** Hour of day (UTC) for auto-update window. Only used when mode=auto. */
189
+ preferredHourUtc: number; // 0-23, default 3
190
+ }
191
+ ```
192
+
193
+ Default for new tenants: `{ mode: "manual", preferredHourUtc: 3 }`.
194
+
195
+ **Repository interface** (follows the `IFooRepository` pattern used throughout platform-core):
196
+
197
+ ```typescript
198
+ export interface ITenantUpdateConfigRepository {
199
+ get(tenantId: string): Promise<TenantUpdateConfig | null>;
200
+ upsert(tenantId: string, config: TenantUpdateConfig): Promise<void>;
201
+ listAutoEnabled(): Promise<Array<{ tenantId: string; config: TenantUpdateConfig }>>;
202
+ }
203
+ ```
204
+
205
+ `DrizzleTenantUpdateConfigRepository` implements this against a `tenant_update_configs` table with columns `(tenant_id TEXT PK, mode TEXT, preferred_hour_utc INTEGER, updated_at BIGINT)`.
206
+
207
+ **Audit logging:** All config changes (mode switch, hour change) are logged via `logger.info("Tenant update config changed", { tenantId, oldConfig, newConfig, actorUserId })`. Admin-triggered updates via the `/admin/updates` route include the actor in the log entry.
208
+
209
+ Admin panel can override per-tenant or set global defaults.
210
+
211
+ **Precedence: tenant config overrides per-bot `updatePolicy`.** The existing `BotProfile.updatePolicy` field (per-bot: `on-push`, `nightly`, `manual`, `cron:*`) is superseded by `TenantUpdateConfig` for hosted deployments. The `RolloutOrchestrator` reads tenant config, not bot-level policy. `ImagePoller.shouldAutoUpdate()` is refactored to always return `false` — the poller's only job is to detect new digests and notify the orchestrator, which makes the auto/manual decision based on tenant config.
212
+
213
+ `ImagePoller.isNightlyWindow()` (hardcoded 03:00-03:30 UTC) is superseded by the orchestrator's per-tenant `preferredHourUtc` window check. The poller's nightly logic becomes a no-op.
214
+
215
+ Per-bot `updatePolicy` is preserved in the schema for self-hosted (non-platform) deployments where there is no tenant config.
216
+
217
+ ### 6. Admin Controls
218
+
219
+ Admin panel (platform-core admin routes, not user-facing):
220
+
221
+ - **Global update mode**: auto / manual / paused (pause halts all rollouts fleet-wide)
222
+ - **Strategy config**: batch %, pause duration, failure threshold
223
+ - **Default update window**: hour UTC
224
+ - **Per-tenant overrides**: mode, window
225
+ - **Manual triggers**: "roll out now" for a specific image digest
226
+ - **Rollout status dashboard**: which bots updated, which failed, which pending
227
+
228
+ ### 7. User-Facing Experience
229
+
230
+ **Auto mode (tenant doesn't know or care):**
231
+ - Updates happen silently during configured window
232
+ - Email after: "Your Paperclip was updated. Here's what's new: [changelog]"
233
+ - Brief downtime during container restart (seconds)
234
+
235
+ **Manual mode:**
236
+ - Email when update available: "A new update is available for your Paperclip. [changelog]"
237
+ - In-app: badge on bot in UI indicating update available
238
+ - Click "Update" → modal shows user-facing changelog with "Update Now" / "Later" buttons
239
+ - "Update Now" triggers `SingleBotStrategy` immediately
240
+ - Email after: "Your Paperclip was updated. Here's what's new: [changelog]"
241
+
242
+ **Both modes:**
243
+ - Admin email on rollback failure
244
+ - Fleet event log for audit
245
+
246
+ ### 8. Image Allowlist
247
+
248
+ `FLEET_IMAGE_ALLOWLIST` already allows `ghcr.io/wopr-network/` — covers both WOPR and Paperclip images. Future brands add their prefix.
249
+
250
+ ## Files to Create/Modify
251
+
252
+ ### platform-core
253
+
254
+ | File | Action | Description |
255
+ |------|--------|-------------|
256
+ | `src/fleet/rollout-orchestrator.ts` | Create | Strategy pattern orchestrator |
257
+ | `src/fleet/rollout-strategies.ts` | Create | RollingWave, SingleBot, Immediate strategies |
258
+ | `src/fleet/services.ts` | Modify | Wire ImagePoller + ContainerUpdater + RolloutOrchestrator into initFleet() |
259
+ | `src/fleet/updater.ts` | Major rework | Add volume snapshot/restore lifecycle, replace FleetManager delegation with direct Docker operations for atomic update, upgrade health check from Docker HEALTHCHECK polling to HTTP GET, increase timeout from 60s to 120s |
260
+ | `src/fleet/volume-snapshot-manager.ts` | Create | Snapshot and restore Docker named volumes using temporary alpine containers |
261
+ | `src/fleet/fleet-manager.ts` | Modify | Upgrade HEALTHCHECK in createContainer() to use node+fetch instead of node -e |
262
+ | `src/fleet/image-poller.ts` | Modify | Wire onUpdateAvailable to orchestrator instead of direct updater |
263
+ | `src/db/schema/tenant-update-config.ts` | Create | Drizzle schema for tenant update preferences |
264
+ | `src/api/routes/admin-updates.ts` | Create | Admin API for update management |
265
+ | `src/fleet/update-notifier.ts` | Create | Email notifications for updates |
266
+
267
+ ### paperclip
268
+
269
+ | File | Action | Description |
270
+ |------|--------|-------------|
271
+ | `scripts/upstream-sync.mjs` | Modify | Add changelog generation step |
272
+ | `Dockerfile.managed` | Modify | COPY changelogs into image |
273
+ | `changelogs/` | Create | Directory for generated changelogs |
274
+
275
+ ### paperclip-platform-ui
276
+
277
+ | File | Action | Description |
278
+ |------|--------|-------------|
279
+ | Update modal component | Create | Shows changelog, "Update Now" / "Later" |
280
+ | Bot card badge | Modify | Show "Update Available" indicator |
281
+
282
+ ## Dependencies
283
+
284
+ - **Implementation work required:**
285
+ - `ImagePoller` and `ContainerUpdater` classes exist and are tested, but have no singleton getters in `services.ts` and are not imported or wired. Docker instance injection needs to be plumbed through.
286
+ - `ContainerUpdater` needs significant enhancement: volume snapshot/restore integration with `SnapshotManager`, HTTP-based health checks (replacing `node -e`), increased timeout from 60s to 120s for migration time.
287
+ - `RolloutOrchestrator` and strategies are entirely new code.
288
+ - `SnapshotManager` exists in `src/backup/` but has no integration with `ContainerUpdater`.
289
+ - **Future:** Org support (see `2026-03-14-paperclip-org-integration-design.md`) — update config moves from tenant to org level after org integration ships
290
+ - **Future:** Cron policy implementation in ImagePoller (currently stubbed)
291
+
292
+ ## Risks
293
+
294
+ | Risk | Mitigation |
295
+ |------|------------|
296
+ | Bad upstream migration corrupts data | Nuclear rollback: volume snapshot restored alongside image rollback |
297
+ | Upstream pushes breaking change | Human gate at sync PR review catches this before any image is built |
298
+ | Rolling wave takes too long | ImmediateStrategy available for emergency hotfixes |
299
+ | Health check passes but app is subtly broken | `/health` endpoint queries DB, so migration failures surface. Consider adding deeper health checks later. |
300
+ | Volume snapshots consume disk | Snapshots deleted after successful update. Failed rollbacks alert admin for manual cleanup. |