@checkstack/satellite-backend 0.3.6 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +204 -0
- package/drizzle/0001_tiresome_terror.sql +3 -0
- package/drizzle/0002_graceful_mac_gargan.sql +2 -0
- package/drizzle/meta/0001_snapshot.json +102 -0
- package/drizzle/meta/0002_snapshot.json +89 -0
- package/drizzle/meta/_journal.json +14 -0
- package/package.json +22 -13
- package/src/automations.ts +107 -0
- package/src/entity.test.ts +313 -0
- package/src/entity.ts +221 -0
- package/src/heartbeat-monitor.it.test.ts +232 -0
- package/src/heartbeat-monitor.test.ts +156 -83
- package/src/heartbeat-monitor.ts +106 -35
- package/src/hooks.ts +9 -2
- package/src/index.ts +180 -0
- package/src/run-secret-resolver.test.ts +121 -0
- package/src/run-secret-resolver.ts +66 -0
- package/src/satellite-ws-handler.test.ts +267 -0
- package/src/satellite-ws-handler.ts +266 -6
- package/src/schema.ts +22 -1
- package/src/service.test.ts +274 -0
- package/src/service.ts +133 -15
- package/src/status.ts +18 -0
- package/tsconfig.json +18 -0
package/src/heartbeat-monitor.ts
CHANGED
|
@@ -1,70 +1,141 @@
|
|
|
1
1
|
import type { Logger } from "@checkstack/backend-api";
|
|
2
2
|
import type { SignalService } from "@checkstack/signal-common";
|
|
3
3
|
import type { SatelliteService } from "./service";
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
} from "@checkstack/satellite-common";
|
|
4
|
+
import type { SatelliteConnectionEvent } from "./entity";
|
|
5
|
+
import { computeStatus } from "./status";
|
|
6
|
+
import { SATELLITE_STATUS_CHANGED } from "@checkstack/satellite-common";
|
|
8
7
|
|
|
9
8
|
/**
|
|
10
|
-
*
|
|
11
|
-
*
|
|
9
|
+
* Plug-point for driving the heartbeat-lost (`online` → `offline`) edge into
|
|
10
|
+
* the reactive `satellite-connection` entity (reactive automation engine
|
|
11
|
+
* §10.6). Bound from `afterPluginsReady`; when not provided, no entity state is
|
|
12
|
+
* mirrored.
|
|
13
|
+
*
|
|
14
|
+
* The monitor flips ONLY `lastConnectionEvent` to `"heartbeat_lost"` (leaving
|
|
15
|
+
* the already-aged `lastHeartbeatAt` untouched, since it is what made the
|
|
16
|
+
* computed status `offline` in the first place). The change-deriver re-fires
|
|
17
|
+
* `satellite.heartbeat_lost`. The opposite edge (offline→online) is mirrored as
|
|
18
|
+
* `connected` by the WS handler on reconnect, so the monitor leaves it alone.
|
|
19
|
+
*/
|
|
20
|
+
export interface SatelliteHeartbeatEntitySink {
|
|
21
|
+
mirror: (satelliteId: string) => Promise<void>;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Monitors satellite heartbeats and detects the online→offline transition from
|
|
26
|
+
* DURABLE state alone — no pod-local baseline.
|
|
27
|
+
*
|
|
28
|
+
* ## Horizontal-scale correctness
|
|
29
|
+
*
|
|
30
|
+
* The heartbeat-check job runs under ONE consumer group claimed by a VARYING
|
|
31
|
+
* pod. A process-local "previous status" map is therefore wrong: a pod with an
|
|
32
|
+
* empty map never sees the online→offline edge, so `connectionStatus` could get
|
|
33
|
+
* stuck `online` forever after a pod crash. This monitor instead reads every
|
|
34
|
+
* satellite's durable `(lastHeartbeatAt, lastConnectionEvent)`, computes status
|
|
35
|
+
* via {@link computeStatus} (the same wall-clock liveness rule the entity read
|
|
36
|
+
* uses), and detects the heartbeat-lost edge purely from durable state:
|
|
37
|
+
*
|
|
38
|
+
* computed status is `offline` AND `lastConnectionEvent === "connected"`
|
|
39
|
+
* ⇒ this satellite just lost its heartbeat (it was last marked connected,
|
|
40
|
+
* but its heartbeat has now aged past the offline threshold).
|
|
41
|
+
*
|
|
42
|
+
* The mutate that flips `lastConnectionEvent` to `"heartbeat_lost"` is
|
|
43
|
+
* IDEMPOTENT across pods and redelivery: once it is `"heartbeat_lost"`, the
|
|
44
|
+
* predicate above is false, so re-runs (on any pod) are no-ops, and the entity
|
|
45
|
+
* handle's diff-on-unchanged suppresses any duplicate transition/event. Any pod
|
|
46
|
+
* can therefore drive the edge correctly, regardless of which pod (if any) ever
|
|
47
|
+
* observed the satellite online in memory.
|
|
12
48
|
*/
|
|
13
49
|
export class HeartbeatMonitor {
|
|
14
50
|
/**
|
|
15
|
-
*
|
|
16
|
-
*
|
|
51
|
+
* Pod-local broadcast-dedup ONLY (never the source of truth). The durable
|
|
52
|
+
* `lastConnectionEvent` flip is what makes detection idempotent; this set
|
|
53
|
+
* merely avoids re-broadcasting the same status-change signal from this pod on
|
|
54
|
+
* back-to-back checks. A fresh pod with an empty set still detects + mirrors
|
|
55
|
+
* the edge from durable state — it just also broadcasts once, which is benign.
|
|
17
56
|
*/
|
|
18
|
-
private
|
|
57
|
+
private broadcastedOffline = new Set<string>();
|
|
19
58
|
|
|
20
59
|
constructor(
|
|
21
60
|
private service: SatelliteService,
|
|
22
61
|
private signalService: SignalService,
|
|
23
62
|
private logger: Logger,
|
|
63
|
+
private entitySink?: SatelliteHeartbeatEntitySink,
|
|
24
64
|
) {}
|
|
25
65
|
|
|
26
66
|
/**
|
|
27
|
-
* Check all satellites and
|
|
28
|
-
* Called periodically by a recurring
|
|
67
|
+
* Check all satellites and drive the heartbeat-lost edge for any that have
|
|
68
|
+
* aged out while still marked connected. Called periodically by a recurring
|
|
69
|
+
* queue job; safe to run on any pod and to redeliver.
|
|
29
70
|
*/
|
|
30
71
|
async checkHeartbeats(): Promise<void> {
|
|
31
|
-
const
|
|
72
|
+
const rows = await this.service.listConnectionLiveness();
|
|
73
|
+
const liveIds = new Set(rows.map((r) => r.id));
|
|
32
74
|
|
|
33
|
-
for (const
|
|
34
|
-
const
|
|
35
|
-
const
|
|
75
|
+
for (const row of rows) {
|
|
76
|
+
const status = computeStatus(row.lastHeartbeatAt);
|
|
77
|
+
const lostHeartbeat = this.hasLostHeartbeat({
|
|
78
|
+
status,
|
|
79
|
+
lastConnectionEvent: row.lastConnectionEvent,
|
|
80
|
+
});
|
|
36
81
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
);
|
|
82
|
+
if (!lostHeartbeat) {
|
|
83
|
+
// Still online (or already past the lost edge / never connected):
|
|
84
|
+
// nothing to detect. Clear the broadcast-dedup marker once a satellite
|
|
85
|
+
// is no longer in the lost state so a future lost edge re-broadcasts.
|
|
86
|
+
if (status === "online") this.broadcastedOffline.delete(row.id);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Durable heartbeat-lost edge: computed offline while still marked
|
|
91
|
+
// `connected`. Detected from durable state, so this fires correctly from
|
|
92
|
+
// ANY pod with no prior in-memory knowledge of the satellite.
|
|
93
|
+
this.logger.info(
|
|
94
|
+
`Satellite ${row.name} (${row.region}) lost heartbeat (online → offline)`,
|
|
95
|
+
);
|
|
42
96
|
|
|
97
|
+
// Broadcast the status-change signal once per offline edge from this pod.
|
|
98
|
+
if (!this.broadcastedOffline.has(row.id)) {
|
|
99
|
+
this.broadcastedOffline.add(row.id);
|
|
43
100
|
await this.signalService.broadcast(SATELLITE_STATUS_CHANGED, {
|
|
44
|
-
satelliteId:
|
|
45
|
-
status:
|
|
46
|
-
name:
|
|
47
|
-
region:
|
|
101
|
+
satelliteId: row.id,
|
|
102
|
+
status: "offline",
|
|
103
|
+
name: row.name,
|
|
104
|
+
region: row.region,
|
|
48
105
|
});
|
|
49
106
|
}
|
|
50
107
|
|
|
51
|
-
|
|
108
|
+
// Drive the entity edge. The mutate is idempotent: it flips
|
|
109
|
+
// `lastConnectionEvent` to `"heartbeat_lost"`, after which this branch is
|
|
110
|
+
// never re-entered for the same satellite (re-runs are no-ops).
|
|
111
|
+
if (this.entitySink) {
|
|
112
|
+
try {
|
|
113
|
+
await this.entitySink.mirror(row.id);
|
|
114
|
+
} catch (error) {
|
|
115
|
+
this.logger.error(
|
|
116
|
+
`Failed to mirror satellite-connection (heartbeat_lost) for ${row.name}:`,
|
|
117
|
+
error,
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
52
121
|
}
|
|
53
122
|
|
|
54
|
-
//
|
|
55
|
-
const
|
|
56
|
-
|
|
57
|
-
if (!currentIds.has(trackedId)) {
|
|
58
|
-
this.previousStatuses.delete(trackedId);
|
|
59
|
-
}
|
|
123
|
+
// Drop broadcast-dedup markers for satellites that no longer exist.
|
|
124
|
+
for (const id of this.broadcastedOffline) {
|
|
125
|
+
if (!liveIds.has(id)) this.broadcastedOffline.delete(id);
|
|
60
126
|
}
|
|
61
127
|
}
|
|
62
128
|
|
|
63
129
|
/**
|
|
64
|
-
*
|
|
65
|
-
*
|
|
130
|
+
* Pure predicate: a satellite has just lost its heartbeat when its computed
|
|
131
|
+
* status is `offline` but its last recorded lifecycle edge still says it was
|
|
132
|
+
* `connected`. Once the edge is mirrored (`lastConnectionEvent` becomes
|
|
133
|
+
* `"heartbeat_lost"`), this returns false — the idempotency guarantee.
|
|
66
134
|
*/
|
|
67
|
-
|
|
68
|
-
|
|
135
|
+
private hasLostHeartbeat(props: {
|
|
136
|
+
status: "online" | "offline";
|
|
137
|
+
lastConnectionEvent: SatelliteConnectionEvent | null;
|
|
138
|
+
}): boolean {
|
|
139
|
+
return props.status === "offline" && props.lastConnectionEvent === "connected";
|
|
69
140
|
}
|
|
70
141
|
}
|
package/src/hooks.ts
CHANGED
|
@@ -2,8 +2,15 @@ import { createHook } from "@checkstack/backend-api";
|
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Satellite hooks for cross-plugin communication.
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
*
|
|
6
|
+
* The connection-lifecycle hooks (`satellite.connected` / `.disconnected` /
|
|
7
|
+
* `.heartbeat_lost`) were removed in Phase 4 (reactive automation engine
|
|
8
|
+
* §10.6): satellite connection state is now the reactive
|
|
9
|
+
* `satellite-connection` entity (see `./entity.ts`), and the equivalent
|
|
10
|
+
* trigger events are derived from its changes.
|
|
11
|
+
*
|
|
12
|
+
* `satellite.removed` stays — it is a deletion/cleanup signal (consumed by
|
|
13
|
+
* healthcheck-backend to scrub the satellite's id), not entity state.
|
|
7
14
|
*/
|
|
8
15
|
export const satelliteHooks = {
|
|
9
16
|
/**
|
package/src/index.ts
CHANGED
|
@@ -9,6 +9,10 @@ import {
|
|
|
9
9
|
} from "@checkstack/satellite-common";
|
|
10
10
|
import { HealthCheckApi } from "@checkstack/healthcheck-common";
|
|
11
11
|
import { healthCheckHooks } from "@checkstack/healthcheck-backend";
|
|
12
|
+
import { ScriptPackagesApi } from "@checkstack/script-packages-common";
|
|
13
|
+
import { scriptPackagesChangedHook } from "@checkstack/script-packages-backend";
|
|
14
|
+
import { secretResolverRef } from "@checkstack/secrets-backend";
|
|
15
|
+
import { resolveSatelliteRunSecrets } from "./run-secret-resolver";
|
|
12
16
|
import { SatelliteService } from "./service";
|
|
13
17
|
import { createSatelliteRouter } from "./router";
|
|
14
18
|
import { HeartbeatMonitor } from "./heartbeat-monitor";
|
|
@@ -16,6 +20,21 @@ import { SatelliteWsHandler } from "./satellite-ws-handler";
|
|
|
16
20
|
import { ConfigRelay } from "./config-relay";
|
|
17
21
|
import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
|
|
18
22
|
import { registerSatelliteGitOpsKinds } from "./satellite-gitops-kinds";
|
|
23
|
+
import {
|
|
24
|
+
automationTriggerExtensionPoint,
|
|
25
|
+
entityExtensionPoint,
|
|
26
|
+
withEntityWrite,
|
|
27
|
+
type EntityHandle,
|
|
28
|
+
} from "@checkstack/automation-backend";
|
|
29
|
+
import {
|
|
30
|
+
SATELLITE_CONNECTION_ENTITY_KIND,
|
|
31
|
+
createSatelliteConnectionRead,
|
|
32
|
+
deriveSatelliteConnectionEvents,
|
|
33
|
+
satelliteChangeToPayload,
|
|
34
|
+
satelliteConnectionStateSchema,
|
|
35
|
+
type SatelliteConnectionState,
|
|
36
|
+
} from "./entity";
|
|
37
|
+
import { satelliteTriggers } from "./automations";
|
|
19
38
|
|
|
20
39
|
// Queue and job constants
|
|
21
40
|
const HEARTBEAT_QUEUE = "satellite-heartbeat";
|
|
@@ -27,6 +46,50 @@ export default createBackendPlugin({
|
|
|
27
46
|
register(env) {
|
|
28
47
|
env.registerAccessRules(satelliteAccessRules);
|
|
29
48
|
|
|
49
|
+
// ─── Automation Platform: reactive connection entity ─────────────
|
|
50
|
+
// Satellite connection state is the `satellite-connection` entity
|
|
51
|
+
// (reactive automation engine §10.6, §9.1), PLUGIN-BACKED (Model B) and
|
|
52
|
+
// COMPUTE-ON-READ: its `status` is DERIVED on read from the DURABLE, shared
|
|
53
|
+
// `satellites.lastHeartbeatAt` column (the single liveness source of truth,
|
|
54
|
+
// same as the admin list), and `lastConnectionEvent` is the only extra
|
|
55
|
+
// durable column (the deriver's event discriminator). There is NO stored
|
|
56
|
+
// status copy and NO framework `entity_state` mirror, so EVERY pod computes
|
|
57
|
+
// the same state AND a stale row self-heals to offline once the heartbeat
|
|
58
|
+
// ages out (this fixes the horizontal-scaling bug twice: the old in-memory
|
|
59
|
+
// map made pod A's satellite invisible to pod B, and the prior fix's stored
|
|
60
|
+
// status got stuck `online` after a pod crash because the heartbeat-lost
|
|
61
|
+
// EDGE was detected pod-locally). The three lifecycle sites (connect /
|
|
62
|
+
// disconnect / heartbeat-lost) write the liveness inputs through
|
|
63
|
+
// `handle.mutate`, and the framework records full transition HISTORY in
|
|
64
|
+
// `entity_transitions`.
|
|
65
|
+
//
|
|
66
|
+
// The `satellite.connected` / `.disconnected` / `.heartbeat_lost` trigger
|
|
67
|
+
// events are DERIVED from its changes (no hook-backed triggers). The
|
|
68
|
+
// ENTITY-DRIVEN triggers below stay registered so they remain in the
|
|
69
|
+
// editor's trigger catalog + payload-introspectable, and a `toPayload`
|
|
70
|
+
// mapper makes the runtime `trigger.payload` match their `payloadSchema`
|
|
71
|
+
// (mirroring incident / catalog / dependency / healthcheck).
|
|
72
|
+
const automationTriggers = env.getExtensionPoint(
|
|
73
|
+
automationTriggerExtensionPoint,
|
|
74
|
+
);
|
|
75
|
+
for (const trigger of satelliteTriggers) {
|
|
76
|
+
automationTriggers.registerTrigger(trigger, pluginMetadata);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const entity = env.getExtensionPoint(entityExtensionPoint);
|
|
80
|
+
entity.registerChangeDeriver({
|
|
81
|
+
kind: SATELLITE_CONNECTION_ENTITY_KIND,
|
|
82
|
+
derive: deriveSatelliteConnectionEvents,
|
|
83
|
+
toPayload: satelliteChangeToPayload,
|
|
84
|
+
});
|
|
85
|
+
entity.declareNonReactiveState({
|
|
86
|
+
table: "satellites",
|
|
87
|
+
reason: "bookkeeping",
|
|
88
|
+
note: "lastHeartbeatAt is the raw liveness timestamp; the satellite-connection entity's reactive status is computed from it on read.",
|
|
89
|
+
});
|
|
90
|
+
// Created once in init; reused by the WS handler + heartbeat monitor.
|
|
91
|
+
let satelliteEntityHandle: EntityHandle<SatelliteConnectionState>;
|
|
92
|
+
|
|
30
93
|
// ─── GitOps Entity Kind Registration ─────────────────────────────
|
|
31
94
|
let gitopsService: SatelliteService | undefined;
|
|
32
95
|
const kindRegistry = env.getExtensionPoint(entityKindExtensionPoint);
|
|
@@ -47,6 +110,7 @@ export default createBackendPlugin({
|
|
|
47
110
|
signalService: coreServices.signalService,
|
|
48
111
|
queueManager: coreServices.queueManager,
|
|
49
112
|
wsRegistry: coreServices.wsRegistry,
|
|
113
|
+
secretResolver: secretResolverRef,
|
|
50
114
|
},
|
|
51
115
|
init: async ({ logger, database, rpc, signalService }) => {
|
|
52
116
|
logger.debug("🛰️ Initializing Satellite Backend...");
|
|
@@ -56,6 +120,20 @@ export default createBackendPlugin({
|
|
|
56
120
|
);
|
|
57
121
|
gitopsService = service;
|
|
58
122
|
|
|
123
|
+
// Declare the reactive `satellite-connection` entity once. PLUGIN-
|
|
124
|
+
// BACKED, COMPUTE-ON-READ: `read` computes status from the durable
|
|
125
|
+
// `satellites.lastHeartbeatAt` (+ reads `lastConnectionEvent`) via the
|
|
126
|
+
// service (the source of truth — no stored status copy, no
|
|
127
|
+
// `entity_state` mirror, globally consistent from any pod). The handle
|
|
128
|
+
// is the only typed path that drives connection-state changes (reactive
|
|
129
|
+
// automation engine §4.2); it is reused by the WS handler + heartbeat
|
|
130
|
+
// monitor wired in afterPluginsReady.
|
|
131
|
+
satelliteEntityHandle = entity.defineEntity({
|
|
132
|
+
kind: SATELLITE_CONNECTION_ENTITY_KIND,
|
|
133
|
+
state: satelliteConnectionStateSchema,
|
|
134
|
+
read: createSatelliteConnectionRead(service),
|
|
135
|
+
});
|
|
136
|
+
|
|
59
137
|
const router = createSatelliteRouter({
|
|
60
138
|
service,
|
|
61
139
|
signalService,
|
|
@@ -72,6 +150,7 @@ export default createBackendPlugin({
|
|
|
72
150
|
signalService,
|
|
73
151
|
wsRegistry,
|
|
74
152
|
rpcClient,
|
|
153
|
+
secretResolver,
|
|
75
154
|
onHook,
|
|
76
155
|
}) => {
|
|
77
156
|
const service = new SatelliteService(
|
|
@@ -112,6 +191,71 @@ export default createBackendPlugin({
|
|
|
112
191
|
},
|
|
113
192
|
},
|
|
114
193
|
logger,
|
|
194
|
+
{
|
|
195
|
+
// Drive connect/disconnect through `handle.mutate` (Model B):
|
|
196
|
+
// `apply` UPDATEs the satellite row's durable liveness columns
|
|
197
|
+
// (`lastHeartbeatAt` + `lastConnectionEvent`) — the globally-
|
|
198
|
+
// readable source of truth — and returns the view (status COMPUTED
|
|
199
|
+
// from `lastHeartbeatAt`). The framework snapshots `prev` via
|
|
200
|
+
// `read`, records the transition (durable history), and emits the
|
|
201
|
+
// change; the deriver re-fires the equivalent trigger events.
|
|
202
|
+
mirror: async ({ satelliteId, lastEvent, lastHeartbeatAt }) => {
|
|
203
|
+
await withEntityWrite({
|
|
204
|
+
handle: satelliteEntityHandle,
|
|
205
|
+
id: satelliteId,
|
|
206
|
+
apply: () =>
|
|
207
|
+
service.applyConnectionState({
|
|
208
|
+
satelliteId,
|
|
209
|
+
lastEvent,
|
|
210
|
+
lastHeartbeatAt,
|
|
211
|
+
}),
|
|
212
|
+
});
|
|
213
|
+
},
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
// Script-package distribution: carry the desired lockfile hash in
|
|
217
|
+
// assignment payloads + persist per-satellite reconcile state.
|
|
218
|
+
// Satellites pull blobs from CORE (getManifest/downloadBlob),
|
|
219
|
+
// never the registry.
|
|
220
|
+
getDesiredLockfileHash: async () => {
|
|
221
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
222
|
+
const state = await spClient.getInstallState();
|
|
223
|
+
return state.lockfileHash;
|
|
224
|
+
},
|
|
225
|
+
reportSyncState: async (input) => {
|
|
226
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
227
|
+
await spClient.reportSatelliteSyncState(input);
|
|
228
|
+
},
|
|
229
|
+
getManifest: async ({ lockfileHash }) => {
|
|
230
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
231
|
+
const res = await spClient.getManifest({ lockfileHash });
|
|
232
|
+
return res.entries;
|
|
233
|
+
},
|
|
234
|
+
getBlobBase64: async ({ integrity }) => {
|
|
235
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
236
|
+
try {
|
|
237
|
+
const res = await spClient.downloadBlob({ integrity });
|
|
238
|
+
return res.data;
|
|
239
|
+
} catch {
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
// JIT secret delivery: resolve a collector's declared secretEnv
|
|
246
|
+
// (read from the satellite's own assignment) via the central
|
|
247
|
+
// resolver. Values are returned over the WS channel per-run and
|
|
248
|
+
// never persisted.
|
|
249
|
+
resolveRunSecrets: async ({ satelliteId, configId, collectorId }) =>
|
|
250
|
+
resolveSatelliteRunSecrets({
|
|
251
|
+
satelliteId,
|
|
252
|
+
configId,
|
|
253
|
+
collectorId,
|
|
254
|
+
getAssignmentsForSatellite: (id) =>
|
|
255
|
+
configRelay.getAssignmentsForSatellite(id),
|
|
256
|
+
resolver: secretResolver,
|
|
257
|
+
}),
|
|
258
|
+
},
|
|
115
259
|
);
|
|
116
260
|
|
|
117
261
|
// Register satellite WebSocket endpoint via the scoped WS registry
|
|
@@ -124,6 +268,30 @@ export default createBackendPlugin({
|
|
|
124
268
|
service,
|
|
125
269
|
signalService,
|
|
126
270
|
logger,
|
|
271
|
+
{
|
|
272
|
+
// Drive the online → offline (heartbeat-lost) edge through
|
|
273
|
+
// `handle.mutate`. `apply` flips ONLY `lastConnectionEvent` to
|
|
274
|
+
// `"heartbeat_lost"` (the aged `lastHeartbeatAt` is left untouched —
|
|
275
|
+
// it is what made the computed status `offline`). The framework
|
|
276
|
+
// records the transition (durable history) and the deriver re-fires
|
|
277
|
+
// `satellite.heartbeat_lost`. The mutate is idempotent: once
|
|
278
|
+
// `lastConnectionEvent === "heartbeat_lost"`, the monitor's
|
|
279
|
+
// predicate is false and re-runs (on any pod) are no-ops. This is
|
|
280
|
+
// the durable, any-pod offline-on-timeout backstop: a pod that dies
|
|
281
|
+
// without flipping its satellites to offline leaves a stale state
|
|
282
|
+
// only until ANY pod's monitor observes the heartbeat timeout.
|
|
283
|
+
mirror: async (satelliteId) => {
|
|
284
|
+
await withEntityWrite({
|
|
285
|
+
handle: satelliteEntityHandle,
|
|
286
|
+
id: satelliteId,
|
|
287
|
+
apply: () =>
|
|
288
|
+
service.applyConnectionState({
|
|
289
|
+
satelliteId,
|
|
290
|
+
lastEvent: "heartbeat_lost",
|
|
291
|
+
}),
|
|
292
|
+
});
|
|
293
|
+
},
|
|
294
|
+
},
|
|
127
295
|
);
|
|
128
296
|
|
|
129
297
|
const queue = queueManager.getQueue<Record<string, never>>(
|
|
@@ -163,6 +331,18 @@ export default createBackendPlugin({
|
|
|
163
331
|
},
|
|
164
332
|
);
|
|
165
333
|
|
|
334
|
+
// Fan the script-packages.changed broadcast out to THIS instance's
|
|
335
|
+
// connected satellites. Every core instance subscribes in broadcast
|
|
336
|
+
// mode, so each pushes to its own satellites; offline satellites
|
|
337
|
+
// converge via the assignment-carried lockfile hash on reconnect.
|
|
338
|
+
onHook(
|
|
339
|
+
scriptPackagesChangedHook,
|
|
340
|
+
async ({ lockfileHash }) => {
|
|
341
|
+
wsHandler.pushRefreshScriptPackagesToAll(lockfileHash);
|
|
342
|
+
},
|
|
343
|
+
{ mode: "broadcast" },
|
|
344
|
+
);
|
|
345
|
+
|
|
166
346
|
logger.debug("✅ Satellite Backend afterPluginsReady complete.");
|
|
167
347
|
},
|
|
168
348
|
});
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import type { SatelliteAssignment } from "@checkstack/satellite-common";
|
|
3
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
4
|
+
import { resolveSatelliteRunSecrets } from "./run-secret-resolver";
|
|
5
|
+
|
|
6
|
+
// A resolver that resolves from a fixed name->value map (mirrors the real
|
|
7
|
+
// resolveForRun: substitute ${{ secrets.NAME }} per declared env entry).
|
|
8
|
+
function fakeResolver(values: Record<string, string>): SecretResolverService {
|
|
9
|
+
const TEMPLATE_RE = /\$\{\{\s*secrets\.([a-zA-Z0-9_-]+)\s*\}\}/g;
|
|
10
|
+
return {
|
|
11
|
+
resolveSecret: async ({ name }) => {
|
|
12
|
+
if (!(name in values)) throw new Error(`Secret not found: ${name}`);
|
|
13
|
+
return values[name];
|
|
14
|
+
},
|
|
15
|
+
resolveBySchema: async ({ value }) => ({ resolved: value, warnings: [] }),
|
|
16
|
+
resolveForRun: async ({ secretEnv }) => {
|
|
17
|
+
const env: Record<string, string> = {};
|
|
18
|
+
for (const [envName, template] of Object.entries(secretEnv)) {
|
|
19
|
+
TEMPLATE_RE.lastIndex = 0;
|
|
20
|
+
env[envName] = template.replaceAll(TEMPLATE_RE, (_m, name: string) => {
|
|
21
|
+
if (!(name in values)) throw new Error(`Secret not found: ${name}`);
|
|
22
|
+
return values[name];
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
return {
|
|
26
|
+
env,
|
|
27
|
+
masking: {
|
|
28
|
+
size: 0,
|
|
29
|
+
maskText: (t) => t,
|
|
30
|
+
maskDeep: (v) => v,
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function assignment(
|
|
38
|
+
configId: string,
|
|
39
|
+
collectors: SatelliteAssignment["collectors"],
|
|
40
|
+
): SatelliteAssignment {
|
|
41
|
+
return {
|
|
42
|
+
configId,
|
|
43
|
+
systemId: "sys-1",
|
|
44
|
+
strategyId: "script",
|
|
45
|
+
config: {},
|
|
46
|
+
collectors,
|
|
47
|
+
intervalSeconds: 60,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
describe("resolveSatelliteRunSecrets", () => {
|
|
52
|
+
it("resolves ONLY the collector's declared secretEnv from the assignment", async () => {
|
|
53
|
+
const assignments = [
|
|
54
|
+
assignment("config-1", [
|
|
55
|
+
{
|
|
56
|
+
id: "col-1",
|
|
57
|
+
collectorId: "inline-script",
|
|
58
|
+
config: { secretEnv: { API_TOKEN: "${{ secrets.jira_token }}" } },
|
|
59
|
+
},
|
|
60
|
+
]),
|
|
61
|
+
];
|
|
62
|
+
const env = await resolveSatelliteRunSecrets({
|
|
63
|
+
satelliteId: "sat-1",
|
|
64
|
+
configId: "config-1",
|
|
65
|
+
collectorId: "col-1",
|
|
66
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
67
|
+
resolver: fakeResolver({ jira_token: "real-value", other: "nope" }),
|
|
68
|
+
});
|
|
69
|
+
expect(env).toEqual({ API_TOKEN: "real-value" });
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("throws when the assignment is not assigned to this satellite", async () => {
|
|
73
|
+
await expect(
|
|
74
|
+
resolveSatelliteRunSecrets({
|
|
75
|
+
satelliteId: "sat-1",
|
|
76
|
+
configId: "missing",
|
|
77
|
+
collectorId: "col-1",
|
|
78
|
+
getAssignmentsForSatellite: async () => [],
|
|
79
|
+
resolver: fakeResolver({}),
|
|
80
|
+
}),
|
|
81
|
+
).rejects.toThrow(/No assignment/);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("throws when the collector declares no secretEnv (least-privilege)", async () => {
|
|
85
|
+
const assignments = [
|
|
86
|
+
assignment("config-1", [
|
|
87
|
+
{ id: "col-1", collectorId: "inline-script", config: {} },
|
|
88
|
+
]),
|
|
89
|
+
];
|
|
90
|
+
await expect(
|
|
91
|
+
resolveSatelliteRunSecrets({
|
|
92
|
+
satelliteId: "sat-1",
|
|
93
|
+
configId: "config-1",
|
|
94
|
+
collectorId: "col-1",
|
|
95
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
96
|
+
resolver: fakeResolver({}),
|
|
97
|
+
}),
|
|
98
|
+
).rejects.toThrow(/no secretEnv/);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it("propagates a clear error when a required secret cannot resolve", async () => {
|
|
102
|
+
const assignments = [
|
|
103
|
+
assignment("config-1", [
|
|
104
|
+
{
|
|
105
|
+
id: "col-1",
|
|
106
|
+
collectorId: "inline-script",
|
|
107
|
+
config: { secretEnv: { TOKEN: "${{ secrets.absent }}" } },
|
|
108
|
+
},
|
|
109
|
+
]),
|
|
110
|
+
];
|
|
111
|
+
await expect(
|
|
112
|
+
resolveSatelliteRunSecrets({
|
|
113
|
+
satelliteId: "sat-1",
|
|
114
|
+
configId: "config-1",
|
|
115
|
+
collectorId: "col-1",
|
|
116
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
117
|
+
resolver: fakeResolver({}),
|
|
118
|
+
}),
|
|
119
|
+
).rejects.toThrow(/Secret not found: absent/);
|
|
120
|
+
});
|
|
121
|
+
});
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { secretEnvMappingSchema } from "@checkstack/secrets-common";
|
|
2
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
3
|
+
import type { SatelliteAssignment } from "@checkstack/satellite-common";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Resolve a satellite collector run's secrets just-in-time.
|
|
7
|
+
*
|
|
8
|
+
* Security model (least-privilege, decision 5): the satellite asks by
|
|
9
|
+
* `configId` + `collectorId` only. Core reads the `secretEnv` mapping from
|
|
10
|
+
* the satellite's OWN persisted assignment for that collector — the
|
|
11
|
+
* satellite does not get to choose which secrets — and resolves ONLY those
|
|
12
|
+
* refs via the central resolver. So a compromised satellite cannot request
|
|
13
|
+
* arbitrary secrets; it can only obtain what its assignment already
|
|
14
|
+
* declares it needs.
|
|
15
|
+
*
|
|
16
|
+
* Returns the resolved env map. Throws a clear error when the collector
|
|
17
|
+
* isn't in the satellite's assignments, when the collector declares no
|
|
18
|
+
* `secretEnv` (nothing to resolve — caller should not have asked), or when
|
|
19
|
+
* a referenced secret can't be resolved. The values are never persisted.
|
|
20
|
+
*/
|
|
21
|
+
export async function resolveSatelliteRunSecrets({
|
|
22
|
+
satelliteId,
|
|
23
|
+
configId,
|
|
24
|
+
collectorId,
|
|
25
|
+
getAssignmentsForSatellite,
|
|
26
|
+
resolver,
|
|
27
|
+
}: {
|
|
28
|
+
satelliteId: string;
|
|
29
|
+
configId: string;
|
|
30
|
+
collectorId: string;
|
|
31
|
+
getAssignmentsForSatellite: (
|
|
32
|
+
satelliteId: string,
|
|
33
|
+
) => Promise<SatelliteAssignment[]>;
|
|
34
|
+
resolver: SecretResolverService;
|
|
35
|
+
}): Promise<Record<string, string>> {
|
|
36
|
+
const assignments = await getAssignmentsForSatellite(satelliteId);
|
|
37
|
+
const assignment = assignments.find((a) => a.configId === configId);
|
|
38
|
+
if (!assignment) {
|
|
39
|
+
throw new Error(
|
|
40
|
+
`No assignment "${configId}" for this satellite; cannot deliver secrets.`,
|
|
41
|
+
);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
const collector = (assignment.collectors ?? []).find(
|
|
45
|
+
(c) => c.id === collectorId || c.collectorId === collectorId,
|
|
46
|
+
);
|
|
47
|
+
if (!collector) {
|
|
48
|
+
throw new Error(
|
|
49
|
+
`Collector "${collectorId}" not found in assignment "${configId}".`,
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// The declared mapping lives inside the collector's config. Validate it so
|
|
54
|
+
// a malformed config can't smuggle non-template values through.
|
|
55
|
+
const parsed = secretEnvMappingSchema.safeParse(
|
|
56
|
+
(collector.config as { secretEnv?: unknown }).secretEnv,
|
|
57
|
+
);
|
|
58
|
+
if (!parsed.success || Object.keys(parsed.data).length === 0) {
|
|
59
|
+
throw new Error(
|
|
60
|
+
`Collector "${collectorId}" declares no secretEnv; nothing to resolve.`,
|
|
61
|
+
);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const { env } = await resolver.resolveForRun({ secretEnv: parsed.data });
|
|
65
|
+
return env;
|
|
66
|
+
}
|