@checkstack/satellite-backend 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +153 -0
- package/drizzle/0001_tiresome_terror.sql +3 -0
- package/drizzle/0002_graceful_mac_gargan.sql +2 -0
- package/drizzle/meta/0001_snapshot.json +102 -0
- package/drizzle/meta/0002_snapshot.json +89 -0
- package/drizzle/meta/_journal.json +14 -0
- package/package.json +20 -13
- package/src/automations.ts +65 -24
- package/src/entity.test.ts +313 -0
- package/src/entity.ts +221 -0
- package/src/heartbeat-monitor.it.test.ts +232 -0
- package/src/heartbeat-monitor.test.ts +156 -83
- package/src/heartbeat-monitor.ts +102 -71
- package/src/hooks.ts +9 -39
- package/src/index.ts +168 -9
- package/src/run-secret-resolver.test.ts +121 -0
- package/src/run-secret-resolver.ts +66 -0
- package/src/satellite-ws-handler.test.ts +267 -0
- package/src/satellite-ws-handler.ts +242 -49
- package/src/schema.ts +22 -1
- package/src/service.test.ts +274 -0
- package/src/service.ts +133 -15
- package/src/status.ts +18 -0
- package/tsconfig.json +15 -0
- package/src/automations.test.ts +0 -54
package/src/heartbeat-monitor.ts
CHANGED
|
@@ -1,110 +1,141 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
2
2
|
import type { SignalService } from "@checkstack/signal-common";
|
|
3
3
|
import type { SatelliteService } from "./service";
|
|
4
|
-
import {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
} from "@checkstack/satellite-common";
|
|
4
|
+
import type { SatelliteConnectionEvent } from "./entity";
|
|
5
|
+
import { computeStatus } from "./status";
|
|
6
|
+
import { SATELLITE_STATUS_CHANGED } from "@checkstack/satellite-common";
|
|
8
7
|
|
|
9
8
|
/**
|
|
10
|
-
*
|
|
11
|
-
* `satellite
|
|
12
|
-
*
|
|
13
|
-
*
|
|
9
|
+
* Plug-point for driving the heartbeat-lost (`online` → `offline`) edge into
|
|
10
|
+
* the reactive `satellite-connection` entity (reactive automation engine
|
|
11
|
+
* §10.6). Bound from `afterPluginsReady`; when not provided, no entity state is
|
|
12
|
+
* mirrored.
|
|
13
|
+
*
|
|
14
|
+
* The monitor flips ONLY `lastConnectionEvent` to `"heartbeat_lost"` (leaving
|
|
15
|
+
* the already-aged `lastHeartbeatAt` untouched, since it is what made the
|
|
16
|
+
* computed status `offline` in the first place). The change-deriver re-fires
|
|
17
|
+
* `satellite.heartbeat_lost`. The opposite edge (offline→online) is mirrored as
|
|
18
|
+
* `connected` by the WS handler on reconnect, so the monitor leaves it alone.
|
|
14
19
|
*/
|
|
15
|
-
export interface
|
|
16
|
-
|
|
17
|
-
heartbeatLostHook: Hook<{
|
|
18
|
-
satelliteId: string;
|
|
19
|
-
name: string;
|
|
20
|
-
region: string;
|
|
21
|
-
timestamp: string;
|
|
22
|
-
}>;
|
|
20
|
+
export interface SatelliteHeartbeatEntitySink {
|
|
21
|
+
mirror: (satelliteId: string) => Promise<void>;
|
|
23
22
|
}
|
|
24
23
|
|
|
25
24
|
/**
|
|
26
|
-
* Monitors satellite heartbeats and
|
|
27
|
-
*
|
|
25
|
+
* Monitors satellite heartbeats and detects the online→offline transition from
|
|
26
|
+
* DURABLE state alone — no pod-local baseline.
|
|
27
|
+
*
|
|
28
|
+
* ## Horizontal-scale correctness
|
|
29
|
+
*
|
|
30
|
+
* The heartbeat-check job runs under ONE consumer group claimed by a VARYING
|
|
31
|
+
* pod. A process-local "previous status" map is therefore wrong: a pod with an
|
|
32
|
+
* empty map never sees the online→offline edge, so `connectionStatus` could get
|
|
33
|
+
* stuck `online` forever after a pod crash. This monitor instead reads every
|
|
34
|
+
* satellite's durable `(lastHeartbeatAt, lastConnectionEvent)`, computes status
|
|
35
|
+
* via {@link computeStatus} (the same wall-clock liveness rule the entity read
|
|
36
|
+
* uses), and detects the heartbeat-lost edge purely from durable state:
|
|
37
|
+
*
|
|
38
|
+
* computed status is `offline` AND `lastConnectionEvent === "connected"`
|
|
39
|
+
* ⇒ this satellite just lost its heartbeat (it was last marked connected,
|
|
40
|
+
* but its heartbeat has now aged past the offline threshold).
|
|
41
|
+
*
|
|
42
|
+
* The mutate that flips `lastConnectionEvent` to `"heartbeat_lost"` is
|
|
43
|
+
* IDEMPOTENT across pods and redelivery: once it is `"heartbeat_lost"`, the
|
|
44
|
+
* predicate above is false, so re-runs (on any pod) are no-ops, and the entity
|
|
45
|
+
* handle's diff-on-unchanged suppresses any duplicate transition/event. Any pod
|
|
46
|
+
* can therefore drive the edge correctly, regardless of which pod (if any) ever
|
|
47
|
+
* observed the satellite online in memory.
|
|
28
48
|
*/
|
|
29
49
|
export class HeartbeatMonitor {
|
|
30
50
|
/**
|
|
31
|
-
*
|
|
32
|
-
*
|
|
51
|
+
* Pod-local broadcast-dedup ONLY (never the source of truth). The durable
|
|
52
|
+
* `lastConnectionEvent` flip is what makes detection idempotent; this set
|
|
53
|
+
* merely avoids re-broadcasting the same status-change signal from this pod on
|
|
54
|
+
* back-to-back checks. A fresh pod with an empty set still detects + mirrors
|
|
55
|
+
* the edge from durable state — it just also broadcasts once, which is benign.
|
|
33
56
|
*/
|
|
34
|
-
private
|
|
57
|
+
private broadcastedOffline = new Set<string>();
|
|
35
58
|
|
|
36
59
|
constructor(
|
|
37
60
|
private service: SatelliteService,
|
|
38
61
|
private signalService: SignalService,
|
|
39
62
|
private logger: Logger,
|
|
40
|
-
private
|
|
63
|
+
private entitySink?: SatelliteHeartbeatEntitySink,
|
|
41
64
|
) {}
|
|
42
65
|
|
|
43
66
|
/**
|
|
44
|
-
* Check all satellites and
|
|
45
|
-
* Called periodically by a recurring
|
|
67
|
+
* Check all satellites and drive the heartbeat-lost edge for any that have
|
|
68
|
+
* aged out while still marked connected. Called periodically by a recurring
|
|
69
|
+
* queue job; safe to run on any pod and to redeliver.
|
|
46
70
|
*/
|
|
47
71
|
async checkHeartbeats(): Promise<void> {
|
|
48
|
-
const
|
|
72
|
+
const rows = await this.service.listConnectionLiveness();
|
|
73
|
+
const liveIds = new Set(rows.map((r) => r.id));
|
|
49
74
|
|
|
50
|
-
for (const
|
|
51
|
-
const
|
|
52
|
-
const
|
|
75
|
+
for (const row of rows) {
|
|
76
|
+
const status = computeStatus(row.lastHeartbeatAt);
|
|
77
|
+
const lostHeartbeat = this.hasLostHeartbeat({
|
|
78
|
+
status,
|
|
79
|
+
lastConnectionEvent: row.lastConnectionEvent,
|
|
80
|
+
});
|
|
53
81
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
);
|
|
82
|
+
if (!lostHeartbeat) {
|
|
83
|
+
// Still online (or already past the lost edge / never connected):
|
|
84
|
+
// nothing to detect. Clear the broadcast-dedup marker once a satellite
|
|
85
|
+
// is no longer in the lost state so a future lost edge re-broadcasts.
|
|
86
|
+
if (status === "online") this.broadcastedOffline.delete(row.id);
|
|
87
|
+
continue;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Durable heartbeat-lost edge: computed offline while still marked
|
|
91
|
+
// `connected`. Detected from durable state, so this fires correctly from
|
|
92
|
+
// ANY pod with no prior in-memory knowledge of the satellite.
|
|
93
|
+
this.logger.info(
|
|
94
|
+
`Satellite ${row.name} (${row.region}) lost heartbeat (online → offline)`,
|
|
95
|
+
);
|
|
59
96
|
|
|
97
|
+
// Broadcast the status-change signal once per offline edge from this pod.
|
|
98
|
+
if (!this.broadcastedOffline.has(row.id)) {
|
|
99
|
+
this.broadcastedOffline.add(row.id);
|
|
60
100
|
await this.signalService.broadcast(SATELLITE_STATUS_CHANGED, {
|
|
61
|
-
satelliteId:
|
|
62
|
-
status:
|
|
63
|
-
name:
|
|
64
|
-
region:
|
|
101
|
+
satelliteId: row.id,
|
|
102
|
+
status: "offline",
|
|
103
|
+
name: row.name,
|
|
104
|
+
region: row.region,
|
|
65
105
|
});
|
|
106
|
+
}
|
|
66
107
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
name: satellite.name,
|
|
79
|
-
region: satellite.region,
|
|
80
|
-
timestamp: new Date().toISOString(),
|
|
81
|
-
});
|
|
82
|
-
} catch (error) {
|
|
83
|
-
this.logger.error(
|
|
84
|
-
`Failed to emit satellite.heartbeat_lost hook for ${satellite.name}:`,
|
|
85
|
-
error,
|
|
86
|
-
);
|
|
87
|
-
}
|
|
108
|
+
// Drive the entity edge. The mutate is idempotent: it flips
|
|
109
|
+
// `lastConnectionEvent` to `"heartbeat_lost"`, after which this branch is
|
|
110
|
+
// never re-entered for the same satellite (re-runs are no-ops).
|
|
111
|
+
if (this.entitySink) {
|
|
112
|
+
try {
|
|
113
|
+
await this.entitySink.mirror(row.id);
|
|
114
|
+
} catch (error) {
|
|
115
|
+
this.logger.error(
|
|
116
|
+
`Failed to mirror satellite-connection (heartbeat_lost) for ${row.name}:`,
|
|
117
|
+
error,
|
|
118
|
+
);
|
|
88
119
|
}
|
|
89
120
|
}
|
|
90
|
-
|
|
91
|
-
this.previousStatuses.set(satellite.id, currentStatus);
|
|
92
121
|
}
|
|
93
122
|
|
|
94
|
-
//
|
|
95
|
-
const
|
|
96
|
-
|
|
97
|
-
if (!currentIds.has(trackedId)) {
|
|
98
|
-
this.previousStatuses.delete(trackedId);
|
|
99
|
-
}
|
|
123
|
+
// Drop broadcast-dedup markers for satellites that no longer exist.
|
|
124
|
+
for (const id of this.broadcastedOffline) {
|
|
125
|
+
if (!liveIds.has(id)) this.broadcastedOffline.delete(id);
|
|
100
126
|
}
|
|
101
127
|
}
|
|
102
128
|
|
|
103
129
|
/**
|
|
104
|
-
*
|
|
105
|
-
*
|
|
130
|
+
* Pure predicate: a satellite has just lost its heartbeat when its computed
|
|
131
|
+
* status is `offline` but its last recorded lifecycle edge still says it was
|
|
132
|
+
* `connected`. Once the edge is mirrored (`lastConnectionEvent` becomes
|
|
133
|
+
* `"heartbeat_lost"`), this returns false — the idempotency guarantee.
|
|
106
134
|
*/
|
|
107
|
-
|
|
108
|
-
|
|
135
|
+
private hasLostHeartbeat(props: {
|
|
136
|
+
status: "online" | "offline";
|
|
137
|
+
lastConnectionEvent: SatelliteConnectionEvent | null;
|
|
138
|
+
}): boolean {
|
|
139
|
+
return props.status === "offline" && props.lastConnectionEvent === "connected";
|
|
109
140
|
}
|
|
110
141
|
}
|
package/src/hooks.ts
CHANGED
|
@@ -2,8 +2,15 @@ import { createHook } from "@checkstack/backend-api";
|
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Satellite hooks for cross-plugin communication.
|
|
5
|
-
*
|
|
6
|
-
*
|
|
5
|
+
*
|
|
6
|
+
* The connection-lifecycle hooks (`satellite.connected` / `.disconnected` /
|
|
7
|
+
* `.heartbeat_lost`) were removed in Phase 4 (reactive automation engine
|
|
8
|
+
* §10.6): satellite connection state is now the reactive
|
|
9
|
+
* `satellite-connection` entity (see `./entity.ts`), and the equivalent
|
|
10
|
+
* trigger events are derived from its changes.
|
|
11
|
+
*
|
|
12
|
+
* `satellite.removed` stays — it is a deletion/cleanup signal (consumed by
|
|
13
|
+
* healthcheck-backend to scrub the satellite's id), not entity state.
|
|
7
14
|
*/
|
|
8
15
|
export const satelliteHooks = {
|
|
9
16
|
/**
|
|
@@ -14,41 +21,4 @@ export const satelliteHooks = {
|
|
|
14
21
|
satelliteRemoved: createHook<{
|
|
15
22
|
satelliteId: string;
|
|
16
23
|
}>("satellite.removed"),
|
|
17
|
-
|
|
18
|
-
/**
|
|
19
|
-
* Emitted when a satellite WebSocket completes authentication and
|
|
20
|
-
* registers itself in the in-memory connection map.
|
|
21
|
-
*/
|
|
22
|
-
connected: createHook<{
|
|
23
|
-
satelliteId: string;
|
|
24
|
-
name: string;
|
|
25
|
-
region: string;
|
|
26
|
-
timestamp: string;
|
|
27
|
-
}>("satellite.connected"),
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Emitted when a previously-connected satellite's WebSocket closes
|
|
31
|
-
* (graceful or otherwise). Distinct from `heartbeatLost`: this fires
|
|
32
|
-
* the moment the socket drops, regardless of whether the satellite
|
|
33
|
-
* comes back within the heartbeat window.
|
|
34
|
-
*/
|
|
35
|
-
disconnected: createHook<{
|
|
36
|
-
satelliteId: string;
|
|
37
|
-
name: string;
|
|
38
|
-
region: string;
|
|
39
|
-
timestamp: string;
|
|
40
|
-
}>("satellite.disconnected"),
|
|
41
|
-
|
|
42
|
-
/**
|
|
43
|
-
* Emitted by the heartbeat monitor when a satellite's status
|
|
44
|
-
* transitions from `online` to `offline` — i.e. no heartbeat for
|
|
45
|
-
* longer than `OFFLINE_THRESHOLD_MS`. Used by automations that
|
|
46
|
-
* page on stale satellites.
|
|
47
|
-
*/
|
|
48
|
-
heartbeatLost: createHook<{
|
|
49
|
-
satelliteId: string;
|
|
50
|
-
name: string;
|
|
51
|
-
region: string;
|
|
52
|
-
timestamp: string;
|
|
53
|
-
}>("satellite.heartbeat_lost"),
|
|
54
24
|
} as const;
|
package/src/index.ts
CHANGED
|
@@ -9,6 +9,10 @@ import {
|
|
|
9
9
|
} from "@checkstack/satellite-common";
|
|
10
10
|
import { HealthCheckApi } from "@checkstack/healthcheck-common";
|
|
11
11
|
import { healthCheckHooks } from "@checkstack/healthcheck-backend";
|
|
12
|
+
import { ScriptPackagesApi } from "@checkstack/script-packages-common";
|
|
13
|
+
import { scriptPackagesChangedHook } from "@checkstack/script-packages-backend";
|
|
14
|
+
import { secretResolverRef } from "@checkstack/secrets-backend";
|
|
15
|
+
import { resolveSatelliteRunSecrets } from "./run-secret-resolver";
|
|
12
16
|
import { SatelliteService } from "./service";
|
|
13
17
|
import { createSatelliteRouter } from "./router";
|
|
14
18
|
import { HeartbeatMonitor } from "./heartbeat-monitor";
|
|
@@ -16,9 +20,21 @@ import { SatelliteWsHandler } from "./satellite-ws-handler";
|
|
|
16
20
|
import { ConfigRelay } from "./config-relay";
|
|
17
21
|
import { entityKindExtensionPoint } from "@checkstack/gitops-backend";
|
|
18
22
|
import { registerSatelliteGitOpsKinds } from "./satellite-gitops-kinds";
|
|
19
|
-
import {
|
|
23
|
+
import {
|
|
24
|
+
automationTriggerExtensionPoint,
|
|
25
|
+
entityExtensionPoint,
|
|
26
|
+
withEntityWrite,
|
|
27
|
+
type EntityHandle,
|
|
28
|
+
} from "@checkstack/automation-backend";
|
|
29
|
+
import {
|
|
30
|
+
SATELLITE_CONNECTION_ENTITY_KIND,
|
|
31
|
+
createSatelliteConnectionRead,
|
|
32
|
+
deriveSatelliteConnectionEvents,
|
|
33
|
+
satelliteChangeToPayload,
|
|
34
|
+
satelliteConnectionStateSchema,
|
|
35
|
+
type SatelliteConnectionState,
|
|
36
|
+
} from "./entity";
|
|
20
37
|
import { satelliteTriggers } from "./automations";
|
|
21
|
-
import { satelliteHooks } from "./hooks";
|
|
22
38
|
|
|
23
39
|
// Queue and job constants
|
|
24
40
|
const HEARTBEAT_QUEUE = "satellite-heartbeat";
|
|
@@ -30,7 +46,29 @@ export default createBackendPlugin({
|
|
|
30
46
|
register(env) {
|
|
31
47
|
env.registerAccessRules(satelliteAccessRules);
|
|
32
48
|
|
|
33
|
-
// ─── Automation Platform:
|
|
49
|
+
// ─── Automation Platform: reactive connection entity ─────────────
|
|
50
|
+
// Satellite connection state is the `satellite-connection` entity
|
|
51
|
+
// (reactive automation engine §10.6, §9.1), PLUGIN-BACKED (Model B) and
|
|
52
|
+
// COMPUTE-ON-READ: its `status` is DERIVED on read from the DURABLE, shared
|
|
53
|
+
// `satellites.lastHeartbeatAt` column (the single liveness source of truth,
|
|
54
|
+
// same as the admin list), and `lastConnectionEvent` is the only extra
|
|
55
|
+
// durable column (the deriver's event discriminator). There is NO stored
|
|
56
|
+
// status copy and NO framework `entity_state` mirror, so EVERY pod computes
|
|
57
|
+
// the same state AND a stale row self-heals to offline once the heartbeat
|
|
58
|
+
// ages out (this fixes the horizontal-scaling bug twice: the old in-memory
|
|
59
|
+
// map made pod A's satellite invisible to pod B, and the prior fix's stored
|
|
60
|
+
// status got stuck `online` after a pod crash because the heartbeat-lost
|
|
61
|
+
// EDGE was detected pod-locally). The three lifecycle sites (connect /
|
|
62
|
+
// disconnect / heartbeat-lost) write the liveness inputs through
|
|
63
|
+
// `handle.mutate`, and the framework records full transition HISTORY in
|
|
64
|
+
// `entity_transitions`.
|
|
65
|
+
//
|
|
66
|
+
// The `satellite.connected` / `.disconnected` / `.heartbeat_lost` trigger
|
|
67
|
+
// events are DERIVED from its changes (no hook-backed triggers). The
|
|
68
|
+
// ENTITY-DRIVEN triggers below stay registered so they remain in the
|
|
69
|
+
// editor's trigger catalog + payload-introspectable, and a `toPayload`
|
|
70
|
+
// mapper makes the runtime `trigger.payload` match their `payloadSchema`
|
|
71
|
+
// (mirroring incident / catalog / dependency / healthcheck).
|
|
34
72
|
const automationTriggers = env.getExtensionPoint(
|
|
35
73
|
automationTriggerExtensionPoint,
|
|
36
74
|
);
|
|
@@ -38,6 +76,20 @@ export default createBackendPlugin({
|
|
|
38
76
|
automationTriggers.registerTrigger(trigger, pluginMetadata);
|
|
39
77
|
}
|
|
40
78
|
|
|
79
|
+
const entity = env.getExtensionPoint(entityExtensionPoint);
|
|
80
|
+
entity.registerChangeDeriver({
|
|
81
|
+
kind: SATELLITE_CONNECTION_ENTITY_KIND,
|
|
82
|
+
derive: deriveSatelliteConnectionEvents,
|
|
83
|
+
toPayload: satelliteChangeToPayload,
|
|
84
|
+
});
|
|
85
|
+
entity.declareNonReactiveState({
|
|
86
|
+
table: "satellites",
|
|
87
|
+
reason: "bookkeeping",
|
|
88
|
+
note: "lastHeartbeatAt is the raw liveness timestamp; the satellite-connection entity's reactive status is computed from it on read.",
|
|
89
|
+
});
|
|
90
|
+
// Created once in init; reused by the WS handler + heartbeat monitor.
|
|
91
|
+
let satelliteEntityHandle: EntityHandle<SatelliteConnectionState>;
|
|
92
|
+
|
|
41
93
|
// ─── GitOps Entity Kind Registration ─────────────────────────────
|
|
42
94
|
let gitopsService: SatelliteService | undefined;
|
|
43
95
|
const kindRegistry = env.getExtensionPoint(entityKindExtensionPoint);
|
|
@@ -58,6 +110,7 @@ export default createBackendPlugin({
|
|
|
58
110
|
signalService: coreServices.signalService,
|
|
59
111
|
queueManager: coreServices.queueManager,
|
|
60
112
|
wsRegistry: coreServices.wsRegistry,
|
|
113
|
+
secretResolver: secretResolverRef,
|
|
61
114
|
},
|
|
62
115
|
init: async ({ logger, database, rpc, signalService }) => {
|
|
63
116
|
logger.debug("🛰️ Initializing Satellite Backend...");
|
|
@@ -67,6 +120,20 @@ export default createBackendPlugin({
|
|
|
67
120
|
);
|
|
68
121
|
gitopsService = service;
|
|
69
122
|
|
|
123
|
+
// Declare the reactive `satellite-connection` entity once. PLUGIN-
|
|
124
|
+
// BACKED, COMPUTE-ON-READ: `read` computes status from the durable
|
|
125
|
+
// `satellites.lastHeartbeatAt` (+ reads `lastConnectionEvent`) via the
|
|
126
|
+
// service (the source of truth — no stored status copy, no
|
|
127
|
+
// `entity_state` mirror, globally consistent from any pod). The handle
|
|
128
|
+
// is the only typed path that drives connection-state changes (reactive
|
|
129
|
+
// automation engine §4.2); it is reused by the WS handler + heartbeat
|
|
130
|
+
// monitor wired in afterPluginsReady.
|
|
131
|
+
satelliteEntityHandle = entity.defineEntity({
|
|
132
|
+
kind: SATELLITE_CONNECTION_ENTITY_KIND,
|
|
133
|
+
state: satelliteConnectionStateSchema,
|
|
134
|
+
read: createSatelliteConnectionRead(service),
|
|
135
|
+
});
|
|
136
|
+
|
|
70
137
|
const router = createSatelliteRouter({
|
|
71
138
|
service,
|
|
72
139
|
signalService,
|
|
@@ -83,8 +150,8 @@ export default createBackendPlugin({
|
|
|
83
150
|
signalService,
|
|
84
151
|
wsRegistry,
|
|
85
152
|
rpcClient,
|
|
153
|
+
secretResolver,
|
|
86
154
|
onHook,
|
|
87
|
-
emitHook,
|
|
88
155
|
}) => {
|
|
89
156
|
const service = new SatelliteService(
|
|
90
157
|
database as SafeDatabase<typeof schema>,
|
|
@@ -125,9 +192,69 @@ export default createBackendPlugin({
|
|
|
125
192
|
},
|
|
126
193
|
logger,
|
|
127
194
|
{
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
195
|
+
// Drive connect/disconnect through `handle.mutate` (Model B):
|
|
196
|
+
// `apply` UPDATEs the satellite row's durable liveness columns
|
|
197
|
+
// (`lastHeartbeatAt` + `lastConnectionEvent`) — the globally-
|
|
198
|
+
// readable source of truth — and returns the view (status COMPUTED
|
|
199
|
+
// from `lastHeartbeatAt`). The framework snapshots `prev` via
|
|
200
|
+
// `read`, records the transition (durable history), and emits the
|
|
201
|
+
// change; the deriver re-fires the equivalent trigger events.
|
|
202
|
+
mirror: async ({ satelliteId, lastEvent, lastHeartbeatAt }) => {
|
|
203
|
+
await withEntityWrite({
|
|
204
|
+
handle: satelliteEntityHandle,
|
|
205
|
+
id: satelliteId,
|
|
206
|
+
apply: () =>
|
|
207
|
+
service.applyConnectionState({
|
|
208
|
+
satelliteId,
|
|
209
|
+
lastEvent,
|
|
210
|
+
lastHeartbeatAt,
|
|
211
|
+
}),
|
|
212
|
+
});
|
|
213
|
+
},
|
|
214
|
+
},
|
|
215
|
+
{
|
|
216
|
+
// Script-package distribution: carry the desired lockfile hash in
|
|
217
|
+
// assignment payloads + persist per-satellite reconcile state.
|
|
218
|
+
// Satellites pull blobs from CORE (getManifest/downloadBlob),
|
|
219
|
+
// never the registry.
|
|
220
|
+
getDesiredLockfileHash: async () => {
|
|
221
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
222
|
+
const state = await spClient.getInstallState();
|
|
223
|
+
return state.lockfileHash;
|
|
224
|
+
},
|
|
225
|
+
reportSyncState: async (input) => {
|
|
226
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
227
|
+
await spClient.reportSatelliteSyncState(input);
|
|
228
|
+
},
|
|
229
|
+
getManifest: async ({ lockfileHash }) => {
|
|
230
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
231
|
+
const res = await spClient.getManifest({ lockfileHash });
|
|
232
|
+
return res.entries;
|
|
233
|
+
},
|
|
234
|
+
getBlobBase64: async ({ integrity }) => {
|
|
235
|
+
const spClient = rpcClient.forPlugin(ScriptPackagesApi);
|
|
236
|
+
try {
|
|
237
|
+
const res = await spClient.downloadBlob({ integrity });
|
|
238
|
+
return res.data;
|
|
239
|
+
} catch {
|
|
240
|
+
return null;
|
|
241
|
+
}
|
|
242
|
+
},
|
|
243
|
+
},
|
|
244
|
+
{
|
|
245
|
+
// JIT secret delivery: resolve a collector's declared secretEnv
|
|
246
|
+
// (read from the satellite's own assignment) via the central
|
|
247
|
+
// resolver. Values are returned over the WS channel per-run and
|
|
248
|
+
// never persisted.
|
|
249
|
+
resolveRunSecrets: async ({ satelliteId, configId, collectorId }) =>
|
|
250
|
+
resolveSatelliteRunSecrets({
|
|
251
|
+
satelliteId,
|
|
252
|
+
configId,
|
|
253
|
+
collectorId,
|
|
254
|
+
getAssignmentsForSatellite: (id) =>
|
|
255
|
+
configRelay.getAssignmentsForSatellite(id),
|
|
256
|
+
resolver: secretResolver,
|
|
257
|
+
}),
|
|
131
258
|
},
|
|
132
259
|
);
|
|
133
260
|
|
|
@@ -142,8 +269,28 @@ export default createBackendPlugin({
|
|
|
142
269
|
signalService,
|
|
143
270
|
logger,
|
|
144
271
|
{
|
|
145
|
-
|
|
146
|
-
|
|
272
|
+
// Drive the online → offline (heartbeat-lost) edge through
|
|
273
|
+
// `handle.mutate`. `apply` flips ONLY `lastConnectionEvent` to
|
|
274
|
+
// `"heartbeat_lost"` (the aged `lastHeartbeatAt` is left untouched —
|
|
275
|
+
// it is what made the computed status `offline`). The framework
|
|
276
|
+
// records the transition (durable history) and the deriver re-fires
|
|
277
|
+
// `satellite.heartbeat_lost`. The mutate is idempotent: once
|
|
278
|
+
// `lastConnectionEvent === "heartbeat_lost"`, the monitor's
|
|
279
|
+
// predicate is false and re-runs (on any pod) are no-ops. This is
|
|
280
|
+
// the durable, any-pod offline-on-timeout backstop: a pod that dies
|
|
281
|
+
// without flipping its satellites to offline leaves a stale state
|
|
282
|
+
// only until ANY pod's monitor observes the heartbeat timeout.
|
|
283
|
+
mirror: async (satelliteId) => {
|
|
284
|
+
await withEntityWrite({
|
|
285
|
+
handle: satelliteEntityHandle,
|
|
286
|
+
id: satelliteId,
|
|
287
|
+
apply: () =>
|
|
288
|
+
service.applyConnectionState({
|
|
289
|
+
satelliteId,
|
|
290
|
+
lastEvent: "heartbeat_lost",
|
|
291
|
+
}),
|
|
292
|
+
});
|
|
293
|
+
},
|
|
147
294
|
},
|
|
148
295
|
);
|
|
149
296
|
|
|
@@ -184,6 +331,18 @@ export default createBackendPlugin({
|
|
|
184
331
|
},
|
|
185
332
|
);
|
|
186
333
|
|
|
334
|
+
// Fan the script-packages.changed broadcast out to THIS instance's
|
|
335
|
+
// connected satellites. Every core instance subscribes in broadcast
|
|
336
|
+
// mode, so each pushes to its own satellites; offline satellites
|
|
337
|
+
// converge via the assignment-carried lockfile hash on reconnect.
|
|
338
|
+
onHook(
|
|
339
|
+
scriptPackagesChangedHook,
|
|
340
|
+
async ({ lockfileHash }) => {
|
|
341
|
+
wsHandler.pushRefreshScriptPackagesToAll(lockfileHash);
|
|
342
|
+
},
|
|
343
|
+
{ mode: "broadcast" },
|
|
344
|
+
);
|
|
345
|
+
|
|
187
346
|
logger.debug("✅ Satellite Backend afterPluginsReady complete.");
|
|
188
347
|
},
|
|
189
348
|
});
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import type { SatelliteAssignment } from "@checkstack/satellite-common";
|
|
3
|
+
import type { SecretResolverService } from "@checkstack/secrets-backend";
|
|
4
|
+
import { resolveSatelliteRunSecrets } from "./run-secret-resolver";
|
|
5
|
+
|
|
6
|
+
// A resolver that resolves from a fixed name->value map (mirrors the real
|
|
7
|
+
// resolveForRun: substitute ${{ secrets.NAME }} per declared env entry).
|
|
8
|
+
function fakeResolver(values: Record<string, string>): SecretResolverService {
|
|
9
|
+
const TEMPLATE_RE = /\$\{\{\s*secrets\.([a-zA-Z0-9_-]+)\s*\}\}/g;
|
|
10
|
+
return {
|
|
11
|
+
resolveSecret: async ({ name }) => {
|
|
12
|
+
if (!(name in values)) throw new Error(`Secret not found: ${name}`);
|
|
13
|
+
return values[name];
|
|
14
|
+
},
|
|
15
|
+
resolveBySchema: async ({ value }) => ({ resolved: value, warnings: [] }),
|
|
16
|
+
resolveForRun: async ({ secretEnv }) => {
|
|
17
|
+
const env: Record<string, string> = {};
|
|
18
|
+
for (const [envName, template] of Object.entries(secretEnv)) {
|
|
19
|
+
TEMPLATE_RE.lastIndex = 0;
|
|
20
|
+
env[envName] = template.replaceAll(TEMPLATE_RE, (_m, name: string) => {
|
|
21
|
+
if (!(name in values)) throw new Error(`Secret not found: ${name}`);
|
|
22
|
+
return values[name];
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
return {
|
|
26
|
+
env,
|
|
27
|
+
masking: {
|
|
28
|
+
size: 0,
|
|
29
|
+
maskText: (t) => t,
|
|
30
|
+
maskDeep: (v) => v,
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
},
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function assignment(
|
|
38
|
+
configId: string,
|
|
39
|
+
collectors: SatelliteAssignment["collectors"],
|
|
40
|
+
): SatelliteAssignment {
|
|
41
|
+
return {
|
|
42
|
+
configId,
|
|
43
|
+
systemId: "sys-1",
|
|
44
|
+
strategyId: "script",
|
|
45
|
+
config: {},
|
|
46
|
+
collectors,
|
|
47
|
+
intervalSeconds: 60,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
describe("resolveSatelliteRunSecrets", () => {
|
|
52
|
+
it("resolves ONLY the collector's declared secretEnv from the assignment", async () => {
|
|
53
|
+
const assignments = [
|
|
54
|
+
assignment("config-1", [
|
|
55
|
+
{
|
|
56
|
+
id: "col-1",
|
|
57
|
+
collectorId: "inline-script",
|
|
58
|
+
config: { secretEnv: { API_TOKEN: "${{ secrets.jira_token }}" } },
|
|
59
|
+
},
|
|
60
|
+
]),
|
|
61
|
+
];
|
|
62
|
+
const env = await resolveSatelliteRunSecrets({
|
|
63
|
+
satelliteId: "sat-1",
|
|
64
|
+
configId: "config-1",
|
|
65
|
+
collectorId: "col-1",
|
|
66
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
67
|
+
resolver: fakeResolver({ jira_token: "real-value", other: "nope" }),
|
|
68
|
+
});
|
|
69
|
+
expect(env).toEqual({ API_TOKEN: "real-value" });
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it("throws when the assignment is not assigned to this satellite", async () => {
|
|
73
|
+
await expect(
|
|
74
|
+
resolveSatelliteRunSecrets({
|
|
75
|
+
satelliteId: "sat-1",
|
|
76
|
+
configId: "missing",
|
|
77
|
+
collectorId: "col-1",
|
|
78
|
+
getAssignmentsForSatellite: async () => [],
|
|
79
|
+
resolver: fakeResolver({}),
|
|
80
|
+
}),
|
|
81
|
+
).rejects.toThrow(/No assignment/);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it("throws when the collector declares no secretEnv (least-privilege)", async () => {
|
|
85
|
+
const assignments = [
|
|
86
|
+
assignment("config-1", [
|
|
87
|
+
{ id: "col-1", collectorId: "inline-script", config: {} },
|
|
88
|
+
]),
|
|
89
|
+
];
|
|
90
|
+
await expect(
|
|
91
|
+
resolveSatelliteRunSecrets({
|
|
92
|
+
satelliteId: "sat-1",
|
|
93
|
+
configId: "config-1",
|
|
94
|
+
collectorId: "col-1",
|
|
95
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
96
|
+
resolver: fakeResolver({}),
|
|
97
|
+
}),
|
|
98
|
+
).rejects.toThrow(/no secretEnv/);
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
it("propagates a clear error when a required secret cannot resolve", async () => {
|
|
102
|
+
const assignments = [
|
|
103
|
+
assignment("config-1", [
|
|
104
|
+
{
|
|
105
|
+
id: "col-1",
|
|
106
|
+
collectorId: "inline-script",
|
|
107
|
+
config: { secretEnv: { TOKEN: "${{ secrets.absent }}" } },
|
|
108
|
+
},
|
|
109
|
+
]),
|
|
110
|
+
];
|
|
111
|
+
await expect(
|
|
112
|
+
resolveSatelliteRunSecrets({
|
|
113
|
+
satelliteId: "sat-1",
|
|
114
|
+
configId: "config-1",
|
|
115
|
+
collectorId: "col-1",
|
|
116
|
+
getAssignmentsForSatellite: async () => assignments,
|
|
117
|
+
resolver: fakeResolver({}),
|
|
118
|
+
}),
|
|
119
|
+
).rejects.toThrow(/Secret not found: absent/);
|
|
120
|
+
});
|
|
121
|
+
});
|