@checkstack/satellite-backend 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,13 @@
1
- import type { Hook, Logger } from "@checkstack/backend-api";
1
+ import type { Logger } from "@checkstack/backend-api";
2
2
  import type {
3
3
  WebSocketRouteHandler,
4
4
  WsConnection,
5
5
  WsConnectionHandlers,
6
6
  } from "@checkstack/backend-api";
7
+ import { extractErrorMessage } from "@checkstack/common";
7
8
  import type { SatelliteService } from "./service";
8
9
  import type { ConfigRelay } from "./config-relay";
10
+ import type { SatelliteConnectionEvent } from "./entity";
9
11
  import {
10
12
  SatelliteToCoreMessageSchema,
11
13
  type CoreToSatelliteMessage,
@@ -14,24 +16,24 @@ import {
14
16
  } from "@checkstack/satellite-common";
15
17
 
16
18
  /**
17
- * Optional plug-point for firing automation triggers when a satellite
18
- * connects or disconnects. Bound from `afterPluginsReady` where
19
- * `emitHook` is available when not provided, no hook fires.
19
+ * Optional plug-point for driving a satellite connection lifecycle edge into
20
+ * the reactive `satellite-connection` entity (reactive automation engine
21
+ * §10.6). Bound from `afterPluginsReady` where the entity handle is available
22
+ * when not provided, no entity state is mirrored (graceful no-op in unit tests).
23
+ *
24
+ * The WS handler calls `mirror` at the same connect / disconnect lifecycle
25
+ * points it previously emitted the `satellite.connected` / `.disconnected`
26
+ * hooks; the change-deriver re-fires the equivalent trigger events. The status
27
+ * is COMPUTED on read from `lastHeartbeatAt`, so the sink carries the new
28
+ * heartbeat value for the edge rather than a status: `now` on connect (online),
29
+ * `null` on clean disconnect (offline immediately).
20
30
  */
21
- export interface SatelliteConnectionHookSink {
22
- emitHook: <T>(hook: Hook<T>, payload: T) => Promise<void>;
23
- connectedHook: Hook<{
31
+ export interface SatelliteConnectionEntitySink {
32
+ mirror: (input: {
24
33
  satelliteId: string;
25
- name: string;
26
- region: string;
27
- timestamp: string;
28
- }>;
29
- disconnectedHook: Hook<{
30
- satelliteId: string;
31
- name: string;
32
- region: string;
33
- timestamp: string;
34
- }>;
34
+ lastEvent: SatelliteConnectionEvent;
35
+ lastHeartbeatAt: Date | null;
36
+ }) => Promise<void>;
35
37
  }
36
38
 
37
39
  /**
@@ -45,6 +47,49 @@ export interface SatelliteResultHandler {
45
47
  }): Promise<void>;
46
48
  }
47
49
 
50
+ /**
51
+ * Optional plug-point for script-package distribution to satellites. Wired
52
+ * from `afterPluginsReady` against the script-packages RPC. When absent,
53
+ * satellites simply never receive a `scriptPackagesLockfileHash` or refresh
54
+ * push (graceful no-op on installs without the plugin).
55
+ */
56
+ export interface SatelliteScriptPackageSink {
57
+ /** The desired lockfile hash to carry in assignment payloads, or null. */
58
+ getDesiredLockfileHash(): Promise<string | null>;
59
+ /** Persist a satellite's reconcile state for the admin UI. */
60
+ reportSyncState(input: {
61
+ satelliteId: string;
62
+ lockfileHash: string | null;
63
+ status: "pending" | "syncing" | "ready" | "error";
64
+ errorMessage?: string;
65
+ }): Promise<void>;
66
+ /** Manifest entries for a lockfile hash (for satellite delta diffing). */
67
+ getManifest(input: {
68
+ lockfileHash: string;
69
+ }): Promise<{ name: string; version: string; integrity: string }[]>;
70
+ /** One content-addressed blob as base64, or null if not found. */
71
+ getBlobBase64(input: { integrity: string }): Promise<string | null>;
72
+ }
73
+
74
+ /**
75
+ * Optional plug-point for just-in-time secret delivery to satellites.
76
+ * Wired from `afterPluginsReady` against `secretResolverRef`. When absent,
77
+ * a `request_run_secrets` is answered with an error (no secrets available),
78
+ * so a collector that declares `secretEnv` fails clearly rather than
79
+ * running without it.
80
+ *
81
+ * The resolver reads the declared `secretEnv` from the satellite's persisted
82
+ * assignment (the satellite does not choose which secrets), resolves ONLY
83
+ * those refs, and returns the env map. Resolved values are never persisted.
84
+ */
85
+ export interface SatelliteSecretSink {
86
+ resolveRunSecrets(input: {
87
+ satelliteId: string;
88
+ configId: string;
89
+ collectorId: string;
90
+ }): Promise<Record<string, string>>;
91
+ }
92
+
48
93
  /**
49
94
  * Active satellite connection tracking.
50
95
  */
@@ -58,7 +103,15 @@ interface SatelliteConnection {
58
103
  * Manages authentication, heartbeats, result ingestion, and config pushes.
59
104
  */
60
105
  export class SatelliteWsHandler implements WebSocketRouteHandler {
61
- /** Map of satelliteId → active WebSocket connection */
106
+ /**
107
+ * Pod-local live-socket registry: satelliteId → the WebSocket connection
108
+ * physically held by THIS pod. This is NOT the reactive entity's source of
109
+ * truth (that is the durable `satellites` connection columns, globally
110
+ * readable from any pod). It exists ONLY to route messages — config pushes,
111
+ * script-package refreshes, shutdowns — to a socket this pod actually owns;
112
+ * a satellite connected to another pod is simply absent here. Treat it as
113
+ * transport infrastructure, not state.
114
+ */
62
115
  private connections = new Map<string, SatelliteConnection>();
63
116
 
64
117
  constructor(
@@ -67,12 +120,24 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
67
120
  private resultHandler: SatelliteResultHandler,
68
121
  private logger: Logger,
69
122
  /**
70
- * Optional. When set, the handler fires `connected` / `disconnected`
71
- * hooks at the same lifecycle points it logs. Wired by
72
- * `afterPluginsReady` so the action graph stays decoupled from
73
- * `emitHook` availability.
123
+ * Optional. When set, the handler mirrors `online` / `offline`
124
+ * connection state into the reactive `satellite-connection` entity at
125
+ * the same lifecycle points it logs. Wired by `afterPluginsReady` so the
126
+ * action graph stays decoupled from entity-handle availability.
74
127
  */
75
- private connectionHookSink?: SatelliteConnectionHookSink,
128
+ private connectionEntitySink?: SatelliteConnectionEntitySink,
129
+ /**
130
+ * Optional. When set, assignment payloads carry the desired script-package
131
+ * lockfile hash and the handler can push `refresh_script_packages` +
132
+ * persist per-satellite sync state.
133
+ */
134
+ private scriptPackageSink?: SatelliteScriptPackageSink,
135
+ /**
136
+ * Optional. When set, the handler answers `request_run_secrets` by
137
+ * resolving the collector's declared secretEnv just-in-time. When
138
+ * unset, such a request is answered with an error.
139
+ */
140
+ private secretSink?: SatelliteSecretSink,
76
141
  ) {}
77
142
 
78
143
  /**
@@ -122,38 +187,47 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
122
187
  // Track connection
123
188
  this.connections.set(satellite.id, { satellite, ws });
124
189
 
125
- // Fire the automation `connected` hook (best-effort — never
126
- // block the auth handshake on a hook subscriber failure).
127
- if (this.connectionHookSink) {
190
+ // Drive the `connected` edge into the reactive entity (best-effort —
191
+ // never block the auth handshake on a mirror failure). `apply` sets
192
+ // `lastHeartbeatAt = now` so the computed status reads `online`, and
193
+ // `lastConnectionEvent = "connected"`; the change-deriver re-fires the
194
+ // `satellite.connected` trigger event. This is also the connect-time
195
+ // heartbeat write (no separate `updateHeartbeat` needed), and it runs
196
+ // through `handle.mutate` so `prev` is snapshotted BEFORE the write.
197
+ if (this.connectionEntitySink) {
128
198
  try {
129
- await this.connectionHookSink.emitHook(
130
- this.connectionHookSink.connectedHook,
131
- {
132
- satelliteId: satellite.id,
133
- name: satellite.name,
134
- region: satellite.region,
135
- timestamp: new Date().toISOString(),
136
- },
137
- );
199
+ await this.connectionEntitySink.mirror({
200
+ satelliteId: satellite.id,
201
+ lastEvent: "connected",
202
+ lastHeartbeatAt: new Date(),
203
+ });
138
204
  } catch (error) {
139
205
  this.logger.error(
140
- `Failed to emit satellite.connected hook for ${satellite.name}:`,
206
+ `Failed to mirror satellite-connection (connected) for ${satellite.name}:`,
141
207
  error,
142
208
  );
143
209
  }
210
+ } else {
211
+ // No entity sink wired (e.g. unit tests): still record the
212
+ // connect-time heartbeat directly so liveness is correct.
213
+ await this.service.updateHeartbeat(satellite.id, {});
144
214
  }
145
215
 
146
- // Update heartbeat on connect
147
- await this.service.updateHeartbeat(satellite.id, {});
148
-
149
- // Send authenticated response with full config
216
+ // Send authenticated response with full config. Carry the desired
217
+ // script-package lockfile hash as the durable convergence backstop:
218
+ // a satellite that missed a refresh push reconciles on connect.
150
219
  const assignments =
151
220
  await this.configRelay.getAssignmentsForSatellite(satellite.id);
221
+ const scriptPackagesLockfileHash =
222
+ await this.resolveDesiredLockfileHash();
152
223
 
153
224
  this.sendMessage(ws, {
154
225
  type: "authenticated",
155
226
  satelliteId: satellite.id,
156
227
  assignments,
228
+ ...(scriptPackagesLockfileHash === undefined
229
+ ? {}
230
+ : { scriptPackagesLockfileHash }),
157
231
  });
158
232
 
159
233
  this.logger.info(
@@ -184,6 +258,80 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
184
258
  );
185
259
  break;
186
260
  }
261
+ case "script_package_sync_state": {
262
+ // Persist the satellite's reconcile state for the admin UI.
263
+ try {
264
+ await this.scriptPackageSink?.reportSyncState({
265
+ satelliteId: authenticatedSatellite.id,
266
+ lockfileHash: parsed.lockfileHash,
267
+ status: parsed.status,
268
+ errorMessage: parsed.errorMessage,
269
+ });
270
+ } catch (error) {
271
+ this.logger.error(
272
+ `Failed to persist script-package sync state for ${authenticatedSatellite.name}:`,
273
+ error,
274
+ );
275
+ }
276
+ break;
277
+ }
278
+ case "request_script_package_manifest": {
279
+ const entries =
280
+ (await this.scriptPackageSink?.getManifest({
281
+ lockfileHash: parsed.lockfileHash,
282
+ })) ?? [];
283
+ this.sendMessage(ws, {
284
+ type: "script_package_manifest",
285
+ lockfileHash: parsed.lockfileHash,
286
+ entries,
287
+ });
288
+ break;
289
+ }
290
+ case "request_script_package_blob": {
291
+ const data =
292
+ (await this.scriptPackageSink?.getBlobBase64({
293
+ integrity: parsed.integrity,
294
+ })) ?? null;
295
+ this.sendMessage(ws, {
296
+ type: "script_package_blob",
297
+ integrity: parsed.integrity,
298
+ data,
299
+ });
300
+ break;
301
+ }
302
+ case "request_run_secrets": {
303
+ // JIT secret delivery: resolve ONLY the collector's declared
304
+ // secretEnv (read from the persisted assignment, not chosen by
305
+ // the satellite) and reply with the env map. On any failure,
306
+ // reply with an error so the satellite fails the run clearly.
307
+ if (!this.secretSink) {
308
+ this.sendMessage(ws, {
309
+ type: "run_secrets",
310
+ requestId: parsed.requestId,
311
+ error: "Secret delivery is not available on this core instance.",
312
+ });
313
+ break;
314
+ }
315
+ try {
316
+ const env = await this.secretSink.resolveRunSecrets({
317
+ satelliteId: authenticatedSatellite.id,
318
+ configId: parsed.configId,
319
+ collectorId: parsed.collectorId,
320
+ });
321
+ this.sendMessage(ws, {
322
+ type: "run_secrets",
323
+ requestId: parsed.requestId,
324
+ env,
325
+ });
326
+ } catch (error) {
327
+ this.sendMessage(ws, {
328
+ type: "run_secrets",
329
+ requestId: parsed.requestId,
330
+ error: extractErrorMessage(error),
331
+ });
332
+ }
333
+ break;
334
+ }
187
335
  case "authenticate": {
188
336
  // Already authenticated, ignore duplicate auth attempts
189
337
  this.logger.debug(
@@ -201,19 +349,25 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
201
349
  this.logger.info(
202
350
  `Satellite disconnected: ${closedSatellite.name} (${closedSatellite.region})`,
203
351
  );
204
- if (this.connectionHookSink) {
205
- // Fire-and-forget — `onClose` is sync, so don't await; we
206
- // don't have a place to surface a rejection anyway.
207
- void this.connectionHookSink
208
- .emitHook(this.connectionHookSink.disconnectedHook, {
352
+ if (this.connectionEntitySink) {
353
+ // Fire-and-forget — `onClose` is sync, so don't await; we don't have
354
+ // a place to surface a rejection anyway. Clear `lastHeartbeatAt`
355
+ // (`null`) so the computed status flips `offline` IMMEDIATELY on a
356
+ // clean disconnect (no waiting for the heartbeat to age out), and set
357
+ // `lastConnectionEvent = "disconnected"` so the deriver re-fires
358
+ // `satellite.disconnected`. Nulling the heartbeat coincides with the
359
+ // "never connected" representation, but `lastConnectionEvent` stays
360
+ // `"disconnected"` (non-null), so the entity still HAS state — the
361
+ // read only omits a satellite whose `lastConnectionEvent` is null.
362
+ void this.connectionEntitySink
363
+ .mirror({
209
364
  satelliteId: closedSatellite.id,
210
- name: closedSatellite.name,
211
- region: closedSatellite.region,
212
- timestamp: new Date().toISOString(),
365
+ lastEvent: "disconnected",
366
+ lastHeartbeatAt: null,
213
367
  })
214
368
  .catch((error: unknown) => {
215
369
  this.logger.error(
216
- `Failed to emit satellite.disconnected hook for ${closedSatellite.name}:`,
370
+ `Failed to mirror satellite-connection (disconnected) for ${closedSatellite.name}:`,
217
371
  error,
218
372
  );
219
373
  });
@@ -233,10 +387,14 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
233
387
 
234
388
  const assignments =
235
389
  await this.configRelay.getAssignmentsForSatellite(satelliteId);
390
+ const scriptPackagesLockfileHash = await this.resolveDesiredLockfileHash();
236
391
 
237
392
  this.sendMessage(conn.ws, {
238
393
  type: "config_updated",
239
394
  assignments,
395
+ ...(scriptPackagesLockfileHash === undefined
396
+ ? {}
397
+ : { scriptPackagesLockfileHash }),
240
398
  });
241
399
 
242
400
  this.logger.debug(
@@ -244,6 +402,41 @@ export class SatelliteWsHandler implements WebSocketRouteHandler {
244
402
  );
245
403
  }
246
404
 
405
+ /**
406
+ * Push a `refresh_script_packages` to every connected satellite. Called by
407
+ * the `script-packages.changed` broadcast handler so each core instance
408
+ * fans the refresh out to its own satellites. Best-effort liveness; the
409
+ * assignment-carried hash is the durable backstop.
410
+ */
411
+ pushRefreshScriptPackagesToAll(lockfileHash: string): void {
412
+ for (const conn of this.connections.values()) {
413
+ this.sendMessage(conn.ws, {
414
+ type: "refresh_script_packages",
415
+ lockfileHash,
416
+ });
417
+ }
418
+ this.logger.debug(
419
+ `Pushed refresh_script_packages (${lockfileHash}) to ${this.connections.size} satellite(s)`,
420
+ );
421
+ }
422
+
423
+ /**
424
+ * Resolve the desired lockfile hash for assignment payloads. Returns
425
+ * `undefined` when the sink isn't wired (so the field is omitted entirely
426
+ * for version-skew safety), or `string | null` from the sink.
427
+ */
428
+ private async resolveDesiredLockfileHash(): Promise<
429
+ string | null | undefined
430
+ > {
431
+ if (!this.scriptPackageSink) return undefined;
432
+ try {
433
+ return await this.scriptPackageSink.getDesiredLockfileHash();
434
+ } catch (error) {
435
+ this.logger.error("Failed to resolve desired lockfile hash:", error);
436
+ return undefined;
437
+ }
438
+ }
439
+
247
440
  /**
248
441
  * Send a shutdown message to a specific satellite (e.g., on token revocation).
249
442
  */
package/src/schema.ts CHANGED
@@ -19,9 +19,30 @@ export const satellites = pgTable("satellites", {
19
19
  tags: jsonb("tags").$type<Record<string, string>>().default({}).notNull(),
20
20
  /** Bcrypt hash of the satellite's API token */
21
21
  tokenHash: text("token_hash").notNull(),
22
- /** Last heartbeat timestamp — null means never connected */
22
+ /**
23
+ * Last heartbeat timestamp — null means never connected (or cleanly
24
+ * disconnected). This is the SINGLE durable liveness source of truth: the
25
+ * reactive `satellite-connection` entity's `status` and `lastSeenAt` are
26
+ * COMPUTED on read from it (via `computeStatus` / `OFFLINE_THRESHOLD_MS`), so
27
+ * the entity is globally consistent from any pod and self-heals — a stale row
28
+ * reads `offline` once this timestamp ages past the offline threshold, even
29
+ * if the pod that owned the socket crashed without writing offline.
30
+ */
23
31
  lastHeartbeatAt: timestamp("last_heartbeat_at"),
24
32
  /** Satellite version reported on connect/heartbeat */
25
33
  version: text("version"),
34
+ /**
35
+ * Which lifecycle edge produced the latest connection-status change. Preserves
36
+ * the distinction between a socket drop (`disconnected`) and the heartbeat-lost
37
+ * offline edge (`heartbeat_lost`) that a bare status diff cannot encode. This
38
+ * is the ONLY durable connection column the reactive `satellite-connection`
39
+ * entity needs beyond `lastHeartbeatAt`: the deriver reads it as `lastEvent`,
40
+ * and the heartbeat monitor uses it to make heartbeat-lost detection
41
+ * idempotent (once it is `"heartbeat_lost"`, re-runs are no-ops). Nullable: a
42
+ * satellite that never connected has no last event.
43
+ */
44
+ lastConnectionEvent: text("last_connection_event", {
45
+ enum: ["connected", "disconnected", "heartbeat_lost"],
46
+ }),
26
47
  createdAt: timestamp("created_at").defaultNow().notNull(),
27
48
  });