@checkstack/satellite-backend 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +153 -0
- package/drizzle/0001_tiresome_terror.sql +3 -0
- package/drizzle/0002_graceful_mac_gargan.sql +2 -0
- package/drizzle/meta/0001_snapshot.json +102 -0
- package/drizzle/meta/0002_snapshot.json +89 -0
- package/drizzle/meta/_journal.json +14 -0
- package/package.json +20 -13
- package/src/automations.ts +65 -24
- package/src/entity.test.ts +313 -0
- package/src/entity.ts +221 -0
- package/src/heartbeat-monitor.it.test.ts +232 -0
- package/src/heartbeat-monitor.test.ts +156 -83
- package/src/heartbeat-monitor.ts +102 -71
- package/src/hooks.ts +9 -39
- package/src/index.ts +168 -9
- package/src/run-secret-resolver.test.ts +121 -0
- package/src/run-secret-resolver.ts +66 -0
- package/src/satellite-ws-handler.test.ts +267 -0
- package/src/satellite-ws-handler.ts +242 -49
- package/src/schema.ts +22 -1
- package/src/service.test.ts +274 -0
- package/src/service.ts +133 -15
- package/src/status.ts +18 -0
- package/tsconfig.json +15 -0
- package/src/automations.test.ts +0 -54
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Integration test (real Postgres): cross-pod heartbeat-lost detection.
|
|
3
|
+
*
|
|
4
|
+
* This is the DETERMINISTIC backstop for `.agent/rules/state-and-scale.md` that
|
|
5
|
+
* the single-process unit suite structurally cannot provide for the heartbeat
|
|
6
|
+
* monitor. The prior fix made connection STATUS durable but left the
|
|
7
|
+
* online→offline transition DETECTION pod-local (an in-memory `previousStatuses`
|
|
8
|
+
* map): under N pods, the heartbeat-check job runs on a VARYING pod, so a pod
|
|
9
|
+
* with an empty map never observed the satellite online and therefore never
|
|
10
|
+
* fired the `heartbeat_lost` edge — leaving `connectionStatus` stuck `online`
|
|
11
|
+
* forever after a pod crash. This test proves that bug cannot recur.
|
|
12
|
+
*
|
|
13
|
+
* ## Two-pod model (faithful proxy)
|
|
14
|
+
*
|
|
15
|
+
* We model TWO independent "pods" as two independent `SatelliteService` +
|
|
16
|
+
* `HeartbeatMonitor` instances, EACH over its OWN `pg.Pool`, BOTH pointed at the
|
|
17
|
+
* SAME Postgres database + schema (mirroring the automation-backend cross-pod
|
|
18
|
+
* read-consistency IT). Separate pools = separate processes for the property
|
|
19
|
+
* under test (no shared JS heap, so no shared in-memory baseline), one DB =
|
|
20
|
+
* the shared durable substrate N pods share in production.
|
|
21
|
+
*
|
|
22
|
+
* - pod A — the pod that handled the satellite's WS connection.
|
|
23
|
+
* - pod B — a DIFFERENT pod that later claims the heartbeat-check job and has
|
|
24
|
+
* NEVER seen this satellite online in memory.
|
|
25
|
+
*
|
|
26
|
+
* ## What it asserts
|
|
27
|
+
*
|
|
28
|
+
* 1. Pod A connects the satellite (durable write: lastHeartbeatAt=now,
|
|
29
|
+
* lastConnectionEvent="connected"). Pod B's entity read sees it ONLINE.
|
|
30
|
+
* 2. The satellite's heartbeat ages out (we backdate lastHeartbeatAt past the
|
|
31
|
+
* offline threshold — a crashed pod that stopped heartbeating). Pod B's
|
|
32
|
+
* read now self-heals to OFFLINE purely from compute-on-read.
|
|
33
|
+
* 3. Pod B's heartbeat monitor — fresh heap, no prior in-memory knowledge —
|
|
34
|
+
* DETECTS the heartbeat_lost edge from durable state and drives the entity
|
|
35
|
+
* mutate (flips lastConnectionEvent="heartbeat_lost").
|
|
36
|
+
* 4. Re-running the monitor on EITHER pod is a no-op (idempotent across pods +
|
|
37
|
+
* redelivery).
|
|
38
|
+
*
|
|
39
|
+
* Gated behind `CHECKSTACK_IT=1`; connection from `CHECKSTACK_IT_PG_URL`. Each
|
|
40
|
+
* run isolates itself in a freshly created Postgres schema and cleans up.
|
|
41
|
+
*/
|
|
42
|
+
import { afterAll, beforeAll, describe, expect, it } from "bun:test";
|
|
43
|
+
import { drizzle } from "drizzle-orm/node-postgres";
|
|
44
|
+
import { Pool } from "pg";
|
|
45
|
+
|
|
46
|
+
import type { SafeDatabase } from "@checkstack/backend-api";
|
|
47
|
+
import {
|
|
48
|
+
createMockLogger,
|
|
49
|
+
createMockSignalService,
|
|
50
|
+
} from "@checkstack/test-utils-backend";
|
|
51
|
+
|
|
52
|
+
import * as schema from "./schema";
|
|
53
|
+
import { SatelliteService } from "./service";
|
|
54
|
+
import { HeartbeatMonitor } from "./heartbeat-monitor";
|
|
55
|
+
import { OFFLINE_THRESHOLD_MS } from "@checkstack/satellite-common";
|
|
56
|
+
|
|
57
|
+
const PG_URL =
|
|
58
|
+
process.env.CHECKSTACK_IT_PG_URL ??
|
|
59
|
+
"postgres://postgres:postgres@localhost:5432/postgres";
|
|
60
|
+
|
|
61
|
+
const SCHEMA = `it_sat_heartbeat_${crypto.randomUUID().replace(/-/g, "")}`;
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* One simulated pod: an independent `SatelliteService` + `HeartbeatMonitor` over
|
|
65
|
+
* its OWN pool to the shared DB. The monitor's entity sink performs the REAL
|
|
66
|
+
* durable write via this pod's service (no framework handle needed — the
|
|
67
|
+
* property under test is whether DETECTION reads durable state, not the
|
|
68
|
+
* transition-log append).
|
|
69
|
+
*/
|
|
70
|
+
interface Pod {
|
|
71
|
+
readonly pool: Pool;
|
|
72
|
+
readonly service: SatelliteService;
|
|
73
|
+
readonly monitor: HeartbeatMonitor;
|
|
74
|
+
end(): Promise<void>;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
describe.skipIf(!process.env.CHECKSTACK_IT)(
|
|
78
|
+
"cross-pod heartbeat-lost detection (real Postgres)",
|
|
79
|
+
() => {
|
|
80
|
+
const pods: Pod[] = [];
|
|
81
|
+
|
|
82
|
+
function makePod(): Pod {
|
|
83
|
+
const pool = new Pool({
|
|
84
|
+
connectionString: PG_URL,
|
|
85
|
+
options: `-c search_path=${SCHEMA}`,
|
|
86
|
+
});
|
|
87
|
+
const db = drizzle({
|
|
88
|
+
client: pool,
|
|
89
|
+
schema,
|
|
90
|
+
}) as unknown as SafeDatabase<typeof schema>;
|
|
91
|
+
const service = new SatelliteService(db);
|
|
92
|
+
const logger = createMockLogger();
|
|
93
|
+
const signalService = createMockSignalService();
|
|
94
|
+
|
|
95
|
+
// The monitor's entity sink: the REAL durable heartbeat_lost write through
|
|
96
|
+
// this pod's service. This is exactly what `index.ts` wires (minus the
|
|
97
|
+
// framework `handle.mutate` wrapper, which is out of scope for the
|
|
98
|
+
// cross-pod DETECTION property).
|
|
99
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, {
|
|
100
|
+
mirror: async (satelliteId) => {
|
|
101
|
+
await service.applyConnectionState({
|
|
102
|
+
satelliteId,
|
|
103
|
+
lastEvent: "heartbeat_lost",
|
|
104
|
+
});
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
return { pool, service, monitor, end: () => pool.end() };
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
let podA: Pod;
|
|
112
|
+
let podB: Pod;
|
|
113
|
+
|
|
114
|
+
beforeAll(async () => {
|
|
115
|
+
const setupPool = new Pool({ connectionString: PG_URL });
|
|
116
|
+
try {
|
|
117
|
+
await setupPool.query(`CREATE SCHEMA IF NOT EXISTS "${SCHEMA}"`);
|
|
118
|
+
// The satellites table the service reads/writes. Only the columns the
|
|
119
|
+
// service touches are needed.
|
|
120
|
+
await setupPool.query(`
|
|
121
|
+
CREATE TABLE "${SCHEMA}".satellites (
|
|
122
|
+
id uuid PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
123
|
+
name text NOT NULL,
|
|
124
|
+
region text NOT NULL,
|
|
125
|
+
tags jsonb NOT NULL DEFAULT '{}',
|
|
126
|
+
token_hash text NOT NULL,
|
|
127
|
+
last_heartbeat_at timestamp,
|
|
128
|
+
version text,
|
|
129
|
+
last_connection_event text,
|
|
130
|
+
created_at timestamp NOT NULL DEFAULT now()
|
|
131
|
+
)
|
|
132
|
+
`);
|
|
133
|
+
} finally {
|
|
134
|
+
await setupPool.end();
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
podA = makePod();
|
|
138
|
+
podB = makePod();
|
|
139
|
+
pods.push(podA, podB);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
afterAll(async () => {
|
|
143
|
+
await Promise.all(pods.map((p) => p.end()));
|
|
144
|
+
const cleanupPool = new Pool({ connectionString: PG_URL });
|
|
145
|
+
try {
|
|
146
|
+
await cleanupPool.query(`DROP SCHEMA IF EXISTS "${SCHEMA}" CASCADE`);
|
|
147
|
+
} finally {
|
|
148
|
+
await cleanupPool.end();
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
/** Insert a satellite row; returns its id. */
|
|
153
|
+
async function insertSatellite(): Promise<string> {
|
|
154
|
+
const { rows } = await podA.pool.query<{ id: string }>(
|
|
155
|
+
`INSERT INTO "${SCHEMA}".satellites (name, region, token_hash)
|
|
156
|
+
VALUES ('eu-west', 'eu-west-1', 'hash') RETURNING id`,
|
|
157
|
+
);
|
|
158
|
+
return rows[0]!.id;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/** Read the durable connection columns of one satellite (any pod). */
|
|
162
|
+
async function readRow(
|
|
163
|
+
pod: Pod,
|
|
164
|
+
id: string,
|
|
165
|
+
): Promise<{ lastHeartbeatAt: Date | null; lastConnectionEvent: string | null }> {
|
|
166
|
+
const { rows } = await pod.pool.query<{
|
|
167
|
+
last_heartbeat_at: Date | null;
|
|
168
|
+
last_connection_event: string | null;
|
|
169
|
+
}>(
|
|
170
|
+
`SELECT last_heartbeat_at, last_connection_event
|
|
171
|
+
FROM "${SCHEMA}".satellites WHERE id = $1`,
|
|
172
|
+
[id],
|
|
173
|
+
);
|
|
174
|
+
return {
|
|
175
|
+
lastHeartbeatAt: rows[0]!.last_heartbeat_at,
|
|
176
|
+
lastConnectionEvent: rows[0]!.last_connection_event,
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
it("pod B detects heartbeat_lost for a satellite it never saw online, idempotently", async () => {
|
|
181
|
+
const id = await insertSatellite();
|
|
182
|
+
|
|
183
|
+
// 1. Pod A connects the satellite: durable write of the connected edge.
|
|
184
|
+
await podA.service.applyConnectionState({
|
|
185
|
+
satelliteId: id,
|
|
186
|
+
lastEvent: "connected",
|
|
187
|
+
lastHeartbeatAt: new Date(),
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Pod B (different pool/heap) reads it ONLINE via compute-on-read.
|
|
191
|
+
const onlineB = await podB.service.getManyConnectionStates([id]);
|
|
192
|
+
expect(onlineB[id]).toBeDefined();
|
|
193
|
+
expect(onlineB[id]!.status).toBe("online");
|
|
194
|
+
expect(onlineB[id]!.lastEvent).toBe("connected");
|
|
195
|
+
|
|
196
|
+
// 2. The owning pod crashes / stops heartbeating: backdate the heartbeat
|
|
197
|
+
// past the offline threshold (simulating elapsed wall-clock time).
|
|
198
|
+
const aged = new Date(Date.now() - OFFLINE_THRESHOLD_MS - 60_000);
|
|
199
|
+
await podA.pool.query(
|
|
200
|
+
`UPDATE "${SCHEMA}".satellites SET last_heartbeat_at = $1 WHERE id = $2`,
|
|
201
|
+
[aged, id],
|
|
202
|
+
);
|
|
203
|
+
|
|
204
|
+
// Pod B's read self-heals to OFFLINE from durable state alone — no pod
|
|
205
|
+
// ever wrote "offline"; the status is computed.
|
|
206
|
+
const offlineB = await podB.service.getManyConnectionStates([id]);
|
|
207
|
+
expect(offlineB[id]!.status).toBe("offline");
|
|
208
|
+
// ...but the last recorded edge is still "connected" — the heartbeat_lost
|
|
209
|
+
// edge has NOT been fired yet (this is the bug surface: under the old
|
|
210
|
+
// pod-local design it would NEVER fire on a fresh pod).
|
|
211
|
+
expect(offlineB[id]!.lastEvent).toBe("connected");
|
|
212
|
+
|
|
213
|
+
// 3. Pod B's monitor — fresh heap, never saw this satellite online —
|
|
214
|
+
// detects the edge from DURABLE state and drives the mutate.
|
|
215
|
+
await podB.monitor.checkHeartbeats();
|
|
216
|
+
|
|
217
|
+
const afterDetect = await readRow(podB, id);
|
|
218
|
+
expect(afterDetect.lastConnectionEvent).toBe("heartbeat_lost");
|
|
219
|
+
// The entity read now reports the heartbeat_lost edge globally (pod A too).
|
|
220
|
+
const lostA = await podA.service.getManyConnectionStates([id]);
|
|
221
|
+
expect(lostA[id]!.lastEvent).toBe("heartbeat_lost");
|
|
222
|
+
expect(lostA[id]!.status).toBe("offline");
|
|
223
|
+
|
|
224
|
+
// 4. Idempotent: re-running on pod B AND on pod A is a no-op (the durable
|
|
225
|
+
// lastConnectionEvent="heartbeat_lost" makes the predicate false).
|
|
226
|
+
await podB.monitor.checkHeartbeats();
|
|
227
|
+
await podA.monitor.checkHeartbeats();
|
|
228
|
+
const afterReRun = await readRow(podA, id);
|
|
229
|
+
expect(afterReRun.lastConnectionEvent).toBe("heartbeat_lost");
|
|
230
|
+
});
|
|
231
|
+
},
|
|
232
|
+
);
|
|
@@ -1,21 +1,56 @@
|
|
|
1
1
|
import { describe, it, expect, mock, beforeEach } from "bun:test";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
HeartbeatMonitor,
|
|
4
|
+
type SatelliteHeartbeatEntitySink,
|
|
5
|
+
} from "./heartbeat-monitor";
|
|
3
6
|
import {
|
|
4
7
|
createMockLogger,
|
|
5
8
|
createMockSignalService,
|
|
6
9
|
type MockSignalService,
|
|
7
10
|
} from "@checkstack/test-utils-backend";
|
|
8
|
-
import {
|
|
11
|
+
import {
|
|
12
|
+
SATELLITE_STATUS_CHANGED,
|
|
13
|
+
OFFLINE_THRESHOLD_MS,
|
|
14
|
+
} from "@checkstack/satellite-common";
|
|
9
15
|
import type { SatelliteService } from "./service";
|
|
10
|
-
import type {
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
16
|
+
import type { SatelliteConnectionEvent } from "./entity";
|
|
17
|
+
|
|
18
|
+
type LivenessRow = {
|
|
19
|
+
id: string;
|
|
20
|
+
name: string;
|
|
21
|
+
region: string;
|
|
22
|
+
lastHeartbeatAt: Date | null;
|
|
23
|
+
lastConnectionEvent: SatelliteConnectionEvent | null;
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
function makeEntitySink(): {
|
|
27
|
+
sink: SatelliteHeartbeatEntitySink;
|
|
28
|
+
mirrors: string[];
|
|
29
|
+
} {
|
|
30
|
+
const mirrors: string[] = [];
|
|
31
|
+
return {
|
|
32
|
+
sink: {
|
|
33
|
+
mirror: mock(async (id: string) => {
|
|
34
|
+
mirrors.push(id);
|
|
35
|
+
}),
|
|
36
|
+
},
|
|
37
|
+
mirrors,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Mock the service's durable liveness read. The array is captured by reference
|
|
43
|
+
* so a test can mutate it (e.g. flip `lastConnectionEvent` to "heartbeat_lost"
|
|
44
|
+
* to simulate the idempotent durable write) between `checkHeartbeats` calls.
|
|
45
|
+
*/
|
|
46
|
+
const createMockSatelliteService = (rows: LivenessRow[]): SatelliteService =>
|
|
15
47
|
({
|
|
16
|
-
|
|
48
|
+
listConnectionLiveness: mock(async () => rows),
|
|
17
49
|
}) as unknown as SatelliteService;
|
|
18
50
|
|
|
51
|
+
const recentHeartbeat = () => new Date(Date.now() - 5_000);
|
|
52
|
+
const agedHeartbeat = () => new Date(Date.now() - OFFLINE_THRESHOLD_MS - 10_000);
|
|
53
|
+
|
|
19
54
|
describe("HeartbeatMonitor", () => {
|
|
20
55
|
let signalService: MockSignalService;
|
|
21
56
|
let logger: ReturnType<typeof createMockLogger>;
|
|
@@ -25,49 +60,45 @@ describe("HeartbeatMonitor", () => {
|
|
|
25
60
|
logger = createMockLogger();
|
|
26
61
|
});
|
|
27
62
|
|
|
28
|
-
it("
|
|
63
|
+
it("does NOT detect a lost edge for an online (recent heartbeat) satellite", async () => {
|
|
29
64
|
const service = createMockSatelliteService([
|
|
30
65
|
{
|
|
31
66
|
id: "sat-1",
|
|
32
67
|
name: "eu-west",
|
|
33
68
|
region: "eu-west-1",
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
createdAt: new Date(),
|
|
69
|
+
lastHeartbeatAt: recentHeartbeat(),
|
|
70
|
+
lastConnectionEvent: "connected",
|
|
37
71
|
},
|
|
38
72
|
]);
|
|
73
|
+
const { sink, mirrors } = makeEntitySink();
|
|
74
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
39
75
|
|
|
40
|
-
const monitor = new HeartbeatMonitor(service, signalService, logger);
|
|
41
76
|
await monitor.checkHeartbeats();
|
|
42
77
|
|
|
43
|
-
|
|
78
|
+
expect(mirrors).toHaveLength(0);
|
|
44
79
|
expect(signalService.getRecordedSignals()).toHaveLength(0);
|
|
45
80
|
});
|
|
46
81
|
|
|
47
|
-
it("
|
|
48
|
-
|
|
82
|
+
it("detects heartbeat_lost from DURABLE state with NO prior in-memory knowledge", async () => {
|
|
83
|
+
// The core horizontal-scale property: a fresh monitor (empty heap, as on a
|
|
84
|
+
// pod that never saw this satellite online) still detects the online→offline
|
|
85
|
+
// edge purely from the durable (agedHeartbeat + lastConnectionEvent:
|
|
86
|
+
// "connected") row, mirrors heartbeat_lost, and broadcasts.
|
|
87
|
+
const service = createMockSatelliteService([
|
|
49
88
|
{
|
|
50
89
|
id: "sat-1",
|
|
51
90
|
name: "eu-west",
|
|
52
91
|
region: "eu-west-1",
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
createdAt: new Date(),
|
|
92
|
+
lastHeartbeatAt: agedHeartbeat(),
|
|
93
|
+
lastConnectionEvent: "connected",
|
|
56
94
|
},
|
|
57
|
-
];
|
|
58
|
-
const
|
|
59
|
-
const monitor = new HeartbeatMonitor(service, signalService, logger);
|
|
60
|
-
|
|
61
|
-
// First check: initialize state
|
|
62
|
-
await monitor.checkHeartbeats();
|
|
95
|
+
]);
|
|
96
|
+
const { sink, mirrors } = makeEntitySink();
|
|
97
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
63
98
|
|
|
64
|
-
// Simulate satellite going offline
|
|
65
|
-
satellites[0] = { ...satellites[0], status: "offline" };
|
|
66
99
|
await monitor.checkHeartbeats();
|
|
67
100
|
|
|
68
|
-
expect(
|
|
69
|
-
signalService.wasSignalEmitted(SATELLITE_STATUS_CHANGED.id),
|
|
70
|
-
).toBe(true);
|
|
101
|
+
expect(mirrors).toEqual(["sat-1"]);
|
|
71
102
|
|
|
72
103
|
const recorded = signalService.getRecordedSignalsById(
|
|
73
104
|
SATELLITE_STATUS_CHANGED.id,
|
|
@@ -81,95 +112,137 @@ describe("HeartbeatMonitor", () => {
|
|
|
81
112
|
});
|
|
82
113
|
});
|
|
83
114
|
|
|
84
|
-
it("
|
|
85
|
-
const
|
|
115
|
+
it("is idempotent: re-running after the durable flip is a no-op (any pod, redelivery)", async () => {
|
|
116
|
+
const rows: LivenessRow[] = [
|
|
86
117
|
{
|
|
87
118
|
id: "sat-1",
|
|
88
119
|
name: "eu-west",
|
|
89
120
|
region: "eu-west-1",
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
createdAt: new Date(),
|
|
121
|
+
lastHeartbeatAt: agedHeartbeat(),
|
|
122
|
+
lastConnectionEvent: "connected",
|
|
93
123
|
},
|
|
94
124
|
];
|
|
95
|
-
const service = createMockSatelliteService(
|
|
96
|
-
const
|
|
125
|
+
const service = createMockSatelliteService(rows);
|
|
126
|
+
const { sink, mirrors } = makeEntitySink();
|
|
127
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
97
128
|
|
|
98
|
-
// First check
|
|
129
|
+
// First check detects + mirrors the edge.
|
|
99
130
|
await monitor.checkHeartbeats();
|
|
131
|
+
expect(mirrors).toEqual(["sat-1"]);
|
|
100
132
|
|
|
101
|
-
// Simulate
|
|
102
|
-
|
|
133
|
+
// Simulate the durable write the mirror performed: lastConnectionEvent is
|
|
134
|
+
// now "heartbeat_lost". A re-run (same pod OR another pod, since the
|
|
135
|
+
// predicate is over durable state) detects nothing more.
|
|
136
|
+
rows[0] = { ...rows[0], lastConnectionEvent: "heartbeat_lost" };
|
|
103
137
|
await monitor.checkHeartbeats();
|
|
138
|
+
expect(mirrors).toEqual(["sat-1"]); // still just one
|
|
104
139
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
);
|
|
108
|
-
|
|
109
|
-
expect(
|
|
110
|
-
satelliteId: "sat-1",
|
|
111
|
-
status: "online",
|
|
112
|
-
name: "eu-west",
|
|
113
|
-
region: "eu-west-1",
|
|
114
|
-
});
|
|
140
|
+
// A SECOND pod with a fresh (empty) monitor likewise sees a no-op.
|
|
141
|
+
const { sink: sink2, mirrors: mirrors2 } = makeEntitySink();
|
|
142
|
+
const monitor2 = new HeartbeatMonitor(service, signalService, logger, sink2);
|
|
143
|
+
await monitor2.checkHeartbeats();
|
|
144
|
+
expect(mirrors2).toHaveLength(0);
|
|
115
145
|
});
|
|
116
146
|
|
|
117
|
-
it("
|
|
118
|
-
|
|
147
|
+
it("does NOT mirror the offline→online edge (the WS handler owns reconnect)", async () => {
|
|
148
|
+
// A satellite that reconnected: recent heartbeat, lastConnectionEvent already
|
|
149
|
+
// "connected". Nothing for the monitor to do — and crucially it must not
|
|
150
|
+
// treat a heartbeat_lost→connected transition as a lost edge.
|
|
151
|
+
const service = createMockSatelliteService([
|
|
119
152
|
{
|
|
120
153
|
id: "sat-1",
|
|
121
154
|
name: "eu-west",
|
|
122
155
|
region: "eu-west-1",
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
createdAt: new Date(),
|
|
156
|
+
lastHeartbeatAt: recentHeartbeat(),
|
|
157
|
+
lastConnectionEvent: "connected",
|
|
126
158
|
},
|
|
127
|
-
];
|
|
128
|
-
const
|
|
129
|
-
const monitor = new HeartbeatMonitor(service, signalService, logger);
|
|
159
|
+
]);
|
|
160
|
+
const { sink, mirrors } = makeEntitySink();
|
|
161
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
130
162
|
|
|
131
|
-
// First check: initialize
|
|
132
163
|
await monitor.checkHeartbeats();
|
|
133
|
-
|
|
164
|
+
expect(mirrors).toHaveLength(0);
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
it("does NOT detect a lost edge for a cleanly-disconnected satellite", async () => {
|
|
168
|
+
// Clean disconnect already set lastConnectionEvent="disconnected" and nulled
|
|
169
|
+
// the heartbeat: status is offline, but the edge was already recorded, so the
|
|
170
|
+
// monitor must not re-fire heartbeat_lost.
|
|
171
|
+
const service = createMockSatelliteService([
|
|
172
|
+
{
|
|
173
|
+
id: "sat-1",
|
|
174
|
+
name: "eu-west",
|
|
175
|
+
region: "eu-west-1",
|
|
176
|
+
lastHeartbeatAt: null,
|
|
177
|
+
lastConnectionEvent: "disconnected",
|
|
178
|
+
},
|
|
179
|
+
]);
|
|
180
|
+
const { sink, mirrors } = makeEntitySink();
|
|
181
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
182
|
+
|
|
134
183
|
await monitor.checkHeartbeats();
|
|
184
|
+
expect(mirrors).toHaveLength(0);
|
|
185
|
+
expect(signalService.getRecordedSignals()).toHaveLength(0);
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it("does NOT detect a lost edge for a never-connected satellite", async () => {
|
|
189
|
+
const service = createMockSatelliteService([
|
|
190
|
+
{
|
|
191
|
+
id: "sat-1",
|
|
192
|
+
name: "eu-west",
|
|
193
|
+
region: "eu-west-1",
|
|
194
|
+
lastHeartbeatAt: null,
|
|
195
|
+
lastConnectionEvent: null,
|
|
196
|
+
},
|
|
197
|
+
]);
|
|
198
|
+
const { sink, mirrors } = makeEntitySink();
|
|
199
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
135
200
|
|
|
201
|
+
await monitor.checkHeartbeats();
|
|
202
|
+
expect(mirrors).toHaveLength(0);
|
|
136
203
|
expect(signalService.getRecordedSignals()).toHaveLength(0);
|
|
137
204
|
});
|
|
138
205
|
|
|
139
|
-
it("
|
|
140
|
-
const
|
|
206
|
+
it("broadcasts the offline edge only once per pod across back-to-back checks", async () => {
|
|
207
|
+
const rows: LivenessRow[] = [
|
|
141
208
|
{
|
|
142
209
|
id: "sat-1",
|
|
143
210
|
name: "eu-west",
|
|
144
211
|
region: "eu-west-1",
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
createdAt: new Date(),
|
|
212
|
+
lastHeartbeatAt: agedHeartbeat(),
|
|
213
|
+
lastConnectionEvent: "connected",
|
|
148
214
|
},
|
|
149
215
|
];
|
|
150
|
-
const service = createMockSatelliteService(
|
|
151
|
-
const
|
|
216
|
+
const service = createMockSatelliteService(rows);
|
|
217
|
+
const { sink } = makeEntitySink();
|
|
218
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger, sink);
|
|
152
219
|
|
|
153
|
-
//
|
|
220
|
+
// Two checks BEFORE the durable flip lands (e.g. mirror failed once). The
|
|
221
|
+
// broadcast-dedup set suppresses a second broadcast from THIS pod.
|
|
154
222
|
await monitor.checkHeartbeats();
|
|
155
|
-
|
|
156
|
-
// Satellite removed (empty list returned)
|
|
157
|
-
satellites.length = 0;
|
|
158
223
|
await monitor.checkHeartbeats();
|
|
159
224
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
225
|
+
expect(
|
|
226
|
+
signalService.getRecordedSignalsById(SATELLITE_STATUS_CHANGED.id),
|
|
227
|
+
).toHaveLength(1);
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
it("works without an entity sink (broadcast-only)", async () => {
|
|
231
|
+
const service = createMockSatelliteService([
|
|
232
|
+
{
|
|
233
|
+
id: "sat-1",
|
|
234
|
+
name: "eu-west",
|
|
235
|
+
region: "eu-west-1",
|
|
236
|
+
lastHeartbeatAt: agedHeartbeat(),
|
|
237
|
+
lastConnectionEvent: "connected",
|
|
238
|
+
},
|
|
239
|
+
]);
|
|
240
|
+
const monitor = new HeartbeatMonitor(service, signalService, logger);
|
|
241
|
+
|
|
170
242
|
await monitor.checkHeartbeats();
|
|
171
243
|
|
|
172
|
-
|
|
173
|
-
|
|
244
|
+
expect(
|
|
245
|
+
signalService.wasSignalEmitted(SATELLITE_STATUS_CHANGED.id),
|
|
246
|
+
).toBe(true);
|
|
174
247
|
});
|
|
175
248
|
});
|