@checkstack/incident-backend 1.2.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +242 -0
- package/package.json +18 -16
- package/src/automations.test.ts +523 -0
- package/src/automations.ts +601 -0
- package/src/hooks.ts +9 -45
- package/src/incident-entity.test.ts +266 -0
- package/src/incident-entity.ts +192 -0
- package/src/index.ts +110 -76
- package/src/router.ts +162 -98
- package/src/service.test.ts +199 -0
- package/src/service.ts +147 -3
- package/tsconfig.json +6 -0
package/src/service.test.ts
CHANGED
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
import { describe, it, expect, mock, beforeEach } from "bun:test";
|
|
2
2
|
import { IncidentService } from "./service";
|
|
3
|
+
import {
|
|
4
|
+
incidents,
|
|
5
|
+
incidentSystems,
|
|
6
|
+
incidentUpdates,
|
|
7
|
+
incidentLinks,
|
|
8
|
+
} from "./schema";
|
|
3
9
|
|
|
4
10
|
/**
|
|
5
11
|
* Programmable mock DB that records each `select(...).from(...).where(...)`
|
|
@@ -124,3 +130,196 @@ describe("IncidentService.hasActiveIncidentWithSuppression", () => {
|
|
|
124
130
|
expect(dbHelper.getCallCount()).toBe(1);
|
|
125
131
|
});
|
|
126
132
|
});
|
|
133
|
+
|
|
134
|
+
describe("IncidentService.getManyEntityStates (plugin-backed entity read)", () => {
|
|
135
|
+
it("returns {} for an empty id set without querying", async () => {
|
|
136
|
+
const dbHelper = createProgrammableSelectDb([]);
|
|
137
|
+
const service = new IncidentService(dbHelper.db as never);
|
|
138
|
+
expect(await service.getManyEntityStates([])).toEqual({});
|
|
139
|
+
expect(dbHelper.getCallCount()).toBe(0);
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
it("projects { status, severity, systemIds } from incidents + junction", async () => {
|
|
143
|
+
const dbHelper = createProgrammableSelectDb([
|
|
144
|
+
// 1st query: incidents rows for the requested ids.
|
|
145
|
+
[
|
|
146
|
+
{ id: "inc-1", status: "investigating", severity: "major" },
|
|
147
|
+
{ id: "inc-2", status: "resolved", severity: "minor" },
|
|
148
|
+
],
|
|
149
|
+
// 2nd query: incident_systems junction rows for the present ids.
|
|
150
|
+
[
|
|
151
|
+
{ incidentId: "inc-1", systemId: "sys-a" },
|
|
152
|
+
{ incidentId: "inc-1", systemId: "sys-b" },
|
|
153
|
+
{ incidentId: "inc-2", systemId: "sys-c" },
|
|
154
|
+
],
|
|
155
|
+
]);
|
|
156
|
+
const service = new IncidentService(dbHelper.db as never);
|
|
157
|
+
const out = await service.getManyEntityStates(["inc-1", "inc-2", "inc-x"]);
|
|
158
|
+
expect(out).toEqual({
|
|
159
|
+
"inc-1": {
|
|
160
|
+
status: "investigating",
|
|
161
|
+
severity: "major",
|
|
162
|
+
systemIds: ["sys-a", "sys-b"],
|
|
163
|
+
},
|
|
164
|
+
"inc-2": { status: "resolved", severity: "minor", systemIds: ["sys-c"] },
|
|
165
|
+
});
|
|
166
|
+
// Missing ids are omitted (never a null/undefined entry).
|
|
167
|
+
expect("inc-x" in out).toBe(false);
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it("returns {} when none of the ids exist (no junction query)", async () => {
|
|
171
|
+
const dbHelper = createProgrammableSelectDb([
|
|
172
|
+
// incidents query returns nothing → no second query.
|
|
173
|
+
[],
|
|
174
|
+
]);
|
|
175
|
+
const service = new IncidentService(dbHelper.db as never);
|
|
176
|
+
expect(await service.getManyEntityStates(["ghost"])).toEqual({});
|
|
177
|
+
expect(dbHelper.getCallCount()).toBe(1);
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
it("yields an empty systemIds array for an incident with no systems", async () => {
|
|
181
|
+
const dbHelper = createProgrammableSelectDb([
|
|
182
|
+
[{ id: "inc-1", status: "monitoring", severity: "critical" }],
|
|
183
|
+
[], // no junction rows
|
|
184
|
+
]);
|
|
185
|
+
const service = new IncidentService(dbHelper.db as never);
|
|
186
|
+
const out = await service.getManyEntityStates(["inc-1"]);
|
|
187
|
+
expect(out["inc-1"]).toEqual({
|
|
188
|
+
status: "monitoring",
|
|
189
|
+
severity: "critical",
|
|
190
|
+
systemIds: [],
|
|
191
|
+
});
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Table-backed fake `db` for the dedup-create path. Models just enough of
|
|
197
|
+
* the Drizzle surface the service touches (select/insert by TABLE IDENTITY,
|
|
198
|
+
* `.from`/`.where`/`.limit`, and a serializing `transaction`).
|
|
199
|
+
*
|
|
200
|
+
* Crucially `transaction(fn)` models `pg_advisory_xact_lock`: it serializes
|
|
201
|
+
* callers on the lock key seen in the `tx.execute(...)` SQL, so concurrent
|
|
202
|
+
* dedup-creates run their find-then-create one-at-a-time — exactly the
|
|
203
|
+
* guarantee M3 needs. Because the test confines data to a single system,
|
|
204
|
+
* the (ignored) WHERE clauses don't change which rows match.
|
|
205
|
+
*/
|
|
206
|
+
function createDedupFakeDb() {
|
|
207
|
+
const store = {
|
|
208
|
+
incidents: [] as Array<Record<string, unknown>>,
|
|
209
|
+
incidentSystems: [] as Array<Record<string, unknown>>,
|
|
210
|
+
incidentUpdates: [] as Array<Record<string, unknown>>,
|
|
211
|
+
incidentLinks: [] as Array<Record<string, unknown>>,
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
const tableKey = (
|
|
215
|
+
table: unknown,
|
|
216
|
+
): keyof typeof store | undefined => {
|
|
217
|
+
if (table === incidents) return "incidents";
|
|
218
|
+
if (table === incidentSystems) return "incidentSystems";
|
|
219
|
+
if (table === incidentUpdates) return "incidentUpdates";
|
|
220
|
+
if (table === incidentLinks) return "incidentLinks";
|
|
221
|
+
return undefined;
|
|
222
|
+
};
|
|
223
|
+
|
|
224
|
+
// Per-key serialization (the xact-lock model).
|
|
225
|
+
const tails = new Map<string, Promise<unknown>>();
|
|
226
|
+
|
|
227
|
+
function buildSelect() {
|
|
228
|
+
return (projection?: Record<string, unknown>) => {
|
|
229
|
+
const project = (
|
|
230
|
+
list: Array<Record<string, unknown>>,
|
|
231
|
+
): Array<Record<string, unknown>> => {
|
|
232
|
+
if (!projection) return list;
|
|
233
|
+
const keys = Object.keys(projection);
|
|
234
|
+
return list.map((r) => {
|
|
235
|
+
const out: Record<string, unknown> = {};
|
|
236
|
+
for (const k of keys) out[k] = r[k];
|
|
237
|
+
return out;
|
|
238
|
+
});
|
|
239
|
+
};
|
|
240
|
+
const from = (table: unknown) => {
|
|
241
|
+
const key = tableKey(table);
|
|
242
|
+
const rows = key ? project([...store[key]]) : [];
|
|
243
|
+
const limit = (n: number) => Promise.resolve(rows.slice(0, n));
|
|
244
|
+
const where = () =>
|
|
245
|
+
Object.assign(Promise.resolve(rows), { limit });
|
|
246
|
+
return Object.assign(Promise.resolve(rows), { where, limit });
|
|
247
|
+
};
|
|
248
|
+
return { from };
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function buildInsert() {
|
|
253
|
+
return (table: unknown) => ({
|
|
254
|
+
values: (vals: Record<string, unknown>) => {
|
|
255
|
+
const key = tableKey(table);
|
|
256
|
+
if (key) store[key].push({ ...vals });
|
|
257
|
+
return Promise.resolve();
|
|
258
|
+
},
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const db = {
|
|
263
|
+
select: buildSelect(),
|
|
264
|
+
insert: buildInsert(),
|
|
265
|
+
async transaction<T>(fn: (tx: unknown) => Promise<T>): Promise<T> {
|
|
266
|
+
// The lock key is embedded in the SQL the helper runs via tx.execute.
|
|
267
|
+
let lockKey = "default";
|
|
268
|
+
const tx = {
|
|
269
|
+
execute: async (sqlObj: unknown) => {
|
|
270
|
+
// Drizzle sql`` carries the interpolated key in its params; the
|
|
271
|
+
// helper interpolates exactly one param (the lock key).
|
|
272
|
+
const params = (sqlObj as { queryChunks?: unknown[] }).queryChunks;
|
|
273
|
+
const found = JSON.stringify(params ?? sqlObj).match(
|
|
274
|
+
/incident\.dedupe-open-for-system:[^"\\]+/,
|
|
275
|
+
);
|
|
276
|
+
if (found) lockKey = found[0];
|
|
277
|
+
return { rows: [] };
|
|
278
|
+
},
|
|
279
|
+
};
|
|
280
|
+
// Serialize on the lock key: chain after the current tail.
|
|
281
|
+
const prior = tails.get(lockKey) ?? Promise.resolve();
|
|
282
|
+
let resolveTail!: () => void;
|
|
283
|
+
const myTail = new Promise<void>((r) => (resolveTail = r));
|
|
284
|
+
tails.set(
|
|
285
|
+
lockKey,
|
|
286
|
+
prior.then(() => myTail),
|
|
287
|
+
);
|
|
288
|
+
await prior;
|
|
289
|
+
try {
|
|
290
|
+
return await fn(tx);
|
|
291
|
+
} finally {
|
|
292
|
+
resolveTail();
|
|
293
|
+
}
|
|
294
|
+
},
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
return { db: db as unknown, store };
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
describe("IncidentService.createIncidentDedupedForSystem (M3)", () => {
|
|
301
|
+
it("two concurrent dedupe creates for one system open exactly ONE incident", async () => {
|
|
302
|
+
const { db, store } = createDedupFakeDb();
|
|
303
|
+
const service = new IncidentService(db as never);
|
|
304
|
+
|
|
305
|
+
const input = {
|
|
306
|
+
title: "Down",
|
|
307
|
+
severity: "critical" as const,
|
|
308
|
+
systemIds: ["sys-1"],
|
|
309
|
+
suppressNotifications: false,
|
|
310
|
+
};
|
|
311
|
+
|
|
312
|
+
// Sustained + flapping fire concurrently for the same system. Without
|
|
313
|
+
// the per-system lock both would find no open incident and both create.
|
|
314
|
+
const [a, b] = await Promise.all([
|
|
315
|
+
service.createIncidentDedupedForSystem(input, "sys-1"),
|
|
316
|
+
service.createIncidentDedupedForSystem(input, "sys-1"),
|
|
317
|
+
]);
|
|
318
|
+
|
|
319
|
+
// Exactly one incident row created.
|
|
320
|
+
expect(store.incidents).toHaveLength(1);
|
|
321
|
+
// One created, one reused — both return the same incident id.
|
|
322
|
+
expect(a.incident.id).toBe(b.incident.id);
|
|
323
|
+
expect([a.reused, b.reused].filter(Boolean)).toHaveLength(1);
|
|
324
|
+
});
|
|
325
|
+
});
|
package/src/service.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { eq, and, inArray, ne } from "drizzle-orm";
|
|
2
|
-
import type
|
|
2
|
+
import { withXactLock, type SafeDatabase } from "@checkstack/backend-api";
|
|
3
3
|
import * as schema from "./schema";
|
|
4
4
|
import {
|
|
5
5
|
incidents,
|
|
@@ -17,6 +17,7 @@ import type {
|
|
|
17
17
|
UpdateIncidentInput,
|
|
18
18
|
AddIncidentUpdateInput,
|
|
19
19
|
IncidentStatus,
|
|
20
|
+
IncidentSeverity,
|
|
20
21
|
} from "@checkstack/incident-common";
|
|
21
22
|
|
|
22
23
|
type Db = SafeDatabase<typeof schema>;
|
|
@@ -125,6 +126,62 @@ export class IncidentService {
|
|
|
125
126
|
};
|
|
126
127
|
}
|
|
127
128
|
|
|
129
|
+
/**
|
|
130
|
+
* Batched reactive-state read for the `incident` entity (Model B
|
|
131
|
+
* plugin-backed `read` accessor). Given incident ids, return the reactive
|
|
132
|
+
* subset `{ status, severity, systemIds }` for each that exists (missing
|
|
133
|
+
* ids omitted). Reads the AUTHORITATIVE `incidents` + `incident_systems`
|
|
134
|
+
* tables — no framework `entity_state` storage. This is the single source
|
|
135
|
+
* of truth `handle.mutate` snapshots `prev` from and `get`/`getMany`/scope
|
|
136
|
+
* enrichment route through.
|
|
137
|
+
*/
|
|
138
|
+
async getManyEntityStates(
|
|
139
|
+
ids: ReadonlyArray<string>,
|
|
140
|
+
): Promise<
|
|
141
|
+
Record<string, { status: IncidentStatus; severity: IncidentSeverity; systemIds: string[] }>
|
|
142
|
+
> {
|
|
143
|
+
if (ids.length === 0) return {};
|
|
144
|
+
|
|
145
|
+
const rows = await this.db
|
|
146
|
+
.select({
|
|
147
|
+
id: incidents.id,
|
|
148
|
+
status: incidents.status,
|
|
149
|
+
severity: incidents.severity,
|
|
150
|
+
})
|
|
151
|
+
.from(incidents)
|
|
152
|
+
.where(inArray(incidents.id, [...ids]));
|
|
153
|
+
if (rows.length === 0) return {};
|
|
154
|
+
|
|
155
|
+
const presentIds = rows.map((r) => r.id);
|
|
156
|
+
const systemRows = await this.db
|
|
157
|
+
.select({
|
|
158
|
+
incidentId: incidentSystems.incidentId,
|
|
159
|
+
systemId: incidentSystems.systemId,
|
|
160
|
+
})
|
|
161
|
+
.from(incidentSystems)
|
|
162
|
+
.where(inArray(incidentSystems.incidentId, presentIds));
|
|
163
|
+
|
|
164
|
+
const systemsByIncident = new Map<string, string[]>();
|
|
165
|
+
for (const r of systemRows) {
|
|
166
|
+
const list = systemsByIncident.get(r.incidentId);
|
|
167
|
+
if (list) list.push(r.systemId);
|
|
168
|
+
else systemsByIncident.set(r.incidentId, [r.systemId]);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
const out: Record<
|
|
172
|
+
string,
|
|
173
|
+
{ status: IncidentStatus; severity: IncidentSeverity; systemIds: string[] }
|
|
174
|
+
> = {};
|
|
175
|
+
for (const row of rows) {
|
|
176
|
+
out[row.id] = {
|
|
177
|
+
status: row.status,
|
|
178
|
+
severity: row.severity,
|
|
179
|
+
systemIds: systemsByIncident.get(row.id) ?? [],
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
return out;
|
|
183
|
+
}
|
|
184
|
+
|
|
128
185
|
/**
|
|
129
186
|
* Get active incidents for a system
|
|
130
187
|
*/
|
|
@@ -165,13 +222,18 @@ export class IncidentService {
|
|
|
165
222
|
}
|
|
166
223
|
|
|
167
224
|
/**
|
|
168
|
-
* Create a new incident
|
|
225
|
+
* Create a new incident.
|
|
226
|
+
*
|
|
227
|
+
* `id` may be supplied by the caller so the reactive `incident` entity can
|
|
228
|
+
* be keyed on a known id BEFORE the insert runs (the create's `prev`
|
|
229
|
+
* snapshot must read the not-yet-existing row as absent — see §10.1). When
|
|
230
|
+
* omitted, a fresh id is generated. The id is server-owned either way.
|
|
169
231
|
*/
|
|
170
232
|
async createIncident(
|
|
171
233
|
input: CreateIncidentInput,
|
|
172
234
|
userId?: string,
|
|
235
|
+
id: string = generateId(),
|
|
173
236
|
): Promise<IncidentWithSystems> {
|
|
174
|
-
const id = generateId();
|
|
175
237
|
|
|
176
238
|
await this.db.insert(incidents).values({
|
|
177
239
|
id,
|
|
@@ -407,4 +469,86 @@ export class IncidentService {
|
|
|
407
469
|
|
|
408
470
|
return !!match;
|
|
409
471
|
}
|
|
472
|
+
|
|
473
|
+
/**
|
|
474
|
+
* Find a single OPEN (not-resolved) incident affecting `systemId`, if
|
|
475
|
+
* any. Returns the incident with its systems, mirroring the old
|
|
476
|
+
* auto-incident `findActiveAutoIncident(systemId)` dedup semantic. Used
|
|
477
|
+
* by `incident.create`'s opt-in `dedupe_open_for_system` flag so a
|
|
478
|
+
* second trigger for an already-incidented system reuses the open
|
|
479
|
+
* incident rather than opening a duplicate.
|
|
480
|
+
*/
|
|
481
|
+
async findActiveIncidentForSystem(
|
|
482
|
+
systemId: string,
|
|
483
|
+
): Promise<IncidentWithSystems | undefined> {
|
|
484
|
+
const systemIncidents = await this.db
|
|
485
|
+
.select({ incidentId: incidentSystems.incidentId })
|
|
486
|
+
.from(incidentSystems)
|
|
487
|
+
.where(eq(incidentSystems.systemId, systemId));
|
|
488
|
+
|
|
489
|
+
const ids = systemIncidents.map((r) => r.incidentId);
|
|
490
|
+
if (ids.length === 0) return undefined;
|
|
491
|
+
|
|
492
|
+
const [match] = await this.db
|
|
493
|
+
.select({ id: incidents.id })
|
|
494
|
+
.from(incidents)
|
|
495
|
+
.where(and(inArray(incidents.id, ids), ne(incidents.status, "resolved")))
|
|
496
|
+
.limit(1);
|
|
497
|
+
|
|
498
|
+
if (!match) return undefined;
|
|
499
|
+
return this.getIncident(match.id);
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Dedup-aware create for a single system, used by the `incident.create`
|
|
504
|
+
* automation action when `dedupe_open_for_system` is set. Serializes the
|
|
505
|
+
* check-then-create per system with a transaction-scoped advisory lock so
|
|
506
|
+
* two concurrent triggers for the same system (e.g. sustained + flapping)
|
|
507
|
+
* can't both observe "no open incident" and both create one. The critical
|
|
508
|
+
* section is short (a find + an insert), so a transaction-scoped lock is
|
|
509
|
+
* the right primitive (it auto-releases at COMMIT, no leak possible).
|
|
510
|
+
*
|
|
511
|
+
* Returns `{ incident, reused }` — `reused` is true when an already-open
|
|
512
|
+
* incident for the system was found and returned instead of creating.
|
|
513
|
+
*/
|
|
514
|
+
async createIncidentDedupedForSystem(
|
|
515
|
+
input: CreateIncidentInput,
|
|
516
|
+
dedupeSystemId: string,
|
|
517
|
+
userId?: string,
|
|
518
|
+
newId?: string,
|
|
519
|
+
/**
|
|
520
|
+
* Optional create wrapper (§10.1, 6(a)). When the dedup decides to CREATE,
|
|
521
|
+
* the actual create runs through this wrapper INSIDE the advisory lock, so
|
|
522
|
+
* the reactive `incident` entity write (which snapshots `prev` before the
|
|
523
|
+
* insert) is serialized with the dedup check. The wrapper receives the
|
|
524
|
+
* bound create thunk and MUST call it exactly once, returning its result.
|
|
525
|
+
* Defaults to calling the create directly (non-reactive).
|
|
526
|
+
*/
|
|
527
|
+
onCreate: (
|
|
528
|
+
create: () => Promise<IncidentWithSystems>,
|
|
529
|
+
) => Promise<IncidentWithSystems> = (create) => create(),
|
|
530
|
+
): Promise<{ incident: IncidentWithSystems; reused: boolean }> {
|
|
531
|
+
return withXactLock({
|
|
532
|
+
db: this.db,
|
|
533
|
+
key: `incident.dedupe-open-for-system:${dedupeSystemId}`,
|
|
534
|
+
// The find + create run on `this.db` (the pool), NOT on `tx`. That is
|
|
535
|
+
// safe here because `pg_advisory_xact_lock` BLOCKS every other holder
|
|
536
|
+
// of this key until this transaction commits: a racing caller waits
|
|
537
|
+
// at lock-acquire, so its find can't observe "no open incident" until
|
|
538
|
+
// ours has already committed the insert. The critical section is thus
|
|
539
|
+
// serialized by the lock window even though it doesn't ride `tx`.
|
|
540
|
+
fn: async () => {
|
|
541
|
+
const existing = await this.findActiveIncidentForSystem(dedupeSystemId);
|
|
542
|
+
if (existing) {
|
|
543
|
+
return { incident: existing, reused: true };
|
|
544
|
+
}
|
|
545
|
+
// Create through the caller's wrapper (reactive entity write) so the
|
|
546
|
+
// `incident.created` emit is serialized inside the dedup lock.
|
|
547
|
+
const incident = await onCreate(() =>
|
|
548
|
+
this.createIncident(input, userId, newId),
|
|
549
|
+
);
|
|
550
|
+
return { incident, reused: false };
|
|
551
|
+
},
|
|
552
|
+
});
|
|
553
|
+
}
|
|
410
554
|
}
|