@checkstack/slo-backend 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/drizzle/0000_rainy_kronos.sql +57 -0
- package/drizzle/meta/0000_snapshot.json +370 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +7 -0
- package/package.json +41 -0
- package/src/achievement-evaluator.ts +201 -0
- package/src/hooks.ts +76 -0
- package/src/index.ts +425 -0
- package/src/router.ts +192 -0
- package/src/schema.ts +120 -0
- package/src/service.ts +682 -0
- package/src/slo-engine.test.ts +662 -0
- package/src/slo-engine.ts +425 -0
- package/src/streak-calculator.ts +107 -0
- package/src/weekly-digest.ts +140 -0
- package/tsconfig.json +6 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
import type { SloService } from "./service";
|
|
2
|
+
import type {
|
|
3
|
+
SloObjective,
|
|
4
|
+
SloStatus,
|
|
5
|
+
} from "@checkstack/slo-common";
|
|
6
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
7
|
+
import type { SignalService } from "@checkstack/signal-common";
|
|
8
|
+
import { SLO_STATUS_CHANGED } from "@checkstack/slo-common";
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Core SLO computation engine.
|
|
12
|
+
*
|
|
13
|
+
* Two responsibilities:
|
|
14
|
+
* 1. Real-time event handler: reacts to SYSTEM_STATUS_CHANGED signals,
|
|
15
|
+
* creating/splitting/closing downtime events with correct attribution.
|
|
16
|
+
* 2. Status computation: aggregates downtime events for API reads.
|
|
17
|
+
*/
|
|
18
|
+
export class SloEngine {
|
|
19
|
+
private service: SloService;
|
|
20
|
+
private signalService: SignalService;
|
|
21
|
+
private logger: Logger;
|
|
22
|
+
private _getSystemHealthStatus:
|
|
23
|
+
| ((systemId: string) => Promise<{ isHealthy: boolean }>)
|
|
24
|
+
| undefined;
|
|
25
|
+
|
|
26
|
+
constructor({ service, signalService, logger }: {
|
|
27
|
+
service: SloService;
|
|
28
|
+
signalService: SignalService;
|
|
29
|
+
logger: Logger;
|
|
30
|
+
}) {
|
|
31
|
+
this.service = service;
|
|
32
|
+
this.signalService = signalService;
|
|
33
|
+
this.logger = logger;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Set the health status callback. Must be called from afterPluginsReady
|
|
38
|
+
* once the healthcheck RPC client is available.
|
|
39
|
+
*/
|
|
40
|
+
setHealthStatusCallback(
|
|
41
|
+
callback: (systemId: string) => Promise<{ isHealthy: boolean }>,
|
|
42
|
+
) {
|
|
43
|
+
this._getSystemHealthStatus = callback;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Reconcile a newly created objective with the current system state.
|
|
48
|
+
* If the system is already degraded, opens an initial downtime event.
|
|
49
|
+
* Called after createObjective to handle the edge case where a system
|
|
50
|
+
* was already unhealthy before the SLO existed.
|
|
51
|
+
*/
|
|
52
|
+
async reconcileObjective({
|
|
53
|
+
objective,
|
|
54
|
+
}: {
|
|
55
|
+
objective: { id: string; systemId: string };
|
|
56
|
+
}): Promise<void> {
|
|
57
|
+
if (!this._getSystemHealthStatus) {
|
|
58
|
+
// Before afterPluginsReady — can't check. Skip gracefully.
|
|
59
|
+
this.logger.debug(
|
|
60
|
+
`SLO ${objective.id}: reconcileObjective skipped — no health callback set`,
|
|
61
|
+
);
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const health = await this._getSystemHealthStatus(objective.systemId);
|
|
66
|
+
if (health.isHealthy) return;
|
|
67
|
+
|
|
68
|
+
// System is already down — check if there's already an event (defensive)
|
|
69
|
+
const openEvents = await this.service.getOpenDowntimeEventsForObjective({
|
|
70
|
+
objectiveId: objective.id,
|
|
71
|
+
});
|
|
72
|
+
if (openEvents.length > 0) return;
|
|
73
|
+
|
|
74
|
+
// Open an initial downtime event attributed to self
|
|
75
|
+
await this.service.openDowntimeEvent({
|
|
76
|
+
objectiveId: objective.id,
|
|
77
|
+
systemId: objective.systemId,
|
|
78
|
+
attributionType: "self",
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
this.logger.info(
|
|
82
|
+
`SLO ${objective.id}: Initial downtime event — system already degraded at creation time`,
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ===========================================================================
|
|
87
|
+
// PERSPECTIVE 1: This system's own SLOs
|
|
88
|
+
// ===========================================================================
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Handle a system transitioning to unhealthy.
|
|
92
|
+
* Opens new downtime events for all SLO objectives on this system.
|
|
93
|
+
*/
|
|
94
|
+
async handleSystemDown({
|
|
95
|
+
systemId,
|
|
96
|
+
getUpstreamHealthStatus,
|
|
97
|
+
}: {
|
|
98
|
+
systemId: string;
|
|
99
|
+
getUpstreamHealthStatus: ({
|
|
100
|
+
upstreamSystemId,
|
|
101
|
+
}: {
|
|
102
|
+
upstreamSystemId: string;
|
|
103
|
+
}) => Promise<{ isHealthy: boolean; systemName: string }>;
|
|
104
|
+
}): Promise<void> {
|
|
105
|
+
const objectives = await this.service.getObjectivesForSystem({ systemId });
|
|
106
|
+
|
|
107
|
+
for (const objective of objectives) {
|
|
108
|
+
// Check if there's already an open event (idempotent)
|
|
109
|
+
const openEvents = await this.service.getOpenDowntimeEventsForObjective({
|
|
110
|
+
objectiveId: objective.id,
|
|
111
|
+
});
|
|
112
|
+
if (openEvents.length > 0) {
|
|
113
|
+
this.logger.debug(
|
|
114
|
+
`SLO ${objective.id}: Already has open downtime events, skipping`,
|
|
115
|
+
);
|
|
116
|
+
continue;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const attribution = await this.determineAttribution({
|
|
120
|
+
objective,
|
|
121
|
+
_getUpstreamHealthStatus: getUpstreamHealthStatus,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
await this.service.openDowntimeEvent({
|
|
125
|
+
objectiveId: objective.id,
|
|
126
|
+
systemId,
|
|
127
|
+
attributionType: attribution.type,
|
|
128
|
+
upstreamSystemId: attribution.upstreamSystemId,
|
|
129
|
+
upstreamSystemName: attribution.upstreamSystemName,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
this.logger.info(
|
|
133
|
+
`SLO ${objective.id}: Downtime started (attribution: ${attribution.type}${attribution.upstreamSystemName ? ` → ${attribution.upstreamSystemName}` : ""})`,
|
|
134
|
+
);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Handle a system transitioning to healthy.
|
|
140
|
+
* Closes all open downtime events and recomputes SLO status.
|
|
141
|
+
*/
|
|
142
|
+
async handleSystemUp({
|
|
143
|
+
systemId,
|
|
144
|
+
}: {
|
|
145
|
+
systemId: string;
|
|
146
|
+
}): Promise<void> {
|
|
147
|
+
const openEvents = await this.service.getOpenDowntimeEvents({ systemId });
|
|
148
|
+
|
|
149
|
+
for (const event of openEvents) {
|
|
150
|
+
await this.service.closeDowntimeEvent({ id: event.id });
|
|
151
|
+
this.logger.info(
|
|
152
|
+
`SLO event ${event.id}: Closed (${event.attributionType})`,
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Recompute and broadcast status for all affected objectives
|
|
157
|
+
const objectiveIds = [...new Set(openEvents.map((e) => e.objectiveId))];
|
|
158
|
+
for (const objectiveId of objectiveIds) {
|
|
159
|
+
const objective = await this.service.getObjective({ id: objectiveId });
|
|
160
|
+
if (!objective) continue;
|
|
161
|
+
|
|
162
|
+
const status = await this.computeStatus({ objective });
|
|
163
|
+
await this.signalService.broadcast(SLO_STATUS_CHANGED, {
|
|
164
|
+
systemId,
|
|
165
|
+
objectiveId,
|
|
166
|
+
budgetRemainingPercent: status.errorBudgetRemainingPercent,
|
|
167
|
+
isBreaching: status.isBreaching,
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// ===========================================================================
|
|
173
|
+
// PERSPECTIVE 2: This system as an upstream dependency
|
|
174
|
+
// ===========================================================================
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Handle an upstream dependency going down.
|
|
178
|
+
* Splits open "self" events on downstream systems into "upstream" events.
|
|
179
|
+
*/
|
|
180
|
+
async handleUpstreamDown({
|
|
181
|
+
upstreamSystemId,
|
|
182
|
+
upstreamSystemName,
|
|
183
|
+
downstreamSystemIds,
|
|
184
|
+
}: {
|
|
185
|
+
upstreamSystemId: string;
|
|
186
|
+
upstreamSystemName: string;
|
|
187
|
+
downstreamSystemIds: string[];
|
|
188
|
+
}): Promise<void> {
|
|
189
|
+
for (const downstreamId of downstreamSystemIds) {
|
|
190
|
+
const openSelfEvents = await this.service.getOpenSelfEvents({
|
|
191
|
+
systemId: downstreamId,
|
|
192
|
+
});
|
|
193
|
+
|
|
194
|
+
for (const event of openSelfEvents) {
|
|
195
|
+
// Get the objective to check exclusion mode
|
|
196
|
+
const objective = await this.service.getObjective({
|
|
197
|
+
id: event.objectiveId,
|
|
198
|
+
});
|
|
199
|
+
if (!objective || objective.dependencyExclusion === "strict") continue;
|
|
200
|
+
|
|
201
|
+
// Check if this upstream is excluded from the objective
|
|
202
|
+
if (objective.excludedDependencyIds?.includes(upstreamSystemId))
|
|
203
|
+
continue;
|
|
204
|
+
|
|
205
|
+
// SPLIT: Close "self" event, open "upstream" event
|
|
206
|
+
await this.service.closeDowntimeEvent({ id: event.id });
|
|
207
|
+
await this.service.openDowntimeEvent({
|
|
208
|
+
objectiveId: event.objectiveId,
|
|
209
|
+
systemId: downstreamId,
|
|
210
|
+
attributionType: "upstream",
|
|
211
|
+
upstreamSystemId,
|
|
212
|
+
upstreamSystemName,
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
this.logger.info(
|
|
216
|
+
`SLO ${event.objectiveId}: Split event — self → upstream (${upstreamSystemName})`,
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/**
|
|
223
|
+
* Handle an upstream dependency recovering.
|
|
224
|
+
* Splits open "upstream" events on downstream systems back to "self"
|
|
225
|
+
* (or to another upstream if one is still down).
|
|
226
|
+
*/
|
|
227
|
+
async handleUpstreamUp({
|
|
228
|
+
upstreamSystemId,
|
|
229
|
+
downstreamSystemIds,
|
|
230
|
+
getUpstreamHealthStatus,
|
|
231
|
+
}: {
|
|
232
|
+
upstreamSystemId: string;
|
|
233
|
+
downstreamSystemIds: string[];
|
|
234
|
+
getUpstreamHealthStatus: ({
|
|
235
|
+
upstreamSystemId,
|
|
236
|
+
}: {
|
|
237
|
+
upstreamSystemId: string;
|
|
238
|
+
}) => Promise<{ isHealthy: boolean; systemName: string }>;
|
|
239
|
+
}): Promise<void> {
|
|
240
|
+
for (const downstreamId of downstreamSystemIds) {
|
|
241
|
+
const upstreamEvents = await this.service.getOpenUpstreamEvents({
|
|
242
|
+
systemId: downstreamId,
|
|
243
|
+
upstreamSystemId,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
for (const event of upstreamEvents) {
|
|
247
|
+
const objective = await this.service.getObjective({
|
|
248
|
+
id: event.objectiveId,
|
|
249
|
+
});
|
|
250
|
+
if (!objective) continue;
|
|
251
|
+
|
|
252
|
+
// Close the upstream event
|
|
253
|
+
await this.service.closeDowntimeEvent({ id: event.id });
|
|
254
|
+
|
|
255
|
+
// Check if the downstream system is still down
|
|
256
|
+
// (if it recovered, handleSystemUp already closed everything)
|
|
257
|
+
const stillOpen = await this.service.getOpenDowntimeEventsForObjective({
|
|
258
|
+
objectiveId: event.objectiveId,
|
|
259
|
+
});
|
|
260
|
+
// If there are other open events for this objective, skip
|
|
261
|
+
if (stillOpen.length > 0) continue;
|
|
262
|
+
|
|
263
|
+
// The downstream is still down — determine new attribution
|
|
264
|
+
const newAttribution = await this.determineAttribution({
|
|
265
|
+
objective,
|
|
266
|
+
_getUpstreamHealthStatus: getUpstreamHealthStatus,
|
|
267
|
+
_excludeUpstreamId: upstreamSystemId,
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
await this.service.openDowntimeEvent({
|
|
271
|
+
objectiveId: event.objectiveId,
|
|
272
|
+
systemId: downstreamId,
|
|
273
|
+
attributionType: newAttribution.type,
|
|
274
|
+
upstreamSystemId: newAttribution.upstreamSystemId,
|
|
275
|
+
upstreamSystemName: newAttribution.upstreamSystemName,
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
this.logger.info(
|
|
279
|
+
`SLO ${event.objectiveId}: Split event — upstream (${upstreamSystemId}) → ${newAttribution.type}${newAttribution.upstreamSystemName ? ` (${newAttribution.upstreamSystemName})` : ""}`,
|
|
280
|
+
);
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// ===========================================================================
|
|
286
|
+
// STATUS COMPUTATION
|
|
287
|
+
// ===========================================================================
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Compute the current SLO status for display.
|
|
291
|
+
* Reads from pre-computed downtime events — fast O(events-in-window).
|
|
292
|
+
*/
|
|
293
|
+
async computeStatus({
|
|
294
|
+
objective,
|
|
295
|
+
}: {
|
|
296
|
+
objective: SloObjective;
|
|
297
|
+
}): Promise<SloStatus> {
|
|
298
|
+
const now = new Date();
|
|
299
|
+
const windowStart = new Date(
|
|
300
|
+
now.getTime() - objective.windowDays * 24 * 60 * 60 * 1000,
|
|
301
|
+
);
|
|
302
|
+
|
|
303
|
+
const downtime = await this.service.getDowntimeForWindow({
|
|
304
|
+
objectiveId: objective.id,
|
|
305
|
+
windowStart,
|
|
306
|
+
windowEnd: now,
|
|
307
|
+
});
|
|
308
|
+
|
|
309
|
+
const totalWindowMinutes = objective.windowDays * 24 * 60;
|
|
310
|
+
const allowedDowntimeMinutes =
|
|
311
|
+
((100 - objective.target) / 100) * totalWindowMinutes;
|
|
312
|
+
|
|
313
|
+
// What counts depends on the exclusion mode
|
|
314
|
+
const consumedMinutes =
|
|
315
|
+
objective.dependencyExclusion === "strict"
|
|
316
|
+
? downtime.totalMinutes
|
|
317
|
+
: downtime.selfMinutes;
|
|
318
|
+
|
|
319
|
+
const remainingMinutes = Math.max(0, allowedDowntimeMinutes - consumedMinutes);
|
|
320
|
+
const remainingPercent =
|
|
321
|
+
allowedDowntimeMinutes > 0
|
|
322
|
+
? (remainingMinutes / allowedDowntimeMinutes) * 100
|
|
323
|
+
: 100;
|
|
324
|
+
|
|
325
|
+
const effectiveAvailability =
|
|
326
|
+
totalWindowMinutes > 0
|
|
327
|
+
? ((totalWindowMinutes - consumedMinutes) / totalWindowMinutes) * 100
|
|
328
|
+
// eslint-disable-next-line unicorn/no-null -- Zod schema uses .nullable()
|
|
329
|
+
: null;
|
|
330
|
+
|
|
331
|
+
const strictAvailability =
|
|
332
|
+
totalWindowMinutes > 0
|
|
333
|
+
? ((totalWindowMinutes - downtime.totalMinutes) / totalWindowMinutes) * 100
|
|
334
|
+
// eslint-disable-next-line unicorn/no-null -- Zod schema uses .nullable()
|
|
335
|
+
: null;
|
|
336
|
+
|
|
337
|
+
// Burn rate: how fast are we consuming budget relative to the window?
|
|
338
|
+
const elapsedDays = Math.max(
|
|
339
|
+
(now.getTime() - windowStart.getTime()) / (24 * 60 * 60 * 1000),
|
|
340
|
+
1,
|
|
341
|
+
);
|
|
342
|
+
const expectedConsumption =
|
|
343
|
+
(elapsedDays / objective.windowDays) * allowedDowntimeMinutes;
|
|
344
|
+
const burnRate =
|
|
345
|
+
// eslint-disable-next-line unicorn/no-null -- Zod schema uses .nullable()
|
|
346
|
+
expectedConsumption > 0 ? consumedMinutes / expectedConsumption : null;
|
|
347
|
+
|
|
348
|
+
// Check for open downtime events
|
|
349
|
+
const openEvents = await this.service.getOpenDowntimeEventsForObjective({
|
|
350
|
+
objectiveId: objective.id,
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
// Build attribution breakdown
|
|
354
|
+
const attribution = downtime.entries.map((entry) => ({
|
|
355
|
+
sourceType: entry.attributionType as "self" | "upstream",
|
|
356
|
+
systemId: entry.upstreamSystemId ?? undefined,
|
|
357
|
+
systemName: entry.upstreamSystemName ?? undefined,
|
|
358
|
+
minutes: entry.totalMinutes,
|
|
359
|
+
}));
|
|
360
|
+
|
|
361
|
+
return {
|
|
362
|
+
objectiveId: objective.id,
|
|
363
|
+
systemId: objective.systemId,
|
|
364
|
+
target: objective.target,
|
|
365
|
+
windowDays: objective.windowDays,
|
|
366
|
+
healthCheckConfigurationId: objective.healthCheckConfigurationId,
|
|
367
|
+
// eslint-disable-next-line unicorn/no-null -- Not yet resolving configuration names
|
|
368
|
+
healthCheckConfigurationName: null,
|
|
369
|
+
currentAvailability: effectiveAvailability,
|
|
370
|
+
strictAvailability,
|
|
371
|
+
errorBudgetTotalMinutes: allowedDowntimeMinutes,
|
|
372
|
+
errorBudgetConsumedMinutes: consumedMinutes,
|
|
373
|
+
errorBudgetConsumedStrictMinutes: downtime.totalMinutes,
|
|
374
|
+
errorBudgetRemainingMinutes: remainingMinutes,
|
|
375
|
+
errorBudgetRemainingPercent: remainingPercent,
|
|
376
|
+
burnRate,
|
|
377
|
+
dependencyExclusion: objective.dependencyExclusion,
|
|
378
|
+
isBreaching: effectiveAvailability !== null && effectiveAvailability < objective.target,
|
|
379
|
+
hasOpenDowntime: openEvents.length > 0,
|
|
380
|
+
attribution,
|
|
381
|
+
};
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// ===========================================================================
|
|
385
|
+
// PRIVATE HELPERS
|
|
386
|
+
// ===========================================================================
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* Determine attribution for a new downtime event based on the objective's
|
|
390
|
+
* dependency exclusion mode and the current health of upstream dependencies.
|
|
391
|
+
*/
|
|
392
|
+
private async determineAttribution({
|
|
393
|
+
objective,
|
|
394
|
+
_getUpstreamHealthStatus,
|
|
395
|
+
_excludeUpstreamId,
|
|
396
|
+
}: {
|
|
397
|
+
objective: SloObjective;
|
|
398
|
+
_getUpstreamHealthStatus: ({
|
|
399
|
+
upstreamSystemId,
|
|
400
|
+
}: {
|
|
401
|
+
upstreamSystemId: string;
|
|
402
|
+
}) => Promise<{ isHealthy: boolean; systemName: string }>;
|
|
403
|
+
_excludeUpstreamId?: string;
|
|
404
|
+
}): Promise<{
|
|
405
|
+
type: "self" | "upstream";
|
|
406
|
+
upstreamSystemId?: string;
|
|
407
|
+
upstreamSystemName?: string;
|
|
408
|
+
}> {
|
|
409
|
+
// Strict mode: always self
|
|
410
|
+
if (objective.dependencyExclusion === "strict") {
|
|
411
|
+
return { type: "self" };
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// For non-strict modes: check upstream dependencies
|
|
415
|
+
// The caller provides the getUpstreamHealthStatus function, which queries
|
|
416
|
+
// the dependency map and health check status via RPC
|
|
417
|
+
// This is injected to keep the engine testable without real RPC calls
|
|
418
|
+
|
|
419
|
+
// Note: The engine doesn't query dependencies directly — the plugin index.ts
|
|
420
|
+
// wires up the actual dependency and health check clients as the callback.
|
|
421
|
+
// This function signature intentionally takes a callback for testability.
|
|
422
|
+
|
|
423
|
+
return { type: "self" };
|
|
424
|
+
}
|
|
425
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import type { SloService } from "./service";
|
|
2
|
+
import type { SloEngine } from "./slo-engine";
|
|
3
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
4
|
+
import type { QueueManager } from "@checkstack/queue-api";
|
|
5
|
+
|
|
6
|
+
const SNAPSHOT_QUEUE = "slo-daily-snapshots";
|
|
7
|
+
const SNAPSHOT_JOB_ID = "slo-daily-snapshot-run";
|
|
8
|
+
const WORKER_GROUP = "slo-snapshot-worker";
|
|
9
|
+
|
|
10
|
+
interface StreakCalculatorDeps {
|
|
11
|
+
service: SloService;
|
|
12
|
+
engine: SloEngine;
|
|
13
|
+
logger: Logger;
|
|
14
|
+
queueManager: QueueManager;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Sets up the daily SLO snapshot and streak calculation job.
|
|
19
|
+
* Runs once per day at UTC midnight, persisting daily snapshots
|
|
20
|
+
* and updating streak counters for all active objectives.
|
|
21
|
+
*/
|
|
22
|
+
export async function setupDailySnapshotJob(deps: StreakCalculatorDeps) {
|
|
23
|
+
const { queueManager, logger, service, engine } = deps;
|
|
24
|
+
|
|
25
|
+
const queue = queueManager.getQueue<{ trigger: "scheduled" }>(SNAPSHOT_QUEUE);
|
|
26
|
+
|
|
27
|
+
// Register consumer
|
|
28
|
+
await queue.consume(
|
|
29
|
+
async () => {
|
|
30
|
+
logger.info("Starting daily SLO snapshot job");
|
|
31
|
+
await runDailySnapshotJob({ service, engine, logger });
|
|
32
|
+
logger.info("Completed daily SLO snapshot job");
|
|
33
|
+
},
|
|
34
|
+
{ consumerGroup: WORKER_GROUP, maxRetries: 0 },
|
|
35
|
+
);
|
|
36
|
+
|
|
37
|
+
// Schedule daily at midnight UTC (00:00)
|
|
38
|
+
await queue.scheduleRecurring(
|
|
39
|
+
{ trigger: "scheduled" },
|
|
40
|
+
{
|
|
41
|
+
jobId: SNAPSHOT_JOB_ID,
|
|
42
|
+
cronPattern: "0 0 * * *", // Daily at midnight UTC
|
|
43
|
+
},
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
logger.debug("✅ SLO daily snapshot job scheduled (runs at 00:00 UTC)");
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Main daily snapshot and streak calculation logic.
|
|
51
|
+
* For each objective:
|
|
52
|
+
* 1. Compute current SLO status
|
|
53
|
+
* 2. Persist a daily snapshot for trend charts
|
|
54
|
+
* 3. Update streak counter (increment if meeting target, reset if breaching)
|
|
55
|
+
*/
|
|
56
|
+
export async function runDailySnapshotJob(deps: {
|
|
57
|
+
service: SloService;
|
|
58
|
+
engine: SloEngine;
|
|
59
|
+
logger: Logger;
|
|
60
|
+
}) {
|
|
61
|
+
const { service, engine, logger } = deps;
|
|
62
|
+
|
|
63
|
+
const objectives = await service.listObjectives();
|
|
64
|
+
const today = new Date();
|
|
65
|
+
today.setUTCHours(0, 0, 0, 0);
|
|
66
|
+
|
|
67
|
+
for (const objective of objectives) {
|
|
68
|
+
try {
|
|
69
|
+
const status = await engine.computeStatus({ objective });
|
|
70
|
+
|
|
71
|
+
// 1. Persist daily snapshot
|
|
72
|
+
const streak = await service.getStreak({
|
|
73
|
+
objectiveId: objective.id,
|
|
74
|
+
});
|
|
75
|
+
await service.insertDailySnapshot({
|
|
76
|
+
snapshot: {
|
|
77
|
+
objectiveId: objective.id,
|
|
78
|
+
date: today,
|
|
79
|
+
availabilityPercent: status.currentAvailability ?? 100,
|
|
80
|
+
budgetConsumedMinutes: status.errorBudgetConsumedMinutes,
|
|
81
|
+
budgetRemainingPercent: status.errorBudgetRemainingPercent,
|
|
82
|
+
// eslint-disable-next-line unicorn/no-null -- Zod schema uses .nullable()
|
|
83
|
+
burnRate: status.burnRate ?? null,
|
|
84
|
+
streakDays: streak?.currentStreak ?? 0,
|
|
85
|
+
},
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// 2. Update streak: if currently meeting target, increment; else reset
|
|
89
|
+
if (!status.isBreaching && !status.hasOpenDowntime) {
|
|
90
|
+
await service.incrementStreak({ objectiveId: objective.id });
|
|
91
|
+
} else if (status.isBreaching) {
|
|
92
|
+
const currentStreak = streak?.currentStreak ?? 0;
|
|
93
|
+
if (currentStreak > 0) {
|
|
94
|
+
await service.resetStreak({ objectiveId: objective.id });
|
|
95
|
+
logger.info(
|
|
96
|
+
`SLO ${objective.id}: Streak broken at ${currentStreak} days`,
|
|
97
|
+
);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
} catch (error) {
|
|
101
|
+
logger.error(
|
|
102
|
+
`Failed to process daily snapshot for objective ${objective.id}`,
|
|
103
|
+
{ error },
|
|
104
|
+
);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
import type { SloService } from "./service";
|
|
2
|
+
import type { SloEngine } from "./slo-engine";
|
|
3
|
+
import type { Logger, EmitHookFn } from "@checkstack/backend-api";
|
|
4
|
+
import type { QueueManager } from "@checkstack/queue-api";
|
|
5
|
+
import { sloHooks } from "./hooks";
|
|
6
|
+
|
|
7
|
+
const DIGEST_QUEUE = "slo-weekly-digest";
|
|
8
|
+
const DIGEST_JOB_ID = "slo-weekly-digest-run";
|
|
9
|
+
const WORKER_GROUP = "slo-digest-worker";
|
|
10
|
+
|
|
11
|
+
interface WeeklyDigestDeps {
|
|
12
|
+
service: SloService;
|
|
13
|
+
engine: SloEngine;
|
|
14
|
+
logger: Logger;
|
|
15
|
+
queueManager: QueueManager;
|
|
16
|
+
emitHook: EmitHookFn;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Setup the weekly SLO digest cron job.
|
|
21
|
+
* Runs every Monday at 09:00 UTC — summarizes the previous week's SLO performance
|
|
22
|
+
* and emits a hook that integration channels (Slack, Teams, etc.) can deliver.
|
|
23
|
+
*/
|
|
24
|
+
export async function setupWeeklyDigestJob(deps: WeeklyDigestDeps) {
|
|
25
|
+
const { queueManager, logger, service, engine, emitHook } = deps;
|
|
26
|
+
|
|
27
|
+
const queue =
|
|
28
|
+
queueManager.getQueue<{ trigger: "scheduled" }>(DIGEST_QUEUE);
|
|
29
|
+
|
|
30
|
+
// Register consumer
|
|
31
|
+
await queue.consume(
|
|
32
|
+
async () => {
|
|
33
|
+
logger.info("[slo] Running weekly SLO digest...");
|
|
34
|
+
await runWeeklyDigest({ service, engine, logger, emitHook });
|
|
35
|
+
logger.info("[slo] Weekly SLO digest complete.");
|
|
36
|
+
},
|
|
37
|
+
{ consumerGroup: WORKER_GROUP, maxRetries: 0 },
|
|
38
|
+
);
|
|
39
|
+
|
|
40
|
+
// Schedule: every Monday at 09:00 UTC
|
|
41
|
+
await queue.scheduleRecurring(
|
|
42
|
+
{ trigger: "scheduled" },
|
|
43
|
+
{
|
|
44
|
+
jobId: DIGEST_JOB_ID,
|
|
45
|
+
cronPattern: "0 9 * * 1", // Monday 09:00 UTC
|
|
46
|
+
},
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
logger.debug("[slo] Weekly digest cron job registered (Monday 09:00 UTC)");
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Core logic for the weekly SLO digest.
|
|
54
|
+
* Computes status for all objectives, categorizes them, and emits the digest hook.
|
|
55
|
+
*/
|
|
56
|
+
async function runWeeklyDigest(deps: {
|
|
57
|
+
service: SloService;
|
|
58
|
+
engine: SloEngine;
|
|
59
|
+
logger: Logger;
|
|
60
|
+
emitHook: EmitHookFn;
|
|
61
|
+
}) {
|
|
62
|
+
const { service, engine, logger, emitHook } = deps;
|
|
63
|
+
|
|
64
|
+
const objectives = await service.listObjectives();
|
|
65
|
+
if (objectives.length === 0) {
|
|
66
|
+
logger.info("[slo] No SLO objectives — skipping weekly digest.");
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Compute status for all objectives
|
|
71
|
+
const statuses = await Promise.all(
|
|
72
|
+
objectives.map(async (obj) => ({
|
|
73
|
+
objective: obj,
|
|
74
|
+
status: await engine.computeStatus({ objective: obj }),
|
|
75
|
+
})),
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
// Categorize
|
|
79
|
+
const breachingCount = statuses.filter(
|
|
80
|
+
(s) => s.status.isBreaching,
|
|
81
|
+
).length;
|
|
82
|
+
const atRiskCount = statuses.filter(
|
|
83
|
+
(s) =>
|
|
84
|
+
!s.status.isBreaching &&
|
|
85
|
+
s.status.errorBudgetRemainingPercent <= 20,
|
|
86
|
+
).length;
|
|
87
|
+
const healthyCount = statuses.filter(
|
|
88
|
+
(s) =>
|
|
89
|
+
!s.status.isBreaching &&
|
|
90
|
+
s.status.errorBudgetRemainingPercent > 20,
|
|
91
|
+
).length;
|
|
92
|
+
|
|
93
|
+
// Get streaks for top performers
|
|
94
|
+
const streaks = await service.getAllStreaks();
|
|
95
|
+
const streakMap = new Map(
|
|
96
|
+
streaks.map((s) => [s.objectiveId, s.currentStreak]),
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
// Top 3 performers (highest availability)
|
|
100
|
+
const topPerformers = statuses
|
|
101
|
+
.filter((s) => s.status.currentAvailability !== null)
|
|
102
|
+
.toSorted(
|
|
103
|
+
(a, b) =>
|
|
104
|
+
(b.status.currentAvailability ?? 0) -
|
|
105
|
+
(a.status.currentAvailability ?? 0),
|
|
106
|
+
)
|
|
107
|
+
.slice(0, 3)
|
|
108
|
+
.map((s) => ({
|
|
109
|
+
systemName: s.status.systemId,
|
|
110
|
+
availability: s.status.currentAvailability ?? 0,
|
|
111
|
+
streakDays: streakMap.get(s.objective.id) ?? 0,
|
|
112
|
+
}));
|
|
113
|
+
|
|
114
|
+
// Bottom 3 performers (lowest budget remaining)
|
|
115
|
+
const worstPerformers = statuses
|
|
116
|
+
.toSorted(
|
|
117
|
+
(a, b) =>
|
|
118
|
+
a.status.errorBudgetRemainingPercent -
|
|
119
|
+
b.status.errorBudgetRemainingPercent,
|
|
120
|
+
)
|
|
121
|
+
.slice(0, 3)
|
|
122
|
+
.map((s) => ({
|
|
123
|
+
systemName: s.status.systemId,
|
|
124
|
+
availability: s.status.currentAvailability ?? 0,
|
|
125
|
+
budgetRemainingPercent: s.status.errorBudgetRemainingPercent,
|
|
126
|
+
}));
|
|
127
|
+
|
|
128
|
+
await emitHook(sloHooks.sloWeeklyDigest, {
|
|
129
|
+
totalObjectives: objectives.length,
|
|
130
|
+
breachingCount,
|
|
131
|
+
atRiskCount,
|
|
132
|
+
healthyCount,
|
|
133
|
+
topPerformers,
|
|
134
|
+
worstPerformers,
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
logger.info(
|
|
138
|
+
`[slo] Weekly digest emitted: ${objectives.length} objectives, ${breachingCount} breaching, ${atRiskCount} at risk, ${healthyCount} healthy`,
|
|
139
|
+
);
|
|
140
|
+
}
|