@checkstack/slo-backend 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/drizzle/0000_rainy_kronos.sql +57 -0
- package/drizzle/meta/0000_snapshot.json +370 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +7 -0
- package/package.json +41 -0
- package/src/achievement-evaluator.ts +201 -0
- package/src/hooks.ts +76 -0
- package/src/index.ts +425 -0
- package/src/router.ts +192 -0
- package/src/schema.ts +120 -0
- package/src/service.ts +682 -0
- package/src/slo-engine.test.ts +662 -0
- package/src/slo-engine.ts +425 -0
- package/src/streak-calculator.ts +107 -0
- package/src/weekly-digest.ts +140 -0
- package/tsconfig.json +6 -0
package/src/router.ts
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import { implement, ORPCError } from "@orpc/server";
|
|
2
|
+
import {
|
|
3
|
+
sloContract,
|
|
4
|
+
SLO_STATUS_CHANGED,
|
|
5
|
+
} from "@checkstack/slo-common";
|
|
6
|
+
import {
|
|
7
|
+
autoAuthMiddleware,
|
|
8
|
+
type RpcContext,
|
|
9
|
+
} from "@checkstack/backend-api";
|
|
10
|
+
import type { SignalService } from "@checkstack/signal-common";
|
|
11
|
+
import type { SloService } from "./service";
|
|
12
|
+
import type { SloEngine } from "./slo-engine";
|
|
13
|
+
|
|
14
|
+
export function createRouter({
|
|
15
|
+
service,
|
|
16
|
+
engine,
|
|
17
|
+
signalService,
|
|
18
|
+
}: {
|
|
19
|
+
service: SloService;
|
|
20
|
+
engine: SloEngine;
|
|
21
|
+
signalService: SignalService;
|
|
22
|
+
}) {
|
|
23
|
+
const os = implement(sloContract)
|
|
24
|
+
.$context<RpcContext>()
|
|
25
|
+
.use(autoAuthMiddleware);
|
|
26
|
+
|
|
27
|
+
return os.router({
|
|
28
|
+
// =========================================================================
|
|
29
|
+
// OBJECTIVES
|
|
30
|
+
// =========================================================================
|
|
31
|
+
|
|
32
|
+
listObjectives: os.listObjectives.handler(async () => {
|
|
33
|
+
const objectives = await service.listObjectives();
|
|
34
|
+
const results = await Promise.all(
|
|
35
|
+
objectives.map(async (objective) => ({
|
|
36
|
+
objective,
|
|
37
|
+
status: await engine.computeStatus({ objective }),
|
|
38
|
+
})),
|
|
39
|
+
);
|
|
40
|
+
return { objectives: results };
|
|
41
|
+
}),
|
|
42
|
+
|
|
43
|
+
getObjective: os.getObjective.handler(async ({ input }) => {
|
|
44
|
+
const objective = await service.getObjective({ id: input.id });
|
|
45
|
+
if (!objective) {
|
|
46
|
+
// eslint-disable-next-line unicorn/no-null -- oRPC contract requires null for missing values
|
|
47
|
+
return null;
|
|
48
|
+
}
|
|
49
|
+
const status = await engine.computeStatus({ objective });
|
|
50
|
+
return { objective, status };
|
|
51
|
+
}),
|
|
52
|
+
|
|
53
|
+
getObjectivesForSystem: os.getObjectivesForSystem.handler(
|
|
54
|
+
async ({ input }) => {
|
|
55
|
+
const objectives = await service.getObjectivesForSystem({
|
|
56
|
+
systemId: input.systemId,
|
|
57
|
+
});
|
|
58
|
+
return Promise.all(
|
|
59
|
+
objectives.map(async (objective) => ({
|
|
60
|
+
objective,
|
|
61
|
+
status: await engine.computeStatus({ objective }),
|
|
62
|
+
})),
|
|
63
|
+
);
|
|
64
|
+
},
|
|
65
|
+
),
|
|
66
|
+
|
|
67
|
+
getBulkObjectivesForSystems: os.getBulkObjectivesForSystems.handler(
|
|
68
|
+
async ({ input }) => {
|
|
69
|
+
const systems: Record<
|
|
70
|
+
string,
|
|
71
|
+
Array<{
|
|
72
|
+
objective: Awaited<ReturnType<typeof service.getObjective>> & {};
|
|
73
|
+
status: Awaited<ReturnType<typeof engine.computeStatus>>;
|
|
74
|
+
}>
|
|
75
|
+
> = {};
|
|
76
|
+
|
|
77
|
+
await Promise.all(
|
|
78
|
+
input.systemIds.map(async (systemId) => {
|
|
79
|
+
const objectives = await service.getObjectivesForSystem({
|
|
80
|
+
systemId,
|
|
81
|
+
});
|
|
82
|
+
systems[systemId] = await Promise.all(
|
|
83
|
+
objectives.map(async (objective) => ({
|
|
84
|
+
objective,
|
|
85
|
+
status: await engine.computeStatus({ objective }),
|
|
86
|
+
})),
|
|
87
|
+
);
|
|
88
|
+
}),
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
return { systems };
|
|
92
|
+
},
|
|
93
|
+
),
|
|
94
|
+
|
|
95
|
+
createObjective: os.createObjective.handler(
|
|
96
|
+
async ({ input }) => {
|
|
97
|
+
const objective = await service.createObjective({ input });
|
|
98
|
+
|
|
99
|
+
// Reconcile initial state: if system is already down,
|
|
100
|
+
// open an initial downtime event immediately
|
|
101
|
+
await engine.reconcileObjective({ objective });
|
|
102
|
+
|
|
103
|
+
const status = await engine.computeStatus({ objective });
|
|
104
|
+
await signalService.broadcast(SLO_STATUS_CHANGED, {
|
|
105
|
+
systemId: objective.systemId,
|
|
106
|
+
objectiveId: objective.id,
|
|
107
|
+
budgetRemainingPercent: status.errorBudgetRemainingPercent,
|
|
108
|
+
isBreaching: status.isBreaching,
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
return objective;
|
|
112
|
+
},
|
|
113
|
+
),
|
|
114
|
+
|
|
115
|
+
updateObjective: os.updateObjective.handler(async ({ input }) => {
|
|
116
|
+
const objective = await service.updateObjective({ input });
|
|
117
|
+
if (!objective) {
|
|
118
|
+
throw new ORPCError("NOT_FOUND", {
|
|
119
|
+
message: "SLO objective not found",
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Reconcile: if system is currently down but no event exists
|
|
124
|
+
await engine.reconcileObjective({ objective });
|
|
125
|
+
|
|
126
|
+
const status = await engine.computeStatus({ objective });
|
|
127
|
+
await signalService.broadcast(SLO_STATUS_CHANGED, {
|
|
128
|
+
systemId: objective.systemId,
|
|
129
|
+
objectiveId: objective.id,
|
|
130
|
+
budgetRemainingPercent: status.errorBudgetRemainingPercent,
|
|
131
|
+
isBreaching: status.isBreaching,
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
return objective;
|
|
135
|
+
}),
|
|
136
|
+
|
|
137
|
+
deleteObjective: os.deleteObjective.handler(async ({ input }) => {
|
|
138
|
+
const success = await service.deleteObjective({ id: input.id });
|
|
139
|
+
return { success };
|
|
140
|
+
}),
|
|
141
|
+
|
|
142
|
+
// =========================================================================
|
|
143
|
+
// DOWNTIME EVENTS & SNAPSHOTS
|
|
144
|
+
// =========================================================================
|
|
145
|
+
|
|
146
|
+
getDowntimeEvents: os.getDowntimeEvents.handler(async ({ input }) => {
|
|
147
|
+
const events = await service.getRecentDowntimeEvents({
|
|
148
|
+
objectiveId: input.objectiveId,
|
|
149
|
+
limit: input.limit ?? 50,
|
|
150
|
+
});
|
|
151
|
+
return { events };
|
|
152
|
+
}),
|
|
153
|
+
|
|
154
|
+
getDailySnapshots: os.getDailySnapshots.handler(async ({ input }) => {
|
|
155
|
+
const snapshots = await service.getDailySnapshots({
|
|
156
|
+
objectiveId: input.objectiveId,
|
|
157
|
+
startDate: input.startDate,
|
|
158
|
+
endDate: input.endDate,
|
|
159
|
+
});
|
|
160
|
+
return { snapshots };
|
|
161
|
+
}),
|
|
162
|
+
|
|
163
|
+
// =========================================================================
|
|
164
|
+
// STREAKS & ACHIEVEMENTS
|
|
165
|
+
// =========================================================================
|
|
166
|
+
|
|
167
|
+
getStreaks: os.getStreaks.handler(async () => {
|
|
168
|
+
const streaks = await service.getAllStreaks();
|
|
169
|
+
return { streaks };
|
|
170
|
+
}),
|
|
171
|
+
|
|
172
|
+
getAchievements: os.getAchievements.handler(async ({ input }) => {
|
|
173
|
+
const achievements = await service.getAchievements({
|
|
174
|
+
systemId: input.systemId,
|
|
175
|
+
});
|
|
176
|
+
return { achievements };
|
|
177
|
+
}),
|
|
178
|
+
|
|
179
|
+
getRecentMilestones: os.getRecentMilestones.handler(async ({ input }) => {
|
|
180
|
+
const achievements = await service.getRecentMilestones({
|
|
181
|
+
limit: input.limit ?? 20,
|
|
182
|
+
});
|
|
183
|
+
return {
|
|
184
|
+
milestones: achievements.map((a) => ({
|
|
185
|
+
systemId: a.systemId,
|
|
186
|
+
achievement: a.achievement,
|
|
187
|
+
unlockedAt: a.unlockedAt,
|
|
188
|
+
})),
|
|
189
|
+
};
|
|
190
|
+
}),
|
|
191
|
+
});
|
|
192
|
+
}
|
package/src/schema.ts
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import {
|
|
2
|
+
pgTable,
|
|
3
|
+
text,
|
|
4
|
+
timestamp,
|
|
5
|
+
doublePrecision,
|
|
6
|
+
integer,
|
|
7
|
+
json,
|
|
8
|
+
} from "drizzle-orm/pg-core";
|
|
9
|
+
|
|
10
|
+
// =============================================================================
|
|
11
|
+
// SLO OBJECTIVES
|
|
12
|
+
// =============================================================================
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* SLO objective definitions.
|
|
16
|
+
* Multiple SLOs can exist per system. When healthCheckConfigurationId is null,
|
|
17
|
+
* the SLO covers the system's aggregate availability across all health checks.
|
|
18
|
+
*/
|
|
19
|
+
export const sloObjectives = pgTable("slo_objectives", {
|
|
20
|
+
id: text("id").primaryKey(),
|
|
21
|
+
systemId: text("system_id").notNull(),
|
|
22
|
+
healthCheckConfigurationId: text("health_check_configuration_id"),
|
|
23
|
+
target: doublePrecision("target").notNull(),
|
|
24
|
+
windowDays: integer("window_days").notNull(),
|
|
25
|
+
dependencyExclusion: text("dependency_exclusion").notNull().default("strict"),
|
|
26
|
+
excludedDependencyIds: json("excluded_dependency_ids").$type<string[]>(),
|
|
27
|
+
burnRateWarningPercent: doublePrecision("burn_rate_warning_percent")
|
|
28
|
+
.notNull()
|
|
29
|
+
.default(50),
|
|
30
|
+
burnRateCriticalPercent: doublePrecision("burn_rate_critical_percent")
|
|
31
|
+
.notNull()
|
|
32
|
+
.default(80),
|
|
33
|
+
burnRateFastBurnMultiplier: doublePrecision("burn_rate_fast_burn_multiplier")
|
|
34
|
+
.notNull()
|
|
35
|
+
.default(5),
|
|
36
|
+
createdAt: timestamp("created_at").defaultNow().notNull(),
|
|
37
|
+
updatedAt: timestamp("updated_at").defaultNow().notNull(),
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
// =============================================================================
|
|
41
|
+
// DOWNTIME EVENTS (event-sourced)
|
|
42
|
+
// =============================================================================
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Event-sourced downtime records, written in real-time on SYSTEM_STATUS_CHANGED.
|
|
46
|
+
*
|
|
47
|
+
* Events are IMMUTABLE after creation. When attribution changes mid-outage
|
|
48
|
+
* (e.g., upstream goes down after downstream), the open event is closed and a
|
|
49
|
+
* new one is created with the correct attribution (event splitting).
|
|
50
|
+
*
|
|
51
|
+
* endTime = null indicates an ongoing outage.
|
|
52
|
+
*/
|
|
53
|
+
export const sloDowntimeEvents = pgTable("slo_downtime_events", {
|
|
54
|
+
id: text("id").primaryKey(),
|
|
55
|
+
objectiveId: text("objective_id")
|
|
56
|
+
.notNull()
|
|
57
|
+
.references(() => sloObjectives.id, { onDelete: "cascade" }),
|
|
58
|
+
systemId: text("system_id").notNull(),
|
|
59
|
+
startTime: timestamp("start_time").notNull(),
|
|
60
|
+
endTime: timestamp("end_time"),
|
|
61
|
+
durationSeconds: doublePrecision("duration_seconds"),
|
|
62
|
+
attributionType: text("attribution_type").notNull(),
|
|
63
|
+
upstreamSystemId: text("upstream_system_id"),
|
|
64
|
+
upstreamSystemName: text("upstream_system_name"),
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
// =============================================================================
|
|
68
|
+
// DAILY SNAPSHOTS
|
|
69
|
+
// =============================================================================
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Daily SLO snapshots for trend charts.
|
|
73
|
+
* Persisted by a daily cron job at UTC midnight.
|
|
74
|
+
*/
|
|
75
|
+
export const sloDailySnapshots = pgTable("slo_daily_snapshots", {
|
|
76
|
+
id: text("id").primaryKey(),
|
|
77
|
+
objectiveId: text("objective_id")
|
|
78
|
+
.notNull()
|
|
79
|
+
.references(() => sloObjectives.id, { onDelete: "cascade" }),
|
|
80
|
+
date: timestamp("date").notNull(),
|
|
81
|
+
availabilityPercent: doublePrecision("availability_percent").notNull(),
|
|
82
|
+
budgetConsumedMinutes: doublePrecision("budget_consumed_minutes").notNull(),
|
|
83
|
+
budgetRemainingPercent: doublePrecision("budget_remaining_percent").notNull(),
|
|
84
|
+
burnRate: doublePrecision("burn_rate"),
|
|
85
|
+
streakDays: integer("streak_days").notNull().default(0),
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// =============================================================================
|
|
89
|
+
// STREAKS
|
|
90
|
+
// =============================================================================
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Streak tracking per SLO objective (1:1 relationship).
|
|
94
|
+
* Updated daily by the streak calculator cron job.
|
|
95
|
+
*/
|
|
96
|
+
export const sloStreaks = pgTable("slo_streaks", {
|
|
97
|
+
objectiveId: text("objective_id")
|
|
98
|
+
.primaryKey()
|
|
99
|
+
.references(() => sloObjectives.id, { onDelete: "cascade" }),
|
|
100
|
+
systemId: text("system_id").notNull(),
|
|
101
|
+
currentStreak: integer("current_streak").notNull().default(0),
|
|
102
|
+
bestStreak: integer("best_streak").notNull().default(0),
|
|
103
|
+
streakStart: timestamp("streak_start"),
|
|
104
|
+
bestStreakEnd: timestamp("best_streak_end"),
|
|
105
|
+
});
|
|
106
|
+
|
|
107
|
+
// =============================================================================
|
|
108
|
+
// ACHIEVEMENTS
|
|
109
|
+
// =============================================================================
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Achievement log — system-level only, no user attribution.
|
|
113
|
+
* Idempotent: the achievement evaluator checks for existing entries before inserting.
|
|
114
|
+
*/
|
|
115
|
+
export const sloAchievements = pgTable("slo_achievements", {
|
|
116
|
+
id: text("id").primaryKey(),
|
|
117
|
+
systemId: text("system_id").notNull(),
|
|
118
|
+
achievement: text("achievement").notNull(),
|
|
119
|
+
unlockedAt: timestamp("unlocked_at").defaultNow().notNull(),
|
|
120
|
+
});
|