@checkstack/slo-backend 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/drizzle/0000_rainy_kronos.sql +57 -0
- package/drizzle/meta/0000_snapshot.json +370 -0
- package/drizzle/meta/_journal.json +13 -0
- package/drizzle.config.ts +7 -0
- package/package.json +41 -0
- package/src/achievement-evaluator.ts +201 -0
- package/src/hooks.ts +76 -0
- package/src/index.ts +425 -0
- package/src/router.ts +192 -0
- package/src/schema.ts +120 -0
- package/src/service.ts +682 -0
- package/src/slo-engine.test.ts +662 -0
- package/src/slo-engine.ts +425 -0
- package/src/streak-calculator.ts +107 -0
- package/src/weekly-digest.ts +140 -0
- package/tsconfig.json +6 -0
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import type { SloService } from "./service";
|
|
2
|
+
import type { SloEngine } from "./slo-engine";
|
|
3
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
4
|
+
import type { AchievementType } from "@checkstack/slo-common";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Achievement definitions with their evaluation criteria.
|
|
8
|
+
* Each achievement is tied to a system (not a user).
|
|
9
|
+
*/
|
|
10
|
+
const ACHIEVEMENT_DEFINITIONS: Array<{
|
|
11
|
+
type: AchievementType;
|
|
12
|
+
description: string;
|
|
13
|
+
evaluate: (ctx: AchievementContext) => boolean;
|
|
14
|
+
}> = [
|
|
15
|
+
{
|
|
16
|
+
type: "first_steps",
|
|
17
|
+
description: "System has at least one SLO objective configured",
|
|
18
|
+
evaluate: ({ objectiveCount }) => objectiveCount >= 1,
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
type: "full_coverage",
|
|
22
|
+
description: "System has SLOs covering all health checks",
|
|
23
|
+
evaluate: ({ objectiveCount }) => objectiveCount >= 3,
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
type: "iron_uptime",
|
|
27
|
+
description: "7-day reliability streak",
|
|
28
|
+
evaluate: ({ streakDays }) => streakDays >= 7,
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
type: "diamond_uptime",
|
|
32
|
+
description: "30-day reliability streak",
|
|
33
|
+
evaluate: ({ streakDays }) => streakDays >= 30,
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
type: "nines_club",
|
|
37
|
+
description: "Achieved 99.9% or higher availability in a rolling window",
|
|
38
|
+
evaluate: ({ bestAvailability }) =>
|
|
39
|
+
bestAvailability !== undefined && bestAvailability >= 99.9,
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
type: "budget_miser",
|
|
43
|
+
description: "Used less than 10% of error budget over a full window",
|
|
44
|
+
evaluate: ({ budgetRemainingPercent }) =>
|
|
45
|
+
budgetRemainingPercent !== undefined && budgetRemainingPercent >= 90,
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
type: "clean_sheet",
|
|
49
|
+
description: "Zero downtime events in the current rolling window",
|
|
50
|
+
evaluate: ({ hasZeroDowntime }) => hasZeroDowntime,
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
type: "cascade_breaker",
|
|
54
|
+
description:
|
|
55
|
+
"System was down but all downtime was attributed to upstream dependencies",
|
|
56
|
+
evaluate: ({ hasUpstreamOnlyDowntime }) => hasUpstreamOnlyDowntime,
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
type: "rapid_recovery",
|
|
60
|
+
description: "Recovered from downtime in under 5 minutes",
|
|
61
|
+
evaluate: ({ fastestRecoverySeconds }) =>
|
|
62
|
+
fastestRecoverySeconds !== undefined && fastestRecoverySeconds <= 300,
|
|
63
|
+
},
|
|
64
|
+
];
|
|
65
|
+
|
|
66
|
+
interface AchievementContext {
|
|
67
|
+
objectiveCount: number;
|
|
68
|
+
streakDays: number;
|
|
69
|
+
bestAvailability: number | undefined;
|
|
70
|
+
budgetRemainingPercent: number | undefined;
|
|
71
|
+
hasZeroDowntime: boolean;
|
|
72
|
+
hasUpstreamOnlyDowntime: boolean;
|
|
73
|
+
fastestRecoverySeconds: number | undefined;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Evaluates and unlocks achievements for a specific system.
|
|
78
|
+
* Called after daily snapshot processing or on significant events.
|
|
79
|
+
*/
|
|
80
|
+
export async function evaluateAchievements({
|
|
81
|
+
systemId,
|
|
82
|
+
service,
|
|
83
|
+
engine,
|
|
84
|
+
logger,
|
|
85
|
+
}: {
|
|
86
|
+
systemId: string;
|
|
87
|
+
service: SloService;
|
|
88
|
+
engine: SloEngine;
|
|
89
|
+
logger: Logger;
|
|
90
|
+
}): Promise<AchievementType[]> {
|
|
91
|
+
const objectives = await service.getObjectivesForSystem({ systemId });
|
|
92
|
+
if (objectives.length === 0) return [];
|
|
93
|
+
|
|
94
|
+
// Build context from all objectives on this system
|
|
95
|
+
let bestStreakDays = 0;
|
|
96
|
+
let bestAvailability: number | undefined;
|
|
97
|
+
let bestBudgetRemaining: number | undefined;
|
|
98
|
+
let hasZeroDowntime = true;
|
|
99
|
+
let hasUpstreamOnlyDowntime = false;
|
|
100
|
+
let fastestRecoverySeconds: number | undefined;
|
|
101
|
+
|
|
102
|
+
for (const objective of objectives) {
|
|
103
|
+
const status = await engine.computeStatus({ objective });
|
|
104
|
+
const streak = await service.getStreak({ objectiveId: objective.id });
|
|
105
|
+
|
|
106
|
+
// Best streak across all objectives
|
|
107
|
+
if (streak && streak.currentStreak > bestStreakDays) {
|
|
108
|
+
bestStreakDays = streak.currentStreak;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// Best availability across all objectives
|
|
112
|
+
if (
|
|
113
|
+
status.currentAvailability !== undefined &&
|
|
114
|
+
status.currentAvailability !== null &&
|
|
115
|
+
(bestAvailability === undefined ||
|
|
116
|
+
status.currentAvailability > bestAvailability)
|
|
117
|
+
) {
|
|
118
|
+
bestAvailability = status.currentAvailability;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Best budget remaining
|
|
122
|
+
if (
|
|
123
|
+
bestBudgetRemaining === undefined ||
|
|
124
|
+
status.errorBudgetRemainingPercent > bestBudgetRemaining
|
|
125
|
+
) {
|
|
126
|
+
bestBudgetRemaining = status.errorBudgetRemainingPercent;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// Zero downtime check
|
|
130
|
+
if (
|
|
131
|
+
status.errorBudgetConsumedMinutes > 0 ||
|
|
132
|
+
status.errorBudgetConsumedStrictMinutes > 0
|
|
133
|
+
) {
|
|
134
|
+
hasZeroDowntime = false;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Check for cascade_breaker: consumed strict > 0 but consumed self = 0
|
|
138
|
+
if (
|
|
139
|
+
status.errorBudgetConsumedStrictMinutes > 0 &&
|
|
140
|
+
status.errorBudgetConsumedMinutes === 0
|
|
141
|
+
) {
|
|
142
|
+
hasUpstreamOnlyDowntime = true;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Check recent events for rapid recovery
|
|
146
|
+
const recentEvents = await service.getRecentDowntimeEvents({
|
|
147
|
+
objectiveId: objective.id,
|
|
148
|
+
limit: 10,
|
|
149
|
+
});
|
|
150
|
+
for (const event of recentEvents) {
|
|
151
|
+
if (
|
|
152
|
+
event.durationSeconds !== undefined &&
|
|
153
|
+
event.durationSeconds !== null &&
|
|
154
|
+
(fastestRecoverySeconds === undefined ||
|
|
155
|
+
event.durationSeconds < fastestRecoverySeconds)
|
|
156
|
+
) {
|
|
157
|
+
fastestRecoverySeconds = event.durationSeconds;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const context: AchievementContext = {
|
|
163
|
+
objectiveCount: objectives.length,
|
|
164
|
+
streakDays: bestStreakDays,
|
|
165
|
+
bestAvailability,
|
|
166
|
+
budgetRemainingPercent: bestBudgetRemaining,
|
|
167
|
+
hasZeroDowntime,
|
|
168
|
+
hasUpstreamOnlyDowntime,
|
|
169
|
+
fastestRecoverySeconds,
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
// Evaluate all achievements
|
|
173
|
+
const newlyUnlocked: AchievementType[] = [];
|
|
174
|
+
|
|
175
|
+
for (const definition of ACHIEVEMENT_DEFINITIONS) {
|
|
176
|
+
if (definition.evaluate(context)) {
|
|
177
|
+
const achievement = await service.unlockAchievement({
|
|
178
|
+
systemId,
|
|
179
|
+
achievement: definition.type,
|
|
180
|
+
});
|
|
181
|
+
if (achievement) {
|
|
182
|
+
newlyUnlocked.push(definition.type);
|
|
183
|
+
logger.info(
|
|
184
|
+
`🏆 Achievement unlocked for system ${systemId}: ${definition.type}`,
|
|
185
|
+
);
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return newlyUnlocked;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* Get metadata for all achievement types (for display in the frontend).
|
|
195
|
+
*/
|
|
196
|
+
export function getAchievementDefinitions() {
|
|
197
|
+
return ACHIEVEMENT_DEFINITIONS.map((d) => ({
|
|
198
|
+
type: d.type,
|
|
199
|
+
description: d.description,
|
|
200
|
+
}));
|
|
201
|
+
}
|
package/src/hooks.ts
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { createHook } from "@checkstack/backend-api";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* SLO hooks for cross-plugin communication.
|
|
5
|
+
* Other plugins can subscribe to these hooks to react to SLO lifecycle events.
|
|
6
|
+
* Registered as integration events so they flow through configured notification channels.
|
|
7
|
+
*/
|
|
8
|
+
export const sloHooks = {
|
|
9
|
+
/**
|
|
10
|
+
* Emitted when an SLO's error budget consumption exceeds the warning threshold.
|
|
11
|
+
*/
|
|
12
|
+
sloBudgetWarning: createHook<{
|
|
13
|
+
systemId: string;
|
|
14
|
+
objectiveId: string;
|
|
15
|
+
target: number;
|
|
16
|
+
budgetRemainingPercent: number;
|
|
17
|
+
}>("slo.budget.warning"),
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Emitted when an SLO's error budget consumption exceeds the critical threshold.
|
|
21
|
+
*/
|
|
22
|
+
sloBudgetCritical: createHook<{
|
|
23
|
+
systemId: string;
|
|
24
|
+
objectiveId: string;
|
|
25
|
+
target: number;
|
|
26
|
+
budgetRemainingPercent: number;
|
|
27
|
+
}>("slo.budget.critical"),
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Emitted when an SLO's error budget is fully exhausted.
|
|
31
|
+
*/
|
|
32
|
+
sloBudgetExhausted: createHook<{
|
|
33
|
+
systemId: string;
|
|
34
|
+
objectiveId: string;
|
|
35
|
+
target: number;
|
|
36
|
+
}>("slo.budget.exhausted"),
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Emitted when a reliability streak is broken.
|
|
40
|
+
*/
|
|
41
|
+
sloStreakBroken: createHook<{
|
|
42
|
+
systemId: string;
|
|
43
|
+
objectiveId: string;
|
|
44
|
+
streak: number;
|
|
45
|
+
bestStreak: number;
|
|
46
|
+
}>("slo.streak.broken"),
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Emitted when a system unlocks a new reliability achievement.
|
|
50
|
+
*/
|
|
51
|
+
sloAchievementUnlocked: createHook<{
|
|
52
|
+
systemId: string;
|
|
53
|
+
achievement: string;
|
|
54
|
+
}>("slo.achievement.unlocked"),
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Emitted weekly with a summary of SLO performance across all systems.
|
|
58
|
+
* Delivered through configured notification channels (Slack, Teams, etc.).
|
|
59
|
+
*/
|
|
60
|
+
sloWeeklyDigest: createHook<{
|
|
61
|
+
totalObjectives: number;
|
|
62
|
+
breachingCount: number;
|
|
63
|
+
atRiskCount: number;
|
|
64
|
+
healthyCount: number;
|
|
65
|
+
topPerformers: Array<{
|
|
66
|
+
systemName: string;
|
|
67
|
+
availability: number;
|
|
68
|
+
streakDays: number;
|
|
69
|
+
}>;
|
|
70
|
+
worstPerformers: Array<{
|
|
71
|
+
systemName: string;
|
|
72
|
+
availability: number;
|
|
73
|
+
budgetRemainingPercent: number;
|
|
74
|
+
}>;
|
|
75
|
+
}>("slo.weekly.digest"),
|
|
76
|
+
} as const;
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
import * as schema from "./schema";
|
|
2
|
+
import type { SafeDatabase } from "@checkstack/backend-api";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
import {
|
|
5
|
+
sloAccessRules,
|
|
6
|
+
sloAccess,
|
|
7
|
+
pluginMetadata,
|
|
8
|
+
sloContract,
|
|
9
|
+
sloRoutes,
|
|
10
|
+
} from "@checkstack/slo-common";
|
|
11
|
+
import { createBackendPlugin, coreServices } from "@checkstack/backend-api";
|
|
12
|
+
import { integrationEventExtensionPoint } from "@checkstack/integration-backend";
|
|
13
|
+
import { SloService } from "./service";
|
|
14
|
+
import { SloEngine } from "./slo-engine";
|
|
15
|
+
import { createRouter } from "./router";
|
|
16
|
+
import { DependencyApi } from "@checkstack/dependency-common";
|
|
17
|
+
import { HealthCheckApi } from "@checkstack/healthcheck-common";
|
|
18
|
+
import { catalogHooks } from "@checkstack/catalog-backend";
|
|
19
|
+
import { healthCheckHooks } from "@checkstack/healthcheck-backend";
|
|
20
|
+
import { registerSearchProvider } from "@checkstack/command-backend";
|
|
21
|
+
import { resolveRoute } from "@checkstack/common";
|
|
22
|
+
import { sloHooks } from "./hooks";
|
|
23
|
+
import { setupDailySnapshotJob } from "./streak-calculator";
|
|
24
|
+
import { setupWeeklyDigestJob } from "./weekly-digest";
|
|
25
|
+
import { evaluateAchievements } from "./achievement-evaluator";
|
|
26
|
+
|
|
27
|
+
// =============================================================================
|
|
28
|
+
// Integration Event Payload Schemas
|
|
29
|
+
// =============================================================================
|
|
30
|
+
|
|
31
|
+
const sloBudgetWarningPayloadSchema = z.object({
|
|
32
|
+
systemId: z.string(),
|
|
33
|
+
objectiveId: z.string(),
|
|
34
|
+
target: z.number(),
|
|
35
|
+
budgetRemainingPercent: z.number(),
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
const sloBudgetCriticalPayloadSchema = z.object({
|
|
39
|
+
systemId: z.string(),
|
|
40
|
+
objectiveId: z.string(),
|
|
41
|
+
target: z.number(),
|
|
42
|
+
budgetRemainingPercent: z.number(),
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
const sloBudgetExhaustedPayloadSchema = z.object({
|
|
46
|
+
systemId: z.string(),
|
|
47
|
+
objectiveId: z.string(),
|
|
48
|
+
target: z.number(),
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
const sloStreakBrokenPayloadSchema = z.object({
|
|
52
|
+
systemId: z.string(),
|
|
53
|
+
objectiveId: z.string(),
|
|
54
|
+
streak: z.number(),
|
|
55
|
+
bestStreak: z.number(),
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
const sloAchievementUnlockedPayloadSchema = z.object({
|
|
59
|
+
systemId: z.string(),
|
|
60
|
+
achievement: z.string(),
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
const sloWeeklyDigestPayloadSchema = z.object({
|
|
64
|
+
totalObjectives: z.number(),
|
|
65
|
+
breachingCount: z.number(),
|
|
66
|
+
atRiskCount: z.number(),
|
|
67
|
+
healthyCount: z.number(),
|
|
68
|
+
topPerformers: z.array(
|
|
69
|
+
z.object({
|
|
70
|
+
systemName: z.string(),
|
|
71
|
+
availability: z.number(),
|
|
72
|
+
streakDays: z.number(),
|
|
73
|
+
}),
|
|
74
|
+
),
|
|
75
|
+
worstPerformers: z.array(
|
|
76
|
+
z.object({
|
|
77
|
+
systemName: z.string(),
|
|
78
|
+
availability: z.number(),
|
|
79
|
+
budgetRemainingPercent: z.number(),
|
|
80
|
+
}),
|
|
81
|
+
),
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
// =============================================================================
|
|
85
|
+
// Plugin Definition
|
|
86
|
+
// =============================================================================
|
|
87
|
+
|
|
88
|
+
export default createBackendPlugin({
|
|
89
|
+
metadata: pluginMetadata,
|
|
90
|
+
register(env) {
|
|
91
|
+
env.registerAccessRules(sloAccessRules);
|
|
92
|
+
|
|
93
|
+
// Register hooks as integration events
|
|
94
|
+
const integrationEvents = env.getExtensionPoint(
|
|
95
|
+
integrationEventExtensionPoint,
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
integrationEvents.registerEvent(
|
|
99
|
+
{
|
|
100
|
+
hook: sloHooks.sloBudgetWarning,
|
|
101
|
+
displayName: "SLO Budget Warning",
|
|
102
|
+
description:
|
|
103
|
+
"Fired when an SLO error budget consumption exceeds the warning threshold",
|
|
104
|
+
category: "SLO",
|
|
105
|
+
payloadSchema: sloBudgetWarningPayloadSchema,
|
|
106
|
+
},
|
|
107
|
+
pluginMetadata,
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
integrationEvents.registerEvent(
|
|
111
|
+
{
|
|
112
|
+
hook: sloHooks.sloBudgetCritical,
|
|
113
|
+
displayName: "SLO Budget Critical",
|
|
114
|
+
description:
|
|
115
|
+
"Fired when an SLO error budget consumption exceeds the critical threshold",
|
|
116
|
+
category: "SLO",
|
|
117
|
+
payloadSchema: sloBudgetCriticalPayloadSchema,
|
|
118
|
+
},
|
|
119
|
+
pluginMetadata,
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
integrationEvents.registerEvent(
|
|
123
|
+
{
|
|
124
|
+
hook: sloHooks.sloBudgetExhausted,
|
|
125
|
+
displayName: "SLO Budget Exhausted",
|
|
126
|
+
description: "Fired when an SLO error budget is fully consumed",
|
|
127
|
+
category: "SLO",
|
|
128
|
+
payloadSchema: sloBudgetExhaustedPayloadSchema,
|
|
129
|
+
},
|
|
130
|
+
pluginMetadata,
|
|
131
|
+
);
|
|
132
|
+
|
|
133
|
+
integrationEvents.registerEvent(
|
|
134
|
+
{
|
|
135
|
+
hook: sloHooks.sloStreakBroken,
|
|
136
|
+
displayName: "SLO Streak Broken",
|
|
137
|
+
description: "Fired when a reliability streak is broken",
|
|
138
|
+
category: "SLO",
|
|
139
|
+
payloadSchema: sloStreakBrokenPayloadSchema,
|
|
140
|
+
},
|
|
141
|
+
pluginMetadata,
|
|
142
|
+
);
|
|
143
|
+
|
|
144
|
+
integrationEvents.registerEvent(
|
|
145
|
+
{
|
|
146
|
+
hook: sloHooks.sloAchievementUnlocked,
|
|
147
|
+
displayName: "SLO Achievement Unlocked",
|
|
148
|
+
description:
|
|
149
|
+
"Fired when a system unlocks a new reliability achievement",
|
|
150
|
+
category: "SLO",
|
|
151
|
+
payloadSchema: sloAchievementUnlockedPayloadSchema,
|
|
152
|
+
},
|
|
153
|
+
pluginMetadata,
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
integrationEvents.registerEvent(
|
|
157
|
+
{
|
|
158
|
+
hook: sloHooks.sloWeeklyDigest,
|
|
159
|
+
displayName: "SLO Weekly Digest",
|
|
160
|
+
description:
|
|
161
|
+
"Weekly summary of SLO performance across all systems (Monday 09:00 UTC)",
|
|
162
|
+
category: "SLO",
|
|
163
|
+
payloadSchema: sloWeeklyDigestPayloadSchema,
|
|
164
|
+
},
|
|
165
|
+
pluginMetadata,
|
|
166
|
+
);
|
|
167
|
+
|
|
168
|
+
// Shared references across init/afterPluginsReady (maintenance-backend pattern)
|
|
169
|
+
let sharedEngine: SloEngine;
|
|
170
|
+
|
|
171
|
+
env.registerInit({
|
|
172
|
+
schema,
|
|
173
|
+
deps: {
|
|
174
|
+
logger: coreServices.logger,
|
|
175
|
+
rpc: coreServices.rpc,
|
|
176
|
+
signalService: coreServices.signalService,
|
|
177
|
+
rpcClient: coreServices.rpcClient,
|
|
178
|
+
queueManager: coreServices.queueManager,
|
|
179
|
+
},
|
|
180
|
+
init: async ({ logger, database, rpc, signalService }) => {
|
|
181
|
+
logger.debug("🔧 Initializing SLO Backend...");
|
|
182
|
+
|
|
183
|
+
const service = new SloService(database as SafeDatabase<typeof schema>);
|
|
184
|
+
const engine = new SloEngine({
|
|
185
|
+
service,
|
|
186
|
+
signalService,
|
|
187
|
+
logger,
|
|
188
|
+
});
|
|
189
|
+
|
|
190
|
+
// Store for afterPluginsReady
|
|
191
|
+
sharedEngine = engine;
|
|
192
|
+
|
|
193
|
+
const router = createRouter({ service, engine, signalService });
|
|
194
|
+
rpc.registerRouter(router, sloContract);
|
|
195
|
+
|
|
196
|
+
// Register command palette entries
|
|
197
|
+
registerSearchProvider({
|
|
198
|
+
pluginMetadata,
|
|
199
|
+
commands: [
|
|
200
|
+
{
|
|
201
|
+
id: "create-slo",
|
|
202
|
+
title: "Create SLO",
|
|
203
|
+
subtitle: "Define a new Service Level Objective",
|
|
204
|
+
iconName: "Target",
|
|
205
|
+
route: resolveRoute(sloRoutes.routes.config) + "?action=create",
|
|
206
|
+
requiredAccessRules: [sloAccess.slo.manage],
|
|
207
|
+
},
|
|
208
|
+
{
|
|
209
|
+
id: "manage-slos",
|
|
210
|
+
title: "Manage SLOs",
|
|
211
|
+
subtitle: "View and configure Service Level Objectives",
|
|
212
|
+
iconName: "Target",
|
|
213
|
+
shortcuts: ["meta+shift+l", "ctrl+shift+l"],
|
|
214
|
+
route: resolveRoute(sloRoutes.routes.overview),
|
|
215
|
+
},
|
|
216
|
+
],
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
logger.debug("✅ SLO Backend initialized.");
|
|
220
|
+
},
|
|
221
|
+
|
|
222
|
+
afterPluginsReady: async ({
|
|
223
|
+
database,
|
|
224
|
+
logger,
|
|
225
|
+
onHook,
|
|
226
|
+
emitHook,
|
|
227
|
+
rpcClient,
|
|
228
|
+
signalService,
|
|
229
|
+
queueManager,
|
|
230
|
+
}) => {
|
|
231
|
+
const typedDb = database as SafeDatabase<typeof schema>;
|
|
232
|
+
const service = new SloService(typedDb);
|
|
233
|
+
const engine = new SloEngine({
|
|
234
|
+
service,
|
|
235
|
+
signalService,
|
|
236
|
+
logger,
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
const dependencyClient = rpcClient.forPlugin(DependencyApi);
|
|
240
|
+
const healthCheckClient = rpcClient.forPlugin(HealthCheckApi);
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Set health status callback on the shared engine instance
|
|
244
|
+
* (the one used by the router). This enables reconcileObjective
|
|
245
|
+
* to check current system health when SLOs are created.
|
|
246
|
+
*/
|
|
247
|
+
sharedEngine.setHealthStatusCallback(async (systemId) => {
|
|
248
|
+
try {
|
|
249
|
+
const status = await healthCheckClient.getSystemHealthStatus({
|
|
250
|
+
systemId,
|
|
251
|
+
});
|
|
252
|
+
logger.debug(
|
|
253
|
+
`SLO reconcile: System ${systemId} health status = ${status.status}`,
|
|
254
|
+
);
|
|
255
|
+
return { isHealthy: status.status === "healthy" };
|
|
256
|
+
} catch (error) {
|
|
257
|
+
logger.warn(
|
|
258
|
+
`SLO reconcile: Failed to get health status for system ${systemId}`,
|
|
259
|
+
{ error },
|
|
260
|
+
);
|
|
261
|
+
// Default to healthy if we can't determine status
|
|
262
|
+
return { isHealthy: true };
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Helper: check upstream health status via RPC loopback.
|
|
268
|
+
* Injected as a callback into the engine for testability.
|
|
269
|
+
*/
|
|
270
|
+
const getUpstreamHealthStatus = async ({
|
|
271
|
+
upstreamSystemId,
|
|
272
|
+
}: {
|
|
273
|
+
upstreamSystemId: string;
|
|
274
|
+
}) => {
|
|
275
|
+
try {
|
|
276
|
+
const healthStatus = await healthCheckClient.getSystemHealthStatus({
|
|
277
|
+
systemId: upstreamSystemId,
|
|
278
|
+
});
|
|
279
|
+
return {
|
|
280
|
+
isHealthy: healthStatus.status === "healthy",
|
|
281
|
+
systemName: upstreamSystemId,
|
|
282
|
+
};
|
|
283
|
+
} catch {
|
|
284
|
+
// Fail-open: if we can't check upstream, assume healthy
|
|
285
|
+
return { isHealthy: true, systemName: upstreamSystemId };
|
|
286
|
+
}
|
|
287
|
+
};
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Helper: get downstream dependents of a system.
|
|
291
|
+
*/
|
|
292
|
+
const getDownstreamSystemIds = async (
|
|
293
|
+
systemId: string,
|
|
294
|
+
): Promise<string[]> => {
|
|
295
|
+
try {
|
|
296
|
+
const result = await dependencyClient.getDependencies({
|
|
297
|
+
systemId,
|
|
298
|
+
direction: "downstream",
|
|
299
|
+
});
|
|
300
|
+
return result.dependencies.map((d) => d.sourceSystemId);
|
|
301
|
+
} catch {
|
|
302
|
+
return [];
|
|
303
|
+
}
|
|
304
|
+
};
|
|
305
|
+
|
|
306
|
+
// =====================================================================
|
|
307
|
+
// Perspective 1: System goes DOWN — open downtime events
|
|
308
|
+
// =====================================================================
|
|
309
|
+
onHook(
|
|
310
|
+
healthCheckHooks.systemDegraded,
|
|
311
|
+
async (payload) => {
|
|
312
|
+
logger.debug(
|
|
313
|
+
`SLO: System ${payload.systemId} degraded (${payload.previousStatus} → ${payload.newStatus})`,
|
|
314
|
+
);
|
|
315
|
+
await engine.handleSystemDown({
|
|
316
|
+
systemId: payload.systemId,
|
|
317
|
+
getUpstreamHealthStatus,
|
|
318
|
+
});
|
|
319
|
+
},
|
|
320
|
+
{ mode: "work-queue", workerGroup: "slo-system-down" },
|
|
321
|
+
);
|
|
322
|
+
|
|
323
|
+
// =====================================================================
|
|
324
|
+
// Perspective 1: System goes UP — close downtime events
|
|
325
|
+
// =====================================================================
|
|
326
|
+
onHook(
|
|
327
|
+
healthCheckHooks.systemHealthy,
|
|
328
|
+
async (payload) => {
|
|
329
|
+
logger.debug(`SLO: System ${payload.systemId} recovered`);
|
|
330
|
+
await engine.handleSystemUp({
|
|
331
|
+
systemId: payload.systemId,
|
|
332
|
+
});
|
|
333
|
+
|
|
334
|
+
// Also handle Perspective 2 (as upstream)
|
|
335
|
+
const downstreamIds = await getDownstreamSystemIds(
|
|
336
|
+
payload.systemId,
|
|
337
|
+
);
|
|
338
|
+
if (downstreamIds.length > 0) {
|
|
339
|
+
await engine.handleUpstreamUp({
|
|
340
|
+
upstreamSystemId: payload.systemId,
|
|
341
|
+
downstreamSystemIds: downstreamIds,
|
|
342
|
+
getUpstreamHealthStatus,
|
|
343
|
+
});
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Evaluate achievements on recovery (rapid_recovery, clean_sheet, etc.)
|
|
347
|
+
await evaluateAchievements({
|
|
348
|
+
systemId: payload.systemId,
|
|
349
|
+
service,
|
|
350
|
+
engine,
|
|
351
|
+
logger,
|
|
352
|
+
});
|
|
353
|
+
},
|
|
354
|
+
{ mode: "work-queue", workerGroup: "slo-system-up" },
|
|
355
|
+
);
|
|
356
|
+
|
|
357
|
+
// =====================================================================
|
|
358
|
+
// Perspective 2: Upstream degraded — split downstream "self" events
|
|
359
|
+
// We re-use the systemDegraded hook, checking downstream systems
|
|
360
|
+
// =====================================================================
|
|
361
|
+
onHook(
|
|
362
|
+
healthCheckHooks.systemDegraded,
|
|
363
|
+
async (payload) => {
|
|
364
|
+
const downstreamIds = await getDownstreamSystemIds(
|
|
365
|
+
payload.systemId,
|
|
366
|
+
);
|
|
367
|
+
if (downstreamIds.length > 0) {
|
|
368
|
+
await engine.handleUpstreamDown({
|
|
369
|
+
upstreamSystemId: payload.systemId,
|
|
370
|
+
upstreamSystemName: payload.systemName ?? payload.systemId,
|
|
371
|
+
downstreamSystemIds: downstreamIds,
|
|
372
|
+
});
|
|
373
|
+
}
|
|
374
|
+
},
|
|
375
|
+
{ mode: "work-queue", workerGroup: "slo-upstream-down" },
|
|
376
|
+
);
|
|
377
|
+
|
|
378
|
+
// =====================================================================
|
|
379
|
+
// Subscribe to catalog system deletion for cleanup
|
|
380
|
+
// =====================================================================
|
|
381
|
+
onHook(
|
|
382
|
+
catalogHooks.systemDeleted,
|
|
383
|
+
async (payload) => {
|
|
384
|
+
logger.debug(
|
|
385
|
+
`Cleaning up SLO data for deleted system: ${payload.systemId}`,
|
|
386
|
+
);
|
|
387
|
+
await service.deleteObjectivesForSystem({
|
|
388
|
+
systemId: payload.systemId,
|
|
389
|
+
});
|
|
390
|
+
await service.deleteAchievementsForSystem({
|
|
391
|
+
systemId: payload.systemId,
|
|
392
|
+
});
|
|
393
|
+
},
|
|
394
|
+
{ mode: "work-queue", workerGroup: "slo-system-cleanup" },
|
|
395
|
+
);
|
|
396
|
+
|
|
397
|
+
// =====================================================================
|
|
398
|
+
// Daily snapshot + streak calculation cron job
|
|
399
|
+
// =====================================================================
|
|
400
|
+
await setupDailySnapshotJob({
|
|
401
|
+
service,
|
|
402
|
+
engine,
|
|
403
|
+
logger,
|
|
404
|
+
queueManager,
|
|
405
|
+
});
|
|
406
|
+
|
|
407
|
+
// =====================================================================
|
|
408
|
+
// Weekly digest cron job
|
|
409
|
+
// =====================================================================
|
|
410
|
+
await setupWeeklyDigestJob({
|
|
411
|
+
service,
|
|
412
|
+
engine,
|
|
413
|
+
logger,
|
|
414
|
+
queueManager,
|
|
415
|
+
emitHook,
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
logger.debug("✅ SLO Backend afterPluginsReady complete.");
|
|
419
|
+
},
|
|
420
|
+
});
|
|
421
|
+
},
|
|
422
|
+
});
|
|
423
|
+
|
|
424
|
+
// Re-export hooks for other plugins to use
|
|
425
|
+
export { sloHooks } from "./hooks";
|