@checkstack/anomaly-backend 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,361 @@
1
+ import type { SafeDatabase } from "@checkstack/backend-api";
2
+ import type { CacheProvider } from "@checkstack/cache-api";
3
+ import * as schema from "./schema";
4
+ import { eq, and } from "drizzle-orm";
5
+ import {
6
+ computeThresholds,
7
+ isAnomalous,
8
+ isCategoricalAnomalous,
9
+ resolveEffectiveConfig,
10
+ type FieldBaseline,
11
+ } from "@checkstack/anomaly-common";
12
+ import type { Logger } from "@checkstack/backend-api";
13
+ import type { CatalogApi } from "@checkstack/catalog-common";
14
+ import type { InferClient } from "@checkstack/common";
15
+ import { AnomalyService } from "./service";
16
+ import type { AnomalySettings, AnomalyDirection } from "@checkstack/anomaly-common";
17
+ import type { SignalService } from "@checkstack/signal-common";
18
+ import { ANOMALY_STATE_CHANGED } from "@checkstack/anomaly-common";
19
+ import { dispatchAnomalyNotification } from "./notification";
20
+
21
+ // ─────────────────────────────────────────────────────────────────────────────
22
+ // Inline Fast Detector (Phase 1)
23
+ // ─────────────────────────────────────────────────────────────────────────────
24
+
25
+ import type { CollectorRegistry } from "@checkstack/backend-api";
26
+ import { getHealthResultMeta } from "@checkstack/healthcheck-common";
27
+ import type { z } from "@checkstack/backend-api";
28
+
29
+ export async function processCheckCompleted({
30
+ systemId,
31
+ configurationId,
32
+ status,
33
+ latencyMs: _latencyMs,
34
+ result,
35
+ timestamp: _timestamp,
36
+ db,
37
+ cache,
38
+ routerCache,
39
+ logger,
40
+ catalogClient,
41
+ signalService,
42
+ collectorRegistry,
43
+ }: {
44
+ systemId: string;
45
+ configurationId: string;
46
+ status: string;
47
+ latencyMs: number | undefined;
48
+ result: Record<string, unknown> | undefined;
49
+ timestamp: string;
50
+ db: SafeDatabase<typeof schema>;
51
+ cache: CacheProvider;
52
+ routerCache?: {
53
+ invalidateAnomalies: () => Promise<number>;
54
+ };
55
+ logger: Logger;
56
+ catalogClient: InferClient<typeof CatalogApi>;
57
+ signalService?: SignalService;
58
+ collectorRegistry: CollectorRegistry;
59
+ }) {
60
+ if (!result || status !== "healthy") {
61
+ // Only analyze successful results for anomalies
62
+ return;
63
+ }
64
+
65
+ const fieldsToCheck: {
66
+ path: string;
67
+ value: number | string | boolean;
68
+ collectorId: string;
69
+ fieldName: string;
70
+ }[] = [];
71
+
72
+ // `result` here is specifically the `collectors` dictionary where keys are UUIDs
73
+ // e.g. { "uuid-1234": { "_collectorId": "healthcheck-http.request", "responseTimeMs": 50 } }
74
+ for (const collectorData of Object.values(result)) {
75
+ if (typeof collectorData === "object" && collectorData !== null) {
76
+ const data = collectorData as Record<string, unknown>;
77
+ const realCollectorId = data._collectorId;
78
+
79
+ if (typeof realCollectorId === "string") {
80
+ for (const [fieldName, value] of Object.entries(data)) {
81
+ if (fieldName.startsWith("_")) continue;
82
+ if (
83
+ typeof value === "number" ||
84
+ typeof value === "string" ||
85
+ typeof value === "boolean"
86
+ ) {
87
+ fieldsToCheck.push({
88
+ path: `collectors.${realCollectorId}.${fieldName}`,
89
+ value,
90
+ collectorId: realCollectorId,
91
+ fieldName,
92
+ });
93
+ }
94
+ }
95
+ }
96
+ }
97
+ }
98
+
99
+ if (fieldsToCheck.length === 0) return;
100
+
101
+ // F2 fix: Fetch config ONCE above the field loop (was N+1 inside the loop)
102
+ const anomalyService = new AnomalyService(db);
103
+ let templateConfig: AnomalySettings | undefined;
104
+ let assignmentConfig: Partial<AnomalySettings> | undefined;
105
+ try {
106
+ const templateRecord =
107
+ await anomalyService.getAnomalyConfig(configurationId);
108
+ templateConfig = templateRecord.data;
109
+ const assignmentRecord = await anomalyService.getAnomalyAssignmentConfig(
110
+ systemId,
111
+ configurationId,
112
+ );
113
+ assignmentConfig = assignmentRecord?.data ?? undefined;
114
+ } catch (error) {
115
+ logger.warn(
116
+ `Failed to fetch anomaly configuration for ${configurationId}`,
117
+ error,
118
+ );
119
+ }
120
+
121
+ // Check each field
122
+ for (const { path, value, collectorId, fieldName } of fieldsToCheck) {
123
+ const cacheKey = `baseline:${configurationId}:${systemId}:${path}`;
124
+ let baseline = await cache.get<FieldBaseline>(cacheKey);
125
+
126
+ if (!baseline) {
127
+ const [dbBaseline] = await db
128
+ .select()
129
+ .from(schema.anomalyBaselines)
130
+ .where(
131
+ and(
132
+ eq(schema.anomalyBaselines.systemId, systemId),
133
+ eq(schema.anomalyBaselines.configurationId, configurationId),
134
+ eq(schema.anomalyBaselines.fieldPath, path),
135
+ ),
136
+ )
137
+ .limit(1);
138
+
139
+ if (dbBaseline) {
140
+ baseline = {
141
+ mean: dbBaseline.mean,
142
+ stdDev: dbBaseline.stdDev,
143
+ trendSlope: dbBaseline.trendSlope,
144
+ sampleCount: dbBaseline.sampleCount,
145
+ computedAt: dbBaseline.computedAt.toISOString(),
146
+ dominantValue: dbBaseline.dominantValue ?? undefined,
147
+ dominantRatio: dbBaseline.dominantRatio ?? undefined,
148
+ };
149
+ await cache.set(cacheKey, baseline, 1000 * 60 * 60); // 1 hour TTL
150
+ }
151
+ }
152
+
153
+ if (!baseline) {
154
+ continue; // Learning phase (no baseline yet)
155
+ }
156
+
157
+ const {
158
+ enabled: effectiveEnabled,
159
+ sensitivity: effectiveSensitivity,
160
+ confirmationWindow: effectiveConfirmation,
161
+ direction: effectiveDirection,
162
+ } = resolveEffectiveConfig(path, templateConfig, assignmentConfig);
163
+
164
+ if (!effectiveEnabled) {
165
+ continue;
166
+ }
167
+
168
+ let schemaDirection: AnomalyDirection | undefined;
169
+ const collector = collectorRegistry.getCollector(collectorId);
170
+ if (collector) {
171
+ const collectorSchema = collector.collector.result.schema;
172
+ if ("shape" in collectorSchema) {
173
+ const shape = collectorSchema.shape as Record<string, z.ZodTypeAny>;
174
+ const fieldSchema = shape[fieldName];
175
+ if (fieldSchema) {
176
+ const meta = getHealthResultMeta(fieldSchema);
177
+ schemaDirection = meta?.["x-anomaly-direction"];
178
+ }
179
+ }
180
+ }
181
+
182
+ const direction = effectiveDirection ?? schemaDirection;
183
+
184
+ if (!direction) {
185
+ continue; // No direction configured (e.g. explicitly disabled in schema)
186
+ }
187
+
188
+ let anomalous = false;
189
+ let deviation = 0;
190
+
191
+ if (direction === "dominance") {
192
+ anomalous = isCategoricalAnomalous(
193
+ value,
194
+ baseline.dominantValue,
195
+ baseline.dominantRatio,
196
+ effectiveSensitivity,
197
+ );
198
+ deviation = 0; // Categorical shifts do not have a standard deviation
199
+ } else if (typeof value === "number") {
200
+ const thresholds = computeThresholds(
201
+ baseline.mean,
202
+ baseline.stdDev,
203
+ direction,
204
+ effectiveSensitivity,
205
+ );
206
+ anomalous = isAnomalous(value, thresholds);
207
+ deviation =
208
+ baseline.stdDev > 0
209
+ ? Math.abs(value - baseline.mean) / baseline.stdDev
210
+ : 0;
211
+ }
212
+
213
+ const [existingAnomaly] = await db
214
+ .select()
215
+ .from(schema.anomalies)
216
+ .where(
217
+ and(
218
+ eq(schema.anomalies.systemId, systemId),
219
+ eq(schema.anomalies.configurationId, configurationId),
220
+ eq(schema.anomalies.fieldPath, path),
221
+ eq(schema.anomalies.kind, "spike"),
222
+ ),
223
+ )
224
+ .limit(1);
225
+
226
+ if (anomalous) {
227
+ if (!existingAnomaly) {
228
+ const [inserted] = await db
229
+ .insert(schema.anomalies)
230
+ .values({
231
+ systemId,
232
+ configurationId,
233
+ fieldPath: path,
234
+ kind: "spike",
235
+ state: "suspicious",
236
+ direction:
237
+ typeof value === "number"
238
+ ? value > baseline.mean
239
+ ? "above"
240
+ : "below"
241
+ : "above",
242
+ baselineValue: baseline.mean,
243
+ baselineStdDev: baseline.stdDev,
244
+ observedValue: String(value),
245
+ deviation,
246
+ suspiciousRunCount: 1,
247
+ confirmationThreshold: effectiveConfirmation,
248
+ })
249
+ .returning({ id: schema.anomalies.id });
250
+
251
+ // Invalidate router-level cache before signal so dashboards that
252
+ // refetch in response see the new "suspicious" entry.
253
+ if (inserted) {
254
+ await routerCache?.invalidateAnomalies();
255
+ }
256
+
257
+ // F8: Emit signal for state transition
258
+ if (signalService && inserted) {
259
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
260
+ systemId,
261
+ anomalyId: inserted.id,
262
+ newState: "suspicious",
263
+ });
264
+ }
265
+ } else if (existingAnomaly.state === "suspicious") {
266
+ const newCount = existingAnomaly.suspiciousRunCount + 1;
267
+ if (newCount >= existingAnomaly.confirmationThreshold) {
268
+ await db
269
+ .update(schema.anomalies)
270
+ .set({
271
+ state: "anomaly",
272
+ confirmedAt: new Date(),
273
+ observedValue: String(value),
274
+ deviation,
275
+ })
276
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
277
+ logger.warn(`Anomaly confirmed for ${systemId} on ${path}`);
278
+
279
+ await routerCache?.invalidateAnomalies();
280
+
281
+ // F8: Emit signal for state transition
282
+ if (signalService) {
283
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
284
+ systemId,
285
+ anomalyId: existingAnomaly.id,
286
+ newState: "anomaly",
287
+ });
288
+ }
289
+
290
+ // F1: Sidecar Notification Orchestration
291
+ await dispatchAnomalyNotification({
292
+ action: "confirmed",
293
+ systemId,
294
+ fieldPath: path,
295
+ observedValue: value,
296
+ baselineMean: baseline.mean,
297
+ catalogClient,
298
+ logger,
299
+ });
300
+ } else {
301
+ await db
302
+ .update(schema.anomalies)
303
+ .set({
304
+ suspiciousRunCount: newCount,
305
+ observedValue: String(value),
306
+ deviation,
307
+ })
308
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
309
+ }
310
+ } else if (existingAnomaly.state === "anomaly") {
311
+ await db
312
+ .update(schema.anomalies)
313
+ .set({
314
+ observedValue: String(value),
315
+ deviation,
316
+ })
317
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
318
+ }
319
+ } else {
320
+ if (existingAnomaly) {
321
+ if (existingAnomaly.state === "suspicious") {
322
+ await db
323
+ .delete(schema.anomalies)
324
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
325
+ } else if (existingAnomaly.state === "anomaly") {
326
+ await db
327
+ .update(schema.anomalies)
328
+ .set({
329
+ state: "recovered",
330
+ recoveredAt: new Date(),
331
+ observedValue: String(value),
332
+ })
333
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
334
+ logger.info(`Anomaly recovered for ${systemId} on ${path}`);
335
+
336
+ await routerCache?.invalidateAnomalies();
337
+
338
+ // F8: Emit signal for state transition
339
+ if (signalService) {
340
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
341
+ systemId,
342
+ anomalyId: existingAnomaly.id,
343
+ newState: "recovered",
344
+ });
345
+ }
346
+
347
+ // F1: Sidecar Notification Orchestration
348
+ await dispatchAnomalyNotification({
349
+ action: "recovered",
350
+ systemId,
351
+ fieldPath: path,
352
+ observedValue: value,
353
+ baselineMean: baseline.mean,
354
+ catalogClient,
355
+ logger,
356
+ });
357
+ }
358
+ }
359
+ }
360
+ }
361
+ }