@checkstack/anomaly-backend 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +125 -0
- package/drizzle/0000_soft_amphibian.sql +20 -0
- package/drizzle/0001_warm_spyke.sql +13 -0
- package/drizzle/0002_peaceful_krista_starr.sql +13 -0
- package/drizzle/0003_easy_maginty.sql +2 -0
- package/drizzle/meta/0000_snapshot.json +152 -0
- package/drizzle/meta/0001_snapshot.json +232 -0
- package/drizzle/meta/0002_snapshot.json +307 -0
- package/drizzle/meta/0003_snapshot.json +323 -0
- package/drizzle/meta/_journal.json +34 -0
- package/drizzle.config.ts +7 -0
- package/package.json +39 -0
- package/src/config.ts +8 -0
- package/src/detector.test.ts +894 -0
- package/src/detector.ts +361 -0
- package/src/drift-evaluator.test.ts +383 -0
- package/src/drift-evaluator.ts +231 -0
- package/src/index.ts +4 -0
- package/src/jobs/baseline-analyzer.ts +269 -0
- package/src/notification.ts +139 -0
- package/src/plugin.ts +85 -0
- package/src/router-cache.ts +89 -0
- package/src/router.ts +74 -0
- package/src/schema.ts +87 -0
- package/src/service.ts +163 -0
- package/tsconfig.json +6 -0
package/src/detector.ts
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import type { SafeDatabase } from "@checkstack/backend-api";
|
|
2
|
+
import type { CacheProvider } from "@checkstack/cache-api";
|
|
3
|
+
import * as schema from "./schema";
|
|
4
|
+
import { eq, and } from "drizzle-orm";
|
|
5
|
+
import {
|
|
6
|
+
computeThresholds,
|
|
7
|
+
isAnomalous,
|
|
8
|
+
isCategoricalAnomalous,
|
|
9
|
+
resolveEffectiveConfig,
|
|
10
|
+
type FieldBaseline,
|
|
11
|
+
} from "@checkstack/anomaly-common";
|
|
12
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
13
|
+
import type { CatalogApi } from "@checkstack/catalog-common";
|
|
14
|
+
import type { InferClient } from "@checkstack/common";
|
|
15
|
+
import { AnomalyService } from "./service";
|
|
16
|
+
import type { AnomalySettings, AnomalyDirection } from "@checkstack/anomaly-common";
|
|
17
|
+
import type { SignalService } from "@checkstack/signal-common";
|
|
18
|
+
import { ANOMALY_STATE_CHANGED } from "@checkstack/anomaly-common";
|
|
19
|
+
import { dispatchAnomalyNotification } from "./notification";
|
|
20
|
+
|
|
21
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
22
|
+
// Inline Fast Detector (Phase 1)
|
|
23
|
+
// ─────────────────────────────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
import type { CollectorRegistry } from "@checkstack/backend-api";
|
|
26
|
+
import { getHealthResultMeta } from "@checkstack/healthcheck-common";
|
|
27
|
+
import type { z } from "@checkstack/backend-api";
|
|
28
|
+
|
|
29
|
+
export async function processCheckCompleted({
|
|
30
|
+
systemId,
|
|
31
|
+
configurationId,
|
|
32
|
+
status,
|
|
33
|
+
latencyMs: _latencyMs,
|
|
34
|
+
result,
|
|
35
|
+
timestamp: _timestamp,
|
|
36
|
+
db,
|
|
37
|
+
cache,
|
|
38
|
+
routerCache,
|
|
39
|
+
logger,
|
|
40
|
+
catalogClient,
|
|
41
|
+
signalService,
|
|
42
|
+
collectorRegistry,
|
|
43
|
+
}: {
|
|
44
|
+
systemId: string;
|
|
45
|
+
configurationId: string;
|
|
46
|
+
status: string;
|
|
47
|
+
latencyMs: number | undefined;
|
|
48
|
+
result: Record<string, unknown> | undefined;
|
|
49
|
+
timestamp: string;
|
|
50
|
+
db: SafeDatabase<typeof schema>;
|
|
51
|
+
cache: CacheProvider;
|
|
52
|
+
routerCache?: {
|
|
53
|
+
invalidateAnomalies: () => Promise<number>;
|
|
54
|
+
};
|
|
55
|
+
logger: Logger;
|
|
56
|
+
catalogClient: InferClient<typeof CatalogApi>;
|
|
57
|
+
signalService?: SignalService;
|
|
58
|
+
collectorRegistry: CollectorRegistry;
|
|
59
|
+
}) {
|
|
60
|
+
if (!result || status !== "healthy") {
|
|
61
|
+
// Only analyze successful results for anomalies
|
|
62
|
+
return;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const fieldsToCheck: {
|
|
66
|
+
path: string;
|
|
67
|
+
value: number | string | boolean;
|
|
68
|
+
collectorId: string;
|
|
69
|
+
fieldName: string;
|
|
70
|
+
}[] = [];
|
|
71
|
+
|
|
72
|
+
// `result` here is specifically the `collectors` dictionary where keys are UUIDs
|
|
73
|
+
// e.g. { "uuid-1234": { "_collectorId": "healthcheck-http.request", "responseTimeMs": 50 } }
|
|
74
|
+
for (const collectorData of Object.values(result)) {
|
|
75
|
+
if (typeof collectorData === "object" && collectorData !== null) {
|
|
76
|
+
const data = collectorData as Record<string, unknown>;
|
|
77
|
+
const realCollectorId = data._collectorId;
|
|
78
|
+
|
|
79
|
+
if (typeof realCollectorId === "string") {
|
|
80
|
+
for (const [fieldName, value] of Object.entries(data)) {
|
|
81
|
+
if (fieldName.startsWith("_")) continue;
|
|
82
|
+
if (
|
|
83
|
+
typeof value === "number" ||
|
|
84
|
+
typeof value === "string" ||
|
|
85
|
+
typeof value === "boolean"
|
|
86
|
+
) {
|
|
87
|
+
fieldsToCheck.push({
|
|
88
|
+
path: `collectors.${realCollectorId}.${fieldName}`,
|
|
89
|
+
value,
|
|
90
|
+
collectorId: realCollectorId,
|
|
91
|
+
fieldName,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if (fieldsToCheck.length === 0) return;
|
|
100
|
+
|
|
101
|
+
// F2 fix: Fetch config ONCE above the field loop (was N+1 inside the loop)
|
|
102
|
+
const anomalyService = new AnomalyService(db);
|
|
103
|
+
let templateConfig: AnomalySettings | undefined;
|
|
104
|
+
let assignmentConfig: Partial<AnomalySettings> | undefined;
|
|
105
|
+
try {
|
|
106
|
+
const templateRecord =
|
|
107
|
+
await anomalyService.getAnomalyConfig(configurationId);
|
|
108
|
+
templateConfig = templateRecord.data;
|
|
109
|
+
const assignmentRecord = await anomalyService.getAnomalyAssignmentConfig(
|
|
110
|
+
systemId,
|
|
111
|
+
configurationId,
|
|
112
|
+
);
|
|
113
|
+
assignmentConfig = assignmentRecord?.data ?? undefined;
|
|
114
|
+
} catch (error) {
|
|
115
|
+
logger.warn(
|
|
116
|
+
`Failed to fetch anomaly configuration for ${configurationId}`,
|
|
117
|
+
error,
|
|
118
|
+
);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Check each field
|
|
122
|
+
for (const { path, value, collectorId, fieldName } of fieldsToCheck) {
|
|
123
|
+
const cacheKey = `baseline:${configurationId}:${systemId}:${path}`;
|
|
124
|
+
let baseline = await cache.get<FieldBaseline>(cacheKey);
|
|
125
|
+
|
|
126
|
+
if (!baseline) {
|
|
127
|
+
const [dbBaseline] = await db
|
|
128
|
+
.select()
|
|
129
|
+
.from(schema.anomalyBaselines)
|
|
130
|
+
.where(
|
|
131
|
+
and(
|
|
132
|
+
eq(schema.anomalyBaselines.systemId, systemId),
|
|
133
|
+
eq(schema.anomalyBaselines.configurationId, configurationId),
|
|
134
|
+
eq(schema.anomalyBaselines.fieldPath, path),
|
|
135
|
+
),
|
|
136
|
+
)
|
|
137
|
+
.limit(1);
|
|
138
|
+
|
|
139
|
+
if (dbBaseline) {
|
|
140
|
+
baseline = {
|
|
141
|
+
mean: dbBaseline.mean,
|
|
142
|
+
stdDev: dbBaseline.stdDev,
|
|
143
|
+
trendSlope: dbBaseline.trendSlope,
|
|
144
|
+
sampleCount: dbBaseline.sampleCount,
|
|
145
|
+
computedAt: dbBaseline.computedAt.toISOString(),
|
|
146
|
+
dominantValue: dbBaseline.dominantValue ?? undefined,
|
|
147
|
+
dominantRatio: dbBaseline.dominantRatio ?? undefined,
|
|
148
|
+
};
|
|
149
|
+
await cache.set(cacheKey, baseline, 1000 * 60 * 60); // 1 hour TTL
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (!baseline) {
|
|
154
|
+
continue; // Learning phase (no baseline yet)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const {
|
|
158
|
+
enabled: effectiveEnabled,
|
|
159
|
+
sensitivity: effectiveSensitivity,
|
|
160
|
+
confirmationWindow: effectiveConfirmation,
|
|
161
|
+
direction: effectiveDirection,
|
|
162
|
+
} = resolveEffectiveConfig(path, templateConfig, assignmentConfig);
|
|
163
|
+
|
|
164
|
+
if (!effectiveEnabled) {
|
|
165
|
+
continue;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
let schemaDirection: AnomalyDirection | undefined;
|
|
169
|
+
const collector = collectorRegistry.getCollector(collectorId);
|
|
170
|
+
if (collector) {
|
|
171
|
+
const collectorSchema = collector.collector.result.schema;
|
|
172
|
+
if ("shape" in collectorSchema) {
|
|
173
|
+
const shape = collectorSchema.shape as Record<string, z.ZodTypeAny>;
|
|
174
|
+
const fieldSchema = shape[fieldName];
|
|
175
|
+
if (fieldSchema) {
|
|
176
|
+
const meta = getHealthResultMeta(fieldSchema);
|
|
177
|
+
schemaDirection = meta?.["x-anomaly-direction"];
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const direction = effectiveDirection ?? schemaDirection;
|
|
183
|
+
|
|
184
|
+
if (!direction) {
|
|
185
|
+
continue; // No direction configured (e.g. explicitly disabled in schema)
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
let anomalous = false;
|
|
189
|
+
let deviation = 0;
|
|
190
|
+
|
|
191
|
+
if (direction === "dominance") {
|
|
192
|
+
anomalous = isCategoricalAnomalous(
|
|
193
|
+
value,
|
|
194
|
+
baseline.dominantValue,
|
|
195
|
+
baseline.dominantRatio,
|
|
196
|
+
effectiveSensitivity,
|
|
197
|
+
);
|
|
198
|
+
deviation = 0; // Categorical shifts do not have a standard deviation
|
|
199
|
+
} else if (typeof value === "number") {
|
|
200
|
+
const thresholds = computeThresholds(
|
|
201
|
+
baseline.mean,
|
|
202
|
+
baseline.stdDev,
|
|
203
|
+
direction,
|
|
204
|
+
effectiveSensitivity,
|
|
205
|
+
);
|
|
206
|
+
anomalous = isAnomalous(value, thresholds);
|
|
207
|
+
deviation =
|
|
208
|
+
baseline.stdDev > 0
|
|
209
|
+
? Math.abs(value - baseline.mean) / baseline.stdDev
|
|
210
|
+
: 0;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
const [existingAnomaly] = await db
|
|
214
|
+
.select()
|
|
215
|
+
.from(schema.anomalies)
|
|
216
|
+
.where(
|
|
217
|
+
and(
|
|
218
|
+
eq(schema.anomalies.systemId, systemId),
|
|
219
|
+
eq(schema.anomalies.configurationId, configurationId),
|
|
220
|
+
eq(schema.anomalies.fieldPath, path),
|
|
221
|
+
eq(schema.anomalies.kind, "spike"),
|
|
222
|
+
),
|
|
223
|
+
)
|
|
224
|
+
.limit(1);
|
|
225
|
+
|
|
226
|
+
if (anomalous) {
|
|
227
|
+
if (!existingAnomaly) {
|
|
228
|
+
const [inserted] = await db
|
|
229
|
+
.insert(schema.anomalies)
|
|
230
|
+
.values({
|
|
231
|
+
systemId,
|
|
232
|
+
configurationId,
|
|
233
|
+
fieldPath: path,
|
|
234
|
+
kind: "spike",
|
|
235
|
+
state: "suspicious",
|
|
236
|
+
direction:
|
|
237
|
+
typeof value === "number"
|
|
238
|
+
? value > baseline.mean
|
|
239
|
+
? "above"
|
|
240
|
+
: "below"
|
|
241
|
+
: "above",
|
|
242
|
+
baselineValue: baseline.mean,
|
|
243
|
+
baselineStdDev: baseline.stdDev,
|
|
244
|
+
observedValue: String(value),
|
|
245
|
+
deviation,
|
|
246
|
+
suspiciousRunCount: 1,
|
|
247
|
+
confirmationThreshold: effectiveConfirmation,
|
|
248
|
+
})
|
|
249
|
+
.returning({ id: schema.anomalies.id });
|
|
250
|
+
|
|
251
|
+
// Invalidate router-level cache before signal so dashboards that
|
|
252
|
+
// refetch in response see the new "suspicious" entry.
|
|
253
|
+
if (inserted) {
|
|
254
|
+
await routerCache?.invalidateAnomalies();
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// F8: Emit signal for state transition
|
|
258
|
+
if (signalService && inserted) {
|
|
259
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
260
|
+
systemId,
|
|
261
|
+
anomalyId: inserted.id,
|
|
262
|
+
newState: "suspicious",
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
} else if (existingAnomaly.state === "suspicious") {
|
|
266
|
+
const newCount = existingAnomaly.suspiciousRunCount + 1;
|
|
267
|
+
if (newCount >= existingAnomaly.confirmationThreshold) {
|
|
268
|
+
await db
|
|
269
|
+
.update(schema.anomalies)
|
|
270
|
+
.set({
|
|
271
|
+
state: "anomaly",
|
|
272
|
+
confirmedAt: new Date(),
|
|
273
|
+
observedValue: String(value),
|
|
274
|
+
deviation,
|
|
275
|
+
})
|
|
276
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
277
|
+
logger.warn(`Anomaly confirmed for ${systemId} on ${path}`);
|
|
278
|
+
|
|
279
|
+
await routerCache?.invalidateAnomalies();
|
|
280
|
+
|
|
281
|
+
// F8: Emit signal for state transition
|
|
282
|
+
if (signalService) {
|
|
283
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
284
|
+
systemId,
|
|
285
|
+
anomalyId: existingAnomaly.id,
|
|
286
|
+
newState: "anomaly",
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// F1: Sidecar Notification Orchestration
|
|
291
|
+
await dispatchAnomalyNotification({
|
|
292
|
+
action: "confirmed",
|
|
293
|
+
systemId,
|
|
294
|
+
fieldPath: path,
|
|
295
|
+
observedValue: value,
|
|
296
|
+
baselineMean: baseline.mean,
|
|
297
|
+
catalogClient,
|
|
298
|
+
logger,
|
|
299
|
+
});
|
|
300
|
+
} else {
|
|
301
|
+
await db
|
|
302
|
+
.update(schema.anomalies)
|
|
303
|
+
.set({
|
|
304
|
+
suspiciousRunCount: newCount,
|
|
305
|
+
observedValue: String(value),
|
|
306
|
+
deviation,
|
|
307
|
+
})
|
|
308
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
309
|
+
}
|
|
310
|
+
} else if (existingAnomaly.state === "anomaly") {
|
|
311
|
+
await db
|
|
312
|
+
.update(schema.anomalies)
|
|
313
|
+
.set({
|
|
314
|
+
observedValue: String(value),
|
|
315
|
+
deviation,
|
|
316
|
+
})
|
|
317
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
318
|
+
}
|
|
319
|
+
} else {
|
|
320
|
+
if (existingAnomaly) {
|
|
321
|
+
if (existingAnomaly.state === "suspicious") {
|
|
322
|
+
await db
|
|
323
|
+
.delete(schema.anomalies)
|
|
324
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
325
|
+
} else if (existingAnomaly.state === "anomaly") {
|
|
326
|
+
await db
|
|
327
|
+
.update(schema.anomalies)
|
|
328
|
+
.set({
|
|
329
|
+
state: "recovered",
|
|
330
|
+
recoveredAt: new Date(),
|
|
331
|
+
observedValue: String(value),
|
|
332
|
+
})
|
|
333
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
334
|
+
logger.info(`Anomaly recovered for ${systemId} on ${path}`);
|
|
335
|
+
|
|
336
|
+
await routerCache?.invalidateAnomalies();
|
|
337
|
+
|
|
338
|
+
// F8: Emit signal for state transition
|
|
339
|
+
if (signalService) {
|
|
340
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
341
|
+
systemId,
|
|
342
|
+
anomalyId: existingAnomaly.id,
|
|
343
|
+
newState: "recovered",
|
|
344
|
+
});
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// F1: Sidecar Notification Orchestration
|
|
348
|
+
await dispatchAnomalyNotification({
|
|
349
|
+
action: "recovered",
|
|
350
|
+
systemId,
|
|
351
|
+
fieldPath: path,
|
|
352
|
+
observedValue: value,
|
|
353
|
+
baselineMean: baseline.mean,
|
|
354
|
+
catalogClient,
|
|
355
|
+
logger,
|
|
356
|
+
});
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}
|