@checkstack/anomaly-backend 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +125 -0
- package/drizzle/0000_soft_amphibian.sql +20 -0
- package/drizzle/0001_warm_spyke.sql +13 -0
- package/drizzle/0002_peaceful_krista_starr.sql +13 -0
- package/drizzle/0003_easy_maginty.sql +2 -0
- package/drizzle/meta/0000_snapshot.json +152 -0
- package/drizzle/meta/0001_snapshot.json +232 -0
- package/drizzle/meta/0002_snapshot.json +307 -0
- package/drizzle/meta/0003_snapshot.json +323 -0
- package/drizzle/meta/_journal.json +34 -0
- package/drizzle.config.ts +7 -0
- package/package.json +39 -0
- package/src/config.ts +8 -0
- package/src/detector.test.ts +894 -0
- package/src/detector.ts +361 -0
- package/src/drift-evaluator.test.ts +383 -0
- package/src/drift-evaluator.ts +231 -0
- package/src/index.ts +4 -0
- package/src/jobs/baseline-analyzer.ts +269 -0
- package/src/notification.ts +139 -0
- package/src/plugin.ts +85 -0
- package/src/router-cache.ts +89 -0
- package/src/router.ts +74 -0
- package/src/schema.ts +87 -0
- package/src/service.ts +163 -0
- package/tsconfig.json +6 -0
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
import type { CollectorRegistry, Logger, SafeDatabase } from "@checkstack/backend-api";
|
|
2
|
+
import type { CacheProvider } from "@checkstack/cache-api";
|
|
3
|
+
import type { CatalogApi } from "@checkstack/catalog-common";
|
|
4
|
+
import type { InferClient } from "@checkstack/common";
|
|
5
|
+
import type { HealthCheckApi, HealthCheckRunResult } from "@checkstack/healthcheck-common";
|
|
6
|
+
import { getHealthResultMeta } from "@checkstack/healthcheck-common";
|
|
7
|
+
import {
|
|
8
|
+
ANOMALY_BASELINE_UPDATED,
|
|
9
|
+
computeDominance,
|
|
10
|
+
computeLinearRegressionSlope,
|
|
11
|
+
computeMean,
|
|
12
|
+
computeStdDev,
|
|
13
|
+
type AnomalyDirection,
|
|
14
|
+
type FieldBaseline,
|
|
15
|
+
} from "@checkstack/anomaly-common";
|
|
16
|
+
import type { QueueManager } from "@checkstack/queue-api";
|
|
17
|
+
import type { SignalService } from "@checkstack/signal-common";
|
|
18
|
+
import type { z } from "zod";
|
|
19
|
+
import * as schema from "../schema";
|
|
20
|
+
import { AnomalyService } from "../service";
|
|
21
|
+
import { evaluateDrift } from "../drift-evaluator";
|
|
22
|
+
|
|
23
|
+
export const BASELINE_ANALYZER_QUEUE = "anomaly-baseline-analyzer";
|
|
24
|
+
|
|
25
|
+
/** Minimum data points required before any baseline is persisted (cold start). */
|
|
26
|
+
const MIN_BASELINE_SAMPLES = 24;
|
|
27
|
+
|
|
28
|
+
export async function setupBaselineAnalyzerJob({
|
|
29
|
+
db,
|
|
30
|
+
cache,
|
|
31
|
+
logger,
|
|
32
|
+
queueManager,
|
|
33
|
+
healthCheckClient,
|
|
34
|
+
signalService,
|
|
35
|
+
catalogClient,
|
|
36
|
+
collectorRegistry,
|
|
37
|
+
}: {
|
|
38
|
+
db: SafeDatabase<typeof schema>;
|
|
39
|
+
cache: CacheProvider;
|
|
40
|
+
logger: Logger;
|
|
41
|
+
queueManager: QueueManager;
|
|
42
|
+
healthCheckClient: InferClient<typeof HealthCheckApi>;
|
|
43
|
+
signalService?: SignalService;
|
|
44
|
+
catalogClient: InferClient<typeof CatalogApi>;
|
|
45
|
+
collectorRegistry: CollectorRegistry;
|
|
46
|
+
}) {
|
|
47
|
+
const queue = queueManager.getQueue(BASELINE_ANALYZER_QUEUE);
|
|
48
|
+
const anomalyService = new AnomalyService(db);
|
|
49
|
+
|
|
50
|
+
await queue.consume(
|
|
51
|
+
async (_job) => {
|
|
52
|
+
logger.debug("Running anomaly baseline analyzer background job...");
|
|
53
|
+
|
|
54
|
+
const sevenDaysAgo = new Date(Date.now() - 7 * 24 * 60 * 60 * 1000);
|
|
55
|
+
|
|
56
|
+
const activeAssignments = await healthCheckClient.getRunsForAnalysis({
|
|
57
|
+
startDate: sevenDaysAgo,
|
|
58
|
+
limitPerAssignment: 200,
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
for (const assignment of activeAssignments) {
|
|
62
|
+
// Per-assignment configuration is fetched once and reused per field.
|
|
63
|
+
let templateConfig;
|
|
64
|
+
let assignmentConfig;
|
|
65
|
+
try {
|
|
66
|
+
const templateRecord = await anomalyService.getAnomalyConfig(
|
|
67
|
+
assignment.configurationId,
|
|
68
|
+
);
|
|
69
|
+
templateConfig = templateRecord.data;
|
|
70
|
+
const assignmentRecord =
|
|
71
|
+
await anomalyService.getAnomalyAssignmentConfig(
|
|
72
|
+
assignment.systemId,
|
|
73
|
+
assignment.configurationId,
|
|
74
|
+
);
|
|
75
|
+
assignmentConfig = assignmentRecord?.data;
|
|
76
|
+
} catch (error) {
|
|
77
|
+
logger.warn(
|
|
78
|
+
`Failed to fetch anomaly config for ${assignment.configurationId}; skipping drift evaluation`,
|
|
79
|
+
error,
|
|
80
|
+
);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const fieldValues: Record<string, (string | boolean | number)[]> = {};
|
|
84
|
+
const fieldCollectorIds: Record<string, string> = {};
|
|
85
|
+
const fieldNames: Record<string, string> = {};
|
|
86
|
+
|
|
87
|
+
// `getRunsForAnalysis` returns runs in DESCENDING timestamp order.
|
|
88
|
+
// Iterate in reverse so per-field arrays end up chronologically ascending —
|
|
89
|
+
// a property the regression slope relies on.
|
|
90
|
+
for (let i = assignment.runs.length - 1; i >= 0; i--) {
|
|
91
|
+
const row = assignment.runs[i];
|
|
92
|
+
if (!row.result) continue;
|
|
93
|
+
|
|
94
|
+
const result = row.result as HealthCheckRunResult;
|
|
95
|
+
|
|
96
|
+
if (typeof result.latencyMs === "number") {
|
|
97
|
+
const fullPath = "latencyMs";
|
|
98
|
+
if (!fieldValues[fullPath]) fieldValues[fullPath] = [];
|
|
99
|
+
fieldValues[fullPath].push(result.latencyMs);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const collectors = result.metadata?.collectors;
|
|
103
|
+
if (!collectors) continue;
|
|
104
|
+
|
|
105
|
+
for (const collectorData of Object.values(collectors)) {
|
|
106
|
+
if (typeof collectorData !== "object" || collectorData === null) continue;
|
|
107
|
+
const data = collectorData as Record<string, unknown>;
|
|
108
|
+
const realCollectorId = data._collectorId;
|
|
109
|
+
if (typeof realCollectorId !== "string") continue;
|
|
110
|
+
|
|
111
|
+
for (const [fieldName, value] of Object.entries(data)) {
|
|
112
|
+
if (fieldName === "_collectorId" || fieldName.startsWith("_")) continue;
|
|
113
|
+
if (
|
|
114
|
+
typeof value !== "number" &&
|
|
115
|
+
typeof value !== "string" &&
|
|
116
|
+
typeof value !== "boolean"
|
|
117
|
+
) {
|
|
118
|
+
continue;
|
|
119
|
+
}
|
|
120
|
+
const fullPath = `collectors.${realCollectorId}.${fieldName}`;
|
|
121
|
+
if (!fieldValues[fullPath]) fieldValues[fullPath] = [];
|
|
122
|
+
fieldValues[fullPath].push(value);
|
|
123
|
+
fieldCollectorIds[fullPath] = realCollectorId;
|
|
124
|
+
fieldNames[fullPath] = fieldName;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
for (const [path, values] of Object.entries(fieldValues)) {
|
|
130
|
+
if (values.length < MIN_BASELINE_SAMPLES) continue;
|
|
131
|
+
|
|
132
|
+
let mean = 0;
|
|
133
|
+
let stdDev = 0;
|
|
134
|
+
let trendSlope = 0;
|
|
135
|
+
let dominantValue: string | undefined;
|
|
136
|
+
let dominantRatio: number | undefined;
|
|
137
|
+
|
|
138
|
+
if (typeof values[0] === "number") {
|
|
139
|
+
const numValues = values as number[];
|
|
140
|
+
mean = computeMean(numValues);
|
|
141
|
+
stdDev = computeStdDev(numValues);
|
|
142
|
+
trendSlope = computeLinearRegressionSlope(numValues);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const dom = computeDominance(values);
|
|
146
|
+
if (dom.dominantValue !== undefined) {
|
|
147
|
+
dominantValue = String(dom.dominantValue);
|
|
148
|
+
dominantRatio = dom.dominantRatio;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const baseline = {
|
|
152
|
+
mean,
|
|
153
|
+
stdDev,
|
|
154
|
+
trendSlope,
|
|
155
|
+
dominantValue,
|
|
156
|
+
dominantRatio,
|
|
157
|
+
sampleCount: values.length,
|
|
158
|
+
computedAt: new Date(),
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
await db
|
|
162
|
+
.insert(schema.anomalyBaselines)
|
|
163
|
+
.values({
|
|
164
|
+
systemId: assignment.systemId,
|
|
165
|
+
configurationId: assignment.configurationId,
|
|
166
|
+
fieldPath: path,
|
|
167
|
+
...baseline,
|
|
168
|
+
})
|
|
169
|
+
.onConflictDoUpdate({
|
|
170
|
+
target: [
|
|
171
|
+
schema.anomalyBaselines.systemId,
|
|
172
|
+
schema.anomalyBaselines.configurationId,
|
|
173
|
+
schema.anomalyBaselines.fieldPath,
|
|
174
|
+
],
|
|
175
|
+
set: baseline,
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
const cacheKey = `baseline:${assignment.configurationId}:${assignment.systemId}:${path}`;
|
|
179
|
+
await cache.set(
|
|
180
|
+
cacheKey,
|
|
181
|
+
{ ...baseline, computedAt: baseline.computedAt.toISOString() },
|
|
182
|
+
1000 * 60 * 60 * 24,
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
if (signalService && typeof values[0] === "number") {
|
|
186
|
+
await signalService.broadcast(ANOMALY_BASELINE_UPDATED, {
|
|
187
|
+
systemId: assignment.systemId,
|
|
188
|
+
configurationId: assignment.configurationId,
|
|
189
|
+
fieldPath: path,
|
|
190
|
+
mean: baseline.mean,
|
|
191
|
+
stdDev: baseline.stdDev,
|
|
192
|
+
sampleCount: baseline.sampleCount,
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Drift evaluation runs only for numeric fields scoped to a collector
|
|
197
|
+
// (the path layout `collectors.${id}.${field}`). Run-level fields like
|
|
198
|
+
// `latencyMs` are skipped because we have no schema-declared direction
|
|
199
|
+
// for them.
|
|
200
|
+
if (typeof values[0] !== "number") continue;
|
|
201
|
+
const collectorId = fieldCollectorIds[path];
|
|
202
|
+
const fieldName = fieldNames[path];
|
|
203
|
+
if (!collectorId || !fieldName) continue;
|
|
204
|
+
|
|
205
|
+
const schemaDirection = lookupSchemaDirection({
|
|
206
|
+
collectorRegistry,
|
|
207
|
+
collectorId,
|
|
208
|
+
fieldName,
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
const baselineDto: FieldBaseline = {
|
|
212
|
+
...baseline,
|
|
213
|
+
computedAt: baseline.computedAt.toISOString(),
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
await evaluateDrift({
|
|
217
|
+
db,
|
|
218
|
+
logger,
|
|
219
|
+
catalogClient,
|
|
220
|
+
signalService,
|
|
221
|
+
systemId: assignment.systemId,
|
|
222
|
+
configurationId: assignment.configurationId,
|
|
223
|
+
fieldPath: path,
|
|
224
|
+
baseline: baselineDto,
|
|
225
|
+
schemaDirection,
|
|
226
|
+
templateConfig,
|
|
227
|
+
assignmentConfig,
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
logger.debug("Anomaly baselines successfully recomputed.");
|
|
233
|
+
},
|
|
234
|
+
{
|
|
235
|
+
consumerGroup: "anomaly-workers",
|
|
236
|
+
},
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
// Schedule to run every hour
|
|
240
|
+
await queue.scheduleRecurring(
|
|
241
|
+
{ trigger: "scheduled" },
|
|
242
|
+
{
|
|
243
|
+
jobId: "hourly-baseline-analysis",
|
|
244
|
+
cronPattern: "0 * * * *",
|
|
245
|
+
},
|
|
246
|
+
);
|
|
247
|
+
|
|
248
|
+
logger.debug("Anomaly baseline analyzer job scheduled.");
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function lookupSchemaDirection({
|
|
252
|
+
collectorRegistry,
|
|
253
|
+
collectorId,
|
|
254
|
+
fieldName,
|
|
255
|
+
}: {
|
|
256
|
+
collectorRegistry: CollectorRegistry;
|
|
257
|
+
collectorId: string;
|
|
258
|
+
fieldName: string;
|
|
259
|
+
}): AnomalyDirection | undefined {
|
|
260
|
+
const collector = collectorRegistry.getCollector(collectorId);
|
|
261
|
+
if (!collector) return undefined;
|
|
262
|
+
const collectorSchema = collector.collector.result.schema;
|
|
263
|
+
if (!("shape" in collectorSchema)) return undefined;
|
|
264
|
+
const shape = collectorSchema.shape as Record<string, z.ZodTypeAny>;
|
|
265
|
+
const fieldSchema = shape[fieldName];
|
|
266
|
+
if (!fieldSchema) return undefined;
|
|
267
|
+
const meta = getHealthResultMeta(fieldSchema);
|
|
268
|
+
return meta?.["x-anomaly-direction"];
|
|
269
|
+
}
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
2
|
+
import type { CatalogApi } from "@checkstack/catalog-common";
|
|
3
|
+
import { catalogRoutes } from "@checkstack/catalog-common";
|
|
4
|
+
import type { InferClient } from "@checkstack/common";
|
|
5
|
+
import { resolveRoute } from "@checkstack/common";
|
|
6
|
+
|
|
7
|
+
export type AnomalyNotificationAction =
|
|
8
|
+
| "confirmed"
|
|
9
|
+
| "recovered"
|
|
10
|
+
| "drift_confirmed"
|
|
11
|
+
| "drift_recovered";
|
|
12
|
+
|
|
13
|
+
export interface DispatchAnomalyNotificationInput {
|
|
14
|
+
action: AnomalyNotificationAction;
|
|
15
|
+
systemId: string;
|
|
16
|
+
fieldPath: string;
|
|
17
|
+
observedValue: string | boolean | number;
|
|
18
|
+
baselineMean: number;
|
|
19
|
+
catalogClient: InferClient<typeof CatalogApi>;
|
|
20
|
+
logger: Logger;
|
|
21
|
+
/** Drift-specific: projected change over the baseline window. */
|
|
22
|
+
projectedChange?: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Dispatches anomaly-related notifications following the Sidecar Notification
|
|
27
|
+
* Orchestration pattern. Centralizes system lookup, URL resolution, importance
|
|
28
|
+
* mapping, and error isolation across all anomaly action types (Phase 1 spike
|
|
29
|
+
* confirmed/recovered + Phase 2 drift confirmed/recovered).
|
|
30
|
+
*/
|
|
31
|
+
export async function dispatchAnomalyNotification({
|
|
32
|
+
action,
|
|
33
|
+
systemId,
|
|
34
|
+
fieldPath,
|
|
35
|
+
observedValue,
|
|
36
|
+
baselineMean,
|
|
37
|
+
catalogClient,
|
|
38
|
+
logger,
|
|
39
|
+
projectedChange,
|
|
40
|
+
}: DispatchAnomalyNotificationInput): Promise<void> {
|
|
41
|
+
try {
|
|
42
|
+
const system = await catalogClient.getSystem({ systemId });
|
|
43
|
+
const systemName = system?.name ?? systemId;
|
|
44
|
+
const actionUrl = resolveRoute(catalogRoutes.routes.systemDetail, {
|
|
45
|
+
systemId,
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const obsStr =
|
|
49
|
+
typeof observedValue === "number"
|
|
50
|
+
? observedValue.toFixed(2)
|
|
51
|
+
: String(observedValue);
|
|
52
|
+
const baseStr = baselineMean.toFixed(2);
|
|
53
|
+
const driftStr =
|
|
54
|
+
projectedChange === undefined
|
|
55
|
+
? ""
|
|
56
|
+
: `${projectedChange >= 0 ? "+" : ""}${projectedChange.toFixed(2)}`;
|
|
57
|
+
|
|
58
|
+
const { title, message } = buildNotificationCopy({
|
|
59
|
+
action,
|
|
60
|
+
systemName,
|
|
61
|
+
fieldPath,
|
|
62
|
+
obsStr,
|
|
63
|
+
baseStr,
|
|
64
|
+
driftStr,
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
const importance = getImportance(action);
|
|
68
|
+
|
|
69
|
+
await catalogClient.notifySystemSubscribers({
|
|
70
|
+
systemId,
|
|
71
|
+
title,
|
|
72
|
+
body: message,
|
|
73
|
+
importance,
|
|
74
|
+
action: { label: "View System", url: actionUrl },
|
|
75
|
+
includeGroupSubscribers: true,
|
|
76
|
+
});
|
|
77
|
+
} catch (error) {
|
|
78
|
+
logger.warn(
|
|
79
|
+
`Failed to dispatch anomaly ${action} notification for ${systemId}`,
|
|
80
|
+
error,
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function buildNotificationCopy({
|
|
86
|
+
action,
|
|
87
|
+
systemName,
|
|
88
|
+
fieldPath,
|
|
89
|
+
obsStr,
|
|
90
|
+
baseStr,
|
|
91
|
+
driftStr,
|
|
92
|
+
}: {
|
|
93
|
+
action: AnomalyNotificationAction;
|
|
94
|
+
systemName: string;
|
|
95
|
+
fieldPath: string;
|
|
96
|
+
obsStr: string;
|
|
97
|
+
baseStr: string;
|
|
98
|
+
driftStr: string;
|
|
99
|
+
}): { title: string; message: string } {
|
|
100
|
+
switch (action) {
|
|
101
|
+
case "confirmed": {
|
|
102
|
+
return {
|
|
103
|
+
title: `Anomaly Detected: ${systemName}`,
|
|
104
|
+
message: `Anomaly confirmed for **${fieldPath}**. Observed: ${obsStr}, Baseline: ${baseStr}.`,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
case "recovered": {
|
|
108
|
+
return {
|
|
109
|
+
title: `Anomaly Recovered: ${systemName}`,
|
|
110
|
+
message: `Anomaly recovered for **${fieldPath}**. Observed: ${obsStr}, Baseline: ${baseStr}.`,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
case "drift_confirmed": {
|
|
114
|
+
const projectionFragment = driftStr === ""
|
|
115
|
+
? ""
|
|
116
|
+
: ` Projected change over the baseline window: ${driftStr}.`;
|
|
117
|
+
return {
|
|
118
|
+
title: `Trend Drift Detected: ${systemName}`,
|
|
119
|
+
message: `**${fieldPath}** is drifting. Current mean: ${obsStr}, Baseline: ${baseStr}.${projectionFragment}`,
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
case "drift_recovered": {
|
|
123
|
+
return {
|
|
124
|
+
title: `Trend Drift Recovered: ${systemName}`,
|
|
125
|
+
message: `**${fieldPath}** has stabilized. Current mean: ${obsStr}, Baseline: ${baseStr}.`,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Action-Based Importance Logic per Sidecar Orchestration standard.
|
|
133
|
+
* - Terminal "Good News" states (recovered, drift_recovered) are always info.
|
|
134
|
+
* - "Bad News" states (confirmed, drift_confirmed) are warnings.
|
|
135
|
+
*/
|
|
136
|
+
export function getImportance(action: AnomalyNotificationAction): "info" | "warning" {
|
|
137
|
+
if (action === "recovered" || action === "drift_recovered") return "info";
|
|
138
|
+
return "warning";
|
|
139
|
+
}
|
package/src/plugin.ts
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { createBackendPlugin, coreServices, type SafeDatabase } from "@checkstack/backend-api";
|
|
2
|
+
import { healthCheckHooks } from "@checkstack/healthcheck-backend";
|
|
3
|
+
import { setupBaselineAnalyzerJob } from "./jobs/baseline-analyzer";
|
|
4
|
+
import { processCheckCompleted } from "./detector";
|
|
5
|
+
import * as schema from "./schema";
|
|
6
|
+
import { CatalogApi } from "@checkstack/catalog-common";
|
|
7
|
+
import { AnomalyService } from "./service";
|
|
8
|
+
import { createRouter } from "./router";
|
|
9
|
+
import { createAnomalyRouterCache, type AnomalyRouterCache } from "./router-cache";
|
|
10
|
+
import { anomalyContract, anomalyAccessRules } from "@checkstack/anomaly-common";
|
|
11
|
+
import { HealthCheckApi } from "@checkstack/healthcheck-common";
|
|
12
|
+
|
|
13
|
+
import { definePluginMetadata } from "@checkstack/common";
|
|
14
|
+
|
|
15
|
+
export const plugin = createBackendPlugin({
|
|
16
|
+
metadata: definePluginMetadata({
|
|
17
|
+
pluginId: "anomaly",
|
|
18
|
+
}),
|
|
19
|
+
register(env) {
|
|
20
|
+
env.registerAccessRules(anomalyAccessRules);
|
|
21
|
+
|
|
22
|
+
// Shared between init (router) and afterPluginsReady (detector hook),
|
|
23
|
+
// so the detector can drop the router cache before broadcasting state
|
|
24
|
+
// change signals.
|
|
25
|
+
let routerCache: AnomalyRouterCache | undefined;
|
|
26
|
+
|
|
27
|
+
env.registerInit({
|
|
28
|
+
schema,
|
|
29
|
+
deps: {
|
|
30
|
+
db: coreServices.database,
|
|
31
|
+
logger: coreServices.logger,
|
|
32
|
+
queueManager: coreServices.queueManager,
|
|
33
|
+
cacheManager: coreServices.cacheManager, // Pre-req
|
|
34
|
+
rpcClient: coreServices.rpcClient,
|
|
35
|
+
rpc: coreServices.rpc,
|
|
36
|
+
signalService: coreServices.signalService,
|
|
37
|
+
collectorRegistry: coreServices.collectorRegistry,
|
|
38
|
+
},
|
|
39
|
+
init: async ({ db, logger, queueManager, cacheManager, rpc, rpcClient, signalService, collectorRegistry }) => {
|
|
40
|
+
logger.debug("Initializing Anomaly Detection Backend...");
|
|
41
|
+
|
|
42
|
+
const cache = cacheManager.getProvider();
|
|
43
|
+
const typedDb = db as SafeDatabase<typeof schema>;
|
|
44
|
+
const healthCheckClient = rpcClient.forPlugin(HealthCheckApi);
|
|
45
|
+
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
46
|
+
|
|
47
|
+
await setupBaselineAnalyzerJob({
|
|
48
|
+
db: typedDb,
|
|
49
|
+
cache,
|
|
50
|
+
logger,
|
|
51
|
+
queueManager,
|
|
52
|
+
healthCheckClient,
|
|
53
|
+
signalService,
|
|
54
|
+
catalogClient,
|
|
55
|
+
collectorRegistry,
|
|
56
|
+
});
|
|
57
|
+
|
|
58
|
+
const service = new AnomalyService(typedDb);
|
|
59
|
+
routerCache = createAnomalyRouterCache({ cacheManager, logger });
|
|
60
|
+
const router = createRouter(service, logger, routerCache);
|
|
61
|
+
rpc.registerRouter(router, anomalyContract);
|
|
62
|
+
|
|
63
|
+
logger.debug("Anomaly Detection Backend initialized.");
|
|
64
|
+
},
|
|
65
|
+
afterPluginsReady: async ({ onHook, db, logger, cacheManager, rpcClient, collectorRegistry, signalService }) => {
|
|
66
|
+
const cache = cacheManager.getProvider();
|
|
67
|
+
const typedDb = db as SafeDatabase<typeof schema>;
|
|
68
|
+
const catalogClient = rpcClient.forPlugin(CatalogApi);
|
|
69
|
+
|
|
70
|
+
onHook(healthCheckHooks.checkCompleted, async (payload) => {
|
|
71
|
+
await processCheckCompleted({
|
|
72
|
+
...payload,
|
|
73
|
+
db: typedDb,
|
|
74
|
+
cache,
|
|
75
|
+
routerCache,
|
|
76
|
+
logger,
|
|
77
|
+
catalogClient,
|
|
78
|
+
signalService,
|
|
79
|
+
collectorRegistry,
|
|
80
|
+
});
|
|
81
|
+
});
|
|
82
|
+
},
|
|
83
|
+
});
|
|
84
|
+
},
|
|
85
|
+
});
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import type { CacheManager } from "@checkstack/cache-api";
|
|
2
|
+
import {
|
|
3
|
+
createCachedScope,
|
|
4
|
+
type CachedScope,
|
|
5
|
+
} from "@checkstack/cache-utils";
|
|
6
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Router-level cache for the anomaly plugin's read APIs.
|
|
10
|
+
*
|
|
11
|
+
* NOTE: This is a *separate* cache from the baseline cache the detector
|
|
12
|
+
* uses (which lives in `detector.ts` / `jobs/baseline-analyzer.ts` and
|
|
13
|
+
* keys by `baseline:...`). Keeping them in distinct prefixes lets us
|
|
14
|
+
* invalidate one without touching the other.
|
|
15
|
+
*
|
|
16
|
+
* 15s TTL because the detector creates/updates anomaly rows on every
|
|
17
|
+
* relevant check completion; we drop the cache on every detector write
|
|
18
|
+
* but TTL acts as a freshness ceiling for any path that forgets.
|
|
19
|
+
*/
|
|
20
|
+
const ANOMALY_TTL_MS = 15_000;
|
|
21
|
+
|
|
22
|
+
const ANOMALIES_PREFIX = "anomalies:";
|
|
23
|
+
const BASELINES_PREFIX = "baselines:";
|
|
24
|
+
|
|
25
|
+
function stableStringify(value: unknown): string {
|
|
26
|
+
if (value === null || typeof value !== "object") {
|
|
27
|
+
return JSON.stringify(value);
|
|
28
|
+
}
|
|
29
|
+
if (Array.isArray(value)) {
|
|
30
|
+
return `[${value.map((v) => stableStringify(v)).join(",")}]`;
|
|
31
|
+
}
|
|
32
|
+
const entries = Object.entries(value as Record<string, unknown>)
|
|
33
|
+
.filter(([, v]) => v !== undefined)
|
|
34
|
+
.toSorted(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0));
|
|
35
|
+
return `{${entries
|
|
36
|
+
.map(([k, v]) => `${JSON.stringify(k)}:${stableStringify(v)}`)
|
|
37
|
+
.join(",")}}`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const anomaliesKey = (input: unknown): string =>
|
|
41
|
+
`${ANOMALIES_PREFIX}${stableStringify(input ?? {})}`;
|
|
42
|
+
const baselinesKey = (input: unknown): string =>
|
|
43
|
+
`${BASELINES_PREFIX}${stableStringify(input ?? {})}`;
|
|
44
|
+
|
|
45
|
+
export interface AnomalyRouterCache {
|
|
46
|
+
wrapAnomalies: <T>(input: unknown, loader: () => Promise<T>) => Promise<T>;
|
|
47
|
+
wrapBaselines: <T>(input: unknown, loader: () => Promise<T>) => Promise<T>;
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Drop every cached anomaly list. Called by the detector after any
|
|
51
|
+
* insert/update/delete on the anomalies table, before the
|
|
52
|
+
* `ANOMALY_STATE_CHANGED` signal goes out, so frontend refetches see
|
|
53
|
+
* fresh data.
|
|
54
|
+
*/
|
|
55
|
+
invalidateAnomalies: () => Promise<number>;
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Drop cached baseline list shapes — used after detector writes that
|
|
59
|
+
* affect baselines.
|
|
60
|
+
*/
|
|
61
|
+
invalidateBaselines: () => Promise<number>;
|
|
62
|
+
|
|
63
|
+
scope: CachedScope;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
export function createAnomalyRouterCache({
|
|
67
|
+
cacheManager,
|
|
68
|
+
logger,
|
|
69
|
+
}: {
|
|
70
|
+
cacheManager: CacheManager;
|
|
71
|
+
logger: Logger;
|
|
72
|
+
}): AnomalyRouterCache {
|
|
73
|
+
const scope = createCachedScope({
|
|
74
|
+
cacheManager,
|
|
75
|
+
pluginId: "anomaly-router",
|
|
76
|
+
defaultTtlMs: ANOMALY_TTL_MS,
|
|
77
|
+
onError: (op: string, error: unknown) => {
|
|
78
|
+
logger.warn(`anomaly router cache ${op} failed: ${String(error)}`);
|
|
79
|
+
},
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
wrapAnomalies: (input, loader) => scope.wrap(anomaliesKey(input), loader),
|
|
84
|
+
wrapBaselines: (input, loader) => scope.wrap(baselinesKey(input), loader),
|
|
85
|
+
invalidateAnomalies: () => scope.invalidatePrefix(ANOMALIES_PREFIX),
|
|
86
|
+
invalidateBaselines: () => scope.invalidatePrefix(BASELINES_PREFIX),
|
|
87
|
+
scope,
|
|
88
|
+
};
|
|
89
|
+
}
|
package/src/router.ts
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { implement } from "@orpc/server";
|
|
2
|
+
import { anomalyContract } from "@checkstack/anomaly-common";
|
|
3
|
+
import type { AnomalyService } from "./service";
|
|
4
|
+
import type { Logger } from "@checkstack/backend-api";
|
|
5
|
+
import type { VersionedRecord } from "@checkstack/backend-api";
|
|
6
|
+
import type { AnomalySettings } from "@checkstack/anomaly-common";
|
|
7
|
+
import type { AnomalyRouterCache } from "./router-cache";
|
|
8
|
+
|
|
9
|
+
export function createRouter(
|
|
10
|
+
service: AnomalyService,
|
|
11
|
+
logger: Logger,
|
|
12
|
+
cache: AnomalyRouterCache,
|
|
13
|
+
) {
|
|
14
|
+
const os = implement(anomalyContract);
|
|
15
|
+
|
|
16
|
+
return os.router({
|
|
17
|
+
getAnomalies: os.getAnomalies.handler(
|
|
18
|
+
async ({ input }) => {
|
|
19
|
+
logger.debug("Fetching anomalies", { input });
|
|
20
|
+
return cache.wrapAnomalies(input ?? {}, () =>
|
|
21
|
+
service.getAnomalies(input ?? {}),
|
|
22
|
+
);
|
|
23
|
+
}
|
|
24
|
+
),
|
|
25
|
+
|
|
26
|
+
getAnomalyBaselines: os.getAnomalyBaselines.handler(
|
|
27
|
+
async ({ input }) => {
|
|
28
|
+
logger.debug("Fetching anomaly baselines", { input });
|
|
29
|
+
return cache.wrapBaselines(input, () =>
|
|
30
|
+
service.getAnomalyBaselines(input),
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
),
|
|
34
|
+
|
|
35
|
+
getAnomalyConfig: os.getAnomalyConfig.handler(
|
|
36
|
+
async ({ input }) => {
|
|
37
|
+
return await service.getAnomalyConfig(input.configurationId);
|
|
38
|
+
}
|
|
39
|
+
),
|
|
40
|
+
|
|
41
|
+
updateAnomalyConfig: os.updateAnomalyConfig.handler(
|
|
42
|
+
async ({ input }) => {
|
|
43
|
+
const result = await service.updateAnomalyConfig(input.configurationId, input.config);
|
|
44
|
+
// Config updates can change which fields are flagged; drop both
|
|
45
|
+
// anomaly-list and baseline-list caches before returning so any
|
|
46
|
+
// immediate refetch by the admin UI sees fresh state.
|
|
47
|
+
await Promise.all([
|
|
48
|
+
cache.invalidateAnomalies(),
|
|
49
|
+
cache.invalidateBaselines(),
|
|
50
|
+
]);
|
|
51
|
+
return result as VersionedRecord<AnomalySettings>;
|
|
52
|
+
}
|
|
53
|
+
),
|
|
54
|
+
|
|
55
|
+
getAnomalyAssignmentConfig: os.getAnomalyAssignmentConfig.handler(
|
|
56
|
+
async ({ input }) => {
|
|
57
|
+
const result = await service.getAnomalyAssignmentConfig(input.systemId, input.configurationId);
|
|
58
|
+
// eslint-disable-next-line unicorn/no-null
|
|
59
|
+
return (result as VersionedRecord<Partial<AnomalySettings>>) ?? null;
|
|
60
|
+
}
|
|
61
|
+
),
|
|
62
|
+
|
|
63
|
+
updateAnomalyAssignmentConfig: os.updateAnomalyAssignmentConfig.handler(
|
|
64
|
+
async ({ input }) => {
|
|
65
|
+
const result = await service.updateAnomalyAssignmentConfig(input.systemId, input.configurationId, input.config);
|
|
66
|
+
await Promise.all([
|
|
67
|
+
cache.invalidateAnomalies(),
|
|
68
|
+
cache.invalidateBaselines(),
|
|
69
|
+
]);
|
|
70
|
+
return result as VersionedRecord<Partial<AnomalySettings>>;
|
|
71
|
+
}
|
|
72
|
+
),
|
|
73
|
+
});
|
|
74
|
+
}
|