@checkstack/anomaly-backend 1.1.8 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +133 -0
- package/drizzle/0005_cute_blonde_phantom.sql +3 -0
- package/drizzle/meta/0005_snapshot.json +419 -0
- package/drizzle/meta/_journal.json +7 -0
- package/package.json +16 -14
- package/src/ai-projection.test.ts +29 -0
- package/src/config-read.test.ts +115 -0
- package/src/config.ts +42 -2
- package/src/detector.test.ts +161 -0
- package/src/detector.ts +116 -8
- package/src/drift-evaluator.test.ts +96 -1
- package/src/drift-evaluator.ts +62 -2
- package/src/migration-chain-contract.test.ts +33 -0
- package/src/plugin.ts +19 -0
- package/src/router.ts +25 -6
- package/src/schema.ts +15 -0
- package/src/service.ts +146 -22
- package/src/suppression.test.ts +290 -0
- package/tsconfig.json +3 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { describe, test, expect, mock } from "bun:test";
|
|
2
|
+
import { AnomalyService } from "./service";
|
|
3
|
+
import type { SafeDatabase } from "@checkstack/backend-api";
|
|
4
|
+
import type * as schema from "./schema";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Read-side migrate-then-validate guard. `getAnomalyConfig` and
|
|
8
|
+
* `getAnomalyAssignmentConfig` previously cast the stored jsonb straight to a
|
|
9
|
+
* typed `VersionedRecord`, bypassing the strategy schema entirely. They now run
|
|
10
|
+
* the stored record through `anomalySettingsConfig.parseRecord` /
|
|
11
|
+
* `anomalyAssignmentConfig.parseRecord`, so the data is validated (and migrated
|
|
12
|
+
* once migrations exist) on every read. These tests prove the read path goes
|
|
13
|
+
* through `.parseRecord`: stray keys are stripped and defaults are applied.
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
function createSelectMockDb({
|
|
17
|
+
storedConfig,
|
|
18
|
+
}: {
|
|
19
|
+
storedConfig: unknown | undefined;
|
|
20
|
+
}): SafeDatabase<typeof schema> {
|
|
21
|
+
const rows = storedConfig === undefined ? [] : [{ config: storedConfig }];
|
|
22
|
+
const db = {
|
|
23
|
+
select: mock(() => ({
|
|
24
|
+
from: mock(() => ({
|
|
25
|
+
where: mock(() => Promise.resolve(rows)),
|
|
26
|
+
})),
|
|
27
|
+
})),
|
|
28
|
+
};
|
|
29
|
+
// The service only touches `select().from().where()` on these read paths.
|
|
30
|
+
return db as unknown as SafeDatabase<typeof schema>;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
describe("getAnomalyConfig read path", () => {
|
|
34
|
+
test("validates the stored record via parseRecord (strips stray keys)", async () => {
|
|
35
|
+
const service = new AnomalyService(
|
|
36
|
+
createSelectMockDb({
|
|
37
|
+
storedConfig: {
|
|
38
|
+
version: 1,
|
|
39
|
+
data: {
|
|
40
|
+
enabled: false,
|
|
41
|
+
baselineWindow: "30d",
|
|
42
|
+
notify: false,
|
|
43
|
+
// Not part of AnomalySettingsSchema — must be stripped by parse.
|
|
44
|
+
legacyField: "should-be-dropped",
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
}),
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
const result = await service.getAnomalyConfig("cfg-1");
|
|
51
|
+
|
|
52
|
+
expect(result.version).toBe(1);
|
|
53
|
+
expect(result.data.enabled).toBe(false);
|
|
54
|
+
expect(result.data.baselineWindow).toBe("30d");
|
|
55
|
+
expect(result.data.notify).toBe(false);
|
|
56
|
+
expect("legacyField" in result.data).toBe(false);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test("applies schema defaults to a sparse stored record", async () => {
|
|
60
|
+
const service = new AnomalyService(
|
|
61
|
+
createSelectMockDb({ storedConfig: { version: 1, data: {} } }),
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
const result = await service.getAnomalyConfig("cfg-1");
|
|
65
|
+
|
|
66
|
+
// Defaults come from AnomalySettingsSchema, proving validation ran.
|
|
67
|
+
expect(result.data.enabled).toBe(true);
|
|
68
|
+
expect(result.data.baselineWindow).toBe("7d");
|
|
69
|
+
expect(result.data.notify).toBe(true);
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
test("returns a validated default wrapper when no row exists", async () => {
|
|
73
|
+
const service = new AnomalyService(
|
|
74
|
+
createSelectMockDb({ storedConfig: undefined }),
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
const result = await service.getAnomalyConfig("cfg-missing");
|
|
78
|
+
|
|
79
|
+
expect(result.version).toBe(1);
|
|
80
|
+
expect(result.data.enabled).toBe(true);
|
|
81
|
+
});
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
describe("getAnomalyAssignmentConfig read path", () => {
|
|
85
|
+
test("validates the stored override record via parseRecord", async () => {
|
|
86
|
+
const service = new AnomalyService(
|
|
87
|
+
createSelectMockDb({
|
|
88
|
+
storedConfig: {
|
|
89
|
+
version: 1,
|
|
90
|
+
data: {
|
|
91
|
+
enabled: true,
|
|
92
|
+
// Stray key not on PartialAnomalySettingsSchema — must be stripped.
|
|
93
|
+
bogus: 123,
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
}),
|
|
97
|
+
);
|
|
98
|
+
|
|
99
|
+
const result = await service.getAnomalyAssignmentConfig("sys-1", "cfg-1");
|
|
100
|
+
|
|
101
|
+
expect(result).toBeDefined();
|
|
102
|
+
expect(result?.data.enabled).toBe(true);
|
|
103
|
+
expect(result && "bogus" in result.data).toBe(false);
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
test("returns undefined when no override row exists", async () => {
|
|
107
|
+
const service = new AnomalyService(
|
|
108
|
+
createSelectMockDb({ storedConfig: undefined }),
|
|
109
|
+
);
|
|
110
|
+
|
|
111
|
+
const result = await service.getAnomalyAssignmentConfig("sys-1", "cfg-1");
|
|
112
|
+
|
|
113
|
+
expect(result).toBeUndefined();
|
|
114
|
+
});
|
|
115
|
+
});
|
package/src/config.ts
CHANGED
|
@@ -1,8 +1,48 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
1
2
|
import { Versioned } from "@checkstack/backend-api";
|
|
2
|
-
import {
|
|
3
|
-
import
|
|
3
|
+
import type { VersionedRecord } from "@checkstack/backend-api";
|
|
4
|
+
import {
|
|
5
|
+
AnomalySettingsSchema,
|
|
6
|
+
PartialAnomalySettingsSchema,
|
|
7
|
+
} from "@checkstack/anomaly-common";
|
|
8
|
+
import type {
|
|
9
|
+
AnomalySettings,
|
|
10
|
+
PartialAnomalySettings,
|
|
11
|
+
} from "@checkstack/anomaly-common";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Envelope schema for the stored versioned record. Stored configs come back
|
|
15
|
+
* from the jsonb column typed as `unknown`, so we validate the wrapper shape
|
|
16
|
+
* (`{ version, data }`) before handing it to `Versioned.parseRecord`, which
|
|
17
|
+
* then migrates + validates the inner `data` against the strategy schema.
|
|
18
|
+
*/
|
|
19
|
+
const versionedEnvelopeSchema = z.object({
|
|
20
|
+
version: z.number(),
|
|
21
|
+
data: z.unknown(),
|
|
22
|
+
migratedAt: z.date().optional(),
|
|
23
|
+
originalVersion: z.number().optional(),
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Narrow a stored jsonb config (`unknown`) into a {@link VersionedRecord}
|
|
28
|
+
* envelope so it can be passed to `Versioned.parse`/`parseRecord`.
|
|
29
|
+
*/
|
|
30
|
+
export function toVersionedRecord(stored: unknown): VersionedRecord<unknown> {
|
|
31
|
+
return versionedEnvelopeSchema.parse(stored);
|
|
32
|
+
}
|
|
4
33
|
|
|
5
34
|
export const anomalySettingsConfig = new Versioned<AnomalySettings>({
|
|
6
35
|
version: 1,
|
|
7
36
|
schema: AnomalySettingsSchema,
|
|
8
37
|
});
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Versioned config for assignment-level overrides. Stored alongside the
|
|
41
|
+
* template config and migrated/validated on read via {@link Versioned.parseRecord}.
|
|
42
|
+
* `version: 1` with no migrations today; threading it through the read path
|
|
43
|
+
* means a future reshape only needs a migration step added here.
|
|
44
|
+
*/
|
|
45
|
+
export const anomalyAssignmentConfig = new Versioned<PartialAnomalySettings>({
|
|
46
|
+
version: 1,
|
|
47
|
+
schema: PartialAnomalySettingsSchema,
|
|
48
|
+
});
|
package/src/detector.test.ts
CHANGED
|
@@ -886,6 +886,167 @@ describe("Anomaly Detector — processCheckCompleted", () => {
|
|
|
886
886
|
expect(broadcastPayload).toMatchObject({ newState: "recovered" });
|
|
887
887
|
});
|
|
888
888
|
|
|
889
|
+
// ─── PART A: self-resolution (settled at a new level) ─────────────────
|
|
890
|
+
|
|
891
|
+
test("self-resolves a confirmed anomaly once recent samples settle at a new level", async () => {
|
|
892
|
+
const baseline = createBaseline({ mean: 100, stdDev: 10 });
|
|
893
|
+
const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
|
|
894
|
+
const catalogClient = createMockCatalogClient();
|
|
895
|
+
const notificationClient = createMockNotificationClient(["user-1"]);
|
|
896
|
+
// Four prior healthy samples already sitting at the new stable level (~200);
|
|
897
|
+
// the fifth (anomalousResult = 200) completes the window → self-resolve.
|
|
898
|
+
const db = createMockDb({
|
|
899
|
+
existingAnomaly: {
|
|
900
|
+
id: "anomaly-stuck",
|
|
901
|
+
systemId,
|
|
902
|
+
configurationId,
|
|
903
|
+
fieldPath: "collectors.http.request.responseTimeMs",
|
|
904
|
+
state: "anomaly",
|
|
905
|
+
suspiciousRunCount: 5,
|
|
906
|
+
confirmationThreshold: 3,
|
|
907
|
+
baselineValue: 100,
|
|
908
|
+
observedValue: "200",
|
|
909
|
+
suppressedAt: null,
|
|
910
|
+
suppressedValue: null,
|
|
911
|
+
metadata: { recentSamples: [200, 200, 200, 200] },
|
|
912
|
+
},
|
|
913
|
+
});
|
|
914
|
+
|
|
915
|
+
await processCheckCompleted({
|
|
916
|
+
...baseProps,
|
|
917
|
+
latencyMs: 50,
|
|
918
|
+
result: anomalousResult, // still 10σ above the stale baseline
|
|
919
|
+
db: db as never,
|
|
920
|
+
cache,
|
|
921
|
+
logger: createMockLogger() as never,
|
|
922
|
+
catalogClient: catalogClient as never,
|
|
923
|
+
notificationClient: notificationClient as never,
|
|
924
|
+
});
|
|
925
|
+
|
|
926
|
+
expect(db._updateCalls.length).toBe(1);
|
|
927
|
+
expect(db._updateCalls[0]).toMatchObject({ state: "recovered" });
|
|
928
|
+
// Recovery notification is dispatched.
|
|
929
|
+
expect(notificationClient.notifyForSubscription).toHaveBeenCalledTimes(1);
|
|
930
|
+
});
|
|
931
|
+
|
|
932
|
+
test("does not self-resolve while the window is still filling", async () => {
|
|
933
|
+
const baseline = createBaseline({ mean: 100, stdDev: 10 });
|
|
934
|
+
const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
|
|
935
|
+
const db = createMockDb({
|
|
936
|
+
existingAnomaly: {
|
|
937
|
+
id: "anomaly-filling",
|
|
938
|
+
systemId,
|
|
939
|
+
configurationId,
|
|
940
|
+
fieldPath: "collectors.http.request.responseTimeMs",
|
|
941
|
+
state: "anomaly",
|
|
942
|
+
suspiciousRunCount: 5,
|
|
943
|
+
confirmationThreshold: 3,
|
|
944
|
+
baselineValue: 100,
|
|
945
|
+
observedValue: "200",
|
|
946
|
+
suppressedAt: null,
|
|
947
|
+
suppressedValue: null,
|
|
948
|
+
metadata: { recentSamples: [200, 200] },
|
|
949
|
+
},
|
|
950
|
+
});
|
|
951
|
+
|
|
952
|
+
await processCheckCompleted({
|
|
953
|
+
...baseProps,
|
|
954
|
+
latencyMs: 50,
|
|
955
|
+
result: anomalousResult,
|
|
956
|
+
db: db as never,
|
|
957
|
+
cache,
|
|
958
|
+
logger: createMockLogger() as never,
|
|
959
|
+
catalogClient: createMockCatalogClient() as never,
|
|
960
|
+
notificationClient: createMockNotificationClient() as never,
|
|
961
|
+
});
|
|
962
|
+
|
|
963
|
+
expect(db._updateCalls.length).toBe(1);
|
|
964
|
+
// Still anomalous: only the rolling window/observed value is updated.
|
|
965
|
+
expect(db._updateCalls[0]).not.toHaveProperty("state");
|
|
966
|
+
expect(db._updateCalls[0].metadata).toMatchObject({
|
|
967
|
+
recentSamples: [200, 200, 200],
|
|
968
|
+
});
|
|
969
|
+
});
|
|
970
|
+
|
|
971
|
+
// ─── PART B: auto-unsuppress ("changes again") ────────────────────────
|
|
972
|
+
|
|
973
|
+
test("auto-unsuppresses a suppressed anomaly when the value changes again", async () => {
|
|
974
|
+
const baseline = createBaseline({ mean: 100, stdDev: 10 });
|
|
975
|
+
const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
|
|
976
|
+
const db = createMockDb({
|
|
977
|
+
existingAnomaly: {
|
|
978
|
+
id: "anomaly-suppressed",
|
|
979
|
+
systemId,
|
|
980
|
+
configurationId,
|
|
981
|
+
fieldPath: "collectors.http.request.responseTimeMs",
|
|
982
|
+
state: "anomaly",
|
|
983
|
+
suspiciousRunCount: 5,
|
|
984
|
+
confirmationThreshold: 3,
|
|
985
|
+
baselineValue: 100,
|
|
986
|
+
observedValue: "200",
|
|
987
|
+
suppressedAt: new Date(),
|
|
988
|
+
suppressedValue: 200, // suppressed at ~200; new value 200 is unchanged...
|
|
989
|
+
metadata: {},
|
|
990
|
+
},
|
|
991
|
+
});
|
|
992
|
+
|
|
993
|
+
// anomalousResult is 200 — within band → must NOT auto-unsuppress.
|
|
994
|
+
await processCheckCompleted({
|
|
995
|
+
...baseProps,
|
|
996
|
+
latencyMs: 50,
|
|
997
|
+
result: anomalousResult,
|
|
998
|
+
db: db as never,
|
|
999
|
+
cache,
|
|
1000
|
+
logger: createMockLogger() as never,
|
|
1001
|
+
catalogClient: createMockCatalogClient() as never,
|
|
1002
|
+
notificationClient: createMockNotificationClient() as never,
|
|
1003
|
+
});
|
|
1004
|
+
const unsuppressed = db._updateCalls.find(
|
|
1005
|
+
(c) => c.suppressedAt === null,
|
|
1006
|
+
);
|
|
1007
|
+
expect(unsuppressed).toBeUndefined();
|
|
1008
|
+
});
|
|
1009
|
+
|
|
1010
|
+
test("auto-unsuppresses when the value moves outside the reactivation band", async () => {
|
|
1011
|
+
// Baseline far below so the new high value is still anomalous and reaches
|
|
1012
|
+
// the anomaly branch; suppressed at 50, observed jumps to 200 (>25% move).
|
|
1013
|
+
const baseline = createBaseline({ mean: 100, stdDev: 10 });
|
|
1014
|
+
const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
|
|
1015
|
+
const db = createMockDb({
|
|
1016
|
+
existingAnomaly: {
|
|
1017
|
+
id: "anomaly-suppressed-2",
|
|
1018
|
+
systemId,
|
|
1019
|
+
configurationId,
|
|
1020
|
+
fieldPath: "collectors.http.request.responseTimeMs",
|
|
1021
|
+
state: "anomaly",
|
|
1022
|
+
suspiciousRunCount: 5,
|
|
1023
|
+
confirmationThreshold: 3,
|
|
1024
|
+
baselineValue: 100,
|
|
1025
|
+
observedValue: "50",
|
|
1026
|
+
suppressedAt: new Date(),
|
|
1027
|
+
suppressedValue: 50,
|
|
1028
|
+
metadata: {},
|
|
1029
|
+
},
|
|
1030
|
+
});
|
|
1031
|
+
|
|
1032
|
+
await processCheckCompleted({
|
|
1033
|
+
...baseProps,
|
|
1034
|
+
latencyMs: 50,
|
|
1035
|
+
result: anomalousResult, // 200, far from suppressedValue 50
|
|
1036
|
+
db: db as never,
|
|
1037
|
+
cache,
|
|
1038
|
+
logger: createMockLogger() as never,
|
|
1039
|
+
catalogClient: createMockCatalogClient() as never,
|
|
1040
|
+
notificationClient: createMockNotificationClient() as never,
|
|
1041
|
+
});
|
|
1042
|
+
|
|
1043
|
+
expect(db._updateCalls.length).toBe(1);
|
|
1044
|
+
expect(db._updateCalls[0]).toMatchObject({
|
|
1045
|
+
suppressedAt: null,
|
|
1046
|
+
suppressedValue: null,
|
|
1047
|
+
});
|
|
1048
|
+
});
|
|
1049
|
+
|
|
889
1050
|
// ─── Notification resilience ──────────────────────────────────────────
|
|
890
1051
|
|
|
891
1052
|
test("does not crash when notification dispatch fails", async () => {
|
package/src/detector.ts
CHANGED
|
@@ -7,6 +7,10 @@ import {
|
|
|
7
7
|
isAnomalous,
|
|
8
8
|
isCategoricalAnomalous,
|
|
9
9
|
resolveEffectiveConfig,
|
|
10
|
+
appendRecentSample,
|
|
11
|
+
hasSettledAtNewLevel,
|
|
12
|
+
hasChangedSinceSuppression,
|
|
13
|
+
type AnomalyMetadata,
|
|
10
14
|
type FieldBaseline,
|
|
11
15
|
} from "@checkstack/anomaly-common";
|
|
12
16
|
import type { Logger } from "@checkstack/backend-api";
|
|
@@ -304,7 +308,7 @@ export async function processCheckCompleted({
|
|
|
304
308
|
deviation,
|
|
305
309
|
})
|
|
306
310
|
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
307
|
-
logger.
|
|
311
|
+
logger.debug(`Anomaly confirmed for ${systemId} on ${path}`);
|
|
308
312
|
|
|
309
313
|
await routerCache?.invalidateAnomalies();
|
|
310
314
|
|
|
@@ -340,13 +344,112 @@ export async function processCheckCompleted({
|
|
|
340
344
|
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
341
345
|
}
|
|
342
346
|
} else if (existingAnomaly.state === "anomaly") {
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
347
|
+
// PART B: a suppressed anomaly auto-unsuppresses once the metric
|
|
348
|
+
// "changes again" — the observed value moves outside the relative band
|
|
349
|
+
// around the value it was suppressed at. We detect that here (the only
|
|
350
|
+
// place fresh samples flow through) before the self-resolution check so
|
|
351
|
+
// a re-activated anomaly resumes normal lifecycle handling.
|
|
352
|
+
const suppressedValue = existingAnomaly.suppressedValue;
|
|
353
|
+
if (
|
|
354
|
+
existingAnomaly.suppressedAt &&
|
|
355
|
+
typeof value === "number" &&
|
|
356
|
+
typeof suppressedValue === "number" &&
|
|
357
|
+
hasChangedSinceSuppression({
|
|
358
|
+
observedValue: value,
|
|
359
|
+
suppressedValue,
|
|
348
360
|
})
|
|
349
|
-
|
|
361
|
+
) {
|
|
362
|
+
await db
|
|
363
|
+
.update(schema.anomalies)
|
|
364
|
+
.set({
|
|
365
|
+
suppressedAt: null,
|
|
366
|
+
suppressedValue: null,
|
|
367
|
+
suppressedBaseline: null,
|
|
368
|
+
observedValue: String(value),
|
|
369
|
+
deviation,
|
|
370
|
+
})
|
|
371
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
372
|
+
await routerCache?.invalidateAnomalies();
|
|
373
|
+
if (signalService) {
|
|
374
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
375
|
+
systemId,
|
|
376
|
+
anomalyId: existingAnomaly.id,
|
|
377
|
+
newState: "anomaly",
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
continue;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
// PART A: self-resolution. The value is still anomalous against the
|
|
384
|
+
// (stale) baseline, but if the recent healthy samples have settled into
|
|
385
|
+
// a tight relative band the metric has found a new normal — resolve
|
|
386
|
+
// independently of the slow baseline analyzer. We keep a rolling window
|
|
387
|
+
// of recent samples on the row's metadata (shared Postgres, so every
|
|
388
|
+
// pod sees the same window).
|
|
389
|
+
if (typeof value === "number") {
|
|
390
|
+
const metadata = (existingAnomaly.metadata ??
|
|
391
|
+
{}) as AnomalyMetadata;
|
|
392
|
+
const recentSamples = appendRecentSample(
|
|
393
|
+
metadata.recentSamples,
|
|
394
|
+
value,
|
|
395
|
+
);
|
|
396
|
+
|
|
397
|
+
if (hasSettledAtNewLevel(recentSamples)) {
|
|
398
|
+
await db
|
|
399
|
+
.update(schema.anomalies)
|
|
400
|
+
.set({
|
|
401
|
+
state: "recovered",
|
|
402
|
+
recoveredAt: new Date(),
|
|
403
|
+
observedValue: String(value),
|
|
404
|
+
deviation,
|
|
405
|
+
metadata: { ...metadata, recentSamples: [] },
|
|
406
|
+
})
|
|
407
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
408
|
+
logger.debug(
|
|
409
|
+
`Anomaly self-resolved (settled at new level) for ${systemId} on ${path}`,
|
|
410
|
+
);
|
|
411
|
+
|
|
412
|
+
await routerCache?.invalidateAnomalies();
|
|
413
|
+
|
|
414
|
+
if (signalService) {
|
|
415
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
416
|
+
systemId,
|
|
417
|
+
anomalyId: existingAnomaly.id,
|
|
418
|
+
newState: "recovered",
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
await dispatchAnomalyNotification({
|
|
423
|
+
action: "recovered",
|
|
424
|
+
systemId,
|
|
425
|
+
fieldPath: path,
|
|
426
|
+
observedValue: value,
|
|
427
|
+
baselineMean: baseline.mean,
|
|
428
|
+
catalogClient,
|
|
429
|
+
notificationClient,
|
|
430
|
+
db,
|
|
431
|
+
logger,
|
|
432
|
+
});
|
|
433
|
+
continue;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
await db
|
|
437
|
+
.update(schema.anomalies)
|
|
438
|
+
.set({
|
|
439
|
+
observedValue: String(value),
|
|
440
|
+
deviation,
|
|
441
|
+
metadata: { ...metadata, recentSamples },
|
|
442
|
+
})
|
|
443
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
444
|
+
} else {
|
|
445
|
+
await db
|
|
446
|
+
.update(schema.anomalies)
|
|
447
|
+
.set({
|
|
448
|
+
observedValue: String(value),
|
|
449
|
+
deviation,
|
|
450
|
+
})
|
|
451
|
+
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
452
|
+
}
|
|
350
453
|
}
|
|
351
454
|
} else {
|
|
352
455
|
if (existingAnomaly) {
|
|
@@ -361,9 +464,14 @@ export async function processCheckCompleted({
|
|
|
361
464
|
state: "recovered",
|
|
362
465
|
recoveredAt: new Date(),
|
|
363
466
|
observedValue: String(value),
|
|
467
|
+
// Baseline-relative recovery clears any active suppression and the
|
|
468
|
+
// rolling self-resolution window — the row is no longer active.
|
|
469
|
+
suppressedAt: null,
|
|
470
|
+
suppressedValue: null,
|
|
471
|
+
suppressedBaseline: null,
|
|
364
472
|
})
|
|
365
473
|
.where(eq(schema.anomalies.id, existingAnomaly.id));
|
|
366
|
-
logger.
|
|
474
|
+
logger.debug(`Anomaly recovered for ${systemId} on ${path}`);
|
|
367
475
|
|
|
368
476
|
await routerCache?.invalidateAnomalies();
|
|
369
477
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import { describe, test, expect, mock } from "bun:test";
|
|
2
2
|
import { evaluateDrift } from "./drift-evaluator";
|
|
3
3
|
import * as schema from "./schema";
|
|
4
|
-
import
|
|
4
|
+
import {
|
|
5
|
+
STABLE_DRIFT_RESOLUTION_RUN_COUNT,
|
|
6
|
+
type AnomalySettings,
|
|
7
|
+
type FieldBaseline,
|
|
8
|
+
} from "@checkstack/anomaly-common";
|
|
5
9
|
|
|
6
10
|
function createBaseline(overrides: Partial<FieldBaseline> = {}): FieldBaseline {
|
|
7
11
|
return {
|
|
@@ -357,6 +361,97 @@ describe("evaluateDrift", () => {
|
|
|
357
361
|
expect(notification.notifyForSubscription).toHaveBeenCalledTimes(1);
|
|
358
362
|
});
|
|
359
363
|
|
|
364
|
+
// ─── PART A: drift self-resolution (settled at a new level) ──────────
|
|
365
|
+
|
|
366
|
+
// Statistically drifting (slope×n = 150 ≫ 2×σ = 20) yet the projected
|
|
367
|
+
// change is tiny relative to the new mean (150 / 10000 = 1.5% < band) — the
|
|
368
|
+
// metric has settled at a high new level the 7-day window hasn't caught up to.
|
|
369
|
+
const flatHighMeanBaseline = createBaseline({
|
|
370
|
+
mean: 10000,
|
|
371
|
+
stdDev: 10,
|
|
372
|
+
trendSlope: 1.5,
|
|
373
|
+
sampleCount: 100,
|
|
374
|
+
});
|
|
375
|
+
|
|
376
|
+
test("self-resolves a confirmed drift once slope is flat relative to the new mean for N runs", async () => {
|
|
377
|
+
const existing = {
|
|
378
|
+
id: "drift-stuck",
|
|
379
|
+
state: "anomaly",
|
|
380
|
+
suspiciousRunCount: 2,
|
|
381
|
+
confirmationThreshold: 2,
|
|
382
|
+
// One prior flat run already recorded; this run reaches the threshold.
|
|
383
|
+
metadata: { stableDriftRunCount: STABLE_DRIFT_RESOLUTION_RUN_COUNT - 1 },
|
|
384
|
+
};
|
|
385
|
+
const db = createMockDb({ existingAnomaly: existing });
|
|
386
|
+
const notification = createMockNotificationClient();
|
|
387
|
+
await evaluateDrift({
|
|
388
|
+
...baseProps,
|
|
389
|
+
baseline: flatHighMeanBaseline,
|
|
390
|
+
schemaDirection: "lower-is-better",
|
|
391
|
+
templateConfig: defaultTemplate,
|
|
392
|
+
db: db as never,
|
|
393
|
+
catalogClient: createMockCatalogClient() as never,
|
|
394
|
+
notificationClient: notification as never,
|
|
395
|
+
logger: createMockLogger() as never,
|
|
396
|
+
});
|
|
397
|
+
expect(db._updateCalls.length).toBe(1);
|
|
398
|
+
expect(db._updateCalls[0].state).toBe("recovered");
|
|
399
|
+
expect(notification.notifyForSubscription).toHaveBeenCalledTimes(1);
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
test("accumulates the flat-run counter without resolving prematurely", async () => {
|
|
403
|
+
const existing = {
|
|
404
|
+
id: "drift-counting",
|
|
405
|
+
state: "anomaly",
|
|
406
|
+
suspiciousRunCount: 2,
|
|
407
|
+
confirmationThreshold: 2,
|
|
408
|
+
metadata: {},
|
|
409
|
+
};
|
|
410
|
+
const db = createMockDb({ existingAnomaly: existing });
|
|
411
|
+
await evaluateDrift({
|
|
412
|
+
...baseProps,
|
|
413
|
+
baseline: flatHighMeanBaseline,
|
|
414
|
+
schemaDirection: "lower-is-better",
|
|
415
|
+
templateConfig: defaultTemplate,
|
|
416
|
+
db: db as never,
|
|
417
|
+
catalogClient: createMockCatalogClient() as never,
|
|
418
|
+
notificationClient: createMockNotificationClient() as never,
|
|
419
|
+
logger: createMockLogger() as never,
|
|
420
|
+
});
|
|
421
|
+
expect(db._updateCalls.length).toBe(1);
|
|
422
|
+
expect(db._updateCalls[0].state).toBeUndefined();
|
|
423
|
+
expect(db._updateCalls[0].metadata).toMatchObject({
|
|
424
|
+
stableDriftRunCount: 1,
|
|
425
|
+
});
|
|
426
|
+
});
|
|
427
|
+
|
|
428
|
+
test("resets the flat-run counter when drift is steep again", async () => {
|
|
429
|
+
const existing = {
|
|
430
|
+
id: "drift-resteepening",
|
|
431
|
+
state: "anomaly",
|
|
432
|
+
suspiciousRunCount: 2,
|
|
433
|
+
confirmationThreshold: 2,
|
|
434
|
+
metadata: { stableDriftRunCount: 1 },
|
|
435
|
+
};
|
|
436
|
+
const db = createMockDb({ existingAnomaly: existing });
|
|
437
|
+
// driftingBaseline: mean 200, projectedChange 150 → 75% of mean → not flat.
|
|
438
|
+
await evaluateDrift({
|
|
439
|
+
...baseProps,
|
|
440
|
+
baseline: driftingBaseline,
|
|
441
|
+
schemaDirection: "lower-is-better",
|
|
442
|
+
templateConfig: defaultTemplate,
|
|
443
|
+
db: db as never,
|
|
444
|
+
catalogClient: createMockCatalogClient() as never,
|
|
445
|
+
notificationClient: createMockNotificationClient() as never,
|
|
446
|
+
logger: createMockLogger() as never,
|
|
447
|
+
});
|
|
448
|
+
expect(db._updateCalls.length).toBe(1);
|
|
449
|
+
expect(db._updateCalls[0].state).toBeUndefined();
|
|
450
|
+
expect(db._updateCalls[0].metadata).toMatchObject({
|
|
451
|
+
stableDriftRunCount: 0,
|
|
452
|
+
});
|
|
453
|
+
});
|
|
454
|
+
|
|
360
455
|
test("does nothing when no row and no drift (steady state)", async () => {
|
|
361
456
|
const db = createMockDb();
|
|
362
457
|
await evaluateDrift({
|
package/src/drift-evaluator.ts
CHANGED
|
@@ -8,7 +8,10 @@ import {
|
|
|
8
8
|
ANOMALY_TREND_DETECTED,
|
|
9
9
|
detectDrift,
|
|
10
10
|
resolveEffectiveConfig,
|
|
11
|
+
isDriftFlatRelative,
|
|
12
|
+
STABLE_DRIFT_RESOLUTION_RUN_COUNT,
|
|
11
13
|
type AnomalyDirection,
|
|
14
|
+
type AnomalyMetadata,
|
|
12
15
|
type AnomalySettings,
|
|
13
16
|
type FieldBaseline,
|
|
14
17
|
} from "@checkstack/anomaly-common";
|
|
@@ -170,7 +173,7 @@ export async function evaluateDrift({
|
|
|
170
173
|
deviation: driftResult.deviationSigmas,
|
|
171
174
|
})
|
|
172
175
|
.where(eq(schema.anomalies.id, existing.id));
|
|
173
|
-
logger.
|
|
176
|
+
logger.debug(`Drift confirmed for ${systemId} on ${fieldPath}`);
|
|
174
177
|
|
|
175
178
|
if (signalService) {
|
|
176
179
|
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
@@ -211,11 +214,65 @@ export async function evaluateDrift({
|
|
|
211
214
|
}
|
|
212
215
|
|
|
213
216
|
if (existing.state === "anomaly") {
|
|
217
|
+
// PART A (drift self-resolution): the slope-based detector still reports
|
|
218
|
+
// drift because the 7-day window straddles the old and new regimes, but
|
|
219
|
+
// if the *projected change relative to the (new) mean* has gone flat for
|
|
220
|
+
// several consecutive analyzer runs, the metric has settled at its new
|
|
221
|
+
// level — resolve independently of the slow window catching up. The
|
|
222
|
+
// run-count lives on the row's metadata (shared Postgres) so it survives
|
|
223
|
+
// across whichever pod claims the analyzer job.
|
|
224
|
+
const metadata = (existing.metadata ?? {}) as AnomalyMetadata;
|
|
225
|
+
const flat = isDriftFlatRelative({
|
|
226
|
+
projectedChange: driftResult.projectedChange,
|
|
227
|
+
mean: baseline.mean,
|
|
228
|
+
});
|
|
229
|
+
const stableDriftRunCount = flat
|
|
230
|
+
? (metadata.stableDriftRunCount ?? 0) + 1
|
|
231
|
+
: 0;
|
|
232
|
+
|
|
233
|
+
if (stableDriftRunCount >= STABLE_DRIFT_RESOLUTION_RUN_COUNT) {
|
|
234
|
+
await db
|
|
235
|
+
.update(schema.anomalies)
|
|
236
|
+
.set({
|
|
237
|
+
state: "recovered",
|
|
238
|
+
recoveredAt: new Date(),
|
|
239
|
+
observedValue: baseline.mean.toString(),
|
|
240
|
+
deviation: driftResult.deviationSigmas,
|
|
241
|
+
metadata: { ...metadata, stableDriftRunCount: 0 },
|
|
242
|
+
})
|
|
243
|
+
.where(eq(schema.anomalies.id, existing.id));
|
|
244
|
+
logger.debug(
|
|
245
|
+
`Drift self-resolved (settled at new level) for ${systemId} on ${fieldPath}`,
|
|
246
|
+
);
|
|
247
|
+
|
|
248
|
+
if (signalService) {
|
|
249
|
+
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|
|
250
|
+
systemId,
|
|
251
|
+
anomalyId: existing.id,
|
|
252
|
+
newState: "recovered",
|
|
253
|
+
});
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
await dispatchAnomalyNotification({
|
|
257
|
+
action: "drift_recovered",
|
|
258
|
+
systemId,
|
|
259
|
+
fieldPath,
|
|
260
|
+
observedValue: baseline.mean,
|
|
261
|
+
baselineMean: baseline.mean,
|
|
262
|
+
catalogClient,
|
|
263
|
+
notificationClient,
|
|
264
|
+
db,
|
|
265
|
+
logger,
|
|
266
|
+
});
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
|
|
214
270
|
await db
|
|
215
271
|
.update(schema.anomalies)
|
|
216
272
|
.set({
|
|
217
273
|
observedValue: baseline.mean.toString(),
|
|
218
274
|
deviation: driftResult.deviationSigmas,
|
|
275
|
+
metadata: { ...metadata, stableDriftRunCount },
|
|
219
276
|
})
|
|
220
277
|
.where(eq(schema.anomalies.id, existing.id));
|
|
221
278
|
return;
|
|
@@ -241,9 +298,12 @@ export async function evaluateDrift({
|
|
|
241
298
|
state: "recovered",
|
|
242
299
|
recoveredAt: new Date(),
|
|
243
300
|
observedValue: baseline.mean.toString(),
|
|
301
|
+
suppressedAt: null,
|
|
302
|
+
suppressedValue: null,
|
|
303
|
+
suppressedBaseline: null,
|
|
244
304
|
})
|
|
245
305
|
.where(eq(schema.anomalies.id, existing.id));
|
|
246
|
-
logger.
|
|
306
|
+
logger.debug(`Drift recovered for ${systemId} on ${fieldPath}`);
|
|
247
307
|
|
|
248
308
|
if (signalService) {
|
|
249
309
|
await signalService.broadcast(ANOMALY_STATE_CHANGED, {
|