@checkstack/anomaly-backend 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,115 @@
1
+ import { describe, test, expect, mock } from "bun:test";
2
+ import { AnomalyService } from "./service";
3
+ import type { SafeDatabase } from "@checkstack/backend-api";
4
+ import type * as schema from "./schema";
5
+
6
+ /**
7
+ * Read-side migrate-then-validate guard. `getAnomalyConfig` and
8
+ * `getAnomalyAssignmentConfig` previously cast the stored jsonb straight to a
9
+ * typed `VersionedRecord`, bypassing the strategy schema entirely. They now run
10
+ * the stored record through `anomalySettingsConfig.parseRecord` /
11
+ * `anomalyAssignmentConfig.parseRecord`, so the data is validated (and migrated
12
+ * once migrations exist) on every read. These tests prove the read path goes
13
+ * through `.parseRecord`: stray keys are stripped and defaults are applied.
14
+ */
15
+
16
+ function createSelectMockDb({
17
+ storedConfig,
18
+ }: {
19
+ storedConfig: unknown | undefined;
20
+ }): SafeDatabase<typeof schema> {
21
+ const rows = storedConfig === undefined ? [] : [{ config: storedConfig }];
22
+ const db = {
23
+ select: mock(() => ({
24
+ from: mock(() => ({
25
+ where: mock(() => Promise.resolve(rows)),
26
+ })),
27
+ })),
28
+ };
29
+ // The service only touches `select().from().where()` on these read paths.
30
+ return db as unknown as SafeDatabase<typeof schema>;
31
+ }
32
+
33
+ describe("getAnomalyConfig read path", () => {
34
+ test("validates the stored record via parseRecord (strips stray keys)", async () => {
35
+ const service = new AnomalyService(
36
+ createSelectMockDb({
37
+ storedConfig: {
38
+ version: 1,
39
+ data: {
40
+ enabled: false,
41
+ baselineWindow: "30d",
42
+ notify: false,
43
+ // Not part of AnomalySettingsSchema — must be stripped by parse.
44
+ legacyField: "should-be-dropped",
45
+ },
46
+ },
47
+ }),
48
+ );
49
+
50
+ const result = await service.getAnomalyConfig("cfg-1");
51
+
52
+ expect(result.version).toBe(1);
53
+ expect(result.data.enabled).toBe(false);
54
+ expect(result.data.baselineWindow).toBe("30d");
55
+ expect(result.data.notify).toBe(false);
56
+ expect("legacyField" in result.data).toBe(false);
57
+ });
58
+
59
+ test("applies schema defaults to a sparse stored record", async () => {
60
+ const service = new AnomalyService(
61
+ createSelectMockDb({ storedConfig: { version: 1, data: {} } }),
62
+ );
63
+
64
+ const result = await service.getAnomalyConfig("cfg-1");
65
+
66
+ // Defaults come from AnomalySettingsSchema, proving validation ran.
67
+ expect(result.data.enabled).toBe(true);
68
+ expect(result.data.baselineWindow).toBe("7d");
69
+ expect(result.data.notify).toBe(true);
70
+ });
71
+
72
+ test("returns a validated default wrapper when no row exists", async () => {
73
+ const service = new AnomalyService(
74
+ createSelectMockDb({ storedConfig: undefined }),
75
+ );
76
+
77
+ const result = await service.getAnomalyConfig("cfg-missing");
78
+
79
+ expect(result.version).toBe(1);
80
+ expect(result.data.enabled).toBe(true);
81
+ });
82
+ });
83
+
84
+ describe("getAnomalyAssignmentConfig read path", () => {
85
+ test("validates the stored override record via parseRecord", async () => {
86
+ const service = new AnomalyService(
87
+ createSelectMockDb({
88
+ storedConfig: {
89
+ version: 1,
90
+ data: {
91
+ enabled: true,
92
+ // Stray key not on PartialAnomalySettingsSchema — must be stripped.
93
+ bogus: 123,
94
+ },
95
+ },
96
+ }),
97
+ );
98
+
99
+ const result = await service.getAnomalyAssignmentConfig("sys-1", "cfg-1");
100
+
101
+ expect(result).toBeDefined();
102
+ expect(result?.data.enabled).toBe(true);
103
+ expect(result && "bogus" in result.data).toBe(false);
104
+ });
105
+
106
+ test("returns undefined when no override row exists", async () => {
107
+ const service = new AnomalyService(
108
+ createSelectMockDb({ storedConfig: undefined }),
109
+ );
110
+
111
+ const result = await service.getAnomalyAssignmentConfig("sys-1", "cfg-1");
112
+
113
+ expect(result).toBeUndefined();
114
+ });
115
+ });
package/src/config.ts CHANGED
@@ -1,8 +1,48 @@
1
+ import { z } from "zod";
1
2
  import { Versioned } from "@checkstack/backend-api";
2
- import { AnomalySettingsSchema } from "@checkstack/anomaly-common";
3
- import type { AnomalySettings } from "@checkstack/anomaly-common";
3
+ import type { VersionedRecord } from "@checkstack/backend-api";
4
+ import {
5
+ AnomalySettingsSchema,
6
+ PartialAnomalySettingsSchema,
7
+ } from "@checkstack/anomaly-common";
8
+ import type {
9
+ AnomalySettings,
10
+ PartialAnomalySettings,
11
+ } from "@checkstack/anomaly-common";
12
+
13
+ /**
14
+ * Envelope schema for the stored versioned record. Stored configs come back
15
+ * from the jsonb column typed as `unknown`, so we validate the wrapper shape
16
+ * (`{ version, data }`) before handing it to `Versioned.parseRecord`, which
17
+ * then migrates + validates the inner `data` against the strategy schema.
18
+ */
19
+ const versionedEnvelopeSchema = z.object({
20
+ version: z.number(),
21
+ data: z.unknown(),
22
+ migratedAt: z.date().optional(),
23
+ originalVersion: z.number().optional(),
24
+ });
25
+
26
+ /**
27
+ * Narrow a stored jsonb config (`unknown`) into a {@link VersionedRecord}
28
+ * envelope so it can be passed to `Versioned.parse`/`parseRecord`.
29
+ */
30
+ export function toVersionedRecord(stored: unknown): VersionedRecord<unknown> {
31
+ return versionedEnvelopeSchema.parse(stored);
32
+ }
4
33
 
5
34
  export const anomalySettingsConfig = new Versioned<AnomalySettings>({
6
35
  version: 1,
7
36
  schema: AnomalySettingsSchema,
8
37
  });
38
+
39
+ /**
40
+ * Versioned config for assignment-level overrides. Stored alongside the
41
+ * template config and migrated/validated on read via {@link Versioned.parseRecord}.
42
+ * `version: 1` with no migrations today; threading it through the read path
43
+ * means a future reshape only needs a migration step added here.
44
+ */
45
+ export const anomalyAssignmentConfig = new Versioned<PartialAnomalySettings>({
46
+ version: 1,
47
+ schema: PartialAnomalySettingsSchema,
48
+ });
@@ -886,6 +886,167 @@ describe("Anomaly Detector — processCheckCompleted", () => {
886
886
  expect(broadcastPayload).toMatchObject({ newState: "recovered" });
887
887
  });
888
888
 
889
+ // ─── PART A: self-resolution (settled at a new level) ─────────────────
890
+
891
+ test("self-resolves a confirmed anomaly once recent samples settle at a new level", async () => {
892
+ const baseline = createBaseline({ mean: 100, stdDev: 10 });
893
+ const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
894
+ const catalogClient = createMockCatalogClient();
895
+ const notificationClient = createMockNotificationClient(["user-1"]);
896
+ // Four prior healthy samples already sitting at the new stable level (~200);
897
+ // the fifth (anomalousResult = 200) completes the window → self-resolve.
898
+ const db = createMockDb({
899
+ existingAnomaly: {
900
+ id: "anomaly-stuck",
901
+ systemId,
902
+ configurationId,
903
+ fieldPath: "collectors.http.request.responseTimeMs",
904
+ state: "anomaly",
905
+ suspiciousRunCount: 5,
906
+ confirmationThreshold: 3,
907
+ baselineValue: 100,
908
+ observedValue: "200",
909
+ suppressedAt: null,
910
+ suppressedValue: null,
911
+ metadata: { recentSamples: [200, 200, 200, 200] },
912
+ },
913
+ });
914
+
915
+ await processCheckCompleted({
916
+ ...baseProps,
917
+ latencyMs: 50,
918
+ result: anomalousResult, // still 10σ above the stale baseline
919
+ db: db as never,
920
+ cache,
921
+ logger: createMockLogger() as never,
922
+ catalogClient: catalogClient as never,
923
+ notificationClient: notificationClient as never,
924
+ });
925
+
926
+ expect(db._updateCalls.length).toBe(1);
927
+ expect(db._updateCalls[0]).toMatchObject({ state: "recovered" });
928
+ // Recovery notification is dispatched.
929
+ expect(notificationClient.notifyForSubscription).toHaveBeenCalledTimes(1);
930
+ });
931
+
932
+ test("does not self-resolve while the window is still filling", async () => {
933
+ const baseline = createBaseline({ mean: 100, stdDev: 10 });
934
+ const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
935
+ const db = createMockDb({
936
+ existingAnomaly: {
937
+ id: "anomaly-filling",
938
+ systemId,
939
+ configurationId,
940
+ fieldPath: "collectors.http.request.responseTimeMs",
941
+ state: "anomaly",
942
+ suspiciousRunCount: 5,
943
+ confirmationThreshold: 3,
944
+ baselineValue: 100,
945
+ observedValue: "200",
946
+ suppressedAt: null,
947
+ suppressedValue: null,
948
+ metadata: { recentSamples: [200, 200] },
949
+ },
950
+ });
951
+
952
+ await processCheckCompleted({
953
+ ...baseProps,
954
+ latencyMs: 50,
955
+ result: anomalousResult,
956
+ db: db as never,
957
+ cache,
958
+ logger: createMockLogger() as never,
959
+ catalogClient: createMockCatalogClient() as never,
960
+ notificationClient: createMockNotificationClient() as never,
961
+ });
962
+
963
+ expect(db._updateCalls.length).toBe(1);
964
+ // Still anomalous: only the rolling window/observed value is updated.
965
+ expect(db._updateCalls[0]).not.toHaveProperty("state");
966
+ expect(db._updateCalls[0].metadata).toMatchObject({
967
+ recentSamples: [200, 200, 200],
968
+ });
969
+ });
970
+
971
+ // ─── PART B: auto-unsuppress ("changes again") ────────────────────────
972
+
973
+ test("auto-unsuppresses a suppressed anomaly when the value changes again", async () => {
974
+ const baseline = createBaseline({ mean: 100, stdDev: 10 });
975
+ const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
976
+ const db = createMockDb({
977
+ existingAnomaly: {
978
+ id: "anomaly-suppressed",
979
+ systemId,
980
+ configurationId,
981
+ fieldPath: "collectors.http.request.responseTimeMs",
982
+ state: "anomaly",
983
+ suspiciousRunCount: 5,
984
+ confirmationThreshold: 3,
985
+ baselineValue: 100,
986
+ observedValue: "200",
987
+ suppressedAt: new Date(),
988
+ suppressedValue: 200, // suppressed at ~200; new value 200 is unchanged...
989
+ metadata: {},
990
+ },
991
+ });
992
+
993
+ // anomalousResult is 200 — within band → must NOT auto-unsuppress.
994
+ await processCheckCompleted({
995
+ ...baseProps,
996
+ latencyMs: 50,
997
+ result: anomalousResult,
998
+ db: db as never,
999
+ cache,
1000
+ logger: createMockLogger() as never,
1001
+ catalogClient: createMockCatalogClient() as never,
1002
+ notificationClient: createMockNotificationClient() as never,
1003
+ });
1004
+ const unsuppressed = db._updateCalls.find(
1005
+ (c) => c.suppressedAt === null,
1006
+ );
1007
+ expect(unsuppressed).toBeUndefined();
1008
+ });
1009
+
1010
+ test("auto-unsuppresses when the value moves outside the reactivation band", async () => {
1011
+ // Baseline far below so the new high value is still anomalous and reaches
1012
+ // the anomaly branch; suppressed at 50, observed jumps to 200 (>25% move).
1013
+ const baseline = createBaseline({ mean: 100, stdDev: 10 });
1014
+ const cache = createMockCache(new Map([[cacheKeyPrefix, baseline]]));
1015
+ const db = createMockDb({
1016
+ existingAnomaly: {
1017
+ id: "anomaly-suppressed-2",
1018
+ systemId,
1019
+ configurationId,
1020
+ fieldPath: "collectors.http.request.responseTimeMs",
1021
+ state: "anomaly",
1022
+ suspiciousRunCount: 5,
1023
+ confirmationThreshold: 3,
1024
+ baselineValue: 100,
1025
+ observedValue: "50",
1026
+ suppressedAt: new Date(),
1027
+ suppressedValue: 50,
1028
+ metadata: {},
1029
+ },
1030
+ });
1031
+
1032
+ await processCheckCompleted({
1033
+ ...baseProps,
1034
+ latencyMs: 50,
1035
+ result: anomalousResult, // 200, far from suppressedValue 50
1036
+ db: db as never,
1037
+ cache,
1038
+ logger: createMockLogger() as never,
1039
+ catalogClient: createMockCatalogClient() as never,
1040
+ notificationClient: createMockNotificationClient() as never,
1041
+ });
1042
+
1043
+ expect(db._updateCalls.length).toBe(1);
1044
+ expect(db._updateCalls[0]).toMatchObject({
1045
+ suppressedAt: null,
1046
+ suppressedValue: null,
1047
+ });
1048
+ });
1049
+
889
1050
  // ─── Notification resilience ──────────────────────────────────────────
890
1051
 
891
1052
  test("does not crash when notification dispatch fails", async () => {
package/src/detector.ts CHANGED
@@ -7,6 +7,10 @@ import {
7
7
  isAnomalous,
8
8
  isCategoricalAnomalous,
9
9
  resolveEffectiveConfig,
10
+ appendRecentSample,
11
+ hasSettledAtNewLevel,
12
+ hasChangedSinceSuppression,
13
+ type AnomalyMetadata,
10
14
  type FieldBaseline,
11
15
  } from "@checkstack/anomaly-common";
12
16
  import type { Logger } from "@checkstack/backend-api";
@@ -304,7 +308,7 @@ export async function processCheckCompleted({
304
308
  deviation,
305
309
  })
306
310
  .where(eq(schema.anomalies.id, existingAnomaly.id));
307
- logger.warn(`Anomaly confirmed for ${systemId} on ${path}`);
311
+ logger.debug(`Anomaly confirmed for ${systemId} on ${path}`);
308
312
 
309
313
  await routerCache?.invalidateAnomalies();
310
314
 
@@ -340,13 +344,112 @@ export async function processCheckCompleted({
340
344
  .where(eq(schema.anomalies.id, existingAnomaly.id));
341
345
  }
342
346
  } else if (existingAnomaly.state === "anomaly") {
343
- await db
344
- .update(schema.anomalies)
345
- .set({
346
- observedValue: String(value),
347
- deviation,
347
+ // PART B: a suppressed anomaly auto-unsuppresses once the metric
348
+ // "changes again" — the observed value moves outside the relative band
349
+ // around the value it was suppressed at. We detect that here (the only
350
+ // place fresh samples flow through) before the self-resolution check so
351
+ // a re-activated anomaly resumes normal lifecycle handling.
352
+ const suppressedValue = existingAnomaly.suppressedValue;
353
+ if (
354
+ existingAnomaly.suppressedAt &&
355
+ typeof value === "number" &&
356
+ typeof suppressedValue === "number" &&
357
+ hasChangedSinceSuppression({
358
+ observedValue: value,
359
+ suppressedValue,
348
360
  })
349
- .where(eq(schema.anomalies.id, existingAnomaly.id));
361
+ ) {
362
+ await db
363
+ .update(schema.anomalies)
364
+ .set({
365
+ suppressedAt: null,
366
+ suppressedValue: null,
367
+ suppressedBaseline: null,
368
+ observedValue: String(value),
369
+ deviation,
370
+ })
371
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
372
+ await routerCache?.invalidateAnomalies();
373
+ if (signalService) {
374
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
375
+ systemId,
376
+ anomalyId: existingAnomaly.id,
377
+ newState: "anomaly",
378
+ });
379
+ }
380
+ continue;
381
+ }
382
+
383
+ // PART A: self-resolution. The value is still anomalous against the
384
+ // (stale) baseline, but if the recent healthy samples have settled into
385
+ // a tight relative band the metric has found a new normal — resolve
386
+ // independently of the slow baseline analyzer. We keep a rolling window
387
+ // of recent samples on the row's metadata (shared Postgres, so every
388
+ // pod sees the same window).
389
+ if (typeof value === "number") {
390
+ const metadata = (existingAnomaly.metadata ??
391
+ {}) as AnomalyMetadata;
392
+ const recentSamples = appendRecentSample(
393
+ metadata.recentSamples,
394
+ value,
395
+ );
396
+
397
+ if (hasSettledAtNewLevel(recentSamples)) {
398
+ await db
399
+ .update(schema.anomalies)
400
+ .set({
401
+ state: "recovered",
402
+ recoveredAt: new Date(),
403
+ observedValue: String(value),
404
+ deviation,
405
+ metadata: { ...metadata, recentSamples: [] },
406
+ })
407
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
408
+ logger.debug(
409
+ `Anomaly self-resolved (settled at new level) for ${systemId} on ${path}`,
410
+ );
411
+
412
+ await routerCache?.invalidateAnomalies();
413
+
414
+ if (signalService) {
415
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
416
+ systemId,
417
+ anomalyId: existingAnomaly.id,
418
+ newState: "recovered",
419
+ });
420
+ }
421
+
422
+ await dispatchAnomalyNotification({
423
+ action: "recovered",
424
+ systemId,
425
+ fieldPath: path,
426
+ observedValue: value,
427
+ baselineMean: baseline.mean,
428
+ catalogClient,
429
+ notificationClient,
430
+ db,
431
+ logger,
432
+ });
433
+ continue;
434
+ }
435
+
436
+ await db
437
+ .update(schema.anomalies)
438
+ .set({
439
+ observedValue: String(value),
440
+ deviation,
441
+ metadata: { ...metadata, recentSamples },
442
+ })
443
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
444
+ } else {
445
+ await db
446
+ .update(schema.anomalies)
447
+ .set({
448
+ observedValue: String(value),
449
+ deviation,
450
+ })
451
+ .where(eq(schema.anomalies.id, existingAnomaly.id));
452
+ }
350
453
  }
351
454
  } else {
352
455
  if (existingAnomaly) {
@@ -361,9 +464,14 @@ export async function processCheckCompleted({
361
464
  state: "recovered",
362
465
  recoveredAt: new Date(),
363
466
  observedValue: String(value),
467
+ // Baseline-relative recovery clears any active suppression and the
468
+ // rolling self-resolution window — the row is no longer active.
469
+ suppressedAt: null,
470
+ suppressedValue: null,
471
+ suppressedBaseline: null,
364
472
  })
365
473
  .where(eq(schema.anomalies.id, existingAnomaly.id));
366
- logger.info(`Anomaly recovered for ${systemId} on ${path}`);
474
+ logger.debug(`Anomaly recovered for ${systemId} on ${path}`);
367
475
 
368
476
  await routerCache?.invalidateAnomalies();
369
477
 
@@ -1,7 +1,11 @@
1
1
  import { describe, test, expect, mock } from "bun:test";
2
2
  import { evaluateDrift } from "./drift-evaluator";
3
3
  import * as schema from "./schema";
4
- import type { AnomalySettings, FieldBaseline } from "@checkstack/anomaly-common";
4
+ import {
5
+ STABLE_DRIFT_RESOLUTION_RUN_COUNT,
6
+ type AnomalySettings,
7
+ type FieldBaseline,
8
+ } from "@checkstack/anomaly-common";
5
9
 
6
10
  function createBaseline(overrides: Partial<FieldBaseline> = {}): FieldBaseline {
7
11
  return {
@@ -357,6 +361,97 @@ describe("evaluateDrift", () => {
357
361
  expect(notification.notifyForSubscription).toHaveBeenCalledTimes(1);
358
362
  });
359
363
 
364
+ // ─── PART A: drift self-resolution (settled at a new level) ──────────
365
+
366
+ // Statistically drifting (slope×n = 150 ≫ 2×σ = 20) yet the projected
367
+ // change is tiny relative to the new mean (150 / 10000 = 1.5% < band) — the
368
+ // metric has settled at a high new level the 7-day window hasn't caught up to.
369
+ const flatHighMeanBaseline = createBaseline({
370
+ mean: 10000,
371
+ stdDev: 10,
372
+ trendSlope: 1.5,
373
+ sampleCount: 100,
374
+ });
375
+
376
+ test("self-resolves a confirmed drift once slope is flat relative to the new mean for N runs", async () => {
377
+ const existing = {
378
+ id: "drift-stuck",
379
+ state: "anomaly",
380
+ suspiciousRunCount: 2,
381
+ confirmationThreshold: 2,
382
+ // One prior flat run already recorded; this run reaches the threshold.
383
+ metadata: { stableDriftRunCount: STABLE_DRIFT_RESOLUTION_RUN_COUNT - 1 },
384
+ };
385
+ const db = createMockDb({ existingAnomaly: existing });
386
+ const notification = createMockNotificationClient();
387
+ await evaluateDrift({
388
+ ...baseProps,
389
+ baseline: flatHighMeanBaseline,
390
+ schemaDirection: "lower-is-better",
391
+ templateConfig: defaultTemplate,
392
+ db: db as never,
393
+ catalogClient: createMockCatalogClient() as never,
394
+ notificationClient: notification as never,
395
+ logger: createMockLogger() as never,
396
+ });
397
+ expect(db._updateCalls.length).toBe(1);
398
+ expect(db._updateCalls[0].state).toBe("recovered");
399
+ expect(notification.notifyForSubscription).toHaveBeenCalledTimes(1);
400
+ });
401
+
402
+ test("accumulates the flat-run counter without resolving prematurely", async () => {
403
+ const existing = {
404
+ id: "drift-counting",
405
+ state: "anomaly",
406
+ suspiciousRunCount: 2,
407
+ confirmationThreshold: 2,
408
+ metadata: {},
409
+ };
410
+ const db = createMockDb({ existingAnomaly: existing });
411
+ await evaluateDrift({
412
+ ...baseProps,
413
+ baseline: flatHighMeanBaseline,
414
+ schemaDirection: "lower-is-better",
415
+ templateConfig: defaultTemplate,
416
+ db: db as never,
417
+ catalogClient: createMockCatalogClient() as never,
418
+ notificationClient: createMockNotificationClient() as never,
419
+ logger: createMockLogger() as never,
420
+ });
421
+ expect(db._updateCalls.length).toBe(1);
422
+ expect(db._updateCalls[0].state).toBeUndefined();
423
+ expect(db._updateCalls[0].metadata).toMatchObject({
424
+ stableDriftRunCount: 1,
425
+ });
426
+ });
427
+
428
+ test("resets the flat-run counter when drift is steep again", async () => {
429
+ const existing = {
430
+ id: "drift-resteepening",
431
+ state: "anomaly",
432
+ suspiciousRunCount: 2,
433
+ confirmationThreshold: 2,
434
+ metadata: { stableDriftRunCount: 1 },
435
+ };
436
+ const db = createMockDb({ existingAnomaly: existing });
437
+ // driftingBaseline: mean 200, projectedChange 150 → 75% of mean → not flat.
438
+ await evaluateDrift({
439
+ ...baseProps,
440
+ baseline: driftingBaseline,
441
+ schemaDirection: "lower-is-better",
442
+ templateConfig: defaultTemplate,
443
+ db: db as never,
444
+ catalogClient: createMockCatalogClient() as never,
445
+ notificationClient: createMockNotificationClient() as never,
446
+ logger: createMockLogger() as never,
447
+ });
448
+ expect(db._updateCalls.length).toBe(1);
449
+ expect(db._updateCalls[0].state).toBeUndefined();
450
+ expect(db._updateCalls[0].metadata).toMatchObject({
451
+ stableDriftRunCount: 0,
452
+ });
453
+ });
454
+
360
455
  test("does nothing when no row and no drift (steady state)", async () => {
361
456
  const db = createMockDb();
362
457
  await evaluateDrift({
@@ -8,7 +8,10 @@ import {
8
8
  ANOMALY_TREND_DETECTED,
9
9
  detectDrift,
10
10
  resolveEffectiveConfig,
11
+ isDriftFlatRelative,
12
+ STABLE_DRIFT_RESOLUTION_RUN_COUNT,
11
13
  type AnomalyDirection,
14
+ type AnomalyMetadata,
12
15
  type AnomalySettings,
13
16
  type FieldBaseline,
14
17
  } from "@checkstack/anomaly-common";
@@ -170,7 +173,7 @@ export async function evaluateDrift({
170
173
  deviation: driftResult.deviationSigmas,
171
174
  })
172
175
  .where(eq(schema.anomalies.id, existing.id));
173
- logger.warn(`Drift confirmed for ${systemId} on ${fieldPath}`);
176
+ logger.debug(`Drift confirmed for ${systemId} on ${fieldPath}`);
174
177
 
175
178
  if (signalService) {
176
179
  await signalService.broadcast(ANOMALY_STATE_CHANGED, {
@@ -211,11 +214,65 @@ export async function evaluateDrift({
211
214
  }
212
215
 
213
216
  if (existing.state === "anomaly") {
217
+ // PART A (drift self-resolution): the slope-based detector still reports
218
+ // drift because the 7-day window straddles the old and new regimes, but
219
+ // if the *projected change relative to the (new) mean* has gone flat for
220
+ // several consecutive analyzer runs, the metric has settled at its new
221
+ // level — resolve independently of the slow window catching up. The
222
+ // run-count lives on the row's metadata (shared Postgres) so it survives
223
+ // across whichever pod claims the analyzer job.
224
+ const metadata = (existing.metadata ?? {}) as AnomalyMetadata;
225
+ const flat = isDriftFlatRelative({
226
+ projectedChange: driftResult.projectedChange,
227
+ mean: baseline.mean,
228
+ });
229
+ const stableDriftRunCount = flat
230
+ ? (metadata.stableDriftRunCount ?? 0) + 1
231
+ : 0;
232
+
233
+ if (stableDriftRunCount >= STABLE_DRIFT_RESOLUTION_RUN_COUNT) {
234
+ await db
235
+ .update(schema.anomalies)
236
+ .set({
237
+ state: "recovered",
238
+ recoveredAt: new Date(),
239
+ observedValue: baseline.mean.toString(),
240
+ deviation: driftResult.deviationSigmas,
241
+ metadata: { ...metadata, stableDriftRunCount: 0 },
242
+ })
243
+ .where(eq(schema.anomalies.id, existing.id));
244
+ logger.debug(
245
+ `Drift self-resolved (settled at new level) for ${systemId} on ${fieldPath}`,
246
+ );
247
+
248
+ if (signalService) {
249
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
250
+ systemId,
251
+ anomalyId: existing.id,
252
+ newState: "recovered",
253
+ });
254
+ }
255
+
256
+ await dispatchAnomalyNotification({
257
+ action: "drift_recovered",
258
+ systemId,
259
+ fieldPath,
260
+ observedValue: baseline.mean,
261
+ baselineMean: baseline.mean,
262
+ catalogClient,
263
+ notificationClient,
264
+ db,
265
+ logger,
266
+ });
267
+ return;
268
+ }
269
+
214
270
  await db
215
271
  .update(schema.anomalies)
216
272
  .set({
217
273
  observedValue: baseline.mean.toString(),
218
274
  deviation: driftResult.deviationSigmas,
275
+ metadata: { ...metadata, stableDriftRunCount },
219
276
  })
220
277
  .where(eq(schema.anomalies.id, existing.id));
221
278
  return;
@@ -241,9 +298,12 @@ export async function evaluateDrift({
241
298
  state: "recovered",
242
299
  recoveredAt: new Date(),
243
300
  observedValue: baseline.mean.toString(),
301
+ suppressedAt: null,
302
+ suppressedValue: null,
303
+ suppressedBaseline: null,
244
304
  })
245
305
  .where(eq(schema.anomalies.id, existing.id));
246
- logger.info(`Drift recovered for ${systemId} on ${fieldPath}`);
306
+ logger.debug(`Drift recovered for ${systemId} on ${fieldPath}`);
247
307
 
248
308
  if (signalService) {
249
309
  await signalService.broadcast(ANOMALY_STATE_CHANGED, {