@checkstack/anomaly-backend 1.1.9 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,10 @@ import {
8
8
  ANOMALY_TREND_DETECTED,
9
9
  detectDrift,
10
10
  resolveEffectiveConfig,
11
+ isDriftFlatRelative,
12
+ STABLE_DRIFT_RESOLUTION_RUN_COUNT,
11
13
  type AnomalyDirection,
14
+ type AnomalyMetadata,
12
15
  type AnomalySettings,
13
16
  type FieldBaseline,
14
17
  } from "@checkstack/anomaly-common";
@@ -170,7 +173,7 @@ export async function evaluateDrift({
170
173
  deviation: driftResult.deviationSigmas,
171
174
  })
172
175
  .where(eq(schema.anomalies.id, existing.id));
173
- logger.warn(`Drift confirmed for ${systemId} on ${fieldPath}`);
176
+ logger.debug(`Drift confirmed for ${systemId} on ${fieldPath}`);
174
177
 
175
178
  if (signalService) {
176
179
  await signalService.broadcast(ANOMALY_STATE_CHANGED, {
@@ -211,11 +214,65 @@ export async function evaluateDrift({
211
214
  }
212
215
 
213
216
  if (existing.state === "anomaly") {
217
+ // PART A (drift self-resolution): the slope-based detector still reports
218
+ // drift because the 7-day window straddles the old and new regimes, but
219
+ // if the *projected change relative to the (new) mean* has gone flat for
220
+ // several consecutive analyzer runs, the metric has settled at its new
221
+ // level — resolve independently of the slow window catching up. The
222
+ // run-count lives on the row's metadata (shared Postgres) so it survives
223
+ // across whichever pod claims the analyzer job.
224
+ const metadata = (existing.metadata ?? {}) as AnomalyMetadata;
225
+ const flat = isDriftFlatRelative({
226
+ projectedChange: driftResult.projectedChange,
227
+ mean: baseline.mean,
228
+ });
229
+ const stableDriftRunCount = flat
230
+ ? (metadata.stableDriftRunCount ?? 0) + 1
231
+ : 0;
232
+
233
+ if (stableDriftRunCount >= STABLE_DRIFT_RESOLUTION_RUN_COUNT) {
234
+ await db
235
+ .update(schema.anomalies)
236
+ .set({
237
+ state: "recovered",
238
+ recoveredAt: new Date(),
239
+ observedValue: baseline.mean.toString(),
240
+ deviation: driftResult.deviationSigmas,
241
+ metadata: { ...metadata, stableDriftRunCount: 0 },
242
+ })
243
+ .where(eq(schema.anomalies.id, existing.id));
244
+ logger.debug(
245
+ `Drift self-resolved (settled at new level) for ${systemId} on ${fieldPath}`,
246
+ );
247
+
248
+ if (signalService) {
249
+ await signalService.broadcast(ANOMALY_STATE_CHANGED, {
250
+ systemId,
251
+ anomalyId: existing.id,
252
+ newState: "recovered",
253
+ });
254
+ }
255
+
256
+ await dispatchAnomalyNotification({
257
+ action: "drift_recovered",
258
+ systemId,
259
+ fieldPath,
260
+ observedValue: baseline.mean,
261
+ baselineMean: baseline.mean,
262
+ catalogClient,
263
+ notificationClient,
264
+ db,
265
+ logger,
266
+ });
267
+ return;
268
+ }
269
+
214
270
  await db
215
271
  .update(schema.anomalies)
216
272
  .set({
217
273
  observedValue: baseline.mean.toString(),
218
274
  deviation: driftResult.deviationSigmas,
275
+ metadata: { ...metadata, stableDriftRunCount },
219
276
  })
220
277
  .where(eq(schema.anomalies.id, existing.id));
221
278
  return;
@@ -241,9 +298,12 @@ export async function evaluateDrift({
241
298
  state: "recovered",
242
299
  recoveredAt: new Date(),
243
300
  observedValue: baseline.mean.toString(),
301
+ suppressedAt: null,
302
+ suppressedValue: null,
303
+ suppressedBaseline: null,
244
304
  })
245
305
  .where(eq(schema.anomalies.id, existing.id));
246
- logger.info(`Drift recovered for ${systemId} on ${fieldPath}`);
306
+ logger.debug(`Drift recovered for ${systemId} on ${fieldPath}`);
247
307
 
248
308
  if (signalService) {
249
309
  await signalService.broadcast(ANOMALY_STATE_CHANGED, {
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Contract test: every anomaly Versioned config that is stored and read back
3
+ * via the migration chain MUST have a COMPLETE, contiguous chain from version
4
+ * 1 to its current `version`. Pure STRUCTURAL check
5
+ * (`validateMigrationChainFromV1` — no `migrate()` is run), so it carries zero
6
+ * per-config upkeep: the day someone bumps a config's `version` without
7
+ * shipping a covering migration, `parse`/`parseRecord` would silently fail at
8
+ * runtime on a genuinely-v1 stored record — this test turns that into a CI
9
+ * failure instead. See the HTTP plugin's equivalent test for the full
10
+ * rationale.
11
+ *
12
+ * Covers the two module-level Versioned wrappers this package owns: the
13
+ * site-wide settings config and the assignment-level override config.
14
+ */
15
+ import { describe, expect, it } from "bun:test";
16
+ import { anomalySettingsConfig, anomalyAssignmentConfig } from "./config";
17
+
18
+ describe("anomaly config migration-chain contract", () => {
19
+ const configs = [
20
+ { name: "anomaly settings", config: anomalySettingsConfig },
21
+ { name: "anomaly assignment", config: anomalyAssignmentConfig },
22
+ ];
23
+
24
+ it("every registered Versioned config has a complete v1->version chain", () => {
25
+ for (const { name, config } of configs) {
26
+ const problem = config.validateMigrationChainFromV1();
27
+ expect(
28
+ problem,
29
+ `${name} config (version ${config.version}) has a broken migration chain: ${problem}`,
30
+ ).toBeUndefined();
31
+ }
32
+ });
33
+ });
package/src/plugin.ts CHANGED
@@ -1,4 +1,8 @@
1
1
  import { createBackendPlugin, coreServices, type SafeDatabase } from "@checkstack/backend-api";
2
+ import {
3
+ aiToolProjectionExtensionPoint,
4
+ deferredProjectionExecute,
5
+ } from "@checkstack/ai-backend";
2
6
  import { healthCheckHooks } from "@checkstack/healthcheck-backend";
3
7
  import { setupBaselineAnalyzerJob } from "./jobs/baseline-analyzer";
4
8
  import { processCheckCompleted } from "./detector";
@@ -13,6 +17,7 @@ import {
13
17
  anomalyAccessRules,
14
18
  anomalySystemSubscription,
15
19
  anomalyGroupSubscription,
20
+ pluginMetadata,
16
21
  } from "@checkstack/anomaly-common";
17
22
  import { specToRegistration } from "@checkstack/notification-common";
18
23
  import { HealthCheckApi } from "@checkstack/healthcheck-common";
@@ -46,6 +51,20 @@ export const plugin = createBackendPlugin({
46
51
  // Mutable ref populated during init(); the reconciler closure pulls
47
52
  // the service via the lazy accessor at sync time.
48
53
  let gitopsService: AnomalyService | undefined;
54
+ // Expose anomaly's own read-only AI projection so ai-backend never has
55
+ // to import @checkstack/anomaly-common. The projection re-uses the
56
+ // existing getAnomalies contract procedure (read-only, access-gated).
57
+ env.getExtensionPoint(aiToolProjectionExtensionPoint).expose({
58
+ procedure: anomalyContract.getAnomalies,
59
+ sourcePluginMetadata: pluginMetadata,
60
+ procedureKey: "getAnomalies",
61
+ name: "anomaly.explain",
62
+ description:
63
+ "List detected anomalies (statistical sigma/drift) for context. Read-only.",
64
+ effect: "read",
65
+ execute: deferredProjectionExecute,
66
+ });
67
+
49
68
  const kindRegistry = env.getExtensionPoint(entityKindExtensionPoint);
50
69
  registerAnomalyGitOpsKinds({
51
70
  kindRegistry,
package/src/router.ts CHANGED
@@ -7,9 +7,7 @@ import {
7
7
  type Logger,
8
8
  type RealUser,
9
9
  type RpcContext,
10
- type VersionedRecord,
11
10
  } from "@checkstack/backend-api";
12
- import type { AnomalySettings } from "@checkstack/anomaly-common";
13
11
  import type { AnomalyRouterCache } from "./router-cache";
14
12
 
15
13
  export function createRouter(
@@ -57,15 +55,14 @@ export function createRouter(
57
55
  cache.invalidateAnomalies(),
58
56
  cache.invalidateBaselines(),
59
57
  ]);
60
- return result as VersionedRecord<AnomalySettings>;
58
+ return result;
61
59
  }
62
60
  ),
63
61
 
64
62
  getAnomalyAssignmentConfig: os.getAnomalyAssignmentConfig.handler(
65
63
  async ({ input }) => {
66
64
  const result = await service.getAnomalyAssignmentConfig(input.systemId, input.configurationId);
67
-
68
- return (result as VersionedRecord<Partial<AnomalySettings>>) ?? null;
65
+ return result ?? null;
69
66
  }
70
67
  ),
71
68
 
@@ -76,10 +73,32 @@ export function createRouter(
76
73
  cache.invalidateAnomalies(),
77
74
  cache.invalidateBaselines(),
78
75
  ]);
79
- return result as VersionedRecord<Partial<AnomalySettings>>;
76
+ return result;
80
77
  }
81
78
  ),
82
79
 
80
+ suppressAnomaly: os.suppressAnomaly.handler(
81
+ async ({ input }) => {
82
+ const success = await service.suppressAnomaly({
83
+ anomalyId: input.anomalyId,
84
+ systemId: input.systemId,
85
+ });
86
+ await cache.invalidateAnomalies();
87
+ return { success };
88
+ },
89
+ ),
90
+
91
+ unsuppressAnomaly: os.unsuppressAnomaly.handler(
92
+ async ({ input }) => {
93
+ const success = await service.unsuppressAnomaly({
94
+ anomalyId: input.anomalyId,
95
+ systemId: input.systemId,
96
+ });
97
+ await cache.invalidateAnomalies();
98
+ return { success };
99
+ },
100
+ ),
101
+
83
102
  listAnomalyNotificationMutes: os.listAnomalyNotificationMutes.handler(
84
103
  async ({ input, context }) => {
85
104
  const userId = (context.user as RealUser).id;
package/src/schema.ts CHANGED
@@ -52,6 +52,21 @@ export const anomalies = pgTable("anomalies", {
52
52
  startedAt: timestamp("started_at").defaultNow().notNull(),
53
53
  confirmedAt: timestamp("confirmed_at"),
54
54
  recoveredAt: timestamp("recovered_at"),
55
+ /**
56
+ * Global (per-row) suppression. We model suppression as a flag layered on top
57
+ * of `state` rather than a new `suppressed` enum value: the existing
58
+ * suspicious/anomaly/recovered state machine (in both the spike detector and
59
+ * the drift evaluator) stays intact, and un-suppressing simply reveals the
60
+ * underlying state again. A NULL `suppressedAt` means "not suppressed".
61
+ *
62
+ * Lives on the shared `anomalies` row (Postgres) so every horizontally-scaled
63
+ * pod reads the same suppressed/active set — see state-and-scale.md. The
64
+ * snapshot columns capture the value/baseline at suppression time so the
65
+ * inline detector can auto-unsuppress once the metric "changes again".
66
+ */
67
+ suppressedAt: timestamp("suppressed_at"),
68
+ suppressedValue: doublePrecision("suppressed_value"),
69
+ suppressedBaseline: doublePrecision("suppressed_baseline"),
55
70
  metadata: jsonb("metadata").$type<Record<string, unknown>>(),
56
71
  });
57
72
 
package/src/service.ts CHANGED
@@ -1,9 +1,16 @@
1
- import { eq, and, desc, inArray } from "drizzle-orm";
1
+ import { eq, and, desc, inArray, isNull, isNotNull } from "drizzle-orm";
2
2
  import type { SafeDatabase } from "@checkstack/backend-api";
3
3
  import * as schema from "./schema";
4
- import { anomalySettingsConfig } from "./config";
4
+ import {
5
+ anomalySettingsConfig,
6
+ anomalyAssignmentConfig,
7
+ toVersionedRecord,
8
+ } from "./config";
5
9
  import type { VersionedRecord } from "@checkstack/backend-api";
6
- import type { AnomalySettings } from "@checkstack/anomaly-common";
10
+ import type {
11
+ AnomalySettings,
12
+ PartialAnomalySettings,
13
+ } from "@checkstack/anomaly-common";
7
14
 
8
15
  export class AnomalyService {
9
16
  constructor(private readonly db: SafeDatabase<typeof schema>) {}
@@ -13,6 +20,12 @@ export class AnomalyService {
13
20
  configurationId?: string;
14
21
  state?: schema.AnomalyState;
15
22
  kind?: schema.AnomalyKind;
23
+ /**
24
+ * Suppression filter. Defaults to "active": suppressed rows are excluded
25
+ * from the active view. Pass "suppressed" to list only suppressed rows, or
26
+ * "all" to ignore the suppression flag entirely.
27
+ */
28
+ suppression?: "active" | "suppressed" | "all";
16
29
  limit?: number;
17
30
  }) {
18
31
  const conditions = [];
@@ -32,6 +45,13 @@ export class AnomalyService {
32
45
  conditions.push(eq(schema.anomalies.kind, params.kind));
33
46
  }
34
47
 
48
+ const suppression = params.suppression ?? "active";
49
+ if (suppression === "active") {
50
+ conditions.push(isNull(schema.anomalies.suppressedAt));
51
+ } else if (suppression === "suppressed") {
52
+ conditions.push(isNotNull(schema.anomalies.suppressedAt));
53
+ }
54
+
35
55
  const whereClause = conditions.length > 0 ? and(...conditions) : undefined;
36
56
 
37
57
  const results = await this.db
@@ -44,13 +64,112 @@ export class AnomalyService {
44
64
  return results.map((r) => ({
45
65
  ...r,
46
66
  startedAt: r.startedAt.toISOString(),
47
-
67
+
48
68
  confirmedAt: r.confirmedAt?.toISOString() ?? null,
49
-
69
+
50
70
  recoveredAt: r.recoveredAt?.toISOString() ?? null,
71
+
72
+ suppressedAt: r.suppressedAt?.toISOString() ?? null,
51
73
  }));
52
74
  }
53
75
 
76
+ /**
77
+ * Globally suppress a single anomaly row. Snapshots the current observed
78
+ * value and baseline so the inline detector can auto-unsuppress once the
79
+ * metric "changes again" (moves outside the relative reactivation band).
80
+ *
81
+ * The mutation is scoped by BOTH `anomalyId` and `systemId` — the access
82
+ * gate authorizes the caller on `systemId`, so we must verify the target row
83
+ * actually belongs to that system, otherwise a user with `feed.manage` on
84
+ * system A could suppress system B's anomaly by passing B's id (IDOR).
85
+ *
86
+ * Only confirmed (`state === "anomaly"`) rows can be suppressed: the
87
+ * auto-unsuppress re-evaluation lives in the confirmed-anomaly branch of the
88
+ * detector, and a suppressed suspicious row would otherwise still confirm and
89
+ * notify. Returns false if no matching confirmed row exists.
90
+ */
91
+ async suppressAnomaly({
92
+ anomalyId,
93
+ systemId,
94
+ }: {
95
+ anomalyId: string;
96
+ systemId: string;
97
+ }): Promise<boolean> {
98
+ const [existing] = await this.db
99
+ .select()
100
+ .from(schema.anomalies)
101
+ .where(
102
+ and(
103
+ eq(schema.anomalies.id, anomalyId),
104
+ eq(schema.anomalies.systemId, systemId),
105
+ ),
106
+ )
107
+ .limit(1);
108
+
109
+ if (!existing || existing.state !== "anomaly") return false;
110
+
111
+ const observedNumeric = Number(existing.observedValue);
112
+
113
+ await this.db
114
+ .update(schema.anomalies)
115
+ .set({
116
+ suppressedAt: new Date(),
117
+ suppressedValue: Number.isFinite(observedNumeric)
118
+ ? observedNumeric
119
+ : null,
120
+ suppressedBaseline: existing.baselineValue,
121
+ })
122
+ .where(
123
+ and(
124
+ eq(schema.anomalies.id, anomalyId),
125
+ eq(schema.anomalies.systemId, systemId),
126
+ ),
127
+ );
128
+
129
+ return true;
130
+ }
131
+
132
+ /**
133
+ * Clear suppression on a single anomaly row. Scoped by both `anomalyId` and
134
+ * `systemId` for the same IDOR reason as {@link suppressAnomaly}.
135
+ */
136
+ async unsuppressAnomaly({
137
+ anomalyId,
138
+ systemId,
139
+ }: {
140
+ anomalyId: string;
141
+ systemId: string;
142
+ }): Promise<boolean> {
143
+ const [existing] = await this.db
144
+ .select()
145
+ .from(schema.anomalies)
146
+ .where(
147
+ and(
148
+ eq(schema.anomalies.id, anomalyId),
149
+ eq(schema.anomalies.systemId, systemId),
150
+ ),
151
+ )
152
+ .limit(1);
153
+
154
+ if (!existing) return false;
155
+
156
+ await this.db
157
+ .update(schema.anomalies)
158
+ .set({
159
+ suppressedAt: null,
160
+ suppressedValue: null,
161
+ suppressedBaseline: null,
162
+ })
163
+ .where(
164
+ and(
165
+ eq(schema.anomalies.id, anomalyId),
166
+ eq(schema.anomalies.systemId, systemId),
167
+ ),
168
+ );
169
+
170
+ return true;
171
+ }
172
+
54
173
  async getAnomalyBaselines(params: {
55
174
  systemId: string;
56
175
  configurationId: string;
@@ -88,7 +207,10 @@ export class AnomalyService {
88
207
  });
89
208
  }
90
209
 
91
- return result.config as VersionedRecord<AnomalySettings>;
210
+ // Migrate-then-validate the stored record on read. `version: 1` with no
211
+ // migrations today, so this is behavior-preserving now and stays correct
212
+ // once a migration is added.
213
+ return anomalySettingsConfig.parseRecord(toVersionedRecord(result.config));
92
214
  }
93
215
 
94
216
  async updateAnomalyConfig(
@@ -97,7 +219,7 @@ export class AnomalyService {
97
219
  ) {
98
220
  const newConfigRecord = anomalySettingsConfig.create(configData);
99
221
 
100
- const [result] = await this.db
222
+ await this.db
101
223
  .insert(schema.anomalyConfigurations)
102
224
  .values({
103
225
  configurationId,
@@ -106,10 +228,11 @@ export class AnomalyService {
106
228
  .onConflictDoUpdate({
107
229
  target: [schema.anomalyConfigurations.configurationId],
108
230
  set: { config: newConfigRecord },
109
- })
110
- .returning();
231
+ });
111
232
 
112
- return result!.config;
233
+ // Return the validated record we just persisted (typed), rather than the
234
+ // untyped jsonb round-trip.
235
+ return newConfigRecord;
113
236
  }
114
237
 
115
238
  async getAnomalyAssignmentConfig(systemId: string, configurationId: string) {
@@ -123,22 +246,23 @@ export class AnomalyService {
123
246
  ),
124
247
  );
125
248
 
126
- return result
127
- ? (result.config as VersionedRecord<Partial<AnomalySettings>>)
128
- : undefined;
249
+ if (!result) return;
250
+
251
+ // Migrate-then-validate the stored override record on read (see
252
+ // getAnomalyConfig). `version: 1` no-migrations today.
253
+ return anomalyAssignmentConfig.parseRecord(
254
+ toVersionedRecord(result.config),
255
+ );
129
256
  }
130
257
 
131
258
  async updateAnomalyAssignmentConfig(
132
259
  systemId: string,
133
260
  configurationId: string,
134
- configData: Partial<AnomalySettings>,
261
+ configData: PartialAnomalySettings,
135
262
  ) {
136
- const newConfigRecord = {
137
- version: anomalySettingsConfig.version,
138
- data: configData,
139
- };
263
+ const newConfigRecord = anomalyAssignmentConfig.create(configData);
140
264
 
141
- const [result] = await this.db
265
+ await this.db
142
266
  .insert(schema.anomalyAssignments)
143
267
  .values({
144
268
  systemId,
@@ -151,10 +275,10 @@ export class AnomalyService {
151
275
  schema.anomalyAssignments.configurationId,
152
276
  ],
153
277
  set: { config: newConfigRecord },
154
- })
155
- .returning();
278
+ });
156
279
 
157
- return result.config;
280
+ // Return the validated record we just persisted (typed).
281
+ return newConfigRecord;
158
282
  }
159
283
 
160
284
  /**