@checkstack/anomaly-backend 1.2.6 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +78 -0
- package/package.json +9 -9
- package/src/ai-projection.test.ts +4 -5
- package/src/plugin.ts +26 -2
- package/src/service.ts +39 -0
- package/src/system-signals.test.ts +97 -0
- package/src/system-signals.ts +40 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,83 @@
|
|
|
1
1
|
# @checkstack/anomaly-backend
|
|
2
2
|
|
|
3
|
+
## 1.3.0
|
|
4
|
+
|
|
5
|
+
### Minor Changes
|
|
6
|
+
|
|
7
|
+
- dbb76a2: fix(ai): guide the assistant to find all issues and fix the anomaly tool
|
|
8
|
+
|
|
9
|
+
Two assistant problems reported in production:
|
|
10
|
+
|
|
11
|
+
1. Asked "are there any issues?", the model answered from a single source (an
|
|
12
|
+
SLO breach) and missed a system with a failing health check. The chat
|
|
13
|
+
system prompt now instructs the model to check ALL issue sources before
|
|
14
|
+
answering - failing health checks (`healthcheck_status`), breaching/at-risk
|
|
15
|
+
SLOs (`slo_listObjectives`), active anomalies (`anomaly_list`), and open
|
|
16
|
+
incidents (`incident_list`) - and not to stop after the first source. It
|
|
17
|
+
also tells the model that `systemId` must be a real system UUID (resolve a
|
|
18
|
+
name via the catalog tool first) and to never invent ids or filter values.
|
|
19
|
+
|
|
20
|
+
2. The anomaly tool was named `anomaly.explain` but actually LISTS anomalies
|
|
21
|
+
with optional filters. The misleading name led the model to pass a
|
|
22
|
+
non-existent filter value ("Type validation failed") and a system
|
|
23
|
+
name/anomaly id as `systemId` ("a value was malformed"). Renamed to
|
|
24
|
+
`anomaly.list` with a description that spells out the optional filters and
|
|
25
|
+
their valid enum values (state: suspicious|anomaly|recovered, kind:
|
|
26
|
+
spike|drift, suppression: active|suppressed|all) and that `systemId` is a
|
|
27
|
+
system UUID.
|
|
28
|
+
|
|
29
|
+
Also sharpened the `healthcheck.status` and `slo.listObjectives` tool
|
|
30
|
+
descriptions to be use-case oriented ("use when asked what is failing /
|
|
31
|
+
breaching").
|
|
32
|
+
|
|
33
|
+
BREAKING: the anomaly read tool's name changes from `anomaly_explain` to
|
|
34
|
+
`anomaly_list` over the MCP `tools/list` surface. MCP clients referencing it by
|
|
35
|
+
the old name must update.
|
|
36
|
+
|
|
37
|
+
- 0b6f01b: feat(anomaly): contribute anomaly signals to the backend system.issues aggregator
|
|
38
|
+
|
|
39
|
+
The anomaly plugin now registers a `system.issues` contributor (sourceId
|
|
40
|
+
`anomaly`) from its backend `init`, so the AI assistant surfaces confirmed
|
|
41
|
+
anomalies and suspicious states alongside incidents, SLOs, health checks, and
|
|
42
|
+
dependency problems.
|
|
43
|
+
|
|
44
|
+
The contributor enforces its own `anomaly_feed.read` access gate (returning an
|
|
45
|
+
empty map - never throwing - when the principal lacks access; service users are
|
|
46
|
+
trusted), then reads the current problem rows for every system from the shared,
|
|
47
|
+
durable `anomalies` table via a new global `getActiveSignalAnomalies` service
|
|
48
|
+
method (state = anomaly | suspicious, suppressed rows excluded). The answer is
|
|
49
|
+
therefore identical on every pod, and only systems with a current problem appear
|
|
50
|
+
in the result.
|
|
51
|
+
|
|
52
|
+
The row->signal mapping (source/tone/label/detail/href/accessRule/iconName) is
|
|
53
|
+
extracted into a new pure `deriveAnomalySignals` deriver in
|
|
54
|
+
`@checkstack/anomaly-common`, shared by both the backend contributor and the
|
|
55
|
+
frontend `AnomalySignalsFiller` so the two surfaces stay in lockstep. The
|
|
56
|
+
frontend filler now delegates to that deriver with unchanged behavior.
|
|
57
|
+
|
|
58
|
+
### Patch Changes
|
|
59
|
+
|
|
60
|
+
- Updated dependencies [dbb76a2]
|
|
61
|
+
- Updated dependencies [0b6f01b]
|
|
62
|
+
- Updated dependencies [0b6f01b]
|
|
63
|
+
- Updated dependencies [0b6f01b]
|
|
64
|
+
- @checkstack/ai-backend@0.3.0
|
|
65
|
+
- @checkstack/healthcheck-backend@1.7.0
|
|
66
|
+
- @checkstack/anomaly-common@1.4.0
|
|
67
|
+
- @checkstack/healthcheck-common@1.6.0
|
|
68
|
+
- @checkstack/catalog-backend@1.4.8
|
|
69
|
+
- @checkstack/backend-api@0.21.6
|
|
70
|
+
- @checkstack/gitops-backend@0.5.6
|
|
71
|
+
|
|
72
|
+
## 1.2.7
|
|
73
|
+
|
|
74
|
+
### Patch Changes
|
|
75
|
+
|
|
76
|
+
- Updated dependencies [2428bfc]
|
|
77
|
+
- @checkstack/ai-backend@0.2.0
|
|
78
|
+
- @checkstack/catalog-backend@1.4.7
|
|
79
|
+
- @checkstack/healthcheck-backend@1.6.7
|
|
80
|
+
|
|
3
81
|
## 1.2.6
|
|
4
82
|
|
|
5
83
|
### Patch Changes
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@checkstack/anomaly-backend",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"license": "Elastic-2.0",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -14,18 +14,18 @@
|
|
|
14
14
|
"lint:code": "eslint . --max-warnings 0"
|
|
15
15
|
},
|
|
16
16
|
"dependencies": {
|
|
17
|
-
"@checkstack/backend-api": "0.21.
|
|
18
|
-
"@checkstack/ai-backend": "0.
|
|
17
|
+
"@checkstack/backend-api": "0.21.6",
|
|
18
|
+
"@checkstack/ai-backend": "0.3.0",
|
|
19
19
|
"@checkstack/common": "0.15.0",
|
|
20
|
-
"@checkstack/anomaly-common": "1.
|
|
20
|
+
"@checkstack/anomaly-common": "1.4.0",
|
|
21
21
|
"@checkstack/signal-common": "0.2.9",
|
|
22
|
-
"@checkstack/healthcheck-common": "1.
|
|
22
|
+
"@checkstack/healthcheck-common": "1.6.0",
|
|
23
23
|
"@checkstack/queue-api": "0.3.12",
|
|
24
24
|
"@checkstack/cache-api": "0.3.12",
|
|
25
25
|
"@checkstack/cache-utils": "0.2.17",
|
|
26
|
-
"@checkstack/healthcheck-backend": "1.
|
|
27
|
-
"@checkstack/catalog-backend": "1.4.
|
|
28
|
-
"@checkstack/gitops-backend": "0.5.
|
|
26
|
+
"@checkstack/healthcheck-backend": "1.7.0",
|
|
27
|
+
"@checkstack/catalog-backend": "1.4.8",
|
|
28
|
+
"@checkstack/gitops-backend": "0.5.6",
|
|
29
29
|
"@checkstack/gitops-common": "0.6.3",
|
|
30
30
|
"@checkstack/catalog-common": "2.3.4",
|
|
31
31
|
"@checkstack/notification-common": "1.3.3",
|
|
@@ -38,7 +38,7 @@
|
|
|
38
38
|
"devDependencies": {
|
|
39
39
|
"@checkstack/drizzle-helper": "0.0.5",
|
|
40
40
|
"@checkstack/scripts": "0.6.1",
|
|
41
|
-
"@checkstack/test-utils-backend": "0.1.
|
|
41
|
+
"@checkstack/test-utils-backend": "0.1.40",
|
|
42
42
|
"@checkstack/tsconfig": "0.0.7",
|
|
43
43
|
"@types/bun": "^1.0.0",
|
|
44
44
|
"date-fns": "^4.4.0",
|
|
@@ -5,20 +5,19 @@ import {
|
|
|
5
5
|
} from "@checkstack/ai-backend";
|
|
6
6
|
import { anomalyContract, pluginMetadata } from "@checkstack/anomaly-common";
|
|
7
7
|
|
|
8
|
-
describe("anomaly AI projection (anomaly.
|
|
8
|
+
describe("anomaly AI projection (anomaly.list)", () => {
|
|
9
9
|
test("projects getAnomalies as a read-only tool with the source procedure's access rules", () => {
|
|
10
10
|
const tool = buildProjectedTool({
|
|
11
11
|
procedure: anomalyContract.getAnomalies,
|
|
12
12
|
sourcePluginMetadata: pluginMetadata,
|
|
13
13
|
procedureKey: "getAnomalies",
|
|
14
|
-
name: "anomaly.
|
|
15
|
-
description:
|
|
16
|
-
"List detected anomalies (statistical sigma/drift) for context. Read-only.",
|
|
14
|
+
name: "anomaly.list",
|
|
15
|
+
description: "List detected anomalies (statistical spikes / drift).",
|
|
17
16
|
effect: "read",
|
|
18
17
|
execute: deferredProjectionExecute,
|
|
19
18
|
});
|
|
20
19
|
|
|
21
|
-
expect(tool.name).toBe("anomaly.
|
|
20
|
+
expect(tool.name).toBe("anomaly.list");
|
|
22
21
|
expect(tool.effect).toBe("read");
|
|
23
22
|
|
|
24
23
|
// The projection inherits the source procedure's gating — it must NOT
|
package/src/plugin.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import { createBackendPlugin, coreServices, type SafeDatabase } from "@checkstack/backend-api";
|
|
2
2
|
import {
|
|
3
3
|
aiToolProjectionExtensionPoint,
|
|
4
|
+
systemSignalsExtensionPoint,
|
|
5
|
+
createSystemAccessResolver,
|
|
4
6
|
deferredProjectionExecute,
|
|
5
7
|
} from "@checkstack/ai-backend";
|
|
8
|
+
import { createAnomalySignalsContributor } from "./system-signals";
|
|
6
9
|
import { healthCheckHooks } from "@checkstack/healthcheck-backend";
|
|
7
10
|
import { setupBaselineAnalyzerJob } from "./jobs/baseline-analyzer";
|
|
8
11
|
import { processCheckCompleted } from "./detector";
|
|
@@ -58,9 +61,16 @@ export const plugin = createBackendPlugin({
|
|
|
58
61
|
procedure: anomalyContract.getAnomalies,
|
|
59
62
|
sourcePluginMetadata: pluginMetadata,
|
|
60
63
|
procedureKey: "getAnomalies",
|
|
61
|
-
name: "anomaly.
|
|
64
|
+
name: "anomaly.list",
|
|
62
65
|
description:
|
|
63
|
-
"List detected anomalies (statistical
|
|
66
|
+
"List detected anomalies (statistical spikes / drift). Read-only. " +
|
|
67
|
+
"All filters are OPTIONAL - call with no arguments to list every " +
|
|
68
|
+
"anomaly, or narrow with: systemId (a system UUID from the catalog " +
|
|
69
|
+
"tool, never a system name), state (one of: suspicious, anomaly, " +
|
|
70
|
+
"recovered), kind (one of: spike, drift), suppression (one of: " +
|
|
71
|
+
"active, suppressed, all). Each result includes the anomaly's id and " +
|
|
72
|
+
"systemId. There is no per-anomaly 'explain' call - read the returned " +
|
|
73
|
+
"rows directly.",
|
|
64
74
|
effect: "read",
|
|
65
75
|
execute: deferredProjectionExecute,
|
|
66
76
|
});
|
|
@@ -110,6 +120,20 @@ export const plugin = createBackendPlugin({
|
|
|
110
120
|
|
|
111
121
|
const service = new AnomalyService(typedDb);
|
|
112
122
|
gitopsService = service;
|
|
123
|
+
|
|
124
|
+
// Contribute anomaly problem state to the dashboard `system.issues`
|
|
125
|
+
// aggregator. The contributor gates the originating principal on
|
|
126
|
+
// anomaly's own read rule and reads globally from shared Postgres - see
|
|
127
|
+
// createAnomalySignalsContributor.
|
|
128
|
+
env
|
|
129
|
+
.getExtensionPoint(systemSignalsExtensionPoint)
|
|
130
|
+
.contribute(
|
|
131
|
+
createAnomalySignalsContributor({
|
|
132
|
+
service,
|
|
133
|
+
resolver: createSystemAccessResolver(rpcClient),
|
|
134
|
+
}),
|
|
135
|
+
);
|
|
136
|
+
|
|
113
137
|
routerCache = createAnomalyRouterCache({ cacheManager, logger });
|
|
114
138
|
const router = createRouter(service, logger, routerCache);
|
|
115
139
|
rpc.registerRouter(router, anomalyContract);
|
package/src/service.ts
CHANGED
|
@@ -73,6 +73,45 @@ export class AnomalyService {
|
|
|
73
73
|
}));
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
/**
|
|
77
|
+
* Return the current "problem" anomaly rows across ALL systems, for the
|
|
78
|
+
* dashboard `system.issues` aggregator. Mirrors the frontend filler's two
|
|
79
|
+
* active queries (state = anomaly | suspicious, suppressed rows excluded) in a
|
|
80
|
+
* single global read so the backend signals match the frontend ones. Reads
|
|
81
|
+
* from shared, durable storage so every pod returns the same answer.
|
|
82
|
+
*/
|
|
83
|
+
async getActiveSignalAnomalies(): Promise<
|
|
84
|
+
Array<{
|
|
85
|
+
systemId: string;
|
|
86
|
+
configurationId: string;
|
|
87
|
+
fieldPath: string;
|
|
88
|
+
startedAt: string;
|
|
89
|
+
state: schema.AnomalyState;
|
|
90
|
+
}>
|
|
91
|
+
> {
|
|
92
|
+
const rows = await this.db
|
|
93
|
+
.select({
|
|
94
|
+
systemId: schema.anomalies.systemId,
|
|
95
|
+
configurationId: schema.anomalies.configurationId,
|
|
96
|
+
fieldPath: schema.anomalies.fieldPath,
|
|
97
|
+
startedAt: schema.anomalies.startedAt,
|
|
98
|
+
state: schema.anomalies.state,
|
|
99
|
+
})
|
|
100
|
+
.from(schema.anomalies)
|
|
101
|
+
.where(
|
|
102
|
+
and(
|
|
103
|
+
inArray(schema.anomalies.state, ["anomaly", "suspicious"]),
|
|
104
|
+
isNull(schema.anomalies.suppressedAt),
|
|
105
|
+
),
|
|
106
|
+
)
|
|
107
|
+
.orderBy(desc(schema.anomalies.startedAt));
|
|
108
|
+
|
|
109
|
+
return rows.map((r) => ({
|
|
110
|
+
...r,
|
|
111
|
+
startedAt: r.startedAt.toISOString(),
|
|
112
|
+
}));
|
|
113
|
+
}
|
|
114
|
+
|
|
76
115
|
/**
|
|
77
116
|
* Globally suppress a single anomaly row. Snapshots the current observed
|
|
78
117
|
* value and baseline so the inline detector can auto-unsuppress once the
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import type { AuthUser } from "@checkstack/backend-api";
|
|
3
|
+
import { qualifyAccessRuleId } from "@checkstack/common";
|
|
4
|
+
import type { SystemAccessResolver } from "@checkstack/ai-backend";
|
|
5
|
+
import { anomalyAccess } from "@checkstack/anomaly-common";
|
|
6
|
+
import { createAnomalySignalsContributor } from "./system-signals";
|
|
7
|
+
import type { AnomalyService } from "./service";
|
|
8
|
+
|
|
9
|
+
type Rows = Awaited<ReturnType<AnomalyService["getActiveSignalAnomalies"]>>;
|
|
10
|
+
|
|
11
|
+
const stubService = (
|
|
12
|
+
rows: Rows,
|
|
13
|
+
): Pick<AnomalyService, "getActiveSignalAnomalies"> => ({
|
|
14
|
+
getActiveSignalAnomalies: async () => rows,
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
const sampleRows: Rows = [
|
|
18
|
+
{
|
|
19
|
+
systemId: "sys-1",
|
|
20
|
+
configurationId: "cfg-1",
|
|
21
|
+
fieldPath: "latency",
|
|
22
|
+
startedAt: "2026-06-07T10:00:00.000Z",
|
|
23
|
+
state: "anomaly",
|
|
24
|
+
},
|
|
25
|
+
{
|
|
26
|
+
systemId: "sys-2",
|
|
27
|
+
configurationId: "cfg-2",
|
|
28
|
+
fieldPath: "errors",
|
|
29
|
+
startedAt: "2026-06-07T11:00:00.000Z",
|
|
30
|
+
state: "suspicious",
|
|
31
|
+
},
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
// The per-source gate is owned/tested by createGatedSystemSignalsContributor.
|
|
35
|
+
const allowAll: SystemAccessResolver = {
|
|
36
|
+
accessibleSystemIds: async ({ systemIds }) => systemIds,
|
|
37
|
+
};
|
|
38
|
+
const denyAll: SystemAccessResolver = { accessibleSystemIds: async () => [] };
|
|
39
|
+
|
|
40
|
+
const withFeedRead: AuthUser = {
|
|
41
|
+
type: "user",
|
|
42
|
+
id: "u1",
|
|
43
|
+
accessRules: [
|
|
44
|
+
qualifyAccessRuleId(
|
|
45
|
+
{ pluginId: anomalyAccess.feed.read.pluginId },
|
|
46
|
+
anomalyAccess.feed.read,
|
|
47
|
+
),
|
|
48
|
+
],
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
describe("createAnomalySignalsContributor", () => {
|
|
52
|
+
test("uses the anomaly source id", () => {
|
|
53
|
+
const contributor = createAnomalySignalsContributor({
|
|
54
|
+
service: stubService([]),
|
|
55
|
+
resolver: allowAll,
|
|
56
|
+
});
|
|
57
|
+
expect(contributor.sourceId).toBe("anomaly");
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test("wires the service + shared deriver for an authorized principal", async () => {
|
|
61
|
+
const contributor = createAnomalySignalsContributor({
|
|
62
|
+
service: stubService(sampleRows),
|
|
63
|
+
resolver: allowAll,
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
const map = await contributor.read({ principal: withFeedRead });
|
|
67
|
+
|
|
68
|
+
expect(Object.keys(map.signals).sort()).toEqual(["sys-1", "sys-2"]);
|
|
69
|
+
expect(map.signals["sys-1"]?.[0]).toMatchObject({
|
|
70
|
+
source: "anomaly",
|
|
71
|
+
tone: "warn",
|
|
72
|
+
label: "Anomaly detected",
|
|
73
|
+
});
|
|
74
|
+
expect(map.signals["sys-2"]?.[0]).toMatchObject({
|
|
75
|
+
source: "anomaly",
|
|
76
|
+
tone: "info",
|
|
77
|
+
label: "Suspicious behaviour",
|
|
78
|
+
});
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
test("routes a non-global user through the team gate (no grants -> nothing)", async () => {
|
|
82
|
+
const contributor = createAnomalySignalsContributor({
|
|
83
|
+
service: stubService(sampleRows),
|
|
84
|
+
resolver: denyAll,
|
|
85
|
+
});
|
|
86
|
+
const principal: AuthUser = {
|
|
87
|
+
type: "user",
|
|
88
|
+
id: "u1",
|
|
89
|
+
accessRules: ["catalog.system.read"],
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
expect(await contributor.read({ principal })).toEqual({
|
|
93
|
+
accessible: false,
|
|
94
|
+
signals: {},
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
});
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import {
|
|
2
|
+
createGatedSystemSignalsContributor,
|
|
3
|
+
type SystemAccessResolver,
|
|
4
|
+
type SystemSignalsContributor,
|
|
5
|
+
} from "@checkstack/ai-backend";
|
|
6
|
+
import {
|
|
7
|
+
anomalyAccess,
|
|
8
|
+
deriveAnomalySignals,
|
|
9
|
+
ANOMALY_SIGNAL_SOURCE_ID,
|
|
10
|
+
} from "@checkstack/anomaly-common";
|
|
11
|
+
import type { AnomalyService } from "./service";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* The slice of {@link AnomalyService} the contributor needs - the single global
|
|
15
|
+
* read of current problem rows. Narrowed so the contributor (and its test) does
|
|
16
|
+
* not depend on the full service surface.
|
|
17
|
+
*/
|
|
18
|
+
type SignalSource = Pick<AnomalyService, "getActiveSignalAnomalies">;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Build the anomaly contributor for the dashboard `system.issues` aggregator.
|
|
22
|
+
* Reads active (anomaly/suspicious) rows globally from shared Postgres and runs
|
|
23
|
+
* the SAME deriver the frontend filler uses. The per-source access gate (global
|
|
24
|
+
* `anomaly.feed.read` plus per-system team grants) is applied by
|
|
25
|
+
* {@link createGatedSystemSignalsContributor}.
|
|
26
|
+
*/
|
|
27
|
+
export const createAnomalySignalsContributor = ({
|
|
28
|
+
service,
|
|
29
|
+
resolver,
|
|
30
|
+
}: {
|
|
31
|
+
service: SignalSource;
|
|
32
|
+
resolver: SystemAccessResolver;
|
|
33
|
+
}): SystemSignalsContributor =>
|
|
34
|
+
createGatedSystemSignalsContributor({
|
|
35
|
+
sourceId: ANOMALY_SIGNAL_SOURCE_ID,
|
|
36
|
+
accessRule: anomalyAccess.feed.read,
|
|
37
|
+
resolver,
|
|
38
|
+
readSignals: async () =>
|
|
39
|
+
deriveAnomalySignals({ rows: await service.getActiveSignalAnomalies() }),
|
|
40
|
+
});
|