@thotischner/observability-mcp 3.2.1 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analysis/health.d.ts +13 -3
- package/dist/analysis/health.js +32 -16
- package/dist/analysis/health.test.js +31 -5
- package/dist/analysis/library.test.js +1 -1
- package/dist/conformance/mcp-2025-11-25.test.js +64 -0
- package/dist/connectors/loki.js +16 -1
- package/dist/connectors/loki.test.js +33 -2
- package/dist/index.js +146 -15
- package/dist/tools/detect-anomalies.js +45 -10
- package/dist/tools/get-service-health.js +76 -9
- package/dist/tools/handlers.test.js +86 -0
- package/dist/types.d.ts +15 -4
- package/package.json +1 -1
|
@@ -7,12 +7,22 @@ export interface HealthInputs {
|
|
|
7
7
|
logErrorRate: number;
|
|
8
8
|
}
|
|
9
9
|
export interface HealthResult {
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
/** 0-100, or null when no signal had data (status "unknown"). */
|
|
11
|
+
score: number | null;
|
|
12
|
+
status: HealthStatus | "unknown";
|
|
12
13
|
details: Record<string, {
|
|
13
14
|
score: number;
|
|
14
15
|
value: number;
|
|
15
16
|
threshold: string;
|
|
16
17
|
}>;
|
|
17
18
|
}
|
|
18
|
-
|
|
19
|
+
/** Which signal families actually returned data. Missing/false families are
|
|
20
|
+
* excluded from the weighted score and the remaining weights are
|
|
21
|
+
* renormalised — so a log-only service is judged on its logs, not on metric
|
|
22
|
+
* zeros coerced to "healthy" (issue #453). Omit for the back-compat
|
|
23
|
+
* all-signals-present behaviour. */
|
|
24
|
+
export interface SignalCoverage {
|
|
25
|
+
metrics?: boolean;
|
|
26
|
+
logs?: boolean;
|
|
27
|
+
}
|
|
28
|
+
export declare function calculateHealthScore(inputs: HealthInputs, thresholds: HealthThresholds, coverage?: SignalCoverage): HealthResult;
|
package/dist/analysis/health.js
CHANGED
|
@@ -1,27 +1,43 @@
|
|
|
1
|
-
export function calculateHealthScore(inputs, thresholds) {
|
|
1
|
+
export function calculateHealthScore(inputs, thresholds, coverage) {
|
|
2
2
|
const w = thresholds.weights;
|
|
3
3
|
const t = thresholds;
|
|
4
|
+
const hasMetrics = coverage?.metrics !== false; // default: present (back-compat)
|
|
5
|
+
const hasLogs = coverage?.logs !== false;
|
|
4
6
|
const cpuScore = scoreFromThreshold(inputs.cpu, t.cpu.good, t.cpu.warn, t.cpu.crit);
|
|
5
7
|
const errorRateScore = scoreFromThreshold(inputs.errorRate, t.errorRate.good, t.errorRate.warn, t.errorRate.crit);
|
|
6
8
|
const latencyScore = scoreFromThreshold(inputs.latencyP99, t.latencyP99.good, t.latencyP99.warn, t.latencyP99.crit);
|
|
7
9
|
const logErrorScore = scoreFromThreshold(inputs.logErrorRate, t.logErrors.good, t.logErrors.warn, t.logErrors.crit);
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
// Only count the families that actually reported data; renormalise by the
|
|
11
|
+
// sum of their weights so a missing family is neither a free "100" nor a
|
|
12
|
+
// free "0". With full coverage the active weights sum to ~1 and this equals
|
|
13
|
+
// the previous behaviour.
|
|
14
|
+
let weighted = 0;
|
|
15
|
+
let activeWeight = 0;
|
|
16
|
+
if (hasMetrics) {
|
|
17
|
+
weighted += cpuScore * w.cpu + errorRateScore * w.errorRate + latencyScore * w.latency;
|
|
18
|
+
activeWeight += w.cpu + w.errorRate + w.latency;
|
|
19
|
+
}
|
|
20
|
+
if (hasLogs) {
|
|
21
|
+
weighted += logErrorScore * w.logErrors;
|
|
22
|
+
activeWeight += w.logErrors;
|
|
23
|
+
}
|
|
24
|
+
const details = {};
|
|
25
|
+
if (hasMetrics) {
|
|
26
|
+
details.cpu = { score: Math.round(cpuScore), value: inputs.cpu, threshold: `warn >${t.cpu.warn}%, crit >${t.cpu.crit}%` };
|
|
27
|
+
details.errorRate = { score: Math.round(errorRateScore), value: inputs.errorRate, threshold: `warn >${t.errorRate.warn}/s, crit >${t.errorRate.crit}/s` };
|
|
28
|
+
details.latencyP99 = { score: Math.round(latencyScore), value: inputs.latencyP99, threshold: `warn >${t.latencyP99.warn}s, crit >${t.latencyP99.crit}s` };
|
|
29
|
+
}
|
|
30
|
+
if (hasLogs) {
|
|
31
|
+
details.logErrors = { score: Math.round(logErrorScore), value: inputs.logErrorRate, threshold: `warn >${t.logErrors.warn}/min, crit >${t.logErrors.crit}/min` };
|
|
32
|
+
}
|
|
33
|
+
// No family reported data → honestly unknown, not a confident 100/healthy.
|
|
34
|
+
if (activeWeight === 0) {
|
|
35
|
+
return { score: null, status: "unknown", details };
|
|
36
|
+
}
|
|
37
|
+
const score = Math.round(Math.max(0, Math.min(100, weighted / activeWeight)));
|
|
13
38
|
const status = score > t.statusBoundaries.healthy ? "healthy" :
|
|
14
39
|
score > t.statusBoundaries.degraded ? "degraded" : "critical";
|
|
15
|
-
return {
|
|
16
|
-
score,
|
|
17
|
-
status,
|
|
18
|
-
details: {
|
|
19
|
-
cpu: { score: Math.round(cpuScore), value: inputs.cpu, threshold: `warn >${t.cpu.warn}%, crit >${t.cpu.crit}%` },
|
|
20
|
-
errorRate: { score: Math.round(errorRateScore), value: inputs.errorRate, threshold: `warn >${t.errorRate.warn}/s, crit >${t.errorRate.crit}/s` },
|
|
21
|
-
latencyP99: { score: Math.round(latencyScore), value: inputs.latencyP99, threshold: `warn >${t.latencyP99.warn}s, crit >${t.latencyP99.crit}s` },
|
|
22
|
-
logErrors: { score: Math.round(logErrorScore), value: inputs.logErrorRate, threshold: `warn >${t.logErrors.warn}/min, crit >${t.logErrors.crit}/min` },
|
|
23
|
-
},
|
|
24
|
-
};
|
|
40
|
+
return { score, status, details };
|
|
25
41
|
}
|
|
26
42
|
function scoreFromThreshold(value, good, warn, crit) {
|
|
27
43
|
if (value <= good)
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { describe, it } from "node:test";
|
|
2
2
|
import assert from "node:assert/strict";
|
|
3
3
|
import { calculateHealthScore } from "./health.js";
|
|
4
|
+
// Full-coverage scores are never null; narrow for the existing assertions.
|
|
5
|
+
function num(r) {
|
|
6
|
+
assert.notEqual(r.score, null, "expected a numeric score");
|
|
7
|
+
return r.score;
|
|
8
|
+
}
|
|
4
9
|
const defaults = {
|
|
5
10
|
weights: { errorRate: 0.35, latency: 0.25, cpu: 0.20, logErrors: 0.20 },
|
|
6
11
|
cpu: { good: 50, warn: 80, crit: 95 },
|
|
@@ -22,21 +27,21 @@ describe("calculateHealthScore", () => {
|
|
|
22
27
|
cpu: 20, memory: 100_000_000, errorRate: 0.005, latencyP99: 0.3, logErrorRate: 0,
|
|
23
28
|
}, defaults);
|
|
24
29
|
assert.equal(result.status, "healthy");
|
|
25
|
-
assert.ok(result
|
|
30
|
+
assert.ok(num(result) > 80);
|
|
26
31
|
});
|
|
27
32
|
it("returns degraded for elevated values", () => {
|
|
28
33
|
const result = calculateHealthScore({
|
|
29
34
|
cpu: 65, memory: 200_000_000, errorRate: 0.05, latencyP99: 0.8, logErrorRate: 3,
|
|
30
35
|
}, defaults);
|
|
31
36
|
assert.equal(result.status, "degraded");
|
|
32
|
-
assert.ok(result
|
|
37
|
+
assert.ok(num(result) > 50 && num(result) <= 80, `Expected degraded score 50-80, got ${result.score}`);
|
|
33
38
|
});
|
|
34
39
|
it("returns critical for extreme values", () => {
|
|
35
40
|
const result = calculateHealthScore({
|
|
36
41
|
cpu: 98, memory: 500_000_000, errorRate: 1.0, latencyP99: 5.0, logErrorRate: 50,
|
|
37
42
|
}, defaults);
|
|
38
43
|
assert.equal(result.status, "critical");
|
|
39
|
-
assert.ok(result
|
|
44
|
+
assert.ok(num(result) <= 50);
|
|
40
45
|
});
|
|
41
46
|
it("score is between 0 and 100", () => {
|
|
42
47
|
for (const cpu of [0, 50, 100]) {
|
|
@@ -44,7 +49,7 @@ describe("calculateHealthScore", () => {
|
|
|
44
49
|
const result = calculateHealthScore({
|
|
45
50
|
cpu, memory: 0, errorRate: err, latencyP99: 0, logErrorRate: 0,
|
|
46
51
|
}, defaults);
|
|
47
|
-
assert.ok(result
|
|
52
|
+
assert.ok(num(result) >= 0 && num(result) <= 100, `Score ${result.score} out of range`);
|
|
48
53
|
}
|
|
49
54
|
}
|
|
50
55
|
});
|
|
@@ -57,7 +62,7 @@ describe("calculateHealthScore", () => {
|
|
|
57
62
|
cpu: 25, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 0,
|
|
58
63
|
}, strict);
|
|
59
64
|
// CPU 25% with strict thresholds should lower the score
|
|
60
|
-
assert.ok(result
|
|
65
|
+
assert.ok(num(result) < 100);
|
|
61
66
|
});
|
|
62
67
|
it("includes detail breakdown", () => {
|
|
63
68
|
const result = calculateHealthScore({
|
|
@@ -67,4 +72,25 @@ describe("calculateHealthScore", () => {
|
|
|
67
72
|
assert.ok("errorRate" in result.details);
|
|
68
73
|
assert.ok(result.details.cpu.score < 100);
|
|
69
74
|
});
|
|
75
|
+
it("coverage: no signals at all → score null, status unknown (issue #453)", () => {
|
|
76
|
+
const r = calculateHealthScore({ cpu: 0, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 0 }, defaults, { metrics: false, logs: false });
|
|
77
|
+
assert.equal(r.score, null);
|
|
78
|
+
assert.equal(r.status, "unknown");
|
|
79
|
+
assert.deepEqual(r.details, {});
|
|
80
|
+
});
|
|
81
|
+
it("coverage: log-only service is judged on logs, not metric zeros (issue #453)", () => {
|
|
82
|
+
// High log error rate, no metric coverage → must NOT come back healthy.
|
|
83
|
+
const r = calculateHealthScore({ cpu: 0, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 50 }, defaults, { metrics: false, logs: true });
|
|
84
|
+
assert.notEqual(r.status, "healthy");
|
|
85
|
+
assert.ok(num(r) < 50, `log-only with 50 errors/min should not be healthy, got ${r.score}`);
|
|
86
|
+
assert.ok(!("cpu" in r.details), "metric details excluded when metrics absent");
|
|
87
|
+
assert.ok("logErrors" in r.details);
|
|
88
|
+
});
|
|
89
|
+
it("coverage: full coverage (default) is unchanged by the coverage param", () => {
|
|
90
|
+
const inputs = { cpu: 65, memory: 0, errorRate: 0.05, latencyP99: 0.8, logErrorRate: 3 };
|
|
91
|
+
const implicit = calculateHealthScore(inputs, defaults);
|
|
92
|
+
const explicit = calculateHealthScore(inputs, defaults, { metrics: true, logs: true });
|
|
93
|
+
assert.equal(implicit.score, explicit.score);
|
|
94
|
+
assert.equal(implicit.status, explicit.status);
|
|
95
|
+
});
|
|
70
96
|
});
|
|
@@ -38,7 +38,7 @@ describe("embeddable analysis library", () => {
|
|
|
38
38
|
logErrors: { good: 1, warn: 5, crit: 10 },
|
|
39
39
|
statusBoundaries: { healthy: 80, degraded: 50 },
|
|
40
40
|
});
|
|
41
|
-
assert.ok(r.score >= 0 && r.score <= 100);
|
|
41
|
+
assert.ok(r.score !== null && r.score >= 0 && r.score <= 100);
|
|
42
42
|
assert.ok(["healthy", "degraded", "critical"].includes(r.status));
|
|
43
43
|
});
|
|
44
44
|
});
|
|
@@ -375,3 +375,67 @@ test("E2E tools/call: every registered tool dispatches over MCP and returns a Ca
|
|
|
375
375
|
assert.ok(Array.isArray(r.content), `tool ${name} must return content[]`);
|
|
376
376
|
}
|
|
377
377
|
});
|
|
378
|
+
test("E2E tools/list: every builtin tool advertises ToolAnnotations (readOnlyHint)", opts, async () => {
|
|
379
|
+
// AX hardening: all 12 builtin tools are read-only; clients (e.g. Claude)
|
|
380
|
+
// use these hints for auto-approve decisions, so they must be advertised
|
|
381
|
+
// over the live transport — not just present in the registration source.
|
|
382
|
+
const session = await newSession();
|
|
383
|
+
const { response } = await jsonRpc("tools/list", {}, { id: 2, session });
|
|
384
|
+
const r = response.result;
|
|
385
|
+
const tools = r.tools ?? [];
|
|
386
|
+
assert.ok(tools.length >= 12, `expected >=12 tools, got ${tools.length}`);
|
|
387
|
+
// Federated tools (namespaced `<prefix>.<tool>`) proxy upstream metadata and
|
|
388
|
+
// may legitimately lack annotations — only the builtin set is asserted.
|
|
389
|
+
const builtin = tools.filter((t) => t.name && !t.name.includes("."));
|
|
390
|
+
for (const t of builtin) {
|
|
391
|
+
assert.equal(t.annotations?.readOnlyHint, true, `tool ${t.name} must advertise annotations.readOnlyHint=true`);
|
|
392
|
+
assert.ok(t.annotations?.title, `tool ${t.name} must advertise annotations.title`);
|
|
393
|
+
}
|
|
394
|
+
});
|
|
395
|
+
test("E2E: builtin resource agent-usage-guide is listed and readable", opts, async () => {
|
|
396
|
+
// AX: the agent usage guide ships as an MCP resource so clients can pull
|
|
397
|
+
// it into context without a web fetch. Assert list + read over the wire.
|
|
398
|
+
const session = await newSession();
|
|
399
|
+
const list = await jsonRpc("resources/list", {}, { id: 10, session });
|
|
400
|
+
const resources = list.response.result?.resources ?? [];
|
|
401
|
+
assert.ok(resources.some((r) => r.uri === "omcp://guide/agent-usage"), `agent-usage-guide resource must be listed, got ${JSON.stringify(resources.map((r) => r.uri))}`);
|
|
402
|
+
const read = await jsonRpc("resources/read", { uri: "omcp://guide/agent-usage" }, { id: 11, session });
|
|
403
|
+
const contents = read.response.result?.contents ?? [];
|
|
404
|
+
assert.ok((contents[0]?.text ?? "").includes("Triage recipe"), "guide text must round-trip");
|
|
405
|
+
});
|
|
406
|
+
test("E2E: builtin prompts triage-incident + write-postmortem are listed and resolvable", opts, async () => {
|
|
407
|
+
const session = await newSession();
|
|
408
|
+
const list = await jsonRpc("prompts/list", {}, { id: 12, session });
|
|
409
|
+
const prompts = list.response.result?.prompts ?? [];
|
|
410
|
+
for (const name of ["triage-incident", "write-postmortem"]) {
|
|
411
|
+
assert.ok(prompts.some((p) => p.name === name), `prompt ${name} must be listed`);
|
|
412
|
+
}
|
|
413
|
+
const got = await jsonRpc("prompts/get", { name: "triage-incident", arguments: { service: "ci-probe" } }, { id: 13, session });
|
|
414
|
+
const msgs = got.response.result?.messages ?? [];
|
|
415
|
+
assert.ok((msgs[0]?.content?.text ?? "").includes('"ci-probe"'), "prompt must interpolate the service arg");
|
|
416
|
+
});
|
|
417
|
+
test("E2E: /llms.txt is served and reflects the canonical tool registry", opts, async () => {
|
|
418
|
+
// llms.txt convention: LLM-readable summary at the server root. Generated
|
|
419
|
+
// from registry-names.ts, so this also guards against registry drift.
|
|
420
|
+
const base = URL_ENV.replace(/\/mcp\/?$/, "");
|
|
421
|
+
const res = await fetch(`${base}/llms.txt`);
|
|
422
|
+
assert.equal(res.status, 200);
|
|
423
|
+
assert.match(res.headers.get("content-type") ?? "", /text\/plain/);
|
|
424
|
+
const text = await res.text();
|
|
425
|
+
assert.match(text, /^# observability-mcp/, "must start with the llms.txt H1");
|
|
426
|
+
for (const name of ["query_logs", "query_metrics", "enrich_ips", "get_blast_radius"]) {
|
|
427
|
+
assert.ok(text.includes(`- ${name} (`), `tool ${name} must be listed`);
|
|
428
|
+
}
|
|
429
|
+
assert.ok(text.includes("for-agents"), "must link the for-agents guide");
|
|
430
|
+
});
|
|
431
|
+
test("E2E: initialize advertises non-empty instructions pointing at the usage guide (issue #455)", opts, async () => {
|
|
432
|
+
const { response } = await jsonRpc("initialize", {
|
|
433
|
+
protocolVersion: PROTOCOL_VERSION,
|
|
434
|
+
capabilities: {},
|
|
435
|
+
clientInfo: { name: "harness", version: "0" },
|
|
436
|
+
}, { id: 30 });
|
|
437
|
+
const r = response.result;
|
|
438
|
+
assert.ok(r.instructions && r.instructions.length > 0, "initialize.instructions must be populated");
|
|
439
|
+
assert.match(r.instructions, /omcp:\/\/guide\/agent-usage/, "must point at the usage-guide resource");
|
|
440
|
+
assert.match(r.instructions, /aggregate/i, "must carry the filter+aggregate golden rule");
|
|
441
|
+
});
|
package/dist/connectors/loki.js
CHANGED
|
@@ -68,7 +68,12 @@ export function buildAggregateLogQL(streamPipeline, agg, duration) {
|
|
|
68
68
|
if (agg.op === "count_over_time") {
|
|
69
69
|
const stepSec = (agg.step && parseDurationSeconds(agg.step)) || defaultBucketSeconds(durSec);
|
|
70
70
|
const inner = `count_over_time(${streamPipeline} [${stepSec}s])`;
|
|
71
|
-
|
|
71
|
+
// Always wrap in sum() — even with no `by`. A bare count_over_time over a
|
|
72
|
+
// `| json`-piped stream keeps every extracted label (rid/ip/status/…) as
|
|
73
|
+
// its own series, so the "requests over time" headline case returns a
|
|
74
|
+
// high-cardinality mess instead of one bucketed total (issue #452). With
|
|
75
|
+
// no `by` we collapse to a single series; `sum by(...)` for explicit by.
|
|
76
|
+
const logql = `sum${byClause} (${inner})`;
|
|
72
77
|
return { logql, mode: "range", step: `${stepSec}s` };
|
|
73
78
|
}
|
|
74
79
|
// sum / topk: count over the whole window, then aggregate → instant vector.
|
|
@@ -195,6 +200,16 @@ export class LokiConnector {
|
|
|
195
200
|
const url = `/loki/api/v1/query_range?query=${encodeURIComponent(logql)}` +
|
|
196
201
|
`&start=${start}000000000&end=${end}000000000&limit=${limit}`;
|
|
197
202
|
const data = await this.apiGet(url);
|
|
203
|
+
// A log query yields resultType "streams". A metric query (e.g. a
|
|
204
|
+
// raw_query wrapping sum()/count() → vector/matrix) does NOT — and the
|
|
205
|
+
// streams parser below would dereference undefined `.stream`/`.values`
|
|
206
|
+
// and crash on `.level` (issue #452). Fail fast with a clear, actionable
|
|
207
|
+
// message instead of running to timeout on a wrong-shaped result.
|
|
208
|
+
const resultType = data?.data?.resultType;
|
|
209
|
+
if (resultType && resultType !== "streams") {
|
|
210
|
+
throw new Error(`query_logs raw_query returned a '${resultType}' result, but query_logs handles log lines (streams) only. ` +
|
|
211
|
+
"For counts/sums/top-k use the `aggregate` param on query_logs; for arbitrary vector/matrix LogQL use query_metrics raw_query.");
|
|
212
|
+
}
|
|
198
213
|
const entries = [];
|
|
199
214
|
for (const stream of data?.data?.result || []) {
|
|
200
215
|
const labels = stream.stream;
|
|
@@ -96,6 +96,34 @@ describe("Q-LOG1: queryLogs LogQL assembly", () => {
|
|
|
96
96
|
});
|
|
97
97
|
assert.equal(q, '{job="raw"}');
|
|
98
98
|
});
|
|
99
|
+
it("#452: rawQuery returning a vector/matrix fails fast with a clear message (no .level crash)", async () => {
|
|
100
|
+
const conn = new LokiConnector();
|
|
101
|
+
await conn.connect({ name: "loki", type: "loki", url: "http://loki:3100", enabled: true });
|
|
102
|
+
const orig = globalThis.fetch;
|
|
103
|
+
globalThis.fetch = (async (url) => {
|
|
104
|
+
const u = String(url);
|
|
105
|
+
if (u.includes("/label/") && u.includes("/values"))
|
|
106
|
+
return jsonRes({ data: ["app"] });
|
|
107
|
+
// A metric raw_query (sum(count_over_time(...))) returns resultType matrix,
|
|
108
|
+
// not streams — must not be fed to the streams parser.
|
|
109
|
+
if (u.includes("/query_range")) {
|
|
110
|
+
return jsonRes({ data: { resultType: "matrix", result: [{ metric: { url: "/" }, values: [[1000, "5"]] }] } });
|
|
111
|
+
}
|
|
112
|
+
return jsonRes({ data: [] });
|
|
113
|
+
});
|
|
114
|
+
try {
|
|
115
|
+
await conn.queryLogs({ rawQuery: "sum(count_over_time({app=\"x\"} | json [6h]))", duration: "6h" });
|
|
116
|
+
assert.fail("expected a thrown error for a non-streams raw_query result");
|
|
117
|
+
}
|
|
118
|
+
catch (err) {
|
|
119
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
120
|
+
assert.match(msg, /'matrix' result/);
|
|
121
|
+
assert.match(msg, /aggregate.*param|query_metrics raw_query/);
|
|
122
|
+
}
|
|
123
|
+
finally {
|
|
124
|
+
globalThis.fetch = orig;
|
|
125
|
+
}
|
|
126
|
+
});
|
|
99
127
|
});
|
|
100
128
|
describe("Q-LOG2: parseDurationSeconds / defaultBucketSeconds", () => {
|
|
101
129
|
it("parses m/h/d", () => {
|
|
@@ -118,11 +146,14 @@ describe("Q-LOG2: buildAggregateLogQL", () => {
|
|
|
118
146
|
assert.equal(r.step, "900s");
|
|
119
147
|
assert.equal(r.logql, `sum by (url) (count_over_time(${PIPE} [900s]))`);
|
|
120
148
|
});
|
|
121
|
-
it("count_over_time without by →
|
|
149
|
+
it("count_over_time without by → sum-wrapped (single series), default step (#452)", () => {
|
|
150
|
+
// Regression for issue #452: a bare count_over_time over a `| json` stream
|
|
151
|
+
// keeps every extracted label as its own series. With no `by` we must
|
|
152
|
+
// collapse to one bucketed total via sum(...).
|
|
122
153
|
const r = buildAggregateLogQL(PIPE, { op: "count_over_time" }, "1h");
|
|
123
154
|
assert.equal(r.mode, "range");
|
|
124
155
|
assert.equal(r.step, "60s");
|
|
125
|
-
assert.equal(r.logql, `count_over_time(${PIPE} [60s])`);
|
|
156
|
+
assert.equal(r.logql, `sum (count_over_time(${PIPE} [60s]))`);
|
|
126
157
|
});
|
|
127
158
|
it("sum → instant total per group over the whole window", () => {
|
|
128
159
|
const r = buildAggregateLogQL(PIPE, { op: "sum", by: ["status"] }, "1h");
|
package/dist/index.js
CHANGED
|
@@ -342,6 +342,18 @@ async function main() {
|
|
|
342
342
|
const mcpServer = new McpServer({
|
|
343
343
|
name: "observability-mcp",
|
|
344
344
|
version: SERVER_VERSION,
|
|
345
|
+
}, {
|
|
346
|
+
// `instructions` is the one channel the MCP spec auto-injects into the
|
|
347
|
+
// agent's context on connect (issue #455). Keep it tight: point at the
|
|
348
|
+
// full guide resource + the single rule that prevents the most common
|
|
349
|
+
// mistake (dumping raw rows), + the empty-state contract.
|
|
350
|
+
instructions: "Read MCP resource `omcp://guide/agent-usage` before heavy use. " +
|
|
351
|
+
"Golden rule: filter + aggregate server-side — use `query_logs`/`query_metrics` " +
|
|
352
|
+
"`labels` and `query_logs` `aggregate` to ask for numbers, not haystacks (raw log " +
|
|
353
|
+
"dumps blow past context limits). All tools are read-only. When a result is empty " +
|
|
354
|
+
"or refused, the message names the operator flag that unlocks it (e.g. OMCP_RAW_QUERY) " +
|
|
355
|
+
"— relay it verbatim. Prompts `triage-incident` and `write-postmortem` compose the tools. " +
|
|
356
|
+
"Hit a rough edge? Report it via the agent-report issue template (see the guide).",
|
|
345
357
|
});
|
|
346
358
|
const toolHandlers = new Map();
|
|
347
359
|
// --- Register tools with Zod schemas ---
|
|
@@ -393,15 +405,98 @@ async function main() {
|
|
|
393
405
|
}
|
|
394
406
|
return mcpServer.prompt(name, ...rest);
|
|
395
407
|
});
|
|
396
|
-
//
|
|
397
|
-
|
|
398
|
-
|
|
408
|
+
// --- Builtin resources + prompts (agent experience) -------------------
|
|
409
|
+
// The usage guide is the distilled, agent-validated workflow from issue
|
|
410
|
+
// #415 — served as an MCP resource so a client can pull it into context
|
|
411
|
+
// without a web fetch. Prompts compose the existing read-only tools into
|
|
412
|
+
// the two flows agents run most.
|
|
413
|
+
registerResource("agent-usage-guide", "omcp://guide/agent-usage", {
|
|
414
|
+
description: "How to use this gateway effectively as an agent: the proven filter→aggregate→enrich triage recipe, signal-vs-silence behaviours, and the operator flags that unlock optional tools.",
|
|
415
|
+
mimeType: "text/markdown",
|
|
416
|
+
}, async (uri) => ({
|
|
417
|
+
contents: [
|
|
418
|
+
{
|
|
419
|
+
uri: uri.toString(),
|
|
420
|
+
mimeType: "text/markdown",
|
|
421
|
+
text: [
|
|
422
|
+
"# Agent usage guide (observability-mcp)",
|
|
423
|
+
"",
|
|
424
|
+
"All tools are read-only (`readOnlyHint: true`). The golden rule:",
|
|
425
|
+
"**filter and aggregate server-side — ask for numbers, not haystacks.**",
|
|
426
|
+
"",
|
|
427
|
+
"## Triage recipe (agent-validated, issue #415)",
|
|
428
|
+
'1. `query_logs` with `labels` (exact-match field filters, e.g. {"environment":"prod"})',
|
|
429
|
+
' and `aggregate` ({"op":"topk","by":["ip"],"k":10} or {"op":"count_over_time","step":"15m"})',
|
|
430
|
+
" — pushed down to LogQL, returns a handful of numbers instead of thousands of rows.",
|
|
431
|
+
"2. `enrich_ips` with the IPs from step 1 — offline geo/ASN/hosting-flag lookup",
|
|
432
|
+
" (bot-vs-human signal). Requires OMCP_IP_ENRICH_FILE on the operator side.",
|
|
433
|
+
'3. `query_metrics` with `labels` ({"route":"/checkout"}) and `groupBy` to scope a',
|
|
434
|
+
" curated metric to the slice you care about.",
|
|
435
|
+
"",
|
|
436
|
+
"## Incident flow",
|
|
437
|
+
"`detect_anomalies` (fleet scan) → `get_service_health` (one-service verdict) →",
|
|
438
|
+
"`get_blast_radius` (shared-host impact) → `generate_postmortem` (markdown report).",
|
|
439
|
+
"",
|
|
440
|
+
"## When something is empty or refused",
|
|
441
|
+
"The gateway explains itself: no topology connector → explicit note; no trace",
|
|
442
|
+
"backend → explicit error; `raw_query` disabled → message naming OMCP_RAW_QUERY=on;",
|
|
443
|
+
"redacted values → a `_redacted` count in the result. Relay flag names to your",
|
|
444
|
+
"operator verbatim — the messages are written to be forwarded.",
|
|
445
|
+
"",
|
|
446
|
+
"## Report findings & collaborate",
|
|
447
|
+
"Structured agent reports drive releases here (see issue #415). File a bug or gap:",
|
|
448
|
+
"https://github.com/ThoTischner/observability-mcp/issues/new?template=agent-report.yml",
|
|
449
|
+
"For workflows, ideas, or to compare notes with other agents (collaboration welcome):",
|
|
450
|
+
"https://github.com/ThoTischner/observability-mcp/discussions",
|
|
451
|
+
"Note: posting publicly is an outward-facing action — do it on your operator's behalf, not unprompted.",
|
|
452
|
+
"Full guide: https://thotischner.github.io/observability-mcp/for-agents/",
|
|
453
|
+
].join("\n"),
|
|
454
|
+
},
|
|
455
|
+
],
|
|
456
|
+
}));
|
|
457
|
+
registerPrompt("triage-incident", "Guided incident triage for one service: health verdict, anomaly scan, blast radius, and the log slice that matters.", { service: z.string().describe("Service name as returned by list_services") }, ({ service }) => ({
|
|
458
|
+
messages: [
|
|
459
|
+
{
|
|
460
|
+
role: "user",
|
|
461
|
+
content: {
|
|
462
|
+
type: "text",
|
|
463
|
+
text: [
|
|
464
|
+
`Triage the service "${service}" using the observability-mcp tools, in this order:`,
|
|
465
|
+
`1. get_service_health {"service":"${service}"} — the current verdict and why.`,
|
|
466
|
+
`2. detect_anomalies {"service":"${service}","duration":"1h"} — what is statistically off.`,
|
|
467
|
+
`3. get_blast_radius {"resource":"${service}"} — who else fails if its host fails.`,
|
|
468
|
+
`4. query_logs {"service":"${service}","level":"error","aggregate":{"op":"count_over_time","step":"5m"},"duration":"1h"} — error-volume shape over time; drill into raw rows only for the spike window.`,
|
|
469
|
+
"Then summarise: current state, most likely cause, blast radius, and the next diagnostic step. Prefer aggregated queries over raw log dumps.",
|
|
470
|
+
].join("\n"),
|
|
471
|
+
},
|
|
472
|
+
},
|
|
473
|
+
],
|
|
474
|
+
}));
|
|
475
|
+
registerPrompt("write-postmortem", "Generate and refine a post-incident report for one service over a window.", {
|
|
476
|
+
service: z.string().describe("Service name as returned by list_services"),
|
|
477
|
+
duration: z.string().optional().describe("Look-back window, e.g. '1h', '6h'. Default '1h'."),
|
|
478
|
+
}, ({ service, duration }) => ({
|
|
479
|
+
messages: [
|
|
480
|
+
{
|
|
481
|
+
role: "user",
|
|
482
|
+
content: {
|
|
483
|
+
type: "text",
|
|
484
|
+
text: [
|
|
485
|
+
`Produce a post-mortem for "${service}" over the last ${duration || "1h"}:`,
|
|
486
|
+
`1. generate_postmortem {"service":"${service}","duration":"${duration || "1h"}"} — the stitched report (anomaly timeline, blast radius, traces, log highlights).`,
|
|
487
|
+
`2. Verify its claims: get_anomaly_history {"service":"${service}","duration":"${duration || "1h"}"} for the score timeline, and query_logs with an aggregate for the error shape.`,
|
|
488
|
+
"3. Rewrite the result as a blameless post-mortem: summary, impact, timeline, root-cause hypothesis (with confidence), follow-ups. Mark any section the gateway reported as missing data instead of inventing content.",
|
|
489
|
+
].join("\n"),
|
|
490
|
+
},
|
|
491
|
+
},
|
|
492
|
+
],
|
|
493
|
+
}));
|
|
399
494
|
registerTool("list_sources", [
|
|
400
495
|
"List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
|
|
401
496
|
"When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
|
|
402
497
|
"Behavior: read-only, no side effects. Returns one entry per source with its name, type, signal types (metrics/logs), and a live up/down status (the backend URL is intentionally not exposed — it may carry embedded credentials). Never throws for an unreachable backend — the backend is reported as down instead.",
|
|
403
498
|
"Related: use `list_services` to see what is monitored within these sources.",
|
|
404
|
-
].join(" "), {}, async () => {
|
|
499
|
+
].join(" "), {}, { title: "List Sources", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async () => {
|
|
405
500
|
await enforceEntitledAccess(ctx, { tool: "list_sources" });
|
|
406
501
|
return withToolMetrics("list_sources", () => listSourcesHandler(registry, ctx));
|
|
407
502
|
});
|
|
@@ -415,7 +510,7 @@ async function main() {
|
|
|
415
510
|
.string()
|
|
416
511
|
.optional()
|
|
417
512
|
.describe("Optional case-insensitive substring to narrow the result to matching service names (e.g. 'payment'). Omit to list every discovered service."),
|
|
418
|
-
}, async (args) => {
|
|
513
|
+
}, { title: "List Services", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
419
514
|
await enforceEntitledAccess(ctx, { tool: "list_services" });
|
|
420
515
|
const result = await withToolMetrics("list_services", () => listServicesHandler(registry, args, ctx));
|
|
421
516
|
return enrichToolServicesText(result, ctx);
|
|
@@ -458,7 +553,7 @@ async function main() {
|
|
|
458
553
|
.string()
|
|
459
554
|
.optional()
|
|
460
555
|
.describe("Optional escape hatch: a verbatim PromQL expression, run as-is over the range — for ad-hoc queries the curated `metric` catalog can't express (any series, any function, broken down by any label). When set, `metric`/`service`/`groupBy`/`labels` are ignored. DISABLED by default; the operator must enable the raw-query capability (OMCP_RAW_QUERY=on) or the call is refused. Still tenant-scoped and source-allow-listed."),
|
|
461
|
-
}, async (args) => {
|
|
556
|
+
}, { title: "Query Metrics", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
462
557
|
await enforceEntitledAccess(ctx, { tool: "query_metrics", source: args?.source, service: args?.service });
|
|
463
558
|
const result = await withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args, ctx, { allowRawQuery: RAW_QUERY_ENABLED }));
|
|
464
559
|
return chargeTokenBudget(result, ctx, "query_metrics");
|
|
@@ -466,6 +561,7 @@ async function main() {
|
|
|
466
561
|
registerTool("query_logs", [
|
|
467
562
|
"Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
|
|
468
563
|
"When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
|
|
564
|
+
"Golden rule: filter + aggregate server-side — pass `labels` to scope and `aggregate` (count_over_time/sum/topk) to get numbers, not raw rows. A high-volume window returned raw will blow past your context limit.",
|
|
469
565
|
"Prerequisites: get the exact service name from `list_services` (the service must expose a logs signal).",
|
|
470
566
|
"Behavior: read-only, no side effects. Returns the matching log entries (newest first, capped by `limit`) plus a summary with total/error/warn counts and top recurring error patterns. No matches yields an empty result with a zeroed summary; an unreachable backend yields a structured explanatory error, never an exception.",
|
|
471
567
|
].join(" "), {
|
|
@@ -525,7 +621,7 @@ async function main() {
|
|
|
525
621
|
.string()
|
|
526
622
|
.optional()
|
|
527
623
|
.describe("Optional escape hatch: a verbatim LogQL log query, run as-is — for selectors/pipelines the curated params can't express. When set, `service`/`labels`/`level`/`query` are ignored and it is mutually exclusive with `aggregate` (express aggregation in the LogQL itself). DISABLED by default; the operator must enable the raw-query capability (OMCP_RAW_QUERY=on) or the call is refused. Redaction still applies to the returned log lines."),
|
|
528
|
-
}, async (args) => {
|
|
624
|
+
}, { title: "Query Logs", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
529
625
|
await enforceEntitledAccess(ctx, { tool: "query_logs", source: args?.source, service: args?.service });
|
|
530
626
|
const result = await withToolMetrics("query_logs", () => queryLogsHandler(registry, args, ctx, { allowRawQuery: RAW_QUERY_ENABLED }));
|
|
531
627
|
// Redact PII / secrets from the log payload before it crosses the
|
|
@@ -565,7 +661,7 @@ async function main() {
|
|
|
565
661
|
service: z.string().describe("Service name to filter on."),
|
|
566
662
|
duration: z.string().optional().describe("Rolling window, e.g. '1h', '24h'. Default '1h'."),
|
|
567
663
|
method: z.string().optional().describe("Filter by detector method ('mad' / 'seasonality' / 'correlator'). Optional."),
|
|
568
|
-
}, async (args) => {
|
|
664
|
+
}, { title: "Anomaly History", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
569
665
|
await enforceEntitledAccess(ctx, { tool: "get_anomaly_history", service: args?.service });
|
|
570
666
|
const result = await withToolMetrics("get_anomaly_history", () => getAnomalyHistoryHandler(registry, args, ctx));
|
|
571
667
|
return chargeTokenBudget(result, ctx, "get_anomaly_history");
|
|
@@ -580,7 +676,7 @@ async function main() {
|
|
|
580
676
|
service: z.string().describe("Suspected root-cause service."),
|
|
581
677
|
duration: z.string().optional().describe("Window length, e.g. '1h', '6h'. Default '1h'."),
|
|
582
678
|
format: z.enum(["markdown", "json"]).optional().describe("'markdown' (default) or 'json'."),
|
|
583
|
-
}, async (args) => {
|
|
679
|
+
}, { title: "Generate Postmortem", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
584
680
|
await enforceEntitledAccess(ctx, { tool: "generate_postmortem", service: args?.service });
|
|
585
681
|
const result = await withToolMetrics("generate_postmortem", () => generatePostmortemHandler(registry, args, ctx));
|
|
586
682
|
return chargeTokenBudget(result, ctx, "generate_postmortem");
|
|
@@ -597,7 +693,7 @@ async function main() {
|
|
|
597
693
|
filter: z.string().optional().describe("Backend-native filter (TraceQL on Tempo, tag query on Jaeger). Optional."),
|
|
598
694
|
limit: z.number().int().positive().optional().describe("Soft cap on returned trace summaries. Default 50."),
|
|
599
695
|
errorsOnly: z.boolean().optional().describe("If true, only traces with at least one error span."),
|
|
600
|
-
}, async (args) => {
|
|
696
|
+
}, { title: "Query Traces", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
601
697
|
await enforceEntitledAccess(ctx, { tool: "query_traces", service: args?.service });
|
|
602
698
|
const result = await withToolMetrics("query_traces", () => queryTracesHandler(registry, args, ctx));
|
|
603
699
|
return chargeTokenBudget(result, ctx, "query_traces");
|
|
@@ -611,7 +707,7 @@ async function main() {
|
|
|
611
707
|
service: z
|
|
612
708
|
.string()
|
|
613
709
|
.describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
|
|
614
|
-
}, async (args) => {
|
|
710
|
+
}, { title: "Service Health", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
615
711
|
await enforceEntitledAccess(ctx, { tool: "get_service_health", service: args?.service });
|
|
616
712
|
const result = await withToolMetrics("get_service_health", () => getServiceHealthHandler(registry, args, ctx));
|
|
617
713
|
const enriched = enrichToolHealthText(result, String(args?.service ?? ""), ctx);
|
|
@@ -635,7 +731,7 @@ async function main() {
|
|
|
635
731
|
.enum(["low", "medium", "high"])
|
|
636
732
|
.optional()
|
|
637
733
|
.describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
|
|
638
|
-
}, async (args) => {
|
|
734
|
+
}, { title: "Detect Anomalies", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
639
735
|
await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
|
|
640
736
|
// P1: pass the anomaly-history sink so detected scores flow
|
|
641
737
|
// into the TSDB and `get_anomaly_history` returns real data.
|
|
@@ -666,7 +762,7 @@ async function main() {
|
|
|
666
762
|
.max(5000)
|
|
667
763
|
.optional()
|
|
668
764
|
.describe("Optional. Maximum resources to return; edges are trimmed to the kept set. Default 500, max 5000."),
|
|
669
|
-
}, async (args) => {
|
|
765
|
+
}, { title: "Topology Graph", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
670
766
|
await enforceEntitledAccess(ctx, { tool: "get_topology", source: args?.source });
|
|
671
767
|
return withToolMetrics("get_topology", () => getTopologyHandler(registry, args, ctx));
|
|
672
768
|
});
|
|
@@ -679,7 +775,7 @@ async function main() {
|
|
|
679
775
|
resource: z
|
|
680
776
|
.string()
|
|
681
777
|
.describe("Required. Resource to evaluate. Accepts the canonical id (e.g. 'k8s:pod:default/checkout-7f89d'), the exact resource name (e.g. 'checkout-7f89d'), or a unique substring of either."),
|
|
682
|
-
}, async (args) => {
|
|
778
|
+
}, { title: "Blast Radius", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
683
779
|
await enforceEntitledAccess(ctx, { tool: "get_blast_radius" });
|
|
684
780
|
return withToolMetrics("get_blast_radius", () => getBlastRadiusHandler(registry, args, ctx));
|
|
685
781
|
});
|
|
@@ -692,7 +788,7 @@ async function main() {
|
|
|
692
788
|
ips: z
|
|
693
789
|
.array(z.string())
|
|
694
790
|
.describe("Required. IPv4 address strings to enrich (e.g. ['203.0.113.5','198.51.100.9']). Max 1000 per call; invalid entries are returned with found=false rather than failing the batch."),
|
|
695
|
-
}, async (args) => {
|
|
791
|
+
}, { title: "Enrich IPs", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
|
|
696
792
|
await enforceEntitledAccess(ctx, { tool: "enrich_ips" });
|
|
697
793
|
return withToolMetrics("enrich_ips", async () => enrichIpsHandler(ipEnrichment, args, ctx));
|
|
698
794
|
});
|
|
@@ -1195,6 +1291,41 @@ async function main() {
|
|
|
1195
1291
|
// enough to skip the request-counter middleware.
|
|
1196
1292
|
let ready = false;
|
|
1197
1293
|
app.get("/healthz", (_req, res) => res.type("text").send("ok"));
|
|
1294
|
+
// /llms.txt — the llms.txt convention (llmstxt.org): a plain-text,
|
|
1295
|
+
// LLM-friendly summary of what this server is and how to use it. The
|
|
1296
|
+
// primary audience of this gateway IS an LLM agent, so the gateway
|
|
1297
|
+
// serves its own. Tool list is generated from the canonical registry
|
|
1298
|
+
// (registry-names.ts) so it can't drift from the real surface.
|
|
1299
|
+
const LLMS_TXT = [
|
|
1300
|
+
"# observability-mcp",
|
|
1301
|
+
"",
|
|
1302
|
+
`> Unified observability gateway for AI agents (v${SERVER_VERSION}). One MCP server`,
|
|
1303
|
+
"> for Prometheus, Loki, and any backend via pluggable connectors — with",
|
|
1304
|
+
"> server-side filtering/aggregation so agents get numbers, not haystacks.",
|
|
1305
|
+
"",
|
|
1306
|
+
"MCP endpoint: POST /mcp (Streamable HTTP) · also stdio (--stdio) and WebSocket (/mcp/ws).",
|
|
1307
|
+
"All tools are read-only and advertise MCP ToolAnnotations (readOnlyHint: true).",
|
|
1308
|
+
"MCP resource omcp://guide/agent-usage carries the agent usage guide;",
|
|
1309
|
+
"prompts triage-incident and write-postmortem compose the tools into workflows.",
|
|
1310
|
+
"",
|
|
1311
|
+
"## Tools",
|
|
1312
|
+
"",
|
|
1313
|
+
...REGISTERED_TOOLS.map((t) => `- ${t.name} (${t.category}): ${t.summary}`),
|
|
1314
|
+
"",
|
|
1315
|
+
"## Connect",
|
|
1316
|
+
"",
|
|
1317
|
+
" claude mcp add observability --transport http http://localhost:3000/mcp",
|
|
1318
|
+
"",
|
|
1319
|
+
"## Docs",
|
|
1320
|
+
"",
|
|
1321
|
+
"- For agents (start here): https://thotischner.github.io/observability-mcp/for-agents/",
|
|
1322
|
+
"- Documentation site: https://thotischner.github.io/observability-mcp/",
|
|
1323
|
+
"- Report a finding (agent-report template): https://github.com/ThoTischner/observability-mcp/issues/new?template=agent-report.yml",
|
|
1324
|
+
"- Discussions (agent collaboration welcome): https://github.com/ThoTischner/observability-mcp/discussions",
|
|
1325
|
+
"- Source: https://github.com/ThoTischner/observability-mcp",
|
|
1326
|
+
"",
|
|
1327
|
+
].join("\n");
|
|
1328
|
+
app.get("/llms.txt", (_req, res) => res.type("text/plain; charset=utf-8").send(LLMS_TXT));
|
|
1198
1329
|
// Procurement-time probe: the MCP spec revisions and transports the
|
|
1199
1330
|
// gateway supports. Static today — kept as a separate endpoint so a
|
|
1200
1331
|
// discovery tool / RFP probe / catalog scanner can resolve our
|
|
@@ -40,18 +40,42 @@ export async function detectAnomaliesHandler(registry, args, ctx = defaultContex
|
|
|
40
40
|
const tenantConnectors = registry.getByTenant(ctx.tenant);
|
|
41
41
|
const metricsConnectors = tenantConnectors.filter((c) => c.signalType === "metrics");
|
|
42
42
|
const logConnectors = tenantConnectors.filter((c) => c.signalType === "logs");
|
|
43
|
-
|
|
43
|
+
// Discover services from BOTH metrics and log connectors, tracking which
|
|
44
|
+
// signal each service exposes. Previously the fleet scan only enumerated
|
|
45
|
+
// metrics connectors, so a log-only service was silently dropped from the
|
|
46
|
+
// scan — and the "all healthy" all-clear never said so (issue #453B). Now
|
|
47
|
+
// log-only services are scanned (via their log error-rate, as the
|
|
48
|
+
// description promises) and the per-service coverage is reported.
|
|
49
|
+
const coverage = new Map();
|
|
50
|
+
const mark = (name, key) => {
|
|
51
|
+
const e = coverage.get(name) ?? { metrics: false, logs: false };
|
|
52
|
+
e[key] = true;
|
|
53
|
+
coverage.set(name, e);
|
|
54
|
+
};
|
|
55
|
+
for (const connector of metricsConnectors) {
|
|
56
|
+
try {
|
|
57
|
+
for (const s of await connector.listServices())
|
|
58
|
+
mark(s.name, "metrics");
|
|
59
|
+
}
|
|
60
|
+
catch { /* connector down — skip */ }
|
|
61
|
+
}
|
|
62
|
+
for (const connector of logConnectors) {
|
|
63
|
+
try {
|
|
64
|
+
for (const s of await connector.listServices())
|
|
65
|
+
mark(s.name, "logs");
|
|
66
|
+
}
|
|
67
|
+
catch { /* connector down — skip */ }
|
|
68
|
+
}
|
|
69
|
+
let serviceNames;
|
|
44
70
|
if (args.service) {
|
|
45
71
|
serviceNames = [args.service];
|
|
72
|
+
if (!coverage.has(args.service)) {
|
|
73
|
+
// Unknown to listServices — still attempt both signal paths.
|
|
74
|
+
coverage.set(args.service, { metrics: metricsConnectors.length > 0, logs: logConnectors.length > 0 });
|
|
75
|
+
}
|
|
46
76
|
}
|
|
47
77
|
else {
|
|
48
|
-
|
|
49
|
-
const services = await connector.listServices();
|
|
50
|
-
for (const s of services) {
|
|
51
|
-
if (!serviceNames.includes(s.name))
|
|
52
|
-
serviceNames.push(s.name);
|
|
53
|
-
}
|
|
54
|
-
}
|
|
78
|
+
serviceNames = [...coverage.keys()];
|
|
55
79
|
}
|
|
56
80
|
const allAnomalies = [];
|
|
57
81
|
const allCorrelations = [];
|
|
@@ -179,14 +203,25 @@ export async function detectAnomaliesHandler(registry, args, ctx = defaultContex
|
|
|
179
203
|
severity: a.severity,
|
|
180
204
|
})))
|
|
181
205
|
: { ranked: [], summary: "" };
|
|
206
|
+
// Per-service coverage so an "all healthy" all-clear is verifiable rather
|
|
207
|
+
// than silently partial: the caller sees exactly which services were
|
|
208
|
+
// scanned and on which signals (issue #453B).
|
|
209
|
+
const scanned = serviceNames.map((name) => {
|
|
210
|
+
const cov = coverage.get(name) ?? { metrics: false, logs: false };
|
|
211
|
+
const signals = [cov.metrics ? "metrics" : null, cov.logs ? "logs" : null].filter(Boolean);
|
|
212
|
+
return { service: name, signals };
|
|
213
|
+
});
|
|
214
|
+
const metricsCount = scanned.filter((s) => s.signals.includes("metrics")).length;
|
|
215
|
+
const logsCount = scanned.filter((s) => s.signals.includes("logs")).length;
|
|
182
216
|
const result = {
|
|
183
217
|
scannedServices: serviceNames.length,
|
|
218
|
+
coverage: { scanned },
|
|
184
219
|
anomalies: allAnomalies,
|
|
185
220
|
correlations: allCorrelations,
|
|
186
221
|
rootCause,
|
|
187
222
|
summary: allAnomalies.length === 0
|
|
188
|
-
?
|
|
189
|
-
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`,
|
|
223
|
+
? `No anomalies across ${serviceNames.length} scanned service(s) (${metricsCount} with metrics, ${logsCount} with logs).`
|
|
224
|
+
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} of ${serviceNames.length} scanned service(s).`,
|
|
190
225
|
};
|
|
191
226
|
return {
|
|
192
227
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
@@ -24,23 +24,38 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
24
24
|
const tenantConnectors = registry.getByTenant(ctx.tenant);
|
|
25
25
|
const metricsConnectors = tenantConnectors.filter((c) => c.signalType === "metrics");
|
|
26
26
|
const logConnectors = tenantConnectors.filter((c) => c.signalType === "logs");
|
|
27
|
-
// Gather metrics
|
|
27
|
+
// Gather metrics. Track whether any series actually returned data —
|
|
28
|
+
// absent metrics must NOT be coerced to 0 and read as a confident
|
|
29
|
+
// "healthy" (issue #453).
|
|
28
30
|
let cpu = 0, memory = 0, errorRate = 0, latencyP99 = 0;
|
|
31
|
+
let metricsHadData = false;
|
|
29
32
|
const anomalies = [];
|
|
30
33
|
for (const connector of metricsConnectors) {
|
|
31
34
|
if (!connector.queryMetrics)
|
|
32
35
|
continue;
|
|
33
36
|
try {
|
|
34
37
|
const cpuResult = await connector.queryMetrics({ service: args.service, metric: "cpu", duration: "5m" });
|
|
35
|
-
|
|
38
|
+
if (cpuResult.values.length > 0) {
|
|
39
|
+
cpu = cpuResult.summary.current;
|
|
40
|
+
metricsHadData = true;
|
|
41
|
+
}
|
|
36
42
|
checkAnomaly(cpuResult.values.map(v => v.value), "cpu", args.service, connector.name, anomalies);
|
|
37
43
|
const memResult = await connector.queryMetrics({ service: args.service, metric: "memory", duration: "5m" });
|
|
38
|
-
|
|
44
|
+
if (memResult.values.length > 0) {
|
|
45
|
+
memory = memResult.summary.current / 1_000_000;
|
|
46
|
+
metricsHadData = true;
|
|
47
|
+
} // MB for display
|
|
39
48
|
const errResult = await connector.queryMetrics({ service: args.service, metric: "error_rate", duration: "5m" });
|
|
40
|
-
|
|
49
|
+
if (errResult.values.length > 0) {
|
|
50
|
+
errorRate = errResult.summary.current;
|
|
51
|
+
metricsHadData = true;
|
|
52
|
+
}
|
|
41
53
|
checkAnomaly(errResult.values.map(v => v.value), "error_rate", args.service, connector.name, anomalies);
|
|
42
54
|
const latResult = await connector.queryMetrics({ service: args.service, metric: "latency_p99", duration: "5m" });
|
|
43
|
-
|
|
55
|
+
if (latResult.values.length > 0) {
|
|
56
|
+
latencyP99 = latResult.summary.current;
|
|
57
|
+
metricsHadData = true;
|
|
58
|
+
}
|
|
44
59
|
checkAnomaly(latResult.values.map(v => v.value), "latency_p99", args.service, connector.name, anomalies);
|
|
45
60
|
}
|
|
46
61
|
catch (err) {
|
|
@@ -50,12 +65,15 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
50
65
|
// Gather logs
|
|
51
66
|
let logErrorRate = 0;
|
|
52
67
|
let topErrors = [];
|
|
68
|
+
let logsHadData = false;
|
|
53
69
|
const correlations = [];
|
|
54
70
|
for (const connector of logConnectors) {
|
|
55
71
|
if (!connector.queryLogs)
|
|
56
72
|
continue;
|
|
57
73
|
try {
|
|
58
74
|
const logs = await connector.queryLogs({ service: args.service, duration: "5m", limit: 200 });
|
|
75
|
+
if (logs.summary.total > 0)
|
|
76
|
+
logsHadData = true; // real log coverage in the window
|
|
59
77
|
logErrorRate = logs.summary.errorCount; // errors in 5m window
|
|
60
78
|
topErrors = logs.summary.topPatterns;
|
|
61
79
|
// Cross-signal correlation
|
|
@@ -70,7 +88,31 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
70
88
|
console.error("Health check logs failed for %s:", sanitizeForLog(args.service), err);
|
|
71
89
|
}
|
|
72
90
|
}
|
|
73
|
-
//
|
|
91
|
+
// Honest signal coverage: judge the service only on the families that
|
|
92
|
+
// actually returned data, so a log-only (or absent) service is never
|
|
93
|
+
// coerced to a confident "healthy" from metric zeros (issue #453).
|
|
94
|
+
const coverage = { metrics: metricsHadData, logs: logsHadData };
|
|
95
|
+
// No data at all → either the service doesn't exist (typo / decommissioned)
|
|
96
|
+
// or it isn't monitored. Say so explicitly, like the other tools' empty
|
|
97
|
+
// states — don't return 100/healthy.
|
|
98
|
+
if (!metricsHadData && !logsHadData) {
|
|
99
|
+
const known = await knownServiceNames(tenantConnectors, args.service);
|
|
100
|
+
const note = known
|
|
101
|
+
? `No metric or log data for "${args.service}" in the last 5 minutes — the service exists but has no monitored signals (or was quiet). Health is unknown, not healthy.`
|
|
102
|
+
: `Service "${args.service}" was not found in any connected source. Check the exact name via list_services. (Not reporting a health score for a service that does not exist.)`;
|
|
103
|
+
const result = {
|
|
104
|
+
service: args.service,
|
|
105
|
+
status: "unknown",
|
|
106
|
+
score: null,
|
|
107
|
+
signals: { metrics: null, logs: null },
|
|
108
|
+
anomalies,
|
|
109
|
+
correlations,
|
|
110
|
+
coverage,
|
|
111
|
+
note,
|
|
112
|
+
};
|
|
113
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
114
|
+
}
|
|
115
|
+
// Calculate health score over the covered signals only.
|
|
74
116
|
const { DEFAULT_HEALTH_THRESHOLDS } = await import("../config/loader.js");
|
|
75
117
|
const health = calculateHealthScore({
|
|
76
118
|
cpu,
|
|
@@ -78,22 +120,47 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
78
120
|
errorRate,
|
|
79
121
|
latencyP99,
|
|
80
122
|
logErrorRate,
|
|
81
|
-
}, _thresholds || DEFAULT_HEALTH_THRESHOLDS);
|
|
123
|
+
}, _thresholds || DEFAULT_HEALTH_THRESHOLDS, coverage);
|
|
82
124
|
const result = {
|
|
83
125
|
service: args.service,
|
|
84
126
|
status: health.status,
|
|
85
127
|
score: health.score,
|
|
86
128
|
signals: {
|
|
87
|
-
metrics: { cpu, memory, errorRate, latencyP99 },
|
|
88
|
-
logs: { errorRate: logErrorRate, topErrors },
|
|
129
|
+
metrics: metricsHadData ? { cpu, memory, errorRate, latencyP99 } : null,
|
|
130
|
+
logs: logsHadData ? { errorRate: logErrorRate, topErrors } : null,
|
|
89
131
|
},
|
|
90
132
|
anomalies,
|
|
91
133
|
correlations,
|
|
134
|
+
coverage,
|
|
135
|
+
note: !metricsHadData
|
|
136
|
+
? "No metrics signal for this service — score reflects logs only."
|
|
137
|
+
: !logsHadData
|
|
138
|
+
? "No logs signal for this service — score reflects metrics only."
|
|
139
|
+
: undefined,
|
|
92
140
|
};
|
|
93
141
|
return {
|
|
94
142
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
95
143
|
};
|
|
96
144
|
}
|
|
145
|
+
/** Best-effort: does any connector in the tenant know this service name?
|
|
146
|
+
* Used only on the no-data path to distinguish "exists but unmonitored/quiet"
|
|
147
|
+
* from "doesn't exist (typo/decommissioned)". A connector that throws is
|
|
148
|
+
* treated as "can't confirm" and skipped. */
|
|
149
|
+
async function knownServiceNames(connectors, service) {
|
|
150
|
+
for (const c of connectors) {
|
|
151
|
+
if (!c.listServices)
|
|
152
|
+
continue;
|
|
153
|
+
try {
|
|
154
|
+
const svcs = await c.listServices();
|
|
155
|
+
if (svcs.some((s) => s.name === service))
|
|
156
|
+
return true;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
/* can't confirm via this connector — keep checking */
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return false;
|
|
163
|
+
}
|
|
97
164
|
function checkAnomaly(values, metric, service, source, anomalies) {
|
|
98
165
|
// Robust, metric-type-aware detector (same path as detect_anomalies):
|
|
99
166
|
// latency/error_rate/saturation are one-sided, so a *decrease* (e.g.
|
|
@@ -153,6 +153,38 @@ describe("listServicesHandler", () => {
|
|
|
153
153
|
assert.equal(data.total, 0);
|
|
154
154
|
});
|
|
155
155
|
});
|
|
156
|
+
describe("detectAnomaliesHandler — fleet coverage (issue #453B)", () => {
|
|
157
|
+
it("fleet scan includes log-only services and reports per-service coverage", async () => {
|
|
158
|
+
const reg = createRegistryWithMocks([
|
|
159
|
+
createMockConnector({
|
|
160
|
+
name: "prom1", type: "prometheus", signalType: "metrics",
|
|
161
|
+
listServices: async () => [{ name: "metric-svc", source: "prom1", signalType: "metrics" }],
|
|
162
|
+
queryMetrics: async () => ({
|
|
163
|
+
source: "prom1", service: "metric-svc", metric: "x", unit: "",
|
|
164
|
+
values: Array.from({ length: 30 }, (_, i) => ({ timestamp: new Date(Date.now() - (30 - i) * 9000).toISOString(), value: 20 })),
|
|
165
|
+
summary: { current: 20, average: 20, min: 20, max: 20, trend: "stable" },
|
|
166
|
+
}),
|
|
167
|
+
}),
|
|
168
|
+
createMockConnector({
|
|
169
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
170
|
+
listServices: async () => [{ name: "log-only-svc", source: "loki1", signalType: "logs" }],
|
|
171
|
+
queryLogs: async () => ({
|
|
172
|
+
source: "loki1", service: "log-only-svc", entries: [],
|
|
173
|
+
summary: { total: 10, errorCount: 0, warnCount: 0, topPatterns: [] },
|
|
174
|
+
}),
|
|
175
|
+
}),
|
|
176
|
+
]);
|
|
177
|
+
const data = JSON.parse((await detectAnomaliesHandler(reg, {})).content[0].text);
|
|
178
|
+
// Both services scanned — the log-only one is NOT silently dropped.
|
|
179
|
+
assert.equal(data.scannedServices, 2);
|
|
180
|
+
const names = data.coverage.scanned.map((s) => s.service).sort();
|
|
181
|
+
assert.deepEqual(names, ["log-only-svc", "metric-svc"]);
|
|
182
|
+
const logOnly = data.coverage.scanned.find((s) => s.service === "log-only-svc");
|
|
183
|
+
assert.deepEqual(logOnly.signals, ["logs"], "log-only service must be scanned via its logs signal");
|
|
184
|
+
// All-clear is no longer silently partial — it states the coverage.
|
|
185
|
+
assert.match(data.summary, /2 scanned service\(s\)/);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
156
188
|
describe("detectAnomaliesHandler — A5 memory/OOM coverage", () => {
|
|
157
189
|
const flatMemory = () => ({
|
|
158
190
|
source: "prom1", service: "payment-service", metric: "memory", unit: "bytes",
|
|
@@ -255,3 +287,57 @@ describe("getServiceHealthHandler — one-sided latency (regression)", () => {
|
|
|
255
287
|
assert.equal(latAnom, undefined, `latency dropping must not be an anomaly, got: ${JSON.stringify(latAnom)}`);
|
|
256
288
|
});
|
|
257
289
|
});
|
|
290
|
+
describe("getServiceHealthHandler — honest no-data / not-found (issue #453)", () => {
|
|
291
|
+
const emptySeries = () => ({
|
|
292
|
+
source: "prom1", service: "x", metric: "x", unit: "",
|
|
293
|
+
values: [],
|
|
294
|
+
summary: { current: 0, average: 0, min: 0, max: 0, trend: "stable" },
|
|
295
|
+
});
|
|
296
|
+
function metricsConnector(known) {
|
|
297
|
+
return {
|
|
298
|
+
connect: async () => { }, disconnect: async () => { },
|
|
299
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
300
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
301
|
+
listServices: async () => known.map((name) => ({ name, source: "prom1", signalType: "metrics" })),
|
|
302
|
+
name: "prom1", type: "prometheus", signalType: "metrics",
|
|
303
|
+
queryMetrics: async () => emptySeries(), // no data for any metric
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
function regWith(...mocks) {
|
|
307
|
+
const reg = new ConnectorRegistry();
|
|
308
|
+
for (const m of mocks) {
|
|
309
|
+
reg.connectors.set(m.name, m);
|
|
310
|
+
reg.sourceConfigs.set(m.name, { name: m.name, type: m.type, url: "http://m", enabled: true });
|
|
311
|
+
}
|
|
312
|
+
return reg;
|
|
313
|
+
}
|
|
314
|
+
it("nonexistent service → status unknown, score null, not-found note (not 100/healthy)", async () => {
|
|
315
|
+
const reg = regWith(metricsConnector(["payment-service"])); // does NOT know the queried name
|
|
316
|
+
const data = JSON.parse((await getServiceHealthHandler(reg, { service: "nope-xyz" })).content[0].text);
|
|
317
|
+
assert.equal(data.status, "unknown");
|
|
318
|
+
assert.equal(data.score, null);
|
|
319
|
+
assert.equal(data.signals.metrics, null);
|
|
320
|
+
assert.match(data.note, /not found/i);
|
|
321
|
+
});
|
|
322
|
+
it("log-only service with errors → judged on logs, never 100/healthy from metric zeros", async () => {
|
|
323
|
+
const logs = {
|
|
324
|
+
connect: async () => { }, disconnect: async () => { },
|
|
325
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
326
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
327
|
+
listServices: async () => [{ name: "logapp", source: "loki1", signalType: "logs" }],
|
|
328
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
329
|
+
queryLogs: async () => ({
|
|
330
|
+
source: "loki1", service: "logapp", entries: [],
|
|
331
|
+
summary: { total: 60, errorCount: 40, warnCount: 0, topPatterns: ["boom"] },
|
|
332
|
+
}),
|
|
333
|
+
};
|
|
334
|
+
const reg = regWith(metricsConnector([]), logs);
|
|
335
|
+
const data = JSON.parse((await getServiceHealthHandler(reg, { service: "logapp" })).content[0].text);
|
|
336
|
+
assert.notEqual(data.status, "healthy");
|
|
337
|
+
assert.notEqual(data.status, "unknown");
|
|
338
|
+
assert.equal(data.signals.metrics, null, "metrics signal must be null when no metric data");
|
|
339
|
+
assert.ok(data.signals.logs, "logs signal must be present");
|
|
340
|
+
assert.deepEqual(data.coverage, { metrics: false, logs: true });
|
|
341
|
+
assert.ok(data.score !== null && data.score < 50, `40 errors/5min log-only must not be healthy, got ${data.score}`);
|
|
342
|
+
});
|
|
343
|
+
});
|
package/dist/types.d.ts
CHANGED
|
@@ -338,20 +338,31 @@ export interface AnomalyReport {
|
|
|
338
338
|
}
|
|
339
339
|
export interface ServiceHealth {
|
|
340
340
|
service: string;
|
|
341
|
-
|
|
342
|
-
|
|
341
|
+
/** "unknown" when the service has no data in any signal (or doesn't exist). */
|
|
342
|
+
status: HealthStatus | "unknown";
|
|
343
|
+
/** 0-100, or null when status is "unknown" (no signal had data). */
|
|
344
|
+
score: number | null;
|
|
343
345
|
signals: {
|
|
346
|
+
/** null when the service exposes no metrics signal / no metric data. */
|
|
344
347
|
metrics: {
|
|
345
348
|
cpu: number;
|
|
346
349
|
memory: number;
|
|
347
350
|
errorRate: number;
|
|
348
351
|
latencyP99: number;
|
|
349
|
-
};
|
|
352
|
+
} | null;
|
|
353
|
+
/** null when the service exposes no logs signal / no log data. */
|
|
350
354
|
logs: {
|
|
351
355
|
errorRate: number;
|
|
352
356
|
topErrors: string[];
|
|
353
|
-
};
|
|
357
|
+
} | null;
|
|
354
358
|
};
|
|
355
359
|
anomalies: AnomalyReport[];
|
|
356
360
|
correlations: string[];
|
|
361
|
+
/** Which signal families actually had data (drives the score weighting). */
|
|
362
|
+
coverage?: {
|
|
363
|
+
metrics: boolean;
|
|
364
|
+
logs: boolean;
|
|
365
|
+
};
|
|
366
|
+
/** Operator-facing explanation when status is "unknown" or coverage is partial. */
|
|
367
|
+
note?: string;
|
|
357
368
|
}
|
package/package.json
CHANGED