@thotischner/observability-mcp 3.3.0 → 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analysis/correlator.js +1 -1
- package/dist/analysis/health.d.ts +13 -3
- package/dist/analysis/health.js +32 -16
- package/dist/analysis/health.test.js +31 -5
- package/dist/analysis/library.test.js +1 -1
- package/dist/conformance/mcp-2025-11-25.test.js +11 -0
- package/dist/connectors/loki.js +16 -1
- package/dist/connectors/loki.test.js +50 -2
- package/dist/connectors/prometheus.js +11 -1
- package/dist/connectors/prometheus.test.js +32 -6
- package/dist/index.js +19 -3
- package/dist/tools/detect-anomalies.js +45 -10
- package/dist/tools/get-service-health.js +76 -9
- package/dist/tools/handlers.test.js +88 -0
- package/dist/tools/query-logs-error-shape.test.d.ts +1 -0
- package/dist/tools/query-logs-error-shape.test.js +32 -0
- package/dist/tools/query-logs.js +5 -2
- package/dist/types.d.ts +21 -6
- package/package.json +1 -1
|
@@ -21,7 +21,7 @@ export function correlateSignals(anomalies, logResults, metricResults) {
|
|
|
21
21
|
for (const metric of serviceMetrics) {
|
|
22
22
|
if (metric.metric === anomaly.metric)
|
|
23
23
|
continue;
|
|
24
|
-
if (metric.summary.trend === "rising") {
|
|
24
|
+
if (metric.summary && metric.summary.trend === "rising") {
|
|
25
25
|
correlations.push(`${anomaly.service}: ${anomaly.metric} anomaly coincides with rising ${metric.metric} ` +
|
|
26
26
|
`(current: ${metric.summary.current.toFixed(2)})`);
|
|
27
27
|
}
|
|
@@ -7,12 +7,22 @@ export interface HealthInputs {
|
|
|
7
7
|
logErrorRate: number;
|
|
8
8
|
}
|
|
9
9
|
export interface HealthResult {
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
/** 0-100, or null when no signal had data (status "unknown"). */
|
|
11
|
+
score: number | null;
|
|
12
|
+
status: HealthStatus | "unknown";
|
|
12
13
|
details: Record<string, {
|
|
13
14
|
score: number;
|
|
14
15
|
value: number;
|
|
15
16
|
threshold: string;
|
|
16
17
|
}>;
|
|
17
18
|
}
|
|
18
|
-
|
|
19
|
+
/** Which signal families actually returned data. Missing/false families are
|
|
20
|
+
* excluded from the weighted score and the remaining weights are
|
|
21
|
+
* renormalised — so a log-only service is judged on its logs, not on metric
|
|
22
|
+
* zeros coerced to "healthy" (issue #453). Omit for the back-compat
|
|
23
|
+
* all-signals-present behaviour. */
|
|
24
|
+
export interface SignalCoverage {
|
|
25
|
+
metrics?: boolean;
|
|
26
|
+
logs?: boolean;
|
|
27
|
+
}
|
|
28
|
+
export declare function calculateHealthScore(inputs: HealthInputs, thresholds: HealthThresholds, coverage?: SignalCoverage): HealthResult;
|
package/dist/analysis/health.js
CHANGED
|
@@ -1,27 +1,43 @@
|
|
|
1
|
-
export function calculateHealthScore(inputs, thresholds) {
|
|
1
|
+
export function calculateHealthScore(inputs, thresholds, coverage) {
|
|
2
2
|
const w = thresholds.weights;
|
|
3
3
|
const t = thresholds;
|
|
4
|
+
const hasMetrics = coverage?.metrics !== false; // default: present (back-compat)
|
|
5
|
+
const hasLogs = coverage?.logs !== false;
|
|
4
6
|
const cpuScore = scoreFromThreshold(inputs.cpu, t.cpu.good, t.cpu.warn, t.cpu.crit);
|
|
5
7
|
const errorRateScore = scoreFromThreshold(inputs.errorRate, t.errorRate.good, t.errorRate.warn, t.errorRate.crit);
|
|
6
8
|
const latencyScore = scoreFromThreshold(inputs.latencyP99, t.latencyP99.good, t.latencyP99.warn, t.latencyP99.crit);
|
|
7
9
|
const logErrorScore = scoreFromThreshold(inputs.logErrorRate, t.logErrors.good, t.logErrors.warn, t.logErrors.crit);
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
10
|
+
// Only count the families that actually reported data; renormalise by the
|
|
11
|
+
// sum of their weights so a missing family is neither a free "100" nor a
|
|
12
|
+
// free "0". With full coverage the active weights sum to ~1 and this equals
|
|
13
|
+
// the previous behaviour.
|
|
14
|
+
let weighted = 0;
|
|
15
|
+
let activeWeight = 0;
|
|
16
|
+
if (hasMetrics) {
|
|
17
|
+
weighted += cpuScore * w.cpu + errorRateScore * w.errorRate + latencyScore * w.latency;
|
|
18
|
+
activeWeight += w.cpu + w.errorRate + w.latency;
|
|
19
|
+
}
|
|
20
|
+
if (hasLogs) {
|
|
21
|
+
weighted += logErrorScore * w.logErrors;
|
|
22
|
+
activeWeight += w.logErrors;
|
|
23
|
+
}
|
|
24
|
+
const details = {};
|
|
25
|
+
if (hasMetrics) {
|
|
26
|
+
details.cpu = { score: Math.round(cpuScore), value: inputs.cpu, threshold: `warn >${t.cpu.warn}%, crit >${t.cpu.crit}%` };
|
|
27
|
+
details.errorRate = { score: Math.round(errorRateScore), value: inputs.errorRate, threshold: `warn >${t.errorRate.warn}/s, crit >${t.errorRate.crit}/s` };
|
|
28
|
+
details.latencyP99 = { score: Math.round(latencyScore), value: inputs.latencyP99, threshold: `warn >${t.latencyP99.warn}s, crit >${t.latencyP99.crit}s` };
|
|
29
|
+
}
|
|
30
|
+
if (hasLogs) {
|
|
31
|
+
details.logErrors = { score: Math.round(logErrorScore), value: inputs.logErrorRate, threshold: `warn >${t.logErrors.warn}/min, crit >${t.logErrors.crit}/min` };
|
|
32
|
+
}
|
|
33
|
+
// No family reported data → honestly unknown, not a confident 100/healthy.
|
|
34
|
+
if (activeWeight === 0) {
|
|
35
|
+
return { score: null, status: "unknown", details };
|
|
36
|
+
}
|
|
37
|
+
const score = Math.round(Math.max(0, Math.min(100, weighted / activeWeight)));
|
|
13
38
|
const status = score > t.statusBoundaries.healthy ? "healthy" :
|
|
14
39
|
score > t.statusBoundaries.degraded ? "degraded" : "critical";
|
|
15
|
-
return {
|
|
16
|
-
score,
|
|
17
|
-
status,
|
|
18
|
-
details: {
|
|
19
|
-
cpu: { score: Math.round(cpuScore), value: inputs.cpu, threshold: `warn >${t.cpu.warn}%, crit >${t.cpu.crit}%` },
|
|
20
|
-
errorRate: { score: Math.round(errorRateScore), value: inputs.errorRate, threshold: `warn >${t.errorRate.warn}/s, crit >${t.errorRate.crit}/s` },
|
|
21
|
-
latencyP99: { score: Math.round(latencyScore), value: inputs.latencyP99, threshold: `warn >${t.latencyP99.warn}s, crit >${t.latencyP99.crit}s` },
|
|
22
|
-
logErrors: { score: Math.round(logErrorScore), value: inputs.logErrorRate, threshold: `warn >${t.logErrors.warn}/min, crit >${t.logErrors.crit}/min` },
|
|
23
|
-
},
|
|
24
|
-
};
|
|
40
|
+
return { score, status, details };
|
|
25
41
|
}
|
|
26
42
|
function scoreFromThreshold(value, good, warn, crit) {
|
|
27
43
|
if (value <= good)
|
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { describe, it } from "node:test";
|
|
2
2
|
import assert from "node:assert/strict";
|
|
3
3
|
import { calculateHealthScore } from "./health.js";
|
|
4
|
+
// Full-coverage scores are never null; narrow for the existing assertions.
|
|
5
|
+
function num(r) {
|
|
6
|
+
assert.notEqual(r.score, null, "expected a numeric score");
|
|
7
|
+
return r.score;
|
|
8
|
+
}
|
|
4
9
|
const defaults = {
|
|
5
10
|
weights: { errorRate: 0.35, latency: 0.25, cpu: 0.20, logErrors: 0.20 },
|
|
6
11
|
cpu: { good: 50, warn: 80, crit: 95 },
|
|
@@ -22,21 +27,21 @@ describe("calculateHealthScore", () => {
|
|
|
22
27
|
cpu: 20, memory: 100_000_000, errorRate: 0.005, latencyP99: 0.3, logErrorRate: 0,
|
|
23
28
|
}, defaults);
|
|
24
29
|
assert.equal(result.status, "healthy");
|
|
25
|
-
assert.ok(result
|
|
30
|
+
assert.ok(num(result) > 80);
|
|
26
31
|
});
|
|
27
32
|
it("returns degraded for elevated values", () => {
|
|
28
33
|
const result = calculateHealthScore({
|
|
29
34
|
cpu: 65, memory: 200_000_000, errorRate: 0.05, latencyP99: 0.8, logErrorRate: 3,
|
|
30
35
|
}, defaults);
|
|
31
36
|
assert.equal(result.status, "degraded");
|
|
32
|
-
assert.ok(result
|
|
37
|
+
assert.ok(num(result) > 50 && num(result) <= 80, `Expected degraded score 50-80, got ${result.score}`);
|
|
33
38
|
});
|
|
34
39
|
it("returns critical for extreme values", () => {
|
|
35
40
|
const result = calculateHealthScore({
|
|
36
41
|
cpu: 98, memory: 500_000_000, errorRate: 1.0, latencyP99: 5.0, logErrorRate: 50,
|
|
37
42
|
}, defaults);
|
|
38
43
|
assert.equal(result.status, "critical");
|
|
39
|
-
assert.ok(result
|
|
44
|
+
assert.ok(num(result) <= 50);
|
|
40
45
|
});
|
|
41
46
|
it("score is between 0 and 100", () => {
|
|
42
47
|
for (const cpu of [0, 50, 100]) {
|
|
@@ -44,7 +49,7 @@ describe("calculateHealthScore", () => {
|
|
|
44
49
|
const result = calculateHealthScore({
|
|
45
50
|
cpu, memory: 0, errorRate: err, latencyP99: 0, logErrorRate: 0,
|
|
46
51
|
}, defaults);
|
|
47
|
-
assert.ok(result
|
|
52
|
+
assert.ok(num(result) >= 0 && num(result) <= 100, `Score ${result.score} out of range`);
|
|
48
53
|
}
|
|
49
54
|
}
|
|
50
55
|
});
|
|
@@ -57,7 +62,7 @@ describe("calculateHealthScore", () => {
|
|
|
57
62
|
cpu: 25, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 0,
|
|
58
63
|
}, strict);
|
|
59
64
|
// CPU 25% with strict thresholds should lower the score
|
|
60
|
-
assert.ok(result
|
|
65
|
+
assert.ok(num(result) < 100);
|
|
61
66
|
});
|
|
62
67
|
it("includes detail breakdown", () => {
|
|
63
68
|
const result = calculateHealthScore({
|
|
@@ -67,4 +72,25 @@ describe("calculateHealthScore", () => {
|
|
|
67
72
|
assert.ok("errorRate" in result.details);
|
|
68
73
|
assert.ok(result.details.cpu.score < 100);
|
|
69
74
|
});
|
|
75
|
+
it("coverage: no signals at all → score null, status unknown (issue #453)", () => {
|
|
76
|
+
const r = calculateHealthScore({ cpu: 0, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 0 }, defaults, { metrics: false, logs: false });
|
|
77
|
+
assert.equal(r.score, null);
|
|
78
|
+
assert.equal(r.status, "unknown");
|
|
79
|
+
assert.deepEqual(r.details, {});
|
|
80
|
+
});
|
|
81
|
+
it("coverage: log-only service is judged on logs, not metric zeros (issue #453)", () => {
|
|
82
|
+
// High log error rate, no metric coverage → must NOT come back healthy.
|
|
83
|
+
const r = calculateHealthScore({ cpu: 0, memory: 0, errorRate: 0, latencyP99: 0, logErrorRate: 50 }, defaults, { metrics: false, logs: true });
|
|
84
|
+
assert.notEqual(r.status, "healthy");
|
|
85
|
+
assert.ok(num(r) < 50, `log-only with 50 errors/min should not be healthy, got ${r.score}`);
|
|
86
|
+
assert.ok(!("cpu" in r.details), "metric details excluded when metrics absent");
|
|
87
|
+
assert.ok("logErrors" in r.details);
|
|
88
|
+
});
|
|
89
|
+
it("coverage: full coverage (default) is unchanged by the coverage param", () => {
|
|
90
|
+
const inputs = { cpu: 65, memory: 0, errorRate: 0.05, latencyP99: 0.8, logErrorRate: 3 };
|
|
91
|
+
const implicit = calculateHealthScore(inputs, defaults);
|
|
92
|
+
const explicit = calculateHealthScore(inputs, defaults, { metrics: true, logs: true });
|
|
93
|
+
assert.equal(implicit.score, explicit.score);
|
|
94
|
+
assert.equal(implicit.status, explicit.status);
|
|
95
|
+
});
|
|
70
96
|
});
|
|
@@ -38,7 +38,7 @@ describe("embeddable analysis library", () => {
|
|
|
38
38
|
logErrors: { good: 1, warn: 5, crit: 10 },
|
|
39
39
|
statusBoundaries: { healthy: 80, degraded: 50 },
|
|
40
40
|
});
|
|
41
|
-
assert.ok(r.score >= 0 && r.score <= 100);
|
|
41
|
+
assert.ok(r.score !== null && r.score >= 0 && r.score <= 100);
|
|
42
42
|
assert.ok(["healthy", "degraded", "critical"].includes(r.status));
|
|
43
43
|
});
|
|
44
44
|
});
|
|
@@ -428,3 +428,14 @@ test("E2E: /llms.txt is served and reflects the canonical tool registry", opts,
|
|
|
428
428
|
}
|
|
429
429
|
assert.ok(text.includes("for-agents"), "must link the for-agents guide");
|
|
430
430
|
});
|
|
431
|
+
test("E2E: initialize advertises non-empty instructions pointing at the usage guide (issue #455)", opts, async () => {
|
|
432
|
+
const { response } = await jsonRpc("initialize", {
|
|
433
|
+
protocolVersion: PROTOCOL_VERSION,
|
|
434
|
+
capabilities: {},
|
|
435
|
+
clientInfo: { name: "harness", version: "0" },
|
|
436
|
+
}, { id: 30 });
|
|
437
|
+
const r = response.result;
|
|
438
|
+
assert.ok(r.instructions && r.instructions.length > 0, "initialize.instructions must be populated");
|
|
439
|
+
assert.match(r.instructions, /omcp:\/\/guide\/agent-usage/, "must point at the usage-guide resource");
|
|
440
|
+
assert.match(r.instructions, /aggregate/i, "must carry the filter+aggregate golden rule");
|
|
441
|
+
});
|
package/dist/connectors/loki.js
CHANGED
|
@@ -68,7 +68,12 @@ export function buildAggregateLogQL(streamPipeline, agg, duration) {
|
|
|
68
68
|
if (agg.op === "count_over_time") {
|
|
69
69
|
const stepSec = (agg.step && parseDurationSeconds(agg.step)) || defaultBucketSeconds(durSec);
|
|
70
70
|
const inner = `count_over_time(${streamPipeline} [${stepSec}s])`;
|
|
71
|
-
|
|
71
|
+
// Always wrap in sum() — even with no `by`. A bare count_over_time over a
|
|
72
|
+
// `| json`-piped stream keeps every extracted label (rid/ip/status/…) as
|
|
73
|
+
// its own series, so the "requests over time" headline case returns a
|
|
74
|
+
// high-cardinality mess instead of one bucketed total (issue #452). With
|
|
75
|
+
// no `by` we collapse to a single series; `sum by(...)` for explicit by.
|
|
76
|
+
const logql = `sum${byClause} (${inner})`;
|
|
72
77
|
return { logql, mode: "range", step: `${stepSec}s` };
|
|
73
78
|
}
|
|
74
79
|
// sum / topk: count over the whole window, then aggregate → instant vector.
|
|
@@ -195,6 +200,16 @@ export class LokiConnector {
|
|
|
195
200
|
const url = `/loki/api/v1/query_range?query=${encodeURIComponent(logql)}` +
|
|
196
201
|
`&start=${start}000000000&end=${end}000000000&limit=${limit}`;
|
|
197
202
|
const data = await this.apiGet(url);
|
|
203
|
+
// A log query yields resultType "streams". A metric query (e.g. a
|
|
204
|
+
// raw_query wrapping sum()/count() → vector/matrix) does NOT — and the
|
|
205
|
+
// streams parser below would dereference undefined `.stream`/`.values`
|
|
206
|
+
// and crash on `.level` (issue #452). Fail fast with a clear, actionable
|
|
207
|
+
// message instead of running to timeout on a wrong-shaped result.
|
|
208
|
+
const resultType = data?.data?.resultType;
|
|
209
|
+
if (resultType && resultType !== "streams") {
|
|
210
|
+
throw new Error(`query_logs raw_query returned a '${resultType}' result, but query_logs handles log lines (streams) only. ` +
|
|
211
|
+
"For counts/sums/top-k use the `aggregate` param on query_logs; for arbitrary vector/matrix LogQL use query_metrics raw_query.");
|
|
212
|
+
}
|
|
198
213
|
const entries = [];
|
|
199
214
|
for (const stream of data?.data?.result || []) {
|
|
200
215
|
const labels = stream.stream;
|
|
@@ -96,6 +96,34 @@ describe("Q-LOG1: queryLogs LogQL assembly", () => {
|
|
|
96
96
|
});
|
|
97
97
|
assert.equal(q, '{job="raw"}');
|
|
98
98
|
});
|
|
99
|
+
it("#452: rawQuery returning a vector/matrix fails fast with a clear message (no .level crash)", async () => {
|
|
100
|
+
const conn = new LokiConnector();
|
|
101
|
+
await conn.connect({ name: "loki", type: "loki", url: "http://loki:3100", enabled: true });
|
|
102
|
+
const orig = globalThis.fetch;
|
|
103
|
+
globalThis.fetch = (async (url) => {
|
|
104
|
+
const u = String(url);
|
|
105
|
+
if (u.includes("/label/") && u.includes("/values"))
|
|
106
|
+
return jsonRes({ data: ["app"] });
|
|
107
|
+
// A metric raw_query (sum(count_over_time(...))) returns resultType matrix,
|
|
108
|
+
// not streams — must not be fed to the streams parser.
|
|
109
|
+
if (u.includes("/query_range")) {
|
|
110
|
+
return jsonRes({ data: { resultType: "matrix", result: [{ metric: { url: "/" }, values: [[1000, "5"]] }] } });
|
|
111
|
+
}
|
|
112
|
+
return jsonRes({ data: [] });
|
|
113
|
+
});
|
|
114
|
+
try {
|
|
115
|
+
await conn.queryLogs({ rawQuery: "sum(count_over_time({app=\"x\"} | json [6h]))", duration: "6h" });
|
|
116
|
+
assert.fail("expected a thrown error for a non-streams raw_query result");
|
|
117
|
+
}
|
|
118
|
+
catch (err) {
|
|
119
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
120
|
+
assert.match(msg, /'matrix' result/);
|
|
121
|
+
assert.match(msg, /aggregate.*param|query_metrics raw_query/);
|
|
122
|
+
}
|
|
123
|
+
finally {
|
|
124
|
+
globalThis.fetch = orig;
|
|
125
|
+
}
|
|
126
|
+
});
|
|
99
127
|
});
|
|
100
128
|
describe("Q-LOG2: parseDurationSeconds / defaultBucketSeconds", () => {
|
|
101
129
|
it("parses m/h/d", () => {
|
|
@@ -118,11 +146,31 @@ describe("Q-LOG2: buildAggregateLogQL", () => {
|
|
|
118
146
|
assert.equal(r.step, "900s");
|
|
119
147
|
assert.equal(r.logql, `sum by (url) (count_over_time(${PIPE} [900s]))`);
|
|
120
148
|
});
|
|
121
|
-
it("count_over_time
|
|
149
|
+
it("count_over_time with a label-filter pipeline + no by → valid sum-wrapped LogQL (#452 leftover #2)", () => {
|
|
150
|
+
// The reporter saw an intermittent 400 on a label-filtered count_over_time
|
|
151
|
+
// and wondered if the collapse path emits different LogQL when a filter is
|
|
152
|
+
// present. It does not: the label filter lives in the streamPipeline
|
|
153
|
+
// (identical to the sum/topk path, which works), and the count_over_time
|
|
154
|
+
// branch wraps it verbatim. Assert the emitted LogQL is well-formed —
|
|
155
|
+
// `sum (count_over_time({sel} | json | environment="prod" [step]))` — so
|
|
156
|
+
// any future regression in the generated query is caught here.
|
|
157
|
+
const filtered = '{service_name="app"} | json | environment="prod"';
|
|
158
|
+
const r = buildAggregateLogQL(filtered, { op: "count_over_time", step: "1h" }, "6h");
|
|
159
|
+
assert.equal(r.mode, "range");
|
|
160
|
+
assert.equal(r.step, "3600s");
|
|
161
|
+
assert.equal(r.logql, `sum (count_over_time(${filtered} [3600s]))`);
|
|
162
|
+
// Structural sanity: balanced parens, sum-wrapped, single range selector.
|
|
163
|
+
assert.equal((r.logql.match(/\(/g) || []).length, (r.logql.match(/\)/g) || []).length);
|
|
164
|
+
assert.match(r.logql, /^sum \(count_over_time\(.*\[\d+s\]\)\)$/);
|
|
165
|
+
});
|
|
166
|
+
it("count_over_time without by → sum-wrapped (single series), default step (#452)", () => {
|
|
167
|
+
// Regression for issue #452: a bare count_over_time over a `| json` stream
|
|
168
|
+
// keeps every extracted label as its own series. With no `by` we must
|
|
169
|
+
// collapse to one bucketed total via sum(...).
|
|
122
170
|
const r = buildAggregateLogQL(PIPE, { op: "count_over_time" }, "1h");
|
|
123
171
|
assert.equal(r.mode, "range");
|
|
124
172
|
assert.equal(r.step, "60s");
|
|
125
|
-
assert.equal(r.logql, `count_over_time(${PIPE} [60s])`);
|
|
173
|
+
assert.equal(r.logql, `sum (count_over_time(${PIPE} [60s]))`);
|
|
126
174
|
});
|
|
127
175
|
it("sum → instant total per group over the whole window", () => {
|
|
128
176
|
const r = buildAggregateLogQL(PIPE, { op: "sum", by: ["status"] }, "1h");
|
|
@@ -240,6 +240,10 @@ export class PrometheusConnector {
|
|
|
240
240
|
resolvedSeries: promql,
|
|
241
241
|
resolvedLabel: label,
|
|
242
242
|
};
|
|
243
|
+
if (!result.summary) {
|
|
244
|
+
result.note = `No data: no '${params.metric}' series matched "${params.service}" in this window. ` +
|
|
245
|
+
"The service may expose logs only, or the metric name/label didn't match. Absent ≠ zero — summary is null rather than all-zeros.";
|
|
246
|
+
}
|
|
243
247
|
if (params.groupBy && groups.length > 1) {
|
|
244
248
|
result.groupBy = params.groupBy;
|
|
245
249
|
result.groups = groups;
|
|
@@ -295,6 +299,9 @@ export class PrometheusConnector {
|
|
|
295
299
|
resolvedSeries: rawQuery,
|
|
296
300
|
resolvedLabel: "",
|
|
297
301
|
};
|
|
302
|
+
if (!result.summary) {
|
|
303
|
+
result.note = "No data: the query returned no series in this window. Absent ≠ zero — summary is null rather than all-zeros.";
|
|
304
|
+
}
|
|
298
305
|
if (groups.length > 1)
|
|
299
306
|
result.groups = groups;
|
|
300
307
|
return result;
|
|
@@ -475,7 +482,10 @@ export class PrometheusConnector {
|
|
|
475
482
|
}
|
|
476
483
|
computeSummary(values) {
|
|
477
484
|
if (values.length === 0) {
|
|
478
|
-
|
|
485
|
+
// No data points → no-data, NOT a confident all-zeros reading. Coercing
|
|
486
|
+
// an empty series to {current:0,trend:"stable"} is indistinguishable
|
|
487
|
+
// from a service genuinely idling at 0 (issue #462).
|
|
488
|
+
return null;
|
|
479
489
|
}
|
|
480
490
|
const current = values[values.length - 1];
|
|
481
491
|
const average = values.reduce((a, b) => a + b, 0) / values.length;
|
|
@@ -53,13 +53,9 @@ describe("PrometheusConnector", () => {
|
|
|
53
53
|
});
|
|
54
54
|
});
|
|
55
55
|
describe("computeSummary", () => {
|
|
56
|
-
it("returns
|
|
56
|
+
it("returns null for empty array — no-data, not a false all-zeros reading (#462)", () => {
|
|
57
57
|
const s = proto.computeSummary([]);
|
|
58
|
-
assert.equal(s
|
|
59
|
-
assert.equal(s.average, 0);
|
|
60
|
-
assert.equal(s.min, 0);
|
|
61
|
-
assert.equal(s.max, 0);
|
|
62
|
-
assert.equal(s.trend, "stable");
|
|
58
|
+
assert.equal(s, null);
|
|
63
59
|
});
|
|
64
60
|
it("computes correct summary for values", () => {
|
|
65
61
|
const s = proto.computeSummary([10, 20, 30, 40]);
|
|
@@ -199,4 +195,34 @@ describe("PrometheusConnector", () => {
|
|
|
199
195
|
}
|
|
200
196
|
});
|
|
201
197
|
});
|
|
198
|
+
describe("queryMetrics no-data → null summary, not zero-fill (#462)", () => {
|
|
199
|
+
const fakeSource = { name: "test", type: "prometheus", url: "http://localhost:9090", enabled: true };
|
|
200
|
+
it("an empty result set yields values:[], summary:null, and a no-data note", async () => {
|
|
201
|
+
const connector = new PrometheusConnector();
|
|
202
|
+
await connector.connect({ ...fakeSource });
|
|
203
|
+
const orig = globalThis.fetch;
|
|
204
|
+
// raw_query bypasses the candidate-probe / label-resolve path and runs
|
|
205
|
+
// query_range directly — here it returns an empty result set (the
|
|
206
|
+
// no-data case: a logs-only service has no such metric series).
|
|
207
|
+
globalThis.fetch = (async () => ({
|
|
208
|
+
ok: true,
|
|
209
|
+
status: 200,
|
|
210
|
+
json: async () => ({ data: { result: [] } }),
|
|
211
|
+
}));
|
|
212
|
+
try {
|
|
213
|
+
const result = await connector.queryMetrics({
|
|
214
|
+
service: "",
|
|
215
|
+
metric: "",
|
|
216
|
+
duration: "1h",
|
|
217
|
+
rawQuery: "rate(process_cpu_seconds_total{job=\"logs-only-svc\"}[1m]) * 100",
|
|
218
|
+
});
|
|
219
|
+
assert.deepEqual(result.values, [], "no data points");
|
|
220
|
+
assert.equal(result.summary, null, "summary must be null, not {current:0,...}");
|
|
221
|
+
assert.match(result.note ?? "", /No data/i, "must carry a no-data note");
|
|
222
|
+
}
|
|
223
|
+
finally {
|
|
224
|
+
globalThis.fetch = orig;
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
});
|
|
202
228
|
});
|
package/dist/index.js
CHANGED
|
@@ -342,6 +342,18 @@ async function main() {
|
|
|
342
342
|
const mcpServer = new McpServer({
|
|
343
343
|
name: "observability-mcp",
|
|
344
344
|
version: SERVER_VERSION,
|
|
345
|
+
}, {
|
|
346
|
+
// `instructions` is the one channel the MCP spec auto-injects into the
|
|
347
|
+
// agent's context on connect (issue #455). Keep it tight: point at the
|
|
348
|
+
// full guide resource + the single rule that prevents the most common
|
|
349
|
+
// mistake (dumping raw rows), + the empty-state contract.
|
|
350
|
+
instructions: "Read MCP resource `omcp://guide/agent-usage` before heavy use. " +
|
|
351
|
+
"Golden rule: filter + aggregate server-side — use `query_logs`/`query_metrics` " +
|
|
352
|
+
"`labels` and `query_logs` `aggregate` to ask for numbers, not haystacks (raw log " +
|
|
353
|
+
"dumps blow past context limits). All tools are read-only. When a result is empty " +
|
|
354
|
+
"or refused, the message names the operator flag that unlocks it (e.g. OMCP_RAW_QUERY) " +
|
|
355
|
+
"— relay it verbatim. Prompts `triage-incident` and `write-postmortem` compose the tools. " +
|
|
356
|
+
"Hit a rough edge? Report it via the agent-report issue template (see the guide).",
|
|
345
357
|
});
|
|
346
358
|
const toolHandlers = new Map();
|
|
347
359
|
// --- Register tools with Zod schemas ---
|
|
@@ -431,9 +443,12 @@ async function main() {
|
|
|
431
443
|
"redacted values → a `_redacted` count in the result. Relay flag names to your",
|
|
432
444
|
"operator verbatim — the messages are written to be forwarded.",
|
|
433
445
|
"",
|
|
434
|
-
"## Report findings",
|
|
435
|
-
"Structured agent reports drive releases here (see issue #415). File
|
|
446
|
+
"## Report findings & collaborate",
|
|
447
|
+
"Structured agent reports drive releases here (see issue #415). File a bug or gap:",
|
|
436
448
|
"https://github.com/ThoTischner/observability-mcp/issues/new?template=agent-report.yml",
|
|
449
|
+
"For workflows, ideas, or to compare notes with other agents (collaboration welcome):",
|
|
450
|
+
"https://github.com/ThoTischner/observability-mcp/discussions",
|
|
451
|
+
"Note: posting publicly is an outward-facing action — do it on your operator's behalf, not unprompted.",
|
|
437
452
|
"Full guide: https://thotischner.github.io/observability-mcp/for-agents/",
|
|
438
453
|
].join("\n"),
|
|
439
454
|
},
|
|
@@ -507,7 +522,7 @@ async function main() {
|
|
|
507
522
|
"Fetch the raw time-series for ONE metric of ONE service over a look-back window, returned together with pre-computed summary statistics.",
|
|
508
523
|
"When to use: when you need the actual numeric values or the trend of a known metric. For a 'is this service OK?' verdict use `get_service_health`; to find which services are misbehaving use `detect_anomalies`.",
|
|
509
524
|
"Prerequisites: get the exact service name from `list_services` and choose a metric from the list at the end of this description.",
|
|
510
|
-
"Behavior: read-only, no side effects. Returns an ordered array of {timestamp, value} points plus a summary {current, average, min, max, trend}. With `groupBy` set, returns one labelled series per distinct label value under `groups` instead of a single aggregated series. Units depend on the metric (e.g. CPU as %, latency as ms, rates as per-second). An unknown service/metric or an unreachable backend yields a structured explanatory error, never an exception.",
|
|
525
|
+
"Behavior: read-only, no side effects. Returns an ordered array of {timestamp, value} points plus a summary {current, average, min, max, trend}. When no series matched (e.g. a logs-only service has no such metric), `values` is empty and `summary` is `null` (not all-zeros) with a `note` — absent data is not a real zero reading. With `groupBy` set, returns one labelled series per distinct label value under `groups` instead of a single aggregated series. Units depend on the metric (e.g. CPU as %, latency as ms, rates as per-second). An unknown service/metric or an unreachable backend yields a structured explanatory error, never an exception.",
|
|
511
526
|
`Available metrics: ${metricsList}`,
|
|
512
527
|
].join(" "), {
|
|
513
528
|
service: z
|
|
@@ -546,6 +561,7 @@ async function main() {
|
|
|
546
561
|
registerTool("query_logs", [
|
|
547
562
|
"Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
|
|
548
563
|
"When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
|
|
564
|
+
"Golden rule: filter + aggregate server-side — pass `labels` to scope and `aggregate` (count_over_time/sum/topk) to get numbers, not raw rows. A high-volume window returned raw will blow past your context limit.",
|
|
549
565
|
"Prerequisites: get the exact service name from `list_services` (the service must expose a logs signal).",
|
|
550
566
|
"Behavior: read-only, no side effects. Returns the matching log entries (newest first, capped by `limit`) plus a summary with total/error/warn counts and top recurring error patterns. No matches yields an empty result with a zeroed summary; an unreachable backend yields a structured explanatory error, never an exception.",
|
|
551
567
|
].join(" "), {
|
|
@@ -40,18 +40,42 @@ export async function detectAnomaliesHandler(registry, args, ctx = defaultContex
|
|
|
40
40
|
const tenantConnectors = registry.getByTenant(ctx.tenant);
|
|
41
41
|
const metricsConnectors = tenantConnectors.filter((c) => c.signalType === "metrics");
|
|
42
42
|
const logConnectors = tenantConnectors.filter((c) => c.signalType === "logs");
|
|
43
|
-
|
|
43
|
+
// Discover services from BOTH metrics and log connectors, tracking which
|
|
44
|
+
// signal each service exposes. Previously the fleet scan only enumerated
|
|
45
|
+
// metrics connectors, so a log-only service was silently dropped from the
|
|
46
|
+
// scan — and the "all healthy" all-clear never said so (issue #453B). Now
|
|
47
|
+
// log-only services are scanned (via their log error-rate, as the
|
|
48
|
+
// description promises) and the per-service coverage is reported.
|
|
49
|
+
const coverage = new Map();
|
|
50
|
+
const mark = (name, key) => {
|
|
51
|
+
const e = coverage.get(name) ?? { metrics: false, logs: false };
|
|
52
|
+
e[key] = true;
|
|
53
|
+
coverage.set(name, e);
|
|
54
|
+
};
|
|
55
|
+
for (const connector of metricsConnectors) {
|
|
56
|
+
try {
|
|
57
|
+
for (const s of await connector.listServices())
|
|
58
|
+
mark(s.name, "metrics");
|
|
59
|
+
}
|
|
60
|
+
catch { /* connector down — skip */ }
|
|
61
|
+
}
|
|
62
|
+
for (const connector of logConnectors) {
|
|
63
|
+
try {
|
|
64
|
+
for (const s of await connector.listServices())
|
|
65
|
+
mark(s.name, "logs");
|
|
66
|
+
}
|
|
67
|
+
catch { /* connector down — skip */ }
|
|
68
|
+
}
|
|
69
|
+
let serviceNames;
|
|
44
70
|
if (args.service) {
|
|
45
71
|
serviceNames = [args.service];
|
|
72
|
+
if (!coverage.has(args.service)) {
|
|
73
|
+
// Unknown to listServices — still attempt both signal paths.
|
|
74
|
+
coverage.set(args.service, { metrics: metricsConnectors.length > 0, logs: logConnectors.length > 0 });
|
|
75
|
+
}
|
|
46
76
|
}
|
|
47
77
|
else {
|
|
48
|
-
|
|
49
|
-
const services = await connector.listServices();
|
|
50
|
-
for (const s of services) {
|
|
51
|
-
if (!serviceNames.includes(s.name))
|
|
52
|
-
serviceNames.push(s.name);
|
|
53
|
-
}
|
|
54
|
-
}
|
|
78
|
+
serviceNames = [...coverage.keys()];
|
|
55
79
|
}
|
|
56
80
|
const allAnomalies = [];
|
|
57
81
|
const allCorrelations = [];
|
|
@@ -179,14 +203,25 @@ export async function detectAnomaliesHandler(registry, args, ctx = defaultContex
|
|
|
179
203
|
severity: a.severity,
|
|
180
204
|
})))
|
|
181
205
|
: { ranked: [], summary: "" };
|
|
206
|
+
// Per-service coverage so an "all healthy" all-clear is verifiable rather
|
|
207
|
+
// than silently partial: the caller sees exactly which services were
|
|
208
|
+
// scanned and on which signals (issue #453B).
|
|
209
|
+
const scanned = serviceNames.map((name) => {
|
|
210
|
+
const cov = coverage.get(name) ?? { metrics: false, logs: false };
|
|
211
|
+
const signals = [cov.metrics ? "metrics" : null, cov.logs ? "logs" : null].filter(Boolean);
|
|
212
|
+
return { service: name, signals };
|
|
213
|
+
});
|
|
214
|
+
const metricsCount = scanned.filter((s) => s.signals.includes("metrics")).length;
|
|
215
|
+
const logsCount = scanned.filter((s) => s.signals.includes("logs")).length;
|
|
182
216
|
const result = {
|
|
183
217
|
scannedServices: serviceNames.length,
|
|
218
|
+
coverage: { scanned },
|
|
184
219
|
anomalies: allAnomalies,
|
|
185
220
|
correlations: allCorrelations,
|
|
186
221
|
rootCause,
|
|
187
222
|
summary: allAnomalies.length === 0
|
|
188
|
-
?
|
|
189
|
-
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`,
|
|
223
|
+
? `No anomalies across ${serviceNames.length} scanned service(s) (${metricsCount} with metrics, ${logsCount} with logs).`
|
|
224
|
+
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} of ${serviceNames.length} scanned service(s).`,
|
|
190
225
|
};
|
|
191
226
|
return {
|
|
192
227
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
@@ -24,23 +24,38 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
24
24
|
const tenantConnectors = registry.getByTenant(ctx.tenant);
|
|
25
25
|
const metricsConnectors = tenantConnectors.filter((c) => c.signalType === "metrics");
|
|
26
26
|
const logConnectors = tenantConnectors.filter((c) => c.signalType === "logs");
|
|
27
|
-
// Gather metrics
|
|
27
|
+
// Gather metrics. Track whether any series actually returned data —
|
|
28
|
+
// absent metrics must NOT be coerced to 0 and read as a confident
|
|
29
|
+
// "healthy" (issue #453).
|
|
28
30
|
let cpu = 0, memory = 0, errorRate = 0, latencyP99 = 0;
|
|
31
|
+
let metricsHadData = false;
|
|
29
32
|
const anomalies = [];
|
|
30
33
|
for (const connector of metricsConnectors) {
|
|
31
34
|
if (!connector.queryMetrics)
|
|
32
35
|
continue;
|
|
33
36
|
try {
|
|
34
37
|
const cpuResult = await connector.queryMetrics({ service: args.service, metric: "cpu", duration: "5m" });
|
|
35
|
-
|
|
38
|
+
if (cpuResult.summary) {
|
|
39
|
+
cpu = cpuResult.summary.current;
|
|
40
|
+
metricsHadData = true;
|
|
41
|
+
}
|
|
36
42
|
checkAnomaly(cpuResult.values.map(v => v.value), "cpu", args.service, connector.name, anomalies);
|
|
37
43
|
const memResult = await connector.queryMetrics({ service: args.service, metric: "memory", duration: "5m" });
|
|
38
|
-
|
|
44
|
+
if (memResult.summary) {
|
|
45
|
+
memory = memResult.summary.current / 1_000_000;
|
|
46
|
+
metricsHadData = true;
|
|
47
|
+
} // MB for display
|
|
39
48
|
const errResult = await connector.queryMetrics({ service: args.service, metric: "error_rate", duration: "5m" });
|
|
40
|
-
|
|
49
|
+
if (errResult.summary) {
|
|
50
|
+
errorRate = errResult.summary.current;
|
|
51
|
+
metricsHadData = true;
|
|
52
|
+
}
|
|
41
53
|
checkAnomaly(errResult.values.map(v => v.value), "error_rate", args.service, connector.name, anomalies);
|
|
42
54
|
const latResult = await connector.queryMetrics({ service: args.service, metric: "latency_p99", duration: "5m" });
|
|
43
|
-
|
|
55
|
+
if (latResult.summary) {
|
|
56
|
+
latencyP99 = latResult.summary.current;
|
|
57
|
+
metricsHadData = true;
|
|
58
|
+
}
|
|
44
59
|
checkAnomaly(latResult.values.map(v => v.value), "latency_p99", args.service, connector.name, anomalies);
|
|
45
60
|
}
|
|
46
61
|
catch (err) {
|
|
@@ -50,12 +65,15 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
50
65
|
// Gather logs
|
|
51
66
|
let logErrorRate = 0;
|
|
52
67
|
let topErrors = [];
|
|
68
|
+
let logsHadData = false;
|
|
53
69
|
const correlations = [];
|
|
54
70
|
for (const connector of logConnectors) {
|
|
55
71
|
if (!connector.queryLogs)
|
|
56
72
|
continue;
|
|
57
73
|
try {
|
|
58
74
|
const logs = await connector.queryLogs({ service: args.service, duration: "5m", limit: 200 });
|
|
75
|
+
if (logs.summary.total > 0)
|
|
76
|
+
logsHadData = true; // real log coverage in the window
|
|
59
77
|
logErrorRate = logs.summary.errorCount; // errors in 5m window
|
|
60
78
|
topErrors = logs.summary.topPatterns;
|
|
61
79
|
// Cross-signal correlation
|
|
@@ -70,7 +88,31 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
70
88
|
console.error("Health check logs failed for %s:", sanitizeForLog(args.service), err);
|
|
71
89
|
}
|
|
72
90
|
}
|
|
73
|
-
//
|
|
91
|
+
// Honest signal coverage: judge the service only on the families that
|
|
92
|
+
// actually returned data, so a log-only (or absent) service is never
|
|
93
|
+
// coerced to a confident "healthy" from metric zeros (issue #453).
|
|
94
|
+
const coverage = { metrics: metricsHadData, logs: logsHadData };
|
|
95
|
+
// No data at all → either the service doesn't exist (typo / decommissioned)
|
|
96
|
+
// or it isn't monitored. Say so explicitly, like the other tools' empty
|
|
97
|
+
// states — don't return 100/healthy.
|
|
98
|
+
if (!metricsHadData && !logsHadData) {
|
|
99
|
+
const known = await knownServiceNames(tenantConnectors, args.service);
|
|
100
|
+
const note = known
|
|
101
|
+
? `No metric or log data for "${args.service}" in the last 5 minutes — the service exists but has no monitored signals (or was quiet). Health is unknown, not healthy.`
|
|
102
|
+
: `Service "${args.service}" was not found in any connected source. Check the exact name via list_services. (Not reporting a health score for a service that does not exist.)`;
|
|
103
|
+
const result = {
|
|
104
|
+
service: args.service,
|
|
105
|
+
status: "unknown",
|
|
106
|
+
score: null,
|
|
107
|
+
signals: { metrics: null, logs: null },
|
|
108
|
+
anomalies,
|
|
109
|
+
correlations,
|
|
110
|
+
coverage,
|
|
111
|
+
note,
|
|
112
|
+
};
|
|
113
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
114
|
+
}
|
|
115
|
+
// Calculate health score over the covered signals only.
|
|
74
116
|
const { DEFAULT_HEALTH_THRESHOLDS } = await import("../config/loader.js");
|
|
75
117
|
const health = calculateHealthScore({
|
|
76
118
|
cpu,
|
|
@@ -78,22 +120,47 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
78
120
|
errorRate,
|
|
79
121
|
latencyP99,
|
|
80
122
|
logErrorRate,
|
|
81
|
-
}, _thresholds || DEFAULT_HEALTH_THRESHOLDS);
|
|
123
|
+
}, _thresholds || DEFAULT_HEALTH_THRESHOLDS, coverage);
|
|
82
124
|
const result = {
|
|
83
125
|
service: args.service,
|
|
84
126
|
status: health.status,
|
|
85
127
|
score: health.score,
|
|
86
128
|
signals: {
|
|
87
|
-
metrics: { cpu, memory, errorRate, latencyP99 },
|
|
88
|
-
logs: { errorRate: logErrorRate, topErrors },
|
|
129
|
+
metrics: metricsHadData ? { cpu, memory, errorRate, latencyP99 } : null,
|
|
130
|
+
logs: logsHadData ? { errorRate: logErrorRate, topErrors } : null,
|
|
89
131
|
},
|
|
90
132
|
anomalies,
|
|
91
133
|
correlations,
|
|
134
|
+
coverage,
|
|
135
|
+
note: !metricsHadData
|
|
136
|
+
? "No metrics signal for this service — score reflects logs only."
|
|
137
|
+
: !logsHadData
|
|
138
|
+
? "No logs signal for this service — score reflects metrics only."
|
|
139
|
+
: undefined,
|
|
92
140
|
};
|
|
93
141
|
return {
|
|
94
142
|
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
95
143
|
};
|
|
96
144
|
}
|
|
145
|
+
/** Best-effort: does any connector in the tenant know this service name?
|
|
146
|
+
* Used only on the no-data path to distinguish "exists but unmonitored/quiet"
|
|
147
|
+
* from "doesn't exist (typo/decommissioned)". A connector that throws is
|
|
148
|
+
* treated as "can't confirm" and skipped. */
|
|
149
|
+
async function knownServiceNames(connectors, service) {
|
|
150
|
+
for (const c of connectors) {
|
|
151
|
+
if (!c.listServices)
|
|
152
|
+
continue;
|
|
153
|
+
try {
|
|
154
|
+
const svcs = await c.listServices();
|
|
155
|
+
if (svcs.some((s) => s.name === service))
|
|
156
|
+
return true;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
/* can't confirm via this connector — keep checking */
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
return false;
|
|
163
|
+
}
|
|
97
164
|
function checkAnomaly(values, metric, service, source, anomalies) {
|
|
98
165
|
// Robust, metric-type-aware detector (same path as detect_anomalies):
|
|
99
166
|
// latency/error_rate/saturation are one-sided, so a *decrease* (e.g.
|
|
@@ -153,6 +153,38 @@ describe("listServicesHandler", () => {
|
|
|
153
153
|
assert.equal(data.total, 0);
|
|
154
154
|
});
|
|
155
155
|
});
|
|
156
|
+
describe("detectAnomaliesHandler — fleet coverage (issue #453B)", () => {
|
|
157
|
+
it("fleet scan includes log-only services and reports per-service coverage", async () => {
|
|
158
|
+
const reg = createRegistryWithMocks([
|
|
159
|
+
createMockConnector({
|
|
160
|
+
name: "prom1", type: "prometheus", signalType: "metrics",
|
|
161
|
+
listServices: async () => [{ name: "metric-svc", source: "prom1", signalType: "metrics" }],
|
|
162
|
+
queryMetrics: async () => ({
|
|
163
|
+
source: "prom1", service: "metric-svc", metric: "x", unit: "",
|
|
164
|
+
values: Array.from({ length: 30 }, (_, i) => ({ timestamp: new Date(Date.now() - (30 - i) * 9000).toISOString(), value: 20 })),
|
|
165
|
+
summary: { current: 20, average: 20, min: 20, max: 20, trend: "stable" },
|
|
166
|
+
}),
|
|
167
|
+
}),
|
|
168
|
+
createMockConnector({
|
|
169
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
170
|
+
listServices: async () => [{ name: "log-only-svc", source: "loki1", signalType: "logs" }],
|
|
171
|
+
queryLogs: async () => ({
|
|
172
|
+
source: "loki1", service: "log-only-svc", entries: [],
|
|
173
|
+
summary: { total: 10, errorCount: 0, warnCount: 0, topPatterns: [] },
|
|
174
|
+
}),
|
|
175
|
+
}),
|
|
176
|
+
]);
|
|
177
|
+
const data = JSON.parse((await detectAnomaliesHandler(reg, {})).content[0].text);
|
|
178
|
+
// Both services scanned — the log-only one is NOT silently dropped.
|
|
179
|
+
assert.equal(data.scannedServices, 2);
|
|
180
|
+
const names = data.coverage.scanned.map((s) => s.service).sort();
|
|
181
|
+
assert.deepEqual(names, ["log-only-svc", "metric-svc"]);
|
|
182
|
+
const logOnly = data.coverage.scanned.find((s) => s.service === "log-only-svc");
|
|
183
|
+
assert.deepEqual(logOnly.signals, ["logs"], "log-only service must be scanned via its logs signal");
|
|
184
|
+
// All-clear is no longer silently partial — it states the coverage.
|
|
185
|
+
assert.match(data.summary, /2 scanned service\(s\)/);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
156
188
|
describe("detectAnomaliesHandler — A5 memory/OOM coverage", () => {
|
|
157
189
|
const flatMemory = () => ({
|
|
158
190
|
source: "prom1", service: "payment-service", metric: "memory", unit: "bytes",
|
|
@@ -255,3 +287,59 @@ describe("getServiceHealthHandler — one-sided latency (regression)", () => {
|
|
|
255
287
|
assert.equal(latAnom, undefined, `latency dropping must not be an anomaly, got: ${JSON.stringify(latAnom)}`);
|
|
256
288
|
});
|
|
257
289
|
});
|
|
290
|
+
describe("getServiceHealthHandler — honest no-data / not-found (issue #453)", () => {
|
|
291
|
+
const emptySeries = () => ({
|
|
292
|
+
source: "prom1", service: "x", metric: "x", unit: "",
|
|
293
|
+
values: [],
|
|
294
|
+
// No data → null summary (matches the real connector after #462), so the
|
|
295
|
+
// health handler treats it as no-coverage, not a real zero reading.
|
|
296
|
+
summary: null,
|
|
297
|
+
});
|
|
298
|
+
function metricsConnector(known) {
|
|
299
|
+
return {
|
|
300
|
+
connect: async () => { }, disconnect: async () => { },
|
|
301
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
302
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
303
|
+
listServices: async () => known.map((name) => ({ name, source: "prom1", signalType: "metrics" })),
|
|
304
|
+
name: "prom1", type: "prometheus", signalType: "metrics",
|
|
305
|
+
queryMetrics: async () => emptySeries(), // no data for any metric
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
function regWith(...mocks) {
|
|
309
|
+
const reg = new ConnectorRegistry();
|
|
310
|
+
for (const m of mocks) {
|
|
311
|
+
reg.connectors.set(m.name, m);
|
|
312
|
+
reg.sourceConfigs.set(m.name, { name: m.name, type: m.type, url: "http://m", enabled: true });
|
|
313
|
+
}
|
|
314
|
+
return reg;
|
|
315
|
+
}
|
|
316
|
+
it("nonexistent service → status unknown, score null, not-found note (not 100/healthy)", async () => {
|
|
317
|
+
const reg = regWith(metricsConnector(["payment-service"])); // does NOT know the queried name
|
|
318
|
+
const data = JSON.parse((await getServiceHealthHandler(reg, { service: "nope-xyz" })).content[0].text);
|
|
319
|
+
assert.equal(data.status, "unknown");
|
|
320
|
+
assert.equal(data.score, null);
|
|
321
|
+
assert.equal(data.signals.metrics, null);
|
|
322
|
+
assert.match(data.note, /not found/i);
|
|
323
|
+
});
|
|
324
|
+
it("log-only service with errors → judged on logs, never 100/healthy from metric zeros", async () => {
|
|
325
|
+
const logs = {
|
|
326
|
+
connect: async () => { }, disconnect: async () => { },
|
|
327
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
328
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
329
|
+
listServices: async () => [{ name: "logapp", source: "loki1", signalType: "logs" }],
|
|
330
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
331
|
+
queryLogs: async () => ({
|
|
332
|
+
source: "loki1", service: "logapp", entries: [],
|
|
333
|
+
summary: { total: 60, errorCount: 40, warnCount: 0, topPatterns: ["boom"] },
|
|
334
|
+
}),
|
|
335
|
+
};
|
|
336
|
+
const reg = regWith(metricsConnector([]), logs);
|
|
337
|
+
const data = JSON.parse((await getServiceHealthHandler(reg, { service: "logapp" })).content[0].text);
|
|
338
|
+
assert.notEqual(data.status, "healthy");
|
|
339
|
+
assert.notEqual(data.status, "unknown");
|
|
340
|
+
assert.equal(data.signals.metrics, null, "metrics signal must be null when no metric data");
|
|
341
|
+
assert.ok(data.signals.logs, "logs signal must be present");
|
|
342
|
+
assert.deepEqual(data.coverage, { metrics: false, logs: true });
|
|
343
|
+
assert.ok(data.score !== null && data.score < 50, `40 errors/5min log-only must not be healthy, got ${data.score}`);
|
|
344
|
+
});
|
|
345
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { ConnectorRegistry } from "../connectors/registry.js";
|
|
4
|
+
import { queryLogsHandler } from "./query-logs.js";
|
|
5
|
+
// Inject a mock connector into the registry's internal maps.
|
|
6
|
+
function regWith(mock) {
|
|
7
|
+
const reg = new ConnectorRegistry();
|
|
8
|
+
reg.connectors.set(mock.name, mock);
|
|
9
|
+
reg.sourceConfigs.set(mock.name, { name: mock.name, type: mock.type, url: "http://mock", enabled: true });
|
|
10
|
+
return reg;
|
|
11
|
+
}
|
|
12
|
+
describe("queryLogsHandler error response shape (issue #452)", () => {
|
|
13
|
+
it("a failing query reports `window` (the look-back), not `duration` (read as wall-clock)", async () => {
|
|
14
|
+
// Mirrors the raw_query fail-fast case: the connector throws, the handler
|
|
15
|
+
// returns a structured error. The look-back window must be labelled
|
|
16
|
+
// `window`, never `duration` — an agent reading duration:"5m" on a <1s
|
|
17
|
+
// failure thinks it hung (the very symptom the fail-fast fix removed).
|
|
18
|
+
const mock = {
|
|
19
|
+
connect: async () => { }, disconnect: async () => { },
|
|
20
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
21
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
22
|
+
listServices: async () => [],
|
|
23
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
24
|
+
queryLogs: async () => { throw new Error("query_logs raw_query returned a 'matrix' result, but query_logs handles log lines (streams) only."); },
|
|
25
|
+
};
|
|
26
|
+
const result = await queryLogsHandler(regWith(mock), { raw_query: "sum(count_over_time({service_name=\"x\"} | json [1h]))", duration: "1h" }, undefined, { allowRawQuery: true });
|
|
27
|
+
const data = JSON.parse(result.content[0].text);
|
|
28
|
+
assert.ok(data.error, "must be an error response");
|
|
29
|
+
assert.equal(data.window, "1h", "look-back must be reported as `window`");
|
|
30
|
+
assert.equal("duration" in data, false, "must NOT carry a `duration` field (misread as elapsed time)");
|
|
31
|
+
});
|
|
32
|
+
});
|
package/dist/tools/query-logs.js
CHANGED
|
@@ -119,7 +119,8 @@ export async function queryLogsHandler(registry, args, ctx = defaultContext(), o
|
|
|
119
119
|
}
|
|
120
120
|
if (aggResults.length === 0) {
|
|
121
121
|
return {
|
|
122
|
-
|
|
122
|
+
// `window` = the requested look-back, not elapsed time (issue #452).
|
|
123
|
+
content: [{ type: "text", text: JSON.stringify({ error: aggErrors.length ? `Aggregate failed: ${aggErrors.join("; ")}` : "No data returned", service: args.service, window: duration }) }],
|
|
123
124
|
isError: aggErrors.length > 0,
|
|
124
125
|
};
|
|
125
126
|
}
|
|
@@ -160,7 +161,9 @@ export async function queryLogsHandler(registry, args, ctx = defaultContext(), o
|
|
|
160
161
|
text: JSON.stringify({
|
|
161
162
|
error: errors.length > 0 ? `Query failed: ${errors.join("; ")}` : "No logs returned",
|
|
162
163
|
service: args.service,
|
|
163
|
-
|
|
164
|
+
// The requested look-back window, NOT elapsed wall-clock time. Named
|
|
165
|
+
// `window` so a fast failure isn't misread as a 5-minute hang (#452).
|
|
166
|
+
window: duration,
|
|
164
167
|
}),
|
|
165
168
|
},
|
|
166
169
|
],
|
package/dist/types.d.ts
CHANGED
|
@@ -182,7 +182,8 @@ export interface MetricSummary {
|
|
|
182
182
|
export interface MetricGroup {
|
|
183
183
|
key: string;
|
|
184
184
|
values: DataPoint[];
|
|
185
|
-
|
|
185
|
+
/** null when this group has no data points — absent ≠ a real zero reading. */
|
|
186
|
+
summary: MetricSummary | null;
|
|
186
187
|
}
|
|
187
188
|
export interface MetricResult {
|
|
188
189
|
source: string;
|
|
@@ -190,12 +191,15 @@ export interface MetricResult {
|
|
|
190
191
|
metric: string;
|
|
191
192
|
unit: string;
|
|
192
193
|
values: DataPoint[];
|
|
193
|
-
|
|
194
|
+
/** null when `values` is empty (no series matched this service/metric) — a
|
|
195
|
+
* no-data signal, not a confident all-zeros reading (issue #462). */
|
|
196
|
+
summary: MetricSummary | null;
|
|
194
197
|
resolvedSeries?: string;
|
|
195
198
|
resolvedLabel?: string;
|
|
196
199
|
groupBy?: string;
|
|
197
200
|
groups?: MetricGroup[];
|
|
198
201
|
hint?: string;
|
|
202
|
+
note?: string;
|
|
199
203
|
}
|
|
200
204
|
export interface LogEntry {
|
|
201
205
|
timestamp: string;
|
|
@@ -338,20 +342,31 @@ export interface AnomalyReport {
|
|
|
338
342
|
}
|
|
339
343
|
export interface ServiceHealth {
|
|
340
344
|
service: string;
|
|
341
|
-
|
|
342
|
-
|
|
345
|
+
/** "unknown" when the service has no data in any signal (or doesn't exist). */
|
|
346
|
+
status: HealthStatus | "unknown";
|
|
347
|
+
/** 0-100, or null when status is "unknown" (no signal had data). */
|
|
348
|
+
score: number | null;
|
|
343
349
|
signals: {
|
|
350
|
+
/** null when the service exposes no metrics signal / no metric data. */
|
|
344
351
|
metrics: {
|
|
345
352
|
cpu: number;
|
|
346
353
|
memory: number;
|
|
347
354
|
errorRate: number;
|
|
348
355
|
latencyP99: number;
|
|
349
|
-
};
|
|
356
|
+
} | null;
|
|
357
|
+
/** null when the service exposes no logs signal / no log data. */
|
|
350
358
|
logs: {
|
|
351
359
|
errorRate: number;
|
|
352
360
|
topErrors: string[];
|
|
353
|
-
};
|
|
361
|
+
} | null;
|
|
354
362
|
};
|
|
355
363
|
anomalies: AnomalyReport[];
|
|
356
364
|
correlations: string[];
|
|
365
|
+
/** Which signal families actually had data (drives the score weighting). */
|
|
366
|
+
coverage?: {
|
|
367
|
+
metrics: boolean;
|
|
368
|
+
logs: boolean;
|
|
369
|
+
};
|
|
370
|
+
/** Operator-facing explanation when status is "unknown" or coverage is partial. */
|
|
371
|
+
note?: string;
|
|
357
372
|
}
|
package/package.json
CHANGED