@thotischner/observability-mcp 3.3.1 → 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analysis/correlator.js +1 -1
- package/dist/connectors/loki.test.js +17 -0
- package/dist/connectors/prometheus.js +11 -1
- package/dist/connectors/prometheus.test.js +32 -6
- package/dist/index.js +1 -1
- package/dist/tools/get-service-health.js +4 -4
- package/dist/tools/handlers.test.js +3 -1
- package/dist/tools/query-logs-error-shape.test.d.ts +1 -0
- package/dist/tools/query-logs-error-shape.test.js +32 -0
- package/dist/tools/query-logs.js +5 -2
- package/dist/types.d.ts +6 -2
- package/package.json +1 -1
|
@@ -21,7 +21,7 @@ export function correlateSignals(anomalies, logResults, metricResults) {
|
|
|
21
21
|
for (const metric of serviceMetrics) {
|
|
22
22
|
if (metric.metric === anomaly.metric)
|
|
23
23
|
continue;
|
|
24
|
-
if (metric.summary.trend === "rising") {
|
|
24
|
+
if (metric.summary && metric.summary.trend === "rising") {
|
|
25
25
|
correlations.push(`${anomaly.service}: ${anomaly.metric} anomaly coincides with rising ${metric.metric} ` +
|
|
26
26
|
`(current: ${metric.summary.current.toFixed(2)})`);
|
|
27
27
|
}
|
|
@@ -146,6 +146,23 @@ describe("Q-LOG2: buildAggregateLogQL", () => {
|
|
|
146
146
|
assert.equal(r.step, "900s");
|
|
147
147
|
assert.equal(r.logql, `sum by (url) (count_over_time(${PIPE} [900s]))`);
|
|
148
148
|
});
|
|
149
|
+
it("count_over_time with a label-filter pipeline + no by → valid sum-wrapped LogQL (#452 leftover #2)", () => {
|
|
150
|
+
// The reporter saw an intermittent 400 on a label-filtered count_over_time
|
|
151
|
+
// and wondered if the collapse path emits different LogQL when a filter is
|
|
152
|
+
// present. It does not: the label filter lives in the streamPipeline
|
|
153
|
+
// (identical to the sum/topk path, which works), and the count_over_time
|
|
154
|
+
// branch wraps it verbatim. Assert the emitted LogQL is well-formed —
|
|
155
|
+
// `sum (count_over_time({sel} | json | environment="prod" [step]))` — so
|
|
156
|
+
// any future regression in the generated query is caught here.
|
|
157
|
+
const filtered = '{service_name="app"} | json | environment="prod"';
|
|
158
|
+
const r = buildAggregateLogQL(filtered, { op: "count_over_time", step: "1h" }, "6h");
|
|
159
|
+
assert.equal(r.mode, "range");
|
|
160
|
+
assert.equal(r.step, "3600s");
|
|
161
|
+
assert.equal(r.logql, `sum (count_over_time(${filtered} [3600s]))`);
|
|
162
|
+
// Structural sanity: balanced parens, sum-wrapped, single range selector.
|
|
163
|
+
assert.equal((r.logql.match(/\(/g) || []).length, (r.logql.match(/\)/g) || []).length);
|
|
164
|
+
assert.match(r.logql, /^sum \(count_over_time\(.*\[\d+s\]\)\)$/);
|
|
165
|
+
});
|
|
149
166
|
it("count_over_time without by → sum-wrapped (single series), default step (#452)", () => {
|
|
150
167
|
// Regression for issue #452: a bare count_over_time over a `| json` stream
|
|
151
168
|
// keeps every extracted label as its own series. With no `by` we must
|
|
@@ -240,6 +240,10 @@ export class PrometheusConnector {
|
|
|
240
240
|
resolvedSeries: promql,
|
|
241
241
|
resolvedLabel: label,
|
|
242
242
|
};
|
|
243
|
+
if (!result.summary) {
|
|
244
|
+
result.note = `No data: no '${params.metric}' series matched "${params.service}" in this window. ` +
|
|
245
|
+
"The service may expose logs only, or the metric name/label didn't match. Absent ≠ zero — summary is null rather than all-zeros.";
|
|
246
|
+
}
|
|
243
247
|
if (params.groupBy && groups.length > 1) {
|
|
244
248
|
result.groupBy = params.groupBy;
|
|
245
249
|
result.groups = groups;
|
|
@@ -295,6 +299,9 @@ export class PrometheusConnector {
|
|
|
295
299
|
resolvedSeries: rawQuery,
|
|
296
300
|
resolvedLabel: "",
|
|
297
301
|
};
|
|
302
|
+
if (!result.summary) {
|
|
303
|
+
result.note = "No data: the query returned no series in this window. Absent ≠ zero — summary is null rather than all-zeros.";
|
|
304
|
+
}
|
|
298
305
|
if (groups.length > 1)
|
|
299
306
|
result.groups = groups;
|
|
300
307
|
return result;
|
|
@@ -475,7 +482,10 @@ export class PrometheusConnector {
|
|
|
475
482
|
}
|
|
476
483
|
computeSummary(values) {
|
|
477
484
|
if (values.length === 0) {
|
|
478
|
-
|
|
485
|
+
// No data points → no-data, NOT a confident all-zeros reading. Coercing
|
|
486
|
+
// an empty series to {current:0,trend:"stable"} is indistinguishable
|
|
487
|
+
// from a service genuinely idling at 0 (issue #462).
|
|
488
|
+
return null;
|
|
479
489
|
}
|
|
480
490
|
const current = values[values.length - 1];
|
|
481
491
|
const average = values.reduce((a, b) => a + b, 0) / values.length;
|
|
@@ -53,13 +53,9 @@ describe("PrometheusConnector", () => {
|
|
|
53
53
|
});
|
|
54
54
|
});
|
|
55
55
|
describe("computeSummary", () => {
|
|
56
|
-
it("returns
|
|
56
|
+
it("returns null for empty array — no-data, not a false all-zeros reading (#462)", () => {
|
|
57
57
|
const s = proto.computeSummary([]);
|
|
58
|
-
assert.equal(s
|
|
59
|
-
assert.equal(s.average, 0);
|
|
60
|
-
assert.equal(s.min, 0);
|
|
61
|
-
assert.equal(s.max, 0);
|
|
62
|
-
assert.equal(s.trend, "stable");
|
|
58
|
+
assert.equal(s, null);
|
|
63
59
|
});
|
|
64
60
|
it("computes correct summary for values", () => {
|
|
65
61
|
const s = proto.computeSummary([10, 20, 30, 40]);
|
|
@@ -199,4 +195,34 @@ describe("PrometheusConnector", () => {
|
|
|
199
195
|
}
|
|
200
196
|
});
|
|
201
197
|
});
|
|
198
|
+
describe("queryMetrics no-data → null summary, not zero-fill (#462)", () => {
|
|
199
|
+
const fakeSource = { name: "test", type: "prometheus", url: "http://localhost:9090", enabled: true };
|
|
200
|
+
it("an empty result set yields values:[], summary:null, and a no-data note", async () => {
|
|
201
|
+
const connector = new PrometheusConnector();
|
|
202
|
+
await connector.connect({ ...fakeSource });
|
|
203
|
+
const orig = globalThis.fetch;
|
|
204
|
+
// raw_query bypasses the candidate-probe / label-resolve path and runs
|
|
205
|
+
// query_range directly — here it returns an empty result set (the
|
|
206
|
+
// no-data case: a logs-only service has no such metric series).
|
|
207
|
+
globalThis.fetch = (async () => ({
|
|
208
|
+
ok: true,
|
|
209
|
+
status: 200,
|
|
210
|
+
json: async () => ({ data: { result: [] } }),
|
|
211
|
+
}));
|
|
212
|
+
try {
|
|
213
|
+
const result = await connector.queryMetrics({
|
|
214
|
+
service: "",
|
|
215
|
+
metric: "",
|
|
216
|
+
duration: "1h",
|
|
217
|
+
rawQuery: "rate(process_cpu_seconds_total{job=\"logs-only-svc\"}[1m]) * 100",
|
|
218
|
+
});
|
|
219
|
+
assert.deepEqual(result.values, [], "no data points");
|
|
220
|
+
assert.equal(result.summary, null, "summary must be null, not {current:0,...}");
|
|
221
|
+
assert.match(result.note ?? "", /No data/i, "must carry a no-data note");
|
|
222
|
+
}
|
|
223
|
+
finally {
|
|
224
|
+
globalThis.fetch = orig;
|
|
225
|
+
}
|
|
226
|
+
});
|
|
227
|
+
});
|
|
202
228
|
});
|
package/dist/index.js
CHANGED
|
@@ -522,7 +522,7 @@ async function main() {
|
|
|
522
522
|
"Fetch the raw time-series for ONE metric of ONE service over a look-back window, returned together with pre-computed summary statistics.",
|
|
523
523
|
"When to use: when you need the actual numeric values or the trend of a known metric. For a 'is this service OK?' verdict use `get_service_health`; to find which services are misbehaving use `detect_anomalies`.",
|
|
524
524
|
"Prerequisites: get the exact service name from `list_services` and choose a metric from the list at the end of this description.",
|
|
525
|
-
"Behavior: read-only, no side effects. Returns an ordered array of {timestamp, value} points plus a summary {current, average, min, max, trend}. With `groupBy` set, returns one labelled series per distinct label value under `groups` instead of a single aggregated series. Units depend on the metric (e.g. CPU as %, latency as ms, rates as per-second). An unknown service/metric or an unreachable backend yields a structured explanatory error, never an exception.",
|
|
525
|
+
"Behavior: read-only, no side effects. Returns an ordered array of {timestamp, value} points plus a summary {current, average, min, max, trend}. When no series matched (e.g. a logs-only service has no such metric), `values` is empty and `summary` is `null` (not all-zeros) with a `note` — absent data is not a real zero reading. With `groupBy` set, returns one labelled series per distinct label value under `groups` instead of a single aggregated series. Units depend on the metric (e.g. CPU as %, latency as ms, rates as per-second). An unknown service/metric or an unreachable backend yields a structured explanatory error, never an exception.",
|
|
526
526
|
`Available metrics: ${metricsList}`,
|
|
527
527
|
].join(" "), {
|
|
528
528
|
service: z
|
|
@@ -35,24 +35,24 @@ export async function getServiceHealthHandler(registry, args, ctx = defaultConte
|
|
|
35
35
|
continue;
|
|
36
36
|
try {
|
|
37
37
|
const cpuResult = await connector.queryMetrics({ service: args.service, metric: "cpu", duration: "5m" });
|
|
38
|
-
if (cpuResult.
|
|
38
|
+
if (cpuResult.summary) {
|
|
39
39
|
cpu = cpuResult.summary.current;
|
|
40
40
|
metricsHadData = true;
|
|
41
41
|
}
|
|
42
42
|
checkAnomaly(cpuResult.values.map(v => v.value), "cpu", args.service, connector.name, anomalies);
|
|
43
43
|
const memResult = await connector.queryMetrics({ service: args.service, metric: "memory", duration: "5m" });
|
|
44
|
-
if (memResult.
|
|
44
|
+
if (memResult.summary) {
|
|
45
45
|
memory = memResult.summary.current / 1_000_000;
|
|
46
46
|
metricsHadData = true;
|
|
47
47
|
} // MB for display
|
|
48
48
|
const errResult = await connector.queryMetrics({ service: args.service, metric: "error_rate", duration: "5m" });
|
|
49
|
-
if (errResult.
|
|
49
|
+
if (errResult.summary) {
|
|
50
50
|
errorRate = errResult.summary.current;
|
|
51
51
|
metricsHadData = true;
|
|
52
52
|
}
|
|
53
53
|
checkAnomaly(errResult.values.map(v => v.value), "error_rate", args.service, connector.name, anomalies);
|
|
54
54
|
const latResult = await connector.queryMetrics({ service: args.service, metric: "latency_p99", duration: "5m" });
|
|
55
|
-
if (latResult.
|
|
55
|
+
if (latResult.summary) {
|
|
56
56
|
latencyP99 = latResult.summary.current;
|
|
57
57
|
metricsHadData = true;
|
|
58
58
|
}
|
|
@@ -291,7 +291,9 @@ describe("getServiceHealthHandler — honest no-data / not-found (issue #453)",
|
|
|
291
291
|
const emptySeries = () => ({
|
|
292
292
|
source: "prom1", service: "x", metric: "x", unit: "",
|
|
293
293
|
values: [],
|
|
294
|
-
|
|
294
|
+
// No data → null summary (matches the real connector after #462), so the
|
|
295
|
+
// health handler treats it as no-coverage, not a real zero reading.
|
|
296
|
+
summary: null,
|
|
295
297
|
});
|
|
296
298
|
function metricsConnector(known) {
|
|
297
299
|
return {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { ConnectorRegistry } from "../connectors/registry.js";
|
|
4
|
+
import { queryLogsHandler } from "./query-logs.js";
|
|
5
|
+
// Inject a mock connector into the registry's internal maps.
|
|
6
|
+
function regWith(mock) {
|
|
7
|
+
const reg = new ConnectorRegistry();
|
|
8
|
+
reg.connectors.set(mock.name, mock);
|
|
9
|
+
reg.sourceConfigs.set(mock.name, { name: mock.name, type: mock.type, url: "http://mock", enabled: true });
|
|
10
|
+
return reg;
|
|
11
|
+
}
|
|
12
|
+
describe("queryLogsHandler error response shape (issue #452)", () => {
|
|
13
|
+
it("a failing query reports `window` (the look-back), not `duration` (read as wall-clock)", async () => {
|
|
14
|
+
// Mirrors the raw_query fail-fast case: the connector throws, the handler
|
|
15
|
+
// returns a structured error. The look-back window must be labelled
|
|
16
|
+
// `window`, never `duration` — an agent reading duration:"5m" on a <1s
|
|
17
|
+
// failure thinks it hung (the very symptom the fail-fast fix removed).
|
|
18
|
+
const mock = {
|
|
19
|
+
connect: async () => { }, disconnect: async () => { },
|
|
20
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
21
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
22
|
+
listServices: async () => [],
|
|
23
|
+
name: "loki1", type: "loki", signalType: "logs",
|
|
24
|
+
queryLogs: async () => { throw new Error("query_logs raw_query returned a 'matrix' result, but query_logs handles log lines (streams) only."); },
|
|
25
|
+
};
|
|
26
|
+
const result = await queryLogsHandler(regWith(mock), { raw_query: "sum(count_over_time({service_name=\"x\"} | json [1h]))", duration: "1h" }, undefined, { allowRawQuery: true });
|
|
27
|
+
const data = JSON.parse(result.content[0].text);
|
|
28
|
+
assert.ok(data.error, "must be an error response");
|
|
29
|
+
assert.equal(data.window, "1h", "look-back must be reported as `window`");
|
|
30
|
+
assert.equal("duration" in data, false, "must NOT carry a `duration` field (misread as elapsed time)");
|
|
31
|
+
});
|
|
32
|
+
});
|
package/dist/tools/query-logs.js
CHANGED
|
@@ -119,7 +119,8 @@ export async function queryLogsHandler(registry, args, ctx = defaultContext(), o
|
|
|
119
119
|
}
|
|
120
120
|
if (aggResults.length === 0) {
|
|
121
121
|
return {
|
|
122
|
-
|
|
122
|
+
// `window` = the requested look-back, not elapsed time (issue #452).
|
|
123
|
+
content: [{ type: "text", text: JSON.stringify({ error: aggErrors.length ? `Aggregate failed: ${aggErrors.join("; ")}` : "No data returned", service: args.service, window: duration }) }],
|
|
123
124
|
isError: aggErrors.length > 0,
|
|
124
125
|
};
|
|
125
126
|
}
|
|
@@ -160,7 +161,9 @@ export async function queryLogsHandler(registry, args, ctx = defaultContext(), o
|
|
|
160
161
|
text: JSON.stringify({
|
|
161
162
|
error: errors.length > 0 ? `Query failed: ${errors.join("; ")}` : "No logs returned",
|
|
162
163
|
service: args.service,
|
|
163
|
-
|
|
164
|
+
// The requested look-back window, NOT elapsed wall-clock time. Named
|
|
165
|
+
// `window` so a fast failure isn't misread as a 5-minute hang (#452).
|
|
166
|
+
window: duration,
|
|
164
167
|
}),
|
|
165
168
|
},
|
|
166
169
|
],
|
package/dist/types.d.ts
CHANGED
|
@@ -182,7 +182,8 @@ export interface MetricSummary {
|
|
|
182
182
|
export interface MetricGroup {
|
|
183
183
|
key: string;
|
|
184
184
|
values: DataPoint[];
|
|
185
|
-
|
|
185
|
+
/** null when this group has no data points — absent ≠ a real zero reading. */
|
|
186
|
+
summary: MetricSummary | null;
|
|
186
187
|
}
|
|
187
188
|
export interface MetricResult {
|
|
188
189
|
source: string;
|
|
@@ -190,12 +191,15 @@ export interface MetricResult {
|
|
|
190
191
|
metric: string;
|
|
191
192
|
unit: string;
|
|
192
193
|
values: DataPoint[];
|
|
193
|
-
|
|
194
|
+
/** null when `values` is empty (no series matched this service/metric) — a
|
|
195
|
+
* no-data signal, not a confident all-zeros reading (issue #462). */
|
|
196
|
+
summary: MetricSummary | null;
|
|
194
197
|
resolvedSeries?: string;
|
|
195
198
|
resolvedLabel?: string;
|
|
196
199
|
groupBy?: string;
|
|
197
200
|
groups?: MetricGroup[];
|
|
198
201
|
hint?: string;
|
|
202
|
+
note?: string;
|
|
199
203
|
}
|
|
200
204
|
export interface LogEntry {
|
|
201
205
|
timestamp: string;
|
package/package.json
CHANGED