@thotischner/observability-mcp 1.2.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -10,7 +10,7 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
10
10
|
private metrics;
|
|
11
11
|
private serviceLabels;
|
|
12
12
|
private labelValuesCache;
|
|
13
|
-
private
|
|
13
|
+
private candidateCache;
|
|
14
14
|
private userOverrides;
|
|
15
15
|
connect(config: SourceConfig): Promise<void>;
|
|
16
16
|
getDefaultMetrics(): MetricDefinition[];
|
|
@@ -23,8 +23,10 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
23
23
|
listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
|
|
24
24
|
queryMetrics(params: MetricQuery): Promise<MetricResult>;
|
|
25
25
|
private buildQuery;
|
|
26
|
+
private groupKey;
|
|
27
|
+
private getDistinctLabelValues;
|
|
26
28
|
private pickMetricCandidate;
|
|
27
|
-
private
|
|
29
|
+
private seriesExistsForService;
|
|
28
30
|
private resolveServiceLabel;
|
|
29
31
|
private getLabelValues;
|
|
30
32
|
private getUnit;
|
|
@@ -3,11 +3,13 @@ const PROMETHEUS_METRIC_CANDIDATES = {
|
|
|
3
3
|
cpu: [
|
|
4
4
|
{
|
|
5
5
|
seriesName: "process_cpu_seconds_total",
|
|
6
|
+
// rate() preserves all labels — already broken down per-instance.
|
|
6
7
|
query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
|
|
7
8
|
},
|
|
8
9
|
{
|
|
9
10
|
seriesName: "node_cpu_seconds_total",
|
|
10
11
|
query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
|
|
12
|
+
groupedQuery: '100 - avg by({{groupBy}}) (rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
|
|
11
13
|
},
|
|
12
14
|
],
|
|
13
15
|
memory: [
|
|
@@ -24,33 +26,40 @@ const PROMETHEUS_METRIC_CANDIDATES = {
|
|
|
24
26
|
{
|
|
25
27
|
seriesName: "http_requests_total",
|
|
26
28
|
query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
|
|
29
|
+
groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}} }[1m]))',
|
|
27
30
|
},
|
|
28
31
|
],
|
|
29
32
|
error_rate: [
|
|
30
33
|
{
|
|
31
34
|
seriesName: "http_requests_total",
|
|
32
35
|
query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
|
|
36
|
+
groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
|
|
33
37
|
},
|
|
34
38
|
],
|
|
35
39
|
latency_p99: [
|
|
36
40
|
{
|
|
37
41
|
seriesName: "http_request_duration_seconds_bucket",
|
|
38
42
|
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
43
|
+
groupedQuery: 'histogram_quantile(0.99, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
|
|
39
44
|
},
|
|
40
45
|
],
|
|
41
46
|
latency_p50: [
|
|
42
47
|
{
|
|
43
48
|
seriesName: "http_request_duration_seconds_bucket",
|
|
44
49
|
query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
50
|
+
groupedQuery: 'histogram_quantile(0.50, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
|
|
45
51
|
},
|
|
46
52
|
],
|
|
47
53
|
latency_avg: [
|
|
48
54
|
{
|
|
49
55
|
seriesName: "http_request_duration_seconds_sum",
|
|
50
56
|
query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
|
|
57
|
+
groupedQuery: 'sum by({{groupBy}}) (rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum by({{groupBy}}) (rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
|
|
51
58
|
},
|
|
52
59
|
],
|
|
53
60
|
};
|
|
61
|
+
// Common breakdown labels probed for the auto-hint when no groupBy is set.
|
|
62
|
+
const HINT_BREAKDOWN_LABELS = ["instance", "pod"];
|
|
54
63
|
const DEFAULT_METRIC_META = {
|
|
55
64
|
cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
|
|
56
65
|
memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
|
|
@@ -78,7 +87,7 @@ export class PrometheusConnector {
|
|
|
78
87
|
metrics = [];
|
|
79
88
|
serviceLabels = DEFAULT_SERVICE_LABELS;
|
|
80
89
|
labelValuesCache = new Map();
|
|
81
|
-
|
|
90
|
+
candidateCache = new Map();
|
|
82
91
|
userOverrides = new Set();
|
|
83
92
|
async connect(config) {
|
|
84
93
|
this.name = config.name;
|
|
@@ -187,83 +196,168 @@ export class PrometheusConnector {
|
|
|
187
196
|
return metrics;
|
|
188
197
|
}
|
|
189
198
|
async queryMetrics(params) {
|
|
190
|
-
const { promql, label } = await this.buildQuery(params.service, params.metric);
|
|
199
|
+
const { promql, label, candidate } = await this.buildQuery(params.service, params.metric, params.groupBy);
|
|
191
200
|
const { start, end, step } = this.parseTimeRange(params.duration, params.step);
|
|
192
201
|
const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
|
|
193
|
-
const
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
202
|
+
const seriesList = data?.data?.result || [];
|
|
203
|
+
// Build groups from each returned series, keyed either by the explicit
|
|
204
|
+
// groupBy label (when grouped) or by a synthesized name from any
|
|
205
|
+
// remaining labels (for naturally per-instance queries like cpu/memory
|
|
206
|
+
// prom-client). Empty when nothing came back.
|
|
207
|
+
const groups = [];
|
|
208
|
+
for (const series of seriesList) {
|
|
209
|
+
const seriesValues = [];
|
|
210
|
+
const rawValues = [];
|
|
211
|
+
for (const [ts, val] of series.values || []) {
|
|
212
|
+
const numVal = parseFloat(val);
|
|
213
|
+
if (!isNaN(numVal)) {
|
|
214
|
+
seriesValues.push({ timestamp: new Date(ts * 1000).toISOString(), value: numVal });
|
|
215
|
+
rawValues.push(numVal);
|
|
216
|
+
}
|
|
201
217
|
}
|
|
218
|
+
groups.push({
|
|
219
|
+
key: this.groupKey(series.metric || {}, params.groupBy),
|
|
220
|
+
values: seriesValues,
|
|
221
|
+
summary: this.computeSummary(rawValues),
|
|
222
|
+
});
|
|
202
223
|
}
|
|
203
|
-
|
|
224
|
+
// Top-level values/summary always reflect the first series (back-compat:
|
|
225
|
+
// single-aggregated queries always return one row, so this is unchanged).
|
|
226
|
+
const top = groups[0] || { values: [], summary: this.computeSummary([]) };
|
|
227
|
+
const result = {
|
|
204
228
|
source: this.name,
|
|
205
229
|
service: params.service,
|
|
206
230
|
metric: params.metric,
|
|
207
231
|
unit: this.getUnit(params.metric),
|
|
208
|
-
values,
|
|
209
|
-
summary:
|
|
232
|
+
values: top.values,
|
|
233
|
+
summary: top.summary,
|
|
210
234
|
resolvedSeries: promql,
|
|
211
235
|
resolvedLabel: label,
|
|
212
236
|
};
|
|
237
|
+
if (params.groupBy && groups.length > 1) {
|
|
238
|
+
result.groupBy = params.groupBy;
|
|
239
|
+
result.groups = groups;
|
|
240
|
+
}
|
|
241
|
+
else if (!params.groupBy && candidate) {
|
|
242
|
+
// Probe common breakdown labels and hint when more than one distinct
|
|
243
|
+
// value exists for this service. Helps the model ask the right
|
|
244
|
+
// follow-up instead of silently looking at an aggregated number.
|
|
245
|
+
const escaped = params.service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
246
|
+
for (const breakdownLabel of HINT_BREAKDOWN_LABELS) {
|
|
247
|
+
const distinct = await this.getDistinctLabelValues(candidate.seriesName, label, escaped, breakdownLabel);
|
|
248
|
+
if (distinct.length > 1) {
|
|
249
|
+
result.hint = `${distinct.length} distinct ${breakdownLabel}s exist for this service. Pass groupBy="${breakdownLabel}" to break the result down.`;
|
|
250
|
+
break;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
return result;
|
|
213
255
|
}
|
|
214
256
|
// --- Private helpers ---
|
|
215
|
-
async buildQuery(service, metric) {
|
|
216
|
-
//
|
|
217
|
-
//
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
// their query verbatim.
|
|
257
|
+
async buildQuery(service, metric, groupBy) {
|
|
258
|
+
// Resolve the service-filter label first. Candidate probing uses this
|
|
259
|
+
// label to scope existence checks per-service rather than per-source.
|
|
260
|
+
const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
261
|
+
let label = "job";
|
|
221
262
|
let template;
|
|
263
|
+
let candidate = null;
|
|
222
264
|
if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
|
|
223
|
-
|
|
224
|
-
|
|
265
|
+
label = await this.resolveServiceLabel(service);
|
|
266
|
+
candidate = await this.pickMetricCandidate(metric, label, escaped);
|
|
267
|
+
const fallback = PROMETHEUS_METRIC_CANDIDATES[metric][0];
|
|
268
|
+
const chosen = candidate || fallback;
|
|
269
|
+
template = (groupBy && chosen.groupedQuery) ? chosen.groupedQuery : chosen.query;
|
|
225
270
|
}
|
|
226
271
|
else {
|
|
227
272
|
const def = this.metrics.find((m) => m.name === metric);
|
|
228
273
|
template = def?.query || `${metric}{ {{selector}} }`;
|
|
229
274
|
}
|
|
230
|
-
const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
231
275
|
let promql = template;
|
|
232
|
-
let label = "job";
|
|
233
276
|
if (template.includes("{{selector}}")) {
|
|
234
|
-
label
|
|
277
|
+
// Resolve label here for non-candidate paths that haven't done it yet.
|
|
278
|
+
if (label === "job" && !PROMETHEUS_METRIC_CANDIDATES[metric]) {
|
|
279
|
+
label = await this.resolveServiceLabel(service);
|
|
280
|
+
}
|
|
235
281
|
const selector = `${label}="${escaped}"`;
|
|
236
282
|
promql = promql.replace(/\{\{selector\}\}/g, selector);
|
|
237
283
|
}
|
|
284
|
+
if (groupBy && template.includes("{{groupBy}}")) {
|
|
285
|
+
// groupBy is a label name (caller-supplied). Constrain to the same
|
|
286
|
+
// safe character set we use for service names so it can't break out
|
|
287
|
+
// of the by(...) clause.
|
|
288
|
+
const safe = groupBy.replace(/[^a-zA-Z0-9_]/g, "");
|
|
289
|
+
promql = promql.replace(/\{\{groupBy\}\}/g, safe);
|
|
290
|
+
}
|
|
238
291
|
promql = promql.replace(/\{\{service\}\}/g, escaped);
|
|
239
|
-
return { promql, label };
|
|
292
|
+
return { promql, label, candidate };
|
|
293
|
+
}
|
|
294
|
+
groupKey(metric, groupBy) {
|
|
295
|
+
if (groupBy && metric[groupBy] !== undefined)
|
|
296
|
+
return metric[groupBy];
|
|
297
|
+
// No explicit groupBy: synthesize a key from instance/pod/node if any,
|
|
298
|
+
// else from all labels. Useful for naturally-per-series queries like
|
|
299
|
+
// process_resident_memory_bytes (no aggregator dropping labels).
|
|
300
|
+
for (const probe of HINT_BREAKDOWN_LABELS) {
|
|
301
|
+
if (metric[probe])
|
|
302
|
+
return metric[probe];
|
|
303
|
+
}
|
|
304
|
+
const entries = Object.entries(metric);
|
|
305
|
+
if (entries.length === 0)
|
|
306
|
+
return "default";
|
|
307
|
+
return entries.map(([k, v]) => `${k}=${v}`).join(",");
|
|
308
|
+
}
|
|
309
|
+
async getDistinctLabelValues(seriesName, label, escapedService, breakdownLabel) {
|
|
310
|
+
try {
|
|
311
|
+
const matchExpr = `${seriesName}{${label}="${escapedService}"}`;
|
|
312
|
+
const url = `/api/v1/label/${encodeURIComponent(breakdownLabel)}/values?match[]=${encodeURIComponent(matchExpr)}`;
|
|
313
|
+
const data = await this.apiGet(url);
|
|
314
|
+
return data?.data || [];
|
|
315
|
+
}
|
|
316
|
+
catch {
|
|
317
|
+
return [];
|
|
318
|
+
}
|
|
240
319
|
}
|
|
241
|
-
async pickMetricCandidate(metric) {
|
|
320
|
+
async pickMetricCandidate(metric, label, escapedService) {
|
|
242
321
|
const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];
|
|
243
322
|
if (!candidates || candidates.length === 0)
|
|
244
323
|
return null;
|
|
245
324
|
if (candidates.length === 1)
|
|
246
325
|
return candidates[0];
|
|
247
|
-
|
|
326
|
+
// Per-service cache: a source can have BOTH process_* and node_* series
|
|
327
|
+
// present (e.g. an apps stack alongside node_exporter), so probing has
|
|
328
|
+
// to check whether THIS service has the series, not whether the source
|
|
329
|
+
// has it anywhere.
|
|
330
|
+
const cacheKey = `${metric}|${label}|${escapedService}`;
|
|
331
|
+
const cached = this.candidateCache.get(cacheKey);
|
|
332
|
+
if (cached && cached.expiresAt > Date.now())
|
|
333
|
+
return cached.candidate;
|
|
248
334
|
for (const c of candidates) {
|
|
249
|
-
if (
|
|
335
|
+
if (await this.seriesExistsForService(c.seriesName, label, escapedService)) {
|
|
336
|
+
this.candidateCache.set(cacheKey, {
|
|
337
|
+
candidate: c,
|
|
338
|
+
expiresAt: Date.now() + LABEL_CACHE_TTL_MS,
|
|
339
|
+
});
|
|
250
340
|
return c;
|
|
341
|
+
}
|
|
251
342
|
}
|
|
252
|
-
return
|
|
343
|
+
// Nothing found — return first candidate as best-effort. Cache the
|
|
344
|
+
// negative outcome so we don't probe again for 60s.
|
|
345
|
+
const fallback = candidates[0];
|
|
346
|
+
this.candidateCache.set(cacheKey, {
|
|
347
|
+
candidate: fallback,
|
|
348
|
+
expiresAt: Date.now() + LABEL_CACHE_TTL_MS,
|
|
349
|
+
});
|
|
350
|
+
return fallback;
|
|
253
351
|
}
|
|
254
|
-
async
|
|
255
|
-
if (this.metricNamesCache && this.metricNamesCache.expiresAt > Date.now()) {
|
|
256
|
-
return this.metricNamesCache.values;
|
|
257
|
-
}
|
|
352
|
+
async seriesExistsForService(seriesName, label, escapedService) {
|
|
258
353
|
try {
|
|
259
|
-
const
|
|
260
|
-
const
|
|
261
|
-
|
|
262
|
-
return
|
|
354
|
+
const matchExpr = `${seriesName}{${label}="${escapedService}"}`;
|
|
355
|
+
const url = `/api/v1/series?match[]=${encodeURIComponent(matchExpr)}`;
|
|
356
|
+
const data = await this.apiGet(url);
|
|
357
|
+
return Array.isArray(data?.data) && data.data.length > 0;
|
|
263
358
|
}
|
|
264
359
|
catch {
|
|
265
|
-
|
|
266
|
-
return new Set();
|
|
360
|
+
return false;
|
|
267
361
|
}
|
|
268
362
|
}
|
|
269
363
|
async resolveServiceLabel(service) {
|
package/dist/index.js
CHANGED
|
@@ -72,6 +72,7 @@ async function main() {
|
|
|
72
72
|
metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
|
|
73
73
|
duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
|
|
74
74
|
source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
|
|
75
|
+
groupBy: z.string().optional().describe("Label to break the result down by, e.g. 'instance', 'pod', 'node'. Returns one series per distinct value in 'groups'."),
|
|
75
76
|
}, async (args) => queryMetricsHandler(registry, args));
|
|
76
77
|
mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
|
|
77
78
|
service: z.string().describe("Service name (e.g. 'payment-service')"),
|
|
@@ -21,6 +21,10 @@ export declare const queryMetricsDefinition: {
|
|
|
21
21
|
type: string;
|
|
22
22
|
description: string;
|
|
23
23
|
};
|
|
24
|
+
groupBy: {
|
|
25
|
+
type: string;
|
|
26
|
+
description: string;
|
|
27
|
+
};
|
|
24
28
|
};
|
|
25
29
|
required: string[];
|
|
26
30
|
};
|
|
@@ -30,6 +34,7 @@ export declare function queryMetricsHandler(registry: ConnectorRegistry, args: {
|
|
|
30
34
|
metric: string;
|
|
31
35
|
duration?: string;
|
|
32
36
|
source?: string;
|
|
37
|
+
groupBy?: string;
|
|
33
38
|
}): Promise<{
|
|
34
39
|
content: {
|
|
35
40
|
type: "text";
|
|
@@ -21,6 +21,10 @@ export const queryMetricsDefinition = {
|
|
|
21
21
|
type: "string",
|
|
22
22
|
description: "Specific source name to query. If omitted, queries all metrics backends.",
|
|
23
23
|
},
|
|
24
|
+
groupBy: {
|
|
25
|
+
type: "string",
|
|
26
|
+
description: "Optional label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response includes a 'groups' array with one time-series per distinct value. When the service has only one matching series, the result is unchanged.",
|
|
27
|
+
},
|
|
24
28
|
},
|
|
25
29
|
required: ["service", "metric"],
|
|
26
30
|
},
|
|
@@ -36,6 +40,9 @@ export async function queryMetricsHandler(registry, args) {
|
|
|
36
40
|
const metricErr = validateMetricName(args.metric, registry);
|
|
37
41
|
if (metricErr)
|
|
38
42
|
return errorResponse(metricErr);
|
|
43
|
+
if (args.groupBy && !/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(args.groupBy)) {
|
|
44
|
+
return errorResponse(`Invalid groupBy "${args.groupBy}". Must be a valid Prometheus label name (alphanumeric + underscore, starting with letter/underscore).`);
|
|
45
|
+
}
|
|
39
46
|
const connectors = args.source
|
|
40
47
|
? [registry.getByName(args.source)].filter(Boolean)
|
|
41
48
|
: registry.getBySignal("metrics");
|
|
@@ -55,6 +62,7 @@ export async function queryMetricsHandler(registry, args) {
|
|
|
55
62
|
service: args.service,
|
|
56
63
|
metric: args.metric,
|
|
57
64
|
duration,
|
|
65
|
+
groupBy: args.groupBy,
|
|
58
66
|
});
|
|
59
67
|
results.push(result);
|
|
60
68
|
}
|
package/dist/types.d.ts
CHANGED
|
@@ -95,6 +95,7 @@ export interface MetricQuery {
|
|
|
95
95
|
metric: string;
|
|
96
96
|
duration: string;
|
|
97
97
|
step?: string;
|
|
98
|
+
groupBy?: string;
|
|
98
99
|
}
|
|
99
100
|
export interface LogQuery {
|
|
100
101
|
service: string;
|
|
@@ -114,6 +115,11 @@ export interface MetricSummary {
|
|
|
114
115
|
max: number;
|
|
115
116
|
trend: Trend;
|
|
116
117
|
}
|
|
118
|
+
export interface MetricGroup {
|
|
119
|
+
key: string;
|
|
120
|
+
values: DataPoint[];
|
|
121
|
+
summary: MetricSummary;
|
|
122
|
+
}
|
|
117
123
|
export interface MetricResult {
|
|
118
124
|
source: string;
|
|
119
125
|
service: string;
|
|
@@ -123,6 +129,9 @@ export interface MetricResult {
|
|
|
123
129
|
summary: MetricSummary;
|
|
124
130
|
resolvedSeries?: string;
|
|
125
131
|
resolvedLabel?: string;
|
|
132
|
+
groupBy?: string;
|
|
133
|
+
groups?: MetricGroup[];
|
|
134
|
+
hint?: string;
|
|
126
135
|
}
|
|
127
136
|
export interface LogEntry {
|
|
128
137
|
timestamp: string;
|
package/package.json
CHANGED