@thotischner/observability-mcp 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,6 +23,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
23
23
  listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
24
24
  queryMetrics(params: MetricQuery): Promise<MetricResult>;
25
25
  private buildQuery;
26
+ private groupKey;
27
+ private getDistinctLabelValues;
26
28
  private pickMetricCandidate;
27
29
  private seriesExistsForService;
28
30
  private resolveServiceLabel;
@@ -3,11 +3,13 @@ const PROMETHEUS_METRIC_CANDIDATES = {
3
3
  cpu: [
4
4
  {
5
5
  seriesName: "process_cpu_seconds_total",
6
+ // rate() preserves all labels — already broken down per-instance.
6
7
  query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
7
8
  },
8
9
  {
9
10
  seriesName: "node_cpu_seconds_total",
10
11
  query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
12
+ groupedQuery: '100 - avg by({{groupBy}}) (rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
11
13
  },
12
14
  ],
13
15
  memory: [
@@ -24,33 +26,40 @@ const PROMETHEUS_METRIC_CANDIDATES = {
24
26
  {
25
27
  seriesName: "http_requests_total",
26
28
  query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
29
+ groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}} }[1m]))',
27
30
  },
28
31
  ],
29
32
  error_rate: [
30
33
  {
31
34
  seriesName: "http_requests_total",
32
35
  query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
36
+ groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
33
37
  },
34
38
  ],
35
39
  latency_p99: [
36
40
  {
37
41
  seriesName: "http_request_duration_seconds_bucket",
38
42
  query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
43
+ groupedQuery: 'histogram_quantile(0.99, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
39
44
  },
40
45
  ],
41
46
  latency_p50: [
42
47
  {
43
48
  seriesName: "http_request_duration_seconds_bucket",
44
49
  query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
50
+ groupedQuery: 'histogram_quantile(0.50, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
45
51
  },
46
52
  ],
47
53
  latency_avg: [
48
54
  {
49
55
  seriesName: "http_request_duration_seconds_sum",
50
56
  query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
57
+ groupedQuery: 'sum by({{groupBy}}) (rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum by({{groupBy}}) (rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
51
58
  },
52
59
  ],
53
60
  };
61
+ // Common breakdown labels probed for the auto-hint when no groupBy is set.
62
+ const HINT_BREAKDOWN_LABELS = ["instance", "pod"];
54
63
  const DEFAULT_METRIC_META = {
55
64
  cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
56
65
  memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
@@ -187,41 +196,77 @@ export class PrometheusConnector {
187
196
  return metrics;
188
197
  }
189
198
  async queryMetrics(params) {
190
- const { promql, label } = await this.buildQuery(params.service, params.metric);
199
+ const { promql, label, candidate } = await this.buildQuery(params.service, params.metric, params.groupBy);
191
200
  const { start, end, step } = this.parseTimeRange(params.duration, params.step);
192
201
  const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
193
- const values = [];
194
- const rawValues = [];
195
- const resultData = data?.data?.result?.[0]?.values || [];
196
- for (const [ts, val] of resultData) {
197
- const numVal = parseFloat(val);
198
- if (!isNaN(numVal)) {
199
- values.push({ timestamp: new Date(ts * 1000).toISOString(), value: numVal });
200
- rawValues.push(numVal);
202
+ const seriesList = data?.data?.result || [];
203
+ // Build groups from each returned series, keyed either by the explicit
204
+ // groupBy label (when grouped) or by a synthesized name from any
205
+ // remaining labels (for naturally per-instance queries like cpu/memory
206
+ // prom-client). Empty when nothing came back.
207
+ const groups = [];
208
+ for (const series of seriesList) {
209
+ const seriesValues = [];
210
+ const rawValues = [];
211
+ for (const [ts, val] of series.values || []) {
212
+ const numVal = parseFloat(val);
213
+ if (!isNaN(numVal)) {
214
+ seriesValues.push({ timestamp: new Date(ts * 1000).toISOString(), value: numVal });
215
+ rawValues.push(numVal);
216
+ }
201
217
  }
218
+ groups.push({
219
+ key: this.groupKey(series.metric || {}, params.groupBy),
220
+ values: seriesValues,
221
+ summary: this.computeSummary(rawValues),
222
+ });
202
223
  }
203
- return {
224
+ // Top-level values/summary always reflect the first series (back-compat:
225
+ // single-aggregated queries always return one row, so this is unchanged).
226
+ const top = groups[0] || { values: [], summary: this.computeSummary([]) };
227
+ const result = {
204
228
  source: this.name,
205
229
  service: params.service,
206
230
  metric: params.metric,
207
231
  unit: this.getUnit(params.metric),
208
- values,
209
- summary: this.computeSummary(rawValues),
232
+ values: top.values,
233
+ summary: top.summary,
210
234
  resolvedSeries: promql,
211
235
  resolvedLabel: label,
212
236
  };
237
+ if (params.groupBy && groups.length > 1) {
238
+ result.groupBy = params.groupBy;
239
+ result.groups = groups;
240
+ }
241
+ else if (!params.groupBy && candidate) {
242
+ // Probe common breakdown labels and hint when more than one distinct
243
+ // value exists for this service. Helps the model ask the right
244
+ // follow-up instead of silently looking at an aggregated number.
245
+ const escaped = params.service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
246
+ for (const breakdownLabel of HINT_BREAKDOWN_LABELS) {
247
+ const distinct = await this.getDistinctLabelValues(candidate.seriesName, label, escaped, breakdownLabel);
248
+ if (distinct.length > 1) {
249
+ result.hint = `${distinct.length} distinct ${breakdownLabel}s exist for this service. Pass groupBy="${breakdownLabel}" to break the result down.`;
250
+ break;
251
+ }
252
+ }
253
+ }
254
+ return result;
213
255
  }
214
256
  // --- Private helpers ---
215
- async buildQuery(service, metric) {
257
+ async buildQuery(service, metric, groupBy) {
216
258
  // Resolve the service-filter label first. Candidate probing uses this
217
259
  // label to scope existence checks per-service rather than per-source.
218
260
  const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
219
261
  let label = "job";
220
262
  let template;
263
+ let candidate = null;
221
264
  if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
222
265
  label = await this.resolveServiceLabel(service);
223
- const candidate = await this.pickMetricCandidate(metric, label, escaped);
224
- template = candidate?.query || PROMETHEUS_METRIC_CANDIDATES[metric][0].query;
266
+ candidate = await this.pickMetricCandidate(metric, label, escaped);
267
+ const fallback = PROMETHEUS_METRIC_CANDIDATES[metric][0];
268
+ const chosen = candidate || fallback;
269
+ template = (groupBy && chosen.groupedQuery) ? chosen.groupedQuery : chosen.query;
225
270
  }
226
271
  else {
227
272
  const def = this.metrics.find((m) => m.name === metric);
@@ -236,8 +281,41 @@ export class PrometheusConnector {
236
281
  const selector = `${label}="${escaped}"`;
237
282
  promql = promql.replace(/\{\{selector\}\}/g, selector);
238
283
  }
284
+ if (groupBy && template.includes("{{groupBy}}")) {
285
+ // groupBy is a label name (caller-supplied). Constrain to the same
286
+ // safe character set we use for service names so it can't break out
287
+ // of the by(...) clause.
288
+ const safe = groupBy.replace(/[^a-zA-Z0-9_]/g, "");
289
+ promql = promql.replace(/\{\{groupBy\}\}/g, safe);
290
+ }
239
291
  promql = promql.replace(/\{\{service\}\}/g, escaped);
240
- return { promql, label };
292
+ return { promql, label, candidate };
293
+ }
294
+ groupKey(metric, groupBy) {
295
+ if (groupBy && metric[groupBy] !== undefined)
296
+ return metric[groupBy];
297
+ // No explicit groupBy: synthesize a key from instance/pod/node if any,
298
+ // else from all labels. Useful for naturally-per-series queries like
299
+ // process_resident_memory_bytes (no aggregator dropping labels).
300
+ for (const probe of HINT_BREAKDOWN_LABELS) {
301
+ if (metric[probe])
302
+ return metric[probe];
303
+ }
304
+ const entries = Object.entries(metric);
305
+ if (entries.length === 0)
306
+ return "default";
307
+ return entries.map(([k, v]) => `${k}=${v}`).join(",");
308
+ }
309
+ async getDistinctLabelValues(seriesName, label, escapedService, breakdownLabel) {
310
+ try {
311
+ const matchExpr = `${seriesName}{${label}="${escapedService}"}`;
312
+ const url = `/api/v1/label/${encodeURIComponent(breakdownLabel)}/values?match[]=${encodeURIComponent(matchExpr)}`;
313
+ const data = await this.apiGet(url);
314
+ return data?.data || [];
315
+ }
316
+ catch {
317
+ return [];
318
+ }
241
319
  }
242
320
  async pickMetricCandidate(metric, label, escapedService) {
243
321
  const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];
package/dist/index.js CHANGED
@@ -72,6 +72,7 @@ async function main() {
72
72
  metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
73
73
  duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
74
74
  source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
75
+ groupBy: z.string().optional().describe("Label to break the result down by, e.g. 'instance', 'pod', 'node'. Returns one series per distinct value in 'groups'."),
75
76
  }, async (args) => queryMetricsHandler(registry, args));
76
77
  mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
77
78
  service: z.string().describe("Service name (e.g. 'payment-service')"),
@@ -21,6 +21,10 @@ export declare const queryMetricsDefinition: {
21
21
  type: string;
22
22
  description: string;
23
23
  };
24
+ groupBy: {
25
+ type: string;
26
+ description: string;
27
+ };
24
28
  };
25
29
  required: string[];
26
30
  };
@@ -30,6 +34,7 @@ export declare function queryMetricsHandler(registry: ConnectorRegistry, args: {
30
34
  metric: string;
31
35
  duration?: string;
32
36
  source?: string;
37
+ groupBy?: string;
33
38
  }): Promise<{
34
39
  content: {
35
40
  type: "text";
@@ -21,6 +21,10 @@ export const queryMetricsDefinition = {
21
21
  type: "string",
22
22
  description: "Specific source name to query. If omitted, queries all metrics backends.",
23
23
  },
24
+ groupBy: {
25
+ type: "string",
26
+ description: "Optional label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response includes a 'groups' array with one time-series per distinct value. When the service has only one matching series, the result is unchanged.",
27
+ },
24
28
  },
25
29
  required: ["service", "metric"],
26
30
  },
@@ -36,6 +40,9 @@ export async function queryMetricsHandler(registry, args) {
36
40
  const metricErr = validateMetricName(args.metric, registry);
37
41
  if (metricErr)
38
42
  return errorResponse(metricErr);
43
+ if (args.groupBy && !/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(args.groupBy)) {
44
+ return errorResponse(`Invalid groupBy "${args.groupBy}". Must be a valid Prometheus label name (alphanumeric + underscore, starting with letter/underscore).`);
45
+ }
39
46
  const connectors = args.source
40
47
  ? [registry.getByName(args.source)].filter(Boolean)
41
48
  : registry.getBySignal("metrics");
@@ -55,6 +62,7 @@ export async function queryMetricsHandler(registry, args) {
55
62
  service: args.service,
56
63
  metric: args.metric,
57
64
  duration,
65
+ groupBy: args.groupBy,
58
66
  });
59
67
  results.push(result);
60
68
  }
package/dist/types.d.ts CHANGED
@@ -95,6 +95,7 @@ export interface MetricQuery {
95
95
  metric: string;
96
96
  duration: string;
97
97
  step?: string;
98
+ groupBy?: string;
98
99
  }
99
100
  export interface LogQuery {
100
101
  service: string;
@@ -114,6 +115,11 @@ export interface MetricSummary {
114
115
  max: number;
115
116
  trend: Trend;
116
117
  }
118
+ export interface MetricGroup {
119
+ key: string;
120
+ values: DataPoint[];
121
+ summary: MetricSummary;
122
+ }
117
123
  export interface MetricResult {
118
124
  source: string;
119
125
  service: string;
@@ -123,6 +129,9 @@ export interface MetricResult {
123
129
  summary: MetricSummary;
124
130
  resolvedSeries?: string;
125
131
  resolvedLabel?: string;
132
+ groupBy?: string;
133
+ groups?: MetricGroup[];
134
+ hint?: string;
126
135
  }
127
136
  export interface LogEntry {
128
137
  timestamp: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thotischner/observability-mcp",
3
- "version": "1.2.2",
3
+ "version": "1.3.0",
4
4
  "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
5
5
  "type": "module",
6
6
  "license": "MIT",