npm - @thotischner/observability-mcp - Versions diffs - 1.2.2 → 1.3.1 - Mend

@thotischner/observability-mcp 1.2.2 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/connectors/prometheus.d.ts +2 -0
package/dist/connectors/prometheus.js +94 -16
package/dist/index.js +41 -32
package/dist/tools/query-metrics.d.ts +5 -0
package/dist/tools/query-metrics.js +8 -0
package/dist/types.d.ts +9 -0
package/package.json +1 -1

package/dist/connectors/prometheus.d.ts CHANGED Viewed

@@ -23,6 +23,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
     listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
     queryMetrics(params: MetricQuery): Promise<MetricResult>;
     private buildQuery;
+    private groupKey;
+    private getDistinctLabelValues;
     private pickMetricCandidate;
     private seriesExistsForService;
     private resolveServiceLabel;

package/dist/connectors/prometheus.js CHANGED Viewed

@@ -3,11 +3,13 @@ const PROMETHEUS_METRIC_CANDIDATES = {
     cpu: [
         {
             seriesName: "process_cpu_seconds_total",
+            // rate() preserves all labels — already broken down per-instance.
             query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
         },
         {
             seriesName: "node_cpu_seconds_total",
             query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
+            groupedQuery: '100 - avg by({{groupBy}}) (rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
         },
     ],
     memory: [
@@ -24,33 +26,40 @@ const PROMETHEUS_METRIC_CANDIDATES = {
         {
             seriesName: "http_requests_total",
             query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
+            groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}} }[1m]))',
         },
     ],
     error_rate: [
         {
             seriesName: "http_requests_total",
             query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
+            groupedQuery: 'sum by({{groupBy}}) (rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
         },
     ],
     latency_p99: [
         {
             seriesName: "http_request_duration_seconds_bucket",
             query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
+            groupedQuery: 'histogram_quantile(0.99, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
         },
     ],
     latency_p50: [
         {
             seriesName: "http_request_duration_seconds_bucket",
             query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
+            groupedQuery: 'histogram_quantile(0.50, sum by(le, {{groupBy}}) (rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])))',
         },
     ],
     latency_avg: [
         {
             seriesName: "http_request_duration_seconds_sum",
             query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
+            groupedQuery: 'sum by({{groupBy}}) (rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum by({{groupBy}}) (rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
         },
     ],
 };
+// Common breakdown labels probed for the auto-hint when no groupBy is set.
+const HINT_BREAKDOWN_LABELS = ["instance", "pod"];
 const DEFAULT_METRIC_META = {
     cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
     memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
@@ -187,41 +196,77 @@ export class PrometheusConnector {
         return metrics;
     }
     async queryMetrics(params) {
-        const { promql, label } = await this.buildQuery(params.service, params.metric);
+        const { promql, label, candidate } = await this.buildQuery(params.service, params.metric, params.groupBy);
         const { start, end, step } = this.parseTimeRange(params.duration, params.step);
         const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
-        const values = [];
-        const rawValues = [];
-        const resultData = data?.data?.result?.[0]?.values || [];
-        for (const [ts, val] of resultData) {
-            const numVal = parseFloat(val);
-            if (!isNaN(numVal)) {
-                values.push({ timestamp: new Date(ts * 1000).toISOString(), value: numVal });
-                rawValues.push(numVal);
+        const seriesList = data?.data?.result || [];
+        // Build groups from each returned series, keyed either by the explicit
+        // groupBy label (when grouped) or by a synthesized name from any
+        // remaining labels (for naturally per-instance queries like cpu/memory
+        // prom-client). Empty when nothing came back.
+        const groups = [];
+        for (const series of seriesList) {
+            const seriesValues = [];
+            const rawValues = [];
+            for (const [ts, val] of series.values || []) {
+                const numVal = parseFloat(val);
+                if (!isNaN(numVal)) {
+                    seriesValues.push({ timestamp: new Date(ts * 1000).toISOString(), value: numVal });
+                    rawValues.push(numVal);
+                }
             }
+            groups.push({
+                key: this.groupKey(series.metric || {}, params.groupBy),
+                values: seriesValues,
+                summary: this.computeSummary(rawValues),
+            });
         }
-        return {
+        // Top-level values/summary always reflect the first series (back-compat:
+        // single-aggregated queries always return one row, so this is unchanged).
+        const top = groups[0] || { values: [], summary: this.computeSummary([]) };
+        const result = {
             source: this.name,
             service: params.service,
             metric: params.metric,
             unit: this.getUnit(params.metric),
-            values,
-            summary: this.computeSummary(rawValues),
+            values: top.values,
+            summary: top.summary,
             resolvedSeries: promql,
             resolvedLabel: label,
         };
+        if (params.groupBy && groups.length > 1) {
+            result.groupBy = params.groupBy;
+            result.groups = groups;
+        }
+        else if (!params.groupBy && candidate) {
+            // Probe common breakdown labels and hint when more than one distinct
+            // value exists for this service. Helps the model ask the right
+            // follow-up instead of silently looking at an aggregated number.
+            const escaped = params.service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
+            for (const breakdownLabel of HINT_BREAKDOWN_LABELS) {
+                const distinct = await this.getDistinctLabelValues(candidate.seriesName, label, escaped, breakdownLabel);
+                if (distinct.length > 1) {
+                    result.hint = `${distinct.length} distinct ${breakdownLabel}s exist for this service. Pass groupBy="${breakdownLabel}" to break the result down.`;
+                    break;
+                }
+            }
+        }
+        return result;
     }
     // --- Private helpers ---
-    async buildQuery(service, metric) {
+    async buildQuery(service, metric, groupBy) {
         // Resolve the service-filter label first. Candidate probing uses this
         // label to scope existence checks per-service rather than per-source.
         const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
         let label = "job";
         let template;
+        let candidate = null;
         if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
             label = await this.resolveServiceLabel(service);
-            const candidate = await this.pickMetricCandidate(metric, label, escaped);
-            template = candidate?.query || PROMETHEUS_METRIC_CANDIDATES[metric][0].query;
+            candidate = await this.pickMetricCandidate(metric, label, escaped);
+            const fallback = PROMETHEUS_METRIC_CANDIDATES[metric][0];
+            const chosen = candidate || fallback;
+            template = (groupBy && chosen.groupedQuery) ? chosen.groupedQuery : chosen.query;
         }
         else {
             const def = this.metrics.find((m) => m.name === metric);
@@ -236,8 +281,41 @@ export class PrometheusConnector {
             const selector = `${label}="${escaped}"`;
             promql = promql.replace(/\{\{selector\}\}/g, selector);
         }
+        if (groupBy && template.includes("{{groupBy}}")) {
+            // groupBy is a label name (caller-supplied). Constrain to the same
+            // safe character set we use for service names so it can't break out
+            // of the by(...) clause.
+            const safe = groupBy.replace(/[^a-zA-Z0-9_]/g, "");
+            promql = promql.replace(/\{\{groupBy\}\}/g, safe);
+        }
         promql = promql.replace(/\{\{service\}\}/g, escaped);
-        return { promql, label };
+        return { promql, label, candidate };
+    }
+    groupKey(metric, groupBy) {
+        if (groupBy && metric[groupBy] !== undefined)
+            return metric[groupBy];
+        // No explicit groupBy: synthesize a key from instance/pod/node if any,
+        // else from all labels. Useful for naturally-per-series queries like
+        // process_resident_memory_bytes (no aggregator dropping labels).
+        for (const probe of HINT_BREAKDOWN_LABELS) {
+            if (metric[probe])
+                return metric[probe];
+        }
+        const entries = Object.entries(metric);
+        if (entries.length === 0)
+            return "default";
+        return entries.map(([k, v]) => `${k}=${v}`).join(",");
+    }
+    async getDistinctLabelValues(seriesName, label, escapedService, breakdownLabel) {
+        try {
+            const matchExpr = `${seriesName}{${label}="${escapedService}"}`;
+            const url = `/api/v1/label/${encodeURIComponent(breakdownLabel)}/values?match[]=${encodeURIComponent(matchExpr)}`;
+            const data = await this.apiGet(url);
+            return data?.data || [];
+        }
+        catch {
+            return [];
+        }
     }
     async pickMetricCandidate(metric, label, escapedService) {
         const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];

package/dist/index.js CHANGED Viewed

@@ -57,37 +57,45 @@ async function main() {
     const registry = new ConnectorRegistry();
     await registry.initialize(config);
     applyConfigToRuntime(config, registry);
-    const mcpServer = new McpServer({
-        name: "observability-mcp",
-        version: "1.0.0",
-    });
-    // --- Register tools with Zod schemas ---
-    mcpServer.tool("list_sources", "List all configured observability backends and their connection status. Use this to discover what data sources are available.", {}, async () => listSourcesHandler(registry));
-    mcpServer.tool("list_services", "List all monitored services discovered across all connected backends. Returns service names, their data sources, and signal types (metrics/logs).", { filter: z.string().optional().describe("Optional filter to match service names") }, async (args) => listServicesHandler(registry, args));
-    const metricsList = getAvailableMetricNames(registry);
-    const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
-    const uniqueNames = [...new Set(metricNames)];
-    mcpServer.tool("query_metrics", `Query a specific metric for a service over a given timeframe. Returns time-series data with pre-computed summary statistics (current, average, min, max, trend). Available metrics: ${metricsList}`, {
-        service: z.string().describe("Service name (e.g. 'api-gateway', 'payment-service')"),
-        metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
-        duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
-        source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
-    }, async (args) => queryMetricsHandler(registry, args));
-    mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
-        service: z.string().describe("Service name (e.g. 'payment-service')"),
-        query: z.string().optional().describe("Optional search query to filter log messages (regex supported)"),
-        duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
-        level: z.string().optional().describe("Filter by log level: 'error', 'warn', 'info', 'debug'"),
-        limit: z.number().optional().describe("Maximum log entries to return. Default: 100"),
-    }, async (args) => queryLogsHandler(registry, args));
-    mcpServer.tool("get_service_health", "Get an aggregated health overview for a service combining metrics AND logs. Returns health score (0-100), status (healthy/degraded/critical), key metrics, log error summary, anomalies, and cross-signal correlations.", {
-        service: z.string().describe("Service name to check health for"),
-    }, async (args) => getServiceHealthHandler(registry, args));
-    mcpServer.tool("detect_anomalies", "Scan for anomalies across all monitored services (or a specific one). Uses z-score analysis on metrics, checks log error spikes, and correlates signals. Returns anomalies with severity ratings.", {
-        service: z.string().optional().describe("Specific service to scan. If omitted, scans all."),
-        duration: z.string().optional().describe("Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'"),
-        sensitivity: z.enum(["low", "medium", "high"]).optional().describe("Detection sensitivity: low (>3σ), medium (>2σ), high (>1.5σ). Default: 'medium'"),
-    }, async (args) => detectAnomaliesHandler(registry, args));
+    // The MCP SDK Protocol class permits exactly one transport per instance,
+    // so we cannot share a single McpServer across HTTP sessions. Each new
+    // session needs its own server. The factory captures the live registry
+    // by reference so tool handlers always see the current configuration.
+    function createMcpServer() {
+        const mcpServer = new McpServer({
+            name: "observability-mcp",
+            version: "1.3.0",
+        });
+        // --- Register tools with Zod schemas ---
+        mcpServer.tool("list_sources", "List all configured observability backends and their connection status. Use this to discover what data sources are available.", {}, async () => listSourcesHandler(registry));
+        mcpServer.tool("list_services", "List all monitored services discovered across all connected backends. Returns service names, their data sources, and signal types (metrics/logs).", { filter: z.string().optional().describe("Optional filter to match service names") }, async (args) => listServicesHandler(registry, args));
+        const metricsList = getAvailableMetricNames(registry);
+        const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
+        const uniqueNames = [...new Set(metricNames)];
+        mcpServer.tool("query_metrics", `Query a specific metric for a service over a given timeframe. Returns time-series data with pre-computed summary statistics (current, average, min, max, trend). Available metrics: ${metricsList}`, {
+            service: z.string().describe("Service name (e.g. 'api-gateway', 'payment-service')"),
+            metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
+            duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
+            source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
+            groupBy: z.string().optional().describe("Label to break the result down by, e.g. 'instance', 'pod', 'node'. Returns one series per distinct value in 'groups'."),
+        }, async (args) => queryMetricsHandler(registry, args));
+        mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
+            service: z.string().describe("Service name (e.g. 'payment-service')"),
+            query: z.string().optional().describe("Optional search query to filter log messages (regex supported)"),
+            duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
+            level: z.string().optional().describe("Filter by log level: 'error', 'warn', 'info', 'debug'"),
+            limit: z.number().optional().describe("Maximum log entries to return. Default: 100"),
+        }, async (args) => queryLogsHandler(registry, args));
+        mcpServer.tool("get_service_health", "Get an aggregated health overview for a service combining metrics AND logs. Returns health score (0-100), status (healthy/degraded/critical), key metrics, log error summary, anomalies, and cross-signal correlations.", {
+            service: z.string().describe("Service name to check health for"),
+        }, async (args) => getServiceHealthHandler(registry, args));
+        mcpServer.tool("detect_anomalies", "Scan for anomalies across all monitored services (or a specific one). Uses z-score analysis on metrics, checks log error spikes, and correlates signals. Returns anomalies with severity ratings.", {
+            service: z.string().optional().describe("Specific service to scan. If omitted, scans all."),
+            duration: z.string().optional().describe("Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'"),
+            sensitivity: z.enum(["low", "medium", "high"]).optional().describe("Detection sensitivity: low (>3σ), medium (>2σ), high (>1.5σ). Default: 'medium'"),
+        }, async (args) => detectAnomaliesHandler(registry, args));
+        return mcpServer;
+    }
     // --- HTTP server ---
     const app = express();
     app.use(express.json({ limit: "1mb" }));
@@ -378,7 +386,8 @@ async function main() {
                     }
                 }
             };
-            await mcpServer.connect(transport);
+            const sessionMcpServer = createMcpServer();
+            await sessionMcpServer.connect(transport);
         }
         await transport.handleRequest(req, res, req.body);
         // Store session after handling (sessionId is set during handleRequest)

package/dist/tools/query-metrics.d.ts CHANGED Viewed

@@ -21,6 +21,10 @@ export declare const queryMetricsDefinition: {
                 type: string;
                 description: string;
             };
+            groupBy: {
+                type: string;
+                description: string;
+            };
         };
         required: string[];
     };
@@ -30,6 +34,7 @@ export declare function queryMetricsHandler(registry: ConnectorRegistry, args: {
     metric: string;
     duration?: string;
     source?: string;
+    groupBy?: string;
 }): Promise<{
     content: {
         type: "text";

package/dist/tools/query-metrics.js CHANGED Viewed

@@ -21,6 +21,10 @@ export const queryMetricsDefinition = {
                 type: "string",
                 description: "Specific source name to query. If omitted, queries all metrics backends.",
             },
+            groupBy: {
+                type: "string",
+                description: "Optional label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response includes a 'groups' array with one time-series per distinct value. When the service has only one matching series, the result is unchanged.",
+            },
         },
         required: ["service", "metric"],
     },
@@ -36,6 +40,9 @@ export async function queryMetricsHandler(registry, args) {
     const metricErr = validateMetricName(args.metric, registry);
     if (metricErr)
         return errorResponse(metricErr);
+    if (args.groupBy && !/^[a-zA-Z_][a-zA-Z0-9_]*$/.test(args.groupBy)) {
+        return errorResponse(`Invalid groupBy "${args.groupBy}". Must be a valid Prometheus label name (alphanumeric + underscore, starting with letter/underscore).`);
+    }
     const connectors = args.source
         ? [registry.getByName(args.source)].filter(Boolean)
         : registry.getBySignal("metrics");
@@ -55,6 +62,7 @@ export async function queryMetricsHandler(registry, args) {
                 service: args.service,
                 metric: args.metric,
                 duration,
+                groupBy: args.groupBy,
             });
             results.push(result);
         }

package/dist/types.d.ts CHANGED Viewed

@@ -95,6 +95,7 @@ export interface MetricQuery {
     metric: string;
     duration: string;
     step?: string;
+    groupBy?: string;
 }
 export interface LogQuery {
     service: string;
@@ -114,6 +115,11 @@ export interface MetricSummary {
     max: number;
     trend: Trend;
 }
+export interface MetricGroup {
+    key: string;
+    values: DataPoint[];
+    summary: MetricSummary;
+}
 export interface MetricResult {
     source: string;
     service: string;
@@ -123,6 +129,9 @@ export interface MetricResult {
     summary: MetricSummary;
     resolvedSeries?: string;
     resolvedLabel?: string;
+    groupBy?: string;
+    groups?: MetricGroup[];
+    hint?: string;
 }
 export interface LogEntry {
     timestamp: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@thotischner/observability-mcp",
-  "version": "1.2.2",
+  "version": "1.3.1",
   "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
   "type": "module",
   "license": "MIT",