npm - @thotischner/observability-mcp - Versions diffs - 3.2.1 → 3.3.0 - Mend

@thotischner/observability-mcp 3.2.1 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/conformance/mcp-2025-11-25.test.js +53 -0
package/dist/index.js +130 -15
package/package.json +1 -1

package/dist/conformance/mcp-2025-11-25.test.js CHANGED Viewed

@@ -375,3 +375,56 @@ test("E2E tools/call: every registered tool dispatches over MCP and returns a Ca
         assert.ok(Array.isArray(r.content), `tool ${name} must return content[]`);
     }
 });
+test("E2E tools/list: every builtin tool advertises ToolAnnotations (readOnlyHint)", opts, async () => {
+    // AX hardening: all 12 builtin tools are read-only; clients (e.g. Claude)
+    // use these hints for auto-approve decisions, so they must be advertised
+    // over the live transport — not just present in the registration source.
+    const session = await newSession();
+    const { response } = await jsonRpc("tools/list", {}, { id: 2, session });
+    const r = response.result;
+    const tools = r.tools ?? [];
+    assert.ok(tools.length >= 12, `expected >=12 tools, got ${tools.length}`);
+    // Federated tools (namespaced `<prefix>.<tool>`) proxy upstream metadata and
+    // may legitimately lack annotations — only the builtin set is asserted.
+    const builtin = tools.filter((t) => t.name && !t.name.includes("."));
+    for (const t of builtin) {
+        assert.equal(t.annotations?.readOnlyHint, true, `tool ${t.name} must advertise annotations.readOnlyHint=true`);
+        assert.ok(t.annotations?.title, `tool ${t.name} must advertise annotations.title`);
+    }
+});
+test("E2E: builtin resource agent-usage-guide is listed and readable", opts, async () => {
+    // AX: the agent usage guide ships as an MCP resource so clients can pull
+    // it into context without a web fetch. Assert list + read over the wire.
+    const session = await newSession();
+    const list = await jsonRpc("resources/list", {}, { id: 10, session });
+    const resources = list.response.result?.resources ?? [];
+    assert.ok(resources.some((r) => r.uri === "omcp://guide/agent-usage"), `agent-usage-guide resource must be listed, got ${JSON.stringify(resources.map((r) => r.uri))}`);
+    const read = await jsonRpc("resources/read", { uri: "omcp://guide/agent-usage" }, { id: 11, session });
+    const contents = read.response.result?.contents ?? [];
+    assert.ok((contents[0]?.text ?? "").includes("Triage recipe"), "guide text must round-trip");
+});
+test("E2E: builtin prompts triage-incident + write-postmortem are listed and resolvable", opts, async () => {
+    const session = await newSession();
+    const list = await jsonRpc("prompts/list", {}, { id: 12, session });
+    const prompts = list.response.result?.prompts ?? [];
+    for (const name of ["triage-incident", "write-postmortem"]) {
+        assert.ok(prompts.some((p) => p.name === name), `prompt ${name} must be listed`);
+    }
+    const got = await jsonRpc("prompts/get", { name: "triage-incident", arguments: { service: "ci-probe" } }, { id: 13, session });
+    const msgs = got.response.result?.messages ?? [];
+    assert.ok((msgs[0]?.content?.text ?? "").includes('"ci-probe"'), "prompt must interpolate the service arg");
+});
+test("E2E: /llms.txt is served and reflects the canonical tool registry", opts, async () => {
+    // llms.txt convention: LLM-readable summary at the server root. Generated
+    // from registry-names.ts, so this also guards against registry drift.
+    const base = URL_ENV.replace(/\/mcp\/?$/, "");
+    const res = await fetch(`${base}/llms.txt`);
+    assert.equal(res.status, 200);
+    assert.match(res.headers.get("content-type") ?? "", /text\/plain/);
+    const text = await res.text();
+    assert.match(text, /^# observability-mcp/, "must start with the llms.txt H1");
+    for (const name of ["query_logs", "query_metrics", "enrich_ips", "get_blast_radius"]) {
+        assert.ok(text.includes(`- ${name} (`), `tool ${name} must be listed`);
+    }
+    assert.ok(text.includes("for-agents"), "must link the for-agents guide");
+});

package/dist/index.js CHANGED Viewed

@@ -393,15 +393,95 @@ async function main() {
             }
             return mcpServer.prompt(name, ...rest);
         });
-        // Suppress unused-warn — kept for the moment registrations land.
-        void registerResource;
-        void registerPrompt;
+        // --- Builtin resources + prompts (agent experience) -------------------
+        // The usage guide is the distilled, agent-validated workflow from issue
+        // #415 — served as an MCP resource so a client can pull it into context
+        // without a web fetch. Prompts compose the existing read-only tools into
+        // the two flows agents run most.
+        registerResource("agent-usage-guide", "omcp://guide/agent-usage", {
+            description: "How to use this gateway effectively as an agent: the proven filter→aggregate→enrich triage recipe, signal-vs-silence behaviours, and the operator flags that unlock optional tools.",
+            mimeType: "text/markdown",
+        }, async (uri) => ({
+            contents: [
+                {
+                    uri: uri.toString(),
+                    mimeType: "text/markdown",
+                    text: [
+                        "# Agent usage guide (observability-mcp)",
+                        "",
+                        "All tools are read-only (`readOnlyHint: true`). The golden rule:",
+                        "**filter and aggregate server-side — ask for numbers, not haystacks.**",
+                        "",
+                        "## Triage recipe (agent-validated, issue #415)",
+                        '1. `query_logs` with `labels` (exact-match field filters, e.g. {"environment":"prod"})',
+                        '   and `aggregate` ({"op":"topk","by":["ip"],"k":10} or {"op":"count_over_time","step":"15m"})',
+                        "   — pushed down to LogQL, returns a handful of numbers instead of thousands of rows.",
+                        "2. `enrich_ips` with the IPs from step 1 — offline geo/ASN/hosting-flag lookup",
+                        "   (bot-vs-human signal). Requires OMCP_IP_ENRICH_FILE on the operator side.",
+                        '3. `query_metrics` with `labels` ({"route":"/checkout"}) and `groupBy` to scope a',
+                        "   curated metric to the slice you care about.",
+                        "",
+                        "## Incident flow",
+                        "`detect_anomalies` (fleet scan) → `get_service_health` (one-service verdict) →",
+                        "`get_blast_radius` (shared-host impact) → `generate_postmortem` (markdown report).",
+                        "",
+                        "## When something is empty or refused",
+                        "The gateway explains itself: no topology connector → explicit note; no trace",
+                        "backend → explicit error; `raw_query` disabled → message naming OMCP_RAW_QUERY=on;",
+                        "redacted values → a `_redacted` count in the result. Relay flag names to your",
+                        "operator verbatim — the messages are written to be forwarded.",
+                        "",
+                        "## Report findings",
+                        "Structured agent reports drive releases here (see issue #415). File one:",
+                        "https://github.com/ThoTischner/observability-mcp/issues/new?template=agent-report.yml",
+                        "Full guide: https://thotischner.github.io/observability-mcp/for-agents/",
+                    ].join("\n"),
+                },
+            ],
+        }));
+        registerPrompt("triage-incident", "Guided incident triage for one service: health verdict, anomaly scan, blast radius, and the log slice that matters.", { service: z.string().describe("Service name as returned by list_services") }, ({ service }) => ({
+            messages: [
+                {
+                    role: "user",
+                    content: {
+                        type: "text",
+                        text: [
+                            `Triage the service "${service}" using the observability-mcp tools, in this order:`,
+                            `1. get_service_health {"service":"${service}"} — the current verdict and why.`,
+                            `2. detect_anomalies {"service":"${service}","duration":"1h"} — what is statistically off.`,
+                            `3. get_blast_radius {"resource":"${service}"} — who else fails if its host fails.`,
+                            `4. query_logs {"service":"${service}","level":"error","aggregate":{"op":"count_over_time","step":"5m"},"duration":"1h"} — error-volume shape over time; drill into raw rows only for the spike window.`,
+                            "Then summarise: current state, most likely cause, blast radius, and the next diagnostic step. Prefer aggregated queries over raw log dumps.",
+                        ].join("\n"),
+                    },
+                },
+            ],
+        }));
+        registerPrompt("write-postmortem", "Generate and refine a post-incident report for one service over a window.", {
+            service: z.string().describe("Service name as returned by list_services"),
+            duration: z.string().optional().describe("Look-back window, e.g. '1h', '6h'. Default '1h'."),
+        }, ({ service, duration }) => ({
+            messages: [
+                {
+                    role: "user",
+                    content: {
+                        type: "text",
+                        text: [
+                            `Produce a post-mortem for "${service}" over the last ${duration || "1h"}:`,
+                            `1. generate_postmortem {"service":"${service}","duration":"${duration || "1h"}"} — the stitched report (anomaly timeline, blast radius, traces, log highlights).`,
+                            `2. Verify its claims: get_anomaly_history {"service":"${service}","duration":"${duration || "1h"}"} for the score timeline, and query_logs with an aggregate for the error shape.`,
+                            "3. Rewrite the result as a blameless post-mortem: summary, impact, timeline, root-cause hypothesis (with confidence), follow-ups. Mark any section the gateway reported as missing data instead of inventing content.",
+                        ].join("\n"),
+                    },
+                },
+            ],
+        }));
         registerTool("list_sources", [
             "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
             "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
             "Behavior: read-only, no side effects. Returns one entry per source with its name, type, signal types (metrics/logs), and a live up/down status (the backend URL is intentionally not exposed — it may carry embedded credentials). Never throws for an unreachable backend — the backend is reported as down instead.",
             "Related: use `list_services` to see what is monitored within these sources.",
-        ].join(" "), {}, async () => {
+        ].join(" "), {}, { title: "List Sources", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async () => {
             await enforceEntitledAccess(ctx, { tool: "list_sources" });
             return withToolMetrics("list_sources", () => listSourcesHandler(registry, ctx));
         });
@@ -415,7 +495,7 @@ async function main() {
                 .string()
                 .optional()
                 .describe("Optional case-insensitive substring to narrow the result to matching service names (e.g. 'payment'). Omit to list every discovered service."),
-        }, async (args) => {
+        }, { title: "List Services", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "list_services" });
             const result = await withToolMetrics("list_services", () => listServicesHandler(registry, args, ctx));
             return enrichToolServicesText(result, ctx);
@@ -458,7 +538,7 @@ async function main() {
                 .string()
                 .optional()
                 .describe("Optional escape hatch: a verbatim PromQL expression, run as-is over the range — for ad-hoc queries the curated `metric` catalog can't express (any series, any function, broken down by any label). When set, `metric`/`service`/`groupBy`/`labels` are ignored. DISABLED by default; the operator must enable the raw-query capability (OMCP_RAW_QUERY=on) or the call is refused. Still tenant-scoped and source-allow-listed."),
-        }, async (args) => {
+        }, { title: "Query Metrics", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "query_metrics", source: args?.source, service: args?.service });
             const result = await withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args, ctx, { allowRawQuery: RAW_QUERY_ENABLED }));
             return chargeTokenBudget(result, ctx, "query_metrics");
@@ -525,7 +605,7 @@ async function main() {
                 .string()
                 .optional()
                 .describe("Optional escape hatch: a verbatim LogQL log query, run as-is — for selectors/pipelines the curated params can't express. When set, `service`/`labels`/`level`/`query` are ignored and it is mutually exclusive with `aggregate` (express aggregation in the LogQL itself). DISABLED by default; the operator must enable the raw-query capability (OMCP_RAW_QUERY=on) or the call is refused. Redaction still applies to the returned log lines."),
-        }, async (args) => {
+        }, { title: "Query Logs", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "query_logs", source: args?.source, service: args?.service });
             const result = await withToolMetrics("query_logs", () => queryLogsHandler(registry, args, ctx, { allowRawQuery: RAW_QUERY_ENABLED }));
             // Redact PII / secrets from the log payload before it crosses the
@@ -565,7 +645,7 @@ async function main() {
             service: z.string().describe("Service name to filter on."),
             duration: z.string().optional().describe("Rolling window, e.g. '1h', '24h'. Default '1h'."),
             method: z.string().optional().describe("Filter by detector method ('mad' / 'seasonality' / 'correlator'). Optional."),
-        }, async (args) => {
+        }, { title: "Anomaly History", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "get_anomaly_history", service: args?.service });
             const result = await withToolMetrics("get_anomaly_history", () => getAnomalyHistoryHandler(registry, args, ctx));
             return chargeTokenBudget(result, ctx, "get_anomaly_history");
@@ -580,7 +660,7 @@ async function main() {
             service: z.string().describe("Suspected root-cause service."),
             duration: z.string().optional().describe("Window length, e.g. '1h', '6h'. Default '1h'."),
             format: z.enum(["markdown", "json"]).optional().describe("'markdown' (default) or 'json'."),
-        }, async (args) => {
+        }, { title: "Generate Postmortem", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "generate_postmortem", service: args?.service });
             const result = await withToolMetrics("generate_postmortem", () => generatePostmortemHandler(registry, args, ctx));
             return chargeTokenBudget(result, ctx, "generate_postmortem");
@@ -597,7 +677,7 @@ async function main() {
             filter: z.string().optional().describe("Backend-native filter (TraceQL on Tempo, tag query on Jaeger). Optional."),
             limit: z.number().int().positive().optional().describe("Soft cap on returned trace summaries. Default 50."),
             errorsOnly: z.boolean().optional().describe("If true, only traces with at least one error span."),
-        }, async (args) => {
+        }, { title: "Query Traces", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "query_traces", service: args?.service });
             const result = await withToolMetrics("query_traces", () => queryTracesHandler(registry, args, ctx));
             return chargeTokenBudget(result, ctx, "query_traces");
@@ -611,7 +691,7 @@ async function main() {
             service: z
                 .string()
                 .describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
-        }, async (args) => {
+        }, { title: "Service Health", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "get_service_health", service: args?.service });
             const result = await withToolMetrics("get_service_health", () => getServiceHealthHandler(registry, args, ctx));
             const enriched = enrichToolHealthText(result, String(args?.service ?? ""), ctx);
@@ -635,7 +715,7 @@ async function main() {
                 .enum(["low", "medium", "high"])
                 .optional()
                 .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
-        }, async (args) => {
+        }, { title: "Detect Anomalies", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
             // P1: pass the anomaly-history sink so detected scores flow
             // into the TSDB and `get_anomaly_history` returns real data.
@@ -666,7 +746,7 @@ async function main() {
                 .max(5000)
                 .optional()
                 .describe("Optional. Maximum resources to return; edges are trimmed to the kept set. Default 500, max 5000."),
-        }, async (args) => {
+        }, { title: "Topology Graph", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "get_topology", source: args?.source });
             return withToolMetrics("get_topology", () => getTopologyHandler(registry, args, ctx));
         });
@@ -679,7 +759,7 @@ async function main() {
             resource: z
                 .string()
                 .describe("Required. Resource to evaluate. Accepts the canonical id (e.g. 'k8s:pod:default/checkout-7f89d'), the exact resource name (e.g. 'checkout-7f89d'), or a unique substring of either."),
-        }, async (args) => {
+        }, { title: "Blast Radius", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "get_blast_radius" });
             return withToolMetrics("get_blast_radius", () => getBlastRadiusHandler(registry, args, ctx));
         });
@@ -692,7 +772,7 @@ async function main() {
             ips: z
                 .array(z.string())
                 .describe("Required. IPv4 address strings to enrich (e.g. ['203.0.113.5','198.51.100.9']). Max 1000 per call; invalid entries are returned with found=false rather than failing the batch."),
-        }, async (args) => {
+        }, { title: "Enrich IPs", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "enrich_ips" });
             return withToolMetrics("enrich_ips", async () => enrichIpsHandler(ipEnrichment, args, ctx));
         });
@@ -1195,6 +1275,41 @@ async function main() {
     // enough to skip the request-counter middleware.
     let ready = false;
     app.get("/healthz", (_req, res) => res.type("text").send("ok"));
+    // /llms.txt — the llms.txt convention (llmstxt.org): a plain-text,
+    // LLM-friendly summary of what this server is and how to use it. The
+    // primary audience of this gateway IS an LLM agent, so the gateway
+    // serves its own. Tool list is generated from the canonical registry
+    // (registry-names.ts) so it can't drift from the real surface.
+    const LLMS_TXT = [
+        "# observability-mcp",
+        "",
+        `> Unified observability gateway for AI agents (v${SERVER_VERSION}). One MCP server`,
+        "> for Prometheus, Loki, and any backend via pluggable connectors — with",
+        "> server-side filtering/aggregation so agents get numbers, not haystacks.",
+        "",
+        "MCP endpoint: POST /mcp (Streamable HTTP) · also stdio (--stdio) and WebSocket (/mcp/ws).",
+        "All tools are read-only and advertise MCP ToolAnnotations (readOnlyHint: true).",
+        "MCP resource omcp://guide/agent-usage carries the agent usage guide;",
+        "prompts triage-incident and write-postmortem compose the tools into workflows.",
+        "",
+        "## Tools",
+        "",
+        ...REGISTERED_TOOLS.map((t) => `- ${t.name} (${t.category}): ${t.summary}`),
+        "",
+        "## Connect",
+        "",
+        "    claude mcp add observability --transport http http://localhost:3000/mcp",
+        "",
+        "## Docs",
+        "",
+        "- For agents (start here): https://thotischner.github.io/observability-mcp/for-agents/",
+        "- Documentation site: https://thotischner.github.io/observability-mcp/",
+        "- Report a finding (agent-report template): https://github.com/ThoTischner/observability-mcp/issues/new?template=agent-report.yml",
+        "- Discussions (agent collaboration welcome): https://github.com/ThoTischner/observability-mcp/discussions",
+        "- Source: https://github.com/ThoTischner/observability-mcp",
+        "",
+    ].join("\n");
+    app.get("/llms.txt", (_req, res) => res.type("text/plain; charset=utf-8").send(LLMS_TXT));
     // Procurement-time probe: the MCP spec revisions and transports the
     // gateway supports. Static today — kept as a separate endpoint so a
     // discovery tool / RFP probe / catalog scanner can resolve our

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@thotischner/observability-mcp",
-  "version": "3.2.1",
+  "version": "3.3.0",
   "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
   "type": "module",
   "license": "Apache-2.0",