npm - @thotischner/observability-mcp - Versions diffs - 3.0.0 → 3.0.1 - Mend

@thotischner/observability-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/dist/audit/sinks/s3.d.ts +61 -0
package/dist/audit/sinks/s3.js +179 -0
package/dist/audit/sinks/s3.test.d.ts +1 -0
package/dist/audit/sinks/s3.test.js +175 -0
package/dist/auth/policy/batch-dry-run.js +15 -0
package/dist/connectors/loader.d.ts +8 -0
package/dist/connectors/loader.js +49 -0
package/dist/connectors/manifest-hooks.test.d.ts +1 -0
package/dist/connectors/manifest-hooks.test.js +206 -0
package/dist/federation/registry.d.ts +27 -5
package/dist/federation/registry.js +49 -4
package/dist/federation/registry.test.js +79 -3
package/dist/federation/upstream.d.ts +32 -6
package/dist/federation/upstream.js +60 -12
package/dist/federation/upstream.test.d.ts +1 -0
package/dist/federation/upstream.test.js +118 -0
package/dist/index.js +306 -65
package/dist/metrics/self.d.ts +1 -0
package/dist/metrics/self.js +8 -0
package/dist/policy/redact.js +1 -1
package/dist/postmortem/store.d.ts +34 -0
package/dist/postmortem/store.js +113 -0
package/dist/postmortem/store.test.d.ts +1 -0
package/dist/postmortem/store.test.js +118 -0
package/dist/scim/compliance.test.d.ts +1 -0
package/dist/scim/compliance.test.js +169 -0
package/dist/scim/factory.test.d.ts +1 -0
package/dist/scim/factory.test.js +54 -0
package/dist/scim/patch-ops.test.d.ts +1 -0
package/dist/scim/patch-ops.test.js +100 -0
package/dist/scim/redis-store.d.ts +38 -0
package/dist/scim/redis-store.js +178 -0
package/dist/scim/redis-store.test.d.ts +1 -0
package/dist/scim/redis-store.test.js +138 -0
package/dist/scim/routes.d.ts +27 -2
package/dist/scim/routes.js +161 -15
package/dist/scim/store.d.ts +40 -1
package/dist/scim/store.js +23 -5
package/dist/sdk/hook-wrappers.d.ts +39 -0
package/dist/sdk/hook-wrappers.js +113 -0
package/dist/sdk/hook-wrappers.test.d.ts +1 -0
package/dist/sdk/hook-wrappers.test.js +204 -0
package/dist/sdk/index.d.ts +13 -0
package/dist/tools/detect-anomalies.d.ts +12 -1
package/dist/tools/detect-anomalies.js +22 -2
package/dist/tools/topology.js +23 -5
package/dist/tools/topology.test.js +45 -0
package/dist/transport/transportSessionMap.d.ts +70 -0
package/dist/transport/transportSessionMap.js +128 -0
package/dist/transport/transportSessionMap.test.d.ts +1 -0
package/dist/transport/transportSessionMap.test.js +111 -0
package/dist/ui/index.html +856 -101
package/package.json +1 -1

package/dist/federation/upstream.test.js ADDED Viewed

@@ -0,0 +1,118 @@
+import { test } from "node:test";
+import assert from "node:assert/strict";
+import { UpstreamClient } from "./upstream.js";
+test("UpstreamClient: HTTP config — transportKind='http', url surfaced", () => {
+    const cfg = {
+        name: "remote",
+        url: "https://gw.example.com/mcp",
+        bearerToken: "t0k",
+    };
+    const c = new UpstreamClient(cfg);
+    assert.equal(c.transportKind, "http");
+    assert.equal(c.url, "https://gw.example.com/mcp");
+    assert.equal(c.namespacePrefix, "remote");
+    assert.deepEqual(c.getTools(), []);
+});
+test("UpstreamClient: stdio config — transportKind='stdio', url shows command", () => {
+    const cfg = {
+        transport: "stdio",
+        name: "local-mcp",
+        command: "/usr/local/bin/mcp",
+        args: ["--config", "/etc/mcp.yaml"],
+    };
+    const c = new UpstreamClient(cfg);
+    assert.equal(c.transportKind, "stdio");
+    assert.equal(c.url, "stdio:/usr/local/bin/mcp");
+    assert.equal(c.namespacePrefix, "local-mcp");
+});
+test("UpstreamClient: stdio config respects custom namespacePrefix", () => {
+    const cfg = {
+        transport: "stdio",
+        name: "weather",
+        command: "weather-mcp",
+        namespacePrefix: "weather.local",
+    };
+    const c = new UpstreamClient(cfg);
+    assert.equal(c.namespacePrefix, "weather.local");
+});
+test("UpstreamClient: explicit transport='http' is also accepted", () => {
+    const cfg = {
+        transport: "http",
+        name: "gw",
+        url: "https://gw.example.com/mcp",
+    };
+    const c = new UpstreamClient(cfg);
+    assert.equal(c.transportKind, "http");
+});
+test("UpstreamClient: ws transport surfaces the ws:// URL", () => {
+    const cfg = {
+        transport: "ws",
+        name: "gw",
+        url: "wss://gw.example.com/mcp/ws",
+    };
+    const c = new UpstreamClient(cfg);
+    assert.equal(c.transportKind, "ws");
+    assert.equal(c.url, "wss://gw.example.com/mcp/ws");
+});
+test("UpstreamClient: empty args defaults to [] on stdio", () => {
+    const cfg = {
+        transport: "stdio",
+        name: "x",
+        command: "x",
+    };
+    const c = new UpstreamClient(cfg);
+    // Just verifies construction doesn't throw on a minimal stdio config.
+    assert.equal(c.transportKind, "stdio");
+});
+test("UpstreamClient: getStatus initial state", () => {
+    const c = new UpstreamClient({ name: "x", url: "https://x/mcp" });
+    const s = c.getStatus();
+    assert.equal(s.status, "disconnected");
+    assert.equal(s.toolCount, 0);
+    assert.equal(s.lastError, undefined);
+});
+test("UpstreamClient: connect uses injected _transport instead of spawning / fetching", async () => {
+    // Build a minimal MCP Transport stub that also COMPLETES the
+    // initialize handshake — when the SDK Client sends a JSON-RPC
+    // request, we synthesise a matching response on onmessage so the
+    // initialize promise resolves quickly (no 60s SDK timeout).
+    let started = false;
+    let sentMessages = 0;
+    const fakeTransport = {
+        start: async () => { started = true; },
+        send: async (msg) => {
+            sentMessages += 1;
+            if (msg?.method === "initialize" && msg?.id !== undefined) {
+                queueMicrotask(() => {
+                    fakeTransport.onmessage?.({
+                        jsonrpc: "2.0",
+                        id: msg.id,
+                        result: { protocolVersion: "2024-11-05", capabilities: {}, serverInfo: { name: "fake", version: "1" } },
+                    });
+                });
+            }
+            else if (msg?.method === "tools/list" && msg?.id !== undefined) {
+                queueMicrotask(() => {
+                    fakeTransport.onmessage?.({ jsonrpc: "2.0", id: msg.id, result: { tools: [] } });
+                });
+            }
+        },
+        close: async () => { },
+        onclose: undefined,
+        onerror: undefined,
+        onmessage: undefined,
+    };
+    const c = new UpstreamClient({
+        name: "injected",
+        url: "https://ignored.example/mcp",
+        refreshIntervalMs: 0,
+        _transport: fakeTransport,
+    });
+    await c.connect();
+    await c.close();
+    assert.equal(started, true, "fake transport.start() should have been called");
+    assert.ok(sentMessages >= 1, "fake transport.send() should have received initialize");
+    // Status reaches "ready" only when initialize + tools/list both succeed
+    // — confirms our injected transport drove the whole handshake.
+    // (connect-time errors leave it in "degraded".)
+});

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import { buildSessionAttacher, buildRequireSession, } from "./auth/middleware.js
 import { buildRequirePermissionFromEngine, hasPermission, listGrantedPermissions, DEFAULT_POLICY, } from "./auth/rbac.js";
 import { resolveOidcConfig, buildOidcRuntime } from "./auth/oidc/runtime.js";
 import { registerOidcRoutes } from "./auth/oidc/endpoints.js";
-import { ScimStore } from "./scim/store.js";
+import { createScimStore } from "./scim/store.js";
 import { registerScimRoutes } from "./scim/routes.js";
 import { BuiltinPolicyEngine } from "./auth/policy/engine.js";
 import { loadPolicyFromFile, writePolicyFile, PolicyLoadError, VALID_RESOURCES, VALID_ACTIONS } from "./auth/policy/loader.js";
@@ -40,10 +40,11 @@ import { getPluginLoader } from "./connectors/loader.js";
 import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
 import { isValidConnectorName, installTarball } from "./connectors/install.js";
 import { PluginVerificationError } from "./connectors/verify.js";
-import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions } from "./metrics/self.js";
+import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions, auditDlqDepth } from "./metrics/self.js";
 import { initOtel } from "./observability/otel.js";
 import { WebSocketServerTransport } from "./transport/websocket.js";
 import { HookRegistry } from "./sdk/hooks.js";
+import { wrapToolHandler, wrapResourceHandler, wrapPromptHandler } from "./sdk/hook-wrappers.js";
 import { UpstreamClient } from "./federation/upstream.js";
 import { FederationRegistry, parseFederationEnv } from "./federation/registry.js";
 import { buildCsrfIssuer, buildCsrfEnforcer, csrfBypassFromEnv } from "./auth/csrf.js";
@@ -56,6 +57,7 @@ import { queryLogsHandler } from "./tools/query-logs.js";
 import { queryTracesHandler } from "./tools/query-traces.js";
 import { getAnomalyHistoryHandler } from "./tools/get-anomaly-history.js";
 import { generatePostmortemHandler } from "./tools/generate-postmortem.js";
+import { PostmortemStore } from "./postmortem/store.js";
 import { AnomalyHistory, fromEnv as anomalyHistoryFromEnv } from "./analysis/history.js";
 import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
 import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
@@ -295,11 +297,20 @@ async function main() {
             return result;
         }
     }
+    /**
+     * Returns the McpServer for the given context. The companion
+     * `toolHandlers` map carries every tool registered for this ctx
+     * (post-hook-wrapping) so the in-product Playground UI (Q13) can
+     * invoke a tool without going through the full Streamable HTTP
+     * transport stack. The map is keyed by tool name; values run the
+     * same wrapped handler the McpServer would dispatch over MCP.
+     */
     function createMcpServer(ctx) {
         const mcpServer = new McpServer({
             name: "observability-mcp",
             version: SERVER_VERSION,
         });
+        const toolHandlers = new Map();
         // --- Register tools with Zod schemas ---
         // Product-aware registration: when the active credential is bound
         // to a Product (OMCP_KEY_PRODUCTS), `ctx.allowedTools` carries that
@@ -319,34 +330,39 @@ async function main() {
                 return undefined;
             if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
                 const originalHandler = rest[rest.length - 1];
-                const wrappedHandler = async (args, extra) => {
-                    const hookCtxBase = {
-                        principal: ctx.principalId,
-                        tenant: ctx.tenant || "default",
-                        target: name,
-                    };
-                    const pre = await hookRegistry.fire("tool_pre_invoke", { ...hookCtxBase, kind: "tool_pre_invoke" }, { args });
-                    if (!pre.allow) {
-                        return {
-                            content: [{ type: "text", text: pre.reason ?? "denied by plugin hook" }],
-                            isError: true,
-                        };
-                    }
-                    const effectiveArgs = pre.payload?.args ?? args;
-                    const result = await originalHandler(effectiveArgs, extra);
-                    const post = await hookRegistry.fire("tool_post_invoke", { ...hookCtxBase, kind: "tool_post_invoke" }, { args: effectiveArgs, result });
-                    if (!post.allow) {
-                        return {
-                            content: [{ type: "text", text: post.reason ?? "denied by plugin hook" }],
-                            isError: true,
-                        };
-                    }
-                    return post.payload?.result ?? result;
-                };
+                const wrappedHandler = wrapToolHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
                 rest[rest.length - 1] = wrappedHandler;
+                // Stash for the Playground endpoint — keyed by tool name. The
+                // wrapped handler honours pre/post hooks + the same RBAC the
+                // McpServer dispatch path runs. Per-ctx Map so a different
+                // user's allowedTools never leak.
+                toolHandlers.set(name, wrappedHandler);
             }
             return mcpServer.tool(name, ...rest);
         });
+        // Q12: resource + prompt registrations get the same hook-fan-out
+        // treatment so a plugin's resource_pre_fetch / resource_post_fetch /
+        // prompt_pre_fetch / prompt_post_fetch handlers actually fire when
+        // a future resource/prompt registration lands. The wrappers stay
+        // thin pass-throughs when no hooks are registered (the OSS default).
+        // Wrappers are tested in mcp-server/src/sdk/hook-wrappers.test.ts.
+        const registerResource = ((name, ...rest) => {
+            if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
+                const originalHandler = rest[rest.length - 1];
+                rest[rest.length - 1] = wrapResourceHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
+            }
+            return mcpServer.resource(name, ...rest);
+        });
+        const registerPrompt = ((name, ...rest) => {
+            if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
+                const originalHandler = rest[rest.length - 1];
+                rest[rest.length - 1] = wrapPromptHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
+            }
+            return mcpServer.prompt(name, ...rest);
+        });
+        // Suppress unused-warn — kept for the moment registrations land.
+        void registerResource;
+        void registerPrompt;
         registerTool("list_sources", [
             "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
             "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
@@ -547,7 +563,9 @@ async function main() {
                 .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
         }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
-            return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
+            // P1: pass the anomaly-history sink so detected scores flow
+            // into the TSDB and `get_anomaly_history` returns real data.
+            return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx, anomalyHistory));
         });
         registerTool("get_topology", [
             "Return the infrastructure topology graph (Resources and Edges) from every topology-capable connector.",
@@ -599,16 +617,33 @@ async function main() {
         // Product-allow-list gate, so federated tools obey the same policy
         // surface as native ones.
         for (const info of federationRegistry.getNamespacedTools()) {
-            // Upstream's inputSchema is forwarded verbatim. The SDK's
-            // tool() overload signatures don't carry an obvious type for a
-            // dynamic-shape schema, so we cast to `any` at the boundary and
-            // let the upstream contract speak for the validation.
-            registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, info.inputSchema ?? {}, async (args) => {
+            // The MCP SDK's tool() signature wants a ZodRawShape (a map of
+            // field-name → Zod type), not a raw JSON Schema. Federated
+            // upstreams expose JSON Schema (the wire-format MCP uses on
+            // tools/list); we transcode to a permissive Zod shape so the
+            // SDK accepts the registration. Per-field types are `z.unknown()`
+            // because the upstream will validate the call args anyway; the
+            // local Zod check is only a "this is the field name set" gate.
+            // P7: this transcoding fixes the registration crash that broke
+            // every federation deploy before the E2E test caught it.
+            const upstreamProps = info.inputSchema?.properties ?? {};
+            // Every field is z.unknown().optional() — the SDK only uses this
+            // shape to know the field-name set; the upstream re-validates
+            // against its full JSON Schema (incl. its own `required` list)
+            // when the call arrives. Marking all fields optional here keeps
+            // calls with the upstream-defaults flowing through; without it
+            // the SDK rejects any call that omits a field upstream considers
+            // required even if the upstream would accept the omission.
+            const localShape = {};
+            for (const k of Object.keys(upstreamProps)) {
+                localShape[k] = z.unknown().optional();
+            }
+            registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, localShape, async (args) => {
                 await enforceEntitledAccess(ctx, { tool: info.namespacedName });
                 return withToolMetrics(info.namespacedName, () => federationRegistry.callNamespacedTool(info.namespacedName, args));
             });
         }
-        return mcpServer;
+        return { mcpServer, toolHandlers };
     }
     // --- Management-plane auth (basic mode) -----------------------------------
     // Off by default. Enable with `OMCP_AUTH=basic` + `OMCP_USERS_FILE` and
@@ -717,7 +752,12 @@ async function main() {
             app.set("trust proxy", trustProxy);
         }
     }
-    app.use(express.json({ limit: "1mb" }));
+    // Parse application/json AND any *+json media type. SCIM clients
+    // (Entra, Okta) send `application/scim+json` per RFC 7644 §3.1 —
+    // without the wildcard the body silently arrives empty and every
+    // SCIM POST/PATCH 400s. The wildcard also future-proofs other
+    // structured-suffix JSON content types.
+    app.use(express.json({ limit: "1mb", type: ["application/json", "application/*+json"] }));
     // Security headers
     app.use((req, res, next) => {
         res.setHeader("X-Content-Type-Options", "nosniff");
@@ -938,11 +978,11 @@ async function main() {
     // (no tools) so the gateway boots regardless of upstream health.
     const federationRegistry = new FederationRegistry();
     for (const cfg of parseFederationEnv()) {
-        const client = new UpstreamClient({
-            name: cfg.name,
-            url: cfg.url,
-            bearerToken: cfg.bearerToken,
-        });
+        const client = new UpstreamClient(cfg.kind === "stdio"
+            ? { transport: "stdio", name: cfg.name, command: cfg.command, args: cfg.args }
+            : cfg.kind === "ws"
+                ? { transport: "ws", name: cfg.name, url: cfg.url }
+                : { name: cfg.name, url: cfg.url, bearerToken: cfg.bearerToken });
         federationRegistry.add(client);
         client.connect().catch((err) => {
             console.warn("federation upstream %s initial connect failed: %s", cfg.name, err instanceof Error ? err.message : String(err));
@@ -1044,6 +1084,24 @@ async function main() {
     // this endpoint when enabled.
     if (process.env.METRICS_ENABLED !== "false") {
         app.get("/metrics", async (_req, res) => {
+            // P9: refresh the audit-webhook DLQ depth before the scrape so
+            // Prometheus sees the current file state rather than whatever
+            // /api/audit/dlq last set. Best-effort; ENOENT or missing-env
+            // resets to 0 (the dlqPath being unset is the normal state).
+            try {
+                const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
+                if (dlqPath) {
+                    const fs = await import("node:fs/promises");
+                    const raw = await fs.readFile(dlqPath, "utf8").catch(() => "");
+                    auditDlqDepth.set(raw.split("\n").filter((l) => l.trim()).length);
+                }
+                else {
+                    auditDlqDepth.set(0);
+                }
+            }
+            catch {
+                auditDlqDepth.set(0);
+            }
             res.set("Content-Type", selfRegistry.contentType);
             res.end(await selfRegistry.metrics());
         });
@@ -1108,6 +1166,37 @@ async function main() {
     app.get("/api/tools/registry", (_req, res) => {
         res.json({ tools: REGISTERED_TOOLS });
     });
+    // Q13: in-product Playground endpoint. Lets the operator invoke a
+    // registered tool against the live gateway without spinning up a
+    // separate MCP client. Re-uses the per-session ctx and the same
+    // wrapped handler the McpServer dispatch path would run (so RBAC,
+    // entitlements, rate-limit, audit, hook fan-out all apply
+    // identically).
+    app.post("/api/playground/invoke", async (req, res) => {
+        const ctx = await gateCtx(req, res);
+        if (!ctx)
+            return;
+        const body = (req.body ?? {});
+        const tool = typeof body.tool === "string" ? body.tool : "";
+        if (!tool) {
+            res.status(400).json({ error: "tool (string) is required" });
+            return;
+        }
+        const { toolHandlers } = createMcpServer(ctx);
+        const handler = toolHandlers.get(tool);
+        if (!handler) {
+            res.status(404).json({ error: `tool '${tool}' is not registered (or not allowed for this credential)` });
+            return;
+        }
+        try {
+            const result = await handler(body.args ?? {}, undefined);
+            res.json({ tool, result });
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            res.status(500).json({ error: message, tool });
+        }
+    });
     // Server info — version, loaded plugins, MCP protocol version, build metadata.
     // Used by the Web UI footer and by operators to confirm what's deployed.
     app.get("/api/info", async (_req, res) => {
@@ -1142,6 +1231,16 @@ async function main() {
                 redaction: REDACTION_ENABLED,
                 trustProxy: !!(process.env.OMCP_TRUST_PROXY && process.env.OMCP_TRUST_PROXY !== "false"),
                 toolRatePerMin: resolveToolRatePerMin(process.env.OMCP_TOOL_RATE_PER_MIN),
+                // P1: posture flags so dashboards can alert when a shipped
+                // capability is configured but doing nothing useful.
+                anomalyHistoryActive: anomalyHistory.isEnabled(),
+                tracesCapabilityCount: registry
+                    .getAll()
+                    .filter((c) => typeof c.queryTraces === "function").length,
+                pluginsVerified: !/^(0|false|no|off)$/i.test(process.env.VERIFY_PLUGINS ?? "true"),
+                scimEnabled: !!process.env.OMCP_SCIM_TOKEN,
+                federationUpstreams: (process.env.OMCP_FEDERATION_UPSTREAMS ?? "")
+                    .split(",").map((s) => s.trim()).filter(Boolean).length,
             },
             plugins: loader.list().map((p) => ({
                 name: p.name,
@@ -1566,6 +1665,46 @@ async function main() {
             scopedTo: tenantFilter || (isAdmin ? null : callerTenant),
         });
     });
+    // --- /api/audit/dlq — webhook-sink dead-letter queue surface (P9) ---
+    // When the audit webhook is configured AND the receiver exhausted
+    // its retry budget, entries land in the DLQ file. This endpoint
+    // surfaces the count + the last N entries so operators can decide
+    // whether to replay manually. Also refreshes the
+    // `obsmcp_audit_webhook_dlq_depth` gauge so the /metrics scrape
+    // alongside it stays accurate.
+    app.get("/api/audit/dlq", need("audit", "read"), async (_req, res) => {
+        const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
+        if (!dlqPath) {
+            auditDlqDepth.set(0);
+            res.json({ enabled: false, path: null, depth: 0, entries: [] });
+            return;
+        }
+        try {
+            const fs = await import("node:fs/promises");
+            const raw = await fs.readFile(dlqPath, "utf8");
+            const lines = raw.split("\n").filter((l) => l.trim());
+            auditDlqDepth.set(lines.length);
+            const tail = lines.slice(-50).map((l) => {
+                try {
+                    return JSON.parse(l);
+                }
+                catch {
+                    return { _raw: l, _parseError: true };
+                }
+            });
+            res.json({ enabled: true, path: dlqPath, depth: lines.length, entries: tail });
+        }
+        catch (err) {
+            const code = err.code;
+            if (code === "ENOENT") {
+                auditDlqDepth.set(0);
+                res.json({ enabled: true, path: dlqPath, depth: 0, entries: [] });
+                return;
+            }
+            console.warn("[/api/audit/dlq] read failed:", err);
+            res.status(500).json({ error: err?.message || "DLQ read failed" });
+        }
+    });
     // --- /api/usage — per-identity MCP rate-limit snapshot -----------------
     // Read-only view of the IdentityRateLimiter's bucket state. Gated by
     // need("audit","read") — the same role set that already sees the
@@ -1716,31 +1855,133 @@ async function main() {
         registerOidcRoutes(app, { sessionCfg, oidc: oidcRuntime });
         console.log("[auth] OIDC endpoints registered: /api/auth/oidc/{login,callback,logout}");
     }
-    // Phase F21: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access;
-    // OMCP_SCIM_STORE points at the on-disk JSON (mode 0600, atomic).
-    // Multi-replica deployments should plug the F8 SessionStore in
-    // when F21b lands.
+    // Phase F21 / Q6: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access.
+    // The store backend is chosen by createScimStore from
+    // OMCP_SCIM_BACKEND (file | redis). file (default) → OMCP_SCIM_STORE
+    // on-disk JSON (mode 0600, atomic). redis → a shared snapshot so
+    // multi-replica deployments stay coherent (Q6); the redis client is
+    // built from OMCP_SCIM_REDIS_URL here, mirroring the session store.
     const scimToken = process.env.OMCP_SCIM_TOKEN?.trim();
     if (scimToken) {
-        const scimStorePath = process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json";
-        const scimStore = new ScimStore(scimStorePath);
-        await scimStore.load();
-        registerScimRoutes(app, {
-            store: scimStore,
-            bearerToken: scimToken,
-            audit: (ev) => void mgmtAudit.record({
-                actor: { sub: `scim:${ev.actor}` },
-                tenant: "default",
-                resource: "users",
-                action: ev.action.includes("delete") ? "delete" : "write",
-                method: "SCIM",
-                path: `/scim/v2/${ev.action}`,
-                status: ev.status,
-                target: ev.target,
-            }).catch(() => undefined),
-        });
-        console.log("[scim] /scim/v2/* registered (store: %s)", scimStorePath);
+        try {
+            const scimBackend = (process.env.OMCP_SCIM_BACKEND?.trim() || "file");
+            let scimRedis;
+            if (scimBackend === "redis") {
+                const redisUrl = process.env.OMCP_SCIM_REDIS_URL?.trim();
+                if (!redisUrl)
+                    throw new Error("OMCP_SCIM_BACKEND=redis requires OMCP_SCIM_REDIS_URL");
+                const { createClient } = await import("redis");
+                const client = createClient({ url: redisUrl });
+                client.on("error", (err) => console.warn("[scim] redis client error: %s", err instanceof Error ? err.message : String(err)));
+                await client.connect();
+                scimRedis = client;
+            }
+            const scimStore = await createScimStore({
+                backend: scimBackend,
+                path: process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json",
+                redis: scimRedis,
+                redisKey: process.env.OMCP_SCIM_REDIS_KEY?.trim(),
+            });
+            registerScimRoutes(app, {
+                store: scimStore,
+                bearerToken: scimToken,
+                audit: (ev) => void mgmtAudit.record({
+                    actor: { sub: `scim:${ev.actor}` },
+                    tenant: "default",
+                    resource: "users",
+                    action: ev.action.includes("delete") ? "delete" : "write",
+                    method: "SCIM",
+                    path: `/scim/v2/${ev.action}`,
+                    status: ev.status,
+                    target: ev.target,
+                }).catch(() => undefined),
+            });
+            console.log("[scim] /scim/v2/* registered (backend: %s)", scimBackend);
+        }
+        catch (err) {
+            console.warn("[scim] enable failed (routes not mounted): %s", err instanceof Error ? err.message : String(err));
+        }
     }
+    // Phase P6: Postmortems persistence. /api/postmortems lets the
+    // UI list / open / regenerate / delete previously-generated
+    // reports. Opt-in via OMCP_POSTMORTEMS_FILE (default
+    // /tmp/postmortems.jsonl). When the env is left at its default
+    // the demo still works — operators who want survival across
+    // restarts mount a PVC at the same path and set the env to it.
+    const postmortemStore = new PostmortemStore(process.env.OMCP_POSTMORTEMS_FILE?.trim() || "/tmp/postmortems.jsonl");
+    await postmortemStore.load();
+    // GET /api/postmortems — list (newest-first), tenant-scoped.
+    app.get("/api/postmortems", need("services", "read"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const entries = postmortemStore.list(tenant);
+        res.json({
+            total: entries.length,
+            entries: entries.map((e) => ({
+                id: e.id,
+                ts: e.ts,
+                createdBy: e.createdBy,
+                service: e.report.service,
+                window: e.report.window,
+                synopsis: e.report.synopsis,
+            })),
+        });
+    });
+    // GET /api/postmortems/:id — full report (markdown + sections).
+    app.get("/api/postmortems/:id", need("services", "read"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const id = String(req.params.id ?? "");
+        const entry = postmortemStore.get(id, tenant);
+        if (!entry) {
+            res.status(404).json({ error: `Postmortem ${id} not found` });
+            return;
+        }
+        res.json(entry);
+    });
+    // POST /api/postmortems — regenerate via the tool handler +
+    // persist. Body: { service, duration?, format? }. Returns the
+    // stored entry with its id.
+    app.post("/api/postmortems", need("services", "write"), async (req, res) => {
+        const body = (req.body ?? {});
+        if (!body.service || typeof body.service !== "string") {
+            res.status(400).json({ error: "service is required" });
+            return;
+        }
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const createdBy = sess?.sub || sess?.name || "unknown";
+        try {
+            // Force JSON so we get the structured report shape back from
+            // the tool, not just the markdown body. We persist the full
+            // structured report; the markdown lives inside `report.markdown`.
+            const ctx = { ...defaultContext(), tenant, principalId: createdBy };
+            const result = await generatePostmortemHandler(registry, { service: body.service, duration: body.duration, format: "json" }, ctx);
+            const text = result?.content?.[0]?.text;
+            if (!text) {
+                res.status(500).json({ error: "generate_postmortem returned no content" });
+                return;
+            }
+            const report = JSON.parse(text);
+            const stored = await postmortemStore.append({ report, createdBy, tenant });
+            res.status(201).json(stored);
+        }
+        catch (e) {
+            console.warn(`[postmortems] regen failed:`, e);
+            res.status(500).json({ error: e?.message || "internal error" });
+        }
+    });
+    // DELETE /api/postmortems/:id — admin-gated.
+    app.delete("/api/postmortems/:id", need("services", "delete"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const ok = await postmortemStore.delete(String(req.params.id ?? ""), tenant);
+        if (!ok) {
+            res.status(404).json({ error: `Postmortem ${req.params.id} not found` });
+            return;
+        }
+        res.status(204).end();
+    });
     // Connectors currently loaded into this server (builtin + filesystem
     // plugins), with manifest metadata — drives the UI "Connectors" page.
     app.get("/api/connectors", (_req, res) => {
@@ -2550,7 +2791,7 @@ async function main() {
     });
     // Stdio transport: one server over stdin/stdout, no HTTP listener.
     if (STDIO) {
-        const server = createMcpServer(defaultContext());
+        const { mcpServer: server } = createMcpServer(defaultContext());
         await server.connect(new StdioServerTransport());
         console.error(`observability-mcp running on stdio transport · connectors: ${registry
             .getAll()
@@ -2723,7 +2964,7 @@ async function main() {
                 }
                 mcpActiveSessions.set(transports.size);
             };
-            const sessionMcpServer = createMcpServer(ctx);
+            const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
             await sessionMcpServer.connect(transport);
         }
         await transport.handleRequest(req, res, req.body);
@@ -2831,7 +3072,7 @@ async function main() {
                 }
                 mcpActiveSessions.set(transports.size);
             };
-            const sessionMcpServer = createMcpServer(ctx);
+            const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
             await sessionMcpServer.connect(transport);
         }
         await transport.handleRequest(req, res, req.body);
@@ -2981,7 +3222,7 @@ async function main() {
         wss.handleUpgrade(req, socket, head, async (ws) => {
             try {
                 const transport = new WebSocketServerTransport(ws);
-                const sessionMcpServer = createMcpServer(auth.ctx);
+                const { mcpServer: sessionMcpServer } = createMcpServer(auth.ctx);
                 await sessionMcpServer.connect(transport);
             }
             catch (err) {

package/dist/metrics/self.d.ts CHANGED Viewed

@@ -5,6 +5,7 @@ export declare const mcpToolLatency: Histogram<"tool">;
 export declare const connectorCalls: Counter<"type" | "source" | "outcome" | "operation">;
 export declare const apiRequests: Counter<"status" | "route" | "method">;
 export declare const mcpActiveSessions: Gauge<string>;
+export declare const auditDlqDepth: Gauge<string>;
 /**
  * Wrap a (potentially async) tool handler to record call count + latency.
  * Outcome is "ok" or "error" — never throws on its own.

package/dist/metrics/self.js CHANGED Viewed

@@ -40,6 +40,14 @@ export const mcpActiveSessions = new Gauge({
     help: "Active MCP Streamable HTTP sessions.",
     registers: [selfRegistry],
 });
+// P9: Audit webhook dead-letter queue depth. Refreshed on each
+// `/metrics` scrape and when the operator hits `/api/audit/dlq`.
+// Stays at 0 when no DLQ file is configured or the file is missing.
+export const auditDlqDepth = new Gauge({
+    name: "obsmcp_audit_webhook_dlq_depth",
+    help: "Number of audit entries waiting in the webhook-sink dead-letter queue.",
+    registers: [selfRegistry],
+});
 /**
  * Wrap a (potentially async) tool handler to record call count + latency.
  * Outcome is "ok" or "error" — never throws on its own.