npm - @thotischner/observability-mcp - Versions diffs - 3.0.0 → 3.1.0 - Mend

@thotischner/observability-mcp 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (93) hide show

package/dist/analysis/history.d.ts +36 -2
package/dist/analysis/history.js +60 -2
package/dist/analysis/history.test.js +46 -0
package/dist/audit/sinks/s3.d.ts +61 -0
package/dist/audit/sinks/s3.js +179 -0
package/dist/audit/sinks/s3.test.d.ts +1 -0
package/dist/audit/sinks/s3.test.js +175 -0
package/dist/auth/csrf.d.ts +6 -0
package/dist/auth/csrf.js +4 -0
package/dist/auth/csrf.test.js +22 -0
package/dist/auth/lockout.d.ts +72 -0
package/dist/auth/lockout.js +134 -0
package/dist/auth/lockout.test.d.ts +1 -0
package/dist/auth/lockout.test.js +133 -0
package/dist/auth/middleware.d.ts +5 -0
package/dist/auth/middleware.js +6 -1
package/dist/auth/middleware.test.js +31 -0
package/dist/auth/password-policy.d.ts +52 -0
package/dist/auth/password-policy.js +125 -0
package/dist/auth/password-policy.test.d.ts +1 -0
package/dist/auth/password-policy.test.js +111 -0
package/dist/auth/policy/batch-dry-run.js +15 -0
package/dist/auth/revocation.d.ts +93 -0
package/dist/auth/revocation.js +193 -0
package/dist/auth/revocation.test.d.ts +1 -0
package/dist/auth/revocation.test.js +136 -0
package/dist/auth/session.d.ts +7 -0
package/dist/auth/session.js +6 -0
package/dist/auth/session.test.js +21 -0
package/dist/connectors/interface.d.ts +5 -1
package/dist/connectors/loader.d.ts +8 -0
package/dist/connectors/loader.js +49 -0
package/dist/connectors/loki.d.ts +45 -1
package/dist/connectors/loki.js +141 -8
package/dist/connectors/loki.test.js +171 -1
package/dist/connectors/manifest-hooks.test.d.ts +1 -0
package/dist/connectors/manifest-hooks.test.js +206 -0
package/dist/federation/registry.d.ts +27 -5
package/dist/federation/registry.js +49 -4
package/dist/federation/registry.test.js +79 -3
package/dist/federation/upstream.d.ts +32 -6
package/dist/federation/upstream.js +60 -12
package/dist/federation/upstream.test.d.ts +1 -0
package/dist/federation/upstream.test.js +118 -0
package/dist/index.js +522 -67
package/dist/metrics/self.d.ts +1 -0
package/dist/metrics/self.js +8 -0
package/dist/openapi.js +39 -0
package/dist/openapi.test.js +1 -0
package/dist/policy/redact.js +1 -1
package/dist/postmortem/store.d.ts +34 -0
package/dist/postmortem/store.js +113 -0
package/dist/postmortem/store.test.d.ts +1 -0
package/dist/postmortem/store.test.js +118 -0
package/dist/scim/compliance.test.d.ts +1 -0
package/dist/scim/compliance.test.js +169 -0
package/dist/scim/factory.test.d.ts +1 -0
package/dist/scim/factory.test.js +54 -0
package/dist/scim/patch-ops.test.d.ts +1 -0
package/dist/scim/patch-ops.test.js +100 -0
package/dist/scim/redis-store.d.ts +38 -0
package/dist/scim/redis-store.js +178 -0
package/dist/scim/redis-store.test.d.ts +1 -0
package/dist/scim/redis-store.test.js +138 -0
package/dist/scim/routes.d.ts +27 -2
package/dist/scim/routes.js +161 -15
package/dist/scim/store.d.ts +40 -1
package/dist/scim/store.js +23 -5
package/dist/sdk/hook-wrappers.d.ts +39 -0
package/dist/sdk/hook-wrappers.js +113 -0
package/dist/sdk/hook-wrappers.test.d.ts +1 -0
package/dist/sdk/hook-wrappers.test.js +204 -0
package/dist/sdk/index.d.ts +13 -0
package/dist/security/csp.d.ts +64 -0
package/dist/security/csp.js +135 -0
package/dist/security/csp.test.d.ts +1 -0
package/dist/security/csp.test.js +97 -0
package/dist/tools/detect-anomalies.d.ts +12 -1
package/dist/tools/detect-anomalies.js +22 -2
package/dist/tools/query-logs.d.ts +40 -0
package/dist/tools/query-logs.js +69 -3
package/dist/tools/topology.js +23 -5
package/dist/tools/topology.test.js +45 -0
package/dist/tools/validation.d.ts +13 -0
package/dist/tools/validation.js +74 -0
package/dist/tools/validation.test.js +54 -1
package/dist/transport/transportSessionMap.d.ts +70 -0
package/dist/transport/transportSessionMap.js +128 -0
package/dist/transport/transportSessionMap.test.d.ts +1 -0
package/dist/transport/transportSessionMap.test.js +111 -0
package/dist/types.d.ts +48 -0
package/dist/ui/index.html +898 -116
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,11 @@ import { buildSessionAttacher, buildRequireSession, } from "./auth/middleware.js
 import { buildRequirePermissionFromEngine, hasPermission, listGrantedPermissions, DEFAULT_POLICY, } from "./auth/rbac.js";
 import { resolveOidcConfig, buildOidcRuntime } from "./auth/oidc/runtime.js";
 import { registerOidcRoutes } from "./auth/oidc/endpoints.js";
-import { ScimStore } from "./scim/store.js";
+import { RevocationStore } from "./auth/revocation.js";
+import { AccountLockout, lockoutConfigFromEnv, lockoutDisabledFromEnv, } from "./auth/lockout.js";
+import { resolveSessionStore } from "./transport/sessionStore.js";
+import { generateNonce, enforcedCsp, reportOnlyCsp, reportingEndpointsHeader, reportToHeader, summariseViolation, cspStrictReportFromEnv, CSP_NONCE_PLACEHOLDER, } from "./security/csp.js";
+import { createScimStore } from "./scim/store.js";
 import { registerScimRoutes } from "./scim/routes.js";
 import { BuiltinPolicyEngine } from "./auth/policy/engine.js";
 import { loadPolicyFromFile, writePolicyFile, PolicyLoadError, VALID_RESOURCES, VALID_ACTIONS } from "./auth/policy/loader.js";
@@ -40,10 +44,11 @@ import { getPluginLoader } from "./connectors/loader.js";
 import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
 import { isValidConnectorName, installTarball } from "./connectors/install.js";
 import { PluginVerificationError } from "./connectors/verify.js";
-import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions } from "./metrics/self.js";
+import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions, auditDlqDepth } from "./metrics/self.js";
 import { initOtel } from "./observability/otel.js";
 import { WebSocketServerTransport } from "./transport/websocket.js";
 import { HookRegistry } from "./sdk/hooks.js";
+import { wrapToolHandler, wrapResourceHandler, wrapPromptHandler } from "./sdk/hook-wrappers.js";
 import { UpstreamClient } from "./federation/upstream.js";
 import { FederationRegistry, parseFederationEnv } from "./federation/registry.js";
 import { buildCsrfIssuer, buildCsrfEnforcer, csrfBypassFromEnv } from "./auth/csrf.js";
@@ -56,6 +61,7 @@ import { queryLogsHandler } from "./tools/query-logs.js";
 import { queryTracesHandler } from "./tools/query-traces.js";
 import { getAnomalyHistoryHandler } from "./tools/get-anomaly-history.js";
 import { generatePostmortemHandler } from "./tools/generate-postmortem.js";
+import { PostmortemStore } from "./postmortem/store.js";
 import { AnomalyHistory, fromEnv as anomalyHistoryFromEnv } from "./analysis/history.js";
 import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
 import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
@@ -295,11 +301,20 @@ async function main() {
             return result;
         }
     }
+    /**
+     * Returns the McpServer for the given context. The companion
+     * `toolHandlers` map carries every tool registered for this ctx
+     * (post-hook-wrapping) so the in-product Playground UI (Q13) can
+     * invoke a tool without going through the full Streamable HTTP
+     * transport stack. The map is keyed by tool name; values run the
+     * same wrapped handler the McpServer would dispatch over MCP.
+     */
     function createMcpServer(ctx) {
         const mcpServer = new McpServer({
             name: "observability-mcp",
             version: SERVER_VERSION,
         });
+        const toolHandlers = new Map();
         // --- Register tools with Zod schemas ---
         // Product-aware registration: when the active credential is bound
         // to a Product (OMCP_KEY_PRODUCTS), `ctx.allowedTools` carries that
@@ -319,34 +334,39 @@ async function main() {
                 return undefined;
             if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
                 const originalHandler = rest[rest.length - 1];
-                const wrappedHandler = async (args, extra) => {
-                    const hookCtxBase = {
-                        principal: ctx.principalId,
-                        tenant: ctx.tenant || "default",
-                        target: name,
-                    };
-                    const pre = await hookRegistry.fire("tool_pre_invoke", { ...hookCtxBase, kind: "tool_pre_invoke" }, { args });
-                    if (!pre.allow) {
-                        return {
-                            content: [{ type: "text", text: pre.reason ?? "denied by plugin hook" }],
-                            isError: true,
-                        };
-                    }
-                    const effectiveArgs = pre.payload?.args ?? args;
-                    const result = await originalHandler(effectiveArgs, extra);
-                    const post = await hookRegistry.fire("tool_post_invoke", { ...hookCtxBase, kind: "tool_post_invoke" }, { args: effectiveArgs, result });
-                    if (!post.allow) {
-                        return {
-                            content: [{ type: "text", text: post.reason ?? "denied by plugin hook" }],
-                            isError: true,
-                        };
-                    }
-                    return post.payload?.result ?? result;
-                };
+                const wrappedHandler = wrapToolHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
                 rest[rest.length - 1] = wrappedHandler;
+                // Stash for the Playground endpoint — keyed by tool name. The
+                // wrapped handler honours pre/post hooks + the same RBAC the
+                // McpServer dispatch path runs. Per-ctx Map so a different
+                // user's allowedTools never leak.
+                toolHandlers.set(name, wrappedHandler);
             }
             return mcpServer.tool(name, ...rest);
         });
+        // Q12: resource + prompt registrations get the same hook-fan-out
+        // treatment so a plugin's resource_pre_fetch / resource_post_fetch /
+        // prompt_pre_fetch / prompt_post_fetch handlers actually fire when
+        // a future resource/prompt registration lands. The wrappers stay
+        // thin pass-throughs when no hooks are registered (the OSS default).
+        // Wrappers are tested in mcp-server/src/sdk/hook-wrappers.test.ts.
+        const registerResource = ((name, ...rest) => {
+            if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
+                const originalHandler = rest[rest.length - 1];
+                rest[rest.length - 1] = wrapResourceHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
+            }
+            return mcpServer.resource(name, ...rest);
+        });
+        const registerPrompt = ((name, ...rest) => {
+            if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
+                const originalHandler = rest[rest.length - 1];
+                rest[rest.length - 1] = wrapPromptHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
+            }
+            return mcpServer.prompt(name, ...rest);
+        });
+        // Suppress unused-warn — kept for the moment registrations land.
+        void registerResource;
+        void registerPrompt;
         registerTool("list_sources", [
             "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
             "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
@@ -547,7 +567,9 @@ async function main() {
                 .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
         }, async (args) => {
             await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
-            return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
+            // P1: pass the anomaly-history sink so detected scores flow
+            // into the TSDB and `get_anomaly_history` returns real data.
+            return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx, anomalyHistory));
         });
         registerTool("get_topology", [
             "Return the infrastructure topology graph (Resources and Edges) from every topology-capable connector.",
@@ -599,16 +621,33 @@ async function main() {
         // Product-allow-list gate, so federated tools obey the same policy
         // surface as native ones.
         for (const info of federationRegistry.getNamespacedTools()) {
-            // Upstream's inputSchema is forwarded verbatim. The SDK's
-            // tool() overload signatures don't carry an obvious type for a
-            // dynamic-shape schema, so we cast to `any` at the boundary and
-            // let the upstream contract speak for the validation.
-            registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, info.inputSchema ?? {}, async (args) => {
+            // The MCP SDK's tool() signature wants a ZodRawShape (a map of
+            // field-name → Zod type), not a raw JSON Schema. Federated
+            // upstreams expose JSON Schema (the wire-format MCP uses on
+            // tools/list); we transcode to a permissive Zod shape so the
+            // SDK accepts the registration. Per-field types are `z.unknown()`
+            // because the upstream will validate the call args anyway; the
+            // local Zod check is only a "this is the field name set" gate.
+            // P7: this transcoding fixes the registration crash that broke
+            // every federation deploy before the E2E test caught it.
+            const upstreamProps = info.inputSchema?.properties ?? {};
+            // Every field is z.unknown().optional() — the SDK only uses this
+            // shape to know the field-name set; the upstream re-validates
+            // against its full JSON Schema (incl. its own `required` list)
+            // when the call arrives. Marking all fields optional here keeps
+            // calls with the upstream-defaults flowing through; without it
+            // the SDK rejects any call that omits a field upstream considers
+            // required even if the upstream would accept the omission.
+            const localShape = {};
+            for (const k of Object.keys(upstreamProps)) {
+                localShape[k] = z.unknown().optional();
+            }
+            registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, localShape, async (args) => {
                 await enforceEntitledAccess(ctx, { tool: info.namespacedName });
                 return withToolMetrics(info.namespacedName, () => federationRegistry.callNamespacedTool(info.namespacedName, args));
             });
         }
-        return mcpServer;
+        return { mcpServer, toolHandlers };
     }
     // --- Management-plane auth (basic mode) -----------------------------------
     // Off by default. Enable with `OMCP_AUTH=basic` + `OMCP_USERS_FILE` and
@@ -688,7 +727,19 @@ async function main() {
     else if (requestedAuthMode !== "anonymous") {
         authMisconfig(`unknown OMCP_AUTH=${requestedAuthMode}`);
     }
-    const authRuntime = { mode: authMode, session: sessionCfg, secretEphemeral, oidc: oidcRuntime };
+    // Session revocation blocklist (Q17). Only meaningful when sessions
+    // exist (basic / oidc); anonymous mode leaves it undefined so the
+    // middleware check is a pure no-op. OMCP_AUTH_REVOCATION_FILE persists
+    // the blocklist across restarts and shares it across replicas when it
+    // points at shared storage; unset = in-memory only.
+    let revocationStore;
+    if (authMode !== "anonymous") {
+        revocationStore = await RevocationStore.create({
+            path: process.env.OMCP_AUTH_REVOCATION_FILE?.trim() || undefined,
+        });
+        console.log(`[auth] session revocation blocklist active — backend=${revocationStore.persistent ? `file (${revocationStore.filePath})` : "memory"}, ${revocationStore.size} existing entr${revocationStore.size === 1 ? "y" : "ies"}`);
+    }
+    const authRuntime = { mode: authMode, session: sessionCfg, secretEphemeral, oidc: oidcRuntime, revocation: revocationStore };
     // --- HTTP server ---
     const app = express();
     // Trust-proxy: when set, Express will read req.ip / req.secure from
@@ -717,13 +768,43 @@ async function main() {
             app.set("trust proxy", trustProxy);
         }
     }
-    app.use(express.json({ limit: "1mb" }));
+    // Parse application/json AND any *+json media type. SCIM clients
+    // (Entra, Okta) send `application/scim+json` per RFC 7644 §3.1 —
+    // without the wildcard the body silently arrives empty and every
+    // SCIM POST/PATCH 400s. The wildcard also future-proofs other
+    // structured-suffix JSON content types.
+    // application/csp-report is the legacy media type browsers use for CSP
+    // violation reports (the modern Reporting API uses application/reports+json,
+    // already covered by the wildcard). Without it the report body arrives empty.
+    app.use(express.json({ limit: "1mb", type: ["application/json", "application/*+json", "application/csp-report"] }));
+    // Q20 — resolve the opt-in strict Report-Only CSP toggle once at boot.
+    // Default off: with ~200 inline handlers the report-only policy would
+    // emit a [Report Only] console message per handler on every page load.
+    const cspStrictReport = cspStrictReportFromEnv();
+    if (cspStrictReport) {
+        console.log("[csp] strict report-only policy ON (OMCP_CSP_STRICT_REPORT) — inline-handler violations will be reported to /api/csp-violations");
+    }
     // Security headers
     app.use((req, res, next) => {
         res.setHeader("X-Content-Type-Options", "nosniff");
         res.setHeader("X-Frame-Options", "DENY");
         res.setHeader("X-XSS-Protection", "1; mode=block");
         res.setHeader("Referrer-Policy", "strict-origin-when-cross-origin");
+        // Q20 — Content-Security-Policy. A per-request nonce is minted and
+        // stashed on res.locals so the UI handler can stamp it into the two
+        // inline <script> blocks. The enforced policy keeps the UI working
+        // (script-src 'unsafe-inline' for the ~200 inline handlers) and is
+        // always on; the strict report-only policy is opt-in (it surfaces the
+        // inline-handler debt but is console-noisy). Both report to
+        // /api/csp-violations.
+        const nonce = generateNonce();
+        res.locals.cspNonce = nonce;
+        res.setHeader("Content-Security-Policy", enforcedCsp());
+        if (cspStrictReport) {
+            res.setHeader("Content-Security-Policy-Report-Only", reportOnlyCsp(nonce));
+        }
+        res.setHeader("Reporting-Endpoints", reportingEndpointsHeader());
+        res.setHeader("Report-To", reportToHeader());
         // Dynamic API responses must never be served from the browser/proxy
         // cache: after a mutation (e.g. installing a connector) the UI
         // re-fetches these GETs immediately, and a heuristically-cached stale
@@ -774,6 +855,11 @@ async function main() {
     const csrfCfg = {
         bypassBearer: csrfBypassFromEnv(),
         secureCookie: (r) => r.secure || r.headers["x-forwarded-proto"] === "https",
+        // CSP violation reports are unauthenticated browser POSTs that by
+        // construction carry no cookie + no custom header — exempt them from
+        // CSRF. The endpoint only records a sanitised summary, so accepting it
+        // cross-site is harmless.
+        skip: (r) => r.method === "POST" && (r.path === "/api/csp-violations" || r.originalUrl.split("?")[0] === "/api/csp-violations"),
     };
     app.use(buildCsrfIssuer(csrfCfg));
     app.use("/api", buildCsrfEnforcer(csrfCfg));
@@ -904,6 +990,36 @@ async function main() {
             .catch((err) => console.warn("AuditLog flushSinks failed:", err));
     });
     const audit = (resource, action) => buildAuditMiddleware({ audit: mgmtAudit, resource, action });
+    // Q20 — CSP violation report sink. Unauthenticated browser POST (exempt
+    // from CSRF via csrfCfg.skip), tightly rate-limited so a misbehaving or
+    // hostile client can't flood the audit log, and only a sanitised summary
+    // (directive / blocked-uri / document-uri) is recorded. Always 204 so the
+    // browser never retries. The report-only strict policy is what drives most
+    // of these today (the inline-handler debt) — they roll into mgmtAudit so an
+    // operator can watch the migration surface shrink.
+    const cspReportRateLimit = rateLimit({
+        windowMs: 60_000,
+        max: 60,
+        standardHeaders: true,
+        legacyHeaders: false,
+        message: { error: "rate limited" },
+    });
+    app.post("/api/csp-violations", cspReportRateLimit, (req, res) => {
+        const summary = summariseViolation(req.body);
+        if (summary) {
+            void mgmtAudit.record({
+                actor: { sub: "browser:csp" },
+                tenant: "default",
+                resource: "settings",
+                action: "read",
+                method: "POST",
+                path: "/api/csp-violations",
+                status: 204,
+                target: `${summary.directive} blocked ${summary.blockedUri}`.slice(0, 256),
+            }).catch(() => { });
+        }
+        res.status(204).end();
+    });
     // Plugin lifecycle hook registry — populated by the loader at boot
     // (one entry per manifest `hooks[]` entry) and mutable at runtime
     // when a connector is installed via /api/connectors/install. Each
@@ -938,11 +1054,11 @@ async function main() {
     // (no tools) so the gateway boots regardless of upstream health.
     const federationRegistry = new FederationRegistry();
     for (const cfg of parseFederationEnv()) {
-        const client = new UpstreamClient({
-            name: cfg.name,
-            url: cfg.url,
-            bearerToken: cfg.bearerToken,
-        });
+        const client = new UpstreamClient(cfg.kind === "stdio"
+            ? { transport: "stdio", name: cfg.name, command: cfg.command, args: cfg.args }
+            : cfg.kind === "ws"
+                ? { transport: "ws", name: cfg.name, url: cfg.url }
+                : { name: cfg.name, url: cfg.url, bearerToken: cfg.bearerToken });
         federationRegistry.add(client);
         client.connect().catch((err) => {
             console.warn("federation upstream %s initial connect failed: %s", cfg.name, err instanceof Error ? err.message : String(err));
@@ -1044,11 +1160,51 @@ async function main() {
     // this endpoint when enabled.
     if (process.env.METRICS_ENABLED !== "false") {
         app.get("/metrics", async (_req, res) => {
+            // P9: refresh the audit-webhook DLQ depth before the scrape so
+            // Prometheus sees the current file state rather than whatever
+            // /api/audit/dlq last set. Best-effort; ENOENT or missing-env
+            // resets to 0 (the dlqPath being unset is the normal state).
+            try {
+                const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
+                if (dlqPath) {
+                    const fs = await import("node:fs/promises");
+                    const raw = await fs.readFile(dlqPath, "utf8").catch(() => "");
+                    auditDlqDepth.set(raw.split("\n").filter((l) => l.trim()).length);
+                }
+                else {
+                    auditDlqDepth.set(0);
+                }
+            }
+            catch {
+                auditDlqDepth.set(0);
+            }
             res.set("Content-Type", selfRegistry.contentType);
             res.end(await selfRegistry.metrics());
         });
     }
-    // Serve Web UI
+    // Serve Web UI. The index page is served dynamically so the per-request
+    // CSP nonce can be stamped into its inline <script> blocks (the rest of
+    // ui/ stays on express.static). Read once at boot; if the file is
+    // missing we fall through to static, which 404s like before.
+    let uiHtmlTemplate = null;
+    try {
+        uiHtmlTemplate = readFileSync(join(__dirname, "ui", "index.html"), "utf8");
+    }
+    catch {
+        uiHtmlTemplate = null;
+    }
+    if (uiHtmlTemplate) {
+        const template = uiHtmlTemplate;
+        const serveIndex = (_req, res) => {
+            const nonce = res.locals.cspNonce ?? "";
+            res.setHeader("Content-Type", "text/html; charset=utf-8");
+            // Index is identity/nonce-specific — never let a proxy cache it.
+            res.setHeader("Cache-Control", "no-store");
+            res.send(template.split(CSP_NONCE_PLACEHOLDER).join(nonce));
+        };
+        app.get("/", serveIndex);
+        app.get("/index.html", serveIndex);
+    }
     app.use(express.static(join(__dirname, "ui")));
     // --- API endpoints for Web UI ---
     // List sources with health status — tenant-scoped.
@@ -1108,6 +1264,37 @@ async function main() {
     app.get("/api/tools/registry", (_req, res) => {
         res.json({ tools: REGISTERED_TOOLS });
     });
+    // Q13: in-product Playground endpoint. Lets the operator invoke a
+    // registered tool against the live gateway without spinning up a
+    // separate MCP client. Re-uses the per-session ctx and the same
+    // wrapped handler the McpServer dispatch path would run (so RBAC,
+    // entitlements, rate-limit, audit, hook fan-out all apply
+    // identically).
+    app.post("/api/playground/invoke", async (req, res) => {
+        const ctx = await gateCtx(req, res);
+        if (!ctx)
+            return;
+        const body = (req.body ?? {});
+        const tool = typeof body.tool === "string" ? body.tool : "";
+        if (!tool) {
+            res.status(400).json({ error: "tool (string) is required" });
+            return;
+        }
+        const { toolHandlers } = createMcpServer(ctx);
+        const handler = toolHandlers.get(tool);
+        if (!handler) {
+            res.status(404).json({ error: `tool '${tool}' is not registered (or not allowed for this credential)` });
+            return;
+        }
+        try {
+            const result = await handler(body.args ?? {}, undefined);
+            res.json({ tool, result });
+        }
+        catch (err) {
+            const message = err instanceof Error ? err.message : String(err);
+            res.status(500).json({ error: message, tool });
+        }
+    });
     // Server info — version, loaded plugins, MCP protocol version, build metadata.
     // Used by the Web UI footer and by operators to confirm what's deployed.
     app.get("/api/info", async (_req, res) => {
@@ -1142,6 +1329,16 @@ async function main() {
                 redaction: REDACTION_ENABLED,
                 trustProxy: !!(process.env.OMCP_TRUST_PROXY && process.env.OMCP_TRUST_PROXY !== "false"),
                 toolRatePerMin: resolveToolRatePerMin(process.env.OMCP_TOOL_RATE_PER_MIN),
+                // P1: posture flags so dashboards can alert when a shipped
+                // capability is configured but doing nothing useful.
+                anomalyHistoryActive: anomalyHistory.isEnabled(),
+                tracesCapabilityCount: registry
+                    .getAll()
+                    .filter((c) => typeof c.queryTraces === "function").length,
+                pluginsVerified: !/^(0|false|no|off)$/i.test(process.env.VERIFY_PLUGINS ?? "true"),
+                scimEnabled: !!process.env.OMCP_SCIM_TOKEN,
+                federationUpstreams: (process.env.OMCP_FEDERATION_UPSTREAMS ?? "")
+                    .split(",").map((s) => s.trim()).filter(Boolean).length,
             },
             plugins: loader.list().map((p) => ({
                 name: p.name,
@@ -1187,6 +1384,10 @@ async function main() {
             },
             permissions: listGrantedPermissions(sess.roles, policyEngineToMap(policyEngine)),
             exp: sess.exp,
+            // The current session's revocation id. Surfaced so an admin can
+            // copy it into POST /api/auth/revocations to kill a specific
+            // session. Absent for legacy cookies issued before sid existed.
+            sid: sess.sid,
             // When the user signed in via OIDC, surface the IdP issuer
             // URL so the UI can render an appropriate badge or link to
             // an IdP-side profile page. Empty / absent in basic mode.
@@ -1566,6 +1767,46 @@ async function main() {
             scopedTo: tenantFilter || (isAdmin ? null : callerTenant),
         });
     });
+    // --- /api/audit/dlq — webhook-sink dead-letter queue surface (P9) ---
+    // When the audit webhook is configured AND the receiver exhausted
+    // its retry budget, entries land in the DLQ file. This endpoint
+    // surfaces the count + the last N entries so operators can decide
+    // whether to replay manually. Also refreshes the
+    // `obsmcp_audit_webhook_dlq_depth` gauge so the /metrics scrape
+    // alongside it stays accurate.
+    app.get("/api/audit/dlq", need("audit", "read"), async (_req, res) => {
+        const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
+        if (!dlqPath) {
+            auditDlqDepth.set(0);
+            res.json({ enabled: false, path: null, depth: 0, entries: [] });
+            return;
+        }
+        try {
+            const fs = await import("node:fs/promises");
+            const raw = await fs.readFile(dlqPath, "utf8");
+            const lines = raw.split("\n").filter((l) => l.trim());
+            auditDlqDepth.set(lines.length);
+            const tail = lines.slice(-50).map((l) => {
+                try {
+                    return JSON.parse(l);
+                }
+                catch {
+                    return { _raw: l, _parseError: true };
+                }
+            });
+            res.json({ enabled: true, path: dlqPath, depth: lines.length, entries: tail });
+        }
+        catch (err) {
+            const code = err.code;
+            if (code === "ENOENT") {
+                auditDlqDepth.set(0);
+                res.json({ enabled: true, path: dlqPath, depth: 0, entries: [] });
+                return;
+            }
+            console.warn("[/api/audit/dlq] read failed:", err);
+            res.status(500).json({ error: err?.message || "DLQ read failed" });
+        }
+    });
     // --- /api/usage — per-identity MCP rate-limit snapshot -----------------
     // Read-only view of the IdentityRateLimiter's bucket state. Gated by
     // need("audit","read") — the same role set that already sees the
@@ -1671,6 +1912,19 @@ async function main() {
             catch { /* ignore — first login will pick it up */ }
         }
     }
+    // Q18 — per-username failed-login lockout with progressive backoff.
+    // Complements the per-IP loginRateLimit above: that bounds a noisy
+    // single source, this bounds a slow / distributed grind on one
+    // account. Backed by the shared SessionStore so a Redis deployment
+    // locks consistently across replicas (and self-cleans via TTL).
+    // Basic mode only — OIDC delegates auth (and lockout) to the IdP.
+    let lockout;
+    if (authRuntime.mode === "basic" && !lockoutDisabledFromEnv()) {
+        const lockoutStore = await resolveSessionStore();
+        const lockoutCfg = lockoutConfigFromEnv();
+        lockout = new AccountLockout(lockoutStore, lockoutCfg);
+        console.log(`[auth] account lockout active — ${lockoutCfg.maxFailures} failures / ${lockoutCfg.windowSeconds}s → lock ${lockoutCfg.baseLockSeconds}s (×2 up to ${lockoutCfg.maxLockSeconds}s), backend=${lockoutStore.backend}`);
+    }
     app.post("/api/auth/login", loginRateLimit, async (req, res) => {
         if (authRuntime.mode !== "basic" || !sessionCfg || !usersStore) {
             res.status(503).json({ error: "auth mode does not accept logins" });
@@ -1684,11 +1938,57 @@ async function main() {
             res.status(400).json({ error: "username and password are required" });
             return;
         }
+        // Gate on the lock BEFORE the (expensive) scrypt verify so a locked
+        // account can't be used to burn CPU. A locked account is a 429 with
+        // Retry-After, never a credential oracle — the response is identical
+        // whether or not the username exists.
+        if (lockout) {
+            const status = await lockout.check(username);
+            if (status.locked) {
+                res.setHeader("Retry-After", String(status.retryAfterSeconds ?? 0));
+                res.status(429).json({
+                    error: "account temporarily locked due to repeated failed logins",
+                    retryAfterSeconds: status.retryAfterSeconds,
+                });
+                void mgmtAudit.record({
+                    actor: { sub: username },
+                    tenant: "default",
+                    resource: "users",
+                    action: "write",
+                    method: "POST",
+                    path: "/api/auth/login",
+                    status: 429,
+                }).catch(() => { });
+                return;
+            }
+        }
         const user = authenticate(username, password, usersStore);
         if (!user) {
+            if (lockout) {
+                const after = await lockout.recordFailure(username);
+                if (after.locked) {
+                    res.setHeader("Retry-After", String(after.retryAfterSeconds ?? 0));
+                    res.status(429).json({
+                        error: "account temporarily locked due to repeated failed logins",
+                        retryAfterSeconds: after.retryAfterSeconds,
+                    });
+                    void mgmtAudit.record({
+                        actor: { sub: username },
+                        tenant: "default",
+                        resource: "users",
+                        action: "write",
+                        method: "POST",
+                        path: "/api/auth/login",
+                        status: 429,
+                    }).catch(() => { });
+                    return;
+                }
+            }
             res.status(401).json({ error: "invalid credentials" });
             return;
         }
+        if (lockout)
+            await lockout.recordSuccess(user.username);
         const { cookie } = issueSession({ sub: user.username, name: user.name, roles: user.roles, tenant: user.tenant }, sessionCfg);
         const secure = req.secure || (req.headers["x-forwarded-proto"] === "https");
         res.setHeader("Set-Cookie", setCookieHeader(cookie, sessionCfg, { secure }));
@@ -1716,31 +2016,161 @@ async function main() {
         registerOidcRoutes(app, { sessionCfg, oidc: oidcRuntime });
         console.log("[auth] OIDC endpoints registered: /api/auth/oidc/{login,callback,logout}");
     }
-    // Phase F21: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access;
-    // OMCP_SCIM_STORE points at the on-disk JSON (mode 0600, atomic).
-    // Multi-replica deployments should plug the F8 SessionStore in
-    // when F21b lands.
+    // Q17 — session revocation blocklist. Admin-gated (same role tier as
+    // user/role management). A revoked-but-unexpired cookie is rejected by
+    // buildSessionAttacher on the next request. Revoke a single session by
+    // `sid` (read it from /api/me or the audit log) or every current
+    // session for a `sub` ("log this user out everywhere"). The blocklist
+    // is the stateful complement to the otherwise-stateless cookie.
+    app.post("/api/auth/revocations", need("users", "delete"), audit("users", "write"), async (req, res) => {
+        if (!revocationStore) {
+            res.status(503).json({ error: "revocation requires an auth mode (basic|oidc)" });
+            return;
+        }
+        const body = (req.body || {});
+        const sid = typeof body.sid === "string" && body.sid.trim() ? body.sid.trim() : undefined;
+        const sub = typeof body.sub === "string" && body.sub.trim() ? body.sub.trim() : undefined;
+        const reason = typeof body.reason === "string" ? body.reason.slice(0, 500) : undefined;
+        if ((sid ? 1 : 0) + (sub ? 1 : 0) !== 1) {
+            res.status(400).json({ error: "exactly one of `sid` or `sub` is required" });
+            return;
+        }
+        const by = req.session?.sub;
+        const entry = sid
+            ? await revocationStore.revokeSession(sid, { reason, by })
+            : await revocationStore.revokeSubject(sub, { reason, by });
+        res.status(201).json({ ok: true, revocation: entry });
+    });
+    app.get("/api/auth/revocations", need("users", "delete"), (_req, res) => {
+        res.json({ revocations: revocationStore ? revocationStore.list() : [] });
+    });
+    // Phase F21 / Q6: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access.
+    // The store backend is chosen by createScimStore from
+    // OMCP_SCIM_BACKEND (file | redis). file (default) → OMCP_SCIM_STORE
+    // on-disk JSON (mode 0600, atomic). redis → a shared snapshot so
+    // multi-replica deployments stay coherent (Q6); the redis client is
+    // built from OMCP_SCIM_REDIS_URL here, mirroring the session store.
     const scimToken = process.env.OMCP_SCIM_TOKEN?.trim();
     if (scimToken) {
-        const scimStorePath = process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json";
-        const scimStore = new ScimStore(scimStorePath);
-        await scimStore.load();
-        registerScimRoutes(app, {
-            store: scimStore,
-            bearerToken: scimToken,
-            audit: (ev) => void mgmtAudit.record({
-                actor: { sub: `scim:${ev.actor}` },
-                tenant: "default",
-                resource: "users",
-                action: ev.action.includes("delete") ? "delete" : "write",
-                method: "SCIM",
-                path: `/scim/v2/${ev.action}`,
-                status: ev.status,
-                target: ev.target,
-            }).catch(() => undefined),
-        });
-        console.log("[scim] /scim/v2/* registered (store: %s)", scimStorePath);
+        try {
+            const scimBackend = (process.env.OMCP_SCIM_BACKEND?.trim() || "file");
+            let scimRedis;
+            if (scimBackend === "redis") {
+                const redisUrl = process.env.OMCP_SCIM_REDIS_URL?.trim();
+                if (!redisUrl)
+                    throw new Error("OMCP_SCIM_BACKEND=redis requires OMCP_SCIM_REDIS_URL");
+                const { createClient } = await import("redis");
+                const client = createClient({ url: redisUrl });
+                client.on("error", (err) => console.warn("[scim] redis client error: %s", err instanceof Error ? err.message : String(err)));
+                await client.connect();
+                scimRedis = client;
+            }
+            const scimStore = await createScimStore({
+                backend: scimBackend,
+                path: process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json",
+                redis: scimRedis,
+                redisKey: process.env.OMCP_SCIM_REDIS_KEY?.trim(),
+            });
+            registerScimRoutes(app, {
+                store: scimStore,
+                bearerToken: scimToken,
+                audit: (ev) => void mgmtAudit.record({
+                    actor: { sub: `scim:${ev.actor}` },
+                    tenant: "default",
+                    resource: "users",
+                    action: ev.action.includes("delete") ? "delete" : "write",
+                    method: "SCIM",
+                    path: `/scim/v2/${ev.action}`,
+                    status: ev.status,
+                    target: ev.target,
+                }).catch(() => undefined),
+            });
+            console.log("[scim] /scim/v2/* registered (backend: %s)", scimBackend);
+        }
+        catch (err) {
+            console.warn("[scim] enable failed (routes not mounted): %s", err instanceof Error ? err.message : String(err));
+        }
     }
+    // Phase P6: Postmortems persistence. /api/postmortems lets the
+    // UI list / open / regenerate / delete previously-generated
+    // reports. Opt-in via OMCP_POSTMORTEMS_FILE (default
+    // /tmp/postmortems.jsonl). When the env is left at its default
+    // the demo still works — operators who want survival across
+    // restarts mount a PVC at the same path and set the env to it.
+    const postmortemStore = new PostmortemStore(process.env.OMCP_POSTMORTEMS_FILE?.trim() || "/tmp/postmortems.jsonl");
+    await postmortemStore.load();
+    // GET /api/postmortems — list (newest-first), tenant-scoped.
+    app.get("/api/postmortems", need("services", "read"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const entries = postmortemStore.list(tenant);
+        res.json({
+            total: entries.length,
+            entries: entries.map((e) => ({
+                id: e.id,
+                ts: e.ts,
+                createdBy: e.createdBy,
+                service: e.report.service,
+                window: e.report.window,
+                synopsis: e.report.synopsis,
+            })),
+        });
+    });
+    // GET /api/postmortems/:id — full report (markdown + sections).
+    app.get("/api/postmortems/:id", need("services", "read"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const id = String(req.params.id ?? "");
+        const entry = postmortemStore.get(id, tenant);
+        if (!entry) {
+            res.status(404).json({ error: `Postmortem ${id} not found` });
+            return;
+        }
+        res.json(entry);
+    });
+    // POST /api/postmortems — regenerate via the tool handler +
+    // persist. Body: { service, duration?, format? }. Returns the
+    // stored entry with its id.
+    app.post("/api/postmortems", need("services", "write"), async (req, res) => {
+        const body = (req.body ?? {});
+        if (!body.service || typeof body.service !== "string") {
+            res.status(400).json({ error: "service is required" });
+            return;
+        }
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const createdBy = sess?.sub || sess?.name || "unknown";
+        try {
+            // Force JSON so we get the structured report shape back from
+            // the tool, not just the markdown body. We persist the full
+            // structured report; the markdown lives inside `report.markdown`.
+            const ctx = { ...defaultContext(), tenant, principalId: createdBy };
+            const result = await generatePostmortemHandler(registry, { service: body.service, duration: body.duration, format: "json" }, ctx);
+            const text = result?.content?.[0]?.text;
+            if (!text) {
+                res.status(500).json({ error: "generate_postmortem returned no content" });
+                return;
+            }
+            const report = JSON.parse(text);
+            const stored = await postmortemStore.append({ report, createdBy, tenant });
+            res.status(201).json(stored);
+        }
+        catch (e) {
+            console.warn(`[postmortems] regen failed:`, e);
+            res.status(500).json({ error: e?.message || "internal error" });
+        }
+    });
+    // DELETE /api/postmortems/:id — admin-gated.
+    app.delete("/api/postmortems/:id", need("services", "delete"), async (req, res) => {
+        const sess = req.session;
+        const tenant = sess?.tenant || "default";
+        const ok = await postmortemStore.delete(String(req.params.id ?? ""), tenant);
+        if (!ok) {
+            res.status(404).json({ error: `Postmortem ${req.params.id} not found` });
+            return;
+        }
+        res.status(204).end();
+    });
     // Connectors currently loaded into this server (builtin + filesystem
     // plugins), with manifest metadata — drives the UI "Connectors" page.
     app.get("/api/connectors", (_req, res) => {
@@ -2377,6 +2807,31 @@ async function main() {
             tools: filteredTools,
         });
     });
+    // Q21 — per-service anomaly-score sparklines for the Health tab. Reads
+    // the in-process ring of the anomaly-history sink (last hour), tenant-
+    // scoped. MUST be registered before "/api/health/:service" so the
+    // literal path isn't captured as a service name. `enabled` is true once
+    // any score exists; the UI falls back to its client-side trend otherwise.
+    app.get("/api/health/anomaly-sparklines", (req, res) => {
+        const sess = req.session;
+        const callerTenant = sess?.tenant || "default";
+        // Anonymous (single-tenant) mode: no tenant filter, see everything.
+        const tenant = sess ? callerTenant : undefined;
+        const records = anomalyHistory.recent({ tenant });
+        const series = {};
+        for (const r of records) {
+            const t = Date.parse(r.ts);
+            if (!Number.isFinite(t))
+                continue;
+            (series[r.service] ??= []).push({ t, score: r.score });
+        }
+        res.json({
+            enabled: records.length > 0,
+            remoteWrite: anomalyHistory.isEnabled(),
+            windowMs: anomalyHistory.windowMs,
+            series,
+        });
+    });
     // Health endpoint for UI dashboard
     app.get("/api/health/:service", async (req, res) => {
         try {
@@ -2550,7 +3005,7 @@ async function main() {
     });
     // Stdio transport: one server over stdin/stdout, no HTTP listener.
     if (STDIO) {
-        const server = createMcpServer(defaultContext());
+        const { mcpServer: server } = createMcpServer(defaultContext());
         await server.connect(new StdioServerTransport());
         console.error(`observability-mcp running on stdio transport · connectors: ${registry
             .getAll()
@@ -2723,7 +3178,7 @@ async function main() {
                 }
                 mcpActiveSessions.set(transports.size);
             };
-            const sessionMcpServer = createMcpServer(ctx);
+            const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
             await sessionMcpServer.connect(transport);
         }
         await transport.handleRequest(req, res, req.body);
@@ -2831,7 +3286,7 @@ async function main() {
                 }
                 mcpActiveSessions.set(transports.size);
             };
-            const sessionMcpServer = createMcpServer(ctx);
+            const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
             await sessionMcpServer.connect(transport);
         }
         await transport.handleRequest(req, res, req.body);
@@ -2981,7 +3436,7 @@ async function main() {
         wss.handleUpgrade(req, socket, head, async (ws) => {
             try {
                 const transport = new WebSocketServerTransport(ws);
-                const sessionMcpServer = createMcpServer(auth.ctx);
+                const { mcpServer: sessionMcpServer } = createMcpServer(auth.ctx);
                 await sessionMcpServer.connect(transport);
             }
             catch (err) {