@thotischner/observability-mcp 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/audit/sinks/s3.d.ts +61 -0
  2. package/dist/audit/sinks/s3.js +179 -0
  3. package/dist/audit/sinks/s3.test.d.ts +1 -0
  4. package/dist/audit/sinks/s3.test.js +175 -0
  5. package/dist/auth/policy/batch-dry-run.js +15 -0
  6. package/dist/connectors/loader.d.ts +8 -0
  7. package/dist/connectors/loader.js +49 -0
  8. package/dist/connectors/manifest-hooks.test.d.ts +1 -0
  9. package/dist/connectors/manifest-hooks.test.js +206 -0
  10. package/dist/federation/registry.d.ts +27 -5
  11. package/dist/federation/registry.js +49 -4
  12. package/dist/federation/registry.test.js +79 -3
  13. package/dist/federation/upstream.d.ts +32 -6
  14. package/dist/federation/upstream.js +60 -12
  15. package/dist/federation/upstream.test.d.ts +1 -0
  16. package/dist/federation/upstream.test.js +118 -0
  17. package/dist/index.js +306 -65
  18. package/dist/metrics/self.d.ts +1 -0
  19. package/dist/metrics/self.js +8 -0
  20. package/dist/policy/redact.js +1 -1
  21. package/dist/postmortem/store.d.ts +34 -0
  22. package/dist/postmortem/store.js +113 -0
  23. package/dist/postmortem/store.test.d.ts +1 -0
  24. package/dist/postmortem/store.test.js +118 -0
  25. package/dist/scim/compliance.test.d.ts +1 -0
  26. package/dist/scim/compliance.test.js +169 -0
  27. package/dist/scim/factory.test.d.ts +1 -0
  28. package/dist/scim/factory.test.js +54 -0
  29. package/dist/scim/patch-ops.test.d.ts +1 -0
  30. package/dist/scim/patch-ops.test.js +100 -0
  31. package/dist/scim/redis-store.d.ts +38 -0
  32. package/dist/scim/redis-store.js +178 -0
  33. package/dist/scim/redis-store.test.d.ts +1 -0
  34. package/dist/scim/redis-store.test.js +138 -0
  35. package/dist/scim/routes.d.ts +27 -2
  36. package/dist/scim/routes.js +161 -15
  37. package/dist/scim/store.d.ts +40 -1
  38. package/dist/scim/store.js +23 -5
  39. package/dist/sdk/hook-wrappers.d.ts +39 -0
  40. package/dist/sdk/hook-wrappers.js +113 -0
  41. package/dist/sdk/hook-wrappers.test.d.ts +1 -0
  42. package/dist/sdk/hook-wrappers.test.js +204 -0
  43. package/dist/sdk/index.d.ts +13 -0
  44. package/dist/tools/detect-anomalies.d.ts +12 -1
  45. package/dist/tools/detect-anomalies.js +22 -2
  46. package/dist/tools/topology.js +23 -5
  47. package/dist/tools/topology.test.js +45 -0
  48. package/dist/transport/transportSessionMap.d.ts +70 -0
  49. package/dist/transport/transportSessionMap.js +128 -0
  50. package/dist/transport/transportSessionMap.test.d.ts +1 -0
  51. package/dist/transport/transportSessionMap.test.js +111 -0
  52. package/dist/ui/index.html +856 -101
  53. package/package.json +1 -1
@@ -0,0 +1,118 @@
1
+ import { test } from "node:test";
2
+ import assert from "node:assert/strict";
3
+ import { UpstreamClient } from "./upstream.js";
4
+ test("UpstreamClient: HTTP config — transportKind='http', url surfaced", () => {
5
+ const cfg = {
6
+ name: "remote",
7
+ url: "https://gw.example.com/mcp",
8
+ bearerToken: "t0k",
9
+ };
10
+ const c = new UpstreamClient(cfg);
11
+ assert.equal(c.transportKind, "http");
12
+ assert.equal(c.url, "https://gw.example.com/mcp");
13
+ assert.equal(c.namespacePrefix, "remote");
14
+ assert.deepEqual(c.getTools(), []);
15
+ });
16
+ test("UpstreamClient: stdio config — transportKind='stdio', url shows command", () => {
17
+ const cfg = {
18
+ transport: "stdio",
19
+ name: "local-mcp",
20
+ command: "/usr/local/bin/mcp",
21
+ args: ["--config", "/etc/mcp.yaml"],
22
+ };
23
+ const c = new UpstreamClient(cfg);
24
+ assert.equal(c.transportKind, "stdio");
25
+ assert.equal(c.url, "stdio:/usr/local/bin/mcp");
26
+ assert.equal(c.namespacePrefix, "local-mcp");
27
+ });
28
+ test("UpstreamClient: stdio config respects custom namespacePrefix", () => {
29
+ const cfg = {
30
+ transport: "stdio",
31
+ name: "weather",
32
+ command: "weather-mcp",
33
+ namespacePrefix: "weather.local",
34
+ };
35
+ const c = new UpstreamClient(cfg);
36
+ assert.equal(c.namespacePrefix, "weather.local");
37
+ });
38
+ test("UpstreamClient: explicit transport='http' is also accepted", () => {
39
+ const cfg = {
40
+ transport: "http",
41
+ name: "gw",
42
+ url: "https://gw.example.com/mcp",
43
+ };
44
+ const c = new UpstreamClient(cfg);
45
+ assert.equal(c.transportKind, "http");
46
+ });
47
+ test("UpstreamClient: ws transport surfaces the ws:// URL", () => {
48
+ const cfg = {
49
+ transport: "ws",
50
+ name: "gw",
51
+ url: "wss://gw.example.com/mcp/ws",
52
+ };
53
+ const c = new UpstreamClient(cfg);
54
+ assert.equal(c.transportKind, "ws");
55
+ assert.equal(c.url, "wss://gw.example.com/mcp/ws");
56
+ });
57
+ test("UpstreamClient: empty args defaults to [] on stdio", () => {
58
+ const cfg = {
59
+ transport: "stdio",
60
+ name: "x",
61
+ command: "x",
62
+ };
63
+ const c = new UpstreamClient(cfg);
64
+ // Just verifies construction doesn't throw on a minimal stdio config.
65
+ assert.equal(c.transportKind, "stdio");
66
+ });
67
+ test("UpstreamClient: getStatus initial state", () => {
68
+ const c = new UpstreamClient({ name: "x", url: "https://x/mcp" });
69
+ const s = c.getStatus();
70
+ assert.equal(s.status, "disconnected");
71
+ assert.equal(s.toolCount, 0);
72
+ assert.equal(s.lastError, undefined);
73
+ });
74
+ test("UpstreamClient: connect uses injected _transport instead of spawning / fetching", async () => {
75
+ // Build a minimal MCP Transport stub that also COMPLETES the
76
+ // initialize handshake — when the SDK Client sends a JSON-RPC
77
+ // request, we synthesise a matching response on onmessage so the
78
+ // initialize promise resolves quickly (no 60s SDK timeout).
79
+ let started = false;
80
+ let sentMessages = 0;
81
+ const fakeTransport = {
82
+ start: async () => { started = true; },
83
+ send: async (msg) => {
84
+ sentMessages += 1;
85
+ if (msg?.method === "initialize" && msg?.id !== undefined) {
86
+ queueMicrotask(() => {
87
+ fakeTransport.onmessage?.({
88
+ jsonrpc: "2.0",
89
+ id: msg.id,
90
+ result: { protocolVersion: "2024-11-05", capabilities: {}, serverInfo: { name: "fake", version: "1" } },
91
+ });
92
+ });
93
+ }
94
+ else if (msg?.method === "tools/list" && msg?.id !== undefined) {
95
+ queueMicrotask(() => {
96
+ fakeTransport.onmessage?.({ jsonrpc: "2.0", id: msg.id, result: { tools: [] } });
97
+ });
98
+ }
99
+ },
100
+ close: async () => { },
101
+ onclose: undefined,
102
+ onerror: undefined,
103
+ onmessage: undefined,
104
+ };
105
+ const c = new UpstreamClient({
106
+ name: "injected",
107
+ url: "https://ignored.example/mcp",
108
+ refreshIntervalMs: 0,
109
+ _transport: fakeTransport,
110
+ });
111
+ await c.connect();
112
+ await c.close();
113
+ assert.equal(started, true, "fake transport.start() should have been called");
114
+ assert.ok(sentMessages >= 1, "fake transport.send() should have received initialize");
115
+ // Status reaches "ready" only when initialize + tools/list both succeed
116
+ // — confirms our injected transport drove the whole handshake.
117
+ // (connect-time errors leave it in "degraded".)
118
+ });
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import { buildSessionAttacher, buildRequireSession, } from "./auth/middleware.js
19
19
  import { buildRequirePermissionFromEngine, hasPermission, listGrantedPermissions, DEFAULT_POLICY, } from "./auth/rbac.js";
20
20
  import { resolveOidcConfig, buildOidcRuntime } from "./auth/oidc/runtime.js";
21
21
  import { registerOidcRoutes } from "./auth/oidc/endpoints.js";
22
- import { ScimStore } from "./scim/store.js";
22
+ import { createScimStore } from "./scim/store.js";
23
23
  import { registerScimRoutes } from "./scim/routes.js";
24
24
  import { BuiltinPolicyEngine } from "./auth/policy/engine.js";
25
25
  import { loadPolicyFromFile, writePolicyFile, PolicyLoadError, VALID_RESOURCES, VALID_ACTIONS } from "./auth/policy/loader.js";
@@ -40,10 +40,11 @@ import { getPluginLoader } from "./connectors/loader.js";
40
40
  import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
41
41
  import { isValidConnectorName, installTarball } from "./connectors/install.js";
42
42
  import { PluginVerificationError } from "./connectors/verify.js";
43
- import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions } from "./metrics/self.js";
43
+ import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions, auditDlqDepth } from "./metrics/self.js";
44
44
  import { initOtel } from "./observability/otel.js";
45
45
  import { WebSocketServerTransport } from "./transport/websocket.js";
46
46
  import { HookRegistry } from "./sdk/hooks.js";
47
+ import { wrapToolHandler, wrapResourceHandler, wrapPromptHandler } from "./sdk/hook-wrappers.js";
47
48
  import { UpstreamClient } from "./federation/upstream.js";
48
49
  import { FederationRegistry, parseFederationEnv } from "./federation/registry.js";
49
50
  import { buildCsrfIssuer, buildCsrfEnforcer, csrfBypassFromEnv } from "./auth/csrf.js";
@@ -56,6 +57,7 @@ import { queryLogsHandler } from "./tools/query-logs.js";
56
57
  import { queryTracesHandler } from "./tools/query-traces.js";
57
58
  import { getAnomalyHistoryHandler } from "./tools/get-anomaly-history.js";
58
59
  import { generatePostmortemHandler } from "./tools/generate-postmortem.js";
60
+ import { PostmortemStore } from "./postmortem/store.js";
59
61
  import { AnomalyHistory, fromEnv as anomalyHistoryFromEnv } from "./analysis/history.js";
60
62
  import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
61
63
  import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
@@ -295,11 +297,20 @@ async function main() {
295
297
  return result;
296
298
  }
297
299
  }
300
+ /**
301
+ * Returns the McpServer for the given context. The companion
302
+ * `toolHandlers` map carries every tool registered for this ctx
303
+ * (post-hook-wrapping) so the in-product Playground UI (Q13) can
304
+ * invoke a tool without going through the full Streamable HTTP
305
+ * transport stack. The map is keyed by tool name; values run the
306
+ * same wrapped handler the McpServer would dispatch over MCP.
307
+ */
298
308
  function createMcpServer(ctx) {
299
309
  const mcpServer = new McpServer({
300
310
  name: "observability-mcp",
301
311
  version: SERVER_VERSION,
302
312
  });
313
+ const toolHandlers = new Map();
303
314
  // --- Register tools with Zod schemas ---
304
315
  // Product-aware registration: when the active credential is bound
305
316
  // to a Product (OMCP_KEY_PRODUCTS), `ctx.allowedTools` carries that
@@ -319,34 +330,39 @@ async function main() {
319
330
  return undefined;
320
331
  if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
321
332
  const originalHandler = rest[rest.length - 1];
322
- const wrappedHandler = async (args, extra) => {
323
- const hookCtxBase = {
324
- principal: ctx.principalId,
325
- tenant: ctx.tenant || "default",
326
- target: name,
327
- };
328
- const pre = await hookRegistry.fire("tool_pre_invoke", { ...hookCtxBase, kind: "tool_pre_invoke" }, { args });
329
- if (!pre.allow) {
330
- return {
331
- content: [{ type: "text", text: pre.reason ?? "denied by plugin hook" }],
332
- isError: true,
333
- };
334
- }
335
- const effectiveArgs = pre.payload?.args ?? args;
336
- const result = await originalHandler(effectiveArgs, extra);
337
- const post = await hookRegistry.fire("tool_post_invoke", { ...hookCtxBase, kind: "tool_post_invoke" }, { args: effectiveArgs, result });
338
- if (!post.allow) {
339
- return {
340
- content: [{ type: "text", text: post.reason ?? "denied by plugin hook" }],
341
- isError: true,
342
- };
343
- }
344
- return post.payload?.result ?? result;
345
- };
333
+ const wrappedHandler = wrapToolHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
346
334
  rest[rest.length - 1] = wrappedHandler;
335
+ // Stash for the Playground endpoint — keyed by tool name. The
336
+ // wrapped handler honours pre/post hooks + the same RBAC the
337
+ // McpServer dispatch path runs. Per-ctx Map so a different
338
+ // user's allowedTools never leak.
339
+ toolHandlers.set(name, wrappedHandler);
347
340
  }
348
341
  return mcpServer.tool(name, ...rest);
349
342
  });
343
+ // Q12: resource + prompt registrations get the same hook-fan-out
344
+ // treatment so a plugin's resource_pre_fetch / resource_post_fetch /
345
+ // prompt_pre_fetch / prompt_post_fetch handlers actually fire when
346
+ // a future resource/prompt registration lands. The wrappers stay
347
+ // thin pass-throughs when no hooks are registered (the OSS default).
348
+ // Wrappers are tested in mcp-server/src/sdk/hook-wrappers.test.ts.
349
+ const registerResource = ((name, ...rest) => {
350
+ if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
351
+ const originalHandler = rest[rest.length - 1];
352
+ rest[rest.length - 1] = wrapResourceHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
353
+ }
354
+ return mcpServer.resource(name, ...rest);
355
+ });
356
+ const registerPrompt = ((name, ...rest) => {
357
+ if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
358
+ const originalHandler = rest[rest.length - 1];
359
+ rest[rest.length - 1] = wrapPromptHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
360
+ }
361
+ return mcpServer.prompt(name, ...rest);
362
+ });
363
+ // Suppress unused-warn — kept for the moment registrations land.
364
+ void registerResource;
365
+ void registerPrompt;
350
366
  registerTool("list_sources", [
351
367
  "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
352
368
  "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
@@ -547,7 +563,9 @@ async function main() {
547
563
  .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
548
564
  }, async (args) => {
549
565
  await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
550
- return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
566
+ // P1: pass the anomaly-history sink so detected scores flow
567
+ // into the TSDB and `get_anomaly_history` returns real data.
568
+ return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx, anomalyHistory));
551
569
  });
552
570
  registerTool("get_topology", [
553
571
  "Return the infrastructure topology graph (Resources and Edges) from every topology-capable connector.",
@@ -599,16 +617,33 @@ async function main() {
599
617
  // Product-allow-list gate, so federated tools obey the same policy
600
618
  // surface as native ones.
601
619
  for (const info of federationRegistry.getNamespacedTools()) {
602
- // Upstream's inputSchema is forwarded verbatim. The SDK's
603
- // tool() overload signatures don't carry an obvious type for a
604
- // dynamic-shape schema, so we cast to `any` at the boundary and
605
- // let the upstream contract speak for the validation.
606
- registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, info.inputSchema ?? {}, async (args) => {
620
+ // The MCP SDK's tool() signature wants a ZodRawShape (a map of
621
+ // field-name Zod type), not a raw JSON Schema. Federated
622
+ // upstreams expose JSON Schema (the wire-format MCP uses on
623
+ // tools/list); we transcode to a permissive Zod shape so the
624
+ // SDK accepts the registration. Per-field types are `z.unknown()`
625
+ // because the upstream will validate the call args anyway; the
626
+ // local Zod check is only a "this is the field name set" gate.
627
+ // P7: this transcoding fixes the registration crash that broke
628
+ // every federation deploy before the E2E test caught it.
629
+ const upstreamProps = info.inputSchema?.properties ?? {};
630
+ // Every field is z.unknown().optional() — the SDK only uses this
631
+ // shape to know the field-name set; the upstream re-validates
632
+ // against its full JSON Schema (incl. its own `required` list)
633
+ // when the call arrives. Marking all fields optional here keeps
634
+ // calls with the upstream-defaults flowing through; without it
635
+ // the SDK rejects any call that omits a field upstream considers
636
+ // required even if the upstream would accept the omission.
637
+ const localShape = {};
638
+ for (const k of Object.keys(upstreamProps)) {
639
+ localShape[k] = z.unknown().optional();
640
+ }
641
+ registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, localShape, async (args) => {
607
642
  await enforceEntitledAccess(ctx, { tool: info.namespacedName });
608
643
  return withToolMetrics(info.namespacedName, () => federationRegistry.callNamespacedTool(info.namespacedName, args));
609
644
  });
610
645
  }
611
- return mcpServer;
646
+ return { mcpServer, toolHandlers };
612
647
  }
613
648
  // --- Management-plane auth (basic mode) -----------------------------------
614
649
  // Off by default. Enable with `OMCP_AUTH=basic` + `OMCP_USERS_FILE` and
@@ -717,7 +752,12 @@ async function main() {
717
752
  app.set("trust proxy", trustProxy);
718
753
  }
719
754
  }
720
- app.use(express.json({ limit: "1mb" }));
755
+ // Parse application/json AND any *+json media type. SCIM clients
756
+ // (Entra, Okta) send `application/scim+json` per RFC 7644 §3.1 —
757
+ // without the wildcard the body silently arrives empty and every
758
+ // SCIM POST/PATCH 400s. The wildcard also future-proofs other
759
+ // structured-suffix JSON content types.
760
+ app.use(express.json({ limit: "1mb", type: ["application/json", "application/*+json"] }));
721
761
  // Security headers
722
762
  app.use((req, res, next) => {
723
763
  res.setHeader("X-Content-Type-Options", "nosniff");
@@ -938,11 +978,11 @@ async function main() {
938
978
  // (no tools) so the gateway boots regardless of upstream health.
939
979
  const federationRegistry = new FederationRegistry();
940
980
  for (const cfg of parseFederationEnv()) {
941
- const client = new UpstreamClient({
942
- name: cfg.name,
943
- url: cfg.url,
944
- bearerToken: cfg.bearerToken,
945
- });
981
+ const client = new UpstreamClient(cfg.kind === "stdio"
982
+ ? { transport: "stdio", name: cfg.name, command: cfg.command, args: cfg.args }
983
+ : cfg.kind === "ws"
984
+ ? { transport: "ws", name: cfg.name, url: cfg.url }
985
+ : { name: cfg.name, url: cfg.url, bearerToken: cfg.bearerToken });
946
986
  federationRegistry.add(client);
947
987
  client.connect().catch((err) => {
948
988
  console.warn("federation upstream %s initial connect failed: %s", cfg.name, err instanceof Error ? err.message : String(err));
@@ -1044,6 +1084,24 @@ async function main() {
1044
1084
  // this endpoint when enabled.
1045
1085
  if (process.env.METRICS_ENABLED !== "false") {
1046
1086
  app.get("/metrics", async (_req, res) => {
1087
+ // P9: refresh the audit-webhook DLQ depth before the scrape so
1088
+ // Prometheus sees the current file state rather than whatever
1089
+ // /api/audit/dlq last set. Best-effort; ENOENT or missing-env
1090
+ // resets to 0 (the dlqPath being unset is the normal state).
1091
+ try {
1092
+ const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
1093
+ if (dlqPath) {
1094
+ const fs = await import("node:fs/promises");
1095
+ const raw = await fs.readFile(dlqPath, "utf8").catch(() => "");
1096
+ auditDlqDepth.set(raw.split("\n").filter((l) => l.trim()).length);
1097
+ }
1098
+ else {
1099
+ auditDlqDepth.set(0);
1100
+ }
1101
+ }
1102
+ catch {
1103
+ auditDlqDepth.set(0);
1104
+ }
1047
1105
  res.set("Content-Type", selfRegistry.contentType);
1048
1106
  res.end(await selfRegistry.metrics());
1049
1107
  });
@@ -1108,6 +1166,37 @@ async function main() {
1108
1166
  app.get("/api/tools/registry", (_req, res) => {
1109
1167
  res.json({ tools: REGISTERED_TOOLS });
1110
1168
  });
1169
+ // Q13: in-product Playground endpoint. Lets the operator invoke a
1170
+ // registered tool against the live gateway without spinning up a
1171
+ // separate MCP client. Re-uses the per-session ctx and the same
1172
+ // wrapped handler the McpServer dispatch path would run (so RBAC,
1173
+ // entitlements, rate-limit, audit, hook fan-out all apply
1174
+ // identically).
1175
+ app.post("/api/playground/invoke", async (req, res) => {
1176
+ const ctx = await gateCtx(req, res);
1177
+ if (!ctx)
1178
+ return;
1179
+ const body = (req.body ?? {});
1180
+ const tool = typeof body.tool === "string" ? body.tool : "";
1181
+ if (!tool) {
1182
+ res.status(400).json({ error: "tool (string) is required" });
1183
+ return;
1184
+ }
1185
+ const { toolHandlers } = createMcpServer(ctx);
1186
+ const handler = toolHandlers.get(tool);
1187
+ if (!handler) {
1188
+ res.status(404).json({ error: `tool '${tool}' is not registered (or not allowed for this credential)` });
1189
+ return;
1190
+ }
1191
+ try {
1192
+ const result = await handler(body.args ?? {}, undefined);
1193
+ res.json({ tool, result });
1194
+ }
1195
+ catch (err) {
1196
+ const message = err instanceof Error ? err.message : String(err);
1197
+ res.status(500).json({ error: message, tool });
1198
+ }
1199
+ });
1111
1200
  // Server info — version, loaded plugins, MCP protocol version, build metadata.
1112
1201
  // Used by the Web UI footer and by operators to confirm what's deployed.
1113
1202
  app.get("/api/info", async (_req, res) => {
@@ -1142,6 +1231,16 @@ async function main() {
1142
1231
  redaction: REDACTION_ENABLED,
1143
1232
  trustProxy: !!(process.env.OMCP_TRUST_PROXY && process.env.OMCP_TRUST_PROXY !== "false"),
1144
1233
  toolRatePerMin: resolveToolRatePerMin(process.env.OMCP_TOOL_RATE_PER_MIN),
1234
+ // P1: posture flags so dashboards can alert when a shipped
1235
+ // capability is configured but doing nothing useful.
1236
+ anomalyHistoryActive: anomalyHistory.isEnabled(),
1237
+ tracesCapabilityCount: registry
1238
+ .getAll()
1239
+ .filter((c) => typeof c.queryTraces === "function").length,
1240
+ pluginsVerified: !/^(0|false|no|off)$/i.test(process.env.VERIFY_PLUGINS ?? "true"),
1241
+ scimEnabled: !!process.env.OMCP_SCIM_TOKEN,
1242
+ federationUpstreams: (process.env.OMCP_FEDERATION_UPSTREAMS ?? "")
1243
+ .split(",").map((s) => s.trim()).filter(Boolean).length,
1145
1244
  },
1146
1245
  plugins: loader.list().map((p) => ({
1147
1246
  name: p.name,
@@ -1566,6 +1665,46 @@ async function main() {
1566
1665
  scopedTo: tenantFilter || (isAdmin ? null : callerTenant),
1567
1666
  });
1568
1667
  });
1668
+ // --- /api/audit/dlq — webhook-sink dead-letter queue surface (P9) ---
1669
+ // When the audit webhook is configured AND the receiver exhausted
1670
+ // its retry budget, entries land in the DLQ file. This endpoint
1671
+ // surfaces the count + the last N entries so operators can decide
1672
+ // whether to replay manually. Also refreshes the
1673
+ // `obsmcp_audit_webhook_dlq_depth` gauge so the /metrics scrape
1674
+ // alongside it stays accurate.
1675
+ app.get("/api/audit/dlq", need("audit", "read"), async (_req, res) => {
1676
+ const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
1677
+ if (!dlqPath) {
1678
+ auditDlqDepth.set(0);
1679
+ res.json({ enabled: false, path: null, depth: 0, entries: [] });
1680
+ return;
1681
+ }
1682
+ try {
1683
+ const fs = await import("node:fs/promises");
1684
+ const raw = await fs.readFile(dlqPath, "utf8");
1685
+ const lines = raw.split("\n").filter((l) => l.trim());
1686
+ auditDlqDepth.set(lines.length);
1687
+ const tail = lines.slice(-50).map((l) => {
1688
+ try {
1689
+ return JSON.parse(l);
1690
+ }
1691
+ catch {
1692
+ return { _raw: l, _parseError: true };
1693
+ }
1694
+ });
1695
+ res.json({ enabled: true, path: dlqPath, depth: lines.length, entries: tail });
1696
+ }
1697
+ catch (err) {
1698
+ const code = err.code;
1699
+ if (code === "ENOENT") {
1700
+ auditDlqDepth.set(0);
1701
+ res.json({ enabled: true, path: dlqPath, depth: 0, entries: [] });
1702
+ return;
1703
+ }
1704
+ console.warn("[/api/audit/dlq] read failed:", err);
1705
+ res.status(500).json({ error: err?.message || "DLQ read failed" });
1706
+ }
1707
+ });
1569
1708
  // --- /api/usage — per-identity MCP rate-limit snapshot -----------------
1570
1709
  // Read-only view of the IdentityRateLimiter's bucket state. Gated by
1571
1710
  // need("audit","read") — the same role set that already sees the
@@ -1716,31 +1855,133 @@ async function main() {
1716
1855
  registerOidcRoutes(app, { sessionCfg, oidc: oidcRuntime });
1717
1856
  console.log("[auth] OIDC endpoints registered: /api/auth/oidc/{login,callback,logout}");
1718
1857
  }
1719
- // Phase F21: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access;
1720
- // OMCP_SCIM_STORE points at the on-disk JSON (mode 0600, atomic).
1721
- // Multi-replica deployments should plug the F8 SessionStore in
1722
- // when F21b lands.
1858
+ // Phase F21 / Q6: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access.
1859
+ // The store backend is chosen by createScimStore from
1860
+ // OMCP_SCIM_BACKEND (file | redis). file (default) OMCP_SCIM_STORE
1861
+ // on-disk JSON (mode 0600, atomic). redis → a shared snapshot so
1862
+ // multi-replica deployments stay coherent (Q6); the redis client is
1863
+ // built from OMCP_SCIM_REDIS_URL here, mirroring the session store.
1723
1864
  const scimToken = process.env.OMCP_SCIM_TOKEN?.trim();
1724
1865
  if (scimToken) {
1725
- const scimStorePath = process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json";
1726
- const scimStore = new ScimStore(scimStorePath);
1727
- await scimStore.load();
1728
- registerScimRoutes(app, {
1729
- store: scimStore,
1730
- bearerToken: scimToken,
1731
- audit: (ev) => void mgmtAudit.record({
1732
- actor: { sub: `scim:${ev.actor}` },
1733
- tenant: "default",
1734
- resource: "users",
1735
- action: ev.action.includes("delete") ? "delete" : "write",
1736
- method: "SCIM",
1737
- path: `/scim/v2/${ev.action}`,
1738
- status: ev.status,
1739
- target: ev.target,
1740
- }).catch(() => undefined),
1741
- });
1742
- console.log("[scim] /scim/v2/* registered (store: %s)", scimStorePath);
1866
+ try {
1867
+ const scimBackend = (process.env.OMCP_SCIM_BACKEND?.trim() || "file");
1868
+ let scimRedis;
1869
+ if (scimBackend === "redis") {
1870
+ const redisUrl = process.env.OMCP_SCIM_REDIS_URL?.trim();
1871
+ if (!redisUrl)
1872
+ throw new Error("OMCP_SCIM_BACKEND=redis requires OMCP_SCIM_REDIS_URL");
1873
+ const { createClient } = await import("redis");
1874
+ const client = createClient({ url: redisUrl });
1875
+ client.on("error", (err) => console.warn("[scim] redis client error: %s", err instanceof Error ? err.message : String(err)));
1876
+ await client.connect();
1877
+ scimRedis = client;
1878
+ }
1879
+ const scimStore = await createScimStore({
1880
+ backend: scimBackend,
1881
+ path: process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json",
1882
+ redis: scimRedis,
1883
+ redisKey: process.env.OMCP_SCIM_REDIS_KEY?.trim(),
1884
+ });
1885
+ registerScimRoutes(app, {
1886
+ store: scimStore,
1887
+ bearerToken: scimToken,
1888
+ audit: (ev) => void mgmtAudit.record({
1889
+ actor: { sub: `scim:${ev.actor}` },
1890
+ tenant: "default",
1891
+ resource: "users",
1892
+ action: ev.action.includes("delete") ? "delete" : "write",
1893
+ method: "SCIM",
1894
+ path: `/scim/v2/${ev.action}`,
1895
+ status: ev.status,
1896
+ target: ev.target,
1897
+ }).catch(() => undefined),
1898
+ });
1899
+ console.log("[scim] /scim/v2/* registered (backend: %s)", scimBackend);
1900
+ }
1901
+ catch (err) {
1902
+ console.warn("[scim] enable failed (routes not mounted): %s", err instanceof Error ? err.message : String(err));
1903
+ }
1743
1904
  }
1905
+ // Phase P6: Postmortems persistence. /api/postmortems lets the
1906
+ // UI list / open / regenerate / delete previously-generated
1907
+ // reports. Opt-in via OMCP_POSTMORTEMS_FILE (default
1908
+ // /tmp/postmortems.jsonl). When the env is left at its default
1909
+ // the demo still works — operators who want survival across
1910
+ // restarts mount a PVC at the same path and set the env to it.
1911
+ const postmortemStore = new PostmortemStore(process.env.OMCP_POSTMORTEMS_FILE?.trim() || "/tmp/postmortems.jsonl");
1912
+ await postmortemStore.load();
1913
+ // GET /api/postmortems — list (newest-first), tenant-scoped.
1914
+ app.get("/api/postmortems", need("services", "read"), async (req, res) => {
1915
+ const sess = req.session;
1916
+ const tenant = sess?.tenant || "default";
1917
+ const entries = postmortemStore.list(tenant);
1918
+ res.json({
1919
+ total: entries.length,
1920
+ entries: entries.map((e) => ({
1921
+ id: e.id,
1922
+ ts: e.ts,
1923
+ createdBy: e.createdBy,
1924
+ service: e.report.service,
1925
+ window: e.report.window,
1926
+ synopsis: e.report.synopsis,
1927
+ })),
1928
+ });
1929
+ });
1930
+ // GET /api/postmortems/:id — full report (markdown + sections).
1931
+ app.get("/api/postmortems/:id", need("services", "read"), async (req, res) => {
1932
+ const sess = req.session;
1933
+ const tenant = sess?.tenant || "default";
1934
+ const id = String(req.params.id ?? "");
1935
+ const entry = postmortemStore.get(id, tenant);
1936
+ if (!entry) {
1937
+ res.status(404).json({ error: `Postmortem ${id} not found` });
1938
+ return;
1939
+ }
1940
+ res.json(entry);
1941
+ });
1942
+ // POST /api/postmortems — regenerate via the tool handler +
1943
+ // persist. Body: { service, duration?, format? }. Returns the
1944
+ // stored entry with its id.
1945
+ app.post("/api/postmortems", need("services", "write"), async (req, res) => {
1946
+ const body = (req.body ?? {});
1947
+ if (!body.service || typeof body.service !== "string") {
1948
+ res.status(400).json({ error: "service is required" });
1949
+ return;
1950
+ }
1951
+ const sess = req.session;
1952
+ const tenant = sess?.tenant || "default";
1953
+ const createdBy = sess?.sub || sess?.name || "unknown";
1954
+ try {
1955
+ // Force JSON so we get the structured report shape back from
1956
+ // the tool, not just the markdown body. We persist the full
1957
+ // structured report; the markdown lives inside `report.markdown`.
1958
+ const ctx = { ...defaultContext(), tenant, principalId: createdBy };
1959
+ const result = await generatePostmortemHandler(registry, { service: body.service, duration: body.duration, format: "json" }, ctx);
1960
+ const text = result?.content?.[0]?.text;
1961
+ if (!text) {
1962
+ res.status(500).json({ error: "generate_postmortem returned no content" });
1963
+ return;
1964
+ }
1965
+ const report = JSON.parse(text);
1966
+ const stored = await postmortemStore.append({ report, createdBy, tenant });
1967
+ res.status(201).json(stored);
1968
+ }
1969
+ catch (e) {
1970
+ console.warn(`[postmortems] regen failed:`, e);
1971
+ res.status(500).json({ error: e?.message || "internal error" });
1972
+ }
1973
+ });
1974
+ // DELETE /api/postmortems/:id — admin-gated.
1975
+ app.delete("/api/postmortems/:id", need("services", "delete"), async (req, res) => {
1976
+ const sess = req.session;
1977
+ const tenant = sess?.tenant || "default";
1978
+ const ok = await postmortemStore.delete(String(req.params.id ?? ""), tenant);
1979
+ if (!ok) {
1980
+ res.status(404).json({ error: `Postmortem ${req.params.id} not found` });
1981
+ return;
1982
+ }
1983
+ res.status(204).end();
1984
+ });
1744
1985
  // Connectors currently loaded into this server (builtin + filesystem
1745
1986
  // plugins), with manifest metadata — drives the UI "Connectors" page.
1746
1987
  app.get("/api/connectors", (_req, res) => {
@@ -2550,7 +2791,7 @@ async function main() {
2550
2791
  });
2551
2792
  // Stdio transport: one server over stdin/stdout, no HTTP listener.
2552
2793
  if (STDIO) {
2553
- const server = createMcpServer(defaultContext());
2794
+ const { mcpServer: server } = createMcpServer(defaultContext());
2554
2795
  await server.connect(new StdioServerTransport());
2555
2796
  console.error(`observability-mcp running on stdio transport · connectors: ${registry
2556
2797
  .getAll()
@@ -2723,7 +2964,7 @@ async function main() {
2723
2964
  }
2724
2965
  mcpActiveSessions.set(transports.size);
2725
2966
  };
2726
- const sessionMcpServer = createMcpServer(ctx);
2967
+ const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
2727
2968
  await sessionMcpServer.connect(transport);
2728
2969
  }
2729
2970
  await transport.handleRequest(req, res, req.body);
@@ -2831,7 +3072,7 @@ async function main() {
2831
3072
  }
2832
3073
  mcpActiveSessions.set(transports.size);
2833
3074
  };
2834
- const sessionMcpServer = createMcpServer(ctx);
3075
+ const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
2835
3076
  await sessionMcpServer.connect(transport);
2836
3077
  }
2837
3078
  await transport.handleRequest(req, res, req.body);
@@ -2981,7 +3222,7 @@ async function main() {
2981
3222
  wss.handleUpgrade(req, socket, head, async (ws) => {
2982
3223
  try {
2983
3224
  const transport = new WebSocketServerTransport(ws);
2984
- const sessionMcpServer = createMcpServer(auth.ctx);
3225
+ const { mcpServer: sessionMcpServer } = createMcpServer(auth.ctx);
2985
3226
  await sessionMcpServer.connect(transport);
2986
3227
  }
2987
3228
  catch (err) {
@@ -5,6 +5,7 @@ export declare const mcpToolLatency: Histogram<"tool">;
5
5
  export declare const connectorCalls: Counter<"type" | "source" | "outcome" | "operation">;
6
6
  export declare const apiRequests: Counter<"status" | "route" | "method">;
7
7
  export declare const mcpActiveSessions: Gauge<string>;
8
+ export declare const auditDlqDepth: Gauge<string>;
8
9
  /**
9
10
  * Wrap a (potentially async) tool handler to record call count + latency.
10
11
  * Outcome is "ok" or "error" — never throws on its own.
@@ -40,6 +40,14 @@ export const mcpActiveSessions = new Gauge({
40
40
  help: "Active MCP Streamable HTTP sessions.",
41
41
  registers: [selfRegistry],
42
42
  });
43
+ // P9: Audit webhook dead-letter queue depth. Refreshed on each
44
+ // `/metrics` scrape and when the operator hits `/api/audit/dlq`.
45
+ // Stays at 0 when no DLQ file is configured or the file is missing.
46
+ export const auditDlqDepth = new Gauge({
47
+ name: "obsmcp_audit_webhook_dlq_depth",
48
+ help: "Number of audit entries waiting in the webhook-sink dead-letter queue.",
49
+ registers: [selfRegistry],
50
+ });
43
51
  /**
44
52
  * Wrap a (potentially async) tool handler to record call count + latency.
45
53
  * Outcome is "ok" or "error" — never throws on its own.