@thotischner/observability-mcp 1.8.1 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. package/dist/analysis/history.d.ts +70 -0
  2. package/dist/analysis/history.js +170 -0
  3. package/dist/analysis/history.test.d.ts +1 -0
  4. package/dist/analysis/history.test.js +141 -0
  5. package/dist/audit/log.d.ts +9 -0
  6. package/dist/audit/log.js +20 -0
  7. package/dist/audit/redaction-bypass.d.ts +67 -0
  8. package/dist/audit/redaction-bypass.js +64 -0
  9. package/dist/audit/redaction-bypass.test.d.ts +1 -0
  10. package/dist/audit/redaction-bypass.test.js +72 -0
  11. package/dist/audit/sinks/s3.d.ts +61 -0
  12. package/dist/audit/sinks/s3.js +179 -0
  13. package/dist/audit/sinks/s3.test.d.ts +1 -0
  14. package/dist/audit/sinks/s3.test.js +175 -0
  15. package/dist/audit/sinks/types.d.ts +18 -0
  16. package/dist/audit/sinks/types.js +1 -0
  17. package/dist/audit/sinks/webhook.d.ts +45 -0
  18. package/dist/audit/sinks/webhook.js +111 -0
  19. package/dist/audit/sinks/webhook.test.d.ts +1 -0
  20. package/dist/audit/sinks/webhook.test.js +162 -0
  21. package/dist/auth/credentials.d.ts +11 -0
  22. package/dist/auth/credentials.js +27 -0
  23. package/dist/auth/credentials.test.js +21 -1
  24. package/dist/auth/csrf.d.ts +26 -0
  25. package/dist/auth/csrf.js +128 -0
  26. package/dist/auth/csrf.test.d.ts +1 -0
  27. package/dist/auth/csrf.test.js +143 -0
  28. package/dist/auth/local-users.d.ts +6 -0
  29. package/dist/auth/local-users.js +11 -0
  30. package/dist/auth/local-users.test.js +41 -0
  31. package/dist/auth/middleware.d.ts +7 -6
  32. package/dist/auth/oidc/dcr.d.ts +70 -0
  33. package/dist/auth/oidc/dcr.js +160 -0
  34. package/dist/auth/oidc/dcr.test.d.ts +1 -0
  35. package/dist/auth/oidc/dcr.test.js +109 -0
  36. package/dist/auth/oidc/endpoints.js +44 -0
  37. package/dist/auth/oidc/profiles.d.ts +22 -0
  38. package/dist/auth/oidc/profiles.js +95 -0
  39. package/dist/auth/oidc/profiles.test.d.ts +1 -0
  40. package/dist/auth/oidc/profiles.test.js +51 -0
  41. package/dist/auth/oidc/runtime.d.ts +3 -0
  42. package/dist/auth/oidc/runtime.js +16 -3
  43. package/dist/auth/oidc/runtime.test.js +1 -0
  44. package/dist/auth/policy/batch-dry-run.d.ts +56 -0
  45. package/dist/auth/policy/batch-dry-run.js +144 -0
  46. package/dist/auth/policy/batch-dry-run.test.d.ts +1 -0
  47. package/dist/auth/policy/batch-dry-run.test.js +140 -0
  48. package/dist/auth/policy/engine.d.ts +20 -4
  49. package/dist/auth/policy/engine.js +16 -2
  50. package/dist/auth/policy/loader.d.ts +11 -1
  51. package/dist/auth/policy/loader.js +37 -0
  52. package/dist/auth/policy/loader.test.d.ts +1 -0
  53. package/dist/auth/policy/loader.test.js +86 -0
  54. package/dist/auth/policy/opa.d.ts +5 -5
  55. package/dist/auth/policy/opa.js +25 -14
  56. package/dist/auth/policy/opa.test.js +48 -0
  57. package/dist/auth/rbac.d.ts +23 -1
  58. package/dist/auth/rbac.js +43 -1
  59. package/dist/auth/rbac.test.js +62 -0
  60. package/dist/cli/index.js +3 -0
  61. package/dist/cli/inspector-config.d.ts +9 -0
  62. package/dist/cli/inspector-config.js +28 -0
  63. package/dist/cli/inspector-config.test.d.ts +1 -0
  64. package/dist/cli/inspector-config.test.js +33 -0
  65. package/dist/cli/lib.d.ts +1 -1
  66. package/dist/cli/lib.js +1 -0
  67. package/dist/conformance/mcp-2025-11-25.test.d.ts +1 -0
  68. package/dist/conformance/mcp-2025-11-25.test.js +206 -0
  69. package/dist/connectors/interface.d.ts +5 -1
  70. package/dist/connectors/loader.d.ts +8 -0
  71. package/dist/connectors/loader.js +55 -4
  72. package/dist/connectors/loader.test.d.ts +1 -0
  73. package/dist/connectors/loader.test.js +78 -0
  74. package/dist/connectors/manifest-hooks.test.d.ts +1 -0
  75. package/dist/connectors/manifest-hooks.test.js +206 -0
  76. package/dist/connectors/prometheus.test.js +31 -13
  77. package/dist/connectors/registry.d.ts +13 -0
  78. package/dist/connectors/registry.js +30 -0
  79. package/dist/connectors/registry.test.js +56 -2
  80. package/dist/context.d.ts +32 -0
  81. package/dist/context.js +35 -0
  82. package/dist/context.test.d.ts +1 -0
  83. package/dist/context.test.js +58 -0
  84. package/dist/federation/registry.d.ts +54 -0
  85. package/dist/federation/registry.js +122 -0
  86. package/dist/federation/registry.test.d.ts +1 -0
  87. package/dist/federation/registry.test.js +206 -0
  88. package/dist/federation/upstream.d.ts +86 -0
  89. package/dist/federation/upstream.js +162 -0
  90. package/dist/federation/upstream.test.d.ts +1 -0
  91. package/dist/federation/upstream.test.js +118 -0
  92. package/dist/index.js +1435 -126
  93. package/dist/metrics/self.d.ts +1 -0
  94. package/dist/metrics/self.js +8 -0
  95. package/dist/middleware/ssrfGuard.d.ts +15 -0
  96. package/dist/middleware/ssrfGuard.js +103 -0
  97. package/dist/middleware/ssrfGuard.test.d.ts +1 -0
  98. package/dist/middleware/ssrfGuard.test.js +81 -0
  99. package/dist/observability/otel.d.ts +20 -0
  100. package/dist/observability/otel.js +118 -0
  101. package/dist/observability/otel.test.d.ts +1 -0
  102. package/dist/observability/otel.test.js +56 -0
  103. package/dist/openapi.js +215 -7
  104. package/dist/openapi.test.js +34 -0
  105. package/dist/policy/redact.js +1 -1
  106. package/dist/postmortem/store.d.ts +34 -0
  107. package/dist/postmortem/store.js +113 -0
  108. package/dist/postmortem/store.test.d.ts +1 -0
  109. package/dist/postmortem/store.test.js +118 -0
  110. package/dist/postmortem/synthesizer.d.ts +83 -0
  111. package/dist/postmortem/synthesizer.js +205 -0
  112. package/dist/postmortem/synthesizer.test.d.ts +1 -0
  113. package/dist/postmortem/synthesizer.test.js +141 -0
  114. package/dist/products/loader.d.ts +31 -3
  115. package/dist/products/loader.js +77 -4
  116. package/dist/products/loader.test.js +90 -1
  117. package/dist/quota/charge.d.ts +28 -0
  118. package/dist/quota/charge.js +30 -0
  119. package/dist/quota/charge.test.d.ts +1 -0
  120. package/dist/quota/charge.test.js +83 -0
  121. package/dist/quota/limiter.d.ts +29 -4
  122. package/dist/quota/limiter.js +64 -8
  123. package/dist/quota/limiter.test.js +86 -0
  124. package/dist/scim/compliance.test.d.ts +1 -0
  125. package/dist/scim/compliance.test.js +169 -0
  126. package/dist/scim/factory.test.d.ts +1 -0
  127. package/dist/scim/factory.test.js +54 -0
  128. package/dist/scim/group-role-map.d.ts +4 -0
  129. package/dist/scim/group-role-map.js +33 -0
  130. package/dist/scim/group-role-map.test.d.ts +1 -0
  131. package/dist/scim/group-role-map.test.js +33 -0
  132. package/dist/scim/patch-ops.test.d.ts +1 -0
  133. package/dist/scim/patch-ops.test.js +100 -0
  134. package/dist/scim/redis-store.d.ts +38 -0
  135. package/dist/scim/redis-store.js +178 -0
  136. package/dist/scim/redis-store.test.d.ts +1 -0
  137. package/dist/scim/redis-store.test.js +138 -0
  138. package/dist/scim/routes.d.ts +40 -0
  139. package/dist/scim/routes.js +395 -0
  140. package/dist/scim/store.d.ts +76 -0
  141. package/dist/scim/store.js +196 -0
  142. package/dist/scim/store.test.d.ts +1 -0
  143. package/dist/scim/store.test.js +121 -0
  144. package/dist/scim/types.d.ts +73 -0
  145. package/dist/scim/types.js +29 -0
  146. package/dist/sdk/hook-wrappers.d.ts +39 -0
  147. package/dist/sdk/hook-wrappers.js +113 -0
  148. package/dist/sdk/hook-wrappers.test.d.ts +1 -0
  149. package/dist/sdk/hook-wrappers.test.js +204 -0
  150. package/dist/sdk/hooks.d.ts +77 -0
  151. package/dist/sdk/hooks.js +72 -0
  152. package/dist/sdk/hooks.test.d.ts +1 -0
  153. package/dist/sdk/hooks.test.js +159 -0
  154. package/dist/sdk/index.d.ts +15 -0
  155. package/dist/sdk/index.js +1 -0
  156. package/dist/sdk/manifest-schema.d.ts +17 -0
  157. package/dist/sdk/manifest-schema.js +21 -0
  158. package/dist/tools/context-seam.test.js +6 -1
  159. package/dist/tools/detect-anomalies.d.ts +12 -1
  160. package/dist/tools/detect-anomalies.js +26 -5
  161. package/dist/tools/generate-postmortem.d.ts +35 -0
  162. package/dist/tools/generate-postmortem.js +191 -0
  163. package/dist/tools/get-anomaly-history.d.ts +35 -0
  164. package/dist/tools/get-anomaly-history.js +126 -0
  165. package/dist/tools/get-service-health.d.ts +1 -1
  166. package/dist/tools/get-service-health.js +4 -3
  167. package/dist/tools/list-services.d.ts +1 -1
  168. package/dist/tools/list-services.js +3 -2
  169. package/dist/tools/list-sources.d.ts +1 -1
  170. package/dist/tools/list-sources.js +6 -2
  171. package/dist/tools/query-logs.d.ts +1 -1
  172. package/dist/tools/query-logs.js +2 -2
  173. package/dist/tools/query-metrics.d.ts +1 -1
  174. package/dist/tools/query-metrics.js +19 -6
  175. package/dist/tools/query-traces.d.ts +47 -0
  176. package/dist/tools/query-traces.js +145 -0
  177. package/dist/tools/query-traces.test.d.ts +1 -0
  178. package/dist/tools/query-traces.test.js +110 -0
  179. package/dist/tools/registry-names.d.ts +35 -0
  180. package/dist/tools/registry-names.js +54 -0
  181. package/dist/tools/registry-names.test.d.ts +1 -0
  182. package/dist/tools/registry-names.test.js +61 -0
  183. package/dist/tools/topology.d.ts +3 -3
  184. package/dist/tools/topology.js +33 -11
  185. package/dist/tools/topology.test.js +45 -0
  186. package/dist/topology/merge.d.ts +22 -0
  187. package/dist/topology/merge.js +178 -0
  188. package/dist/topology/merge.test.d.ts +1 -0
  189. package/dist/topology/merge.test.js +110 -0
  190. package/dist/transport/sessionStore.d.ts +66 -0
  191. package/dist/transport/sessionStore.js +138 -0
  192. package/dist/transport/sessionStore.test.d.ts +1 -0
  193. package/dist/transport/sessionStore.test.js +118 -0
  194. package/dist/transport/transportSessionMap.d.ts +70 -0
  195. package/dist/transport/transportSessionMap.js +128 -0
  196. package/dist/transport/transportSessionMap.test.d.ts +1 -0
  197. package/dist/transport/transportSessionMap.test.js +111 -0
  198. package/dist/transport/websocket.d.ts +35 -0
  199. package/dist/transport/websocket.js +133 -0
  200. package/dist/transport/websocket.test.d.ts +1 -0
  201. package/dist/transport/websocket.test.js +124 -0
  202. package/dist/types.d.ts +51 -0
  203. package/dist/ui/index.html +2529 -145
  204. package/package.json +13 -3
package/dist/index.js CHANGED
@@ -9,35 +9,56 @@ import { z } from "zod";
9
9
  import { loadConfig, saveConfig, DEFAULT_HEALTH_THRESHOLDS, DEFAULT_SETTINGS } from "./config/loader.js";
10
10
  import { ConnectorRegistry, getSupportedTypes } from "./connectors/registry.js";
11
11
  import { isTopologyProvider } from "./connectors/interface.js";
12
- import { defaultContext, principalContext } from "./context.js";
12
+ import { defaultContext, principalContext, sessionContext, allowsTool } from "./context.js";
13
+ import { parseKeyTenants } from "./tenancy/context.js";
13
14
  import { enforceEntitledAccess, enterpriseGateStatus, enterpriseGateInfo, enterprisePolicyView, enterpriseCatalogView, enterpriseAuditTail, authorizeAdmin, updateRbacPolicy, updateCatalog, } from "./enterprise-gate.js";
14
15
  import { loadCredentials, credentialsConfigured, extractToken, resolveToken, } from "./auth/credentials.js";
15
16
  import { issueSession, setCookieHeader, clearCookieHeader, generateSecret, } from "./auth/session.js";
16
- import { readUsersFile, authenticate, } from "./auth/local-users.js";
17
+ import { readUsersFile, writeUsersFile, authenticate, } from "./auth/local-users.js";
17
18
  import { buildSessionAttacher, buildRequireSession, } from "./auth/middleware.js";
18
- import { buildRequirePermission, hasPermission, listGrantedPermissions, DEFAULT_POLICY, } from "./auth/rbac.js";
19
+ import { buildRequirePermissionFromEngine, hasPermission, listGrantedPermissions, DEFAULT_POLICY, } from "./auth/rbac.js";
19
20
  import { resolveOidcConfig, buildOidcRuntime } from "./auth/oidc/runtime.js";
20
21
  import { registerOidcRoutes } from "./auth/oidc/endpoints.js";
22
+ import { createScimStore } from "./scim/store.js";
23
+ import { registerScimRoutes } from "./scim/routes.js";
21
24
  import { BuiltinPolicyEngine } from "./auth/policy/engine.js";
22
- import { loadPolicyFromFile, PolicyLoadError, VALID_RESOURCES, VALID_ACTIONS } from "./auth/policy/loader.js";
25
+ import { loadPolicyFromFile, writePolicyFile, PolicyLoadError, VALID_RESOURCES, VALID_ACTIONS } from "./auth/policy/loader.js";
23
26
  import { OpaPolicyEngine } from "./auth/policy/opa.js";
27
+ import { evaluateBatch, batchResultToCsv } from "./auth/policy/batch-dry-run.js";
24
28
  import { AuditLog } from "./audit/log.js";
25
29
  import { buildAuditMiddleware } from "./audit/middleware.js";
30
+ import { WebhookSink } from "./audit/sinks/webhook.js";
31
+ import { buildBypassBreadcrumb, buildBypassAuditParams } from "./audit/redaction-bypass.js";
26
32
  import { readCatalogFile, CatalogStore } from "./catalog/loader.js";
27
33
  import { readProductsFile, ProductsStore, validateProduct, writeProductsFile, ProductsLoadError } from "./products/loader.js";
34
+ import { REGISTERED_TOOL_NAMES, REGISTERED_TOOLS, unknownToolNames } from "./tools/registry-names.js";
28
35
  import { redactValue } from "./policy/redact.js";
29
- import { IdentityRateLimiter, resolveToolRatePerMin } from "./quota/limiter.js";
36
+ import { IdentityRateLimiter, resolveToolRatePerMin, parseKeyRateLimits } from "./quota/limiter.js";
30
37
  import { TokenBudget, estimateTokensFor, resolveDailyTokenLimit } from "./quota/token-budget.js";
38
+ import { applyBudgetDecision } from "./quota/charge.js";
31
39
  import { getPluginLoader } from "./connectors/loader.js";
32
40
  import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
33
41
  import { isValidConnectorName, installTarball } from "./connectors/install.js";
34
42
  import { PluginVerificationError } from "./connectors/verify.js";
35
- import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions } from "./metrics/self.js";
43
+ import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions, auditDlqDepth } from "./metrics/self.js";
44
+ import { initOtel } from "./observability/otel.js";
45
+ import { WebSocketServerTransport } from "./transport/websocket.js";
46
+ import { HookRegistry } from "./sdk/hooks.js";
47
+ import { wrapToolHandler, wrapResourceHandler, wrapPromptHandler } from "./sdk/hook-wrappers.js";
48
+ import { UpstreamClient } from "./federation/upstream.js";
49
+ import { FederationRegistry, parseFederationEnv } from "./federation/registry.js";
50
+ import { buildCsrfIssuer, buildCsrfEnforcer, csrfBypassFromEnv } from "./auth/csrf.js";
51
+ import { checkOutboundUrl, ssrfGuardFromEnv } from "./middleware/ssrfGuard.js";
36
52
  import { buildOpenApiSpec } from "./openapi.js";
37
53
  import { listSourcesHandler } from "./tools/list-sources.js";
38
54
  import { listServicesHandler } from "./tools/list-services.js";
39
55
  import { queryMetricsHandler } from "./tools/query-metrics.js";
40
56
  import { queryLogsHandler } from "./tools/query-logs.js";
57
+ import { queryTracesHandler } from "./tools/query-traces.js";
58
+ import { getAnomalyHistoryHandler } from "./tools/get-anomaly-history.js";
59
+ import { generatePostmortemHandler } from "./tools/generate-postmortem.js";
60
+ import { PostmortemStore } from "./postmortem/store.js";
61
+ import { AnomalyHistory, fromEnv as anomalyHistoryFromEnv } from "./analysis/history.js";
41
62
  import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
42
63
  import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
43
64
  import { getTopologyHandler, getBlastRadiusHandler } from "./tools/topology.js";
@@ -82,15 +103,7 @@ function qstr(v) {
82
103
  * so a downstream investigator can join the two channels there.
83
104
  */
84
105
  function emitBypassEvent(event, ctx, args) {
85
- console.error(JSON.stringify({
86
- event,
87
- ts: new Date().toISOString(),
88
- auth: ctx.auth,
89
- tool: "query_logs",
90
- service: args?.service ?? null,
91
- correlationId: ctx.correlationId,
92
- ...(event === "redaction_bypass_denied" ? { reason: "credential_not_in_OMCP_KEY_BYPASS_REDACTION" } : {}),
93
- }));
106
+ console.error(JSON.stringify(buildBypassBreadcrumb(event, ctx, args)));
94
107
  }
95
108
  /** Bridge from the new PolicyEngine to the existing
96
109
  * hasPermission/buildRequirePermission signatures (which still take
@@ -127,21 +140,25 @@ function getAvailableMetricNames(registry) {
127
140
  }
128
141
  /** Validate source URL: must be http/https, reject obviously dangerous targets */
129
142
  function validateSourceUrl(url) {
143
+ // Phase F11: delegate to the shared SSRF guard. Strict by default;
144
+ // operators add OMCP_ALLOW_PRIVATE_BACKENDS=true to allow in-cluster
145
+ // backends. Cloud-metadata IPs (AWS 169.254.169.254, GCE
146
+ // fd00:ec2::254) are rejected regardless.
147
+ const v = checkOutboundUrl(url, ssrfGuardFromEnv());
148
+ if (!v.allow)
149
+ return v.reason ?? `URL "${url}" is rejected by the SSRF guard`;
150
+ // Extra Google-metadata-hostname check (DNS-based, not in the
151
+ // numeric guard).
130
152
  try {
131
- const parsed = new URL(url);
132
- if (!["http:", "https:"].includes(parsed.protocol)) {
133
- return `Invalid URL scheme "${parsed.protocol}". Only http and https are allowed.`;
134
- }
135
- // Block cloud metadata endpoints
136
- const host = parsed.hostname.toLowerCase();
137
- if (host === "169.254.169.254" || host === "metadata.google.internal") {
153
+ const host = new URL(url).hostname.toLowerCase();
154
+ if (host === "metadata.google.internal") {
138
155
  return "Access to cloud metadata endpoints is not allowed.";
139
156
  }
140
- return null;
141
157
  }
142
158
  catch {
143
- return `Invalid URL: "${url}"`;
159
+ /* already caught by checkOutboundUrl */
144
160
  }
161
+ return null;
145
162
  }
146
163
  // Hard cap for a downloaded/uploaded connector tarball (defence against
147
164
  // a hostile or accidental huge artifact OOM-ing the server).
@@ -169,6 +186,13 @@ async function main() {
169
186
  if (STDIO) {
170
187
  console.log = (...a) => console.error(...a);
171
188
  }
189
+ // OpenTelemetry self-tracing — opt-in via OMCP_OTEL_ENABLED. Init
190
+ // before express() so HTTP auto-instrumentation captures every
191
+ // /api/* and /mcp request. Skipped in stdio mode (no HTTP surface
192
+ // and the auto-instrumentation would emit noise per stdio call).
193
+ if (!STDIO) {
194
+ await initOtel({ serviceVersion: process.env.npm_package_version });
195
+ }
172
196
  let config = loadConfig();
173
197
  await getPluginLoader().load();
174
198
  const registry = new ConnectorRegistry();
@@ -238,37 +262,7 @@ async function main() {
238
262
  const text = result.content[0]?.text ?? "";
239
263
  const tokens = estimateTokensFor(text);
240
264
  const decision = tokenBudget.check(identityKey(ctx), tokens);
241
- if (decision.allowed || decision.limit === 0)
242
- return result;
243
- // A single request larger than the entire daily cap can never
244
- // succeed by waiting — surface a distinct error code so the
245
- // agent doesn't loop. Otherwise the wait-then-retry path is the
246
- // right answer (and freedAtRetry tells the agent how much they
247
- // can request after the wait).
248
- const requestExceedsCap = tokens > decision.limit;
249
- const errBody = {
250
- error: requestExceedsCap ? "OMCP_TOKEN_REQUEST_EXCEEDS_BUDGET" : "OMCP_TOKEN_BUDGET_EXCEEDED",
251
- tool: toolName,
252
- used: decision.used,
253
- limit: decision.limit,
254
- requested: tokens,
255
- retryAfterSeconds: requestExceedsCap ? 0 : decision.retryAfterSeconds,
256
- freedAtRetry: decision.freedAtRetry,
257
- message: requestExceedsCap
258
- ? `This single response (~${tokens} tokens) is larger than the entire daily budget (${decision.limit}). Retrying won't help — narrow the query (smaller window / lower limit / more selective filter) or raise OMCP_TOOL_DAILY_TOKENS.`
259
- : `Daily token budget exceeded (${decision.used}/${decision.limit} tokens used in the trailing 24h; this call would have added ~${tokens}). Try again in ~${Math.ceil(decision.retryAfterSeconds / 3600)}h or raise OMCP_TOOL_DAILY_TOKENS.`,
260
- };
261
- // Preserve any additional content entries (e.g. a future
262
- // tool returning [text, image]) — only the text payload of the
263
- // first entry is replaced with the error JSON; everything after
264
- // it passes through.
265
- return {
266
- ...result,
267
- content: [
268
- { ...result.content[0], text: JSON.stringify(errBody) },
269
- ...result.content.slice(1),
270
- ],
271
- };
265
+ return applyBudgetDecision(result, decision, tokens, toolName);
272
266
  }
273
267
  const REDACTION_ENABLED = String(process.env.OMCP_REDACTION ?? "on").toLowerCase() !== "off";
274
268
  function redactToolText(result, opts = {}) {
@@ -303,13 +297,73 @@ async function main() {
303
297
  return result;
304
298
  }
305
299
  }
300
+ /**
301
+ * Returns the McpServer for the given context. The companion
302
+ * `toolHandlers` map carries every tool registered for this ctx
303
+ * (post-hook-wrapping) so the in-product Playground UI (Q13) can
304
+ * invoke a tool without going through the full Streamable HTTP
305
+ * transport stack. The map is keyed by tool name; values run the
306
+ * same wrapped handler the McpServer would dispatch over MCP.
307
+ */
306
308
  function createMcpServer(ctx) {
307
309
  const mcpServer = new McpServer({
308
310
  name: "observability-mcp",
309
311
  version: SERVER_VERSION,
310
312
  });
313
+ const toolHandlers = new Map();
311
314
  // --- Register tools with Zod schemas ---
312
- mcpServer.tool("list_sources", [
315
+ // Product-aware registration: when the active credential is bound
316
+ // to a Product (OMCP_KEY_PRODUCTS), `ctx.allowedTools` carries that
317
+ // Product's `tools` allow-list and we skip the registration of any
318
+ // tool not in it. Anonymous + Product-less sessions leave
319
+ // allowedTools undefined and see every tool — the bypass is the
320
+ // back-compat path the open-source default relies on.
321
+ //
322
+ // The wrapper also wires Phase F7 hook fan-out: every tool dispatch
323
+ // fires tool_pre_invoke before the handler and tool_post_invoke after.
324
+ // Plugins can deny the call (allow:false → isError CallToolResult),
325
+ // mutate the args before dispatch, or mutate the result before it
326
+ // reaches the caller. When no hooks are registered (the default in
327
+ // the OSS demo) the wrapper is a thin pass-through.
328
+ const registerTool = ((name, ...rest) => {
329
+ if (!allowsTool(ctx.allowedTools, name))
330
+ return undefined;
331
+ if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
332
+ const originalHandler = rest[rest.length - 1];
333
+ const wrappedHandler = wrapToolHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
334
+ rest[rest.length - 1] = wrappedHandler;
335
+ // Stash for the Playground endpoint — keyed by tool name. The
336
+ // wrapped handler honours pre/post hooks + the same RBAC the
337
+ // McpServer dispatch path runs. Per-ctx Map so a different
338
+ // user's allowedTools never leak.
339
+ toolHandlers.set(name, wrappedHandler);
340
+ }
341
+ return mcpServer.tool(name, ...rest);
342
+ });
343
+ // Q12: resource + prompt registrations get the same hook-fan-out
344
+ // treatment so a plugin's resource_pre_fetch / resource_post_fetch /
345
+ // prompt_pre_fetch / prompt_post_fetch handlers actually fire when
346
+ // a future resource/prompt registration lands. The wrappers stay
347
+ // thin pass-throughs when no hooks are registered (the OSS default).
348
+ // Wrappers are tested in mcp-server/src/sdk/hook-wrappers.test.ts.
349
+ const registerResource = ((name, ...rest) => {
350
+ if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
351
+ const originalHandler = rest[rest.length - 1];
352
+ rest[rest.length - 1] = wrapResourceHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
353
+ }
354
+ return mcpServer.resource(name, ...rest);
355
+ });
356
+ const registerPrompt = ((name, ...rest) => {
357
+ if (rest.length > 0 && typeof rest[rest.length - 1] === "function") {
358
+ const originalHandler = rest[rest.length - 1];
359
+ rest[rest.length - 1] = wrapPromptHandler(hookRegistry, { principal: ctx.principalId, tenant: ctx.tenant || "default", target: name }, originalHandler);
360
+ }
361
+ return mcpServer.prompt(name, ...rest);
362
+ });
363
+ // Suppress unused-warn — kept for the moment registrations land.
364
+ void registerResource;
365
+ void registerPrompt;
366
+ registerTool("list_sources", [
313
367
  "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
314
368
  "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
315
369
  "Behavior: read-only, no side effects. Returns one entry per source with its name, type, configured URL, signal types (metrics/logs), and a live up/down status. Never throws for an unreachable backend — the backend is reported as down instead.",
@@ -318,7 +372,7 @@ async function main() {
318
372
  await enforceEntitledAccess(ctx, { tool: "list_sources" });
319
373
  return withToolMetrics("list_sources", () => listSourcesHandler(registry, ctx));
320
374
  });
321
- mcpServer.tool("list_services", [
375
+ registerTool("list_services", [
322
376
  "Discover the service names that can be queried, aggregated across every connected backend.",
323
377
  "When to use: call this before `query_metrics`, `query_logs`, or `get_service_health` to obtain the exact, case-sensitive service name those tools require.",
324
378
  "Behavior: read-only, no side effects. Returns one entry per service with the service name, the source(s) it was discovered in, and which signals are available for it (metrics, logs, or both).",
@@ -336,7 +390,7 @@ async function main() {
336
390
  const metricsList = getAvailableMetricNames(registry);
337
391
  const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
338
392
  const uniqueNames = [...new Set(metricNames)];
339
- mcpServer.tool("query_metrics", [
393
+ registerTool("query_metrics", [
340
394
  "Fetch the raw time-series for ONE metric of ONE service over a look-back window, returned together with pre-computed summary statistics.",
341
395
  "When to use: when you need the actual numeric values or the trend of a known metric. For a 'is this service OK?' verdict use `get_service_health`; to find which services are misbehaving use `detect_anomalies`.",
342
396
  "Prerequisites: get the exact service name from `list_services` and choose a metric from the list at the end of this description.",
@@ -366,7 +420,7 @@ async function main() {
366
420
  const result = await withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args, ctx));
367
421
  return chargeTokenBudget(result, ctx, "query_metrics");
368
422
  });
369
- mcpServer.tool("query_logs", [
423
+ registerTool("query_logs", [
370
424
  "Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
371
425
  "When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
372
426
  "Prerequisites: get the exact service name from `list_services` (the service must expose a logs signal).",
@@ -419,16 +473,7 @@ async function main() {
419
473
  // invocation is tamper-evident alongside the rest of
420
474
  // /api/*. Persists if OMCP_MGMT_AUDIT_FILE is set.
421
475
  emitBypassEvent(bypass ? "redaction_bypass_engaged" : "redaction_bypass_denied", ctx, args);
422
- void mgmtAudit.record({
423
- actor: { sub: ctx.principalId },
424
- tenant: ctx.tenant,
425
- resource: "redaction",
426
- action: "bypass",
427
- method: "MCP",
428
- path: "/mcp/query_logs",
429
- status: bypass ? 200 : 403,
430
- target: args?.service ?? undefined,
431
- }).catch(() => {
476
+ void mgmtAudit.record(buildBypassAuditParams(bypass, ctx, args)).catch(() => {
432
477
  // Audit record is best-effort — losing one entry must not
433
478
  // crash the tool call. The chain itself remains intact.
434
479
  });
@@ -436,7 +481,54 @@ async function main() {
436
481
  const redacted = redactToolText(result, { bypass });
437
482
  return chargeTokenBudget(redacted, ctx, "query_logs");
438
483
  });
439
- mcpServer.tool("get_service_health", [
484
+ registerTool("get_anomaly_history", [
485
+ "Replay historical anomaly scores for a service from the TSDB the gateway writes to (omcp_anomaly_score series).",
486
+ "When to use: post-mortem reconstruction, trend analysis on detector noise, or pulling context for the LLM when an incident is reviewed after the fact.",
487
+ "Prerequisites: the operator must have OMCP_ANOMALY_HISTORY_REMOTE_WRITE configured AND a Prometheus source pointed at the same TSDB so the round-trip closes.",
488
+ "Behavior: read-only. Returns the time-series of scores. Empty result means either no anomalies in the window or history is disabled.",
489
+ "Related: `detect_anomalies` for the live scores; `query_metrics` if you want to write the PromQL by hand.",
490
+ ].join(" "), {
491
+ service: z.string().describe("Service name to filter on."),
492
+ duration: z.string().optional().describe("Rolling window, e.g. '1h', '24h'. Default '1h'."),
493
+ method: z.string().optional().describe("Filter by detector method ('mad' / 'seasonality' / 'correlator'). Optional."),
494
+ }, async (args) => {
495
+ await enforceEntitledAccess(ctx, { tool: "get_anomaly_history", service: args?.service });
496
+ const result = await withToolMetrics("get_anomaly_history", () => getAnomalyHistoryHandler(registry, args, ctx));
497
+ return chargeTokenBudget(result, ctx, "get_anomaly_history");
498
+ });
499
+ registerTool("generate_postmortem", [
500
+ "Stitch the gateway's primitives (anomaly history, blast-radius, traces, log highlights) into a single markdown post-mortem report for one service over a given window.",
501
+ "When to use: after an incident, when the operator or LLM wants 'one document the on-call can read in 60 seconds' instead of poking the individual tools.",
502
+ "Prerequisites: anomaly history requires OMCP_ANOMALY_HISTORY_REMOTE_WRITE + a Prometheus source. Traces require Tempo / Jaeger. Blast-radius requires a topology provider.",
503
+ "Behavior: read-only. Returns markdown by default; pass `format='json'` for the structured shape. Output capped (timeline 20 rows, blast-radius 30 nodes, 10 traces) — JSON shape carries the full data.",
504
+ "Related: `get_anomaly_history`, `query_traces`, `get_blast_radius` for the underlying primitives.",
505
+ ].join(" "), {
506
+ service: z.string().describe("Suspected root-cause service."),
507
+ duration: z.string().optional().describe("Window length, e.g. '1h', '6h'. Default '1h'."),
508
+ format: z.enum(["markdown", "json"]).optional().describe("'markdown' (default) or 'json'."),
509
+ }, async (args) => {
510
+ await enforceEntitledAccess(ctx, { tool: "generate_postmortem", service: args?.service });
511
+ const result = await withToolMetrics("generate_postmortem", () => generatePostmortemHandler(registry, args, ctx));
512
+ return chargeTokenBudget(result, ctx, "generate_postmortem");
513
+ });
514
+ registerTool("query_traces", [
515
+ "Query distributed traces for a service over a given timeframe.",
516
+ "Returns ranked trace summaries (duration, span count, error status) with a p50/p95 aggregate across the returned set.",
517
+ "When to use: investigate tail-latency outliers, walk call chains across services for a specific time window, or pull traces related to an anomaly that the metric/log tools surfaced first.",
518
+ "Prerequisites: get the exact service name from `list_services`. A Tempo / Jaeger / OTLP connector must be configured.",
519
+ "Behavior: read-only. `filter` accepts the backend's native query language (TraceQL on Tempo, tag query on Jaeger). When `errorsOnly=true`, only traces with at least one error span are returned. Default limit is 50.",
520
+ ].join(" "), {
521
+ service: z.string().describe("Service name (e.g. 'payment-service')."),
522
+ duration: z.string().optional().describe("Rolling time window, e.g. '5m', '1h'. Default '15m'."),
523
+ filter: z.string().optional().describe("Backend-native filter (TraceQL on Tempo, tag query on Jaeger). Optional."),
524
+ limit: z.number().int().positive().optional().describe("Soft cap on returned trace summaries. Default 50."),
525
+ errorsOnly: z.boolean().optional().describe("If true, only traces with at least one error span."),
526
+ }, async (args) => {
527
+ await enforceEntitledAccess(ctx, { tool: "query_traces", service: args?.service });
528
+ const result = await withToolMetrics("query_traces", () => queryTracesHandler(registry, args, ctx));
529
+ return chargeTokenBudget(result, ctx, "query_traces");
530
+ });
531
+ registerTool("get_service_health", [
440
532
  "Produce a single aggregated health verdict for ONE service by combining its metrics and logs.",
441
533
  "When to use: the fastest way to answer 'is this service healthy right now and why?'. Use `query_metrics`/`query_logs` to drill into the underlying numbers, or `detect_anomalies` to scan many services at once.",
442
534
  "Prerequisites: get the exact service name from `list_services`.",
@@ -451,7 +543,7 @@ async function main() {
451
543
  const enriched = enrichToolHealthText(result, String(args?.service ?? ""), ctx);
452
544
  return chargeTokenBudget(enriched, ctx, "get_service_health");
453
545
  });
454
- mcpServer.tool("detect_anomalies", [
546
+ registerTool("detect_anomalies", [
455
547
  "Scan one or all monitored services for abnormal behavior and return the findings ranked by severity.",
456
548
  "When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with `get_service_health` for the verdict or `query_metrics`/`query_logs` for the raw evidence.",
457
549
  "Behavior: read-only, no side effects. Applies z-score analysis to metrics, detects log error-rate spikes, and correlates the two. Returns a list of anomalies, each with the affected service, metric/signal, severity, the deviation (e.g. σ and % change), and a short explanation. No anomalies yields an empty list, not an error.",
@@ -471,9 +563,11 @@ async function main() {
471
563
  .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
472
564
  }, async (args) => {
473
565
  await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
474
- return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
566
+ // P1: pass the anomaly-history sink so detected scores flow
567
+ // into the TSDB and `get_anomaly_history` returns real data.
568
+ return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx, anomalyHistory));
475
569
  });
476
- mcpServer.tool("get_topology", [
570
+ registerTool("get_topology", [
477
571
  "Return the infrastructure topology graph (Resources and Edges) from every topology-capable connector.",
478
572
  "When to use: when an agent needs to reason about which workload runs on which host, who owns whom, or which scope (namespace/project/folder) a resource belongs to. Pair with `get_blast_radius` for shared-host RCA.",
479
573
  "Behavior: read-only, no side effects. Returns `{ sources, resources, edges, total, truncated }`. Filters compose: `source` to one connector, `kind` to one resource type (e.g. 'pod', 'node', 'deployment'), `scope` to members of a namespace/folder/project. Output is capped by `limit` (default 500, max 5000) and edges referencing dropped resources are removed.",
@@ -502,7 +596,7 @@ async function main() {
502
596
  await enforceEntitledAccess(ctx, { tool: "get_topology", source: args?.source });
503
597
  return withToolMetrics("get_topology", () => getTopologyHandler(registry, args, ctx));
504
598
  });
505
- mcpServer.tool("get_blast_radius", [
599
+ registerTool("get_blast_radius", [
506
600
  "Given a resource, return who else fails if its underlying host(s) fail.",
507
601
  "When to use: cross-cutting RCA — when several services degrade together and you suspect a shared host. Works for any RUNS_ON relationship: pod→node, vm→hypervisor, container→host.",
508
602
  "Behavior: read-only, no side effects. Resolves `resource` to a Resource (accepts canonical id, exact name, or unique substring), determines its host(s) via RUNS_ON, then lists every other resource that runs on those hosts, bucketed by ownership root (the terminal `OWNED_BY` target — e.g. the Deployment, not the ReplicaSet). If the target is itself a host, its tenants are reported. Returns a structured error if the resource is ambiguous or unknown.",
@@ -515,7 +609,41 @@ async function main() {
515
609
  await enforceEntitledAccess(ctx, { tool: "get_blast_radius" });
516
610
  return withToolMetrics("get_blast_radius", () => getBlastRadiusHandler(registry, args, ctx));
517
611
  });
518
- return mcpServer;
612
+ // Phase F10: federated tools — every upstream MCP server's tools
613
+ // show up here under `<prefix>.<upstream-tool>`. The handler is a
614
+ // pure proxy: it forwards args verbatim and returns the upstream's
615
+ // CallToolResult unchanged. The wrapping registerTool() at the top
616
+ // of this function still fires F7 lifecycle hooks + the F1
617
+ // Product-allow-list gate, so federated tools obey the same policy
618
+ // surface as native ones.
619
+ for (const info of federationRegistry.getNamespacedTools()) {
620
+ // The MCP SDK's tool() signature wants a ZodRawShape (a map of
621
+ // field-name → Zod type), not a raw JSON Schema. Federated
622
+ // upstreams expose JSON Schema (the wire-format MCP uses on
623
+ // tools/list); we transcode to a permissive Zod shape so the
624
+ // SDK accepts the registration. Per-field types are `z.unknown()`
625
+ // because the upstream will validate the call args anyway; the
626
+ // local Zod check is only a "this is the field name set" gate.
627
+ // P7: this transcoding fixes the registration crash that broke
628
+ // every federation deploy before the E2E test caught it.
629
+ const upstreamProps = info.inputSchema?.properties ?? {};
630
+ // Every field is z.unknown().optional() — the SDK only uses this
631
+ // shape to know the field-name set; the upstream re-validates
632
+ // against its full JSON Schema (incl. its own `required` list)
633
+ // when the call arrives. Marking all fields optional here keeps
634
+ // calls with the upstream-defaults flowing through; without it
635
+ // the SDK rejects any call that omits a field upstream considers
636
+ // required even if the upstream would accept the omission.
637
+ const localShape = {};
638
+ for (const k of Object.keys(upstreamProps)) {
639
+ localShape[k] = z.unknown().optional();
640
+ }
641
+ registerTool(info.namespacedName, info.description || `Federated from upstream ${info.sourceName}.`, localShape, async (args) => {
642
+ await enforceEntitledAccess(ctx, { tool: info.namespacedName });
643
+ return withToolMetrics(info.namespacedName, () => federationRegistry.callNamespacedTool(info.namespacedName, args));
644
+ });
645
+ }
646
+ return { mcpServer, toolHandlers };
519
647
  }
520
648
  // --- Management-plane auth (basic mode) -----------------------------------
521
649
  // Off by default. Enable with `OMCP_AUTH=basic` + `OMCP_USERS_FILE` and
@@ -624,7 +752,12 @@ async function main() {
624
752
  app.set("trust proxy", trustProxy);
625
753
  }
626
754
  }
627
- app.use(express.json({ limit: "1mb" }));
755
+ // Parse application/json AND any *+json media type. SCIM clients
756
+ // (Entra, Okta) send `application/scim+json` per RFC 7644 §3.1 —
757
+ // without the wildcard the body silently arrives empty and every
758
+ // SCIM POST/PATCH 400s. The wildcard also future-proofs other
759
+ // structured-suffix JSON content types.
760
+ app.use(express.json({ limit: "1mb", type: ["application/json", "application/*+json"] }));
628
761
  // Security headers
629
762
  app.use((req, res, next) => {
630
763
  res.setHeader("X-Content-Type-Options", "nosniff");
@@ -673,6 +806,17 @@ async function main() {
673
806
  // there is no string-match-based "is this public?" branch anywhere.
674
807
  app.use(buildSessionAttacher(authRuntime));
675
808
  const requireSession = buildRequireSession(authRuntime);
809
+ // Phase F11: CSRF — double-submit cookie pattern, enforced on every
810
+ // mutating /api/* request. The issuer runs top-of-pipe so any page
811
+ // render leaves a CSRF token cookie the SPA can read + echo back.
812
+ // Bearer-token clients (CI, agents, MCP clients) bypass by default
813
+ // since they can't be a browser confused-deputy.
814
+ const csrfCfg = {
815
+ bypassBearer: csrfBypassFromEnv(),
816
+ secureCookie: (r) => r.secure || r.headers["x-forwarded-proto"] === "https",
817
+ };
818
+ app.use(buildCsrfIssuer(csrfCfg));
819
+ app.use("/api", buildCsrfEnforcer(csrfCfg));
676
820
  // Active policy engine — built-in DEFAULT_POLICY by default. When
677
821
  // OMCP_RBAC_POLICY_FILE is set we load it and ALWAYS abort on
678
822
  // failure: OMCP_AUTH_ALLOW_FALLBACK is for *auth-mode* fallback
@@ -713,22 +857,42 @@ async function main() {
713
857
  return;
714
858
  const resources = [...VALID_RESOURCES];
715
859
  const actions = [...VALID_ACTIONS];
860
+ // Tenant-aware pre-warm: the gate keys cache per
861
+ // (roles, resource, action, tenant) so a tenant-conditional
862
+ // Rego rule that fires for "acme" but not "bigco" produces a
863
+ // distinct cached verdict per tenant. The pre-warm iterates
864
+ // every known declared tenant + "default" so the first user
865
+ // request from a tenant'd identity gets a real decision
866
+ // instead of a warming-deny. OIDC tenants only known at
867
+ // runtime are still subject to first-request warming, but
868
+ // operator-set OMCP_KEY_TENANTS land here.
869
+ const knownTenants = new Set(["default"]);
870
+ // parseKeyTenants is the same parser the credentials layer
871
+ // uses, so the warm set is exactly what the gate will see.
872
+ for (const t of parseKeyTenants(process.env.OMCP_KEY_TENANTS).values()) {
873
+ if (t)
874
+ knownTenants.add(t);
875
+ }
876
+ const tenants = Array.from(knownTenants);
716
877
  const tasks = [];
717
- for (const role of roles) {
718
- for (const resource of resources)
719
- for (const action of actions) {
720
- tasks.push(opaEngine.warmEvaluate([role], resource, action));
721
- }
722
- tasks.push(opaEngine.warmList([role]));
878
+ for (const tenant of tenants) {
879
+ for (const role of roles) {
880
+ for (const resource of resources)
881
+ for (const action of actions) {
882
+ tasks.push(opaEngine.warmEvaluate([role], resource, action, tenant));
883
+ }
884
+ tasks.push(opaEngine.warmList([role], tenant));
885
+ }
723
886
  }
724
887
  try {
725
888
  const settled = await Promise.allSettled(tasks);
726
889
  const failed = settled.filter((s) => s.status === "rejected").length;
890
+ const tlbl = tenants.length === 1 ? "1 tenant" : `${tenants.length} tenants`;
727
891
  if (failed === 0) {
728
- console.log(`[auth] OPA cache pre-warmed: ${settled.length} decisions cached for ${roles.length} role(s)`);
892
+ console.log(`[auth] OPA cache pre-warmed: ${settled.length} decisions cached for ${roles.length} role(s) × ${tlbl}`);
729
893
  }
730
894
  else {
731
- console.warn(`[auth] OPA cache pre-warmed: ${settled.length - failed}/${settled.length} ok, ${failed} failed (gates will retry on first user call)`);
895
+ console.warn(`[auth] OPA cache pre-warmed: ${settled.length - failed}/${settled.length} ok, ${failed} failed across ${tlbl} (gates will retry on first user call)`);
732
896
  }
733
897
  }
734
898
  catch { /* best-effort */ }
@@ -745,20 +909,108 @@ async function main() {
745
909
  process.exit(1);
746
910
  }
747
911
  }
748
- const need = (resource, action) => buildRequirePermission(authRuntime, resource, action, policyEngineToMap(policyEngine));
912
+ // Use the engine-aware variant so tenant (session.tenant) flows into
913
+ // engine.evaluate() — required for tenant-conditional Rego rules
914
+ // (`input.tenant == "acme"` etc.) under OMCP_OPA_URL. Built-in /
915
+ // file-loaded engines ignore the tenant ctx, so the behaviour is
916
+ // unchanged for those deployments.
917
+ const need = (resource, action) => buildRequirePermissionFromEngine(authRuntime, resource, action, policyEngine);
749
918
  // Management-plane audit log. Records one entry per mutating /api/*
750
919
  // request. Writes JSONL to disk when OMCP_MGMT_AUDIT_FILE is set;
751
920
  // otherwise an in-memory ring of the last 500 entries keeps the
752
921
  // /api/audit endpoint useful in the demo / single-user case.
753
- const mgmtAudit = new AuditLog({ file: process.env.OMCP_MGMT_AUDIT_FILE });
922
+ // External audit sinks opt-in via env. Each chained entry is
923
+ // mirrored to every configured sink; the on-disk JSONL master
924
+ // remains the source of truth (the hash chain is never split).
925
+ const auditSinks = [];
926
+ if (process.env.OMCP_AUDIT_WEBHOOK_URL) {
927
+ auditSinks.push(new WebhookSink({
928
+ url: process.env.OMCP_AUDIT_WEBHOOK_URL,
929
+ token: process.env.OMCP_AUDIT_WEBHOOK_TOKEN,
930
+ deadLetterFile: process.env.OMCP_AUDIT_WEBHOOK_DLQ,
931
+ }));
932
+ console.log("AuditLog: webhook sink enabled -> %s%s", process.env.OMCP_AUDIT_WEBHOOK_URL, process.env.OMCP_AUDIT_WEBHOOK_DLQ
933
+ ? ` (DLQ: ${process.env.OMCP_AUDIT_WEBHOOK_DLQ})`
934
+ : "");
935
+ }
936
+ const mgmtAudit = new AuditLog({
937
+ file: process.env.OMCP_MGMT_AUDIT_FILE,
938
+ sinks: auditSinks,
939
+ });
754
940
  await mgmtAudit.bootstrap();
941
+ process.on("SIGTERM", () => {
942
+ mgmtAudit
943
+ .flushSinks()
944
+ .catch((err) => console.warn("AuditLog flushSinks failed:", err));
945
+ });
755
946
  const audit = (resource, action) => buildAuditMiddleware({ audit: mgmtAudit, resource, action });
947
+ // Plugin lifecycle hook registry — populated by the loader at boot
948
+ // (one entry per manifest `hooks[]` entry) and mutable at runtime
949
+ // when a connector is installed via /api/connectors/install. Each
950
+ // tool dispatch in createMcpServer fans through this registry's
951
+ // tool_pre_invoke / tool_post_invoke chains; resource and prompt
952
+ // hooks plug into their respective seams as they ship.
953
+ const hookRegistry = new HookRegistry();
954
+ // Phase F15: anomaly-history sink — opt-in via
955
+ // OMCP_ANOMALY_HISTORY_REMOTE_WRITE. When configured, anomaly
956
+ // scores written via anomalyHistory.record() flush to the
957
+ // configured TSDB on a 10-second timer. The MCP tool
958
+ // get_anomaly_history queries them back via any Prometheus source
959
+ // pointed at the same TSDB.
960
+ //
961
+ // The detector-side hook that actually records per-anomaly scores
962
+ // is plumbed in F15b (it requires passing this instance into the
963
+ // detectAnomaliesHandler — minor surgery deferred). The
964
+ // infrastructure ships now so externally-written omcp_anomaly_score
965
+ // metrics are already queryable end-to-end.
966
+ const anomalyHistory = new AnomalyHistory(anomalyHistoryFromEnv());
967
+ anomalyHistory.start();
968
+ if (anomalyHistory.isEnabled()) {
969
+ console.log("AnomalyHistory: TSDB sink enabled (OMCP_ANOMALY_HISTORY_REMOTE_WRITE set)");
970
+ }
971
+ process.on("SIGTERM", () => {
972
+ void anomalyHistory.stop().catch(() => undefined);
973
+ });
974
+ // Federation registry — populated from OMCP_FEDERATION_UPSTREAMS at
975
+ // boot. Each upstream connects, fetches tools/list, and exposes its
976
+ // tools under `<prefix>.<upstream-tool-name>` on the gateway's
977
+ // surface. Failures are logged + the upstream is left in `degraded`
978
+ // (no tools) so the gateway boots regardless of upstream health.
979
+ const federationRegistry = new FederationRegistry();
980
+ for (const cfg of parseFederationEnv()) {
981
+ const client = new UpstreamClient(cfg.kind === "stdio"
982
+ ? { transport: "stdio", name: cfg.name, command: cfg.command, args: cfg.args }
983
+ : cfg.kind === "ws"
984
+ ? { transport: "ws", name: cfg.name, url: cfg.url }
985
+ : { name: cfg.name, url: cfg.url, bearerToken: cfg.bearerToken });
986
+ federationRegistry.add(client);
987
+ client.connect().catch((err) => {
988
+ console.warn("federation upstream %s initial connect failed: %s", cfg.name, err instanceof Error ? err.message : String(err));
989
+ });
990
+ }
991
+ if (federationRegistry.list().length > 0) {
992
+ console.log("federation: %d upstream(s) configured: %s", federationRegistry.list().length, federationRegistry.list().map((u) => `${u.name}=${u.url}`).join(", "));
993
+ }
994
+ process.on("SIGTERM", () => {
995
+ federationRegistry
996
+ .closeAll()
997
+ .catch((err) => console.warn("federation closeAll failed:", err));
998
+ });
756
999
  // Service catalog: optional operator-curated ownership / criticality /
757
1000
  // on-call metadata, keyed on the service name list_services returns.
758
1001
  // No file ⇒ empty catalog, enrichment is a no-op (anonymous demos
759
1002
  // see no behaviour change).
760
1003
  const catalog = new CatalogStore(await readCatalogFile(process.env.OMCP_SERVICE_CATALOG_FILE));
761
- const products = new ProductsStore(await readProductsFile(process.env.OMCP_PRODUCTS_FILE));
1004
+ // Hot-reload aware: passing the path lets `products.maybeReload()`
1005
+ // pick up out-of-band edits to OMCP_PRODUCTS_FILE without a restart.
1006
+ // Each /api/products* handler awaits maybeReload() before reading,
1007
+ // so a `kubectl apply` of an updated ConfigMap or a git-ops edit is
1008
+ // visible on the very next request.
1009
+ const productsPath = process.env.OMCP_PRODUCTS_FILE;
1010
+ const products = new ProductsStore(await readProductsFile(productsPath), { path: productsPath });
1011
+ // Seed the mtime cursor from the file we just loaded so the first
1012
+ // maybeReload() call doesn't redundantly re-parse the boot state.
1013
+ await products.pinMtimeAfterWrite();
762
1014
  // Protected route prefixes. /api/me, /api/auth/*, /api/info,
763
1015
  // /api/openapi.json deliberately don't appear here — they stay public.
764
1016
  for (const prefix of [
@@ -784,6 +1036,39 @@ async function main() {
784
1036
  // enough to skip the request-counter middleware.
785
1037
  let ready = false;
786
1038
  app.get("/healthz", (_req, res) => res.type("text").send("ok"));
1039
+ // Procurement-time probe: the MCP spec revisions and transports the
1040
+ // gateway supports. Static today — kept as a separate endpoint so a
1041
+ // discovery tool / RFP probe / catalog scanner can resolve our
1042
+ // compliance posture without sending a real MCP handshake.
1043
+ // See docs/mcp-conformance.md for the test suite that proves it.
1044
+ app.get("/api/conformance", (_req, res) => {
1045
+ res.json({
1046
+ revisions: ["2025-11-25"],
1047
+ transports: ["streamable-http", "stdio", "websocket"],
1048
+ methods: {
1049
+ // Methods exercised by the conformance harness. "supported"
1050
+ // is the union of methods that return a non -32601 envelope
1051
+ // for any conforming caller. Per-method spec compliance is
1052
+ // proven by src/conformance/mcp-2025-11-25.test.ts.
1053
+ supported: [
1054
+ "initialize",
1055
+ "notifications/initialized",
1056
+ "ping",
1057
+ "tools/list",
1058
+ "tools/call",
1059
+ ],
1060
+ optional: [
1061
+ "resources/list",
1062
+ "resources/read",
1063
+ "prompts/list",
1064
+ "prompts/get",
1065
+ "logging/setLevel",
1066
+ ],
1067
+ },
1068
+ harnessPath: "mcp-server/src/conformance/mcp-2025-11-25.test.ts",
1069
+ docs: "docs/mcp-conformance.md",
1070
+ });
1071
+ });
787
1072
  app.get("/readyz", (_req, res) => {
788
1073
  if (ready)
789
1074
  return res.type("text").send("ok");
@@ -799,6 +1084,24 @@ async function main() {
799
1084
  // this endpoint when enabled.
800
1085
  if (process.env.METRICS_ENABLED !== "false") {
801
1086
  app.get("/metrics", async (_req, res) => {
1087
+ // P9: refresh the audit-webhook DLQ depth before the scrape so
1088
+ // Prometheus sees the current file state rather than whatever
1089
+ // /api/audit/dlq last set. Best-effort; ENOENT or missing-env
1090
+ // resets to 0 (the dlqPath being unset is the normal state).
1091
+ try {
1092
+ const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
1093
+ if (dlqPath) {
1094
+ const fs = await import("node:fs/promises");
1095
+ const raw = await fs.readFile(dlqPath, "utf8").catch(() => "");
1096
+ auditDlqDepth.set(raw.split("\n").filter((l) => l.trim()).length);
1097
+ }
1098
+ else {
1099
+ auditDlqDepth.set(0);
1100
+ }
1101
+ }
1102
+ catch {
1103
+ auditDlqDepth.set(0);
1104
+ }
802
1105
  res.set("Content-Type", selfRegistry.contentType);
803
1106
  res.end(await selfRegistry.metrics());
804
1107
  });
@@ -806,11 +1109,33 @@ async function main() {
806
1109
  // Serve Web UI
807
1110
  app.use(express.static(join(__dirname, "ui")));
808
1111
  // --- API endpoints for Web UI ---
809
- // List sources with health status
810
- app.get("/api/sources", async (_req, res) => {
1112
+ // List sources with health status — tenant-scoped.
1113
+ // Non-admin callers see only their own tenant's sources + globals
1114
+ // (untagged). Admins (users:delete) see everything, with optional
1115
+ // ?tenant=acme drill-down. Anonymous mode (no session) sees
1116
+ // everything — preserves single-tenant default. The `tenant` field
1117
+ // is included on every entry so the UI can render scope badges.
1118
+ app.get("/api/sources", async (req, res) => {
1119
+ const sess = req.session;
1120
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
1121
+ const callerTenant = sess?.tenant || "default";
1122
+ const requestedTenant = qstr(req.query.tenant);
811
1123
  const health = await registry.healthCheckAll();
812
1124
  const configs = registry.getSourceConfigs();
813
- const sources = configs.map((c) => {
1125
+ const filtered = configs.filter((c) => {
1126
+ // Anonymous: every source.
1127
+ if (!sess)
1128
+ return true;
1129
+ // Admin with ?tenant=X drill-down: untagged + that tenant.
1130
+ if (isAdmin && requestedTenant)
1131
+ return !c.tenant || c.tenant === requestedTenant;
1132
+ // Admin no filter: every source (cross-tenant view).
1133
+ if (isAdmin)
1134
+ return true;
1135
+ // Non-admin: own tenant + untagged.
1136
+ return !c.tenant || c.tenant === callerTenant;
1137
+ });
1138
+ const sources = filtered.map((c) => {
814
1139
  const connector = registry.getByName(c.name);
815
1140
  return {
816
1141
  name: c.name,
@@ -819,6 +1144,7 @@ async function main() {
819
1144
  enabled: c.enabled,
820
1145
  auth: c.auth ? { type: c.auth.type } : undefined,
821
1146
  tls: c.tls || undefined,
1147
+ tenant: c.tenant,
822
1148
  signalType: connector?.signalType || null,
823
1149
  status: health[c.name]?.status || (c.enabled ? "down" : "disabled"),
824
1150
  latencyMs: health[c.name]?.latencyMs || null,
@@ -831,6 +1157,46 @@ async function main() {
831
1157
  app.get("/api/source-types", (_req, res) => {
832
1158
  res.json(getSupportedTypes());
833
1159
  });
1160
+ // Get the registry of MCP tools the server can advertise (name +
1161
+ // category + one-line summary). The Products modal uses this to
1162
+ // populate the tools-allowlist picker so a typo can't happen at
1163
+ // authoring time; the server-side typo guard (PR #343) stays as
1164
+ // defence-in-depth. Open to every viewer — there's nothing
1165
+ // sensitive in the catalogue, it's just static metadata.
1166
+ app.get("/api/tools/registry", (_req, res) => {
1167
+ res.json({ tools: REGISTERED_TOOLS });
1168
+ });
1169
+ // Q13: in-product Playground endpoint. Lets the operator invoke a
1170
+ // registered tool against the live gateway without spinning up a
1171
+ // separate MCP client. Re-uses the per-session ctx and the same
1172
+ // wrapped handler the McpServer dispatch path would run (so RBAC,
1173
+ // entitlements, rate-limit, audit, hook fan-out all apply
1174
+ // identically).
1175
+ app.post("/api/playground/invoke", async (req, res) => {
1176
+ const ctx = await gateCtx(req, res);
1177
+ if (!ctx)
1178
+ return;
1179
+ const body = (req.body ?? {});
1180
+ const tool = typeof body.tool === "string" ? body.tool : "";
1181
+ if (!tool) {
1182
+ res.status(400).json({ error: "tool (string) is required" });
1183
+ return;
1184
+ }
1185
+ const { toolHandlers } = createMcpServer(ctx);
1186
+ const handler = toolHandlers.get(tool);
1187
+ if (!handler) {
1188
+ res.status(404).json({ error: `tool '${tool}' is not registered (or not allowed for this credential)` });
1189
+ return;
1190
+ }
1191
+ try {
1192
+ const result = await handler(body.args ?? {}, undefined);
1193
+ res.json({ tool, result });
1194
+ }
1195
+ catch (err) {
1196
+ const message = err instanceof Error ? err.message : String(err);
1197
+ res.status(500).json({ error: message, tool });
1198
+ }
1199
+ });
834
1200
  // Server info — version, loaded plugins, MCP protocol version, build metadata.
835
1201
  // Used by the Web UI footer and by operators to confirm what's deployed.
836
1202
  app.get("/api/info", async (_req, res) => {
@@ -865,6 +1231,16 @@ async function main() {
865
1231
  redaction: REDACTION_ENABLED,
866
1232
  trustProxy: !!(process.env.OMCP_TRUST_PROXY && process.env.OMCP_TRUST_PROXY !== "false"),
867
1233
  toolRatePerMin: resolveToolRatePerMin(process.env.OMCP_TOOL_RATE_PER_MIN),
1234
+ // P1: posture flags so dashboards can alert when a shipped
1235
+ // capability is configured but doing nothing useful.
1236
+ anomalyHistoryActive: anomalyHistory.isEnabled(),
1237
+ tracesCapabilityCount: registry
1238
+ .getAll()
1239
+ .filter((c) => typeof c.queryTraces === "function").length,
1240
+ pluginsVerified: !/^(0|false|no|off)$/i.test(process.env.VERIFY_PLUGINS ?? "true"),
1241
+ scimEnabled: !!process.env.OMCP_SCIM_TOKEN,
1242
+ federationUpstreams: (process.env.OMCP_FEDERATION_UPSTREAMS ?? "")
1243
+ .split(",").map((s) => s.trim()).filter(Boolean).length,
868
1244
  },
869
1245
  plugins: loader.list().map((p) => ({
870
1246
  name: p.name,
@@ -923,9 +1299,18 @@ async function main() {
923
1299
  // to non-admin sessions.
924
1300
  app.get("/api/policy", need("users", "delete"), (req, res) => {
925
1301
  const map = policyEngineToMap(policyEngine);
926
- // Optional dry-run: ?roles=admin,operator&resource=sources&action=delete
1302
+ // The OPA engine's kind() is prefixed `opa:` (see opa.ts:198).
1303
+ // Surface a `tenantAware` boolean so operators can confirm at a
1304
+ // glance whether the active engine honours session.tenant in
1305
+ // .evaluate() — the BuiltinPolicyEngine ignores tenant ctx; OPA
1306
+ // threads it into the Rego input. This is the property required
1307
+ // for `allow { input.tenant == "acme" }` rules to actually fire.
1308
+ const tenantAware = policyEngine.kind().startsWith("opa:");
1309
+ // Optional dry-run: ?roles=admin,operator&resource=sources&action=delete[&tenant=acme]
927
1310
  // returns { allowed, reason } so operators can probe the active
928
- // engine without writing tests against a checkout.
1311
+ // engine without writing tests against a checkout. Tenant defaults
1312
+ // to the caller's session tenant; an admin can override via the
1313
+ // ?tenant= query string to probe verdicts for any tenant.
929
1314
  const q = req.query;
930
1315
  if (q.resource && q.action) {
931
1316
  const dryRoles = typeof q.roles === "string" ? q.roles.split(",").map((r) => r.trim()).filter(Boolean) : undefined;
@@ -940,12 +1325,30 @@ async function main() {
940
1325
  res.json({ dryRun: { roles: dryRoles ?? [], resource: q.resource, action: q.action, allowed: false, reason: `unknown action '${q.action}' (valid: ${[...VALID_ACTIONS].join(", ")})` } });
941
1326
  return;
942
1327
  }
943
- const result = policyEngine.evaluate(dryRoles, q.resource, q.action);
944
- res.json({ dryRun: { roles: dryRoles ?? [], resource: q.resource, action: q.action, ...result } });
1328
+ const callerSess = req.session;
1329
+ // Tenant resolution: explicit ?tenant= override wins, else the
1330
+ // caller's session tenant. The probe runs at users:delete (admin),
1331
+ // so a cross-tenant override is intentional — exactly how an
1332
+ // operator debugs "why doesn't my tenant-conditional Rego rule
1333
+ // fire for tenant Acme?".
1334
+ const probeTenant = typeof q.tenant === "string" && q.tenant
1335
+ ? q.tenant.trim()
1336
+ : callerSess?.tenant;
1337
+ const result = policyEngine.evaluate(dryRoles, q.resource, q.action, probeTenant ? { tenant: probeTenant } : undefined);
1338
+ res.json({
1339
+ dryRun: {
1340
+ roles: dryRoles ?? [],
1341
+ resource: q.resource,
1342
+ action: q.action,
1343
+ tenant: probeTenant,
1344
+ ...result,
1345
+ },
1346
+ });
945
1347
  return;
946
1348
  }
947
1349
  res.json({
948
1350
  engine: policyEngine.kind(),
1351
+ tenantAware,
949
1352
  policy: map,
950
1353
  roles: policyEngine.roles(),
951
1354
  note: policyEngine.kind() === "builtin"
@@ -953,6 +1356,281 @@ async function main() {
953
1356
  : `policy loaded from ${policyEngine.kind()}; restart to reload.`,
954
1357
  });
955
1358
  });
1359
+ // Phase F16: batch policy dry-run. Evaluates every
1360
+ // (subject × resource × action) cell against the active engine and
1361
+ // returns a matrix the UI heat-map renders. Gated identically to
1362
+ // the single-call dry-run on GET /api/policy. Capped at 100×100×10
1363
+ // cells per request — a single OPA query per cell is cheap on the
1364
+ // BuiltinPolicyEngine but a careless caller could hose an external
1365
+ // OPA, so the limit fences that. Operators get CSV via
1366
+ // Accept: text/csv for ticket attachments.
1367
+ app.post("/api/policy/dry-run-batch", need("users", "delete"), audit("policy", "read"), async (req, res) => {
1368
+ const body = (req.body ?? {});
1369
+ const subjects = Array.isArray(body.subjects) ? body.subjects : [];
1370
+ const resources = Array.isArray(body.resources) ? body.resources : [];
1371
+ const actions = Array.isArray(body.actions) ? body.actions : [];
1372
+ const result = await evaluateBatch(policyEngine, { subjects, resources, actions }, VALID_RESOURCES, VALID_ACTIONS);
1373
+ if (req.headers["accept"]?.toString().includes("text/csv")) {
1374
+ res.type("text/csv").send(batchResultToCsv(result));
1375
+ return;
1376
+ }
1377
+ res.json(result);
1378
+ });
1379
+ // --- /api/subjects — aggregated principals catalogue ------------------
1380
+ // The third k8s-shaped RBAC view: who the deployment knows about.
1381
+ // Three independent sources, returned in three independent arrays so
1382
+ // the UI can table each section separately:
1383
+ // - users : OMCP_USERS_FILE (basic-mode local users). Password
1384
+ // hashes are never returned.
1385
+ // - apiKeys : OMCP_API_KEYS names (the bearer-token catalogue).
1386
+ // Tokens are never returned; only metadata (tenant,
1387
+ // bound product, source allow-list, bypass flag).
1388
+ // - oidcGroups: keys of OMCP_OIDC_ROLE_MAP — every group the
1389
+ // operator has explicitly mapped to an OMCP role.
1390
+ // Runtime-only groups (claims that arrive without an
1391
+ // OMCP-side mapping) are skipped on purpose; they
1392
+ // produce no roles by definition.
1393
+ // Gated identically to /api/policy.
1394
+ app.get("/api/subjects", need("users", "delete"), async (_req, res) => {
1395
+ // Local users.
1396
+ const usersOut = [];
1397
+ if (process.env.OMCP_USERS_FILE) {
1398
+ try {
1399
+ const f = await readUsersFile(process.env.OMCP_USERS_FILE);
1400
+ if (f && Array.isArray(f.users)) {
1401
+ for (const u of f.users) {
1402
+ usersOut.push({
1403
+ username: u.username,
1404
+ name: u.name,
1405
+ roles: u.roles ? u.roles.slice() : [],
1406
+ tenant: u.tenant || "default",
1407
+ });
1408
+ }
1409
+ }
1410
+ }
1411
+ catch (e) {
1412
+ // Read failures don't 500 the whole endpoint — surface an
1413
+ // empty users array; admins can check the boot log for the
1414
+ // file-load diagnostic.
1415
+ console.warn(`[/api/subjects] readUsersFile failed: ${e.message}`);
1416
+ }
1417
+ }
1418
+ // API key credentials (tokens stripped).
1419
+ const apiKeysOut = [];
1420
+ for (const c of loadCredentials()) {
1421
+ apiKeysOut.push({
1422
+ name: c.name,
1423
+ tenant: c.tenant || "default",
1424
+ productId: c.productId,
1425
+ bypassRedaction: !!c.bypassRedaction,
1426
+ allowedSources: c.allowedSources,
1427
+ });
1428
+ }
1429
+ // OIDC groups → role mappings.
1430
+ const oidcGroupsOut = [];
1431
+ const roleMapRaw = process.env.OMCP_OIDC_ROLE_MAP;
1432
+ if (roleMapRaw) {
1433
+ try {
1434
+ const parsed = JSON.parse(roleMapRaw);
1435
+ if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) {
1436
+ for (const [claim, role] of Object.entries(parsed)) {
1437
+ if (typeof role === "string" && claim) {
1438
+ oidcGroupsOut.push({ claim, role });
1439
+ }
1440
+ }
1441
+ }
1442
+ }
1443
+ catch {
1444
+ // The OIDC runtime already rejects an invalid role map at
1445
+ // boot — if parsing fails here it's almost certainly a
1446
+ // transient state during config reload. Surface empty.
1447
+ }
1448
+ }
1449
+ res.json({
1450
+ users: usersOut,
1451
+ apiKeys: apiKeysOut,
1452
+ oidcGroups: oidcGroupsOut,
1453
+ // Surface which env vars actually drive each list so an
1454
+ // admin diagnosing "where is my user?" sees the source path
1455
+ // without having to read the deploy.
1456
+ sources: {
1457
+ users: process.env.OMCP_USERS_FILE || null,
1458
+ apiKeys: process.env.OMCP_API_KEYS ? "OMCP_API_KEYS" : null,
1459
+ oidcGroups: process.env.OMCP_OIDC_ROLE_MAP ? "OMCP_OIDC_ROLE_MAP" : null,
1460
+ },
1461
+ });
1462
+ });
1463
+ // Update a user's roles. Today this is the only binding-shape that
1464
+ // OMCP can actually mutate at runtime: api-key roles aren't stored
1465
+ // anywhere (creds carry sources / tenant / product but not roles),
1466
+ // and OIDC group → role mappings come from OMCP_OIDC_ROLE_MAP which
1467
+ // is read once at boot. The Bindings UI surface api-key + oidc rows
1468
+ // explain the env-source path instead of offering an edit affordance.
1469
+ app.put("/api/users/:username/roles", need("users", "delete"), audit("users", "write"), async (req, res) => {
1470
+ const username = String(req.params.username);
1471
+ const path = process.env.OMCP_USERS_FILE;
1472
+ if (!path) {
1473
+ res.status(409).json({ error: "OMCP_USERS_FILE is not configured — basic-mode user roles can't be edited via the API." });
1474
+ return;
1475
+ }
1476
+ const body = req.body;
1477
+ if (!body || !Array.isArray(body.roles) || !body.roles.every((r) => typeof r === "string")) {
1478
+ res.status(400).json({ error: "body must include { roles: string[] }" });
1479
+ return;
1480
+ }
1481
+ const requestedRoles = body.roles;
1482
+ // Reject role names not in the active policy engine's catalogue —
1483
+ // assigning a user a role that grants nothing is almost always a
1484
+ // typo, not intent. Same defence-in-depth posture as the products
1485
+ // typo guard (PR #343).
1486
+ const knownRoles = new Set(policyEngine.roles());
1487
+ const unknown = requestedRoles.filter((r) => !knownRoles.has(r));
1488
+ if (unknown.length > 0) {
1489
+ res.status(422).json({
1490
+ error: `unknown role name(s) for user '${username}': ${unknown.join(", ")}`,
1491
+ code: "OMCP_USER_UNKNOWN_ROLE",
1492
+ unknown,
1493
+ available: Array.from(knownRoles),
1494
+ });
1495
+ return;
1496
+ }
1497
+ const file = await readUsersFile(path);
1498
+ if (!file) {
1499
+ res.status(404).json({ error: `users file at ${path} is unreadable or empty` });
1500
+ return;
1501
+ }
1502
+ const idx = file.users.findIndex((u) => u.username === username);
1503
+ if (idx < 0) {
1504
+ res.status(404).json({ error: `user '${username}' not found` });
1505
+ return;
1506
+ }
1507
+ file.users[idx].roles = requestedRoles;
1508
+ try {
1509
+ await writeUsersFile(path, file);
1510
+ }
1511
+ catch (e) {
1512
+ res.status(500).json({ error: `failed to persist users file: ${e.message}` });
1513
+ return;
1514
+ }
1515
+ // Refresh the in-memory store so the next login picks up the new
1516
+ // role set without a server restart. maybeReloadUsers stat()s the
1517
+ // file's mtime, which we just bumped via the atomic rename.
1518
+ await maybeReloadUsers();
1519
+ res.json({ ok: true, username, roles: requestedRoles });
1520
+ });
1521
+ // Upsert a role in the file-backed RBAC policy. File engine only:
1522
+ // built-in defaults are immutable in source; OPA is the Rego
1523
+ // source of truth. The UI hides the affordance under non-file
1524
+ // engines via the [data-engine-required="file"] CSS gate; the
1525
+ // endpoint enforces the rule too for defence-in-depth.
1526
+ app.put("/api/policy/roles/:name", need("users", "delete"), audit("users", "write"), async (req, res) => {
1527
+ const name = String(req.params.name);
1528
+ // Reject names with shell-unfriendly characters early so the
1529
+ // YAML round-trip can't accidentally produce an exotic key.
1530
+ if (!/^[A-Za-z0-9][A-Za-z0-9._-]{0,63}$/.test(name)) {
1531
+ res.status(400).json({ error: `role name '${name}' must match [A-Za-z0-9][A-Za-z0-9._-]{0,63}` });
1532
+ return;
1533
+ }
1534
+ const policyFile = process.env.OMCP_RBAC_POLICY_FILE?.trim();
1535
+ if (!policyEngine.kind().startsWith("file:")) {
1536
+ // Built-in (immutable source) or OPA (Rego is the source of
1537
+ // truth) — role authoring isn't available. Return distinct
1538
+ // error codes so the UI can show the right hint without
1539
+ // string-matching the message.
1540
+ const code = policyEngine.kind() === "builtin"
1541
+ ? "OMCP_POLICY_ENGINE_BUILTIN"
1542
+ : policyEngine.kind().startsWith("opa:")
1543
+ ? "OMCP_POLICY_ENGINE_OPA"
1544
+ : "OMCP_POLICY_ENGINE_NOT_FILE";
1545
+ res.status(409).json({
1546
+ error: `role authoring requires the file engine — current is '${policyEngine.kind()}'`,
1547
+ code,
1548
+ });
1549
+ return;
1550
+ }
1551
+ if (!policyFile) {
1552
+ res.status(409).json({
1553
+ error: "OMCP_RBAC_POLICY_FILE is not configured — role authoring writes through that file.",
1554
+ code: "OMCP_POLICY_FILE_NOT_SET",
1555
+ });
1556
+ return;
1557
+ }
1558
+ const body = req.body;
1559
+ if (!body || !Array.isArray(body.permissions)) {
1560
+ res.status(400).json({ error: "body must include { permissions: [{resource, action}] }" });
1561
+ return;
1562
+ }
1563
+ const cleanPerms = [];
1564
+ for (let i = 0; i < body.permissions.length; i++) {
1565
+ const p = body.permissions[i];
1566
+ if (!p || typeof p !== "object" || typeof p.resource !== "string" || typeof p.action !== "string") {
1567
+ res.status(400).json({ error: `body.permissions[${i}] must be { resource: string, action: string }` });
1568
+ return;
1569
+ }
1570
+ if (!VALID_RESOURCES.has(p.resource)) {
1571
+ res.status(422).json({
1572
+ error: `unknown resource '${p.resource}'`,
1573
+ code: "OMCP_POLICY_UNKNOWN_RESOURCE",
1574
+ unknown: p.resource,
1575
+ available: [...VALID_RESOURCES],
1576
+ });
1577
+ return;
1578
+ }
1579
+ if (!VALID_ACTIONS.has(p.action)) {
1580
+ res.status(422).json({
1581
+ error: `unknown action '${p.action}'`,
1582
+ code: "OMCP_POLICY_UNKNOWN_ACTION",
1583
+ unknown: p.action,
1584
+ available: [...VALID_ACTIONS],
1585
+ });
1586
+ return;
1587
+ }
1588
+ cleanPerms.push({ resource: p.resource, action: p.action });
1589
+ }
1590
+ // De-duplicate exact (resource, action) pairs so the file
1591
+ // doesn't accumulate redundant entries via re-saves.
1592
+ const seen = new Set();
1593
+ const dedup = [];
1594
+ for (const p of cleanPerms) {
1595
+ const k = p.resource + ":" + p.action;
1596
+ if (seen.has(k))
1597
+ continue;
1598
+ seen.add(k);
1599
+ dedup.push(p);
1600
+ }
1601
+ // Snapshot the existing map (via raw()) and overlay the upsert.
1602
+ // BuiltinPolicyEngine is the only kind that reaches here per the
1603
+ // checks above.
1604
+ const current = {};
1605
+ if (policyEngine instanceof BuiltinPolicyEngine) {
1606
+ for (const [r, ps] of Object.entries(policyEngine.raw())) {
1607
+ current[r] = ps.slice();
1608
+ }
1609
+ }
1610
+ current[name] = dedup;
1611
+ try {
1612
+ await writePolicyFile(policyFile, current);
1613
+ }
1614
+ catch (e) {
1615
+ if (e instanceof PolicyLoadError) {
1616
+ res.status(422).json({ error: e.message });
1617
+ return;
1618
+ }
1619
+ res.status(500).json({ error: `failed to persist policy: ${e.message}` });
1620
+ return;
1621
+ }
1622
+ // Hot-swap the in-memory engine so the next gate evaluation
1623
+ // picks up the new role without a restart. `replace()` mutates
1624
+ // in-place, so existing middleware closures over `policyEngine`
1625
+ // see the new map immediately.
1626
+ if (policyEngine instanceof BuiltinPolicyEngine) {
1627
+ const fresh = loadPolicyFromFile(policyFile);
1628
+ if (fresh instanceof BuiltinPolicyEngine) {
1629
+ policyEngine.replace(fresh.raw());
1630
+ }
1631
+ }
1632
+ res.json({ ok: true, name, permissions: dedup });
1633
+ });
956
1634
  // --- /api/audit — management-plane audit feed -------------------------
957
1635
  // Read-only, gated by the "audit:read" permission so only viewers /
958
1636
  // operators / admins (basically anyone authenticated in the default
@@ -987,6 +1665,46 @@ async function main() {
987
1665
  scopedTo: tenantFilter || (isAdmin ? null : callerTenant),
988
1666
  });
989
1667
  });
1668
+ // --- /api/audit/dlq — webhook-sink dead-letter queue surface (P9) ---
1669
+ // When the audit webhook is configured AND the receiver exhausted
1670
+ // its retry budget, entries land in the DLQ file. This endpoint
1671
+ // surfaces the count + the last N entries so operators can decide
1672
+ // whether to replay manually. Also refreshes the
1673
+ // `obsmcp_audit_webhook_dlq_depth` gauge so the /metrics scrape
1674
+ // alongside it stays accurate.
1675
+ app.get("/api/audit/dlq", need("audit", "read"), async (_req, res) => {
1676
+ const dlqPath = process.env.OMCP_AUDIT_WEBHOOK_DLQ;
1677
+ if (!dlqPath) {
1678
+ auditDlqDepth.set(0);
1679
+ res.json({ enabled: false, path: null, depth: 0, entries: [] });
1680
+ return;
1681
+ }
1682
+ try {
1683
+ const fs = await import("node:fs/promises");
1684
+ const raw = await fs.readFile(dlqPath, "utf8");
1685
+ const lines = raw.split("\n").filter((l) => l.trim());
1686
+ auditDlqDepth.set(lines.length);
1687
+ const tail = lines.slice(-50).map((l) => {
1688
+ try {
1689
+ return JSON.parse(l);
1690
+ }
1691
+ catch {
1692
+ return { _raw: l, _parseError: true };
1693
+ }
1694
+ });
1695
+ res.json({ enabled: true, path: dlqPath, depth: lines.length, entries: tail });
1696
+ }
1697
+ catch (err) {
1698
+ const code = err.code;
1699
+ if (code === "ENOENT") {
1700
+ auditDlqDepth.set(0);
1701
+ res.json({ enabled: true, path: dlqPath, depth: 0, entries: [] });
1702
+ return;
1703
+ }
1704
+ console.warn("[/api/audit/dlq] read failed:", err);
1705
+ res.status(500).json({ error: err?.message || "DLQ read failed" });
1706
+ }
1707
+ });
990
1708
  // --- /api/usage — per-identity MCP rate-limit snapshot -----------------
991
1709
  // Read-only view of the IdentityRateLimiter's bucket state. Gated by
992
1710
  // need("audit","read") — the same role set that already sees the
@@ -1137,6 +1855,133 @@ async function main() {
1137
1855
  registerOidcRoutes(app, { sessionCfg, oidc: oidcRuntime });
1138
1856
  console.log("[auth] OIDC endpoints registered: /api/auth/oidc/{login,callback,logout}");
1139
1857
  }
1858
+ // Phase F21 / Q6: SCIM 2.0 — opt-in. OMCP_SCIM_TOKEN gates access.
1859
+ // The store backend is chosen by createScimStore from
1860
+ // OMCP_SCIM_BACKEND (file | redis). file (default) → OMCP_SCIM_STORE
1861
+ // on-disk JSON (mode 0600, atomic). redis → a shared snapshot so
1862
+ // multi-replica deployments stay coherent (Q6); the redis client is
1863
+ // built from OMCP_SCIM_REDIS_URL here, mirroring the session store.
1864
+ const scimToken = process.env.OMCP_SCIM_TOKEN?.trim();
1865
+ if (scimToken) {
1866
+ try {
1867
+ const scimBackend = (process.env.OMCP_SCIM_BACKEND?.trim() || "file");
1868
+ let scimRedis;
1869
+ if (scimBackend === "redis") {
1870
+ const redisUrl = process.env.OMCP_SCIM_REDIS_URL?.trim();
1871
+ if (!redisUrl)
1872
+ throw new Error("OMCP_SCIM_BACKEND=redis requires OMCP_SCIM_REDIS_URL");
1873
+ const { createClient } = await import("redis");
1874
+ const client = createClient({ url: redisUrl });
1875
+ client.on("error", (err) => console.warn("[scim] redis client error: %s", err instanceof Error ? err.message : String(err)));
1876
+ await client.connect();
1877
+ scimRedis = client;
1878
+ }
1879
+ const scimStore = await createScimStore({
1880
+ backend: scimBackend,
1881
+ path: process.env.OMCP_SCIM_STORE?.trim() || "/tmp/scim.json",
1882
+ redis: scimRedis,
1883
+ redisKey: process.env.OMCP_SCIM_REDIS_KEY?.trim(),
1884
+ });
1885
+ registerScimRoutes(app, {
1886
+ store: scimStore,
1887
+ bearerToken: scimToken,
1888
+ audit: (ev) => void mgmtAudit.record({
1889
+ actor: { sub: `scim:${ev.actor}` },
1890
+ tenant: "default",
1891
+ resource: "users",
1892
+ action: ev.action.includes("delete") ? "delete" : "write",
1893
+ method: "SCIM",
1894
+ path: `/scim/v2/${ev.action}`,
1895
+ status: ev.status,
1896
+ target: ev.target,
1897
+ }).catch(() => undefined),
1898
+ });
1899
+ console.log("[scim] /scim/v2/* registered (backend: %s)", scimBackend);
1900
+ }
1901
+ catch (err) {
1902
+ console.warn("[scim] enable failed (routes not mounted): %s", err instanceof Error ? err.message : String(err));
1903
+ }
1904
+ }
1905
+ // Phase P6: Postmortems persistence. /api/postmortems lets the
1906
+ // UI list / open / regenerate / delete previously-generated
1907
+ // reports. Opt-in via OMCP_POSTMORTEMS_FILE (default
1908
+ // /tmp/postmortems.jsonl). When the env is left at its default
1909
+ // the demo still works — operators who want survival across
1910
+ // restarts mount a PVC at the same path and set the env to it.
1911
+ const postmortemStore = new PostmortemStore(process.env.OMCP_POSTMORTEMS_FILE?.trim() || "/tmp/postmortems.jsonl");
1912
+ await postmortemStore.load();
1913
+ // GET /api/postmortems — list (newest-first), tenant-scoped.
1914
+ app.get("/api/postmortems", need("services", "read"), async (req, res) => {
1915
+ const sess = req.session;
1916
+ const tenant = sess?.tenant || "default";
1917
+ const entries = postmortemStore.list(tenant);
1918
+ res.json({
1919
+ total: entries.length,
1920
+ entries: entries.map((e) => ({
1921
+ id: e.id,
1922
+ ts: e.ts,
1923
+ createdBy: e.createdBy,
1924
+ service: e.report.service,
1925
+ window: e.report.window,
1926
+ synopsis: e.report.synopsis,
1927
+ })),
1928
+ });
1929
+ });
1930
+ // GET /api/postmortems/:id — full report (markdown + sections).
1931
+ app.get("/api/postmortems/:id", need("services", "read"), async (req, res) => {
1932
+ const sess = req.session;
1933
+ const tenant = sess?.tenant || "default";
1934
+ const id = String(req.params.id ?? "");
1935
+ const entry = postmortemStore.get(id, tenant);
1936
+ if (!entry) {
1937
+ res.status(404).json({ error: `Postmortem ${id} not found` });
1938
+ return;
1939
+ }
1940
+ res.json(entry);
1941
+ });
1942
+ // POST /api/postmortems — regenerate via the tool handler +
1943
+ // persist. Body: { service, duration?, format? }. Returns the
1944
+ // stored entry with its id.
1945
+ app.post("/api/postmortems", need("services", "write"), async (req, res) => {
1946
+ const body = (req.body ?? {});
1947
+ if (!body.service || typeof body.service !== "string") {
1948
+ res.status(400).json({ error: "service is required" });
1949
+ return;
1950
+ }
1951
+ const sess = req.session;
1952
+ const tenant = sess?.tenant || "default";
1953
+ const createdBy = sess?.sub || sess?.name || "unknown";
1954
+ try {
1955
+ // Force JSON so we get the structured report shape back from
1956
+ // the tool, not just the markdown body. We persist the full
1957
+ // structured report; the markdown lives inside `report.markdown`.
1958
+ const ctx = { ...defaultContext(), tenant, principalId: createdBy };
1959
+ const result = await generatePostmortemHandler(registry, { service: body.service, duration: body.duration, format: "json" }, ctx);
1960
+ const text = result?.content?.[0]?.text;
1961
+ if (!text) {
1962
+ res.status(500).json({ error: "generate_postmortem returned no content" });
1963
+ return;
1964
+ }
1965
+ const report = JSON.parse(text);
1966
+ const stored = await postmortemStore.append({ report, createdBy, tenant });
1967
+ res.status(201).json(stored);
1968
+ }
1969
+ catch (e) {
1970
+ console.warn(`[postmortems] regen failed:`, e);
1971
+ res.status(500).json({ error: e?.message || "internal error" });
1972
+ }
1973
+ });
1974
+ // DELETE /api/postmortems/:id — admin-gated.
1975
+ app.delete("/api/postmortems/:id", need("services", "delete"), async (req, res) => {
1976
+ const sess = req.session;
1977
+ const tenant = sess?.tenant || "default";
1978
+ const ok = await postmortemStore.delete(String(req.params.id ?? ""), tenant);
1979
+ if (!ok) {
1980
+ res.status(404).json({ error: `Postmortem ${req.params.id} not found` });
1981
+ return;
1982
+ }
1983
+ res.status(204).end();
1984
+ });
1140
1985
  // Connectors currently loaded into this server (builtin + filesystem
1141
1986
  // plugins), with manifest metadata — drives the UI "Connectors" page.
1142
1987
  app.get("/api/connectors", (_req, res) => {
@@ -1328,9 +2173,13 @@ async function main() {
1328
2173
  rmSync(work, { recursive: true, force: true });
1329
2174
  }
1330
2175
  });
1331
- // Add a new source
2176
+ // Add a new source — tenant-aware. Non-admins can only create
2177
+ // sources in their own tenant; admins may set any tenant or leave
2178
+ // unset (global). Untagged inputs default to undefined (global) for
2179
+ // admins and to the caller's own tenant for non-admins, so a
2180
+ // tenant-bound user can't accidentally pollute the global pool.
1332
2181
  app.post("/api/sources", installRateLimit, need("sources", "write"), audit("sources", "write"), async (req, res) => {
1333
- const { name, type, url, enabled, auth, tls } = req.body;
2182
+ const { name, type, url, enabled, auth, tls, tenant: bodyTenant } = req.body;
1334
2183
  if (!name || !type || !url) {
1335
2184
  res.status(400).json({ error: "name, type, and url are required" });
1336
2185
  return;
@@ -1340,22 +2189,40 @@ async function main() {
1340
2189
  res.status(400).json({ error: urlErr });
1341
2190
  return;
1342
2191
  }
2192
+ const sess = req.session;
2193
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2194
+ const callerTenant = sess?.tenant || "default";
2195
+ const resolvedTenant = isAdmin
2196
+ ? (typeof bodyTenant === "string" && bodyTenant ? bodyTenant : undefined)
2197
+ : (typeof bodyTenant === "string" && bodyTenant && bodyTenant !== callerTenant
2198
+ ? "__deny__"
2199
+ : callerTenant);
2200
+ if (resolvedTenant === "__deny__") {
2201
+ res.status(403).json({ error: "cannot create source in another tenant" });
2202
+ return;
2203
+ }
1343
2204
  const existing = registry.getSourceConfigs().find((s) => s.name === name);
1344
2205
  if (existing) {
1345
2206
  res.status(409).json({ error: `Source "${name}" already exists` });
1346
2207
  return;
1347
2208
  }
1348
- const source = { name, type, url, enabled: enabled !== false, auth, tls };
2209
+ const source = { name, type, url, enabled: enabled !== false, auth, tls, tenant: resolvedTenant };
1349
2210
  await registry.addSource(source);
1350
2211
  saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
1351
2212
  res.status(201).json({ ok: true, source });
1352
2213
  });
1353
- // Update an existing source
2214
+ // Update an existing source — tenant-aware. Non-admins editing a
2215
+ // cross-tenant source get the same 404 they'd get for "no such
2216
+ // source" (no existence leak). Admins may move a source between
2217
+ // tenants by setting body.tenant; non-admins cannot.
1354
2218
  app.put("/api/sources/:name", need("sources", "write"), audit("sources", "write"), async (req, res) => {
1355
2219
  const oldName = String(req.params.name);
1356
- const { name, type, url, enabled, auth, tls } = req.body;
2220
+ const { name, type, url, enabled, auth, tls, tenant: bodyTenant } = req.body;
1357
2221
  const existing = registry.getSourceConfigs().find((s) => s.name === oldName);
1358
- if (!existing) {
2222
+ const sess = req.session;
2223
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2224
+ const callerTenant = sess?.tenant || "default";
2225
+ if (!existing || (!isAdmin && existing.tenant && existing.tenant !== callerTenant)) {
1359
2226
  res.status(404).json({ error: `Source "${oldName}" not found` });
1360
2227
  return;
1361
2228
  }
@@ -1367,6 +2234,19 @@ async function main() {
1367
2234
  return;
1368
2235
  }
1369
2236
  }
2237
+ let nextTenant = existing.tenant;
2238
+ if (bodyTenant !== undefined) {
2239
+ if (!isAdmin) {
2240
+ // Non-admin attempting tenant reassignment — disallow.
2241
+ if (bodyTenant !== existing.tenant) {
2242
+ res.status(403).json({ error: "cannot change source tenant" });
2243
+ return;
2244
+ }
2245
+ }
2246
+ else {
2247
+ nextTenant = typeof bodyTenant === "string" && bodyTenant ? bodyTenant : undefined;
2248
+ }
2249
+ }
1370
2250
  const source = {
1371
2251
  name: name || oldName,
1372
2252
  type: type || existing.type,
@@ -1374,16 +2254,20 @@ async function main() {
1374
2254
  enabled: enabled !== undefined ? enabled : existing.enabled,
1375
2255
  auth: auth !== undefined ? auth : existing.auth,
1376
2256
  tls: tls !== undefined ? tls : existing.tls,
2257
+ tenant: nextTenant,
1377
2258
  };
1378
2259
  await registry.updateSource(oldName, source);
1379
2260
  saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
1380
2261
  res.json({ ok: true, source });
1381
2262
  });
1382
- // Delete a source
2263
+ // Delete a source — same cross-tenant 404 posture.
1383
2264
  app.delete("/api/sources/:name", need("sources", "delete"), audit("sources", "delete"), async (req, res) => {
1384
2265
  const name = String(req.params.name);
1385
2266
  const existing = registry.getSourceConfigs().find((s) => s.name === name);
1386
- if (!existing) {
2267
+ const sess = req.session;
2268
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2269
+ const callerTenant = sess?.tenant || "default";
2270
+ if (!existing || (!isAdmin && existing.tenant && existing.tenant !== callerTenant)) {
1387
2271
  res.status(404).json({ error: `Source "${name}" not found` });
1388
2272
  return;
1389
2273
  }
@@ -1417,7 +2301,10 @@ async function main() {
1417
2301
  app.patch("/api/sources/:name/toggle", need("sources", "write"), audit("sources", "write"), async (req, res) => {
1418
2302
  const name = String(req.params.name);
1419
2303
  const existing = registry.getSourceConfigs().find((s) => s.name === name);
1420
- if (!existing) {
2304
+ const sess = req.session;
2305
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2306
+ const callerTenant = sess?.tenant || "default";
2307
+ if (!existing || (!isAdmin && existing.tenant && existing.tenant !== callerTenant)) {
1421
2308
  res.status(404).json({ error: `Source "${name}" not found` });
1422
2309
  return;
1423
2310
  }
@@ -1440,7 +2327,10 @@ async function main() {
1440
2327
  try {
1441
2328
  const sess = req.session;
1442
2329
  const callerTenant = sess?.tenant || "default";
1443
- const result = await listServicesHandler(registry, {}, defaultContext());
2330
+ // sessionContext threads the caller's tenant into the handler so
2331
+ // PR #331's per-tenant connector scoping fires for the dashboard
2332
+ // surface too (was previously bypassed with defaultContext()).
2333
+ const result = await listServicesHandler(registry, {}, sessionContext(sess));
1444
2334
  const parsed = parseToolResult(result);
1445
2335
  // Tenant-scope catalog enrichment so a viewer in tenant A
1446
2336
  // doesn't accidentally see acme's owner/SLO metadata on a
@@ -1483,7 +2373,9 @@ async function main() {
1483
2373
  // Same scoping / staging-visibility pattern as /api/catalog. Non-admins
1484
2374
  // see only their own tenant's PUBLISHED products; admins see all
1485
2375
  // tenants by default + staging.
1486
- app.get("/api/products", need("products", "read"), (req, res) => {
2376
+ app.get("/api/products", need("products", "read"), async (req, res) => {
2377
+ // Pick up out-of-band edits before serving — see ProductsStore docs.
2378
+ await products.maybeReload();
1487
2379
  const sess = req.session;
1488
2380
  const isAdmin = hasPermission(sess?.roles, "users", "delete");
1489
2381
  const callerTenant = sess?.tenant || "default";
@@ -1497,6 +2389,67 @@ async function main() {
1497
2389
  includesStaging: includeStaging,
1498
2390
  });
1499
2391
  });
2392
+ // Create a new product (REST convention: POST = create, 409 on
2393
+ // conflict). Same tenancy + typo-guard posture as PUT. The PUT
2394
+ // upsert path remains for the existing UI; new integrations that
2395
+ // want strict create-vs-update semantics use POST.
2396
+ app.post("/api/products", need("products", "write"), audit("products", "write"), async (req, res) => {
2397
+ await products.maybeReload();
2398
+ const sess = req.session;
2399
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2400
+ const callerTenant = sess?.tenant || "default";
2401
+ const body = req.body;
2402
+ if (!body || typeof body !== "object" || Array.isArray(body)) {
2403
+ res.status(400).json({ error: "body must be a product object" });
2404
+ return;
2405
+ }
2406
+ if (typeof body.id !== "string" || !body.id) {
2407
+ res.status(400).json({ error: "body.id is required" });
2408
+ return;
2409
+ }
2410
+ let validated;
2411
+ try {
2412
+ validated = validateProduct(body, `POST /api/products`);
2413
+ }
2414
+ catch (e) {
2415
+ if (e instanceof ProductsLoadError) {
2416
+ res.status(400).json({ error: e.message });
2417
+ return;
2418
+ }
2419
+ throw e;
2420
+ }
2421
+ if (validated.tools && validated.tools.length > 0) {
2422
+ const unknown = unknownToolNames(validated.tools);
2423
+ if (unknown.length > 0) {
2424
+ res.status(422).json({
2425
+ error: `unknown tool name(s) in product '${validated.id}': ${unknown.join(", ")}`,
2426
+ code: "OMCP_PRODUCT_UNKNOWN_TOOL",
2427
+ unknown,
2428
+ available: REGISTERED_TOOL_NAMES,
2429
+ });
2430
+ return;
2431
+ }
2432
+ }
2433
+ if (!isAdmin && (validated.tenant || "default") !== callerTenant) {
2434
+ res.status(403).json({ error: "cannot create product in another tenant" });
2435
+ return;
2436
+ }
2437
+ if (products.get(validated.id)) {
2438
+ res.status(409).json({ error: `product '${validated.id}' already exists; use PUT to update` });
2439
+ return;
2440
+ }
2441
+ const next = products.upsert(validated);
2442
+ if (process.env.OMCP_PRODUCTS_FILE) {
2443
+ try {
2444
+ await writeProductsFile(process.env.OMCP_PRODUCTS_FILE, next);
2445
+ await products.pinMtimeAfterWrite();
2446
+ }
2447
+ catch (e) {
2448
+ console.warn(`[products] POST ${validated.id}: failed to persist to ${process.env.OMCP_PRODUCTS_FILE}: ${e.message} — in-memory state is still updated`);
2449
+ }
2450
+ }
2451
+ res.status(201).json({ product: validated, persisted: !!process.env.OMCP_PRODUCTS_FILE });
2452
+ });
1500
2453
  // Upsert a product. Body is the same shape as a single entry
1501
2454
  // in OMCP_PRODUCTS_FILE. The URL-path id must match the body id
1502
2455
  // (defence-in-depth: the gate keys on body, the path keys the
@@ -1504,6 +2457,9 @@ async function main() {
1504
2457
  // updated catalogue back to disk so the change survives a
1505
2458
  // restart; without the file, the upsert is in-memory only.
1506
2459
  app.put("/api/products/:id", need("products", "write"), audit("products", "write"), async (req, res) => {
2460
+ // Hot-reload before mutating so a concurrent on-disk edit isn't
2461
+ // silently clobbered by our in-memory snapshot.
2462
+ await products.maybeReload();
1507
2463
  const id = String(req.params.id);
1508
2464
  const sess = req.session;
1509
2465
  const isAdmin = hasPermission(sess?.roles, "users", "delete");
@@ -1532,6 +2488,23 @@ async function main() {
1532
2488
  }
1533
2489
  throw e;
1534
2490
  }
2491
+ // Typo guard: a Product whose `tools` allow-list names tools
2492
+ // that don't actually register would bind a credential to an
2493
+ // empty /mcp tool surface (silent dead session). Reject with
2494
+ // 422 + a hint of valid tool names so the operator can see the
2495
+ // intended typo immediately.
2496
+ if (validated.tools && validated.tools.length > 0) {
2497
+ const unknown = unknownToolNames(validated.tools);
2498
+ if (unknown.length > 0) {
2499
+ res.status(422).json({
2500
+ error: `unknown tool name(s) in product '${id}': ${unknown.join(", ")}`,
2501
+ code: "OMCP_PRODUCT_UNKNOWN_TOOL",
2502
+ unknown,
2503
+ available: REGISTERED_TOOL_NAMES,
2504
+ });
2505
+ return;
2506
+ }
2507
+ }
1535
2508
  // Tenant gate: non-admins can only write into their own tenant.
1536
2509
  if (!isAdmin && (validated.tenant || "default") !== callerTenant) {
1537
2510
  res.status(403).json({ error: "cannot write product into another tenant" });
@@ -1549,6 +2522,10 @@ async function main() {
1549
2522
  if (process.env.OMCP_PRODUCTS_FILE) {
1550
2523
  try {
1551
2524
  await writeProductsFile(process.env.OMCP_PRODUCTS_FILE, next);
2525
+ // Advance our mtime cursor past this write so the next
2526
+ // maybeReload() doesn't treat our own change as an external
2527
+ // edit and re-read what we just persisted.
2528
+ await products.pinMtimeAfterWrite();
1552
2529
  }
1553
2530
  catch (e) {
1554
2531
  console.warn(`[products] PUT ${id}: failed to persist to ${process.env.OMCP_PRODUCTS_FILE}: ${e.message} — in-memory state is still updated`);
@@ -1557,6 +2534,7 @@ async function main() {
1557
2534
  res.json({ product: validated, persisted: !!process.env.OMCP_PRODUCTS_FILE });
1558
2535
  });
1559
2536
  app.delete("/api/products/:id", need("products", "delete"), audit("products", "delete"), async (req, res) => {
2537
+ await products.maybeReload();
1560
2538
  const id = String(req.params.id);
1561
2539
  const sess = req.session;
1562
2540
  const isAdmin = hasPermission(sess?.roles, "users", "delete");
@@ -1574,6 +2552,7 @@ async function main() {
1574
2552
  if (process.env.OMCP_PRODUCTS_FILE) {
1575
2553
  try {
1576
2554
  await writeProductsFile(process.env.OMCP_PRODUCTS_FILE, next);
2555
+ await products.pinMtimeAfterWrite();
1577
2556
  }
1578
2557
  catch (e) {
1579
2558
  console.warn(`[products] DELETE ${id}: failed to persist to ${process.env.OMCP_PRODUCTS_FILE}: ${e.message} — in-memory state is still updated`);
@@ -1584,7 +2563,8 @@ async function main() {
1584
2563
  // Single product by id. Non-admins get a 404 (not 403) on a
1585
2564
  // cross-tenant probe so the existence of the product isn't leaked
1586
2565
  // — same posture as the rest of the tenancy layer.
1587
- app.get("/api/products/:id", need("products", "read"), (req, res) => {
2566
+ app.get("/api/products/:id", need("products", "read"), async (req, res) => {
2567
+ await products.maybeReload();
1588
2568
  const sess = req.session;
1589
2569
  const isAdmin = hasPermission(sess?.roles, "users", "delete");
1590
2570
  const callerTenant = sess?.tenant || "default";
@@ -1603,12 +2583,48 @@ async function main() {
1603
2583
  }
1604
2584
  res.json(p);
1605
2585
  });
2586
+ // Agent preview — what would the /mcp tools/list response look
2587
+ // like for a credential bound to this product? Same RBAC + tenant
2588
+ // gate as the singular GET above. The body mirrors the actual
2589
+ // tools/list shape (name + description + category), filtered the
2590
+ // same way the /mcp transport filters it via allowsTool +
2591
+ // registerTool — so the UI's Review pane shows the exact set the
2592
+ // agent will see, not an approximation. Branding metadata travels
2593
+ // alongside so the preview can render with the product's identity.
2594
+ app.get("/api/products/:id/preview", need("products", "read"), async (req, res) => {
2595
+ await products.maybeReload();
2596
+ const sess = req.session;
2597
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2598
+ const callerTenant = sess?.tenant || "default";
2599
+ const tenantFilter = isAdmin ? undefined : callerTenant;
2600
+ const id = String(req.params.id);
2601
+ const p = products.get(id, tenantFilter);
2602
+ if (!p) {
2603
+ res.status(404).json({ error: "not found" });
2604
+ return;
2605
+ }
2606
+ if (!isAdmin && p.status === "staging") {
2607
+ res.status(404).json({ error: "not found" });
2608
+ return;
2609
+ }
2610
+ const allowList = p.tools && p.tools.length > 0 ? p.tools : undefined;
2611
+ const filteredTools = REGISTERED_TOOLS.filter((t) => allowsTool(allowList, t.name));
2612
+ res.json({
2613
+ product: { id: p.id, name: p.name, version: p.version, branding: p.branding, tenant: p.tenant, status: p.status },
2614
+ // unrestricted = true when the product has no tools allow-list,
2615
+ // i.e. the bound agent sees every registered tool. UI uses this
2616
+ // to render a distinct "no filter" preview banner.
2617
+ unrestricted: !allowList,
2618
+ tools: filteredTools,
2619
+ });
2620
+ });
1606
2621
  // Health endpoint for UI dashboard
1607
2622
  app.get("/api/health/:service", async (req, res) => {
1608
2623
  try {
1609
- const callerTenant = req.session?.tenant || "default";
2624
+ const sess = req.session;
2625
+ const callerTenant = sess?.tenant || "default";
1610
2626
  const service = String(req.params.service);
1611
- const result = await getServiceHealthHandler(registry, { service }, defaultContext());
2627
+ const result = await getServiceHealthHandler(registry, { service }, sessionContext(sess));
1612
2628
  const parsed = parseToolResult(result);
1613
2629
  const entry = catalog.get(service, callerTenant);
1614
2630
  if (entry && parsed && typeof parsed === "object")
@@ -1622,14 +2638,16 @@ async function main() {
1622
2638
  // Health for all services
1623
2639
  app.get("/api/health", async (req, res) => {
1624
2640
  try {
1625
- const callerTenant = req.session?.tenant || "default";
1626
- const servicesResult = await listServicesHandler(registry, {}, defaultContext());
2641
+ const sess = req.session;
2642
+ const callerTenant = sess?.tenant || "default";
2643
+ const ctx = sessionContext(sess);
2644
+ const servicesResult = await listServicesHandler(registry, {}, ctx);
1627
2645
  const parsed = parseToolResult(servicesResult);
1628
2646
  const services = parsed?.services || [];
1629
2647
  const health = {};
1630
2648
  for (const svc of services) {
1631
2649
  try {
1632
- const result = await getServiceHealthHandler(registry, { service: svc.name }, defaultContext());
2650
+ const result = await getServiceHealthHandler(registry, { service: svc.name }, ctx);
1633
2651
  const h = parseToolResult(result);
1634
2652
  // Same tenant scoping as /api/services to avoid the
1635
2653
  // dashboard cross-tenant catalog leak the reviewer
@@ -1653,12 +2671,18 @@ async function main() {
1653
2671
  // Returns the union of topology snapshots across all topology-capable
1654
2672
  // connectors (today only "kubernetes"). One JSON document so the UI can
1655
2673
  // render summary + grouped views without N round-trips.
1656
- app.get("/api/topology", async (_req, res) => {
2674
+ app.get("/api/topology", async (req, res) => {
1657
2675
  try {
2676
+ const sess = req.session;
2677
+ const callerTenant = sess?.tenant || "default";
1658
2678
  const sources = [];
1659
2679
  const allResources = [];
1660
2680
  const allEdges = [];
1661
- for (const c of registry.getAll()) {
2681
+ // Tenant-scoped: non-anonymous callers only see topology from
2682
+ // connectors their tenant can reach. Anonymous mode keeps the
2683
+ // global view (single-tenant default).
2684
+ const connectors = sess ? registry.getByTenant(callerTenant) : registry.getAll();
2685
+ for (const c of connectors) {
1662
2686
  if (!isTopologyProvider(c))
1663
2687
  continue;
1664
2688
  const snap = await c.getTopologySnapshot();
@@ -1710,9 +2734,19 @@ async function main() {
1710
2734
  // --- Per-Source Metrics API ---
1711
2735
  // Get metrics for a source (active metrics or defaults)
1712
2736
  app.get("/api/sources/:name/metrics", (req, res) => {
1713
- const connector = registry.getByName(String(req.params.name));
2737
+ const name = String(req.params.name);
2738
+ const sess = req.session;
2739
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2740
+ const callerTenant = sess?.tenant || "default";
2741
+ // Tenant-aware: getByNameForTenant returns undefined for both
2742
+ // "doesn't exist" and "cross-tenant" — same no-leak posture as
2743
+ // /api/sources GET/PUT/DELETE. Anonymous / admin keep the
2744
+ // single-tenant behaviour by falling back to getByName.
2745
+ const connector = (sess && !isAdmin)
2746
+ ? registry.getByNameForTenant(name, callerTenant)
2747
+ : registry.getByName(name);
1714
2748
  if (!connector) {
1715
- res.status(404).json({ error: `Source "${String(req.params.name)}" not found` });
2749
+ res.status(404).json({ error: `Source "${name}" not found` });
1716
2750
  return;
1717
2751
  }
1718
2752
  res.json({
@@ -1720,11 +2754,15 @@ async function main() {
1720
2754
  defaults: connector.getDefaultMetrics(),
1721
2755
  });
1722
2756
  });
1723
- // Update metrics for a source
2757
+ // Update metrics for a source — tenant-aware mutation.
1724
2758
  app.put("/api/sources/:name/metrics", need("sources", "write"), audit("sources", "write"), async (req, res) => {
1725
2759
  const name = String(req.params.name);
1726
2760
  const sourceIdx = config.sources.findIndex((s) => s.name === name);
1727
- if (sourceIdx === -1) {
2761
+ const sess = req.session;
2762
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2763
+ const callerTenant = sess?.tenant || "default";
2764
+ const src = sourceIdx >= 0 ? config.sources[sourceIdx] : undefined;
2765
+ if (!src || (!isAdmin && src.tenant && src.tenant !== callerTenant)) {
1728
2766
  res.status(404).json({ error: `Source "${name}" not found` });
1729
2767
  return;
1730
2768
  }
@@ -1734,11 +2772,15 @@ async function main() {
1734
2772
  saveConfig(config);
1735
2773
  res.json({ ok: true });
1736
2774
  });
1737
- // Reset a source's metrics to connector defaults
2775
+ // Reset a source's metrics to connector defaults — tenant-aware.
1738
2776
  app.delete("/api/sources/:name/metrics", need("sources", "write"), audit("sources", "write"), async (req, res) => {
1739
2777
  const name = String(req.params.name);
1740
2778
  const sourceIdx = config.sources.findIndex((s) => s.name === name);
1741
- if (sourceIdx === -1) {
2779
+ const sess = req.session;
2780
+ const isAdmin = hasPermission(sess?.roles, "users", "delete");
2781
+ const callerTenant = sess?.tenant || "default";
2782
+ const src = sourceIdx >= 0 ? config.sources[sourceIdx] : undefined;
2783
+ if (!src || (!isAdmin && src.tenant && src.tenant !== callerTenant)) {
1742
2784
  res.status(404).json({ error: `Source "${name}" not found` });
1743
2785
  return;
1744
2786
  }
@@ -1749,7 +2791,7 @@ async function main() {
1749
2791
  });
1750
2792
  // Stdio transport: one server over stdin/stdout, no HTTP listener.
1751
2793
  if (STDIO) {
1752
- const server = createMcpServer(defaultContext());
2794
+ const { mcpServer: server } = createMcpServer(defaultContext());
1753
2795
  await server.connect(new StdioServerTransport());
1754
2796
  console.error(`observability-mcp running on stdio transport · connectors: ${registry
1755
2797
  .getAll()
@@ -1760,6 +2802,12 @@ async function main() {
1760
2802
  // MCP Streamable HTTP transport — stateful sessions
1761
2803
  const transports = new Map();
1762
2804
  const sessionLastActive = new Map();
2805
+ // Phase F9: per-session tag identifying the virtual-server slug a
2806
+ // session was issued under (or undefined for the root /mcp surface).
2807
+ // Used to prevent a session minted on /mcp/v/foo from being probed
2808
+ // via /mcp/v/bar — the GET/DELETE handlers refuse the cross-product
2809
+ // lookup.
2810
+ const sessionProduct = new Map();
1763
2811
  const SESSION_TTL_MS = 30 * 60 * 1000; // 30 min idle timeout
1764
2812
  // Clean up idle sessions every 5 minutes
1765
2813
  setInterval(() => {
@@ -1768,6 +2816,7 @@ async function main() {
1768
2816
  if (now - lastActive > SESSION_TTL_MS) {
1769
2817
  transports.delete(sid);
1770
2818
  sessionLastActive.delete(sid);
2819
+ sessionProduct.delete(sid);
1771
2820
  console.log(`Session ${sid} expired (idle)`);
1772
2821
  }
1773
2822
  }
@@ -1781,8 +2830,22 @@ async function main() {
1781
2830
  // 429 with a Retry-After. Anonymous /mcp traffic (no OMCP_API_KEYS
1782
2831
  // configured) bypasses this — the global express-rate-limit IP gate
1783
2832
  // still applies. Override via OMCP_TOOL_RATE_PER_MIN.
2833
+ // Per-credential cap overrides: OMCP_KEY_RATE_PER_MIN="agent=600;ci=240"
2834
+ // wins over the global OMCP_TOOL_RATE_PER_MIN for the named credentials.
2835
+ // The bucket identity is "<tenant> <credName>"; the override map keys on
2836
+ // credName, so the lookup pulls the cred-name back out of the composite.
2837
+ const keyRateLimits = parseKeyRateLimits(process.env.OMCP_KEY_RATE_PER_MIN);
1784
2838
  const toolRateLimiter = new IdentityRateLimiter({
1785
2839
  limit: resolveToolRatePerMin(process.env.OMCP_TOOL_RATE_PER_MIN),
2840
+ limitFor: keyRateLimits.size === 0 ? undefined : (identity) => {
2841
+ // Composite identity is "<tenant> <credName>" — split on the
2842
+ // single space that gateCtx put there (NUL would be safer but
2843
+ // would break existing /api/usage actor labels; cred names are
2844
+ // operator-set and don't contain spaces in practice).
2845
+ const sp = identity.indexOf(" ");
2846
+ const credName = sp >= 0 ? identity.slice(sp + 1) : identity;
2847
+ return keyRateLimits.get(credName);
2848
+ },
1786
2849
  });
1787
2850
  // Per-identity tracker key. Composes tenant + principalId so two
1788
2851
  // credentials of the same name in different tenants don't share
@@ -1822,7 +2885,7 @@ async function main() {
1822
2885
  }
1823
2886
  // Bearer/X-API-Key on every /mcp request; resolve the principal + its
1824
2887
  // coarse source allow-list into the RequestContext.
1825
- function gateCtx(req, res) {
2888
+ async function gateCtx(req, res) {
1826
2889
  if (!credentialsConfigured())
1827
2890
  return defaultContext();
1828
2891
  const cred = resolveToken(extractToken(req.headers), loadCredentials());
@@ -1853,13 +2916,33 @@ async function main() {
1853
2916
  });
1854
2917
  return null;
1855
2918
  }
2919
+ // Resolve the credential's bound Product (OMCP_KEY_PRODUCTS) into
2920
+ // a concrete tools allow-list. Cross-tenant Products are invisible
2921
+ // — products.get() returns undefined when the productId belongs to
2922
+ // another tenant, mirroring the rest of the tenancy layer. A bound
2923
+ // Product whose own `tools` field is absent / empty leaves the
2924
+ // allow-list undefined (== unrestricted), matching the YAML
2925
+ // loader's "no tools key = no restriction" semantics.
2926
+ let allowedTools;
2927
+ if (cred.productId) {
2928
+ // Pick up out-of-band edits to OMCP_PRODUCTS_FILE before each
2929
+ // /mcp request — cheap (one stat), keeps the binding live.
2930
+ // Best-effort: if the catalogue reload fails we keep the prior
2931
+ // good state (the store handles that internally) rather than
2932
+ // failing the request.
2933
+ await products.maybeReload().catch(() => undefined);
2934
+ const p = products.get(cred.productId, credTenant);
2935
+ if (p && p.tools && p.tools.length > 0)
2936
+ allowedTools = p.tools.slice();
2937
+ }
1856
2938
  return principalContext(cred.name, cred.allowedSources, {
1857
2939
  allowBypassRedaction: cred.bypassRedaction,
1858
2940
  tenant: cred.tenant,
2941
+ allowedTools,
1859
2942
  });
1860
2943
  }
1861
2944
  app.post("/mcp", async (req, res) => {
1862
- const ctx = gateCtx(req, res);
2945
+ const ctx = await gateCtx(req, res);
1863
2946
  if (!ctx)
1864
2947
  return;
1865
2948
  const sessionId = req.headers["mcp-session-id"];
@@ -1881,7 +2964,7 @@ async function main() {
1881
2964
  }
1882
2965
  mcpActiveSessions.set(transports.size);
1883
2966
  };
1884
- const sessionMcpServer = createMcpServer(ctx);
2967
+ const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
1885
2968
  await sessionMcpServer.connect(transport);
1886
2969
  }
1887
2970
  await transport.handleRequest(req, res, req.body);
@@ -1895,7 +2978,7 @@ async function main() {
1895
2978
  mcpActiveSessions.set(transports.size);
1896
2979
  });
1897
2980
  app.get("/mcp", async (req, res) => {
1898
- if (!gateCtx(req, res))
2981
+ if (!(await gateCtx(req, res)))
1899
2982
  return;
1900
2983
  const sessionId = req.headers["mcp-session-id"];
1901
2984
  const transport = transports.get(sessionId);
@@ -1906,7 +2989,7 @@ async function main() {
1906
2989
  await transport.handleRequest(req, res);
1907
2990
  });
1908
2991
  app.delete("/mcp", async (req, res) => {
1909
- if (!gateCtx(req, res))
2992
+ if (!(await gateCtx(req, res)))
1910
2993
  return;
1911
2994
  const sessionId = req.headers["mcp-session-id"];
1912
2995
  const transport = transports.get(sessionId);
@@ -1914,18 +2997,244 @@ async function main() {
1914
2997
  await transport.handleRequest(req, res);
1915
2998
  transports.delete(sessionId);
1916
2999
  sessionLastActive.delete(sessionId);
3000
+ sessionProduct.delete(sessionId);
3001
+ }
3002
+ else {
3003
+ res.status(400).json({ error: "No active session" });
3004
+ }
3005
+ });
3006
+ // Phase F9: virtual servers — every Product gets its own MCP
3007
+ // endpoint at /mcp/v/<slug> that exposes only the tools bound to
3008
+ // that Product, with the caller's existing tenant + RBAC scoping
3009
+ // preserved. The narrow ctx flows into createMcpServer's
3010
+ // registerTool gate, so the surface a /mcp/v/<slug> client sees is
3011
+ // strictly product.tools (intersected with any pre-existing
3012
+ // allowedTools the credential already carries).
3013
+ function intersectAllowed(a, b) {
3014
+ if (!a)
3015
+ return b;
3016
+ if (!b)
3017
+ return a;
3018
+ const bSet = new Set(b);
3019
+ return a.filter((t) => bSet.has(t));
3020
+ }
3021
+ async function resolveVirtualProduct(req, res, baseCtx) {
3022
+ const slug = req.params.slug;
3023
+ if (!slug || typeof slug !== "string") {
3024
+ res.status(404).json({ error: "virtual server not found" });
3025
+ return null;
3026
+ }
3027
+ // Hot-reload aware so newly-published products are visible
3028
+ // without restart (same pattern /mcp uses for product changes).
3029
+ await products.maybeReload().catch(() => undefined);
3030
+ const tenant = baseCtx.tenant || "default";
3031
+ const product = products.get(slug, tenant);
3032
+ if (!product || product.status === "staging") {
3033
+ // 404 (not 403) for cross-tenant or missing — matches the
3034
+ // existence-hiding stance of the rest of the tenancy layer.
3035
+ res.status(404).json({ error: "virtual server not found" });
3036
+ return null;
3037
+ }
3038
+ const allowedTools = intersectAllowed(baseCtx.allowedTools, product.tools);
3039
+ const ctx = { ...baseCtx, allowedTools };
3040
+ return { product, ctx };
3041
+ }
3042
+ app.post("/mcp/v/:slug", async (req, res) => {
3043
+ const baseCtx = await gateCtx(req, res);
3044
+ if (!baseCtx)
3045
+ return;
3046
+ const resolved = await resolveVirtualProduct(req, res, baseCtx);
3047
+ if (!resolved)
3048
+ return;
3049
+ const { ctx, product } = resolved;
3050
+ const sessionId = req.headers["mcp-session-id"];
3051
+ let transport;
3052
+ if (sessionId && transports.has(sessionId)) {
3053
+ // Cross-product session probe is rejected: the session is
3054
+ // bound to whichever virtual server issued it.
3055
+ if (sessionProduct.get(sessionId) !== product.id) {
3056
+ res.status(404).json({ error: "virtual server not found" });
3057
+ return;
3058
+ }
3059
+ transport = transports.get(sessionId);
3060
+ }
3061
+ else {
3062
+ transport = new StreamableHTTPServerTransport({
3063
+ sessionIdGenerator: () => randomUUID(),
3064
+ });
3065
+ transport.onclose = () => {
3066
+ for (const [sid, t] of transports) {
3067
+ if (t === transport) {
3068
+ transports.delete(sid);
3069
+ sessionProduct.delete(sid);
3070
+ break;
3071
+ }
3072
+ }
3073
+ mcpActiveSessions.set(transports.size);
3074
+ };
3075
+ const { mcpServer: sessionMcpServer } = createMcpServer(ctx);
3076
+ await sessionMcpServer.connect(transport);
3077
+ }
3078
+ await transport.handleRequest(req, res, req.body);
3079
+ const sid = res.getHeader("mcp-session-id");
3080
+ if (sid) {
3081
+ if (!transports.has(sid)) {
3082
+ transports.set(sid, transport);
3083
+ sessionProduct.set(sid, product.id);
3084
+ }
3085
+ sessionLastActive.set(sid, Date.now());
3086
+ }
3087
+ mcpActiveSessions.set(transports.size);
3088
+ });
3089
+ app.get("/mcp/v/:slug", async (req, res) => {
3090
+ const baseCtx = await gateCtx(req, res);
3091
+ if (!baseCtx)
3092
+ return;
3093
+ const resolved = await resolveVirtualProduct(req, res, baseCtx);
3094
+ if (!resolved)
3095
+ return;
3096
+ const sessionId = req.headers["mcp-session-id"];
3097
+ const transport = transports.get(sessionId);
3098
+ if (!transport || sessionProduct.get(sessionId) !== resolved.product.id) {
3099
+ res.status(400).json({ error: "No active session" });
3100
+ return;
3101
+ }
3102
+ await transport.handleRequest(req, res);
3103
+ });
3104
+ app.delete("/mcp/v/:slug", async (req, res) => {
3105
+ const baseCtx = await gateCtx(req, res);
3106
+ if (!baseCtx)
3107
+ return;
3108
+ const resolved = await resolveVirtualProduct(req, res, baseCtx);
3109
+ if (!resolved)
3110
+ return;
3111
+ const sessionId = req.headers["mcp-session-id"];
3112
+ const transport = transports.get(sessionId);
3113
+ if (transport && sessionProduct.get(sessionId) === resolved.product.id) {
3114
+ await transport.handleRequest(req, res);
3115
+ transports.delete(sessionId);
3116
+ sessionLastActive.delete(sessionId);
3117
+ sessionProduct.delete(sessionId);
1917
3118
  }
1918
3119
  else {
1919
3120
  res.status(400).json({ error: "No active session" });
1920
3121
  }
1921
3122
  });
3123
+ // Bearer-token resolver for WebSocket upgrade requests. Browsers
3124
+ // can't set Authorization on a WS handshake, so we accept the token
3125
+ // from any of: Authorization: Bearer X, ?token=X, or the
3126
+ // Sec-WebSocket-Protocol subprotocol "bearer.X" (echoed back by the
3127
+ // server when accepted so clients see which subprotocol won).
3128
+ function extractWsToken(req) {
3129
+ const auth = req.headers["authorization"];
3130
+ if (typeof auth === "string") {
3131
+ const m = auth.match(/^Bearer\s+(.+)$/i);
3132
+ if (m)
3133
+ return { token: m[1] };
3134
+ }
3135
+ try {
3136
+ const url = new URL(req.url ?? "/", "http://localhost");
3137
+ const q = url.searchParams.get("token");
3138
+ if (q)
3139
+ return { token: q };
3140
+ }
3141
+ catch {
3142
+ /* malformed URL */
3143
+ }
3144
+ const sp = req.headers["sec-websocket-protocol"];
3145
+ if (typeof sp === "string") {
3146
+ const offered = sp.split(",").map((s) => s.trim());
3147
+ const bearer = offered.find((p) => p.startsWith("bearer."));
3148
+ if (bearer)
3149
+ return { token: bearer.slice("bearer.".length), selectedSubprotocol: bearer };
3150
+ }
3151
+ return {};
3152
+ }
3153
+ async function gateWsCtx(req) {
3154
+ const { token, selectedSubprotocol } = extractWsToken(req);
3155
+ if (!credentialsConfigured()) {
3156
+ return { ctx: defaultContext(), selectedSubprotocol };
3157
+ }
3158
+ if (!token) {
3159
+ return { reject: 4401, reason: "unauthorized: token required" };
3160
+ }
3161
+ const cred = resolveToken(token, loadCredentials());
3162
+ if (!cred) {
3163
+ return { reject: 4401, reason: "unauthorized: invalid token" };
3164
+ }
3165
+ const credTenant = cred.tenant || "default";
3166
+ const decision = toolRateLimiter.check(`${credTenant} ${cred.name}`);
3167
+ if (!decision.allowed) {
3168
+ return { reject: 4429, reason: "rate limit exceeded for identity" };
3169
+ }
3170
+ let allowedTools;
3171
+ if (cred.productId) {
3172
+ await products.maybeReload().catch(() => undefined);
3173
+ const p = products.get(cred.productId, credTenant);
3174
+ if (p && p.tools && p.tools.length > 0)
3175
+ allowedTools = p.tools.slice();
3176
+ }
3177
+ return {
3178
+ ctx: principalContext(cred.name, cred.allowedSources, {
3179
+ allowBypassRedaction: cred.bypassRedaction,
3180
+ tenant: cred.tenant,
3181
+ allowedTools,
3182
+ }),
3183
+ selectedSubprotocol,
3184
+ };
3185
+ }
1922
3186
  const PORT = parseInt(process.env.PORT || "3000");
1923
- app.listen(PORT, () => {
3187
+ const httpServer = app.listen(PORT, () => {
1924
3188
  ready = true;
1925
3189
  console.log(`observability-mcp server running on port ${PORT}`);
1926
3190
  console.log(` MCP endpoint: http://localhost:${PORT}/mcp`);
3191
+ console.log(` MCP (WS): ws://localhost:${PORT}/mcp/ws`);
1927
3192
  console.log(` Web UI: http://localhost:${PORT}`);
1928
3193
  console.log(` Connectors: ${registry.getAll().map((c) => c.name).join(", ")}`);
1929
3194
  });
3195
+ // Mount the WebSocket MCP transport. One McpServer instance per
3196
+ // accepted socket; per-connection state is carried in
3197
+ // WebSocketServerTransport.sessionId so concurrent clients stay
3198
+ // isolated. Dynamic import so the `ws` package only loads on
3199
+ // platforms that actually use this transport.
3200
+ const { WebSocketServer } = await import("ws");
3201
+ const wss = new WebSocketServer({ noServer: true });
3202
+ httpServer.on("upgrade", async (req, socket, head) => {
3203
+ if (!req.url) {
3204
+ socket.destroy();
3205
+ return;
3206
+ }
3207
+ const path = req.url.split("?")[0];
3208
+ if (path !== "/mcp/ws") {
3209
+ socket.destroy();
3210
+ return;
3211
+ }
3212
+ const auth = await gateWsCtx(req);
3213
+ if ("reject" in auth) {
3214
+ // Custom 4xxx codes during upgrade aren't expressible via HTTP
3215
+ // status, so we accept the upgrade just long enough to close
3216
+ // with the WS-level close code that carries our reason.
3217
+ wss.handleUpgrade(req, socket, head, (ws) => {
3218
+ ws.close(auth.reject === 4429 ? 1013 : 1008, auth.reason);
3219
+ });
3220
+ return;
3221
+ }
3222
+ wss.handleUpgrade(req, socket, head, async (ws) => {
3223
+ try {
3224
+ const transport = new WebSocketServerTransport(ws);
3225
+ const { mcpServer: sessionMcpServer } = createMcpServer(auth.ctx);
3226
+ await sessionMcpServer.connect(transport);
3227
+ }
3228
+ catch (err) {
3229
+ console.warn("WS /mcp/ws session setup failed:", err);
3230
+ try {
3231
+ ws.close(1011, "server error");
3232
+ }
3233
+ catch {
3234
+ /* socket already gone */
3235
+ }
3236
+ }
3237
+ });
3238
+ });
1930
3239
  }
1931
3240
  main().catch(console.error);