@thotischner/observability-mcp 1.3.4 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/cli/index.d.ts +2 -0
  2. package/dist/cli/index.js +370 -0
  3. package/dist/cli/lib.d.ts +95 -0
  4. package/dist/cli/lib.js +185 -0
  5. package/dist/cli/lib.test.d.ts +1 -0
  6. package/dist/cli/lib.test.js +134 -0
  7. package/dist/config/loader.test.js +3 -3
  8. package/dist/connectors/hub.d.ts +48 -0
  9. package/dist/connectors/hub.js +51 -0
  10. package/dist/connectors/hub.test.d.ts +1 -0
  11. package/dist/connectors/hub.test.js +52 -0
  12. package/dist/connectors/install.d.ts +24 -0
  13. package/dist/connectors/install.js +100 -0
  14. package/dist/connectors/install.test.d.ts +1 -0
  15. package/dist/connectors/install.test.js +58 -0
  16. package/dist/connectors/loader.d.ts +48 -0
  17. package/dist/connectors/loader.js +222 -0
  18. package/dist/connectors/loki.js +14 -6
  19. package/dist/connectors/loki.test.js +27 -0
  20. package/dist/connectors/registry.d.ts +3 -0
  21. package/dist/connectors/registry.js +16 -16
  22. package/dist/connectors/tls.test.js +3 -3
  23. package/dist/connectors/verify.d.ts +19 -0
  24. package/dist/connectors/verify.js +87 -0
  25. package/dist/connectors/verify.test.d.ts +1 -0
  26. package/dist/connectors/verify.test.js +63 -0
  27. package/dist/index.js +389 -26
  28. package/dist/metrics/instrument-connector.d.ts +8 -0
  29. package/dist/metrics/instrument-connector.js +41 -0
  30. package/dist/metrics/self.d.ts +12 -0
  31. package/dist/metrics/self.js +61 -0
  32. package/dist/openapi.d.ts +2 -0
  33. package/dist/openapi.js +186 -0
  34. package/dist/sdk/index.d.ts +52 -0
  35. package/dist/sdk/index.js +13 -0
  36. package/dist/sdk/manifest-schema.d.ts +28 -0
  37. package/dist/sdk/manifest-schema.js +47 -0
  38. package/dist/sdk/manifest-schema.test.d.ts +1 -0
  39. package/dist/sdk/manifest-schema.test.js +50 -0
  40. package/dist/tools/get-service-health.js +3 -2
  41. package/dist/ui/index.html +687 -115
  42. package/dist/util/sanitize.d.ts +1 -0
  43. package/dist/util/sanitize.js +6 -0
  44. package/package.json +21 -8
package/dist/index.js CHANGED
@@ -3,9 +3,16 @@ import express from "express";
3
3
  import { randomUUID } from "node:crypto";
4
4
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
5
5
  import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
6
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
6
7
  import { z } from "zod";
7
8
  import { loadConfig, saveConfig, DEFAULT_HEALTH_THRESHOLDS, DEFAULT_SETTINGS } from "./config/loader.js";
8
9
  import { ConnectorRegistry, getSupportedTypes } from "./connectors/registry.js";
10
+ import { getPluginLoader } from "./connectors/loader.js";
11
+ import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
12
+ import { isValidConnectorName, installTarball } from "./connectors/install.js";
13
+ import { PluginVerificationError } from "./connectors/verify.js";
14
+ import { selfRegistry, withToolMetrics, apiRequests, mcpActiveSessions } from "./metrics/self.js";
15
+ import { buildOpenApiSpec } from "./openapi.js";
9
16
  import { listSourcesHandler } from "./tools/list-sources.js";
10
17
  import { listServicesHandler } from "./tools/list-services.js";
11
18
  import { queryMetricsHandler } from "./tools/query-metrics.js";
@@ -14,7 +21,20 @@ import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-servic
14
21
  import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
15
22
  import { fileURLToPath } from "node:url";
16
23
  import { dirname, join } from "node:path";
24
+ import { readFileSync, writeFileSync, mkdtempSync, rmSync } from "node:fs";
25
+ import { tmpdir } from "node:os";
17
26
  const __dirname = dirname(fileURLToPath(import.meta.url));
27
+ // Read once at startup; the file is shipped inside the image so this
28
+ // is the source of truth even when the user runs from `npx`.
29
+ const SERVER_VERSION = (() => {
30
+ try {
31
+ const pkg = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf8"));
32
+ return pkg.version ?? "unknown";
33
+ }
34
+ catch {
35
+ return "unknown";
36
+ }
37
+ })();
18
38
  function applyConfigToRuntime(config, registry) {
19
39
  setHealthThresholds(config.healthThresholds);
20
40
  }
@@ -52,8 +72,51 @@ function validateSourceUrl(url) {
52
72
  return `Invalid URL: "${url}"`;
53
73
  }
54
74
  }
75
+ // Hard cap for a downloaded/uploaded connector tarball (defence against
76
+ // a hostile or accidental huge artifact OOM-ing the server).
77
+ const MAX_CONNECTOR_TGZ_BYTES = 64 * 1024 * 1024;
78
+ // Dependency-free fixed-window per-client rate limiter for the runtime
79
+ // connector install/upload routes (expensive: fetch + extract + verify +
80
+ // fs write + loader rescan). Bounds abuse even with ENABLE_UI_INSTALL on.
81
+ const installRateState = new Map();
82
+ function installRateLimit(req, res, next) {
83
+ const WINDOW_MS = 60_000;
84
+ const MAX = 5;
85
+ const now = Date.now();
86
+ if (installRateState.size > 5000) {
87
+ for (const [k, v] of installRateState)
88
+ if (v.resetAt < now)
89
+ installRateState.delete(k);
90
+ }
91
+ const key = req.ip || "unknown";
92
+ let s = installRateState.get(key);
93
+ if (!s || s.resetAt < now) {
94
+ s = { count: 0, resetAt: now + WINDOW_MS };
95
+ installRateState.set(key, s);
96
+ }
97
+ s.count++;
98
+ if (s.count > MAX) {
99
+ res.setHeader("Retry-After", String(Math.ceil((s.resetAt - now) / 1000)));
100
+ res.status(429).json({
101
+ error: "rate limit exceeded — too many connector install attempts, slow down",
102
+ });
103
+ return;
104
+ }
105
+ next();
106
+ }
55
107
  async function main() {
108
+ // Stdio transport mode (MCP catalogs / desktop clients / Glama's
109
+ // mcp-proxy spawn a stdio MCP server and read JSON-RPC from stdout).
110
+ // The protocol stream MUST be the only thing on stdout, so route all
111
+ // console.log to stderr before anything logs.
112
+ const STDIO = process.argv.includes("--stdio") ||
113
+ process.env.MCP_TRANSPORT === "stdio" ||
114
+ !!process.env.MCP_STDIO;
115
+ if (STDIO) {
116
+ console.log = (...a) => console.error(...a);
117
+ }
56
118
  let config = loadConfig();
119
+ await getPluginLoader().load();
57
120
  const registry = new ConnectorRegistry();
58
121
  await registry.initialize(config);
59
122
  applyConfigToRuntime(config, registry);
@@ -64,49 +127,171 @@ async function main() {
64
127
  function createMcpServer() {
65
128
  const mcpServer = new McpServer({
66
129
  name: "observability-mcp",
67
- version: "1.3.0",
130
+ version: SERVER_VERSION,
68
131
  });
69
132
  // --- Register tools with Zod schemas ---
70
- mcpServer.tool("list_sources", "List all configured observability backends and their connection status. Use this to discover what data sources are available.", {}, async () => listSourcesHandler(registry));
71
- mcpServer.tool("list_services", "List all monitored services discovered across all connected backends. Returns service names, their data sources, and signal types (metrics/logs).", { filter: z.string().optional().describe("Optional filter to match service names") }, async (args) => listServicesHandler(registry, args));
133
+ mcpServer.tool("list_sources", [
134
+ "List the configured observability backends (Prometheus, Loki, and any connector) and whether each is currently reachable.",
135
+ "When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
136
+ "Behavior: read-only, no side effects. Returns one entry per source with its name, type, configured URL, signal types (metrics/logs), and a live up/down status. Never throws for an unreachable backend — the backend is reported as down instead.",
137
+ "Related: use `list_services` to see what is monitored within these sources.",
138
+ ].join(" "), {}, async () => withToolMetrics("list_sources", () => listSourcesHandler(registry)));
139
+ mcpServer.tool("list_services", [
140
+ "Discover the service names that can be queried, aggregated across every connected backend.",
141
+ "When to use: call this before `query_metrics`, `query_logs`, or `get_service_health` to obtain the exact, case-sensitive service name those tools require.",
142
+ "Behavior: read-only, no side effects. Returns one entry per service with the service name, the source(s) it was discovered in, and which signals are available for it (metrics, logs, or both).",
143
+ "Related: `list_sources` for backend health; `get_service_health` for a per-service overview.",
144
+ ].join(" "), {
145
+ filter: z
146
+ .string()
147
+ .optional()
148
+ .describe("Optional case-insensitive substring to narrow the result to matching service names (e.g. 'payment'). Omit to list every discovered service."),
149
+ }, async (args) => withToolMetrics("list_services", () => listServicesHandler(registry, args)));
72
150
  const metricsList = getAvailableMetricNames(registry);
73
151
  const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
74
152
  const uniqueNames = [...new Set(metricNames)];
75
- mcpServer.tool("query_metrics", `Query a specific metric for a service over a given timeframe. Returns time-series data with pre-computed summary statistics (current, average, min, max, trend). Available metrics: ${metricsList}`, {
76
- service: z.string().describe("Service name (e.g. 'api-gateway', 'payment-service')"),
77
- metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
78
- duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
79
- source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
80
- groupBy: z.string().optional().describe("Label to break the result down by, e.g. 'instance', 'pod', 'node'. Returns one series per distinct value in 'groups'."),
81
- }, async (args) => queryMetricsHandler(registry, args));
82
- mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
83
- service: z.string().describe("Service name (e.g. 'payment-service')"),
84
- query: z.string().optional().describe("Optional search query to filter log messages (regex supported)"),
85
- duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
86
- level: z.string().optional().describe("Filter by log level: 'error', 'warn', 'info', 'debug'"),
87
- limit: z.number().optional().describe("Maximum log entries to return. Default: 100"),
88
- }, async (args) => queryLogsHandler(registry, args));
89
- mcpServer.tool("get_service_health", "Get an aggregated health overview for a service combining metrics AND logs. Returns health score (0-100), status (healthy/degraded/critical), key metrics, log error summary, anomalies, and cross-signal correlations.", {
90
- service: z.string().describe("Service name to check health for"),
91
- }, async (args) => getServiceHealthHandler(registry, args));
92
- mcpServer.tool("detect_anomalies", "Scan for anomalies across all monitored services (or a specific one). Uses z-score analysis on metrics, checks log error spikes, and correlates signals. Returns anomalies with severity ratings.", {
93
- service: z.string().optional().describe("Specific service to scan. If omitted, scans all."),
94
- duration: z.string().optional().describe("Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'"),
95
- sensitivity: z.enum(["low", "medium", "high"]).optional().describe("Detection sensitivity: low (>3σ), medium (>2σ), high (>1.5σ). Default: 'medium'"),
96
- }, async (args) => detectAnomaliesHandler(registry, args));
153
+ mcpServer.tool("query_metrics", [
154
+ "Fetch the raw time-series for ONE metric of ONE service over a look-back window, returned together with pre-computed summary statistics.",
155
+ "When to use: when you need the actual numeric values or the trend of a known metric. For a 'is this service OK?' verdict use `get_service_health`; to find which services are misbehaving use `detect_anomalies`.",
156
+ "Prerequisites: get the exact service name from `list_services` and choose a metric from the list at the end of this description.",
157
+ "Behavior: read-only, no side effects. Returns an ordered array of {timestamp, value} points plus a summary {current, average, min, max, trend}. With `groupBy` set, returns one labelled series per distinct label value under `groups` instead of a single aggregated series. Units depend on the metric (e.g. CPU as %, latency as ms, rates as per-second). An unknown service/metric or an unreachable backend yields a structured explanatory error, never an exception.",
158
+ `Available metrics: ${metricsList}`,
159
+ ].join(" "), {
160
+ service: z
161
+ .string()
162
+ .describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'api-gateway', 'payment-service')."),
163
+ metric: z
164
+ .string()
165
+ .describe(`Required. Exact metric name to query. One of: ${uniqueNames.join(", ")}.`),
166
+ duration: z
167
+ .string()
168
+ .optional()
169
+ .describe("Optional. Look-back window ending at 'now', written as <number><unit> with unit s|m|h|d (e.g. '5m', '90m', '1h', '24h'). Default: '5m'."),
170
+ source: z
171
+ .string()
172
+ .optional()
173
+ .describe("Optional. Restrict the query to a single backend by its source name (see `list_sources`). Default: query and merge all metrics backends."),
174
+ groupBy: z
175
+ .string()
176
+ .optional()
177
+ .describe("Optional. Metric label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response contains one series per distinct label value under `groups`. Default: a single aggregated series."),
178
+ }, async (args) => withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args)));
179
+ mcpServer.tool("query_logs", [
180
+ "Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
181
+ "When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
182
+ "Prerequisites: get the exact service name from `list_services` (the service must expose a logs signal).",
183
+ "Behavior: read-only, no side effects. Returns the matching log entries (newest first, capped by `limit`) plus a summary with total/error/warn counts and top recurring error patterns. No matches yields an empty result with a zeroed summary; an unreachable backend yields a structured explanatory error, never an exception.",
184
+ ].join(" "), {
185
+ service: z
186
+ .string()
187
+ .describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
188
+ query: z
189
+ .string()
190
+ .optional()
191
+ .describe("Optional. Filter expression matched against the log message; regular expressions are supported. Omit to return all entries in the window."),
192
+ duration: z
193
+ .string()
194
+ .optional()
195
+ .describe("Optional. Look-back window ending at 'now', written as <number><unit> with unit s|m|h|d (e.g. '5m', '1h', '24h'). Default: '5m'."),
196
+ level: z
197
+ .enum(["error", "warn", "info", "debug"])
198
+ .optional()
199
+ .describe("Optional. Return only entries at this severity. Default: all levels."),
200
+ limit: z
201
+ .number()
202
+ .int()
203
+ .positive()
204
+ .optional()
205
+ .describe("Optional. Maximum number of log entries to return (most recent first). Default: 100."),
206
+ }, async (args) => withToolMetrics("query_logs", () => queryLogsHandler(registry, args)));
207
+ mcpServer.tool("get_service_health", [
208
+ "Produce a single aggregated health verdict for ONE service by combining its metrics and logs.",
209
+ "When to use: the fastest way to answer 'is this service healthy right now and why?'. Use `query_metrics`/`query_logs` to drill into the underlying numbers, or `detect_anomalies` to scan many services at once.",
210
+ "Prerequisites: get the exact service name from `list_services`.",
211
+ "Behavior: read-only, no side effects. Returns a weighted health score (0–100), a status of healthy | degraded | critical, the key contributing metrics, a log error summary, detected anomalies, and cross-signal correlations explaining the score. A service with no data yields an explanatory result rather than an exception.",
212
+ ].join(" "), {
213
+ service: z
214
+ .string()
215
+ .describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
216
+ }, async (args) => withToolMetrics("get_service_health", () => getServiceHealthHandler(registry, args)));
217
+ mcpServer.tool("detect_anomalies", [
218
+ "Scan one or all monitored services for abnormal behavior and return the findings ranked by severity.",
219
+ "When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with `get_service_health` for the verdict or `query_metrics`/`query_logs` for the raw evidence.",
220
+ "Behavior: read-only, no side effects. Applies z-score analysis to metrics, detects log error-rate spikes, and correlates the two. Returns a list of anomalies, each with the affected service, metric/signal, severity, the deviation (e.g. σ and % change), and a short explanation. No anomalies yields an empty list, not an error.",
221
+ "Related: `get_service_health` (single-service verdict), `query_metrics` (raw series behind a flagged metric).",
222
+ ].join(" "), {
223
+ service: z
224
+ .string()
225
+ .optional()
226
+ .describe("Optional. Restrict the scan to one service (exact, case-sensitive name from `list_services`). Default: scan every monitored service."),
227
+ duration: z
228
+ .string()
229
+ .optional()
230
+ .describe("Optional. Look-back window analyzed for anomalies, written as <number><unit> with unit s|m|h|d (e.g. '5m', '15m', '1h'). Default: '10m'."),
231
+ sensitivity: z
232
+ .enum(["low", "medium", "high"])
233
+ .optional()
234
+ .describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
235
+ }, async (args) => withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args)));
97
236
  return mcpServer;
98
237
  }
99
238
  // --- HTTP server ---
100
239
  const app = express();
101
240
  app.use(express.json({ limit: "1mb" }));
102
241
  // Security headers
103
- app.use((_req, res, next) => {
242
+ app.use((req, res, next) => {
104
243
  res.setHeader("X-Content-Type-Options", "nosniff");
105
244
  res.setHeader("X-Frame-Options", "DENY");
106
245
  res.setHeader("X-XSS-Protection", "1; mode=block");
107
246
  res.setHeader("Referrer-Policy", "strict-origin-when-cross-origin");
247
+ // Dynamic API responses must never be served from the browser/proxy
248
+ // cache: after a mutation (e.g. installing a connector) the UI
249
+ // re-fetches these GETs immediately, and a heuristically-cached stale
250
+ // body would make the change "not show up until a page reload".
251
+ if (req.path.startsWith("/api/")) {
252
+ res.setHeader("Cache-Control", "no-store");
253
+ }
254
+ next();
255
+ });
256
+ // API request counter — emitted at response time so the `status` label
257
+ // is the real outcome. /metrics itself is excluded to avoid self-scrape
258
+ // amplification.
259
+ app.use((req, res, next) => {
260
+ if (req.path === "/metrics")
261
+ return next();
262
+ res.on("finish", () => {
263
+ // Group dynamic segments by the registered Express route when we
264
+ // have one, otherwise fall back to the literal path. This keeps
265
+ // label cardinality bounded.
266
+ const route = req.route?.path ?? req.path;
267
+ apiRequests.inc({ route, method: req.method, status: String(res.statusCode) });
268
+ });
108
269
  next();
109
270
  });
271
+ // k8s-convention liveness/readiness probes at the root of the path
272
+ // tree, no /api prefix. Helm chart points its probes here. Cheap
273
+ // enough to skip the request-counter middleware.
274
+ let ready = false;
275
+ app.get("/healthz", (_req, res) => res.type("text").send("ok"));
276
+ app.get("/readyz", (_req, res) => {
277
+ if (ready)
278
+ return res.type("text").send("ok");
279
+ return res.status(503).type("text").send("starting");
280
+ });
281
+ // OpenAPI 3.1 document for the /api/* surface.
282
+ app.get("/api/openapi.json", (_req, res) => {
283
+ res.json(buildOpenApiSpec(SERVER_VERSION));
284
+ });
285
+ // Self-monitoring — Prometheus scrape endpoint.
286
+ // Disabled with METRICS_ENABLED=false for environments that prefer
287
+ // sidecar agents. The Helm chart's ServiceMonitor template targets
288
+ // this endpoint when enabled.
289
+ if (process.env.METRICS_ENABLED !== "false") {
290
+ app.get("/metrics", async (_req, res) => {
291
+ res.set("Content-Type", selfRegistry.contentType);
292
+ res.end(await selfRegistry.metrics());
293
+ });
294
+ }
110
295
  // Serve Web UI
111
296
  app.use(express.static(join(__dirname, "ui")));
112
297
  // --- API endpoints for Web UI ---
@@ -135,6 +320,170 @@ async function main() {
135
320
  app.get("/api/source-types", (_req, res) => {
136
321
  res.json(getSupportedTypes());
137
322
  });
323
+ // Server info — version, loaded plugins, MCP protocol version, build metadata.
324
+ // Used by the Web UI footer and by operators to confirm what's deployed.
325
+ app.get("/api/info", async (_req, res) => {
326
+ const loader = getPluginLoader();
327
+ res.json({
328
+ name: "observability-mcp",
329
+ version: SERVER_VERSION,
330
+ mcpProtocolVersion: "2025-03-26",
331
+ build: {
332
+ commit: process.env.GIT_COMMIT || null,
333
+ date: process.env.BUILD_DATE || null,
334
+ },
335
+ runtime: {
336
+ node: process.version,
337
+ platform: process.platform,
338
+ arch: process.arch,
339
+ },
340
+ plugins: loader.list().map((p) => ({
341
+ name: p.name,
342
+ source: p.source,
343
+ version: p.manifest?.version ?? null,
344
+ signalTypes: p.manifest?.signalTypes ?? null,
345
+ })),
346
+ });
347
+ });
348
+ // Connectors currently loaded into this server (builtin + filesystem
349
+ // plugins), with manifest metadata — drives the UI "Connectors" page.
350
+ app.get("/api/connectors", (_req, res) => {
351
+ res.json({ connectors: describeInstalled(getPluginLoader().list()) });
352
+ });
353
+ // Server-side proxy of the connector hub catalog (so the browser
354
+ // needn't reach the hub directly — works behind a proxy / against a
355
+ // mirror via HUB_CATALOG_URL). Installed status merged in.
356
+ app.get("/api/hub/catalog", async (_req, res) => {
357
+ const url = resolveHubCatalogUrl();
358
+ try {
359
+ const catalog = await fetchHubCatalog(url);
360
+ res.json({
361
+ url,
362
+ connectors: mergeCatalog(catalog, describeInstalled(getPluginLoader().list())),
363
+ });
364
+ }
365
+ catch (e) {
366
+ res.status(502).json({ url, error: e instanceof Error ? e.message : String(e), connectors: [] });
367
+ }
368
+ });
369
+ // Install a connector from the hub into the running server.
370
+ //
371
+ // Runtime code-load is powerful, so this is doubly gated:
372
+ // 1. ENABLE_UI_INSTALL=true must be set (default OFF).
373
+ // 2. PLUGIN_TRUST_ROOT must be configured — install is ALWAYS
374
+ // fail-closed verified (no insecure bypass over HTTP).
375
+ // Only catalog tarballUrls are fetched (no arbitrary URL in the body)
376
+ // to avoid SSRF. The connector persists to PLUGINS_DIR (back it with
377
+ // a PVC on k8s so it survives restarts).
378
+ app.post("/api/connectors/install", installRateLimit, async (req, res) => {
379
+ if (process.env.ENABLE_UI_INSTALL !== "true") {
380
+ return res.status(403).json({
381
+ error: "UI install is disabled. Set ENABLE_UI_INSTALL=true and PLUGIN_TRUST_ROOT to enable it.",
382
+ });
383
+ }
384
+ const trustRootPath = process.env.PLUGIN_TRUST_ROOT;
385
+ if (!trustRootPath) {
386
+ return res.status(412).json({
387
+ error: "PLUGIN_TRUST_ROOT not configured — refusing to install unverified code.",
388
+ });
389
+ }
390
+ const name = (req.body || {}).name;
391
+ const version = (req.body || {}).version;
392
+ if (!isValidConnectorName(name)) {
393
+ return res.status(400).json({ error: "invalid connector name" });
394
+ }
395
+ const pluginsDir = process.env.PLUGINS_DIR ?? "/app/plugins";
396
+ let work = null;
397
+ try {
398
+ const catalog = await fetchHubCatalog(resolveHubCatalogUrl());
399
+ const entry = catalog.connectors.find((c) => c.name === name);
400
+ if (!entry)
401
+ return res.status(404).json({ error: `'${name}' is not in the catalog` });
402
+ if (entry.builtin)
403
+ return res.status(409).json({ error: `'${name}' is builtin — no install needed` });
404
+ const v = version
405
+ ? entry.versions.find((x) => x.version === version)
406
+ : entry.versions.find((x) => x.version === (entry.latest ?? entry.versions[0]?.version)) ?? entry.versions[0];
407
+ if (!v || !v.tarballUrl) {
408
+ return res.status(422).json({ error: `no tarball for ${name}@${version ?? "latest"}` });
409
+ }
410
+ const resp = await fetch(v.tarballUrl);
411
+ if (!resp.ok)
412
+ return res.status(502).json({ error: `tarball download HTTP ${resp.status}` });
413
+ const declared = Number(resp.headers.get("content-length") || 0);
414
+ if (declared > MAX_CONNECTOR_TGZ_BYTES) {
415
+ return res.status(413).json({ error: `tarball too large (${declared} bytes)` });
416
+ }
417
+ const buf = Buffer.from(await resp.arrayBuffer());
418
+ if (buf.length > MAX_CONNECTOR_TGZ_BYTES) {
419
+ return res.status(413).json({ error: `tarball too large (${buf.length} bytes)` });
420
+ }
421
+ work = mkdtempSync(join(tmpdir(), "obsmcp-dl-"));
422
+ const tgz = join(work, "c.tgz");
423
+ writeFileSync(tgz, buf);
424
+ const result = installTarball({ tgzPath: tgz, pluginsDir, trustRootPath, expectedName: name });
425
+ await getPluginLoader().load(); // re-scan so /api/connectors reflects it
426
+ res.json({
427
+ ok: true,
428
+ ...result,
429
+ note: "installed & persisted to PLUGINS_DIR. Add a source of this type to use it; a server restart is recommended for full availability in existing MCP sessions.",
430
+ });
431
+ }
432
+ catch (e) {
433
+ const msg = e instanceof Error ? e.message : String(e);
434
+ const code = e instanceof PluginVerificationError ? 400 : 500;
435
+ res.status(code).json({ error: `install failed (fail-closed): ${msg}` });
436
+ }
437
+ finally {
438
+ if (work)
439
+ rmSync(work, { recursive: true, force: true });
440
+ }
441
+ });
442
+ // Upload a connector bundle (.tgz) and install it into the running
443
+ // server. Same fail-closed guardrails as /install: the upload is
444
+ // ALWAYS verified against PLUGIN_TRUST_ROOT (signature + integrity),
445
+ // so an unsigned/tampered bundle is rejected. Body is the raw tarball
446
+ // bytes (application/octet-stream). Persists to PLUGINS_DIR.
447
+ app.post("/api/connectors/upload", installRateLimit, express.raw({ type: "application/octet-stream", limit: "50mb" }), async (req, res) => {
448
+ if (process.env.ENABLE_UI_INSTALL !== "true") {
449
+ return res.status(403).json({
450
+ error: "UI install is disabled. Set ENABLE_UI_INSTALL=true and PLUGIN_TRUST_ROOT to enable it.",
451
+ });
452
+ }
453
+ const trustRootPath = process.env.PLUGIN_TRUST_ROOT;
454
+ if (!trustRootPath) {
455
+ return res.status(412).json({
456
+ error: "PLUGIN_TRUST_ROOT not configured — refusing to install unverified code.",
457
+ });
458
+ }
459
+ const body = req.body;
460
+ if (!Buffer.isBuffer(body) || body.length === 0) {
461
+ return res.status(400).json({ error: "empty body — POST the connector .tgz as application/octet-stream" });
462
+ }
463
+ const pluginsDir = process.env.PLUGINS_DIR ?? "/app/plugins";
464
+ let work = null;
465
+ try {
466
+ work = mkdtempSync(join(tmpdir(), "obsmcp-up-"));
467
+ const tgz = join(work, "c.tgz");
468
+ writeFileSync(tgz, body);
469
+ const result = installTarball({ tgzPath: tgz, pluginsDir, trustRootPath });
470
+ await getPluginLoader().load(); // re-scan so /api/connectors reflects it
471
+ res.json({
472
+ ok: true,
473
+ ...result,
474
+ note: "uploaded, verified & persisted to PLUGINS_DIR. Add a source of this type to use it; a server restart is recommended for full availability in existing MCP sessions.",
475
+ });
476
+ }
477
+ catch (e) {
478
+ const msg = e instanceof Error ? e.message : String(e);
479
+ const code = e instanceof PluginVerificationError ? 400 : 500;
480
+ res.status(code).json({ error: `upload install failed (fail-closed): ${msg}` });
481
+ }
482
+ finally {
483
+ if (work)
484
+ rmSync(work, { recursive: true, force: true });
485
+ }
486
+ });
138
487
  // Add a new source
139
488
  app.post("/api/sources", async (req, res) => {
140
489
  const { name, type, url, enabled, auth, tls } = req.body;
@@ -352,6 +701,16 @@ async function main() {
352
701
  saveConfig(config);
353
702
  res.json({ ok: true });
354
703
  });
704
+ // Stdio transport: one server over stdin/stdout, no HTTP listener.
705
+ if (STDIO) {
706
+ const server = createMcpServer();
707
+ await server.connect(new StdioServerTransport());
708
+ console.error(`observability-mcp running on stdio transport · connectors: ${registry
709
+ .getAll()
710
+ .map((c) => c.name)
711
+ .join(", ")}`);
712
+ return;
713
+ }
355
714
  // MCP Streamable HTTP transport — stateful sessions
356
715
  const transports = new Map();
357
716
  const sessionLastActive = new Map();
@@ -366,6 +725,7 @@ async function main() {
366
725
  console.log(`Session ${sid} expired (idle)`);
367
726
  }
368
727
  }
728
+ mcpActiveSessions.set(transports.size);
369
729
  }, 5 * 60 * 1000);
370
730
  app.post("/mcp", async (req, res) => {
371
731
  const sessionId = req.headers["mcp-session-id"];
@@ -385,6 +745,7 @@ async function main() {
385
745
  break;
386
746
  }
387
747
  }
748
+ mcpActiveSessions.set(transports.size);
388
749
  };
389
750
  const sessionMcpServer = createMcpServer();
390
751
  await sessionMcpServer.connect(transport);
@@ -397,6 +758,7 @@ async function main() {
397
758
  transports.set(sid, transport);
398
759
  sessionLastActive.set(sid, Date.now());
399
760
  }
761
+ mcpActiveSessions.set(transports.size);
400
762
  });
401
763
  app.get("/mcp", async (req, res) => {
402
764
  const sessionId = req.headers["mcp-session-id"];
@@ -421,6 +783,7 @@ async function main() {
421
783
  });
422
784
  const PORT = parseInt(process.env.PORT || "3000");
423
785
  app.listen(PORT, () => {
786
+ ready = true;
424
787
  console.log(`observability-mcp server running on port ${PORT}`);
425
788
  console.log(` MCP endpoint: http://localhost:${PORT}/mcp`);
426
789
  console.log(` Web UI: http://localhost:${PORT}`);
@@ -0,0 +1,8 @@
1
+ import type { ObservabilityConnector } from "../connectors/interface.js";
2
+ /**
3
+ * Decorate a connector so every observable backend call increments
4
+ * obsmcp_connector_calls_total{source,type,operation,outcome}. The
5
+ * `source` label is filled in on first `connect()` once the config
6
+ * is known. Keeps connector implementations free of metrics code.
7
+ */
8
+ export declare function instrumentConnector<T extends ObservabilityConnector>(c: T): T;
@@ -0,0 +1,41 @@
1
+ import { connectorCalls } from "./self.js";
2
+ const OPS = [
3
+ "healthCheck",
4
+ "listServices",
5
+ "queryMetrics",
6
+ "queryLogs",
7
+ "listAvailableMetrics",
8
+ ];
9
+ /**
10
+ * Decorate a connector so every observable backend call increments
11
+ * obsmcp_connector_calls_total{source,type,operation,outcome}. The
12
+ * `source` label is filled in on first `connect()` once the config
13
+ * is known. Keeps connector implementations free of metrics code.
14
+ */
15
+ export function instrumentConnector(c) {
16
+ let source = "";
17
+ const type = c.type;
18
+ const wrappedConnect = c.connect.bind(c);
19
+ c.connect = async (config) => {
20
+ source = config.name;
21
+ return wrappedConnect(config);
22
+ };
23
+ for (const op of OPS) {
24
+ const fn = c[op];
25
+ if (typeof fn !== "function")
26
+ continue;
27
+ const bound = fn.bind(c);
28
+ c[op] = async (...args) => {
29
+ try {
30
+ const r = await bound(...args);
31
+ connectorCalls.inc({ source: source || "<pending>", type, operation: op, outcome: "ok" });
32
+ return r;
33
+ }
34
+ catch (err) {
35
+ connectorCalls.inc({ source: source || "<pending>", type, operation: op, outcome: "error" });
36
+ throw err;
37
+ }
38
+ };
39
+ }
40
+ return c;
41
+ }
@@ -0,0 +1,12 @@
1
+ import { Registry, Counter, Histogram, Gauge } from "prom-client";
2
+ export declare const selfRegistry: Registry<"text/plain; version=0.0.4; charset=utf-8">;
3
+ export declare const mcpToolCalls: Counter<"tool" | "outcome">;
4
+ export declare const mcpToolLatency: Histogram<"tool">;
5
+ export declare const connectorCalls: Counter<"type" | "source" | "outcome" | "operation">;
6
+ export declare const apiRequests: Counter<"status" | "route" | "method">;
7
+ export declare const mcpActiveSessions: Gauge<string>;
8
+ /**
9
+ * Wrap a (potentially async) tool handler to record call count + latency.
10
+ * Outcome is "ok" or "error" — never throws on its own.
11
+ */
12
+ export declare function withToolMetrics<T>(tool: string, fn: () => Promise<T>): Promise<T>;
@@ -0,0 +1,61 @@
1
+ // Server self-metrics exposed at /metrics for Prometheus scraping.
2
+ // Pairs with the Helm chart's ServiceMonitor template.
3
+ //
4
+ // Default Node metrics (CPU, memory, event loop lag, heap) come from
5
+ // prom-client's collectDefaultMetrics. On top of that we ship four
6
+ // product-specific counters/histograms that operators actually need
7
+ // to graph: MCP tool calls, connector backend calls, /api/* requests,
8
+ // active session count.
9
+ import { Registry, collectDefaultMetrics, Counter, Histogram, Gauge, } from "prom-client";
10
+ export const selfRegistry = new Registry();
11
+ selfRegistry.setDefaultLabels({ service: "observability-mcp" });
12
+ collectDefaultMetrics({ register: selfRegistry, prefix: "obsmcp_" });
13
+ export const mcpToolCalls = new Counter({
14
+ name: "obsmcp_mcp_tool_calls_total",
15
+ help: "MCP tool invocations by tool and outcome.",
16
+ labelNames: ["tool", "outcome"],
17
+ registers: [selfRegistry],
18
+ });
19
+ export const mcpToolLatency = new Histogram({
20
+ name: "obsmcp_mcp_tool_duration_seconds",
21
+ help: "MCP tool invocation latency.",
22
+ labelNames: ["tool"],
23
+ buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10],
24
+ registers: [selfRegistry],
25
+ });
26
+ export const connectorCalls = new Counter({
27
+ name: "obsmcp_connector_calls_total",
28
+ help: "Calls to a configured connector, by source and outcome.",
29
+ labelNames: ["source", "type", "operation", "outcome"],
30
+ registers: [selfRegistry],
31
+ });
32
+ export const apiRequests = new Counter({
33
+ name: "obsmcp_api_requests_total",
34
+ help: "Web UI / API request count, by route and status.",
35
+ labelNames: ["route", "method", "status"],
36
+ registers: [selfRegistry],
37
+ });
38
+ export const mcpActiveSessions = new Gauge({
39
+ name: "obsmcp_mcp_active_sessions",
40
+ help: "Active MCP Streamable HTTP sessions.",
41
+ registers: [selfRegistry],
42
+ });
43
+ /**
44
+ * Wrap a (potentially async) tool handler to record call count + latency.
45
+ * Outcome is "ok" or "error" — never throws on its own.
46
+ */
47
+ export async function withToolMetrics(tool, fn) {
48
+ const end = mcpToolLatency.startTimer({ tool });
49
+ try {
50
+ const r = await fn();
51
+ mcpToolCalls.inc({ tool, outcome: "ok" });
52
+ return r;
53
+ }
54
+ catch (err) {
55
+ mcpToolCalls.inc({ tool, outcome: "error" });
56
+ throw err;
57
+ }
58
+ finally {
59
+ end();
60
+ }
61
+ }
@@ -0,0 +1,2 @@
1
+ import type { OpenAPIV3_1 } from "openapi-types";
2
+ export declare function buildOpenApiSpec(version: string): OpenAPIV3_1.Document;