@thotischner/observability-mcp 1.4.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/analysis/anomaly.d.ts +89 -0
- package/dist/analysis/anomaly.js +235 -0
- package/dist/analysis/anomaly.test.js +149 -1
- package/dist/analysis/backtest.d.ts +31 -0
- package/dist/analysis/backtest.js +206 -0
- package/dist/analysis/backtest.test.d.ts +1 -0
- package/dist/analysis/backtest.test.js +34 -0
- package/dist/analysis/correlator.d.ts +35 -0
- package/dist/analysis/correlator.js +95 -0
- package/dist/analysis/correlator.test.js +60 -1
- package/dist/analysis/health.d.ts +2 -3
- package/dist/analysis/index.d.ts +32 -0
- package/dist/analysis/index.js +29 -0
- package/dist/analysis/library.test.d.ts +1 -0
- package/dist/analysis/library.test.js +44 -0
- package/dist/auth/credentials.d.ts +29 -0
- package/dist/auth/credentials.js +76 -0
- package/dist/auth/credentials.test.d.ts +1 -0
- package/dist/auth/credentials.test.js +57 -0
- package/dist/context.d.ts +27 -0
- package/dist/context.js +18 -0
- package/dist/enterprise-gate.d.ts +132 -0
- package/dist/enterprise-gate.js +510 -0
- package/dist/enterprise-gate.test.d.ts +1 -0
- package/dist/enterprise-gate.test.js +178 -0
- package/dist/index.js +125 -44
- package/dist/net/egress-policy.d.ts +31 -0
- package/dist/net/egress-policy.js +37 -0
- package/dist/net/egress-policy.test.d.ts +1 -0
- package/dist/net/egress-policy.test.js +52 -0
- package/dist/tools/context-seam.test.d.ts +1 -0
- package/dist/tools/context-seam.test.js +23 -0
- package/dist/tools/detect-anomalies.d.ts +2 -1
- package/dist/tools/detect-anomalies.js +47 -11
- package/dist/tools/get-service-health.d.ts +2 -1
- package/dist/tools/get-service-health.js +13 -9
- package/dist/tools/handlers.test.js +104 -0
- package/dist/tools/list-services.d.ts +2 -1
- package/dist/tools/list-services.js +2 -1
- package/dist/tools/list-sources.d.ts +2 -1
- package/dist/tools/list-sources.js +2 -1
- package/dist/tools/query-logs.d.ts +2 -1
- package/dist/tools/query-logs.js +2 -1
- package/dist/tools/query-metrics.d.ts +2 -1
- package/dist/tools/query-metrics.js +9 -1
- package/dist/ui/index.html +1510 -67
- package/package.json +10 -2
package/dist/index.js
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
import express from "express";
|
|
3
|
+
import rateLimit from "express-rate-limit";
|
|
3
4
|
import { randomUUID } from "node:crypto";
|
|
4
5
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
5
6
|
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
@@ -7,6 +8,9 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
|
|
|
7
8
|
import { z } from "zod";
|
|
8
9
|
import { loadConfig, saveConfig, DEFAULT_HEALTH_THRESHOLDS, DEFAULT_SETTINGS } from "./config/loader.js";
|
|
9
10
|
import { ConnectorRegistry, getSupportedTypes } from "./connectors/registry.js";
|
|
11
|
+
import { defaultContext, principalContext } from "./context.js";
|
|
12
|
+
import { enforceEntitledAccess, enterpriseGateStatus, enterpriseGateInfo, enterprisePolicyView, enterpriseCatalogView, enterpriseAuditTail, authorizeAdmin, updateRbacPolicy, updateCatalog, } from "./enterprise-gate.js";
|
|
13
|
+
import { loadCredentials, credentialsConfigured, extractToken, resolveToken, } from "./auth/credentials.js";
|
|
10
14
|
import { getPluginLoader } from "./connectors/loader.js";
|
|
11
15
|
import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
|
|
12
16
|
import { isValidConnectorName, installTarball } from "./connectors/install.js";
|
|
@@ -75,35 +79,18 @@ function validateSourceUrl(url) {
|
|
|
75
79
|
// Hard cap for a downloaded/uploaded connector tarball (defence against
|
|
76
80
|
// a hostile or accidental huge artifact OOM-ing the server).
|
|
77
81
|
const MAX_CONNECTOR_TGZ_BYTES = 64 * 1024 * 1024;
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
//
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
}
|
|
91
|
-
const key = req.ip || "unknown";
|
|
92
|
-
let s = installRateState.get(key);
|
|
93
|
-
if (!s || s.resetAt < now) {
|
|
94
|
-
s = { count: 0, resetAt: now + WINDOW_MS };
|
|
95
|
-
installRateState.set(key, s);
|
|
96
|
-
}
|
|
97
|
-
s.count++;
|
|
98
|
-
if (s.count > MAX) {
|
|
99
|
-
res.setHeader("Retry-After", String(Math.ceil((s.resetAt - now) / 1000)));
|
|
100
|
-
res.status(429).json({
|
|
101
|
-
error: "rate limit exceeded — too many connector install attempts, slow down",
|
|
102
|
-
});
|
|
103
|
-
return;
|
|
104
|
-
}
|
|
105
|
-
next();
|
|
106
|
-
}
|
|
82
|
+
// Per-client rate limiter for the expensive runtime routes (connector
|
|
83
|
+
// install/upload: fetch + extract + verify + fs write + loader rescan;
|
|
84
|
+
// add/test source: outbound backend connect). Uses express-rate-limit
|
|
85
|
+
// so the control is explicit and well-tested. Bounds abuse even with
|
|
86
|
+
// ENABLE_UI_INSTALL on.
|
|
87
|
+
const installRateLimit = rateLimit({
|
|
88
|
+
windowMs: 60_000,
|
|
89
|
+
limit: 5,
|
|
90
|
+
standardHeaders: true,
|
|
91
|
+
legacyHeaders: false,
|
|
92
|
+
message: { error: "rate limit exceeded — too many attempts, slow down" },
|
|
93
|
+
});
|
|
107
94
|
async function main() {
|
|
108
95
|
// Stdio transport mode (MCP catalogs / desktop clients / Glama's
|
|
109
96
|
// mcp-proxy spawn a stdio MCP server and read JSON-RPC from stdout).
|
|
@@ -124,7 +111,7 @@ async function main() {
|
|
|
124
111
|
// so we cannot share a single McpServer across HTTP sessions. Each new
|
|
125
112
|
// session needs its own server. The factory captures the live registry
|
|
126
113
|
// by reference so tool handlers always see the current configuration.
|
|
127
|
-
function createMcpServer() {
|
|
114
|
+
function createMcpServer(ctx) {
|
|
128
115
|
const mcpServer = new McpServer({
|
|
129
116
|
name: "observability-mcp",
|
|
130
117
|
version: SERVER_VERSION,
|
|
@@ -135,7 +122,10 @@ async function main() {
|
|
|
135
122
|
"When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
|
|
136
123
|
"Behavior: read-only, no side effects. Returns one entry per source with its name, type, configured URL, signal types (metrics/logs), and a live up/down status. Never throws for an unreachable backend — the backend is reported as down instead.",
|
|
137
124
|
"Related: use `list_services` to see what is monitored within these sources.",
|
|
138
|
-
].join(" "), {}, async () =>
|
|
125
|
+
].join(" "), {}, async () => {
|
|
126
|
+
await enforceEntitledAccess(ctx, { tool: "list_sources" });
|
|
127
|
+
return withToolMetrics("list_sources", () => listSourcesHandler(registry, ctx));
|
|
128
|
+
});
|
|
139
129
|
mcpServer.tool("list_services", [
|
|
140
130
|
"Discover the service names that can be queried, aggregated across every connected backend.",
|
|
141
131
|
"When to use: call this before `query_metrics`, `query_logs`, or `get_service_health` to obtain the exact, case-sensitive service name those tools require.",
|
|
@@ -146,7 +136,10 @@ async function main() {
|
|
|
146
136
|
.string()
|
|
147
137
|
.optional()
|
|
148
138
|
.describe("Optional case-insensitive substring to narrow the result to matching service names (e.g. 'payment'). Omit to list every discovered service."),
|
|
149
|
-
}, async (args) =>
|
|
139
|
+
}, async (args) => {
|
|
140
|
+
await enforceEntitledAccess(ctx, { tool: "list_services" });
|
|
141
|
+
return withToolMetrics("list_services", () => listServicesHandler(registry, args, ctx));
|
|
142
|
+
});
|
|
150
143
|
const metricsList = getAvailableMetricNames(registry);
|
|
151
144
|
const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
|
|
152
145
|
const uniqueNames = [...new Set(metricNames)];
|
|
@@ -175,7 +168,10 @@ async function main() {
|
|
|
175
168
|
.string()
|
|
176
169
|
.optional()
|
|
177
170
|
.describe("Optional. Metric label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response contains one series per distinct label value under `groups`. Default: a single aggregated series."),
|
|
178
|
-
}, async (args) =>
|
|
171
|
+
}, async (args) => {
|
|
172
|
+
await enforceEntitledAccess(ctx, { tool: "query_metrics", source: args?.source, service: args?.service });
|
|
173
|
+
return withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args, ctx));
|
|
174
|
+
});
|
|
179
175
|
mcpServer.tool("query_logs", [
|
|
180
176
|
"Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
|
|
181
177
|
"When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
|
|
@@ -203,7 +199,10 @@ async function main() {
|
|
|
203
199
|
.positive()
|
|
204
200
|
.optional()
|
|
205
201
|
.describe("Optional. Maximum number of log entries to return (most recent first). Default: 100."),
|
|
206
|
-
}, async (args) =>
|
|
202
|
+
}, async (args) => {
|
|
203
|
+
await enforceEntitledAccess(ctx, { tool: "query_logs", source: args?.source, service: args?.service });
|
|
204
|
+
return withToolMetrics("query_logs", () => queryLogsHandler(registry, args, ctx));
|
|
205
|
+
});
|
|
207
206
|
mcpServer.tool("get_service_health", [
|
|
208
207
|
"Produce a single aggregated health verdict for ONE service by combining its metrics and logs.",
|
|
209
208
|
"When to use: the fastest way to answer 'is this service healthy right now and why?'. Use `query_metrics`/`query_logs` to drill into the underlying numbers, or `detect_anomalies` to scan many services at once.",
|
|
@@ -213,7 +212,10 @@ async function main() {
|
|
|
213
212
|
service: z
|
|
214
213
|
.string()
|
|
215
214
|
.describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
|
|
216
|
-
}, async (args) =>
|
|
215
|
+
}, async (args) => {
|
|
216
|
+
await enforceEntitledAccess(ctx, { tool: "get_service_health", service: args?.service });
|
|
217
|
+
return withToolMetrics("get_service_health", () => getServiceHealthHandler(registry, args, ctx));
|
|
218
|
+
});
|
|
217
219
|
mcpServer.tool("detect_anomalies", [
|
|
218
220
|
"Scan one or all monitored services for abnormal behavior and return the findings ranked by severity.",
|
|
219
221
|
"When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with `get_service_health` for the verdict or `query_metrics`/`query_logs` for the raw evidence.",
|
|
@@ -232,7 +234,10 @@ async function main() {
|
|
|
232
234
|
.enum(["low", "medium", "high"])
|
|
233
235
|
.optional()
|
|
234
236
|
.describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
|
|
235
|
-
}, async (args) =>
|
|
237
|
+
}, async (args) => {
|
|
238
|
+
await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
|
|
239
|
+
return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
|
|
240
|
+
});
|
|
236
241
|
return mcpServer;
|
|
237
242
|
}
|
|
238
243
|
// --- HTTP server ---
|
|
@@ -327,6 +332,7 @@ async function main() {
|
|
|
327
332
|
res.json({
|
|
328
333
|
name: "observability-mcp",
|
|
329
334
|
version: SERVER_VERSION,
|
|
335
|
+
enterpriseGate: await enterpriseGateStatus(),
|
|
330
336
|
mcpProtocolVersion: "2025-03-26",
|
|
331
337
|
build: {
|
|
332
338
|
commit: process.env.GIT_COMMIT || null,
|
|
@@ -350,6 +356,58 @@ async function main() {
|
|
|
350
356
|
app.get("/api/connectors", (_req, res) => {
|
|
351
357
|
res.json({ connectors: describeInstalled(getPluginLoader().list()) });
|
|
352
358
|
});
|
|
359
|
+
// --- Enterprise console (read-only introspection) -------------------
|
|
360
|
+
// Drives the management UI's Enterprise page. Read-only in this phase;
|
|
361
|
+
// never exposes the entitlement token or any key. Same trusted-local
|
|
362
|
+
// management plane as the other /api/* endpoints (see auth-and-tls).
|
|
363
|
+
app.get("/api/enterprise/status", async (_req, res) => {
|
|
364
|
+
try {
|
|
365
|
+
res.json(await enterpriseGateInfo());
|
|
366
|
+
}
|
|
367
|
+
catch (e) {
|
|
368
|
+
res.status(500).json({ error: String(e) });
|
|
369
|
+
}
|
|
370
|
+
});
|
|
371
|
+
app.get("/api/enterprise/policy", (_req, res) => {
|
|
372
|
+
res.json(enterprisePolicyView());
|
|
373
|
+
});
|
|
374
|
+
app.get("/api/enterprise/catalog", (_req, res) => {
|
|
375
|
+
res.json(enterpriseCatalogView());
|
|
376
|
+
});
|
|
377
|
+
app.get("/api/enterprise/audit", async (req, res) => {
|
|
378
|
+
const limit = Math.min(Number(req.query.limit) || 50, 500);
|
|
379
|
+
try {
|
|
380
|
+
res.json(await enterpriseAuditTail(limit));
|
|
381
|
+
}
|
|
382
|
+
catch (e) {
|
|
383
|
+
res.status(500).json({ error: String(e) });
|
|
384
|
+
}
|
|
385
|
+
});
|
|
386
|
+
// Phase 2: edit the RBAC policy. NOT on the open local plane — requires
|
|
387
|
+
// an API-key principal the CURRENT policy grants `enterprise:admin`.
|
|
388
|
+
app.put("/api/enterprise/policy", async (req, res) => {
|
|
389
|
+
const cred = resolveToken(extractToken(req.headers), loadCredentials());
|
|
390
|
+
const principal = cred ? cred.name : null;
|
|
391
|
+
const authz = await authorizeAdmin(principal);
|
|
392
|
+
if (!authz.ok)
|
|
393
|
+
return res.status(authz.status).json({ error: authz.error });
|
|
394
|
+
const result = await updateRbacPolicy(principal, req.body);
|
|
395
|
+
if (!result.ok)
|
|
396
|
+
return res.status(result.status).json({ error: result.error });
|
|
397
|
+
res.json({ ok: true });
|
|
398
|
+
});
|
|
399
|
+
// Phase 3: edit the product catalog. Same admin model as the RBAC write.
|
|
400
|
+
app.put("/api/enterprise/catalog", async (req, res) => {
|
|
401
|
+
const cred = resolveToken(extractToken(req.headers), loadCredentials());
|
|
402
|
+
const principal = cred ? cred.name : null;
|
|
403
|
+
const authz = await authorizeAdmin(principal);
|
|
404
|
+
if (!authz.ok)
|
|
405
|
+
return res.status(authz.status).json({ error: authz.error });
|
|
406
|
+
const result = await updateCatalog(principal, req.body);
|
|
407
|
+
if (!result.ok)
|
|
408
|
+
return res.status(result.status).json({ error: result.error });
|
|
409
|
+
res.json({ ok: true });
|
|
410
|
+
});
|
|
353
411
|
// Server-side proxy of the connector hub catalog (so the browser
|
|
354
412
|
// needn't reach the hub directly — works behind a proxy / against a
|
|
355
413
|
// mirror via HUB_CATALOG_URL). Installed status merged in.
|
|
@@ -485,7 +543,7 @@ async function main() {
|
|
|
485
543
|
}
|
|
486
544
|
});
|
|
487
545
|
// Add a new source
|
|
488
|
-
app.post("/api/sources", async (req, res) => {
|
|
546
|
+
app.post("/api/sources", installRateLimit, async (req, res) => {
|
|
489
547
|
const { name, type, url, enabled, auth, tls } = req.body;
|
|
490
548
|
if (!name || !type || !url) {
|
|
491
549
|
res.status(400).json({ error: "name, type, and url are required" });
|
|
@@ -548,7 +606,7 @@ async function main() {
|
|
|
548
606
|
res.json({ ok: true });
|
|
549
607
|
});
|
|
550
608
|
// Test a source connection (without saving)
|
|
551
|
-
app.post("/api/sources/test", async (req, res) => {
|
|
609
|
+
app.post("/api/sources/test", installRateLimit, async (req, res) => {
|
|
552
610
|
const { name, type, url, enabled, auth, tls } = req.body;
|
|
553
611
|
if (!type || !url) {
|
|
554
612
|
res.status(400).json({ error: "type and url are required" });
|
|
@@ -594,7 +652,7 @@ async function main() {
|
|
|
594
652
|
// List discovered services
|
|
595
653
|
app.get("/api/services", async (_req, res) => {
|
|
596
654
|
try {
|
|
597
|
-
const result = await listServicesHandler(registry, {});
|
|
655
|
+
const result = await listServicesHandler(registry, {}, defaultContext());
|
|
598
656
|
res.json(parseToolResult(result));
|
|
599
657
|
}
|
|
600
658
|
catch {
|
|
@@ -604,7 +662,7 @@ async function main() {
|
|
|
604
662
|
// Health endpoint for UI dashboard
|
|
605
663
|
app.get("/api/health/:service", async (req, res) => {
|
|
606
664
|
try {
|
|
607
|
-
const result = await getServiceHealthHandler(registry, { service: req.params.service });
|
|
665
|
+
const result = await getServiceHealthHandler(registry, { service: req.params.service }, defaultContext());
|
|
608
666
|
res.json(parseToolResult(result));
|
|
609
667
|
}
|
|
610
668
|
catch {
|
|
@@ -614,13 +672,13 @@ async function main() {
|
|
|
614
672
|
// Health for all services
|
|
615
673
|
app.get("/api/health", async (_req, res) => {
|
|
616
674
|
try {
|
|
617
|
-
const servicesResult = await listServicesHandler(registry, {});
|
|
675
|
+
const servicesResult = await listServicesHandler(registry, {}, defaultContext());
|
|
618
676
|
const parsed = parseToolResult(servicesResult);
|
|
619
677
|
const services = parsed?.services || [];
|
|
620
678
|
const health = {};
|
|
621
679
|
for (const svc of services) {
|
|
622
680
|
try {
|
|
623
|
-
const result = await getServiceHealthHandler(registry, { service: svc.name });
|
|
681
|
+
const result = await getServiceHealthHandler(registry, { service: svc.name }, defaultContext());
|
|
624
682
|
health[svc.name] = parseToolResult(result);
|
|
625
683
|
}
|
|
626
684
|
catch {
|
|
@@ -703,7 +761,7 @@ async function main() {
|
|
|
703
761
|
});
|
|
704
762
|
// Stdio transport: one server over stdin/stdout, no HTTP listener.
|
|
705
763
|
if (STDIO) {
|
|
706
|
-
const server = createMcpServer();
|
|
764
|
+
const server = createMcpServer(defaultContext());
|
|
707
765
|
await server.connect(new StdioServerTransport());
|
|
708
766
|
console.error(`observability-mcp running on stdio transport · connectors: ${registry
|
|
709
767
|
.getAll()
|
|
@@ -727,7 +785,26 @@ async function main() {
|
|
|
727
785
|
}
|
|
728
786
|
mcpActiveSessions.set(transports.size);
|
|
729
787
|
}, 5 * 60 * 1000);
|
|
788
|
+
// Single-tenant auth gate. No credentials configured → anonymous (current
|
|
789
|
+
// behaviour, fully backward compatible). Configured → require a valid
|
|
790
|
+
// Bearer/X-API-Key on every /mcp request; resolve the principal + its
|
|
791
|
+
// coarse source allow-list into the RequestContext.
|
|
792
|
+
function gateCtx(req, res) {
|
|
793
|
+
if (!credentialsConfigured())
|
|
794
|
+
return defaultContext();
|
|
795
|
+
const cred = resolveToken(extractToken(req.headers), loadCredentials());
|
|
796
|
+
if (!cred) {
|
|
797
|
+
res
|
|
798
|
+
.status(401)
|
|
799
|
+
.json({ error: "unauthorized: valid Bearer token or X-API-Key required" });
|
|
800
|
+
return null;
|
|
801
|
+
}
|
|
802
|
+
return principalContext(cred.name, cred.allowedSources);
|
|
803
|
+
}
|
|
730
804
|
app.post("/mcp", async (req, res) => {
|
|
805
|
+
const ctx = gateCtx(req, res);
|
|
806
|
+
if (!ctx)
|
|
807
|
+
return;
|
|
731
808
|
const sessionId = req.headers["mcp-session-id"];
|
|
732
809
|
let transport;
|
|
733
810
|
if (sessionId && transports.has(sessionId)) {
|
|
@@ -747,7 +824,7 @@ async function main() {
|
|
|
747
824
|
}
|
|
748
825
|
mcpActiveSessions.set(transports.size);
|
|
749
826
|
};
|
|
750
|
-
const sessionMcpServer = createMcpServer();
|
|
827
|
+
const sessionMcpServer = createMcpServer(ctx);
|
|
751
828
|
await sessionMcpServer.connect(transport);
|
|
752
829
|
}
|
|
753
830
|
await transport.handleRequest(req, res, req.body);
|
|
@@ -761,6 +838,8 @@ async function main() {
|
|
|
761
838
|
mcpActiveSessions.set(transports.size);
|
|
762
839
|
});
|
|
763
840
|
app.get("/mcp", async (req, res) => {
|
|
841
|
+
if (!gateCtx(req, res))
|
|
842
|
+
return;
|
|
764
843
|
const sessionId = req.headers["mcp-session-id"];
|
|
765
844
|
const transport = transports.get(sessionId);
|
|
766
845
|
if (!transport) {
|
|
@@ -770,6 +849,8 @@ async function main() {
|
|
|
770
849
|
await transport.handleRequest(req, res);
|
|
771
850
|
});
|
|
772
851
|
app.delete("/mcp", async (req, res) => {
|
|
852
|
+
if (!gateCtx(req, res))
|
|
853
|
+
return;
|
|
773
854
|
const sessionId = req.headers["mcp-session-id"];
|
|
774
855
|
const transport = transports.get(sessionId);
|
|
775
856
|
if (transport) {
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verifiable offline mode — egress policy.
|
|
3
|
+
*
|
|
4
|
+
* The server performs **no telemetry, analytics, phone-home, or update
|
|
5
|
+
* checks**. The only outbound network calls it ever makes are to backends
|
|
6
|
+
* the operator explicitly configures (Prometheus/Loki/... source URLs) or to
|
|
7
|
+
* an artifact URL the operator/registry explicitly asks it to install.
|
|
8
|
+
*
|
|
9
|
+
* This module is the machine-checkable statement of that guarantee:
|
|
10
|
+
* `egress-policy.test.ts` fails CI if any source file outside the allowlist
|
|
11
|
+
* introduces an outbound call — so the "no data egress" property cannot
|
|
12
|
+
* silently regress.
|
|
13
|
+
*/
|
|
14
|
+
export declare const OFFLINE_STATEMENT: string;
|
|
15
|
+
/** Regex of outbound-call shapes the guard scans for. */
|
|
16
|
+
export declare const OUTBOUND_PATTERN: RegExp;
|
|
17
|
+
/**
|
|
18
|
+
* Files/prefixes permitted to make outbound calls, each with the reason.
|
|
19
|
+
* Anything matching OUTBOUND_PATTERN outside these paths is a policy breach
|
|
20
|
+
* (e.g. a newly added analytics/telemetry module).
|
|
21
|
+
*/
|
|
22
|
+
export declare const EGRESS_ALLOWLIST: ReadonlyArray<{
|
|
23
|
+
prefix: string;
|
|
24
|
+
reason: string;
|
|
25
|
+
}>;
|
|
26
|
+
/**
|
|
27
|
+
* Hard-blocked analytics/telemetry SDKs — matches an *import/require of the
|
|
28
|
+
* package*, not the word in prose, so comments/policy text don't false-positive.
|
|
29
|
+
*/
|
|
30
|
+
export declare const FORBIDDEN_TELEMETRY: RegExp;
|
|
31
|
+
export declare function isEgressAllowed(relPath: string): boolean;
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Verifiable offline mode — egress policy.
|
|
3
|
+
*
|
|
4
|
+
* The server performs **no telemetry, analytics, phone-home, or update
|
|
5
|
+
* checks**. The only outbound network calls it ever makes are to backends
|
|
6
|
+
* the operator explicitly configures (Prometheus/Loki/... source URLs) or to
|
|
7
|
+
* an artifact URL the operator/registry explicitly asks it to install.
|
|
8
|
+
*
|
|
9
|
+
* This module is the machine-checkable statement of that guarantee:
|
|
10
|
+
* `egress-policy.test.ts` fails CI if any source file outside the allowlist
|
|
11
|
+
* introduces an outbound call — so the "no data egress" property cannot
|
|
12
|
+
* silently regress.
|
|
13
|
+
*/
|
|
14
|
+
export const OFFLINE_STATEMENT = "observability-mcp makes no telemetry/analytics/phone-home/update calls. " +
|
|
15
|
+
"Outbound traffic goes only to operator-configured source backends and " +
|
|
16
|
+
"operator/registry-requested plugin artifacts. It runs fully air-gapped.";
|
|
17
|
+
/** Regex of outbound-call shapes the guard scans for. */
|
|
18
|
+
export const OUTBOUND_PATTERN = /\b(fetch\s*\(|https?\.request\s*\(|new\s+WebSocket\s*\(|import\s*\(\s*['"]https?:)/;
|
|
19
|
+
/**
|
|
20
|
+
* Files/prefixes permitted to make outbound calls, each with the reason.
|
|
21
|
+
* Anything matching OUTBOUND_PATTERN outside these paths is a policy breach
|
|
22
|
+
* (e.g. a newly added analytics/telemetry module).
|
|
23
|
+
*/
|
|
24
|
+
export const EGRESS_ALLOWLIST = [
|
|
25
|
+
{ prefix: "connectors/", reason: "connectors query operator-configured source backends" },
|
|
26
|
+
{ prefix: "cli/index.ts", reason: "CLI fetches a source location the operator passed explicitly" },
|
|
27
|
+
{ prefix: "index.ts", reason: "connector-hub plugin install of an operator/registry-requested tarball URL" },
|
|
28
|
+
];
|
|
29
|
+
/**
|
|
30
|
+
* Hard-blocked analytics/telemetry SDKs — matches an *import/require of the
|
|
31
|
+
* package*, not the word in prose, so comments/policy text don't false-positive.
|
|
32
|
+
*/
|
|
33
|
+
export const FORBIDDEN_TELEMETRY = /(?:from\s*['"]|require\(\s*['"])[^'"]*(sentry|posthog|mixpanel|amplitude|@segment|datadog-rum|analytics-node|google-analytics)/i;
|
|
34
|
+
export function isEgressAllowed(relPath) {
|
|
35
|
+
const p = relPath.replace(/\\/g, "/");
|
|
36
|
+
return EGRESS_ALLOWLIST.some((a) => p === a.prefix || p.startsWith(a.prefix));
|
|
37
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { readFileSync, readdirSync, statSync } from "node:fs";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { dirname, join, relative } from "node:path";
|
|
6
|
+
import { OUTBOUND_PATTERN, FORBIDDEN_TELEMETRY, isEgressAllowed, EGRESS_ALLOWLIST, } from "./egress-policy.js";
|
|
7
|
+
// Verifiable offline mode: static guard so the "no data egress" guarantee
|
|
8
|
+
// cannot silently regress. Any new outbound call outside the documented
|
|
9
|
+
// allowlist, or any analytics/telemetry SDK anywhere, fails CI here.
|
|
10
|
+
const srcRoot = join(dirname(fileURLToPath(import.meta.url)), "..");
|
|
11
|
+
function walk(dir) {
|
|
12
|
+
const out = [];
|
|
13
|
+
for (const e of readdirSync(dir)) {
|
|
14
|
+
const p = join(dir, e);
|
|
15
|
+
if (statSync(p).isDirectory())
|
|
16
|
+
out.push(...walk(p));
|
|
17
|
+
else if (e.endsWith(".ts") && !e.endsWith(".test.ts"))
|
|
18
|
+
out.push(p);
|
|
19
|
+
}
|
|
20
|
+
return out;
|
|
21
|
+
}
|
|
22
|
+
describe("verifiable offline mode — egress policy", () => {
|
|
23
|
+
const files = walk(srcRoot)
|
|
24
|
+
.map((f) => ({
|
|
25
|
+
rel: relative(srcRoot, f).replace(/\\/g, "/"),
|
|
26
|
+
src: readFileSync(f, "utf8"),
|
|
27
|
+
}))
|
|
28
|
+
// The policy module itself names these tokens by design.
|
|
29
|
+
.filter((f) => f.rel !== "net/egress-policy.ts");
|
|
30
|
+
it("scans a non-trivial number of source files", () => {
|
|
31
|
+
assert.ok(files.length > 20, `only scanned ${files.length} files`);
|
|
32
|
+
});
|
|
33
|
+
it("no outbound call outside the egress allowlist", () => {
|
|
34
|
+
const breaches = files
|
|
35
|
+
.filter((f) => OUTBOUND_PATTERN.test(f.src) && !isEgressAllowed(f.rel))
|
|
36
|
+
.map((f) => f.rel);
|
|
37
|
+
assert.deepEqual(breaches, [], `outbound calls found outside allowlist (${EGRESS_ALLOWLIST.map((a) => a.prefix).join(", ")}): ` +
|
|
38
|
+
`${breaches.join(", ")} — telemetry/phone-home is forbidden; if legitimate, extend EGRESS_ALLOWLIST with a reason`);
|
|
39
|
+
});
|
|
40
|
+
it("no analytics/telemetry SDK anywhere in source", () => {
|
|
41
|
+
const hits = files
|
|
42
|
+
.filter((f) => FORBIDDEN_TELEMETRY.test(f.src))
|
|
43
|
+
.map((f) => f.rel);
|
|
44
|
+
assert.deepEqual(hits, [], `forbidden telemetry/analytics identifiers in: ${hits.join(", ")}`);
|
|
45
|
+
});
|
|
46
|
+
it("allowlisted files are still present (allowlist not stale)", () => {
|
|
47
|
+
for (const { prefix } of EGRESS_ALLOWLIST) {
|
|
48
|
+
const covered = files.some((f) => f.rel === prefix || f.rel.startsWith(prefix));
|
|
49
|
+
assert.ok(covered, `allowlist entry "${prefix}" matches no source file — prune it`);
|
|
50
|
+
}
|
|
51
|
+
});
|
|
52
|
+
});
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { describe, it } from "node:test";
|
|
2
|
+
import assert from "node:assert/strict";
|
|
3
|
+
import { readFileSync, readdirSync } from "node:fs";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { dirname, join } from "node:path";
|
|
6
|
+
// Keystone guard: every tool handler must accept the RequestContext seam.
|
|
7
|
+
// This prevents a new handler (or a refactor) from silently bypassing the
|
|
8
|
+
// request-scoped context that access-control / scoping / audit attach to.
|
|
9
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
10
|
+
describe("RequestContext seam", () => {
|
|
11
|
+
const handlerFiles = readdirSync(here).filter((f) => f.endsWith(".ts") && !f.endsWith(".test.ts"));
|
|
12
|
+
for (const file of handlerFiles) {
|
|
13
|
+
const src = readFileSync(join(here, file), "utf8");
|
|
14
|
+
const hasHandler = /export\s+(async\s+)?function\s+\w*Handler\s*\(/.test(src);
|
|
15
|
+
if (!hasHandler)
|
|
16
|
+
continue;
|
|
17
|
+
it(`${file}: handler accepts a RequestContext`, () => {
|
|
18
|
+
assert.match(src, /_ctx:\s*RequestContext/, `${file} exports a *Handler but does not thread RequestContext — ` +
|
|
19
|
+
`add the ctx seam (see context.ts)`);
|
|
20
|
+
assert.match(src, /from "\.\.\/context\.js"/, `${file} must import from ../context.js`);
|
|
21
|
+
});
|
|
22
|
+
}
|
|
23
|
+
});
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ConnectorRegistry } from "../connectors/registry.js";
|
|
2
|
+
import { type RequestContext } from "../context.js";
|
|
2
3
|
export declare const detectAnomaliesDefinition: {
|
|
3
4
|
name: "detect_anomalies";
|
|
4
5
|
description: string;
|
|
@@ -25,7 +26,7 @@ export declare function detectAnomaliesHandler(registry: ConnectorRegistry, args
|
|
|
25
26
|
service?: string;
|
|
26
27
|
duration?: string;
|
|
27
28
|
sensitivity?: string;
|
|
28
|
-
}): Promise<{
|
|
29
|
+
}, _ctx?: RequestContext): Promise<{
|
|
29
30
|
content: {
|
|
30
31
|
type: "text";
|
|
31
32
|
text: string;
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { defaultContext } from "../context.js";
|
|
2
|
+
import { detectAnomaly, classifyMetric } from "../analysis/anomaly.js";
|
|
3
|
+
import { rankRootCause } from "../analysis/correlator.js";
|
|
2
4
|
export const detectAnomaliesDefinition = {
|
|
3
5
|
name: "detect_anomalies",
|
|
4
6
|
description: "Scan for anomalies across all monitored services (or a specific service). Detects metric deviations using z-score analysis against recent baseline, checks log error spikes, and correlates signals across metrics and logs. Returns anomalies with severity ratings and cross-signal correlations.",
|
|
@@ -26,8 +28,12 @@ const SENSITIVITY_THRESHOLDS = {
|
|
|
26
28
|
medium: 2.0,
|
|
27
29
|
high: 1.5,
|
|
28
30
|
};
|
|
29
|
-
const KEY_METRICS = ["cpu", "error_rate", "latency_p99", "request_rate"];
|
|
30
|
-
|
|
31
|
+
const KEY_METRICS = ["cpu", "memory", "error_rate", "latency_p99", "request_rate"];
|
|
32
|
+
// Patterns that signal a serious incident even at warn level and even when
|
|
33
|
+
// the overall error ratio is low (e.g. a memory leak emits a handful of
|
|
34
|
+
// "OutOfMemoryWarning" lines long before it turns into 5xx errors).
|
|
35
|
+
const CRITICAL_LOG_PATTERN = /\b(out\s?of\s?memory|oom|outofmemory|heap (usage|exhaust)|memory leak|panic|fatal|deadlock|segfault|stack overflow|cannot allocate)\b/i;
|
|
36
|
+
export async function detectAnomaliesHandler(registry, args, _ctx = defaultContext()) {
|
|
31
37
|
const duration = args.duration || "10m";
|
|
32
38
|
const threshold = SENSITIVITY_THRESHOLDS[args.sensitivity || "medium"] || 2.0;
|
|
33
39
|
// Discover services to scan
|
|
@@ -56,18 +62,21 @@ export async function detectAnomaliesHandler(registry, args) {
|
|
|
56
62
|
for (const metric of KEY_METRICS) {
|
|
57
63
|
try {
|
|
58
64
|
const result = await connector.queryMetrics({ service: serviceName, metric, duration });
|
|
59
|
-
const
|
|
60
|
-
const anomaly =
|
|
65
|
+
const points = result.values.map((v) => ({ timestamp: v.timestamp, value: v.value }));
|
|
66
|
+
const anomaly = detectAnomaly(points, {
|
|
67
|
+
threshold,
|
|
68
|
+
metricKind: classifyMetric(metric),
|
|
69
|
+
});
|
|
61
70
|
if (anomaly.isAnomaly) {
|
|
62
|
-
const deviationPercent = anomaly.
|
|
71
|
+
const deviationPercent = anomaly.baselineValue === 0
|
|
63
72
|
? 100
|
|
64
|
-
: Math.round(((anomaly.
|
|
73
|
+
: Math.round(((anomaly.recentValue - anomaly.baselineValue) / anomaly.baselineValue) * 100);
|
|
65
74
|
allAnomalies.push({
|
|
66
75
|
metric,
|
|
67
|
-
severity: Math.abs(anomaly.
|
|
68
|
-
description: `${metric}
|
|
69
|
-
currentValue: anomaly.
|
|
70
|
-
baselineValue: anomaly.
|
|
76
|
+
severity: Math.abs(anomaly.score) >= 6 ? "high" : Math.abs(anomaly.score) >= 4 ? "medium" : "low",
|
|
77
|
+
description: `${metric}: ${anomaly.reason}`,
|
|
78
|
+
currentValue: anomaly.recentValue,
|
|
79
|
+
baselineValue: anomaly.baselineValue,
|
|
71
80
|
deviationPercent,
|
|
72
81
|
source: connector.name,
|
|
73
82
|
service: serviceName,
|
|
@@ -85,6 +94,21 @@ export async function detectAnomaliesHandler(registry, args) {
|
|
|
85
94
|
continue;
|
|
86
95
|
try {
|
|
87
96
|
const logs = await connector.queryLogs({ service: serviceName, duration, limit: 500 });
|
|
97
|
+
// Critical-pattern scan — independent of the error-ratio gate, so a
|
|
98
|
+
// warn-level OOM/leak signal is not silently dropped.
|
|
99
|
+
const criticalPattern = logs.summary.topPatterns.find((p) => CRITICAL_LOG_PATTERN.test(p));
|
|
100
|
+
if (criticalPattern) {
|
|
101
|
+
allAnomalies.push({
|
|
102
|
+
metric: "log_critical_pattern",
|
|
103
|
+
severity: "high",
|
|
104
|
+
description: `Critical log pattern detected: "${criticalPattern}"`,
|
|
105
|
+
currentValue: logs.summary.errorCount + logs.summary.warnCount,
|
|
106
|
+
baselineValue: 0,
|
|
107
|
+
deviationPercent: 100,
|
|
108
|
+
source: connector.name,
|
|
109
|
+
service: serviceName,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
88
112
|
if (logs.summary.errorCount > 5) {
|
|
89
113
|
const errorRatio = logs.summary.total > 0
|
|
90
114
|
? logs.summary.errorCount / logs.summary.total
|
|
@@ -123,10 +147,22 @@ export async function detectAnomaliesHandler(registry, args) {
|
|
|
123
147
|
}
|
|
124
148
|
}
|
|
125
149
|
}
|
|
150
|
+
// Dependency-aware root-cause ranking. The service graph / change markers
|
|
151
|
+
// are empty here (no trace source wired yet); ranking then degrades to
|
|
152
|
+
// severity-weighted ordering and still names the most likely culprit
|
|
153
|
+
// instead of just listing "both signals bad".
|
|
154
|
+
const rootCause = allAnomalies.length > 0
|
|
155
|
+
? rankRootCause(allAnomalies.map((a) => ({
|
|
156
|
+
service: a.service,
|
|
157
|
+
metric: a.metric,
|
|
158
|
+
severity: a.severity,
|
|
159
|
+
})))
|
|
160
|
+
: { ranked: [], summary: "" };
|
|
126
161
|
const result = {
|
|
127
162
|
scannedServices: serviceNames.length,
|
|
128
163
|
anomalies: allAnomalies,
|
|
129
164
|
correlations: allCorrelations,
|
|
165
|
+
rootCause,
|
|
130
166
|
summary: allAnomalies.length === 0
|
|
131
167
|
? "All services healthy — no anomalies detected."
|
|
132
168
|
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`,
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { ConnectorRegistry } from "../connectors/registry.js";
|
|
2
|
+
import { type RequestContext } from "../context.js";
|
|
2
3
|
import type { HealthThresholds } from "../types.js";
|
|
3
4
|
export declare function setHealthThresholds(t: HealthThresholds): void;
|
|
4
5
|
export declare const getServiceHealthDefinition: {
|
|
@@ -17,7 +18,7 @@ export declare const getServiceHealthDefinition: {
|
|
|
17
18
|
};
|
|
18
19
|
export declare function getServiceHealthHandler(registry: ConnectorRegistry, args: {
|
|
19
20
|
service: string;
|
|
20
|
-
}): Promise<{
|
|
21
|
+
}, _ctx?: RequestContext): Promise<{
|
|
21
22
|
content: {
|
|
22
23
|
type: "text";
|
|
23
24
|
text: string;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import { defaultContext } from "../context.js";
|
|
1
2
|
import { calculateHealthScore } from "../analysis/health.js";
|
|
2
|
-
import {
|
|
3
|
+
import { detectRobustAnomaly, classifyMetric } from "../analysis/anomaly.js";
|
|
3
4
|
import { sanitizeForLog } from "../util/sanitize.js";
|
|
4
5
|
let _thresholds = null;
|
|
5
6
|
export function setHealthThresholds(t) {
|
|
@@ -19,7 +20,7 @@ export const getServiceHealthDefinition = {
|
|
|
19
20
|
required: ["service"],
|
|
20
21
|
},
|
|
21
22
|
};
|
|
22
|
-
export async function getServiceHealthHandler(registry, args) {
|
|
23
|
+
export async function getServiceHealthHandler(registry, args, _ctx = defaultContext()) {
|
|
23
24
|
const metricsConnectors = registry.getBySignal("metrics");
|
|
24
25
|
const logConnectors = registry.getBySignal("logs");
|
|
25
26
|
// Gather metrics
|
|
@@ -93,17 +94,20 @@ export async function getServiceHealthHandler(registry, args) {
|
|
|
93
94
|
};
|
|
94
95
|
}
|
|
95
96
|
function checkAnomaly(values, metric, service, source, anomalies) {
|
|
96
|
-
|
|
97
|
+
// Robust, metric-type-aware detector (same path as detect_anomalies):
|
|
98
|
+
// latency/error_rate/saturation are one-sided, so a *decrease* (e.g.
|
|
99
|
+
// latency dropping) is correctly NOT flagged as an anomaly.
|
|
100
|
+
const result = detectRobustAnomaly(values, { metricKind: classifyMetric(metric) });
|
|
97
101
|
if (result.isAnomaly) {
|
|
98
|
-
const deviationPercent = result.
|
|
102
|
+
const deviationPercent = result.baselineValue === 0
|
|
99
103
|
? 100
|
|
100
|
-
: Math.round(((result.
|
|
104
|
+
: Math.round(((result.recentValue - result.baselineValue) / result.baselineValue) * 100);
|
|
101
105
|
anomalies.push({
|
|
102
106
|
metric,
|
|
103
|
-
severity: Math.abs(result.
|
|
104
|
-
description: `${metric}
|
|
105
|
-
currentValue: result.
|
|
106
|
-
baselineValue: result.
|
|
107
|
+
severity: Math.abs(result.score) >= 6 ? "high" : Math.abs(result.score) >= 4 ? "medium" : "low",
|
|
108
|
+
description: `${metric}: ${result.reason}`,
|
|
109
|
+
currentValue: result.recentValue,
|
|
110
|
+
baselineValue: result.baselineValue,
|
|
107
111
|
deviationPercent,
|
|
108
112
|
source,
|
|
109
113
|
service,
|