@thotischner/observability-mcp 1.5.1 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/sources.yaml +10 -0
- package/dist/connectors/interface.d.ts +11 -1
- package/dist/connectors/interface.js +7 -1
- package/dist/connectors/kubernetes-client.d.ts +3 -0
- package/dist/connectors/kubernetes-client.js +90 -0
- package/dist/connectors/kubernetes-graph.d.ts +73 -0
- package/dist/connectors/kubernetes-graph.js +257 -0
- package/dist/connectors/kubernetes-graph.test.d.ts +1 -0
- package/dist/connectors/kubernetes-graph.test.js +150 -0
- package/dist/connectors/kubernetes.d.ts +52 -0
- package/dist/connectors/kubernetes.js +185 -0
- package/dist/connectors/kubernetes.test.d.ts +1 -0
- package/dist/connectors/kubernetes.test.js +136 -0
- package/dist/connectors/loader.js +6 -0
- package/dist/connectors/topology.test.d.ts +1 -0
- package/dist/connectors/topology.test.js +165 -0
- package/dist/enterprise-gate.d.ts +132 -0
- package/dist/enterprise-gate.js +510 -0
- package/dist/enterprise-gate.test.d.ts +1 -0
- package/dist/enterprise-gate.test.js +178 -0
- package/dist/index.js +152 -6
- package/dist/sdk/index.d.ts +2 -2
- package/dist/sdk/manifest-schema.d.ts +1 -0
- package/dist/sdk/manifest-schema.js +1 -1
- package/dist/tools/get-service-health.js +11 -8
- package/dist/tools/handlers.test.js +31 -0
- package/dist/tools/topology.d.ts +64 -0
- package/dist/tools/topology.js +233 -0
- package/dist/tools/topology.test.d.ts +1 -0
- package/dist/tools/topology.test.js +210 -0
- package/dist/types.d.ts +67 -1
- package/dist/ui/index.html +2333 -67
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -8,7 +8,9 @@ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"
|
|
|
8
8
|
import { z } from "zod";
|
|
9
9
|
import { loadConfig, saveConfig, DEFAULT_HEALTH_THRESHOLDS, DEFAULT_SETTINGS } from "./config/loader.js";
|
|
10
10
|
import { ConnectorRegistry, getSupportedTypes } from "./connectors/registry.js";
|
|
11
|
+
import { isTopologyProvider } from "./connectors/interface.js";
|
|
11
12
|
import { defaultContext, principalContext } from "./context.js";
|
|
13
|
+
import { enforceEntitledAccess, enterpriseGateStatus, enterpriseGateInfo, enterprisePolicyView, enterpriseCatalogView, enterpriseAuditTail, authorizeAdmin, updateRbacPolicy, updateCatalog, } from "./enterprise-gate.js";
|
|
12
14
|
import { loadCredentials, credentialsConfigured, extractToken, resolveToken, } from "./auth/credentials.js";
|
|
13
15
|
import { getPluginLoader } from "./connectors/loader.js";
|
|
14
16
|
import { resolveHubCatalogUrl, describeInstalled, mergeCatalog, fetchHubCatalog, } from "./connectors/hub.js";
|
|
@@ -22,6 +24,7 @@ import { queryMetricsHandler } from "./tools/query-metrics.js";
|
|
|
22
24
|
import { queryLogsHandler } from "./tools/query-logs.js";
|
|
23
25
|
import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
|
|
24
26
|
import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
|
|
27
|
+
import { getTopologyHandler, getBlastRadiusHandler } from "./tools/topology.js";
|
|
25
28
|
import { fileURLToPath } from "node:url";
|
|
26
29
|
import { dirname, join } from "node:path";
|
|
27
30
|
import { readFileSync, writeFileSync, mkdtempSync, rmSync } from "node:fs";
|
|
@@ -121,7 +124,10 @@ async function main() {
|
|
|
121
124
|
"When to use: call this first to learn which source names exist and are healthy before passing `source` to other tools, or to debug why a query returns no data.",
|
|
122
125
|
"Behavior: read-only, no side effects. Returns one entry per source with its name, type, configured URL, signal types (metrics/logs), and a live up/down status. Never throws for an unreachable backend — the backend is reported as down instead.",
|
|
123
126
|
"Related: use `list_services` to see what is monitored within these sources.",
|
|
124
|
-
].join(" "), {}, async () =>
|
|
127
|
+
].join(" "), {}, async () => {
|
|
128
|
+
await enforceEntitledAccess(ctx, { tool: "list_sources" });
|
|
129
|
+
return withToolMetrics("list_sources", () => listSourcesHandler(registry, ctx));
|
|
130
|
+
});
|
|
125
131
|
mcpServer.tool("list_services", [
|
|
126
132
|
"Discover the service names that can be queried, aggregated across every connected backend.",
|
|
127
133
|
"When to use: call this before `query_metrics`, `query_logs`, or `get_service_health` to obtain the exact, case-sensitive service name those tools require.",
|
|
@@ -132,7 +138,10 @@ async function main() {
|
|
|
132
138
|
.string()
|
|
133
139
|
.optional()
|
|
134
140
|
.describe("Optional case-insensitive substring to narrow the result to matching service names (e.g. 'payment'). Omit to list every discovered service."),
|
|
135
|
-
}, async (args) =>
|
|
141
|
+
}, async (args) => {
|
|
142
|
+
await enforceEntitledAccess(ctx, { tool: "list_services" });
|
|
143
|
+
return withToolMetrics("list_services", () => listServicesHandler(registry, args, ctx));
|
|
144
|
+
});
|
|
136
145
|
const metricsList = getAvailableMetricNames(registry);
|
|
137
146
|
const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
|
|
138
147
|
const uniqueNames = [...new Set(metricNames)];
|
|
@@ -161,7 +170,10 @@ async function main() {
|
|
|
161
170
|
.string()
|
|
162
171
|
.optional()
|
|
163
172
|
.describe("Optional. Metric label to break the result down by, e.g. 'instance', 'pod', 'node'. When set, the response contains one series per distinct label value under `groups`. Default: a single aggregated series."),
|
|
164
|
-
}, async (args) =>
|
|
173
|
+
}, async (args) => {
|
|
174
|
+
await enforceEntitledAccess(ctx, { tool: "query_metrics", source: args?.source, service: args?.service });
|
|
175
|
+
return withToolMetrics("query_metrics", () => queryMetricsHandler(registry, args, ctx));
|
|
176
|
+
});
|
|
165
177
|
mcpServer.tool("query_logs", [
|
|
166
178
|
"Fetch recent log entries for ONE service over a look-back window, with a pre-computed summary (error/warning counts and the most frequent error patterns).",
|
|
167
179
|
"When to use: to inspect what a service actually logged, or to investigate an error spike surfaced by `detect_anomalies` / `get_service_health`. For numeric metrics use `query_metrics` instead.",
|
|
@@ -189,7 +201,10 @@ async function main() {
|
|
|
189
201
|
.positive()
|
|
190
202
|
.optional()
|
|
191
203
|
.describe("Optional. Maximum number of log entries to return (most recent first). Default: 100."),
|
|
192
|
-
}, async (args) =>
|
|
204
|
+
}, async (args) => {
|
|
205
|
+
await enforceEntitledAccess(ctx, { tool: "query_logs", source: args?.source, service: args?.service });
|
|
206
|
+
return withToolMetrics("query_logs", () => queryLogsHandler(registry, args, ctx));
|
|
207
|
+
});
|
|
193
208
|
mcpServer.tool("get_service_health", [
|
|
194
209
|
"Produce a single aggregated health verdict for ONE service by combining its metrics and logs.",
|
|
195
210
|
"When to use: the fastest way to answer 'is this service healthy right now and why?'. Use `query_metrics`/`query_logs` to drill into the underlying numbers, or `detect_anomalies` to scan many services at once.",
|
|
@@ -199,7 +214,10 @@ async function main() {
|
|
|
199
214
|
service: z
|
|
200
215
|
.string()
|
|
201
216
|
.describe("Required. Exact, case-sensitive service name exactly as returned by `list_services` (e.g. 'payment-service')."),
|
|
202
|
-
}, async (args) =>
|
|
217
|
+
}, async (args) => {
|
|
218
|
+
await enforceEntitledAccess(ctx, { tool: "get_service_health", service: args?.service });
|
|
219
|
+
return withToolMetrics("get_service_health", () => getServiceHealthHandler(registry, args, ctx));
|
|
220
|
+
});
|
|
203
221
|
mcpServer.tool("detect_anomalies", [
|
|
204
222
|
"Scan one or all monitored services for abnormal behavior and return the findings ranked by severity.",
|
|
205
223
|
"When to use: the entry point for 'is anything wrong anywhere?' triage. Once a service is flagged, follow up with `get_service_health` for the verdict or `query_metrics`/`query_logs` for the raw evidence.",
|
|
@@ -218,7 +236,52 @@ async function main() {
|
|
|
218
236
|
.enum(["low", "medium", "high"])
|
|
219
237
|
.optional()
|
|
220
238
|
.describe("Optional. Detection threshold: 'low' flags only strong deviations (>3σ), 'medium' is balanced (>2σ), 'high' is most sensitive and noisier (>1.5σ). Default: 'medium'."),
|
|
221
|
-
}, async (args) =>
|
|
239
|
+
}, async (args) => {
|
|
240
|
+
await enforceEntitledAccess(ctx, { tool: "detect_anomalies", source: args?.source, service: args?.service });
|
|
241
|
+
return withToolMetrics("detect_anomalies", () => detectAnomaliesHandler(registry, args, ctx));
|
|
242
|
+
});
|
|
243
|
+
mcpServer.tool("get_topology", [
|
|
244
|
+
"Return the infrastructure topology graph (Resources and Edges) from every topology-capable connector.",
|
|
245
|
+
"When to use: when an agent needs to reason about which workload runs on which host, who owns whom, or which scope (namespace/project/folder) a resource belongs to. Pair with `get_blast_radius` for shared-host RCA.",
|
|
246
|
+
"Behavior: read-only, no side effects. Returns `{ sources, resources, edges, total, truncated }`. Filters compose: `source` to one connector, `kind` to one resource type (e.g. 'pod', 'node', 'deployment'), `scope` to members of a namespace/folder/project. Output is capped by `limit` (default 500, max 5000) and edges referencing dropped resources are removed.",
|
|
247
|
+
"Related: `get_blast_radius` to evaluate the impact of a host failure; `list_sources` to discover topology-capable connectors.",
|
|
248
|
+
].join(" "), {
|
|
249
|
+
source: z
|
|
250
|
+
.string()
|
|
251
|
+
.optional()
|
|
252
|
+
.describe("Optional. Restrict the graph to one topology connector by source name (see `list_sources`). Default: merge across all connectors."),
|
|
253
|
+
kind: z
|
|
254
|
+
.string()
|
|
255
|
+
.optional()
|
|
256
|
+
.describe("Optional. Restrict to resources of one kind. Common values for Kubernetes: 'pod', 'node', 'deployment', 'replicaset', 'namespace'. Other connectors may emit different kinds (e.g. 'vm', 'hypervisor', 'volume'). Default: all kinds."),
|
|
257
|
+
scope: z
|
|
258
|
+
.string()
|
|
259
|
+
.optional()
|
|
260
|
+
.describe("Optional. Restrict to resources contained in a scope (anything pointed to by `IN_NAMESPACE` edges). Pass the scope's resource id (e.g. 'k8s:namespace:default') or its name (e.g. 'default'). Default: no scope filter."),
|
|
261
|
+
limit: z
|
|
262
|
+
.number()
|
|
263
|
+
.int()
|
|
264
|
+
.min(1)
|
|
265
|
+
.max(5000)
|
|
266
|
+
.optional()
|
|
267
|
+
.describe("Optional. Maximum resources to return; edges are trimmed to the kept set. Default 500, max 5000."),
|
|
268
|
+
}, async (args) => {
|
|
269
|
+
await enforceEntitledAccess(ctx, { tool: "get_topology", source: args?.source });
|
|
270
|
+
return withToolMetrics("get_topology", () => getTopologyHandler(registry, args, ctx));
|
|
271
|
+
});
|
|
272
|
+
mcpServer.tool("get_blast_radius", [
|
|
273
|
+
"Given a resource, return who else fails if its underlying host(s) fail.",
|
|
274
|
+
"When to use: cross-cutting RCA — when several services degrade together and you suspect a shared host. Works for any RUNS_ON relationship: pod→node, vm→hypervisor, container→host.",
|
|
275
|
+
"Behavior: read-only, no side effects. Resolves `resource` to a Resource (accepts canonical id, exact name, or unique substring), determines its host(s) via RUNS_ON, then lists every other resource that runs on those hosts, bucketed by ownership root (the terminal `OWNED_BY` target — e.g. the Deployment, not the ReplicaSet). If the target is itself a host, its tenants are reported. Returns a structured error if the resource is ambiguous or unknown.",
|
|
276
|
+
"Related: `get_topology` for the full graph; `get_service_health` for the per-service verdict on each co-tenant.",
|
|
277
|
+
].join(" "), {
|
|
278
|
+
resource: z
|
|
279
|
+
.string()
|
|
280
|
+
.describe("Required. Resource to evaluate. Accepts the canonical id (e.g. 'k8s:pod:default/checkout-7f89d'), the exact resource name (e.g. 'checkout-7f89d'), or a unique substring of either."),
|
|
281
|
+
}, async (args) => {
|
|
282
|
+
await enforceEntitledAccess(ctx, { tool: "get_blast_radius" });
|
|
283
|
+
return withToolMetrics("get_blast_radius", () => getBlastRadiusHandler(registry, args, ctx));
|
|
284
|
+
});
|
|
222
285
|
return mcpServer;
|
|
223
286
|
}
|
|
224
287
|
// --- HTTP server ---
|
|
@@ -313,6 +376,7 @@ async function main() {
|
|
|
313
376
|
res.json({
|
|
314
377
|
name: "observability-mcp",
|
|
315
378
|
version: SERVER_VERSION,
|
|
379
|
+
enterpriseGate: await enterpriseGateStatus(),
|
|
316
380
|
mcpProtocolVersion: "2025-03-26",
|
|
317
381
|
build: {
|
|
318
382
|
commit: process.env.GIT_COMMIT || null,
|
|
@@ -336,6 +400,58 @@ async function main() {
|
|
|
336
400
|
app.get("/api/connectors", (_req, res) => {
|
|
337
401
|
res.json({ connectors: describeInstalled(getPluginLoader().list()) });
|
|
338
402
|
});
|
|
403
|
+
// --- Enterprise console (read-only introspection) -------------------
|
|
404
|
+
// Drives the management UI's Enterprise page. Read-only in this phase;
|
|
405
|
+
// never exposes the entitlement token or any key. Same trusted-local
|
|
406
|
+
// management plane as the other /api/* endpoints (see auth-and-tls).
|
|
407
|
+
app.get("/api/enterprise/status", async (_req, res) => {
|
|
408
|
+
try {
|
|
409
|
+
res.json(await enterpriseGateInfo());
|
|
410
|
+
}
|
|
411
|
+
catch (e) {
|
|
412
|
+
res.status(500).json({ error: String(e) });
|
|
413
|
+
}
|
|
414
|
+
});
|
|
415
|
+
app.get("/api/enterprise/policy", (_req, res) => {
|
|
416
|
+
res.json(enterprisePolicyView());
|
|
417
|
+
});
|
|
418
|
+
app.get("/api/enterprise/catalog", (_req, res) => {
|
|
419
|
+
res.json(enterpriseCatalogView());
|
|
420
|
+
});
|
|
421
|
+
app.get("/api/enterprise/audit", async (req, res) => {
|
|
422
|
+
const limit = Math.min(Number(req.query.limit) || 50, 500);
|
|
423
|
+
try {
|
|
424
|
+
res.json(await enterpriseAuditTail(limit));
|
|
425
|
+
}
|
|
426
|
+
catch (e) {
|
|
427
|
+
res.status(500).json({ error: String(e) });
|
|
428
|
+
}
|
|
429
|
+
});
|
|
430
|
+
// Phase 2: edit the RBAC policy. NOT on the open local plane — requires
|
|
431
|
+
// an API-key principal the CURRENT policy grants `enterprise:admin`.
|
|
432
|
+
app.put("/api/enterprise/policy", async (req, res) => {
|
|
433
|
+
const cred = resolveToken(extractToken(req.headers), loadCredentials());
|
|
434
|
+
const principal = cred ? cred.name : null;
|
|
435
|
+
const authz = await authorizeAdmin(principal);
|
|
436
|
+
if (!authz.ok)
|
|
437
|
+
return res.status(authz.status).json({ error: authz.error });
|
|
438
|
+
const result = await updateRbacPolicy(principal, req.body);
|
|
439
|
+
if (!result.ok)
|
|
440
|
+
return res.status(result.status).json({ error: result.error });
|
|
441
|
+
res.json({ ok: true });
|
|
442
|
+
});
|
|
443
|
+
// Phase 3: edit the product catalog. Same admin model as the RBAC write.
|
|
444
|
+
app.put("/api/enterprise/catalog", async (req, res) => {
|
|
445
|
+
const cred = resolveToken(extractToken(req.headers), loadCredentials());
|
|
446
|
+
const principal = cred ? cred.name : null;
|
|
447
|
+
const authz = await authorizeAdmin(principal);
|
|
448
|
+
if (!authz.ok)
|
|
449
|
+
return res.status(authz.status).json({ error: authz.error });
|
|
450
|
+
const result = await updateCatalog(principal, req.body);
|
|
451
|
+
if (!result.ok)
|
|
452
|
+
return res.status(result.status).json({ error: result.error });
|
|
453
|
+
res.json({ ok: true });
|
|
454
|
+
});
|
|
339
455
|
// Server-side proxy of the connector hub catalog (so the browser
|
|
340
456
|
// needn't reach the hub directly — works behind a proxy / against a
|
|
341
457
|
// mirror via HUB_CATALOG_URL). Installed status merged in.
|
|
@@ -619,6 +735,36 @@ async function main() {
|
|
|
619
735
|
res.status(500).json({ error: "Failed to get health data" });
|
|
620
736
|
}
|
|
621
737
|
});
|
|
738
|
+
// --- Topology API ---
|
|
739
|
+
// Returns the union of topology snapshots across all topology-capable
|
|
740
|
+
// connectors (today only "kubernetes"). One JSON document so the UI can
|
|
741
|
+
// render summary + grouped views without N round-trips.
|
|
742
|
+
app.get("/api/topology", async (_req, res) => {
|
|
743
|
+
try {
|
|
744
|
+
const sources = [];
|
|
745
|
+
const allResources = [];
|
|
746
|
+
const allEdges = [];
|
|
747
|
+
for (const c of registry.getAll()) {
|
|
748
|
+
if (!isTopologyProvider(c))
|
|
749
|
+
continue;
|
|
750
|
+
const snap = await c.getTopologySnapshot();
|
|
751
|
+
sources.push({
|
|
752
|
+
source: snap.source,
|
|
753
|
+
type: c.type,
|
|
754
|
+
revision: snap.revision,
|
|
755
|
+
resources: snap.resources.length,
|
|
756
|
+
edges: snap.edges.length,
|
|
757
|
+
});
|
|
758
|
+
allResources.push(...snap.resources);
|
|
759
|
+
allEdges.push(...snap.edges);
|
|
760
|
+
}
|
|
761
|
+
res.json({ sources, resources: allResources, edges: allEdges });
|
|
762
|
+
}
|
|
763
|
+
catch (err) {
|
|
764
|
+
console.error("topology endpoint failed:", err);
|
|
765
|
+
res.status(500).json({ error: "Failed to read topology" });
|
|
766
|
+
}
|
|
767
|
+
});
|
|
622
768
|
// --- Settings API ---
|
|
623
769
|
// Get general settings
|
|
624
770
|
app.get("/api/settings", (_req, res) => {
|
package/dist/sdk/index.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
export type { ObservabilityConnector } from "../connectors/interface.js";
|
|
2
2
|
export { manifestSchema } from "./manifest-schema.js";
|
|
3
3
|
export type { ValidatedConnectorManifest } from "./manifest-schema.js";
|
|
4
|
-
export type { SignalType, SourceConfig, SourceAuth, SourceTls, ConnectorHealth, ServiceInfo, MetricInfo, MetricQuery, MetricResult, MetricSummary, DataPoint, LogQuery, LogResult, LogEntry, LogSummary, MetricDefinition, } from "../types.js";
|
|
4
|
+
export type { SignalType, SourceConfig, SourceAuth, SourceTls, ConnectorHealth, ServiceInfo, MetricInfo, MetricQuery, MetricResult, MetricSummary, DataPoint, LogQuery, LogResult, LogEntry, LogSummary, MetricDefinition, Resource, Edge, TopologySnapshot, TopologyChangeEvent, TopologyChangeListener, } from "../types.js";
|
|
5
5
|
/**
|
|
6
6
|
* Manifest shape declared in a plugin's `manifest.json`. The server
|
|
7
7
|
* validates plugin manifests against this at load time.
|
|
@@ -18,7 +18,7 @@ export interface ConnectorManifest {
|
|
|
18
18
|
/** Semver of this connector build. */
|
|
19
19
|
version: string;
|
|
20
20
|
description: string;
|
|
21
|
-
signalTypes: Array<"metrics" | "logs" | "traces">;
|
|
21
|
+
signalTypes: Array<"metrics" | "logs" | "traces" | "topology">;
|
|
22
22
|
homepage?: string;
|
|
23
23
|
license?: string;
|
|
24
24
|
logo?: string;
|
|
@@ -15,7 +15,7 @@ export const manifestSchema = z.object({
|
|
|
15
15
|
message: "version must be semver",
|
|
16
16
|
}),
|
|
17
17
|
description: z.string().min(1),
|
|
18
|
-
signalTypes: z.array(z.enum(["metrics", "logs", "traces"])).min(1),
|
|
18
|
+
signalTypes: z.array(z.enum(["metrics", "logs", "traces", "topology"])).min(1),
|
|
19
19
|
homepage: z.string().url().optional(),
|
|
20
20
|
license: z.string().optional(),
|
|
21
21
|
logo: z.string().optional(),
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { defaultContext } from "../context.js";
|
|
2
2
|
import { calculateHealthScore } from "../analysis/health.js";
|
|
3
|
-
import {
|
|
3
|
+
import { detectRobustAnomaly, classifyMetric } from "../analysis/anomaly.js";
|
|
4
4
|
import { sanitizeForLog } from "../util/sanitize.js";
|
|
5
5
|
let _thresholds = null;
|
|
6
6
|
export function setHealthThresholds(t) {
|
|
@@ -94,17 +94,20 @@ export async function getServiceHealthHandler(registry, args, _ctx = defaultCont
|
|
|
94
94
|
};
|
|
95
95
|
}
|
|
96
96
|
function checkAnomaly(values, metric, service, source, anomalies) {
|
|
97
|
-
|
|
97
|
+
// Robust, metric-type-aware detector (same path as detect_anomalies):
|
|
98
|
+
// latency/error_rate/saturation are one-sided, so a *decrease* (e.g.
|
|
99
|
+
// latency dropping) is correctly NOT flagged as an anomaly.
|
|
100
|
+
const result = detectRobustAnomaly(values, { metricKind: classifyMetric(metric) });
|
|
98
101
|
if (result.isAnomaly) {
|
|
99
|
-
const deviationPercent = result.
|
|
102
|
+
const deviationPercent = result.baselineValue === 0
|
|
100
103
|
? 100
|
|
101
|
-
: Math.round(((result.
|
|
104
|
+
: Math.round(((result.recentValue - result.baselineValue) / result.baselineValue) * 100);
|
|
102
105
|
anomalies.push({
|
|
103
106
|
metric,
|
|
104
|
-
severity: Math.abs(result.
|
|
105
|
-
description: `${metric}
|
|
106
|
-
currentValue: result.
|
|
107
|
-
baselineValue: result.
|
|
107
|
+
severity: Math.abs(result.score) >= 6 ? "high" : Math.abs(result.score) >= 4 ? "medium" : "low",
|
|
108
|
+
description: `${metric}: ${result.reason}`,
|
|
109
|
+
currentValue: result.recentValue,
|
|
110
|
+
baselineValue: result.baselineValue,
|
|
108
111
|
deviationPercent,
|
|
109
112
|
source,
|
|
110
113
|
service,
|
|
@@ -4,6 +4,7 @@ import { ConnectorRegistry } from "../connectors/registry.js";
|
|
|
4
4
|
import { listSourcesHandler } from "./list-sources.js";
|
|
5
5
|
import { listServicesHandler } from "./list-services.js";
|
|
6
6
|
import { detectAnomaliesHandler } from "./detect-anomalies.js";
|
|
7
|
+
import { getServiceHealthHandler } from "./get-service-health.js";
|
|
7
8
|
// --- Mock Connector ---
|
|
8
9
|
function createMockConnector(overrides) {
|
|
9
10
|
return {
|
|
@@ -209,3 +210,33 @@ describe("detectAnomaliesHandler — A5 memory/OOM coverage", () => {
|
|
|
209
210
|
assert.equal(data.anomalies.length, 0);
|
|
210
211
|
});
|
|
211
212
|
});
|
|
213
|
+
describe("getServiceHealthHandler — one-sided latency (regression)", () => {
|
|
214
|
+
const series = (vals) => ({
|
|
215
|
+
source: "prom1", service: "payment-service", metric: "x", unit: "",
|
|
216
|
+
values: vals.map((v, i) => ({ timestamp: new Date(Date.now() - (vals.length - i) * 9000).toISOString(), value: v })),
|
|
217
|
+
summary: { current: vals[vals.length - 1], average: vals[0], min: Math.min(...vals), max: Math.max(...vals), trend: "falling" },
|
|
218
|
+
});
|
|
219
|
+
it("a DECREASING latency_p99 is NOT flagged as an anomaly", async () => {
|
|
220
|
+
const reg = new ConnectorRegistry();
|
|
221
|
+
const mock = {
|
|
222
|
+
connect: async () => { }, disconnect: async () => { },
|
|
223
|
+
healthCheck: async () => ({ status: "up", latencyMs: 1 }),
|
|
224
|
+
getDefaultMetrics: () => [], getMetrics: () => [],
|
|
225
|
+
listServices: async () => [{ name: "payment-service", source: "prom1", signalType: "metrics" }],
|
|
226
|
+
name: "prom1", type: "prometheus", signalType: "metrics",
|
|
227
|
+
queryMetrics: async ({ metric }) => {
|
|
228
|
+
if (metric === "latency_p99")
|
|
229
|
+
return series(Array.from({ length: 30 }, (_, i) => 1.0 - i * 0.025)); // 1.0 → 0.275, strictly down
|
|
230
|
+
if (metric === "cpu")
|
|
231
|
+
return series(Array.from({ length: 30 }, () => 20 + (Math.random() < 0 ? 1 : 0)));
|
|
232
|
+
return series(Array.from({ length: 30 }, () => 0.01)); // error_rate flat
|
|
233
|
+
},
|
|
234
|
+
};
|
|
235
|
+
reg.connectors.set("prom1", mock);
|
|
236
|
+
reg.sourceConfigs.set("prom1", { name: "prom1", type: "prometheus", url: "http://m", enabled: true });
|
|
237
|
+
const result = await getServiceHealthHandler(reg, { service: "payment-service" });
|
|
238
|
+
const data = JSON.parse(result.content[0].text);
|
|
239
|
+
const latAnom = (data.anomalies || []).find((a) => a.metric === "latency_p99");
|
|
240
|
+
assert.equal(latAnom, undefined, `latency dropping must not be an anomaly, got: ${JSON.stringify(latAnom)}`);
|
|
241
|
+
});
|
|
242
|
+
});
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import type { ConnectorRegistry } from "../connectors/registry.js";
|
|
2
|
+
import type { Resource, Edge } from "../types.js";
|
|
3
|
+
import { type RequestContext } from "../context.js";
|
|
4
|
+
interface AggregatedTopology {
|
|
5
|
+
sources: Array<{
|
|
6
|
+
source: string;
|
|
7
|
+
type: string;
|
|
8
|
+
revision: number;
|
|
9
|
+
resources: number;
|
|
10
|
+
edges: number;
|
|
11
|
+
}>;
|
|
12
|
+
resources: Resource[];
|
|
13
|
+
edges: Edge[];
|
|
14
|
+
}
|
|
15
|
+
export declare function aggregateTopology(registry: ConnectorRegistry): Promise<AggregatedTopology>;
|
|
16
|
+
/**
|
|
17
|
+
* Resolve a caller-supplied identifier to a Resource. Accepts:
|
|
18
|
+
* - exact canonical id (e.g. "k8s:pod:default/checkout-7f89d")
|
|
19
|
+
* - exact resource name (e.g. "checkout-7f89d")
|
|
20
|
+
* - case-insensitive substring of name (only used if uniquely matching)
|
|
21
|
+
*
|
|
22
|
+
* Stays generic — no knowledge of kind-specific id grammars.
|
|
23
|
+
*/
|
|
24
|
+
export declare function resolveResource(query: string, resources: Resource[]): Resource | {
|
|
25
|
+
error: string;
|
|
26
|
+
candidates?: string[];
|
|
27
|
+
};
|
|
28
|
+
export declare const getTopologyDefinition: {
|
|
29
|
+
name: "get_topology";
|
|
30
|
+
description: string;
|
|
31
|
+
};
|
|
32
|
+
export interface GetTopologyArgs {
|
|
33
|
+
source?: string;
|
|
34
|
+
kind?: string;
|
|
35
|
+
scope?: string;
|
|
36
|
+
limit?: number;
|
|
37
|
+
}
|
|
38
|
+
export declare function getTopologyHandler(registry: ConnectorRegistry, args?: GetTopologyArgs, _ctx?: RequestContext): Promise<{
|
|
39
|
+
content: {
|
|
40
|
+
type: "text";
|
|
41
|
+
text: string;
|
|
42
|
+
}[];
|
|
43
|
+
}>;
|
|
44
|
+
export declare const getBlastRadiusDefinition: {
|
|
45
|
+
name: "get_blast_radius";
|
|
46
|
+
description: string;
|
|
47
|
+
};
|
|
48
|
+
export interface GetBlastRadiusArgs {
|
|
49
|
+
resource: string;
|
|
50
|
+
}
|
|
51
|
+
export declare function getBlastRadiusHandler(registry: ConnectorRegistry, args: GetBlastRadiusArgs, _ctx?: RequestContext): Promise<{
|
|
52
|
+
isError: boolean;
|
|
53
|
+
content: {
|
|
54
|
+
type: "text";
|
|
55
|
+
text: string;
|
|
56
|
+
}[];
|
|
57
|
+
} | {
|
|
58
|
+
content: {
|
|
59
|
+
type: "text";
|
|
60
|
+
text: string;
|
|
61
|
+
}[];
|
|
62
|
+
isError?: undefined;
|
|
63
|
+
}>;
|
|
64
|
+
export {};
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
// MCP tools that expose the infrastructure topology graph to agents.
|
|
2
|
+
//
|
|
3
|
+
// Two tools live here:
|
|
4
|
+
// - `get_topology` — returns the merged resource/edge graph across
|
|
5
|
+
// every topology-capable connector, optionally
|
|
6
|
+
// filtered by source/kind/scope. Useful as a
|
|
7
|
+
// starting point for any cross-cutting question.
|
|
8
|
+
// - `get_blast_radius` — given a resource, returns who else is co-tenant
|
|
9
|
+
// on the same host(s). The canonical "if this
|
|
10
|
+
// host fails, who else fails?" question.
|
|
11
|
+
//
|
|
12
|
+
// Both stay generic — they pivot on the `RUNS_ON` and `OWNED_BY` relations
|
|
13
|
+
// rather than any specific kind. Adding a vCenter/NetBox/AWS topology
|
|
14
|
+
// connector later requires zero changes here.
|
|
15
|
+
import { isTopologyProvider } from "../connectors/interface.js";
|
|
16
|
+
import { defaultContext } from "../context.js";
|
|
17
|
+
export async function aggregateTopology(registry) {
|
|
18
|
+
const sources = [];
|
|
19
|
+
const resources = [];
|
|
20
|
+
const edges = [];
|
|
21
|
+
for (const c of registry.getAll()) {
|
|
22
|
+
if (!isTopologyProvider(c))
|
|
23
|
+
continue;
|
|
24
|
+
try {
|
|
25
|
+
const snap = await c.getTopologySnapshot();
|
|
26
|
+
sources.push({
|
|
27
|
+
source: snap.source,
|
|
28
|
+
type: c.type,
|
|
29
|
+
revision: snap.revision,
|
|
30
|
+
resources: snap.resources.length,
|
|
31
|
+
edges: snap.edges.length,
|
|
32
|
+
});
|
|
33
|
+
resources.push(...snap.resources);
|
|
34
|
+
edges.push(...snap.edges);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
// A misbehaving connector must not poison the agent's view of the graph.
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return { sources, resources, edges };
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Resolve a caller-supplied identifier to a Resource. Accepts:
|
|
44
|
+
* - exact canonical id (e.g. "k8s:pod:default/checkout-7f89d")
|
|
45
|
+
* - exact resource name (e.g. "checkout-7f89d")
|
|
46
|
+
* - case-insensitive substring of name (only used if uniquely matching)
|
|
47
|
+
*
|
|
48
|
+
* Stays generic — no knowledge of kind-specific id grammars.
|
|
49
|
+
*/
|
|
50
|
+
export function resolveResource(query, resources) {
|
|
51
|
+
if (!query)
|
|
52
|
+
return { error: "Missing resource query" };
|
|
53
|
+
const exactId = resources.find((r) => r.id === query);
|
|
54
|
+
if (exactId)
|
|
55
|
+
return exactId;
|
|
56
|
+
const exactName = resources.filter((r) => r.name === query);
|
|
57
|
+
if (exactName.length === 1)
|
|
58
|
+
return exactName[0];
|
|
59
|
+
if (exactName.length > 1) {
|
|
60
|
+
return {
|
|
61
|
+
error: `Name '${query}' is ambiguous across ${exactName.length} resources; pass the full id`,
|
|
62
|
+
candidates: exactName.map((r) => r.id),
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
const q = query.toLowerCase();
|
|
66
|
+
const fuzzy = resources.filter((r) => r.name.toLowerCase().includes(q) || r.id.toLowerCase().includes(q));
|
|
67
|
+
if (fuzzy.length === 1)
|
|
68
|
+
return fuzzy[0];
|
|
69
|
+
if (fuzzy.length > 1) {
|
|
70
|
+
return {
|
|
71
|
+
error: `Query '${query}' matched ${fuzzy.length} resources; pass the full id`,
|
|
72
|
+
candidates: fuzzy.slice(0, 25).map((r) => r.id),
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
return { error: `No resource found matching '${query}'` };
|
|
76
|
+
}
|
|
77
|
+
// --- get_topology ------------------------------------------------------
|
|
78
|
+
export const getTopologyDefinition = {
|
|
79
|
+
name: "get_topology",
|
|
80
|
+
description: "Return the infrastructure topology graph as Resources and Edges. Use this when an agent needs to reason about which workload runs where, who owns whom, or which scope (namespace/project/folder) a resource belongs to.",
|
|
81
|
+
};
|
|
82
|
+
export async function getTopologyHandler(registry, args = {}, _ctx = defaultContext()) {
|
|
83
|
+
const agg = await aggregateTopology(registry);
|
|
84
|
+
// Filtering — all optional. Filters compose conjunctively.
|
|
85
|
+
let resources = agg.resources;
|
|
86
|
+
let edges = agg.edges;
|
|
87
|
+
if (args.source) {
|
|
88
|
+
resources = resources.filter((r) => r.source === args.source);
|
|
89
|
+
edges = edges.filter((e) => e.source === args.source);
|
|
90
|
+
}
|
|
91
|
+
if (args.kind) {
|
|
92
|
+
resources = resources.filter((r) => r.kind === args.kind);
|
|
93
|
+
}
|
|
94
|
+
if (args.scope) {
|
|
95
|
+
// Match either by scope resource id (e.g. "k8s:namespace:default") or by name (e.g. "default").
|
|
96
|
+
const inScope = new Set();
|
|
97
|
+
for (const e of agg.edges) {
|
|
98
|
+
if (e.relation !== "IN_NAMESPACE")
|
|
99
|
+
continue;
|
|
100
|
+
const target = agg.resources.find((r) => r.id === e.to);
|
|
101
|
+
if (!target)
|
|
102
|
+
continue;
|
|
103
|
+
if (target.id === args.scope || target.name === args.scope)
|
|
104
|
+
inScope.add(e.from);
|
|
105
|
+
}
|
|
106
|
+
resources = resources.filter((r) => inScope.has(r.id));
|
|
107
|
+
}
|
|
108
|
+
// Edges must still reference resources that survived filtering.
|
|
109
|
+
const keepIds = new Set(resources.map((r) => r.id));
|
|
110
|
+
edges = edges.filter((e) => keepIds.has(e.from) && keepIds.has(e.to));
|
|
111
|
+
// Soft truncation so an agent can't accidentally pull a 10k-node graph
|
|
112
|
+
// into context — defaults are generous but capped.
|
|
113
|
+
const limit = Math.min(Math.max(args.limit ?? 500, 1), 5000);
|
|
114
|
+
const truncated = resources.length > limit;
|
|
115
|
+
if (truncated) {
|
|
116
|
+
resources = resources.slice(0, limit);
|
|
117
|
+
const keep2 = new Set(resources.map((r) => r.id));
|
|
118
|
+
edges = edges.filter((e) => keep2.has(e.from) && keep2.has(e.to));
|
|
119
|
+
}
|
|
120
|
+
const payload = {
|
|
121
|
+
sources: agg.sources,
|
|
122
|
+
resources,
|
|
123
|
+
edges,
|
|
124
|
+
total: { resources: agg.resources.length, edges: agg.edges.length },
|
|
125
|
+
truncated,
|
|
126
|
+
};
|
|
127
|
+
return {
|
|
128
|
+
content: [{ type: "text", text: JSON.stringify(payload, null, 2) }],
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
// --- get_blast_radius --------------------------------------------------
|
|
132
|
+
export const getBlastRadiusDefinition = {
|
|
133
|
+
name: "get_blast_radius",
|
|
134
|
+
description: "Given a resource, return the impact set if its underlying host(s) fail. Pivots on the generic RUNS_ON relation, so it works for pod→node, vm→hypervisor, container→host alike. Use this for cross-cutting RCA when several services degrade together.",
|
|
135
|
+
};
|
|
136
|
+
export async function getBlastRadiusHandler(registry, args, _ctx = defaultContext()) {
|
|
137
|
+
const agg = await aggregateTopology(registry);
|
|
138
|
+
const found = resolveResource(args.resource, agg.resources);
|
|
139
|
+
if ("error" in found) {
|
|
140
|
+
return {
|
|
141
|
+
isError: true,
|
|
142
|
+
content: [{ type: "text", text: JSON.stringify(found, null, 2) }],
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
// Index edges once.
|
|
146
|
+
const byId = new Map(agg.resources.map((r) => [r.id, r]));
|
|
147
|
+
const runsOnOut = new Map(); // child → host
|
|
148
|
+
const runsOnIn = new Map(); // host → children
|
|
149
|
+
const ownedByOut = new Map(); // child → owner
|
|
150
|
+
for (const e of agg.edges) {
|
|
151
|
+
if (e.relation === "RUNS_ON") {
|
|
152
|
+
runsOnOut.set(e.from, e.to);
|
|
153
|
+
const s = runsOnIn.get(e.to) || new Set();
|
|
154
|
+
s.add(e.from);
|
|
155
|
+
runsOnIn.set(e.to, s);
|
|
156
|
+
}
|
|
157
|
+
else if (e.relation === "OWNED_BY") {
|
|
158
|
+
if (!ownedByOut.has(e.from))
|
|
159
|
+
ownedByOut.set(e.from, e.to);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
function ownershipRoot(id) {
|
|
163
|
+
let cur = id;
|
|
164
|
+
for (let i = 0; i < 16; i++) {
|
|
165
|
+
const next = ownedByOut.get(cur);
|
|
166
|
+
if (!next || next === cur)
|
|
167
|
+
return cur;
|
|
168
|
+
cur = next;
|
|
169
|
+
}
|
|
170
|
+
return cur;
|
|
171
|
+
}
|
|
172
|
+
// Determine which hosts the target depends on. If the resource is itself
|
|
173
|
+
// a host (incoming RUNS_ON exists), the host is the resource itself.
|
|
174
|
+
const hosts = [];
|
|
175
|
+
if (runsOnIn.has(found.id)) {
|
|
176
|
+
hosts.push(found.id);
|
|
177
|
+
}
|
|
178
|
+
else if (runsOnOut.has(found.id)) {
|
|
179
|
+
hosts.push(runsOnOut.get(found.id));
|
|
180
|
+
}
|
|
181
|
+
if (hosts.length === 0) {
|
|
182
|
+
const payload = {
|
|
183
|
+
target: { id: found.id, name: found.name, kind: found.kind },
|
|
184
|
+
hosts: [],
|
|
185
|
+
note: "This resource has no RUNS_ON edges in the current topology — either it is itself a top-level host with no tenants yet, or its connector does not emit RUNS_ON.",
|
|
186
|
+
};
|
|
187
|
+
return { content: [{ type: "text", text: JSON.stringify(payload, null, 2) }] };
|
|
188
|
+
}
|
|
189
|
+
const perHost = [];
|
|
190
|
+
for (const hostId of hosts) {
|
|
191
|
+
const host = byId.get(hostId);
|
|
192
|
+
if (!host)
|
|
193
|
+
continue;
|
|
194
|
+
const childIds = Array.from(runsOnIn.get(hostId) || []);
|
|
195
|
+
// Bucket children by their ownership root.
|
|
196
|
+
const buckets = new Map();
|
|
197
|
+
for (const cid of childIds) {
|
|
198
|
+
const child = byId.get(cid);
|
|
199
|
+
if (!child)
|
|
200
|
+
continue;
|
|
201
|
+
const rootId = ownershipRoot(cid);
|
|
202
|
+
const root = byId.get(rootId);
|
|
203
|
+
const bucket = buckets.get(rootId) || {
|
|
204
|
+
ownershipRoot: rootId,
|
|
205
|
+
ownershipRootName: root ? root.name : rootId,
|
|
206
|
+
ownershipRootKind: root ? root.kind : "?",
|
|
207
|
+
members: [],
|
|
208
|
+
};
|
|
209
|
+
bucket.members.push({ id: child.id, name: child.name, kind: child.kind });
|
|
210
|
+
buckets.set(rootId, bucket);
|
|
211
|
+
}
|
|
212
|
+
const coTenants = Array.from(buckets.values()).sort((a, b) => b.members.length - a.members.length);
|
|
213
|
+
perHost.push({
|
|
214
|
+
host: { id: host.id, name: host.name, kind: host.kind },
|
|
215
|
+
ownershipRoots: coTenants.length,
|
|
216
|
+
coTenants,
|
|
217
|
+
});
|
|
218
|
+
}
|
|
219
|
+
// Surface a one-line recommendation when ≥2 services share a host —
|
|
220
|
+
// exactly the "blast radius if it fails" case that justifies this tool.
|
|
221
|
+
const sharedHosts = perHost.filter((h) => h.ownershipRoots > 1);
|
|
222
|
+
const summary = sharedHosts.length > 0
|
|
223
|
+
? `${sharedHosts.length} of ${perHost.length} host(s) carry ≥2 services — those hosts are blast-radius candidates if they fail.`
|
|
224
|
+
: `No host carries more than one service besides the target — limited shared-host blast radius.`;
|
|
225
|
+
const payload = {
|
|
226
|
+
target: { id: found.id, name: found.name, kind: found.kind },
|
|
227
|
+
hosts: perHost,
|
|
228
|
+
summary,
|
|
229
|
+
};
|
|
230
|
+
return {
|
|
231
|
+
content: [{ type: "text", text: JSON.stringify(payload, null, 2) }],
|
|
232
|
+
};
|
|
233
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|