@thotischner/observability-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/sources.yaml +45 -0
- package/dist/analysis/anomaly.d.ts +24 -0
- package/dist/analysis/anomaly.js +50 -0
- package/dist/analysis/anomaly.test.d.ts +1 -0
- package/dist/analysis/anomaly.test.js +87 -0
- package/dist/analysis/correlator.d.ts +7 -0
- package/dist/analysis/correlator.js +31 -0
- package/dist/analysis/correlator.test.d.ts +1 -0
- package/dist/analysis/correlator.test.js +53 -0
- package/dist/analysis/health.d.ts +19 -0
- package/dist/analysis/health.js +34 -0
- package/dist/analysis/health.test.d.ts +1 -0
- package/dist/analysis/health.test.js +70 -0
- package/dist/config/loader.d.ts +5 -0
- package/dist/config/loader.js +81 -0
- package/dist/config/loader.test.d.ts +1 -0
- package/dist/config/loader.test.js +163 -0
- package/dist/connectors/interface.d.ts +17 -0
- package/dist/connectors/interface.js +1 -0
- package/dist/connectors/loki.d.ts +25 -0
- package/dist/connectors/loki.js +182 -0
- package/dist/connectors/loki.test.d.ts +1 -0
- package/dist/connectors/loki.test.js +111 -0
- package/dist/connectors/prometheus.d.ts +28 -0
- package/dist/connectors/prometheus.js +196 -0
- package/dist/connectors/prometheus.test.d.ts +1 -0
- package/dist/connectors/prometheus.test.js +103 -0
- package/dist/connectors/registry.d.ts +18 -0
- package/dist/connectors/registry.js +90 -0
- package/dist/connectors/registry.test.d.ts +1 -0
- package/dist/connectors/registry.test.js +93 -0
- package/dist/connectors/tls.d.ts +7 -0
- package/dist/connectors/tls.js +25 -0
- package/dist/connectors/tls.test.d.ts +1 -0
- package/dist/connectors/tls.test.js +99 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +421 -0
- package/dist/tools/detect-anomalies.d.ts +33 -0
- package/dist/tools/detect-anomalies.js +137 -0
- package/dist/tools/get-service-health.d.ts +25 -0
- package/dist/tools/get-service-health.js +111 -0
- package/dist/tools/handlers.test.d.ts +1 -0
- package/dist/tools/handlers.test.js +138 -0
- package/dist/tools/list-services.d.ts +22 -0
- package/dist/tools/list-services.js +57 -0
- package/dist/tools/list-sources.d.ts +15 -0
- package/dist/tools/list-sources.js +27 -0
- package/dist/tools/query-logs.d.ts +49 -0
- package/dist/tools/query-logs.js +93 -0
- package/dist/tools/query-metrics.d.ts +44 -0
- package/dist/tools/query-metrics.js +91 -0
- package/dist/tools/validation.d.ts +17 -0
- package/dist/tools/validation.js +45 -0
- package/dist/tools/validation.test.d.ts +1 -0
- package/dist/tools/validation.test.js +84 -0
- package/dist/types.d.ts +171 -0
- package/dist/types.js +1 -0
- package/dist/ui/index.html +675 -0
- package/package.json +35 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import express from "express";
|
|
3
|
+
import { randomUUID } from "node:crypto";
|
|
4
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
5
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
import { loadConfig, saveConfig, DEFAULT_HEALTH_THRESHOLDS, DEFAULT_SETTINGS } from "./config/loader.js";
|
|
8
|
+
import { ConnectorRegistry, getSupportedTypes } from "./connectors/registry.js";
|
|
9
|
+
import { listSourcesHandler } from "./tools/list-sources.js";
|
|
10
|
+
import { listServicesHandler } from "./tools/list-services.js";
|
|
11
|
+
import { queryMetricsHandler } from "./tools/query-metrics.js";
|
|
12
|
+
import { queryLogsHandler } from "./tools/query-logs.js";
|
|
13
|
+
import { getServiceHealthHandler, setHealthThresholds } from "./tools/get-service-health.js";
|
|
14
|
+
import { detectAnomaliesHandler } from "./tools/detect-anomalies.js";
|
|
15
|
+
import { fileURLToPath } from "node:url";
|
|
16
|
+
import { dirname, join } from "node:path";
|
|
17
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
18
|
+
function applyConfigToRuntime(config, registry) {
|
|
19
|
+
setHealthThresholds(config.healthThresholds);
|
|
20
|
+
}
|
|
21
|
+
/** Build a dynamic description of available metrics from all connected sources */
|
|
22
|
+
function getAvailableMetricNames(registry) {
|
|
23
|
+
const allMetrics = new Map(); // name → description
|
|
24
|
+
for (const connector of registry.getBySignal("metrics")) {
|
|
25
|
+
for (const m of connector.getMetrics()) {
|
|
26
|
+
if (!allMetrics.has(m.name)) {
|
|
27
|
+
allMetrics.set(m.name, m.description || m.name);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
if (allMetrics.size === 0)
|
|
32
|
+
return "No metrics sources configured.";
|
|
33
|
+
return Array.from(allMetrics.entries())
|
|
34
|
+
.map(([name, desc]) => `${name} (${desc})`)
|
|
35
|
+
.join(", ");
|
|
36
|
+
}
|
|
37
|
+
/** Validate source URL: must be http/https, reject obviously dangerous targets */
|
|
38
|
+
function validateSourceUrl(url) {
|
|
39
|
+
try {
|
|
40
|
+
const parsed = new URL(url);
|
|
41
|
+
if (!["http:", "https:"].includes(parsed.protocol)) {
|
|
42
|
+
return `Invalid URL scheme "${parsed.protocol}". Only http and https are allowed.`;
|
|
43
|
+
}
|
|
44
|
+
// Block cloud metadata endpoints
|
|
45
|
+
const host = parsed.hostname.toLowerCase();
|
|
46
|
+
if (host === "169.254.169.254" || host === "metadata.google.internal") {
|
|
47
|
+
return "Access to cloud metadata endpoints is not allowed.";
|
|
48
|
+
}
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
return `Invalid URL: "${url}"`;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
async function main() {
|
|
56
|
+
let config = loadConfig();
|
|
57
|
+
const registry = new ConnectorRegistry();
|
|
58
|
+
await registry.initialize(config);
|
|
59
|
+
applyConfigToRuntime(config, registry);
|
|
60
|
+
const mcpServer = new McpServer({
|
|
61
|
+
name: "observability-mcp",
|
|
62
|
+
version: "1.0.0",
|
|
63
|
+
});
|
|
64
|
+
// --- Register tools with Zod schemas ---
|
|
65
|
+
mcpServer.tool("list_sources", "List all configured observability backends and their connection status. Use this to discover what data sources are available.", {}, async () => listSourcesHandler(registry));
|
|
66
|
+
mcpServer.tool("list_services", "List all monitored services discovered across all connected backends. Returns service names, their data sources, and signal types (metrics/logs).", { filter: z.string().optional().describe("Optional filter to match service names") }, async (args) => listServicesHandler(registry, args));
|
|
67
|
+
const metricsList = getAvailableMetricNames(registry);
|
|
68
|
+
const metricNames = registry.getBySignal("metrics").flatMap(c => c.getMetrics().map(m => m.name));
|
|
69
|
+
const uniqueNames = [...new Set(metricNames)];
|
|
70
|
+
mcpServer.tool("query_metrics", `Query a specific metric for a service over a given timeframe. Returns time-series data with pre-computed summary statistics (current, average, min, max, trend). Available metrics: ${metricsList}`, {
|
|
71
|
+
service: z.string().describe("Service name (e.g. 'api-gateway', 'payment-service')"),
|
|
72
|
+
metric: z.string().describe(`Metric name. Available: ${uniqueNames.join(", ")}`),
|
|
73
|
+
duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
|
|
74
|
+
source: z.string().optional().describe("Specific source name. If omitted, queries all metrics backends."),
|
|
75
|
+
}, async (args) => queryMetricsHandler(registry, args));
|
|
76
|
+
mcpServer.tool("query_logs", "Query logs for a service over a given timeframe. Returns log entries with a summary including error/warning counts and top error patterns.", {
|
|
77
|
+
service: z.string().describe("Service name (e.g. 'payment-service')"),
|
|
78
|
+
query: z.string().optional().describe("Optional search query to filter log messages (regex supported)"),
|
|
79
|
+
duration: z.string().optional().describe("Time range (e.g. '5m', '1h', '24h'). Default: '5m'"),
|
|
80
|
+
level: z.string().optional().describe("Filter by log level: 'error', 'warn', 'info', 'debug'"),
|
|
81
|
+
limit: z.number().optional().describe("Maximum log entries to return. Default: 100"),
|
|
82
|
+
}, async (args) => queryLogsHandler(registry, args));
|
|
83
|
+
mcpServer.tool("get_service_health", "Get an aggregated health overview for a service combining metrics AND logs. Returns health score (0-100), status (healthy/degraded/critical), key metrics, log error summary, anomalies, and cross-signal correlations.", {
|
|
84
|
+
service: z.string().describe("Service name to check health for"),
|
|
85
|
+
}, async (args) => getServiceHealthHandler(registry, args));
|
|
86
|
+
mcpServer.tool("detect_anomalies", "Scan for anomalies across all monitored services (or a specific one). Uses z-score analysis on metrics, checks log error spikes, and correlates signals. Returns anomalies with severity ratings.", {
|
|
87
|
+
service: z.string().optional().describe("Specific service to scan. If omitted, scans all."),
|
|
88
|
+
duration: z.string().optional().describe("Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'"),
|
|
89
|
+
sensitivity: z.enum(["low", "medium", "high"]).optional().describe("Detection sensitivity: low (>3σ), medium (>2σ), high (>1.5σ). Default: 'medium'"),
|
|
90
|
+
}, async (args) => detectAnomaliesHandler(registry, args));
|
|
91
|
+
// --- HTTP server ---
|
|
92
|
+
const app = express();
|
|
93
|
+
app.use(express.json({ limit: "1mb" }));
|
|
94
|
+
// Security headers
|
|
95
|
+
app.use((_req, res, next) => {
|
|
96
|
+
res.setHeader("X-Content-Type-Options", "nosniff");
|
|
97
|
+
res.setHeader("X-Frame-Options", "DENY");
|
|
98
|
+
res.setHeader("X-XSS-Protection", "1; mode=block");
|
|
99
|
+
res.setHeader("Referrer-Policy", "strict-origin-when-cross-origin");
|
|
100
|
+
next();
|
|
101
|
+
});
|
|
102
|
+
// Serve Web UI
|
|
103
|
+
app.use(express.static(join(__dirname, "ui")));
|
|
104
|
+
// --- API endpoints for Web UI ---
|
|
105
|
+
// List sources with health status
|
|
106
|
+
app.get("/api/sources", async (_req, res) => {
|
|
107
|
+
const health = await registry.healthCheckAll();
|
|
108
|
+
const configs = registry.getSourceConfigs();
|
|
109
|
+
const sources = configs.map((c) => {
|
|
110
|
+
const connector = registry.getByName(c.name);
|
|
111
|
+
return {
|
|
112
|
+
name: c.name,
|
|
113
|
+
type: c.type,
|
|
114
|
+
url: c.url,
|
|
115
|
+
enabled: c.enabled,
|
|
116
|
+
auth: c.auth ? { type: c.auth.type } : undefined,
|
|
117
|
+
tls: c.tls || undefined,
|
|
118
|
+
signalType: connector?.signalType || null,
|
|
119
|
+
status: health[c.name]?.status || (c.enabled ? "down" : "disabled"),
|
|
120
|
+
latencyMs: health[c.name]?.latencyMs || null,
|
|
121
|
+
message: health[c.name]?.message || null,
|
|
122
|
+
};
|
|
123
|
+
});
|
|
124
|
+
res.json(sources);
|
|
125
|
+
});
|
|
126
|
+
// Get supported connector types
|
|
127
|
+
app.get("/api/source-types", (_req, res) => {
|
|
128
|
+
res.json(getSupportedTypes());
|
|
129
|
+
});
|
|
130
|
+
// Add a new source
|
|
131
|
+
app.post("/api/sources", async (req, res) => {
|
|
132
|
+
const { name, type, url, enabled, auth, tls } = req.body;
|
|
133
|
+
if (!name || !type || !url) {
|
|
134
|
+
res.status(400).json({ error: "name, type, and url are required" });
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
const urlErr = validateSourceUrl(url);
|
|
138
|
+
if (urlErr) {
|
|
139
|
+
res.status(400).json({ error: urlErr });
|
|
140
|
+
return;
|
|
141
|
+
}
|
|
142
|
+
const existing = registry.getSourceConfigs().find((s) => s.name === name);
|
|
143
|
+
if (existing) {
|
|
144
|
+
res.status(409).json({ error: `Source "${name}" already exists` });
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
const source = { name, type, url, enabled: enabled !== false, auth, tls };
|
|
148
|
+
await registry.addSource(source);
|
|
149
|
+
saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
|
|
150
|
+
res.status(201).json({ ok: true, source });
|
|
151
|
+
});
|
|
152
|
+
// Update an existing source
|
|
153
|
+
app.put("/api/sources/:name", async (req, res) => {
|
|
154
|
+
const oldName = req.params.name;
|
|
155
|
+
const { name, type, url, enabled, auth, tls } = req.body;
|
|
156
|
+
const existing = registry.getSourceConfigs().find((s) => s.name === oldName);
|
|
157
|
+
if (!existing) {
|
|
158
|
+
res.status(404).json({ error: `Source "${oldName}" not found` });
|
|
159
|
+
return;
|
|
160
|
+
}
|
|
161
|
+
const newUrl = url || existing.url;
|
|
162
|
+
if (url) {
|
|
163
|
+
const urlErr = validateSourceUrl(newUrl);
|
|
164
|
+
if (urlErr) {
|
|
165
|
+
res.status(400).json({ error: urlErr });
|
|
166
|
+
return;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
const source = {
|
|
170
|
+
name: name || oldName,
|
|
171
|
+
type: type || existing.type,
|
|
172
|
+
url: newUrl,
|
|
173
|
+
enabled: enabled !== undefined ? enabled : existing.enabled,
|
|
174
|
+
auth: auth !== undefined ? auth : existing.auth,
|
|
175
|
+
tls: tls !== undefined ? tls : existing.tls,
|
|
176
|
+
};
|
|
177
|
+
await registry.updateSource(oldName, source);
|
|
178
|
+
saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
|
|
179
|
+
res.json({ ok: true, source });
|
|
180
|
+
});
|
|
181
|
+
// Delete a source
|
|
182
|
+
app.delete("/api/sources/:name", async (req, res) => {
|
|
183
|
+
const name = req.params.name;
|
|
184
|
+
const existing = registry.getSourceConfigs().find((s) => s.name === name);
|
|
185
|
+
if (!existing) {
|
|
186
|
+
res.status(404).json({ error: `Source "${name}" not found` });
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
await registry.removeSource(name);
|
|
190
|
+
saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
|
|
191
|
+
res.json({ ok: true });
|
|
192
|
+
});
|
|
193
|
+
// Test a source connection (without saving)
|
|
194
|
+
app.post("/api/sources/test", async (req, res) => {
|
|
195
|
+
const { name, type, url, enabled, auth, tls } = req.body;
|
|
196
|
+
if (!type || !url) {
|
|
197
|
+
res.status(400).json({ error: "type and url are required" });
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
const urlErr = validateSourceUrl(url);
|
|
201
|
+
if (urlErr) {
|
|
202
|
+
res.status(400).json({ error: urlErr });
|
|
203
|
+
return;
|
|
204
|
+
}
|
|
205
|
+
const result = await registry.testConnection({
|
|
206
|
+
name: name || "test",
|
|
207
|
+
type,
|
|
208
|
+
url,
|
|
209
|
+
enabled: enabled !== false,
|
|
210
|
+
auth,
|
|
211
|
+
tls,
|
|
212
|
+
});
|
|
213
|
+
res.json(result);
|
|
214
|
+
});
|
|
215
|
+
// Toggle source enabled/disabled
|
|
216
|
+
app.patch("/api/sources/:name/toggle", async (req, res) => {
|
|
217
|
+
const name = req.params.name;
|
|
218
|
+
const existing = registry.getSourceConfigs().find((s) => s.name === name);
|
|
219
|
+
if (!existing) {
|
|
220
|
+
res.status(404).json({ error: `Source "${name}" not found` });
|
|
221
|
+
return;
|
|
222
|
+
}
|
|
223
|
+
const updated = { ...existing, enabled: !existing.enabled };
|
|
224
|
+
await registry.updateSource(name, updated);
|
|
225
|
+
saveConfig(config = { ...config, sources: registry.getSourceConfigs() });
|
|
226
|
+
res.json({ ok: true, enabled: updated.enabled });
|
|
227
|
+
});
|
|
228
|
+
/** Safely parse JSON from MCP tool result */
|
|
229
|
+
function parseToolResult(result) {
|
|
230
|
+
try {
|
|
231
|
+
return JSON.parse(result.content[0]?.text || "{}");
|
|
232
|
+
}
|
|
233
|
+
catch {
|
|
234
|
+
return { error: "Failed to parse tool result" };
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// List discovered services
|
|
238
|
+
app.get("/api/services", async (_req, res) => {
|
|
239
|
+
try {
|
|
240
|
+
const result = await listServicesHandler(registry, {});
|
|
241
|
+
res.json(parseToolResult(result));
|
|
242
|
+
}
|
|
243
|
+
catch {
|
|
244
|
+
res.status(500).json({ error: "Failed to list services" });
|
|
245
|
+
}
|
|
246
|
+
});
|
|
247
|
+
// Health endpoint for UI dashboard
|
|
248
|
+
app.get("/api/health/:service", async (req, res) => {
|
|
249
|
+
try {
|
|
250
|
+
const result = await getServiceHealthHandler(registry, { service: req.params.service });
|
|
251
|
+
res.json(parseToolResult(result));
|
|
252
|
+
}
|
|
253
|
+
catch {
|
|
254
|
+
res.status(500).json({ error: "Failed to get service health" });
|
|
255
|
+
}
|
|
256
|
+
});
|
|
257
|
+
// Health for all services
|
|
258
|
+
app.get("/api/health", async (_req, res) => {
|
|
259
|
+
try {
|
|
260
|
+
const servicesResult = await listServicesHandler(registry, {});
|
|
261
|
+
const parsed = parseToolResult(servicesResult);
|
|
262
|
+
const services = parsed?.services || [];
|
|
263
|
+
const health = {};
|
|
264
|
+
for (const svc of services) {
|
|
265
|
+
try {
|
|
266
|
+
const result = await getServiceHealthHandler(registry, { service: svc.name });
|
|
267
|
+
health[svc.name] = parseToolResult(result);
|
|
268
|
+
}
|
|
269
|
+
catch {
|
|
270
|
+
health[svc.name] = { error: "failed to fetch health" };
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
res.json(health);
|
|
274
|
+
}
|
|
275
|
+
catch {
|
|
276
|
+
res.status(500).json({ error: "Failed to get health data" });
|
|
277
|
+
}
|
|
278
|
+
});
|
|
279
|
+
// --- Settings API ---
|
|
280
|
+
// Get general settings
|
|
281
|
+
app.get("/api/settings", (_req, res) => {
|
|
282
|
+
res.json(config.settings);
|
|
283
|
+
});
|
|
284
|
+
// Update general settings
|
|
285
|
+
app.put("/api/settings", (req, res) => {
|
|
286
|
+
config = { ...config, settings: { ...config.settings, ...req.body } };
|
|
287
|
+
saveConfig(config);
|
|
288
|
+
res.json({ ok: true, settings: config.settings });
|
|
289
|
+
});
|
|
290
|
+
// Get defaults (for reset buttons in UI)
|
|
291
|
+
app.get("/api/settings/defaults", (_req, res) => {
|
|
292
|
+
res.json({
|
|
293
|
+
settings: DEFAULT_SETTINGS,
|
|
294
|
+
healthThresholds: DEFAULT_HEALTH_THRESHOLDS,
|
|
295
|
+
});
|
|
296
|
+
});
|
|
297
|
+
// --- Health Thresholds API ---
|
|
298
|
+
app.get("/api/health-thresholds", (_req, res) => {
|
|
299
|
+
res.json(config.healthThresholds);
|
|
300
|
+
});
|
|
301
|
+
app.put("/api/health-thresholds", (req, res) => {
|
|
302
|
+
config = { ...config, healthThresholds: { ...config.healthThresholds, ...req.body } };
|
|
303
|
+
applyConfigToRuntime(config, registry);
|
|
304
|
+
saveConfig(config);
|
|
305
|
+
res.json({ ok: true, healthThresholds: config.healthThresholds });
|
|
306
|
+
});
|
|
307
|
+
// --- Per-Source Metrics API ---
|
|
308
|
+
// Get metrics for a source (active metrics or defaults)
|
|
309
|
+
app.get("/api/sources/:name/metrics", (req, res) => {
|
|
310
|
+
const connector = registry.getByName(req.params.name);
|
|
311
|
+
if (!connector) {
|
|
312
|
+
res.status(404).json({ error: `Source "${req.params.name}" not found` });
|
|
313
|
+
return;
|
|
314
|
+
}
|
|
315
|
+
res.json({
|
|
316
|
+
metrics: connector.getMetrics(),
|
|
317
|
+
defaults: connector.getDefaultMetrics(),
|
|
318
|
+
});
|
|
319
|
+
});
|
|
320
|
+
// Update metrics for a source
|
|
321
|
+
app.put("/api/sources/:name/metrics", async (req, res) => {
|
|
322
|
+
const name = req.params.name;
|
|
323
|
+
const sourceIdx = config.sources.findIndex((s) => s.name === name);
|
|
324
|
+
if (sourceIdx === -1) {
|
|
325
|
+
res.status(404).json({ error: `Source "${name}" not found` });
|
|
326
|
+
return;
|
|
327
|
+
}
|
|
328
|
+
config.sources[sourceIdx].metrics = req.body.metrics || [];
|
|
329
|
+
// Reconnect to pick up new metrics
|
|
330
|
+
await registry.updateSource(name, config.sources[sourceIdx]);
|
|
331
|
+
saveConfig(config);
|
|
332
|
+
res.json({ ok: true });
|
|
333
|
+
});
|
|
334
|
+
// Reset a source's metrics to connector defaults
|
|
335
|
+
app.delete("/api/sources/:name/metrics", async (req, res) => {
|
|
336
|
+
const name = req.params.name;
|
|
337
|
+
const sourceIdx = config.sources.findIndex((s) => s.name === name);
|
|
338
|
+
if (sourceIdx === -1) {
|
|
339
|
+
res.status(404).json({ error: `Source "${name}" not found` });
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
delete config.sources[sourceIdx].metrics;
|
|
343
|
+
await registry.updateSource(name, config.sources[sourceIdx]);
|
|
344
|
+
saveConfig(config);
|
|
345
|
+
res.json({ ok: true });
|
|
346
|
+
});
|
|
347
|
+
// MCP Streamable HTTP transport — stateful sessions
|
|
348
|
+
const transports = new Map();
|
|
349
|
+
const sessionLastActive = new Map();
|
|
350
|
+
const SESSION_TTL_MS = 30 * 60 * 1000; // 30 min idle timeout
|
|
351
|
+
// Clean up idle sessions every 5 minutes
|
|
352
|
+
setInterval(() => {
|
|
353
|
+
const now = Date.now();
|
|
354
|
+
for (const [sid, lastActive] of sessionLastActive) {
|
|
355
|
+
if (now - lastActive > SESSION_TTL_MS) {
|
|
356
|
+
transports.delete(sid);
|
|
357
|
+
sessionLastActive.delete(sid);
|
|
358
|
+
console.log(`Session ${sid} expired (idle)`);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
}, 5 * 60 * 1000);
|
|
362
|
+
app.post("/mcp", async (req, res) => {
|
|
363
|
+
const sessionId = req.headers["mcp-session-id"];
|
|
364
|
+
let transport;
|
|
365
|
+
if (sessionId && transports.has(sessionId)) {
|
|
366
|
+
transport = transports.get(sessionId);
|
|
367
|
+
}
|
|
368
|
+
else {
|
|
369
|
+
transport = new StreamableHTTPServerTransport({
|
|
370
|
+
sessionIdGenerator: () => randomUUID(),
|
|
371
|
+
});
|
|
372
|
+
transport.onclose = () => {
|
|
373
|
+
// Clean up session on close
|
|
374
|
+
for (const [sid, t] of transports) {
|
|
375
|
+
if (t === transport) {
|
|
376
|
+
transports.delete(sid);
|
|
377
|
+
break;
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
};
|
|
381
|
+
await mcpServer.connect(transport);
|
|
382
|
+
}
|
|
383
|
+
await transport.handleRequest(req, res, req.body);
|
|
384
|
+
// Store session after handling (sessionId is set during handleRequest)
|
|
385
|
+
const sid = res.getHeader("mcp-session-id");
|
|
386
|
+
if (sid) {
|
|
387
|
+
if (!transports.has(sid))
|
|
388
|
+
transports.set(sid, transport);
|
|
389
|
+
sessionLastActive.set(sid, Date.now());
|
|
390
|
+
}
|
|
391
|
+
});
|
|
392
|
+
app.get("/mcp", async (req, res) => {
|
|
393
|
+
const sessionId = req.headers["mcp-session-id"];
|
|
394
|
+
const transport = transports.get(sessionId);
|
|
395
|
+
if (!transport) {
|
|
396
|
+
res.status(400).json({ error: "No active session" });
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
await transport.handleRequest(req, res);
|
|
400
|
+
});
|
|
401
|
+
app.delete("/mcp", async (req, res) => {
|
|
402
|
+
const sessionId = req.headers["mcp-session-id"];
|
|
403
|
+
const transport = transports.get(sessionId);
|
|
404
|
+
if (transport) {
|
|
405
|
+
await transport.handleRequest(req, res);
|
|
406
|
+
transports.delete(sessionId);
|
|
407
|
+
sessionLastActive.delete(sessionId);
|
|
408
|
+
}
|
|
409
|
+
else {
|
|
410
|
+
res.status(400).json({ error: "No active session" });
|
|
411
|
+
}
|
|
412
|
+
});
|
|
413
|
+
const PORT = parseInt(process.env.PORT || "3000");
|
|
414
|
+
app.listen(PORT, () => {
|
|
415
|
+
console.log(`observability-mcp server running on port ${PORT}`);
|
|
416
|
+
console.log(` MCP endpoint: http://localhost:${PORT}/mcp`);
|
|
417
|
+
console.log(` Web UI: http://localhost:${PORT}`);
|
|
418
|
+
console.log(` Connectors: ${registry.getAll().map((c) => c.name).join(", ")}`);
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import type { ConnectorRegistry } from "../connectors/registry.js";
|
|
2
|
+
export declare const detectAnomaliesDefinition: {
|
|
3
|
+
name: "detect_anomalies";
|
|
4
|
+
description: string;
|
|
5
|
+
inputSchema: {
|
|
6
|
+
type: "object";
|
|
7
|
+
properties: {
|
|
8
|
+
service: {
|
|
9
|
+
type: string;
|
|
10
|
+
description: string;
|
|
11
|
+
};
|
|
12
|
+
duration: {
|
|
13
|
+
type: string;
|
|
14
|
+
description: string;
|
|
15
|
+
};
|
|
16
|
+
sensitivity: {
|
|
17
|
+
type: string;
|
|
18
|
+
enum: string[];
|
|
19
|
+
description: string;
|
|
20
|
+
};
|
|
21
|
+
};
|
|
22
|
+
};
|
|
23
|
+
};
|
|
24
|
+
export declare function detectAnomaliesHandler(registry: ConnectorRegistry, args: {
|
|
25
|
+
service?: string;
|
|
26
|
+
duration?: string;
|
|
27
|
+
sensitivity?: string;
|
|
28
|
+
}): Promise<{
|
|
29
|
+
content: {
|
|
30
|
+
type: "text";
|
|
31
|
+
text: string;
|
|
32
|
+
}[];
|
|
33
|
+
}>;
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import { detectRecentAnomaly } from "../analysis/anomaly.js";
|
|
2
|
+
export const detectAnomaliesDefinition = {
|
|
3
|
+
name: "detect_anomalies",
|
|
4
|
+
description: "Scan for anomalies across all monitored services (or a specific service). Detects metric deviations using z-score analysis against recent baseline, checks log error spikes, and correlates signals across metrics and logs. Returns anomalies with severity ratings and cross-signal correlations.",
|
|
5
|
+
inputSchema: {
|
|
6
|
+
type: "object",
|
|
7
|
+
properties: {
|
|
8
|
+
service: {
|
|
9
|
+
type: "string",
|
|
10
|
+
description: "Specific service to scan. If omitted, scans all services.",
|
|
11
|
+
},
|
|
12
|
+
duration: {
|
|
13
|
+
type: "string",
|
|
14
|
+
description: "Time range to analyze (e.g. '5m', '15m', '1h'). Default: '10m'",
|
|
15
|
+
},
|
|
16
|
+
sensitivity: {
|
|
17
|
+
type: "string",
|
|
18
|
+
enum: ["low", "medium", "high"],
|
|
19
|
+
description: "Detection sensitivity. 'low' = major deviations only (>3σ), 'medium' = moderate (>2σ), 'high' = subtle changes (>1.5σ). Default: 'medium'",
|
|
20
|
+
},
|
|
21
|
+
},
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
const SENSITIVITY_THRESHOLDS = {
|
|
25
|
+
low: 3.0,
|
|
26
|
+
medium: 2.0,
|
|
27
|
+
high: 1.5,
|
|
28
|
+
};
|
|
29
|
+
const KEY_METRICS = ["cpu", "error_rate", "latency_p99", "request_rate"];
|
|
30
|
+
export async function detectAnomaliesHandler(registry, args) {
|
|
31
|
+
const duration = args.duration || "10m";
|
|
32
|
+
const threshold = SENSITIVITY_THRESHOLDS[args.sensitivity || "medium"] || 2.0;
|
|
33
|
+
// Discover services to scan
|
|
34
|
+
const metricsConnectors = registry.getBySignal("metrics");
|
|
35
|
+
const logConnectors = registry.getBySignal("logs");
|
|
36
|
+
let serviceNames = [];
|
|
37
|
+
if (args.service) {
|
|
38
|
+
serviceNames = [args.service];
|
|
39
|
+
}
|
|
40
|
+
else {
|
|
41
|
+
for (const connector of metricsConnectors) {
|
|
42
|
+
const services = await connector.listServices();
|
|
43
|
+
for (const s of services) {
|
|
44
|
+
if (!serviceNames.includes(s.name))
|
|
45
|
+
serviceNames.push(s.name);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
const allAnomalies = [];
|
|
50
|
+
const allCorrelations = [];
|
|
51
|
+
for (const serviceName of serviceNames) {
|
|
52
|
+
// Check metrics
|
|
53
|
+
for (const connector of metricsConnectors) {
|
|
54
|
+
if (!connector.queryMetrics)
|
|
55
|
+
continue;
|
|
56
|
+
for (const metric of KEY_METRICS) {
|
|
57
|
+
try {
|
|
58
|
+
const result = await connector.queryMetrics({ service: serviceName, metric, duration });
|
|
59
|
+
const values = result.values.map((v) => v.value);
|
|
60
|
+
const anomaly = detectRecentAnomaly(values, 5, threshold);
|
|
61
|
+
if (anomaly.isAnomaly) {
|
|
62
|
+
const deviationPercent = anomaly.baselineAvg === 0
|
|
63
|
+
? 100
|
|
64
|
+
: Math.round(((anomaly.recentAvg - anomaly.baselineAvg) / anomaly.baselineAvg) * 100);
|
|
65
|
+
allAnomalies.push({
|
|
66
|
+
metric,
|
|
67
|
+
severity: Math.abs(anomaly.zScore) >= 3 ? "high" : Math.abs(anomaly.zScore) >= 2 ? "medium" : "low",
|
|
68
|
+
description: `${metric} is ${anomaly.zScore.toFixed(1)}σ ${anomaly.zScore > 0 ? "above" : "below"} baseline (${anomaly.baselineAvg.toFixed(2)} → ${anomaly.recentAvg.toFixed(2)})`,
|
|
69
|
+
currentValue: anomaly.recentAvg,
|
|
70
|
+
baselineValue: anomaly.baselineAvg,
|
|
71
|
+
deviationPercent,
|
|
72
|
+
source: connector.name,
|
|
73
|
+
service: serviceName,
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
// Skip metrics that don't exist for this service
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
// Check logs for error spikes
|
|
83
|
+
for (const connector of logConnectors) {
|
|
84
|
+
if (!connector.queryLogs)
|
|
85
|
+
continue;
|
|
86
|
+
try {
|
|
87
|
+
const logs = await connector.queryLogs({ service: serviceName, duration, limit: 500 });
|
|
88
|
+
if (logs.summary.errorCount > 5) {
|
|
89
|
+
const errorRatio = logs.summary.total > 0
|
|
90
|
+
? logs.summary.errorCount / logs.summary.total
|
|
91
|
+
: 0;
|
|
92
|
+
if (errorRatio > 0.1) {
|
|
93
|
+
allAnomalies.push({
|
|
94
|
+
metric: "log_error_rate",
|
|
95
|
+
severity: errorRatio > 0.3 ? "high" : errorRatio > 0.15 ? "medium" : "low",
|
|
96
|
+
description: `${Math.round(errorRatio * 100)}% of logs are errors (${logs.summary.errorCount}/${logs.summary.total}). Top: ${logs.summary.topPatterns[0] || "N/A"}`,
|
|
97
|
+
currentValue: logs.summary.errorCount,
|
|
98
|
+
baselineValue: 0,
|
|
99
|
+
deviationPercent: 100,
|
|
100
|
+
source: connector.name,
|
|
101
|
+
service: serviceName,
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
catch {
|
|
107
|
+
// Skip if logs unavailable
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
// Cross-signal correlation
|
|
112
|
+
if (allAnomalies.length > 0) {
|
|
113
|
+
const servicesWithAnomalies = [...new Set(allAnomalies.map((a) => a.service))];
|
|
114
|
+
for (const svc of servicesWithAnomalies) {
|
|
115
|
+
const svcAnomalies = allAnomalies.filter((a) => a.service === svc);
|
|
116
|
+
const metricTypes = svcAnomalies.map((a) => a.metric).filter((m) => m !== "log_error_rate");
|
|
117
|
+
const hasLogAnomaly = svcAnomalies.some((a) => a.metric === "log_error_rate");
|
|
118
|
+
if (metricTypes.length > 0 && hasLogAnomaly) {
|
|
119
|
+
allCorrelations.push(`${svc}: metric anomalies (${metricTypes.join(", ")}) correlate with elevated log error rate`);
|
|
120
|
+
}
|
|
121
|
+
if (metricTypes.includes("cpu") && metricTypes.includes("latency_p99")) {
|
|
122
|
+
allCorrelations.push(`${svc}: CPU spike and latency increase detected simultaneously — possible resource saturation`);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
const result = {
|
|
127
|
+
scannedServices: serviceNames.length,
|
|
128
|
+
anomalies: allAnomalies,
|
|
129
|
+
correlations: allCorrelations,
|
|
130
|
+
summary: allAnomalies.length === 0
|
|
131
|
+
? "All services healthy — no anomalies detected."
|
|
132
|
+
: `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`,
|
|
133
|
+
};
|
|
134
|
+
return {
|
|
135
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
136
|
+
};
|
|
137
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { ConnectorRegistry } from "../connectors/registry.js";
|
|
2
|
+
import type { HealthThresholds } from "../types.js";
|
|
3
|
+
export declare function setHealthThresholds(t: HealthThresholds): void;
|
|
4
|
+
export declare const getServiceHealthDefinition: {
|
|
5
|
+
name: "get_service_health";
|
|
6
|
+
description: string;
|
|
7
|
+
inputSchema: {
|
|
8
|
+
type: "object";
|
|
9
|
+
properties: {
|
|
10
|
+
service: {
|
|
11
|
+
type: string;
|
|
12
|
+
description: string;
|
|
13
|
+
};
|
|
14
|
+
};
|
|
15
|
+
required: string[];
|
|
16
|
+
};
|
|
17
|
+
};
|
|
18
|
+
export declare function getServiceHealthHandler(registry: ConnectorRegistry, args: {
|
|
19
|
+
service: string;
|
|
20
|
+
}): Promise<{
|
|
21
|
+
content: {
|
|
22
|
+
type: "text";
|
|
23
|
+
text: string;
|
|
24
|
+
}[];
|
|
25
|
+
}>;
|