@thotischner/observability-mcp 1.4.1 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/dist/analysis/anomaly.d.ts +89 -0
  2. package/dist/analysis/anomaly.js +235 -0
  3. package/dist/analysis/anomaly.test.js +149 -1
  4. package/dist/analysis/backtest.d.ts +31 -0
  5. package/dist/analysis/backtest.js +206 -0
  6. package/dist/analysis/backtest.test.d.ts +1 -0
  7. package/dist/analysis/backtest.test.js +34 -0
  8. package/dist/analysis/correlator.d.ts +35 -0
  9. package/dist/analysis/correlator.js +95 -0
  10. package/dist/analysis/correlator.test.js +60 -1
  11. package/dist/analysis/health.d.ts +2 -3
  12. package/dist/analysis/index.d.ts +32 -0
  13. package/dist/analysis/index.js +29 -0
  14. package/dist/analysis/library.test.d.ts +1 -0
  15. package/dist/analysis/library.test.js +44 -0
  16. package/dist/auth/credentials.d.ts +29 -0
  17. package/dist/auth/credentials.js +76 -0
  18. package/dist/auth/credentials.test.d.ts +1 -0
  19. package/dist/auth/credentials.test.js +57 -0
  20. package/dist/context.d.ts +27 -0
  21. package/dist/context.js +18 -0
  22. package/dist/index.js +53 -44
  23. package/dist/net/egress-policy.d.ts +31 -0
  24. package/dist/net/egress-policy.js +37 -0
  25. package/dist/net/egress-policy.test.d.ts +1 -0
  26. package/dist/net/egress-policy.test.js +52 -0
  27. package/dist/tools/context-seam.test.d.ts +1 -0
  28. package/dist/tools/context-seam.test.js +23 -0
  29. package/dist/tools/detect-anomalies.d.ts +2 -1
  30. package/dist/tools/detect-anomalies.js +47 -11
  31. package/dist/tools/get-service-health.d.ts +2 -1
  32. package/dist/tools/get-service-health.js +2 -1
  33. package/dist/tools/handlers.test.js +73 -0
  34. package/dist/tools/list-services.d.ts +2 -1
  35. package/dist/tools/list-services.js +2 -1
  36. package/dist/tools/list-sources.d.ts +2 -1
  37. package/dist/tools/list-sources.js +2 -1
  38. package/dist/tools/query-logs.d.ts +2 -1
  39. package/dist/tools/query-logs.js +2 -1
  40. package/dist/tools/query-metrics.d.ts +2 -1
  41. package/dist/tools/query-metrics.js +9 -1
  42. package/package.json +10 -2
@@ -1,4 +1,6 @@
1
- import { detectRecentAnomaly } from "../analysis/anomaly.js";
1
+ import { defaultContext } from "../context.js";
2
+ import { detectAnomaly, classifyMetric } from "../analysis/anomaly.js";
3
+ import { rankRootCause } from "../analysis/correlator.js";
2
4
  export const detectAnomaliesDefinition = {
3
5
  name: "detect_anomalies",
4
6
  description: "Scan for anomalies across all monitored services (or a specific service). Detects metric deviations using z-score analysis against recent baseline, checks log error spikes, and correlates signals across metrics and logs. Returns anomalies with severity ratings and cross-signal correlations.",
@@ -26,8 +28,12 @@ const SENSITIVITY_THRESHOLDS = {
26
28
  medium: 2.0,
27
29
  high: 1.5,
28
30
  };
29
- const KEY_METRICS = ["cpu", "error_rate", "latency_p99", "request_rate"];
30
- export async function detectAnomaliesHandler(registry, args) {
31
+ const KEY_METRICS = ["cpu", "memory", "error_rate", "latency_p99", "request_rate"];
32
+ // Patterns that signal a serious incident even at warn level and even when
33
+ // the overall error ratio is low (e.g. a memory leak emits a handful of
34
+ // "OutOfMemoryWarning" lines long before it turns into 5xx errors).
35
+ const CRITICAL_LOG_PATTERN = /\b(out\s?of\s?memory|oom|outofmemory|heap (usage|exhaust)|memory leak|panic|fatal|deadlock|segfault|stack overflow|cannot allocate)\b/i;
36
+ export async function detectAnomaliesHandler(registry, args, _ctx = defaultContext()) {
31
37
  const duration = args.duration || "10m";
32
38
  const threshold = SENSITIVITY_THRESHOLDS[args.sensitivity || "medium"] || 2.0;
33
39
  // Discover services to scan
@@ -56,18 +62,21 @@ export async function detectAnomaliesHandler(registry, args) {
56
62
  for (const metric of KEY_METRICS) {
57
63
  try {
58
64
  const result = await connector.queryMetrics({ service: serviceName, metric, duration });
59
- const values = result.values.map((v) => v.value);
60
- const anomaly = detectRecentAnomaly(values, 5, threshold);
65
+ const points = result.values.map((v) => ({ timestamp: v.timestamp, value: v.value }));
66
+ const anomaly = detectAnomaly(points, {
67
+ threshold,
68
+ metricKind: classifyMetric(metric),
69
+ });
61
70
  if (anomaly.isAnomaly) {
62
- const deviationPercent = anomaly.baselineAvg === 0
71
+ const deviationPercent = anomaly.baselineValue === 0
63
72
  ? 100
64
- : Math.round(((anomaly.recentAvg - anomaly.baselineAvg) / anomaly.baselineAvg) * 100);
73
+ : Math.round(((anomaly.recentValue - anomaly.baselineValue) / anomaly.baselineValue) * 100);
65
74
  allAnomalies.push({
66
75
  metric,
67
- severity: Math.abs(anomaly.zScore) >= 3 ? "high" : Math.abs(anomaly.zScore) >= 2 ? "medium" : "low",
68
- description: `${metric} is ${anomaly.zScore.toFixed(1)}σ ${anomaly.zScore > 0 ? "above" : "below"} baseline (${anomaly.baselineAvg.toFixed(2)} → ${anomaly.recentAvg.toFixed(2)})`,
69
- currentValue: anomaly.recentAvg,
70
- baselineValue: anomaly.baselineAvg,
76
+ severity: Math.abs(anomaly.score) >= 6 ? "high" : Math.abs(anomaly.score) >= 4 ? "medium" : "low",
77
+ description: `${metric}: ${anomaly.reason}`,
78
+ currentValue: anomaly.recentValue,
79
+ baselineValue: anomaly.baselineValue,
71
80
  deviationPercent,
72
81
  source: connector.name,
73
82
  service: serviceName,
@@ -85,6 +94,21 @@ export async function detectAnomaliesHandler(registry, args) {
85
94
  continue;
86
95
  try {
87
96
  const logs = await connector.queryLogs({ service: serviceName, duration, limit: 500 });
97
+ // Critical-pattern scan — independent of the error-ratio gate, so a
98
+ // warn-level OOM/leak signal is not silently dropped.
99
+ const criticalPattern = logs.summary.topPatterns.find((p) => CRITICAL_LOG_PATTERN.test(p));
100
+ if (criticalPattern) {
101
+ allAnomalies.push({
102
+ metric: "log_critical_pattern",
103
+ severity: "high",
104
+ description: `Critical log pattern detected: "${criticalPattern}"`,
105
+ currentValue: logs.summary.errorCount + logs.summary.warnCount,
106
+ baselineValue: 0,
107
+ deviationPercent: 100,
108
+ source: connector.name,
109
+ service: serviceName,
110
+ });
111
+ }
88
112
  if (logs.summary.errorCount > 5) {
89
113
  const errorRatio = logs.summary.total > 0
90
114
  ? logs.summary.errorCount / logs.summary.total
@@ -123,10 +147,22 @@ export async function detectAnomaliesHandler(registry, args) {
123
147
  }
124
148
  }
125
149
  }
150
+ // Dependency-aware root-cause ranking. The service graph / change markers
151
+ // are empty here (no trace source wired yet); ranking then degrades to
152
+ // severity-weighted ordering and still names the most likely culprit
153
+ // instead of just listing "both signals bad".
154
+ const rootCause = allAnomalies.length > 0
155
+ ? rankRootCause(allAnomalies.map((a) => ({
156
+ service: a.service,
157
+ metric: a.metric,
158
+ severity: a.severity,
159
+ })))
160
+ : { ranked: [], summary: "" };
126
161
  const result = {
127
162
  scannedServices: serviceNames.length,
128
163
  anomalies: allAnomalies,
129
164
  correlations: allCorrelations,
165
+ rootCause,
130
166
  summary: allAnomalies.length === 0
131
167
  ? "All services healthy — no anomalies detected."
132
168
  : `${allAnomalies.length} anomal${allAnomalies.length === 1 ? "y" : "ies"} detected across ${[...new Set(allAnomalies.map((a) => a.service))].length} service(s).`,
@@ -1,4 +1,5 @@
1
1
  import type { ConnectorRegistry } from "../connectors/registry.js";
2
+ import { type RequestContext } from "../context.js";
2
3
  import type { HealthThresholds } from "../types.js";
3
4
  export declare function setHealthThresholds(t: HealthThresholds): void;
4
5
  export declare const getServiceHealthDefinition: {
@@ -17,7 +18,7 @@ export declare const getServiceHealthDefinition: {
17
18
  };
18
19
  export declare function getServiceHealthHandler(registry: ConnectorRegistry, args: {
19
20
  service: string;
20
- }): Promise<{
21
+ }, _ctx?: RequestContext): Promise<{
21
22
  content: {
22
23
  type: "text";
23
24
  text: string;
@@ -1,3 +1,4 @@
1
+ import { defaultContext } from "../context.js";
1
2
  import { calculateHealthScore } from "../analysis/health.js";
2
3
  import { detectRecentAnomaly } from "../analysis/anomaly.js";
3
4
  import { sanitizeForLog } from "../util/sanitize.js";
@@ -19,7 +20,7 @@ export const getServiceHealthDefinition = {
19
20
  required: ["service"],
20
21
  },
21
22
  };
22
- export async function getServiceHealthHandler(registry, args) {
23
+ export async function getServiceHealthHandler(registry, args, _ctx = defaultContext()) {
23
24
  const metricsConnectors = registry.getBySignal("metrics");
24
25
  const logConnectors = registry.getBySignal("logs");
25
26
  // Gather metrics
@@ -3,6 +3,7 @@ import assert from "node:assert/strict";
3
3
  import { ConnectorRegistry } from "../connectors/registry.js";
4
4
  import { listSourcesHandler } from "./list-sources.js";
5
5
  import { listServicesHandler } from "./list-services.js";
6
+ import { detectAnomaliesHandler } from "./detect-anomalies.js";
6
7
  // --- Mock Connector ---
7
8
  function createMockConnector(overrides) {
8
9
  return {
@@ -136,3 +137,75 @@ describe("listServicesHandler", () => {
136
137
  assert.equal(data.total, 0);
137
138
  });
138
139
  });
140
+ describe("detectAnomaliesHandler — A5 memory/OOM coverage", () => {
141
+ const flatMemory = () => ({
142
+ source: "prom1", service: "payment-service", metric: "memory", unit: "bytes",
143
+ values: Array.from({ length: 40 }, (_, i) => ({
144
+ timestamp: new Date(Date.now() - (40 - i) * 9000).toISOString(),
145
+ value: 1.3e8 + (i % 3) * 1e6, // noisy, no trend → no metric anomaly
146
+ })),
147
+ summary: { current: 1.3e8, average: 1.3e8, min: 1.28e8, max: 1.33e8, trend: "stable" },
148
+ });
149
+ it("scans the memory metric (now in KEY_METRICS)", async () => {
150
+ const requested = [];
151
+ const reg = createRegistryWithMocks([
152
+ createMockConnector({
153
+ name: "prom1", type: "prometheus", signalType: "metrics",
154
+ listServices: async () => [{ name: "payment-service", source: "prom1", signalType: "metrics" }],
155
+ queryMetrics: async ({ metric }) => {
156
+ requested.push(metric);
157
+ return flatMemory();
158
+ },
159
+ }),
160
+ ]);
161
+ await detectAnomaliesHandler(reg, {});
162
+ assert.ok(requested.includes("memory"), `memory not scanned; got ${requested.join(",")}`);
163
+ });
164
+ it("flags a warn-level OOM log pattern below the error-rate gate", async () => {
165
+ const reg = createRegistryWithMocks([
166
+ createMockConnector({
167
+ name: "prom1", type: "prometheus", signalType: "metrics",
168
+ listServices: async () => [{ name: "payment-service", source: "prom1", signalType: "metrics" }],
169
+ queryMetrics: async () => flatMemory(),
170
+ }),
171
+ createMockConnector({
172
+ name: "loki1", type: "loki", signalType: "logs",
173
+ queryLogs: async () => ({
174
+ source: "loki1", service: "payment-service", entries: [],
175
+ // Only 4 warn-level lines: errorCount below the >5 gate, ratio tiny.
176
+ summary: {
177
+ total: 800, errorCount: 4, warnCount: 4,
178
+ topPatterns: ["OutOfMemoryWarning: heap usage exceeding threshold (4x)"],
179
+ },
180
+ }),
181
+ }),
182
+ ]);
183
+ const result = await detectAnomaliesHandler(reg, {});
184
+ const data = JSON.parse(result.content[0].text);
185
+ const crit = data.anomalies.find((a) => a.metric === "log_critical_pattern");
186
+ assert.ok(crit, "expected a log_critical_pattern anomaly for the OOM warning");
187
+ assert.equal(crit.service, "payment-service");
188
+ assert.equal(crit.severity, "high");
189
+ assert.equal(data.rootCause.ranked[0].service, "payment-service");
190
+ assert.notEqual(data.summary, "All services healthy — no anomalies detected.");
191
+ });
192
+ it("does not flag benign warn patterns", async () => {
193
+ const reg = createRegistryWithMocks([
194
+ createMockConnector({
195
+ name: "prom1", type: "prometheus", signalType: "metrics",
196
+ listServices: async () => [{ name: "order-service", source: "prom1", signalType: "metrics" }],
197
+ queryMetrics: async () => flatMemory(),
198
+ }),
199
+ createMockConnector({
200
+ name: "loki1", type: "loki", signalType: "logs",
201
+ queryLogs: async () => ({
202
+ source: "loki1", service: "order-service", entries: [],
203
+ summary: { total: 500, errorCount: 1, warnCount: 2, topPatterns: ["cache miss for key user:42"] },
204
+ }),
205
+ }),
206
+ ]);
207
+ const result = await detectAnomaliesHandler(reg, {});
208
+ const data = JSON.parse(result.content[0].text);
209
+ assert.equal(data.anomalies.length, 0);
210
+ });
211
+ });
@@ -1,4 +1,5 @@
1
1
  import type { ConnectorRegistry } from "../connectors/registry.js";
2
+ import { type RequestContext } from "../context.js";
2
3
  export declare const listServicesDefinition: {
3
4
  name: "list_services";
4
5
  description: string;
@@ -14,7 +15,7 @@ export declare const listServicesDefinition: {
14
15
  };
15
16
  export declare function listServicesHandler(registry: ConnectorRegistry, args: {
16
17
  filter?: string;
17
- }): Promise<{
18
+ }, _ctx?: RequestContext): Promise<{
18
19
  content: {
19
20
  type: "text";
20
21
  text: string;
@@ -1,3 +1,4 @@
1
+ import { defaultContext } from "../context.js";
1
2
  export const listServicesDefinition = {
2
3
  name: "list_services",
3
4
  description: "List all monitored services discovered across all connected backends. Returns service names, their data sources, and signal types (metrics/logs).",
@@ -11,7 +12,7 @@ export const listServicesDefinition = {
11
12
  },
12
13
  },
13
14
  };
14
- export async function listServicesHandler(registry, args) {
15
+ export async function listServicesHandler(registry, args, _ctx = defaultContext()) {
15
16
  const connectors = registry.getAll();
16
17
  const allServices = [];
17
18
  for (const connector of connectors) {
@@ -1,4 +1,5 @@
1
1
  import type { ConnectorRegistry } from "../connectors/registry.js";
2
+ import { type RequestContext } from "../context.js";
2
3
  export declare const listSourcesDefinition: {
3
4
  name: "list_sources";
4
5
  description: string;
@@ -7,7 +8,7 @@ export declare const listSourcesDefinition: {
7
8
  properties: {};
8
9
  };
9
10
  };
10
- export declare function listSourcesHandler(registry: ConnectorRegistry): Promise<{
11
+ export declare function listSourcesHandler(registry: ConnectorRegistry, _ctx?: RequestContext): Promise<{
11
12
  content: {
12
13
  type: "text";
13
14
  text: string;
@@ -1,3 +1,4 @@
1
+ import { defaultContext } from "../context.js";
1
2
  export const listSourcesDefinition = {
2
3
  name: "list_sources",
3
4
  description: "List all configured observability backends and their connection status. Use this to discover what data sources are available.",
@@ -6,7 +7,7 @@ export const listSourcesDefinition = {
6
7
  properties: {},
7
8
  },
8
9
  };
9
- export async function listSourcesHandler(registry) {
10
+ export async function listSourcesHandler(registry, _ctx = defaultContext()) {
10
11
  const healthResults = await registry.healthCheckAll();
11
12
  const connectors = registry.getAll();
12
13
  const sources = connectors.map((c) => ({
@@ -1,4 +1,5 @@
1
1
  import type { ConnectorRegistry } from "../connectors/registry.js";
2
+ import { type RequestContext } from "../context.js";
2
3
  export declare const queryLogsDefinition: {
3
4
  name: "query_logs";
4
5
  description: string;
@@ -35,7 +36,7 @@ export declare function queryLogsHandler(registry: ConnectorRegistry, args: {
35
36
  duration?: string;
36
37
  level?: string;
37
38
  limit?: number;
38
- }): Promise<{
39
+ }, _ctx?: RequestContext): Promise<{
39
40
  content: {
40
41
  type: "text";
41
42
  text: string;
@@ -1,3 +1,4 @@
1
+ import { defaultContext } from "../context.js";
1
2
  import { validateDuration, validateServiceName, errorResponse } from "./validation.js";
2
3
  export const queryLogsDefinition = {
3
4
  name: "query_logs",
@@ -29,7 +30,7 @@ export const queryLogsDefinition = {
29
30
  required: ["service"],
30
31
  },
31
32
  };
32
- export async function queryLogsHandler(registry, args) {
33
+ export async function queryLogsHandler(registry, args, _ctx = defaultContext()) {
33
34
  const svcErr = validateServiceName(args.service);
34
35
  if (svcErr)
35
36
  return errorResponse(svcErr);
@@ -1,4 +1,5 @@
1
1
  import type { ConnectorRegistry } from "../connectors/registry.js";
2
+ import { type RequestContext } from "../context.js";
2
3
  export declare const queryMetricsDefinition: {
3
4
  name: "query_metrics";
4
5
  description: string;
@@ -35,7 +36,7 @@ export declare function queryMetricsHandler(registry: ConnectorRegistry, args: {
35
36
  duration?: string;
36
37
  source?: string;
37
38
  groupBy?: string;
38
- }): Promise<{
39
+ }, _ctx?: RequestContext): Promise<{
39
40
  content: {
40
41
  type: "text";
41
42
  text: string;
@@ -1,3 +1,4 @@
1
+ import { defaultContext } from "../context.js";
1
2
  import { validateDuration, validateMetricName, validateServiceName, errorResponse } from "./validation.js";
2
3
  export const queryMetricsDefinition = {
3
4
  name: "query_metrics",
@@ -29,7 +30,14 @@ export const queryMetricsDefinition = {
29
30
  required: ["service", "metric"],
30
31
  },
31
32
  };
32
- export async function queryMetricsHandler(registry, args) {
33
+ export async function queryMetricsHandler(registry, args, _ctx = defaultContext()) {
34
+ // Coarse single-tenant source scoping: if the principal is restricted to a
35
+ // source allow-list, deny an explicit out-of-scope source.
36
+ if (_ctx.allowedSources &&
37
+ args.source &&
38
+ !_ctx.allowedSources.includes(args.source)) {
39
+ return errorResponse(`forbidden: source "${args.source}" is not in your allowed sources`);
40
+ }
33
41
  const svcErr = validateServiceName(args.service);
34
42
  if (svcErr)
35
43
  return errorResponse(svcErr);
package/package.json CHANGED
@@ -1,9 +1,9 @@
1
1
  {
2
2
  "name": "@thotischner/observability-mcp",
3
- "version": "1.4.1",
3
+ "version": "1.5.1",
4
4
  "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
5
5
  "type": "module",
6
- "license": "MIT",
6
+ "license": "Apache-2.0",
7
7
  "mcpName": "io.github.ThoTischner/observability-mcp",
8
8
  "repository": {
9
9
  "type": "git",
@@ -25,6 +25,13 @@
25
25
  "observability-mcp": "./dist/index.js",
26
26
  "omcp": "./dist/cli/index.js"
27
27
  },
28
+ "exports": {
29
+ ".": "./dist/index.js",
30
+ "./analysis": {
31
+ "types": "./dist/analysis/index.d.ts",
32
+ "default": "./dist/analysis/index.js"
33
+ }
34
+ },
28
35
  "files": [
29
36
  "dist",
30
37
  "config"
@@ -37,6 +44,7 @@
37
44
  "dependencies": {
38
45
  "@modelcontextprotocol/sdk": "^1.12.0",
39
46
  "express": "^5.2.1",
47
+ "express-rate-limit": "^7.5.0",
40
48
  "js-yaml": "^4.1.0",
41
49
  "prom-client": "^15.1.0",
42
50
  "zod": "^4.4.3"