@thotischner/observability-mcp 1.1.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,11 +3,21 @@ sources:
3
3
  type: prometheus
4
4
  url: http://prometheus:9090
5
5
  enabled: true
6
- # metrics: # Uncomment to override default Prometheus metrics
7
- # - name: custom_metric
8
- # query: 'my_custom_metric{job="{{service}}"}'
9
- # unit: requests
10
- # description: My custom metric
6
+ # The demo services emit synthetic CPU/memory gauges that the chaos
7
+ # endpoints drive directly, so we override the connector defaults
8
+ # (which target prom-client conventions) with the demo-specific names.
9
+ # request_rate / error_rate / latency_* keep the prom-client defaults
10
+ # and apply unchanged because the demo also emits http_requests_total
11
+ # and http_request_duration_seconds_bucket.
12
+ metrics:
13
+ - name: cpu
14
+ query: 'service_cpu_usage_percent{job="{{service}}"}'
15
+ unit: percent
16
+ description: Synthetic CPU gauge driven by /chaos endpoints
17
+ - name: memory
18
+ query: 'service_memory_usage_bytes{job="{{service}}"}'
19
+ unit: bytes
20
+ description: Synthetic memory gauge driven by /chaos endpoints
11
21
 
12
22
  - name: loki
13
23
  type: loki
@@ -18,7 +18,7 @@ export declare class LokiConnector implements ObservabilityConnector {
18
18
  listServices(): Promise<ServiceInfo[]>;
19
19
  queryLogs(params: LogQuery): Promise<LogResult>;
20
20
  private getLabelValues;
21
- private resolveServiceLabel;
21
+ private resolveServiceSelector;
22
22
  private parseLine;
23
23
  private extractTopPatterns;
24
24
  private parseTimeRange;
@@ -61,10 +61,14 @@ export class LokiConnector {
61
61
  const seen = new Map();
62
62
  for (const label of this.serviceLabels) {
63
63
  const values = await this.getLabelValues(label);
64
- for (const name of values) {
65
- if (!seen.has(name)) {
66
- seen.set(name, {
67
- name,
64
+ for (const raw of values) {
65
+ // Docker's loki.source.docker writes container names with a leading '/'
66
+ // (Docker API Names[0] convention). Strip it for display so the name
67
+ // matches what the service-name validator and users will pass back in.
68
+ const display = label === "container" ? raw.replace(/^\//, "") : raw;
69
+ if (!seen.has(display)) {
70
+ seen.set(display, {
71
+ name: display,
68
72
  source: this.name,
69
73
  signalType: "logs",
70
74
  labels: { discoveredVia: label },
@@ -77,11 +81,12 @@ export class LokiConnector {
77
81
  async queryLogs(params) {
78
82
  const { start, end } = this.parseTimeRange(params.duration);
79
83
  const limit = Math.min(Math.max(params.limit || 100, 1), 1000);
80
- // Resolve which label this service identifier lives under. Falls back to
81
- // the first configured label when no exact match is found, preserving
82
- // legacy behavior for callers passing labels that aren't in the cache yet.
83
- const matchedLabel = await this.resolveServiceLabel(params.service);
84
- const service = this.escapeLogQLValue(params.service);
84
+ // Resolve label + actual selector value. For the 'container' label the
85
+ // value stored in Loki may be '/my-app-1' while the caller passes the
86
+ // sanitized 'my-app-1' return the prefixed form so the LogQL selector
87
+ // matches the real stream.
88
+ const { label: matchedLabel, value: rawValue } = await this.resolveServiceSelector(params.service);
89
+ const service = this.escapeLogQLValue(rawValue);
85
90
  let logql = `{${matchedLabel}="${service}"}`;
86
91
  if (params.level) {
87
92
  const level = this.escapeLogQLValue(params.level);
@@ -148,13 +153,18 @@ export class LokiConnector {
148
153
  return [];
149
154
  }
150
155
  }
151
- async resolveServiceLabel(service) {
156
+ async resolveServiceSelector(service) {
152
157
  for (const label of this.serviceLabels) {
153
158
  const values = await this.getLabelValues(label);
154
159
  if (values.includes(service))
155
- return label;
160
+ return { label, value: service };
161
+ // Container label values are Docker-prefixed with '/'. The caller can't
162
+ // pass that form (validator rejects '/'), so probe the prefixed variant.
163
+ if (label === "container" && values.includes(`/${service}`)) {
164
+ return { label, value: `/${service}` };
165
+ }
156
166
  }
157
- return this.serviceLabels[0] || "service_name";
167
+ return { label: this.serviceLabels[0] || "service_name", value: service };
158
168
  }
159
169
  parseLine(line) {
160
170
  try {
@@ -8,6 +8,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
8
8
  private auth?;
9
9
  private tlsAgent?;
10
10
  private metrics;
11
+ private serviceLabels;
12
+ private labelValuesCache;
11
13
  connect(config: SourceConfig): Promise<void>;
12
14
  getDefaultMetrics(): MetricDefinition[];
13
15
  getMetrics(): MetricDefinition[];
@@ -19,6 +21,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
19
21
  listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
20
22
  queryMetrics(params: MetricQuery): Promise<MetricResult>;
21
23
  private buildQuery;
24
+ private resolveServiceLabel;
25
+ private getLabelValues;
22
26
  private getUnit;
23
27
  private parseTimeRange;
24
28
  private computeSummary;
@@ -1,13 +1,57 @@
1
1
  import { buildTlsAgent } from "./tls.js";
2
+ // Defaults target prom-client conventions, the de-facto standard for
3
+ // Node.js/Express instrumentation and what most apps emit out of the box.
4
+ // {{selector}} is replaced at query time with the discovered label/value
5
+ // pair (e.g. job="my-svc"); the connector probes job → service → app →
6
+ // service_name to find which label carries the requested service name.
7
+ // {{service}} (literal value) is still supported for back-compat with
8
+ // user-provided overrides.
2
9
  const DEFAULT_PROMETHEUS_METRICS = [
3
- { name: "cpu", query: 'service_cpu_usage_percent{job="{{service}}"}', unit: "percent", description: "CPU usage percentage" },
4
- { name: "memory", query: 'service_memory_usage_bytes{job="{{service}}"}', unit: "bytes", description: "Memory usage in bytes" },
5
- { name: "error_rate", query: 'rate(http_requests_total{job="{{service}}",status=~"5.."}[1m])', unit: "req/s", description: "HTTP 5xx error rate" },
6
- { name: "request_rate", query: 'rate(http_requests_total{job="{{service}}"}[1m])', unit: "req/s", description: "Total HTTP request rate" },
7
- { name: "latency_p99", query: 'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket{job="{{service}}"}[1m]))', unit: "seconds", description: "99th percentile latency" },
8
- { name: "latency_p50", query: 'histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{job="{{service}}"}[1m]))', unit: "seconds", description: "50th percentile latency" },
9
- { name: "latency_avg", query: 'rate(http_request_duration_seconds_sum{job="{{service}}"}[1m]) / rate(http_request_duration_seconds_count{job="{{service}}"}[1m])', unit: "seconds", description: "Average request latency" },
10
+ {
11
+ name: "cpu",
12
+ query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
13
+ unit: "percent",
14
+ description: "CPU usage % (rate of process_cpu_seconds_total prom-client default)",
15
+ },
16
+ {
17
+ name: "memory",
18
+ query: 'process_resident_memory_bytes{ {{selector}} }',
19
+ unit: "bytes",
20
+ description: "Resident memory in bytes (prom-client default)",
21
+ },
22
+ {
23
+ name: "request_rate",
24
+ query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
25
+ unit: "req/s",
26
+ description: "HTTP request rate",
27
+ },
28
+ {
29
+ name: "error_rate",
30
+ query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
31
+ unit: "req/s",
32
+ description: "HTTP 5xx error rate",
33
+ },
34
+ {
35
+ name: "latency_p99",
36
+ query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
37
+ unit: "seconds",
38
+ description: "99th percentile latency",
39
+ },
40
+ {
41
+ name: "latency_p50",
42
+ query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
43
+ unit: "seconds",
44
+ description: "50th percentile latency",
45
+ },
46
+ {
47
+ name: "latency_avg",
48
+ query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
49
+ unit: "seconds",
50
+ description: "Average request latency",
51
+ },
10
52
  ];
53
+ const DEFAULT_SERVICE_LABELS = ["job", "service", "app", "service_name"];
54
+ const LABEL_CACHE_TTL_MS = 60_000;
11
55
  export class PrometheusConnector {
12
56
  type = "prometheus";
13
57
  signalType = "metrics";
@@ -16,15 +60,27 @@ export class PrometheusConnector {
16
60
  auth;
17
61
  tlsAgent;
18
62
  metrics = [];
63
+ serviceLabels = DEFAULT_SERVICE_LABELS;
64
+ labelValuesCache = new Map();
19
65
  async connect(config) {
20
66
  this.name = config.name;
21
67
  this.baseUrl = config.url.replace(/\/$/, "");
22
68
  this.auth = config.auth;
23
69
  this.tlsAgent = buildTlsAgent(config);
24
- // Use source-level metrics if provided, otherwise connector defaults
25
- this.metrics = config.metrics && config.metrics.length > 0
26
- ? config.metrics
27
- : [...DEFAULT_PROMETHEUS_METRICS];
70
+ // Source-level overrides merge with defaults by name, so users can pin
71
+ // a single metric (e.g. cpu) to a custom query without re-listing the
72
+ // rest. To fully replace the defaults, override every metric explicitly.
73
+ const overrides = new Map((config.metrics || []).map((m) => [m.name, m]));
74
+ this.metrics = DEFAULT_PROMETHEUS_METRICS.map((d) => overrides.get(d.name) || d);
75
+ for (const [name, m] of overrides) {
76
+ if (!DEFAULT_PROMETHEUS_METRICS.some((d) => d.name === name)) {
77
+ this.metrics.push(m);
78
+ }
79
+ }
80
+ const envLabels = process.env.PROMETHEUS_SERVICE_LABELS;
81
+ if (envLabels) {
82
+ this.serviceLabels = envLabels.split(",").map((s) => s.trim()).filter(Boolean);
83
+ }
28
84
  }
29
85
  getDefaultMetrics() {
30
86
  return DEFAULT_PROMETHEUS_METRICS;
@@ -111,7 +167,7 @@ export class PrometheusConnector {
111
167
  return metrics;
112
168
  }
113
169
  async queryMetrics(params) {
114
- const promql = this.buildQuery(params.service, params.metric);
170
+ const { promql, label } = await this.buildQuery(params.service, params.metric);
115
171
  const { start, end, step } = this.parseTimeRange(params.duration, params.step);
116
172
  const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
117
173
  const values = [];
@@ -131,15 +187,48 @@ export class PrometheusConnector {
131
187
  unit: this.getUnit(params.metric),
132
188
  values,
133
189
  summary: this.computeSummary(rawValues),
190
+ resolvedSeries: promql,
191
+ resolvedLabel: label,
134
192
  };
135
193
  }
136
194
  // --- Private helpers ---
137
- buildQuery(service, metric) {
195
+ async buildQuery(service, metric) {
138
196
  const def = this.metrics.find((m) => m.name === metric);
139
- if (def) {
140
- return def.query.replace(/\{\{service\}\}/g, service);
197
+ const template = def?.query || `${metric}{ {{selector}} }`;
198
+ const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
199
+ let promql = template;
200
+ let label = "job";
201
+ if (template.includes("{{selector}}")) {
202
+ label = await this.resolveServiceLabel(service);
203
+ const selector = `${label}="${escaped}"`;
204
+ promql = promql.replace(/\{\{selector\}\}/g, selector);
205
+ }
206
+ promql = promql.replace(/\{\{service\}\}/g, escaped);
207
+ return { promql, label };
208
+ }
209
+ async resolveServiceLabel(service) {
210
+ for (const label of this.serviceLabels) {
211
+ const values = await this.getLabelValues(label);
212
+ if (values.includes(service))
213
+ return label;
214
+ }
215
+ return this.serviceLabels[0] || "job";
216
+ }
217
+ async getLabelValues(label) {
218
+ const cached = this.labelValuesCache.get(label);
219
+ if (cached && cached.expiresAt > Date.now()) {
220
+ return cached.values;
221
+ }
222
+ try {
223
+ const data = await this.apiGet(`/api/v1/label/${encodeURIComponent(label)}/values`);
224
+ const values = data?.data || [];
225
+ this.labelValuesCache.set(label, { values, expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
226
+ return values;
227
+ }
228
+ catch {
229
+ this.labelValuesCache.set(label, { values: [], expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
230
+ return [];
141
231
  }
142
- return `${metric}{job="${service}"}`;
143
232
  }
144
233
  getUnit(metric) {
145
234
  const def = this.metrics.find((m) => m.name === metric);
@@ -1,5 +1,9 @@
1
1
  const DURATION_RE = /^\d+[mhd]$/;
2
- const SAFE_LABEL_RE = /^[a-zA-Z0-9_\-.:]+$/;
2
+ // Slashes are valid in Prometheus label values and appear in real-world job
3
+ // names (Grafana Cloud Integrations like "integrations/unix", k8s namespaces,
4
+ // Docker image refs). The PromQL/LogQL injection surface is the surrounding
5
+ // quote/backslash, which we escape separately, not these characters.
6
+ const SAFE_LABEL_RE = /^[a-zA-Z0-9_\-.:/]+$/;
3
7
  export function validateDuration(duration) {
4
8
  if (!DURATION_RE.test(duration)) {
5
9
  return `Invalid duration "${duration}". Expected format: <number><unit> where unit is m (minutes), h (hours), or d (days). Examples: 5m, 1h, 24h, 7d`;
@@ -33,7 +37,7 @@ export function sanitizeLabelValue(value) {
33
37
  }
34
38
  export function validateServiceName(service) {
35
39
  if (!sanitizeLabelValue(service)) {
36
- return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, and colons are allowed (max 128 chars).`;
40
+ return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, colons, and slashes are allowed (max 128 chars).`;
37
41
  }
38
42
  return null;
39
43
  }
package/dist/types.d.ts CHANGED
@@ -121,6 +121,8 @@ export interface MetricResult {
121
121
  unit: string;
122
122
  values: DataPoint[];
123
123
  summary: MetricSummary;
124
+ resolvedSeries?: string;
125
+ resolvedLabel?: string;
124
126
  }
125
127
  export interface LogEntry {
126
128
  timestamp: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thotischner/observability-mcp",
3
- "version": "1.1.2",
3
+ "version": "1.2.0",
4
4
  "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
5
5
  "type": "module",
6
6
  "license": "MIT",