@thotischner/observability-mcp 1.2.0 → 1.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,6 +10,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
10
10
  private metrics;
11
11
  private serviceLabels;
12
12
  private labelValuesCache;
13
+ private metricNamesCache;
14
+ private userOverrides;
13
15
  connect(config: SourceConfig): Promise<void>;
14
16
  getDefaultMetrics(): MetricDefinition[];
15
17
  getMetrics(): MetricDefinition[];
@@ -21,6 +23,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
21
23
  listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
22
24
  queryMetrics(params: MetricQuery): Promise<MetricResult>;
23
25
  private buildQuery;
26
+ private pickMetricCandidate;
27
+ private getAllMetricNames;
24
28
  private resolveServiceLabel;
25
29
  private getLabelValues;
26
30
  private getUnit;
@@ -1,55 +1,71 @@
1
1
  import { buildTlsAgent } from "./tls.js";
2
- // Defaults target prom-client conventions, the de-facto standard for
3
- // Node.js/Express instrumentation and what most apps emit out of the box.
4
- // {{selector}} is replaced at query time with the discovered label/value
5
- // pair (e.g. job="my-svc"); the connector probes job → service → app →
6
- // service_name to find which label carries the requested service name.
7
- // {{service}} (literal value) is still supported for back-compat with
8
- // user-provided overrides.
9
- const DEFAULT_PROMETHEUS_METRICS = [
10
- {
11
- name: "cpu",
12
- query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
13
- unit: "percent",
14
- description: "CPU usage % (rate of process_cpu_seconds_total — prom-client default)",
15
- },
16
- {
17
- name: "memory",
18
- query: 'process_resident_memory_bytes{ {{selector}} }',
19
- unit: "bytes",
20
- description: "Resident memory in bytes (prom-client default)",
21
- },
22
- {
23
- name: "request_rate",
24
- query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
25
- unit: "req/s",
26
- description: "HTTP request rate",
27
- },
28
- {
29
- name: "error_rate",
30
- query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
31
- unit: "req/s",
32
- description: "HTTP 5xx error rate",
33
- },
34
- {
35
- name: "latency_p99",
36
- query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
37
- unit: "seconds",
38
- description: "99th percentile latency",
39
- },
40
- {
41
- name: "latency_p50",
42
- query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
43
- unit: "seconds",
44
- description: "50th percentile latency",
45
- },
46
- {
47
- name: "latency_avg",
48
- query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
49
- unit: "seconds",
50
- description: "Average request latency",
51
- },
52
- ];
2
+ const PROMETHEUS_METRIC_CANDIDATES = {
3
+ cpu: [
4
+ {
5
+ seriesName: "process_cpu_seconds_total",
6
+ query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
7
+ },
8
+ {
9
+ seriesName: "node_cpu_seconds_total",
10
+ query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
11
+ },
12
+ ],
13
+ memory: [
14
+ {
15
+ seriesName: "process_resident_memory_bytes",
16
+ query: 'process_resident_memory_bytes{ {{selector}} }',
17
+ },
18
+ {
19
+ seriesName: "node_memory_MemTotal_bytes",
20
+ query: '(node_memory_MemTotal_bytes{ {{selector}} } - node_memory_MemAvailable_bytes{ {{selector}} })',
21
+ },
22
+ ],
23
+ request_rate: [
24
+ {
25
+ seriesName: "http_requests_total",
26
+ query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
27
+ },
28
+ ],
29
+ error_rate: [
30
+ {
31
+ seriesName: "http_requests_total",
32
+ query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
33
+ },
34
+ ],
35
+ latency_p99: [
36
+ {
37
+ seriesName: "http_request_duration_seconds_bucket",
38
+ query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
39
+ },
40
+ ],
41
+ latency_p50: [
42
+ {
43
+ seriesName: "http_request_duration_seconds_bucket",
44
+ query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
45
+ },
46
+ ],
47
+ latency_avg: [
48
+ {
49
+ seriesName: "http_request_duration_seconds_sum",
50
+ query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
51
+ },
52
+ ],
53
+ };
54
+ const DEFAULT_METRIC_META = {
55
+ cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
56
+ memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
57
+ request_rate: { unit: "req/s", description: "HTTP request rate (http_requests_total)" },
58
+ error_rate: { unit: "req/s", description: "HTTP 5xx error rate (http_requests_total filtered by status)" },
59
+ latency_p99: { unit: "seconds", description: "99th percentile latency (http_request_duration_seconds_bucket)" },
60
+ latency_p50: { unit: "seconds", description: "50th percentile latency (http_request_duration_seconds_bucket)" },
61
+ latency_avg: { unit: "seconds", description: "Average request latency (sum/count ratio)" },
62
+ };
63
+ const DEFAULT_PROMETHEUS_METRICS = Object.keys(PROMETHEUS_METRIC_CANDIDATES).map((name) => ({
64
+ name,
65
+ query: PROMETHEUS_METRIC_CANDIDATES[name][0].query,
66
+ unit: DEFAULT_METRIC_META[name].unit,
67
+ description: DEFAULT_METRIC_META[name].description,
68
+ }));
53
69
  const DEFAULT_SERVICE_LABELS = ["job", "service", "app", "service_name"];
54
70
  const LABEL_CACHE_TTL_MS = 60_000;
55
71
  export class PrometheusConnector {
@@ -62,6 +78,8 @@ export class PrometheusConnector {
62
78
  metrics = [];
63
79
  serviceLabels = DEFAULT_SERVICE_LABELS;
64
80
  labelValuesCache = new Map();
81
+ metricNamesCache = null;
82
+ userOverrides = new Set();
65
83
  async connect(config) {
66
84
  this.name = config.name;
67
85
  this.baseUrl = config.url.replace(/\/$/, "");
@@ -69,7 +87,9 @@ export class PrometheusConnector {
69
87
  this.tlsAgent = buildTlsAgent(config);
70
88
  // Source-level overrides merge with defaults by name, so users can pin
71
89
  // a single metric (e.g. cpu) to a custom query without re-listing the
72
- // rest. To fully replace the defaults, override every metric explicitly.
90
+ // rest. Overridden metrics skip candidate probing the user-supplied
91
+ // query is used verbatim.
92
+ this.userOverrides = new Set((config.metrics || []).map((m) => m.name));
73
93
  const overrides = new Map((config.metrics || []).map((m) => [m.name, m]));
74
94
  this.metrics = DEFAULT_PROMETHEUS_METRICS.map((d) => overrides.get(d.name) || d);
75
95
  for (const [name, m] of overrides) {
@@ -193,8 +213,20 @@ export class PrometheusConnector {
193
213
  }
194
214
  // --- Private helpers ---
195
215
  async buildQuery(service, metric) {
196
- const def = this.metrics.find((m) => m.name === metric);
197
- const template = def?.query || `${metric}{ {{selector}} }`;
216
+ // Pick the query template. For built-in metrics with no user override,
217
+ // probe candidate series in the backend and pick the first that exists
218
+ // (e.g. prom-client process_cpu_seconds_total → falls back to
219
+ // node_exporter node_cpu_seconds_total). User-overridden metrics use
220
+ // their query verbatim.
221
+ let template;
222
+ if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
223
+ const candidate = await this.pickMetricCandidate(metric);
224
+ template = candidate?.query || PROMETHEUS_METRIC_CANDIDATES[metric][0].query;
225
+ }
226
+ else {
227
+ const def = this.metrics.find((m) => m.name === metric);
228
+ template = def?.query || `${metric}{ {{selector}} }`;
229
+ }
198
230
  const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
199
231
  let promql = template;
200
232
  let label = "job";
@@ -206,6 +238,34 @@ export class PrometheusConnector {
206
238
  promql = promql.replace(/\{\{service\}\}/g, escaped);
207
239
  return { promql, label };
208
240
  }
241
+ async pickMetricCandidate(metric) {
242
+ const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];
243
+ if (!candidates || candidates.length === 0)
244
+ return null;
245
+ if (candidates.length === 1)
246
+ return candidates[0];
247
+ const allNames = await this.getAllMetricNames();
248
+ for (const c of candidates) {
249
+ if (allNames.has(c.seriesName))
250
+ return c;
251
+ }
252
+ return candidates[0];
253
+ }
254
+ async getAllMetricNames() {
255
+ if (this.metricNamesCache && this.metricNamesCache.expiresAt > Date.now()) {
256
+ return this.metricNamesCache.values;
257
+ }
258
+ try {
259
+ const data = await this.apiGet("/api/v1/label/__name__/values");
260
+ const values = new Set(data?.data || []);
261
+ this.metricNamesCache = { values, expiresAt: Date.now() + LABEL_CACHE_TTL_MS };
262
+ return values;
263
+ }
264
+ catch {
265
+ this.metricNamesCache = { values: new Set(), expiresAt: Date.now() + LABEL_CACHE_TTL_MS };
266
+ return new Set();
267
+ }
268
+ }
209
269
  async resolveServiceLabel(service) {
210
270
  for (const label of this.serviceLabels) {
211
271
  const values = await this.getLabelValues(label);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@thotischner/observability-mcp",
3
- "version": "1.2.0",
3
+ "version": "1.2.1",
4
4
  "description": "Unified observability gateway for AI agents — one MCP server for Prometheus, Loki, and any backend",
5
5
  "type": "module",
6
6
  "license": "MIT",