@thotischner/observability-mcp 1.2.0 → 1.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -10,6 +10,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
10
10
|
private metrics;
|
|
11
11
|
private serviceLabels;
|
|
12
12
|
private labelValuesCache;
|
|
13
|
+
private candidateCache;
|
|
14
|
+
private userOverrides;
|
|
13
15
|
connect(config: SourceConfig): Promise<void>;
|
|
14
16
|
getDefaultMetrics(): MetricDefinition[];
|
|
15
17
|
getMetrics(): MetricDefinition[];
|
|
@@ -21,6 +23,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
21
23
|
listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
|
|
22
24
|
queryMetrics(params: MetricQuery): Promise<MetricResult>;
|
|
23
25
|
private buildQuery;
|
|
26
|
+
private pickMetricCandidate;
|
|
27
|
+
private seriesExistsForService;
|
|
24
28
|
private resolveServiceLabel;
|
|
25
29
|
private getLabelValues;
|
|
26
30
|
private getUnit;
|
|
@@ -1,55 +1,71 @@
|
|
|
1
1
|
import { buildTlsAgent } from "./tls.js";
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
]
|
|
2
|
+
const PROMETHEUS_METRIC_CANDIDATES = {
|
|
3
|
+
cpu: [
|
|
4
|
+
{
|
|
5
|
+
seriesName: "process_cpu_seconds_total",
|
|
6
|
+
query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
seriesName: "node_cpu_seconds_total",
|
|
10
|
+
query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
|
|
11
|
+
},
|
|
12
|
+
],
|
|
13
|
+
memory: [
|
|
14
|
+
{
|
|
15
|
+
seriesName: "process_resident_memory_bytes",
|
|
16
|
+
query: 'process_resident_memory_bytes{ {{selector}} }',
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
seriesName: "node_memory_MemTotal_bytes",
|
|
20
|
+
query: '(node_memory_MemTotal_bytes{ {{selector}} } - node_memory_MemAvailable_bytes{ {{selector}} })',
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
request_rate: [
|
|
24
|
+
{
|
|
25
|
+
seriesName: "http_requests_total",
|
|
26
|
+
query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
|
|
27
|
+
},
|
|
28
|
+
],
|
|
29
|
+
error_rate: [
|
|
30
|
+
{
|
|
31
|
+
seriesName: "http_requests_total",
|
|
32
|
+
query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
latency_p99: [
|
|
36
|
+
{
|
|
37
|
+
seriesName: "http_request_duration_seconds_bucket",
|
|
38
|
+
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
latency_p50: [
|
|
42
|
+
{
|
|
43
|
+
seriesName: "http_request_duration_seconds_bucket",
|
|
44
|
+
query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
latency_avg: [
|
|
48
|
+
{
|
|
49
|
+
seriesName: "http_request_duration_seconds_sum",
|
|
50
|
+
query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
};
|
|
54
|
+
const DEFAULT_METRIC_META = {
|
|
55
|
+
cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
|
|
56
|
+
memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
|
|
57
|
+
request_rate: { unit: "req/s", description: "HTTP request rate (http_requests_total)" },
|
|
58
|
+
error_rate: { unit: "req/s", description: "HTTP 5xx error rate (http_requests_total filtered by status)" },
|
|
59
|
+
latency_p99: { unit: "seconds", description: "99th percentile latency (http_request_duration_seconds_bucket)" },
|
|
60
|
+
latency_p50: { unit: "seconds", description: "50th percentile latency (http_request_duration_seconds_bucket)" },
|
|
61
|
+
latency_avg: { unit: "seconds", description: "Average request latency (sum/count ratio)" },
|
|
62
|
+
};
|
|
63
|
+
const DEFAULT_PROMETHEUS_METRICS = Object.keys(PROMETHEUS_METRIC_CANDIDATES).map((name) => ({
|
|
64
|
+
name,
|
|
65
|
+
query: PROMETHEUS_METRIC_CANDIDATES[name][0].query,
|
|
66
|
+
unit: DEFAULT_METRIC_META[name].unit,
|
|
67
|
+
description: DEFAULT_METRIC_META[name].description,
|
|
68
|
+
}));
|
|
53
69
|
const DEFAULT_SERVICE_LABELS = ["job", "service", "app", "service_name"];
|
|
54
70
|
const LABEL_CACHE_TTL_MS = 60_000;
|
|
55
71
|
export class PrometheusConnector {
|
|
@@ -62,6 +78,8 @@ export class PrometheusConnector {
|
|
|
62
78
|
metrics = [];
|
|
63
79
|
serviceLabels = DEFAULT_SERVICE_LABELS;
|
|
64
80
|
labelValuesCache = new Map();
|
|
81
|
+
candidateCache = new Map();
|
|
82
|
+
userOverrides = new Set();
|
|
65
83
|
async connect(config) {
|
|
66
84
|
this.name = config.name;
|
|
67
85
|
this.baseUrl = config.url.replace(/\/$/, "");
|
|
@@ -69,7 +87,9 @@ export class PrometheusConnector {
|
|
|
69
87
|
this.tlsAgent = buildTlsAgent(config);
|
|
70
88
|
// Source-level overrides merge with defaults by name, so users can pin
|
|
71
89
|
// a single metric (e.g. cpu) to a custom query without re-listing the
|
|
72
|
-
// rest.
|
|
90
|
+
// rest. Overridden metrics skip candidate probing — the user-supplied
|
|
91
|
+
// query is used verbatim.
|
|
92
|
+
this.userOverrides = new Set((config.metrics || []).map((m) => m.name));
|
|
73
93
|
const overrides = new Map((config.metrics || []).map((m) => [m.name, m]));
|
|
74
94
|
this.metrics = DEFAULT_PROMETHEUS_METRICS.map((d) => overrides.get(d.name) || d);
|
|
75
95
|
for (const [name, m] of overrides) {
|
|
@@ -193,19 +213,75 @@ export class PrometheusConnector {
|
|
|
193
213
|
}
|
|
194
214
|
// --- Private helpers ---
|
|
195
215
|
async buildQuery(service, metric) {
|
|
196
|
-
|
|
197
|
-
|
|
216
|
+
// Resolve the service-filter label first. Candidate probing uses this
|
|
217
|
+
// label to scope existence checks per-service rather than per-source.
|
|
198
218
|
const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
199
|
-
let promql = template;
|
|
200
219
|
let label = "job";
|
|
201
|
-
|
|
220
|
+
let template;
|
|
221
|
+
if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
|
|
202
222
|
label = await this.resolveServiceLabel(service);
|
|
223
|
+
const candidate = await this.pickMetricCandidate(metric, label, escaped);
|
|
224
|
+
template = candidate?.query || PROMETHEUS_METRIC_CANDIDATES[metric][0].query;
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
227
|
+
const def = this.metrics.find((m) => m.name === metric);
|
|
228
|
+
template = def?.query || `${metric}{ {{selector}} }`;
|
|
229
|
+
}
|
|
230
|
+
let promql = template;
|
|
231
|
+
if (template.includes("{{selector}}")) {
|
|
232
|
+
// Resolve label here for non-candidate paths that haven't done it yet.
|
|
233
|
+
if (label === "job" && !PROMETHEUS_METRIC_CANDIDATES[metric]) {
|
|
234
|
+
label = await this.resolveServiceLabel(service);
|
|
235
|
+
}
|
|
203
236
|
const selector = `${label}="${escaped}"`;
|
|
204
237
|
promql = promql.replace(/\{\{selector\}\}/g, selector);
|
|
205
238
|
}
|
|
206
239
|
promql = promql.replace(/\{\{service\}\}/g, escaped);
|
|
207
240
|
return { promql, label };
|
|
208
241
|
}
|
|
242
|
+
async pickMetricCandidate(metric, label, escapedService) {
|
|
243
|
+
const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];
|
|
244
|
+
if (!candidates || candidates.length === 0)
|
|
245
|
+
return null;
|
|
246
|
+
if (candidates.length === 1)
|
|
247
|
+
return candidates[0];
|
|
248
|
+
// Per-service cache: a source can have BOTH process_* and node_* series
|
|
249
|
+
// present (e.g. an apps stack alongside node_exporter), so probing has
|
|
250
|
+
// to check whether THIS service has the series, not whether the source
|
|
251
|
+
// has it anywhere.
|
|
252
|
+
const cacheKey = `${metric}|${label}|${escapedService}`;
|
|
253
|
+
const cached = this.candidateCache.get(cacheKey);
|
|
254
|
+
if (cached && cached.expiresAt > Date.now())
|
|
255
|
+
return cached.candidate;
|
|
256
|
+
for (const c of candidates) {
|
|
257
|
+
if (await this.seriesExistsForService(c.seriesName, label, escapedService)) {
|
|
258
|
+
this.candidateCache.set(cacheKey, {
|
|
259
|
+
candidate: c,
|
|
260
|
+
expiresAt: Date.now() + LABEL_CACHE_TTL_MS,
|
|
261
|
+
});
|
|
262
|
+
return c;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
// Nothing found — return first candidate as best-effort. Cache the
|
|
266
|
+
// negative outcome so we don't probe again for 60s.
|
|
267
|
+
const fallback = candidates[0];
|
|
268
|
+
this.candidateCache.set(cacheKey, {
|
|
269
|
+
candidate: fallback,
|
|
270
|
+
expiresAt: Date.now() + LABEL_CACHE_TTL_MS,
|
|
271
|
+
});
|
|
272
|
+
return fallback;
|
|
273
|
+
}
|
|
274
|
+
async seriesExistsForService(seriesName, label, escapedService) {
|
|
275
|
+
try {
|
|
276
|
+
const matchExpr = `${seriesName}{${label}="${escapedService}"}`;
|
|
277
|
+
const url = `/api/v1/series?match[]=${encodeURIComponent(matchExpr)}`;
|
|
278
|
+
const data = await this.apiGet(url);
|
|
279
|
+
return Array.isArray(data?.data) && data.data.length > 0;
|
|
280
|
+
}
|
|
281
|
+
catch {
|
|
282
|
+
return false;
|
|
283
|
+
}
|
|
284
|
+
}
|
|
209
285
|
async resolveServiceLabel(service) {
|
|
210
286
|
for (const label of this.serviceLabels) {
|
|
211
287
|
const values = await this.getLabelValues(label);
|
package/package.json
CHANGED