@thotischner/observability-mcp 1.1.3 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/sources.yaml +15 -5
- package/dist/connectors/prometheus.d.ts +8 -0
- package/dist/connectors/prometheus.js +168 -19
- package/dist/tools/validation.js +6 -2
- package/dist/types.d.ts +2 -0
- package/package.json +1 -1
package/config/sources.yaml
CHANGED
|
@@ -3,11 +3,21 @@ sources:
|
|
|
3
3
|
type: prometheus
|
|
4
4
|
url: http://prometheus:9090
|
|
5
5
|
enabled: true
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
6
|
+
# The demo services emit synthetic CPU/memory gauges that the chaos
|
|
7
|
+
# endpoints drive directly, so we override the connector defaults
|
|
8
|
+
# (which target prom-client conventions) with the demo-specific names.
|
|
9
|
+
# request_rate / error_rate / latency_* keep the prom-client defaults
|
|
10
|
+
# and apply unchanged because the demo also emits http_requests_total
|
|
11
|
+
# and http_request_duration_seconds_bucket.
|
|
12
|
+
metrics:
|
|
13
|
+
- name: cpu
|
|
14
|
+
query: 'service_cpu_usage_percent{job="{{service}}"}'
|
|
15
|
+
unit: percent
|
|
16
|
+
description: Synthetic CPU gauge driven by /chaos endpoints
|
|
17
|
+
- name: memory
|
|
18
|
+
query: 'service_memory_usage_bytes{job="{{service}}"}'
|
|
19
|
+
unit: bytes
|
|
20
|
+
description: Synthetic memory gauge driven by /chaos endpoints
|
|
11
21
|
|
|
12
22
|
- name: loki
|
|
13
23
|
type: loki
|
|
@@ -8,6 +8,10 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
8
8
|
private auth?;
|
|
9
9
|
private tlsAgent?;
|
|
10
10
|
private metrics;
|
|
11
|
+
private serviceLabels;
|
|
12
|
+
private labelValuesCache;
|
|
13
|
+
private metricNamesCache;
|
|
14
|
+
private userOverrides;
|
|
11
15
|
connect(config: SourceConfig): Promise<void>;
|
|
12
16
|
getDefaultMetrics(): MetricDefinition[];
|
|
13
17
|
getMetrics(): MetricDefinition[];
|
|
@@ -19,6 +23,10 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
19
23
|
listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
|
|
20
24
|
queryMetrics(params: MetricQuery): Promise<MetricResult>;
|
|
21
25
|
private buildQuery;
|
|
26
|
+
private pickMetricCandidate;
|
|
27
|
+
private getAllMetricNames;
|
|
28
|
+
private resolveServiceLabel;
|
|
29
|
+
private getLabelValues;
|
|
22
30
|
private getUnit;
|
|
23
31
|
private parseTimeRange;
|
|
24
32
|
private computeSummary;
|
|
@@ -1,13 +1,73 @@
|
|
|
1
1
|
import { buildTlsAgent } from "./tls.js";
|
|
2
|
-
const
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
]
|
|
2
|
+
const PROMETHEUS_METRIC_CANDIDATES = {
|
|
3
|
+
cpu: [
|
|
4
|
+
{
|
|
5
|
+
seriesName: "process_cpu_seconds_total",
|
|
6
|
+
query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
|
|
7
|
+
},
|
|
8
|
+
{
|
|
9
|
+
seriesName: "node_cpu_seconds_total",
|
|
10
|
+
query: '100 - avg(rate(node_cpu_seconds_total{ {{selector}}, mode="idle" }[1m])) * 100',
|
|
11
|
+
},
|
|
12
|
+
],
|
|
13
|
+
memory: [
|
|
14
|
+
{
|
|
15
|
+
seriesName: "process_resident_memory_bytes",
|
|
16
|
+
query: 'process_resident_memory_bytes{ {{selector}} }',
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
seriesName: "node_memory_MemTotal_bytes",
|
|
20
|
+
query: '(node_memory_MemTotal_bytes{ {{selector}} } - node_memory_MemAvailable_bytes{ {{selector}} })',
|
|
21
|
+
},
|
|
22
|
+
],
|
|
23
|
+
request_rate: [
|
|
24
|
+
{
|
|
25
|
+
seriesName: "http_requests_total",
|
|
26
|
+
query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
|
|
27
|
+
},
|
|
28
|
+
],
|
|
29
|
+
error_rate: [
|
|
30
|
+
{
|
|
31
|
+
seriesName: "http_requests_total",
|
|
32
|
+
query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
|
|
33
|
+
},
|
|
34
|
+
],
|
|
35
|
+
latency_p99: [
|
|
36
|
+
{
|
|
37
|
+
seriesName: "http_request_duration_seconds_bucket",
|
|
38
|
+
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
39
|
+
},
|
|
40
|
+
],
|
|
41
|
+
latency_p50: [
|
|
42
|
+
{
|
|
43
|
+
seriesName: "http_request_duration_seconds_bucket",
|
|
44
|
+
query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
45
|
+
},
|
|
46
|
+
],
|
|
47
|
+
latency_avg: [
|
|
48
|
+
{
|
|
49
|
+
seriesName: "http_request_duration_seconds_sum",
|
|
50
|
+
query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
|
|
51
|
+
},
|
|
52
|
+
],
|
|
53
|
+
};
|
|
54
|
+
const DEFAULT_METRIC_META = {
|
|
55
|
+
cpu: { unit: "percent", description: "CPU usage % (auto: prom-client process_cpu_seconds_total or node_exporter node_cpu_seconds_total)" },
|
|
56
|
+
memory: { unit: "bytes", description: "Resident memory bytes (auto: prom-client process_resident_memory_bytes or node_memory used)" },
|
|
57
|
+
request_rate: { unit: "req/s", description: "HTTP request rate (http_requests_total)" },
|
|
58
|
+
error_rate: { unit: "req/s", description: "HTTP 5xx error rate (http_requests_total filtered by status)" },
|
|
59
|
+
latency_p99: { unit: "seconds", description: "99th percentile latency (http_request_duration_seconds_bucket)" },
|
|
60
|
+
latency_p50: { unit: "seconds", description: "50th percentile latency (http_request_duration_seconds_bucket)" },
|
|
61
|
+
latency_avg: { unit: "seconds", description: "Average request latency (sum/count ratio)" },
|
|
62
|
+
};
|
|
63
|
+
const DEFAULT_PROMETHEUS_METRICS = Object.keys(PROMETHEUS_METRIC_CANDIDATES).map((name) => ({
|
|
64
|
+
name,
|
|
65
|
+
query: PROMETHEUS_METRIC_CANDIDATES[name][0].query,
|
|
66
|
+
unit: DEFAULT_METRIC_META[name].unit,
|
|
67
|
+
description: DEFAULT_METRIC_META[name].description,
|
|
68
|
+
}));
|
|
69
|
+
const DEFAULT_SERVICE_LABELS = ["job", "service", "app", "service_name"];
|
|
70
|
+
const LABEL_CACHE_TTL_MS = 60_000;
|
|
11
71
|
export class PrometheusConnector {
|
|
12
72
|
type = "prometheus";
|
|
13
73
|
signalType = "metrics";
|
|
@@ -16,15 +76,31 @@ export class PrometheusConnector {
|
|
|
16
76
|
auth;
|
|
17
77
|
tlsAgent;
|
|
18
78
|
metrics = [];
|
|
79
|
+
serviceLabels = DEFAULT_SERVICE_LABELS;
|
|
80
|
+
labelValuesCache = new Map();
|
|
81
|
+
metricNamesCache = null;
|
|
82
|
+
userOverrides = new Set();
|
|
19
83
|
async connect(config) {
|
|
20
84
|
this.name = config.name;
|
|
21
85
|
this.baseUrl = config.url.replace(/\/$/, "");
|
|
22
86
|
this.auth = config.auth;
|
|
23
87
|
this.tlsAgent = buildTlsAgent(config);
|
|
24
|
-
//
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
88
|
+
// Source-level overrides merge with defaults by name, so users can pin
|
|
89
|
+
// a single metric (e.g. cpu) to a custom query without re-listing the
|
|
90
|
+
// rest. Overridden metrics skip candidate probing — the user-supplied
|
|
91
|
+
// query is used verbatim.
|
|
92
|
+
this.userOverrides = new Set((config.metrics || []).map((m) => m.name));
|
|
93
|
+
const overrides = new Map((config.metrics || []).map((m) => [m.name, m]));
|
|
94
|
+
this.metrics = DEFAULT_PROMETHEUS_METRICS.map((d) => overrides.get(d.name) || d);
|
|
95
|
+
for (const [name, m] of overrides) {
|
|
96
|
+
if (!DEFAULT_PROMETHEUS_METRICS.some((d) => d.name === name)) {
|
|
97
|
+
this.metrics.push(m);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
const envLabels = process.env.PROMETHEUS_SERVICE_LABELS;
|
|
101
|
+
if (envLabels) {
|
|
102
|
+
this.serviceLabels = envLabels.split(",").map((s) => s.trim()).filter(Boolean);
|
|
103
|
+
}
|
|
28
104
|
}
|
|
29
105
|
getDefaultMetrics() {
|
|
30
106
|
return DEFAULT_PROMETHEUS_METRICS;
|
|
@@ -111,7 +187,7 @@ export class PrometheusConnector {
|
|
|
111
187
|
return metrics;
|
|
112
188
|
}
|
|
113
189
|
async queryMetrics(params) {
|
|
114
|
-
const promql = this.buildQuery(params.service, params.metric);
|
|
190
|
+
const { promql, label } = await this.buildQuery(params.service, params.metric);
|
|
115
191
|
const { start, end, step } = this.parseTimeRange(params.duration, params.step);
|
|
116
192
|
const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
|
|
117
193
|
const values = [];
|
|
@@ -131,15 +207,88 @@ export class PrometheusConnector {
|
|
|
131
207
|
unit: this.getUnit(params.metric),
|
|
132
208
|
values,
|
|
133
209
|
summary: this.computeSummary(rawValues),
|
|
210
|
+
resolvedSeries: promql,
|
|
211
|
+
resolvedLabel: label,
|
|
134
212
|
};
|
|
135
213
|
}
|
|
136
214
|
// --- Private helpers ---
|
|
137
|
-
buildQuery(service, metric) {
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
215
|
+
async buildQuery(service, metric) {
|
|
216
|
+
// Pick the query template. For built-in metrics with no user override,
|
|
217
|
+
// probe candidate series in the backend and pick the first that exists
|
|
218
|
+
// (e.g. prom-client process_cpu_seconds_total → falls back to
|
|
219
|
+
// node_exporter node_cpu_seconds_total). User-overridden metrics use
|
|
220
|
+
// their query verbatim.
|
|
221
|
+
let template;
|
|
222
|
+
if (!this.userOverrides.has(metric) && PROMETHEUS_METRIC_CANDIDATES[metric]) {
|
|
223
|
+
const candidate = await this.pickMetricCandidate(metric);
|
|
224
|
+
template = candidate?.query || PROMETHEUS_METRIC_CANDIDATES[metric][0].query;
|
|
225
|
+
}
|
|
226
|
+
else {
|
|
227
|
+
const def = this.metrics.find((m) => m.name === metric);
|
|
228
|
+
template = def?.query || `${metric}{ {{selector}} }`;
|
|
229
|
+
}
|
|
230
|
+
const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
231
|
+
let promql = template;
|
|
232
|
+
let label = "job";
|
|
233
|
+
if (template.includes("{{selector}}")) {
|
|
234
|
+
label = await this.resolveServiceLabel(service);
|
|
235
|
+
const selector = `${label}="${escaped}"`;
|
|
236
|
+
promql = promql.replace(/\{\{selector\}\}/g, selector);
|
|
237
|
+
}
|
|
238
|
+
promql = promql.replace(/\{\{service\}\}/g, escaped);
|
|
239
|
+
return { promql, label };
|
|
240
|
+
}
|
|
241
|
+
async pickMetricCandidate(metric) {
|
|
242
|
+
const candidates = PROMETHEUS_METRIC_CANDIDATES[metric];
|
|
243
|
+
if (!candidates || candidates.length === 0)
|
|
244
|
+
return null;
|
|
245
|
+
if (candidates.length === 1)
|
|
246
|
+
return candidates[0];
|
|
247
|
+
const allNames = await this.getAllMetricNames();
|
|
248
|
+
for (const c of candidates) {
|
|
249
|
+
if (allNames.has(c.seriesName))
|
|
250
|
+
return c;
|
|
251
|
+
}
|
|
252
|
+
return candidates[0];
|
|
253
|
+
}
|
|
254
|
+
async getAllMetricNames() {
|
|
255
|
+
if (this.metricNamesCache && this.metricNamesCache.expiresAt > Date.now()) {
|
|
256
|
+
return this.metricNamesCache.values;
|
|
257
|
+
}
|
|
258
|
+
try {
|
|
259
|
+
const data = await this.apiGet("/api/v1/label/__name__/values");
|
|
260
|
+
const values = new Set(data?.data || []);
|
|
261
|
+
this.metricNamesCache = { values, expiresAt: Date.now() + LABEL_CACHE_TTL_MS };
|
|
262
|
+
return values;
|
|
263
|
+
}
|
|
264
|
+
catch {
|
|
265
|
+
this.metricNamesCache = { values: new Set(), expiresAt: Date.now() + LABEL_CACHE_TTL_MS };
|
|
266
|
+
return new Set();
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
async resolveServiceLabel(service) {
|
|
270
|
+
for (const label of this.serviceLabels) {
|
|
271
|
+
const values = await this.getLabelValues(label);
|
|
272
|
+
if (values.includes(service))
|
|
273
|
+
return label;
|
|
274
|
+
}
|
|
275
|
+
return this.serviceLabels[0] || "job";
|
|
276
|
+
}
|
|
277
|
+
async getLabelValues(label) {
|
|
278
|
+
const cached = this.labelValuesCache.get(label);
|
|
279
|
+
if (cached && cached.expiresAt > Date.now()) {
|
|
280
|
+
return cached.values;
|
|
281
|
+
}
|
|
282
|
+
try {
|
|
283
|
+
const data = await this.apiGet(`/api/v1/label/${encodeURIComponent(label)}/values`);
|
|
284
|
+
const values = data?.data || [];
|
|
285
|
+
this.labelValuesCache.set(label, { values, expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
|
|
286
|
+
return values;
|
|
287
|
+
}
|
|
288
|
+
catch {
|
|
289
|
+
this.labelValuesCache.set(label, { values: [], expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
|
|
290
|
+
return [];
|
|
141
291
|
}
|
|
142
|
-
return `${metric}{job="${service}"}`;
|
|
143
292
|
}
|
|
144
293
|
getUnit(metric) {
|
|
145
294
|
const def = this.metrics.find((m) => m.name === metric);
|
package/dist/tools/validation.js
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
const DURATION_RE = /^\d+[mhd]$/;
|
|
2
|
-
|
|
2
|
+
// Slashes are valid in Prometheus label values and appear in real-world job
|
|
3
|
+
// names (Grafana Cloud Integrations like "integrations/unix", k8s namespaces,
|
|
4
|
+
// Docker image refs). The PromQL/LogQL injection surface is the surrounding
|
|
5
|
+
// quote/backslash, which we escape separately, not these characters.
|
|
6
|
+
const SAFE_LABEL_RE = /^[a-zA-Z0-9_\-.:/]+$/;
|
|
3
7
|
export function validateDuration(duration) {
|
|
4
8
|
if (!DURATION_RE.test(duration)) {
|
|
5
9
|
return `Invalid duration "${duration}". Expected format: <number><unit> where unit is m (minutes), h (hours), or d (days). Examples: 5m, 1h, 24h, 7d`;
|
|
@@ -33,7 +37,7 @@ export function sanitizeLabelValue(value) {
|
|
|
33
37
|
}
|
|
34
38
|
export function validateServiceName(service) {
|
|
35
39
|
if (!sanitizeLabelValue(service)) {
|
|
36
|
-
return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, and
|
|
40
|
+
return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, colons, and slashes are allowed (max 128 chars).`;
|
|
37
41
|
}
|
|
38
42
|
return null;
|
|
39
43
|
}
|
package/dist/types.d.ts
CHANGED
package/package.json
CHANGED