@thotischner/observability-mcp 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/sources.yaml +15 -5
- package/dist/connectors/loki.d.ts +1 -1
- package/dist/connectors/loki.js +22 -12
- package/dist/connectors/prometheus.d.ts +4 -0
- package/dist/connectors/prometheus.js +105 -16
- package/dist/tools/validation.js +6 -2
- package/dist/types.d.ts +2 -0
- package/package.json +1 -1
package/config/sources.yaml
CHANGED
|
@@ -3,11 +3,21 @@ sources:
|
|
|
3
3
|
type: prometheus
|
|
4
4
|
url: http://prometheus:9090
|
|
5
5
|
enabled: true
|
|
6
|
-
#
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
#
|
|
6
|
+
# The demo services emit synthetic CPU/memory gauges that the chaos
|
|
7
|
+
# endpoints drive directly, so we override the connector defaults
|
|
8
|
+
# (which target prom-client conventions) with the demo-specific names.
|
|
9
|
+
# request_rate / error_rate / latency_* keep the prom-client defaults
|
|
10
|
+
# and apply unchanged because the demo also emits http_requests_total
|
|
11
|
+
# and http_request_duration_seconds_bucket.
|
|
12
|
+
metrics:
|
|
13
|
+
- name: cpu
|
|
14
|
+
query: 'service_cpu_usage_percent{job="{{service}}"}'
|
|
15
|
+
unit: percent
|
|
16
|
+
description: Synthetic CPU gauge driven by /chaos endpoints
|
|
17
|
+
- name: memory
|
|
18
|
+
query: 'service_memory_usage_bytes{job="{{service}}"}'
|
|
19
|
+
unit: bytes
|
|
20
|
+
description: Synthetic memory gauge driven by /chaos endpoints
|
|
11
21
|
|
|
12
22
|
- name: loki
|
|
13
23
|
type: loki
|
|
@@ -18,7 +18,7 @@ export declare class LokiConnector implements ObservabilityConnector {
|
|
|
18
18
|
listServices(): Promise<ServiceInfo[]>;
|
|
19
19
|
queryLogs(params: LogQuery): Promise<LogResult>;
|
|
20
20
|
private getLabelValues;
|
|
21
|
-
private
|
|
21
|
+
private resolveServiceSelector;
|
|
22
22
|
private parseLine;
|
|
23
23
|
private extractTopPatterns;
|
|
24
24
|
private parseTimeRange;
|
package/dist/connectors/loki.js
CHANGED
|
@@ -61,10 +61,14 @@ export class LokiConnector {
|
|
|
61
61
|
const seen = new Map();
|
|
62
62
|
for (const label of this.serviceLabels) {
|
|
63
63
|
const values = await this.getLabelValues(label);
|
|
64
|
-
for (const
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
64
|
+
for (const raw of values) {
|
|
65
|
+
// Docker's loki.source.docker writes container names with a leading '/'
|
|
66
|
+
// (Docker API Names[0] convention). Strip it for display so the name
|
|
67
|
+
// matches what the service-name validator and users will pass back in.
|
|
68
|
+
const display = label === "container" ? raw.replace(/^\//, "") : raw;
|
|
69
|
+
if (!seen.has(display)) {
|
|
70
|
+
seen.set(display, {
|
|
71
|
+
name: display,
|
|
68
72
|
source: this.name,
|
|
69
73
|
signalType: "logs",
|
|
70
74
|
labels: { discoveredVia: label },
|
|
@@ -77,11 +81,12 @@ export class LokiConnector {
|
|
|
77
81
|
async queryLogs(params) {
|
|
78
82
|
const { start, end } = this.parseTimeRange(params.duration);
|
|
79
83
|
const limit = Math.min(Math.max(params.limit || 100, 1), 1000);
|
|
80
|
-
// Resolve
|
|
81
|
-
//
|
|
82
|
-
//
|
|
83
|
-
|
|
84
|
-
const
|
|
84
|
+
// Resolve label + actual selector value. For the 'container' label the
|
|
85
|
+
// value stored in Loki may be '/my-app-1' while the caller passes the
|
|
86
|
+
// sanitized 'my-app-1' — return the prefixed form so the LogQL selector
|
|
87
|
+
// matches the real stream.
|
|
88
|
+
const { label: matchedLabel, value: rawValue } = await this.resolveServiceSelector(params.service);
|
|
89
|
+
const service = this.escapeLogQLValue(rawValue);
|
|
85
90
|
let logql = `{${matchedLabel}="${service}"}`;
|
|
86
91
|
if (params.level) {
|
|
87
92
|
const level = this.escapeLogQLValue(params.level);
|
|
@@ -148,13 +153,18 @@ export class LokiConnector {
|
|
|
148
153
|
return [];
|
|
149
154
|
}
|
|
150
155
|
}
|
|
151
|
-
async
|
|
156
|
+
async resolveServiceSelector(service) {
|
|
152
157
|
for (const label of this.serviceLabels) {
|
|
153
158
|
const values = await this.getLabelValues(label);
|
|
154
159
|
if (values.includes(service))
|
|
155
|
-
return label;
|
|
160
|
+
return { label, value: service };
|
|
161
|
+
// Container label values are Docker-prefixed with '/'. The caller can't
|
|
162
|
+
// pass that form (validator rejects '/'), so probe the prefixed variant.
|
|
163
|
+
if (label === "container" && values.includes(`/${service}`)) {
|
|
164
|
+
return { label, value: `/${service}` };
|
|
165
|
+
}
|
|
156
166
|
}
|
|
157
|
-
return this.serviceLabels[0] || "service_name";
|
|
167
|
+
return { label: this.serviceLabels[0] || "service_name", value: service };
|
|
158
168
|
}
|
|
159
169
|
parseLine(line) {
|
|
160
170
|
try {
|
|
@@ -8,6 +8,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
8
8
|
private auth?;
|
|
9
9
|
private tlsAgent?;
|
|
10
10
|
private metrics;
|
|
11
|
+
private serviceLabels;
|
|
12
|
+
private labelValuesCache;
|
|
11
13
|
connect(config: SourceConfig): Promise<void>;
|
|
12
14
|
getDefaultMetrics(): MetricDefinition[];
|
|
13
15
|
getMetrics(): MetricDefinition[];
|
|
@@ -19,6 +21,8 @@ export declare class PrometheusConnector implements ObservabilityConnector {
|
|
|
19
21
|
listAvailableMetrics(_service: string): Promise<MetricInfo[]>;
|
|
20
22
|
queryMetrics(params: MetricQuery): Promise<MetricResult>;
|
|
21
23
|
private buildQuery;
|
|
24
|
+
private resolveServiceLabel;
|
|
25
|
+
private getLabelValues;
|
|
22
26
|
private getUnit;
|
|
23
27
|
private parseTimeRange;
|
|
24
28
|
private computeSummary;
|
|
@@ -1,13 +1,57 @@
|
|
|
1
1
|
import { buildTlsAgent } from "./tls.js";
|
|
2
|
+
// Defaults target prom-client conventions, the de-facto standard for
|
|
3
|
+
// Node.js/Express instrumentation and what most apps emit out of the box.
|
|
4
|
+
// {{selector}} is replaced at query time with the discovered label/value
|
|
5
|
+
// pair (e.g. job="my-svc"); the connector probes job → service → app →
|
|
6
|
+
// service_name to find which label carries the requested service name.
|
|
7
|
+
// {{service}} (literal value) is still supported for back-compat with
|
|
8
|
+
// user-provided overrides.
|
|
2
9
|
const DEFAULT_PROMETHEUS_METRICS = [
|
|
3
|
-
{
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
{
|
|
10
|
+
{
|
|
11
|
+
name: "cpu",
|
|
12
|
+
query: 'rate(process_cpu_seconds_total{ {{selector}} }[1m]) * 100',
|
|
13
|
+
unit: "percent",
|
|
14
|
+
description: "CPU usage % (rate of process_cpu_seconds_total — prom-client default)",
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
name: "memory",
|
|
18
|
+
query: 'process_resident_memory_bytes{ {{selector}} }',
|
|
19
|
+
unit: "bytes",
|
|
20
|
+
description: "Resident memory in bytes (prom-client default)",
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
name: "request_rate",
|
|
24
|
+
query: 'sum(rate(http_requests_total{ {{selector}} }[1m]))',
|
|
25
|
+
unit: "req/s",
|
|
26
|
+
description: "HTTP request rate",
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
name: "error_rate",
|
|
30
|
+
query: 'sum(rate(http_requests_total{ {{selector}}, status=~"5.." }[1m]))',
|
|
31
|
+
unit: "req/s",
|
|
32
|
+
description: "HTTP 5xx error rate",
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
name: "latency_p99",
|
|
36
|
+
query: 'histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
37
|
+
unit: "seconds",
|
|
38
|
+
description: "99th percentile latency",
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
name: "latency_p50",
|
|
42
|
+
query: 'histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{ {{selector}} }[1m])) by (le))',
|
|
43
|
+
unit: "seconds",
|
|
44
|
+
description: "50th percentile latency",
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
name: "latency_avg",
|
|
48
|
+
query: 'sum(rate(http_request_duration_seconds_sum{ {{selector}} }[1m])) / sum(rate(http_request_duration_seconds_count{ {{selector}} }[1m]))',
|
|
49
|
+
unit: "seconds",
|
|
50
|
+
description: "Average request latency",
|
|
51
|
+
},
|
|
10
52
|
];
|
|
53
|
+
const DEFAULT_SERVICE_LABELS = ["job", "service", "app", "service_name"];
|
|
54
|
+
const LABEL_CACHE_TTL_MS = 60_000;
|
|
11
55
|
export class PrometheusConnector {
|
|
12
56
|
type = "prometheus";
|
|
13
57
|
signalType = "metrics";
|
|
@@ -16,15 +60,27 @@ export class PrometheusConnector {
|
|
|
16
60
|
auth;
|
|
17
61
|
tlsAgent;
|
|
18
62
|
metrics = [];
|
|
63
|
+
serviceLabels = DEFAULT_SERVICE_LABELS;
|
|
64
|
+
labelValuesCache = new Map();
|
|
19
65
|
async connect(config) {
|
|
20
66
|
this.name = config.name;
|
|
21
67
|
this.baseUrl = config.url.replace(/\/$/, "");
|
|
22
68
|
this.auth = config.auth;
|
|
23
69
|
this.tlsAgent = buildTlsAgent(config);
|
|
24
|
-
//
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
70
|
+
// Source-level overrides merge with defaults by name, so users can pin
|
|
71
|
+
// a single metric (e.g. cpu) to a custom query without re-listing the
|
|
72
|
+
// rest. To fully replace the defaults, override every metric explicitly.
|
|
73
|
+
const overrides = new Map((config.metrics || []).map((m) => [m.name, m]));
|
|
74
|
+
this.metrics = DEFAULT_PROMETHEUS_METRICS.map((d) => overrides.get(d.name) || d);
|
|
75
|
+
for (const [name, m] of overrides) {
|
|
76
|
+
if (!DEFAULT_PROMETHEUS_METRICS.some((d) => d.name === name)) {
|
|
77
|
+
this.metrics.push(m);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
const envLabels = process.env.PROMETHEUS_SERVICE_LABELS;
|
|
81
|
+
if (envLabels) {
|
|
82
|
+
this.serviceLabels = envLabels.split(",").map((s) => s.trim()).filter(Boolean);
|
|
83
|
+
}
|
|
28
84
|
}
|
|
29
85
|
getDefaultMetrics() {
|
|
30
86
|
return DEFAULT_PROMETHEUS_METRICS;
|
|
@@ -111,7 +167,7 @@ export class PrometheusConnector {
|
|
|
111
167
|
return metrics;
|
|
112
168
|
}
|
|
113
169
|
async queryMetrics(params) {
|
|
114
|
-
const promql = this.buildQuery(params.service, params.metric);
|
|
170
|
+
const { promql, label } = await this.buildQuery(params.service, params.metric);
|
|
115
171
|
const { start, end, step } = this.parseTimeRange(params.duration, params.step);
|
|
116
172
|
const data = await this.apiGet(`/api/v1/query_range?query=${encodeURIComponent(promql)}&start=${start}&end=${end}&step=${step}`);
|
|
117
173
|
const values = [];
|
|
@@ -131,15 +187,48 @@ export class PrometheusConnector {
|
|
|
131
187
|
unit: this.getUnit(params.metric),
|
|
132
188
|
values,
|
|
133
189
|
summary: this.computeSummary(rawValues),
|
|
190
|
+
resolvedSeries: promql,
|
|
191
|
+
resolvedLabel: label,
|
|
134
192
|
};
|
|
135
193
|
}
|
|
136
194
|
// --- Private helpers ---
|
|
137
|
-
buildQuery(service, metric) {
|
|
195
|
+
async buildQuery(service, metric) {
|
|
138
196
|
const def = this.metrics.find((m) => m.name === metric);
|
|
139
|
-
|
|
140
|
-
|
|
197
|
+
const template = def?.query || `${metric}{ {{selector}} }`;
|
|
198
|
+
const escaped = service.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
199
|
+
let promql = template;
|
|
200
|
+
let label = "job";
|
|
201
|
+
if (template.includes("{{selector}}")) {
|
|
202
|
+
label = await this.resolveServiceLabel(service);
|
|
203
|
+
const selector = `${label}="${escaped}"`;
|
|
204
|
+
promql = promql.replace(/\{\{selector\}\}/g, selector);
|
|
205
|
+
}
|
|
206
|
+
promql = promql.replace(/\{\{service\}\}/g, escaped);
|
|
207
|
+
return { promql, label };
|
|
208
|
+
}
|
|
209
|
+
async resolveServiceLabel(service) {
|
|
210
|
+
for (const label of this.serviceLabels) {
|
|
211
|
+
const values = await this.getLabelValues(label);
|
|
212
|
+
if (values.includes(service))
|
|
213
|
+
return label;
|
|
214
|
+
}
|
|
215
|
+
return this.serviceLabels[0] || "job";
|
|
216
|
+
}
|
|
217
|
+
async getLabelValues(label) {
|
|
218
|
+
const cached = this.labelValuesCache.get(label);
|
|
219
|
+
if (cached && cached.expiresAt > Date.now()) {
|
|
220
|
+
return cached.values;
|
|
221
|
+
}
|
|
222
|
+
try {
|
|
223
|
+
const data = await this.apiGet(`/api/v1/label/${encodeURIComponent(label)}/values`);
|
|
224
|
+
const values = data?.data || [];
|
|
225
|
+
this.labelValuesCache.set(label, { values, expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
|
|
226
|
+
return values;
|
|
227
|
+
}
|
|
228
|
+
catch {
|
|
229
|
+
this.labelValuesCache.set(label, { values: [], expiresAt: Date.now() + LABEL_CACHE_TTL_MS });
|
|
230
|
+
return [];
|
|
141
231
|
}
|
|
142
|
-
return `${metric}{job="${service}"}`;
|
|
143
232
|
}
|
|
144
233
|
getUnit(metric) {
|
|
145
234
|
const def = this.metrics.find((m) => m.name === metric);
|
package/dist/tools/validation.js
CHANGED
|
@@ -1,5 +1,9 @@
|
|
|
1
1
|
const DURATION_RE = /^\d+[mhd]$/;
|
|
2
|
-
|
|
2
|
+
// Slashes are valid in Prometheus label values and appear in real-world job
|
|
3
|
+
// names (Grafana Cloud Integrations like "integrations/unix", k8s namespaces,
|
|
4
|
+
// Docker image refs). The PromQL/LogQL injection surface is the surrounding
|
|
5
|
+
// quote/backslash, which we escape separately, not these characters.
|
|
6
|
+
const SAFE_LABEL_RE = /^[a-zA-Z0-9_\-.:/]+$/;
|
|
3
7
|
export function validateDuration(duration) {
|
|
4
8
|
if (!DURATION_RE.test(duration)) {
|
|
5
9
|
return `Invalid duration "${duration}". Expected format: <number><unit> where unit is m (minutes), h (hours), or d (days). Examples: 5m, 1h, 24h, 7d`;
|
|
@@ -33,7 +37,7 @@ export function sanitizeLabelValue(value) {
|
|
|
33
37
|
}
|
|
34
38
|
export function validateServiceName(service) {
|
|
35
39
|
if (!sanitizeLabelValue(service)) {
|
|
36
|
-
return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, and
|
|
40
|
+
return `Invalid service name "${service}". Only alphanumeric characters, hyphens, underscores, dots, colons, and slashes are allowed (max 128 chars).`;
|
|
37
41
|
}
|
|
38
42
|
return null;
|
|
39
43
|
}
|
package/dist/types.d.ts
CHANGED
package/package.json
CHANGED