holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +15 -4
- holmes/common/env_vars.py +8 -1
- holmes/config.py +66 -139
- holmes/core/investigation.py +1 -2
- holmes/core/llm.py +295 -52
- holmes/core/models.py +2 -0
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +14 -8
- holmes/core/tool_calling_llm.py +202 -177
- holmes/core/tools.py +260 -25
- holmes/core/tools_utils/data_types.py +81 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
- holmes/core/tools_utils/tool_executor.py +2 -2
- holmes/core/toolset_manager.py +150 -3
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/main.py +5 -0
- holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
- holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +345 -207
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +96 -32
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +21 -22
- holmes/plugins/toolsets/git.py +22 -22
- holmes/plugins/toolsets/grafana/common.py +14 -2
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +3 -3
- holmes/plugins/toolsets/internet/notion.py +3 -3
- holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
- holmes/plugins/toolsets/kafka.py +18 -18
- holmes/plugins/toolsets/kubernetes.yaml +58 -0
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
- holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
- holmes/plugins/toolsets/newrelic.py +8 -8
- holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
- holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
- holmes/plugins/toolsets/robusta/robusta.py +10 -10
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
- holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/env.py +7 -0
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +9 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +11 -15
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +85 -75
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
import time
|
|
6
5
|
import dateutil.parser
|
|
7
|
-
from typing import Any, Dict,
|
|
6
|
+
from typing import Any, Dict, Optional, Tuple, Type, Union
|
|
8
7
|
from urllib.parse import urljoin
|
|
9
8
|
|
|
10
9
|
import requests # type: ignore
|
|
@@ -17,11 +16,12 @@ from holmes.core.tools import (
|
|
|
17
16
|
StructuredToolResult,
|
|
18
17
|
Tool,
|
|
19
18
|
ToolParameter,
|
|
20
|
-
|
|
19
|
+
StructuredToolResultStatus,
|
|
21
20
|
Toolset,
|
|
22
21
|
ToolsetTag,
|
|
23
22
|
)
|
|
24
23
|
from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
|
|
24
|
+
from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
|
|
25
25
|
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
|
|
26
26
|
from holmes.plugins.toolsets.utils import (
|
|
27
27
|
get_param_or_raise,
|
|
@@ -38,23 +38,64 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
|
38
38
|
from holmes.utils.keygen_utils import generate_random_key
|
|
39
39
|
|
|
40
40
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
41
|
+
PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
|
|
42
|
+
# Default timeout values for PromQL queries
|
|
43
|
+
DEFAULT_QUERY_TIMEOUT_SECONDS = 20
|
|
44
|
+
MAX_QUERY_TIMEOUT_SECONDS = 180
|
|
45
|
+
# Default character limit for query responses to prevent token limit issues
|
|
46
|
+
DEFAULT_QUERY_RESPONSE_SIZE_LIMIT = 20000
|
|
47
|
+
# Default timeout for metadata API calls (discovery endpoints)
|
|
48
|
+
DEFAULT_METADATA_TIMEOUT_SECONDS = 20
|
|
49
|
+
MAX_METADATA_TIMEOUT_SECONDS = 60
|
|
50
|
+
# Default time window for metadata APIs (in hours)
|
|
51
|
+
DEFAULT_METADATA_TIME_WINDOW_HRS = 1
|
|
52
|
+
# Sample size for data summaries when results are too large
|
|
53
|
+
DATA_SUMMARY_SAMPLE_SIZE = 10
|
|
41
54
|
|
|
42
55
|
|
|
43
56
|
class PrometheusConfig(BaseModel):
|
|
44
57
|
# URL is optional because it can be set with an env var
|
|
45
58
|
prometheus_url: Optional[str]
|
|
46
59
|
healthcheck: str = "-/healthy"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
60
|
+
|
|
61
|
+
# New config for default time window for metadata APIs
|
|
62
|
+
default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS # Default: only show metrics active in the last hour
|
|
63
|
+
|
|
64
|
+
# Query timeout configuration
|
|
65
|
+
default_query_timeout_seconds: int = (
|
|
66
|
+
DEFAULT_QUERY_TIMEOUT_SECONDS # Default timeout for PromQL queries
|
|
67
|
+
)
|
|
68
|
+
max_query_timeout_seconds: int = (
|
|
69
|
+
MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Metadata API timeout configuration
|
|
73
|
+
default_metadata_timeout_seconds: int = (
|
|
74
|
+
DEFAULT_METADATA_TIMEOUT_SECONDS # Default timeout for metadata/discovery APIs
|
|
75
|
+
)
|
|
76
|
+
max_metadata_timeout_seconds: int = (
|
|
77
|
+
MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# DEPRECATED: These config values are deprecated and will be removed in a future version
|
|
81
|
+
# Using None as default so we can detect if user explicitly set them
|
|
82
|
+
metrics_labels_time_window_hrs: Optional[int] = (
|
|
83
|
+
None # DEPRECATED - use default_metadata_time_window_hrs instead
|
|
84
|
+
)
|
|
85
|
+
metrics_labels_cache_duration_hrs: Optional[int] = (
|
|
86
|
+
None # DEPRECATED - no longer used
|
|
87
|
+
)
|
|
88
|
+
fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
89
|
+
fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
90
|
+
|
|
53
91
|
tool_calls_return_data: bool = True
|
|
54
92
|
headers: Dict = Field(default_factory=dict)
|
|
55
|
-
rules_cache_duration_seconds:
|
|
93
|
+
rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
|
|
56
94
|
additional_labels: Optional[Dict[str, str]] = None
|
|
57
95
|
prometheus_ssl_enabled: bool = True
|
|
96
|
+
query_response_size_limit: Optional[int] = (
|
|
97
|
+
DEFAULT_QUERY_RESPONSE_SIZE_LIMIT # Limit the max number of characters in a query result to proactively prevent token limit issues (roughly 5-6k tokens)
|
|
98
|
+
)
|
|
58
99
|
|
|
59
100
|
@field_validator("prometheus_url")
|
|
60
101
|
def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
|
|
@@ -64,6 +105,26 @@ class PrometheusConfig(BaseModel):
|
|
|
64
105
|
|
|
65
106
|
@model_validator(mode="after")
|
|
66
107
|
def validate_prom_config(self):
|
|
108
|
+
# Check for deprecated config values and print warnings
|
|
109
|
+
deprecated_configs = []
|
|
110
|
+
if self.metrics_labels_time_window_hrs is not None: # Check if explicitly set
|
|
111
|
+
deprecated_configs.append(
|
|
112
|
+
"metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
|
|
113
|
+
)
|
|
114
|
+
if (
|
|
115
|
+
self.metrics_labels_cache_duration_hrs is not None
|
|
116
|
+
): # Check if explicitly set
|
|
117
|
+
deprecated_configs.append("metrics_labels_cache_duration_hrs")
|
|
118
|
+
if self.fetch_labels_with_labels_api is not None: # Check if explicitly set
|
|
119
|
+
deprecated_configs.append("fetch_labels_with_labels_api")
|
|
120
|
+
if self.fetch_metadata_with_series_api is not None: # Check if explicitly set
|
|
121
|
+
deprecated_configs.append("fetch_metadata_with_series_api")
|
|
122
|
+
|
|
123
|
+
if deprecated_configs:
|
|
124
|
+
logging.warning(
|
|
125
|
+
f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
|
|
126
|
+
f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
|
|
127
|
+
)
|
|
67
128
|
# If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
|
|
68
129
|
if IS_OPENSHIFT:
|
|
69
130
|
if self.healthcheck == "-/healthy":
|
|
@@ -160,6 +221,8 @@ def do_request(
|
|
|
160
221
|
|
|
161
222
|
if isinstance(config, AMPConfig):
|
|
162
223
|
client = config.get_aws_client() # cached AWSPrometheusConnect
|
|
224
|
+
# Note: timeout parameter is not supported by prometrix's signed_request
|
|
225
|
+
# AWS/AMP requests will not respect the timeout setting
|
|
163
226
|
return client.signed_request( # type: ignore
|
|
164
227
|
method=method,
|
|
165
228
|
url=url,
|
|
@@ -181,99 +244,6 @@ def do_request(
|
|
|
181
244
|
)
|
|
182
245
|
|
|
183
246
|
|
|
184
|
-
def filter_metrics_by_type(metrics: Dict, expected_type: str):
|
|
185
|
-
return {
|
|
186
|
-
metric_name: metric_data
|
|
187
|
-
for metric_name, metric_data in metrics.items()
|
|
188
|
-
if expected_type in metric_data.get("type", "")
|
|
189
|
-
or metric_data.get("type", "") == "?"
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
|
|
194
|
-
regex = re.compile(pattern)
|
|
195
|
-
return {
|
|
196
|
-
metric_name: metric_data
|
|
197
|
-
for metric_name, metric_data in metrics.items()
|
|
198
|
-
if regex.search(metric_name)
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def fetch_metadata(
|
|
206
|
-
prometheus_url: str,
|
|
207
|
-
headers: Optional[Dict],
|
|
208
|
-
config,
|
|
209
|
-
verify_ssl: bool = True,
|
|
210
|
-
) -> Dict:
|
|
211
|
-
metadata_url = urljoin(prometheus_url, "api/v1/metadata")
|
|
212
|
-
metadata_response = do_request(
|
|
213
|
-
config=config,
|
|
214
|
-
url=metadata_url,
|
|
215
|
-
headers=headers,
|
|
216
|
-
timeout=60,
|
|
217
|
-
verify=verify_ssl,
|
|
218
|
-
method="GET",
|
|
219
|
-
)
|
|
220
|
-
metadata_response.raise_for_status()
|
|
221
|
-
|
|
222
|
-
metadata = metadata_response.json()["data"]
|
|
223
|
-
|
|
224
|
-
metrics = {}
|
|
225
|
-
for metric_name, meta_list in metadata.items():
|
|
226
|
-
if meta_list:
|
|
227
|
-
metric_type = meta_list[0].get("type", "unknown")
|
|
228
|
-
metric_description = meta_list[0].get("help", "unknown")
|
|
229
|
-
metrics[metric_name] = {
|
|
230
|
-
"type": metric_type,
|
|
231
|
-
"description": metric_description,
|
|
232
|
-
"labels": set(),
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
return metrics
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
def fetch_metadata_with_series_api(
|
|
239
|
-
prometheus_url: str,
|
|
240
|
-
metric_name: str,
|
|
241
|
-
headers: Dict,
|
|
242
|
-
config,
|
|
243
|
-
verify_ssl: bool = True,
|
|
244
|
-
) -> Dict:
|
|
245
|
-
url = urljoin(prometheus_url, "api/v1/series")
|
|
246
|
-
params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
247
|
-
|
|
248
|
-
response = do_request(
|
|
249
|
-
config=config,
|
|
250
|
-
url=url,
|
|
251
|
-
headers=headers,
|
|
252
|
-
params=params,
|
|
253
|
-
timeout=60,
|
|
254
|
-
verify=verify_ssl,
|
|
255
|
-
method="GET",
|
|
256
|
-
)
|
|
257
|
-
response.raise_for_status()
|
|
258
|
-
metrics = response.json()["data"]
|
|
259
|
-
|
|
260
|
-
metadata: Dict = {}
|
|
261
|
-
for metric_data in metrics:
|
|
262
|
-
metric_name = metric_data.get("__name__")
|
|
263
|
-
if not metric_name:
|
|
264
|
-
continue
|
|
265
|
-
|
|
266
|
-
metric = metadata.get(metric_name)
|
|
267
|
-
if not metric:
|
|
268
|
-
metric = {"description": "?", "type": "?", "labels": set()}
|
|
269
|
-
metadata[metric_name] = metric
|
|
270
|
-
|
|
271
|
-
labels = {k for k in metric_data.keys() if k != "__name__"}
|
|
272
|
-
metric["labels"].update(labels)
|
|
273
|
-
|
|
274
|
-
return metadata
|
|
275
|
-
|
|
276
|
-
|
|
277
247
|
def result_has_data(result: Dict) -> bool:
|
|
278
248
|
data = result.get("data", {})
|
|
279
249
|
if len(data.get("result", [])) > 0:
|
|
@@ -284,33 +254,58 @@ def result_has_data(result: Dict) -> bool:
|
|
|
284
254
|
def adjust_step_for_max_points(
|
|
285
255
|
start_timestamp: str,
|
|
286
256
|
end_timestamp: str,
|
|
287
|
-
step: float,
|
|
257
|
+
step: Optional[float] = None,
|
|
258
|
+
max_points_override: Optional[float] = None,
|
|
288
259
|
) -> float:
|
|
289
260
|
"""
|
|
290
261
|
Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
|
|
291
|
-
Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
|
|
292
262
|
|
|
293
263
|
Args:
|
|
294
264
|
start_timestamp: RFC3339 formatted start time
|
|
295
265
|
end_timestamp: RFC3339 formatted end time
|
|
296
|
-
step: The requested step duration in seconds
|
|
266
|
+
step: The requested step duration in seconds (None for auto-calculation)
|
|
267
|
+
max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
|
|
297
268
|
|
|
298
269
|
Returns:
|
|
299
270
|
Adjusted step value in seconds that ensures points <= max_points
|
|
300
271
|
"""
|
|
272
|
+
# Use override if provided and valid, otherwise use default
|
|
273
|
+
max_points = MAX_GRAPH_POINTS
|
|
274
|
+
if max_points_override is not None:
|
|
275
|
+
if max_points_override > MAX_GRAPH_POINTS:
|
|
276
|
+
logging.warning(
|
|
277
|
+
f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
|
|
278
|
+
)
|
|
279
|
+
max_points = MAX_GRAPH_POINTS
|
|
280
|
+
elif max_points_override < 1:
|
|
281
|
+
logging.warning(
|
|
282
|
+
f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
|
|
283
|
+
)
|
|
284
|
+
max_points = MAX_GRAPH_POINTS
|
|
285
|
+
else:
|
|
286
|
+
max_points = max_points_override
|
|
287
|
+
logging.debug(f"Using max_points override: {max_points}")
|
|
301
288
|
|
|
302
289
|
start_dt = dateutil.parser.parse(start_timestamp)
|
|
303
290
|
end_dt = dateutil.parser.parse(end_timestamp)
|
|
304
291
|
|
|
305
292
|
time_range_seconds = (end_dt - start_dt).total_seconds()
|
|
306
293
|
|
|
294
|
+
# If no step provided, calculate a reasonable default
|
|
295
|
+
# Aim for ~60 data points across the time range (1 per minute for hourly, etc)
|
|
296
|
+
if step is None:
|
|
297
|
+
step = max(1, time_range_seconds / 60)
|
|
298
|
+
logging.debug(
|
|
299
|
+
f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
|
|
300
|
+
)
|
|
301
|
+
|
|
307
302
|
current_points = time_range_seconds / step
|
|
308
303
|
|
|
309
304
|
# If current points exceed max, adjust the step
|
|
310
|
-
if current_points >
|
|
311
|
-
adjusted_step = time_range_seconds /
|
|
305
|
+
if current_points > max_points:
|
|
306
|
+
adjusted_step = time_range_seconds / max_points
|
|
312
307
|
logging.info(
|
|
313
|
-
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {
|
|
308
|
+
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
|
|
314
309
|
)
|
|
315
310
|
return adjusted_step
|
|
316
311
|
|
|
@@ -324,168 +319,97 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
|
|
|
324
319
|
return results
|
|
325
320
|
|
|
326
321
|
|
|
327
|
-
def
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
metric_name: str,
|
|
333
|
-
config=None,
|
|
334
|
-
verify_ssl: bool = True,
|
|
335
|
-
) -> dict:
|
|
336
|
-
"""This is a slow query. Takes 5+ seconds to run"""
|
|
337
|
-
cache_key = f"metrics_labels_series_api:{metric_name}"
|
|
338
|
-
if cache:
|
|
339
|
-
cached_result = cache.get(cache_key)
|
|
340
|
-
if cached_result:
|
|
341
|
-
return cached_result
|
|
342
|
-
|
|
343
|
-
series_url = urljoin(prometheus_url, "api/v1/series")
|
|
344
|
-
params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
345
|
-
|
|
346
|
-
if metrics_labels_time_window_hrs is not None:
|
|
347
|
-
params["end"] = int(time.time())
|
|
348
|
-
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
349
|
-
|
|
350
|
-
series_response = do_request(
|
|
351
|
-
config=config,
|
|
352
|
-
url=series_url,
|
|
353
|
-
headers=headers,
|
|
354
|
-
params=params,
|
|
355
|
-
timeout=60,
|
|
356
|
-
verify=verify_ssl,
|
|
357
|
-
method="GET",
|
|
358
|
-
)
|
|
359
|
-
series_response.raise_for_status()
|
|
360
|
-
series = series_response.json()["data"]
|
|
361
|
-
|
|
362
|
-
metrics_labels: dict = {}
|
|
363
|
-
for serie in series:
|
|
364
|
-
metric_name = serie["__name__"]
|
|
365
|
-
# Add all labels except __name__
|
|
366
|
-
labels = {k for k in serie.keys() if k != "__name__"}
|
|
367
|
-
if metric_name in metrics_labels:
|
|
368
|
-
metrics_labels[metric_name].update(labels)
|
|
369
|
-
else:
|
|
370
|
-
metrics_labels[metric_name] = labels
|
|
371
|
-
if cache:
|
|
372
|
-
cache.set(cache_key, metrics_labels)
|
|
373
|
-
|
|
374
|
-
return metrics_labels
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
def fetch_metrics_labels_with_labels_api(
|
|
378
|
-
prometheus_url: str,
|
|
379
|
-
cache: Optional[TTLCache],
|
|
380
|
-
metrics_labels_time_window_hrs: Union[int, None],
|
|
381
|
-
metric_names: List[str],
|
|
382
|
-
headers: Dict,
|
|
383
|
-
config=None,
|
|
384
|
-
verify_ssl: bool = True,
|
|
385
|
-
) -> dict:
|
|
386
|
-
metrics_labels = {}
|
|
387
|
-
|
|
388
|
-
for metric_name in metric_names:
|
|
389
|
-
cache_key = f"metrics_labels_labels_api:{metric_name}"
|
|
390
|
-
if cache:
|
|
391
|
-
cached_result = cache.get(cache_key)
|
|
392
|
-
if cached_result:
|
|
393
|
-
metrics_labels[metric_name] = cached_result
|
|
394
|
-
|
|
395
|
-
url = urljoin(prometheus_url, "api/v1/labels")
|
|
396
|
-
params: dict = {
|
|
397
|
-
"match[]": f'{{__name__="{metric_name}"}}',
|
|
398
|
-
}
|
|
399
|
-
if metrics_labels_time_window_hrs is not None:
|
|
400
|
-
params["end"] = int(time.time())
|
|
401
|
-
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
322
|
+
def create_data_summary_for_large_result(
|
|
323
|
+
result_data: Dict, query: str, data_size_chars: int, is_range_query: bool = False
|
|
324
|
+
) -> Dict[str, Any]:
|
|
325
|
+
"""
|
|
326
|
+
Create a summary for large Prometheus results instead of returning full data.
|
|
402
327
|
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
prometheus_url=prometheus_url,
|
|
439
|
-
metric_name=metric_name,
|
|
440
|
-
headers=headers,
|
|
441
|
-
config=config,
|
|
442
|
-
verify_ssl=verify_ssl,
|
|
328
|
+
Args:
|
|
329
|
+
result_data: The Prometheus data result
|
|
330
|
+
query: The original PromQL query
|
|
331
|
+
data_size_chars: Size of the data in characters
|
|
332
|
+
is_range_query: Whether this is a range query (vs instant query)
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
Dictionary with summary information and suggestions
|
|
336
|
+
"""
|
|
337
|
+
if is_range_query:
|
|
338
|
+
series_list = result_data.get("result", [])
|
|
339
|
+
num_items = len(series_list)
|
|
340
|
+
|
|
341
|
+
# Calculate exact total data points across all series
|
|
342
|
+
total_points = 0
|
|
343
|
+
for series in series_list: # Iterate through ALL series for exact count
|
|
344
|
+
points = len(series.get("values", []))
|
|
345
|
+
total_points += points
|
|
346
|
+
|
|
347
|
+
# Analyze label keys and their cardinality
|
|
348
|
+
label_cardinality: Dict[str, set] = {}
|
|
349
|
+
for series in series_list:
|
|
350
|
+
metric = series.get("metric", {})
|
|
351
|
+
for label_key, label_value in metric.items():
|
|
352
|
+
if label_key not in label_cardinality:
|
|
353
|
+
label_cardinality[label_key] = set()
|
|
354
|
+
label_cardinality[label_key].add(label_value)
|
|
355
|
+
|
|
356
|
+
# Convert sets to counts for the summary
|
|
357
|
+
label_summary = {
|
|
358
|
+
label: len(values) for label, values in label_cardinality.items()
|
|
359
|
+
}
|
|
360
|
+
# Sort by cardinality (highest first) for better insights
|
|
361
|
+
label_summary = dict(
|
|
362
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
443
363
|
)
|
|
444
|
-
|
|
364
|
+
|
|
365
|
+
return {
|
|
366
|
+
"message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with {total_points:,} total data points.",
|
|
367
|
+
"series_count": num_items,
|
|
368
|
+
"total_data_points": total_points,
|
|
369
|
+
"data_size_characters": data_size_chars,
|
|
370
|
+
"label_cardinality": label_summary,
|
|
371
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
|
|
372
|
+
}
|
|
445
373
|
else:
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
374
|
+
# Instant query
|
|
375
|
+
result_type = result_data.get("resultType", "")
|
|
376
|
+
result_list = result_data.get("result", [])
|
|
377
|
+
num_items = len(result_list)
|
|
378
|
+
|
|
379
|
+
# Analyze label keys and their cardinality
|
|
380
|
+
instant_label_cardinality: Dict[str, set] = {}
|
|
381
|
+
for item in result_list:
|
|
382
|
+
if isinstance(item, dict):
|
|
383
|
+
metric = item.get("metric", {})
|
|
384
|
+
for label_key, label_value in metric.items():
|
|
385
|
+
if label_key not in instant_label_cardinality:
|
|
386
|
+
instant_label_cardinality[label_key] = set()
|
|
387
|
+
instant_label_cardinality[label_key].add(label_value)
|
|
388
|
+
|
|
389
|
+
# Convert sets to counts for the summary
|
|
390
|
+
label_summary = {
|
|
391
|
+
label: len(values) for label, values in instant_label_cardinality.items()
|
|
392
|
+
}
|
|
393
|
+
# Sort by cardinality (highest first) for better insights
|
|
394
|
+
label_summary = dict(
|
|
395
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
451
396
|
)
|
|
452
|
-
metrics = filter_metrics_by_name(metrics, metric_name)
|
|
453
|
-
|
|
454
|
-
if should_fetch_labels:
|
|
455
|
-
metrics_labels = {}
|
|
456
|
-
if should_fetch_labels_with_labels_api:
|
|
457
|
-
metrics_labels = fetch_metrics_labels_with_labels_api(
|
|
458
|
-
prometheus_url=prometheus_url,
|
|
459
|
-
cache=cache,
|
|
460
|
-
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
461
|
-
metric_names=list(metrics.keys()),
|
|
462
|
-
headers=headers,
|
|
463
|
-
config=config,
|
|
464
|
-
verify_ssl=verify_ssl,
|
|
465
|
-
)
|
|
466
|
-
else:
|
|
467
|
-
metrics_labels = fetch_metrics_labels_with_series_api(
|
|
468
|
-
prometheus_url=prometheus_url,
|
|
469
|
-
cache=cache,
|
|
470
|
-
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
471
|
-
metric_name=metric_name,
|
|
472
|
-
headers=headers,
|
|
473
|
-
config=config,
|
|
474
|
-
verify_ssl=verify_ssl,
|
|
475
|
-
)
|
|
476
|
-
|
|
477
|
-
for metric_name in metrics:
|
|
478
|
-
if metric_name in metrics_labels:
|
|
479
|
-
metrics[metric_name]["labels"] = metrics_labels[metric_name]
|
|
480
397
|
|
|
481
|
-
|
|
398
|
+
return {
|
|
399
|
+
"message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
|
|
400
|
+
"result_count": num_items,
|
|
401
|
+
"result_type": result_type,
|
|
402
|
+
"data_size_characters": data_size_chars,
|
|
403
|
+
"label_cardinality": label_summary,
|
|
404
|
+
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
|
|
405
|
+
}
|
|
482
406
|
|
|
483
407
|
|
|
484
408
|
class ListPrometheusRules(BasePrometheusTool):
|
|
485
409
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
486
410
|
super().__init__(
|
|
487
411
|
name="list_prometheus_rules",
|
|
488
|
-
description="List all defined
|
|
412
|
+
description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
|
|
489
413
|
parameters={},
|
|
490
414
|
toolset=toolset,
|
|
491
415
|
)
|
|
@@ -496,13 +420,13 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
496
420
|
) -> StructuredToolResult:
|
|
497
421
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
498
422
|
return StructuredToolResult(
|
|
499
|
-
status=
|
|
423
|
+
status=StructuredToolResultStatus.ERROR,
|
|
500
424
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
501
425
|
params=params,
|
|
502
426
|
)
|
|
503
427
|
if self.toolset.config.is_amp():
|
|
504
428
|
return StructuredToolResult(
|
|
505
|
-
status=
|
|
429
|
+
status=StructuredToolResultStatus.ERROR,
|
|
506
430
|
error="Tool not supported in AMP",
|
|
507
431
|
params=params,
|
|
508
432
|
)
|
|
@@ -515,7 +439,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
515
439
|
logging.debug("rules returned from cache")
|
|
516
440
|
|
|
517
441
|
return StructuredToolResult(
|
|
518
|
-
status=
|
|
442
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
519
443
|
data=cached_rules,
|
|
520
444
|
params=params,
|
|
521
445
|
)
|
|
@@ -528,7 +452,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
528
452
|
config=self.toolset.config,
|
|
529
453
|
url=rules_url,
|
|
530
454
|
params=params,
|
|
531
|
-
timeout=
|
|
455
|
+
timeout=40,
|
|
532
456
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
533
457
|
headers=self.toolset.config.headers,
|
|
534
458
|
method="GET",
|
|
@@ -539,28 +463,28 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
539
463
|
if self._cache:
|
|
540
464
|
self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
|
|
541
465
|
return StructuredToolResult(
|
|
542
|
-
status=
|
|
466
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
543
467
|
data=data,
|
|
544
468
|
params=params,
|
|
545
469
|
)
|
|
546
470
|
except requests.Timeout:
|
|
547
471
|
logging.warning("Timeout while fetching prometheus rules", exc_info=True)
|
|
548
472
|
return StructuredToolResult(
|
|
549
|
-
status=
|
|
473
|
+
status=StructuredToolResultStatus.ERROR,
|
|
550
474
|
error="Request timed out while fetching rules",
|
|
551
475
|
params=params,
|
|
552
476
|
)
|
|
553
477
|
except RequestException as e:
|
|
554
478
|
logging.warning("Failed to fetch prometheus rules", exc_info=True)
|
|
555
479
|
return StructuredToolResult(
|
|
556
|
-
status=
|
|
480
|
+
status=StructuredToolResultStatus.ERROR,
|
|
557
481
|
error=f"Network error while fetching rules: {str(e)}",
|
|
558
482
|
params=params,
|
|
559
483
|
)
|
|
560
484
|
except Exception as e:
|
|
561
485
|
logging.warning("Failed to process prometheus rules", exc_info=True)
|
|
562
486
|
return StructuredToolResult(
|
|
563
|
-
status=
|
|
487
|
+
status=StructuredToolResultStatus.ERROR,
|
|
564
488
|
error=f"Unexpected error: {str(e)}",
|
|
565
489
|
params=params,
|
|
566
490
|
)
|
|
@@ -569,120 +493,563 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
569
493
|
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
|
|
570
494
|
|
|
571
495
|
|
|
572
|
-
class
|
|
496
|
+
class GetMetricNames(BasePrometheusTool):
|
|
497
|
+
"""Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
|
|
498
|
+
|
|
573
499
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
574
500
|
super().__init__(
|
|
575
|
-
name="
|
|
576
|
-
description=
|
|
501
|
+
name="get_metric_names",
|
|
502
|
+
description=(
|
|
503
|
+
"Get list of metric names using /api/v1/label/__name__/values. "
|
|
504
|
+
"FASTEST method for metric discovery when you need to explore available metrics. "
|
|
505
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
|
|
506
|
+
f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
|
|
507
|
+
"Note: Does not return metric metadata (type, description, labels). "
|
|
508
|
+
"By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
509
|
+
),
|
|
577
510
|
parameters={
|
|
578
|
-
"
|
|
579
|
-
description=
|
|
511
|
+
"match": ToolParameter(
|
|
512
|
+
description=(
|
|
513
|
+
"REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
|
|
514
|
+
"'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
|
|
515
|
+
"'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
|
|
516
|
+
"'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
|
|
517
|
+
"'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
|
|
518
|
+
"'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
|
|
519
|
+
"'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
|
|
520
|
+
),
|
|
521
|
+
type="string",
|
|
522
|
+
required=True,
|
|
523
|
+
),
|
|
524
|
+
"start": ToolParameter(
|
|
525
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
580
526
|
type="string",
|
|
581
527
|
required=False,
|
|
582
528
|
),
|
|
583
|
-
"
|
|
584
|
-
description="
|
|
529
|
+
"end": ToolParameter(
|
|
530
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
585
531
|
type="string",
|
|
586
|
-
required=
|
|
532
|
+
required=False,
|
|
587
533
|
),
|
|
588
534
|
},
|
|
589
535
|
toolset=toolset,
|
|
590
536
|
)
|
|
591
|
-
self._cache = None
|
|
592
537
|
|
|
593
538
|
def _invoke(
|
|
594
539
|
self, params: dict, user_approved: bool = False
|
|
595
540
|
) -> StructuredToolResult:
|
|
596
541
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
597
542
|
return StructuredToolResult(
|
|
598
|
-
status=
|
|
543
|
+
status=StructuredToolResultStatus.ERROR,
|
|
599
544
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
600
545
|
params=params,
|
|
601
546
|
)
|
|
602
|
-
if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
|
|
603
|
-
self._cache = TTLCache(
|
|
604
|
-
self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
|
|
605
|
-
)
|
|
606
547
|
try:
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
548
|
+
match_param = params.get("match")
|
|
549
|
+
if not match_param:
|
|
550
|
+
return StructuredToolResult(
|
|
551
|
+
status=StructuredToolResultStatus.ERROR,
|
|
552
|
+
error="Match parameter is required to filter metrics",
|
|
553
|
+
params=params,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
url = urljoin(
|
|
557
|
+
self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
|
|
610
558
|
)
|
|
559
|
+
query_params = {
|
|
560
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
561
|
+
"match[]": match_param,
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
# Add time parameters - use provided values or defaults
|
|
565
|
+
if params.get("end"):
|
|
566
|
+
query_params["end"] = params["end"]
|
|
567
|
+
else:
|
|
568
|
+
query_params["end"] = str(int(time.time()))
|
|
569
|
+
|
|
570
|
+
if params.get("start"):
|
|
571
|
+
query_params["start"] = params["start"]
|
|
572
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
573
|
+
# Use default time window
|
|
574
|
+
query_params["start"] = str(
|
|
575
|
+
int(time.time())
|
|
576
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
response = do_request(
|
|
580
|
+
config=self.toolset.config,
|
|
581
|
+
url=url,
|
|
582
|
+
params=query_params,
|
|
583
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
584
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
585
|
+
headers=self.toolset.config.headers,
|
|
586
|
+
method="GET",
|
|
587
|
+
)
|
|
588
|
+
response.raise_for_status()
|
|
589
|
+
data = response.json()
|
|
590
|
+
|
|
591
|
+
# Check if results were truncated
|
|
592
|
+
if (
|
|
593
|
+
"data" in data
|
|
594
|
+
and isinstance(data["data"], list)
|
|
595
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
596
|
+
):
|
|
597
|
+
data["_truncated"] = True
|
|
598
|
+
data["_message"] = (
|
|
599
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
return StructuredToolResult(
|
|
603
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
604
|
+
data=data,
|
|
605
|
+
params=params,
|
|
606
|
+
)
|
|
607
|
+
except Exception as e:
|
|
608
|
+
return StructuredToolResult(
|
|
609
|
+
status=StructuredToolResultStatus.ERROR,
|
|
610
|
+
error=str(e),
|
|
611
|
+
params=params,
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
615
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
|
|
616
|
+
|
|
617
|
+
|
|
618
|
+
class GetLabelValues(BasePrometheusTool):
|
|
619
|
+
"""Get values for a specific label across all metrics"""
|
|
620
|
+
|
|
621
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
622
|
+
super().__init__(
|
|
623
|
+
name="get_label_values",
|
|
624
|
+
description=(
|
|
625
|
+
"Get all values for a specific label using /api/v1/label/{label}/values. "
|
|
626
|
+
"Use this to discover pods, namespaces, jobs, instances, etc. "
|
|
627
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
628
|
+
"Supports optional match[] parameter to filter. "
|
|
629
|
+
"By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
630
|
+
),
|
|
631
|
+
parameters={
|
|
632
|
+
"label": ToolParameter(
|
|
633
|
+
description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
|
|
634
|
+
type="string",
|
|
635
|
+
required=True,
|
|
636
|
+
),
|
|
637
|
+
"match": ToolParameter(
|
|
638
|
+
description=(
|
|
639
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
640
|
+
"'{namespace=\"default\"}')."
|
|
641
|
+
),
|
|
642
|
+
type="string",
|
|
643
|
+
required=False,
|
|
644
|
+
),
|
|
645
|
+
"start": ToolParameter(
|
|
646
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
647
|
+
type="string",
|
|
648
|
+
required=False,
|
|
649
|
+
),
|
|
650
|
+
"end": ToolParameter(
|
|
651
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
652
|
+
type="string",
|
|
653
|
+
required=False,
|
|
654
|
+
),
|
|
655
|
+
},
|
|
656
|
+
toolset=toolset,
|
|
657
|
+
)
|
|
611
658
|
|
|
612
|
-
|
|
613
|
-
|
|
659
|
+
def _invoke(
|
|
660
|
+
self, params: dict, user_approved: bool = False
|
|
661
|
+
) -> StructuredToolResult:
|
|
662
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
663
|
+
return StructuredToolResult(
|
|
664
|
+
status=StructuredToolResultStatus.ERROR,
|
|
665
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
666
|
+
params=params,
|
|
667
|
+
)
|
|
668
|
+
try:
|
|
669
|
+
label = params.get("label")
|
|
670
|
+
if not label:
|
|
614
671
|
return StructuredToolResult(
|
|
615
|
-
status=
|
|
616
|
-
error="
|
|
672
|
+
status=StructuredToolResultStatus.ERROR,
|
|
673
|
+
error="Label parameter is required",
|
|
617
674
|
params=params,
|
|
618
675
|
)
|
|
619
676
|
|
|
620
|
-
|
|
621
|
-
prometheus_url
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
677
|
+
url = urljoin(
|
|
678
|
+
self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
|
|
679
|
+
)
|
|
680
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
681
|
+
if params.get("match"):
|
|
682
|
+
query_params["match[]"] = params["match"]
|
|
683
|
+
|
|
684
|
+
# Add time parameters - use provided values or defaults
|
|
685
|
+
if params.get("end"):
|
|
686
|
+
query_params["end"] = params["end"]
|
|
687
|
+
else:
|
|
688
|
+
query_params["end"] = str(int(time.time()))
|
|
689
|
+
|
|
690
|
+
if params.get("start"):
|
|
691
|
+
query_params["start"] = params["start"]
|
|
692
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
693
|
+
# Use default time window
|
|
694
|
+
query_params["start"] = str(
|
|
695
|
+
int(time.time())
|
|
696
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
697
|
+
)
|
|
698
|
+
|
|
699
|
+
response = do_request(
|
|
700
|
+
config=self.toolset.config,
|
|
701
|
+
url=url,
|
|
702
|
+
params=query_params,
|
|
703
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
704
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
627
705
|
headers=self.toolset.config.headers,
|
|
706
|
+
method="GET",
|
|
707
|
+
)
|
|
708
|
+
response.raise_for_status()
|
|
709
|
+
data = response.json()
|
|
710
|
+
|
|
711
|
+
# Check if results were truncated
|
|
712
|
+
if (
|
|
713
|
+
"data" in data
|
|
714
|
+
and isinstance(data["data"], list)
|
|
715
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
716
|
+
):
|
|
717
|
+
data["_truncated"] = True
|
|
718
|
+
data["_message"] = (
|
|
719
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
return StructuredToolResult(
|
|
723
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
724
|
+
data=data,
|
|
725
|
+
params=params,
|
|
726
|
+
)
|
|
727
|
+
except Exception as e:
|
|
728
|
+
return StructuredToolResult(
|
|
729
|
+
status=StructuredToolResultStatus.ERROR,
|
|
730
|
+
error=str(e),
|
|
731
|
+
params=params,
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
735
|
+
label = params.get("label", "")
|
|
736
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
class GetAllLabels(BasePrometheusTool):
|
|
740
|
+
"""Get all label names that exist in Prometheus"""
|
|
741
|
+
|
|
742
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
743
|
+
super().__init__(
|
|
744
|
+
name="get_all_labels",
|
|
745
|
+
description=(
|
|
746
|
+
"Get list of all label names using /api/v1/labels. "
|
|
747
|
+
"Use this to discover what labels are available across all metrics. "
|
|
748
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
749
|
+
"Supports optional match[] parameter to filter. "
|
|
750
|
+
"By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
751
|
+
),
|
|
752
|
+
parameters={
|
|
753
|
+
"match": ToolParameter(
|
|
754
|
+
description=(
|
|
755
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
756
|
+
"'{job=\"prometheus\"}')."
|
|
757
|
+
),
|
|
758
|
+
type="string",
|
|
759
|
+
required=False,
|
|
760
|
+
),
|
|
761
|
+
"start": ToolParameter(
|
|
762
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
763
|
+
type="string",
|
|
764
|
+
required=False,
|
|
765
|
+
),
|
|
766
|
+
"end": ToolParameter(
|
|
767
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
768
|
+
type="string",
|
|
769
|
+
required=False,
|
|
770
|
+
),
|
|
771
|
+
},
|
|
772
|
+
toolset=toolset,
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
def _invoke(
|
|
776
|
+
self, params: dict, user_approved: bool = False
|
|
777
|
+
) -> StructuredToolResult:
|
|
778
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
779
|
+
return StructuredToolResult(
|
|
780
|
+
status=StructuredToolResultStatus.ERROR,
|
|
781
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
782
|
+
params=params,
|
|
783
|
+
)
|
|
784
|
+
try:
|
|
785
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
|
|
786
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
787
|
+
if params.get("match"):
|
|
788
|
+
query_params["match[]"] = params["match"]
|
|
789
|
+
|
|
790
|
+
# Add time parameters - use provided values or defaults
|
|
791
|
+
if params.get("end"):
|
|
792
|
+
query_params["end"] = params["end"]
|
|
793
|
+
else:
|
|
794
|
+
query_params["end"] = str(int(time.time()))
|
|
795
|
+
|
|
796
|
+
if params.get("start"):
|
|
797
|
+
query_params["start"] = params["start"]
|
|
798
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
799
|
+
# Use default time window
|
|
800
|
+
query_params["start"] = str(
|
|
801
|
+
int(time.time())
|
|
802
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
803
|
+
)
|
|
804
|
+
|
|
805
|
+
response = do_request(
|
|
628
806
|
config=self.toolset.config,
|
|
629
|
-
|
|
807
|
+
url=url,
|
|
808
|
+
params=query_params,
|
|
809
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
810
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
811
|
+
headers=self.toolset.config.headers,
|
|
812
|
+
method="GET",
|
|
630
813
|
)
|
|
814
|
+
response.raise_for_status()
|
|
815
|
+
data = response.json()
|
|
816
|
+
|
|
817
|
+
# Check if results were truncated
|
|
818
|
+
if (
|
|
819
|
+
"data" in data
|
|
820
|
+
and isinstance(data["data"], list)
|
|
821
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
822
|
+
):
|
|
823
|
+
data["_truncated"] = True
|
|
824
|
+
data["_message"] = (
|
|
825
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
|
|
826
|
+
)
|
|
631
827
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
828
|
+
return StructuredToolResult(
|
|
829
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
830
|
+
data=data,
|
|
831
|
+
params=params,
|
|
832
|
+
)
|
|
833
|
+
except Exception as e:
|
|
834
|
+
return StructuredToolResult(
|
|
835
|
+
status=StructuredToolResultStatus.ERROR,
|
|
836
|
+
error=str(e),
|
|
837
|
+
params=params,
|
|
838
|
+
)
|
|
635
839
|
|
|
636
|
-
|
|
637
|
-
|
|
840
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
841
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
class GetSeries(BasePrometheusTool):
|
|
845
|
+
"""Get time series matching a selector"""
|
|
638
846
|
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
847
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
848
|
+
super().__init__(
|
|
849
|
+
name="get_series",
|
|
850
|
+
description=(
|
|
851
|
+
"Get time series using /api/v1/series. "
|
|
852
|
+
"Returns label sets for all time series matching the selector. "
|
|
853
|
+
"SLOWER than other discovery methods - use only when you need full label sets. "
|
|
854
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
|
|
855
|
+
"Requires match[] parameter with PromQL selector. "
|
|
856
|
+
"By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
857
|
+
),
|
|
858
|
+
parameters={
|
|
859
|
+
"match": ToolParameter(
|
|
860
|
+
description=(
|
|
861
|
+
"PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
|
|
862
|
+
"'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
|
|
863
|
+
'\'{__name__="up",job="prometheus"}\').'
|
|
864
|
+
),
|
|
865
|
+
type="string",
|
|
866
|
+
required=True,
|
|
867
|
+
),
|
|
868
|
+
"start": ToolParameter(
|
|
869
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
870
|
+
type="string",
|
|
871
|
+
required=False,
|
|
872
|
+
),
|
|
873
|
+
"end": ToolParameter(
|
|
874
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
875
|
+
type="string",
|
|
876
|
+
required=False,
|
|
877
|
+
),
|
|
878
|
+
},
|
|
879
|
+
toolset=toolset,
|
|
880
|
+
)
|
|
881
|
+
|
|
882
|
+
def _invoke(
|
|
883
|
+
self, params: dict, user_approved: bool = False
|
|
884
|
+
) -> StructuredToolResult:
|
|
885
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
886
|
+
return StructuredToolResult(
|
|
887
|
+
status=StructuredToolResultStatus.ERROR,
|
|
888
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
889
|
+
params=params,
|
|
890
|
+
)
|
|
891
|
+
try:
|
|
892
|
+
match = params.get("match")
|
|
893
|
+
if not match:
|
|
894
|
+
return StructuredToolResult(
|
|
895
|
+
status=StructuredToolResultStatus.ERROR,
|
|
896
|
+
error="Match parameter is required",
|
|
897
|
+
params=params,
|
|
642
898
|
)
|
|
643
|
-
|
|
644
|
-
|
|
899
|
+
|
|
900
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
|
|
901
|
+
query_params = {
|
|
902
|
+
"match[]": match,
|
|
903
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
# Add time parameters - use provided values or defaults
|
|
907
|
+
if params.get("end"):
|
|
908
|
+
query_params["end"] = params["end"]
|
|
909
|
+
else:
|
|
910
|
+
query_params["end"] = str(int(time.time()))
|
|
911
|
+
|
|
912
|
+
if params.get("start"):
|
|
913
|
+
query_params["start"] = params["start"]
|
|
914
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
915
|
+
# Use default time window
|
|
916
|
+
query_params["start"] = str(
|
|
917
|
+
int(time.time())
|
|
918
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
645
919
|
)
|
|
646
920
|
|
|
647
|
-
|
|
921
|
+
response = do_request(
|
|
922
|
+
config=self.toolset.config,
|
|
923
|
+
url=url,
|
|
924
|
+
params=query_params,
|
|
925
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
926
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
927
|
+
headers=self.toolset.config.headers,
|
|
928
|
+
method="GET",
|
|
929
|
+
)
|
|
930
|
+
response.raise_for_status()
|
|
931
|
+
data = response.json()
|
|
932
|
+
|
|
933
|
+
# Check if results were truncated
|
|
934
|
+
if (
|
|
935
|
+
"data" in data
|
|
936
|
+
and isinstance(data["data"], list)
|
|
937
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
938
|
+
):
|
|
939
|
+
data["_truncated"] = True
|
|
940
|
+
data["_message"] = (
|
|
941
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
|
|
942
|
+
)
|
|
943
|
+
|
|
944
|
+
return StructuredToolResult(
|
|
945
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
946
|
+
data=data,
|
|
947
|
+
params=params,
|
|
948
|
+
)
|
|
949
|
+
except Exception as e:
|
|
648
950
|
return StructuredToolResult(
|
|
649
|
-
status=
|
|
650
|
-
|
|
951
|
+
status=StructuredToolResultStatus.ERROR,
|
|
952
|
+
error=str(e),
|
|
651
953
|
params=params,
|
|
652
954
|
)
|
|
653
955
|
|
|
654
|
-
|
|
655
|
-
|
|
956
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
957
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
class GetMetricMetadata(BasePrometheusTool):
|
|
961
|
+
"""Get metadata (type, description, unit) for metrics"""
|
|
962
|
+
|
|
963
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
964
|
+
super().__init__(
|
|
965
|
+
name="get_metric_metadata",
|
|
966
|
+
description=(
|
|
967
|
+
"Get metric metadata using /api/v1/metadata. "
|
|
968
|
+
"Returns type, help text, and unit for metrics. "
|
|
969
|
+
"Use after discovering metric names to get their descriptions. "
|
|
970
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
|
|
971
|
+
"Supports optional metric name filter."
|
|
972
|
+
),
|
|
973
|
+
parameters={
|
|
974
|
+
"metric": ToolParameter(
|
|
975
|
+
description=(
|
|
976
|
+
"Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
|
|
977
|
+
"If not provided, returns metadata for all metrics."
|
|
978
|
+
),
|
|
979
|
+
type="string",
|
|
980
|
+
required=False,
|
|
981
|
+
),
|
|
982
|
+
},
|
|
983
|
+
toolset=toolset,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
def _invoke(
|
|
987
|
+
self, params: dict, user_approved: bool = False
|
|
988
|
+
) -> StructuredToolResult:
|
|
989
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
656
990
|
return StructuredToolResult(
|
|
657
|
-
status=
|
|
658
|
-
error="
|
|
991
|
+
status=StructuredToolResultStatus.ERROR,
|
|
992
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
659
993
|
params=params,
|
|
660
994
|
)
|
|
661
|
-
|
|
662
|
-
|
|
995
|
+
try:
|
|
996
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
|
|
997
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
998
|
+
|
|
999
|
+
if params.get("metric"):
|
|
1000
|
+
query_params["metric"] = params["metric"]
|
|
1001
|
+
|
|
1002
|
+
response = do_request(
|
|
1003
|
+
config=self.toolset.config,
|
|
1004
|
+
url=url,
|
|
1005
|
+
params=query_params,
|
|
1006
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
1007
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
1008
|
+
headers=self.toolset.config.headers,
|
|
1009
|
+
method="GET",
|
|
1010
|
+
)
|
|
1011
|
+
response.raise_for_status()
|
|
1012
|
+
data = response.json()
|
|
1013
|
+
|
|
1014
|
+
# Check if results were truncated (metadata endpoint returns a dict, not a list)
|
|
1015
|
+
if (
|
|
1016
|
+
"data" in data
|
|
1017
|
+
and isinstance(data["data"], dict)
|
|
1018
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
1019
|
+
):
|
|
1020
|
+
data["_truncated"] = True
|
|
1021
|
+
data["_message"] = (
|
|
1022
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
|
|
1023
|
+
)
|
|
1024
|
+
|
|
663
1025
|
return StructuredToolResult(
|
|
664
|
-
status=
|
|
665
|
-
|
|
1026
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
1027
|
+
data=data,
|
|
666
1028
|
params=params,
|
|
667
1029
|
)
|
|
668
1030
|
except Exception as e:
|
|
669
|
-
logging.warn("Failed to process prometheus metrics", exc_info=True)
|
|
670
1031
|
return StructuredToolResult(
|
|
671
|
-
status=
|
|
672
|
-
error=
|
|
1032
|
+
status=StructuredToolResultStatus.ERROR,
|
|
1033
|
+
error=str(e),
|
|
673
1034
|
params=params,
|
|
674
1035
|
)
|
|
675
1036
|
|
|
676
1037
|
def get_parameterized_one_liner(self, params) -> str:
|
|
677
|
-
|
|
678
|
-
return
|
|
1038
|
+
metric = params.get("metric", "all")
|
|
1039
|
+
return (
|
|
1040
|
+
f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
|
|
1041
|
+
)
|
|
679
1042
|
|
|
680
1043
|
|
|
681
1044
|
class ExecuteInstantQuery(BasePrometheusTool):
|
|
682
1045
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
683
1046
|
super().__init__(
|
|
684
1047
|
name="execute_prometheus_instant_query",
|
|
685
|
-
description=
|
|
1048
|
+
description=(
|
|
1049
|
+
f"Execute an instant PromQL query (single point in time). "
|
|
1050
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1051
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
|
|
1052
|
+
),
|
|
686
1053
|
parameters={
|
|
687
1054
|
"query": ToolParameter(
|
|
688
1055
|
description="The PromQL query",
|
|
@@ -694,6 +1061,15 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
694
1061
|
type="string",
|
|
695
1062
|
required=True,
|
|
696
1063
|
),
|
|
1064
|
+
"timeout": ToolParameter(
|
|
1065
|
+
description=(
|
|
1066
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1067
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1068
|
+
f"Increase for complex queries that may take longer."
|
|
1069
|
+
),
|
|
1070
|
+
type="number",
|
|
1071
|
+
required=False,
|
|
1072
|
+
),
|
|
697
1073
|
},
|
|
698
1074
|
toolset=toolset,
|
|
699
1075
|
)
|
|
@@ -703,7 +1079,7 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
703
1079
|
) -> StructuredToolResult:
|
|
704
1080
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
705
1081
|
return StructuredToolResult(
|
|
706
|
-
status=
|
|
1082
|
+
status=StructuredToolResultStatus.ERROR,
|
|
707
1083
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
708
1084
|
params=params,
|
|
709
1085
|
)
|
|
@@ -715,12 +1091,24 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
715
1091
|
|
|
716
1092
|
payload = {"query": query}
|
|
717
1093
|
|
|
1094
|
+
# Get timeout parameter and enforce limits
|
|
1095
|
+
default_timeout = self.toolset.config.default_query_timeout_seconds
|
|
1096
|
+
max_timeout = self.toolset.config.max_query_timeout_seconds
|
|
1097
|
+
timeout = params.get("timeout", default_timeout)
|
|
1098
|
+
if timeout > max_timeout:
|
|
1099
|
+
timeout = max_timeout
|
|
1100
|
+
logging.warning(
|
|
1101
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1102
|
+
)
|
|
1103
|
+
elif timeout < 1:
|
|
1104
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1105
|
+
|
|
718
1106
|
response = do_request(
|
|
719
1107
|
config=self.toolset.config,
|
|
720
1108
|
url=url,
|
|
721
1109
|
headers=self.toolset.config.headers,
|
|
722
1110
|
data=payload,
|
|
723
|
-
timeout=
|
|
1111
|
+
timeout=timeout,
|
|
724
1112
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
725
1113
|
method="POST",
|
|
726
1114
|
)
|
|
@@ -743,12 +1131,44 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
743
1131
|
"query": query,
|
|
744
1132
|
}
|
|
745
1133
|
|
|
1134
|
+
# Check if data should be included based on size
|
|
746
1135
|
if self.toolset.config.tool_calls_return_data:
|
|
747
|
-
|
|
1136
|
+
result_data = data.get("data", {})
|
|
1137
|
+
|
|
1138
|
+
# Estimate the size of the data
|
|
1139
|
+
data_str_preview = json.dumps(result_data)
|
|
1140
|
+
data_size_chars = len(data_str_preview)
|
|
1141
|
+
|
|
1142
|
+
# Provide summary if data is too large
|
|
1143
|
+
if (
|
|
1144
|
+
self.toolset.config.query_response_size_limit
|
|
1145
|
+
and data_size_chars
|
|
1146
|
+
> self.toolset.config.query_response_size_limit
|
|
1147
|
+
):
|
|
1148
|
+
response_data["data_summary"] = (
|
|
1149
|
+
create_data_summary_for_large_result(
|
|
1150
|
+
result_data,
|
|
1151
|
+
query,
|
|
1152
|
+
data_size_chars,
|
|
1153
|
+
is_range_query=False,
|
|
1154
|
+
)
|
|
1155
|
+
)
|
|
1156
|
+
logging.info(
|
|
1157
|
+
f"Prometheus instant query returned large dataset: "
|
|
1158
|
+
f"{response_data['data_summary'].get('result_count', 0)} results, "
|
|
1159
|
+
f"{data_size_chars:,} characters (limit: {self.toolset.config.query_response_size_limit:,}). "
|
|
1160
|
+
f"Returning summary instead of full data."
|
|
1161
|
+
)
|
|
1162
|
+
# Also add character info to the summary for debugging
|
|
1163
|
+
response_data["data_summary"]["_debug_info"] = (
|
|
1164
|
+
f"Data size: {data_size_chars:,} chars exceeded limit of {self.toolset.config.query_response_size_limit:,} chars"
|
|
1165
|
+
)
|
|
1166
|
+
else:
|
|
1167
|
+
response_data["data"] = result_data
|
|
748
1168
|
|
|
749
1169
|
data_str = json.dumps(response_data, indent=2)
|
|
750
1170
|
return StructuredToolResult(
|
|
751
|
-
status=
|
|
1171
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
752
1172
|
data=data_str,
|
|
753
1173
|
params=params,
|
|
754
1174
|
)
|
|
@@ -764,14 +1184,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
764
1184
|
except json.JSONDecodeError:
|
|
765
1185
|
pass
|
|
766
1186
|
return StructuredToolResult(
|
|
767
|
-
status=
|
|
1187
|
+
status=StructuredToolResultStatus.ERROR,
|
|
768
1188
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
769
1189
|
params=params,
|
|
770
1190
|
)
|
|
771
1191
|
|
|
772
1192
|
# For other status codes, just return the status code and content
|
|
773
1193
|
return StructuredToolResult(
|
|
774
|
-
status=
|
|
1194
|
+
status=StructuredToolResultStatus.ERROR,
|
|
775
1195
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
776
1196
|
params=params,
|
|
777
1197
|
)
|
|
@@ -779,14 +1199,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
779
1199
|
except RequestException as e:
|
|
780
1200
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
781
1201
|
return StructuredToolResult(
|
|
782
|
-
status=
|
|
1202
|
+
status=StructuredToolResultStatus.ERROR,
|
|
783
1203
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
784
1204
|
params=params,
|
|
785
1205
|
)
|
|
786
1206
|
except Exception as e:
|
|
787
1207
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
788
1208
|
return StructuredToolResult(
|
|
789
|
-
status=
|
|
1209
|
+
status=StructuredToolResultStatus.ERROR,
|
|
790
1210
|
error=f"Unexpected error executing query: {str(e)}",
|
|
791
1211
|
params=params,
|
|
792
1212
|
)
|
|
@@ -800,7 +1220,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
800
1220
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
801
1221
|
super().__init__(
|
|
802
1222
|
name="execute_prometheus_range_query",
|
|
803
|
-
description=
|
|
1223
|
+
description=(
|
|
1224
|
+
f"Generates a graph and Execute a PromQL range query. "
|
|
1225
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1226
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
|
|
1227
|
+
f"Default time range is last 1 hour."
|
|
1228
|
+
),
|
|
804
1229
|
parameters={
|
|
805
1230
|
"query": ToolParameter(
|
|
806
1231
|
description="The PromQL query",
|
|
@@ -827,13 +1252,32 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
827
1252
|
"step": ToolParameter(
|
|
828
1253
|
description="Query resolution step width in duration format or float number of seconds",
|
|
829
1254
|
type="number",
|
|
830
|
-
required=
|
|
1255
|
+
required=False,
|
|
831
1256
|
),
|
|
832
1257
|
"output_type": ToolParameter(
|
|
833
1258
|
description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
|
|
834
1259
|
type="string",
|
|
835
1260
|
required=True,
|
|
836
1261
|
),
|
|
1262
|
+
"timeout": ToolParameter(
|
|
1263
|
+
description=(
|
|
1264
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1265
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1266
|
+
f"Increase for complex queries that may take longer."
|
|
1267
|
+
),
|
|
1268
|
+
type="number",
|
|
1269
|
+
required=False,
|
|
1270
|
+
),
|
|
1271
|
+
"max_points": ToolParameter(
|
|
1272
|
+
description=(
|
|
1273
|
+
f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
|
|
1274
|
+
f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
|
|
1275
|
+
f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
|
|
1276
|
+
f"If your query would return more points than this limit, the step will be automatically adjusted."
|
|
1277
|
+
),
|
|
1278
|
+
type="number",
|
|
1279
|
+
required=False,
|
|
1280
|
+
),
|
|
837
1281
|
},
|
|
838
1282
|
toolset=toolset,
|
|
839
1283
|
)
|
|
@@ -843,7 +1287,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
843
1287
|
) -> StructuredToolResult:
|
|
844
1288
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
845
1289
|
return StructuredToolResult(
|
|
846
|
-
status=
|
|
1290
|
+
status=StructuredToolResultStatus.ERROR,
|
|
847
1291
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
848
1292
|
params=params,
|
|
849
1293
|
)
|
|
@@ -857,12 +1301,17 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
857
1301
|
end_timestamp=params.get("end"),
|
|
858
1302
|
default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
859
1303
|
)
|
|
860
|
-
step = params.get("step"
|
|
1304
|
+
step = parse_duration_to_seconds(params.get("step"))
|
|
1305
|
+
max_points = params.get(
|
|
1306
|
+
"max_points"
|
|
1307
|
+
) # Get the optional max_points parameter
|
|
861
1308
|
|
|
1309
|
+
# adjust_step_for_max_points handles None case and converts to float
|
|
862
1310
|
step = adjust_step_for_max_points(
|
|
863
1311
|
start_timestamp=start,
|
|
864
1312
|
end_timestamp=end,
|
|
865
|
-
step=
|
|
1313
|
+
step=step,
|
|
1314
|
+
max_points_override=max_points,
|
|
866
1315
|
)
|
|
867
1316
|
|
|
868
1317
|
description = params.get("description", "")
|
|
@@ -874,12 +1323,24 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
874
1323
|
"step": step,
|
|
875
1324
|
}
|
|
876
1325
|
|
|
1326
|
+
# Get timeout parameter and enforce limits
|
|
1327
|
+
default_timeout = self.toolset.config.default_query_timeout_seconds
|
|
1328
|
+
max_timeout = self.toolset.config.max_query_timeout_seconds
|
|
1329
|
+
timeout = params.get("timeout", default_timeout)
|
|
1330
|
+
if timeout > max_timeout:
|
|
1331
|
+
timeout = max_timeout
|
|
1332
|
+
logging.warning(
|
|
1333
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1334
|
+
)
|
|
1335
|
+
elif timeout < 1:
|
|
1336
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1337
|
+
|
|
877
1338
|
response = do_request(
|
|
878
1339
|
config=self.toolset.config,
|
|
879
1340
|
url=url,
|
|
880
1341
|
headers=self.toolset.config.headers,
|
|
881
1342
|
data=payload,
|
|
882
|
-
timeout=
|
|
1343
|
+
timeout=timeout,
|
|
883
1344
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
884
1345
|
method="POST",
|
|
885
1346
|
)
|
|
@@ -906,12 +1367,42 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
906
1367
|
"output_type": output_type,
|
|
907
1368
|
}
|
|
908
1369
|
|
|
1370
|
+
# Check if data should be included based on size
|
|
909
1371
|
if self.toolset.config.tool_calls_return_data:
|
|
910
|
-
|
|
1372
|
+
result_data = data.get("data", {})
|
|
1373
|
+
|
|
1374
|
+
# Estimate the size of the data
|
|
1375
|
+
data_str_preview = json.dumps(result_data)
|
|
1376
|
+
data_size_chars = len(data_str_preview)
|
|
1377
|
+
|
|
1378
|
+
# Provide summary if data is too large
|
|
1379
|
+
if (
|
|
1380
|
+
self.toolset.config.query_response_size_limit
|
|
1381
|
+
and data_size_chars
|
|
1382
|
+
> self.toolset.config.query_response_size_limit
|
|
1383
|
+
):
|
|
1384
|
+
response_data["data_summary"] = (
|
|
1385
|
+
create_data_summary_for_large_result(
|
|
1386
|
+
result_data, query, data_size_chars, is_range_query=True
|
|
1387
|
+
)
|
|
1388
|
+
)
|
|
1389
|
+
logging.info(
|
|
1390
|
+
f"Prometheus range query returned large dataset: "
|
|
1391
|
+
f"{response_data['data_summary'].get('series_count', 0)} series, "
|
|
1392
|
+
f"{data_size_chars:,} characters (limit: {self.toolset.config.query_response_size_limit:,}). "
|
|
1393
|
+
f"Returning summary instead of full data."
|
|
1394
|
+
)
|
|
1395
|
+
# Also add character info to the summary for debugging
|
|
1396
|
+
response_data["data_summary"]["_debug_info"] = (
|
|
1397
|
+
f"Data size: {data_size_chars:,} chars exceeded limit of {self.toolset.config.query_response_size_limit:,} chars"
|
|
1398
|
+
)
|
|
1399
|
+
else:
|
|
1400
|
+
response_data["data"] = result_data
|
|
1401
|
+
|
|
911
1402
|
data_str = json.dumps(response_data, indent=2)
|
|
912
1403
|
|
|
913
1404
|
return StructuredToolResult(
|
|
914
|
-
status=
|
|
1405
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
915
1406
|
data=data_str,
|
|
916
1407
|
params=params,
|
|
917
1408
|
)
|
|
@@ -926,13 +1417,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
926
1417
|
except json.JSONDecodeError:
|
|
927
1418
|
pass
|
|
928
1419
|
return StructuredToolResult(
|
|
929
|
-
status=
|
|
1420
|
+
status=StructuredToolResultStatus.ERROR,
|
|
930
1421
|
error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
|
|
931
1422
|
params=params,
|
|
932
1423
|
)
|
|
933
1424
|
|
|
934
1425
|
return StructuredToolResult(
|
|
935
|
-
status=
|
|
1426
|
+
status=StructuredToolResultStatus.ERROR,
|
|
936
1427
|
error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
|
|
937
1428
|
params=params,
|
|
938
1429
|
)
|
|
@@ -940,14 +1431,14 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
940
1431
|
except RequestException as e:
|
|
941
1432
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
942
1433
|
return StructuredToolResult(
|
|
943
|
-
status=
|
|
1434
|
+
status=StructuredToolResultStatus.ERROR,
|
|
944
1435
|
error=f"Connection error to Prometheus: {str(e)}",
|
|
945
1436
|
params=params,
|
|
946
1437
|
)
|
|
947
1438
|
except Exception as e:
|
|
948
1439
|
logging.info("Failed to connect to Prometheus", exc_info=True)
|
|
949
1440
|
return StructuredToolResult(
|
|
950
|
-
status=
|
|
1441
|
+
status=StructuredToolResultStatus.ERROR,
|
|
951
1442
|
error=f"Unexpected error executing query: {str(e)}",
|
|
952
1443
|
params=params,
|
|
953
1444
|
)
|
|
@@ -969,7 +1460,11 @@ class PrometheusToolset(Toolset):
|
|
|
969
1460
|
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
970
1461
|
tools=[
|
|
971
1462
|
ListPrometheusRules(toolset=self),
|
|
972
|
-
|
|
1463
|
+
GetMetricNames(toolset=self),
|
|
1464
|
+
GetLabelValues(toolset=self),
|
|
1465
|
+
GetAllLabels(toolset=self),
|
|
1466
|
+
GetSeries(toolset=self),
|
|
1467
|
+
GetMetricMetadata(toolset=self),
|
|
973
1468
|
ExecuteInstantQuery(toolset=self),
|
|
974
1469
|
ExecuteRangeQuery(toolset=self),
|
|
975
1470
|
],
|
|
@@ -1060,13 +1555,8 @@ class PrometheusToolset(Toolset):
|
|
|
1060
1555
|
f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
|
|
1061
1556
|
)
|
|
1062
1557
|
|
|
1063
|
-
except RequestException:
|
|
1064
|
-
return (
|
|
1065
|
-
False,
|
|
1066
|
-
f"Failed to initialize using url={url}",
|
|
1067
|
-
)
|
|
1068
1558
|
except Exception as e:
|
|
1069
|
-
logging.exception("Failed to initialize Prometheus")
|
|
1559
|
+
logging.exception("Failed to initialize Prometheus", exc_info=True)
|
|
1070
1560
|
return (
|
|
1071
1561
|
False,
|
|
1072
1562
|
f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
|