holmesgpt 0.14.1a0__py3-none-any.whl → 0.14.3a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +5 -2
- holmes/common/env_vars.py +8 -2
- holmes/config.py +4 -7
- holmes/core/conversations.py +12 -2
- holmes/core/feedback.py +191 -0
- holmes/core/llm.py +52 -10
- holmes/core/models.py +101 -1
- holmes/core/supabase_dal.py +23 -9
- holmes/core/tool_calling_llm.py +206 -16
- holmes/core/tools.py +20 -7
- holmes/core/tools_utils/token_counting.py +13 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +45 -23
- holmes/core/tools_utils/tool_executor.py +11 -6
- holmes/core/toolset_manager.py +7 -3
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/interactive.py +146 -14
- holmes/plugins/prompts/_fetch_logs.jinja2 +13 -1
- holmes/plugins/runbooks/__init__.py +6 -1
- holmes/plugins/toolsets/__init__.py +11 -4
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +9 -20
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +6 -4
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +6 -4
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -4
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -3
- holmes/plugins/toolsets/bash/bash_toolset.py +4 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +333 -199
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +181 -9
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +80 -22
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +5 -8
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +7 -12
- holmes/plugins/toolsets/git.py +14 -12
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +23 -42
- holmes/plugins/toolsets/grafana/toolset_grafana.py +2 -3
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +2 -1
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +21 -39
- holmes/plugins/toolsets/internet/internet.py +2 -3
- holmes/plugins/toolsets/internet/notion.py +2 -3
- holmes/plugins/toolsets/investigator/core_investigation.py +7 -9
- holmes/plugins/toolsets/kafka.py +7 -18
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +2 -3
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +211 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +5 -12
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +3 -6
- holmes/plugins/toolsets/prometheus/prometheus.py +808 -419
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +27 -11
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +3 -6
- holmes/plugins/toolsets/robusta/robusta.py +4 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +93 -13
- holmes/plugins/toolsets/servicenow/servicenow.py +5 -10
- holmes/utils/sentry_helper.py +1 -1
- holmes/utils/stream.py +22 -7
- holmes/version.py +34 -14
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/METADATA +7 -9
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/RECORD +71 -65
- holmes/core/tools_utils/data_types.py +0 -81
- holmes/plugins/toolsets/newrelic.py +0 -231
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import os
|
|
4
|
-
import re
|
|
5
4
|
import time
|
|
6
5
|
import dateutil.parser
|
|
7
|
-
from typing import Any, Dict,
|
|
6
|
+
from typing import Any, Dict, Optional, Tuple, Type, Union
|
|
8
7
|
from urllib.parse import urljoin
|
|
9
8
|
|
|
10
9
|
import requests # type: ignore
|
|
@@ -16,11 +15,14 @@ from holmes.core.tools import (
|
|
|
16
15
|
CallablePrerequisite,
|
|
17
16
|
StructuredToolResult,
|
|
18
17
|
Tool,
|
|
18
|
+
ToolInvokeContext,
|
|
19
19
|
ToolParameter,
|
|
20
20
|
StructuredToolResultStatus,
|
|
21
21
|
Toolset,
|
|
22
22
|
ToolsetTag,
|
|
23
23
|
)
|
|
24
|
+
from holmes.core.tools_utils.token_counting import count_tool_response_tokens
|
|
25
|
+
from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
|
|
24
26
|
from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
|
|
25
27
|
from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
|
|
26
28
|
from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
|
|
@@ -39,25 +41,59 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
|
39
41
|
from holmes.utils.keygen_utils import generate_random_key
|
|
40
42
|
|
|
41
43
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
44
|
+
PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
|
|
45
|
+
# Default timeout values for PromQL queries
|
|
46
|
+
DEFAULT_QUERY_TIMEOUT_SECONDS = 20
|
|
47
|
+
MAX_QUERY_TIMEOUT_SECONDS = 180
|
|
48
|
+
# Default timeout for metadata API calls (discovery endpoints)
|
|
49
|
+
DEFAULT_METADATA_TIMEOUT_SECONDS = 20
|
|
50
|
+
MAX_METADATA_TIMEOUT_SECONDS = 60
|
|
51
|
+
# Default time window for metadata APIs (in hours)
|
|
52
|
+
DEFAULT_METADATA_TIME_WINDOW_HRS = 1
|
|
42
53
|
|
|
43
54
|
|
|
44
55
|
class PrometheusConfig(BaseModel):
|
|
45
56
|
# URL is optional because it can be set with an env var
|
|
46
57
|
prometheus_url: Optional[str]
|
|
47
58
|
healthcheck: str = "-/healthy"
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
59
|
+
|
|
60
|
+
# New config for default time window for metadata APIs
|
|
61
|
+
default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS # Default: only show metrics active in the last hour
|
|
62
|
+
|
|
63
|
+
# Query timeout configuration
|
|
64
|
+
default_query_timeout_seconds: int = (
|
|
65
|
+
DEFAULT_QUERY_TIMEOUT_SECONDS # Default timeout for PromQL queries
|
|
66
|
+
)
|
|
67
|
+
max_query_timeout_seconds: int = (
|
|
68
|
+
MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Metadata API timeout configuration
|
|
72
|
+
default_metadata_timeout_seconds: int = (
|
|
73
|
+
DEFAULT_METADATA_TIMEOUT_SECONDS # Default timeout for metadata/discovery APIs
|
|
74
|
+
)
|
|
75
|
+
max_metadata_timeout_seconds: int = (
|
|
76
|
+
MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# DEPRECATED: These config values are deprecated and will be removed in a future version
|
|
80
|
+
# Using None as default so we can detect if user explicitly set them
|
|
81
|
+
metrics_labels_time_window_hrs: Optional[int] = (
|
|
82
|
+
None # DEPRECATED - use default_metadata_time_window_hrs instead
|
|
83
|
+
)
|
|
84
|
+
metrics_labels_cache_duration_hrs: Optional[int] = (
|
|
85
|
+
None # DEPRECATED - no longer used
|
|
86
|
+
)
|
|
87
|
+
fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
88
|
+
fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
|
|
89
|
+
|
|
54
90
|
tool_calls_return_data: bool = True
|
|
55
91
|
headers: Dict = Field(default_factory=dict)
|
|
56
|
-
rules_cache_duration_seconds:
|
|
92
|
+
rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
|
|
57
93
|
additional_labels: Optional[Dict[str, str]] = None
|
|
58
94
|
prometheus_ssl_enabled: bool = True
|
|
59
|
-
|
|
60
|
-
|
|
95
|
+
query_response_size_limit_pct: Optional[int] = (
|
|
96
|
+
2 # Limit the max number of tokens that a query result can take to proactively prevent token limit issues. Expressed in % of the model's context window
|
|
61
97
|
)
|
|
62
98
|
|
|
63
99
|
@field_validator("prometheus_url")
|
|
@@ -68,6 +104,26 @@ class PrometheusConfig(BaseModel):
|
|
|
68
104
|
|
|
69
105
|
@model_validator(mode="after")
|
|
70
106
|
def validate_prom_config(self):
|
|
107
|
+
# Check for deprecated config values and print warnings
|
|
108
|
+
deprecated_configs = []
|
|
109
|
+
if self.metrics_labels_time_window_hrs is not None: # Check if explicitly set
|
|
110
|
+
deprecated_configs.append(
|
|
111
|
+
"metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
|
|
112
|
+
)
|
|
113
|
+
if (
|
|
114
|
+
self.metrics_labels_cache_duration_hrs is not None
|
|
115
|
+
): # Check if explicitly set
|
|
116
|
+
deprecated_configs.append("metrics_labels_cache_duration_hrs")
|
|
117
|
+
if self.fetch_labels_with_labels_api is not None: # Check if explicitly set
|
|
118
|
+
deprecated_configs.append("fetch_labels_with_labels_api")
|
|
119
|
+
if self.fetch_metadata_with_series_api is not None: # Check if explicitly set
|
|
120
|
+
deprecated_configs.append("fetch_metadata_with_series_api")
|
|
121
|
+
|
|
122
|
+
if deprecated_configs:
|
|
123
|
+
logging.warning(
|
|
124
|
+
f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
|
|
125
|
+
f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
|
|
126
|
+
)
|
|
71
127
|
# If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
|
|
72
128
|
if IS_OPENSHIFT:
|
|
73
129
|
if self.healthcheck == "-/healthy":
|
|
@@ -164,6 +220,8 @@ def do_request(
|
|
|
164
220
|
|
|
165
221
|
if isinstance(config, AMPConfig):
|
|
166
222
|
client = config.get_aws_client() # cached AWSPrometheusConnect
|
|
223
|
+
# Note: timeout parameter is not supported by prometrix's signed_request
|
|
224
|
+
# AWS/AMP requests will not respect the timeout setting
|
|
167
225
|
return client.signed_request( # type: ignore
|
|
168
226
|
method=method,
|
|
169
227
|
url=url,
|
|
@@ -185,99 +243,6 @@ def do_request(
|
|
|
185
243
|
)
|
|
186
244
|
|
|
187
245
|
|
|
188
|
-
def filter_metrics_by_type(metrics: Dict, expected_type: str):
|
|
189
|
-
return {
|
|
190
|
-
metric_name: metric_data
|
|
191
|
-
for metric_name, metric_data in metrics.items()
|
|
192
|
-
if expected_type in metric_data.get("type", "")
|
|
193
|
-
or metric_data.get("type", "") == "?"
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
|
|
198
|
-
regex = re.compile(pattern)
|
|
199
|
-
return {
|
|
200
|
-
metric_name: metric_data
|
|
201
|
-
for metric_name, metric_data in metrics.items()
|
|
202
|
-
if regex.search(metric_name)
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
def fetch_metadata(
|
|
210
|
-
prometheus_url: str,
|
|
211
|
-
headers: Optional[Dict],
|
|
212
|
-
config,
|
|
213
|
-
verify_ssl: bool = True,
|
|
214
|
-
) -> Dict:
|
|
215
|
-
metadata_url = urljoin(prometheus_url, "api/v1/metadata")
|
|
216
|
-
metadata_response = do_request(
|
|
217
|
-
config=config,
|
|
218
|
-
url=metadata_url,
|
|
219
|
-
headers=headers,
|
|
220
|
-
timeout=60,
|
|
221
|
-
verify=verify_ssl,
|
|
222
|
-
method="GET",
|
|
223
|
-
)
|
|
224
|
-
metadata_response.raise_for_status()
|
|
225
|
-
|
|
226
|
-
metadata = metadata_response.json()["data"]
|
|
227
|
-
|
|
228
|
-
metrics = {}
|
|
229
|
-
for metric_name, meta_list in metadata.items():
|
|
230
|
-
if meta_list:
|
|
231
|
-
metric_type = meta_list[0].get("type", "unknown")
|
|
232
|
-
metric_description = meta_list[0].get("help", "unknown")
|
|
233
|
-
metrics[metric_name] = {
|
|
234
|
-
"type": metric_type,
|
|
235
|
-
"description": metric_description,
|
|
236
|
-
"labels": set(),
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
return metrics
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
def fetch_metadata_with_series_api(
|
|
243
|
-
prometheus_url: str,
|
|
244
|
-
metric_name: str,
|
|
245
|
-
headers: Dict,
|
|
246
|
-
config,
|
|
247
|
-
verify_ssl: bool = True,
|
|
248
|
-
) -> Dict:
|
|
249
|
-
url = urljoin(prometheus_url, "api/v1/series")
|
|
250
|
-
params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
251
|
-
|
|
252
|
-
response = do_request(
|
|
253
|
-
config=config,
|
|
254
|
-
url=url,
|
|
255
|
-
headers=headers,
|
|
256
|
-
params=params,
|
|
257
|
-
timeout=60,
|
|
258
|
-
verify=verify_ssl,
|
|
259
|
-
method="GET",
|
|
260
|
-
)
|
|
261
|
-
response.raise_for_status()
|
|
262
|
-
metrics = response.json()["data"]
|
|
263
|
-
|
|
264
|
-
metadata: Dict = {}
|
|
265
|
-
for metric_data in metrics:
|
|
266
|
-
metric_name = metric_data.get("__name__")
|
|
267
|
-
if not metric_name:
|
|
268
|
-
continue
|
|
269
|
-
|
|
270
|
-
metric = metadata.get(metric_name)
|
|
271
|
-
if not metric:
|
|
272
|
-
metric = {"description": "?", "type": "?", "labels": set()}
|
|
273
|
-
metadata[metric_name] = metric
|
|
274
|
-
|
|
275
|
-
labels = {k for k in metric_data.keys() if k != "__name__"}
|
|
276
|
-
metric["labels"].update(labels)
|
|
277
|
-
|
|
278
|
-
return metadata
|
|
279
|
-
|
|
280
|
-
|
|
281
246
|
def result_has_data(result: Dict) -> bool:
|
|
282
247
|
data = result.get("data", {})
|
|
283
248
|
if len(data.get("result", [])) > 0:
|
|
@@ -289,19 +254,36 @@ def adjust_step_for_max_points(
|
|
|
289
254
|
start_timestamp: str,
|
|
290
255
|
end_timestamp: str,
|
|
291
256
|
step: Optional[float] = None,
|
|
257
|
+
max_points_override: Optional[float] = None,
|
|
292
258
|
) -> float:
|
|
293
259
|
"""
|
|
294
260
|
Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
|
|
295
|
-
Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
|
|
296
261
|
|
|
297
262
|
Args:
|
|
298
263
|
start_timestamp: RFC3339 formatted start time
|
|
299
264
|
end_timestamp: RFC3339 formatted end time
|
|
300
265
|
step: The requested step duration in seconds (None for auto-calculation)
|
|
266
|
+
max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
|
|
301
267
|
|
|
302
268
|
Returns:
|
|
303
269
|
Adjusted step value in seconds that ensures points <= max_points
|
|
304
270
|
"""
|
|
271
|
+
# Use override if provided and valid, otherwise use default
|
|
272
|
+
max_points = MAX_GRAPH_POINTS
|
|
273
|
+
if max_points_override is not None:
|
|
274
|
+
if max_points_override > MAX_GRAPH_POINTS:
|
|
275
|
+
logging.warning(
|
|
276
|
+
f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
|
|
277
|
+
)
|
|
278
|
+
max_points = MAX_GRAPH_POINTS
|
|
279
|
+
elif max_points_override < 1:
|
|
280
|
+
logging.warning(
|
|
281
|
+
f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
|
|
282
|
+
)
|
|
283
|
+
max_points = MAX_GRAPH_POINTS
|
|
284
|
+
else:
|
|
285
|
+
max_points = max_points_override
|
|
286
|
+
logging.debug(f"Using max_points override: {max_points}")
|
|
305
287
|
|
|
306
288
|
start_dt = dateutil.parser.parse(start_timestamp)
|
|
307
289
|
end_dt = dateutil.parser.parse(end_timestamp)
|
|
@@ -319,10 +301,10 @@ def adjust_step_for_max_points(
|
|
|
319
301
|
current_points = time_range_seconds / step
|
|
320
302
|
|
|
321
303
|
# If current points exceed max, adjust the step
|
|
322
|
-
if current_points >
|
|
323
|
-
adjusted_step = time_range_seconds /
|
|
304
|
+
if current_points > max_points:
|
|
305
|
+
adjusted_step = time_range_seconds / max_points
|
|
324
306
|
logging.info(
|
|
325
|
-
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {
|
|
307
|
+
f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
|
|
326
308
|
)
|
|
327
309
|
return adjusted_step
|
|
328
310
|
|
|
@@ -337,7 +319,7 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
|
|
|
337
319
|
|
|
338
320
|
|
|
339
321
|
def create_data_summary_for_large_result(
|
|
340
|
-
result_data: Dict, query: str,
|
|
322
|
+
result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
|
|
341
323
|
) -> Dict[str, Any]:
|
|
342
324
|
"""
|
|
343
325
|
Create a summary for large Prometheus results instead of returning full data.
|
|
@@ -345,7 +327,7 @@ def create_data_summary_for_large_result(
|
|
|
345
327
|
Args:
|
|
346
328
|
result_data: The Prometheus data result
|
|
347
329
|
query: The original PromQL query
|
|
348
|
-
|
|
330
|
+
data_size_tokens: Size of the data in tokens
|
|
349
331
|
is_range_query: Whether this is a range query (vs instant query)
|
|
350
332
|
|
|
351
333
|
Returns:
|
|
@@ -355,32 +337,36 @@ def create_data_summary_for_large_result(
|
|
|
355
337
|
series_list = result_data.get("result", [])
|
|
356
338
|
num_items = len(series_list)
|
|
357
339
|
|
|
358
|
-
# Calculate
|
|
340
|
+
# Calculate exact total data points across all series
|
|
359
341
|
total_points = 0
|
|
360
|
-
for series in series_list
|
|
342
|
+
for series in series_list: # Iterate through ALL series for exact count
|
|
361
343
|
points = len(series.get("values", []))
|
|
362
344
|
total_points += points
|
|
363
345
|
|
|
364
|
-
|
|
365
|
-
|
|
346
|
+
# Analyze label keys and their cardinality
|
|
347
|
+
label_cardinality: Dict[str, set] = {}
|
|
348
|
+
for series in series_list:
|
|
349
|
+
metric = series.get("metric", {})
|
|
350
|
+
for label_key, label_value in metric.items():
|
|
351
|
+
if label_key not in label_cardinality:
|
|
352
|
+
label_cardinality[label_key] = set()
|
|
353
|
+
label_cardinality[label_key].add(label_value)
|
|
354
|
+
|
|
355
|
+
# Convert sets to counts for the summary
|
|
356
|
+
label_summary = {
|
|
357
|
+
label: len(values) for label, values in label_cardinality.items()
|
|
358
|
+
}
|
|
359
|
+
# Sort by cardinality (highest first) for better insights
|
|
360
|
+
label_summary = dict(
|
|
361
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
366
362
|
)
|
|
367
|
-
estimated_total_points = avg_points_per_series * num_items
|
|
368
|
-
|
|
369
|
-
# Create a sample of just the metadata (labels) without values
|
|
370
|
-
sample_metrics = []
|
|
371
|
-
for series in series_list[:10]: # Sample first 10 series
|
|
372
|
-
sample_metrics.append(series.get("metric", {}))
|
|
373
|
-
|
|
374
|
-
sample_json = json.dumps(sample_metrics, indent=2)
|
|
375
|
-
if len(sample_json) > 2000:
|
|
376
|
-
sample_json = sample_json[:2000] + "\n... (truncated)"
|
|
377
363
|
|
|
378
364
|
return {
|
|
379
|
-
"message": f"Data too large to return ({
|
|
365
|
+
"message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
|
|
380
366
|
"series_count": num_items,
|
|
381
|
-
"
|
|
382
|
-
"
|
|
383
|
-
"
|
|
367
|
+
"total_data_points": total_points,
|
|
368
|
+
"data_size_tokens": data_size_tokens,
|
|
369
|
+
"label_cardinality": label_summary,
|
|
384
370
|
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
|
|
385
371
|
}
|
|
386
372
|
else:
|
|
@@ -389,196 +375,77 @@ def create_data_summary_for_large_result(
|
|
|
389
375
|
result_list = result_data.get("result", [])
|
|
390
376
|
num_items = len(result_list)
|
|
391
377
|
|
|
392
|
-
#
|
|
393
|
-
|
|
394
|
-
for item in result_list
|
|
378
|
+
# Analyze label keys and their cardinality
|
|
379
|
+
instant_label_cardinality: Dict[str, set] = {}
|
|
380
|
+
for item in result_list:
|
|
395
381
|
if isinstance(item, dict):
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
382
|
+
metric = item.get("metric", {})
|
|
383
|
+
for label_key, label_value in metric.items():
|
|
384
|
+
if label_key not in instant_label_cardinality:
|
|
385
|
+
instant_label_cardinality[label_key] = set()
|
|
386
|
+
instant_label_cardinality[label_key].add(label_value)
|
|
387
|
+
|
|
388
|
+
# Convert sets to counts for the summary
|
|
389
|
+
label_summary = {
|
|
390
|
+
label: len(values) for label, values in instant_label_cardinality.items()
|
|
391
|
+
}
|
|
392
|
+
# Sort by cardinality (highest first) for better insights
|
|
393
|
+
label_summary = dict(
|
|
394
|
+
sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
|
|
395
|
+
)
|
|
401
396
|
|
|
402
397
|
return {
|
|
403
|
-
"message": f"Data too large to return ({
|
|
398
|
+
"message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
|
|
404
399
|
"result_count": num_items,
|
|
405
400
|
"result_type": result_type,
|
|
406
|
-
"
|
|
407
|
-
"
|
|
401
|
+
"data_size_tokens": data_size_tokens,
|
|
402
|
+
"label_cardinality": label_summary,
|
|
408
403
|
"suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
|
|
409
404
|
}
|
|
410
405
|
|
|
411
406
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
if
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
407
|
+
class MetricsBasedResponse(BaseModel):
|
|
408
|
+
status: str
|
|
409
|
+
error_message: Optional[str] = None
|
|
410
|
+
data: Optional[str] = None
|
|
411
|
+
random_key: str
|
|
412
|
+
tool_name: str
|
|
413
|
+
description: str
|
|
414
|
+
query: str
|
|
415
|
+
start: Optional[str] = None
|
|
416
|
+
end: Optional[str] = None
|
|
417
|
+
step: Optional[float] = None
|
|
418
|
+
output_type: Optional[str] = None
|
|
419
|
+
data_summary: Optional[dict[str, Any]] = None
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
def create_structured_tool_result(
|
|
423
|
+
params: dict, response: MetricsBasedResponse
|
|
424
|
+
) -> StructuredToolResult:
|
|
425
|
+
status = StructuredToolResultStatus.SUCCESS
|
|
426
|
+
if response.error_message or response.status.lower() in ("failed", "error"):
|
|
427
|
+
status = StructuredToolResultStatus.ERROR
|
|
428
|
+
elif not response.data:
|
|
429
|
+
status = StructuredToolResultStatus.NO_DATA
|
|
430
|
+
|
|
431
|
+
return StructuredToolResult(
|
|
432
|
+
status=status,
|
|
433
|
+
data=response.model_dump_json(indent=2),
|
|
439
434
|
params=params,
|
|
440
|
-
timeout=60,
|
|
441
|
-
verify=verify_ssl,
|
|
442
|
-
method="GET",
|
|
443
435
|
)
|
|
444
|
-
series_response.raise_for_status()
|
|
445
|
-
series = series_response.json()["data"]
|
|
446
|
-
|
|
447
|
-
metrics_labels: dict = {}
|
|
448
|
-
for serie in series:
|
|
449
|
-
metric_name = serie["__name__"]
|
|
450
|
-
# Add all labels except __name__
|
|
451
|
-
labels = {k for k in serie.keys() if k != "__name__"}
|
|
452
|
-
if metric_name in metrics_labels:
|
|
453
|
-
metrics_labels[metric_name].update(labels)
|
|
454
|
-
else:
|
|
455
|
-
metrics_labels[metric_name] = labels
|
|
456
|
-
if cache:
|
|
457
|
-
cache.set(cache_key, metrics_labels)
|
|
458
|
-
|
|
459
|
-
return metrics_labels
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
def fetch_metrics_labels_with_labels_api(
|
|
463
|
-
prometheus_url: str,
|
|
464
|
-
cache: Optional[TTLCache],
|
|
465
|
-
metrics_labels_time_window_hrs: Union[int, None],
|
|
466
|
-
metric_names: List[str],
|
|
467
|
-
headers: Dict,
|
|
468
|
-
config=None,
|
|
469
|
-
verify_ssl: bool = True,
|
|
470
|
-
) -> dict:
|
|
471
|
-
metrics_labels = {}
|
|
472
|
-
|
|
473
|
-
for metric_name in metric_names:
|
|
474
|
-
cache_key = f"metrics_labels_labels_api:{metric_name}"
|
|
475
|
-
if cache:
|
|
476
|
-
cached_result = cache.get(cache_key)
|
|
477
|
-
if cached_result:
|
|
478
|
-
metrics_labels[metric_name] = cached_result
|
|
479
|
-
|
|
480
|
-
url = urljoin(prometheus_url, "api/v1/labels")
|
|
481
|
-
params: dict = {
|
|
482
|
-
"match[]": f'{{__name__="{metric_name}"}}',
|
|
483
|
-
}
|
|
484
|
-
if metrics_labels_time_window_hrs is not None:
|
|
485
|
-
params["end"] = int(time.time())
|
|
486
|
-
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
487
|
-
|
|
488
|
-
response = do_request(
|
|
489
|
-
config=config,
|
|
490
|
-
url=url,
|
|
491
|
-
headers=headers,
|
|
492
|
-
params=params,
|
|
493
|
-
timeout=60,
|
|
494
|
-
verify=verify_ssl,
|
|
495
|
-
method="GET",
|
|
496
|
-
)
|
|
497
|
-
response.raise_for_status()
|
|
498
|
-
labels = response.json()["data"]
|
|
499
|
-
filtered_labels = {label for label in labels if label != "__name__"}
|
|
500
|
-
metrics_labels[metric_name] = filtered_labels
|
|
501
|
-
|
|
502
|
-
if cache:
|
|
503
|
-
cache.set(cache_key, filtered_labels)
|
|
504
|
-
|
|
505
|
-
return metrics_labels
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
def fetch_metrics(
|
|
509
|
-
prometheus_url: str,
|
|
510
|
-
cache: Optional[TTLCache],
|
|
511
|
-
metrics_labels_time_window_hrs: Union[int, None],
|
|
512
|
-
metric_name: str,
|
|
513
|
-
should_fetch_labels_with_labels_api: bool,
|
|
514
|
-
should_fetch_metadata_with_series_api: bool,
|
|
515
|
-
headers: Dict,
|
|
516
|
-
config=None,
|
|
517
|
-
verify_ssl: bool = True,
|
|
518
|
-
) -> dict:
|
|
519
|
-
metrics = None
|
|
520
|
-
should_fetch_labels = True
|
|
521
|
-
if should_fetch_metadata_with_series_api:
|
|
522
|
-
metrics = fetch_metadata_with_series_api(
|
|
523
|
-
prometheus_url=prometheus_url,
|
|
524
|
-
metric_name=metric_name,
|
|
525
|
-
headers=headers,
|
|
526
|
-
config=config,
|
|
527
|
-
verify_ssl=verify_ssl,
|
|
528
|
-
)
|
|
529
|
-
should_fetch_labels = False # series API returns the labels
|
|
530
|
-
else:
|
|
531
|
-
metrics = fetch_metadata(
|
|
532
|
-
prometheus_url=prometheus_url,
|
|
533
|
-
headers=headers,
|
|
534
|
-
config=config,
|
|
535
|
-
verify_ssl=verify_ssl,
|
|
536
|
-
)
|
|
537
|
-
metrics = filter_metrics_by_name(metrics, metric_name)
|
|
538
|
-
|
|
539
|
-
if should_fetch_labels:
|
|
540
|
-
metrics_labels = {}
|
|
541
|
-
if should_fetch_labels_with_labels_api:
|
|
542
|
-
metrics_labels = fetch_metrics_labels_with_labels_api(
|
|
543
|
-
prometheus_url=prometheus_url,
|
|
544
|
-
cache=cache,
|
|
545
|
-
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
546
|
-
metric_names=list(metrics.keys()),
|
|
547
|
-
headers=headers,
|
|
548
|
-
config=config,
|
|
549
|
-
verify_ssl=verify_ssl,
|
|
550
|
-
)
|
|
551
|
-
else:
|
|
552
|
-
metrics_labels = fetch_metrics_labels_with_series_api(
|
|
553
|
-
prometheus_url=prometheus_url,
|
|
554
|
-
cache=cache,
|
|
555
|
-
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
556
|
-
metric_name=metric_name,
|
|
557
|
-
headers=headers,
|
|
558
|
-
config=config,
|
|
559
|
-
verify_ssl=verify_ssl,
|
|
560
|
-
)
|
|
561
|
-
|
|
562
|
-
for metric_name in metrics:
|
|
563
|
-
if metric_name in metrics_labels:
|
|
564
|
-
metrics[metric_name]["labels"] = metrics_labels[metric_name]
|
|
565
|
-
|
|
566
|
-
return metrics
|
|
567
436
|
|
|
568
437
|
|
|
569
438
|
class ListPrometheusRules(BasePrometheusTool):
|
|
570
439
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
571
440
|
super().__init__(
|
|
572
441
|
name="list_prometheus_rules",
|
|
573
|
-
description="List all defined
|
|
442
|
+
description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
|
|
574
443
|
parameters={},
|
|
575
444
|
toolset=toolset,
|
|
576
445
|
)
|
|
577
446
|
self._cache = None
|
|
578
447
|
|
|
579
|
-
def _invoke(
|
|
580
|
-
self, params: dict, user_approved: bool = False
|
|
581
|
-
) -> StructuredToolResult:
|
|
448
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
582
449
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
583
450
|
return StructuredToolResult(
|
|
584
451
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -613,7 +480,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
613
480
|
config=self.toolset.config,
|
|
614
481
|
url=rules_url,
|
|
615
482
|
params=params,
|
|
616
|
-
timeout=
|
|
483
|
+
timeout=40,
|
|
617
484
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
618
485
|
headers=self.toolset.config.headers,
|
|
619
486
|
method="GET",
|
|
@@ -654,120 +521,553 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
654
521
|
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
|
|
655
522
|
|
|
656
523
|
|
|
657
|
-
class
|
|
524
|
+
class GetMetricNames(BasePrometheusTool):
|
|
525
|
+
"""Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
|
|
526
|
+
|
|
658
527
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
659
528
|
super().__init__(
|
|
660
|
-
name="
|
|
661
|
-
description=
|
|
529
|
+
name="get_metric_names",
|
|
530
|
+
description=(
|
|
531
|
+
"Get list of metric names using /api/v1/label/__name__/values. "
|
|
532
|
+
"FASTEST method for metric discovery when you need to explore available metrics. "
|
|
533
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
|
|
534
|
+
f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
|
|
535
|
+
"Note: Does not return metric metadata (type, description, labels). "
|
|
536
|
+
"By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
537
|
+
),
|
|
662
538
|
parameters={
|
|
663
|
-
"
|
|
664
|
-
description=
|
|
539
|
+
"match": ToolParameter(
|
|
540
|
+
description=(
|
|
541
|
+
"REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
|
|
542
|
+
"'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
|
|
543
|
+
"'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
|
|
544
|
+
"'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
|
|
545
|
+
"'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
|
|
546
|
+
"'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
|
|
547
|
+
"'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
|
|
548
|
+
),
|
|
549
|
+
type="string",
|
|
550
|
+
required=True,
|
|
551
|
+
),
|
|
552
|
+
"start": ToolParameter(
|
|
553
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
665
554
|
type="string",
|
|
666
555
|
required=False,
|
|
667
556
|
),
|
|
668
|
-
"
|
|
669
|
-
description="
|
|
557
|
+
"end": ToolParameter(
|
|
558
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
670
559
|
type="string",
|
|
671
|
-
required=
|
|
560
|
+
required=False,
|
|
672
561
|
),
|
|
673
562
|
},
|
|
674
563
|
toolset=toolset,
|
|
675
564
|
)
|
|
676
|
-
self._cache = None
|
|
677
565
|
|
|
678
|
-
def _invoke(
|
|
679
|
-
self, params: dict, user_approved: bool = False
|
|
680
|
-
) -> StructuredToolResult:
|
|
566
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
681
567
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
682
568
|
return StructuredToolResult(
|
|
683
569
|
status=StructuredToolResultStatus.ERROR,
|
|
684
570
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
685
571
|
params=params,
|
|
686
572
|
)
|
|
687
|
-
if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
|
|
688
|
-
self._cache = TTLCache(
|
|
689
|
-
self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
|
|
690
|
-
)
|
|
691
573
|
try:
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
574
|
+
match_param = params.get("match")
|
|
575
|
+
if not match_param:
|
|
576
|
+
return StructuredToolResult(
|
|
577
|
+
status=StructuredToolResultStatus.ERROR,
|
|
578
|
+
error="Match parameter is required to filter metrics",
|
|
579
|
+
params=params,
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
url = urljoin(
|
|
583
|
+
self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
|
|
584
|
+
)
|
|
585
|
+
query_params = {
|
|
586
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
587
|
+
"match[]": match_param,
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
# Add time parameters - use provided values or defaults
|
|
591
|
+
if params.get("end"):
|
|
592
|
+
query_params["end"] = params["end"]
|
|
593
|
+
else:
|
|
594
|
+
query_params["end"] = str(int(time.time()))
|
|
595
|
+
|
|
596
|
+
if params.get("start"):
|
|
597
|
+
query_params["start"] = params["start"]
|
|
598
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
599
|
+
# Use default time window
|
|
600
|
+
query_params["start"] = str(
|
|
601
|
+
int(time.time())
|
|
602
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
response = do_request(
|
|
606
|
+
config=self.toolset.config,
|
|
607
|
+
url=url,
|
|
608
|
+
params=query_params,
|
|
609
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
610
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
611
|
+
headers=self.toolset.config.headers,
|
|
612
|
+
method="GET",
|
|
695
613
|
)
|
|
614
|
+
response.raise_for_status()
|
|
615
|
+
data = response.json()
|
|
616
|
+
|
|
617
|
+
# Check if results were truncated
|
|
618
|
+
if (
|
|
619
|
+
"data" in data
|
|
620
|
+
and isinstance(data["data"], list)
|
|
621
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
622
|
+
):
|
|
623
|
+
data["_truncated"] = True
|
|
624
|
+
data["_message"] = (
|
|
625
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
|
|
626
|
+
)
|
|
696
627
|
|
|
697
|
-
|
|
698
|
-
|
|
628
|
+
return StructuredToolResult(
|
|
629
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
630
|
+
data=data,
|
|
631
|
+
params=params,
|
|
632
|
+
)
|
|
633
|
+
except Exception as e:
|
|
634
|
+
return StructuredToolResult(
|
|
635
|
+
status=StructuredToolResultStatus.ERROR,
|
|
636
|
+
error=str(e),
|
|
637
|
+
params=params,
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
641
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
class GetLabelValues(BasePrometheusTool):
|
|
645
|
+
"""Get values for a specific label across all metrics"""
|
|
646
|
+
|
|
647
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
648
|
+
super().__init__(
|
|
649
|
+
name="get_label_values",
|
|
650
|
+
description=(
|
|
651
|
+
"Get all values for a specific label using /api/v1/label/{label}/values. "
|
|
652
|
+
"Use this to discover pods, namespaces, jobs, instances, etc. "
|
|
653
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
654
|
+
"Supports optional match[] parameter to filter. "
|
|
655
|
+
"By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
656
|
+
),
|
|
657
|
+
parameters={
|
|
658
|
+
"label": ToolParameter(
|
|
659
|
+
description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
|
|
660
|
+
type="string",
|
|
661
|
+
required=True,
|
|
662
|
+
),
|
|
663
|
+
"match": ToolParameter(
|
|
664
|
+
description=(
|
|
665
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
666
|
+
"'{namespace=\"default\"}')."
|
|
667
|
+
),
|
|
668
|
+
type="string",
|
|
669
|
+
required=False,
|
|
670
|
+
),
|
|
671
|
+
"start": ToolParameter(
|
|
672
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
673
|
+
type="string",
|
|
674
|
+
required=False,
|
|
675
|
+
),
|
|
676
|
+
"end": ToolParameter(
|
|
677
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
678
|
+
type="string",
|
|
679
|
+
required=False,
|
|
680
|
+
),
|
|
681
|
+
},
|
|
682
|
+
toolset=toolset,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
686
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
687
|
+
return StructuredToolResult(
|
|
688
|
+
status=StructuredToolResultStatus.ERROR,
|
|
689
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
690
|
+
params=params,
|
|
691
|
+
)
|
|
692
|
+
try:
|
|
693
|
+
label = params.get("label")
|
|
694
|
+
if not label:
|
|
699
695
|
return StructuredToolResult(
|
|
700
696
|
status=StructuredToolResultStatus.ERROR,
|
|
701
|
-
error="
|
|
697
|
+
error="Label parameter is required",
|
|
702
698
|
params=params,
|
|
703
699
|
)
|
|
704
700
|
|
|
705
|
-
|
|
706
|
-
prometheus_url
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
701
|
+
url = urljoin(
|
|
702
|
+
self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
|
|
703
|
+
)
|
|
704
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
705
|
+
if params.get("match"):
|
|
706
|
+
query_params["match[]"] = params["match"]
|
|
707
|
+
|
|
708
|
+
# Add time parameters - use provided values or defaults
|
|
709
|
+
if params.get("end"):
|
|
710
|
+
query_params["end"] = params["end"]
|
|
711
|
+
else:
|
|
712
|
+
query_params["end"] = str(int(time.time()))
|
|
713
|
+
|
|
714
|
+
if params.get("start"):
|
|
715
|
+
query_params["start"] = params["start"]
|
|
716
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
717
|
+
# Use default time window
|
|
718
|
+
query_params["start"] = str(
|
|
719
|
+
int(time.time())
|
|
720
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
response = do_request(
|
|
713
724
|
config=self.toolset.config,
|
|
714
|
-
|
|
725
|
+
url=url,
|
|
726
|
+
params=query_params,
|
|
727
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
728
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
729
|
+
headers=self.toolset.config.headers,
|
|
730
|
+
method="GET",
|
|
715
731
|
)
|
|
732
|
+
response.raise_for_status()
|
|
733
|
+
data = response.json()
|
|
734
|
+
|
|
735
|
+
# Check if results were truncated
|
|
736
|
+
if (
|
|
737
|
+
"data" in data
|
|
738
|
+
and isinstance(data["data"], list)
|
|
739
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
740
|
+
):
|
|
741
|
+
data["_truncated"] = True
|
|
742
|
+
data["_message"] = (
|
|
743
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
return StructuredToolResult(
|
|
747
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
748
|
+
data=data,
|
|
749
|
+
params=params,
|
|
750
|
+
)
|
|
751
|
+
except Exception as e:
|
|
752
|
+
return StructuredToolResult(
|
|
753
|
+
status=StructuredToolResultStatus.ERROR,
|
|
754
|
+
error=str(e),
|
|
755
|
+
params=params,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
759
|
+
label = params.get("label", "")
|
|
760
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
|
|
716
761
|
|
|
717
|
-
type_filter = params.get("type_filter")
|
|
718
|
-
if type_filter:
|
|
719
|
-
metrics = filter_metrics_by_type(metrics, type_filter)
|
|
720
762
|
|
|
721
|
-
|
|
722
|
-
|
|
763
|
+
class GetAllLabels(BasePrometheusTool):
|
|
764
|
+
"""Get all label names that exist in Prometheus"""
|
|
723
765
|
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
766
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
767
|
+
super().__init__(
|
|
768
|
+
name="get_all_labels",
|
|
769
|
+
description=(
|
|
770
|
+
"Get list of all label names using /api/v1/labels. "
|
|
771
|
+
"Use this to discover what labels are available across all metrics. "
|
|
772
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
|
|
773
|
+
"Supports optional match[] parameter to filter. "
|
|
774
|
+
"By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
775
|
+
),
|
|
776
|
+
parameters={
|
|
777
|
+
"match": ToolParameter(
|
|
778
|
+
description=(
|
|
779
|
+
"Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
|
|
780
|
+
"'{job=\"prometheus\"}')."
|
|
781
|
+
),
|
|
782
|
+
type="string",
|
|
783
|
+
required=False,
|
|
784
|
+
),
|
|
785
|
+
"start": ToolParameter(
|
|
786
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
787
|
+
type="string",
|
|
788
|
+
required=False,
|
|
789
|
+
),
|
|
790
|
+
"end": ToolParameter(
|
|
791
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
792
|
+
type="string",
|
|
793
|
+
required=False,
|
|
794
|
+
),
|
|
795
|
+
},
|
|
796
|
+
toolset=toolset,
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
800
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
801
|
+
return StructuredToolResult(
|
|
802
|
+
status=StructuredToolResultStatus.ERROR,
|
|
803
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
804
|
+
params=params,
|
|
805
|
+
)
|
|
806
|
+
try:
|
|
807
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
|
|
808
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
809
|
+
if params.get("match"):
|
|
810
|
+
query_params["match[]"] = params["match"]
|
|
811
|
+
|
|
812
|
+
# Add time parameters - use provided values or defaults
|
|
813
|
+
if params.get("end"):
|
|
814
|
+
query_params["end"] = params["end"]
|
|
815
|
+
else:
|
|
816
|
+
query_params["end"] = str(int(time.time()))
|
|
817
|
+
|
|
818
|
+
if params.get("start"):
|
|
819
|
+
query_params["start"] = params["start"]
|
|
820
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
821
|
+
# Use default time window
|
|
822
|
+
query_params["start"] = str(
|
|
823
|
+
int(time.time())
|
|
824
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
727
825
|
)
|
|
728
|
-
|
|
729
|
-
|
|
826
|
+
|
|
827
|
+
response = do_request(
|
|
828
|
+
config=self.toolset.config,
|
|
829
|
+
url=url,
|
|
830
|
+
params=query_params,
|
|
831
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
832
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
833
|
+
headers=self.toolset.config.headers,
|
|
834
|
+
method="GET",
|
|
835
|
+
)
|
|
836
|
+
response.raise_for_status()
|
|
837
|
+
data = response.json()
|
|
838
|
+
|
|
839
|
+
# Check if results were truncated
|
|
840
|
+
if (
|
|
841
|
+
"data" in data
|
|
842
|
+
and isinstance(data["data"], list)
|
|
843
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
844
|
+
):
|
|
845
|
+
data["_truncated"] = True
|
|
846
|
+
data["_message"] = (
|
|
847
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
|
|
730
848
|
)
|
|
731
849
|
|
|
732
|
-
table_output = "\n".join(output)
|
|
733
850
|
return StructuredToolResult(
|
|
734
851
|
status=StructuredToolResultStatus.SUCCESS,
|
|
735
|
-
data=
|
|
852
|
+
data=data,
|
|
853
|
+
params=params,
|
|
854
|
+
)
|
|
855
|
+
except Exception as e:
|
|
856
|
+
return StructuredToolResult(
|
|
857
|
+
status=StructuredToolResultStatus.ERROR,
|
|
858
|
+
error=str(e),
|
|
736
859
|
params=params,
|
|
737
860
|
)
|
|
738
861
|
|
|
739
|
-
|
|
740
|
-
|
|
862
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
863
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
|
|
864
|
+
|
|
865
|
+
|
|
866
|
+
class GetSeries(BasePrometheusTool):
|
|
867
|
+
"""Get time series matching a selector"""
|
|
868
|
+
|
|
869
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
870
|
+
super().__init__(
|
|
871
|
+
name="get_series",
|
|
872
|
+
description=(
|
|
873
|
+
"Get time series using /api/v1/series. "
|
|
874
|
+
"Returns label sets for all time series matching the selector. "
|
|
875
|
+
"SLOWER than other discovery methods - use only when you need full label sets. "
|
|
876
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
|
|
877
|
+
"Requires match[] parameter with PromQL selector. "
|
|
878
|
+
"By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
|
|
879
|
+
),
|
|
880
|
+
parameters={
|
|
881
|
+
"match": ToolParameter(
|
|
882
|
+
description=(
|
|
883
|
+
"PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
|
|
884
|
+
"'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
|
|
885
|
+
'\'{__name__="up",job="prometheus"}\').'
|
|
886
|
+
),
|
|
887
|
+
type="string",
|
|
888
|
+
required=True,
|
|
889
|
+
),
|
|
890
|
+
"start": ToolParameter(
|
|
891
|
+
description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
|
|
892
|
+
type="string",
|
|
893
|
+
required=False,
|
|
894
|
+
),
|
|
895
|
+
"end": ToolParameter(
|
|
896
|
+
description="End timestamp (RFC3339 or Unix). Default: now",
|
|
897
|
+
type="string",
|
|
898
|
+
required=False,
|
|
899
|
+
),
|
|
900
|
+
},
|
|
901
|
+
toolset=toolset,
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
905
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
741
906
|
return StructuredToolResult(
|
|
742
907
|
status=StructuredToolResultStatus.ERROR,
|
|
743
|
-
error="
|
|
908
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
744
909
|
params=params,
|
|
745
910
|
)
|
|
746
|
-
|
|
747
|
-
|
|
911
|
+
try:
|
|
912
|
+
match = params.get("match")
|
|
913
|
+
if not match:
|
|
914
|
+
return StructuredToolResult(
|
|
915
|
+
status=StructuredToolResultStatus.ERROR,
|
|
916
|
+
error="Match parameter is required",
|
|
917
|
+
params=params,
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
|
|
921
|
+
query_params = {
|
|
922
|
+
"match[]": match,
|
|
923
|
+
"limit": str(PROMETHEUS_METADATA_API_LIMIT),
|
|
924
|
+
}
|
|
925
|
+
|
|
926
|
+
# Add time parameters - use provided values or defaults
|
|
927
|
+
if params.get("end"):
|
|
928
|
+
query_params["end"] = params["end"]
|
|
929
|
+
else:
|
|
930
|
+
query_params["end"] = str(int(time.time()))
|
|
931
|
+
|
|
932
|
+
if params.get("start"):
|
|
933
|
+
query_params["start"] = params["start"]
|
|
934
|
+
elif self.toolset.config.default_metadata_time_window_hrs:
|
|
935
|
+
# Use default time window
|
|
936
|
+
query_params["start"] = str(
|
|
937
|
+
int(time.time())
|
|
938
|
+
- (self.toolset.config.default_metadata_time_window_hrs * 3600)
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
response = do_request(
|
|
942
|
+
config=self.toolset.config,
|
|
943
|
+
url=url,
|
|
944
|
+
params=query_params,
|
|
945
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
946
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
947
|
+
headers=self.toolset.config.headers,
|
|
948
|
+
method="GET",
|
|
949
|
+
)
|
|
950
|
+
response.raise_for_status()
|
|
951
|
+
data = response.json()
|
|
952
|
+
|
|
953
|
+
# Check if results were truncated
|
|
954
|
+
if (
|
|
955
|
+
"data" in data
|
|
956
|
+
and isinstance(data["data"], list)
|
|
957
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
958
|
+
):
|
|
959
|
+
data["_truncated"] = True
|
|
960
|
+
data["_message"] = (
|
|
961
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
|
|
962
|
+
)
|
|
963
|
+
|
|
964
|
+
return StructuredToolResult(
|
|
965
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
966
|
+
data=data,
|
|
967
|
+
params=params,
|
|
968
|
+
)
|
|
969
|
+
except Exception as e:
|
|
970
|
+
return StructuredToolResult(
|
|
971
|
+
status=StructuredToolResultStatus.ERROR,
|
|
972
|
+
error=str(e),
|
|
973
|
+
params=params,
|
|
974
|
+
)
|
|
975
|
+
|
|
976
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
977
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
|
|
978
|
+
|
|
979
|
+
|
|
980
|
+
class GetMetricMetadata(BasePrometheusTool):
|
|
981
|
+
"""Get metadata (type, description, unit) for metrics"""
|
|
982
|
+
|
|
983
|
+
def __init__(self, toolset: "PrometheusToolset"):
|
|
984
|
+
super().__init__(
|
|
985
|
+
name="get_metric_metadata",
|
|
986
|
+
description=(
|
|
987
|
+
"Get metric metadata using /api/v1/metadata. "
|
|
988
|
+
"Returns type, help text, and unit for metrics. "
|
|
989
|
+
"Use after discovering metric names to get their descriptions. "
|
|
990
|
+
f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
|
|
991
|
+
"Supports optional metric name filter."
|
|
992
|
+
),
|
|
993
|
+
parameters={
|
|
994
|
+
"metric": ToolParameter(
|
|
995
|
+
description=(
|
|
996
|
+
"Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
|
|
997
|
+
"If not provided, returns metadata for all metrics."
|
|
998
|
+
),
|
|
999
|
+
type="string",
|
|
1000
|
+
required=False,
|
|
1001
|
+
),
|
|
1002
|
+
},
|
|
1003
|
+
toolset=toolset,
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
1007
|
+
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
748
1008
|
return StructuredToolResult(
|
|
749
1009
|
status=StructuredToolResultStatus.ERROR,
|
|
750
|
-
error=
|
|
1010
|
+
error="Prometheus is not configured. Prometheus URL is missing",
|
|
1011
|
+
params=params,
|
|
1012
|
+
)
|
|
1013
|
+
try:
|
|
1014
|
+
url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
|
|
1015
|
+
query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
|
|
1016
|
+
|
|
1017
|
+
if params.get("metric"):
|
|
1018
|
+
query_params["metric"] = params["metric"]
|
|
1019
|
+
|
|
1020
|
+
response = do_request(
|
|
1021
|
+
config=self.toolset.config,
|
|
1022
|
+
url=url,
|
|
1023
|
+
params=query_params,
|
|
1024
|
+
timeout=self.toolset.config.default_metadata_timeout_seconds,
|
|
1025
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
1026
|
+
headers=self.toolset.config.headers,
|
|
1027
|
+
method="GET",
|
|
1028
|
+
)
|
|
1029
|
+
response.raise_for_status()
|
|
1030
|
+
data = response.json()
|
|
1031
|
+
|
|
1032
|
+
# Check if results were truncated (metadata endpoint returns a dict, not a list)
|
|
1033
|
+
if (
|
|
1034
|
+
"data" in data
|
|
1035
|
+
and isinstance(data["data"], dict)
|
|
1036
|
+
and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
|
|
1037
|
+
):
|
|
1038
|
+
data["_truncated"] = True
|
|
1039
|
+
data["_message"] = (
|
|
1040
|
+
f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
|
|
1041
|
+
)
|
|
1042
|
+
|
|
1043
|
+
return StructuredToolResult(
|
|
1044
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
1045
|
+
data=data,
|
|
751
1046
|
params=params,
|
|
752
1047
|
)
|
|
753
1048
|
except Exception as e:
|
|
754
|
-
logging.warn("Failed to process prometheus metrics", exc_info=True)
|
|
755
1049
|
return StructuredToolResult(
|
|
756
1050
|
status=StructuredToolResultStatus.ERROR,
|
|
757
|
-
error=
|
|
1051
|
+
error=str(e),
|
|
758
1052
|
params=params,
|
|
759
1053
|
)
|
|
760
1054
|
|
|
761
1055
|
def get_parameterized_one_liner(self, params) -> str:
|
|
762
|
-
|
|
763
|
-
return
|
|
1056
|
+
metric = params.get("metric", "all")
|
|
1057
|
+
return (
|
|
1058
|
+
f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
|
|
1059
|
+
)
|
|
764
1060
|
|
|
765
1061
|
|
|
766
1062
|
class ExecuteInstantQuery(BasePrometheusTool):
|
|
767
1063
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
768
1064
|
super().__init__(
|
|
769
1065
|
name="execute_prometheus_instant_query",
|
|
770
|
-
description=
|
|
1066
|
+
description=(
|
|
1067
|
+
f"Execute an instant PromQL query (single point in time). "
|
|
1068
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1069
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
|
|
1070
|
+
),
|
|
771
1071
|
parameters={
|
|
772
1072
|
"query": ToolParameter(
|
|
773
1073
|
description="The PromQL query",
|
|
@@ -779,13 +1079,20 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
779
1079
|
type="string",
|
|
780
1080
|
required=True,
|
|
781
1081
|
),
|
|
1082
|
+
"timeout": ToolParameter(
|
|
1083
|
+
description=(
|
|
1084
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1085
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1086
|
+
f"Increase for complex queries that may take longer."
|
|
1087
|
+
),
|
|
1088
|
+
type="number",
|
|
1089
|
+
required=False,
|
|
1090
|
+
),
|
|
782
1091
|
},
|
|
783
1092
|
toolset=toolset,
|
|
784
1093
|
)
|
|
785
1094
|
|
|
786
|
-
def _invoke(
|
|
787
|
-
self, params: dict, user_approved: bool = False
|
|
788
|
-
) -> StructuredToolResult:
|
|
1095
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
789
1096
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
790
1097
|
return StructuredToolResult(
|
|
791
1098
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -800,12 +1107,24 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
800
1107
|
|
|
801
1108
|
payload = {"query": query}
|
|
802
1109
|
|
|
1110
|
+
# Get timeout parameter and enforce limits
|
|
1111
|
+
default_timeout = self.toolset.config.default_query_timeout_seconds
|
|
1112
|
+
max_timeout = self.toolset.config.max_query_timeout_seconds
|
|
1113
|
+
timeout = params.get("timeout", default_timeout)
|
|
1114
|
+
if timeout > max_timeout:
|
|
1115
|
+
timeout = max_timeout
|
|
1116
|
+
logging.warning(
|
|
1117
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1118
|
+
)
|
|
1119
|
+
elif timeout < 1:
|
|
1120
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1121
|
+
|
|
803
1122
|
response = do_request(
|
|
804
1123
|
config=self.toolset.config,
|
|
805
1124
|
url=url,
|
|
806
1125
|
headers=self.toolset.config.headers,
|
|
807
1126
|
data=payload,
|
|
808
|
-
timeout=
|
|
1127
|
+
timeout=timeout,
|
|
809
1128
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
810
1129
|
method="POST",
|
|
811
1130
|
)
|
|
@@ -819,51 +1138,64 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
819
1138
|
error_message = (
|
|
820
1139
|
"The prometheus query returned no result. Is the query correct?"
|
|
821
1140
|
)
|
|
822
|
-
response_data =
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
1141
|
+
response_data = MetricsBasedResponse(
|
|
1142
|
+
status=status,
|
|
1143
|
+
error_message=error_message,
|
|
1144
|
+
random_key=generate_random_key(),
|
|
1145
|
+
tool_name=self.name,
|
|
1146
|
+
description=description,
|
|
1147
|
+
query=query,
|
|
1148
|
+
)
|
|
1149
|
+
structured_tool_result: StructuredToolResult
|
|
831
1150
|
# Check if data should be included based on size
|
|
832
1151
|
if self.toolset.config.tool_calls_return_data:
|
|
833
1152
|
result_data = data.get("data", {})
|
|
1153
|
+
response_data.data = result_data
|
|
1154
|
+
|
|
1155
|
+
structured_tool_result = create_structured_tool_result(
|
|
1156
|
+
params=params, response=response_data
|
|
1157
|
+
)
|
|
1158
|
+
token_count = count_tool_response_tokens(
|
|
1159
|
+
llm=context.llm, structured_tool_result=structured_tool_result
|
|
1160
|
+
)
|
|
834
1161
|
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
1162
|
+
token_limit = context.max_token_count
|
|
1163
|
+
if self.toolset.config.query_response_size_limit_pct:
|
|
1164
|
+
custom_token_limit = get_pct_token_count(
|
|
1165
|
+
percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
|
|
1166
|
+
llm=context.llm,
|
|
1167
|
+
)
|
|
1168
|
+
if custom_token_limit < token_limit:
|
|
1169
|
+
token_limit = custom_token_limit
|
|
838
1170
|
|
|
839
1171
|
# Provide summary if data is too large
|
|
840
|
-
if
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
> self.toolset.config.query_response_size_limit
|
|
844
|
-
):
|
|
845
|
-
response_data["data_summary"] = (
|
|
1172
|
+
if token_count > token_limit:
|
|
1173
|
+
response_data.data = None
|
|
1174
|
+
response_data.data_summary = (
|
|
846
1175
|
create_data_summary_for_large_result(
|
|
847
1176
|
result_data,
|
|
848
1177
|
query,
|
|
849
|
-
|
|
1178
|
+
token_count,
|
|
850
1179
|
is_range_query=False,
|
|
851
1180
|
)
|
|
852
1181
|
)
|
|
853
1182
|
logging.info(
|
|
854
1183
|
f"Prometheus instant query returned large dataset: "
|
|
855
|
-
f"{response_data
|
|
856
|
-
f"{
|
|
1184
|
+
f"{response_data.data_summary.get('result_count', 0)} results, "
|
|
1185
|
+
f"{token_count:,} tokens (limit: {token_limit:,}). "
|
|
1186
|
+
f"Returning summary instead of full data."
|
|
1187
|
+
)
|
|
1188
|
+
# Also add token info to the summary for debugging
|
|
1189
|
+
response_data.data_summary["_debug_info"] = (
|
|
1190
|
+
f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
|
|
857
1191
|
)
|
|
858
1192
|
else:
|
|
859
|
-
response_data
|
|
1193
|
+
response_data.data = result_data
|
|
860
1194
|
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
status=StructuredToolResultStatus.SUCCESS,
|
|
864
|
-
data=data_str,
|
|
865
|
-
params=params,
|
|
1195
|
+
structured_tool_result = create_structured_tool_result(
|
|
1196
|
+
params=params, response=response_data
|
|
866
1197
|
)
|
|
1198
|
+
return structured_tool_result
|
|
867
1199
|
|
|
868
1200
|
# Handle known Prometheus error status codes
|
|
869
1201
|
error_msg = "Unknown error occurred"
|
|
@@ -912,7 +1244,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
912
1244
|
def __init__(self, toolset: "PrometheusToolset"):
|
|
913
1245
|
super().__init__(
|
|
914
1246
|
name="execute_prometheus_range_query",
|
|
915
|
-
description=
|
|
1247
|
+
description=(
|
|
1248
|
+
f"Generates a graph and Execute a PromQL range query. "
|
|
1249
|
+
f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
|
|
1250
|
+
f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
|
|
1251
|
+
f"Default time range is last 1 hour."
|
|
1252
|
+
),
|
|
916
1253
|
parameters={
|
|
917
1254
|
"query": ToolParameter(
|
|
918
1255
|
description="The PromQL query",
|
|
@@ -946,13 +1283,30 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
946
1283
|
type="string",
|
|
947
1284
|
required=True,
|
|
948
1285
|
),
|
|
1286
|
+
"timeout": ToolParameter(
|
|
1287
|
+
description=(
|
|
1288
|
+
f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
|
|
1289
|
+
f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
|
|
1290
|
+
f"Increase for complex queries that may take longer."
|
|
1291
|
+
),
|
|
1292
|
+
type="number",
|
|
1293
|
+
required=False,
|
|
1294
|
+
),
|
|
1295
|
+
"max_points": ToolParameter(
|
|
1296
|
+
description=(
|
|
1297
|
+
f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
|
|
1298
|
+
f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
|
|
1299
|
+
f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
|
|
1300
|
+
f"If your query would return more points than this limit, the step will be automatically adjusted."
|
|
1301
|
+
),
|
|
1302
|
+
type="number",
|
|
1303
|
+
required=False,
|
|
1304
|
+
),
|
|
949
1305
|
},
|
|
950
1306
|
toolset=toolset,
|
|
951
1307
|
)
|
|
952
1308
|
|
|
953
|
-
def _invoke(
|
|
954
|
-
self, params: dict, user_approved: bool = False
|
|
955
|
-
) -> StructuredToolResult:
|
|
1309
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
956
1310
|
if not self.toolset.config or not self.toolset.config.prometheus_url:
|
|
957
1311
|
return StructuredToolResult(
|
|
958
1312
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -970,12 +1324,16 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
970
1324
|
default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
|
|
971
1325
|
)
|
|
972
1326
|
step = parse_duration_to_seconds(params.get("step"))
|
|
1327
|
+
max_points = params.get(
|
|
1328
|
+
"max_points"
|
|
1329
|
+
) # Get the optional max_points parameter
|
|
973
1330
|
|
|
974
1331
|
# adjust_step_for_max_points handles None case and converts to float
|
|
975
1332
|
step = adjust_step_for_max_points(
|
|
976
1333
|
start_timestamp=start,
|
|
977
1334
|
end_timestamp=end,
|
|
978
1335
|
step=step,
|
|
1336
|
+
max_points_override=max_points,
|
|
979
1337
|
)
|
|
980
1338
|
|
|
981
1339
|
description = params.get("description", "")
|
|
@@ -987,12 +1345,24 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
987
1345
|
"step": step,
|
|
988
1346
|
}
|
|
989
1347
|
|
|
1348
|
+
# Get timeout parameter and enforce limits
|
|
1349
|
+
default_timeout = self.toolset.config.default_query_timeout_seconds
|
|
1350
|
+
max_timeout = self.toolset.config.max_query_timeout_seconds
|
|
1351
|
+
timeout = params.get("timeout", default_timeout)
|
|
1352
|
+
if timeout > max_timeout:
|
|
1353
|
+
timeout = max_timeout
|
|
1354
|
+
logging.warning(
|
|
1355
|
+
f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
|
|
1356
|
+
)
|
|
1357
|
+
elif timeout < 1:
|
|
1358
|
+
timeout = default_timeout # Min 1 second, but use default if invalid
|
|
1359
|
+
|
|
990
1360
|
response = do_request(
|
|
991
1361
|
config=self.toolset.config,
|
|
992
1362
|
url=url,
|
|
993
1363
|
headers=self.toolset.config.headers,
|
|
994
1364
|
data=payload,
|
|
995
|
-
timeout=
|
|
1365
|
+
timeout=timeout,
|
|
996
1366
|
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
997
1367
|
method="POST",
|
|
998
1368
|
)
|
|
@@ -1006,54 +1376,69 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
1006
1376
|
error_message = (
|
|
1007
1377
|
"The prometheus query returned no result. Is the query correct?"
|
|
1008
1378
|
)
|
|
1009
|
-
response_data =
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1379
|
+
response_data = MetricsBasedResponse(
|
|
1380
|
+
status=status,
|
|
1381
|
+
error_message=error_message,
|
|
1382
|
+
random_key=generate_random_key(),
|
|
1383
|
+
tool_name=self.name,
|
|
1384
|
+
description=description,
|
|
1385
|
+
query=query,
|
|
1386
|
+
start=start,
|
|
1387
|
+
end=end,
|
|
1388
|
+
step=step,
|
|
1389
|
+
output_type=output_type,
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
structured_tool_result: StructuredToolResult
|
|
1021
1393
|
|
|
1022
1394
|
# Check if data should be included based on size
|
|
1023
1395
|
if self.toolset.config.tool_calls_return_data:
|
|
1024
1396
|
result_data = data.get("data", {})
|
|
1397
|
+
response_data.data = result_data
|
|
1398
|
+
structured_tool_result = create_structured_tool_result(
|
|
1399
|
+
params=params, response=response_data
|
|
1400
|
+
)
|
|
1025
1401
|
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1402
|
+
token_count = count_tool_response_tokens(
|
|
1403
|
+
llm=context.llm, structured_tool_result=structured_tool_result
|
|
1404
|
+
)
|
|
1405
|
+
|
|
1406
|
+
token_limit = context.max_token_count
|
|
1407
|
+
if self.toolset.config.query_response_size_limit_pct:
|
|
1408
|
+
custom_token_limit = get_pct_token_count(
|
|
1409
|
+
percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
|
|
1410
|
+
llm=context.llm,
|
|
1411
|
+
)
|
|
1412
|
+
if custom_token_limit < token_limit:
|
|
1413
|
+
token_limit = custom_token_limit
|
|
1029
1414
|
|
|
1030
1415
|
# Provide summary if data is too large
|
|
1031
|
-
if
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
> self.toolset.config.query_response_size_limit
|
|
1035
|
-
):
|
|
1036
|
-
response_data["data_summary"] = (
|
|
1416
|
+
if token_count > token_limit:
|
|
1417
|
+
response_data.data = None
|
|
1418
|
+
response_data.data_summary = (
|
|
1037
1419
|
create_data_summary_for_large_result(
|
|
1038
|
-
result_data, query,
|
|
1420
|
+
result_data, query, token_count, is_range_query=True
|
|
1039
1421
|
)
|
|
1040
1422
|
)
|
|
1041
1423
|
logging.info(
|
|
1042
1424
|
f"Prometheus range query returned large dataset: "
|
|
1043
|
-
f"{response_data
|
|
1044
|
-
f"{
|
|
1425
|
+
f"{response_data.data_summary.get('series_count', 0)} series, "
|
|
1426
|
+
f"{token_count:,} tokens (limit: {token_limit:,}). "
|
|
1427
|
+
f"Returning summary instead of full data."
|
|
1428
|
+
)
|
|
1429
|
+
# Also add character info to the summary for debugging
|
|
1430
|
+
response_data.data_summary["_debug_info"] = (
|
|
1431
|
+
f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
|
|
1045
1432
|
)
|
|
1046
1433
|
else:
|
|
1047
|
-
response_data
|
|
1048
|
-
|
|
1049
|
-
data_str = json.dumps(response_data, indent=2)
|
|
1434
|
+
response_data.data = result_data
|
|
1050
1435
|
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
data=data_str,
|
|
1054
|
-
params=params,
|
|
1436
|
+
structured_tool_result = create_structured_tool_result(
|
|
1437
|
+
params=params, response=response_data
|
|
1055
1438
|
)
|
|
1056
1439
|
|
|
1440
|
+
return structured_tool_result
|
|
1441
|
+
|
|
1057
1442
|
error_msg = "Unknown error occurred"
|
|
1058
1443
|
if response.status_code in [400, 429]:
|
|
1059
1444
|
try:
|
|
@@ -1107,7 +1492,11 @@ class PrometheusToolset(Toolset):
|
|
|
1107
1492
|
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
1108
1493
|
tools=[
|
|
1109
1494
|
ListPrometheusRules(toolset=self),
|
|
1110
|
-
|
|
1495
|
+
GetMetricNames(toolset=self),
|
|
1496
|
+
GetLabelValues(toolset=self),
|
|
1497
|
+
GetAllLabels(toolset=self),
|
|
1498
|
+
GetSeries(toolset=self),
|
|
1499
|
+
GetMetricMetadata(toolset=self),
|
|
1111
1500
|
ExecuteInstantQuery(toolset=self),
|
|
1112
1501
|
ExecuteRangeQuery(toolset=self),
|
|
1113
1502
|
],
|