holmesgpt 0.14.1a0__py3-none-any.whl → 0.14.3a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (73) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +5 -2
  3. holmes/common/env_vars.py +8 -2
  4. holmes/config.py +4 -7
  5. holmes/core/conversations.py +12 -2
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/llm.py +52 -10
  8. holmes/core/models.py +101 -1
  9. holmes/core/supabase_dal.py +23 -9
  10. holmes/core/tool_calling_llm.py +206 -16
  11. holmes/core/tools.py +20 -7
  12. holmes/core/tools_utils/token_counting.py +13 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +45 -23
  14. holmes/core/tools_utils/tool_executor.py +11 -6
  15. holmes/core/toolset_manager.py +7 -3
  16. holmes/core/truncation/dal_truncation_utils.py +23 -0
  17. holmes/interactive.py +146 -14
  18. holmes/plugins/prompts/_fetch_logs.jinja2 +13 -1
  19. holmes/plugins/runbooks/__init__.py +6 -1
  20. holmes/plugins/toolsets/__init__.py +11 -4
  21. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +9 -20
  22. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -3
  23. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -3
  24. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +6 -4
  25. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +6 -4
  26. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -3
  27. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -4
  28. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -3
  29. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -3
  30. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -3
  31. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -3
  32. holmes/plugins/toolsets/bash/bash_toolset.py +4 -7
  33. holmes/plugins/toolsets/cilium.yaml +284 -0
  34. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  35. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  36. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +333 -199
  37. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +181 -9
  38. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +80 -22
  39. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +5 -8
  40. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +7 -12
  41. holmes/plugins/toolsets/git.py +14 -12
  42. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +23 -42
  43. holmes/plugins/toolsets/grafana/toolset_grafana.py +2 -3
  44. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +2 -1
  45. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +21 -39
  46. holmes/plugins/toolsets/internet/internet.py +2 -3
  47. holmes/plugins/toolsets/internet/notion.py +2 -3
  48. holmes/plugins/toolsets/investigator/core_investigation.py +7 -9
  49. holmes/plugins/toolsets/kafka.py +7 -18
  50. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  51. holmes/plugins/toolsets/mcp/toolset_mcp.py +2 -3
  52. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  53. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  54. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  55. holmes/plugins/toolsets/newrelic/newrelic.py +211 -0
  56. holmes/plugins/toolsets/opensearch/opensearch.py +5 -12
  57. holmes/plugins/toolsets/opensearch/opensearch_traces.py +3 -6
  58. holmes/plugins/toolsets/prometheus/prometheus.py +808 -419
  59. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +27 -11
  60. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +3 -6
  61. holmes/plugins/toolsets/robusta/robusta.py +4 -9
  62. holmes/plugins/toolsets/runbook/runbook_fetcher.py +93 -13
  63. holmes/plugins/toolsets/servicenow/servicenow.py +5 -10
  64. holmes/utils/sentry_helper.py +1 -1
  65. holmes/utils/stream.py +22 -7
  66. holmes/version.py +34 -14
  67. {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/METADATA +7 -9
  68. {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/RECORD +71 -65
  69. holmes/core/tools_utils/data_types.py +0 -81
  70. holmes/plugins/toolsets/newrelic.py +0 -231
  71. {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/LICENSE.txt +0 -0
  72. {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/WHEEL +0 -0
  73. {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.3a0.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,9 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import re
5
4
  import time
6
5
  import dateutil.parser
7
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
6
+ from typing import Any, Dict, Optional, Tuple, Type, Union
8
7
  from urllib.parse import urljoin
9
8
 
10
9
  import requests # type: ignore
@@ -16,11 +15,14 @@ from holmes.core.tools import (
16
15
  CallablePrerequisite,
17
16
  StructuredToolResult,
18
17
  Tool,
18
+ ToolInvokeContext,
19
19
  ToolParameter,
20
20
  StructuredToolResultStatus,
21
21
  Toolset,
22
22
  ToolsetTag,
23
23
  )
24
+ from holmes.core.tools_utils.token_counting import count_tool_response_tokens
25
+ from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
24
26
  from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
25
27
  from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
26
28
  from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
@@ -39,25 +41,59 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
39
41
  from holmes.utils.keygen_utils import generate_random_key
40
42
 
41
43
  PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
44
+ PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
45
+ # Default timeout values for PromQL queries
46
+ DEFAULT_QUERY_TIMEOUT_SECONDS = 20
47
+ MAX_QUERY_TIMEOUT_SECONDS = 180
48
+ # Default timeout for metadata API calls (discovery endpoints)
49
+ DEFAULT_METADATA_TIMEOUT_SECONDS = 20
50
+ MAX_METADATA_TIMEOUT_SECONDS = 60
51
+ # Default time window for metadata APIs (in hours)
52
+ DEFAULT_METADATA_TIME_WINDOW_HRS = 1
42
53
 
43
54
 
44
55
  class PrometheusConfig(BaseModel):
45
56
  # URL is optional because it can be set with an env var
46
57
  prometheus_url: Optional[str]
47
58
  healthcheck: str = "-/healthy"
48
- # Setting to None will remove the time window from the request for labels
49
- metrics_labels_time_window_hrs: Union[int, None] = 48
50
- # Setting to None will disable the cache
51
- metrics_labels_cache_duration_hrs: Union[int, None] = 12
52
- fetch_labels_with_labels_api: bool = False
53
- fetch_metadata_with_series_api: bool = False
59
+
60
+ # New config for default time window for metadata APIs
61
+ default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS # Default: only show metrics active in the last hour
62
+
63
+ # Query timeout configuration
64
+ default_query_timeout_seconds: int = (
65
+ DEFAULT_QUERY_TIMEOUT_SECONDS # Default timeout for PromQL queries
66
+ )
67
+ max_query_timeout_seconds: int = (
68
+ MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
69
+ )
70
+
71
+ # Metadata API timeout configuration
72
+ default_metadata_timeout_seconds: int = (
73
+ DEFAULT_METADATA_TIMEOUT_SECONDS # Default timeout for metadata/discovery APIs
74
+ )
75
+ max_metadata_timeout_seconds: int = (
76
+ MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
77
+ )
78
+
79
+ # DEPRECATED: These config values are deprecated and will be removed in a future version
80
+ # Using None as default so we can detect if user explicitly set them
81
+ metrics_labels_time_window_hrs: Optional[int] = (
82
+ None # DEPRECATED - use default_metadata_time_window_hrs instead
83
+ )
84
+ metrics_labels_cache_duration_hrs: Optional[int] = (
85
+ None # DEPRECATED - no longer used
86
+ )
87
+ fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
88
+ fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
89
+
54
90
  tool_calls_return_data: bool = True
55
91
  headers: Dict = Field(default_factory=dict)
56
- rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
92
+ rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
57
93
  additional_labels: Optional[Dict[str, str]] = None
58
94
  prometheus_ssl_enabled: bool = True
59
- query_response_size_limit: Optional[int] = (
60
- 80000 # Limit the max number of characters in a query result to proactively prevent truncation and advise LLM to query less data
95
+ query_response_size_limit_pct: Optional[int] = (
96
+ 2 # Limit the max number of tokens that a query result can take to proactively prevent token limit issues. Expressed in % of the model's context window
61
97
  )
62
98
 
63
99
  @field_validator("prometheus_url")
@@ -68,6 +104,26 @@ class PrometheusConfig(BaseModel):
68
104
 
69
105
  @model_validator(mode="after")
70
106
  def validate_prom_config(self):
107
+ # Check for deprecated config values and print warnings
108
+ deprecated_configs = []
109
+ if self.metrics_labels_time_window_hrs is not None: # Check if explicitly set
110
+ deprecated_configs.append(
111
+ "metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
112
+ )
113
+ if (
114
+ self.metrics_labels_cache_duration_hrs is not None
115
+ ): # Check if explicitly set
116
+ deprecated_configs.append("metrics_labels_cache_duration_hrs")
117
+ if self.fetch_labels_with_labels_api is not None: # Check if explicitly set
118
+ deprecated_configs.append("fetch_labels_with_labels_api")
119
+ if self.fetch_metadata_with_series_api is not None: # Check if explicitly set
120
+ deprecated_configs.append("fetch_metadata_with_series_api")
121
+
122
+ if deprecated_configs:
123
+ logging.warning(
124
+ f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
125
+ f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
126
+ )
71
127
  # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
72
128
  if IS_OPENSHIFT:
73
129
  if self.healthcheck == "-/healthy":
@@ -164,6 +220,8 @@ def do_request(
164
220
 
165
221
  if isinstance(config, AMPConfig):
166
222
  client = config.get_aws_client() # cached AWSPrometheusConnect
223
+ # Note: timeout parameter is not supported by prometrix's signed_request
224
+ # AWS/AMP requests will not respect the timeout setting
167
225
  return client.signed_request( # type: ignore
168
226
  method=method,
169
227
  url=url,
@@ -185,99 +243,6 @@ def do_request(
185
243
  )
186
244
 
187
245
 
188
- def filter_metrics_by_type(metrics: Dict, expected_type: str):
189
- return {
190
- metric_name: metric_data
191
- for metric_name, metric_data in metrics.items()
192
- if expected_type in metric_data.get("type", "")
193
- or metric_data.get("type", "") == "?"
194
- }
195
-
196
-
197
- def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
198
- regex = re.compile(pattern)
199
- return {
200
- metric_name: metric_data
201
- for metric_name, metric_data in metrics.items()
202
- if regex.search(metric_name)
203
- }
204
-
205
-
206
- METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
207
-
208
-
209
- def fetch_metadata(
210
- prometheus_url: str,
211
- headers: Optional[Dict],
212
- config,
213
- verify_ssl: bool = True,
214
- ) -> Dict:
215
- metadata_url = urljoin(prometheus_url, "api/v1/metadata")
216
- metadata_response = do_request(
217
- config=config,
218
- url=metadata_url,
219
- headers=headers,
220
- timeout=60,
221
- verify=verify_ssl,
222
- method="GET",
223
- )
224
- metadata_response.raise_for_status()
225
-
226
- metadata = metadata_response.json()["data"]
227
-
228
- metrics = {}
229
- for metric_name, meta_list in metadata.items():
230
- if meta_list:
231
- metric_type = meta_list[0].get("type", "unknown")
232
- metric_description = meta_list[0].get("help", "unknown")
233
- metrics[metric_name] = {
234
- "type": metric_type,
235
- "description": metric_description,
236
- "labels": set(),
237
- }
238
-
239
- return metrics
240
-
241
-
242
- def fetch_metadata_with_series_api(
243
- prometheus_url: str,
244
- metric_name: str,
245
- headers: Dict,
246
- config,
247
- verify_ssl: bool = True,
248
- ) -> Dict:
249
- url = urljoin(prometheus_url, "api/v1/series")
250
- params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
251
-
252
- response = do_request(
253
- config=config,
254
- url=url,
255
- headers=headers,
256
- params=params,
257
- timeout=60,
258
- verify=verify_ssl,
259
- method="GET",
260
- )
261
- response.raise_for_status()
262
- metrics = response.json()["data"]
263
-
264
- metadata: Dict = {}
265
- for metric_data in metrics:
266
- metric_name = metric_data.get("__name__")
267
- if not metric_name:
268
- continue
269
-
270
- metric = metadata.get(metric_name)
271
- if not metric:
272
- metric = {"description": "?", "type": "?", "labels": set()}
273
- metadata[metric_name] = metric
274
-
275
- labels = {k for k in metric_data.keys() if k != "__name__"}
276
- metric["labels"].update(labels)
277
-
278
- return metadata
279
-
280
-
281
246
  def result_has_data(result: Dict) -> bool:
282
247
  data = result.get("data", {})
283
248
  if len(data.get("result", [])) > 0:
@@ -289,19 +254,36 @@ def adjust_step_for_max_points(
289
254
  start_timestamp: str,
290
255
  end_timestamp: str,
291
256
  step: Optional[float] = None,
257
+ max_points_override: Optional[float] = None,
292
258
  ) -> float:
293
259
  """
294
260
  Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
295
- Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
296
261
 
297
262
  Args:
298
263
  start_timestamp: RFC3339 formatted start time
299
264
  end_timestamp: RFC3339 formatted end time
300
265
  step: The requested step duration in seconds (None for auto-calculation)
266
+ max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
301
267
 
302
268
  Returns:
303
269
  Adjusted step value in seconds that ensures points <= max_points
304
270
  """
271
+ # Use override if provided and valid, otherwise use default
272
+ max_points = MAX_GRAPH_POINTS
273
+ if max_points_override is not None:
274
+ if max_points_override > MAX_GRAPH_POINTS:
275
+ logging.warning(
276
+ f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
277
+ )
278
+ max_points = MAX_GRAPH_POINTS
279
+ elif max_points_override < 1:
280
+ logging.warning(
281
+ f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
282
+ )
283
+ max_points = MAX_GRAPH_POINTS
284
+ else:
285
+ max_points = max_points_override
286
+ logging.debug(f"Using max_points override: {max_points}")
305
287
 
306
288
  start_dt = dateutil.parser.parse(start_timestamp)
307
289
  end_dt = dateutil.parser.parse(end_timestamp)
@@ -319,10 +301,10 @@ def adjust_step_for_max_points(
319
301
  current_points = time_range_seconds / step
320
302
 
321
303
  # If current points exceed max, adjust the step
322
- if current_points > MAX_GRAPH_POINTS:
323
- adjusted_step = time_range_seconds / MAX_GRAPH_POINTS
304
+ if current_points > max_points:
305
+ adjusted_step = time_range_seconds / max_points
324
306
  logging.info(
325
- f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {MAX_GRAPH_POINTS}"
307
+ f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
326
308
  )
327
309
  return adjusted_step
328
310
 
@@ -337,7 +319,7 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
337
319
 
338
320
 
339
321
  def create_data_summary_for_large_result(
340
- result_data: Dict, query: str, data_size_chars: int, is_range_query: bool = False
322
+ result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
341
323
  ) -> Dict[str, Any]:
342
324
  """
343
325
  Create a summary for large Prometheus results instead of returning full data.
@@ -345,7 +327,7 @@ def create_data_summary_for_large_result(
345
327
  Args:
346
328
  result_data: The Prometheus data result
347
329
  query: The original PromQL query
348
- data_size_chars: Size of the data in characters
330
+ data_size_tokens: Size of the data in tokens
349
331
  is_range_query: Whether this is a range query (vs instant query)
350
332
 
351
333
  Returns:
@@ -355,32 +337,36 @@ def create_data_summary_for_large_result(
355
337
  series_list = result_data.get("result", [])
356
338
  num_items = len(series_list)
357
339
 
358
- # Calculate statistics for range queries
340
+ # Calculate exact total data points across all series
359
341
  total_points = 0
360
- for series in series_list[:10]: # Sample first 10 series
342
+ for series in series_list: # Iterate through ALL series for exact count
361
343
  points = len(series.get("values", []))
362
344
  total_points += points
363
345
 
364
- avg_points_per_series = (
365
- total_points / min(10, num_items) if num_items > 0 else 0
346
+ # Analyze label keys and their cardinality
347
+ label_cardinality: Dict[str, set] = {}
348
+ for series in series_list:
349
+ metric = series.get("metric", {})
350
+ for label_key, label_value in metric.items():
351
+ if label_key not in label_cardinality:
352
+ label_cardinality[label_key] = set()
353
+ label_cardinality[label_key].add(label_value)
354
+
355
+ # Convert sets to counts for the summary
356
+ label_summary = {
357
+ label: len(values) for label, values in label_cardinality.items()
358
+ }
359
+ # Sort by cardinality (highest first) for better insights
360
+ label_summary = dict(
361
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
366
362
  )
367
- estimated_total_points = avg_points_per_series * num_items
368
-
369
- # Create a sample of just the metadata (labels) without values
370
- sample_metrics = []
371
- for series in series_list[:10]: # Sample first 10 series
372
- sample_metrics.append(series.get("metric", {}))
373
-
374
- sample_json = json.dumps(sample_metrics, indent=2)
375
- if len(sample_json) > 2000:
376
- sample_json = sample_json[:2000] + "\n... (truncated)"
377
363
 
378
364
  return {
379
- "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with approximately {estimated_total_points:,.0f} total data points.",
365
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
380
366
  "series_count": num_items,
381
- "estimated_total_points": int(estimated_total_points),
382
- "data_size_characters": data_size_chars,
383
- "sample_data": sample_json,
367
+ "total_data_points": total_points,
368
+ "data_size_tokens": data_size_tokens,
369
+ "label_cardinality": label_summary,
384
370
  "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
385
371
  }
386
372
  else:
@@ -389,196 +375,77 @@ def create_data_summary_for_large_result(
389
375
  result_list = result_data.get("result", [])
390
376
  num_items = len(result_list)
391
377
 
392
- # Create a sample of just the metadata (labels) without values
393
- sample_metrics = []
394
- for item in result_list[:10]: # Sample first 10 results
378
+ # Analyze label keys and their cardinality
379
+ instant_label_cardinality: Dict[str, set] = {}
380
+ for item in result_list:
395
381
  if isinstance(item, dict):
396
- sample_metrics.append(item.get("metric", {}))
397
-
398
- sample_json = json.dumps(sample_metrics, indent=2)
399
- if len(sample_json) > 2000:
400
- sample_json = sample_json[:2000] + "\n... (truncated)"
382
+ metric = item.get("metric", {})
383
+ for label_key, label_value in metric.items():
384
+ if label_key not in instant_label_cardinality:
385
+ instant_label_cardinality[label_key] = set()
386
+ instant_label_cardinality[label_key].add(label_value)
387
+
388
+ # Convert sets to counts for the summary
389
+ label_summary = {
390
+ label: len(values) for label, values in instant_label_cardinality.items()
391
+ }
392
+ # Sort by cardinality (highest first) for better insights
393
+ label_summary = dict(
394
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
395
+ )
401
396
 
402
397
  return {
403
- "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
398
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
404
399
  "result_count": num_items,
405
400
  "result_type": result_type,
406
- "data_size_characters": data_size_chars,
407
- "sample_data": sample_json,
401
+ "data_size_tokens": data_size_tokens,
402
+ "label_cardinality": label_summary,
408
403
  "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
409
404
  }
410
405
 
411
406
 
412
- def fetch_metrics_labels_with_series_api(
413
- prometheus_url: str,
414
- headers: Dict[str, str],
415
- cache: Optional[TTLCache],
416
- metrics_labels_time_window_hrs: Union[int, None],
417
- metric_name: str,
418
- config=None,
419
- verify_ssl: bool = True,
420
- ) -> dict:
421
- """This is a slow query. Takes 5+ seconds to run"""
422
- cache_key = f"metrics_labels_series_api:{metric_name}"
423
- if cache:
424
- cached_result = cache.get(cache_key)
425
- if cached_result:
426
- return cached_result
427
-
428
- series_url = urljoin(prometheus_url, "api/v1/series")
429
- params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
430
-
431
- if metrics_labels_time_window_hrs is not None:
432
- params["end"] = int(time.time())
433
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
434
-
435
- series_response = do_request(
436
- config=config,
437
- url=series_url,
438
- headers=headers,
407
+ class MetricsBasedResponse(BaseModel):
408
+ status: str
409
+ error_message: Optional[str] = None
410
+ data: Optional[str] = None
411
+ random_key: str
412
+ tool_name: str
413
+ description: str
414
+ query: str
415
+ start: Optional[str] = None
416
+ end: Optional[str] = None
417
+ step: Optional[float] = None
418
+ output_type: Optional[str] = None
419
+ data_summary: Optional[dict[str, Any]] = None
420
+
421
+
422
+ def create_structured_tool_result(
423
+ params: dict, response: MetricsBasedResponse
424
+ ) -> StructuredToolResult:
425
+ status = StructuredToolResultStatus.SUCCESS
426
+ if response.error_message or response.status.lower() in ("failed", "error"):
427
+ status = StructuredToolResultStatus.ERROR
428
+ elif not response.data:
429
+ status = StructuredToolResultStatus.NO_DATA
430
+
431
+ return StructuredToolResult(
432
+ status=status,
433
+ data=response.model_dump_json(indent=2),
439
434
  params=params,
440
- timeout=60,
441
- verify=verify_ssl,
442
- method="GET",
443
435
  )
444
- series_response.raise_for_status()
445
- series = series_response.json()["data"]
446
-
447
- metrics_labels: dict = {}
448
- for serie in series:
449
- metric_name = serie["__name__"]
450
- # Add all labels except __name__
451
- labels = {k for k in serie.keys() if k != "__name__"}
452
- if metric_name in metrics_labels:
453
- metrics_labels[metric_name].update(labels)
454
- else:
455
- metrics_labels[metric_name] = labels
456
- if cache:
457
- cache.set(cache_key, metrics_labels)
458
-
459
- return metrics_labels
460
-
461
-
462
- def fetch_metrics_labels_with_labels_api(
463
- prometheus_url: str,
464
- cache: Optional[TTLCache],
465
- metrics_labels_time_window_hrs: Union[int, None],
466
- metric_names: List[str],
467
- headers: Dict,
468
- config=None,
469
- verify_ssl: bool = True,
470
- ) -> dict:
471
- metrics_labels = {}
472
-
473
- for metric_name in metric_names:
474
- cache_key = f"metrics_labels_labels_api:{metric_name}"
475
- if cache:
476
- cached_result = cache.get(cache_key)
477
- if cached_result:
478
- metrics_labels[metric_name] = cached_result
479
-
480
- url = urljoin(prometheus_url, "api/v1/labels")
481
- params: dict = {
482
- "match[]": f'{{__name__="{metric_name}"}}',
483
- }
484
- if metrics_labels_time_window_hrs is not None:
485
- params["end"] = int(time.time())
486
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
487
-
488
- response = do_request(
489
- config=config,
490
- url=url,
491
- headers=headers,
492
- params=params,
493
- timeout=60,
494
- verify=verify_ssl,
495
- method="GET",
496
- )
497
- response.raise_for_status()
498
- labels = response.json()["data"]
499
- filtered_labels = {label for label in labels if label != "__name__"}
500
- metrics_labels[metric_name] = filtered_labels
501
-
502
- if cache:
503
- cache.set(cache_key, filtered_labels)
504
-
505
- return metrics_labels
506
-
507
-
508
- def fetch_metrics(
509
- prometheus_url: str,
510
- cache: Optional[TTLCache],
511
- metrics_labels_time_window_hrs: Union[int, None],
512
- metric_name: str,
513
- should_fetch_labels_with_labels_api: bool,
514
- should_fetch_metadata_with_series_api: bool,
515
- headers: Dict,
516
- config=None,
517
- verify_ssl: bool = True,
518
- ) -> dict:
519
- metrics = None
520
- should_fetch_labels = True
521
- if should_fetch_metadata_with_series_api:
522
- metrics = fetch_metadata_with_series_api(
523
- prometheus_url=prometheus_url,
524
- metric_name=metric_name,
525
- headers=headers,
526
- config=config,
527
- verify_ssl=verify_ssl,
528
- )
529
- should_fetch_labels = False # series API returns the labels
530
- else:
531
- metrics = fetch_metadata(
532
- prometheus_url=prometheus_url,
533
- headers=headers,
534
- config=config,
535
- verify_ssl=verify_ssl,
536
- )
537
- metrics = filter_metrics_by_name(metrics, metric_name)
538
-
539
- if should_fetch_labels:
540
- metrics_labels = {}
541
- if should_fetch_labels_with_labels_api:
542
- metrics_labels = fetch_metrics_labels_with_labels_api(
543
- prometheus_url=prometheus_url,
544
- cache=cache,
545
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
546
- metric_names=list(metrics.keys()),
547
- headers=headers,
548
- config=config,
549
- verify_ssl=verify_ssl,
550
- )
551
- else:
552
- metrics_labels = fetch_metrics_labels_with_series_api(
553
- prometheus_url=prometheus_url,
554
- cache=cache,
555
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
556
- metric_name=metric_name,
557
- headers=headers,
558
- config=config,
559
- verify_ssl=verify_ssl,
560
- )
561
-
562
- for metric_name in metrics:
563
- if metric_name in metrics_labels:
564
- metrics[metric_name]["labels"] = metrics_labels[metric_name]
565
-
566
- return metrics
567
436
 
568
437
 
569
438
  class ListPrometheusRules(BasePrometheusTool):
570
439
  def __init__(self, toolset: "PrometheusToolset"):
571
440
  super().__init__(
572
441
  name="list_prometheus_rules",
573
- description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
442
+ description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
574
443
  parameters={},
575
444
  toolset=toolset,
576
445
  )
577
446
  self._cache = None
578
447
 
579
- def _invoke(
580
- self, params: dict, user_approved: bool = False
581
- ) -> StructuredToolResult:
448
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
582
449
  if not self.toolset.config or not self.toolset.config.prometheus_url:
583
450
  return StructuredToolResult(
584
451
  status=StructuredToolResultStatus.ERROR,
@@ -613,7 +480,7 @@ class ListPrometheusRules(BasePrometheusTool):
613
480
  config=self.toolset.config,
614
481
  url=rules_url,
615
482
  params=params,
616
- timeout=180,
483
+ timeout=40,
617
484
  verify=self.toolset.config.prometheus_ssl_enabled,
618
485
  headers=self.toolset.config.headers,
619
486
  method="GET",
@@ -654,120 +521,553 @@ class ListPrometheusRules(BasePrometheusTool):
654
521
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
655
522
 
656
523
 
657
- class ListAvailableMetrics(BasePrometheusTool):
524
+ class GetMetricNames(BasePrometheusTool):
525
+ """Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
526
+
658
527
  def __init__(self, toolset: "PrometheusToolset"):
659
528
  super().__init__(
660
- name="list_available_metrics",
661
- description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
529
+ name="get_metric_names",
530
+ description=(
531
+ "Get list of metric names using /api/v1/label/__name__/values. "
532
+ "FASTEST method for metric discovery when you need to explore available metrics. "
533
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
534
+ f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
535
+ "Note: Does not return metric metadata (type, description, labels). "
536
+ "By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
537
+ ),
662
538
  parameters={
663
- "type_filter": ToolParameter(
664
- description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
539
+ "match": ToolParameter(
540
+ description=(
541
+ "REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
542
+ "'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
543
+ "'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
544
+ "'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
545
+ "'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
546
+ "'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
547
+ "'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
548
+ ),
549
+ type="string",
550
+ required=True,
551
+ ),
552
+ "start": ToolParameter(
553
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
665
554
  type="string",
666
555
  required=False,
667
556
  ),
668
- "name_filter": ToolParameter(
669
- description="Only the metrics partially or fully matching this name will be returned",
557
+ "end": ToolParameter(
558
+ description="End timestamp (RFC3339 or Unix). Default: now",
670
559
  type="string",
671
- required=True,
560
+ required=False,
672
561
  ),
673
562
  },
674
563
  toolset=toolset,
675
564
  )
676
- self._cache = None
677
565
 
678
- def _invoke(
679
- self, params: dict, user_approved: bool = False
680
- ) -> StructuredToolResult:
566
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
681
567
  if not self.toolset.config or not self.toolset.config.prometheus_url:
682
568
  return StructuredToolResult(
683
569
  status=StructuredToolResultStatus.ERROR,
684
570
  error="Prometheus is not configured. Prometheus URL is missing",
685
571
  params=params,
686
572
  )
687
- if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
688
- self._cache = TTLCache(
689
- self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
690
- )
691
573
  try:
692
- prometheus_url = self.toolset.config.prometheus_url
693
- metrics_labels_time_window_hrs = (
694
- self.toolset.config.metrics_labels_time_window_hrs
574
+ match_param = params.get("match")
575
+ if not match_param:
576
+ return StructuredToolResult(
577
+ status=StructuredToolResultStatus.ERROR,
578
+ error="Match parameter is required to filter metrics",
579
+ params=params,
580
+ )
581
+
582
+ url = urljoin(
583
+ self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
584
+ )
585
+ query_params = {
586
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
587
+ "match[]": match_param,
588
+ }
589
+
590
+ # Add time parameters - use provided values or defaults
591
+ if params.get("end"):
592
+ query_params["end"] = params["end"]
593
+ else:
594
+ query_params["end"] = str(int(time.time()))
595
+
596
+ if params.get("start"):
597
+ query_params["start"] = params["start"]
598
+ elif self.toolset.config.default_metadata_time_window_hrs:
599
+ # Use default time window
600
+ query_params["start"] = str(
601
+ int(time.time())
602
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
603
+ )
604
+
605
+ response = do_request(
606
+ config=self.toolset.config,
607
+ url=url,
608
+ params=query_params,
609
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
610
+ verify=self.toolset.config.prometheus_ssl_enabled,
611
+ headers=self.toolset.config.headers,
612
+ method="GET",
695
613
  )
614
+ response.raise_for_status()
615
+ data = response.json()
616
+
617
+ # Check if results were truncated
618
+ if (
619
+ "data" in data
620
+ and isinstance(data["data"], list)
621
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
622
+ ):
623
+ data["_truncated"] = True
624
+ data["_message"] = (
625
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
626
+ )
696
627
 
697
- name_filter = params.get("name_filter")
698
- if not name_filter:
628
+ return StructuredToolResult(
629
+ status=StructuredToolResultStatus.SUCCESS,
630
+ data=data,
631
+ params=params,
632
+ )
633
+ except Exception as e:
634
+ return StructuredToolResult(
635
+ status=StructuredToolResultStatus.ERROR,
636
+ error=str(e),
637
+ params=params,
638
+ )
639
+
640
+ def get_parameterized_one_liner(self, params) -> str:
641
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
642
+
643
+
644
+ class GetLabelValues(BasePrometheusTool):
645
+ """Get values for a specific label across all metrics"""
646
+
647
+ def __init__(self, toolset: "PrometheusToolset"):
648
+ super().__init__(
649
+ name="get_label_values",
650
+ description=(
651
+ "Get all values for a specific label using /api/v1/label/{label}/values. "
652
+ "Use this to discover pods, namespaces, jobs, instances, etc. "
653
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
654
+ "Supports optional match[] parameter to filter. "
655
+ "By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
656
+ ),
657
+ parameters={
658
+ "label": ToolParameter(
659
+ description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
660
+ type="string",
661
+ required=True,
662
+ ),
663
+ "match": ToolParameter(
664
+ description=(
665
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
666
+ "'{namespace=\"default\"}')."
667
+ ),
668
+ type="string",
669
+ required=False,
670
+ ),
671
+ "start": ToolParameter(
672
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
673
+ type="string",
674
+ required=False,
675
+ ),
676
+ "end": ToolParameter(
677
+ description="End timestamp (RFC3339 or Unix). Default: now",
678
+ type="string",
679
+ required=False,
680
+ ),
681
+ },
682
+ toolset=toolset,
683
+ )
684
+
685
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
686
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
687
+ return StructuredToolResult(
688
+ status=StructuredToolResultStatus.ERROR,
689
+ error="Prometheus is not configured. Prometheus URL is missing",
690
+ params=params,
691
+ )
692
+ try:
693
+ label = params.get("label")
694
+ if not label:
699
695
  return StructuredToolResult(
700
696
  status=StructuredToolResultStatus.ERROR,
701
- error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
697
+ error="Label parameter is required",
702
698
  params=params,
703
699
  )
704
700
 
705
- metrics = fetch_metrics(
706
- prometheus_url=prometheus_url,
707
- cache=self._cache,
708
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
709
- metric_name=name_filter,
710
- should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
711
- should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
712
- headers=self.toolset.config.headers,
701
+ url = urljoin(
702
+ self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
703
+ )
704
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
705
+ if params.get("match"):
706
+ query_params["match[]"] = params["match"]
707
+
708
+ # Add time parameters - use provided values or defaults
709
+ if params.get("end"):
710
+ query_params["end"] = params["end"]
711
+ else:
712
+ query_params["end"] = str(int(time.time()))
713
+
714
+ if params.get("start"):
715
+ query_params["start"] = params["start"]
716
+ elif self.toolset.config.default_metadata_time_window_hrs:
717
+ # Use default time window
718
+ query_params["start"] = str(
719
+ int(time.time())
720
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
721
+ )
722
+
723
+ response = do_request(
713
724
  config=self.toolset.config,
714
- verify_ssl=self.toolset.config.prometheus_ssl_enabled,
725
+ url=url,
726
+ params=query_params,
727
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
728
+ verify=self.toolset.config.prometheus_ssl_enabled,
729
+ headers=self.toolset.config.headers,
730
+ method="GET",
715
731
  )
732
+ response.raise_for_status()
733
+ data = response.json()
734
+
735
+ # Check if results were truncated
736
+ if (
737
+ "data" in data
738
+ and isinstance(data["data"], list)
739
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
740
+ ):
741
+ data["_truncated"] = True
742
+ data["_message"] = (
743
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
744
+ )
745
+
746
+ return StructuredToolResult(
747
+ status=StructuredToolResultStatus.SUCCESS,
748
+ data=data,
749
+ params=params,
750
+ )
751
+ except Exception as e:
752
+ return StructuredToolResult(
753
+ status=StructuredToolResultStatus.ERROR,
754
+ error=str(e),
755
+ params=params,
756
+ )
757
+
758
+ def get_parameterized_one_liner(self, params) -> str:
759
+ label = params.get("label", "")
760
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
716
761
 
717
- type_filter = params.get("type_filter")
718
- if type_filter:
719
- metrics = filter_metrics_by_type(metrics, type_filter)
720
762
 
721
- output = ["Metric | Description | Type | Labels"]
722
- output.append("-" * 100)
763
+ class GetAllLabels(BasePrometheusTool):
764
+ """Get all label names that exist in Prometheus"""
723
765
 
724
- for metric, info in sorted(metrics.items()):
725
- labels_str = (
726
- ", ".join(sorted(info["labels"])) if info["labels"] else "none"
766
+ def __init__(self, toolset: "PrometheusToolset"):
767
+ super().__init__(
768
+ name="get_all_labels",
769
+ description=(
770
+ "Get list of all label names using /api/v1/labels. "
771
+ "Use this to discover what labels are available across all metrics. "
772
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
773
+ "Supports optional match[] parameter to filter. "
774
+ "By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
775
+ ),
776
+ parameters={
777
+ "match": ToolParameter(
778
+ description=(
779
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
780
+ "'{job=\"prometheus\"}')."
781
+ ),
782
+ type="string",
783
+ required=False,
784
+ ),
785
+ "start": ToolParameter(
786
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
787
+ type="string",
788
+ required=False,
789
+ ),
790
+ "end": ToolParameter(
791
+ description="End timestamp (RFC3339 or Unix). Default: now",
792
+ type="string",
793
+ required=False,
794
+ ),
795
+ },
796
+ toolset=toolset,
797
+ )
798
+
799
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
800
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
801
+ return StructuredToolResult(
802
+ status=StructuredToolResultStatus.ERROR,
803
+ error="Prometheus is not configured. Prometheus URL is missing",
804
+ params=params,
805
+ )
806
+ try:
807
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
808
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
809
+ if params.get("match"):
810
+ query_params["match[]"] = params["match"]
811
+
812
+ # Add time parameters - use provided values or defaults
813
+ if params.get("end"):
814
+ query_params["end"] = params["end"]
815
+ else:
816
+ query_params["end"] = str(int(time.time()))
817
+
818
+ if params.get("start"):
819
+ query_params["start"] = params["start"]
820
+ elif self.toolset.config.default_metadata_time_window_hrs:
821
+ # Use default time window
822
+ query_params["start"] = str(
823
+ int(time.time())
824
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
727
825
  )
728
- output.append(
729
- f"{metric} | {info['description']} | {info['type']} | {labels_str}"
826
+
827
+ response = do_request(
828
+ config=self.toolset.config,
829
+ url=url,
830
+ params=query_params,
831
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
832
+ verify=self.toolset.config.prometheus_ssl_enabled,
833
+ headers=self.toolset.config.headers,
834
+ method="GET",
835
+ )
836
+ response.raise_for_status()
837
+ data = response.json()
838
+
839
+ # Check if results were truncated
840
+ if (
841
+ "data" in data
842
+ and isinstance(data["data"], list)
843
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
844
+ ):
845
+ data["_truncated"] = True
846
+ data["_message"] = (
847
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
730
848
  )
731
849
 
732
- table_output = "\n".join(output)
733
850
  return StructuredToolResult(
734
851
  status=StructuredToolResultStatus.SUCCESS,
735
- data=table_output,
852
+ data=data,
853
+ params=params,
854
+ )
855
+ except Exception as e:
856
+ return StructuredToolResult(
857
+ status=StructuredToolResultStatus.ERROR,
858
+ error=str(e),
736
859
  params=params,
737
860
  )
738
861
 
739
- except requests.Timeout:
740
- logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
862
+ def get_parameterized_one_liner(self, params) -> str:
863
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
864
+
865
+
866
+ class GetSeries(BasePrometheusTool):
867
+ """Get time series matching a selector"""
868
+
869
+ def __init__(self, toolset: "PrometheusToolset"):
870
+ super().__init__(
871
+ name="get_series",
872
+ description=(
873
+ "Get time series using /api/v1/series. "
874
+ "Returns label sets for all time series matching the selector. "
875
+ "SLOWER than other discovery methods - use only when you need full label sets. "
876
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
877
+ "Requires match[] parameter with PromQL selector. "
878
+ "By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
879
+ ),
880
+ parameters={
881
+ "match": ToolParameter(
882
+ description=(
883
+ "PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
884
+ "'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
885
+ '\'{__name__="up",job="prometheus"}\').'
886
+ ),
887
+ type="string",
888
+ required=True,
889
+ ),
890
+ "start": ToolParameter(
891
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
892
+ type="string",
893
+ required=False,
894
+ ),
895
+ "end": ToolParameter(
896
+ description="End timestamp (RFC3339 or Unix). Default: now",
897
+ type="string",
898
+ required=False,
899
+ ),
900
+ },
901
+ toolset=toolset,
902
+ )
903
+
904
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
905
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
741
906
  return StructuredToolResult(
742
907
  status=StructuredToolResultStatus.ERROR,
743
- error="Request timed out while fetching metrics",
908
+ error="Prometheus is not configured. Prometheus URL is missing",
744
909
  params=params,
745
910
  )
746
- except RequestException as e:
747
- logging.warn("Failed to fetch prometheus metrics", exc_info=True)
911
+ try:
912
+ match = params.get("match")
913
+ if not match:
914
+ return StructuredToolResult(
915
+ status=StructuredToolResultStatus.ERROR,
916
+ error="Match parameter is required",
917
+ params=params,
918
+ )
919
+
920
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
921
+ query_params = {
922
+ "match[]": match,
923
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
924
+ }
925
+
926
+ # Add time parameters - use provided values or defaults
927
+ if params.get("end"):
928
+ query_params["end"] = params["end"]
929
+ else:
930
+ query_params["end"] = str(int(time.time()))
931
+
932
+ if params.get("start"):
933
+ query_params["start"] = params["start"]
934
+ elif self.toolset.config.default_metadata_time_window_hrs:
935
+ # Use default time window
936
+ query_params["start"] = str(
937
+ int(time.time())
938
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
939
+ )
940
+
941
+ response = do_request(
942
+ config=self.toolset.config,
943
+ url=url,
944
+ params=query_params,
945
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
946
+ verify=self.toolset.config.prometheus_ssl_enabled,
947
+ headers=self.toolset.config.headers,
948
+ method="GET",
949
+ )
950
+ response.raise_for_status()
951
+ data = response.json()
952
+
953
+ # Check if results were truncated
954
+ if (
955
+ "data" in data
956
+ and isinstance(data["data"], list)
957
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
958
+ ):
959
+ data["_truncated"] = True
960
+ data["_message"] = (
961
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
962
+ )
963
+
964
+ return StructuredToolResult(
965
+ status=StructuredToolResultStatus.SUCCESS,
966
+ data=data,
967
+ params=params,
968
+ )
969
+ except Exception as e:
970
+ return StructuredToolResult(
971
+ status=StructuredToolResultStatus.ERROR,
972
+ error=str(e),
973
+ params=params,
974
+ )
975
+
976
+ def get_parameterized_one_liner(self, params) -> str:
977
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
978
+
979
+
980
+ class GetMetricMetadata(BasePrometheusTool):
981
+ """Get metadata (type, description, unit) for metrics"""
982
+
983
+ def __init__(self, toolset: "PrometheusToolset"):
984
+ super().__init__(
985
+ name="get_metric_metadata",
986
+ description=(
987
+ "Get metric metadata using /api/v1/metadata. "
988
+ "Returns type, help text, and unit for metrics. "
989
+ "Use after discovering metric names to get their descriptions. "
990
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
991
+ "Supports optional metric name filter."
992
+ ),
993
+ parameters={
994
+ "metric": ToolParameter(
995
+ description=(
996
+ "Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
997
+ "If not provided, returns metadata for all metrics."
998
+ ),
999
+ type="string",
1000
+ required=False,
1001
+ ),
1002
+ },
1003
+ toolset=toolset,
1004
+ )
1005
+
1006
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
1007
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
748
1008
  return StructuredToolResult(
749
1009
  status=StructuredToolResultStatus.ERROR,
750
- error=f"Network error while fetching metrics: {str(e)}",
1010
+ error="Prometheus is not configured. Prometheus URL is missing",
1011
+ params=params,
1012
+ )
1013
+ try:
1014
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
1015
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
1016
+
1017
+ if params.get("metric"):
1018
+ query_params["metric"] = params["metric"]
1019
+
1020
+ response = do_request(
1021
+ config=self.toolset.config,
1022
+ url=url,
1023
+ params=query_params,
1024
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
1025
+ verify=self.toolset.config.prometheus_ssl_enabled,
1026
+ headers=self.toolset.config.headers,
1027
+ method="GET",
1028
+ )
1029
+ response.raise_for_status()
1030
+ data = response.json()
1031
+
1032
+ # Check if results were truncated (metadata endpoint returns a dict, not a list)
1033
+ if (
1034
+ "data" in data
1035
+ and isinstance(data["data"], dict)
1036
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1037
+ ):
1038
+ data["_truncated"] = True
1039
+ data["_message"] = (
1040
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
1041
+ )
1042
+
1043
+ return StructuredToolResult(
1044
+ status=StructuredToolResultStatus.SUCCESS,
1045
+ data=data,
751
1046
  params=params,
752
1047
  )
753
1048
  except Exception as e:
754
- logging.warn("Failed to process prometheus metrics", exc_info=True)
755
1049
  return StructuredToolResult(
756
1050
  status=StructuredToolResultStatus.ERROR,
757
- error=f"Unexpected error: {str(e)}",
1051
+ error=str(e),
758
1052
  params=params,
759
1053
  )
760
1054
 
761
1055
  def get_parameterized_one_liner(self, params) -> str:
762
- name_filter = params.get("name_filter", "")
763
- return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
1056
+ metric = params.get("metric", "all")
1057
+ return (
1058
+ f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
1059
+ )
764
1060
 
765
1061
 
766
1062
  class ExecuteInstantQuery(BasePrometheusTool):
767
1063
  def __init__(self, toolset: "PrometheusToolset"):
768
1064
  super().__init__(
769
1065
  name="execute_prometheus_instant_query",
770
- description="Execute an instant PromQL query",
1066
+ description=(
1067
+ f"Execute an instant PromQL query (single point in time). "
1068
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1069
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
1070
+ ),
771
1071
  parameters={
772
1072
  "query": ToolParameter(
773
1073
  description="The PromQL query",
@@ -779,13 +1079,20 @@ class ExecuteInstantQuery(BasePrometheusTool):
779
1079
  type="string",
780
1080
  required=True,
781
1081
  ),
1082
+ "timeout": ToolParameter(
1083
+ description=(
1084
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1085
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1086
+ f"Increase for complex queries that may take longer."
1087
+ ),
1088
+ type="number",
1089
+ required=False,
1090
+ ),
782
1091
  },
783
1092
  toolset=toolset,
784
1093
  )
785
1094
 
786
- def _invoke(
787
- self, params: dict, user_approved: bool = False
788
- ) -> StructuredToolResult:
1095
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
789
1096
  if not self.toolset.config or not self.toolset.config.prometheus_url:
790
1097
  return StructuredToolResult(
791
1098
  status=StructuredToolResultStatus.ERROR,
@@ -800,12 +1107,24 @@ class ExecuteInstantQuery(BasePrometheusTool):
800
1107
 
801
1108
  payload = {"query": query}
802
1109
 
1110
+ # Get timeout parameter and enforce limits
1111
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1112
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1113
+ timeout = params.get("timeout", default_timeout)
1114
+ if timeout > max_timeout:
1115
+ timeout = max_timeout
1116
+ logging.warning(
1117
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1118
+ )
1119
+ elif timeout < 1:
1120
+ timeout = default_timeout # Min 1 second, but use default if invalid
1121
+
803
1122
  response = do_request(
804
1123
  config=self.toolset.config,
805
1124
  url=url,
806
1125
  headers=self.toolset.config.headers,
807
1126
  data=payload,
808
- timeout=60,
1127
+ timeout=timeout,
809
1128
  verify=self.toolset.config.prometheus_ssl_enabled,
810
1129
  method="POST",
811
1130
  )
@@ -819,51 +1138,64 @@ class ExecuteInstantQuery(BasePrometheusTool):
819
1138
  error_message = (
820
1139
  "The prometheus query returned no result. Is the query correct?"
821
1140
  )
822
- response_data = {
823
- "status": status,
824
- "error_message": error_message,
825
- "random_key": generate_random_key(),
826
- "tool_name": self.name,
827
- "description": description,
828
- "query": query,
829
- }
830
-
1141
+ response_data = MetricsBasedResponse(
1142
+ status=status,
1143
+ error_message=error_message,
1144
+ random_key=generate_random_key(),
1145
+ tool_name=self.name,
1146
+ description=description,
1147
+ query=query,
1148
+ )
1149
+ structured_tool_result: StructuredToolResult
831
1150
  # Check if data should be included based on size
832
1151
  if self.toolset.config.tool_calls_return_data:
833
1152
  result_data = data.get("data", {})
1153
+ response_data.data = result_data
1154
+
1155
+ structured_tool_result = create_structured_tool_result(
1156
+ params=params, response=response_data
1157
+ )
1158
+ token_count = count_tool_response_tokens(
1159
+ llm=context.llm, structured_tool_result=structured_tool_result
1160
+ )
834
1161
 
835
- # Estimate the size of the data
836
- data_str_preview = json.dumps(result_data)
837
- data_size_chars = len(data_str_preview)
1162
+ token_limit = context.max_token_count
1163
+ if self.toolset.config.query_response_size_limit_pct:
1164
+ custom_token_limit = get_pct_token_count(
1165
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1166
+ llm=context.llm,
1167
+ )
1168
+ if custom_token_limit < token_limit:
1169
+ token_limit = custom_token_limit
838
1170
 
839
1171
  # Provide summary if data is too large
840
- if (
841
- self.toolset.config.query_response_size_limit
842
- and data_size_chars
843
- > self.toolset.config.query_response_size_limit
844
- ):
845
- response_data["data_summary"] = (
1172
+ if token_count > token_limit:
1173
+ response_data.data = None
1174
+ response_data.data_summary = (
846
1175
  create_data_summary_for_large_result(
847
1176
  result_data,
848
1177
  query,
849
- data_size_chars,
1178
+ token_count,
850
1179
  is_range_query=False,
851
1180
  )
852
1181
  )
853
1182
  logging.info(
854
1183
  f"Prometheus instant query returned large dataset: "
855
- f"{response_data['data_summary'].get('result_count', 0)} results, "
856
- f"{data_size_chars:,} characters. Returning summary instead of full data."
1184
+ f"{response_data.data_summary.get('result_count', 0)} results, "
1185
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1186
+ f"Returning summary instead of full data."
1187
+ )
1188
+ # Also add token info to the summary for debugging
1189
+ response_data.data_summary["_debug_info"] = (
1190
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
857
1191
  )
858
1192
  else:
859
- response_data["data"] = result_data
1193
+ response_data.data = result_data
860
1194
 
861
- data_str = json.dumps(response_data, indent=2)
862
- return StructuredToolResult(
863
- status=StructuredToolResultStatus.SUCCESS,
864
- data=data_str,
865
- params=params,
1195
+ structured_tool_result = create_structured_tool_result(
1196
+ params=params, response=response_data
866
1197
  )
1198
+ return structured_tool_result
867
1199
 
868
1200
  # Handle known Prometheus error status codes
869
1201
  error_msg = "Unknown error occurred"
@@ -912,7 +1244,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
912
1244
  def __init__(self, toolset: "PrometheusToolset"):
913
1245
  super().__init__(
914
1246
  name="execute_prometheus_range_query",
915
- description="Generates a graph and Execute a PromQL range query",
1247
+ description=(
1248
+ f"Generates a graph and Execute a PromQL range query. "
1249
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1250
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
1251
+ f"Default time range is last 1 hour."
1252
+ ),
916
1253
  parameters={
917
1254
  "query": ToolParameter(
918
1255
  description="The PromQL query",
@@ -946,13 +1283,30 @@ class ExecuteRangeQuery(BasePrometheusTool):
946
1283
  type="string",
947
1284
  required=True,
948
1285
  ),
1286
+ "timeout": ToolParameter(
1287
+ description=(
1288
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1289
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1290
+ f"Increase for complex queries that may take longer."
1291
+ ),
1292
+ type="number",
1293
+ required=False,
1294
+ ),
1295
+ "max_points": ToolParameter(
1296
+ description=(
1297
+ f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
1298
+ f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
1299
+ f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
1300
+ f"If your query would return more points than this limit, the step will be automatically adjusted."
1301
+ ),
1302
+ type="number",
1303
+ required=False,
1304
+ ),
949
1305
  },
950
1306
  toolset=toolset,
951
1307
  )
952
1308
 
953
- def _invoke(
954
- self, params: dict, user_approved: bool = False
955
- ) -> StructuredToolResult:
1309
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
956
1310
  if not self.toolset.config or not self.toolset.config.prometheus_url:
957
1311
  return StructuredToolResult(
958
1312
  status=StructuredToolResultStatus.ERROR,
@@ -970,12 +1324,16 @@ class ExecuteRangeQuery(BasePrometheusTool):
970
1324
  default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
971
1325
  )
972
1326
  step = parse_duration_to_seconds(params.get("step"))
1327
+ max_points = params.get(
1328
+ "max_points"
1329
+ ) # Get the optional max_points parameter
973
1330
 
974
1331
  # adjust_step_for_max_points handles None case and converts to float
975
1332
  step = adjust_step_for_max_points(
976
1333
  start_timestamp=start,
977
1334
  end_timestamp=end,
978
1335
  step=step,
1336
+ max_points_override=max_points,
979
1337
  )
980
1338
 
981
1339
  description = params.get("description", "")
@@ -987,12 +1345,24 @@ class ExecuteRangeQuery(BasePrometheusTool):
987
1345
  "step": step,
988
1346
  }
989
1347
 
1348
+ # Get timeout parameter and enforce limits
1349
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1350
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1351
+ timeout = params.get("timeout", default_timeout)
1352
+ if timeout > max_timeout:
1353
+ timeout = max_timeout
1354
+ logging.warning(
1355
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1356
+ )
1357
+ elif timeout < 1:
1358
+ timeout = default_timeout # Min 1 second, but use default if invalid
1359
+
990
1360
  response = do_request(
991
1361
  config=self.toolset.config,
992
1362
  url=url,
993
1363
  headers=self.toolset.config.headers,
994
1364
  data=payload,
995
- timeout=120,
1365
+ timeout=timeout,
996
1366
  verify=self.toolset.config.prometheus_ssl_enabled,
997
1367
  method="POST",
998
1368
  )
@@ -1006,54 +1376,69 @@ class ExecuteRangeQuery(BasePrometheusTool):
1006
1376
  error_message = (
1007
1377
  "The prometheus query returned no result. Is the query correct?"
1008
1378
  )
1009
- response_data = {
1010
- "status": status,
1011
- "error_message": error_message,
1012
- "random_key": generate_random_key(),
1013
- "tool_name": self.name,
1014
- "description": description,
1015
- "query": query,
1016
- "start": start,
1017
- "end": end,
1018
- "step": step,
1019
- "output_type": output_type,
1020
- }
1379
+ response_data = MetricsBasedResponse(
1380
+ status=status,
1381
+ error_message=error_message,
1382
+ random_key=generate_random_key(),
1383
+ tool_name=self.name,
1384
+ description=description,
1385
+ query=query,
1386
+ start=start,
1387
+ end=end,
1388
+ step=step,
1389
+ output_type=output_type,
1390
+ )
1391
+
1392
+ structured_tool_result: StructuredToolResult
1021
1393
 
1022
1394
  # Check if data should be included based on size
1023
1395
  if self.toolset.config.tool_calls_return_data:
1024
1396
  result_data = data.get("data", {})
1397
+ response_data.data = result_data
1398
+ structured_tool_result = create_structured_tool_result(
1399
+ params=params, response=response_data
1400
+ )
1025
1401
 
1026
- # Estimate the size of the data
1027
- data_str_preview = json.dumps(result_data)
1028
- data_size_chars = len(data_str_preview)
1402
+ token_count = count_tool_response_tokens(
1403
+ llm=context.llm, structured_tool_result=structured_tool_result
1404
+ )
1405
+
1406
+ token_limit = context.max_token_count
1407
+ if self.toolset.config.query_response_size_limit_pct:
1408
+ custom_token_limit = get_pct_token_count(
1409
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1410
+ llm=context.llm,
1411
+ )
1412
+ if custom_token_limit < token_limit:
1413
+ token_limit = custom_token_limit
1029
1414
 
1030
1415
  # Provide summary if data is too large
1031
- if (
1032
- self.toolset.config.query_response_size_limit
1033
- and data_size_chars
1034
- > self.toolset.config.query_response_size_limit
1035
- ):
1036
- response_data["data_summary"] = (
1416
+ if token_count > token_limit:
1417
+ response_data.data = None
1418
+ response_data.data_summary = (
1037
1419
  create_data_summary_for_large_result(
1038
- result_data, query, data_size_chars, is_range_query=True
1420
+ result_data, query, token_count, is_range_query=True
1039
1421
  )
1040
1422
  )
1041
1423
  logging.info(
1042
1424
  f"Prometheus range query returned large dataset: "
1043
- f"{response_data['data_summary'].get('series_count', 0)} series, "
1044
- f"{data_size_chars:,} characters. Returning summary instead of full data."
1425
+ f"{response_data.data_summary.get('series_count', 0)} series, "
1426
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1427
+ f"Returning summary instead of full data."
1428
+ )
1429
+ # Also add character info to the summary for debugging
1430
+ response_data.data_summary["_debug_info"] = (
1431
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
1045
1432
  )
1046
1433
  else:
1047
- response_data["data"] = result_data
1048
-
1049
- data_str = json.dumps(response_data, indent=2)
1434
+ response_data.data = result_data
1050
1435
 
1051
- return StructuredToolResult(
1052
- status=StructuredToolResultStatus.SUCCESS,
1053
- data=data_str,
1054
- params=params,
1436
+ structured_tool_result = create_structured_tool_result(
1437
+ params=params, response=response_data
1055
1438
  )
1056
1439
 
1440
+ return structured_tool_result
1441
+
1057
1442
  error_msg = "Unknown error occurred"
1058
1443
  if response.status_code in [400, 429]:
1059
1444
  try:
@@ -1107,7 +1492,11 @@ class PrometheusToolset(Toolset):
1107
1492
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
1108
1493
  tools=[
1109
1494
  ListPrometheusRules(toolset=self),
1110
- ListAvailableMetrics(toolset=self),
1495
+ GetMetricNames(toolset=self),
1496
+ GetLabelValues(toolset=self),
1497
+ GetAllLabels(toolset=self),
1498
+ GetSeries(toolset=self),
1499
+ GetMetricMetadata(toolset=self),
1111
1500
  ExecuteInstantQuery(toolset=self),
1112
1501
  ExecuteRangeQuery(toolset=self),
1113
1502
  ],