holmesgpt 0.14.1a0__py3-none-any.whl → 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

@@ -1,10 +1,9 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import re
5
4
  import time
6
5
  import dateutil.parser
7
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
6
+ from typing import Any, Dict, Optional, Tuple, Type, Union
8
7
  from urllib.parse import urljoin
9
8
 
10
9
  import requests # type: ignore
@@ -39,25 +38,61 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
39
38
  from holmes.utils.keygen_utils import generate_random_key
40
39
 
41
40
  PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
41
+ PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
42
+ # Default timeout values for PromQL queries
43
+ DEFAULT_QUERY_TIMEOUT_SECONDS = 20
44
+ MAX_QUERY_TIMEOUT_SECONDS = 180
45
+ # Default character limit for query responses to prevent token limit issues
46
+ DEFAULT_QUERY_RESPONSE_SIZE_LIMIT = 30000
47
+ # Default timeout for metadata API calls (discovery endpoints)
48
+ DEFAULT_METADATA_TIMEOUT_SECONDS = 20
49
+ MAX_METADATA_TIMEOUT_SECONDS = 60
50
+ # Default time window for metadata APIs (in hours)
51
+ DEFAULT_METADATA_TIME_WINDOW_HRS = 1
42
52
 
43
53
 
44
54
  class PrometheusConfig(BaseModel):
45
55
  # URL is optional because it can be set with an env var
46
56
  prometheus_url: Optional[str]
47
57
  healthcheck: str = "-/healthy"
48
- # Setting to None will remove the time window from the request for labels
49
- metrics_labels_time_window_hrs: Union[int, None] = 48
50
- # Setting to None will disable the cache
51
- metrics_labels_cache_duration_hrs: Union[int, None] = 12
52
- fetch_labels_with_labels_api: bool = False
53
- fetch_metadata_with_series_api: bool = False
58
+
59
+ # New config for default time window for metadata APIs
60
+ default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS # Default: only show metrics active in the last hour
61
+
62
+ # Query timeout configuration
63
+ default_query_timeout_seconds: int = (
64
+ DEFAULT_QUERY_TIMEOUT_SECONDS # Default timeout for PromQL queries
65
+ )
66
+ max_query_timeout_seconds: int = (
67
+ MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
68
+ )
69
+
70
+ # Metadata API timeout configuration
71
+ default_metadata_timeout_seconds: int = (
72
+ DEFAULT_METADATA_TIMEOUT_SECONDS # Default timeout for metadata/discovery APIs
73
+ )
74
+ max_metadata_timeout_seconds: int = (
75
+ MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
76
+ )
77
+
78
+ # DEPRECATED: These config values are deprecated and will be removed in a future version
79
+ # Using None as default so we can detect if user explicitly set them
80
+ metrics_labels_time_window_hrs: Optional[int] = (
81
+ None # DEPRECATED - use default_metadata_time_window_hrs instead
82
+ )
83
+ metrics_labels_cache_duration_hrs: Optional[int] = (
84
+ None # DEPRECATED - no longer used
85
+ )
86
+ fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
87
+ fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
88
+
54
89
  tool_calls_return_data: bool = True
55
90
  headers: Dict = Field(default_factory=dict)
56
- rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
91
+ rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
57
92
  additional_labels: Optional[Dict[str, str]] = None
58
93
  prometheus_ssl_enabled: bool = True
59
94
  query_response_size_limit: Optional[int] = (
60
- 80000 # Limit the max number of characters in a query result to proactively prevent truncation and advise LLM to query less data
95
+ DEFAULT_QUERY_RESPONSE_SIZE_LIMIT # Limit the max number of characters in a query result to proactively prevent token limit issues (roughly 5-6k tokens)
61
96
  )
62
97
 
63
98
  @field_validator("prometheus_url")
@@ -68,6 +103,26 @@ class PrometheusConfig(BaseModel):
68
103
 
69
104
  @model_validator(mode="after")
70
105
  def validate_prom_config(self):
106
+ # Check for deprecated config values and print warnings
107
+ deprecated_configs = []
108
+ if self.metrics_labels_time_window_hrs is not None: # Check if explicitly set
109
+ deprecated_configs.append(
110
+ "metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
111
+ )
112
+ if (
113
+ self.metrics_labels_cache_duration_hrs is not None
114
+ ): # Check if explicitly set
115
+ deprecated_configs.append("metrics_labels_cache_duration_hrs")
116
+ if self.fetch_labels_with_labels_api is not None: # Check if explicitly set
117
+ deprecated_configs.append("fetch_labels_with_labels_api")
118
+ if self.fetch_metadata_with_series_api is not None: # Check if explicitly set
119
+ deprecated_configs.append("fetch_metadata_with_series_api")
120
+
121
+ if deprecated_configs:
122
+ logging.warning(
123
+ f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
124
+ f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
125
+ )
71
126
  # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
72
127
  if IS_OPENSHIFT:
73
128
  if self.healthcheck == "-/healthy":
@@ -164,6 +219,8 @@ def do_request(
164
219
 
165
220
  if isinstance(config, AMPConfig):
166
221
  client = config.get_aws_client() # cached AWSPrometheusConnect
222
+ # Note: timeout parameter is not supported by prometrix's signed_request
223
+ # AWS/AMP requests will not respect the timeout setting
167
224
  return client.signed_request( # type: ignore
168
225
  method=method,
169
226
  url=url,
@@ -185,99 +242,6 @@ def do_request(
185
242
  )
186
243
 
187
244
 
188
- def filter_metrics_by_type(metrics: Dict, expected_type: str):
189
- return {
190
- metric_name: metric_data
191
- for metric_name, metric_data in metrics.items()
192
- if expected_type in metric_data.get("type", "")
193
- or metric_data.get("type", "") == "?"
194
- }
195
-
196
-
197
- def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
198
- regex = re.compile(pattern)
199
- return {
200
- metric_name: metric_data
201
- for metric_name, metric_data in metrics.items()
202
- if regex.search(metric_name)
203
- }
204
-
205
-
206
- METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
207
-
208
-
209
- def fetch_metadata(
210
- prometheus_url: str,
211
- headers: Optional[Dict],
212
- config,
213
- verify_ssl: bool = True,
214
- ) -> Dict:
215
- metadata_url = urljoin(prometheus_url, "api/v1/metadata")
216
- metadata_response = do_request(
217
- config=config,
218
- url=metadata_url,
219
- headers=headers,
220
- timeout=60,
221
- verify=verify_ssl,
222
- method="GET",
223
- )
224
- metadata_response.raise_for_status()
225
-
226
- metadata = metadata_response.json()["data"]
227
-
228
- metrics = {}
229
- for metric_name, meta_list in metadata.items():
230
- if meta_list:
231
- metric_type = meta_list[0].get("type", "unknown")
232
- metric_description = meta_list[0].get("help", "unknown")
233
- metrics[metric_name] = {
234
- "type": metric_type,
235
- "description": metric_description,
236
- "labels": set(),
237
- }
238
-
239
- return metrics
240
-
241
-
242
- def fetch_metadata_with_series_api(
243
- prometheus_url: str,
244
- metric_name: str,
245
- headers: Dict,
246
- config,
247
- verify_ssl: bool = True,
248
- ) -> Dict:
249
- url = urljoin(prometheus_url, "api/v1/series")
250
- params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
251
-
252
- response = do_request(
253
- config=config,
254
- url=url,
255
- headers=headers,
256
- params=params,
257
- timeout=60,
258
- verify=verify_ssl,
259
- method="GET",
260
- )
261
- response.raise_for_status()
262
- metrics = response.json()["data"]
263
-
264
- metadata: Dict = {}
265
- for metric_data in metrics:
266
- metric_name = metric_data.get("__name__")
267
- if not metric_name:
268
- continue
269
-
270
- metric = metadata.get(metric_name)
271
- if not metric:
272
- metric = {"description": "?", "type": "?", "labels": set()}
273
- metadata[metric_name] = metric
274
-
275
- labels = {k for k in metric_data.keys() if k != "__name__"}
276
- metric["labels"].update(labels)
277
-
278
- return metadata
279
-
280
-
281
245
  def result_has_data(result: Dict) -> bool:
282
246
  data = result.get("data", {})
283
247
  if len(data.get("result", [])) > 0:
@@ -289,19 +253,36 @@ def adjust_step_for_max_points(
289
253
  start_timestamp: str,
290
254
  end_timestamp: str,
291
255
  step: Optional[float] = None,
256
+ max_points_override: Optional[float] = None,
292
257
  ) -> float:
293
258
  """
294
259
  Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
295
- Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
296
260
 
297
261
  Args:
298
262
  start_timestamp: RFC3339 formatted start time
299
263
  end_timestamp: RFC3339 formatted end time
300
264
  step: The requested step duration in seconds (None for auto-calculation)
265
+ max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
301
266
 
302
267
  Returns:
303
268
  Adjusted step value in seconds that ensures points <= max_points
304
269
  """
270
+ # Use override if provided and valid, otherwise use default
271
+ max_points = MAX_GRAPH_POINTS
272
+ if max_points_override is not None:
273
+ if max_points_override > MAX_GRAPH_POINTS:
274
+ logging.warning(
275
+ f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
276
+ )
277
+ max_points = MAX_GRAPH_POINTS
278
+ elif max_points_override < 1:
279
+ logging.warning(
280
+ f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
281
+ )
282
+ max_points = MAX_GRAPH_POINTS
283
+ else:
284
+ max_points = max_points_override
285
+ logging.debug(f"Using max_points override: {max_points}")
305
286
 
306
287
  start_dt = dateutil.parser.parse(start_timestamp)
307
288
  end_dt = dateutil.parser.parse(end_timestamp)
@@ -319,10 +300,10 @@ def adjust_step_for_max_points(
319
300
  current_points = time_range_seconds / step
320
301
 
321
302
  # If current points exceed max, adjust the step
322
- if current_points > MAX_GRAPH_POINTS:
323
- adjusted_step = time_range_seconds / MAX_GRAPH_POINTS
303
+ if current_points > max_points:
304
+ adjusted_step = time_range_seconds / max_points
324
305
  logging.info(
325
- f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {MAX_GRAPH_POINTS}"
306
+ f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
326
307
  )
327
308
  return adjusted_step
328
309
 
@@ -355,32 +336,36 @@ def create_data_summary_for_large_result(
355
336
  series_list = result_data.get("result", [])
356
337
  num_items = len(series_list)
357
338
 
358
- # Calculate statistics for range queries
339
+ # Calculate exact total data points across all series
359
340
  total_points = 0
360
- for series in series_list[:10]: # Sample first 10 series
341
+ for series in series_list: # Iterate through ALL series for exact count
361
342
  points = len(series.get("values", []))
362
343
  total_points += points
363
344
 
364
- avg_points_per_series = (
365
- total_points / min(10, num_items) if num_items > 0 else 0
345
+ # Analyze label keys and their cardinality
346
+ label_cardinality: Dict[str, set] = {}
347
+ for series in series_list:
348
+ metric = series.get("metric", {})
349
+ for label_key, label_value in metric.items():
350
+ if label_key not in label_cardinality:
351
+ label_cardinality[label_key] = set()
352
+ label_cardinality[label_key].add(label_value)
353
+
354
+ # Convert sets to counts for the summary
355
+ label_summary = {
356
+ label: len(values) for label, values in label_cardinality.items()
357
+ }
358
+ # Sort by cardinality (highest first) for better insights
359
+ label_summary = dict(
360
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
366
361
  )
367
- estimated_total_points = avg_points_per_series * num_items
368
-
369
- # Create a sample of just the metadata (labels) without values
370
- sample_metrics = []
371
- for series in series_list[:10]: # Sample first 10 series
372
- sample_metrics.append(series.get("metric", {}))
373
-
374
- sample_json = json.dumps(sample_metrics, indent=2)
375
- if len(sample_json) > 2000:
376
- sample_json = sample_json[:2000] + "\n... (truncated)"
377
362
 
378
363
  return {
379
- "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with approximately {estimated_total_points:,.0f} total data points.",
364
+ "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} time series with {total_points:,} total data points.",
380
365
  "series_count": num_items,
381
- "estimated_total_points": int(estimated_total_points),
366
+ "total_data_points": total_points,
382
367
  "data_size_characters": data_size_chars,
383
- "sample_data": sample_json,
368
+ "label_cardinality": label_summary,
384
369
  "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
385
370
  }
386
371
  else:
@@ -389,188 +374,40 @@ def create_data_summary_for_large_result(
389
374
  result_list = result_data.get("result", [])
390
375
  num_items = len(result_list)
391
376
 
392
- # Create a sample of just the metadata (labels) without values
393
- sample_metrics = []
394
- for item in result_list[:10]: # Sample first 10 results
377
+ # Analyze label keys and their cardinality
378
+ instant_label_cardinality: Dict[str, set] = {}
379
+ for item in result_list:
395
380
  if isinstance(item, dict):
396
- sample_metrics.append(item.get("metric", {}))
397
-
398
- sample_json = json.dumps(sample_metrics, indent=2)
399
- if len(sample_json) > 2000:
400
- sample_json = sample_json[:2000] + "\n... (truncated)"
381
+ metric = item.get("metric", {})
382
+ for label_key, label_value in metric.items():
383
+ if label_key not in instant_label_cardinality:
384
+ instant_label_cardinality[label_key] = set()
385
+ instant_label_cardinality[label_key].add(label_value)
386
+
387
+ # Convert sets to counts for the summary
388
+ label_summary = {
389
+ label: len(values) for label, values in instant_label_cardinality.items()
390
+ }
391
+ # Sort by cardinality (highest first) for better insights
392
+ label_summary = dict(
393
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
394
+ )
401
395
 
402
396
  return {
403
397
  "message": f"Data too large to return ({data_size_chars:,} characters). Query returned {num_items} results.",
404
398
  "result_count": num_items,
405
399
  "result_type": result_type,
406
400
  "data_size_characters": data_size_chars,
407
- "sample_data": sample_json,
401
+ "label_cardinality": label_summary,
408
402
  "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
409
403
  }
410
404
 
411
405
 
412
- def fetch_metrics_labels_with_series_api(
413
- prometheus_url: str,
414
- headers: Dict[str, str],
415
- cache: Optional[TTLCache],
416
- metrics_labels_time_window_hrs: Union[int, None],
417
- metric_name: str,
418
- config=None,
419
- verify_ssl: bool = True,
420
- ) -> dict:
421
- """This is a slow query. Takes 5+ seconds to run"""
422
- cache_key = f"metrics_labels_series_api:{metric_name}"
423
- if cache:
424
- cached_result = cache.get(cache_key)
425
- if cached_result:
426
- return cached_result
427
-
428
- series_url = urljoin(prometheus_url, "api/v1/series")
429
- params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
430
-
431
- if metrics_labels_time_window_hrs is not None:
432
- params["end"] = int(time.time())
433
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
434
-
435
- series_response = do_request(
436
- config=config,
437
- url=series_url,
438
- headers=headers,
439
- params=params,
440
- timeout=60,
441
- verify=verify_ssl,
442
- method="GET",
443
- )
444
- series_response.raise_for_status()
445
- series = series_response.json()["data"]
446
-
447
- metrics_labels: dict = {}
448
- for serie in series:
449
- metric_name = serie["__name__"]
450
- # Add all labels except __name__
451
- labels = {k for k in serie.keys() if k != "__name__"}
452
- if metric_name in metrics_labels:
453
- metrics_labels[metric_name].update(labels)
454
- else:
455
- metrics_labels[metric_name] = labels
456
- if cache:
457
- cache.set(cache_key, metrics_labels)
458
-
459
- return metrics_labels
460
-
461
-
462
- def fetch_metrics_labels_with_labels_api(
463
- prometheus_url: str,
464
- cache: Optional[TTLCache],
465
- metrics_labels_time_window_hrs: Union[int, None],
466
- metric_names: List[str],
467
- headers: Dict,
468
- config=None,
469
- verify_ssl: bool = True,
470
- ) -> dict:
471
- metrics_labels = {}
472
-
473
- for metric_name in metric_names:
474
- cache_key = f"metrics_labels_labels_api:{metric_name}"
475
- if cache:
476
- cached_result = cache.get(cache_key)
477
- if cached_result:
478
- metrics_labels[metric_name] = cached_result
479
-
480
- url = urljoin(prometheus_url, "api/v1/labels")
481
- params: dict = {
482
- "match[]": f'{{__name__="{metric_name}"}}',
483
- }
484
- if metrics_labels_time_window_hrs is not None:
485
- params["end"] = int(time.time())
486
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
487
-
488
- response = do_request(
489
- config=config,
490
- url=url,
491
- headers=headers,
492
- params=params,
493
- timeout=60,
494
- verify=verify_ssl,
495
- method="GET",
496
- )
497
- response.raise_for_status()
498
- labels = response.json()["data"]
499
- filtered_labels = {label for label in labels if label != "__name__"}
500
- metrics_labels[metric_name] = filtered_labels
501
-
502
- if cache:
503
- cache.set(cache_key, filtered_labels)
504
-
505
- return metrics_labels
506
-
507
-
508
- def fetch_metrics(
509
- prometheus_url: str,
510
- cache: Optional[TTLCache],
511
- metrics_labels_time_window_hrs: Union[int, None],
512
- metric_name: str,
513
- should_fetch_labels_with_labels_api: bool,
514
- should_fetch_metadata_with_series_api: bool,
515
- headers: Dict,
516
- config=None,
517
- verify_ssl: bool = True,
518
- ) -> dict:
519
- metrics = None
520
- should_fetch_labels = True
521
- if should_fetch_metadata_with_series_api:
522
- metrics = fetch_metadata_with_series_api(
523
- prometheus_url=prometheus_url,
524
- metric_name=metric_name,
525
- headers=headers,
526
- config=config,
527
- verify_ssl=verify_ssl,
528
- )
529
- should_fetch_labels = False # series API returns the labels
530
- else:
531
- metrics = fetch_metadata(
532
- prometheus_url=prometheus_url,
533
- headers=headers,
534
- config=config,
535
- verify_ssl=verify_ssl,
536
- )
537
- metrics = filter_metrics_by_name(metrics, metric_name)
538
-
539
- if should_fetch_labels:
540
- metrics_labels = {}
541
- if should_fetch_labels_with_labels_api:
542
- metrics_labels = fetch_metrics_labels_with_labels_api(
543
- prometheus_url=prometheus_url,
544
- cache=cache,
545
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
546
- metric_names=list(metrics.keys()),
547
- headers=headers,
548
- config=config,
549
- verify_ssl=verify_ssl,
550
- )
551
- else:
552
- metrics_labels = fetch_metrics_labels_with_series_api(
553
- prometheus_url=prometheus_url,
554
- cache=cache,
555
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
556
- metric_name=metric_name,
557
- headers=headers,
558
- config=config,
559
- verify_ssl=verify_ssl,
560
- )
561
-
562
- for metric_name in metrics:
563
- if metric_name in metrics_labels:
564
- metrics[metric_name]["labels"] = metrics_labels[metric_name]
565
-
566
- return metrics
567
-
568
-
569
406
  class ListPrometheusRules(BasePrometheusTool):
570
407
  def __init__(self, toolset: "PrometheusToolset"):
571
408
  super().__init__(
572
409
  name="list_prometheus_rules",
573
- description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
410
+ description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
574
411
  parameters={},
575
412
  toolset=toolset,
576
413
  )
@@ -613,7 +450,7 @@ class ListPrometheusRules(BasePrometheusTool):
613
450
  config=self.toolset.config,
614
451
  url=rules_url,
615
452
  params=params,
616
- timeout=180,
453
+ timeout=40,
617
454
  verify=self.toolset.config.prometheus_ssl_enabled,
618
455
  headers=self.toolset.config.headers,
619
456
  method="GET",
@@ -654,26 +491,47 @@ class ListPrometheusRules(BasePrometheusTool):
654
491
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
655
492
 
656
493
 
657
- class ListAvailableMetrics(BasePrometheusTool):
494
+ class GetMetricNames(BasePrometheusTool):
495
+ """Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
496
+
658
497
  def __init__(self, toolset: "PrometheusToolset"):
659
498
  super().__init__(
660
- name="list_available_metrics",
661
- description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
499
+ name="get_metric_names",
500
+ description=(
501
+ "Get list of metric names using /api/v1/label/__name__/values. "
502
+ "FASTEST method for metric discovery when you need to explore available metrics. "
503
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
504
+ f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
505
+ "Note: Does not return metric metadata (type, description, labels). "
506
+ "By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
507
+ ),
662
508
  parameters={
663
- "type_filter": ToolParameter(
664
- description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
509
+ "match": ToolParameter(
510
+ description=(
511
+ "REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
512
+ "'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
513
+ "'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
514
+ "'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
515
+ "'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
516
+ "'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
517
+ "'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
518
+ ),
519
+ type="string",
520
+ required=True,
521
+ ),
522
+ "start": ToolParameter(
523
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
665
524
  type="string",
666
525
  required=False,
667
526
  ),
668
- "name_filter": ToolParameter(
669
- description="Only the metrics partially or fully matching this name will be returned",
527
+ "end": ToolParameter(
528
+ description="End timestamp (RFC3339 or Unix). Default: now",
670
529
  type="string",
671
- required=True,
530
+ required=False,
672
531
  ),
673
532
  },
674
533
  toolset=toolset,
675
534
  )
676
- self._cache = None
677
535
 
678
536
  def _invoke(
679
537
  self, params: dict, user_approved: bool = False
@@ -684,90 +542,512 @@ class ListAvailableMetrics(BasePrometheusTool):
684
542
  error="Prometheus is not configured. Prometheus URL is missing",
685
543
  params=params,
686
544
  )
687
- if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
688
- self._cache = TTLCache(
689
- self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
690
- )
691
545
  try:
692
- prometheus_url = self.toolset.config.prometheus_url
693
- metrics_labels_time_window_hrs = (
694
- self.toolset.config.metrics_labels_time_window_hrs
546
+ match_param = params.get("match")
547
+ if not match_param:
548
+ return StructuredToolResult(
549
+ status=StructuredToolResultStatus.ERROR,
550
+ error="Match parameter is required to filter metrics",
551
+ params=params,
552
+ )
553
+
554
+ url = urljoin(
555
+ self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
556
+ )
557
+ query_params = {
558
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
559
+ "match[]": match_param,
560
+ }
561
+
562
+ # Add time parameters - use provided values or defaults
563
+ if params.get("end"):
564
+ query_params["end"] = params["end"]
565
+ else:
566
+ query_params["end"] = str(int(time.time()))
567
+
568
+ if params.get("start"):
569
+ query_params["start"] = params["start"]
570
+ elif self.toolset.config.default_metadata_time_window_hrs:
571
+ # Use default time window
572
+ query_params["start"] = str(
573
+ int(time.time())
574
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
575
+ )
576
+
577
+ response = do_request(
578
+ config=self.toolset.config,
579
+ url=url,
580
+ params=query_params,
581
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
582
+ verify=self.toolset.config.prometheus_ssl_enabled,
583
+ headers=self.toolset.config.headers,
584
+ method="GET",
585
+ )
586
+ response.raise_for_status()
587
+ data = response.json()
588
+
589
+ # Check if results were truncated
590
+ if (
591
+ "data" in data
592
+ and isinstance(data["data"], list)
593
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
594
+ ):
595
+ data["_truncated"] = True
596
+ data["_message"] = (
597
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
598
+ )
599
+
600
+ return StructuredToolResult(
601
+ status=StructuredToolResultStatus.SUCCESS,
602
+ data=data,
603
+ params=params,
604
+ )
605
+ except Exception as e:
606
+ return StructuredToolResult(
607
+ status=StructuredToolResultStatus.ERROR,
608
+ error=str(e),
609
+ params=params,
695
610
  )
696
611
 
697
- name_filter = params.get("name_filter")
698
- if not name_filter:
612
+ def get_parameterized_one_liner(self, params) -> str:
613
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
614
+
615
+
616
+ class GetLabelValues(BasePrometheusTool):
617
+ """Get values for a specific label across all metrics"""
618
+
619
+ def __init__(self, toolset: "PrometheusToolset"):
620
+ super().__init__(
621
+ name="get_label_values",
622
+ description=(
623
+ "Get all values for a specific label using /api/v1/label/{label}/values. "
624
+ "Use this to discover pods, namespaces, jobs, instances, etc. "
625
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
626
+ "Supports optional match[] parameter to filter. "
627
+ "By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
628
+ ),
629
+ parameters={
630
+ "label": ToolParameter(
631
+ description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
632
+ type="string",
633
+ required=True,
634
+ ),
635
+ "match": ToolParameter(
636
+ description=(
637
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
638
+ "'{namespace=\"default\"}')."
639
+ ),
640
+ type="string",
641
+ required=False,
642
+ ),
643
+ "start": ToolParameter(
644
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
645
+ type="string",
646
+ required=False,
647
+ ),
648
+ "end": ToolParameter(
649
+ description="End timestamp (RFC3339 or Unix). Default: now",
650
+ type="string",
651
+ required=False,
652
+ ),
653
+ },
654
+ toolset=toolset,
655
+ )
656
+
657
+ def _invoke(
658
+ self, params: dict, user_approved: bool = False
659
+ ) -> StructuredToolResult:
660
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
661
+ return StructuredToolResult(
662
+ status=StructuredToolResultStatus.ERROR,
663
+ error="Prometheus is not configured. Prometheus URL is missing",
664
+ params=params,
665
+ )
666
+ try:
667
+ label = params.get("label")
668
+ if not label:
699
669
  return StructuredToolResult(
700
670
  status=StructuredToolResultStatus.ERROR,
701
- error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
671
+ error="Label parameter is required",
702
672
  params=params,
703
673
  )
704
674
 
705
- metrics = fetch_metrics(
706
- prometheus_url=prometheus_url,
707
- cache=self._cache,
708
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
709
- metric_name=name_filter,
710
- should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
711
- should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
712
- headers=self.toolset.config.headers,
675
+ url = urljoin(
676
+ self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
677
+ )
678
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
679
+ if params.get("match"):
680
+ query_params["match[]"] = params["match"]
681
+
682
+ # Add time parameters - use provided values or defaults
683
+ if params.get("end"):
684
+ query_params["end"] = params["end"]
685
+ else:
686
+ query_params["end"] = str(int(time.time()))
687
+
688
+ if params.get("start"):
689
+ query_params["start"] = params["start"]
690
+ elif self.toolset.config.default_metadata_time_window_hrs:
691
+ # Use default time window
692
+ query_params["start"] = str(
693
+ int(time.time())
694
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
695
+ )
696
+
697
+ response = do_request(
713
698
  config=self.toolset.config,
714
- verify_ssl=self.toolset.config.prometheus_ssl_enabled,
699
+ url=url,
700
+ params=query_params,
701
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
702
+ verify=self.toolset.config.prometheus_ssl_enabled,
703
+ headers=self.toolset.config.headers,
704
+ method="GET",
715
705
  )
706
+ response.raise_for_status()
707
+ data = response.json()
708
+
709
+ # Check if results were truncated
710
+ if (
711
+ "data" in data
712
+ and isinstance(data["data"], list)
713
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
714
+ ):
715
+ data["_truncated"] = True
716
+ data["_message"] = (
717
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
718
+ )
716
719
 
717
- type_filter = params.get("type_filter")
718
- if type_filter:
719
- metrics = filter_metrics_by_type(metrics, type_filter)
720
+ return StructuredToolResult(
721
+ status=StructuredToolResultStatus.SUCCESS,
722
+ data=data,
723
+ params=params,
724
+ )
725
+ except Exception as e:
726
+ return StructuredToolResult(
727
+ status=StructuredToolResultStatus.ERROR,
728
+ error=str(e),
729
+ params=params,
730
+ )
731
+
732
+ def get_parameterized_one_liner(self, params) -> str:
733
+ label = params.get("label", "")
734
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
735
+
736
+
737
+ class GetAllLabels(BasePrometheusTool):
738
+ """Get all label names that exist in Prometheus"""
720
739
 
721
- output = ["Metric | Description | Type | Labels"]
722
- output.append("-" * 100)
740
+ def __init__(self, toolset: "PrometheusToolset"):
741
+ super().__init__(
742
+ name="get_all_labels",
743
+ description=(
744
+ "Get list of all label names using /api/v1/labels. "
745
+ "Use this to discover what labels are available across all metrics. "
746
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
747
+ "Supports optional match[] parameter to filter. "
748
+ "By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
749
+ ),
750
+ parameters={
751
+ "match": ToolParameter(
752
+ description=(
753
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
754
+ "'{job=\"prometheus\"}')."
755
+ ),
756
+ type="string",
757
+ required=False,
758
+ ),
759
+ "start": ToolParameter(
760
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
761
+ type="string",
762
+ required=False,
763
+ ),
764
+ "end": ToolParameter(
765
+ description="End timestamp (RFC3339 or Unix). Default: now",
766
+ type="string",
767
+ required=False,
768
+ ),
769
+ },
770
+ toolset=toolset,
771
+ )
723
772
 
724
- for metric, info in sorted(metrics.items()):
725
- labels_str = (
726
- ", ".join(sorted(info["labels"])) if info["labels"] else "none"
773
+ def _invoke(
774
+ self, params: dict, user_approved: bool = False
775
+ ) -> StructuredToolResult:
776
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
777
+ return StructuredToolResult(
778
+ status=StructuredToolResultStatus.ERROR,
779
+ error="Prometheus is not configured. Prometheus URL is missing",
780
+ params=params,
781
+ )
782
+ try:
783
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
784
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
785
+ if params.get("match"):
786
+ query_params["match[]"] = params["match"]
787
+
788
+ # Add time parameters - use provided values or defaults
789
+ if params.get("end"):
790
+ query_params["end"] = params["end"]
791
+ else:
792
+ query_params["end"] = str(int(time.time()))
793
+
794
+ if params.get("start"):
795
+ query_params["start"] = params["start"]
796
+ elif self.toolset.config.default_metadata_time_window_hrs:
797
+ # Use default time window
798
+ query_params["start"] = str(
799
+ int(time.time())
800
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
727
801
  )
728
- output.append(
729
- f"{metric} | {info['description']} | {info['type']} | {labels_str}"
802
+
803
+ response = do_request(
804
+ config=self.toolset.config,
805
+ url=url,
806
+ params=query_params,
807
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
808
+ verify=self.toolset.config.prometheus_ssl_enabled,
809
+ headers=self.toolset.config.headers,
810
+ method="GET",
811
+ )
812
+ response.raise_for_status()
813
+ data = response.json()
814
+
815
+ # Check if results were truncated
816
+ if (
817
+ "data" in data
818
+ and isinstance(data["data"], list)
819
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
820
+ ):
821
+ data["_truncated"] = True
822
+ data["_message"] = (
823
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
730
824
  )
731
825
 
732
- table_output = "\n".join(output)
733
826
  return StructuredToolResult(
734
827
  status=StructuredToolResultStatus.SUCCESS,
735
- data=table_output,
828
+ data=data,
829
+ params=params,
830
+ )
831
+ except Exception as e:
832
+ return StructuredToolResult(
833
+ status=StructuredToolResultStatus.ERROR,
834
+ error=str(e),
736
835
  params=params,
737
836
  )
738
837
 
739
- except requests.Timeout:
740
- logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
838
+ def get_parameterized_one_liner(self, params) -> str:
839
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
840
+
841
+
842
+ class GetSeries(BasePrometheusTool):
843
+ """Get time series matching a selector"""
844
+
845
+ def __init__(self, toolset: "PrometheusToolset"):
846
+ super().__init__(
847
+ name="get_series",
848
+ description=(
849
+ "Get time series using /api/v1/series. "
850
+ "Returns label sets for all time series matching the selector. "
851
+ "SLOWER than other discovery methods - use only when you need full label sets. "
852
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
853
+ "Requires match[] parameter with PromQL selector. "
854
+ "By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
855
+ ),
856
+ parameters={
857
+ "match": ToolParameter(
858
+ description=(
859
+ "PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
860
+ "'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
861
+ '\'{__name__="up",job="prometheus"}\').'
862
+ ),
863
+ type="string",
864
+ required=True,
865
+ ),
866
+ "start": ToolParameter(
867
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
868
+ type="string",
869
+ required=False,
870
+ ),
871
+ "end": ToolParameter(
872
+ description="End timestamp (RFC3339 or Unix). Default: now",
873
+ type="string",
874
+ required=False,
875
+ ),
876
+ },
877
+ toolset=toolset,
878
+ )
879
+
880
+ def _invoke(
881
+ self, params: dict, user_approved: bool = False
882
+ ) -> StructuredToolResult:
883
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
741
884
  return StructuredToolResult(
742
885
  status=StructuredToolResultStatus.ERROR,
743
- error="Request timed out while fetching metrics",
886
+ error="Prometheus is not configured. Prometheus URL is missing",
744
887
  params=params,
745
888
  )
746
- except RequestException as e:
747
- logging.warn("Failed to fetch prometheus metrics", exc_info=True)
889
+ try:
890
+ match = params.get("match")
891
+ if not match:
892
+ return StructuredToolResult(
893
+ status=StructuredToolResultStatus.ERROR,
894
+ error="Match parameter is required",
895
+ params=params,
896
+ )
897
+
898
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
899
+ query_params = {
900
+ "match[]": match,
901
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
902
+ }
903
+
904
+ # Add time parameters - use provided values or defaults
905
+ if params.get("end"):
906
+ query_params["end"] = params["end"]
907
+ else:
908
+ query_params["end"] = str(int(time.time()))
909
+
910
+ if params.get("start"):
911
+ query_params["start"] = params["start"]
912
+ elif self.toolset.config.default_metadata_time_window_hrs:
913
+ # Use default time window
914
+ query_params["start"] = str(
915
+ int(time.time())
916
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
917
+ )
918
+
919
+ response = do_request(
920
+ config=self.toolset.config,
921
+ url=url,
922
+ params=query_params,
923
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
924
+ verify=self.toolset.config.prometheus_ssl_enabled,
925
+ headers=self.toolset.config.headers,
926
+ method="GET",
927
+ )
928
+ response.raise_for_status()
929
+ data = response.json()
930
+
931
+ # Check if results were truncated
932
+ if (
933
+ "data" in data
934
+ and isinstance(data["data"], list)
935
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
936
+ ):
937
+ data["_truncated"] = True
938
+ data["_message"] = (
939
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
940
+ )
941
+
942
+ return StructuredToolResult(
943
+ status=StructuredToolResultStatus.SUCCESS,
944
+ data=data,
945
+ params=params,
946
+ )
947
+ except Exception as e:
948
+ return StructuredToolResult(
949
+ status=StructuredToolResultStatus.ERROR,
950
+ error=str(e),
951
+ params=params,
952
+ )
953
+
954
+ def get_parameterized_one_liner(self, params) -> str:
955
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
956
+
957
+
958
+ class GetMetricMetadata(BasePrometheusTool):
959
+ """Get metadata (type, description, unit) for metrics"""
960
+
961
+ def __init__(self, toolset: "PrometheusToolset"):
962
+ super().__init__(
963
+ name="get_metric_metadata",
964
+ description=(
965
+ "Get metric metadata using /api/v1/metadata. "
966
+ "Returns type, help text, and unit for metrics. "
967
+ "Use after discovering metric names to get their descriptions. "
968
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
969
+ "Supports optional metric name filter."
970
+ ),
971
+ parameters={
972
+ "metric": ToolParameter(
973
+ description=(
974
+ "Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
975
+ "If not provided, returns metadata for all metrics."
976
+ ),
977
+ type="string",
978
+ required=False,
979
+ ),
980
+ },
981
+ toolset=toolset,
982
+ )
983
+
984
+ def _invoke(
985
+ self, params: dict, user_approved: bool = False
986
+ ) -> StructuredToolResult:
987
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
748
988
  return StructuredToolResult(
749
989
  status=StructuredToolResultStatus.ERROR,
750
- error=f"Network error while fetching metrics: {str(e)}",
990
+ error="Prometheus is not configured. Prometheus URL is missing",
991
+ params=params,
992
+ )
993
+ try:
994
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
995
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
996
+
997
+ if params.get("metric"):
998
+ query_params["metric"] = params["metric"]
999
+
1000
+ response = do_request(
1001
+ config=self.toolset.config,
1002
+ url=url,
1003
+ params=query_params,
1004
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
1005
+ verify=self.toolset.config.prometheus_ssl_enabled,
1006
+ headers=self.toolset.config.headers,
1007
+ method="GET",
1008
+ )
1009
+ response.raise_for_status()
1010
+ data = response.json()
1011
+
1012
+ # Check if results were truncated (metadata endpoint returns a dict, not a list)
1013
+ if (
1014
+ "data" in data
1015
+ and isinstance(data["data"], dict)
1016
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1017
+ ):
1018
+ data["_truncated"] = True
1019
+ data["_message"] = (
1020
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
1021
+ )
1022
+
1023
+ return StructuredToolResult(
1024
+ status=StructuredToolResultStatus.SUCCESS,
1025
+ data=data,
751
1026
  params=params,
752
1027
  )
753
1028
  except Exception as e:
754
- logging.warn("Failed to process prometheus metrics", exc_info=True)
755
1029
  return StructuredToolResult(
756
1030
  status=StructuredToolResultStatus.ERROR,
757
- error=f"Unexpected error: {str(e)}",
1031
+ error=str(e),
758
1032
  params=params,
759
1033
  )
760
1034
 
761
1035
  def get_parameterized_one_liner(self, params) -> str:
762
- name_filter = params.get("name_filter", "")
763
- return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
1036
+ metric = params.get("metric", "all")
1037
+ return (
1038
+ f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
1039
+ )
764
1040
 
765
1041
 
766
1042
  class ExecuteInstantQuery(BasePrometheusTool):
767
1043
  def __init__(self, toolset: "PrometheusToolset"):
768
1044
  super().__init__(
769
1045
  name="execute_prometheus_instant_query",
770
- description="Execute an instant PromQL query",
1046
+ description=(
1047
+ f"Execute an instant PromQL query (single point in time). "
1048
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1049
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
1050
+ ),
771
1051
  parameters={
772
1052
  "query": ToolParameter(
773
1053
  description="The PromQL query",
@@ -779,6 +1059,15 @@ class ExecuteInstantQuery(BasePrometheusTool):
779
1059
  type="string",
780
1060
  required=True,
781
1061
  ),
1062
+ "timeout": ToolParameter(
1063
+ description=(
1064
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1065
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1066
+ f"Increase for complex queries that may take longer."
1067
+ ),
1068
+ type="number",
1069
+ required=False,
1070
+ ),
782
1071
  },
783
1072
  toolset=toolset,
784
1073
  )
@@ -800,12 +1089,24 @@ class ExecuteInstantQuery(BasePrometheusTool):
800
1089
 
801
1090
  payload = {"query": query}
802
1091
 
1092
+ # Get timeout parameter and enforce limits
1093
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1094
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1095
+ timeout = params.get("timeout", default_timeout)
1096
+ if timeout > max_timeout:
1097
+ timeout = max_timeout
1098
+ logging.warning(
1099
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1100
+ )
1101
+ elif timeout < 1:
1102
+ timeout = default_timeout # Min 1 second, but use default if invalid
1103
+
803
1104
  response = do_request(
804
1105
  config=self.toolset.config,
805
1106
  url=url,
806
1107
  headers=self.toolset.config.headers,
807
1108
  data=payload,
808
- timeout=60,
1109
+ timeout=timeout,
809
1110
  verify=self.toolset.config.prometheus_ssl_enabled,
810
1111
  method="POST",
811
1112
  )
@@ -853,7 +1154,12 @@ class ExecuteInstantQuery(BasePrometheusTool):
853
1154
  logging.info(
854
1155
  f"Prometheus instant query returned large dataset: "
855
1156
  f"{response_data['data_summary'].get('result_count', 0)} results, "
856
- f"{data_size_chars:,} characters. Returning summary instead of full data."
1157
+ f"{data_size_chars:,} characters (limit: {self.toolset.config.query_response_size_limit:,}). "
1158
+ f"Returning summary instead of full data."
1159
+ )
1160
+ # Also add character info to the summary for debugging
1161
+ response_data["data_summary"]["_debug_info"] = (
1162
+ f"Data size: {data_size_chars:,} chars exceeded limit of {self.toolset.config.query_response_size_limit:,} chars"
857
1163
  )
858
1164
  else:
859
1165
  response_data["data"] = result_data
@@ -912,7 +1218,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
912
1218
  def __init__(self, toolset: "PrometheusToolset"):
913
1219
  super().__init__(
914
1220
  name="execute_prometheus_range_query",
915
- description="Generates a graph and Execute a PromQL range query",
1221
+ description=(
1222
+ f"Generates a graph and Execute a PromQL range query. "
1223
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1224
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
1225
+ f"Default time range is last 1 hour."
1226
+ ),
916
1227
  parameters={
917
1228
  "query": ToolParameter(
918
1229
  description="The PromQL query",
@@ -946,6 +1257,25 @@ class ExecuteRangeQuery(BasePrometheusTool):
946
1257
  type="string",
947
1258
  required=True,
948
1259
  ),
1260
+ "timeout": ToolParameter(
1261
+ description=(
1262
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1263
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1264
+ f"Increase for complex queries that may take longer."
1265
+ ),
1266
+ type="number",
1267
+ required=False,
1268
+ ),
1269
+ "max_points": ToolParameter(
1270
+ description=(
1271
+ f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
1272
+ f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
1273
+ f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
1274
+ f"If your query would return more points than this limit, the step will be automatically adjusted."
1275
+ ),
1276
+ type="number",
1277
+ required=False,
1278
+ ),
949
1279
  },
950
1280
  toolset=toolset,
951
1281
  )
@@ -970,12 +1300,16 @@ class ExecuteRangeQuery(BasePrometheusTool):
970
1300
  default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
971
1301
  )
972
1302
  step = parse_duration_to_seconds(params.get("step"))
1303
+ max_points = params.get(
1304
+ "max_points"
1305
+ ) # Get the optional max_points parameter
973
1306
 
974
1307
  # adjust_step_for_max_points handles None case and converts to float
975
1308
  step = adjust_step_for_max_points(
976
1309
  start_timestamp=start,
977
1310
  end_timestamp=end,
978
1311
  step=step,
1312
+ max_points_override=max_points,
979
1313
  )
980
1314
 
981
1315
  description = params.get("description", "")
@@ -987,12 +1321,24 @@ class ExecuteRangeQuery(BasePrometheusTool):
987
1321
  "step": step,
988
1322
  }
989
1323
 
1324
+ # Get timeout parameter and enforce limits
1325
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1326
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1327
+ timeout = params.get("timeout", default_timeout)
1328
+ if timeout > max_timeout:
1329
+ timeout = max_timeout
1330
+ logging.warning(
1331
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1332
+ )
1333
+ elif timeout < 1:
1334
+ timeout = default_timeout # Min 1 second, but use default if invalid
1335
+
990
1336
  response = do_request(
991
1337
  config=self.toolset.config,
992
1338
  url=url,
993
1339
  headers=self.toolset.config.headers,
994
1340
  data=payload,
995
- timeout=120,
1341
+ timeout=timeout,
996
1342
  verify=self.toolset.config.prometheus_ssl_enabled,
997
1343
  method="POST",
998
1344
  )
@@ -1041,7 +1387,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
1041
1387
  logging.info(
1042
1388
  f"Prometheus range query returned large dataset: "
1043
1389
  f"{response_data['data_summary'].get('series_count', 0)} series, "
1044
- f"{data_size_chars:,} characters. Returning summary instead of full data."
1390
+ f"{data_size_chars:,} characters (limit: {self.toolset.config.query_response_size_limit:,}). "
1391
+ f"Returning summary instead of full data."
1392
+ )
1393
+ # Also add character info to the summary for debugging
1394
+ response_data["data_summary"]["_debug_info"] = (
1395
+ f"Data size: {data_size_chars:,} chars exceeded limit of {self.toolset.config.query_response_size_limit:,} chars"
1045
1396
  )
1046
1397
  else:
1047
1398
  response_data["data"] = result_data
@@ -1107,7 +1458,11 @@ class PrometheusToolset(Toolset):
1107
1458
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
1108
1459
  tools=[
1109
1460
  ListPrometheusRules(toolset=self),
1110
- ListAvailableMetrics(toolset=self),
1461
+ GetMetricNames(toolset=self),
1462
+ GetLabelValues(toolset=self),
1463
+ GetAllLabels(toolset=self),
1464
+ GetSeries(toolset=self),
1465
+ GetMetricMetadata(toolset=self),
1111
1466
  ExecuteInstantQuery(toolset=self),
1112
1467
  ExecuteRangeQuery(toolset=self),
1113
1468
  ],