holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +17 -4
  3. holmes/common/env_vars.py +40 -1
  4. holmes/config.py +114 -144
  5. holmes/core/conversations.py +53 -14
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/investigation.py +18 -22
  8. holmes/core/llm.py +489 -88
  9. holmes/core/models.py +103 -1
  10. holmes/core/openai_formatting.py +13 -0
  11. holmes/core/prompt.py +1 -1
  12. holmes/core/safeguards.py +4 -4
  13. holmes/core/supabase_dal.py +293 -100
  14. holmes/core/tool_calling_llm.py +423 -323
  15. holmes/core/tools.py +311 -33
  16. holmes/core/tools_utils/token_counting.py +14 -0
  17. holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
  18. holmes/core/tools_utils/tool_executor.py +13 -8
  19. holmes/core/toolset_manager.py +155 -4
  20. holmes/core/tracing.py +6 -1
  21. holmes/core/transformers/__init__.py +23 -0
  22. holmes/core/transformers/base.py +62 -0
  23. holmes/core/transformers/llm_summarize.py +174 -0
  24. holmes/core/transformers/registry.py +122 -0
  25. holmes/core/transformers/transformer.py +31 -0
  26. holmes/core/truncation/compaction.py +59 -0
  27. holmes/core/truncation/dal_truncation_utils.py +23 -0
  28. holmes/core/truncation/input_context_window_limiter.py +218 -0
  29. holmes/interactive.py +177 -24
  30. holmes/main.py +7 -4
  31. holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
  32. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  33. holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
  34. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  35. holmes/plugins/prompts/generic_ask.jinja2 +2 -4
  36. holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
  37. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
  38. holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
  39. holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
  40. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
  41. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
  42. holmes/plugins/runbooks/__init__.py +117 -18
  43. holmes/plugins/runbooks/catalog.json +2 -0
  44. holmes/plugins/toolsets/__init__.py +21 -8
  45. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  46. holmes/plugins/toolsets/aks.yaml +64 -0
  47. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
  48. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  49. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
  50. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
  51. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
  52. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
  53. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
  54. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
  55. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
  56. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
  57. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
  58. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
  59. holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
  60. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  61. holmes/plugins/toolsets/cilium.yaml +284 -0
  62. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  63. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  64. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  65. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
  66. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  67. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
  68. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
  69. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
  70. holmes/plugins/toolsets/git.py +51 -46
  71. holmes/plugins/toolsets/grafana/common.py +15 -3
  72. holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
  73. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
  74. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
  75. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
  76. holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
  77. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
  78. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  79. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
  80. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  81. holmes/plugins/toolsets/internet/internet.py +6 -7
  82. holmes/plugins/toolsets/internet/notion.py +5 -6
  83. holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
  84. holmes/plugins/toolsets/kafka.py +25 -36
  85. holmes/plugins/toolsets/kubernetes.yaml +58 -84
  86. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  87. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  88. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  89. holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
  90. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  91. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  92. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  93. holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
  94. holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
  95. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  96. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  97. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  98. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  99. holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
  100. holmes/plugins/toolsets/openshift.yaml +283 -0
  101. holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
  102. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
  103. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  104. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
  105. holmes/plugins/toolsets/robusta/robusta.py +236 -65
  106. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  107. holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
  108. holmes/plugins/toolsets/service_discovery.py +1 -1
  109. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  110. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  111. holmes/plugins/toolsets/utils.py +88 -0
  112. holmes/utils/config_utils.py +91 -0
  113. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  114. holmes/utils/env.py +7 -0
  115. holmes/utils/global_instructions.py +75 -10
  116. holmes/utils/holmes_status.py +2 -1
  117. holmes/utils/holmes_sync_toolsets.py +0 -2
  118. holmes/utils/krr_utils.py +188 -0
  119. holmes/utils/sentry_helper.py +41 -0
  120. holmes/utils/stream.py +61 -7
  121. holmes/version.py +34 -14
  122. holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
  123. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
  124. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
  125. holmes/core/performance_timing.py +0 -72
  126. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  127. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  128. holmes/plugins/toolsets/newrelic.py +0 -231
  129. holmes/plugins/toolsets/servicenow/install.md +0 -37
  130. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  131. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  132. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  133. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
  134. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,9 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import re
5
4
  import time
6
5
  import dateutil.parser
7
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
6
+ from typing import Any, Dict, Optional, Tuple, Type, Union
8
7
  from urllib.parse import urljoin
9
8
 
10
9
  import requests # type: ignore
@@ -16,12 +15,16 @@ from holmes.core.tools import (
16
15
  CallablePrerequisite,
17
16
  StructuredToolResult,
18
17
  Tool,
18
+ ToolInvokeContext,
19
19
  ToolParameter,
20
- ToolResultStatus,
20
+ StructuredToolResultStatus,
21
21
  Toolset,
22
22
  ToolsetTag,
23
23
  )
24
+ from holmes.core.tools_utils.token_counting import count_tool_response_tokens
25
+ from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
24
26
  from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
27
+ from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
25
28
  from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
26
29
  from holmes.plugins.toolsets.utils import (
27
30
  get_param_or_raise,
@@ -38,24 +41,64 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
38
41
  from holmes.utils.keygen_utils import generate_random_key
39
42
 
40
43
  PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
44
+ PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
45
+ # Default timeout values for PromQL queries
46
+ DEFAULT_QUERY_TIMEOUT_SECONDS = 20
47
+ MAX_QUERY_TIMEOUT_SECONDS = 180
48
+ # Default timeout for metadata API calls (discovery endpoints)
49
+ DEFAULT_METADATA_TIMEOUT_SECONDS = 20
50
+ MAX_METADATA_TIMEOUT_SECONDS = 60
51
+ # Default time window for metadata APIs (in hours)
52
+ DEFAULT_METADATA_TIME_WINDOW_HRS = 1
41
53
 
42
54
 
43
55
  class PrometheusConfig(BaseModel):
44
56
  # URL is optional because it can be set with an env var
45
57
  prometheus_url: Optional[str]
46
58
  healthcheck: str = "-/healthy"
47
- # Setting to None will remove the time window from the request for labels
48
- metrics_labels_time_window_hrs: Union[int, None] = 48
49
- # Setting to None will disable the cache
50
- metrics_labels_cache_duration_hrs: Union[int, None] = 12
51
- fetch_labels_with_labels_api: bool = False
52
- fetch_metadata_with_series_api: bool = False
59
+
60
+ # New config for default time window for metadata APIs
61
+ default_metadata_time_window_hrs: int = DEFAULT_METADATA_TIME_WINDOW_HRS # Default: only show metrics active in the last hour
62
+
63
+ # Query timeout configuration
64
+ default_query_timeout_seconds: int = (
65
+ DEFAULT_QUERY_TIMEOUT_SECONDS # Default timeout for PromQL queries
66
+ )
67
+ max_query_timeout_seconds: int = (
68
+ MAX_QUERY_TIMEOUT_SECONDS # Maximum allowed timeout for PromQL queries
69
+ )
70
+
71
+ # Metadata API timeout configuration
72
+ default_metadata_timeout_seconds: int = (
73
+ DEFAULT_METADATA_TIMEOUT_SECONDS # Default timeout for metadata/discovery APIs
74
+ )
75
+ max_metadata_timeout_seconds: int = (
76
+ MAX_METADATA_TIMEOUT_SECONDS # Maximum allowed timeout for metadata APIs
77
+ )
78
+
79
+ # DEPRECATED: These config values are deprecated and will be removed in a future version
80
+ # Using None as default so we can detect if user explicitly set them
81
+ metrics_labels_time_window_hrs: Optional[int] = (
82
+ None # DEPRECATED - use default_metadata_time_window_hrs instead
83
+ )
84
+ metrics_labels_cache_duration_hrs: Optional[int] = (
85
+ None # DEPRECATED - no longer used
86
+ )
87
+ fetch_labels_with_labels_api: Optional[bool] = None # DEPRECATED - no longer used
88
+ fetch_metadata_with_series_api: Optional[bool] = None # DEPRECATED - no longer used
89
+
53
90
  tool_calls_return_data: bool = True
54
91
  headers: Dict = Field(default_factory=dict)
55
- rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
92
+ rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
56
93
  additional_labels: Optional[Dict[str, str]] = None
57
94
  prometheus_ssl_enabled: bool = True
58
95
 
96
+ # Custom limit to the max number of tokens that a query result can take to proactively
97
+ # prevent token limit issues. Expressed in % of the model's context window.
98
+ # This limit only overrides the global limit for all tools (TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT)
99
+ # if it is lower.
100
+ query_response_size_limit_pct: Optional[int] = None
101
+
59
102
  @field_validator("prometheus_url")
60
103
  def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
61
104
  if v is not None and not v.endswith("/"):
@@ -64,6 +107,26 @@ class PrometheusConfig(BaseModel):
64
107
 
65
108
  @model_validator(mode="after")
66
109
  def validate_prom_config(self):
110
+ # Check for deprecated config values and print warnings
111
+ deprecated_configs = []
112
+ if self.metrics_labels_time_window_hrs is not None: # Check if explicitly set
113
+ deprecated_configs.append(
114
+ "metrics_labels_time_window_hrs (use default_metadata_time_window_hrs instead)"
115
+ )
116
+ if (
117
+ self.metrics_labels_cache_duration_hrs is not None
118
+ ): # Check if explicitly set
119
+ deprecated_configs.append("metrics_labels_cache_duration_hrs")
120
+ if self.fetch_labels_with_labels_api is not None: # Check if explicitly set
121
+ deprecated_configs.append("fetch_labels_with_labels_api")
122
+ if self.fetch_metadata_with_series_api is not None: # Check if explicitly set
123
+ deprecated_configs.append("fetch_metadata_with_series_api")
124
+
125
+ if deprecated_configs:
126
+ logging.warning(
127
+ f"WARNING: The following Prometheus config values are deprecated and will be removed in a future version: "
128
+ f"{', '.join(deprecated_configs)}. These configs no longer affect behavior."
129
+ )
67
130
  # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
68
131
  if IS_OPENSHIFT:
69
132
  if self.healthcheck == "-/healthy":
@@ -160,6 +223,8 @@ def do_request(
160
223
 
161
224
  if isinstance(config, AMPConfig):
162
225
  client = config.get_aws_client() # cached AWSPrometheusConnect
226
+ # Note: timeout parameter is not supported by prometrix's signed_request
227
+ # AWS/AMP requests will not respect the timeout setting
163
228
  return client.signed_request( # type: ignore
164
229
  method=method,
165
230
  url=url,
@@ -181,99 +246,6 @@ def do_request(
181
246
  )
182
247
 
183
248
 
184
- def filter_metrics_by_type(metrics: Dict, expected_type: str):
185
- return {
186
- metric_name: metric_data
187
- for metric_name, metric_data in metrics.items()
188
- if expected_type in metric_data.get("type", "")
189
- or metric_data.get("type", "") == "?"
190
- }
191
-
192
-
193
- def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
194
- regex = re.compile(pattern)
195
- return {
196
- metric_name: metric_data
197
- for metric_name, metric_data in metrics.items()
198
- if regex.search(metric_name)
199
- }
200
-
201
-
202
- METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
203
-
204
-
205
- def fetch_metadata(
206
- prometheus_url: str,
207
- headers: Optional[Dict],
208
- config,
209
- verify_ssl: bool = True,
210
- ) -> Dict:
211
- metadata_url = urljoin(prometheus_url, "api/v1/metadata")
212
- metadata_response = do_request(
213
- config=config,
214
- url=metadata_url,
215
- headers=headers,
216
- timeout=60,
217
- verify=verify_ssl,
218
- method="GET",
219
- )
220
- metadata_response.raise_for_status()
221
-
222
- metadata = metadata_response.json()["data"]
223
-
224
- metrics = {}
225
- for metric_name, meta_list in metadata.items():
226
- if meta_list:
227
- metric_type = meta_list[0].get("type", "unknown")
228
- metric_description = meta_list[0].get("help", "unknown")
229
- metrics[metric_name] = {
230
- "type": metric_type,
231
- "description": metric_description,
232
- "labels": set(),
233
- }
234
-
235
- return metrics
236
-
237
-
238
- def fetch_metadata_with_series_api(
239
- prometheus_url: str,
240
- metric_name: str,
241
- headers: Dict,
242
- config,
243
- verify_ssl: bool = True,
244
- ) -> Dict:
245
- url = urljoin(prometheus_url, "api/v1/series")
246
- params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
247
-
248
- response = do_request(
249
- config=config,
250
- url=url,
251
- headers=headers,
252
- params=params,
253
- timeout=60,
254
- verify=verify_ssl,
255
- method="GET",
256
- )
257
- response.raise_for_status()
258
- metrics = response.json()["data"]
259
-
260
- metadata: Dict = {}
261
- for metric_data in metrics:
262
- metric_name = metric_data.get("__name__")
263
- if not metric_name:
264
- continue
265
-
266
- metric = metadata.get(metric_name)
267
- if not metric:
268
- metric = {"description": "?", "type": "?", "labels": set()}
269
- metadata[metric_name] = metric
270
-
271
- labels = {k for k in metric_data.keys() if k != "__name__"}
272
- metric["labels"].update(labels)
273
-
274
- return metadata
275
-
276
-
277
249
  def result_has_data(result: Dict) -> bool:
278
250
  data = result.get("data", {})
279
251
  if len(data.get("result", [])) > 0:
@@ -284,33 +256,58 @@ def result_has_data(result: Dict) -> bool:
284
256
  def adjust_step_for_max_points(
285
257
  start_timestamp: str,
286
258
  end_timestamp: str,
287
- step: float,
259
+ step: Optional[float] = None,
260
+ max_points_override: Optional[float] = None,
288
261
  ) -> float:
289
262
  """
290
263
  Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
291
- Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
292
264
 
293
265
  Args:
294
266
  start_timestamp: RFC3339 formatted start time
295
267
  end_timestamp: RFC3339 formatted end time
296
- step: The requested step duration in seconds
268
+ step: The requested step duration in seconds (None for auto-calculation)
269
+ max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
297
270
 
298
271
  Returns:
299
272
  Adjusted step value in seconds that ensures points <= max_points
300
273
  """
274
+ # Use override if provided and valid, otherwise use default
275
+ max_points = MAX_GRAPH_POINTS
276
+ if max_points_override is not None:
277
+ if max_points_override > MAX_GRAPH_POINTS:
278
+ logging.warning(
279
+ f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
280
+ )
281
+ max_points = MAX_GRAPH_POINTS
282
+ elif max_points_override < 1:
283
+ logging.warning(
284
+ f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
285
+ )
286
+ max_points = MAX_GRAPH_POINTS
287
+ else:
288
+ max_points = max_points_override
289
+ logging.debug(f"Using max_points override: {max_points}")
301
290
 
302
291
  start_dt = dateutil.parser.parse(start_timestamp)
303
292
  end_dt = dateutil.parser.parse(end_timestamp)
304
293
 
305
294
  time_range_seconds = (end_dt - start_dt).total_seconds()
306
295
 
296
+ # If no step provided, calculate a reasonable default
297
+ # Aim for ~60 data points across the time range (1 per minute for hourly, etc)
298
+ if step is None:
299
+ step = max(1, time_range_seconds / 60)
300
+ logging.debug(
301
+ f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
302
+ )
303
+
307
304
  current_points = time_range_seconds / step
308
305
 
309
306
  # If current points exceed max, adjust the step
310
- if current_points > MAX_GRAPH_POINTS:
311
- adjusted_step = time_range_seconds / MAX_GRAPH_POINTS
307
+ if current_points > max_points:
308
+ adjusted_step = time_range_seconds / max_points
312
309
  logging.info(
313
- f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {MAX_GRAPH_POINTS}"
310
+ f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
314
311
  )
315
312
  return adjusted_step
316
313
 
@@ -324,185 +321,143 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
324
321
  return results
325
322
 
326
323
 
327
- def fetch_metrics_labels_with_series_api(
328
- prometheus_url: str,
329
- headers: Dict[str, str],
330
- cache: Optional[TTLCache],
331
- metrics_labels_time_window_hrs: Union[int, None],
332
- metric_name: str,
333
- config=None,
334
- verify_ssl: bool = True,
335
- ) -> dict:
336
- """This is a slow query. Takes 5+ seconds to run"""
337
- cache_key = f"metrics_labels_series_api:{metric_name}"
338
- if cache:
339
- cached_result = cache.get(cache_key)
340
- if cached_result:
341
- return cached_result
342
-
343
- series_url = urljoin(prometheus_url, "api/v1/series")
344
- params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
345
-
346
- if metrics_labels_time_window_hrs is not None:
347
- params["end"] = int(time.time())
348
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
349
-
350
- series_response = do_request(
351
- config=config,
352
- url=series_url,
353
- headers=headers,
354
- params=params,
355
- timeout=60,
356
- verify=verify_ssl,
357
- method="GET",
358
- )
359
- series_response.raise_for_status()
360
- series = series_response.json()["data"]
361
-
362
- metrics_labels: dict = {}
363
- for serie in series:
364
- metric_name = serie["__name__"]
365
- # Add all labels except __name__
366
- labels = {k for k in serie.keys() if k != "__name__"}
367
- if metric_name in metrics_labels:
368
- metrics_labels[metric_name].update(labels)
369
- else:
370
- metrics_labels[metric_name] = labels
371
- if cache:
372
- cache.set(cache_key, metrics_labels)
373
-
374
- return metrics_labels
375
-
376
-
377
- def fetch_metrics_labels_with_labels_api(
378
- prometheus_url: str,
379
- cache: Optional[TTLCache],
380
- metrics_labels_time_window_hrs: Union[int, None],
381
- metric_names: List[str],
382
- headers: Dict,
383
- config=None,
384
- verify_ssl: bool = True,
385
- ) -> dict:
386
- metrics_labels = {}
387
-
388
- for metric_name in metric_names:
389
- cache_key = f"metrics_labels_labels_api:{metric_name}"
390
- if cache:
391
- cached_result = cache.get(cache_key)
392
- if cached_result:
393
- metrics_labels[metric_name] = cached_result
394
-
395
- url = urljoin(prometheus_url, "api/v1/labels")
396
- params: dict = {
397
- "match[]": f'{{__name__="{metric_name}"}}',
398
- }
399
- if metrics_labels_time_window_hrs is not None:
400
- params["end"] = int(time.time())
401
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
324
+ def create_data_summary_for_large_result(
325
+ result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
326
+ ) -> Dict[str, Any]:
327
+ """
328
+ Create a summary for large Prometheus results instead of returning full data.
402
329
 
403
- response = do_request(
404
- config=config,
405
- url=url,
406
- headers=headers,
407
- params=params,
408
- timeout=60,
409
- verify=verify_ssl,
410
- method="GET",
411
- )
412
- response.raise_for_status()
413
- labels = response.json()["data"]
414
- filtered_labels = {label for label in labels if label != "__name__"}
415
- metrics_labels[metric_name] = filtered_labels
416
-
417
- if cache:
418
- cache.set(cache_key, filtered_labels)
419
-
420
- return metrics_labels
421
-
422
-
423
- def fetch_metrics(
424
- prometheus_url: str,
425
- cache: Optional[TTLCache],
426
- metrics_labels_time_window_hrs: Union[int, None],
427
- metric_name: str,
428
- should_fetch_labels_with_labels_api: bool,
429
- should_fetch_metadata_with_series_api: bool,
430
- headers: Dict,
431
- config=None,
432
- verify_ssl: bool = True,
433
- ) -> dict:
434
- metrics = None
435
- should_fetch_labels = True
436
- if should_fetch_metadata_with_series_api:
437
- metrics = fetch_metadata_with_series_api(
438
- prometheus_url=prometheus_url,
439
- metric_name=metric_name,
440
- headers=headers,
441
- config=config,
442
- verify_ssl=verify_ssl,
330
+ Args:
331
+ result_data: The Prometheus data result
332
+ query: The original PromQL query
333
+ data_size_tokens: Size of the data in tokens
334
+ is_range_query: Whether this is a range query (vs instant query)
335
+
336
+ Returns:
337
+ Dictionary with summary information and suggestions
338
+ """
339
+ if is_range_query:
340
+ series_list = result_data.get("result", [])
341
+ num_items = len(series_list)
342
+
343
+ # Calculate exact total data points across all series
344
+ total_points = 0
345
+ for series in series_list: # Iterate through ALL series for exact count
346
+ points = len(series.get("values", []))
347
+ total_points += points
348
+
349
+ # Analyze label keys and their cardinality
350
+ label_cardinality: Dict[str, set] = {}
351
+ for series in series_list:
352
+ metric = series.get("metric", {})
353
+ for label_key, label_value in metric.items():
354
+ if label_key not in label_cardinality:
355
+ label_cardinality[label_key] = set()
356
+ label_cardinality[label_key].add(label_value)
357
+
358
+ # Convert sets to counts for the summary
359
+ label_summary = {
360
+ label: len(values) for label, values in label_cardinality.items()
361
+ }
362
+ # Sort by cardinality (highest first) for better insights
363
+ label_summary = dict(
364
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
443
365
  )
444
- should_fetch_labels = False # series API returns the labels
366
+
367
+ return {
368
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
369
+ "series_count": num_items,
370
+ "total_data_points": total_points,
371
+ "data_size_tokens": data_size_tokens,
372
+ "label_cardinality": label_summary,
373
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
374
+ }
445
375
  else:
446
- metrics = fetch_metadata(
447
- prometheus_url=prometheus_url,
448
- headers=headers,
449
- config=config,
450
- verify_ssl=verify_ssl,
376
+ # Instant query
377
+ result_type = result_data.get("resultType", "")
378
+ result_list = result_data.get("result", [])
379
+ num_items = len(result_list)
380
+
381
+ # Analyze label keys and their cardinality
382
+ instant_label_cardinality: Dict[str, set] = {}
383
+ for item in result_list:
384
+ if isinstance(item, dict):
385
+ metric = item.get("metric", {})
386
+ for label_key, label_value in metric.items():
387
+ if label_key not in instant_label_cardinality:
388
+ instant_label_cardinality[label_key] = set()
389
+ instant_label_cardinality[label_key].add(label_value)
390
+
391
+ # Convert sets to counts for the summary
392
+ label_summary = {
393
+ label: len(values) for label, values in instant_label_cardinality.items()
394
+ }
395
+ # Sort by cardinality (highest first) for better insights
396
+ label_summary = dict(
397
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
451
398
  )
452
- metrics = filter_metrics_by_name(metrics, metric_name)
453
399
 
454
- if should_fetch_labels:
455
- metrics_labels = {}
456
- if should_fetch_labels_with_labels_api:
457
- metrics_labels = fetch_metrics_labels_with_labels_api(
458
- prometheus_url=prometheus_url,
459
- cache=cache,
460
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
461
- metric_names=list(metrics.keys()),
462
- headers=headers,
463
- config=config,
464
- verify_ssl=verify_ssl,
465
- )
466
- else:
467
- metrics_labels = fetch_metrics_labels_with_series_api(
468
- prometheus_url=prometheus_url,
469
- cache=cache,
470
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
471
- metric_name=metric_name,
472
- headers=headers,
473
- config=config,
474
- verify_ssl=verify_ssl,
475
- )
400
+ return {
401
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
402
+ "result_count": num_items,
403
+ "result_type": result_type,
404
+ "data_size_tokens": data_size_tokens,
405
+ "label_cardinality": label_summary,
406
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
407
+ }
476
408
 
477
- for metric_name in metrics:
478
- if metric_name in metrics_labels:
479
- metrics[metric_name]["labels"] = metrics_labels[metric_name]
480
409
 
481
- return metrics
410
+ class MetricsBasedResponse(BaseModel):
411
+ status: str
412
+ error_message: Optional[str] = None
413
+ data: Optional[str] = None
414
+ random_key: str
415
+ tool_name: str
416
+ description: str
417
+ query: str
418
+ start: Optional[str] = None
419
+ end: Optional[str] = None
420
+ step: Optional[float] = None
421
+ output_type: Optional[str] = None
422
+ data_summary: Optional[dict[str, Any]] = None
423
+
424
+
425
+ def create_structured_tool_result(
426
+ params: dict, response: MetricsBasedResponse
427
+ ) -> StructuredToolResult:
428
+ status = StructuredToolResultStatus.SUCCESS
429
+ if response.error_message or response.status.lower() in ("failed", "error"):
430
+ status = StructuredToolResultStatus.ERROR
431
+ elif not response.data:
432
+ status = StructuredToolResultStatus.NO_DATA
433
+
434
+ return StructuredToolResult(
435
+ status=status,
436
+ data=response.model_dump_json(indent=2),
437
+ params=params,
438
+ )
482
439
 
483
440
 
484
441
  class ListPrometheusRules(BasePrometheusTool):
485
442
  def __init__(self, toolset: "PrometheusToolset"):
486
443
  super().__init__(
487
444
  name="list_prometheus_rules",
488
- description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
445
+ description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
489
446
  parameters={},
490
447
  toolset=toolset,
491
448
  )
492
449
  self._cache = None
493
450
 
494
- def _invoke(
495
- self, params: dict, user_approved: bool = False
496
- ) -> StructuredToolResult:
451
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
497
452
  if not self.toolset.config or not self.toolset.config.prometheus_url:
498
453
  return StructuredToolResult(
499
- status=ToolResultStatus.ERROR,
454
+ status=StructuredToolResultStatus.ERROR,
500
455
  error="Prometheus is not configured. Prometheus URL is missing",
501
456
  params=params,
502
457
  )
503
458
  if self.toolset.config.is_amp():
504
459
  return StructuredToolResult(
505
- status=ToolResultStatus.ERROR,
460
+ status=StructuredToolResultStatus.ERROR,
506
461
  error="Tool not supported in AMP",
507
462
  params=params,
508
463
  )
@@ -515,7 +470,7 @@ class ListPrometheusRules(BasePrometheusTool):
515
470
  logging.debug("rules returned from cache")
516
471
 
517
472
  return StructuredToolResult(
518
- status=ToolResultStatus.SUCCESS,
473
+ status=StructuredToolResultStatus.SUCCESS,
519
474
  data=cached_rules,
520
475
  params=params,
521
476
  )
@@ -528,7 +483,7 @@ class ListPrometheusRules(BasePrometheusTool):
528
483
  config=self.toolset.config,
529
484
  url=rules_url,
530
485
  params=params,
531
- timeout=180,
486
+ timeout=40,
532
487
  verify=self.toolset.config.prometheus_ssl_enabled,
533
488
  headers=self.toolset.config.headers,
534
489
  method="GET",
@@ -539,28 +494,28 @@ class ListPrometheusRules(BasePrometheusTool):
539
494
  if self._cache:
540
495
  self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
541
496
  return StructuredToolResult(
542
- status=ToolResultStatus.SUCCESS,
497
+ status=StructuredToolResultStatus.SUCCESS,
543
498
  data=data,
544
499
  params=params,
545
500
  )
546
501
  except requests.Timeout:
547
502
  logging.warning("Timeout while fetching prometheus rules", exc_info=True)
548
503
  return StructuredToolResult(
549
- status=ToolResultStatus.ERROR,
504
+ status=StructuredToolResultStatus.ERROR,
550
505
  error="Request timed out while fetching rules",
551
506
  params=params,
552
507
  )
553
508
  except RequestException as e:
554
509
  logging.warning("Failed to fetch prometheus rules", exc_info=True)
555
510
  return StructuredToolResult(
556
- status=ToolResultStatus.ERROR,
511
+ status=StructuredToolResultStatus.ERROR,
557
512
  error=f"Network error while fetching rules: {str(e)}",
558
513
  params=params,
559
514
  )
560
515
  except Exception as e:
561
516
  logging.warning("Failed to process prometheus rules", exc_info=True)
562
517
  return StructuredToolResult(
563
- status=ToolResultStatus.ERROR,
518
+ status=StructuredToolResultStatus.ERROR,
564
519
  error=f"Unexpected error: {str(e)}",
565
520
  params=params,
566
521
  )
@@ -569,120 +524,553 @@ class ListPrometheusRules(BasePrometheusTool):
569
524
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
570
525
 
571
526
 
572
- class ListAvailableMetrics(BasePrometheusTool):
527
+ class GetMetricNames(BasePrometheusTool):
528
+ """Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
529
+
573
530
  def __init__(self, toolset: "PrometheusToolset"):
574
531
  super().__init__(
575
- name="list_available_metrics",
576
- description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
532
+ name="get_metric_names",
533
+ description=(
534
+ "Get list of metric names using /api/v1/label/__name__/values. "
535
+ "FASTEST method for metric discovery when you need to explore available metrics. "
536
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
537
+ f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
538
+ "Note: Does not return metric metadata (type, description, labels). "
539
+ "By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
540
+ ),
577
541
  parameters={
578
- "type_filter": ToolParameter(
579
- description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
542
+ "match": ToolParameter(
543
+ description=(
544
+ "REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
545
+ "'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
546
+ "'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
547
+ "'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
548
+ "'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
549
+ "'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
550
+ "'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
551
+ ),
552
+ type="string",
553
+ required=True,
554
+ ),
555
+ "start": ToolParameter(
556
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
580
557
  type="string",
581
558
  required=False,
582
559
  ),
583
- "name_filter": ToolParameter(
584
- description="Only the metrics partially or fully matching this name will be returned",
560
+ "end": ToolParameter(
561
+ description="End timestamp (RFC3339 or Unix). Default: now",
585
562
  type="string",
586
- required=True,
563
+ required=False,
587
564
  ),
588
565
  },
589
566
  toolset=toolset,
590
567
  )
591
- self._cache = None
592
568
 
593
- def _invoke(
594
- self, params: dict, user_approved: bool = False
595
- ) -> StructuredToolResult:
569
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
596
570
  if not self.toolset.config or not self.toolset.config.prometheus_url:
597
571
  return StructuredToolResult(
598
- status=ToolResultStatus.ERROR,
572
+ status=StructuredToolResultStatus.ERROR,
599
573
  error="Prometheus is not configured. Prometheus URL is missing",
600
574
  params=params,
601
575
  )
602
- if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
603
- self._cache = TTLCache(
604
- self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
605
- )
606
576
  try:
607
- prometheus_url = self.toolset.config.prometheus_url
608
- metrics_labels_time_window_hrs = (
609
- self.toolset.config.metrics_labels_time_window_hrs
577
+ match_param = params.get("match")
578
+ if not match_param:
579
+ return StructuredToolResult(
580
+ status=StructuredToolResultStatus.ERROR,
581
+ error="Match parameter is required to filter metrics",
582
+ params=params,
583
+ )
584
+
585
+ url = urljoin(
586
+ self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
610
587
  )
588
+ query_params = {
589
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
590
+ "match[]": match_param,
591
+ }
592
+
593
+ # Add time parameters - use provided values or defaults
594
+ if params.get("end"):
595
+ query_params["end"] = params["end"]
596
+ else:
597
+ query_params["end"] = str(int(time.time()))
598
+
599
+ if params.get("start"):
600
+ query_params["start"] = params["start"]
601
+ elif self.toolset.config.default_metadata_time_window_hrs:
602
+ # Use default time window
603
+ query_params["start"] = str(
604
+ int(time.time())
605
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
606
+ )
607
+
608
+ response = do_request(
609
+ config=self.toolset.config,
610
+ url=url,
611
+ params=query_params,
612
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
613
+ verify=self.toolset.config.prometheus_ssl_enabled,
614
+ headers=self.toolset.config.headers,
615
+ method="GET",
616
+ )
617
+ response.raise_for_status()
618
+ data = response.json()
619
+
620
+ # Check if results were truncated
621
+ if (
622
+ "data" in data
623
+ and isinstance(data["data"], list)
624
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
625
+ ):
626
+ data["_truncated"] = True
627
+ data["_message"] = (
628
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
629
+ )
630
+
631
+ return StructuredToolResult(
632
+ status=StructuredToolResultStatus.SUCCESS,
633
+ data=data,
634
+ params=params,
635
+ )
636
+ except Exception as e:
637
+ return StructuredToolResult(
638
+ status=StructuredToolResultStatus.ERROR,
639
+ error=str(e),
640
+ params=params,
641
+ )
642
+
643
+ def get_parameterized_one_liner(self, params) -> str:
644
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
645
+
646
+
647
+ class GetLabelValues(BasePrometheusTool):
648
+ """Get values for a specific label across all metrics"""
649
+
650
+ def __init__(self, toolset: "PrometheusToolset"):
651
+ super().__init__(
652
+ name="get_label_values",
653
+ description=(
654
+ "Get all values for a specific label using /api/v1/label/{label}/values. "
655
+ "Use this to discover pods, namespaces, jobs, instances, etc. "
656
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
657
+ "Supports optional match[] parameter to filter. "
658
+ "By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
659
+ ),
660
+ parameters={
661
+ "label": ToolParameter(
662
+ description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
663
+ type="string",
664
+ required=True,
665
+ ),
666
+ "match": ToolParameter(
667
+ description=(
668
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
669
+ "'{namespace=\"default\"}')."
670
+ ),
671
+ type="string",
672
+ required=False,
673
+ ),
674
+ "start": ToolParameter(
675
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
676
+ type="string",
677
+ required=False,
678
+ ),
679
+ "end": ToolParameter(
680
+ description="End timestamp (RFC3339 or Unix). Default: now",
681
+ type="string",
682
+ required=False,
683
+ ),
684
+ },
685
+ toolset=toolset,
686
+ )
611
687
 
612
- name_filter = params.get("name_filter")
613
- if not name_filter:
688
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
689
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
690
+ return StructuredToolResult(
691
+ status=StructuredToolResultStatus.ERROR,
692
+ error="Prometheus is not configured. Prometheus URL is missing",
693
+ params=params,
694
+ )
695
+ try:
696
+ label = params.get("label")
697
+ if not label:
614
698
  return StructuredToolResult(
615
- status=ToolResultStatus.ERROR,
616
- error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
699
+ status=StructuredToolResultStatus.ERROR,
700
+ error="Label parameter is required",
617
701
  params=params,
618
702
  )
619
703
 
620
- metrics = fetch_metrics(
621
- prometheus_url=prometheus_url,
622
- cache=self._cache,
623
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
624
- metric_name=name_filter,
625
- should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
626
- should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
704
+ url = urljoin(
705
+ self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
706
+ )
707
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
708
+ if params.get("match"):
709
+ query_params["match[]"] = params["match"]
710
+
711
+ # Add time parameters - use provided values or defaults
712
+ if params.get("end"):
713
+ query_params["end"] = params["end"]
714
+ else:
715
+ query_params["end"] = str(int(time.time()))
716
+
717
+ if params.get("start"):
718
+ query_params["start"] = params["start"]
719
+ elif self.toolset.config.default_metadata_time_window_hrs:
720
+ # Use default time window
721
+ query_params["start"] = str(
722
+ int(time.time())
723
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
724
+ )
725
+
726
+ response = do_request(
727
+ config=self.toolset.config,
728
+ url=url,
729
+ params=query_params,
730
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
731
+ verify=self.toolset.config.prometheus_ssl_enabled,
627
732
  headers=self.toolset.config.headers,
733
+ method="GET",
734
+ )
735
+ response.raise_for_status()
736
+ data = response.json()
737
+
738
+ # Check if results were truncated
739
+ if (
740
+ "data" in data
741
+ and isinstance(data["data"], list)
742
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
743
+ ):
744
+ data["_truncated"] = True
745
+ data["_message"] = (
746
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
747
+ )
748
+
749
+ return StructuredToolResult(
750
+ status=StructuredToolResultStatus.SUCCESS,
751
+ data=data,
752
+ params=params,
753
+ )
754
+ except Exception as e:
755
+ return StructuredToolResult(
756
+ status=StructuredToolResultStatus.ERROR,
757
+ error=str(e),
758
+ params=params,
759
+ )
760
+
761
+ def get_parameterized_one_liner(self, params) -> str:
762
+ label = params.get("label", "")
763
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
764
+
765
+
766
+ class GetAllLabels(BasePrometheusTool):
767
+ """Get all label names that exist in Prometheus"""
768
+
769
+ def __init__(self, toolset: "PrometheusToolset"):
770
+ super().__init__(
771
+ name="get_all_labels",
772
+ description=(
773
+ "Get list of all label names using /api/v1/labels. "
774
+ "Use this to discover what labels are available across all metrics. "
775
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
776
+ "Supports optional match[] parameter to filter. "
777
+ "By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
778
+ ),
779
+ parameters={
780
+ "match": ToolParameter(
781
+ description=(
782
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
783
+ "'{job=\"prometheus\"}')."
784
+ ),
785
+ type="string",
786
+ required=False,
787
+ ),
788
+ "start": ToolParameter(
789
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
790
+ type="string",
791
+ required=False,
792
+ ),
793
+ "end": ToolParameter(
794
+ description="End timestamp (RFC3339 or Unix). Default: now",
795
+ type="string",
796
+ required=False,
797
+ ),
798
+ },
799
+ toolset=toolset,
800
+ )
801
+
802
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
803
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
804
+ return StructuredToolResult(
805
+ status=StructuredToolResultStatus.ERROR,
806
+ error="Prometheus is not configured. Prometheus URL is missing",
807
+ params=params,
808
+ )
809
+ try:
810
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
811
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
812
+ if params.get("match"):
813
+ query_params["match[]"] = params["match"]
814
+
815
+ # Add time parameters - use provided values or defaults
816
+ if params.get("end"):
817
+ query_params["end"] = params["end"]
818
+ else:
819
+ query_params["end"] = str(int(time.time()))
820
+
821
+ if params.get("start"):
822
+ query_params["start"] = params["start"]
823
+ elif self.toolset.config.default_metadata_time_window_hrs:
824
+ # Use default time window
825
+ query_params["start"] = str(
826
+ int(time.time())
827
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
828
+ )
829
+
830
+ response = do_request(
628
831
  config=self.toolset.config,
629
- verify_ssl=self.toolset.config.prometheus_ssl_enabled,
832
+ url=url,
833
+ params=query_params,
834
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
835
+ verify=self.toolset.config.prometheus_ssl_enabled,
836
+ headers=self.toolset.config.headers,
837
+ method="GET",
838
+ )
839
+ response.raise_for_status()
840
+ data = response.json()
841
+
842
+ # Check if results were truncated
843
+ if (
844
+ "data" in data
845
+ and isinstance(data["data"], list)
846
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
847
+ ):
848
+ data["_truncated"] = True
849
+ data["_message"] = (
850
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
851
+ )
852
+
853
+ return StructuredToolResult(
854
+ status=StructuredToolResultStatus.SUCCESS,
855
+ data=data,
856
+ params=params,
857
+ )
858
+ except Exception as e:
859
+ return StructuredToolResult(
860
+ status=StructuredToolResultStatus.ERROR,
861
+ error=str(e),
862
+ params=params,
630
863
  )
631
864
 
632
- type_filter = params.get("type_filter")
633
- if type_filter:
634
- metrics = filter_metrics_by_type(metrics, type_filter)
865
+ def get_parameterized_one_liner(self, params) -> str:
866
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
635
867
 
636
- output = ["Metric | Description | Type | Labels"]
637
- output.append("-" * 100)
638
868
 
639
- for metric, info in sorted(metrics.items()):
640
- labels_str = (
641
- ", ".join(sorted(info["labels"])) if info["labels"] else "none"
869
+ class GetSeries(BasePrometheusTool):
870
+ """Get time series matching a selector"""
871
+
872
+ def __init__(self, toolset: "PrometheusToolset"):
873
+ super().__init__(
874
+ name="get_series",
875
+ description=(
876
+ "Get time series using /api/v1/series. "
877
+ "Returns label sets for all time series matching the selector. "
878
+ "SLOWER than other discovery methods - use only when you need full label sets. "
879
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
880
+ "Requires match[] parameter with PromQL selector. "
881
+ "By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
882
+ ),
883
+ parameters={
884
+ "match": ToolParameter(
885
+ description=(
886
+ "PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
887
+ "'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
888
+ '\'{__name__="up",job="prometheus"}\').'
889
+ ),
890
+ type="string",
891
+ required=True,
892
+ ),
893
+ "start": ToolParameter(
894
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
895
+ type="string",
896
+ required=False,
897
+ ),
898
+ "end": ToolParameter(
899
+ description="End timestamp (RFC3339 or Unix). Default: now",
900
+ type="string",
901
+ required=False,
902
+ ),
903
+ },
904
+ toolset=toolset,
905
+ )
906
+
907
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
908
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
909
+ return StructuredToolResult(
910
+ status=StructuredToolResultStatus.ERROR,
911
+ error="Prometheus is not configured. Prometheus URL is missing",
912
+ params=params,
913
+ )
914
+ try:
915
+ match = params.get("match")
916
+ if not match:
917
+ return StructuredToolResult(
918
+ status=StructuredToolResultStatus.ERROR,
919
+ error="Match parameter is required",
920
+ params=params,
921
+ )
922
+
923
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
924
+ query_params = {
925
+ "match[]": match,
926
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
927
+ }
928
+
929
+ # Add time parameters - use provided values or defaults
930
+ if params.get("end"):
931
+ query_params["end"] = params["end"]
932
+ else:
933
+ query_params["end"] = str(int(time.time()))
934
+
935
+ if params.get("start"):
936
+ query_params["start"] = params["start"]
937
+ elif self.toolset.config.default_metadata_time_window_hrs:
938
+ # Use default time window
939
+ query_params["start"] = str(
940
+ int(time.time())
941
+ - (self.toolset.config.default_metadata_time_window_hrs * 3600)
642
942
  )
643
- output.append(
644
- f"{metric} | {info['description']} | {info['type']} | {labels_str}"
943
+
944
+ response = do_request(
945
+ config=self.toolset.config,
946
+ url=url,
947
+ params=query_params,
948
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
949
+ verify=self.toolset.config.prometheus_ssl_enabled,
950
+ headers=self.toolset.config.headers,
951
+ method="GET",
952
+ )
953
+ response.raise_for_status()
954
+ data = response.json()
955
+
956
+ # Check if results were truncated
957
+ if (
958
+ "data" in data
959
+ and isinstance(data["data"], list)
960
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
961
+ ):
962
+ data["_truncated"] = True
963
+ data["_message"] = (
964
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
645
965
  )
646
966
 
647
- table_output = "\n".join(output)
648
967
  return StructuredToolResult(
649
- status=ToolResultStatus.SUCCESS,
650
- data=table_output,
968
+ status=StructuredToolResultStatus.SUCCESS,
969
+ data=data,
970
+ params=params,
971
+ )
972
+ except Exception as e:
973
+ return StructuredToolResult(
974
+ status=StructuredToolResultStatus.ERROR,
975
+ error=str(e),
651
976
  params=params,
652
977
  )
653
978
 
654
- except requests.Timeout:
655
- logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
979
+ def get_parameterized_one_liner(self, params) -> str:
980
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
981
+
982
+
983
+ class GetMetricMetadata(BasePrometheusTool):
984
+ """Get metadata (type, description, unit) for metrics"""
985
+
986
+ def __init__(self, toolset: "PrometheusToolset"):
987
+ super().__init__(
988
+ name="get_metric_metadata",
989
+ description=(
990
+ "Get metric metadata using /api/v1/metadata. "
991
+ "Returns type, help text, and unit for metrics. "
992
+ "Use after discovering metric names to get their descriptions. "
993
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
994
+ "Supports optional metric name filter."
995
+ ),
996
+ parameters={
997
+ "metric": ToolParameter(
998
+ description=(
999
+ "Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
1000
+ "If not provided, returns metadata for all metrics."
1001
+ ),
1002
+ type="string",
1003
+ required=False,
1004
+ ),
1005
+ },
1006
+ toolset=toolset,
1007
+ )
1008
+
1009
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
1010
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
656
1011
  return StructuredToolResult(
657
- status=ToolResultStatus.ERROR,
658
- error="Request timed out while fetching metrics",
1012
+ status=StructuredToolResultStatus.ERROR,
1013
+ error="Prometheus is not configured. Prometheus URL is missing",
659
1014
  params=params,
660
1015
  )
661
- except RequestException as e:
662
- logging.warn("Failed to fetch prometheus metrics", exc_info=True)
1016
+ try:
1017
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
1018
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
1019
+
1020
+ if params.get("metric"):
1021
+ query_params["metric"] = params["metric"]
1022
+
1023
+ response = do_request(
1024
+ config=self.toolset.config,
1025
+ url=url,
1026
+ params=query_params,
1027
+ timeout=self.toolset.config.default_metadata_timeout_seconds,
1028
+ verify=self.toolset.config.prometheus_ssl_enabled,
1029
+ headers=self.toolset.config.headers,
1030
+ method="GET",
1031
+ )
1032
+ response.raise_for_status()
1033
+ data = response.json()
1034
+
1035
+ # Check if results were truncated (metadata endpoint returns a dict, not a list)
1036
+ if (
1037
+ "data" in data
1038
+ and isinstance(data["data"], dict)
1039
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1040
+ ):
1041
+ data["_truncated"] = True
1042
+ data["_message"] = (
1043
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
1044
+ )
1045
+
663
1046
  return StructuredToolResult(
664
- status=ToolResultStatus.ERROR,
665
- error=f"Network error while fetching metrics: {str(e)}",
1047
+ status=StructuredToolResultStatus.SUCCESS,
1048
+ data=data,
666
1049
  params=params,
667
1050
  )
668
1051
  except Exception as e:
669
- logging.warn("Failed to process prometheus metrics", exc_info=True)
670
1052
  return StructuredToolResult(
671
- status=ToolResultStatus.ERROR,
672
- error=f"Unexpected error: {str(e)}",
1053
+ status=StructuredToolResultStatus.ERROR,
1054
+ error=str(e),
673
1055
  params=params,
674
1056
  )
675
1057
 
676
1058
  def get_parameterized_one_liner(self, params) -> str:
677
- name_filter = params.get("name_filter", "")
678
- return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
1059
+ metric = params.get("metric", "all")
1060
+ return (
1061
+ f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
1062
+ )
679
1063
 
680
1064
 
681
1065
  class ExecuteInstantQuery(BasePrometheusTool):
682
1066
  def __init__(self, toolset: "PrometheusToolset"):
683
1067
  super().__init__(
684
1068
  name="execute_prometheus_instant_query",
685
- description="Execute an instant PromQL query",
1069
+ description=(
1070
+ f"Execute an instant PromQL query (single point in time). "
1071
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1072
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
1073
+ ),
686
1074
  parameters={
687
1075
  "query": ToolParameter(
688
1076
  description="The PromQL query",
@@ -694,16 +1082,23 @@ class ExecuteInstantQuery(BasePrometheusTool):
694
1082
  type="string",
695
1083
  required=True,
696
1084
  ),
1085
+ "timeout": ToolParameter(
1086
+ description=(
1087
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1088
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1089
+ f"Increase for complex queries that may take longer."
1090
+ ),
1091
+ type="number",
1092
+ required=False,
1093
+ ),
697
1094
  },
698
1095
  toolset=toolset,
699
1096
  )
700
1097
 
701
- def _invoke(
702
- self, params: dict, user_approved: bool = False
703
- ) -> StructuredToolResult:
1098
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
704
1099
  if not self.toolset.config or not self.toolset.config.prometheus_url:
705
1100
  return StructuredToolResult(
706
- status=ToolResultStatus.ERROR,
1101
+ status=StructuredToolResultStatus.ERROR,
707
1102
  error="Prometheus is not configured. Prometheus URL is missing",
708
1103
  params=params,
709
1104
  )
@@ -715,12 +1110,24 @@ class ExecuteInstantQuery(BasePrometheusTool):
715
1110
 
716
1111
  payload = {"query": query}
717
1112
 
1113
+ # Get timeout parameter and enforce limits
1114
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1115
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1116
+ timeout = params.get("timeout", default_timeout)
1117
+ if timeout > max_timeout:
1118
+ timeout = max_timeout
1119
+ logging.warning(
1120
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1121
+ )
1122
+ elif timeout < 1:
1123
+ timeout = default_timeout # Min 1 second, but use default if invalid
1124
+
718
1125
  response = do_request(
719
1126
  config=self.toolset.config,
720
1127
  url=url,
721
1128
  headers=self.toolset.config.headers,
722
1129
  data=payload,
723
- timeout=60,
1130
+ timeout=timeout,
724
1131
  verify=self.toolset.config.prometheus_ssl_enabled,
725
1132
  method="POST",
726
1133
  )
@@ -734,24 +1141,64 @@ class ExecuteInstantQuery(BasePrometheusTool):
734
1141
  error_message = (
735
1142
  "The prometheus query returned no result. Is the query correct?"
736
1143
  )
737
- response_data = {
738
- "status": status,
739
- "error_message": error_message,
740
- "random_key": generate_random_key(),
741
- "tool_name": self.name,
742
- "description": description,
743
- "query": query,
744
- }
745
-
1144
+ response_data = MetricsBasedResponse(
1145
+ status=status,
1146
+ error_message=error_message,
1147
+ random_key=generate_random_key(),
1148
+ tool_name=self.name,
1149
+ description=description,
1150
+ query=query,
1151
+ )
1152
+ structured_tool_result: StructuredToolResult
1153
+ # Check if data should be included based on size
746
1154
  if self.toolset.config.tool_calls_return_data:
747
- response_data["data"] = data.get("data")
1155
+ result_data = data.get("data", {})
1156
+ response_data.data = result_data
748
1157
 
749
- data_str = json.dumps(response_data, indent=2)
750
- return StructuredToolResult(
751
- status=ToolResultStatus.SUCCESS,
752
- data=data_str,
753
- params=params,
1158
+ structured_tool_result = create_structured_tool_result(
1159
+ params=params, response=response_data
1160
+ )
1161
+ token_count = count_tool_response_tokens(
1162
+ llm=context.llm, structured_tool_result=structured_tool_result
1163
+ )
1164
+
1165
+ token_limit = context.max_token_count
1166
+ if self.toolset.config.query_response_size_limit_pct:
1167
+ custom_token_limit = get_pct_token_count(
1168
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1169
+ llm=context.llm,
1170
+ )
1171
+ if custom_token_limit < token_limit:
1172
+ token_limit = custom_token_limit
1173
+
1174
+ # Provide summary if data is too large
1175
+ if token_count > token_limit:
1176
+ response_data.data = None
1177
+ response_data.data_summary = (
1178
+ create_data_summary_for_large_result(
1179
+ result_data,
1180
+ query,
1181
+ token_count,
1182
+ is_range_query=False,
1183
+ )
1184
+ )
1185
+ logging.info(
1186
+ f"Prometheus instant query returned large dataset: "
1187
+ f"{response_data.data_summary.get('result_count', 0)} results, "
1188
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1189
+ f"Returning summary instead of full data."
1190
+ )
1191
+ # Also add token info to the summary for debugging
1192
+ response_data.data_summary["_debug_info"] = (
1193
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
1194
+ )
1195
+ else:
1196
+ response_data.data = result_data
1197
+
1198
+ structured_tool_result = create_structured_tool_result(
1199
+ params=params, response=response_data
754
1200
  )
1201
+ return structured_tool_result
755
1202
 
756
1203
  # Handle known Prometheus error status codes
757
1204
  error_msg = "Unknown error occurred"
@@ -764,14 +1211,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
764
1211
  except json.JSONDecodeError:
765
1212
  pass
766
1213
  return StructuredToolResult(
767
- status=ToolResultStatus.ERROR,
1214
+ status=StructuredToolResultStatus.ERROR,
768
1215
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
769
1216
  params=params,
770
1217
  )
771
1218
 
772
1219
  # For other status codes, just return the status code and content
773
1220
  return StructuredToolResult(
774
- status=ToolResultStatus.ERROR,
1221
+ status=StructuredToolResultStatus.ERROR,
775
1222
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
776
1223
  params=params,
777
1224
  )
@@ -779,14 +1226,14 @@ class ExecuteInstantQuery(BasePrometheusTool):
779
1226
  except RequestException as e:
780
1227
  logging.info("Failed to connect to Prometheus", exc_info=True)
781
1228
  return StructuredToolResult(
782
- status=ToolResultStatus.ERROR,
1229
+ status=StructuredToolResultStatus.ERROR,
783
1230
  error=f"Connection error to Prometheus: {str(e)}",
784
1231
  params=params,
785
1232
  )
786
1233
  except Exception as e:
787
1234
  logging.info("Failed to connect to Prometheus", exc_info=True)
788
1235
  return StructuredToolResult(
789
- status=ToolResultStatus.ERROR,
1236
+ status=StructuredToolResultStatus.ERROR,
790
1237
  error=f"Unexpected error executing query: {str(e)}",
791
1238
  params=params,
792
1239
  )
@@ -800,7 +1247,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
800
1247
  def __init__(self, toolset: "PrometheusToolset"):
801
1248
  super().__init__(
802
1249
  name="execute_prometheus_range_query",
803
- description="Generates a graph and Execute a PromQL range query",
1250
+ description=(
1251
+ f"Generates a graph and Execute a PromQL range query. "
1252
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1253
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
1254
+ f"Default time range is last 1 hour."
1255
+ ),
804
1256
  parameters={
805
1257
  "query": ToolParameter(
806
1258
  description="The PromQL query",
@@ -827,23 +1279,40 @@ class ExecuteRangeQuery(BasePrometheusTool):
827
1279
  "step": ToolParameter(
828
1280
  description="Query resolution step width in duration format or float number of seconds",
829
1281
  type="number",
830
- required=True,
1282
+ required=False,
831
1283
  ),
832
1284
  "output_type": ToolParameter(
833
1285
  description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
834
1286
  type="string",
835
1287
  required=True,
836
1288
  ),
1289
+ "timeout": ToolParameter(
1290
+ description=(
1291
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1292
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1293
+ f"Increase for complex queries that may take longer."
1294
+ ),
1295
+ type="number",
1296
+ required=False,
1297
+ ),
1298
+ "max_points": ToolParameter(
1299
+ description=(
1300
+ f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
1301
+ f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
1302
+ f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
1303
+ f"If your query would return more points than this limit, the step will be automatically adjusted."
1304
+ ),
1305
+ type="number",
1306
+ required=False,
1307
+ ),
837
1308
  },
838
1309
  toolset=toolset,
839
1310
  )
840
1311
 
841
- def _invoke(
842
- self, params: dict, user_approved: bool = False
843
- ) -> StructuredToolResult:
1312
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
844
1313
  if not self.toolset.config or not self.toolset.config.prometheus_url:
845
1314
  return StructuredToolResult(
846
- status=ToolResultStatus.ERROR,
1315
+ status=StructuredToolResultStatus.ERROR,
847
1316
  error="Prometheus is not configured. Prometheus URL is missing",
848
1317
  params=params,
849
1318
  )
@@ -857,12 +1326,17 @@ class ExecuteRangeQuery(BasePrometheusTool):
857
1326
  end_timestamp=params.get("end"),
858
1327
  default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
859
1328
  )
860
- step = params.get("step", "")
1329
+ step = parse_duration_to_seconds(params.get("step"))
1330
+ max_points = params.get(
1331
+ "max_points"
1332
+ ) # Get the optional max_points parameter
861
1333
 
1334
+ # adjust_step_for_max_points handles None case and converts to float
862
1335
  step = adjust_step_for_max_points(
863
1336
  start_timestamp=start,
864
1337
  end_timestamp=end,
865
- step=float(step) if step else MAX_GRAPH_POINTS,
1338
+ step=step,
1339
+ max_points_override=max_points,
866
1340
  )
867
1341
 
868
1342
  description = params.get("description", "")
@@ -874,12 +1348,24 @@ class ExecuteRangeQuery(BasePrometheusTool):
874
1348
  "step": step,
875
1349
  }
876
1350
 
1351
+ # Get timeout parameter and enforce limits
1352
+ default_timeout = self.toolset.config.default_query_timeout_seconds
1353
+ max_timeout = self.toolset.config.max_query_timeout_seconds
1354
+ timeout = params.get("timeout", default_timeout)
1355
+ if timeout > max_timeout:
1356
+ timeout = max_timeout
1357
+ logging.warning(
1358
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1359
+ )
1360
+ elif timeout < 1:
1361
+ timeout = default_timeout # Min 1 second, but use default if invalid
1362
+
877
1363
  response = do_request(
878
1364
  config=self.toolset.config,
879
1365
  url=url,
880
1366
  headers=self.toolset.config.headers,
881
1367
  data=payload,
882
- timeout=120,
1368
+ timeout=timeout,
883
1369
  verify=self.toolset.config.prometheus_ssl_enabled,
884
1370
  method="POST",
885
1371
  )
@@ -893,29 +1379,69 @@ class ExecuteRangeQuery(BasePrometheusTool):
893
1379
  error_message = (
894
1380
  "The prometheus query returned no result. Is the query correct?"
895
1381
  )
896
- response_data = {
897
- "status": status,
898
- "error_message": error_message,
899
- "random_key": generate_random_key(),
900
- "tool_name": self.name,
901
- "description": description,
902
- "query": query,
903
- "start": start,
904
- "end": end,
905
- "step": step,
906
- "output_type": output_type,
907
- }
1382
+ response_data = MetricsBasedResponse(
1383
+ status=status,
1384
+ error_message=error_message,
1385
+ random_key=generate_random_key(),
1386
+ tool_name=self.name,
1387
+ description=description,
1388
+ query=query,
1389
+ start=start,
1390
+ end=end,
1391
+ step=step,
1392
+ output_type=output_type,
1393
+ )
908
1394
 
1395
+ structured_tool_result: StructuredToolResult
1396
+
1397
+ # Check if data should be included based on size
909
1398
  if self.toolset.config.tool_calls_return_data:
910
- response_data["data"] = data.get("data")
911
- data_str = json.dumps(response_data, indent=2)
1399
+ result_data = data.get("data", {})
1400
+ response_data.data = result_data
1401
+ structured_tool_result = create_structured_tool_result(
1402
+ params=params, response=response_data
1403
+ )
912
1404
 
913
- return StructuredToolResult(
914
- status=ToolResultStatus.SUCCESS,
915
- data=data_str,
916
- params=params,
1405
+ token_count = count_tool_response_tokens(
1406
+ llm=context.llm, structured_tool_result=structured_tool_result
1407
+ )
1408
+
1409
+ token_limit = context.max_token_count
1410
+ if self.toolset.config.query_response_size_limit_pct:
1411
+ custom_token_limit = get_pct_token_count(
1412
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1413
+ llm=context.llm,
1414
+ )
1415
+ if custom_token_limit < token_limit:
1416
+ token_limit = custom_token_limit
1417
+
1418
+ # Provide summary if data is too large
1419
+ if token_count > token_limit:
1420
+ response_data.data = None
1421
+ response_data.data_summary = (
1422
+ create_data_summary_for_large_result(
1423
+ result_data, query, token_count, is_range_query=True
1424
+ )
1425
+ )
1426
+ logging.info(
1427
+ f"Prometheus range query returned large dataset: "
1428
+ f"{response_data.data_summary.get('series_count', 0)} series, "
1429
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1430
+ f"Returning summary instead of full data."
1431
+ )
1432
+ # Also add character info to the summary for debugging
1433
+ response_data.data_summary["_debug_info"] = (
1434
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
1435
+ )
1436
+ else:
1437
+ response_data.data = result_data
1438
+
1439
+ structured_tool_result = create_structured_tool_result(
1440
+ params=params, response=response_data
917
1441
  )
918
1442
 
1443
+ return structured_tool_result
1444
+
919
1445
  error_msg = "Unknown error occurred"
920
1446
  if response.status_code in [400, 429]:
921
1447
  try:
@@ -926,13 +1452,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
926
1452
  except json.JSONDecodeError:
927
1453
  pass
928
1454
  return StructuredToolResult(
929
- status=ToolResultStatus.ERROR,
1455
+ status=StructuredToolResultStatus.ERROR,
930
1456
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
931
1457
  params=params,
932
1458
  )
933
1459
 
934
1460
  return StructuredToolResult(
935
- status=ToolResultStatus.ERROR,
1461
+ status=StructuredToolResultStatus.ERROR,
936
1462
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
937
1463
  params=params,
938
1464
  )
@@ -940,14 +1466,14 @@ class ExecuteRangeQuery(BasePrometheusTool):
940
1466
  except RequestException as e:
941
1467
  logging.info("Failed to connect to Prometheus", exc_info=True)
942
1468
  return StructuredToolResult(
943
- status=ToolResultStatus.ERROR,
1469
+ status=StructuredToolResultStatus.ERROR,
944
1470
  error=f"Connection error to Prometheus: {str(e)}",
945
1471
  params=params,
946
1472
  )
947
1473
  except Exception as e:
948
1474
  logging.info("Failed to connect to Prometheus", exc_info=True)
949
1475
  return StructuredToolResult(
950
- status=ToolResultStatus.ERROR,
1476
+ status=StructuredToolResultStatus.ERROR,
951
1477
  error=f"Unexpected error executing query: {str(e)}",
952
1478
  params=params,
953
1479
  )
@@ -969,7 +1495,11 @@ class PrometheusToolset(Toolset):
969
1495
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
970
1496
  tools=[
971
1497
  ListPrometheusRules(toolset=self),
972
- ListAvailableMetrics(toolset=self),
1498
+ GetMetricNames(toolset=self),
1499
+ GetLabelValues(toolset=self),
1500
+ GetAllLabels(toolset=self),
1501
+ GetSeries(toolset=self),
1502
+ GetMetricMetadata(toolset=self),
973
1503
  ExecuteInstantQuery(toolset=self),
974
1504
  ExecuteRangeQuery(toolset=self),
975
1505
  ],
@@ -1060,13 +1590,8 @@ class PrometheusToolset(Toolset):
1060
1590
  f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
1061
1591
  )
1062
1592
 
1063
- except RequestException:
1064
- return (
1065
- False,
1066
- f"Failed to initialize using url={url}",
1067
- )
1068
1593
  except Exception as e:
1069
- logging.exception("Failed to initialize Prometheus")
1594
+ logging.debug("Failed to initialize Prometheus", exc_info=True)
1070
1595
  return (
1071
1596
  False,
1072
1597
  f"Failed to initialize using url={url}. Unexpected error: {str(e)}",