holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,27 +1,41 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
- import re
5
4
  import time
6
- import dateutil.parser
7
- from typing import Any, Dict, List, Optional, Tuple, Type, Union
5
+ from typing import Any, Dict, Optional, Tuple, Type, Union
8
6
  from urllib.parse import urljoin
9
7
 
8
+ import dateutil.parser
10
9
  import requests # type: ignore
11
- from pydantic import BaseModel, field_validator, Field, model_validator
12
- from requests import RequestException
10
+ from prometrix.auth import PrometheusAuthorization
13
11
  from prometrix.connect.aws_connect import AWSPrometheusConnect
12
+ from prometrix.models.prometheus_config import (
13
+ AzurePrometheusConfig as PrometrixAzureConfig,
14
+ )
14
15
  from prometrix.models.prometheus_config import PrometheusConfig as BasePrometheusConfig
16
+ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
17
+ from requests import RequestException
18
+ from requests.exceptions import SSLError # type: ignore
19
+
20
+ from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
21
+ from holmes.common.openshift import load_openshift_token
15
22
  from holmes.core.tools import (
16
23
  CallablePrerequisite,
17
24
  StructuredToolResult,
25
+ StructuredToolResultStatus,
18
26
  Tool,
27
+ ToolInvokeContext,
19
28
  ToolParameter,
20
- ToolResultStatus,
21
29
  Toolset,
22
30
  ToolsetTag,
23
31
  )
32
+ from holmes.core.tools_utils.token_counting import count_tool_response_tokens
33
+ from holmes.core.tools_utils.tool_context_window_limiter import get_pct_token_count
24
34
  from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
35
+ from holmes.plugins.toolsets.logging_utils.logging_api import (
36
+ DEFAULT_GRAPH_TIME_SPAN_SECONDS,
37
+ )
38
+ from holmes.plugins.toolsets.prometheus.utils import parse_duration_to_seconds
25
39
  from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
26
40
  from holmes.plugins.toolsets.utils import (
27
41
  get_param_or_raise,
@@ -30,31 +44,76 @@ from holmes.plugins.toolsets.utils import (
30
44
  toolset_name_for_one_liner,
31
45
  )
32
46
  from holmes.utils.cache import TTLCache
33
- from holmes.common.env_vars import IS_OPENSHIFT, MAX_GRAPH_POINTS
34
- from holmes.common.openshift import load_openshift_token
35
- from holmes.plugins.toolsets.logging_utils.logging_api import (
36
- DEFAULT_GRAPH_TIME_SPAN_SECONDS,
37
- )
38
- from holmes.utils.keygen_utils import generate_random_key
39
47
 
40
48
  PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
49
+ PROMETHEUS_METADATA_API_LIMIT = 100 # Default limit for Prometheus metadata APIs (series, labels, metadata) to prevent overwhelming responses
50
+ # Default timeout values for PromQL queries
51
+ DEFAULT_QUERY_TIMEOUT_SECONDS = 20
52
+ MAX_QUERY_TIMEOUT_SECONDS = 180
53
+ # Default timeout for metadata API calls (discovery endpoints)
54
+ DEFAULT_METADATA_TIMEOUT_SECONDS = 20
55
+ MAX_METADATA_TIMEOUT_SECONDS = 60
56
+ # Default time window for metadata APIs (in hours)
57
+ DEFAULT_METADATA_TIME_WINDOW_HRS = 1
58
+
59
+
60
+ def format_ssl_error_message(prometheus_url: str, error: SSLError) -> str:
61
+ """Format a clear SSL error message with remediation steps."""
62
+ return (
63
+ f"SSL certificate verification failed when connecting to Prometheus at {prometheus_url}. "
64
+ f"Error: {str(error)}. "
65
+ f"To disable SSL verification, set 'verify_ssl: false' in your configuration. "
66
+ f"For Helm deployments, add this to your values.yaml:\n"
67
+ f" toolsets:\n"
68
+ f" prometheus/metrics:\n"
69
+ f" config:\n"
70
+ f" verify_ssl: false"
71
+ )
41
72
 
42
73
 
43
74
  class PrometheusConfig(BaseModel):
75
+ """Prometheus toolset configuration.
76
+
77
+ Deprecated config names (still accepted but not in schema):
78
+ - default_metadata_time_window_hrs -> discover_metrics_from_last_hours
79
+ - default_query_timeout_seconds -> query_timeout_seconds_default
80
+ - max_query_timeout_seconds -> query_timeout_seconds_hard_max
81
+ - default_metadata_timeout_seconds -> metadata_timeout_seconds_default
82
+ - max_metadata_timeout_seconds -> metadata_timeout_seconds_hard_max
83
+ - metrics_labels_time_window_hrs -> discover_metrics_from_last_hours
84
+ - prometheus_ssl_enabled -> verify_ssl
85
+ - metrics_labels_cache_duration_hrs (no longer used)
86
+ - fetch_labels_with_labels_api (no longer used)
87
+ - fetch_metadata_with_series_api (no longer used)
88
+ """
89
+
90
+ model_config = ConfigDict(extra="allow")
91
+
44
92
  # URL is optional because it can be set with an env var
45
- prometheus_url: Optional[str]
46
- healthcheck: str = "-/healthy"
47
- # Setting to None will remove the time window from the request for labels
48
- metrics_labels_time_window_hrs: Union[int, None] = 48
49
- # Setting to None will disable the cache
50
- metrics_labels_cache_duration_hrs: Union[int, None] = 12
51
- fetch_labels_with_labels_api: bool = False
52
- fetch_metadata_with_series_api: bool = False
93
+ prometheus_url: Optional[str] = None
94
+
95
+ # Discovery API time window - only return metrics with data in the last N hours
96
+ discover_metrics_from_last_hours: int = DEFAULT_METADATA_TIME_WINDOW_HRS
97
+
98
+ # Query timeout configuration
99
+ query_timeout_seconds_default: int = DEFAULT_QUERY_TIMEOUT_SECONDS
100
+ query_timeout_seconds_hard_max: int = MAX_QUERY_TIMEOUT_SECONDS
101
+
102
+ # Metadata API timeout configuration
103
+ metadata_timeout_seconds_default: int = DEFAULT_METADATA_TIMEOUT_SECONDS
104
+ metadata_timeout_seconds_hard_max: int = MAX_METADATA_TIMEOUT_SECONDS
105
+
53
106
  tool_calls_return_data: bool = True
54
107
  headers: Dict = Field(default_factory=dict)
55
- rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
108
+ rules_cache_duration_seconds: Optional[int] = 1800 # 30 minutes
56
109
  additional_labels: Optional[Dict[str, str]] = None
57
- prometheus_ssl_enabled: bool = True
110
+ verify_ssl: bool = True
111
+
112
+ # Custom limit to the max number of tokens that a query result can take to proactively
113
+ # prevent token limit issues. Expressed in % of the model's context window.
114
+ # This limit only overrides the global limit for all tools (TOOL_MAX_ALLOCATED_CONTEXT_WINDOW_PCT)
115
+ # if it is lower.
116
+ query_response_size_limit_pct: Optional[int] = None
58
117
 
59
118
  @field_validator("prometheus_url")
60
119
  def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
@@ -64,11 +123,52 @@ class PrometheusConfig(BaseModel):
64
123
 
65
124
  @model_validator(mode="after")
66
125
  def validate_prom_config(self):
126
+ # Handle deprecated config names passed as extra fields
127
+ # These are accepted via extra="allow" but not defined in schema
128
+ extra = self.model_extra or {}
129
+ deprecated_with_replacement = []
130
+
131
+ # Map of old names -> new names
132
+ deprecated_mappings = {
133
+ "default_metadata_time_window_hrs": "discover_metrics_from_last_hours",
134
+ "default_query_timeout_seconds": "query_timeout_seconds_default",
135
+ "max_query_timeout_seconds": "query_timeout_seconds_hard_max",
136
+ "default_metadata_timeout_seconds": "metadata_timeout_seconds_default",
137
+ "max_metadata_timeout_seconds": "metadata_timeout_seconds_hard_max",
138
+ "metrics_labels_time_window_hrs": "discover_metrics_from_last_hours",
139
+ "prometheus_ssl_enabled": "verify_ssl",
140
+ }
141
+
142
+ for old_name, new_name in deprecated_mappings.items():
143
+ if old_name in extra:
144
+ setattr(self, new_name, extra[old_name])
145
+ deprecated_with_replacement.append(f"{old_name} -> {new_name}")
146
+
147
+ if deprecated_with_replacement:
148
+ logging.warning(
149
+ f"Prometheus config uses deprecated names. Please update: "
150
+ f"{', '.join(deprecated_with_replacement)}"
151
+ )
152
+
153
+ # Check for deprecated config values that no longer have any effect
154
+ deprecated_no_effect = [
155
+ name
156
+ for name in [
157
+ "metrics_labels_cache_duration_hrs",
158
+ "fetch_labels_with_labels_api",
159
+ "fetch_metadata_with_series_api",
160
+ ]
161
+ if name in extra
162
+ ]
163
+
164
+ if deprecated_no_effect:
165
+ logging.warning(
166
+ f"The following Prometheus config values are deprecated and have no effect: "
167
+ f"{', '.join(deprecated_no_effect)}"
168
+ )
169
+
67
170
  # If openshift is enabled, and the user didn't configure auth headers, we will try to load the token from the service account.
68
171
  if IS_OPENSHIFT:
69
- if self.healthcheck == "-/healthy":
70
- self.healthcheck = "api/v1/query?query=up"
71
-
72
172
  if self.headers.get("Authorization"):
73
173
  return self
74
174
 
@@ -88,8 +188,7 @@ class AMPConfig(PrometheusConfig):
88
188
  aws_secret_access_key: Optional[str] = None
89
189
  aws_region: str
90
190
  aws_service_name: str = "aps"
91
- healthcheck: str = "api/v1/query?query=up"
92
- prometheus_ssl_enabled: bool = False
191
+ verify_ssl: bool = False
93
192
  assume_role_arn: Optional[str] = None
94
193
 
95
194
  # Refresh the AWS client (and its STS creds) every N seconds (default: 15 minutes)
@@ -113,7 +212,7 @@ class AMPConfig(PrometheusConfig):
113
212
  try:
114
213
  base_config = BasePrometheusConfig(
115
214
  url=self.prometheus_url,
116
- disable_ssl=not self.prometheus_ssl_enabled,
215
+ disable_ssl=not self.verify_ssl,
117
216
  additional_labels=self.additional_labels,
118
217
  )
119
218
  self._aws_client = AWSPrometheusConnect(
@@ -132,12 +231,155 @@ class AMPConfig(PrometheusConfig):
132
231
  return self._aws_client
133
232
 
134
233
 
234
+ class AzurePrometheusConfig(PrometheusConfig):
235
+ azure_resource: Optional[str] = None
236
+ azure_metadata_endpoint: Optional[str] = None
237
+ azure_token_endpoint: Optional[str] = None
238
+ azure_use_managed_id: bool = False
239
+ azure_client_id: Optional[str] = None
240
+ azure_client_secret: Optional[str] = None
241
+ azure_tenant_id: Optional[str] = None
242
+ verify_ssl: bool = True
243
+
244
+ # Refresh the Azure bearer token every N seconds (default: 15 minutes)
245
+ refresh_interval_seconds: int = 900
246
+
247
+ _prometrix_config: Optional[PrometrixAzureConfig] = None
248
+ _token_created_at: float = 0.0
249
+
250
+ @staticmethod
251
+ def _load_from_env_or_default(
252
+ config_value: Optional[str], env_var: str, default: Optional[str] = None
253
+ ) -> Optional[str]:
254
+ """Load value from config, environment variable, or use default."""
255
+ if config_value:
256
+ return config_value
257
+ return os.environ.get(env_var, default)
258
+
259
+ def __init__(self, **data):
260
+ super().__init__(**data)
261
+ # Load from environment variables if not provided in config
262
+ self.azure_client_id = self._load_from_env_or_default(
263
+ self.azure_client_id, "AZURE_CLIENT_ID"
264
+ )
265
+ self.azure_tenant_id = self._load_from_env_or_default(
266
+ self.azure_tenant_id, "AZURE_TENANT_ID"
267
+ )
268
+ self.azure_client_secret = self._load_from_env_or_default(
269
+ self.azure_client_secret, "AZURE_CLIENT_SECRET"
270
+ )
271
+
272
+ # Set defaults from environment if not provided
273
+ self.azure_resource = self._load_from_env_or_default(
274
+ self.azure_resource,
275
+ "AZURE_RESOURCE",
276
+ "https://prometheus.monitor.azure.com",
277
+ )
278
+ # from https://learn.microsoft.com/en-us/entra/identity/managed-identities-azure-resources/how-to-use-vm-token
279
+ self.azure_metadata_endpoint = self._load_from_env_or_default(
280
+ self.azure_metadata_endpoint,
281
+ "AZURE_METADATA_ENDPOINT",
282
+ "http://169.254.169.254/metadata/identity/oauth2/token",
283
+ )
284
+ self.azure_token_endpoint = self._load_from_env_or_default(
285
+ self.azure_token_endpoint, "AZURE_TOKEN_ENDPOINT"
286
+ )
287
+ if not self.azure_token_endpoint and self.azure_tenant_id:
288
+ self.azure_token_endpoint = (
289
+ f"https://login.microsoftonline.com/{self.azure_tenant_id}/oauth2/token"
290
+ )
291
+
292
+ # Check if managed identity should be used
293
+ if not self.azure_use_managed_id:
294
+ self.azure_use_managed_id = os.environ.get(
295
+ "AZURE_USE_MANAGED_ID", "false"
296
+ ).lower() in ("true", "1")
297
+
298
+ # Convert None to empty string for prometrix compatibility (prometrix checks != "")
299
+ azure_client_id = self.azure_client_id or ""
300
+ azure_tenant_id = self.azure_tenant_id or ""
301
+ azure_client_secret = self.azure_client_secret or ""
302
+ azure_resource = self.azure_resource or ""
303
+ azure_metadata_endpoint = self.azure_metadata_endpoint or ""
304
+ azure_token_endpoint = self.azure_token_endpoint or ""
305
+
306
+ # Create prometrix Azure config
307
+ self._prometrix_config = PrometrixAzureConfig(
308
+ url=self.prometheus_url,
309
+ azure_resource=azure_resource,
310
+ azure_metadata_endpoint=azure_metadata_endpoint,
311
+ azure_token_endpoint=azure_token_endpoint,
312
+ azure_use_managed_id=self.azure_use_managed_id,
313
+ azure_client_id=azure_client_id,
314
+ azure_client_secret=azure_client_secret,
315
+ azure_tenant_id=azure_tenant_id,
316
+ disable_ssl=not self.verify_ssl,
317
+ additional_labels=self.additional_labels,
318
+ )
319
+ # Ensure promtrix gets a real bool (not string) for managed identity
320
+ # fixing internal prometrix config issue
321
+ object.__setattr__(
322
+ self._prometrix_config,
323
+ "azure_use_managed_id",
324
+ bool(self.azure_use_managed_id),
325
+ )
326
+
327
+ PrometheusAuthorization.azure_authorization(self._prometrix_config)
328
+
329
+ @staticmethod
330
+ def is_azure_config(config: dict[str, Any]) -> bool:
331
+ """Check if config dict or environment variables indicate Azure Prometheus config."""
332
+ # Check for explicit Azure fields in config
333
+ if (
334
+ "azure_client_id" in config
335
+ or "azure_tenant_id" in config
336
+ or "azure_use_managed_id" in config
337
+ ):
338
+ return True
339
+
340
+ # Check for Azure environment variables
341
+ if os.environ.get("AZURE_CLIENT_ID") or os.environ.get("AZURE_TENANT_ID"):
342
+ return True
343
+
344
+ return False
345
+
346
+ def is_amp(self) -> bool:
347
+ return False
348
+
349
+ def _should_refresh_token(self) -> bool:
350
+ if not PrometheusAuthorization.bearer_token:
351
+ return True
352
+ return (time.time() - self._token_created_at) >= self.refresh_interval_seconds
353
+
354
+ def request_new_token(self) -> bool:
355
+ """Request a new Azure access token using prometrix."""
356
+ success = PrometheusAuthorization.request_new_token(self._prometrix_config)
357
+ if success:
358
+ self._token_created_at = time.time()
359
+ return success
360
+
361
+ def get_authorization_headers(self) -> Dict[str, str]:
362
+ # Request new token if needed
363
+ if self._should_refresh_token():
364
+ if not self.request_new_token():
365
+ logging.error("Failed to request new Azure access token")
366
+ return {}
367
+ self._token_created_at = time.time()
368
+
369
+ headers = PrometheusAuthorization.get_authorization_headers(
370
+ self._prometrix_config
371
+ )
372
+ if not headers.get("Authorization"):
373
+ logging.warning("No authorization header generated for Azure Prometheus")
374
+ return headers
375
+
376
+
135
377
  class BasePrometheusTool(Tool):
136
378
  toolset: "PrometheusToolset"
137
379
 
138
380
 
139
381
  def do_request(
140
- config, # PrometheusConfig | AMPConfig
382
+ config, # PrometheusConfig | AMPConfig | AzurePrometheusConfig
141
383
  url: str,
142
384
  params: Optional[Dict] = None,
143
385
  data: Optional[Dict] = None,
@@ -149,17 +391,20 @@ def do_request(
149
391
  """
150
392
  Route a request through either:
151
393
  - AWSPrometheusConnect (SigV4) when config is AMPConfig
394
+ - Azure bearer token auth when config is AzurePrometheusConfig
152
395
  - plain requests otherwise
153
396
 
154
397
  method defaults to GET so callers can omit it for reads.
155
398
  """
156
399
  if verify is None:
157
- verify = config.prometheus_ssl_enabled
400
+ verify = config.verify_ssl
158
401
  if headers is None:
159
402
  headers = config.headers or {}
160
403
 
161
404
  if isinstance(config, AMPConfig):
162
405
  client = config.get_aws_client() # cached AWSPrometheusConnect
406
+ # Note: timeout parameter is not supported by prometrix's signed_request
407
+ # AWS/AMP requests will not respect the timeout setting
163
408
  return client.signed_request( # type: ignore
164
409
  method=method,
165
410
  url=url,
@@ -169,7 +414,21 @@ def do_request(
169
414
  headers=headers,
170
415
  )
171
416
 
172
- # Non-AMP: plain HTTP
417
+ if isinstance(config, AzurePrometheusConfig):
418
+ # Merge Azure authorization headers with provided headers
419
+ azure_headers = config.get_authorization_headers()
420
+ headers = {**azure_headers, **headers}
421
+ return requests.request(
422
+ method=method,
423
+ url=url,
424
+ headers=headers,
425
+ params=params,
426
+ data=data,
427
+ timeout=timeout,
428
+ verify=verify,
429
+ )
430
+
431
+ # Non-AMP, Non-Azure: plain HTTP
173
432
  return requests.request(
174
433
  method=method,
175
434
  url=url,
@@ -181,99 +440,6 @@ def do_request(
181
440
  )
182
441
 
183
442
 
184
- def filter_metrics_by_type(metrics: Dict, expected_type: str):
185
- return {
186
- metric_name: metric_data
187
- for metric_name, metric_data in metrics.items()
188
- if expected_type in metric_data.get("type", "")
189
- or metric_data.get("type", "") == "?"
190
- }
191
-
192
-
193
- def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
194
- regex = re.compile(pattern)
195
- return {
196
- metric_name: metric_data
197
- for metric_name, metric_data in metrics.items()
198
- if regex.search(metric_name)
199
- }
200
-
201
-
202
- METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
203
-
204
-
205
- def fetch_metadata(
206
- prometheus_url: str,
207
- headers: Optional[Dict],
208
- config,
209
- verify_ssl: bool = True,
210
- ) -> Dict:
211
- metadata_url = urljoin(prometheus_url, "api/v1/metadata")
212
- metadata_response = do_request(
213
- config=config,
214
- url=metadata_url,
215
- headers=headers,
216
- timeout=60,
217
- verify=verify_ssl,
218
- method="GET",
219
- )
220
- metadata_response.raise_for_status()
221
-
222
- metadata = metadata_response.json()["data"]
223
-
224
- metrics = {}
225
- for metric_name, meta_list in metadata.items():
226
- if meta_list:
227
- metric_type = meta_list[0].get("type", "unknown")
228
- metric_description = meta_list[0].get("help", "unknown")
229
- metrics[metric_name] = {
230
- "type": metric_type,
231
- "description": metric_description,
232
- "labels": set(),
233
- }
234
-
235
- return metrics
236
-
237
-
238
- def fetch_metadata_with_series_api(
239
- prometheus_url: str,
240
- metric_name: str,
241
- headers: Dict,
242
- config,
243
- verify_ssl: bool = True,
244
- ) -> Dict:
245
- url = urljoin(prometheus_url, "api/v1/series")
246
- params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
247
-
248
- response = do_request(
249
- config=config,
250
- url=url,
251
- headers=headers,
252
- params=params,
253
- timeout=60,
254
- verify=verify_ssl,
255
- method="GET",
256
- )
257
- response.raise_for_status()
258
- metrics = response.json()["data"]
259
-
260
- metadata: Dict = {}
261
- for metric_data in metrics:
262
- metric_name = metric_data.get("__name__")
263
- if not metric_name:
264
- continue
265
-
266
- metric = metadata.get(metric_name)
267
- if not metric:
268
- metric = {"description": "?", "type": "?", "labels": set()}
269
- metadata[metric_name] = metric
270
-
271
- labels = {k for k in metric_data.keys() if k != "__name__"}
272
- metric["labels"].update(labels)
273
-
274
- return metadata
275
-
276
-
277
443
  def result_has_data(result: Dict) -> bool:
278
444
  data = result.get("data", {})
279
445
  if len(data.get("result", [])) > 0:
@@ -284,33 +450,58 @@ def result_has_data(result: Dict) -> bool:
284
450
  def adjust_step_for_max_points(
285
451
  start_timestamp: str,
286
452
  end_timestamp: str,
287
- step: float,
453
+ step: Optional[float] = None,
454
+ max_points_override: Optional[float] = None,
288
455
  ) -> float:
289
456
  """
290
457
  Adjusts the step parameter to ensure the number of data points doesn't exceed max_points.
291
- Max points is controlled by the PROMETHEUS_MAX_GRAPH_POINTS environment variable (default: 300).
292
458
 
293
459
  Args:
294
460
  start_timestamp: RFC3339 formatted start time
295
461
  end_timestamp: RFC3339 formatted end time
296
- step: The requested step duration in seconds
462
+ step: The requested step duration in seconds (None for auto-calculation)
463
+ max_points_override: Optional override for max points (must be <= MAX_GRAPH_POINTS)
297
464
 
298
465
  Returns:
299
466
  Adjusted step value in seconds that ensures points <= max_points
300
467
  """
468
+ # Use override if provided and valid, otherwise use default
469
+ max_points = MAX_GRAPH_POINTS
470
+ if max_points_override is not None:
471
+ if max_points_override > MAX_GRAPH_POINTS:
472
+ logging.warning(
473
+ f"max_points override ({max_points_override}) exceeds system limit ({MAX_GRAPH_POINTS}), using {MAX_GRAPH_POINTS}"
474
+ )
475
+ max_points = MAX_GRAPH_POINTS
476
+ elif max_points_override < 1:
477
+ logging.warning(
478
+ f"max_points override ({max_points_override}) is invalid, using default {MAX_GRAPH_POINTS}"
479
+ )
480
+ max_points = MAX_GRAPH_POINTS
481
+ else:
482
+ max_points = max_points_override
483
+ logging.debug(f"Using max_points override: {max_points}")
301
484
 
302
485
  start_dt = dateutil.parser.parse(start_timestamp)
303
486
  end_dt = dateutil.parser.parse(end_timestamp)
304
487
 
305
488
  time_range_seconds = (end_dt - start_dt).total_seconds()
306
489
 
490
+ # If no step provided, calculate a reasonable default
491
+ # Aim for ~60 data points across the time range (1 per minute for hourly, etc)
492
+ if step is None:
493
+ step = max(1, time_range_seconds / 60)
494
+ logging.debug(
495
+ f"No step provided, defaulting to {step}s for {time_range_seconds}s range"
496
+ )
497
+
307
498
  current_points = time_range_seconds / step
308
499
 
309
500
  # If current points exceed max, adjust the step
310
- if current_points > MAX_GRAPH_POINTS:
311
- adjusted_step = time_range_seconds / MAX_GRAPH_POINTS
501
+ if current_points > max_points:
502
+ adjusted_step = time_range_seconds / max_points
312
503
  logging.info(
313
- f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {MAX_GRAPH_POINTS}"
504
+ f"Adjusting step from {step}s to {adjusted_step}s to limit points from {current_points:.0f} to {max_points}"
314
505
  )
315
506
  return adjusted_step
316
507
 
@@ -324,185 +515,149 @@ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]
324
515
  return results
325
516
 
326
517
 
327
- def fetch_metrics_labels_with_series_api(
328
- prometheus_url: str,
329
- headers: Dict[str, str],
330
- cache: Optional[TTLCache],
331
- metrics_labels_time_window_hrs: Union[int, None],
332
- metric_name: str,
333
- config=None,
334
- verify_ssl: bool = True,
335
- ) -> dict:
336
- """This is a slow query. Takes 5+ seconds to run"""
337
- cache_key = f"metrics_labels_series_api:{metric_name}"
338
- if cache:
339
- cached_result = cache.get(cache_key)
340
- if cached_result:
341
- return cached_result
342
-
343
- series_url = urljoin(prometheus_url, "api/v1/series")
344
- params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
345
-
346
- if metrics_labels_time_window_hrs is not None:
347
- params["end"] = int(time.time())
348
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
349
-
350
- series_response = do_request(
351
- config=config,
352
- url=series_url,
353
- headers=headers,
354
- params=params,
355
- timeout=60,
356
- verify=verify_ssl,
357
- method="GET",
358
- )
359
- series_response.raise_for_status()
360
- series = series_response.json()["data"]
361
-
362
- metrics_labels: dict = {}
363
- for serie in series:
364
- metric_name = serie["__name__"]
365
- # Add all labels except __name__
366
- labels = {k for k in serie.keys() if k != "__name__"}
367
- if metric_name in metrics_labels:
368
- metrics_labels[metric_name].update(labels)
369
- else:
370
- metrics_labels[metric_name] = labels
371
- if cache:
372
- cache.set(cache_key, metrics_labels)
373
-
374
- return metrics_labels
375
-
376
-
377
- def fetch_metrics_labels_with_labels_api(
378
- prometheus_url: str,
379
- cache: Optional[TTLCache],
380
- metrics_labels_time_window_hrs: Union[int, None],
381
- metric_names: List[str],
382
- headers: Dict,
383
- config=None,
384
- verify_ssl: bool = True,
385
- ) -> dict:
386
- metrics_labels = {}
387
-
388
- for metric_name in metric_names:
389
- cache_key = f"metrics_labels_labels_api:{metric_name}"
390
- if cache:
391
- cached_result = cache.get(cache_key)
392
- if cached_result:
393
- metrics_labels[metric_name] = cached_result
394
-
395
- url = urljoin(prometheus_url, "api/v1/labels")
396
- params: dict = {
397
- "match[]": f'{{__name__="{metric_name}"}}',
398
- }
399
- if metrics_labels_time_window_hrs is not None:
400
- params["end"] = int(time.time())
401
- params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
518
+ def create_data_summary_for_large_result(
519
+ result_data: Dict, query: str, data_size_tokens: int, is_range_query: bool = False
520
+ ) -> Dict[str, Any]:
521
+ """
522
+ Create a summary for large Prometheus results instead of returning full data.
402
523
 
403
- response = do_request(
404
- config=config,
405
- url=url,
406
- headers=headers,
407
- params=params,
408
- timeout=60,
409
- verify=verify_ssl,
410
- method="GET",
411
- )
412
- response.raise_for_status()
413
- labels = response.json()["data"]
414
- filtered_labels = {label for label in labels if label != "__name__"}
415
- metrics_labels[metric_name] = filtered_labels
416
-
417
- if cache:
418
- cache.set(cache_key, filtered_labels)
419
-
420
- return metrics_labels
421
-
422
-
423
- def fetch_metrics(
424
- prometheus_url: str,
425
- cache: Optional[TTLCache],
426
- metrics_labels_time_window_hrs: Union[int, None],
427
- metric_name: str,
428
- should_fetch_labels_with_labels_api: bool,
429
- should_fetch_metadata_with_series_api: bool,
430
- headers: Dict,
431
- config=None,
432
- verify_ssl: bool = True,
433
- ) -> dict:
434
- metrics = None
435
- should_fetch_labels = True
436
- if should_fetch_metadata_with_series_api:
437
- metrics = fetch_metadata_with_series_api(
438
- prometheus_url=prometheus_url,
439
- metric_name=metric_name,
440
- headers=headers,
441
- config=config,
442
- verify_ssl=verify_ssl,
524
+ Args:
525
+ result_data: The Prometheus data result
526
+ query: The original PromQL query
527
+ data_size_tokens: Size of the data in tokens
528
+ is_range_query: Whether this is a range query (vs instant query)
529
+
530
+ Returns:
531
+ Dictionary with summary information and suggestions
532
+ """
533
+ if is_range_query:
534
+ series_list = result_data.get("result", [])
535
+ num_items = len(series_list)
536
+
537
+ # Calculate exact total data points across all series
538
+ total_points = 0
539
+ for series in series_list: # Iterate through ALL series for exact count
540
+ points = len(series.get("values", []))
541
+ total_points += points
542
+
543
+ # Analyze label keys and their cardinality
544
+ label_cardinality: Dict[str, set] = {}
545
+ for series in series_list:
546
+ metric = series.get("metric", {})
547
+ for label_key, label_value in metric.items():
548
+ if label_key not in label_cardinality:
549
+ label_cardinality[label_key] = set()
550
+ label_cardinality[label_key].add(label_value)
551
+
552
+ # Convert sets to counts for the summary
553
+ label_summary = {
554
+ label: len(values) for label, values in label_cardinality.items()
555
+ }
556
+ # Sort by cardinality (highest first) for better insights
557
+ label_summary = dict(
558
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
443
559
  )
444
- should_fetch_labels = False # series API returns the labels
560
+
561
+ return {
562
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} time series with {total_points:,} total data points.",
563
+ "series_count": num_items,
564
+ "total_data_points": total_points,
565
+ "data_size_tokens": data_size_tokens,
566
+ "label_cardinality": label_summary,
567
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results to the top {min(5, num_items)} series. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "pod", "other", "", "")',
568
+ }
445
569
  else:
446
- metrics = fetch_metadata(
447
- prometheus_url=prometheus_url,
448
- headers=headers,
449
- config=config,
450
- verify_ssl=verify_ssl,
570
+ # Instant query
571
+ result_type = result_data.get("resultType", "")
572
+ result_list = result_data.get("result", [])
573
+ num_items = len(result_list)
574
+
575
+ # Analyze label keys and their cardinality
576
+ instant_label_cardinality: Dict[str, set] = {}
577
+ for item in result_list:
578
+ if isinstance(item, dict):
579
+ metric = item.get("metric", {})
580
+ for label_key, label_value in metric.items():
581
+ if label_key not in instant_label_cardinality:
582
+ instant_label_cardinality[label_key] = set()
583
+ instant_label_cardinality[label_key].add(label_value)
584
+
585
+ # Convert sets to counts for the summary
586
+ label_summary = {
587
+ label: len(values) for label, values in instant_label_cardinality.items()
588
+ }
589
+ # Sort by cardinality (highest first) for better insights
590
+ label_summary = dict(
591
+ sorted(label_summary.items(), key=lambda x: x[1], reverse=True)
451
592
  )
452
- metrics = filter_metrics_by_name(metrics, metric_name)
453
593
 
454
- if should_fetch_labels:
455
- metrics_labels = {}
456
- if should_fetch_labels_with_labels_api:
457
- metrics_labels = fetch_metrics_labels_with_labels_api(
458
- prometheus_url=prometheus_url,
459
- cache=cache,
460
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
461
- metric_names=list(metrics.keys()),
462
- headers=headers,
463
- config=config,
464
- verify_ssl=verify_ssl,
465
- )
466
- else:
467
- metrics_labels = fetch_metrics_labels_with_series_api(
468
- prometheus_url=prometheus_url,
469
- cache=cache,
470
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
471
- metric_name=metric_name,
472
- headers=headers,
473
- config=config,
474
- verify_ssl=verify_ssl,
475
- )
594
+ return {
595
+ "message": f"Data too large to return ({data_size_tokens:,} tokens). Query returned {num_items} results.",
596
+ "result_count": num_items,
597
+ "result_type": result_type,
598
+ "data_size_tokens": data_size_tokens,
599
+ "label_cardinality": label_summary,
600
+ "suggestion": f'Consider using topk({min(5, num_items)}, {query}) to limit results. To also capture remaining data as \'other\': topk({min(5, num_items)}, {query}) or label_replace((sum({query}) - sum(topk({min(5, num_items)}, {query}))), "instance", "other", "", "")',
601
+ }
476
602
 
477
- for metric_name in metrics:
478
- if metric_name in metrics_labels:
479
- metrics[metric_name]["labels"] = metrics_labels[metric_name]
480
603
 
481
- return metrics
604
+ class MetricsBasedResponse(BaseModel):
605
+ status: str
606
+ error_message: Optional[str] = None
607
+ data: Optional[str] = None
608
+ tool_name: str
609
+ description: str
610
+ query: str
611
+ start: Optional[str] = None
612
+ end: Optional[str] = None
613
+ step: Optional[float] = None
614
+ output_type: Optional[str] = None
615
+ data_summary: Optional[dict[str, Any]] = None
616
+
617
+
618
+ def create_structured_tool_result(
619
+ params: dict, response: MetricsBasedResponse
620
+ ) -> StructuredToolResult:
621
+ status = StructuredToolResultStatus.SUCCESS
622
+ error = None
623
+ if response.error_message or response.status.lower() in ("failed", "error"):
624
+ status = StructuredToolResultStatus.ERROR
625
+ error = (
626
+ response.error_message
627
+ if response.error_message
628
+ else "Unknown Prometheus error"
629
+ )
630
+ elif not response.data:
631
+ status = StructuredToolResultStatus.NO_DATA
632
+
633
+ return StructuredToolResult(
634
+ status=status,
635
+ data=response,
636
+ params=params,
637
+ error=error,
638
+ )
482
639
 
483
640
 
484
641
  class ListPrometheusRules(BasePrometheusTool):
485
642
  def __init__(self, toolset: "PrometheusToolset"):
486
643
  super().__init__(
487
644
  name="list_prometheus_rules",
488
- description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
645
+ description="List all defined Prometheus rules (api/v1/rules). Will show the Prometheus rules description, expression and annotations",
489
646
  parameters={},
490
647
  toolset=toolset,
491
648
  )
492
649
  self._cache = None
493
650
 
494
- def _invoke(
495
- self, params: dict, user_approved: bool = False
496
- ) -> StructuredToolResult:
651
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
497
652
  if not self.toolset.config or not self.toolset.config.prometheus_url:
498
653
  return StructuredToolResult(
499
- status=ToolResultStatus.ERROR,
654
+ status=StructuredToolResultStatus.ERROR,
500
655
  error="Prometheus is not configured. Prometheus URL is missing",
501
656
  params=params,
502
657
  )
503
658
  if self.toolset.config.is_amp():
504
659
  return StructuredToolResult(
505
- status=ToolResultStatus.ERROR,
660
+ status=StructuredToolResultStatus.ERROR,
506
661
  error="Tool not supported in AMP",
507
662
  params=params,
508
663
  )
@@ -515,7 +670,7 @@ class ListPrometheusRules(BasePrometheusTool):
515
670
  logging.debug("rules returned from cache")
516
671
 
517
672
  return StructuredToolResult(
518
- status=ToolResultStatus.SUCCESS,
673
+ status=StructuredToolResultStatus.SUCCESS,
519
674
  data=cached_rules,
520
675
  params=params,
521
676
  )
@@ -528,8 +683,8 @@ class ListPrometheusRules(BasePrometheusTool):
528
683
  config=self.toolset.config,
529
684
  url=rules_url,
530
685
  params=params,
531
- timeout=180,
532
- verify=self.toolset.config.prometheus_ssl_enabled,
686
+ timeout=40,
687
+ verify=self.toolset.config.verify_ssl,
533
688
  headers=self.toolset.config.headers,
534
689
  method="GET",
535
690
  )
@@ -539,28 +694,35 @@ class ListPrometheusRules(BasePrometheusTool):
539
694
  if self._cache:
540
695
  self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
541
696
  return StructuredToolResult(
542
- status=ToolResultStatus.SUCCESS,
697
+ status=StructuredToolResultStatus.SUCCESS,
543
698
  data=data,
544
699
  params=params,
545
700
  )
546
701
  except requests.Timeout:
547
702
  logging.warning("Timeout while fetching prometheus rules", exc_info=True)
548
703
  return StructuredToolResult(
549
- status=ToolResultStatus.ERROR,
704
+ status=StructuredToolResultStatus.ERROR,
550
705
  error="Request timed out while fetching rules",
551
706
  params=params,
552
707
  )
708
+ except SSLError as e:
709
+ logging.warning("SSL error while fetching prometheus rules", exc_info=True)
710
+ return StructuredToolResult(
711
+ status=StructuredToolResultStatus.ERROR,
712
+ error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
713
+ params=params,
714
+ )
553
715
  except RequestException as e:
554
716
  logging.warning("Failed to fetch prometheus rules", exc_info=True)
555
717
  return StructuredToolResult(
556
- status=ToolResultStatus.ERROR,
718
+ status=StructuredToolResultStatus.ERROR,
557
719
  error=f"Network error while fetching rules: {str(e)}",
558
720
  params=params,
559
721
  )
560
722
  except Exception as e:
561
723
  logging.warning("Failed to process prometheus rules", exc_info=True)
562
724
  return StructuredToolResult(
563
- status=ToolResultStatus.ERROR,
725
+ status=StructuredToolResultStatus.ERROR,
564
726
  error=f"Unexpected error: {str(e)}",
565
727
  params=params,
566
728
  )
@@ -569,120 +731,553 @@ class ListPrometheusRules(BasePrometheusTool):
569
731
  return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
570
732
 
571
733
 
572
- class ListAvailableMetrics(BasePrometheusTool):
734
+ class GetMetricNames(BasePrometheusTool):
735
+ """Thin wrapper around /api/v1/label/__name__/values - the fastest way to discover metric names"""
736
+
573
737
  def __init__(self, toolset: "PrometheusToolset"):
574
738
  super().__init__(
575
- name="list_available_metrics",
576
- description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
739
+ name="get_metric_names",
740
+ description=(
741
+ "Get list of metric names using /api/v1/label/__name__/values. "
742
+ "FASTEST method for metric discovery when you need to explore available metrics. "
743
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique metric names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use a more specific filter. "
744
+ f"ALWAYS use match[] parameter to filter metrics - without it you'll get random {PROMETHEUS_METADATA_API_LIMIT} metrics which is rarely useful. "
745
+ "Note: Does not return metric metadata (type, description, labels). "
746
+ "By default returns metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
747
+ ),
577
748
  parameters={
578
- "type_filter": ToolParameter(
579
- description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
749
+ "match": ToolParameter(
750
+ description=(
751
+ "REQUIRED: PromQL selector to filter metrics. Use regex OR (|) to check multiple patterns in one call - much faster than multiple calls! Examples: "
752
+ "'{__name__=~\"node_cpu.*|node_memory.*|node_disk.*\"}' for all node resource metrics, "
753
+ "'{__name__=~\"container_cpu.*|container_memory.*|container_network.*\"}' for all container metrics, "
754
+ "'{__name__=~\"kube_pod.*|kube_deployment.*|kube_service.*\"}' for multiple Kubernetes object metrics, "
755
+ "'{__name__=~\".*cpu.*|.*memory.*|.*disk.*\"}' for all resource metrics, "
756
+ "'{namespace=~\"kube-system|default|monitoring\"}' for metrics from multiple namespaces, "
757
+ "'{job=~\"prometheus|node-exporter|kube-state-metrics\"}' for metrics from multiple jobs."
758
+ ),
759
+ type="string",
760
+ required=True,
761
+ ),
762
+ "start": ToolParameter(
763
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
580
764
  type="string",
581
765
  required=False,
582
766
  ),
583
- "name_filter": ToolParameter(
584
- description="Only the metrics partially or fully matching this name will be returned",
767
+ "end": ToolParameter(
768
+ description="End timestamp (RFC3339 or Unix). Default: now",
585
769
  type="string",
586
- required=True,
770
+ required=False,
587
771
  ),
588
772
  },
589
773
  toolset=toolset,
590
774
  )
591
- self._cache = None
592
775
 
593
- def _invoke(
594
- self, params: dict, user_approved: bool = False
595
- ) -> StructuredToolResult:
776
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
596
777
  if not self.toolset.config or not self.toolset.config.prometheus_url:
597
778
  return StructuredToolResult(
598
- status=ToolResultStatus.ERROR,
779
+ status=StructuredToolResultStatus.ERROR,
599
780
  error="Prometheus is not configured. Prometheus URL is missing",
600
781
  params=params,
601
782
  )
602
- if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
603
- self._cache = TTLCache(
604
- self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
605
- )
606
783
  try:
607
- prometheus_url = self.toolset.config.prometheus_url
608
- metrics_labels_time_window_hrs = (
609
- self.toolset.config.metrics_labels_time_window_hrs
784
+ match_param = params.get("match")
785
+ if not match_param:
786
+ return StructuredToolResult(
787
+ status=StructuredToolResultStatus.ERROR,
788
+ error="Match parameter is required to filter metrics",
789
+ params=params,
790
+ )
791
+
792
+ url = urljoin(
793
+ self.toolset.config.prometheus_url, "api/v1/label/__name__/values"
610
794
  )
795
+ query_params = {
796
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
797
+ "match[]": match_param,
798
+ }
799
+
800
+ # Add time parameters - use provided values or defaults
801
+ if params.get("end"):
802
+ query_params["end"] = params["end"]
803
+ else:
804
+ query_params["end"] = str(int(time.time()))
805
+
806
+ if params.get("start"):
807
+ query_params["start"] = params["start"]
808
+ elif self.toolset.config.discover_metrics_from_last_hours:
809
+ # Use default time window
810
+ query_params["start"] = str(
811
+ int(time.time())
812
+ - (self.toolset.config.discover_metrics_from_last_hours * 3600)
813
+ )
814
+
815
+ response = do_request(
816
+ config=self.toolset.config,
817
+ url=url,
818
+ params=query_params,
819
+ timeout=self.toolset.config.metadata_timeout_seconds_default,
820
+ verify=self.toolset.config.verify_ssl,
821
+ headers=self.toolset.config.headers,
822
+ method="GET",
823
+ )
824
+ response.raise_for_status()
825
+ data = response.json()
826
+
827
+ # Check if results were truncated
828
+ if (
829
+ "data" in data
830
+ and isinstance(data["data"], list)
831
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
832
+ ):
833
+ data["_truncated"] = True
834
+ data["_message"] = (
835
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match filter to see additional metrics."
836
+ )
837
+
838
+ return StructuredToolResult(
839
+ status=StructuredToolResultStatus.SUCCESS,
840
+ data=data,
841
+ params=params,
842
+ )
843
+ except Exception as e:
844
+ return StructuredToolResult(
845
+ status=StructuredToolResultStatus.ERROR,
846
+ error=str(e),
847
+ params=params,
848
+ )
849
+
850
+ def get_parameterized_one_liner(self, params) -> str:
851
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metric Names"
852
+
853
+
854
+ class GetLabelValues(BasePrometheusTool):
855
+ """Get values for a specific label across all metrics"""
611
856
 
612
- name_filter = params.get("name_filter")
613
- if not name_filter:
857
+ def __init__(self, toolset: "PrometheusToolset"):
858
+ super().__init__(
859
+ name="get_label_values",
860
+ description=(
861
+ "Get all values for a specific label using /api/v1/label/{label}/values. "
862
+ "Use this to discover pods, namespaces, jobs, instances, etc. "
863
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} unique values (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
864
+ "Supports optional match[] parameter to filter. "
865
+ "By default returns values from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
866
+ ),
867
+ parameters={
868
+ "label": ToolParameter(
869
+ description="Label name to get values for (e.g., 'pod', 'namespace', 'job', 'instance')",
870
+ type="string",
871
+ required=True,
872
+ ),
873
+ "match": ToolParameter(
874
+ description=(
875
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
876
+ "'{namespace=\"default\"}')."
877
+ ),
878
+ type="string",
879
+ required=False,
880
+ ),
881
+ "start": ToolParameter(
882
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
883
+ type="string",
884
+ required=False,
885
+ ),
886
+ "end": ToolParameter(
887
+ description="End timestamp (RFC3339 or Unix). Default: now",
888
+ type="string",
889
+ required=False,
890
+ ),
891
+ },
892
+ toolset=toolset,
893
+ )
894
+
895
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
896
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
897
+ return StructuredToolResult(
898
+ status=StructuredToolResultStatus.ERROR,
899
+ error="Prometheus is not configured. Prometheus URL is missing",
900
+ params=params,
901
+ )
902
+ try:
903
+ label = params.get("label")
904
+ if not label:
614
905
  return StructuredToolResult(
615
- status=ToolResultStatus.ERROR,
616
- error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
906
+ status=StructuredToolResultStatus.ERROR,
907
+ error="Label parameter is required",
617
908
  params=params,
618
909
  )
619
910
 
620
- metrics = fetch_metrics(
621
- prometheus_url=prometheus_url,
622
- cache=self._cache,
623
- metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
624
- metric_name=name_filter,
625
- should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
626
- should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
911
+ url = urljoin(
912
+ self.toolset.config.prometheus_url, f"api/v1/label/{label}/values"
913
+ )
914
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
915
+ if params.get("match"):
916
+ query_params["match[]"] = params["match"]
917
+
918
+ # Add time parameters - use provided values or defaults
919
+ if params.get("end"):
920
+ query_params["end"] = params["end"]
921
+ else:
922
+ query_params["end"] = str(int(time.time()))
923
+
924
+ if params.get("start"):
925
+ query_params["start"] = params["start"]
926
+ elif self.toolset.config.discover_metrics_from_last_hours:
927
+ # Use default time window
928
+ query_params["start"] = str(
929
+ int(time.time())
930
+ - (self.toolset.config.discover_metrics_from_last_hours * 3600)
931
+ )
932
+
933
+ response = do_request(
934
+ config=self.toolset.config,
935
+ url=url,
936
+ params=query_params,
937
+ timeout=self.toolset.config.metadata_timeout_seconds_default,
938
+ verify=self.toolset.config.verify_ssl,
627
939
  headers=self.toolset.config.headers,
940
+ method="GET",
941
+ )
942
+ response.raise_for_status()
943
+ data = response.json()
944
+
945
+ # Check if results were truncated
946
+ if (
947
+ "data" in data
948
+ and isinstance(data["data"], list)
949
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
950
+ ):
951
+ data["_truncated"] = True
952
+ data["_message"] = (
953
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter label '{label}' values."
954
+ )
955
+
956
+ return StructuredToolResult(
957
+ status=StructuredToolResultStatus.SUCCESS,
958
+ data=data,
959
+ params=params,
960
+ )
961
+ except Exception as e:
962
+ return StructuredToolResult(
963
+ status=StructuredToolResultStatus.ERROR,
964
+ error=str(e),
965
+ params=params,
966
+ )
967
+
968
+ def get_parameterized_one_liner(self, params) -> str:
969
+ label = params.get("label", "")
970
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get {label} Values"
971
+
972
+
973
+ class GetAllLabels(BasePrometheusTool):
974
+ """Get all label names that exist in Prometheus"""
975
+
976
+ def __init__(self, toolset: "PrometheusToolset"):
977
+ super().__init__(
978
+ name="get_all_labels",
979
+ description=(
980
+ "Get list of all label names using /api/v1/labels. "
981
+ "Use this to discover what labels are available across all metrics. "
982
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} label names (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - use match[] to filter. "
983
+ "Supports optional match[] parameter to filter. "
984
+ "By default returns labels from metrics active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
985
+ ),
986
+ parameters={
987
+ "match": ToolParameter(
988
+ description=(
989
+ "Optional PromQL selector to filter (e.g., '{__name__=~\"kube.*\"}', "
990
+ "'{job=\"prometheus\"}')."
991
+ ),
992
+ type="string",
993
+ required=False,
994
+ ),
995
+ "start": ToolParameter(
996
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
997
+ type="string",
998
+ required=False,
999
+ ),
1000
+ "end": ToolParameter(
1001
+ description="End timestamp (RFC3339 or Unix). Default: now",
1002
+ type="string",
1003
+ required=False,
1004
+ ),
1005
+ },
1006
+ toolset=toolset,
1007
+ )
1008
+
1009
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
1010
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
1011
+ return StructuredToolResult(
1012
+ status=StructuredToolResultStatus.ERROR,
1013
+ error="Prometheus is not configured. Prometheus URL is missing",
1014
+ params=params,
1015
+ )
1016
+ try:
1017
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/labels")
1018
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
1019
+ if params.get("match"):
1020
+ query_params["match[]"] = params["match"]
1021
+
1022
+ # Add time parameters - use provided values or defaults
1023
+ if params.get("end"):
1024
+ query_params["end"] = params["end"]
1025
+ else:
1026
+ query_params["end"] = str(int(time.time()))
1027
+
1028
+ if params.get("start"):
1029
+ query_params["start"] = params["start"]
1030
+ elif self.toolset.config.discover_metrics_from_last_hours:
1031
+ # Use default time window
1032
+ query_params["start"] = str(
1033
+ int(time.time())
1034
+ - (self.toolset.config.discover_metrics_from_last_hours * 3600)
1035
+ )
1036
+
1037
+ response = do_request(
628
1038
  config=self.toolset.config,
629
- verify_ssl=self.toolset.config.prometheus_ssl_enabled,
1039
+ url=url,
1040
+ params=query_params,
1041
+ timeout=self.toolset.config.metadata_timeout_seconds_default,
1042
+ verify=self.toolset.config.verify_ssl,
1043
+ headers=self.toolset.config.headers,
1044
+ method="GET",
630
1045
  )
1046
+ response.raise_for_status()
1047
+ data = response.json()
1048
+
1049
+ # Check if results were truncated
1050
+ if (
1051
+ "data" in data
1052
+ and isinstance(data["data"], list)
1053
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1054
+ ):
1055
+ data["_truncated"] = True
1056
+ data["_message"] = (
1057
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use match[] parameter to filter labels."
1058
+ )
1059
+
1060
+ return StructuredToolResult(
1061
+ status=StructuredToolResultStatus.SUCCESS,
1062
+ data=data,
1063
+ params=params,
1064
+ )
1065
+ except Exception as e:
1066
+ return StructuredToolResult(
1067
+ status=StructuredToolResultStatus.ERROR,
1068
+ error=str(e),
1069
+ params=params,
1070
+ )
1071
+
1072
+ def get_parameterized_one_liner(self, params) -> str:
1073
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get All Labels"
631
1074
 
632
- type_filter = params.get("type_filter")
633
- if type_filter:
634
- metrics = filter_metrics_by_type(metrics, type_filter)
635
1075
 
636
- output = ["Metric | Description | Type | Labels"]
637
- output.append("-" * 100)
1076
+ class GetSeries(BasePrometheusTool):
1077
+ """Get time series matching a selector"""
638
1078
 
639
- for metric, info in sorted(metrics.items()):
640
- labels_str = (
641
- ", ".join(sorted(info["labels"])) if info["labels"] else "none"
1079
+ def __init__(self, toolset: "PrometheusToolset"):
1080
+ super().__init__(
1081
+ name="get_series",
1082
+ description=(
1083
+ "Get time series using /api/v1/series. "
1084
+ "Returns label sets for all time series matching the selector. "
1085
+ "SLOWER than other discovery methods - use only when you need full label sets. "
1086
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} series (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more series exist - use more specific selector. "
1087
+ "Requires match[] parameter with PromQL selector. "
1088
+ "By default returns series active in the last 1 hour (configurable via default_metadata_time_window_hrs)."
1089
+ ),
1090
+ parameters={
1091
+ "match": ToolParameter(
1092
+ description=(
1093
+ "PromQL selector to match series (e.g., 'up', 'node_cpu_seconds_total', "
1094
+ "'{__name__=~\"node.*\"}', '{job=\"prometheus\"}', "
1095
+ '\'{__name__="up",job="prometheus"}\').'
1096
+ ),
1097
+ type="string",
1098
+ required=True,
1099
+ ),
1100
+ "start": ToolParameter(
1101
+ description="Start timestamp (RFC3339 or Unix). Default: 1 hour ago",
1102
+ type="string",
1103
+ required=False,
1104
+ ),
1105
+ "end": ToolParameter(
1106
+ description="End timestamp (RFC3339 or Unix). Default: now",
1107
+ type="string",
1108
+ required=False,
1109
+ ),
1110
+ },
1111
+ toolset=toolset,
1112
+ )
1113
+
1114
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
1115
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
1116
+ return StructuredToolResult(
1117
+ status=StructuredToolResultStatus.ERROR,
1118
+ error="Prometheus is not configured. Prometheus URL is missing",
1119
+ params=params,
1120
+ )
1121
+ try:
1122
+ match = params.get("match")
1123
+ if not match:
1124
+ return StructuredToolResult(
1125
+ status=StructuredToolResultStatus.ERROR,
1126
+ error="Match parameter is required",
1127
+ params=params,
642
1128
  )
643
- output.append(
644
- f"{metric} | {info['description']} | {info['type']} | {labels_str}"
1129
+
1130
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/series")
1131
+ query_params = {
1132
+ "match[]": match,
1133
+ "limit": str(PROMETHEUS_METADATA_API_LIMIT),
1134
+ }
1135
+
1136
+ # Add time parameters - use provided values or defaults
1137
+ if params.get("end"):
1138
+ query_params["end"] = params["end"]
1139
+ else:
1140
+ query_params["end"] = str(int(time.time()))
1141
+
1142
+ if params.get("start"):
1143
+ query_params["start"] = params["start"]
1144
+ elif self.toolset.config.discover_metrics_from_last_hours:
1145
+ # Use default time window
1146
+ query_params["start"] = str(
1147
+ int(time.time())
1148
+ - (self.toolset.config.discover_metrics_from_last_hours * 3600)
645
1149
  )
646
1150
 
647
- table_output = "\n".join(output)
1151
+ response = do_request(
1152
+ config=self.toolset.config,
1153
+ url=url,
1154
+ params=query_params,
1155
+ timeout=self.toolset.config.metadata_timeout_seconds_default,
1156
+ verify=self.toolset.config.verify_ssl,
1157
+ headers=self.toolset.config.headers,
1158
+ method="GET",
1159
+ )
1160
+ response.raise_for_status()
1161
+ data = response.json()
1162
+
1163
+ # Check if results were truncated
1164
+ if (
1165
+ "data" in data
1166
+ and isinstance(data["data"], list)
1167
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1168
+ ):
1169
+ data["_truncated"] = True
1170
+ data["_message"] = (
1171
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use a more specific match selector to see additional series."
1172
+ )
1173
+
1174
+ return StructuredToolResult(
1175
+ status=StructuredToolResultStatus.SUCCESS,
1176
+ data=data,
1177
+ params=params,
1178
+ )
1179
+ except Exception as e:
648
1180
  return StructuredToolResult(
649
- status=ToolResultStatus.SUCCESS,
650
- data=table_output,
1181
+ status=StructuredToolResultStatus.ERROR,
1182
+ error=str(e),
651
1183
  params=params,
652
1184
  )
653
1185
 
654
- except requests.Timeout:
655
- logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
1186
+ def get_parameterized_one_liner(self, params) -> str:
1187
+ return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Series"
1188
+
1189
+
1190
+ class GetMetricMetadata(BasePrometheusTool):
1191
+ """Get metadata (type, description, unit) for metrics"""
1192
+
1193
+ def __init__(self, toolset: "PrometheusToolset"):
1194
+ super().__init__(
1195
+ name="get_metric_metadata",
1196
+ description=(
1197
+ "Get metric metadata using /api/v1/metadata. "
1198
+ "Returns type, help text, and unit for metrics. "
1199
+ "Use after discovering metric names to get their descriptions. "
1200
+ f"Returns up to {PROMETHEUS_METADATA_API_LIMIT} metrics (limit={PROMETHEUS_METADATA_API_LIMIT}). If {PROMETHEUS_METADATA_API_LIMIT} results returned, more may exist - filter by specific metric name. "
1201
+ "Supports optional metric name filter."
1202
+ ),
1203
+ parameters={
1204
+ "metric": ToolParameter(
1205
+ description=(
1206
+ "Optional metric name to filter (e.g., 'up', 'node_cpu_seconds_total'). "
1207
+ "If not provided, returns metadata for all metrics."
1208
+ ),
1209
+ type="string",
1210
+ required=False,
1211
+ ),
1212
+ },
1213
+ toolset=toolset,
1214
+ )
1215
+
1216
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
1217
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
656
1218
  return StructuredToolResult(
657
- status=ToolResultStatus.ERROR,
658
- error="Request timed out while fetching metrics",
1219
+ status=StructuredToolResultStatus.ERROR,
1220
+ error="Prometheus is not configured. Prometheus URL is missing",
659
1221
  params=params,
660
1222
  )
661
- except RequestException as e:
662
- logging.warn("Failed to fetch prometheus metrics", exc_info=True)
1223
+ try:
1224
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/metadata")
1225
+ query_params = {"limit": str(PROMETHEUS_METADATA_API_LIMIT)}
1226
+
1227
+ if params.get("metric"):
1228
+ query_params["metric"] = params["metric"]
1229
+
1230
+ response = do_request(
1231
+ config=self.toolset.config,
1232
+ url=url,
1233
+ params=query_params,
1234
+ timeout=self.toolset.config.metadata_timeout_seconds_default,
1235
+ verify=self.toolset.config.verify_ssl,
1236
+ headers=self.toolset.config.headers,
1237
+ method="GET",
1238
+ )
1239
+ response.raise_for_status()
1240
+ data = response.json()
1241
+
1242
+ # Check if results were truncated (metadata endpoint returns a dict, not a list)
1243
+ if (
1244
+ "data" in data
1245
+ and isinstance(data["data"], dict)
1246
+ and len(data["data"]) == PROMETHEUS_METADATA_API_LIMIT
1247
+ ):
1248
+ data["_truncated"] = True
1249
+ data["_message"] = (
1250
+ f"Results truncated at limit={PROMETHEUS_METADATA_API_LIMIT}. Use metric parameter to filter by specific metric name."
1251
+ )
1252
+
663
1253
  return StructuredToolResult(
664
- status=ToolResultStatus.ERROR,
665
- error=f"Network error while fetching metrics: {str(e)}",
1254
+ status=StructuredToolResultStatus.SUCCESS,
1255
+ data=data,
666
1256
  params=params,
667
1257
  )
668
1258
  except Exception as e:
669
- logging.warn("Failed to process prometheus metrics", exc_info=True)
670
1259
  return StructuredToolResult(
671
- status=ToolResultStatus.ERROR,
672
- error=f"Unexpected error: {str(e)}",
1260
+ status=StructuredToolResultStatus.ERROR,
1261
+ error=str(e),
673
1262
  params=params,
674
1263
  )
675
1264
 
676
1265
  def get_parameterized_one_liner(self, params) -> str:
677
- name_filter = params.get("name_filter", "")
678
- return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
1266
+ metric = params.get("metric", "all")
1267
+ return (
1268
+ f"{toolset_name_for_one_liner(self.toolset.name)}: Get Metadata ({metric})"
1269
+ )
679
1270
 
680
1271
 
681
1272
  class ExecuteInstantQuery(BasePrometheusTool):
682
1273
  def __init__(self, toolset: "PrometheusToolset"):
683
1274
  super().__init__(
684
1275
  name="execute_prometheus_instant_query",
685
- description="Execute an instant PromQL query",
1276
+ description=(
1277
+ f"Execute an instant PromQL query (single point in time). "
1278
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1279
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries."
1280
+ ),
686
1281
  parameters={
687
1282
  "query": ToolParameter(
688
1283
  description="The PromQL query",
@@ -694,16 +1289,23 @@ class ExecuteInstantQuery(BasePrometheusTool):
694
1289
  type="string",
695
1290
  required=True,
696
1291
  ),
1292
+ "timeout": ToolParameter(
1293
+ description=(
1294
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1295
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1296
+ f"Increase for complex queries that may take longer."
1297
+ ),
1298
+ type="number",
1299
+ required=False,
1300
+ ),
697
1301
  },
698
1302
  toolset=toolset,
699
1303
  )
700
1304
 
701
- def _invoke(
702
- self, params: dict, user_approved: bool = False
703
- ) -> StructuredToolResult:
1305
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
704
1306
  if not self.toolset.config or not self.toolset.config.prometheus_url:
705
1307
  return StructuredToolResult(
706
- status=ToolResultStatus.ERROR,
1308
+ status=StructuredToolResultStatus.ERROR,
707
1309
  error="Prometheus is not configured. Prometheus URL is missing",
708
1310
  params=params,
709
1311
  )
@@ -715,13 +1317,25 @@ class ExecuteInstantQuery(BasePrometheusTool):
715
1317
 
716
1318
  payload = {"query": query}
717
1319
 
1320
+ # Get timeout parameter and enforce limits
1321
+ default_timeout = self.toolset.config.query_timeout_seconds_default
1322
+ max_timeout = self.toolset.config.query_timeout_seconds_hard_max
1323
+ timeout = params.get("timeout", default_timeout)
1324
+ if timeout > max_timeout:
1325
+ timeout = max_timeout
1326
+ logging.warning(
1327
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1328
+ )
1329
+ elif timeout < 1:
1330
+ timeout = default_timeout # Min 1 second, but use default if invalid
1331
+
718
1332
  response = do_request(
719
1333
  config=self.toolset.config,
720
1334
  url=url,
721
1335
  headers=self.toolset.config.headers,
722
1336
  data=payload,
723
- timeout=60,
724
- verify=self.toolset.config.prometheus_ssl_enabled,
1337
+ timeout=timeout,
1338
+ verify=self.toolset.config.verify_ssl,
725
1339
  method="POST",
726
1340
  )
727
1341
 
@@ -734,24 +1348,68 @@ class ExecuteInstantQuery(BasePrometheusTool):
734
1348
  error_message = (
735
1349
  "The prometheus query returned no result. Is the query correct?"
736
1350
  )
737
- response_data = {
738
- "status": status,
739
- "error_message": error_message,
740
- "random_key": generate_random_key(),
741
- "tool_name": self.name,
742
- "description": description,
743
- "query": query,
744
- }
745
-
1351
+ response_data = MetricsBasedResponse(
1352
+ status=status,
1353
+ error_message=error_message,
1354
+ tool_name=self.name,
1355
+ description=description,
1356
+ query=query,
1357
+ )
1358
+ structured_tool_result: StructuredToolResult
1359
+ # Check if data should be included based on size
746
1360
  if self.toolset.config.tool_calls_return_data:
747
- response_data["data"] = data.get("data")
1361
+ result_data = data.get("data", {})
1362
+ response_data.data = result_data
748
1363
 
749
- data_str = json.dumps(response_data, indent=2)
750
- return StructuredToolResult(
751
- status=ToolResultStatus.SUCCESS,
752
- data=data_str,
753
- params=params,
1364
+ structured_tool_result = create_structured_tool_result(
1365
+ params=params, response=response_data
1366
+ )
1367
+ tool_call_id = context.tool_call_id
1368
+ tool_name = context.tool_name
1369
+ token_count = count_tool_response_tokens(
1370
+ llm=context.llm,
1371
+ structured_tool_result=structured_tool_result,
1372
+ tool_call_id=tool_call_id,
1373
+ tool_name=tool_name,
1374
+ )
1375
+
1376
+ token_limit = context.max_token_count
1377
+ if self.toolset.config.query_response_size_limit_pct:
1378
+ custom_token_limit = get_pct_token_count(
1379
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1380
+ llm=context.llm,
1381
+ )
1382
+ if custom_token_limit < token_limit:
1383
+ token_limit = custom_token_limit
1384
+
1385
+ # Provide summary if data is too large
1386
+ if token_count > token_limit:
1387
+ response_data.data = None
1388
+ response_data.data_summary = (
1389
+ create_data_summary_for_large_result(
1390
+ result_data,
1391
+ query,
1392
+ token_count,
1393
+ is_range_query=False,
1394
+ )
1395
+ )
1396
+ logging.info(
1397
+ f"Prometheus instant query returned large dataset: "
1398
+ f"{response_data.data_summary.get('result_count', 0)} results, "
1399
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1400
+ f"Returning summary instead of full data."
1401
+ )
1402
+ # Also add token info to the summary for debugging
1403
+ response_data.data_summary["_debug_info"] = (
1404
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
1405
+ )
1406
+ else:
1407
+ response_data.data = result_data
1408
+
1409
+ structured_tool_result = create_structured_tool_result(
1410
+ params=params, response=response_data
754
1411
  )
1412
+ return structured_tool_result
755
1413
 
756
1414
  # Handle known Prometheus error status codes
757
1415
  error_msg = "Unknown error occurred"
@@ -764,29 +1422,36 @@ class ExecuteInstantQuery(BasePrometheusTool):
764
1422
  except json.JSONDecodeError:
765
1423
  pass
766
1424
  return StructuredToolResult(
767
- status=ToolResultStatus.ERROR,
1425
+ status=StructuredToolResultStatus.ERROR,
768
1426
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
769
1427
  params=params,
770
1428
  )
771
1429
 
772
1430
  # For other status codes, just return the status code and content
773
1431
  return StructuredToolResult(
774
- status=ToolResultStatus.ERROR,
1432
+ status=StructuredToolResultStatus.ERROR,
775
1433
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
776
1434
  params=params,
777
1435
  )
778
1436
 
1437
+ except SSLError as e:
1438
+ logging.warning("SSL error while executing Prometheus query", exc_info=True)
1439
+ return StructuredToolResult(
1440
+ status=StructuredToolResultStatus.ERROR,
1441
+ error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
1442
+ params=params,
1443
+ )
779
1444
  except RequestException as e:
780
1445
  logging.info("Failed to connect to Prometheus", exc_info=True)
781
1446
  return StructuredToolResult(
782
- status=ToolResultStatus.ERROR,
1447
+ status=StructuredToolResultStatus.ERROR,
783
1448
  error=f"Connection error to Prometheus: {str(e)}",
784
1449
  params=params,
785
1450
  )
786
1451
  except Exception as e:
787
1452
  logging.info("Failed to connect to Prometheus", exc_info=True)
788
1453
  return StructuredToolResult(
789
- status=ToolResultStatus.ERROR,
1454
+ status=StructuredToolResultStatus.ERROR,
790
1455
  error=f"Unexpected error executing query: {str(e)}",
791
1456
  params=params,
792
1457
  )
@@ -800,7 +1465,12 @@ class ExecuteRangeQuery(BasePrometheusTool):
800
1465
  def __init__(self, toolset: "PrometheusToolset"):
801
1466
  super().__init__(
802
1467
  name="execute_prometheus_range_query",
803
- description="Generates a graph and Execute a PromQL range query",
1468
+ description=(
1469
+ f"Generates a graph and Execute a PromQL range query. "
1470
+ f"Default timeout is {DEFAULT_QUERY_TIMEOUT_SECONDS} seconds "
1471
+ f"but can be increased up to {MAX_QUERY_TIMEOUT_SECONDS} seconds for complex/slow queries. "
1472
+ f"Default time range is last 1 hour."
1473
+ ),
804
1474
  parameters={
805
1475
  "query": ToolParameter(
806
1476
  description="The PromQL query",
@@ -827,23 +1497,40 @@ class ExecuteRangeQuery(BasePrometheusTool):
827
1497
  "step": ToolParameter(
828
1498
  description="Query resolution step width in duration format or float number of seconds",
829
1499
  type="number",
830
- required=True,
1500
+ required=False,
831
1501
  ),
832
1502
  "output_type": ToolParameter(
833
1503
  description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
834
1504
  type="string",
835
1505
  required=True,
836
1506
  ),
1507
+ "timeout": ToolParameter(
1508
+ description=(
1509
+ f"Query timeout in seconds. Default: {DEFAULT_QUERY_TIMEOUT_SECONDS}. "
1510
+ f"Maximum: {MAX_QUERY_TIMEOUT_SECONDS}. "
1511
+ f"Increase for complex queries that may take longer."
1512
+ ),
1513
+ type="number",
1514
+ required=False,
1515
+ ),
1516
+ "max_points": ToolParameter(
1517
+ description=(
1518
+ f"Maximum number of data points to return. Default: {int(MAX_GRAPH_POINTS)}. "
1519
+ f"Can be reduced to get fewer data points (e.g., 50 for simpler graphs). "
1520
+ f"Cannot exceed system limit of {int(MAX_GRAPH_POINTS)}. "
1521
+ f"If your query would return more points than this limit, the step will be automatically adjusted."
1522
+ ),
1523
+ type="number",
1524
+ required=False,
1525
+ ),
837
1526
  },
838
1527
  toolset=toolset,
839
1528
  )
840
1529
 
841
- def _invoke(
842
- self, params: dict, user_approved: bool = False
843
- ) -> StructuredToolResult:
1530
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
844
1531
  if not self.toolset.config or not self.toolset.config.prometheus_url:
845
1532
  return StructuredToolResult(
846
- status=ToolResultStatus.ERROR,
1533
+ status=StructuredToolResultStatus.ERROR,
847
1534
  error="Prometheus is not configured. Prometheus URL is missing",
848
1535
  params=params,
849
1536
  )
@@ -857,12 +1544,17 @@ class ExecuteRangeQuery(BasePrometheusTool):
857
1544
  end_timestamp=params.get("end"),
858
1545
  default_time_span_seconds=DEFAULT_GRAPH_TIME_SPAN_SECONDS,
859
1546
  )
860
- step = params.get("step", "")
1547
+ step = parse_duration_to_seconds(params.get("step"))
1548
+ max_points = params.get(
1549
+ "max_points"
1550
+ ) # Get the optional max_points parameter
861
1551
 
1552
+ # adjust_step_for_max_points handles None case and converts to float
862
1553
  step = adjust_step_for_max_points(
863
1554
  start_timestamp=start,
864
1555
  end_timestamp=end,
865
- step=float(step) if step else MAX_GRAPH_POINTS,
1556
+ step=step,
1557
+ max_points_override=max_points,
866
1558
  )
867
1559
 
868
1560
  description = params.get("description", "")
@@ -874,13 +1566,25 @@ class ExecuteRangeQuery(BasePrometheusTool):
874
1566
  "step": step,
875
1567
  }
876
1568
 
1569
+ # Get timeout parameter and enforce limits
1570
+ default_timeout = self.toolset.config.query_timeout_seconds_default
1571
+ max_timeout = self.toolset.config.query_timeout_seconds_hard_max
1572
+ timeout = params.get("timeout", default_timeout)
1573
+ if timeout > max_timeout:
1574
+ timeout = max_timeout
1575
+ logging.warning(
1576
+ f"Timeout requested ({params.get('timeout')}) exceeds maximum ({max_timeout}s), using {max_timeout}s"
1577
+ )
1578
+ elif timeout < 1:
1579
+ timeout = default_timeout # Min 1 second, but use default if invalid
1580
+
877
1581
  response = do_request(
878
1582
  config=self.toolset.config,
879
1583
  url=url,
880
1584
  headers=self.toolset.config.headers,
881
1585
  data=payload,
882
- timeout=120,
883
- verify=self.toolset.config.prometheus_ssl_enabled,
1586
+ timeout=timeout,
1587
+ verify=self.toolset.config.verify_ssl,
884
1588
  method="POST",
885
1589
  )
886
1590
 
@@ -893,29 +1597,73 @@ class ExecuteRangeQuery(BasePrometheusTool):
893
1597
  error_message = (
894
1598
  "The prometheus query returned no result. Is the query correct?"
895
1599
  )
896
- response_data = {
897
- "status": status,
898
- "error_message": error_message,
899
- "random_key": generate_random_key(),
900
- "tool_name": self.name,
901
- "description": description,
902
- "query": query,
903
- "start": start,
904
- "end": end,
905
- "step": step,
906
- "output_type": output_type,
907
- }
1600
+ response_data = MetricsBasedResponse(
1601
+ status=status,
1602
+ error_message=error_message,
1603
+ tool_name=self.name,
1604
+ description=description,
1605
+ query=query,
1606
+ start=start,
1607
+ end=end,
1608
+ step=step,
1609
+ output_type=output_type,
1610
+ )
1611
+
1612
+ structured_tool_result: StructuredToolResult
908
1613
 
1614
+ # Check if data should be included based on size
909
1615
  if self.toolset.config.tool_calls_return_data:
910
- response_data["data"] = data.get("data")
911
- data_str = json.dumps(response_data, indent=2)
1616
+ result_data = data.get("data", {})
1617
+ response_data.data = result_data
1618
+ structured_tool_result = create_structured_tool_result(
1619
+ params=params, response=response_data
1620
+ )
912
1621
 
913
- return StructuredToolResult(
914
- status=ToolResultStatus.SUCCESS,
915
- data=data_str,
916
- params=params,
1622
+ tool_call_id = context.tool_call_id
1623
+ tool_name = context.tool_name
1624
+ token_count = count_tool_response_tokens(
1625
+ llm=context.llm,
1626
+ structured_tool_result=structured_tool_result,
1627
+ tool_call_id=tool_call_id,
1628
+ tool_name=tool_name,
1629
+ )
1630
+
1631
+ token_limit = context.max_token_count
1632
+ if self.toolset.config.query_response_size_limit_pct:
1633
+ custom_token_limit = get_pct_token_count(
1634
+ percent_of_total_context_window=self.toolset.config.query_response_size_limit_pct,
1635
+ llm=context.llm,
1636
+ )
1637
+ if custom_token_limit < token_limit:
1638
+ token_limit = custom_token_limit
1639
+
1640
+ # Provide summary if data is too large
1641
+ if token_count > token_limit:
1642
+ response_data.data = None
1643
+ response_data.data_summary = (
1644
+ create_data_summary_for_large_result(
1645
+ result_data, query, token_count, is_range_query=True
1646
+ )
1647
+ )
1648
+ logging.info(
1649
+ f"Prometheus range query returned large dataset: "
1650
+ f"{response_data.data_summary.get('series_count', 0)} series, "
1651
+ f"{token_count:,} tokens (limit: {token_limit:,}). "
1652
+ f"Returning summary instead of full data."
1653
+ )
1654
+ # Also add character info to the summary for debugging
1655
+ response_data.data_summary["_debug_info"] = (
1656
+ f"Data size: {token_count:,} tokens exceeded limit of {token_limit:,} tokens"
1657
+ )
1658
+ else:
1659
+ response_data.data = result_data
1660
+
1661
+ structured_tool_result = create_structured_tool_result(
1662
+ params=params, response=response_data
917
1663
  )
918
1664
 
1665
+ return structured_tool_result
1666
+
919
1667
  error_msg = "Unknown error occurred"
920
1668
  if response.status_code in [400, 429]:
921
1669
  try:
@@ -926,28 +1674,37 @@ class ExecuteRangeQuery(BasePrometheusTool):
926
1674
  except json.JSONDecodeError:
927
1675
  pass
928
1676
  return StructuredToolResult(
929
- status=ToolResultStatus.ERROR,
1677
+ status=StructuredToolResultStatus.ERROR,
930
1678
  error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
931
1679
  params=params,
932
1680
  )
933
1681
 
934
1682
  return StructuredToolResult(
935
- status=ToolResultStatus.ERROR,
1683
+ status=StructuredToolResultStatus.ERROR,
936
1684
  error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
937
1685
  params=params,
938
1686
  )
939
1687
 
1688
+ except SSLError as e:
1689
+ logging.warning(
1690
+ "SSL error while executing Prometheus range query", exc_info=True
1691
+ )
1692
+ return StructuredToolResult(
1693
+ status=StructuredToolResultStatus.ERROR,
1694
+ error=format_ssl_error_message(self.toolset.config.prometheus_url, e),
1695
+ params=params,
1696
+ )
940
1697
  except RequestException as e:
941
1698
  logging.info("Failed to connect to Prometheus", exc_info=True)
942
1699
  return StructuredToolResult(
943
- status=ToolResultStatus.ERROR,
1700
+ status=StructuredToolResultStatus.ERROR,
944
1701
  error=f"Connection error to Prometheus: {str(e)}",
945
1702
  params=params,
946
1703
  )
947
1704
  except Exception as e:
948
1705
  logging.info("Failed to connect to Prometheus", exc_info=True)
949
1706
  return StructuredToolResult(
950
- status=ToolResultStatus.ERROR,
1707
+ status=StructuredToolResultStatus.ERROR,
951
1708
  error=f"Unexpected error executing query: {str(e)}",
952
1709
  params=params,
953
1710
  )
@@ -958,7 +1715,7 @@ class ExecuteRangeQuery(BasePrometheusTool):
958
1715
 
959
1716
 
960
1717
  class PrometheusToolset(Toolset):
961
- config: Optional[Union[PrometheusConfig, AMPConfig]] = None
1718
+ config: Optional[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]] = None
962
1719
 
963
1720
  def __init__(self):
964
1721
  super().__init__(
@@ -969,7 +1726,11 @@ class PrometheusToolset(Toolset):
969
1726
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
970
1727
  tools=[
971
1728
  ListPrometheusRules(toolset=self),
972
- ListAvailableMetrics(toolset=self),
1729
+ GetMetricNames(toolset=self),
1730
+ GetLabelValues(toolset=self),
1731
+ GetAllLabels(toolset=self),
1732
+ GetSeries(toolset=self),
1733
+ GetMetricMetadata(toolset=self),
973
1734
  ExecuteInstantQuery(toolset=self),
974
1735
  ExecuteRangeQuery(toolset=self),
975
1736
  ],
@@ -987,16 +1748,36 @@ class PrometheusToolset(Toolset):
987
1748
 
988
1749
  def determine_prometheus_class(
989
1750
  self, config: dict[str, Any]
990
- ) -> Type[Union[PrometheusConfig, AMPConfig]]:
1751
+ ) -> Type[Union[PrometheusConfig, AMPConfig, AzurePrometheusConfig]]:
991
1752
  has_aws_fields = "aws_region" in config
992
- return AMPConfig if has_aws_fields else PrometheusConfig
1753
+ if has_aws_fields:
1754
+ return AMPConfig
1755
+
1756
+ # Check for Azure config using static method
1757
+ is_azure = AzurePrometheusConfig.is_azure_config(config)
1758
+ if is_azure:
1759
+ logging.info("Detected Azure Managed Prometheus configuration")
1760
+ return AzurePrometheusConfig if is_azure else PrometheusConfig
1761
+
1762
+ def _disable_azure_incompatible_tools(self):
1763
+ """
1764
+ Azure Managed Prometheus does not support some APIs.
1765
+ Remove unsupported tools.
1766
+ """
1767
+ incompatible = {
1768
+ "get_label_values",
1769
+ "get_metric_metadata",
1770
+ "list_prometheus_rules",
1771
+ }
1772
+ self.tools = [t for t in self.tools if t.name not in incompatible]
993
1773
 
994
1774
  def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
995
1775
  try:
996
1776
  if config:
997
1777
  config_cls = self.determine_prometheus_class(config)
998
1778
  self.config = config_cls(**config) # type: ignore
999
-
1779
+ if isinstance(self.config, AzurePrometheusConfig):
1780
+ self._disable_azure_incompatible_tools()
1000
1781
  self._reload_llm_instructions()
1001
1782
  return self._is_healthy()
1002
1783
  except Exception:
@@ -1041,14 +1822,14 @@ class PrometheusToolset(Toolset):
1041
1822
  f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
1042
1823
  )
1043
1824
 
1044
- url = urljoin(self.config.prometheus_url, self.config.healthcheck)
1825
+ url = urljoin(self.config.prometheus_url, "api/v1/query?query=up")
1045
1826
  try:
1046
1827
  response = do_request(
1047
1828
  config=self.config,
1048
1829
  url=url,
1049
1830
  headers=self.config.headers,
1050
1831
  timeout=10,
1051
- verify=self.config.prometheus_ssl_enabled,
1832
+ verify=self.config.verify_ssl,
1052
1833
  method="GET",
1053
1834
  )
1054
1835
 
@@ -1060,13 +1841,8 @@ class PrometheusToolset(Toolset):
1060
1841
  f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
1061
1842
  )
1062
1843
 
1063
- except RequestException:
1064
- return (
1065
- False,
1066
- f"Failed to initialize using url={url}",
1067
- )
1068
1844
  except Exception as e:
1069
- logging.exception("Failed to initialize Prometheus")
1845
+ logging.debug("Failed to initialize Prometheus", exc_info=True)
1070
1846
  return (
1071
1847
  False,
1072
1848
  f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
@@ -1074,6 +1850,11 @@ class PrometheusToolset(Toolset):
1074
1850
 
1075
1851
  def get_example_config(self):
1076
1852
  example_config = PrometheusConfig(
1077
- prometheus_url="http://robusta-kube-prometheus-st-prometheus:9090"
1853
+ prometheus_url="http://prometheus-server.monitoring.svc.cluster.local:9090",
1854
+ headers={"Authorization": "Basic <base64_encoded_credentials>"},
1855
+ discover_metrics_from_last_hours=1,
1856
+ query_timeout_seconds_default=20,
1857
+ query_timeout_seconds_hard_max=180,
1858
+ verify_ssl=True,
1078
1859
  )
1079
1860
  return example_config.model_dump()