holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,818 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import random
5
+ import re
6
+ import string
7
+ import time
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
+ from urllib.parse import urljoin
10
+
11
+ import requests # type: ignore
12
+ from pydantic import BaseModel, field_validator
13
+ from requests import RequestException
14
+
15
+ from holmes.core.tools import (
16
+ CallablePrerequisite,
17
+ StructuredToolResult,
18
+ Tool,
19
+ ToolParameter,
20
+ ToolResultStatus,
21
+ Toolset,
22
+ ToolsetTag,
23
+ )
24
+ from holmes.plugins.toolsets.consts import STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION
25
+ from holmes.plugins.toolsets.service_discovery import PrometheusDiscovery
26
+ from holmes.plugins.toolsets.utils import (
27
+ get_param_or_raise,
28
+ process_timestamps_to_rfc3339,
29
+ standard_start_datetime_tool_param_description,
30
+ )
31
+ from holmes.utils.cache import TTLCache
32
+
33
+ PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
34
+ DEFAULT_TIME_SPAN_SECONDS = 3600
35
+
36
+
37
+ class PrometheusConfig(BaseModel):
38
+ # URL is optional because it can be set with an env var
39
+ prometheus_url: Optional[str]
40
+ healthcheck: str = "-/healthy"
41
+ # Setting to None will remove the time window from the request for labels
42
+ metrics_labels_time_window_hrs: Union[int, None] = 48
43
+ # Setting to None will disable the cache
44
+ metrics_labels_cache_duration_hrs: Union[int, None] = 12
45
+ fetch_labels_with_labels_api: bool = False
46
+ fetch_metadata_with_series_api: bool = False
47
+ tool_calls_return_data: bool = True
48
+ headers: Dict = {}
49
+ rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
50
+ additional_labels: Optional[Dict[str, str]] = None
51
+
52
+ @field_validator("prometheus_url")
53
+ def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
54
+ if v is not None and not v.endswith("/"):
55
+ return v + "/"
56
+ return v
57
+
58
+
59
+ class BasePrometheusTool(Tool):
60
+ toolset: "PrometheusToolset"
61
+
62
+
63
+ def generate_random_key():
64
+ return "".join(random.choices(string.ascii_letters + string.digits, k=4))
65
+
66
+
67
+ def filter_metrics_by_type(metrics: Dict, expected_type: str):
68
+ return {
69
+ metric_name: metric_data
70
+ for metric_name, metric_data in metrics.items()
71
+ if expected_type in metric_data.get("type", "")
72
+ or metric_data.get("type", "") == "?"
73
+ }
74
+
75
+
76
+ def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
77
+ regex = re.compile(pattern)
78
+ return {
79
+ metric_name: metric_data
80
+ for metric_name, metric_data in metrics.items()
81
+ if regex.search(metric_name)
82
+ }
83
+
84
+
85
+ METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
86
+
87
+
88
+ def fetch_metadata(prometheus_url: str, headers: Optional[Dict]) -> Dict:
89
+ metadata_url = urljoin(prometheus_url, "api/v1/metadata")
90
+ metadata_response = requests.get(
91
+ metadata_url, headers=headers, timeout=60, verify=True
92
+ )
93
+
94
+ metadata_response.raise_for_status()
95
+
96
+ metadata = metadata_response.json()["data"]
97
+
98
+ metrics = {}
99
+ for metric_name, meta_list in metadata.items():
100
+ if meta_list:
101
+ metric_type = meta_list[0].get("type", "unknown")
102
+ metric_description = meta_list[0].get("help", "unknown")
103
+ metrics[metric_name] = {
104
+ "type": metric_type,
105
+ "description": metric_description,
106
+ "labels": set(),
107
+ }
108
+
109
+ return metrics
110
+
111
+
112
+ def fetch_metadata_with_series_api(
113
+ prometheus_url: str, metric_name: str, headers: Dict
114
+ ) -> Dict:
115
+ url = urljoin(prometheus_url, "api/v1/series")
116
+ params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
117
+
118
+ response = requests.get(
119
+ url, headers=headers, timeout=60, params=params, verify=True
120
+ )
121
+ response.raise_for_status()
122
+ metrics = response.json()["data"]
123
+
124
+ metadata: Dict = {}
125
+ for metric_data in metrics:
126
+ metric_name = metric_data.get("__name__")
127
+ if not metric_name:
128
+ continue
129
+
130
+ metric = metadata.get(metric_name)
131
+ if not metric:
132
+ metric = {"description": "?", "type": "?", "labels": set()}
133
+ metadata[metric_name] = metric
134
+
135
+ labels = {k for k in metric_data.keys() if k != "__name__"}
136
+ metric["labels"].update(labels)
137
+
138
+ return metadata
139
+
140
+
141
+ def result_has_data(result: Dict) -> bool:
142
+ data = result.get("data", {})
143
+ if len(data.get("result", [])) > 0:
144
+ return True
145
+ return False
146
+
147
+
148
+ def add_prometheus_auth(prometheus_auth_header: Optional[str]) -> Dict[str, Any]:
149
+ results = {}
150
+ if prometheus_auth_header:
151
+ results["Authorization"] = prometheus_auth_header
152
+ return results
153
+
154
+
155
+ def fetch_metrics_labels_with_series_api(
156
+ prometheus_url: str,
157
+ headers: Dict[str, str],
158
+ cache: Optional[TTLCache],
159
+ metrics_labels_time_window_hrs: Union[int, None],
160
+ metric_name: str,
161
+ ) -> dict:
162
+ """This is a slow query. Takes 5+ seconds to run"""
163
+ cache_key = f"metrics_labels_series_api:{metric_name}"
164
+ if cache:
165
+ cached_result = cache.get(cache_key)
166
+ if cached_result:
167
+ return cached_result
168
+
169
+ series_url = urljoin(prometheus_url, "api/v1/series")
170
+ params: dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
171
+
172
+ if metrics_labels_time_window_hrs is not None:
173
+ params["end"] = int(time.time())
174
+ params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
175
+
176
+ series_response = requests.get(
177
+ url=series_url, headers=headers, params=params, timeout=60, verify=True
178
+ )
179
+ series_response.raise_for_status()
180
+ series = series_response.json()["data"]
181
+
182
+ metrics_labels: dict = {}
183
+ for serie in series:
184
+ metric_name = serie["__name__"]
185
+ # Add all labels except __name__
186
+ labels = {k for k in serie.keys() if k != "__name__"}
187
+ if metric_name in metrics_labels:
188
+ metrics_labels[metric_name].update(labels)
189
+ else:
190
+ metrics_labels[metric_name] = labels
191
+ if cache:
192
+ cache.set(cache_key, metrics_labels)
193
+
194
+ return metrics_labels
195
+
196
+
197
+ def fetch_metrics_labels_with_labels_api(
198
+ prometheus_url: str,
199
+ cache: Optional[TTLCache],
200
+ metrics_labels_time_window_hrs: Union[int, None],
201
+ metric_names: List[str],
202
+ headers: Dict,
203
+ ) -> dict:
204
+ metrics_labels = {}
205
+
206
+ for metric_name in metric_names:
207
+ cache_key = f"metrics_labels_labels_api:{metric_name}"
208
+ if cache:
209
+ cached_result = cache.get(cache_key)
210
+ if cached_result:
211
+ metrics_labels[metric_name] = cached_result
212
+
213
+ url = urljoin(prometheus_url, "api/v1/labels")
214
+ params: dict = {
215
+ "match[]": f'{{__name__="{metric_name}"}}',
216
+ }
217
+ if metrics_labels_time_window_hrs is not None:
218
+ params["end"] = int(time.time())
219
+ params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
220
+
221
+ response = requests.get(
222
+ url=url, headers=headers, params=params, timeout=60, verify=True
223
+ )
224
+ response.raise_for_status()
225
+ labels = response.json()["data"]
226
+ filtered_labels = {label for label in labels if label != "__name__"}
227
+ metrics_labels[metric_name] = filtered_labels
228
+
229
+ if cache:
230
+ cache.set(cache_key, filtered_labels)
231
+
232
+ return metrics_labels
233
+
234
+
235
+ def fetch_metrics(
236
+ prometheus_url: str,
237
+ cache: Optional[TTLCache],
238
+ metrics_labels_time_window_hrs: Union[int, None],
239
+ metric_name: str,
240
+ should_fetch_labels_with_labels_api: bool,
241
+ should_fetch_metadata_with_series_api: bool,
242
+ headers: Dict,
243
+ ) -> dict:
244
+ metrics = None
245
+ should_fetch_labels = True
246
+ if should_fetch_metadata_with_series_api:
247
+ metrics = fetch_metadata_with_series_api(
248
+ prometheus_url=prometheus_url, metric_name=metric_name, headers=headers
249
+ )
250
+ should_fetch_labels = False # series API returns the labels
251
+ else:
252
+ metrics = fetch_metadata(prometheus_url=prometheus_url, headers=headers)
253
+ metrics = filter_metrics_by_name(metrics, metric_name)
254
+
255
+ if should_fetch_labels:
256
+ metrics_labels = {}
257
+ if should_fetch_labels_with_labels_api:
258
+ metrics_labels = fetch_metrics_labels_with_labels_api(
259
+ prometheus_url=prometheus_url,
260
+ cache=cache,
261
+ metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
262
+ metric_names=list(metrics.keys()),
263
+ headers=headers,
264
+ )
265
+ else:
266
+ metrics_labels = fetch_metrics_labels_with_series_api(
267
+ prometheus_url=prometheus_url,
268
+ cache=cache,
269
+ metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
270
+ metric_name=metric_name,
271
+ headers=headers,
272
+ )
273
+
274
+ for metric_name in metrics:
275
+ if metric_name in metrics_labels:
276
+ metrics[metric_name]["labels"] = metrics_labels[metric_name]
277
+
278
+ return metrics
279
+
280
+
281
+ class ListPrometheusRules(BasePrometheusTool):
282
+ def __init__(self, toolset: "PrometheusToolset"):
283
+ super().__init__(
284
+ name="list_prometheus_rules",
285
+ description="List all defined prometheus rules. Will show the prometheus rules description, expression and annotations",
286
+ parameters={},
287
+ toolset=toolset,
288
+ )
289
+ self._cache = None
290
+
291
+ def _invoke(self, params: Any) -> StructuredToolResult:
292
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
293
+ return StructuredToolResult(
294
+ status=ToolResultStatus.ERROR,
295
+ error="Prometheus is not configured. Prometheus URL is missing",
296
+ params=params,
297
+ )
298
+ if not self._cache and self.toolset.config.rules_cache_duration_seconds:
299
+ self._cache = TTLCache(self.toolset.config.rules_cache_duration_seconds) # type: ignore
300
+ try:
301
+ if self._cache:
302
+ cached_rules = self._cache.get(PROMETHEUS_RULES_CACHE_KEY)
303
+ if cached_rules:
304
+ logging.debug("rules returned from cache")
305
+
306
+ return StructuredToolResult(
307
+ status=ToolResultStatus.SUCCESS,
308
+ data=cached_rules,
309
+ params=params,
310
+ )
311
+
312
+ prometheus_url = self.toolset.config.prometheus_url
313
+
314
+ rules_url = urljoin(prometheus_url, "api/v1/rules")
315
+
316
+ rules_response = requests.get(
317
+ url=rules_url,
318
+ params=params,
319
+ timeout=180,
320
+ verify=True,
321
+ headers=self.toolset.config.headers,
322
+ )
323
+ rules_response.raise_for_status()
324
+ data = rules_response.json()["data"]
325
+
326
+ if self._cache:
327
+ self._cache.set(PROMETHEUS_RULES_CACHE_KEY, data)
328
+ return StructuredToolResult(
329
+ status=ToolResultStatus.SUCCESS,
330
+ data=data,
331
+ params=params,
332
+ )
333
+ except requests.Timeout:
334
+ logging.warning("Timeout while fetching prometheus rules", exc_info=True)
335
+ return StructuredToolResult(
336
+ status=ToolResultStatus.ERROR,
337
+ error="Request timed out while fetching rules",
338
+ params=params,
339
+ )
340
+ except RequestException as e:
341
+ logging.warning("Failed to fetch prometheus rules", exc_info=True)
342
+ return StructuredToolResult(
343
+ status=ToolResultStatus.ERROR,
344
+ error=f"Network error while fetching rules: {str(e)}",
345
+ params=params,
346
+ )
347
+ except Exception as e:
348
+ logging.warning("Failed to process prometheus rules", exc_info=True)
349
+ return StructuredToolResult(
350
+ status=ToolResultStatus.ERROR,
351
+ error=f"Unexpected error: {str(e)}",
352
+ params=params,
353
+ )
354
+
355
+ def get_parameterized_one_liner(self, params) -> str:
356
+ return "list available prometheus rules"
357
+
358
+
359
+ class ListAvailableMetrics(BasePrometheusTool):
360
+ def __init__(self, toolset: "PrometheusToolset"):
361
+ super().__init__(
362
+ name="list_available_metrics",
363
+ description="List all the available metrics to query from prometheus, including their types (counter, gauge, histogram, summary) and available labels.",
364
+ parameters={
365
+ "type_filter": ToolParameter(
366
+ description="Optional filter to only return a specific metric type. Can be one of counter, gauge, histogram, summary",
367
+ type="string",
368
+ required=False,
369
+ ),
370
+ "name_filter": ToolParameter(
371
+ description="Only the metrics partially or fully matching this name will be returned",
372
+ type="string",
373
+ required=True,
374
+ ),
375
+ },
376
+ toolset=toolset,
377
+ )
378
+ self._cache = None
379
+
380
+ def _invoke(self, params: Any) -> StructuredToolResult:
381
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
382
+ return StructuredToolResult(
383
+ status=ToolResultStatus.ERROR,
384
+ error="Prometheus is not configured. Prometheus URL is missing",
385
+ params=params,
386
+ )
387
+ if not self._cache and self.toolset.config.metrics_labels_cache_duration_hrs:
388
+ self._cache = TTLCache(
389
+ self.toolset.config.metrics_labels_cache_duration_hrs * 3600 # type: ignore
390
+ )
391
+ try:
392
+ prometheus_url = self.toolset.config.prometheus_url
393
+ metrics_labels_time_window_hrs = (
394
+ self.toolset.config.metrics_labels_time_window_hrs
395
+ )
396
+
397
+ name_filter = params.get("name_filter")
398
+ if not name_filter:
399
+ return StructuredToolResult(
400
+ status=ToolResultStatus.ERROR,
401
+ error="Error: cannot run tool 'list_available_metrics'. The param 'name_filter' is required but is missing.",
402
+ params=params,
403
+ )
404
+
405
+ metrics = fetch_metrics(
406
+ prometheus_url=prometheus_url,
407
+ cache=self._cache,
408
+ metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
409
+ metric_name=name_filter,
410
+ should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
411
+ should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
412
+ headers=self.toolset.config.headers,
413
+ )
414
+
415
+ if params.get("type_filter"):
416
+ metrics = filter_metrics_by_type(metrics, params.get("type_filter"))
417
+
418
+ output = ["Metric | Description | Type | Labels"]
419
+ output.append("-" * 100)
420
+
421
+ for metric, info in sorted(metrics.items()):
422
+ labels_str = (
423
+ ", ".join(sorted(info["labels"])) if info["labels"] else "none"
424
+ )
425
+ output.append(
426
+ f"{metric} | {info['description']} | {info['type']} | {labels_str}"
427
+ )
428
+
429
+ table_output = "\n".join(output)
430
+ return StructuredToolResult(
431
+ status=ToolResultStatus.SUCCESS,
432
+ data=table_output,
433
+ params=params,
434
+ )
435
+
436
+ except requests.Timeout:
437
+ logging.warn("Timeout while fetching prometheus metrics", exc_info=True)
438
+ return StructuredToolResult(
439
+ status=ToolResultStatus.ERROR,
440
+ error="Request timed out while fetching metrics",
441
+ params=params,
442
+ )
443
+ except RequestException as e:
444
+ logging.warn("Failed to fetch prometheus metrics", exc_info=True)
445
+ return StructuredToolResult(
446
+ status=ToolResultStatus.ERROR,
447
+ error=f"Network error while fetching metrics: {str(e)}",
448
+ params=params,
449
+ )
450
+ except Exception as e:
451
+ logging.warn("Failed to process prometheus metrics", exc_info=True)
452
+ return StructuredToolResult(
453
+ status=ToolResultStatus.ERROR,
454
+ error=f"Unexpected error: {str(e)}",
455
+ params=params,
456
+ )
457
+
458
+ def get_parameterized_one_liner(self, params) -> str:
459
+ return f'Search Available Prometheus Metrics: name_filter="{params.get("name_filter", "<no filter>")}", type_filter="{params.get("type_filter", "<no filter>")}"'
460
+
461
+
462
+ class ExecuteInstantQuery(BasePrometheusTool):
463
+ def __init__(self, toolset: "PrometheusToolset"):
464
+ super().__init__(
465
+ name="execute_prometheus_instant_query",
466
+ description="Execute an instant PromQL query",
467
+ parameters={
468
+ "query": ToolParameter(
469
+ description="The PromQL query",
470
+ type="string",
471
+ required=True,
472
+ ),
473
+ "description": ToolParameter(
474
+ description="Describes the query",
475
+ type="string",
476
+ required=True,
477
+ ),
478
+ },
479
+ toolset=toolset,
480
+ )
481
+
482
+ def _invoke(self, params: Any) -> StructuredToolResult:
483
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
484
+ return StructuredToolResult(
485
+ status=ToolResultStatus.ERROR,
486
+ error="Prometheus is not configured. Prometheus URL is missing",
487
+ params=params,
488
+ )
489
+ try:
490
+ query = params.get("query", "")
491
+ description = params.get("description", "")
492
+
493
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/query")
494
+
495
+ payload = {"query": query}
496
+
497
+ response = requests.post(
498
+ url=url, headers=self.toolset.config.headers, data=payload, timeout=60
499
+ )
500
+
501
+ if response.status_code == 200:
502
+ data = response.json()
503
+ status = data.get("status")
504
+ error_message = None
505
+ if status == "success" and not result_has_data(data):
506
+ status = "Failed"
507
+ error_message = (
508
+ "The prometheus query returned no result. Is the query correct?"
509
+ )
510
+ response_data = {
511
+ "status": status,
512
+ "error_message": error_message,
513
+ "random_key": generate_random_key(),
514
+ "tool_name": self.name,
515
+ "description": description,
516
+ "query": query,
517
+ }
518
+
519
+ if self.toolset.config.tool_calls_return_data:
520
+ response_data["data"] = data.get("data")
521
+
522
+ data_str = json.dumps(response_data, indent=2)
523
+ return StructuredToolResult(
524
+ status=ToolResultStatus.SUCCESS,
525
+ data=data_str,
526
+ params=params,
527
+ )
528
+
529
+ # Handle known Prometheus error status codes
530
+ error_msg = "Unknown error occurred"
531
+ if response.status_code in [400, 429]:
532
+ try:
533
+ error_data = response.json()
534
+ error_msg = error_data.get(
535
+ "error", error_data.get("message", str(response.content))
536
+ )
537
+ except json.JSONDecodeError:
538
+ pass
539
+ return StructuredToolResult(
540
+ status=ToolResultStatus.ERROR,
541
+ error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
542
+ params=params,
543
+ )
544
+
545
+ # For other status codes, just return the status code and content
546
+ return StructuredToolResult(
547
+ status=ToolResultStatus.ERROR,
548
+ error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
549
+ params=params,
550
+ )
551
+
552
+ except RequestException as e:
553
+ logging.info("Failed to connect to Prometheus", exc_info=True)
554
+ return StructuredToolResult(
555
+ status=ToolResultStatus.ERROR,
556
+ error=f"Connection error to Prometheus: {str(e)}",
557
+ params=params,
558
+ )
559
+ except Exception as e:
560
+ logging.info("Failed to connect to Prometheus", exc_info=True)
561
+ return StructuredToolResult(
562
+ status=ToolResultStatus.ERROR,
563
+ error=f"Unexpected error executing query: {str(e)}",
564
+ params=params,
565
+ )
566
+
567
+ def get_parameterized_one_liner(self, params) -> str:
568
+ query = params.get("query")
569
+ description = params.get("description")
570
+ return f"Execute Prometheus Query (instant): promql='{query}', description='{description}'"
571
+
572
+
573
+ class ExecuteRangeQuery(BasePrometheusTool):
574
+ def __init__(self, toolset: "PrometheusToolset"):
575
+ super().__init__(
576
+ name="execute_prometheus_range_query",
577
+ description="Generates a graph and Execute a PromQL range query",
578
+ parameters={
579
+ "query": ToolParameter(
580
+ description="The PromQL query",
581
+ type="string",
582
+ required=True,
583
+ ),
584
+ "description": ToolParameter(
585
+ description="Describes the query",
586
+ type="string",
587
+ required=True,
588
+ ),
589
+ "start": ToolParameter(
590
+ description=standard_start_datetime_tool_param_description(
591
+ DEFAULT_TIME_SPAN_SECONDS
592
+ ),
593
+ type="string",
594
+ required=False,
595
+ ),
596
+ "end": ToolParameter(
597
+ description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
598
+ type="string",
599
+ required=False,
600
+ ),
601
+ "step": ToolParameter(
602
+ description="Query resolution step width in duration format or float number of seconds",
603
+ type="number",
604
+ required=True,
605
+ ),
606
+ "output_type": ToolParameter(
607
+ description="Specifies how to interpret the Prometheus result. Use 'Plain' for raw values, 'Bytes' to format byte values, 'Percentage' to scale 0–1 values into 0–100%, or 'CPUUsage' to convert values to cores (e.g., 500 becomes 500m, 2000 becomes 2).",
608
+ type="string",
609
+ required=True,
610
+ ),
611
+ },
612
+ toolset=toolset,
613
+ )
614
+
615
+ def _invoke(self, params: Any) -> StructuredToolResult:
616
+ if not self.toolset.config or not self.toolset.config.prometheus_url:
617
+ return StructuredToolResult(
618
+ status=ToolResultStatus.ERROR,
619
+ error="Prometheus is not configured. Prometheus URL is missing",
620
+ params=params,
621
+ )
622
+
623
+ try:
624
+ url = urljoin(self.toolset.config.prometheus_url, "api/v1/query_range")
625
+
626
+ query = get_param_or_raise(params, "query")
627
+ (start, end) = process_timestamps_to_rfc3339(
628
+ start_timestamp=params.get("start"),
629
+ end_timestamp=params.get("end"),
630
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
631
+ )
632
+ step = params.get("step", "")
633
+ description = params.get("description", "")
634
+ output_type = params.get("output_type", "Plain")
635
+ payload = {
636
+ "query": query,
637
+ "start": start,
638
+ "end": end,
639
+ "step": step,
640
+ }
641
+
642
+ response = requests.post(
643
+ url=url, headers=self.toolset.config.headers, data=payload, timeout=120
644
+ )
645
+
646
+ if response.status_code == 200:
647
+ data = response.json()
648
+ status = data.get("status")
649
+ error_message = None
650
+ if status == "success" and not result_has_data(data):
651
+ status = "Failed"
652
+ error_message = (
653
+ "The prometheus query returned no result. Is the query correct?"
654
+ )
655
+ response_data = {
656
+ "status": status,
657
+ "error_message": error_message,
658
+ "random_key": generate_random_key(),
659
+ "tool_name": self.name,
660
+ "description": description,
661
+ "query": query,
662
+ "start": start,
663
+ "end": end,
664
+ "step": step,
665
+ "output_type": output_type,
666
+ }
667
+
668
+ if self.toolset.config.tool_calls_return_data:
669
+ response_data["data"] = data.get("data")
670
+ data_str = json.dumps(response_data, indent=2)
671
+ return StructuredToolResult(
672
+ status=ToolResultStatus.SUCCESS,
673
+ data=data_str,
674
+ params=params,
675
+ )
676
+
677
+ error_msg = "Unknown error occurred"
678
+ if response.status_code in [400, 429]:
679
+ try:
680
+ error_data = response.json()
681
+ error_msg = error_data.get(
682
+ "error", error_data.get("message", str(response.content))
683
+ )
684
+ except json.JSONDecodeError:
685
+ pass
686
+ return StructuredToolResult(
687
+ status=ToolResultStatus.ERROR,
688
+ error=f"Query execution failed. HTTP {response.status_code}: {error_msg}",
689
+ params=params,
690
+ )
691
+
692
+ return StructuredToolResult(
693
+ status=ToolResultStatus.ERROR,
694
+ error=f"Query execution failed with unexpected status code: {response.status_code}. Response: {str(response.content)}",
695
+ params=params,
696
+ )
697
+
698
+ except RequestException as e:
699
+ logging.info("Failed to connect to Prometheus", exc_info=True)
700
+ return StructuredToolResult(
701
+ status=ToolResultStatus.ERROR,
702
+ error=f"Connection error to Prometheus: {str(e)}",
703
+ params=params,
704
+ )
705
+ except Exception as e:
706
+ logging.info("Failed to connect to Prometheus", exc_info=True)
707
+ return StructuredToolResult(
708
+ status=ToolResultStatus.ERROR,
709
+ error=f"Unexpected error executing query: {str(e)}",
710
+ params=params,
711
+ )
712
+
713
+ def get_parameterized_one_liner(self, params) -> str:
714
+ query = params.get("query")
715
+ start = params.get("start")
716
+ end = params.get("end")
717
+ step = params.get("step")
718
+ description = params.get("description")
719
+ return f"Execute Prometheus Query (range): promql='{query}', start={start}, end={end}, step={step}, description='{description}'"
720
+
721
+
722
+ class PrometheusToolset(Toolset):
723
+ def __init__(self):
724
+ super().__init__(
725
+ name="prometheus/metrics",
726
+ description="Prometheus integration to fetch metadata and execute PromQL queries",
727
+ docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/prometheus.html",
728
+ icon_url="https://upload.wikimedia.org/wikipedia/commons/3/38/Prometheus_software_logo.svg",
729
+ prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
730
+ tools=[
731
+ ListPrometheusRules(toolset=self),
732
+ ListAvailableMetrics(toolset=self),
733
+ ExecuteInstantQuery(toolset=self),
734
+ ExecuteRangeQuery(toolset=self),
735
+ ],
736
+ tags=[
737
+ ToolsetTag.CORE,
738
+ ],
739
+ )
740
+ self._reload_llm_instructions()
741
+
742
+ def _reload_llm_instructions(self):
743
+ template_file_path = os.path.abspath(
744
+ os.path.join(os.path.dirname(__file__), "prometheus_instructions.jinja2")
745
+ )
746
+ self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
747
+
748
+ def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
749
+ if config:
750
+ self.config = PrometheusConfig(**config)
751
+ self._reload_llm_instructions()
752
+ return self._is_healthy()
753
+
754
+ prometheus_url = os.environ.get("PROMETHEUS_URL")
755
+ if not prometheus_url:
756
+ prometheus_url = self.auto_detect_prometheus_url()
757
+ if not prometheus_url:
758
+ return (
759
+ False,
760
+ "Unable to auto-detect prometheus. Define prometheus_url in the configuration for tool prometheus/metrics",
761
+ )
762
+
763
+ self.config = PrometheusConfig(
764
+ prometheus_url=prometheus_url,
765
+ headers=add_prometheus_auth(os.environ.get("PROMETHEUS_AUTH_HEADER")),
766
+ )
767
+ logging.info(f"Prometheus auto discovered at url {prometheus_url}")
768
+ self._reload_llm_instructions()
769
+ return self._is_healthy()
770
+
771
+ def auto_detect_prometheus_url(self) -> Optional[str]:
772
+ url: Optional[str] = PrometheusDiscovery.find_prometheus_url()
773
+ if not url:
774
+ url = PrometheusDiscovery.find_vm_url()
775
+
776
+ return url
777
+
778
+ def _is_healthy(self) -> Tuple[bool, str]:
779
+ if (
780
+ not hasattr(self, "config")
781
+ or not self.config
782
+ or not self.config.prometheus_url
783
+ ):
784
+ return (
785
+ False,
786
+ f"Toolset {self.name} failed to initialize because prometheus is not configured correctly",
787
+ )
788
+
789
+ url = urljoin(self.config.prometheus_url, self.config.healthcheck)
790
+ try:
791
+ response = requests.get(
792
+ url=url, headers=self.config.headers, timeout=10, verify=True
793
+ )
794
+
795
+ if response.status_code == 200:
796
+ return True, ""
797
+ else:
798
+ return (
799
+ False,
800
+ f"Failed to connect to Prometheus at {url}: HTTP {response.status_code}",
801
+ )
802
+
803
+ except RequestException:
804
+ return (
805
+ False,
806
+ f"Failed to initialize using url={url}",
807
+ )
808
+ except Exception as e:
809
+ return (
810
+ False,
811
+ f"Failed to initialize using url={url}. Unexpected error: {str(e)}",
812
+ )
813
+
814
+ def get_example_config(self):
815
+ example_config = PrometheusConfig(
816
+ prometheus_url="http://robusta-kube-prometheus-st-prometheus:9090"
817
+ )
818
+ return example_config.model_dump()