holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,222 @@
1
+ import os
2
+ import logging
3
+ from typing import Any, List, Optional, Tuple
4
+
5
+ from pydantic import BaseModel
6
+ from holmes.core.tools import (
7
+ CallablePrerequisite,
8
+ StructuredToolResult,
9
+ Tool,
10
+ ToolParameter,
11
+ ToolResultStatus,
12
+ Toolset,
13
+ ToolsetTag,
14
+ )
15
+ from requests import RequestException # type: ignore
16
+ from urllib.parse import urljoin
17
+
18
+ from holmes.plugins.toolsets.rabbitmq.api import (
19
+ ClusterConnectionStatus,
20
+ RabbitMQClusterConfig,
21
+ get_cluster_status,
22
+ make_request,
23
+ )
24
+
25
+
26
+ class RabbitMQConfig(BaseModel):
27
+ clusters: List[RabbitMQClusterConfig]
28
+
29
+
30
+ class BaseRabbitMQTool(Tool):
31
+ toolset: "RabbitMQToolset"
32
+
33
+ def _get_cluster_config(self, cluster_id: Optional[str]) -> RabbitMQClusterConfig:
34
+ if not self.toolset.config:
35
+ raise ValueError("RabbitMQ is not configured.")
36
+ cluster_ids = [c.id for c in self.toolset.config.clusters]
37
+ if not cluster_id and len(cluster_ids) == 1:
38
+ # cluster id is optional if there is only one configured
39
+ return self.toolset.config.clusters[0]
40
+ elif not cluster_id and len(cluster_ids) > 0:
41
+ raise ValueError(
42
+ f"No cluster is configured. Possible cluster_id values are: {', '.join(cluster_ids)}"
43
+ )
44
+ elif not cluster_id:
45
+ raise ValueError("No cluster is configured")
46
+
47
+ for cluster in self.toolset.config.clusters:
48
+ if cluster.id == cluster_id:
49
+ return cluster
50
+
51
+ raise ValueError(
52
+ f"Failed to find cluster_id={cluster_id} amongst configured clusters. Possible cluster_id values are: {', '.join(cluster_ids)}"
53
+ )
54
+
55
+
56
+ class ListConfiguredClusters(BaseRabbitMQTool):
57
+ def __init__(self, toolset: "RabbitMQToolset"):
58
+ super().__init__(
59
+ name="list_configured_clusters",
60
+ description="List all configured clusters. Useful to get the id of a configured cluster (cluster_id) and pass as argument to other rabbitmq tool calls.",
61
+ parameters={},
62
+ toolset=toolset,
63
+ )
64
+
65
+ def _invoke(self, params: Any) -> StructuredToolResult:
66
+ if not self.toolset.config:
67
+ raise ValueError("RabbitMQ is not configured.")
68
+
69
+ available_clusters = [
70
+ {
71
+ "cluster_id": c.id,
72
+ "management_url": c.management_url,
73
+ "connection_status": c.connection_status,
74
+ }
75
+ for c in self.toolset.config.clusters
76
+ if c.connection_status == ClusterConnectionStatus.SUCCESS
77
+ ]
78
+ return StructuredToolResult(
79
+ status=ToolResultStatus.SUCCESS, data=available_clusters
80
+ )
81
+
82
+ def get_parameterized_one_liner(self, params) -> str:
83
+ return "list configured RabbitMQ clusters"
84
+
85
+
86
+ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
87
+ def __init__(self, toolset: "RabbitMQToolset"):
88
+ super().__init__(
89
+ name="get_rabbitmq_cluster_status",
90
+ description="Fetches the overall status of the RabbitMQ cluster, including node information, listeners, and partition details. Crucial for detecting split-brain scenarios",
91
+ parameters={
92
+ "cluster_id": ToolParameter(
93
+ description="The id of the cluster obtained with list_configured_clusters. Only required if more than one rabbitmq cluster is configured.",
94
+ type="string",
95
+ required=False,
96
+ ),
97
+ },
98
+ toolset=toolset,
99
+ )
100
+
101
+ def _invoke(self, params: Any) -> StructuredToolResult:
102
+ try:
103
+ # Fetch node details which include partition info
104
+ cluster_config = self._get_cluster_config(
105
+ cluster_id=params.get("cluster_id")
106
+ )
107
+ result = get_cluster_status(cluster_config)
108
+ return StructuredToolResult(status=ToolResultStatus.SUCCESS, data=result)
109
+
110
+ except Exception as e:
111
+ logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
112
+ return StructuredToolResult(
113
+ status=ToolResultStatus.ERROR,
114
+ error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
115
+ data=None,
116
+ )
117
+
118
+ def get_parameterized_one_liner(self, params) -> str:
119
+ return "get RabbitMQ cluster status and partition information"
120
+
121
+
122
+ class RabbitMQToolset(Toolset):
123
+ def __init__(self):
124
+ super().__init__(
125
+ name="rabbitmq/core",
126
+ description="Provides tools to interact with RabbitMQ to diagnose cluster health, node status, and specifically network partitions (split-brain).",
127
+ docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/rabbitmq.html",
128
+ icon_url="https://cdn.worldvectorlogo.com/logos/rabbitmq.svg",
129
+ prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
130
+ tools=[
131
+ ListConfiguredClusters(toolset=self),
132
+ GetRabbitMQClusterStatus(toolset=self),
133
+ ],
134
+ tags=[ToolsetTag.CORE],
135
+ )
136
+ self._reload_llm_instructions()
137
+
138
+ def _reload_llm_instructions(self):
139
+ # Load instructions from the jinja2 template file
140
+ template_file_path = os.path.abspath(
141
+ os.path.join(os.path.dirname(__file__), "rabbitmq_instructions.jinja2")
142
+ )
143
+ self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
144
+
145
+ def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
146
+ if not config or not config.get("clusters"):
147
+ # Attempt to load from environment variables as fallback
148
+ env_url = os.environ.get("RABBITMQ_MANAGEMENT_URL")
149
+ env_user = os.environ.get("RABBITMQ_USERNAME", "guest")
150
+ env_pass = os.environ.get("RABBITMQ_PASSWORD", "guest")
151
+ if not env_url:
152
+ return (
153
+ False,
154
+ "RabbitMQ toolset is misconfigured. 'management_url' is required.",
155
+ )
156
+ config = {
157
+ "clusters": [
158
+ {
159
+ "id": "rabbitmq",
160
+ "management_url": env_url,
161
+ "username": env_user,
162
+ "password": env_pass,
163
+ }
164
+ ]
165
+ }
166
+ logging.info("Loaded RabbitMQ config from environment variables.")
167
+
168
+ try:
169
+ self.config = RabbitMQConfig(**config)
170
+ except Exception as e:
171
+ return (False, f"Failed to parse RabbitMQ configuration: {str(e)}")
172
+
173
+ return self._check_clusters_config(self.config)
174
+
175
+ def _check_clusters_config(self, config: RabbitMQConfig) -> Tuple[bool, str]:
176
+ errors = []
177
+ for cluster_config in config.clusters:
178
+ url = urljoin(cluster_config.management_url, "api/overview")
179
+
180
+ try:
181
+ data = make_request(
182
+ config=cluster_config,
183
+ method="GET",
184
+ url=url,
185
+ )
186
+
187
+ if data:
188
+ cluster_config.connection_status = ClusterConnectionStatus.SUCCESS
189
+ self._reload_llm_instructions()
190
+ else:
191
+ error_message = f"Failed to connect to RabbitMQ Management API for cluster with id={cluster_config.id} at {url}. No data returned"
192
+ cluster_config.connection_status = ClusterConnectionStatus.ERROR
193
+ errors.append(error_message)
194
+
195
+ except RequestException as e:
196
+ error_message = f"Toolset failed health check for cluster with id={cluster_config.id} at {url} due to a failed http request. Connection error: {str(e)}"
197
+ cluster_config.connection_status = ClusterConnectionStatus.ERROR
198
+ errors.append(error_message)
199
+ except Exception as e:
200
+ error_message = f"Toolset failed health check for cluster with id={cluster_config.id} at {url}: {str(e)}"
201
+ cluster_config.connection_status = ClusterConnectionStatus.ERROR
202
+ errors.append(error_message)
203
+
204
+ if errors:
205
+ if len(errors) == 1:
206
+ return (False, errors[0])
207
+ else:
208
+ return (False, "\n".join([f"- {error}" for error in errors]))
209
+ else:
210
+ return (True, "")
211
+
212
+ def get_example_config(self):
213
+ example_config = RabbitMQConfig(
214
+ clusters=[
215
+ RabbitMQClusterConfig(
216
+ management_url="http://<your-rabbitmq-server-or-service>:15672",
217
+ username="holmes_user",
218
+ password="holmes_password",
219
+ )
220
+ ]
221
+ )
222
+ return example_config.model_dump()
File without changes
@@ -0,0 +1,235 @@
1
+ import os
2
+
3
+ import logging
4
+
5
+ from typing import Optional, Dict, Any, List
6
+ from holmes.core.supabase_dal import SupabaseDal
7
+ from holmes.core.tools import (
8
+ StaticPrerequisite,
9
+ Tool,
10
+ ToolParameter,
11
+ Toolset,
12
+ ToolsetTag,
13
+ )
14
+ from holmes.core.tools import StructuredToolResult, ToolResultStatus
15
+
16
+ PARAM_FINDING_ID = "id"
17
+ START_TIME = "start_datetime"
18
+ END_TIME = "end_datetime"
19
+ NAMESPACE = "namespace"
20
+ WORKLOAD = "workload"
21
+
22
+
23
+ class FetchRobustaFinding(Tool):
24
+ _dal: Optional[SupabaseDal]
25
+
26
+ def __init__(self, dal: Optional[SupabaseDal]):
27
+ super().__init__(
28
+ name="fetch_finding_by_id",
29
+ description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update",
30
+ parameters={
31
+ PARAM_FINDING_ID: ToolParameter(
32
+ description="The id of the finding to fetch",
33
+ type="string",
34
+ required=True,
35
+ )
36
+ },
37
+ )
38
+ self._dal = dal
39
+
40
+ def _fetch_finding(self, finding_id: str) -> Optional[Dict]:
41
+ if self._dal and self._dal.enabled:
42
+ return self._dal.get_issue_data(finding_id)
43
+ else:
44
+ error = f"Failed to find a finding with finding_id={finding_id}: Holmes' data access layer is not enabled."
45
+ logging.error(error)
46
+ return {"error": error}
47
+
48
+ def _invoke(self, params: Dict) -> StructuredToolResult:
49
+ finding_id = params[PARAM_FINDING_ID]
50
+ try:
51
+ finding = self._fetch_finding(finding_id)
52
+ if finding:
53
+ return StructuredToolResult(
54
+ status=ToolResultStatus.SUCCESS,
55
+ data=finding,
56
+ params=params,
57
+ )
58
+ else:
59
+ return StructuredToolResult(
60
+ status=ToolResultStatus.NO_DATA,
61
+ data=f"Could not find a finding with finding_id={finding_id}",
62
+ params=params,
63
+ )
64
+ except Exception as e:
65
+ logging.error(e)
66
+ logging.error(
67
+ f"There was an internal error while fetching finding {finding_id}. {str(e)}"
68
+ )
69
+
70
+ return StructuredToolResult(
71
+ status=ToolResultStatus.ERROR,
72
+ data=f"There was an internal error while fetching finding {finding_id}",
73
+ params=params,
74
+ )
75
+
76
+ def get_parameterized_one_liner(self, params: Dict) -> str:
77
+ return "Fetch Alert Metadata"
78
+
79
+
80
+ class FetchResourceRecommendation(Tool):
81
+ _dal: Optional[SupabaseDal]
82
+
83
+ def __init__(self, dal: Optional[SupabaseDal]):
84
+ super().__init__(
85
+ name="fetch_resource_recommendation",
86
+ description="Fetch workload recommendations for resources requests and limits. Returns the current configured resources, as well as recommendation based on actual historical usage.",
87
+ parameters={
88
+ "name": ToolParameter(
89
+ description="The name of the kubernetes workload.",
90
+ type="string",
91
+ required=True,
92
+ ),
93
+ "namespace": ToolParameter(
94
+ description="The namespace of the kubernetes resource.",
95
+ type="string",
96
+ required=True,
97
+ ),
98
+ "kind": ToolParameter(
99
+ description="The kind of the kubernetes resource. Must be one of: [Deployment, StatefulSet, DaemonSet, Job].",
100
+ type="string",
101
+ required=True,
102
+ ),
103
+ },
104
+ )
105
+ self._dal = dal
106
+
107
+ def _resource_recommendation(self, params: Dict) -> Optional[List[Dict]]:
108
+ if self._dal and self._dal.enabled:
109
+ return self._dal.get_resource_recommendation(
110
+ name=params["name"],
111
+ namespace=params["namespace"],
112
+ kind=params["kind"],
113
+ )
114
+ return None
115
+
116
+ def _invoke(self, params: Dict) -> StructuredToolResult:
117
+ try:
118
+ recommendations = self._resource_recommendation(params)
119
+ if recommendations:
120
+ return StructuredToolResult(
121
+ status=ToolResultStatus.SUCCESS,
122
+ data=recommendations,
123
+ params=params,
124
+ )
125
+ else:
126
+ return StructuredToolResult(
127
+ status=ToolResultStatus.NO_DATA,
128
+ data=f"Could not find recommendations for {params}",
129
+ params=params,
130
+ )
131
+ except Exception as e:
132
+ msg = f"There was an internal error while fetching recommendations for {params}. {str(e)}"
133
+ logging.exception(msg)
134
+ return StructuredToolResult(
135
+ status=ToolResultStatus.ERROR,
136
+ data=msg,
137
+ params=params,
138
+ )
139
+
140
+ def get_parameterized_one_liner(self, params: Dict) -> str:
141
+ return f"Check Historical Resource Utilization: ({str(params)})"
142
+
143
+
144
+ class FetchConfigurationChanges(Tool):
145
+ _dal: Optional[SupabaseDal]
146
+
147
+ def __init__(self, dal: Optional[SupabaseDal]):
148
+ super().__init__(
149
+ name="fetch_configuration_changes",
150
+ description="Fetch configuration changes in a given time range. By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload",
151
+ parameters={
152
+ START_TIME: ToolParameter(
153
+ description="The starting time boundary for the search period. String in RFC3339 format.",
154
+ type="string",
155
+ required=True,
156
+ ),
157
+ END_TIME: ToolParameter(
158
+ description="The starting time boundary for the search period. String in RFC3339 format.",
159
+ type="string",
160
+ required=True,
161
+ ),
162
+ },
163
+ )
164
+ self._dal = dal
165
+
166
+ def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]:
167
+ if self._dal and self._dal.enabled:
168
+ return self._dal.get_configuration_changes(
169
+ start_datetime=params["start_datetime"],
170
+ end_datetime=params["end_datetime"],
171
+ )
172
+ return None
173
+
174
+ def _invoke(self, params: Dict) -> StructuredToolResult:
175
+ try:
176
+ changes = self._fetch_change_history(params)
177
+ if changes:
178
+ return StructuredToolResult(
179
+ status=ToolResultStatus.SUCCESS,
180
+ data=changes,
181
+ params=params,
182
+ )
183
+ else:
184
+ return StructuredToolResult(
185
+ status=ToolResultStatus.NO_DATA,
186
+ data=f"Could not find changes for {params}",
187
+ params=params,
188
+ )
189
+ except Exception as e:
190
+ msg = f"There was an internal error while fetching changes for {params}. {str(e)}"
191
+ logging.exception(msg)
192
+ return StructuredToolResult(
193
+ status=ToolResultStatus.ERROR,
194
+ data=msg,
195
+ params=params,
196
+ )
197
+
198
+ def get_parameterized_one_liner(self, params: Dict) -> str:
199
+ return f"Search Change History: ({str(params)})"
200
+
201
+
202
+ class RobustaToolset(Toolset):
203
+ def __init__(self, dal: Optional[SupabaseDal]):
204
+ dal_prereq = StaticPrerequisite(
205
+ enabled=True if dal else False,
206
+ disabled_reason="The data access layer is not available",
207
+ )
208
+ if dal:
209
+ dal_prereq = StaticPrerequisite(
210
+ enabled=dal.enabled, disabled_reason="Data access layer is disabled"
211
+ )
212
+
213
+ super().__init__(
214
+ icon_url="https://cdn.prod.website-files.com/633e9bac8f71dfb7a8e4c9a6/646be7710db810b14133bdb5_logo.svg",
215
+ description="Fetches alerts metadata and change history",
216
+ docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/robusta.html",
217
+ name="robusta",
218
+ prerequisites=[dal_prereq],
219
+ tools=[
220
+ FetchRobustaFinding(dal),
221
+ FetchConfigurationChanges(dal),
222
+ FetchResourceRecommendation(dal),
223
+ ],
224
+ tags=[
225
+ ToolsetTag.CORE,
226
+ ],
227
+ is_default=True,
228
+ )
229
+ template_file_path = os.path.abspath(
230
+ os.path.join(os.path.dirname(__file__), "robusta_instructions.jinja2")
231
+ )
232
+ self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
233
+
234
+ def get_example_config(self) -> Dict[str, Any]:
235
+ return {}
@@ -0,0 +1,24 @@
1
+ # Configuration and alerts history
2
+ * Use fetch_configuration_changes to get historical configuration changes
3
+ * You must ALWAYS call fetch_configuration_changes when investigating an alert
4
+ * Never respond without calling fetch_configuration_changes
5
+ * When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
6
+ * If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
7
+ For example:
8
+ 'The issue was introduced by a change in the environment variables, on 2025-03-28 10:56:00 << { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Change in environment variables" } >>'
9
+ * Embed the related historical configuration changes only in the 'Conclusions and Possible Root Causes' section
10
+ * Never add the same change more than once to the output
11
+ * Embed it in with the evidence id and a title describing the change. Use this format:
12
+ << { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Image change on the DB workload" } >>
13
+
14
+ # Resource and efficiency recommendations
15
+ * Use fetch_resource_recommendation to get resource recommendations for a given kubernetes workload
16
+ * Resource recommendations contains memory and cpu recommended request and limits for a given workload
17
+ * When asked if a resource can be optimized, or if a resources is over utilized, use the fetch_resource_recommendation tool to answer
18
+ * Right sizing of resources is a key to avoiding performance issues
19
+ * Right sizing of resouces can also lead to cost savings
20
+
21
+ # Investigating issues
22
+ * If provided an issue id (a.k.a. a finding), use `fetch_finding_by_id` to get more information about that issue
23
+ * You may be given an issue id in the following format: << { "type": "issue", "id": "<the id of the issue>" } >>
24
+ * The issue ID may be inside this prompt if given as part of an investigation. In that case, do call the tool `fetch_finding_by_id` to make sure you have all the information
File without changes
@@ -0,0 +1,78 @@
1
+ import logging
2
+ from typing import Any, Dict
3
+
4
+ from holmes.core.tools import (
5
+ StructuredToolResult,
6
+ Tool,
7
+ ToolParameter,
8
+ ToolResultStatus,
9
+ Toolset,
10
+ ToolsetTag,
11
+ )
12
+ from holmes.plugins.runbooks import get_runbook_by_path
13
+
14
+
15
+ # TODO(mainred): currently we support fetch runbooks hosted internally, in the future we may want to support fetching
16
+ # runbooks from external sources as well.
17
+ class RunbookFetcher(Tool):
18
+ toolset: "RunbookToolset"
19
+
20
+ def __init__(self, toolset: "RunbookToolset"):
21
+ super().__init__(
22
+ name="fetch_runbook",
23
+ description="Get runbook content by runbook link. Use this to get troubleshooting steps for incidents",
24
+ parameters={
25
+ # use link as a more generic term for runbook path, considering we may have external links in the future
26
+ "link": ToolParameter(
27
+ description="The link to the runbook",
28
+ type="string",
29
+ required=True,
30
+ ),
31
+ },
32
+ toolset=toolset, # type: ignore
33
+ )
34
+
35
+ def _invoke(self, params: Any) -> StructuredToolResult:
36
+ path: str = params["link"]
37
+
38
+ runbook_path = get_runbook_by_path(path)
39
+ try:
40
+ with open(runbook_path, "r") as file:
41
+ content = file.read()
42
+ return StructuredToolResult(
43
+ status=ToolResultStatus.SUCCESS,
44
+ data=content,
45
+ params=params,
46
+ )
47
+ except Exception as e:
48
+ err_msg = f"Failed to read runbook {runbook_path}: {str(e)}"
49
+ logging.error(err_msg)
50
+ return StructuredToolResult(
51
+ status=ToolResultStatus.ERROR,
52
+ error=err_msg,
53
+ params=params,
54
+ )
55
+
56
+ def get_parameterized_one_liner(self, params) -> str:
57
+ path: str = params["link"]
58
+ return f"fetched runbook {path}"
59
+
60
+
61
+ class RunbookToolset(Toolset):
62
+ def __init__(self):
63
+ super().__init__(
64
+ name="runbook",
65
+ description="Fetch runbooks",
66
+ icon_url="https://platform.robusta.dev/demos/runbook.svg",
67
+ tools=[
68
+ RunbookFetcher(self),
69
+ ],
70
+ docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/runbook.html",
71
+ tags=[
72
+ ToolsetTag.CORE,
73
+ ],
74
+ is_default=True,
75
+ )
76
+
77
+ def get_example_config(self) -> Dict[str, Any]:
78
+ return {}
@@ -0,0 +1,92 @@
1
+ import logging
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from kubernetes import client # type: ignore
6
+ from kubernetes import config # type: ignore
7
+ from kubernetes.client import V1ServiceList # type: ignore
8
+ from kubernetes.client.models.v1_service import V1Service # type: ignore
9
+
10
+ CLUSTER_DOMAIN = os.environ.get("CLUSTER_DOMAIN", "cluster.local")
11
+
12
+ try:
13
+ if os.getenv("KUBERNETES_SERVICE_HOST"):
14
+ config.load_incluster_config()
15
+ else:
16
+ config.load_kube_config()
17
+ except config.config_exception.ConfigException as e:
18
+ logging.warning(f"Running without kube-config! e={e}")
19
+
20
+
21
+ def find_service_url(label_selector):
22
+ """
23
+ Get the url of an in-cluster service with a specific label
24
+ """
25
+ # we do it this way because there is a weird issue with hikaru's ServiceList.listServiceForAllNamespaces()
26
+ try:
27
+ v1 = client.CoreV1Api()
28
+ svc_list: V1ServiceList = v1.list_service_for_all_namespaces( # type: ignore
29
+ label_selector=label_selector
30
+ )
31
+ if not svc_list.items:
32
+ return None
33
+ svc: V1Service = svc_list.items[0]
34
+ name = svc.metadata.name
35
+ namespace = svc.metadata.namespace
36
+ port = svc.spec.ports[0].port
37
+ url = f"http://{name}.{namespace}.svc.{CLUSTER_DOMAIN}:{port}"
38
+ logging.info(
39
+ f"discovered service with label-selector: `{label_selector}` at url: `{url}`"
40
+ )
41
+ return url
42
+ except Exception:
43
+ logging.warning("Error finding url")
44
+ return None
45
+
46
+
47
+ class ServiceDiscovery:
48
+ @classmethod
49
+ def find_url(cls, selectors: List[str], error_msg: str) -> Optional[str]:
50
+ """
51
+ Try to autodiscover the url of an in-cluster service
52
+ """
53
+
54
+ for label_selector in selectors:
55
+ service_url = find_service_url(label_selector)
56
+ if service_url:
57
+ return service_url
58
+
59
+ logging.debug(error_msg)
60
+ return None
61
+
62
+
63
+ class PrometheusDiscovery(ServiceDiscovery):
64
+ @classmethod
65
+ def find_prometheus_url(cls) -> Optional[str]:
66
+ return super().find_url(
67
+ selectors=[
68
+ "app=kube-prometheus-stack-prometheus",
69
+ "app=prometheus,component=server,release!=kubecost",
70
+ "app=prometheus-server",
71
+ "app=prometheus-operator-prometheus",
72
+ "app=rancher-monitoring-prometheus",
73
+ "app=prometheus-prometheus",
74
+ "app.kubernetes.io/component=query,app.kubernetes.io/name=thanos",
75
+ "app.kubernetes.io/name=thanos-query",
76
+ "app=thanos-query",
77
+ "app=thanos-querier",
78
+ ],
79
+ error_msg="Prometheus url could not be found. Add 'prometheus_url' under your prometheus tools config",
80
+ )
81
+
82
+ @classmethod
83
+ def find_vm_url(cls) -> Optional[str]:
84
+ return super().find_url(
85
+ selectors=[
86
+ "app.kubernetes.io/name=vmsingle",
87
+ "app.kubernetes.io/name=victoria-metrics-single",
88
+ "app.kubernetes.io/name=vmselect",
89
+ "app=vmselect",
90
+ ],
91
+ error_msg="Victoria Metrics url could not be found. Add 'prometheus_url' under your prometheus tools config",
92
+ )