holmesgpt 0.11.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/.git_archival.json +7 -0
- holmes/__init__.py +76 -0
- holmes/__init__.py.bak +76 -0
- holmes/clients/robusta_client.py +24 -0
- holmes/common/env_vars.py +47 -0
- holmes/config.py +526 -0
- holmes/core/__init__.py +0 -0
- holmes/core/conversations.py +578 -0
- holmes/core/investigation.py +152 -0
- holmes/core/investigation_structured_output.py +264 -0
- holmes/core/issue.py +54 -0
- holmes/core/llm.py +250 -0
- holmes/core/models.py +157 -0
- holmes/core/openai_formatting.py +51 -0
- holmes/core/performance_timing.py +72 -0
- holmes/core/prompt.py +42 -0
- holmes/core/resource_instruction.py +17 -0
- holmes/core/runbooks.py +26 -0
- holmes/core/safeguards.py +120 -0
- holmes/core/supabase_dal.py +540 -0
- holmes/core/tool_calling_llm.py +798 -0
- holmes/core/tools.py +566 -0
- holmes/core/tools_utils/__init__.py +0 -0
- holmes/core/tools_utils/tool_executor.py +65 -0
- holmes/core/tools_utils/toolset_utils.py +52 -0
- holmes/core/toolset_manager.py +418 -0
- holmes/interactive.py +229 -0
- holmes/main.py +1041 -0
- holmes/plugins/__init__.py +0 -0
- holmes/plugins/destinations/__init__.py +6 -0
- holmes/plugins/destinations/slack/__init__.py +2 -0
- holmes/plugins/destinations/slack/plugin.py +163 -0
- holmes/plugins/interfaces.py +32 -0
- holmes/plugins/prompts/__init__.py +48 -0
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
- holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
- holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
- holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
- holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
- holmes/plugins/prompts/generic_ask.jinja2 +36 -0
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
- holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
- holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
- holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
- holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
- holmes/plugins/runbooks/README.md +22 -0
- holmes/plugins/runbooks/__init__.py +100 -0
- holmes/plugins/runbooks/catalog.json +14 -0
- holmes/plugins/runbooks/jira.yaml +12 -0
- holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
- holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
- holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
- holmes/plugins/sources/github/__init__.py +77 -0
- holmes/plugins/sources/jira/__init__.py +123 -0
- holmes/plugins/sources/opsgenie/__init__.py +93 -0
- holmes/plugins/sources/pagerduty/__init__.py +147 -0
- holmes/plugins/sources/prometheus/__init__.py +0 -0
- holmes/plugins/sources/prometheus/models.py +104 -0
- holmes/plugins/sources/prometheus/plugin.py +154 -0
- holmes/plugins/toolsets/__init__.py +171 -0
- holmes/plugins/toolsets/aks-node-health.yaml +65 -0
- holmes/plugins/toolsets/aks.yaml +86 -0
- holmes/plugins/toolsets/argocd.yaml +70 -0
- holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
- holmes/plugins/toolsets/aws.yaml +76 -0
- holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
- holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
- holmes/plugins/toolsets/azure_sql/install.md +66 -0
- holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
- holmes/plugins/toolsets/azure_sql/utils.py +83 -0
- holmes/plugins/toolsets/bash/__init__.py +0 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
- holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
- holmes/plugins/toolsets/bash/common/bash.py +52 -0
- holmes/plugins/toolsets/bash/common/config.py +14 -0
- holmes/plugins/toolsets/bash/common/stringify.py +25 -0
- holmes/plugins/toolsets/bash/common/validators.py +24 -0
- holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
- holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
- holmes/plugins/toolsets/bash/parse_command.py +103 -0
- holmes/plugins/toolsets/confluence.yaml +19 -0
- holmes/plugins/toolsets/consts.py +5 -0
- holmes/plugins/toolsets/coralogix/api.py +158 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
- holmes/plugins/toolsets/coralogix/utils.py +181 -0
- holmes/plugins/toolsets/datadog.py +153 -0
- holmes/plugins/toolsets/docker.yaml +46 -0
- holmes/plugins/toolsets/git.py +756 -0
- holmes/plugins/toolsets/grafana/__init__.py +0 -0
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
- holmes/plugins/toolsets/grafana/common.py +68 -0
- holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
- holmes/plugins/toolsets/grafana/loki_api.py +89 -0
- holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
- holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
- holmes/plugins/toolsets/helm.yaml +42 -0
- holmes/plugins/toolsets/internet/internet.py +275 -0
- holmes/plugins/toolsets/internet/notion.py +137 -0
- holmes/plugins/toolsets/kafka.py +638 -0
- holmes/plugins/toolsets/kubernetes.yaml +255 -0
- holmes/plugins/toolsets/kubernetes_logs.py +426 -0
- holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
- holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
- holmes/plugins/toolsets/logging_utils/types.py +0 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
- holmes/plugins/toolsets/newrelic.py +222 -0
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
- holmes/plugins/toolsets/rabbitmq/api.py +398 -0
- holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
- holmes/plugins/toolsets/robusta/__init__.py +0 -0
- holmes/plugins/toolsets/robusta/robusta.py +235 -0
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
- holmes/plugins/toolsets/runbook/__init__.py +0 -0
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
- holmes/plugins/toolsets/service_discovery.py +92 -0
- holmes/plugins/toolsets/servicenow/install.md +37 -0
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
- holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
- holmes/plugins/toolsets/slab.yaml +20 -0
- holmes/plugins/toolsets/utils.py +137 -0
- holmes/plugins/utils.py +14 -0
- holmes/utils/__init__.py +0 -0
- holmes/utils/cache.py +84 -0
- holmes/utils/cert_utils.py +40 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
- holmes/utils/definitions.py +13 -0
- holmes/utils/env.py +53 -0
- holmes/utils/file_utils.py +56 -0
- holmes/utils/global_instructions.py +20 -0
- holmes/utils/holmes_status.py +22 -0
- holmes/utils/holmes_sync_toolsets.py +80 -0
- holmes/utils/markdown_utils.py +55 -0
- holmes/utils/pydantic_utils.py +54 -0
- holmes/utils/robusta.py +10 -0
- holmes/utils/tags.py +97 -0
- holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
- holmesgpt-0.11.5.dist-info/METADATA +400 -0
- holmesgpt-0.11.5.dist-info/RECORD +183 -0
- holmesgpt-0.11.5.dist-info/WHEEL +4 -0
- holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, List, Optional, Tuple
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from holmes.core.tools import (
|
|
7
|
+
CallablePrerequisite,
|
|
8
|
+
StructuredToolResult,
|
|
9
|
+
Tool,
|
|
10
|
+
ToolParameter,
|
|
11
|
+
ToolResultStatus,
|
|
12
|
+
Toolset,
|
|
13
|
+
ToolsetTag,
|
|
14
|
+
)
|
|
15
|
+
from requests import RequestException # type: ignore
|
|
16
|
+
from urllib.parse import urljoin
|
|
17
|
+
|
|
18
|
+
from holmes.plugins.toolsets.rabbitmq.api import (
|
|
19
|
+
ClusterConnectionStatus,
|
|
20
|
+
RabbitMQClusterConfig,
|
|
21
|
+
get_cluster_status,
|
|
22
|
+
make_request,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RabbitMQConfig(BaseModel):
|
|
27
|
+
clusters: List[RabbitMQClusterConfig]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class BaseRabbitMQTool(Tool):
|
|
31
|
+
toolset: "RabbitMQToolset"
|
|
32
|
+
|
|
33
|
+
def _get_cluster_config(self, cluster_id: Optional[str]) -> RabbitMQClusterConfig:
|
|
34
|
+
if not self.toolset.config:
|
|
35
|
+
raise ValueError("RabbitMQ is not configured.")
|
|
36
|
+
cluster_ids = [c.id for c in self.toolset.config.clusters]
|
|
37
|
+
if not cluster_id and len(cluster_ids) == 1:
|
|
38
|
+
# cluster id is optional if there is only one configured
|
|
39
|
+
return self.toolset.config.clusters[0]
|
|
40
|
+
elif not cluster_id and len(cluster_ids) > 0:
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"No cluster is configured. Possible cluster_id values are: {', '.join(cluster_ids)}"
|
|
43
|
+
)
|
|
44
|
+
elif not cluster_id:
|
|
45
|
+
raise ValueError("No cluster is configured")
|
|
46
|
+
|
|
47
|
+
for cluster in self.toolset.config.clusters:
|
|
48
|
+
if cluster.id == cluster_id:
|
|
49
|
+
return cluster
|
|
50
|
+
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Failed to find cluster_id={cluster_id} amongst configured clusters. Possible cluster_id values are: {', '.join(cluster_ids)}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ListConfiguredClusters(BaseRabbitMQTool):
|
|
57
|
+
def __init__(self, toolset: "RabbitMQToolset"):
|
|
58
|
+
super().__init__(
|
|
59
|
+
name="list_configured_clusters",
|
|
60
|
+
description="List all configured clusters. Useful to get the id of a configured cluster (cluster_id) and pass as argument to other rabbitmq tool calls.",
|
|
61
|
+
parameters={},
|
|
62
|
+
toolset=toolset,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
66
|
+
if not self.toolset.config:
|
|
67
|
+
raise ValueError("RabbitMQ is not configured.")
|
|
68
|
+
|
|
69
|
+
available_clusters = [
|
|
70
|
+
{
|
|
71
|
+
"cluster_id": c.id,
|
|
72
|
+
"management_url": c.management_url,
|
|
73
|
+
"connection_status": c.connection_status,
|
|
74
|
+
}
|
|
75
|
+
for c in self.toolset.config.clusters
|
|
76
|
+
if c.connection_status == ClusterConnectionStatus.SUCCESS
|
|
77
|
+
]
|
|
78
|
+
return StructuredToolResult(
|
|
79
|
+
status=ToolResultStatus.SUCCESS, data=available_clusters
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
83
|
+
return "list configured RabbitMQ clusters"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
87
|
+
def __init__(self, toolset: "RabbitMQToolset"):
|
|
88
|
+
super().__init__(
|
|
89
|
+
name="get_rabbitmq_cluster_status",
|
|
90
|
+
description="Fetches the overall status of the RabbitMQ cluster, including node information, listeners, and partition details. Crucial for detecting split-brain scenarios",
|
|
91
|
+
parameters={
|
|
92
|
+
"cluster_id": ToolParameter(
|
|
93
|
+
description="The id of the cluster obtained with list_configured_clusters. Only required if more than one rabbitmq cluster is configured.",
|
|
94
|
+
type="string",
|
|
95
|
+
required=False,
|
|
96
|
+
),
|
|
97
|
+
},
|
|
98
|
+
toolset=toolset,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
102
|
+
try:
|
|
103
|
+
# Fetch node details which include partition info
|
|
104
|
+
cluster_config = self._get_cluster_config(
|
|
105
|
+
cluster_id=params.get("cluster_id")
|
|
106
|
+
)
|
|
107
|
+
result = get_cluster_status(cluster_config)
|
|
108
|
+
return StructuredToolResult(status=ToolResultStatus.SUCCESS, data=result)
|
|
109
|
+
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
|
|
112
|
+
return StructuredToolResult(
|
|
113
|
+
status=ToolResultStatus.ERROR,
|
|
114
|
+
error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
|
|
115
|
+
data=None,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
119
|
+
return "get RabbitMQ cluster status and partition information"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class RabbitMQToolset(Toolset):
|
|
123
|
+
def __init__(self):
|
|
124
|
+
super().__init__(
|
|
125
|
+
name="rabbitmq/core",
|
|
126
|
+
description="Provides tools to interact with RabbitMQ to diagnose cluster health, node status, and specifically network partitions (split-brain).",
|
|
127
|
+
docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/rabbitmq.html",
|
|
128
|
+
icon_url="https://cdn.worldvectorlogo.com/logos/rabbitmq.svg",
|
|
129
|
+
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
130
|
+
tools=[
|
|
131
|
+
ListConfiguredClusters(toolset=self),
|
|
132
|
+
GetRabbitMQClusterStatus(toolset=self),
|
|
133
|
+
],
|
|
134
|
+
tags=[ToolsetTag.CORE],
|
|
135
|
+
)
|
|
136
|
+
self._reload_llm_instructions()
|
|
137
|
+
|
|
138
|
+
def _reload_llm_instructions(self):
|
|
139
|
+
# Load instructions from the jinja2 template file
|
|
140
|
+
template_file_path = os.path.abspath(
|
|
141
|
+
os.path.join(os.path.dirname(__file__), "rabbitmq_instructions.jinja2")
|
|
142
|
+
)
|
|
143
|
+
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
144
|
+
|
|
145
|
+
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
146
|
+
if not config or not config.get("clusters"):
|
|
147
|
+
# Attempt to load from environment variables as fallback
|
|
148
|
+
env_url = os.environ.get("RABBITMQ_MANAGEMENT_URL")
|
|
149
|
+
env_user = os.environ.get("RABBITMQ_USERNAME", "guest")
|
|
150
|
+
env_pass = os.environ.get("RABBITMQ_PASSWORD", "guest")
|
|
151
|
+
if not env_url:
|
|
152
|
+
return (
|
|
153
|
+
False,
|
|
154
|
+
"RabbitMQ toolset is misconfigured. 'management_url' is required.",
|
|
155
|
+
)
|
|
156
|
+
config = {
|
|
157
|
+
"clusters": [
|
|
158
|
+
{
|
|
159
|
+
"id": "rabbitmq",
|
|
160
|
+
"management_url": env_url,
|
|
161
|
+
"username": env_user,
|
|
162
|
+
"password": env_pass,
|
|
163
|
+
}
|
|
164
|
+
]
|
|
165
|
+
}
|
|
166
|
+
logging.info("Loaded RabbitMQ config from environment variables.")
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
self.config = RabbitMQConfig(**config)
|
|
170
|
+
except Exception as e:
|
|
171
|
+
return (False, f"Failed to parse RabbitMQ configuration: {str(e)}")
|
|
172
|
+
|
|
173
|
+
return self._check_clusters_config(self.config)
|
|
174
|
+
|
|
175
|
+
def _check_clusters_config(self, config: RabbitMQConfig) -> Tuple[bool, str]:
|
|
176
|
+
errors = []
|
|
177
|
+
for cluster_config in config.clusters:
|
|
178
|
+
url = urljoin(cluster_config.management_url, "api/overview")
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
data = make_request(
|
|
182
|
+
config=cluster_config,
|
|
183
|
+
method="GET",
|
|
184
|
+
url=url,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if data:
|
|
188
|
+
cluster_config.connection_status = ClusterConnectionStatus.SUCCESS
|
|
189
|
+
self._reload_llm_instructions()
|
|
190
|
+
else:
|
|
191
|
+
error_message = f"Failed to connect to RabbitMQ Management API for cluster with id={cluster_config.id} at {url}. No data returned"
|
|
192
|
+
cluster_config.connection_status = ClusterConnectionStatus.ERROR
|
|
193
|
+
errors.append(error_message)
|
|
194
|
+
|
|
195
|
+
except RequestException as e:
|
|
196
|
+
error_message = f"Toolset failed health check for cluster with id={cluster_config.id} at {url} due to a failed http request. Connection error: {str(e)}"
|
|
197
|
+
cluster_config.connection_status = ClusterConnectionStatus.ERROR
|
|
198
|
+
errors.append(error_message)
|
|
199
|
+
except Exception as e:
|
|
200
|
+
error_message = f"Toolset failed health check for cluster with id={cluster_config.id} at {url}: {str(e)}"
|
|
201
|
+
cluster_config.connection_status = ClusterConnectionStatus.ERROR
|
|
202
|
+
errors.append(error_message)
|
|
203
|
+
|
|
204
|
+
if errors:
|
|
205
|
+
if len(errors) == 1:
|
|
206
|
+
return (False, errors[0])
|
|
207
|
+
else:
|
|
208
|
+
return (False, "\n".join([f"- {error}" for error in errors]))
|
|
209
|
+
else:
|
|
210
|
+
return (True, "")
|
|
211
|
+
|
|
212
|
+
def get_example_config(self):
|
|
213
|
+
example_config = RabbitMQConfig(
|
|
214
|
+
clusters=[
|
|
215
|
+
RabbitMQClusterConfig(
|
|
216
|
+
management_url="http://<your-rabbitmq-server-or-service>:15672",
|
|
217
|
+
username="holmes_user",
|
|
218
|
+
password="holmes_password",
|
|
219
|
+
)
|
|
220
|
+
]
|
|
221
|
+
)
|
|
222
|
+
return example_config.model_dump()
|
|
File without changes
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Dict, Any, List
|
|
6
|
+
from holmes.core.supabase_dal import SupabaseDal
|
|
7
|
+
from holmes.core.tools import (
|
|
8
|
+
StaticPrerequisite,
|
|
9
|
+
Tool,
|
|
10
|
+
ToolParameter,
|
|
11
|
+
Toolset,
|
|
12
|
+
ToolsetTag,
|
|
13
|
+
)
|
|
14
|
+
from holmes.core.tools import StructuredToolResult, ToolResultStatus
|
|
15
|
+
|
|
16
|
+
PARAM_FINDING_ID = "id"
|
|
17
|
+
START_TIME = "start_datetime"
|
|
18
|
+
END_TIME = "end_datetime"
|
|
19
|
+
NAMESPACE = "namespace"
|
|
20
|
+
WORKLOAD = "workload"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class FetchRobustaFinding(Tool):
|
|
24
|
+
_dal: Optional[SupabaseDal]
|
|
25
|
+
|
|
26
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
27
|
+
super().__init__(
|
|
28
|
+
name="fetch_finding_by_id",
|
|
29
|
+
description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update",
|
|
30
|
+
parameters={
|
|
31
|
+
PARAM_FINDING_ID: ToolParameter(
|
|
32
|
+
description="The id of the finding to fetch",
|
|
33
|
+
type="string",
|
|
34
|
+
required=True,
|
|
35
|
+
)
|
|
36
|
+
},
|
|
37
|
+
)
|
|
38
|
+
self._dal = dal
|
|
39
|
+
|
|
40
|
+
def _fetch_finding(self, finding_id: str) -> Optional[Dict]:
|
|
41
|
+
if self._dal and self._dal.enabled:
|
|
42
|
+
return self._dal.get_issue_data(finding_id)
|
|
43
|
+
else:
|
|
44
|
+
error = f"Failed to find a finding with finding_id={finding_id}: Holmes' data access layer is not enabled."
|
|
45
|
+
logging.error(error)
|
|
46
|
+
return {"error": error}
|
|
47
|
+
|
|
48
|
+
def _invoke(self, params: Dict) -> StructuredToolResult:
|
|
49
|
+
finding_id = params[PARAM_FINDING_ID]
|
|
50
|
+
try:
|
|
51
|
+
finding = self._fetch_finding(finding_id)
|
|
52
|
+
if finding:
|
|
53
|
+
return StructuredToolResult(
|
|
54
|
+
status=ToolResultStatus.SUCCESS,
|
|
55
|
+
data=finding,
|
|
56
|
+
params=params,
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
return StructuredToolResult(
|
|
60
|
+
status=ToolResultStatus.NO_DATA,
|
|
61
|
+
data=f"Could not find a finding with finding_id={finding_id}",
|
|
62
|
+
params=params,
|
|
63
|
+
)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logging.error(e)
|
|
66
|
+
logging.error(
|
|
67
|
+
f"There was an internal error while fetching finding {finding_id}. {str(e)}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return StructuredToolResult(
|
|
71
|
+
status=ToolResultStatus.ERROR,
|
|
72
|
+
data=f"There was an internal error while fetching finding {finding_id}",
|
|
73
|
+
params=params,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
77
|
+
return "Fetch Alert Metadata"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class FetchResourceRecommendation(Tool):
|
|
81
|
+
_dal: Optional[SupabaseDal]
|
|
82
|
+
|
|
83
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
84
|
+
super().__init__(
|
|
85
|
+
name="fetch_resource_recommendation",
|
|
86
|
+
description="Fetch workload recommendations for resources requests and limits. Returns the current configured resources, as well as recommendation based on actual historical usage.",
|
|
87
|
+
parameters={
|
|
88
|
+
"name": ToolParameter(
|
|
89
|
+
description="The name of the kubernetes workload.",
|
|
90
|
+
type="string",
|
|
91
|
+
required=True,
|
|
92
|
+
),
|
|
93
|
+
"namespace": ToolParameter(
|
|
94
|
+
description="The namespace of the kubernetes resource.",
|
|
95
|
+
type="string",
|
|
96
|
+
required=True,
|
|
97
|
+
),
|
|
98
|
+
"kind": ToolParameter(
|
|
99
|
+
description="The kind of the kubernetes resource. Must be one of: [Deployment, StatefulSet, DaemonSet, Job].",
|
|
100
|
+
type="string",
|
|
101
|
+
required=True,
|
|
102
|
+
),
|
|
103
|
+
},
|
|
104
|
+
)
|
|
105
|
+
self._dal = dal
|
|
106
|
+
|
|
107
|
+
def _resource_recommendation(self, params: Dict) -> Optional[List[Dict]]:
|
|
108
|
+
if self._dal and self._dal.enabled:
|
|
109
|
+
return self._dal.get_resource_recommendation(
|
|
110
|
+
name=params["name"],
|
|
111
|
+
namespace=params["namespace"],
|
|
112
|
+
kind=params["kind"],
|
|
113
|
+
)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
def _invoke(self, params: Dict) -> StructuredToolResult:
|
|
117
|
+
try:
|
|
118
|
+
recommendations = self._resource_recommendation(params)
|
|
119
|
+
if recommendations:
|
|
120
|
+
return StructuredToolResult(
|
|
121
|
+
status=ToolResultStatus.SUCCESS,
|
|
122
|
+
data=recommendations,
|
|
123
|
+
params=params,
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
return StructuredToolResult(
|
|
127
|
+
status=ToolResultStatus.NO_DATA,
|
|
128
|
+
data=f"Could not find recommendations for {params}",
|
|
129
|
+
params=params,
|
|
130
|
+
)
|
|
131
|
+
except Exception as e:
|
|
132
|
+
msg = f"There was an internal error while fetching recommendations for {params}. {str(e)}"
|
|
133
|
+
logging.exception(msg)
|
|
134
|
+
return StructuredToolResult(
|
|
135
|
+
status=ToolResultStatus.ERROR,
|
|
136
|
+
data=msg,
|
|
137
|
+
params=params,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
141
|
+
return f"Check Historical Resource Utilization: ({str(params)})"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class FetchConfigurationChanges(Tool):
|
|
145
|
+
_dal: Optional[SupabaseDal]
|
|
146
|
+
|
|
147
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
148
|
+
super().__init__(
|
|
149
|
+
name="fetch_configuration_changes",
|
|
150
|
+
description="Fetch configuration changes in a given time range. By default, fetch all cluster changes. Can be filtered on a given namespace or a specific workload",
|
|
151
|
+
parameters={
|
|
152
|
+
START_TIME: ToolParameter(
|
|
153
|
+
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
154
|
+
type="string",
|
|
155
|
+
required=True,
|
|
156
|
+
),
|
|
157
|
+
END_TIME: ToolParameter(
|
|
158
|
+
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
159
|
+
type="string",
|
|
160
|
+
required=True,
|
|
161
|
+
),
|
|
162
|
+
},
|
|
163
|
+
)
|
|
164
|
+
self._dal = dal
|
|
165
|
+
|
|
166
|
+
def _fetch_change_history(self, params: Dict) -> Optional[List[Dict]]:
|
|
167
|
+
if self._dal and self._dal.enabled:
|
|
168
|
+
return self._dal.get_configuration_changes(
|
|
169
|
+
start_datetime=params["start_datetime"],
|
|
170
|
+
end_datetime=params["end_datetime"],
|
|
171
|
+
)
|
|
172
|
+
return None
|
|
173
|
+
|
|
174
|
+
def _invoke(self, params: Dict) -> StructuredToolResult:
|
|
175
|
+
try:
|
|
176
|
+
changes = self._fetch_change_history(params)
|
|
177
|
+
if changes:
|
|
178
|
+
return StructuredToolResult(
|
|
179
|
+
status=ToolResultStatus.SUCCESS,
|
|
180
|
+
data=changes,
|
|
181
|
+
params=params,
|
|
182
|
+
)
|
|
183
|
+
else:
|
|
184
|
+
return StructuredToolResult(
|
|
185
|
+
status=ToolResultStatus.NO_DATA,
|
|
186
|
+
data=f"Could not find changes for {params}",
|
|
187
|
+
params=params,
|
|
188
|
+
)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
msg = f"There was an internal error while fetching changes for {params}. {str(e)}"
|
|
191
|
+
logging.exception(msg)
|
|
192
|
+
return StructuredToolResult(
|
|
193
|
+
status=ToolResultStatus.ERROR,
|
|
194
|
+
data=msg,
|
|
195
|
+
params=params,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
199
|
+
return f"Search Change History: ({str(params)})"
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
class RobustaToolset(Toolset):
|
|
203
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
204
|
+
dal_prereq = StaticPrerequisite(
|
|
205
|
+
enabled=True if dal else False,
|
|
206
|
+
disabled_reason="The data access layer is not available",
|
|
207
|
+
)
|
|
208
|
+
if dal:
|
|
209
|
+
dal_prereq = StaticPrerequisite(
|
|
210
|
+
enabled=dal.enabled, disabled_reason="Data access layer is disabled"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
super().__init__(
|
|
214
|
+
icon_url="https://cdn.prod.website-files.com/633e9bac8f71dfb7a8e4c9a6/646be7710db810b14133bdb5_logo.svg",
|
|
215
|
+
description="Fetches alerts metadata and change history",
|
|
216
|
+
docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/robusta.html",
|
|
217
|
+
name="robusta",
|
|
218
|
+
prerequisites=[dal_prereq],
|
|
219
|
+
tools=[
|
|
220
|
+
FetchRobustaFinding(dal),
|
|
221
|
+
FetchConfigurationChanges(dal),
|
|
222
|
+
FetchResourceRecommendation(dal),
|
|
223
|
+
],
|
|
224
|
+
tags=[
|
|
225
|
+
ToolsetTag.CORE,
|
|
226
|
+
],
|
|
227
|
+
is_default=True,
|
|
228
|
+
)
|
|
229
|
+
template_file_path = os.path.abspath(
|
|
230
|
+
os.path.join(os.path.dirname(__file__), "robusta_instructions.jinja2")
|
|
231
|
+
)
|
|
232
|
+
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
233
|
+
|
|
234
|
+
def get_example_config(self) -> Dict[str, Any]:
|
|
235
|
+
return {}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Configuration and alerts history
|
|
2
|
+
* Use fetch_configuration_changes to get historical configuration changes
|
|
3
|
+
* You must ALWAYS call fetch_configuration_changes when investigating an alert
|
|
4
|
+
* Never respond without calling fetch_configuration_changes
|
|
5
|
+
* When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
|
|
6
|
+
* If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
|
|
7
|
+
For example:
|
|
8
|
+
'The issue was introduced by a change in the environment variables, on 2025-03-28 10:56:00 << { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Change in environment variables" } >>'
|
|
9
|
+
* Embed the related historical configuration changes only in the 'Conclusions and Possible Root Causes' section
|
|
10
|
+
* Never add the same change more than once to the output
|
|
11
|
+
* Embed it in with the evidence id and a title describing the change. Use this format:
|
|
12
|
+
<< { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Image change on the DB workload" } >>
|
|
13
|
+
|
|
14
|
+
# Resource and efficiency recommendations
|
|
15
|
+
* Use fetch_resource_recommendation to get resource recommendations for a given kubernetes workload
|
|
16
|
+
* Resource recommendations contains memory and cpu recommended request and limits for a given workload
|
|
17
|
+
* When asked if a resource can be optimized, or if a resources is over utilized, use the fetch_resource_recommendation tool to answer
|
|
18
|
+
* Right sizing of resources is a key to avoiding performance issues
|
|
19
|
+
* Right sizing of resouces can also lead to cost savings
|
|
20
|
+
|
|
21
|
+
# Investigating issues
|
|
22
|
+
* If provided an issue id (a.k.a. a finding), use `fetch_finding_by_id` to get more information about that issue
|
|
23
|
+
* You may be given an issue id in the following format: << { "type": "issue", "id": "<the id of the issue>" } >>
|
|
24
|
+
* The issue ID may be inside this prompt if given as part of an investigation. In that case, do call the tool `fetch_finding_by_id` to make sure you have all the information
|
|
File without changes
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from holmes.core.tools import (
|
|
5
|
+
StructuredToolResult,
|
|
6
|
+
Tool,
|
|
7
|
+
ToolParameter,
|
|
8
|
+
ToolResultStatus,
|
|
9
|
+
Toolset,
|
|
10
|
+
ToolsetTag,
|
|
11
|
+
)
|
|
12
|
+
from holmes.plugins.runbooks import get_runbook_by_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# TODO(mainred): currently we support fetch runbooks hosted internally, in the future we may want to support fetching
|
|
16
|
+
# runbooks from external sources as well.
|
|
17
|
+
class RunbookFetcher(Tool):
|
|
18
|
+
toolset: "RunbookToolset"
|
|
19
|
+
|
|
20
|
+
def __init__(self, toolset: "RunbookToolset"):
|
|
21
|
+
super().__init__(
|
|
22
|
+
name="fetch_runbook",
|
|
23
|
+
description="Get runbook content by runbook link. Use this to get troubleshooting steps for incidents",
|
|
24
|
+
parameters={
|
|
25
|
+
# use link as a more generic term for runbook path, considering we may have external links in the future
|
|
26
|
+
"link": ToolParameter(
|
|
27
|
+
description="The link to the runbook",
|
|
28
|
+
type="string",
|
|
29
|
+
required=True,
|
|
30
|
+
),
|
|
31
|
+
},
|
|
32
|
+
toolset=toolset, # type: ignore
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
36
|
+
path: str = params["link"]
|
|
37
|
+
|
|
38
|
+
runbook_path = get_runbook_by_path(path)
|
|
39
|
+
try:
|
|
40
|
+
with open(runbook_path, "r") as file:
|
|
41
|
+
content = file.read()
|
|
42
|
+
return StructuredToolResult(
|
|
43
|
+
status=ToolResultStatus.SUCCESS,
|
|
44
|
+
data=content,
|
|
45
|
+
params=params,
|
|
46
|
+
)
|
|
47
|
+
except Exception as e:
|
|
48
|
+
err_msg = f"Failed to read runbook {runbook_path}: {str(e)}"
|
|
49
|
+
logging.error(err_msg)
|
|
50
|
+
return StructuredToolResult(
|
|
51
|
+
status=ToolResultStatus.ERROR,
|
|
52
|
+
error=err_msg,
|
|
53
|
+
params=params,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
57
|
+
path: str = params["link"]
|
|
58
|
+
return f"fetched runbook {path}"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class RunbookToolset(Toolset):
|
|
62
|
+
def __init__(self):
|
|
63
|
+
super().__init__(
|
|
64
|
+
name="runbook",
|
|
65
|
+
description="Fetch runbooks",
|
|
66
|
+
icon_url="https://platform.robusta.dev/demos/runbook.svg",
|
|
67
|
+
tools=[
|
|
68
|
+
RunbookFetcher(self),
|
|
69
|
+
],
|
|
70
|
+
docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/runbook.html",
|
|
71
|
+
tags=[
|
|
72
|
+
ToolsetTag.CORE,
|
|
73
|
+
],
|
|
74
|
+
is_default=True,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
def get_example_config(self) -> Dict[str, Any]:
|
|
78
|
+
return {}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
from kubernetes import client # type: ignore
|
|
6
|
+
from kubernetes import config # type: ignore
|
|
7
|
+
from kubernetes.client import V1ServiceList # type: ignore
|
|
8
|
+
from kubernetes.client.models.v1_service import V1Service # type: ignore
|
|
9
|
+
|
|
10
|
+
CLUSTER_DOMAIN = os.environ.get("CLUSTER_DOMAIN", "cluster.local")
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
if os.getenv("KUBERNETES_SERVICE_HOST"):
|
|
14
|
+
config.load_incluster_config()
|
|
15
|
+
else:
|
|
16
|
+
config.load_kube_config()
|
|
17
|
+
except config.config_exception.ConfigException as e:
|
|
18
|
+
logging.warning(f"Running without kube-config! e={e}")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def find_service_url(label_selector):
|
|
22
|
+
"""
|
|
23
|
+
Get the url of an in-cluster service with a specific label
|
|
24
|
+
"""
|
|
25
|
+
# we do it this way because there is a weird issue with hikaru's ServiceList.listServiceForAllNamespaces()
|
|
26
|
+
try:
|
|
27
|
+
v1 = client.CoreV1Api()
|
|
28
|
+
svc_list: V1ServiceList = v1.list_service_for_all_namespaces( # type: ignore
|
|
29
|
+
label_selector=label_selector
|
|
30
|
+
)
|
|
31
|
+
if not svc_list.items:
|
|
32
|
+
return None
|
|
33
|
+
svc: V1Service = svc_list.items[0]
|
|
34
|
+
name = svc.metadata.name
|
|
35
|
+
namespace = svc.metadata.namespace
|
|
36
|
+
port = svc.spec.ports[0].port
|
|
37
|
+
url = f"http://{name}.{namespace}.svc.{CLUSTER_DOMAIN}:{port}"
|
|
38
|
+
logging.info(
|
|
39
|
+
f"discovered service with label-selector: `{label_selector}` at url: `{url}`"
|
|
40
|
+
)
|
|
41
|
+
return url
|
|
42
|
+
except Exception:
|
|
43
|
+
logging.warning("Error finding url")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ServiceDiscovery:
|
|
48
|
+
@classmethod
|
|
49
|
+
def find_url(cls, selectors: List[str], error_msg: str) -> Optional[str]:
|
|
50
|
+
"""
|
|
51
|
+
Try to autodiscover the url of an in-cluster service
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
for label_selector in selectors:
|
|
55
|
+
service_url = find_service_url(label_selector)
|
|
56
|
+
if service_url:
|
|
57
|
+
return service_url
|
|
58
|
+
|
|
59
|
+
logging.debug(error_msg)
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class PrometheusDiscovery(ServiceDiscovery):
|
|
64
|
+
@classmethod
|
|
65
|
+
def find_prometheus_url(cls) -> Optional[str]:
|
|
66
|
+
return super().find_url(
|
|
67
|
+
selectors=[
|
|
68
|
+
"app=kube-prometheus-stack-prometheus",
|
|
69
|
+
"app=prometheus,component=server,release!=kubecost",
|
|
70
|
+
"app=prometheus-server",
|
|
71
|
+
"app=prometheus-operator-prometheus",
|
|
72
|
+
"app=rancher-monitoring-prometheus",
|
|
73
|
+
"app=prometheus-prometheus",
|
|
74
|
+
"app.kubernetes.io/component=query,app.kubernetes.io/name=thanos",
|
|
75
|
+
"app.kubernetes.io/name=thanos-query",
|
|
76
|
+
"app=thanos-query",
|
|
77
|
+
"app=thanos-querier",
|
|
78
|
+
],
|
|
79
|
+
error_msg="Prometheus url could not be found. Add 'prometheus_url' under your prometheus tools config",
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
@classmethod
|
|
83
|
+
def find_vm_url(cls) -> Optional[str]:
|
|
84
|
+
return super().find_url(
|
|
85
|
+
selectors=[
|
|
86
|
+
"app.kubernetes.io/name=vmsingle",
|
|
87
|
+
"app.kubernetes.io/name=victoria-metrics-single",
|
|
88
|
+
"app.kubernetes.io/name=vmselect",
|
|
89
|
+
"app=vmselect",
|
|
90
|
+
],
|
|
91
|
+
error_msg="Victoria Metrics url could not be found. Add 'prometheus_url' under your prometheus tools config",
|
|
92
|
+
)
|