holmesgpt 0.11.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/.git_archival.json +7 -0
- holmes/__init__.py +76 -0
- holmes/__init__.py.bak +76 -0
- holmes/clients/robusta_client.py +24 -0
- holmes/common/env_vars.py +47 -0
- holmes/config.py +526 -0
- holmes/core/__init__.py +0 -0
- holmes/core/conversations.py +578 -0
- holmes/core/investigation.py +152 -0
- holmes/core/investigation_structured_output.py +264 -0
- holmes/core/issue.py +54 -0
- holmes/core/llm.py +250 -0
- holmes/core/models.py +157 -0
- holmes/core/openai_formatting.py +51 -0
- holmes/core/performance_timing.py +72 -0
- holmes/core/prompt.py +42 -0
- holmes/core/resource_instruction.py +17 -0
- holmes/core/runbooks.py +26 -0
- holmes/core/safeguards.py +120 -0
- holmes/core/supabase_dal.py +540 -0
- holmes/core/tool_calling_llm.py +798 -0
- holmes/core/tools.py +566 -0
- holmes/core/tools_utils/__init__.py +0 -0
- holmes/core/tools_utils/tool_executor.py +65 -0
- holmes/core/tools_utils/toolset_utils.py +52 -0
- holmes/core/toolset_manager.py +418 -0
- holmes/interactive.py +229 -0
- holmes/main.py +1041 -0
- holmes/plugins/__init__.py +0 -0
- holmes/plugins/destinations/__init__.py +6 -0
- holmes/plugins/destinations/slack/__init__.py +2 -0
- holmes/plugins/destinations/slack/plugin.py +163 -0
- holmes/plugins/interfaces.py +32 -0
- holmes/plugins/prompts/__init__.py +48 -0
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
- holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
- holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
- holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
- holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
- holmes/plugins/prompts/generic_ask.jinja2 +36 -0
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
- holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
- holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
- holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
- holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
- holmes/plugins/runbooks/README.md +22 -0
- holmes/plugins/runbooks/__init__.py +100 -0
- holmes/plugins/runbooks/catalog.json +14 -0
- holmes/plugins/runbooks/jira.yaml +12 -0
- holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
- holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
- holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
- holmes/plugins/sources/github/__init__.py +77 -0
- holmes/plugins/sources/jira/__init__.py +123 -0
- holmes/plugins/sources/opsgenie/__init__.py +93 -0
- holmes/plugins/sources/pagerduty/__init__.py +147 -0
- holmes/plugins/sources/prometheus/__init__.py +0 -0
- holmes/plugins/sources/prometheus/models.py +104 -0
- holmes/plugins/sources/prometheus/plugin.py +154 -0
- holmes/plugins/toolsets/__init__.py +171 -0
- holmes/plugins/toolsets/aks-node-health.yaml +65 -0
- holmes/plugins/toolsets/aks.yaml +86 -0
- holmes/plugins/toolsets/argocd.yaml +70 -0
- holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
- holmes/plugins/toolsets/aws.yaml +76 -0
- holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
- holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
- holmes/plugins/toolsets/azure_sql/install.md +66 -0
- holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
- holmes/plugins/toolsets/azure_sql/utils.py +83 -0
- holmes/plugins/toolsets/bash/__init__.py +0 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
- holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
- holmes/plugins/toolsets/bash/common/bash.py +52 -0
- holmes/plugins/toolsets/bash/common/config.py +14 -0
- holmes/plugins/toolsets/bash/common/stringify.py +25 -0
- holmes/plugins/toolsets/bash/common/validators.py +24 -0
- holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
- holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
- holmes/plugins/toolsets/bash/parse_command.py +103 -0
- holmes/plugins/toolsets/confluence.yaml +19 -0
- holmes/plugins/toolsets/consts.py +5 -0
- holmes/plugins/toolsets/coralogix/api.py +158 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
- holmes/plugins/toolsets/coralogix/utils.py +181 -0
- holmes/plugins/toolsets/datadog.py +153 -0
- holmes/plugins/toolsets/docker.yaml +46 -0
- holmes/plugins/toolsets/git.py +756 -0
- holmes/plugins/toolsets/grafana/__init__.py +0 -0
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
- holmes/plugins/toolsets/grafana/common.py +68 -0
- holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
- holmes/plugins/toolsets/grafana/loki_api.py +89 -0
- holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
- holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
- holmes/plugins/toolsets/helm.yaml +42 -0
- holmes/plugins/toolsets/internet/internet.py +275 -0
- holmes/plugins/toolsets/internet/notion.py +137 -0
- holmes/plugins/toolsets/kafka.py +638 -0
- holmes/plugins/toolsets/kubernetes.yaml +255 -0
- holmes/plugins/toolsets/kubernetes_logs.py +426 -0
- holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
- holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
- holmes/plugins/toolsets/logging_utils/types.py +0 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
- holmes/plugins/toolsets/newrelic.py +222 -0
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
- holmes/plugins/toolsets/rabbitmq/api.py +398 -0
- holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
- holmes/plugins/toolsets/robusta/__init__.py +0 -0
- holmes/plugins/toolsets/robusta/robusta.py +235 -0
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
- holmes/plugins/toolsets/runbook/__init__.py +0 -0
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
- holmes/plugins/toolsets/service_discovery.py +92 -0
- holmes/plugins/toolsets/servicenow/install.md +37 -0
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
- holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
- holmes/plugins/toolsets/slab.yaml +20 -0
- holmes/plugins/toolsets/utils.py +137 -0
- holmes/plugins/utils.py +14 -0
- holmes/utils/__init__.py +0 -0
- holmes/utils/cache.py +84 -0
- holmes/utils/cert_utils.py +40 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
- holmes/utils/definitions.py +13 -0
- holmes/utils/env.py +53 -0
- holmes/utils/file_utils.py +56 -0
- holmes/utils/global_instructions.py +20 -0
- holmes/utils/holmes_status.py +22 -0
- holmes/utils/holmes_sync_toolsets.py +80 -0
- holmes/utils/markdown_utils.py +55 -0
- holmes/utils/pydantic_utils.py +54 -0
- holmes/utils/robusta.py +10 -0
- holmes/utils/tags.py +97 -0
- holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
- holmesgpt-0.11.5.dist-info/METADATA +400 -0
- holmesgpt-0.11.5.dist-info/RECORD +183 -0
- holmesgpt-0.11.5.dist-info/WHEEL +4 -0
- holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
|
|
2
|
+
# Prometheus/PromQL queries
|
|
3
|
+
* ALWAYS call list_prometheus_rules to get the alert definition
|
|
4
|
+
* Use Prometheus to query metrics from the alert promql
|
|
5
|
+
* Use prometheus to execute promql queries with the tools `execute_prometheus_instant_query` and `execute_prometheus_range_query`
|
|
6
|
+
* To create queries, use 'start_timestamp' and 'end_timestamp' as graphs start and end times
|
|
7
|
+
* ALWAYS embed the execution results into your answer
|
|
8
|
+
* You only need to embed the partial result in your response. Include the "tool_name" and "random_key". For example: << {"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "92jf2hf"} >>
|
|
9
|
+
* Use these tools to generate charts that users can see. Here are standard metrics but you can use different ones:
|
|
10
|
+
** For memory consumption: `container_memory_working_set_bytes`
|
|
11
|
+
** For CPU usage: `container_cpu_usage_seconds_total`
|
|
12
|
+
** For CPU throttling: `container_cpu_cfs_throttled_periods_total`
|
|
13
|
+
** For latencies, prefer using `<metric>_sum` / `<metric>_count` over a sliding window
|
|
14
|
+
** Avoid using `<metric>_bucket` unless you know the bucket's boundaries are configured correctly
|
|
15
|
+
** Prefer individual averages like `rate(<metric>_sum) / rate(<metric>_count)`
|
|
16
|
+
** Avoid global averages like `sum(rate(<metric>_sum)) / sum(rate(<metric>_count))` because it hides data and is not generally informative
|
|
17
|
+
* Timestamps MUST be in string date format. For example: '2025-03-15 10:10:08.610862+00:00'
|
|
18
|
+
* Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
|
|
19
|
+
* Only generate and execute a prometheus query after checking what metrics are available with the `list_available_metrics` tool
|
|
20
|
+
* Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
|
|
21
|
+
* The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
|
|
22
|
+
* When doing queries, always extend the time range, to 15 min before and after the alert start time
|
|
23
|
+
* ALWAYS embed the execution results into your answer
|
|
24
|
+
* ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.
|
|
25
|
+
* Embed at most 2 graphs
|
|
26
|
+
* When embedding multiple graphs, always add line spacing between them
|
|
27
|
+
For example:
|
|
28
|
+
```
|
|
29
|
+
<<{"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "lBaA"}>>
|
|
30
|
+
|
|
31
|
+
<<{"type": "promql", "tool_name": "execute_prometheus_range_query", "random_key": "IKtq"}>>
|
|
32
|
+
```
|
|
33
|
+
{%- if config and config.additional_labels and config.additional_labels.keys()|list|length > 0 %}
|
|
34
|
+
* ALWAYS add the following additional labels to ALL PromQL queries:
|
|
35
|
+
{%- for key, value in config.additional_labels.items() %}
|
|
36
|
+
* {{ key }}="{{ value }}"
|
|
37
|
+
{%- endfor -%}
|
|
38
|
+
{%- endif -%}
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set
|
|
4
|
+
from urllib.parse import urljoin, urlparse
|
|
5
|
+
|
|
6
|
+
import backoff
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
import requests # type: ignore
|
|
9
|
+
from requests.auth import HTTPBasicAuth # type: ignore
|
|
10
|
+
|
|
11
|
+
# --- Enums and Pydantic Models (Mostly Unchanged) ---
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ClusterConnectionStatus(str, Enum):
|
|
15
|
+
SUCCESS = "success"
|
|
16
|
+
ERROR = "error"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RabbitMQClusterConfig(BaseModel):
|
|
20
|
+
id: str = "rabbitmq" # must be unique
|
|
21
|
+
management_url: str # e.g., http://rabbitmq-service:15672
|
|
22
|
+
username: Optional[str] = None
|
|
23
|
+
password: Optional[str] = None
|
|
24
|
+
request_timeout_seconds: int = 30
|
|
25
|
+
verify_certs: bool = True
|
|
26
|
+
|
|
27
|
+
# For internal use
|
|
28
|
+
connection_status: Optional[ClusterConnectionStatus] = None
|
|
29
|
+
connection_error: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Partition(BaseModel):
|
|
33
|
+
node: str
|
|
34
|
+
unreachable_nodes: List[str] # Nodes that 'node' cannot reach
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class NodeStatus(BaseModel):
|
|
38
|
+
node: str
|
|
39
|
+
running: bool # Status as reported by the primary connected node
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NodeInfo(BaseModel):
|
|
43
|
+
name: Optional[str] = "unknown"
|
|
44
|
+
type: Optional[str] = "unknown"
|
|
45
|
+
running: bool = False
|
|
46
|
+
mem_used: Optional[int] = None
|
|
47
|
+
mem_limit: Optional[int] = None
|
|
48
|
+
mem_alarm: Optional[bool] = None
|
|
49
|
+
disk_free: Optional[int] = None
|
|
50
|
+
disk_free_limit: Optional[int] = None
|
|
51
|
+
disk_free_alarm: Optional[bool] = None
|
|
52
|
+
fd_used: Optional[int] = None
|
|
53
|
+
fd_total: Optional[int] = None
|
|
54
|
+
sockets_used: Optional[int] = None
|
|
55
|
+
sockets_total: Optional[int] = None
|
|
56
|
+
uptime: Optional[int] = None
|
|
57
|
+
partitions: Optional[List[Any]] = None
|
|
58
|
+
error: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ClusterStatus(BaseModel):
|
|
62
|
+
nodes: List[NodeStatus] # Overall node running status from primary view
|
|
63
|
+
network_partitions_detected: bool = False
|
|
64
|
+
partition_details: List[Partition] # Combined list of detected partitions
|
|
65
|
+
raw_node_data: List[NodeInfo] # Data from the primary connected node
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --- Helper Functions (Slight modifications) ---
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def get_auth(config: RabbitMQClusterConfig) -> Optional[HTTPBasicAuth]:
|
|
72
|
+
if config.username or config.password:
|
|
73
|
+
return HTTPBasicAuth(
|
|
74
|
+
config.username or "guest",
|
|
75
|
+
config.password or "guest",
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_url(base_url: str, endpoint: str) -> str:
|
|
82
|
+
"""Constructs a URL using a base and an endpoint."""
|
|
83
|
+
# Ensure base_url ends with '/' for urljoin to work predictably
|
|
84
|
+
if not base_url.endswith("/"):
|
|
85
|
+
base_url += "/"
|
|
86
|
+
return urljoin(base_url, endpoint.lstrip("/"))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@backoff.on_exception(
|
|
90
|
+
backoff.expo,
|
|
91
|
+
requests.exceptions.RequestException,
|
|
92
|
+
max_tries=3,
|
|
93
|
+
giveup=lambda e: isinstance(e, requests.exceptions.HTTPError)
|
|
94
|
+
and e.response.status_code < 500,
|
|
95
|
+
)
|
|
96
|
+
def make_request(
|
|
97
|
+
config: RabbitMQClusterConfig,
|
|
98
|
+
method: str,
|
|
99
|
+
url: str,
|
|
100
|
+
params: Optional[Dict] = None,
|
|
101
|
+
data: Optional[Dict] = None,
|
|
102
|
+
) -> Any:
|
|
103
|
+
"""Makes an HTTP request to the RabbitMQ Management API."""
|
|
104
|
+
headers = {"Content-Type": "application/json"}
|
|
105
|
+
try:
|
|
106
|
+
response = requests.request(
|
|
107
|
+
method=method,
|
|
108
|
+
url=url,
|
|
109
|
+
headers=headers,
|
|
110
|
+
auth=get_auth(config),
|
|
111
|
+
params=params,
|
|
112
|
+
json=data,
|
|
113
|
+
timeout=config.request_timeout_seconds,
|
|
114
|
+
verify=config.verify_certs,
|
|
115
|
+
)
|
|
116
|
+
response.raise_for_status()
|
|
117
|
+
return response.json()
|
|
118
|
+
except requests.exceptions.RequestException as e:
|
|
119
|
+
logging.error(f"Request failed for {method} {url}: {e}")
|
|
120
|
+
raise # Re-raise after logging for upstream handling
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def node_data_to_node_info(node_data: Dict) -> NodeInfo:
|
|
124
|
+
"""Converts raw node data dict to NodeInfo model."""
|
|
125
|
+
return NodeInfo(**node_data)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_status_from_node(
|
|
129
|
+
config: RabbitMQClusterConfig, target_node_name: str
|
|
130
|
+
) -> Optional[List[Dict]]:
|
|
131
|
+
"""
|
|
132
|
+
Attempts to connect directly to the management API of a specific node.
|
|
133
|
+
Returns the raw node list from that node's perspective, or None on failure.
|
|
134
|
+
"""
|
|
135
|
+
try:
|
|
136
|
+
# Extract hostname from node name (e.g., rabbit@hostname -> hostname)
|
|
137
|
+
parts = target_node_name.split("@")
|
|
138
|
+
if len(parts) != 2:
|
|
139
|
+
logging.debug(
|
|
140
|
+
f"Could not parse hostname from node name: {target_node_name}"
|
|
141
|
+
)
|
|
142
|
+
return None
|
|
143
|
+
hostname = parts[1]
|
|
144
|
+
|
|
145
|
+
# Construct the target node's management URL based on the original config's scheme/port
|
|
146
|
+
parsed_original_url = urlparse(config.management_url)
|
|
147
|
+
scheme = parsed_original_url.scheme or "http"
|
|
148
|
+
port = parsed_original_url.port or (
|
|
149
|
+
443 if scheme == "https" else 15672
|
|
150
|
+
) # Default ports
|
|
151
|
+
base_target_url = f"{scheme}://{hostname}:{port}"
|
|
152
|
+
|
|
153
|
+
target_api_url = get_url(base_target_url, "api/nodes")
|
|
154
|
+
logging.debug(
|
|
155
|
+
f"Attempting direct connection to node {target_node_name} via {target_api_url}"
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Use the original config for auth, timeout, cert verification etc.
|
|
159
|
+
data = make_request(
|
|
160
|
+
config=config,
|
|
161
|
+
method="GET",
|
|
162
|
+
url=target_api_url,
|
|
163
|
+
)
|
|
164
|
+
# Ensure data is a list as expected from /api/nodes
|
|
165
|
+
if isinstance(data, list):
|
|
166
|
+
return data
|
|
167
|
+
else:
|
|
168
|
+
logging.debug(
|
|
169
|
+
f"Unexpected data format received from {target_api_url}: {type(data)}"
|
|
170
|
+
)
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
except requests.exceptions.RequestException as e:
|
|
174
|
+
logging.debug(
|
|
175
|
+
f"Failed to directly connect to node {target_node_name} via management API: {e}"
|
|
176
|
+
)
|
|
177
|
+
return None
|
|
178
|
+
except Exception:
|
|
179
|
+
logging.debug(
|
|
180
|
+
f"Unexpected error trying to get status from node {target_node_name}",
|
|
181
|
+
exc_info=True,
|
|
182
|
+
)
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# --- Main Logic Function (Updated) ---
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def find_node(nodes: List[NodeInfo], node_name: str) -> Optional[NodeInfo]:
|
|
190
|
+
for node in nodes:
|
|
191
|
+
if node.name == node_name:
|
|
192
|
+
return node
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def upsert(nodes: List[NodeInfo], new_node_info: NodeInfo):
|
|
197
|
+
found_index = -1
|
|
198
|
+
for i, existing_node in enumerate(nodes):
|
|
199
|
+
if existing_node.name == new_node_info.name:
|
|
200
|
+
found_index = i
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
if found_index != -1:
|
|
204
|
+
nodes[found_index] = new_node_info
|
|
205
|
+
else:
|
|
206
|
+
nodes.append(new_node_info)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def get_cluster_status(config: RabbitMQClusterConfig) -> ClusterStatus:
|
|
210
|
+
"""
|
|
211
|
+
Gets cluster status, attempting direct connection to nodes reported as down
|
|
212
|
+
to detect potential hidden partitions.
|
|
213
|
+
"""
|
|
214
|
+
raw_nodes_data: List[Dict] = []
|
|
215
|
+
try:
|
|
216
|
+
url = get_url(config.management_url, "api/nodes")
|
|
217
|
+
raw_nodes_data = make_request(
|
|
218
|
+
config=config,
|
|
219
|
+
method="GET",
|
|
220
|
+
url=url,
|
|
221
|
+
)
|
|
222
|
+
config.connection_status = ClusterConnectionStatus.SUCCESS
|
|
223
|
+
config.connection_error = None
|
|
224
|
+
except Exception as e:
|
|
225
|
+
logging.error(
|
|
226
|
+
f"Failed to get primary cluster status from {config.management_url}: {e}"
|
|
227
|
+
)
|
|
228
|
+
config.connection_status = ClusterConnectionStatus.ERROR
|
|
229
|
+
config.connection_error = str(e)
|
|
230
|
+
# Return an empty/error status if the primary connection fails
|
|
231
|
+
return ClusterStatus(
|
|
232
|
+
nodes=[],
|
|
233
|
+
network_partitions_detected=False, # Cannot determine
|
|
234
|
+
partition_details=[],
|
|
235
|
+
raw_node_data=[],
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Process data from the primary connected node
|
|
239
|
+
detected_partitions: List[Partition] = []
|
|
240
|
+
primary_nodes: List[NodeInfo] = []
|
|
241
|
+
nodes_reported_down: Set[str] = set()
|
|
242
|
+
all_node_names_primary_view: Set[str] = set()
|
|
243
|
+
|
|
244
|
+
for node_data in raw_nodes_data:
|
|
245
|
+
node_info = node_data_to_node_info(node_data)
|
|
246
|
+
if not node_info.name:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
primary_nodes.append(node_info)
|
|
250
|
+
all_node_names_primary_view.add(node_info.name)
|
|
251
|
+
|
|
252
|
+
# Store partitions reported by RabbitMQ itself
|
|
253
|
+
if node_info.partitions:
|
|
254
|
+
# Ensure we don't add duplicates if multiple nodes report the same partition
|
|
255
|
+
partition_exists = any(
|
|
256
|
+
p.node == node_info.name
|
|
257
|
+
and set(p.unreachable_nodes) == set(node_info.partitions)
|
|
258
|
+
for p in detected_partitions
|
|
259
|
+
)
|
|
260
|
+
if not partition_exists:
|
|
261
|
+
detected_partitions.append(
|
|
262
|
+
Partition(
|
|
263
|
+
node=node_info.name,
|
|
264
|
+
unreachable_nodes=list(node_info.partitions),
|
|
265
|
+
)
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Keep track of nodes reported as down for later direct checks
|
|
269
|
+
if not node_info.running:
|
|
270
|
+
nodes_reported_down.add(node_info.name)
|
|
271
|
+
|
|
272
|
+
# --- Enhanced Partition Detection ---
|
|
273
|
+
artificially_detected_partitions: List[Partition] = []
|
|
274
|
+
for node_name in nodes_reported_down:
|
|
275
|
+
logging.debug(
|
|
276
|
+
f"Node {node_name} reported as down by primary. Attempting direct connection."
|
|
277
|
+
)
|
|
278
|
+
# Try connecting directly to the node reported as down
|
|
279
|
+
direct_nodes_data = get_status_from_node(config, node_name)
|
|
280
|
+
if not direct_nodes_data:
|
|
281
|
+
continue
|
|
282
|
+
|
|
283
|
+
direct_nodes = [
|
|
284
|
+
node_data_to_node_info(node_data) for node_data in direct_nodes_data
|
|
285
|
+
]
|
|
286
|
+
|
|
287
|
+
logging.debug(
|
|
288
|
+
f"Direct connection to {node_name} succeeded. Analyzing its cluster view."
|
|
289
|
+
)
|
|
290
|
+
unreachable_by_this_node: List[str] = []
|
|
291
|
+
this_node = find_node(direct_nodes, node_name)
|
|
292
|
+
|
|
293
|
+
if not this_node or not this_node.running:
|
|
294
|
+
# Ignore this node if it's not running
|
|
295
|
+
# if this node reports another node as running that was not considered running before, we ignore that
|
|
296
|
+
# for simplicity as I expect we would get to that node by reaching out directly anyway
|
|
297
|
+
logging.debug(
|
|
298
|
+
f"Node {node_name} reported itself as not running upon direct connection."
|
|
299
|
+
)
|
|
300
|
+
continue
|
|
301
|
+
else:
|
|
302
|
+
# Node is running. Update the primary view with the updated node data
|
|
303
|
+
logging.info(
|
|
304
|
+
f"Node {node_name} reported itself as running upon direct connection. Updating primary view."
|
|
305
|
+
)
|
|
306
|
+
upsert(primary_nodes, this_node)
|
|
307
|
+
|
|
308
|
+
all_node_names_direct_view: Set[str] = set()
|
|
309
|
+
|
|
310
|
+
for node in direct_nodes:
|
|
311
|
+
all_node_names_direct_view.add(node.name) # type: ignore
|
|
312
|
+
if not node.running:
|
|
313
|
+
unreachable_by_this_node.append(node.name) # type: ignore
|
|
314
|
+
|
|
315
|
+
if unreachable_by_this_node:
|
|
316
|
+
unreachable_nodes_set = set(unreachable_by_this_node)
|
|
317
|
+
|
|
318
|
+
# Check if this specific partition view is already covered by RabbitMQ's reporting
|
|
319
|
+
is_already_reported = any(
|
|
320
|
+
partition.node == node_name
|
|
321
|
+
and set(partition.unreachable_nodes) == unreachable_nodes_set
|
|
322
|
+
for partition in detected_partitions
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if not is_already_reported:
|
|
326
|
+
logging.debug(
|
|
327
|
+
f"Artificially detecting partition: Node {node_name} cannot reach {unreachable_nodes_set}"
|
|
328
|
+
)
|
|
329
|
+
artificially_detected_partitions.append(
|
|
330
|
+
Partition(
|
|
331
|
+
node=node_name, unreachable_nodes=list(unreachable_nodes_set)
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
# Check for nodes present in primary view but MISSING entirely from this node's direct view
|
|
336
|
+
missing_nodes = all_node_names_primary_view - all_node_names_direct_view
|
|
337
|
+
if missing_nodes:
|
|
338
|
+
# Combine missing nodes with those reported as down by this node
|
|
339
|
+
combined_unreachable = set(unreachable_by_this_node).union(missing_nodes)
|
|
340
|
+
is_already_reported = any(
|
|
341
|
+
p.node == node_name and set(p.unreachable_nodes) == combined_unreachable
|
|
342
|
+
for p in detected_partitions
|
|
343
|
+
+ artificially_detected_partitions # Check against RMQ and our own detections
|
|
344
|
+
)
|
|
345
|
+
if not is_already_reported:
|
|
346
|
+
logging.debug(
|
|
347
|
+
f"Artificially detecting partition: Node {node_name} cannot see (missing or down) {combined_unreachable}"
|
|
348
|
+
)
|
|
349
|
+
# Avoid duplicate Partition entries if already added above based only on 'running=False'
|
|
350
|
+
existing_artificial = next(
|
|
351
|
+
(
|
|
352
|
+
p
|
|
353
|
+
for p in artificially_detected_partitions
|
|
354
|
+
if p.node == node_name
|
|
355
|
+
),
|
|
356
|
+
None,
|
|
357
|
+
)
|
|
358
|
+
if existing_artificial:
|
|
359
|
+
existing_artificial.unreachable_nodes = list(combined_unreachable)
|
|
360
|
+
else:
|
|
361
|
+
artificially_detected_partitions.append(
|
|
362
|
+
Partition(
|
|
363
|
+
node=node_name, unreachable_nodes=list(combined_unreachable)
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
else:
|
|
368
|
+
logging.debug(
|
|
369
|
+
f"Direct connection to node {node_name} failed. Assuming it's unreachable."
|
|
370
|
+
)
|
|
371
|
+
pass
|
|
372
|
+
|
|
373
|
+
# Combine RabbitMQ-reported partitions and artificially detected ones
|
|
374
|
+
final_partitions = detected_partitions + artificially_detected_partitions
|
|
375
|
+
|
|
376
|
+
# Remove potential duplicates (same node reporting same unreachable set)
|
|
377
|
+
unique_partitions = []
|
|
378
|
+
seen_partitions = set()
|
|
379
|
+
for p in final_partitions:
|
|
380
|
+
# Create a unique key: node_name + sorted tuple of unreachable nodes
|
|
381
|
+
partition_key = (p.node, tuple(sorted(p.unreachable_nodes)))
|
|
382
|
+
if partition_key not in seen_partitions:
|
|
383
|
+
unique_partitions.append(p)
|
|
384
|
+
seen_partitions.add(partition_key)
|
|
385
|
+
|
|
386
|
+
node_statuses: List[NodeStatus] = [
|
|
387
|
+
NodeStatus(node=node_info.name, running=node_info.running) # type: ignore
|
|
388
|
+
for node_info in primary_nodes
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
cluster_status = ClusterStatus(
|
|
392
|
+
nodes=node_statuses, # Keep original running status view
|
|
393
|
+
network_partitions_detected=True if len(unique_partitions) > 0 else False,
|
|
394
|
+
partition_details=unique_partitions,
|
|
395
|
+
raw_node_data=primary_nodes, # Data from the primary node only
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return cluster_status
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# RabbitMQ Troubleshooting Guidelines
|
|
2
|
+
|
|
3
|
+
## Goal
|
|
4
|
+
Your primary goal when using these tools is to diagnose RabbitMQ cluster health issues, with a specific focus on detecting
|
|
5
|
+
**network partitions (split-brain scenarios)** and identifying potential causes like resource exhaustion on individual nodes.
|
|
6
|
+
|
|
7
|
+
* Use the tools to get the *current* state of the cluster and nodes.
|
|
8
|
+
* Clearly present the key findings from the tool outputs in your analysis.
|
|
9
|
+
|
|
10
|
+
## Workflow for Split-Brain Diagnosis (Phase I)
|
|
11
|
+
1. **Check Cluster Status:** ALWAYS start by calling `get_rabbitmq_cluster_status`. This is the most important step.
|
|
12
|
+
* Look for `"network_partitions_detected": true`.
|
|
13
|
+
* Examine the `"partition_details"` to understand which nodes are reporting inability to reach others. This identifies
|
|
14
|
+
the members of different sides of the partition.
|
|
15
|
+
* Check the `"running"` status of all nodes listed in `"nodes"`.
|
|
16
|
+
2. **Investigate Affected Nodes:** If a partition is detected, or if any nodes are reported as not running:
|
|
17
|
+
* Analyze the node details: Pay close attention to `mem_alarm` and `disk_free_alarm`.
|
|
18
|
+
Resource exhaustion (memory, disk, file descriptors) is a common reason for nodes becoming unresponsive and causing
|
|
19
|
+
partitions. Also check if `running` is `false`.
|
|
20
|
+
* Analyze the status of the kubernetes pods running RabbitMQ. There is typically one kubernetes pod per RabbitMQ node.
|
|
21
|
+
This can further indicate if a pod is running and if it is healthy or not.
|
|
22
|
+
* Fetch the logs of any pod that is either partitioned or marked as not healthy by the RabbitMQ API.
|
|
23
|
+
3. **Synthesize Findings:** Based on the cluster status and node details, describe the situation clearly. For example:
|
|
24
|
+
* "A network partition is detected in the RabbitMQ cluster '{cluster_name}'. Node 'rabbit@hostA' cannot reach
|
|
25
|
+
['rabbit@hostB']. Node 'rabbit@hostB' reports a disk space alarm (`disk_free_alarm: true`)."
|
|
26
|
+
* "Node 'rabbit@hostC' is reported as not running in the cluster status."
|
|
27
|
+
4. **Recommend Remediation Steps (Based on Docs):**
|
|
28
|
+
* **CRITICAL:** Refer to the official RabbitMQ documentation for handling partitions:
|
|
29
|
+
* Partitions: https://www.rabbitmq.com/docs/partitions, recovering: https://www.rabbitmq.com/docs/partitions#recovering
|
|
30
|
+
* Clustering: https://www.rabbitmq.com/docs/clustering
|
|
31
|
+
* **DO NOT invent recovery procedures.** Your role is to diagnose and *point* to the correct documentation or standard
|
|
32
|
+
procedures.
|
|
33
|
+
* Based on the *type* of partition (e.g., resource issue vs. pure network), you can suggest which sections of the
|
|
34
|
+
documentation are most relevant. For example, if a node has a disk alarm, recommend investigating and resolving the
|
|
35
|
+
disk space issue on that node *before* attempting partition recovery procedures.
|
|
36
|
+
* Common manual steps often involve deciding on a "winning" partition, restarting nodes in the "losing" partition(s),
|
|
37
|
+
and potentially resetting nodes, but **always defer to the official documentation for the exact commands and strategy.**
|