holmesgpt 0.11.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/.git_archival.json +7 -0
- holmes/__init__.py +76 -0
- holmes/__init__.py.bak +76 -0
- holmes/clients/robusta_client.py +24 -0
- holmes/common/env_vars.py +47 -0
- holmes/config.py +526 -0
- holmes/core/__init__.py +0 -0
- holmes/core/conversations.py +578 -0
- holmes/core/investigation.py +152 -0
- holmes/core/investigation_structured_output.py +264 -0
- holmes/core/issue.py +54 -0
- holmes/core/llm.py +250 -0
- holmes/core/models.py +157 -0
- holmes/core/openai_formatting.py +51 -0
- holmes/core/performance_timing.py +72 -0
- holmes/core/prompt.py +42 -0
- holmes/core/resource_instruction.py +17 -0
- holmes/core/runbooks.py +26 -0
- holmes/core/safeguards.py +120 -0
- holmes/core/supabase_dal.py +540 -0
- holmes/core/tool_calling_llm.py +798 -0
- holmes/core/tools.py +566 -0
- holmes/core/tools_utils/__init__.py +0 -0
- holmes/core/tools_utils/tool_executor.py +65 -0
- holmes/core/tools_utils/toolset_utils.py +52 -0
- holmes/core/toolset_manager.py +418 -0
- holmes/interactive.py +229 -0
- holmes/main.py +1041 -0
- holmes/plugins/__init__.py +0 -0
- holmes/plugins/destinations/__init__.py +6 -0
- holmes/plugins/destinations/slack/__init__.py +2 -0
- holmes/plugins/destinations/slack/plugin.py +163 -0
- holmes/plugins/interfaces.py +32 -0
- holmes/plugins/prompts/__init__.py +48 -0
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
- holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
- holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
- holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
- holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
- holmes/plugins/prompts/generic_ask.jinja2 +36 -0
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
- holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
- holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
- holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
- holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
- holmes/plugins/runbooks/README.md +22 -0
- holmes/plugins/runbooks/__init__.py +100 -0
- holmes/plugins/runbooks/catalog.json +14 -0
- holmes/plugins/runbooks/jira.yaml +12 -0
- holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
- holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
- holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
- holmes/plugins/sources/github/__init__.py +77 -0
- holmes/plugins/sources/jira/__init__.py +123 -0
- holmes/plugins/sources/opsgenie/__init__.py +93 -0
- holmes/plugins/sources/pagerduty/__init__.py +147 -0
- holmes/plugins/sources/prometheus/__init__.py +0 -0
- holmes/plugins/sources/prometheus/models.py +104 -0
- holmes/plugins/sources/prometheus/plugin.py +154 -0
- holmes/plugins/toolsets/__init__.py +171 -0
- holmes/plugins/toolsets/aks-node-health.yaml +65 -0
- holmes/plugins/toolsets/aks.yaml +86 -0
- holmes/plugins/toolsets/argocd.yaml +70 -0
- holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
- holmes/plugins/toolsets/aws.yaml +76 -0
- holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
- holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
- holmes/plugins/toolsets/azure_sql/install.md +66 -0
- holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
- holmes/plugins/toolsets/azure_sql/utils.py +83 -0
- holmes/plugins/toolsets/bash/__init__.py +0 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
- holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
- holmes/plugins/toolsets/bash/common/bash.py +52 -0
- holmes/plugins/toolsets/bash/common/config.py +14 -0
- holmes/plugins/toolsets/bash/common/stringify.py +25 -0
- holmes/plugins/toolsets/bash/common/validators.py +24 -0
- holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
- holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
- holmes/plugins/toolsets/bash/parse_command.py +103 -0
- holmes/plugins/toolsets/confluence.yaml +19 -0
- holmes/plugins/toolsets/consts.py +5 -0
- holmes/plugins/toolsets/coralogix/api.py +158 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
- holmes/plugins/toolsets/coralogix/utils.py +181 -0
- holmes/plugins/toolsets/datadog.py +153 -0
- holmes/plugins/toolsets/docker.yaml +46 -0
- holmes/plugins/toolsets/git.py +756 -0
- holmes/plugins/toolsets/grafana/__init__.py +0 -0
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
- holmes/plugins/toolsets/grafana/common.py +68 -0
- holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
- holmes/plugins/toolsets/grafana/loki_api.py +89 -0
- holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
- holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
- holmes/plugins/toolsets/helm.yaml +42 -0
- holmes/plugins/toolsets/internet/internet.py +275 -0
- holmes/plugins/toolsets/internet/notion.py +137 -0
- holmes/plugins/toolsets/kafka.py +638 -0
- holmes/plugins/toolsets/kubernetes.yaml +255 -0
- holmes/plugins/toolsets/kubernetes_logs.py +426 -0
- holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
- holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
- holmes/plugins/toolsets/logging_utils/types.py +0 -0
- holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
- holmes/plugins/toolsets/newrelic.py +222 -0
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
- holmes/plugins/toolsets/rabbitmq/api.py +398 -0
- holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
- holmes/plugins/toolsets/robusta/__init__.py +0 -0
- holmes/plugins/toolsets/robusta/robusta.py +235 -0
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
- holmes/plugins/toolsets/runbook/__init__.py +0 -0
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
- holmes/plugins/toolsets/service_discovery.py +92 -0
- holmes/plugins/toolsets/servicenow/install.md +37 -0
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
- holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
- holmes/plugins/toolsets/slab.yaml +20 -0
- holmes/plugins/toolsets/utils.py +137 -0
- holmes/plugins/utils.py +14 -0
- holmes/utils/__init__.py +0 -0
- holmes/utils/cache.py +84 -0
- holmes/utils/cert_utils.py +40 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
- holmes/utils/definitions.py +13 -0
- holmes/utils/env.py +53 -0
- holmes/utils/file_utils.py +56 -0
- holmes/utils/global_instructions.py +20 -0
- holmes/utils/holmes_status.py +22 -0
- holmes/utils/holmes_sync_toolsets.py +80 -0
- holmes/utils/markdown_utils.py +55 -0
- holmes/utils/pydantic_utils.py +54 -0
- holmes/utils/robusta.py +10 -0
- holmes/utils/tags.py +97 -0
- holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
- holmesgpt-0.11.5.dist-info/METADATA +400 -0
- holmesgpt-0.11.5.dist-info/RECORD +183 -0
- holmesgpt-0.11.5.dist-info/WHEEL +4 -0
- holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,600 @@
|
|
|
1
|
+
from typing import Dict, List, Any
|
|
2
|
+
import logging
|
|
3
|
+
from datetime import datetime, timezone, timedelta
|
|
4
|
+
from azure.core.credentials import TokenCredential
|
|
5
|
+
from azure.mgmt.monitor import MonitorManagementClient
|
|
6
|
+
from azure.mgmt.resource import ResourceManagementClient
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AlertMonitoringAPI:
|
|
10
|
+
"""API client for Azure Monitor alerts related to Azure SQL databases."""
|
|
11
|
+
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
credential: TokenCredential,
|
|
15
|
+
subscription_id: str,
|
|
16
|
+
):
|
|
17
|
+
self.credential = credential
|
|
18
|
+
self.subscription_id = subscription_id
|
|
19
|
+
self.monitor_client = MonitorManagementClient(credential, subscription_id)
|
|
20
|
+
self.resource_client = ResourceManagementClient(credential, subscription_id)
|
|
21
|
+
|
|
22
|
+
# Initialize alerts management client (different from monitor client)
|
|
23
|
+
try:
|
|
24
|
+
# Import here to avoid circular imports
|
|
25
|
+
from azure.mgmt.alertsmanagement import AlertsManagementClient
|
|
26
|
+
|
|
27
|
+
self.alerts_client = AlertsManagementClient(credential, subscription_id)
|
|
28
|
+
except ImportError:
|
|
29
|
+
logging.warning(
|
|
30
|
+
"AlertsManagementClient not available, using fallback methods"
|
|
31
|
+
)
|
|
32
|
+
self.alerts_client = None
|
|
33
|
+
|
|
34
|
+
def _build_database_resource_id(
|
|
35
|
+
self, resource_group: str, server_name: str, database_name: str
|
|
36
|
+
) -> str:
|
|
37
|
+
"""Build the full Azure resource ID for the SQL database."""
|
|
38
|
+
return (
|
|
39
|
+
f"/subscriptions/{self.subscription_id}/"
|
|
40
|
+
f"resourceGroups/{resource_group}/"
|
|
41
|
+
f"providers/Microsoft.Sql/servers/{server_name}/"
|
|
42
|
+
f"databases/{database_name}"
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def _build_server_resource_id(self, resource_group: str, server_name: str) -> str:
|
|
46
|
+
"""Build the full Azure resource ID for the SQL server."""
|
|
47
|
+
return (
|
|
48
|
+
f"/subscriptions/{self.subscription_id}/"
|
|
49
|
+
f"resourceGroups/{resource_group}/"
|
|
50
|
+
f"providers/Microsoft.Sql/servers/{server_name}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def get_active_alerts(
|
|
54
|
+
self, resource_group: str, server_name: str, database_name: str
|
|
55
|
+
) -> Dict[str, Any]:
|
|
56
|
+
"""Get currently active alerts for the SQL database and server."""
|
|
57
|
+
try:
|
|
58
|
+
database_resource_id = self._build_database_resource_id(
|
|
59
|
+
resource_group, server_name, database_name
|
|
60
|
+
)
|
|
61
|
+
server_resource_id = self._build_server_resource_id(
|
|
62
|
+
resource_group, server_name
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
active_alerts = []
|
|
66
|
+
|
|
67
|
+
if self.alerts_client:
|
|
68
|
+
# Get alerts using the AlertsManagement API
|
|
69
|
+
try:
|
|
70
|
+
# Get database-specific alerts
|
|
71
|
+
db_alerts = self.alerts_client.alerts.get_all(
|
|
72
|
+
target_resource=database_resource_id,
|
|
73
|
+
alert_state="New,Acknowledged",
|
|
74
|
+
)
|
|
75
|
+
for alert in db_alerts:
|
|
76
|
+
active_alerts.append(self._format_alert(alert, "database"))
|
|
77
|
+
|
|
78
|
+
# Get server-level alerts that might affect the database
|
|
79
|
+
server_alerts = self.alerts_client.alerts.get_all(
|
|
80
|
+
target_resource=server_resource_id,
|
|
81
|
+
alert_state="New,Acknowledged",
|
|
82
|
+
)
|
|
83
|
+
for alert in server_alerts:
|
|
84
|
+
active_alerts.append(self._format_alert(alert, "server"))
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logging.warning(f"AlertsManagement API failed, using fallback: {e}")
|
|
88
|
+
return self._get_active_alerts_fallback(
|
|
89
|
+
resource_group, server_name, database_name
|
|
90
|
+
)
|
|
91
|
+
else:
|
|
92
|
+
# Fallback method using Monitor API
|
|
93
|
+
return self._get_active_alerts_fallback(
|
|
94
|
+
resource_group, server_name, database_name
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"database_resource_id": database_resource_id,
|
|
99
|
+
"server_resource_id": server_resource_id,
|
|
100
|
+
"active_alerts": active_alerts,
|
|
101
|
+
"total_count": len(active_alerts),
|
|
102
|
+
"retrieved_at": datetime.now(timezone.utc).isoformat(),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
error_msg = f"Failed to retrieve active alerts: {str(e)}"
|
|
107
|
+
logging.error(error_msg, exc_info=True)
|
|
108
|
+
return {"error": error_msg}
|
|
109
|
+
|
|
110
|
+
def _get_active_alerts_fallback(
|
|
111
|
+
self, resource_group: str, server_name: str, database_name: str
|
|
112
|
+
) -> Dict[str, Any]:
|
|
113
|
+
"""Fallback method to get alerts using Monitor API activity log."""
|
|
114
|
+
try:
|
|
115
|
+
database_resource_id = self._build_database_resource_id(
|
|
116
|
+
resource_group, server_name, database_name
|
|
117
|
+
)
|
|
118
|
+
server_resource_id = self._build_server_resource_id(
|
|
119
|
+
resource_group, server_name
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Get recent activity log entries that might indicate alerts
|
|
123
|
+
end_time = datetime.now(timezone.utc)
|
|
124
|
+
start_time = end_time - timedelta(hours=24)
|
|
125
|
+
|
|
126
|
+
filter_query = (
|
|
127
|
+
f"eventTimestamp ge '{start_time.isoformat()}' and "
|
|
128
|
+
f"eventTimestamp le '{end_time.isoformat()}' and "
|
|
129
|
+
f"(resourceId eq '{database_resource_id}' or resourceId eq '{server_resource_id}') and "
|
|
130
|
+
f"(level eq 'Warning' or level eq 'Error')"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
activity_logs = self.monitor_client.activity_logs.list(filter=filter_query)
|
|
134
|
+
|
|
135
|
+
alerts = []
|
|
136
|
+
for log_entry in activity_logs:
|
|
137
|
+
if hasattr(log_entry, "operation_name") and log_entry.operation_name:
|
|
138
|
+
# Convert activity log to alert-like format
|
|
139
|
+
alert_data = {
|
|
140
|
+
"id": getattr(log_entry, "event_data_id", "unknown"),
|
|
141
|
+
"name": getattr(log_entry, "operation_name", {}).get(
|
|
142
|
+
"value", "Unknown Operation"
|
|
143
|
+
),
|
|
144
|
+
"description": getattr(
|
|
145
|
+
log_entry, "description", "No description available"
|
|
146
|
+
),
|
|
147
|
+
"severity": self._map_level_to_severity(
|
|
148
|
+
getattr(log_entry, "level", "Informational")
|
|
149
|
+
),
|
|
150
|
+
"state": "Active",
|
|
151
|
+
"monitor_condition": "Fired",
|
|
152
|
+
"fired_time": getattr(
|
|
153
|
+
log_entry, "event_timestamp", end_time
|
|
154
|
+
).isoformat(),
|
|
155
|
+
"resource_type": "Activity Log Event",
|
|
156
|
+
"target_resource": getattr(log_entry, "resource_id", ""),
|
|
157
|
+
"scope": "server"
|
|
158
|
+
if server_resource_id
|
|
159
|
+
in str(getattr(log_entry, "resource_id", ""))
|
|
160
|
+
else "database",
|
|
161
|
+
}
|
|
162
|
+
alerts.append(alert_data)
|
|
163
|
+
|
|
164
|
+
return {
|
|
165
|
+
"database_resource_id": database_resource_id,
|
|
166
|
+
"server_resource_id": server_resource_id,
|
|
167
|
+
"active_alerts": alerts,
|
|
168
|
+
"total_count": len(alerts),
|
|
169
|
+
"retrieved_at": datetime.now(timezone.utc).isoformat(),
|
|
170
|
+
"method": "activity_log_fallback",
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
except Exception as e:
|
|
174
|
+
error_msg = f"Failed to retrieve alerts using fallback method: {str(e)}"
|
|
175
|
+
logging.error(error_msg, exc_info=True)
|
|
176
|
+
return {"error": error_msg}
|
|
177
|
+
|
|
178
|
+
def get_alert_history(
|
|
179
|
+
self,
|
|
180
|
+
resource_group: str,
|
|
181
|
+
server_name: str,
|
|
182
|
+
database_name: str,
|
|
183
|
+
hours_back: int = 168, # Default to 7 days
|
|
184
|
+
) -> Dict[str, Any]:
|
|
185
|
+
"""Get historical alerts for the SQL database and server by fetching metric alert rules."""
|
|
186
|
+
try:
|
|
187
|
+
database_resource_id = self._build_database_resource_id(
|
|
188
|
+
resource_group, server_name, database_name
|
|
189
|
+
)
|
|
190
|
+
server_resource_id = self._build_server_resource_id(
|
|
191
|
+
resource_group, server_name
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
end_time = datetime.now(timezone.utc)
|
|
195
|
+
start_time = end_time - timedelta(hours=hours_back)
|
|
196
|
+
|
|
197
|
+
historical_alerts = []
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
# Get metric alert rules from the resource group
|
|
201
|
+
logging.info(
|
|
202
|
+
f"Fetching metric alert rules from resource group: {resource_group}"
|
|
203
|
+
)
|
|
204
|
+
metric_alert_rules = (
|
|
205
|
+
self.monitor_client.metric_alerts.list_by_resource_group(
|
|
206
|
+
resource_group
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
relevant_rules = []
|
|
211
|
+
for rule in metric_alert_rules:
|
|
212
|
+
# Check if this rule applies to our database or server
|
|
213
|
+
if hasattr(rule, "scopes") and rule.scopes:
|
|
214
|
+
for scope in rule.scopes:
|
|
215
|
+
if (
|
|
216
|
+
database_resource_id in scope
|
|
217
|
+
or server_resource_id in scope
|
|
218
|
+
):
|
|
219
|
+
relevant_rules.append(rule)
|
|
220
|
+
break
|
|
221
|
+
|
|
222
|
+
logging.info(
|
|
223
|
+
f"Found {len(relevant_rules)} metric alert rules relevant to our resources"
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# For each relevant rule, try to get alert instances if possible
|
|
227
|
+
for rule in relevant_rules:
|
|
228
|
+
rule_name = getattr(rule, "name", "Unknown Rule")
|
|
229
|
+
rule_id = getattr(rule, "id", "Unknown ID")
|
|
230
|
+
|
|
231
|
+
# Create alert entry from rule definition
|
|
232
|
+
scope_type = (
|
|
233
|
+
"database"
|
|
234
|
+
if database_resource_id in str(getattr(rule, "scopes", []))
|
|
235
|
+
else "server"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
alert_data = {
|
|
239
|
+
"id": rule_id,
|
|
240
|
+
"name": rule_name,
|
|
241
|
+
"description": getattr(
|
|
242
|
+
rule, "description", "Metric alert rule"
|
|
243
|
+
),
|
|
244
|
+
"severity": f"Sev{getattr(rule, 'severity', 3)}",
|
|
245
|
+
"state": "Rule Configured", # We can't determine actual firing state without alert instances
|
|
246
|
+
"monitor_condition": "Configured",
|
|
247
|
+
"fired_time": datetime.now(
|
|
248
|
+
timezone.utc
|
|
249
|
+
).isoformat(), # Use current time as placeholder
|
|
250
|
+
"resource_type": "Metric Alert Rule",
|
|
251
|
+
"target_resource": str(getattr(rule, "scopes", [])),
|
|
252
|
+
"scope": scope_type,
|
|
253
|
+
"rule_enabled": getattr(rule, "enabled", False),
|
|
254
|
+
"window_size": str(getattr(rule, "window_size", "Unknown")),
|
|
255
|
+
"evaluation_frequency": str(
|
|
256
|
+
getattr(rule, "evaluation_frequency", "Unknown")
|
|
257
|
+
),
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
# Add criteria information if available
|
|
261
|
+
if hasattr(rule, "criteria") and rule.criteria:
|
|
262
|
+
criteria_info = []
|
|
263
|
+
if hasattr(rule.criteria, "all_of"):
|
|
264
|
+
for criterion in rule.criteria.all_of:
|
|
265
|
+
metric_name = getattr(
|
|
266
|
+
criterion, "metric_name", "Unknown"
|
|
267
|
+
)
|
|
268
|
+
operator = getattr(criterion, "operator", "Unknown")
|
|
269
|
+
threshold = getattr(criterion, "threshold", "Unknown")
|
|
270
|
+
criteria_info.append(
|
|
271
|
+
f"{metric_name} {operator} {threshold}"
|
|
272
|
+
)
|
|
273
|
+
if criteria_info:
|
|
274
|
+
alert_data["criteria"] = "; ".join(criteria_info)
|
|
275
|
+
|
|
276
|
+
historical_alerts.append(alert_data)
|
|
277
|
+
logging.info(f"Added metric alert rule: {rule_name} ({scope_type})")
|
|
278
|
+
|
|
279
|
+
# If we have alerts management client, try to get actual alert instances
|
|
280
|
+
if self.alerts_client and historical_alerts:
|
|
281
|
+
try:
|
|
282
|
+
logging.info(
|
|
283
|
+
"Attempting to get alert instances from AlertsManagement API"
|
|
284
|
+
)
|
|
285
|
+
# Try to get alert instances for the time period
|
|
286
|
+
alert_instances = self.alerts_client.alerts.get_all(
|
|
287
|
+
time_range=f"PT{hours_back}H" # ISO 8601 duration format
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
instance_count = 0
|
|
291
|
+
for alert_instance in alert_instances:
|
|
292
|
+
instance_count += 1
|
|
293
|
+
# Check if this instance relates to our resources
|
|
294
|
+
instance_resource = str(
|
|
295
|
+
getattr(alert_instance, "target_resource", "")
|
|
296
|
+
)
|
|
297
|
+
if (
|
|
298
|
+
database_resource_id in instance_resource
|
|
299
|
+
or server_resource_id in instance_resource
|
|
300
|
+
):
|
|
301
|
+
# Update or add alert with actual instance data
|
|
302
|
+
instance_data = self._format_alert(
|
|
303
|
+
alert_instance,
|
|
304
|
+
"database"
|
|
305
|
+
if database_resource_id in instance_resource
|
|
306
|
+
else "server",
|
|
307
|
+
)
|
|
308
|
+
historical_alerts.append(instance_data)
|
|
309
|
+
logging.info(
|
|
310
|
+
f"Added alert instance: {instance_data.get('name', 'Unknown')}"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
logging.info(f"Processed {instance_count} alert instances")
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logging.warning(f"Failed to get alert instances: {e}")
|
|
316
|
+
# Continue with just the rules data
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logging.warning(
|
|
320
|
+
f"Failed to get metric alert rules, falling back to activity logs: {e}"
|
|
321
|
+
)
|
|
322
|
+
return self._get_alert_history_fallback(
|
|
323
|
+
resource_group, server_name, database_name, hours_back
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Sort by fired time, most recent first
|
|
327
|
+
historical_alerts.sort(key=lambda x: x.get("fired_time", ""), reverse=True)
|
|
328
|
+
|
|
329
|
+
# Analyze patterns
|
|
330
|
+
analysis = self._analyze_alert_patterns(historical_alerts)
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
"database_resource_id": database_resource_id,
|
|
334
|
+
"server_resource_id": server_resource_id,
|
|
335
|
+
"time_range": {
|
|
336
|
+
"start": start_time.isoformat(),
|
|
337
|
+
"end": end_time.isoformat(),
|
|
338
|
+
"hours": hours_back,
|
|
339
|
+
},
|
|
340
|
+
"alerts": historical_alerts,
|
|
341
|
+
"total_count": len(historical_alerts),
|
|
342
|
+
"analysis": analysis,
|
|
343
|
+
"retrieved_at": datetime.now(timezone.utc).isoformat(),
|
|
344
|
+
"method": "metric_alerts",
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
error_msg = f"Failed to retrieve alert history: {str(e)}"
|
|
349
|
+
logging.error(error_msg, exc_info=True)
|
|
350
|
+
return {"error": error_msg}
|
|
351
|
+
|
|
352
|
+
def _get_alert_history_fallback(
|
|
353
|
+
self, resource_group: str, server_name: str, database_name: str, hours_back: int
|
|
354
|
+
) -> Dict[str, Any]:
|
|
355
|
+
"""Fallback method to get alert history using activity logs."""
|
|
356
|
+
try:
|
|
357
|
+
database_resource_id = self._build_database_resource_id(
|
|
358
|
+
resource_group, server_name, database_name
|
|
359
|
+
)
|
|
360
|
+
server_resource_id = self._build_server_resource_id(
|
|
361
|
+
resource_group, server_name
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
end_time = datetime.now(timezone.utc)
|
|
365
|
+
start_time = end_time - timedelta(hours=hours_back)
|
|
366
|
+
|
|
367
|
+
filter_query = (
|
|
368
|
+
f"eventTimestamp ge '{start_time.isoformat()}' and "
|
|
369
|
+
f"eventTimestamp le '{end_time.isoformat()}' and "
|
|
370
|
+
f"resourceId eq '{database_resource_id}'"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
logging.info(f"Using activity logs fallback with filter: {filter_query}")
|
|
374
|
+
|
|
375
|
+
try:
|
|
376
|
+
activity_logs = self.monitor_client.activity_logs.list(
|
|
377
|
+
filter=filter_query
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
alerts = []
|
|
381
|
+
log_count = 0
|
|
382
|
+
for log_entry in activity_logs:
|
|
383
|
+
log_count += 1
|
|
384
|
+
|
|
385
|
+
logging.info(f"Found {log_count} activity log entries")
|
|
386
|
+
|
|
387
|
+
# Reset iterator and process entries
|
|
388
|
+
activity_logs = self.monitor_client.activity_logs.list(
|
|
389
|
+
filter=filter_query
|
|
390
|
+
)
|
|
391
|
+
for log_entry in activity_logs:
|
|
392
|
+
if hasattr(log_entry, "level") and log_entry.level in [
|
|
393
|
+
"Warning",
|
|
394
|
+
"Error",
|
|
395
|
+
"Critical",
|
|
396
|
+
]:
|
|
397
|
+
alert_data = {
|
|
398
|
+
"id": getattr(log_entry, "event_data_id", "unknown"),
|
|
399
|
+
"name": getattr(log_entry, "operation_name", {}).get(
|
|
400
|
+
"value", "Unknown Operation"
|
|
401
|
+
),
|
|
402
|
+
"description": getattr(
|
|
403
|
+
log_entry, "description", "No description available"
|
|
404
|
+
),
|
|
405
|
+
"severity": self._map_level_to_severity(
|
|
406
|
+
getattr(log_entry, "level", "Informational")
|
|
407
|
+
),
|
|
408
|
+
"state": "Resolved", # Activity logs are historical
|
|
409
|
+
"monitor_condition": "Resolved",
|
|
410
|
+
"fired_time": getattr(
|
|
411
|
+
log_entry, "event_timestamp", end_time
|
|
412
|
+
).isoformat(),
|
|
413
|
+
"resolved_time": getattr(
|
|
414
|
+
log_entry, "event_timestamp", end_time
|
|
415
|
+
).isoformat(),
|
|
416
|
+
"resource_type": "Activity Log Event",
|
|
417
|
+
"target_resource": getattr(log_entry, "resource_id", ""),
|
|
418
|
+
"scope": "server"
|
|
419
|
+
if server_resource_id
|
|
420
|
+
in str(getattr(log_entry, "resource_id", ""))
|
|
421
|
+
else "database",
|
|
422
|
+
"caller": getattr(log_entry, "caller", "Unknown"),
|
|
423
|
+
"status": getattr(log_entry, "status", {}).get(
|
|
424
|
+
"value", "Unknown"
|
|
425
|
+
),
|
|
426
|
+
}
|
|
427
|
+
alerts.append(alert_data)
|
|
428
|
+
logging.info(
|
|
429
|
+
f"Added alert from activity log: {alert_data['name']}"
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Sort by fired time, most recent first
|
|
433
|
+
alerts.sort(key=lambda x: x.get("fired_time", ""), reverse=True)
|
|
434
|
+
|
|
435
|
+
except Exception as e:
|
|
436
|
+
logging.error(f"Failed to process activity logs: {e}")
|
|
437
|
+
alerts = []
|
|
438
|
+
|
|
439
|
+
# Analyze patterns
|
|
440
|
+
analysis = self._analyze_alert_patterns(alerts)
|
|
441
|
+
|
|
442
|
+
return {
|
|
443
|
+
"database_resource_id": database_resource_id,
|
|
444
|
+
"server_resource_id": server_resource_id,
|
|
445
|
+
"time_range": {
|
|
446
|
+
"start": start_time.isoformat(),
|
|
447
|
+
"end": end_time.isoformat(),
|
|
448
|
+
"hours": hours_back,
|
|
449
|
+
},
|
|
450
|
+
"alerts": alerts,
|
|
451
|
+
"total_count": len(alerts),
|
|
452
|
+
"analysis": analysis,
|
|
453
|
+
"retrieved_at": datetime.now(timezone.utc).isoformat(),
|
|
454
|
+
"method": "activity_log_fallback",
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
except Exception as e:
|
|
458
|
+
error_msg = f"Failed to retrieve alert history using fallback: {str(e)}"
|
|
459
|
+
logging.error(error_msg, exc_info=True)
|
|
460
|
+
return {"error": error_msg}
|
|
461
|
+
|
|
462
|
+
def _format_alert(self, alert, scope: str) -> Dict[str, Any]:
|
|
463
|
+
"""Format an alert object into a consistent dictionary structure."""
|
|
464
|
+
try:
|
|
465
|
+
# Handle different alert object types
|
|
466
|
+
if hasattr(alert, "properties"):
|
|
467
|
+
props = alert.properties
|
|
468
|
+
return {
|
|
469
|
+
"id": getattr(alert, "id", "unknown"),
|
|
470
|
+
"name": getattr(alert, "name", "Unknown Alert"),
|
|
471
|
+
"description": getattr(
|
|
472
|
+
props, "description", "No description available"
|
|
473
|
+
),
|
|
474
|
+
"severity": getattr(props, "severity", "Unknown"),
|
|
475
|
+
"state": getattr(props, "monitor_condition", "Unknown"),
|
|
476
|
+
"monitor_condition": getattr(props, "monitor_condition", "Unknown"),
|
|
477
|
+
"fired_time": getattr(
|
|
478
|
+
props, "fired_time", datetime.now(timezone.utc)
|
|
479
|
+
).isoformat(),
|
|
480
|
+
"resolved_time": getattr(props, "resolved_time", None),
|
|
481
|
+
"resource_type": getattr(props, "target_resource_type", "Unknown"),
|
|
482
|
+
"target_resource": getattr(props, "target_resource", ""),
|
|
483
|
+
"scope": scope,
|
|
484
|
+
}
|
|
485
|
+
else:
|
|
486
|
+
# Fallback for different alert formats
|
|
487
|
+
return {
|
|
488
|
+
"id": str(getattr(alert, "id", "unknown")),
|
|
489
|
+
"name": str(getattr(alert, "name", "Unknown Alert")),
|
|
490
|
+
"description": str(
|
|
491
|
+
getattr(alert, "description", "No description available")
|
|
492
|
+
),
|
|
493
|
+
"severity": str(getattr(alert, "severity", "Unknown")),
|
|
494
|
+
"state": str(getattr(alert, "state", "Unknown")),
|
|
495
|
+
"monitor_condition": str(
|
|
496
|
+
getattr(alert, "monitor_condition", "Unknown")
|
|
497
|
+
),
|
|
498
|
+
"fired_time": datetime.now(timezone.utc).isoformat(),
|
|
499
|
+
"resource_type": "Unknown",
|
|
500
|
+
"target_resource": str(getattr(alert, "target_resource", "")),
|
|
501
|
+
"scope": scope,
|
|
502
|
+
}
|
|
503
|
+
except Exception as e:
|
|
504
|
+
logging.warning(f"Failed to format alert: {e}")
|
|
505
|
+
return {
|
|
506
|
+
"id": "unknown",
|
|
507
|
+
"name": "Failed to parse alert",
|
|
508
|
+
"description": f"Error parsing alert: {str(e)}",
|
|
509
|
+
"severity": "Unknown",
|
|
510
|
+
"state": "Unknown",
|
|
511
|
+
"scope": scope,
|
|
512
|
+
"fired_time": datetime.now(timezone.utc).isoformat(),
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
def _map_level_to_severity(self, level: str) -> str:
|
|
516
|
+
"""Map activity log level to alert severity."""
|
|
517
|
+
level_map = {
|
|
518
|
+
"Critical": "Sev0",
|
|
519
|
+
"Error": "Sev1",
|
|
520
|
+
"Warning": "Sev2",
|
|
521
|
+
"Informational": "Sev3",
|
|
522
|
+
"Verbose": "Sev4",
|
|
523
|
+
}
|
|
524
|
+
return level_map.get(level, "Unknown")
|
|
525
|
+
|
|
526
|
+
def _analyze_alert_patterns(self, alerts: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
527
|
+
"""Analyze alert patterns to identify trends and issues."""
|
|
528
|
+
if not alerts:
|
|
529
|
+
return {"message": "No alerts to analyze"}
|
|
530
|
+
|
|
531
|
+
# Count by severity
|
|
532
|
+
severity_counts: dict = {}
|
|
533
|
+
state_counts: dict = {}
|
|
534
|
+
scope_counts: dict = {}
|
|
535
|
+
resource_type_counts: dict = {}
|
|
536
|
+
|
|
537
|
+
for alert in alerts:
|
|
538
|
+
severity = alert.get("severity", "Unknown")
|
|
539
|
+
state = alert.get("state", "Unknown")
|
|
540
|
+
scope = alert.get("scope", "Unknown")
|
|
541
|
+
resource_type = alert.get("resource_type", "Unknown")
|
|
542
|
+
|
|
543
|
+
severity_counts[severity] = severity_counts.get(severity, 0) + 1
|
|
544
|
+
state_counts[state] = state_counts.get(state, 0) + 1
|
|
545
|
+
scope_counts[scope] = scope_counts.get(scope, 0) + 1
|
|
546
|
+
resource_type_counts[resource_type] = (
|
|
547
|
+
resource_type_counts.get(resource_type, 0) + 1
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# Find most frequent alert names
|
|
551
|
+
alert_names: dict = {}
|
|
552
|
+
for alert in alerts:
|
|
553
|
+
name = alert.get("name", "Unknown")
|
|
554
|
+
alert_names[name] = alert_names.get(name, 0) + 1
|
|
555
|
+
|
|
556
|
+
most_frequent_alerts = sorted(
|
|
557
|
+
alert_names.items(), key=lambda x: x[1], reverse=True
|
|
558
|
+
)[:5]
|
|
559
|
+
|
|
560
|
+
return {
|
|
561
|
+
"total_alerts": len(alerts),
|
|
562
|
+
"severity_breakdown": severity_counts,
|
|
563
|
+
"state_breakdown": state_counts,
|
|
564
|
+
"scope_breakdown": scope_counts,
|
|
565
|
+
"resource_type_breakdown": resource_type_counts,
|
|
566
|
+
"most_frequent_alerts": most_frequent_alerts,
|
|
567
|
+
"analysis_notes": self._generate_analysis_notes(
|
|
568
|
+
severity_counts, most_frequent_alerts
|
|
569
|
+
),
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
def _generate_analysis_notes(
|
|
573
|
+
self, severity_counts: Dict, frequent_alerts: List
|
|
574
|
+
) -> List[str]:
|
|
575
|
+
"""Generate human-readable analysis notes."""
|
|
576
|
+
notes = []
|
|
577
|
+
|
|
578
|
+
# Severity analysis
|
|
579
|
+
critical_count = severity_counts.get("Sev0", 0) + severity_counts.get(
|
|
580
|
+
"Critical", 0
|
|
581
|
+
)
|
|
582
|
+
error_count = severity_counts.get("Sev1", 0) + severity_counts.get("Error", 0)
|
|
583
|
+
|
|
584
|
+
if critical_count > 0:
|
|
585
|
+
notes.append(f"⚠️ {critical_count} critical severity alerts detected")
|
|
586
|
+
if error_count > 0:
|
|
587
|
+
notes.append(f"🔴 {error_count} error severity alerts detected")
|
|
588
|
+
|
|
589
|
+
# Frequent alerts analysis
|
|
590
|
+
if frequent_alerts:
|
|
591
|
+
top_alert = frequent_alerts[0]
|
|
592
|
+
if top_alert[1] > 1:
|
|
593
|
+
notes.append(
|
|
594
|
+
f"🔁 Most frequent alert: '{top_alert[0]}' ({top_alert[1]} occurrences)"
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
if not notes:
|
|
598
|
+
notes.append("✅ Alert analysis looks normal")
|
|
599
|
+
|
|
600
|
+
return notes
|