holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,600 @@
1
+ from typing import Dict, List, Any
2
+ import logging
3
+ from datetime import datetime, timezone, timedelta
4
+ from azure.core.credentials import TokenCredential
5
+ from azure.mgmt.monitor import MonitorManagementClient
6
+ from azure.mgmt.resource import ResourceManagementClient
7
+
8
+
9
+ class AlertMonitoringAPI:
10
+ """API client for Azure Monitor alerts related to Azure SQL databases."""
11
+
12
+ def __init__(
13
+ self,
14
+ credential: TokenCredential,
15
+ subscription_id: str,
16
+ ):
17
+ self.credential = credential
18
+ self.subscription_id = subscription_id
19
+ self.monitor_client = MonitorManagementClient(credential, subscription_id)
20
+ self.resource_client = ResourceManagementClient(credential, subscription_id)
21
+
22
+ # Initialize alerts management client (different from monitor client)
23
+ try:
24
+ # Import here to avoid circular imports
25
+ from azure.mgmt.alertsmanagement import AlertsManagementClient
26
+
27
+ self.alerts_client = AlertsManagementClient(credential, subscription_id)
28
+ except ImportError:
29
+ logging.warning(
30
+ "AlertsManagementClient not available, using fallback methods"
31
+ )
32
+ self.alerts_client = None
33
+
34
+ def _build_database_resource_id(
35
+ self, resource_group: str, server_name: str, database_name: str
36
+ ) -> str:
37
+ """Build the full Azure resource ID for the SQL database."""
38
+ return (
39
+ f"/subscriptions/{self.subscription_id}/"
40
+ f"resourceGroups/{resource_group}/"
41
+ f"providers/Microsoft.Sql/servers/{server_name}/"
42
+ f"databases/{database_name}"
43
+ )
44
+
45
+ def _build_server_resource_id(self, resource_group: str, server_name: str) -> str:
46
+ """Build the full Azure resource ID for the SQL server."""
47
+ return (
48
+ f"/subscriptions/{self.subscription_id}/"
49
+ f"resourceGroups/{resource_group}/"
50
+ f"providers/Microsoft.Sql/servers/{server_name}"
51
+ )
52
+
53
+ def get_active_alerts(
54
+ self, resource_group: str, server_name: str, database_name: str
55
+ ) -> Dict[str, Any]:
56
+ """Get currently active alerts for the SQL database and server."""
57
+ try:
58
+ database_resource_id = self._build_database_resource_id(
59
+ resource_group, server_name, database_name
60
+ )
61
+ server_resource_id = self._build_server_resource_id(
62
+ resource_group, server_name
63
+ )
64
+
65
+ active_alerts = []
66
+
67
+ if self.alerts_client:
68
+ # Get alerts using the AlertsManagement API
69
+ try:
70
+ # Get database-specific alerts
71
+ db_alerts = self.alerts_client.alerts.get_all(
72
+ target_resource=database_resource_id,
73
+ alert_state="New,Acknowledged",
74
+ )
75
+ for alert in db_alerts:
76
+ active_alerts.append(self._format_alert(alert, "database"))
77
+
78
+ # Get server-level alerts that might affect the database
79
+ server_alerts = self.alerts_client.alerts.get_all(
80
+ target_resource=server_resource_id,
81
+ alert_state="New,Acknowledged",
82
+ )
83
+ for alert in server_alerts:
84
+ active_alerts.append(self._format_alert(alert, "server"))
85
+
86
+ except Exception as e:
87
+ logging.warning(f"AlertsManagement API failed, using fallback: {e}")
88
+ return self._get_active_alerts_fallback(
89
+ resource_group, server_name, database_name
90
+ )
91
+ else:
92
+ # Fallback method using Monitor API
93
+ return self._get_active_alerts_fallback(
94
+ resource_group, server_name, database_name
95
+ )
96
+
97
+ return {
98
+ "database_resource_id": database_resource_id,
99
+ "server_resource_id": server_resource_id,
100
+ "active_alerts": active_alerts,
101
+ "total_count": len(active_alerts),
102
+ "retrieved_at": datetime.now(timezone.utc).isoformat(),
103
+ }
104
+
105
+ except Exception as e:
106
+ error_msg = f"Failed to retrieve active alerts: {str(e)}"
107
+ logging.error(error_msg, exc_info=True)
108
+ return {"error": error_msg}
109
+
110
+ def _get_active_alerts_fallback(
111
+ self, resource_group: str, server_name: str, database_name: str
112
+ ) -> Dict[str, Any]:
113
+ """Fallback method to get alerts using Monitor API activity log."""
114
+ try:
115
+ database_resource_id = self._build_database_resource_id(
116
+ resource_group, server_name, database_name
117
+ )
118
+ server_resource_id = self._build_server_resource_id(
119
+ resource_group, server_name
120
+ )
121
+
122
+ # Get recent activity log entries that might indicate alerts
123
+ end_time = datetime.now(timezone.utc)
124
+ start_time = end_time - timedelta(hours=24)
125
+
126
+ filter_query = (
127
+ f"eventTimestamp ge '{start_time.isoformat()}' and "
128
+ f"eventTimestamp le '{end_time.isoformat()}' and "
129
+ f"(resourceId eq '{database_resource_id}' or resourceId eq '{server_resource_id}') and "
130
+ f"(level eq 'Warning' or level eq 'Error')"
131
+ )
132
+
133
+ activity_logs = self.monitor_client.activity_logs.list(filter=filter_query)
134
+
135
+ alerts = []
136
+ for log_entry in activity_logs:
137
+ if hasattr(log_entry, "operation_name") and log_entry.operation_name:
138
+ # Convert activity log to alert-like format
139
+ alert_data = {
140
+ "id": getattr(log_entry, "event_data_id", "unknown"),
141
+ "name": getattr(log_entry, "operation_name", {}).get(
142
+ "value", "Unknown Operation"
143
+ ),
144
+ "description": getattr(
145
+ log_entry, "description", "No description available"
146
+ ),
147
+ "severity": self._map_level_to_severity(
148
+ getattr(log_entry, "level", "Informational")
149
+ ),
150
+ "state": "Active",
151
+ "monitor_condition": "Fired",
152
+ "fired_time": getattr(
153
+ log_entry, "event_timestamp", end_time
154
+ ).isoformat(),
155
+ "resource_type": "Activity Log Event",
156
+ "target_resource": getattr(log_entry, "resource_id", ""),
157
+ "scope": "server"
158
+ if server_resource_id
159
+ in str(getattr(log_entry, "resource_id", ""))
160
+ else "database",
161
+ }
162
+ alerts.append(alert_data)
163
+
164
+ return {
165
+ "database_resource_id": database_resource_id,
166
+ "server_resource_id": server_resource_id,
167
+ "active_alerts": alerts,
168
+ "total_count": len(alerts),
169
+ "retrieved_at": datetime.now(timezone.utc).isoformat(),
170
+ "method": "activity_log_fallback",
171
+ }
172
+
173
+ except Exception as e:
174
+ error_msg = f"Failed to retrieve alerts using fallback method: {str(e)}"
175
+ logging.error(error_msg, exc_info=True)
176
+ return {"error": error_msg}
177
+
178
+ def get_alert_history(
179
+ self,
180
+ resource_group: str,
181
+ server_name: str,
182
+ database_name: str,
183
+ hours_back: int = 168, # Default to 7 days
184
+ ) -> Dict[str, Any]:
185
+ """Get historical alerts for the SQL database and server by fetching metric alert rules."""
186
+ try:
187
+ database_resource_id = self._build_database_resource_id(
188
+ resource_group, server_name, database_name
189
+ )
190
+ server_resource_id = self._build_server_resource_id(
191
+ resource_group, server_name
192
+ )
193
+
194
+ end_time = datetime.now(timezone.utc)
195
+ start_time = end_time - timedelta(hours=hours_back)
196
+
197
+ historical_alerts = []
198
+
199
+ try:
200
+ # Get metric alert rules from the resource group
201
+ logging.info(
202
+ f"Fetching metric alert rules from resource group: {resource_group}"
203
+ )
204
+ metric_alert_rules = (
205
+ self.monitor_client.metric_alerts.list_by_resource_group(
206
+ resource_group
207
+ )
208
+ )
209
+
210
+ relevant_rules = []
211
+ for rule in metric_alert_rules:
212
+ # Check if this rule applies to our database or server
213
+ if hasattr(rule, "scopes") and rule.scopes:
214
+ for scope in rule.scopes:
215
+ if (
216
+ database_resource_id in scope
217
+ or server_resource_id in scope
218
+ ):
219
+ relevant_rules.append(rule)
220
+ break
221
+
222
+ logging.info(
223
+ f"Found {len(relevant_rules)} metric alert rules relevant to our resources"
224
+ )
225
+
226
+ # For each relevant rule, try to get alert instances if possible
227
+ for rule in relevant_rules:
228
+ rule_name = getattr(rule, "name", "Unknown Rule")
229
+ rule_id = getattr(rule, "id", "Unknown ID")
230
+
231
+ # Create alert entry from rule definition
232
+ scope_type = (
233
+ "database"
234
+ if database_resource_id in str(getattr(rule, "scopes", []))
235
+ else "server"
236
+ )
237
+
238
+ alert_data = {
239
+ "id": rule_id,
240
+ "name": rule_name,
241
+ "description": getattr(
242
+ rule, "description", "Metric alert rule"
243
+ ),
244
+ "severity": f"Sev{getattr(rule, 'severity', 3)}",
245
+ "state": "Rule Configured", # We can't determine actual firing state without alert instances
246
+ "monitor_condition": "Configured",
247
+ "fired_time": datetime.now(
248
+ timezone.utc
249
+ ).isoformat(), # Use current time as placeholder
250
+ "resource_type": "Metric Alert Rule",
251
+ "target_resource": str(getattr(rule, "scopes", [])),
252
+ "scope": scope_type,
253
+ "rule_enabled": getattr(rule, "enabled", False),
254
+ "window_size": str(getattr(rule, "window_size", "Unknown")),
255
+ "evaluation_frequency": str(
256
+ getattr(rule, "evaluation_frequency", "Unknown")
257
+ ),
258
+ }
259
+
260
+ # Add criteria information if available
261
+ if hasattr(rule, "criteria") and rule.criteria:
262
+ criteria_info = []
263
+ if hasattr(rule.criteria, "all_of"):
264
+ for criterion in rule.criteria.all_of:
265
+ metric_name = getattr(
266
+ criterion, "metric_name", "Unknown"
267
+ )
268
+ operator = getattr(criterion, "operator", "Unknown")
269
+ threshold = getattr(criterion, "threshold", "Unknown")
270
+ criteria_info.append(
271
+ f"{metric_name} {operator} {threshold}"
272
+ )
273
+ if criteria_info:
274
+ alert_data["criteria"] = "; ".join(criteria_info)
275
+
276
+ historical_alerts.append(alert_data)
277
+ logging.info(f"Added metric alert rule: {rule_name} ({scope_type})")
278
+
279
+ # If we have alerts management client, try to get actual alert instances
280
+ if self.alerts_client and historical_alerts:
281
+ try:
282
+ logging.info(
283
+ "Attempting to get alert instances from AlertsManagement API"
284
+ )
285
+ # Try to get alert instances for the time period
286
+ alert_instances = self.alerts_client.alerts.get_all(
287
+ time_range=f"PT{hours_back}H" # ISO 8601 duration format
288
+ )
289
+
290
+ instance_count = 0
291
+ for alert_instance in alert_instances:
292
+ instance_count += 1
293
+ # Check if this instance relates to our resources
294
+ instance_resource = str(
295
+ getattr(alert_instance, "target_resource", "")
296
+ )
297
+ if (
298
+ database_resource_id in instance_resource
299
+ or server_resource_id in instance_resource
300
+ ):
301
+ # Update or add alert with actual instance data
302
+ instance_data = self._format_alert(
303
+ alert_instance,
304
+ "database"
305
+ if database_resource_id in instance_resource
306
+ else "server",
307
+ )
308
+ historical_alerts.append(instance_data)
309
+ logging.info(
310
+ f"Added alert instance: {instance_data.get('name', 'Unknown')}"
311
+ )
312
+
313
+ logging.info(f"Processed {instance_count} alert instances")
314
+ except Exception as e:
315
+ logging.warning(f"Failed to get alert instances: {e}")
316
+ # Continue with just the rules data
317
+
318
+ except Exception as e:
319
+ logging.warning(
320
+ f"Failed to get metric alert rules, falling back to activity logs: {e}"
321
+ )
322
+ return self._get_alert_history_fallback(
323
+ resource_group, server_name, database_name, hours_back
324
+ )
325
+
326
+ # Sort by fired time, most recent first
327
+ historical_alerts.sort(key=lambda x: x.get("fired_time", ""), reverse=True)
328
+
329
+ # Analyze patterns
330
+ analysis = self._analyze_alert_patterns(historical_alerts)
331
+
332
+ return {
333
+ "database_resource_id": database_resource_id,
334
+ "server_resource_id": server_resource_id,
335
+ "time_range": {
336
+ "start": start_time.isoformat(),
337
+ "end": end_time.isoformat(),
338
+ "hours": hours_back,
339
+ },
340
+ "alerts": historical_alerts,
341
+ "total_count": len(historical_alerts),
342
+ "analysis": analysis,
343
+ "retrieved_at": datetime.now(timezone.utc).isoformat(),
344
+ "method": "metric_alerts",
345
+ }
346
+
347
+ except Exception as e:
348
+ error_msg = f"Failed to retrieve alert history: {str(e)}"
349
+ logging.error(error_msg, exc_info=True)
350
+ return {"error": error_msg}
351
+
352
+ def _get_alert_history_fallback(
353
+ self, resource_group: str, server_name: str, database_name: str, hours_back: int
354
+ ) -> Dict[str, Any]:
355
+ """Fallback method to get alert history using activity logs."""
356
+ try:
357
+ database_resource_id = self._build_database_resource_id(
358
+ resource_group, server_name, database_name
359
+ )
360
+ server_resource_id = self._build_server_resource_id(
361
+ resource_group, server_name
362
+ )
363
+
364
+ end_time = datetime.now(timezone.utc)
365
+ start_time = end_time - timedelta(hours=hours_back)
366
+
367
+ filter_query = (
368
+ f"eventTimestamp ge '{start_time.isoformat()}' and "
369
+ f"eventTimestamp le '{end_time.isoformat()}' and "
370
+ f"resourceId eq '{database_resource_id}'"
371
+ )
372
+
373
+ logging.info(f"Using activity logs fallback with filter: {filter_query}")
374
+
375
+ try:
376
+ activity_logs = self.monitor_client.activity_logs.list(
377
+ filter=filter_query
378
+ )
379
+
380
+ alerts = []
381
+ log_count = 0
382
+ for log_entry in activity_logs:
383
+ log_count += 1
384
+
385
+ logging.info(f"Found {log_count} activity log entries")
386
+
387
+ # Reset iterator and process entries
388
+ activity_logs = self.monitor_client.activity_logs.list(
389
+ filter=filter_query
390
+ )
391
+ for log_entry in activity_logs:
392
+ if hasattr(log_entry, "level") and log_entry.level in [
393
+ "Warning",
394
+ "Error",
395
+ "Critical",
396
+ ]:
397
+ alert_data = {
398
+ "id": getattr(log_entry, "event_data_id", "unknown"),
399
+ "name": getattr(log_entry, "operation_name", {}).get(
400
+ "value", "Unknown Operation"
401
+ ),
402
+ "description": getattr(
403
+ log_entry, "description", "No description available"
404
+ ),
405
+ "severity": self._map_level_to_severity(
406
+ getattr(log_entry, "level", "Informational")
407
+ ),
408
+ "state": "Resolved", # Activity logs are historical
409
+ "monitor_condition": "Resolved",
410
+ "fired_time": getattr(
411
+ log_entry, "event_timestamp", end_time
412
+ ).isoformat(),
413
+ "resolved_time": getattr(
414
+ log_entry, "event_timestamp", end_time
415
+ ).isoformat(),
416
+ "resource_type": "Activity Log Event",
417
+ "target_resource": getattr(log_entry, "resource_id", ""),
418
+ "scope": "server"
419
+ if server_resource_id
420
+ in str(getattr(log_entry, "resource_id", ""))
421
+ else "database",
422
+ "caller": getattr(log_entry, "caller", "Unknown"),
423
+ "status": getattr(log_entry, "status", {}).get(
424
+ "value", "Unknown"
425
+ ),
426
+ }
427
+ alerts.append(alert_data)
428
+ logging.info(
429
+ f"Added alert from activity log: {alert_data['name']}"
430
+ )
431
+
432
+ # Sort by fired time, most recent first
433
+ alerts.sort(key=lambda x: x.get("fired_time", ""), reverse=True)
434
+
435
+ except Exception as e:
436
+ logging.error(f"Failed to process activity logs: {e}")
437
+ alerts = []
438
+
439
+ # Analyze patterns
440
+ analysis = self._analyze_alert_patterns(alerts)
441
+
442
+ return {
443
+ "database_resource_id": database_resource_id,
444
+ "server_resource_id": server_resource_id,
445
+ "time_range": {
446
+ "start": start_time.isoformat(),
447
+ "end": end_time.isoformat(),
448
+ "hours": hours_back,
449
+ },
450
+ "alerts": alerts,
451
+ "total_count": len(alerts),
452
+ "analysis": analysis,
453
+ "retrieved_at": datetime.now(timezone.utc).isoformat(),
454
+ "method": "activity_log_fallback",
455
+ }
456
+
457
+ except Exception as e:
458
+ error_msg = f"Failed to retrieve alert history using fallback: {str(e)}"
459
+ logging.error(error_msg, exc_info=True)
460
+ return {"error": error_msg}
461
+
462
+ def _format_alert(self, alert, scope: str) -> Dict[str, Any]:
463
+ """Format an alert object into a consistent dictionary structure."""
464
+ try:
465
+ # Handle different alert object types
466
+ if hasattr(alert, "properties"):
467
+ props = alert.properties
468
+ return {
469
+ "id": getattr(alert, "id", "unknown"),
470
+ "name": getattr(alert, "name", "Unknown Alert"),
471
+ "description": getattr(
472
+ props, "description", "No description available"
473
+ ),
474
+ "severity": getattr(props, "severity", "Unknown"),
475
+ "state": getattr(props, "monitor_condition", "Unknown"),
476
+ "monitor_condition": getattr(props, "monitor_condition", "Unknown"),
477
+ "fired_time": getattr(
478
+ props, "fired_time", datetime.now(timezone.utc)
479
+ ).isoformat(),
480
+ "resolved_time": getattr(props, "resolved_time", None),
481
+ "resource_type": getattr(props, "target_resource_type", "Unknown"),
482
+ "target_resource": getattr(props, "target_resource", ""),
483
+ "scope": scope,
484
+ }
485
+ else:
486
+ # Fallback for different alert formats
487
+ return {
488
+ "id": str(getattr(alert, "id", "unknown")),
489
+ "name": str(getattr(alert, "name", "Unknown Alert")),
490
+ "description": str(
491
+ getattr(alert, "description", "No description available")
492
+ ),
493
+ "severity": str(getattr(alert, "severity", "Unknown")),
494
+ "state": str(getattr(alert, "state", "Unknown")),
495
+ "monitor_condition": str(
496
+ getattr(alert, "monitor_condition", "Unknown")
497
+ ),
498
+ "fired_time": datetime.now(timezone.utc).isoformat(),
499
+ "resource_type": "Unknown",
500
+ "target_resource": str(getattr(alert, "target_resource", "")),
501
+ "scope": scope,
502
+ }
503
+ except Exception as e:
504
+ logging.warning(f"Failed to format alert: {e}")
505
+ return {
506
+ "id": "unknown",
507
+ "name": "Failed to parse alert",
508
+ "description": f"Error parsing alert: {str(e)}",
509
+ "severity": "Unknown",
510
+ "state": "Unknown",
511
+ "scope": scope,
512
+ "fired_time": datetime.now(timezone.utc).isoformat(),
513
+ }
514
+
515
+ def _map_level_to_severity(self, level: str) -> str:
516
+ """Map activity log level to alert severity."""
517
+ level_map = {
518
+ "Critical": "Sev0",
519
+ "Error": "Sev1",
520
+ "Warning": "Sev2",
521
+ "Informational": "Sev3",
522
+ "Verbose": "Sev4",
523
+ }
524
+ return level_map.get(level, "Unknown")
525
+
526
+ def _analyze_alert_patterns(self, alerts: List[Dict[str, Any]]) -> Dict[str, Any]:
527
+ """Analyze alert patterns to identify trends and issues."""
528
+ if not alerts:
529
+ return {"message": "No alerts to analyze"}
530
+
531
+ # Count by severity
532
+ severity_counts: dict = {}
533
+ state_counts: dict = {}
534
+ scope_counts: dict = {}
535
+ resource_type_counts: dict = {}
536
+
537
+ for alert in alerts:
538
+ severity = alert.get("severity", "Unknown")
539
+ state = alert.get("state", "Unknown")
540
+ scope = alert.get("scope", "Unknown")
541
+ resource_type = alert.get("resource_type", "Unknown")
542
+
543
+ severity_counts[severity] = severity_counts.get(severity, 0) + 1
544
+ state_counts[state] = state_counts.get(state, 0) + 1
545
+ scope_counts[scope] = scope_counts.get(scope, 0) + 1
546
+ resource_type_counts[resource_type] = (
547
+ resource_type_counts.get(resource_type, 0) + 1
548
+ )
549
+
550
+ # Find most frequent alert names
551
+ alert_names: dict = {}
552
+ for alert in alerts:
553
+ name = alert.get("name", "Unknown")
554
+ alert_names[name] = alert_names.get(name, 0) + 1
555
+
556
+ most_frequent_alerts = sorted(
557
+ alert_names.items(), key=lambda x: x[1], reverse=True
558
+ )[:5]
559
+
560
+ return {
561
+ "total_alerts": len(alerts),
562
+ "severity_breakdown": severity_counts,
563
+ "state_breakdown": state_counts,
564
+ "scope_breakdown": scope_counts,
565
+ "resource_type_breakdown": resource_type_counts,
566
+ "most_frequent_alerts": most_frequent_alerts,
567
+ "analysis_notes": self._generate_analysis_notes(
568
+ severity_counts, most_frequent_alerts
569
+ ),
570
+ }
571
+
572
+ def _generate_analysis_notes(
573
+ self, severity_counts: Dict, frequent_alerts: List
574
+ ) -> List[str]:
575
+ """Generate human-readable analysis notes."""
576
+ notes = []
577
+
578
+ # Severity analysis
579
+ critical_count = severity_counts.get("Sev0", 0) + severity_counts.get(
580
+ "Critical", 0
581
+ )
582
+ error_count = severity_counts.get("Sev1", 0) + severity_counts.get("Error", 0)
583
+
584
+ if critical_count > 0:
585
+ notes.append(f"⚠️ {critical_count} critical severity alerts detected")
586
+ if error_count > 0:
587
+ notes.append(f"🔴 {error_count} error severity alerts detected")
588
+
589
+ # Frequent alerts analysis
590
+ if frequent_alerts:
591
+ top_alert = frequent_alerts[0]
592
+ if top_alert[1] > 1:
593
+ notes.append(
594
+ f"🔁 Most frequent alert: '{top_alert[0]}' ({top_alert[1]} occurrences)"
595
+ )
596
+
597
+ if not notes:
598
+ notes.append("✅ Alert analysis looks normal")
599
+
600
+ return notes