holmesgpt 0.11.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (183) hide show
  1. holmes/.git_archival.json +7 -0
  2. holmes/__init__.py +76 -0
  3. holmes/__init__.py.bak +76 -0
  4. holmes/clients/robusta_client.py +24 -0
  5. holmes/common/env_vars.py +47 -0
  6. holmes/config.py +526 -0
  7. holmes/core/__init__.py +0 -0
  8. holmes/core/conversations.py +578 -0
  9. holmes/core/investigation.py +152 -0
  10. holmes/core/investigation_structured_output.py +264 -0
  11. holmes/core/issue.py +54 -0
  12. holmes/core/llm.py +250 -0
  13. holmes/core/models.py +157 -0
  14. holmes/core/openai_formatting.py +51 -0
  15. holmes/core/performance_timing.py +72 -0
  16. holmes/core/prompt.py +42 -0
  17. holmes/core/resource_instruction.py +17 -0
  18. holmes/core/runbooks.py +26 -0
  19. holmes/core/safeguards.py +120 -0
  20. holmes/core/supabase_dal.py +540 -0
  21. holmes/core/tool_calling_llm.py +798 -0
  22. holmes/core/tools.py +566 -0
  23. holmes/core/tools_utils/__init__.py +0 -0
  24. holmes/core/tools_utils/tool_executor.py +65 -0
  25. holmes/core/tools_utils/toolset_utils.py +52 -0
  26. holmes/core/toolset_manager.py +418 -0
  27. holmes/interactive.py +229 -0
  28. holmes/main.py +1041 -0
  29. holmes/plugins/__init__.py +0 -0
  30. holmes/plugins/destinations/__init__.py +6 -0
  31. holmes/plugins/destinations/slack/__init__.py +2 -0
  32. holmes/plugins/destinations/slack/plugin.py +163 -0
  33. holmes/plugins/interfaces.py +32 -0
  34. holmes/plugins/prompts/__init__.py +48 -0
  35. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  36. holmes/plugins/prompts/_default_log_prompt.jinja2 +11 -0
  37. holmes/plugins/prompts/_fetch_logs.jinja2 +36 -0
  38. holmes/plugins/prompts/_general_instructions.jinja2 +86 -0
  39. holmes/plugins/prompts/_global_instructions.jinja2 +12 -0
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +13 -0
  41. holmes/plugins/prompts/_toolsets_instructions.jinja2 +56 -0
  42. holmes/plugins/prompts/generic_ask.jinja2 +36 -0
  43. holmes/plugins/prompts/generic_ask_conversation.jinja2 +32 -0
  44. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +50 -0
  45. holmes/plugins/prompts/generic_investigation.jinja2 +42 -0
  46. holmes/plugins/prompts/generic_post_processing.jinja2 +13 -0
  47. holmes/plugins/prompts/generic_ticket.jinja2 +12 -0
  48. holmes/plugins/prompts/investigation_output_format.jinja2 +32 -0
  49. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +84 -0
  50. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +39 -0
  51. holmes/plugins/runbooks/README.md +22 -0
  52. holmes/plugins/runbooks/__init__.py +100 -0
  53. holmes/plugins/runbooks/catalog.json +14 -0
  54. holmes/plugins/runbooks/jira.yaml +12 -0
  55. holmes/plugins/runbooks/kube-prometheus-stack.yaml +10 -0
  56. holmes/plugins/runbooks/networking/dns_troubleshooting_instructions.md +66 -0
  57. holmes/plugins/runbooks/upgrade/upgrade_troubleshooting_instructions.md +44 -0
  58. holmes/plugins/sources/github/__init__.py +77 -0
  59. holmes/plugins/sources/jira/__init__.py +123 -0
  60. holmes/plugins/sources/opsgenie/__init__.py +93 -0
  61. holmes/plugins/sources/pagerduty/__init__.py +147 -0
  62. holmes/plugins/sources/prometheus/__init__.py +0 -0
  63. holmes/plugins/sources/prometheus/models.py +104 -0
  64. holmes/plugins/sources/prometheus/plugin.py +154 -0
  65. holmes/plugins/toolsets/__init__.py +171 -0
  66. holmes/plugins/toolsets/aks-node-health.yaml +65 -0
  67. holmes/plugins/toolsets/aks.yaml +86 -0
  68. holmes/plugins/toolsets/argocd.yaml +70 -0
  69. holmes/plugins/toolsets/atlas_mongodb/instructions.jinja2 +8 -0
  70. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +307 -0
  71. holmes/plugins/toolsets/aws.yaml +76 -0
  72. holmes/plugins/toolsets/azure_sql/__init__.py +0 -0
  73. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +600 -0
  74. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +309 -0
  75. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +445 -0
  76. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +251 -0
  77. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +317 -0
  78. holmes/plugins/toolsets/azure_sql/azure_base_toolset.py +55 -0
  79. holmes/plugins/toolsets/azure_sql/azure_sql_instructions.jinja2 +137 -0
  80. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +183 -0
  81. holmes/plugins/toolsets/azure_sql/install.md +66 -0
  82. holmes/plugins/toolsets/azure_sql/tools/__init__.py +1 -0
  83. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +324 -0
  84. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +243 -0
  85. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +205 -0
  86. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +249 -0
  87. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +373 -0
  88. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +237 -0
  89. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +172 -0
  90. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +170 -0
  91. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +188 -0
  92. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +180 -0
  93. holmes/plugins/toolsets/azure_sql/utils.py +83 -0
  94. holmes/plugins/toolsets/bash/__init__.py +0 -0
  95. holmes/plugins/toolsets/bash/bash_instructions.jinja2 +14 -0
  96. holmes/plugins/toolsets/bash/bash_toolset.py +208 -0
  97. holmes/plugins/toolsets/bash/common/bash.py +52 -0
  98. holmes/plugins/toolsets/bash/common/config.py +14 -0
  99. holmes/plugins/toolsets/bash/common/stringify.py +25 -0
  100. holmes/plugins/toolsets/bash/common/validators.py +24 -0
  101. holmes/plugins/toolsets/bash/grep/__init__.py +52 -0
  102. holmes/plugins/toolsets/bash/kubectl/__init__.py +100 -0
  103. holmes/plugins/toolsets/bash/kubectl/constants.py +96 -0
  104. holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +66 -0
  105. holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +88 -0
  106. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +108 -0
  107. holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +20 -0
  108. holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +46 -0
  109. holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +81 -0
  110. holmes/plugins/toolsets/bash/parse_command.py +103 -0
  111. holmes/plugins/toolsets/confluence.yaml +19 -0
  112. holmes/plugins/toolsets/consts.py +5 -0
  113. holmes/plugins/toolsets/coralogix/api.py +158 -0
  114. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +103 -0
  115. holmes/plugins/toolsets/coralogix/utils.py +181 -0
  116. holmes/plugins/toolsets/datadog.py +153 -0
  117. holmes/plugins/toolsets/docker.yaml +46 -0
  118. holmes/plugins/toolsets/git.py +756 -0
  119. holmes/plugins/toolsets/grafana/__init__.py +0 -0
  120. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +54 -0
  121. holmes/plugins/toolsets/grafana/common.py +68 -0
  122. holmes/plugins/toolsets/grafana/grafana_api.py +31 -0
  123. holmes/plugins/toolsets/grafana/loki_api.py +89 -0
  124. holmes/plugins/toolsets/grafana/tempo_api.py +124 -0
  125. holmes/plugins/toolsets/grafana/toolset_grafana.py +102 -0
  126. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +102 -0
  127. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +10 -0
  128. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -0
  129. holmes/plugins/toolsets/grafana/trace_parser.py +195 -0
  130. holmes/plugins/toolsets/helm.yaml +42 -0
  131. holmes/plugins/toolsets/internet/internet.py +275 -0
  132. holmes/plugins/toolsets/internet/notion.py +137 -0
  133. holmes/plugins/toolsets/kafka.py +638 -0
  134. holmes/plugins/toolsets/kubernetes.yaml +255 -0
  135. holmes/plugins/toolsets/kubernetes_logs.py +426 -0
  136. holmes/plugins/toolsets/kubernetes_logs.yaml +42 -0
  137. holmes/plugins/toolsets/logging_utils/__init__.py +0 -0
  138. holmes/plugins/toolsets/logging_utils/logging_api.py +217 -0
  139. holmes/plugins/toolsets/logging_utils/types.py +0 -0
  140. holmes/plugins/toolsets/mcp/toolset_mcp.py +135 -0
  141. holmes/plugins/toolsets/newrelic.py +222 -0
  142. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  143. holmes/plugins/toolsets/opensearch/opensearch.py +245 -0
  144. holmes/plugins/toolsets/opensearch/opensearch_logs.py +151 -0
  145. holmes/plugins/toolsets/opensearch/opensearch_traces.py +211 -0
  146. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +12 -0
  147. holmes/plugins/toolsets/opensearch/opensearch_utils.py +166 -0
  148. holmes/plugins/toolsets/prometheus/prometheus.py +818 -0
  149. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +38 -0
  150. holmes/plugins/toolsets/rabbitmq/api.py +398 -0
  151. holmes/plugins/toolsets/rabbitmq/rabbitmq_instructions.jinja2 +37 -0
  152. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +222 -0
  153. holmes/plugins/toolsets/robusta/__init__.py +0 -0
  154. holmes/plugins/toolsets/robusta/robusta.py +235 -0
  155. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +24 -0
  156. holmes/plugins/toolsets/runbook/__init__.py +0 -0
  157. holmes/plugins/toolsets/runbook/runbook_fetcher.py +78 -0
  158. holmes/plugins/toolsets/service_discovery.py +92 -0
  159. holmes/plugins/toolsets/servicenow/install.md +37 -0
  160. holmes/plugins/toolsets/servicenow/instructions.jinja2 +3 -0
  161. holmes/plugins/toolsets/servicenow/servicenow.py +198 -0
  162. holmes/plugins/toolsets/slab.yaml +20 -0
  163. holmes/plugins/toolsets/utils.py +137 -0
  164. holmes/plugins/utils.py +14 -0
  165. holmes/utils/__init__.py +0 -0
  166. holmes/utils/cache.py +84 -0
  167. holmes/utils/cert_utils.py +40 -0
  168. holmes/utils/default_toolset_installation_guide.jinja2 +44 -0
  169. holmes/utils/definitions.py +13 -0
  170. holmes/utils/env.py +53 -0
  171. holmes/utils/file_utils.py +56 -0
  172. holmes/utils/global_instructions.py +20 -0
  173. holmes/utils/holmes_status.py +22 -0
  174. holmes/utils/holmes_sync_toolsets.py +80 -0
  175. holmes/utils/markdown_utils.py +55 -0
  176. holmes/utils/pydantic_utils.py +54 -0
  177. holmes/utils/robusta.py +10 -0
  178. holmes/utils/tags.py +97 -0
  179. holmesgpt-0.11.5.dist-info/LICENSE.txt +21 -0
  180. holmesgpt-0.11.5.dist-info/METADATA +400 -0
  181. holmesgpt-0.11.5.dist-info/RECORD +183 -0
  182. holmesgpt-0.11.5.dist-info/WHEEL +4 -0
  183. holmesgpt-0.11.5.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,251 @@
1
+ from typing import Dict, List
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+ from azure.core.credentials import TokenCredential
5
+ from azure.monitor.query import MetricsQueryClient
6
+ from .azure_sql_api import AzureSQLAPIClient
7
+
8
+
9
+ class ConnectionMonitoringAPI:
10
+ def __init__(
11
+ self,
12
+ credential: TokenCredential,
13
+ subscription_id: str,
14
+ ):
15
+ self.sql_api_client = AzureSQLAPIClient(credential, subscription_id)
16
+ self.metrics_client = MetricsQueryClient(credential)
17
+ self.subscription_id = subscription_id
18
+
19
+ def _format_sql_error(self, error: Exception) -> str:
20
+ """Format SQL errors with helpful permission guidance."""
21
+ error_str = str(error)
22
+
23
+ # Detect common permission issues
24
+ if (
25
+ "Login failed for user" in error_str
26
+ and "token-identified principal" in error_str
27
+ ):
28
+ return (
29
+ f"Azure AD authentication failed - the service principal lacks database permissions. "
30
+ f"Please ensure the service principal is added as a database user with VIEW SERVER STATE permission. "
31
+ f"Original error: {error_str}"
32
+ )
33
+ elif (
34
+ "permission was denied" in error_str.lower()
35
+ or "view server state" in error_str.lower()
36
+ ):
37
+ return (
38
+ f"Insufficient database permissions - the user needs VIEW SERVER STATE permission to access system views. "
39
+ f"Original error: {error_str}"
40
+ )
41
+ elif "login failed" in error_str.lower():
42
+ return (
43
+ f"Database login failed - check authentication credentials and database access permissions. "
44
+ f"Original error: {error_str}"
45
+ )
46
+ else:
47
+ return error_str
48
+
49
+ def get_connection_metrics(
50
+ self,
51
+ resource_group: str,
52
+ server_name: str,
53
+ database_name: str,
54
+ hours_back: int = 2,
55
+ ) -> Dict:
56
+ """Get connection-related metrics from Azure Monitor."""
57
+ resource_id = (
58
+ f"subscriptions/{self.subscription_id}/"
59
+ f"resourceGroups/{resource_group}/"
60
+ f"providers/Microsoft.Sql/servers/{server_name}/"
61
+ f"databases/{database_name}"
62
+ )
63
+
64
+ end_time = datetime.now()
65
+ # Use longer timespan for better data availability
66
+ start_time = end_time - timedelta(hours=max(hours_back, 24))
67
+
68
+ try:
69
+ metrics_data = self.metrics_client.query_resource(
70
+ resource_uri=resource_id,
71
+ metric_names=[
72
+ "connection_successful", # This exists
73
+ "sessions_count", # This exists
74
+ "cpu_percent", # This exists
75
+ "storage_percent", # This exists
76
+ ],
77
+ timespan=(start_time, end_time),
78
+ granularity=timedelta(hours=1), # Larger granularity for better data
79
+ aggregations=["Maximum", "Average", "Total"],
80
+ )
81
+
82
+ result = {}
83
+ for metric in metrics_data.metrics:
84
+ metric_data = []
85
+ for timeseries in metric.timeseries:
86
+ for data_point in timeseries.data:
87
+ # Handle None values and pick the best available aggregation
88
+ value_data = {
89
+ "timestamp": data_point.timestamp.isoformat(),
90
+ "maximum": data_point.maximum
91
+ if data_point.maximum is not None
92
+ else 0,
93
+ "average": data_point.average
94
+ if data_point.average is not None
95
+ else 0,
96
+ "total": data_point.total
97
+ if data_point.total is not None
98
+ else 0,
99
+ }
100
+ metric_data.append(value_data)
101
+ result[metric.name] = metric_data
102
+
103
+ return result
104
+
105
+ except Exception as e:
106
+ logging.error(f"Failed to get connection metrics: {str(e)}")
107
+ return {"error": str(e)}
108
+
109
+ def get_active_connections(
110
+ self, server_name: str, database_name: str
111
+ ) -> List[Dict]:
112
+ """Get currently active connections using DMV."""
113
+ query = """
114
+ SELECT
115
+ s.session_id,
116
+ s.login_name,
117
+ s.host_name,
118
+ s.program_name,
119
+ s.login_time,
120
+ s.last_request_start_time,
121
+ s.last_request_end_time,
122
+ s.status,
123
+ s.cpu_time,
124
+ s.memory_usage,
125
+ s.total_scheduled_time,
126
+ s.total_elapsed_time,
127
+ s.reads,
128
+ s.writes,
129
+ s.logical_reads,
130
+ CASE
131
+ WHEN r.session_id IS NOT NULL THEN 'Active'
132
+ ELSE 'Inactive'
133
+ END as connection_status,
134
+ r.blocking_session_id,
135
+ r.wait_type,
136
+ r.wait_time,
137
+ r.wait_resource
138
+ FROM sys.dm_exec_sessions s
139
+ LEFT JOIN sys.dm_exec_requests r ON s.session_id = r.session_id
140
+ WHERE s.is_user_process = 1
141
+ ORDER BY s.login_time DESC;
142
+ """
143
+
144
+ try:
145
+ return self.sql_api_client.execute_query(server_name, database_name, query)
146
+ except Exception as e:
147
+ formatted_error = self._format_sql_error(e)
148
+ logging.error(f"Failed to get active connections: {formatted_error}")
149
+ return []
150
+
151
+ def get_connection_summary(self, server_name: str, database_name: str) -> Dict:
152
+ """Get connection summary statistics."""
153
+ query = """
154
+ SELECT
155
+ COUNT(*) as total_connections,
156
+ COUNT(CASE WHEN r.session_id IS NOT NULL THEN 1 END) as active_connections,
157
+ COUNT(CASE WHEN r.session_id IS NULL THEN 1 END) as idle_connections,
158
+ COUNT(CASE WHEN r.blocking_session_id > 0 THEN 1 END) as blocked_connections,
159
+ COUNT(DISTINCT s.login_name) as unique_users,
160
+ COUNT(DISTINCT s.host_name) as unique_hosts,
161
+ MAX(s.login_time) as latest_login,
162
+ MIN(s.login_time) as earliest_login
163
+ FROM sys.dm_exec_sessions s
164
+ LEFT JOIN sys.dm_exec_requests r ON s.session_id = r.session_id
165
+ WHERE s.is_user_process = 1;
166
+ """
167
+
168
+ try:
169
+ result = self.sql_api_client.execute_query(
170
+ server_name, database_name, query
171
+ )
172
+ return result[0] if result else {}
173
+ except Exception as e:
174
+ formatted_error = self._format_sql_error(e)
175
+ logging.error(f"Failed to get connection summary: {formatted_error}")
176
+ return {"error": formatted_error}
177
+
178
+ def get_failed_connections(
179
+ self, server_name: str, database_name: str, hours_back: int = 24
180
+ ) -> List[Dict]:
181
+ """Get failed connection attempts from extended events or system health."""
182
+ # Note: This query looks for connectivity ring buffer events
183
+ query = f"""
184
+ WITH ConnectivityEvents AS (
185
+ SELECT
186
+ CAST(event_data AS XML) as event_xml,
187
+ timestamp_utc
188
+ FROM sys.fn_xe_file_target_read_file('system_health*.xel', null, null, null)
189
+ WHERE object_name = 'connectivity_ring_buffer_recorded'
190
+ AND timestamp_utc > DATEADD(hour, -{hours_back}, GETUTCDATE())
191
+ )
192
+ SELECT TOP 100
193
+ timestamp_utc,
194
+ event_xml.value('(/Record/ConnectivityTraceRecord/RecordType)[1]', 'varchar(50)') as record_type,
195
+ event_xml.value('(/Record/ConnectivityTraceRecord/RecordSource)[1]', 'varchar(50)') as record_source,
196
+ event_xml.value('(/Record/ConnectivityTraceRecord/Spid)[1]', 'int') as spid,
197
+ event_xml.value('(/Record/ConnectivityTraceRecord/SniConsumerError)[1]', 'int') as sni_consumer_error,
198
+ event_xml.value('(/Record/ConnectivityTraceRecord/State)[1]', 'int') as state,
199
+ event_xml.value('(/Record/ConnectivityTraceRecord/RemoteHost)[1]', 'varchar(100)') as remote_host,
200
+ event_xml.value('(/Record/ConnectivityTraceRecord/RemotePort)[1]', 'varchar(10)') as remote_port
201
+ FROM ConnectivityEvents
202
+ WHERE event_xml.value('(/Record/ConnectivityTraceRecord/RecordType)[1]', 'varchar(50)') LIKE '%Error%'
203
+ ORDER BY timestamp_utc DESC;
204
+ """
205
+
206
+ try:
207
+ return self.sql_api_client.execute_query(server_name, database_name, query)
208
+ except Exception as e:
209
+ logging.warning(
210
+ f"Failed to get failed connections (extended events may not be available): {str(e)}"
211
+ )
212
+ # Fallback to a simpler approach using error log if available
213
+ return []
214
+
215
+ def get_connection_pool_stats(self, server_name: str, database_name: str) -> Dict:
216
+ """Get connection pool related statistics."""
217
+ query = """
218
+ SELECT
219
+ 'Database Connections' as metric_name,
220
+ COUNT(*) as current_value,
221
+ 'connections' as unit
222
+ FROM sys.dm_exec_sessions
223
+ WHERE is_user_process = 1
224
+ UNION ALL
225
+ SELECT
226
+ 'Active Requests' as metric_name,
227
+ COUNT(*) as current_value,
228
+ 'requests' as unit
229
+ FROM sys.dm_exec_requests
230
+ WHERE session_id > 50
231
+ UNION ALL
232
+ SELECT
233
+ 'Waiting Tasks' as metric_name,
234
+ COUNT(*) as current_value,
235
+ 'tasks' as unit
236
+ FROM sys.dm_os_waiting_tasks
237
+ WHERE session_id > 50;
238
+ """
239
+
240
+ try:
241
+ results = self.sql_api_client.execute_query(
242
+ server_name, database_name, query
243
+ )
244
+ return {
245
+ row["metric_name"]: {"value": row["current_value"], "unit": row["unit"]}
246
+ for row in results
247
+ }
248
+ except Exception as e:
249
+ formatted_error = self._format_sql_error(e)
250
+ logging.error(f"Failed to get connection pool stats: {formatted_error}")
251
+ return {"error": formatted_error}
@@ -0,0 +1,317 @@
1
+ from typing import Dict, List
2
+ import logging
3
+ from datetime import datetime, timedelta
4
+ from azure.core.credentials import TokenCredential
5
+ from azure.monitor.query import MetricsQueryClient
6
+ from .azure_sql_api import AzureSQLAPIClient
7
+
8
+
9
+ class StorageAnalysisAPI:
10
+ def __init__(
11
+ self,
12
+ credential: TokenCredential,
13
+ subscription_id: str,
14
+ ):
15
+ self.sql_api_client = AzureSQLAPIClient(credential, subscription_id)
16
+ self.metrics_client = MetricsQueryClient(credential)
17
+ self.subscription_id = subscription_id
18
+
19
+ def _format_sql_error(self, error: Exception) -> str:
20
+ """Format SQL errors with helpful permission guidance."""
21
+ error_str = str(error)
22
+
23
+ # Detect common permission issues
24
+ if (
25
+ "Login failed for user" in error_str
26
+ and "token-identified principal" in error_str
27
+ ):
28
+ return (
29
+ f"Azure AD authentication failed - the service principal lacks database permissions. "
30
+ f"Please ensure the service principal is added as a database user with appropriate permissions. "
31
+ f"Original error: {error_str}"
32
+ )
33
+ elif "permission was denied" in error_str.lower():
34
+ return (
35
+ f"Insufficient database permissions - check user access rights. "
36
+ f"Original error: {error_str}"
37
+ )
38
+ elif "login failed" in error_str.lower():
39
+ return (
40
+ f"Database login failed - check authentication credentials and database access permissions. "
41
+ f"Original error: {error_str}"
42
+ )
43
+ else:
44
+ return error_str
45
+
46
+ def get_storage_metrics(
47
+ self,
48
+ resource_group: str,
49
+ server_name: str,
50
+ database_name: str,
51
+ hours_back: int = 24,
52
+ ) -> Dict:
53
+ """Get storage-related metrics from Azure Monitor."""
54
+ resource_id = (
55
+ f"subscriptions/{self.subscription_id}/"
56
+ f"resourceGroups/{resource_group}/"
57
+ f"providers/Microsoft.Sql/servers/{server_name}/"
58
+ f"databases/{database_name}"
59
+ )
60
+
61
+ end_time = datetime.now()
62
+ start_time = end_time - timedelta(hours=hours_back)
63
+
64
+ try:
65
+ metrics_data = self.metrics_client.query_resource(
66
+ resource_uri=resource_id,
67
+ metric_names=[
68
+ "storage_percent",
69
+ "storage",
70
+ "allocated_data_storage",
71
+ "log_write_percent",
72
+ "tempdb_data_size",
73
+ "tempdb_log_size",
74
+ "tempdb_log_used_percent",
75
+ ],
76
+ timespan=(start_time, end_time),
77
+ granularity=timedelta(minutes=15),
78
+ aggregations=["Maximum", "Average", "Minimum"],
79
+ )
80
+
81
+ result = {}
82
+ for metric in metrics_data.metrics:
83
+ metric_data = []
84
+ for timeseries in metric.timeseries:
85
+ for data_point in timeseries.data:
86
+ metric_data.append(
87
+ {
88
+ "timestamp": data_point.timestamp.isoformat(),
89
+ "maximum": data_point.maximum,
90
+ "average": data_point.average,
91
+ "minimum": data_point.minimum,
92
+ }
93
+ )
94
+ result[metric.name] = metric_data
95
+
96
+ return result
97
+
98
+ except Exception as e:
99
+ logging.error(f"Failed to get storage metrics: {str(e)}")
100
+ return {"error": str(e)}
101
+
102
+ def get_database_size_details(
103
+ self, server_name: str, database_name: str
104
+ ) -> List[Dict]:
105
+ """Get detailed database size information using DMV."""
106
+ query = """
107
+ SELECT
108
+ DB_NAME() as database_name,
109
+ CASE
110
+ WHEN type_desc = 'ROWS' THEN 'Data'
111
+ WHEN type_desc = 'LOG' THEN 'Log'
112
+ ELSE type_desc
113
+ END as file_type,
114
+ name as logical_name,
115
+ physical_name,
116
+ CAST(size * 8.0 / 1024 AS DECIMAL(10,2)) as size_mb,
117
+ CAST(FILEPROPERTY(name, 'SpaceUsed') * 8.0 / 1024 AS DECIMAL(10,2)) as used_mb,
118
+ CAST((size - FILEPROPERTY(name, 'SpaceUsed')) * 8.0 / 1024 AS DECIMAL(10,2)) as free_mb,
119
+ CAST(FILEPROPERTY(name, 'SpaceUsed') * 100.0 / size AS DECIMAL(5,2)) as used_percent,
120
+ CASE
121
+ WHEN max_size = -1 THEN 'Unlimited'
122
+ WHEN max_size = 268435456 THEN 'Default (2TB)'
123
+ ELSE CAST(max_size * 8.0 / 1024 AS VARCHAR(20)) + ' MB'
124
+ END as max_size,
125
+ is_percent_growth,
126
+ CASE
127
+ WHEN is_percent_growth = 1 THEN CAST(growth AS VARCHAR(10)) + '%'
128
+ ELSE CAST(growth * 8.0 / 1024 AS VARCHAR(20)) + ' MB'
129
+ END as growth_setting,
130
+ state_desc as file_state
131
+ FROM sys.database_files
132
+ ORDER BY type_desc, file_id;
133
+ """
134
+
135
+ try:
136
+ return self.sql_api_client.execute_query(server_name, database_name, query)
137
+ except Exception as e:
138
+ logging.error(f"Failed to get database size details: {str(e)}")
139
+ return [{"error": str(e)}]
140
+
141
+ def get_storage_summary(self, server_name: str, database_name: str) -> Dict:
142
+ """Get storage summary statistics."""
143
+ query = """
144
+ SELECT
145
+ DB_NAME() as database_name,
146
+ CAST(SUM(CASE WHEN type_desc = 'ROWS' THEN size END) * 8.0 / 1024 AS DECIMAL(10,2)) as total_data_size_mb,
147
+ CAST(SUM(CASE WHEN type_desc = 'ROWS' THEN FILEPROPERTY(name, 'SpaceUsed') END) * 8.0 / 1024 AS DECIMAL(10,2)) as used_data_size_mb,
148
+ CAST(SUM(CASE WHEN type_desc = 'LOG' THEN size END) * 8.0 / 1024 AS DECIMAL(10,2)) as total_log_size_mb,
149
+ CAST(SUM(CASE WHEN type_desc = 'LOG' THEN FILEPROPERTY(name, 'SpaceUsed') END) * 8.0 / 1024 AS DECIMAL(10,2)) as used_log_size_mb,
150
+ CAST((SUM(CASE WHEN type_desc = 'ROWS' THEN size END) +
151
+ SUM(CASE WHEN type_desc = 'LOG' THEN size END)) * 8.0 / 1024 AS DECIMAL(10,2)) as total_database_size_mb,
152
+ CAST((SUM(CASE WHEN type_desc = 'ROWS' THEN FILEPROPERTY(name, 'SpaceUsed') END) +
153
+ SUM(CASE WHEN type_desc = 'LOG' THEN FILEPROPERTY(name, 'SpaceUsed') END)) * 8.0 / 1024 AS DECIMAL(10,2)) as total_used_size_mb,
154
+ COUNT(CASE WHEN type_desc = 'ROWS' THEN 1 END) as data_files_count,
155
+ COUNT(CASE WHEN type_desc = 'LOG' THEN 1 END) as log_files_count
156
+ FROM sys.database_files;
157
+ """
158
+
159
+ try:
160
+ result = self.sql_api_client.execute_query(
161
+ server_name, database_name, query
162
+ )
163
+ return result[0] if result else {}
164
+ except Exception as e:
165
+ logging.error(f"Failed to get storage summary: {str(e)}")
166
+ return {"error": str(e)}
167
+
168
+ def get_table_space_usage(
169
+ self, server_name: str, database_name: str, top_count: int = 20
170
+ ) -> List[Dict]:
171
+ """Get space usage by table/index."""
172
+ query = f"""
173
+ SELECT TOP {top_count}
174
+ SCHEMA_NAME(t.schema_id) as schema_name,
175
+ t.name as table_name,
176
+ i.name as index_name,
177
+ i.type_desc as index_type,
178
+ p.rows as row_count,
179
+ a.total_pages,
180
+ a.used_pages,
181
+ a.data_pages,
182
+ CAST(a.total_pages * 8.0 / 1024 AS DECIMAL(10,2)) as total_space_mb,
183
+ CAST(a.used_pages * 8.0 / 1024 AS DECIMAL(10,2)) as used_space_mb,
184
+ CAST(a.data_pages * 8.0 / 1024 AS DECIMAL(10,2)) as data_space_mb,
185
+ CAST((a.total_pages - a.used_pages) * 8.0 / 1024 AS DECIMAL(10,2)) as unused_space_mb,
186
+ CAST((a.used_pages - a.data_pages) * 8.0 / 1024 AS DECIMAL(10,2)) as index_space_mb
187
+ FROM sys.tables t
188
+ INNER JOIN sys.indexes i ON t.object_id = i.object_id
189
+ INNER JOIN sys.partitions p ON i.object_id = p.object_id AND i.index_id = p.index_id
190
+ INNER JOIN (
191
+ SELECT
192
+ object_id,
193
+ index_id,
194
+ SUM(total_pages) as total_pages,
195
+ SUM(used_pages) as used_pages,
196
+ SUM(data_pages) as data_pages
197
+ FROM sys.allocation_units au
198
+ INNER JOIN sys.partitions p ON
199
+ (au.type IN (1,3) AND au.container_id = p.hobt_id) OR
200
+ (au.type = 2 AND au.container_id = p.partition_id)
201
+ GROUP BY object_id, index_id
202
+ ) a ON i.object_id = a.object_id AND i.index_id = a.index_id
203
+ WHERE t.is_ms_shipped = 0
204
+ ORDER BY a.total_pages DESC;
205
+ """
206
+
207
+ try:
208
+ return self.sql_api_client.execute_query(server_name, database_name, query)
209
+ except Exception as e:
210
+ logging.error(f"Failed to get table space usage: {str(e)}")
211
+ return []
212
+
213
+ def get_storage_growth_trend(self, server_name: str, database_name: str) -> Dict:
214
+ """Get storage growth trends from backup history."""
215
+ query = """
216
+ WITH BackupSizes AS (
217
+ SELECT
218
+ backup_start_date,
219
+ database_name,
220
+ backup_size,
221
+ compressed_backup_size,
222
+ type as backup_type,
223
+ ROW_NUMBER() OVER (PARTITION BY CONVERT(date, backup_start_date) ORDER BY backup_start_date DESC) as rn
224
+ FROM msdb.dbo.backupset
225
+ WHERE database_name = DB_NAME()
226
+ AND type = 'D' -- Full backups only
227
+ AND backup_start_date >= DATEADD(day, -30, GETDATE())
228
+ )
229
+ SELECT
230
+ CONVERT(date, backup_start_date) as backup_date,
231
+ database_name,
232
+ CAST(backup_size / 1024.0 / 1024.0 AS DECIMAL(10,2)) as backup_size_mb,
233
+ CAST(compressed_backup_size / 1024.0 / 1024.0 AS DECIMAL(10,2)) as compressed_backup_size_mb,
234
+ CAST((backup_size - compressed_backup_size) * 100.0 / backup_size AS DECIMAL(5,2)) as compression_ratio_percent
235
+ FROM BackupSizes
236
+ WHERE rn = 1 -- One backup per day
237
+ ORDER BY backup_date DESC;
238
+ """
239
+
240
+ try:
241
+ results = self.sql_api_client.execute_query(
242
+ server_name, database_name, query
243
+ )
244
+
245
+ # Calculate growth trend if we have multiple data points
246
+ if len(results) >= 2:
247
+ oldest = results[-1]
248
+ newest = results[0]
249
+
250
+ if oldest["backup_size_mb"] and newest["backup_size_mb"]:
251
+ growth_mb = newest["backup_size_mb"] - oldest["backup_size_mb"]
252
+ growth_percent = (growth_mb / oldest["backup_size_mb"]) * 100
253
+ days_diff = (
254
+ datetime.strptime(str(newest["backup_date"]), "%Y-%m-%d")
255
+ - datetime.strptime(str(oldest["backup_date"]), "%Y-%m-%d")
256
+ ).days
257
+
258
+ return {
259
+ "backup_history": results,
260
+ "growth_analysis": {
261
+ "total_growth_mb": round(growth_mb, 2),
262
+ "growth_percent": round(growth_percent, 2),
263
+ "days_analyzed": days_diff,
264
+ "avg_daily_growth_mb": round(growth_mb / days_diff, 2)
265
+ if days_diff > 0
266
+ else 0,
267
+ },
268
+ }
269
+
270
+ return {"backup_history": results, "growth_analysis": None}
271
+
272
+ except Exception as e:
273
+ logging.warning(
274
+ f"Failed to get storage growth trend (backup history may not be available): {str(e)}"
275
+ )
276
+ return {"error": str(e)}
277
+
278
+ def get_tempdb_usage(self, server_name: str, database_name: str) -> Dict:
279
+ """Get tempdb usage information."""
280
+ query = """
281
+ SELECT
282
+ 'TempDB Usage' as metric_type,
283
+ CAST(SUM(size) * 8.0 / 1024 AS DECIMAL(10,2)) as total_size_mb,
284
+ CAST(SUM(FILEPROPERTY(name, 'SpaceUsed')) * 8.0 / 1024 AS DECIMAL(10,2)) as used_size_mb,
285
+ CAST((SUM(size) - SUM(FILEPROPERTY(name, 'SpaceUsed'))) * 8.0 / 1024 AS DECIMAL(10,2)) as free_size_mb,
286
+ CAST(SUM(FILEPROPERTY(name, 'SpaceUsed')) * 100.0 / SUM(size) AS DECIMAL(5,2)) as used_percent
287
+ FROM tempdb.sys.database_files
288
+ WHERE type_desc = 'ROWS'
289
+ UNION ALL
290
+ SELECT
291
+ 'TempDB Log' as metric_type,
292
+ CAST(SUM(size) * 8.0 / 1024 AS DECIMAL(10,2)) as total_size_mb,
293
+ CAST(SUM(FILEPROPERTY(name, 'SpaceUsed')) * 8.0 / 1024 AS DECIMAL(10,2)) as used_size_mb,
294
+ CAST((SUM(size) - SUM(FILEPROPERTY(name, 'SpaceUsed'))) * 8.0 / 1024 AS DECIMAL(10,2)) as free_size_mb,
295
+ CAST(SUM(FILEPROPERTY(name, 'SpaceUsed')) * 100.0 / SUM(size) AS DECIMAL(5,2)) as used_percent
296
+ FROM tempdb.sys.database_files
297
+ WHERE type_desc = 'LOG';
298
+ """
299
+
300
+ try:
301
+ results = self.sql_api_client.execute_query(
302
+ server_name, database_name, query
303
+ )
304
+ return {
305
+ row["metric_type"]: {
306
+ "total_size_mb": row["total_size_mb"],
307
+ "used_size_mb": row["used_size_mb"],
308
+ "free_size_mb": row["free_size_mb"],
309
+ "used_percent": row["used_percent"],
310
+ }
311
+ for row in results
312
+ }
313
+ except Exception as e:
314
+ logging.warning(
315
+ f"Failed to get tempdb usage (may not have permissions): {str(e)}"
316
+ )
317
+ return {"error": str(e)}
@@ -0,0 +1,55 @@
1
+ from typing import Optional, Tuple
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+ from holmes.core.tools import Tool, Toolset
6
+ from holmes.plugins.toolsets.azure_sql.apis.azure_sql_api import AzureSQLAPIClient
7
+
8
+
9
+ class AzureSQLDatabaseConfig(BaseModel):
10
+ subscription_id: str
11
+ resource_group: str
12
+ server_name: str
13
+ database_name: str
14
+
15
+
16
+ class AzureSQLConfig(BaseModel):
17
+ database: AzureSQLDatabaseConfig
18
+ tenant_id: Optional[str]
19
+ client_id: Optional[str]
20
+ client_secret: Optional[str]
21
+
22
+
23
+ class BaseAzureSQLToolset(Toolset):
24
+ model_config = ConfigDict(arbitrary_types_allowed=True)
25
+ _api_client: Optional[AzureSQLAPIClient] = None
26
+ _database_config: Optional[AzureSQLDatabaseConfig] = None
27
+
28
+ def api_client(self):
29
+ if not self._api_client:
30
+ raise Exception(
31
+ "Toolset is missing api_client. This is likely a code issue and not a configuration issue"
32
+ )
33
+ else:
34
+ return self._api_client
35
+
36
+ def database_config(self):
37
+ if not self._database_config:
38
+ raise Exception(
39
+ "Toolset is missing database_config. This is likely a code issue and not a configuration issue"
40
+ )
41
+ else:
42
+ return self._database_config
43
+
44
+
45
+ class BaseAzureSQLTool(Tool):
46
+ toolset: BaseAzureSQLToolset
47
+
48
+ @staticmethod
49
+ def validate_config(
50
+ api_client: AzureSQLAPIClient, database_config: AzureSQLDatabaseConfig
51
+ ) -> Tuple[bool, str]:
52
+ # Each tool is able to validate whether it can work and generate output with this config.
53
+ # The tool should report an error if a permission is missing. e.g. return False, "The client '597a70b9-9f01-4739-ac3e-ac8a934e9ffc' with object id '597a70b9-9f01-4739-ac3e-ac8a934e9ffc' does not have authorization to perform action 'Microsoft.Insights/metricAlerts/read' over scope '/subscriptions/e7a7e3c5-ff48-4ccb-898b-83aa5d2f9097/resourceGroups/arik-aks-dev_group/providers/Microsoft.Insights' or the scope is invalid."
54
+ # The tool should return multiple errors in the return message if there are multiple issues that prevent it from fully working
55
+ return True, ""