holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,739 +0,0 @@
1
- import json
2
- import logging
3
- import os
4
- from datetime import datetime, timezone
5
- from typing import Any, Dict, List, Optional, Tuple
6
-
7
-
8
- from holmes.core.tools import (
9
- CallablePrerequisite,
10
- StructuredToolResult,
11
- Tool,
12
- ToolParameter,
13
- ToolResultStatus,
14
- Toolset,
15
- ToolsetTag,
16
- )
17
- from holmes.plugins.toolsets.consts import (
18
- TOOLSET_CONFIG_MISSING_ERROR,
19
- STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
20
- )
21
- from holmes.plugins.toolsets.datadog.datadog_api import (
22
- DatadogBaseConfig,
23
- DataDogRequestError,
24
- execute_datadog_http_request,
25
- get_headers,
26
- )
27
- from holmes.plugins.toolsets.utils import (
28
- get_param_or_raise,
29
- process_timestamps_to_int,
30
- standard_start_datetime_tool_param_description,
31
- )
32
-
33
- DEFAULT_TIME_SPAN_SECONDS = 3600
34
- DEFAULT_TOP_INSTANCES = 10
35
-
36
- # Metric definitions
37
- LATENCY_METRICS = [
38
- ("aws.rds.read_latency", "Read Latency", "ms"),
39
- ("aws.rds.write_latency", "Write Latency", "ms"),
40
- ("aws.rds.commit_latency", "Commit Latency", "ms"),
41
- ("aws.rds.disk_queue_depth", "Disk Queue Depth", ""),
42
- ]
43
-
44
- RESOURCE_METRICS = [
45
- ("aws.rds.cpuutilization", "CPU Utilization", "%"),
46
- ("aws.rds.database_connections", "Database Connections", "connections"),
47
- ("aws.rds.freeable_memory", "Freeable Memory", "bytes"),
48
- ("aws.rds.swap_usage", "Swap Usage", "bytes"),
49
- ]
50
-
51
- STORAGE_METRICS = [
52
- ("aws.rds.read_iops", "Read IOPS", "iops"),
53
- ("aws.rds.write_iops", "Write IOPS", "iops"),
54
- ("aws.rds.burst_balance", "Burst Balance", "%"),
55
- ("aws.rds.free_storage_space", "Free Storage Space", "bytes"),
56
- ]
57
-
58
-
59
- class DatadogRDSConfig(DatadogBaseConfig):
60
- default_time_span_seconds: int = DEFAULT_TIME_SPAN_SECONDS
61
- default_top_instances: int = DEFAULT_TOP_INSTANCES
62
-
63
-
64
- class BaseDatadogRDSTool(Tool):
65
- toolset: "DatadogRDSToolset"
66
-
67
-
68
- class GenerateRDSPerformanceReport(BaseDatadogRDSTool):
69
- def __init__(self, toolset: "DatadogRDSToolset"):
70
- super().__init__(
71
- name="datadog_rds_performance_report",
72
- description="Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
73
- parameters={
74
- "db_instance_identifier": ToolParameter(
75
- description="The RDS database instance identifier",
76
- type="string",
77
- required=True,
78
- ),
79
- "start_time": ToolParameter(
80
- description=standard_start_datetime_tool_param_description(
81
- DEFAULT_TIME_SPAN_SECONDS
82
- ),
83
- type="string",
84
- required=False,
85
- ),
86
- "end_time": ToolParameter(
87
- description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
88
- type="string",
89
- required=False,
90
- ),
91
- },
92
- toolset=toolset,
93
- )
94
-
95
- def _invoke(
96
- self, params: dict, user_approved: bool = False
97
- ) -> StructuredToolResult:
98
- if not self.toolset.dd_config:
99
- return StructuredToolResult(
100
- status=ToolResultStatus.ERROR,
101
- error=TOOLSET_CONFIG_MISSING_ERROR,
102
- params=params,
103
- )
104
-
105
- try:
106
- db_instance = get_param_or_raise(params, "db_instance_identifier")
107
- start_time, end_time = process_timestamps_to_int(
108
- start=params.get("start_time"),
109
- end=params.get("end_time"),
110
- default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
111
- )
112
-
113
- report: dict[str, Any] = {
114
- "instance_id": db_instance,
115
- "report_time": datetime.now(timezone.utc).isoformat(),
116
- "time_range": {
117
- "start": datetime.fromtimestamp(
118
- start_time, tz=timezone.utc
119
- ).isoformat(),
120
- "end": datetime.fromtimestamp(
121
- end_time, tz=timezone.utc
122
- ).isoformat(),
123
- },
124
- "sections": {},
125
- "issues": [],
126
- "executive_summary": "",
127
- }
128
-
129
- # Collect all metrics
130
- all_metrics = []
131
- for metric_group, group_name in [
132
- (LATENCY_METRICS, "latency"),
133
- (RESOURCE_METRICS, "resources"),
134
- (STORAGE_METRICS, "storage"),
135
- ]:
136
- section_data = self._collect_metrics(
137
- db_instance, metric_group, start_time, end_time
138
- )
139
- if section_data:
140
- report["sections"][group_name] = section_data
141
- all_metrics.extend(section_data.get("metrics", {}).items())
142
-
143
- # Analyze metrics and generate insights
144
- self._analyze_metrics(report, all_metrics)
145
-
146
- # Generate executive summary
147
- report["executive_summary"] = self._generate_executive_summary(report)
148
-
149
- # Format the report as readable text
150
- formatted_report = self._format_report(report)
151
-
152
- return StructuredToolResult(
153
- status=ToolResultStatus.SUCCESS,
154
- data=formatted_report,
155
- params=params,
156
- )
157
-
158
- except Exception as e:
159
- logging.error(f"Error generating RDS performance report: {str(e)}")
160
- return StructuredToolResult(
161
- status=ToolResultStatus.ERROR,
162
- error=f"Failed to generate RDS performance report: {str(e)}",
163
- params=params,
164
- )
165
-
166
- def _collect_metrics(
167
- self,
168
- db_instance: str,
169
- metric_list: List[Tuple[str, str, str]],
170
- start_time: int,
171
- end_time: int,
172
- ) -> Dict[str, Any]:
173
- """Collect metrics for a specific group"""
174
- if not self.toolset.dd_config:
175
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
176
-
177
- metrics = {}
178
-
179
- for metric_name, display_name, unit in metric_list:
180
- query = f"{metric_name}{{dbinstanceidentifier:{db_instance}}}"
181
-
182
- try:
183
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
184
- headers = get_headers(self.toolset.dd_config)
185
- payload = {
186
- "query": query,
187
- "from": start_time,
188
- "to": end_time,
189
- }
190
-
191
- response = execute_datadog_http_request(
192
- url=url,
193
- headers=headers,
194
- payload_or_params=payload,
195
- timeout=self.toolset.dd_config.request_timeout,
196
- method="GET",
197
- )
198
-
199
- if response and "series" in response and response["series"]:
200
- series = response["series"][0]
201
- points = series.get("pointlist", [])
202
-
203
- if points:
204
- values = [p[1] for p in points if p[1] is not None]
205
- if values:
206
- metrics[display_name] = {
207
- "unit": unit
208
- or series.get("unit", [{"short_name": ""}])[0].get(
209
- "short_name", ""
210
- ),
211
- "avg": round(sum(values) / len(values), 2),
212
- "max": round(max(values), 2),
213
- "min": round(min(values), 2),
214
- "latest": round(values[-1], 2),
215
- "data_points": len(values),
216
- }
217
- except DataDogRequestError:
218
- continue
219
-
220
- return {"metrics": metrics} if metrics else {}
221
-
222
- def _analyze_metrics(self, report: Dict, all_metrics: List[Tuple[str, Dict]]):
223
- """Analyze metrics and generate issues"""
224
- for metric_name, data in all_metrics:
225
- # Latency analysis
226
- if "Latency" in metric_name and metric_name != "Commit Latency":
227
- if data["avg"] > 10:
228
- report["issues"].append(
229
- f"{metric_name} averaging {data['avg']}ms (above 10ms threshold)"
230
- )
231
- if data["max"] > 50:
232
- report["issues"].append(f"{metric_name} peaked at {data['max']}ms")
233
-
234
- # Disk queue depth
235
- elif metric_name == "Disk Queue Depth":
236
- if data["avg"] > 5:
237
- report["issues"].append(
238
- f"High disk queue depth (avg: {data['avg']})"
239
- )
240
-
241
- # CPU utilization
242
- elif metric_name == "CPU Utilization":
243
- if data["avg"] > 70:
244
- report["issues"].append(
245
- f"High CPU utilization (avg: {data['avg']}%)"
246
- )
247
- if data["max"] > 90:
248
- report["issues"].append(
249
- f"CPU saturation detected (max: {data['max']}%)"
250
- )
251
-
252
- # Memory
253
- elif metric_name == "Freeable Memory":
254
- if data["min"] < 100 * 1024 * 1024: # 100MB
255
- report["issues"].append(
256
- f"Low memory availability (min: {data['min'] / 1024 / 1024:.1f}MB)"
257
- )
258
-
259
- # Swap usage
260
- elif metric_name == "Swap Usage":
261
- if data["avg"] > 0:
262
- report["issues"].append(
263
- "Swap usage detected, indicating memory pressure"
264
- )
265
-
266
- # Burst balance
267
- elif metric_name == "Burst Balance":
268
- if data["min"] < 30:
269
- report["issues"].append(
270
- f"Low burst balance detected (min: {data['min']}%)"
271
- )
272
-
273
- # IOPS
274
- elif "IOPS" in metric_name:
275
- if data["max"] > 3000:
276
- report["issues"].append(
277
- f"High {metric_name} (max: {data['max']} IOPS)"
278
- )
279
-
280
- def _generate_executive_summary(self, report: Dict) -> str:
281
- """Generate executive summary"""
282
- issue_count = len(report["issues"])
283
-
284
- if issue_count == 0:
285
- return "Database is operating within normal parameters. No significant issues detected."
286
- elif issue_count <= 2:
287
- severity = "Low"
288
- elif issue_count <= 5:
289
- severity = "Medium"
290
- else:
291
- severity = "High"
292
-
293
- summary = f"Performance diagnosis: {severity} severity - {issue_count} issues detected.\n\n"
294
-
295
- # Add key findings
296
- if any("latency" in issue.lower() for issue in report["issues"]):
297
- summary += "• Latency issues affecting database response times\n"
298
- if any("cpu" in issue.lower() for issue in report["issues"]):
299
- summary += "• CPU resource constraints detected\n"
300
- if any(
301
- "memory" in issue.lower() or "swap" in issue.lower()
302
- for issue in report["issues"]
303
- ):
304
- summary += "• Memory pressure affecting performance\n"
305
- if any(
306
- "burst" in issue.lower() or "iops" in issue.lower()
307
- for issue in report["issues"]
308
- ):
309
- summary += "• Storage I/O bottlenecks identified\n"
310
-
311
- return summary
312
-
313
- def _format_report(self, report: Dict) -> str:
314
- """Format the report as readable text"""
315
- lines = []
316
- lines.append(f"RDS Performance Report - {report['instance_id']}")
317
- lines.append("=" * 70)
318
- lines.append(f"Generated: {report['report_time']}")
319
- lines.append(
320
- f"Time Range: {report['time_range']['start']} to {report['time_range']['end']}"
321
- )
322
- lines.append("")
323
-
324
- # Executive Summary
325
- lines.append("EXECUTIVE SUMMARY")
326
- lines.append("-" * 40)
327
- lines.append(report["executive_summary"])
328
- lines.append("")
329
-
330
- # Metrics sections
331
- for section_name, section_data in report["sections"].items():
332
- lines.append(f"{section_name.upper()} METRICS")
333
- lines.append("-" * 40)
334
-
335
- if section_data.get("metrics"):
336
- lines.append(
337
- f"{'Metric':<25} {'Avg':>10} {'Max':>10} {'Min':>10} {'Latest':>10} {'Unit':>8}"
338
- )
339
- lines.append("-" * 80)
340
-
341
- for metric_name, data in section_data["metrics"].items():
342
- lines.append(
343
- f"{metric_name:<25} {data['avg']:>10.2f} {data['max']:>10.2f} "
344
- f"{data['min']:>10.2f} {data['latest']:>10.2f} {data['unit']:>8}"
345
- )
346
- lines.append("")
347
-
348
- # Issues
349
- if report["issues"]:
350
- lines.append(f"ISSUES DETECTED ({len(report['issues'])})")
351
- lines.append("-" * 40)
352
- for i, issue in enumerate(report["issues"], 1):
353
- lines.append(f"{i}. {issue}")
354
- lines.append("")
355
-
356
- return "\n".join(lines)
357
-
358
- def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
359
- db_instance = params.get("db_instance_identifier", "unknown")
360
- return f"Generating performance report for RDS instance: {db_instance}"
361
-
362
-
363
- class GetTopWorstPerformingRDSInstances(BaseDatadogRDSTool):
364
- def __init__(self, toolset: "DatadogRDSToolset"):
365
- super().__init__(
366
- name="datadog_rds_top_worst_performing",
367
- description="Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
368
- parameters={
369
- "top_n": ToolParameter(
370
- description=f"Number of worst performing instances to return (default: {DEFAULT_TOP_INSTANCES})",
371
- type="number",
372
- required=False,
373
- ),
374
- "start_time": ToolParameter(
375
- description=standard_start_datetime_tool_param_description(
376
- DEFAULT_TIME_SPAN_SECONDS
377
- ),
378
- type="string",
379
- required=False,
380
- ),
381
- "end_time": ToolParameter(
382
- description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
383
- type="string",
384
- required=False,
385
- ),
386
- "sort_by": ToolParameter(
387
- description="Metric to sort by: 'latency' (default), 'cpu', 'errors', or 'composite'",
388
- type="string",
389
- required=False,
390
- ),
391
- },
392
- toolset=toolset,
393
- )
394
-
395
- def _invoke(
396
- self, params: dict, user_approved: bool = False
397
- ) -> StructuredToolResult:
398
- if not self.toolset.dd_config:
399
- return StructuredToolResult(
400
- status=ToolResultStatus.ERROR,
401
- error=TOOLSET_CONFIG_MISSING_ERROR,
402
- params=params,
403
- )
404
-
405
- try:
406
- top_n = params.get("top_n", self.toolset.dd_config.default_top_instances)
407
- sort_by = params.get("sort_by", "latency").lower()
408
- start_time, end_time = process_timestamps_to_int(
409
- start=params.get("start_time"),
410
- end=params.get("end_time"),
411
- default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
412
- )
413
-
414
- # Get all RDS instances
415
- instances = self._get_all_rds_instances(start_time, end_time)
416
-
417
- if not instances:
418
- return StructuredToolResult(
419
- status=ToolResultStatus.NO_DATA,
420
- data="No RDS instances found with metrics in the specified time range",
421
- params=params,
422
- )
423
-
424
- # Collect performance data for each instance
425
- instance_performance = []
426
- for instance_id in instances[:50]: # Limit to 50 instances to avoid timeout
427
- perf_data = self._get_instance_performance_summary(
428
- instance_id, start_time, end_time
429
- )
430
- if perf_data:
431
- instance_performance.append(perf_data)
432
-
433
- # Sort by the specified metric
434
- instance_performance = self._sort_instances(instance_performance, sort_by)
435
-
436
- # Get top N worst performers
437
- worst_performers = instance_performance[:top_n]
438
-
439
- # Format the report
440
- report = self._format_summary_report(worst_performers, sort_by)
441
-
442
- report += f"\n\nTotal instances analyzed: {len(instance_performance)}"
443
- report += f"\n\nInstances:\n{json.dumps(worst_performers, indent=2)}"
444
-
445
- return StructuredToolResult(
446
- status=ToolResultStatus.SUCCESS,
447
- data=report,
448
- params=params,
449
- )
450
-
451
- except Exception as e:
452
- logging.error(f"Error getting top worst performing RDS instances: {str(e)}")
453
- return StructuredToolResult(
454
- status=ToolResultStatus.ERROR,
455
- error=f"Failed to get top worst performing RDS instances: {str(e)}",
456
- params=params,
457
- )
458
-
459
- def _get_all_rds_instances(self, start_time: int, end_time: int) -> List[str]:
460
- """Get list of all RDS instances with metrics"""
461
- if not self.toolset.dd_config:
462
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
463
- try:
464
- # Query for any RDS metric grouped by instance
465
- query = "avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier}"
466
-
467
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
468
- headers = get_headers(self.toolset.dd_config)
469
- payload = {
470
- "query": query,
471
- "from": start_time,
472
- "to": end_time,
473
- }
474
-
475
- response = execute_datadog_http_request(
476
- url=url,
477
- headers=headers,
478
- payload_or_params=payload,
479
- timeout=self.toolset.dd_config.request_timeout,
480
- method="GET",
481
- )
482
-
483
- instances = []
484
- if response and "series" in response:
485
- for series in response["series"]:
486
- # Extract instance ID from tags
487
- scope = series.get("scope", "")
488
- if "dbinstanceidentifier:" in scope:
489
- instance_id = scope.split("dbinstanceidentifier:")[1].split(
490
- ","
491
- )[0]
492
- instances.append(instance_id)
493
-
494
- return list(set(instances)) # Remove duplicates
495
-
496
- except Exception as e:
497
- logging.error(f"Error getting RDS instances: {str(e)}")
498
- return []
499
-
500
- def _get_instance_performance_summary(
501
- self, instance_id: str, start_time: int, end_time: int
502
- ) -> Optional[Dict]:
503
- """Get performance summary for a single instance"""
504
-
505
- if not self.toolset.dd_config:
506
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
507
-
508
- summary: dict[str, Any] = {
509
- "instance_id": instance_id,
510
- "metrics": {},
511
- "score": 0, # Composite score for sorting
512
- "issues": [],
513
- }
514
-
515
- # Key metrics to collect
516
- metrics_to_collect = [
517
- ("aws.rds.read_latency", "read_latency", 1.0), # weight for composite score
518
- ("aws.rds.write_latency", "write_latency", 1.0),
519
- ("aws.rds.cpuutilization", "cpu_utilization", 0.5),
520
- ("aws.rds.database_connections", "connections", 0.2),
521
- ("aws.rds.burst_balance", "burst_balance", 0.8),
522
- ]
523
-
524
- for metric_name, key, weight in metrics_to_collect:
525
- query = f"avg:{metric_name}{{dbinstanceidentifier:{instance_id}}}"
526
-
527
- try:
528
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
529
- headers = get_headers(self.toolset.dd_config)
530
- payload = {
531
- "query": query,
532
- "from": start_time,
533
- "to": end_time,
534
- }
535
-
536
- response = execute_datadog_http_request(
537
- url=url,
538
- headers=headers,
539
- payload_or_params=payload,
540
- timeout=self.toolset.dd_config.request_timeout,
541
- method="GET",
542
- )
543
-
544
- if response and "series" in response and response["series"]:
545
- series = response["series"][0]
546
- points = series.get("pointlist", [])
547
-
548
- if points:
549
- values = [p[1] for p in points if p[1] is not None]
550
- if values:
551
- avg_value = sum(values) / len(values)
552
- max_value = max(values)
553
-
554
- summary["metrics"][key] = {
555
- "avg": round(avg_value, 2),
556
- "max": round(max_value, 2),
557
- }
558
-
559
- # Calculate contribution to composite score
560
- if key in ["read_latency", "write_latency"]:
561
- # Higher latency = worse performance
562
- score_contrib = avg_value * weight
563
- if avg_value > 10:
564
- summary["issues"].append(
565
- f"High {key.replace('_', ' ')}: {avg_value:.1f}ms"
566
- )
567
- elif key == "cpu_utilization":
568
- # Higher CPU = worse performance
569
- score_contrib = avg_value * weight
570
- if avg_value > 70:
571
- summary["issues"].append(
572
- f"High CPU: {avg_value:.1f}%"
573
- )
574
- elif key == "burst_balance":
575
- # Lower burst balance = worse performance
576
- score_contrib = (100 - avg_value) * weight
577
- if avg_value < 30:
578
- summary["issues"].append(
579
- f"Low burst balance: {avg_value:.1f}%"
580
- )
581
- else:
582
- score_contrib = 0
583
-
584
- summary["score"] += score_contrib
585
-
586
- except Exception:
587
- continue
588
-
589
- return summary if summary["metrics"] else None
590
-
591
- def _sort_instances(self, instances: List[Dict], sort_by: str) -> List[Dict]:
592
- """Sort instances by specified metric"""
593
- if sort_by == "latency":
594
- # Sort by average of read and write latency
595
- def latency_key(inst):
596
- read_lat = inst["metrics"].get("read_latency", {}).get("avg", 0)
597
- write_lat = inst["metrics"].get("write_latency", {}).get("avg", 0)
598
- return (read_lat + write_lat) / 2
599
-
600
- return sorted(instances, key=latency_key, reverse=True)
601
-
602
- elif sort_by == "cpu":
603
- return sorted(
604
- instances,
605
- key=lambda x: x["metrics"].get("cpu_utilization", {}).get("avg", 0),
606
- reverse=True,
607
- )
608
-
609
- elif sort_by == "composite":
610
- return sorted(instances, key=lambda x: x["score"], reverse=True)
611
-
612
- else: # Default to latency
613
- return self._sort_instances(instances, "latency")
614
-
615
- def _format_summary_report(self, instances: List[Dict], sort_by: str) -> str:
616
- """Format the summary report"""
617
- lines = []
618
- lines.append("Top Worst Performing RDS Instances")
619
- lines.append("=" * 70)
620
- lines.append(f"Sorted by: {sort_by}")
621
- lines.append(f"Instances shown: {len(instances)}")
622
- lines.append("")
623
-
624
- for rank, inst in enumerate(instances, 1):
625
- lines.append(f"{rank}. {inst['instance_id']}")
626
- lines.append("-" * 40)
627
-
628
- # Show key metrics
629
- metrics = inst["metrics"]
630
- if "read_latency" in metrics:
631
- lines.append(
632
- f" Read Latency: {metrics['read_latency']['avg']:.1f}ms avg, {metrics['read_latency']['max']:.1f}ms max"
633
- )
634
- if "write_latency" in metrics:
635
- lines.append(
636
- f" Write Latency: {metrics['write_latency']['avg']:.1f}ms avg, {metrics['write_latency']['max']:.1f}ms max"
637
- )
638
- if "cpu_utilization" in metrics:
639
- lines.append(
640
- f" CPU Usage: {metrics['cpu_utilization']['avg']:.1f}% avg, {metrics['cpu_utilization']['max']:.1f}% max"
641
- )
642
- if "burst_balance" in metrics:
643
- lines.append(
644
- f" Burst Balance: {metrics['burst_balance']['avg']:.1f}% avg"
645
- )
646
-
647
- # Show issues
648
- if inst["issues"]:
649
- lines.append(" Issues:")
650
- for issue in inst["issues"]:
651
- lines.append(f" • {issue}")
652
-
653
- lines.append("")
654
-
655
- return "\n".join(lines)
656
-
657
- def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
658
- top_n = params.get("top_n", DEFAULT_TOP_INSTANCES)
659
- sort_by = params.get("sort_by", "latency")
660
- return f"Getting top {top_n} worst performing RDS instances sorted by {sort_by}"
661
-
662
-
663
- class DatadogRDSToolset(Toolset):
664
- dd_config: Optional[DatadogRDSConfig] = None
665
-
666
- def __init__(self):
667
- super().__init__(
668
- name="datadog/rds",
669
- description="Analyze RDS database performance and identify worst performers using Datadog metrics",
670
- tags=[ToolsetTag.CORE],
671
- tools=[
672
- GenerateRDSPerformanceReport(toolset=self),
673
- GetTopWorstPerformingRDSInstances(toolset=self),
674
- ],
675
- )
676
-
677
- def prerequisites_check(self, config: Dict[str, Any]) -> CallablePrerequisite:
678
- def check_datadog_connectivity(config_dict: Dict[str, Any]) -> Tuple[bool, str]:
679
- """Check Datadog API connectivity and permissions"""
680
- try:
681
- # Validate config
682
- self.dd_config = DatadogRDSConfig(**config_dict)
683
-
684
- # Test API connectivity
685
- url = f"{self.dd_config.site_api_url}/api/v1/validate"
686
- headers = get_headers(self.dd_config)
687
-
688
- response = execute_datadog_http_request(
689
- url=url,
690
- headers=headers,
691
- payload_or_params={},
692
- timeout=self.dd_config.request_timeout,
693
- method="GET",
694
- )
695
-
696
- if response and response.get("valid", False):
697
- # Test metrics API access
698
- metrics_url = f"{self.dd_config.site_api_url}/api/v1/metrics"
699
- execute_datadog_http_request(
700
- url=metrics_url,
701
- headers=headers,
702
- payload_or_params={"from": 0},
703
- timeout=self.dd_config.request_timeout,
704
- method="GET",
705
- )
706
- return True, ""
707
- else:
708
- return False, "Invalid Datadog API credentials"
709
-
710
- except DataDogRequestError as e:
711
- if e.status_code == 403:
712
- return False, "Invalid Datadog API keys or insufficient permissions"
713
- else:
714
- return False, f"Datadog API error: {str(e)}"
715
- except Exception as e:
716
- return False, f"Failed to initialize Datadog RDS toolset: {str(e)}"
717
-
718
- return CallablePrerequisite(callable=check_datadog_connectivity)
719
-
720
- def post_init(self, config: dict):
721
- """Load LLM instructions after initialization"""
722
- self._reload_instructions()
723
-
724
- def _reload_instructions(self):
725
- """Load RDS analysis specific instructions"""
726
- template_file_path = os.path.abspath(
727
- os.path.join(os.path.dirname(__file__), "datadog_rds_instructions.jinja2")
728
- )
729
- self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
730
-
731
- def get_example_config(self) -> Dict[str, Any]:
732
- """Get example configuration for this toolset."""
733
- return {
734
- "dd_api_key": "your-datadog-api-key",
735
- "dd_app_key": "your-datadog-application-key",
736
- "site_api_url": "https://api.datadoghq.com",
737
- "default_time_span_seconds": 3600,
738
- "default_top_instances": 10,
739
- }