holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +4 -3
  3. holmes/common/env_vars.py +18 -2
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +11 -6
  6. holmes/core/conversations.py +30 -13
  7. holmes/core/investigation.py +21 -25
  8. holmes/core/investigation_structured_output.py +3 -3
  9. holmes/core/issue.py +1 -1
  10. holmes/core/llm.py +50 -31
  11. holmes/core/models.py +19 -17
  12. holmes/core/openai_formatting.py +1 -1
  13. holmes/core/prompt.py +47 -2
  14. holmes/core/runbooks.py +1 -0
  15. holmes/core/safeguards.py +4 -2
  16. holmes/core/supabase_dal.py +4 -2
  17. holmes/core/tool_calling_llm.py +102 -141
  18. holmes/core/tools.py +19 -28
  19. holmes/core/tools_utils/token_counting.py +9 -2
  20. holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
  21. holmes/core/tools_utils/tool_executor.py +0 -18
  22. holmes/core/tools_utils/toolset_utils.py +1 -0
  23. holmes/core/toolset_manager.py +37 -2
  24. holmes/core/tracing.py +13 -2
  25. holmes/core/transformers/__init__.py +1 -1
  26. holmes/core/transformers/base.py +1 -0
  27. holmes/core/transformers/llm_summarize.py +3 -2
  28. holmes/core/transformers/registry.py +2 -1
  29. holmes/core/transformers/transformer.py +1 -0
  30. holmes/core/truncation/compaction.py +37 -2
  31. holmes/core/truncation/input_context_window_limiter.py +3 -2
  32. holmes/interactive.py +52 -8
  33. holmes/main.py +17 -37
  34. holmes/plugins/interfaces.py +2 -1
  35. holmes/plugins/prompts/__init__.py +2 -1
  36. holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
  37. holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
  38. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  39. holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
  40. holmes/plugins/prompts/generic_ask.jinja2 +0 -2
  41. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
  42. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
  43. holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
  44. holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
  45. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
  46. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
  47. holmes/plugins/runbooks/__init__.py +32 -3
  48. holmes/plugins/sources/github/__init__.py +4 -2
  49. holmes/plugins/sources/prometheus/models.py +1 -0
  50. holmes/plugins/toolsets/__init__.py +30 -26
  51. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
  52. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  53. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  54. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  55. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  56. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  57. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
  58. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
  59. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
  60. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
  61. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  62. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
  63. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
  64. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
  65. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
  66. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
  67. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
  68. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  69. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  70. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  71. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  72. holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
  73. holmes/plugins/toolsets/bash/common/bash.py +19 -9
  74. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  75. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  76. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  77. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  78. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  79. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  80. holmes/plugins/toolsets/connectivity_check.py +124 -0
  81. holmes/plugins/toolsets/coralogix/api.py +132 -119
  82. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  83. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  84. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  85. holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
  86. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
  87. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  88. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  89. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  90. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  91. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
  92. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
  93. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
  94. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
  95. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  96. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  97. holmes/plugins/toolsets/git.py +7 -8
  98. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  99. holmes/plugins/toolsets/grafana/common.py +2 -30
  100. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
  101. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
  102. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
  103. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  104. holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
  105. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
  106. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
  107. holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
  108. holmes/plugins/toolsets/internet/internet.py +10 -10
  109. holmes/plugins/toolsets/internet/notion.py +5 -6
  110. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  111. holmes/plugins/toolsets/investigator/model.py +3 -1
  112. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  113. holmes/plugins/toolsets/kafka.py +12 -7
  114. holmes/plugins/toolsets/kubernetes.yaml +260 -30
  115. holmes/plugins/toolsets/kubernetes_logs.py +3 -3
  116. holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
  117. holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
  118. holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
  119. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
  120. holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
  121. holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
  122. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
  123. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  124. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
  125. holmes/plugins/toolsets/robusta/robusta.py +5 -5
  126. holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
  127. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
  128. holmes/plugins/toolsets/utils.py +1 -1
  129. holmes/utils/config_utils.py +1 -1
  130. holmes/utils/connection_utils.py +31 -0
  131. holmes/utils/console/result.py +10 -0
  132. holmes/utils/file_utils.py +2 -1
  133. holmes/utils/global_instructions.py +10 -26
  134. holmes/utils/holmes_status.py +4 -3
  135. holmes/utils/log.py +15 -0
  136. holmes/utils/markdown_utils.py +2 -3
  137. holmes/utils/memory_limit.py +58 -0
  138. holmes/utils/sentry_helper.py +23 -0
  139. holmes/utils/stream.py +12 -5
  140. holmes/utils/tags.py +4 -3
  141. holmes/version.py +3 -1
  142. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
  143. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  144. holmes/plugins/toolsets/aws.yaml +0 -80
  145. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
  146. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  147. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
  148. holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
  149. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  150. holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
  151. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  152. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
  153. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  154. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  155. holmes/utils/keygen_utils.py +0 -6
  156. holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
  157. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
  158. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
  159. /holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
  160. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
  161. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  162. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,736 +0,0 @@
1
- import json
2
- import logging
3
- import os
4
- from datetime import datetime, timezone
5
- from typing import Any, Dict, List, Optional, Tuple
6
-
7
-
8
- from holmes.core.tools import (
9
- CallablePrerequisite,
10
- StructuredToolResult,
11
- Tool,
12
- ToolInvokeContext,
13
- ToolParameter,
14
- StructuredToolResultStatus,
15
- Toolset,
16
- ToolsetTag,
17
- )
18
- from holmes.plugins.toolsets.consts import (
19
- TOOLSET_CONFIG_MISSING_ERROR,
20
- STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
21
- )
22
- from holmes.plugins.toolsets.datadog.datadog_api import (
23
- DatadogBaseConfig,
24
- DataDogRequestError,
25
- execute_datadog_http_request,
26
- get_headers,
27
- )
28
- from holmes.plugins.toolsets.utils import (
29
- get_param_or_raise,
30
- process_timestamps_to_int,
31
- standard_start_datetime_tool_param_description,
32
- )
33
-
34
- DEFAULT_TIME_SPAN_SECONDS = 3600
35
- DEFAULT_TOP_INSTANCES = 10
36
-
37
- # Metric definitions
38
- LATENCY_METRICS = [
39
- ("aws.rds.read_latency", "Read Latency", "ms"),
40
- ("aws.rds.write_latency", "Write Latency", "ms"),
41
- ("aws.rds.commit_latency", "Commit Latency", "ms"),
42
- ("aws.rds.disk_queue_depth", "Disk Queue Depth", ""),
43
- ]
44
-
45
- RESOURCE_METRICS = [
46
- ("aws.rds.cpuutilization", "CPU Utilization", "%"),
47
- ("aws.rds.database_connections", "Database Connections", "connections"),
48
- ("aws.rds.freeable_memory", "Freeable Memory", "bytes"),
49
- ("aws.rds.swap_usage", "Swap Usage", "bytes"),
50
- ]
51
-
52
- STORAGE_METRICS = [
53
- ("aws.rds.read_iops", "Read IOPS", "iops"),
54
- ("aws.rds.write_iops", "Write IOPS", "iops"),
55
- ("aws.rds.burst_balance", "Burst Balance", "%"),
56
- ("aws.rds.free_storage_space", "Free Storage Space", "bytes"),
57
- ]
58
-
59
-
60
- class DatadogRDSConfig(DatadogBaseConfig):
61
- default_time_span_seconds: int = DEFAULT_TIME_SPAN_SECONDS
62
- default_top_instances: int = DEFAULT_TOP_INSTANCES
63
-
64
-
65
- class BaseDatadogRDSTool(Tool):
66
- toolset: "DatadogRDSToolset"
67
-
68
-
69
- class GenerateRDSPerformanceReport(BaseDatadogRDSTool):
70
- def __init__(self, toolset: "DatadogRDSToolset"):
71
- super().__init__(
72
- name="datadog_rds_performance_report",
73
- description="[datadog/rds toolset] Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
74
- parameters={
75
- "db_instance_identifier": ToolParameter(
76
- description="The RDS database instance identifier",
77
- type="string",
78
- required=True,
79
- ),
80
- "start_time": ToolParameter(
81
- description=standard_start_datetime_tool_param_description(
82
- DEFAULT_TIME_SPAN_SECONDS
83
- ),
84
- type="string",
85
- required=False,
86
- ),
87
- "end_time": ToolParameter(
88
- description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
89
- type="string",
90
- required=False,
91
- ),
92
- },
93
- toolset=toolset,
94
- )
95
-
96
- def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
97
- if not self.toolset.dd_config:
98
- return StructuredToolResult(
99
- status=StructuredToolResultStatus.ERROR,
100
- error=TOOLSET_CONFIG_MISSING_ERROR,
101
- params=params,
102
- )
103
-
104
- try:
105
- db_instance = get_param_or_raise(params, "db_instance_identifier")
106
- start_time, end_time = process_timestamps_to_int(
107
- start=params.get("start_time"),
108
- end=params.get("end_time"),
109
- default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
110
- )
111
-
112
- report: dict[str, Any] = {
113
- "instance_id": db_instance,
114
- "report_time": datetime.now(timezone.utc).isoformat(),
115
- "time_range": {
116
- "start": datetime.fromtimestamp(
117
- start_time, tz=timezone.utc
118
- ).isoformat(),
119
- "end": datetime.fromtimestamp(
120
- end_time, tz=timezone.utc
121
- ).isoformat(),
122
- },
123
- "sections": {},
124
- "issues": [],
125
- "executive_summary": "",
126
- }
127
-
128
- # Collect all metrics
129
- all_metrics = []
130
- for metric_group, group_name in [
131
- (LATENCY_METRICS, "latency"),
132
- (RESOURCE_METRICS, "resources"),
133
- (STORAGE_METRICS, "storage"),
134
- ]:
135
- section_data = self._collect_metrics(
136
- db_instance, metric_group, start_time, end_time
137
- )
138
- if section_data:
139
- report["sections"][group_name] = section_data
140
- all_metrics.extend(section_data.get("metrics", {}).items())
141
-
142
- # Analyze metrics and generate insights
143
- self._analyze_metrics(report, all_metrics)
144
-
145
- # Generate executive summary
146
- report["executive_summary"] = self._generate_executive_summary(report)
147
-
148
- # Format the report as readable text
149
- formatted_report = self._format_report(report)
150
-
151
- return StructuredToolResult(
152
- status=StructuredToolResultStatus.SUCCESS,
153
- data=formatted_report,
154
- params=params,
155
- )
156
-
157
- except Exception as e:
158
- logging.error(f"Error generating RDS performance report: {str(e)}")
159
- return StructuredToolResult(
160
- status=StructuredToolResultStatus.ERROR,
161
- error=f"Failed to generate RDS performance report: {str(e)}",
162
- params=params,
163
- )
164
-
165
- def _collect_metrics(
166
- self,
167
- db_instance: str,
168
- metric_list: List[Tuple[str, str, str]],
169
- start_time: int,
170
- end_time: int,
171
- ) -> Dict[str, Any]:
172
- """Collect metrics for a specific group"""
173
- if not self.toolset.dd_config:
174
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
175
-
176
- metrics = {}
177
-
178
- for metric_name, display_name, unit in metric_list:
179
- query = f"{metric_name}{{dbinstanceidentifier:{db_instance}}}"
180
-
181
- try:
182
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
183
- headers = get_headers(self.toolset.dd_config)
184
- payload = {
185
- "query": query,
186
- "from": start_time,
187
- "to": end_time,
188
- }
189
-
190
- response = execute_datadog_http_request(
191
- url=url,
192
- headers=headers,
193
- payload_or_params=payload,
194
- timeout=self.toolset.dd_config.request_timeout,
195
- method="GET",
196
- )
197
-
198
- if response and "series" in response and response["series"]:
199
- series = response["series"][0]
200
- points = series.get("pointlist", [])
201
-
202
- if points:
203
- values = [p[1] for p in points if p[1] is not None]
204
- if values:
205
- metrics[display_name] = {
206
- "unit": unit
207
- or series.get("unit", [{"short_name": ""}])[0].get(
208
- "short_name", ""
209
- ),
210
- "avg": round(sum(values) / len(values), 2),
211
- "max": round(max(values), 2),
212
- "min": round(min(values), 2),
213
- "latest": round(values[-1], 2),
214
- "data_points": len(values),
215
- }
216
- except DataDogRequestError:
217
- continue
218
-
219
- return {"metrics": metrics} if metrics else {}
220
-
221
- def _analyze_metrics(self, report: Dict, all_metrics: List[Tuple[str, Dict]]):
222
- """Analyze metrics and generate issues"""
223
- for metric_name, data in all_metrics:
224
- # Latency analysis
225
- if "Latency" in metric_name and metric_name != "Commit Latency":
226
- if data["avg"] > 10:
227
- report["issues"].append(
228
- f"{metric_name} averaging {data['avg']}ms (above 10ms threshold)"
229
- )
230
- if data["max"] > 50:
231
- report["issues"].append(f"{metric_name} peaked at {data['max']}ms")
232
-
233
- # Disk queue depth
234
- elif metric_name == "Disk Queue Depth":
235
- if data["avg"] > 5:
236
- report["issues"].append(
237
- f"High disk queue depth (avg: {data['avg']})"
238
- )
239
-
240
- # CPU utilization
241
- elif metric_name == "CPU Utilization":
242
- if data["avg"] > 70:
243
- report["issues"].append(
244
- f"High CPU utilization (avg: {data['avg']}%)"
245
- )
246
- if data["max"] > 90:
247
- report["issues"].append(
248
- f"CPU saturation detected (max: {data['max']}%)"
249
- )
250
-
251
- # Memory
252
- elif metric_name == "Freeable Memory":
253
- if data["min"] < 100 * 1024 * 1024: # 100MB
254
- report["issues"].append(
255
- f"Low memory availability (min: {data['min'] / 1024 / 1024:.1f}MB)"
256
- )
257
-
258
- # Swap usage
259
- elif metric_name == "Swap Usage":
260
- if data["avg"] > 0:
261
- report["issues"].append(
262
- "Swap usage detected, indicating memory pressure"
263
- )
264
-
265
- # Burst balance
266
- elif metric_name == "Burst Balance":
267
- if data["min"] < 30:
268
- report["issues"].append(
269
- f"Low burst balance detected (min: {data['min']}%)"
270
- )
271
-
272
- # IOPS
273
- elif "IOPS" in metric_name:
274
- if data["max"] > 3000:
275
- report["issues"].append(
276
- f"High {metric_name} (max: {data['max']} IOPS)"
277
- )
278
-
279
- def _generate_executive_summary(self, report: Dict) -> str:
280
- """Generate executive summary"""
281
- issue_count = len(report["issues"])
282
-
283
- if issue_count == 0:
284
- return "Database is operating within normal parameters. No significant issues detected."
285
- elif issue_count <= 2:
286
- severity = "Low"
287
- elif issue_count <= 5:
288
- severity = "Medium"
289
- else:
290
- severity = "High"
291
-
292
- summary = f"Performance diagnosis: {severity} severity - {issue_count} issues detected.\n\n"
293
-
294
- # Add key findings
295
- if any("latency" in issue.lower() for issue in report["issues"]):
296
- summary += "• Latency issues affecting database response times\n"
297
- if any("cpu" in issue.lower() for issue in report["issues"]):
298
- summary += "• CPU resource constraints detected\n"
299
- if any(
300
- "memory" in issue.lower() or "swap" in issue.lower()
301
- for issue in report["issues"]
302
- ):
303
- summary += "• Memory pressure affecting performance\n"
304
- if any(
305
- "burst" in issue.lower() or "iops" in issue.lower()
306
- for issue in report["issues"]
307
- ):
308
- summary += "• Storage I/O bottlenecks identified\n"
309
-
310
- return summary
311
-
312
- def _format_report(self, report: Dict) -> str:
313
- """Format the report as readable text"""
314
- lines = []
315
- lines.append(f"RDS Performance Report - {report['instance_id']}")
316
- lines.append("=" * 70)
317
- lines.append(f"Generated: {report['report_time']}")
318
- lines.append(
319
- f"Time Range: {report['time_range']['start']} to {report['time_range']['end']}"
320
- )
321
- lines.append("")
322
-
323
- # Executive Summary
324
- lines.append("EXECUTIVE SUMMARY")
325
- lines.append("-" * 40)
326
- lines.append(report["executive_summary"])
327
- lines.append("")
328
-
329
- # Metrics sections
330
- for section_name, section_data in report["sections"].items():
331
- lines.append(f"{section_name.upper()} METRICS")
332
- lines.append("-" * 40)
333
-
334
- if section_data.get("metrics"):
335
- lines.append(
336
- f"{'Metric':<25} {'Avg':>10} {'Max':>10} {'Min':>10} {'Latest':>10} {'Unit':>8}"
337
- )
338
- lines.append("-" * 80)
339
-
340
- for metric_name, data in section_data["metrics"].items():
341
- lines.append(
342
- f"{metric_name:<25} {data['avg']:>10.2f} {data['max']:>10.2f} "
343
- f"{data['min']:>10.2f} {data['latest']:>10.2f} {data['unit']:>8}"
344
- )
345
- lines.append("")
346
-
347
- # Issues
348
- if report["issues"]:
349
- lines.append(f"ISSUES DETECTED ({len(report['issues'])})")
350
- lines.append("-" * 40)
351
- for i, issue in enumerate(report["issues"], 1):
352
- lines.append(f"{i}. {issue}")
353
- lines.append("")
354
-
355
- return "\n".join(lines)
356
-
357
- def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
358
- db_instance = params.get("db_instance_identifier", "unknown")
359
- return f"Generating performance report for RDS instance: {db_instance}"
360
-
361
-
362
- class GetTopWorstPerformingRDSInstances(BaseDatadogRDSTool):
363
- def __init__(self, toolset: "DatadogRDSToolset"):
364
- super().__init__(
365
- name="datadog_rds_top_worst_performing",
366
- description="[datadog/rds toolset] Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
367
- parameters={
368
- "top_n": ToolParameter(
369
- description=f"Number of worst performing instances to return (default: {DEFAULT_TOP_INSTANCES})",
370
- type="number",
371
- required=False,
372
- ),
373
- "start_time": ToolParameter(
374
- description=standard_start_datetime_tool_param_description(
375
- DEFAULT_TIME_SPAN_SECONDS
376
- ),
377
- type="string",
378
- required=False,
379
- ),
380
- "end_time": ToolParameter(
381
- description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
382
- type="string",
383
- required=False,
384
- ),
385
- "sort_by": ToolParameter(
386
- description="Metric to sort by: 'latency' (default), 'cpu', 'errors', or 'composite'",
387
- type="string",
388
- required=False,
389
- ),
390
- },
391
- toolset=toolset,
392
- )
393
-
394
- def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
395
- if not self.toolset.dd_config:
396
- return StructuredToolResult(
397
- status=StructuredToolResultStatus.ERROR,
398
- error=TOOLSET_CONFIG_MISSING_ERROR,
399
- params=params,
400
- )
401
-
402
- try:
403
- top_n = params.get("top_n", self.toolset.dd_config.default_top_instances)
404
- sort_by = params.get("sort_by", "latency").lower()
405
- start_time, end_time = process_timestamps_to_int(
406
- start=params.get("start_time"),
407
- end=params.get("end_time"),
408
- default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
409
- )
410
-
411
- # Get all RDS instances
412
- instances = self._get_all_rds_instances(start_time, end_time)
413
-
414
- if not instances:
415
- return StructuredToolResult(
416
- status=StructuredToolResultStatus.NO_DATA,
417
- data="No RDS instances found with metrics in the specified time range",
418
- params=params,
419
- )
420
-
421
- # Collect performance data for each instance
422
- instance_performance = []
423
- for instance_id in instances[:50]: # Limit to 50 instances to avoid timeout
424
- perf_data = self._get_instance_performance_summary(
425
- instance_id, start_time, end_time
426
- )
427
- if perf_data:
428
- instance_performance.append(perf_data)
429
-
430
- # Sort by the specified metric
431
- instance_performance = self._sort_instances(instance_performance, sort_by)
432
-
433
- # Get top N worst performers
434
- worst_performers = instance_performance[:top_n]
435
-
436
- # Format the report
437
- report = self._format_summary_report(worst_performers, sort_by)
438
-
439
- report += f"\n\nTotal instances analyzed: {len(instance_performance)}"
440
- report += f"\n\nInstances:\n{json.dumps(worst_performers, indent=2)}"
441
-
442
- return StructuredToolResult(
443
- status=StructuredToolResultStatus.SUCCESS,
444
- data=report,
445
- params=params,
446
- )
447
-
448
- except Exception as e:
449
- logging.error(f"Error getting top worst performing RDS instances: {str(e)}")
450
- return StructuredToolResult(
451
- status=StructuredToolResultStatus.ERROR,
452
- error=f"Failed to get top worst performing RDS instances: {str(e)}",
453
- params=params,
454
- )
455
-
456
- def _get_all_rds_instances(self, start_time: int, end_time: int) -> List[str]:
457
- """Get list of all RDS instances with metrics"""
458
- if not self.toolset.dd_config:
459
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
460
- try:
461
- # Query for any RDS metric grouped by instance
462
- query = "avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier}"
463
-
464
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
465
- headers = get_headers(self.toolset.dd_config)
466
- payload = {
467
- "query": query,
468
- "from": start_time,
469
- "to": end_time,
470
- }
471
-
472
- response = execute_datadog_http_request(
473
- url=url,
474
- headers=headers,
475
- payload_or_params=payload,
476
- timeout=self.toolset.dd_config.request_timeout,
477
- method="GET",
478
- )
479
-
480
- instances = []
481
- if response and "series" in response:
482
- for series in response["series"]:
483
- # Extract instance ID from tags
484
- scope = series.get("scope", "")
485
- if "dbinstanceidentifier:" in scope:
486
- instance_id = scope.split("dbinstanceidentifier:")[1].split(
487
- ","
488
- )[0]
489
- instances.append(instance_id)
490
-
491
- return list(set(instances)) # Remove duplicates
492
-
493
- except Exception as e:
494
- logging.error(f"Error getting RDS instances: {str(e)}")
495
- return []
496
-
497
- def _get_instance_performance_summary(
498
- self, instance_id: str, start_time: int, end_time: int
499
- ) -> Optional[Dict]:
500
- """Get performance summary for a single instance"""
501
-
502
- if not self.toolset.dd_config:
503
- raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
504
-
505
- summary: dict[str, Any] = {
506
- "instance_id": instance_id,
507
- "metrics": {},
508
- "score": 0, # Composite score for sorting
509
- "issues": [],
510
- }
511
-
512
- # Key metrics to collect
513
- metrics_to_collect = [
514
- ("aws.rds.read_latency", "read_latency", 1.0), # weight for composite score
515
- ("aws.rds.write_latency", "write_latency", 1.0),
516
- ("aws.rds.cpuutilization", "cpu_utilization", 0.5),
517
- ("aws.rds.database_connections", "connections", 0.2),
518
- ("aws.rds.burst_balance", "burst_balance", 0.8),
519
- ]
520
-
521
- for metric_name, key, weight in metrics_to_collect:
522
- query = f"avg:{metric_name}{{dbinstanceidentifier:{instance_id}}}"
523
-
524
- try:
525
- url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
526
- headers = get_headers(self.toolset.dd_config)
527
- payload = {
528
- "query": query,
529
- "from": start_time,
530
- "to": end_time,
531
- }
532
-
533
- response = execute_datadog_http_request(
534
- url=url,
535
- headers=headers,
536
- payload_or_params=payload,
537
- timeout=self.toolset.dd_config.request_timeout,
538
- method="GET",
539
- )
540
-
541
- if response and "series" in response and response["series"]:
542
- series = response["series"][0]
543
- points = series.get("pointlist", [])
544
-
545
- if points:
546
- values = [p[1] for p in points if p[1] is not None]
547
- if values:
548
- avg_value = sum(values) / len(values)
549
- max_value = max(values)
550
-
551
- summary["metrics"][key] = {
552
- "avg": round(avg_value, 2),
553
- "max": round(max_value, 2),
554
- }
555
-
556
- # Calculate contribution to composite score
557
- if key in ["read_latency", "write_latency"]:
558
- # Higher latency = worse performance
559
- score_contrib = avg_value * weight
560
- if avg_value > 10:
561
- summary["issues"].append(
562
- f"High {key.replace('_', ' ')}: {avg_value:.1f}ms"
563
- )
564
- elif key == "cpu_utilization":
565
- # Higher CPU = worse performance
566
- score_contrib = avg_value * weight
567
- if avg_value > 70:
568
- summary["issues"].append(
569
- f"High CPU: {avg_value:.1f}%"
570
- )
571
- elif key == "burst_balance":
572
- # Lower burst balance = worse performance
573
- score_contrib = (100 - avg_value) * weight
574
- if avg_value < 30:
575
- summary["issues"].append(
576
- f"Low burst balance: {avg_value:.1f}%"
577
- )
578
- else:
579
- score_contrib = 0
580
-
581
- summary["score"] += score_contrib
582
-
583
- except Exception:
584
- continue
585
-
586
- return summary if summary["metrics"] else None
587
-
588
- def _sort_instances(self, instances: List[Dict], sort_by: str) -> List[Dict]:
589
- """Sort instances by specified metric"""
590
- if sort_by == "latency":
591
- # Sort by average of read and write latency
592
- def latency_key(inst):
593
- read_lat = inst["metrics"].get("read_latency", {}).get("avg", 0)
594
- write_lat = inst["metrics"].get("write_latency", {}).get("avg", 0)
595
- return (read_lat + write_lat) / 2
596
-
597
- return sorted(instances, key=latency_key, reverse=True)
598
-
599
- elif sort_by == "cpu":
600
- return sorted(
601
- instances,
602
- key=lambda x: x["metrics"].get("cpu_utilization", {}).get("avg", 0),
603
- reverse=True,
604
- )
605
-
606
- elif sort_by == "composite":
607
- return sorted(instances, key=lambda x: x["score"], reverse=True)
608
-
609
- else: # Default to latency
610
- return self._sort_instances(instances, "latency")
611
-
612
- def _format_summary_report(self, instances: List[Dict], sort_by: str) -> str:
613
- """Format the summary report"""
614
- lines = []
615
- lines.append("Top Worst Performing RDS Instances")
616
- lines.append("=" * 70)
617
- lines.append(f"Sorted by: {sort_by}")
618
- lines.append(f"Instances shown: {len(instances)}")
619
- lines.append("")
620
-
621
- for rank, inst in enumerate(instances, 1):
622
- lines.append(f"{rank}. {inst['instance_id']}")
623
- lines.append("-" * 40)
624
-
625
- # Show key metrics
626
- metrics = inst["metrics"]
627
- if "read_latency" in metrics:
628
- lines.append(
629
- f" Read Latency: {metrics['read_latency']['avg']:.1f}ms avg, {metrics['read_latency']['max']:.1f}ms max"
630
- )
631
- if "write_latency" in metrics:
632
- lines.append(
633
- f" Write Latency: {metrics['write_latency']['avg']:.1f}ms avg, {metrics['write_latency']['max']:.1f}ms max"
634
- )
635
- if "cpu_utilization" in metrics:
636
- lines.append(
637
- f" CPU Usage: {metrics['cpu_utilization']['avg']:.1f}% avg, {metrics['cpu_utilization']['max']:.1f}% max"
638
- )
639
- if "burst_balance" in metrics:
640
- lines.append(
641
- f" Burst Balance: {metrics['burst_balance']['avg']:.1f}% avg"
642
- )
643
-
644
- # Show issues
645
- if inst["issues"]:
646
- lines.append(" Issues:")
647
- for issue in inst["issues"]:
648
- lines.append(f" • {issue}")
649
-
650
- lines.append("")
651
-
652
- return "\n".join(lines)
653
-
654
- def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
655
- top_n = params.get("top_n", DEFAULT_TOP_INSTANCES)
656
- sort_by = params.get("sort_by", "latency")
657
- return f"Getting top {top_n} worst performing RDS instances sorted by {sort_by}"
658
-
659
-
660
- class DatadogRDSToolset(Toolset):
661
- dd_config: Optional[DatadogRDSConfig] = None
662
-
663
- def __init__(self):
664
- super().__init__(
665
- name="datadog/rds",
666
- description="Analyze RDS database performance and identify worst performers using Datadog metrics",
667
- tags=[ToolsetTag.CORE],
668
- tools=[
669
- GenerateRDSPerformanceReport(toolset=self),
670
- GetTopWorstPerformingRDSInstances(toolset=self),
671
- ],
672
- )
673
-
674
- def prerequisites_check(self, config: Dict[str, Any]) -> CallablePrerequisite:
675
- def check_datadog_connectivity(config_dict: Dict[str, Any]) -> Tuple[bool, str]:
676
- """Check Datadog API connectivity and permissions"""
677
- try:
678
- # Validate config
679
- self.dd_config = DatadogRDSConfig(**config_dict)
680
-
681
- # Test API connectivity
682
- url = f"{self.dd_config.site_api_url}/api/v1/validate"
683
- headers = get_headers(self.dd_config)
684
-
685
- response = execute_datadog_http_request(
686
- url=url,
687
- headers=headers,
688
- payload_or_params={},
689
- timeout=self.dd_config.request_timeout,
690
- method="GET",
691
- )
692
-
693
- if response and response.get("valid", False):
694
- # Test metrics API access
695
- metrics_url = f"{self.dd_config.site_api_url}/api/v1/metrics"
696
- execute_datadog_http_request(
697
- url=metrics_url,
698
- headers=headers,
699
- payload_or_params={"from": 0},
700
- timeout=self.dd_config.request_timeout,
701
- method="GET",
702
- )
703
- return True, ""
704
- else:
705
- return False, "Invalid Datadog API credentials"
706
-
707
- except DataDogRequestError as e:
708
- if e.status_code == 403:
709
- return False, "Invalid Datadog API keys or insufficient permissions"
710
- else:
711
- return False, f"Datadog API error: {str(e)}"
712
- except Exception as e:
713
- return False, f"Failed to initialize Datadog RDS toolset: {str(e)}"
714
-
715
- return CallablePrerequisite(callable=check_datadog_connectivity)
716
-
717
- def post_init(self, config: dict):
718
- """Load LLM instructions after initialization"""
719
- self._reload_instructions()
720
-
721
- def _reload_instructions(self):
722
- """Load RDS analysis specific instructions"""
723
- template_file_path = os.path.abspath(
724
- os.path.join(os.path.dirname(__file__), "datadog_rds_instructions.jinja2")
725
- )
726
- self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
727
-
728
- def get_example_config(self) -> Dict[str, Any]:
729
- """Get example configuration for this toolset."""
730
- return {
731
- "dd_api_key": "your-datadog-api-key",
732
- "dd_app_key": "your-datadog-application-key",
733
- "site_api_url": "https://api.datadoghq.com",
734
- "default_time_span_seconds": 3600,
735
- "default_top_instances": 10,
736
- }