holmesgpt 0.12.4__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (86) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +19 -1
  3. holmes/common/env_vars.py +13 -0
  4. holmes/config.py +69 -9
  5. holmes/core/conversations.py +11 -0
  6. holmes/core/investigation.py +16 -3
  7. holmes/core/investigation_structured_output.py +12 -0
  8. holmes/core/llm.py +10 -0
  9. holmes/core/models.py +9 -1
  10. holmes/core/openai_formatting.py +72 -12
  11. holmes/core/prompt.py +13 -0
  12. holmes/core/supabase_dal.py +3 -0
  13. holmes/core/todo_manager.py +88 -0
  14. holmes/core/tool_calling_llm.py +121 -149
  15. holmes/core/tools.py +10 -1
  16. holmes/core/tools_utils/tool_executor.py +7 -2
  17. holmes/core/tools_utils/toolset_utils.py +7 -2
  18. holmes/core/tracing.py +8 -7
  19. holmes/interactive.py +1 -0
  20. holmes/main.py +2 -1
  21. holmes/plugins/prompts/__init__.py +7 -1
  22. holmes/plugins/prompts/_ai_safety.jinja2 +43 -0
  23. holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
  24. holmes/plugins/prompts/_default_log_prompt.jinja2 +4 -2
  25. holmes/plugins/prompts/_fetch_logs.jinja2 +6 -1
  26. holmes/plugins/prompts/_general_instructions.jinja2 +16 -0
  27. holmes/plugins/prompts/_permission_errors.jinja2 +1 -1
  28. holmes/plugins/prompts/_toolsets_instructions.jinja2 +4 -4
  29. holmes/plugins/prompts/generic_ask.jinja2 +4 -3
  30. holmes/plugins/prompts/investigation_procedure.jinja2 +210 -0
  31. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +4 -0
  32. holmes/plugins/toolsets/__init__.py +19 -6
  33. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +27 -0
  34. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -2
  35. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -1
  36. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
  37. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +2 -1
  38. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -1
  39. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
  40. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -1
  41. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -1
  42. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -1
  43. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -1
  44. holmes/plugins/toolsets/coralogix/api.py +6 -6
  45. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +7 -1
  46. holmes/plugins/toolsets/datadog/datadog_api.py +20 -8
  47. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +8 -1
  48. holmes/plugins/toolsets/datadog/datadog_rds_instructions.jinja2 +82 -0
  49. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +12 -5
  50. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -11
  51. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +735 -0
  52. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -11
  53. holmes/plugins/toolsets/git.py +15 -15
  54. holmes/plugins/toolsets/grafana/grafana_api.py +12 -1
  55. holmes/plugins/toolsets/grafana/toolset_grafana.py +5 -1
  56. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +9 -4
  57. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +12 -5
  58. holmes/plugins/toolsets/internet/internet.py +2 -1
  59. holmes/plugins/toolsets/internet/notion.py +2 -1
  60. holmes/plugins/toolsets/investigator/__init__.py +0 -0
  61. holmes/plugins/toolsets/investigator/core_investigation.py +157 -0
  62. holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +253 -0
  63. holmes/plugins/toolsets/investigator/model.py +15 -0
  64. holmes/plugins/toolsets/kafka.py +14 -7
  65. holmes/plugins/toolsets/kubernetes.yaml +7 -7
  66. holmes/plugins/toolsets/kubernetes_logs.py +454 -25
  67. holmes/plugins/toolsets/logging_utils/logging_api.py +115 -55
  68. holmes/plugins/toolsets/mcp/toolset_mcp.py +1 -1
  69. holmes/plugins/toolsets/newrelic.py +8 -3
  70. holmes/plugins/toolsets/opensearch/opensearch.py +8 -4
  71. holmes/plugins/toolsets/opensearch/opensearch_logs.py +9 -2
  72. holmes/plugins/toolsets/opensearch/opensearch_traces.py +6 -2
  73. holmes/plugins/toolsets/prometheus/prometheus.py +149 -44
  74. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +8 -2
  75. holmes/plugins/toolsets/robusta/robusta.py +4 -4
  76. holmes/plugins/toolsets/runbook/runbook_fetcher.py +6 -5
  77. holmes/plugins/toolsets/servicenow/servicenow.py +18 -3
  78. holmes/plugins/toolsets/utils.py +8 -1
  79. holmes/utils/llms.py +20 -0
  80. holmes/utils/stream.py +90 -0
  81. {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/METADATA +48 -35
  82. {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/RECORD +85 -75
  83. holmes/utils/robusta.py +0 -9
  84. {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/LICENSE.txt +0 -0
  85. {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/WHEEL +0 -0
  86. {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,735 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from datetime import datetime, timezone
5
+ from typing import Any, Dict, List, Optional, Tuple
6
+
7
+
8
+ from holmes.core.tools import (
9
+ CallablePrerequisite,
10
+ StructuredToolResult,
11
+ Tool,
12
+ ToolParameter,
13
+ ToolResultStatus,
14
+ Toolset,
15
+ ToolsetTag,
16
+ )
17
+ from holmes.plugins.toolsets.consts import (
18
+ TOOLSET_CONFIG_MISSING_ERROR,
19
+ STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
20
+ )
21
+ from holmes.plugins.toolsets.datadog.datadog_api import (
22
+ DatadogBaseConfig,
23
+ DataDogRequestError,
24
+ execute_datadog_http_request,
25
+ get_headers,
26
+ )
27
+ from holmes.plugins.toolsets.utils import (
28
+ get_param_or_raise,
29
+ process_timestamps_to_int,
30
+ standard_start_datetime_tool_param_description,
31
+ )
32
+
33
+ DEFAULT_TIME_SPAN_SECONDS = 3600
34
+ DEFAULT_TOP_INSTANCES = 10
35
+
36
+ # Metric definitions
37
+ LATENCY_METRICS = [
38
+ ("aws.rds.read_latency", "Read Latency", "ms"),
39
+ ("aws.rds.write_latency", "Write Latency", "ms"),
40
+ ("aws.rds.commit_latency", "Commit Latency", "ms"),
41
+ ("aws.rds.disk_queue_depth", "Disk Queue Depth", ""),
42
+ ]
43
+
44
+ RESOURCE_METRICS = [
45
+ ("aws.rds.cpuutilization", "CPU Utilization", "%"),
46
+ ("aws.rds.database_connections", "Database Connections", "connections"),
47
+ ("aws.rds.freeable_memory", "Freeable Memory", "bytes"),
48
+ ("aws.rds.swap_usage", "Swap Usage", "bytes"),
49
+ ]
50
+
51
+ STORAGE_METRICS = [
52
+ ("aws.rds.read_iops", "Read IOPS", "iops"),
53
+ ("aws.rds.write_iops", "Write IOPS", "iops"),
54
+ ("aws.rds.burst_balance", "Burst Balance", "%"),
55
+ ("aws.rds.free_storage_space", "Free Storage Space", "bytes"),
56
+ ]
57
+
58
+
59
+ class DatadogRDSConfig(DatadogBaseConfig):
60
+ default_time_span_seconds: int = DEFAULT_TIME_SPAN_SECONDS
61
+ default_top_instances: int = DEFAULT_TOP_INSTANCES
62
+
63
+
64
+ class BaseDatadogRDSTool(Tool):
65
+ toolset: "DatadogRDSToolset"
66
+
67
+
68
+ class GenerateRDSPerformanceReport(BaseDatadogRDSTool):
69
+ def __init__(self, toolset: "DatadogRDSToolset"):
70
+ super().__init__(
71
+ name="datadog_rds_performance_report",
72
+ description="Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
73
+ parameters={
74
+ "db_instance_identifier": ToolParameter(
75
+ description="The RDS database instance identifier",
76
+ type="string",
77
+ required=True,
78
+ ),
79
+ "start_time": ToolParameter(
80
+ description=standard_start_datetime_tool_param_description(
81
+ DEFAULT_TIME_SPAN_SECONDS
82
+ ),
83
+ type="string",
84
+ required=False,
85
+ ),
86
+ "end_time": ToolParameter(
87
+ description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
88
+ type="string",
89
+ required=False,
90
+ ),
91
+ },
92
+ toolset=toolset,
93
+ )
94
+
95
+ def _invoke(self, params: Any) -> StructuredToolResult:
96
+ if not self.toolset.dd_config:
97
+ return StructuredToolResult(
98
+ status=ToolResultStatus.ERROR,
99
+ error=TOOLSET_CONFIG_MISSING_ERROR,
100
+ params=params,
101
+ )
102
+
103
+ try:
104
+ db_instance = get_param_or_raise(params, "db_instance_identifier")
105
+ start_time, end_time = process_timestamps_to_int(
106
+ start=params.get("start_time"),
107
+ end=params.get("end_time"),
108
+ default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
109
+ )
110
+
111
+ report: dict[str, Any] = {
112
+ "instance_id": db_instance,
113
+ "report_time": datetime.now(timezone.utc).isoformat(),
114
+ "time_range": {
115
+ "start": datetime.fromtimestamp(
116
+ start_time, tz=timezone.utc
117
+ ).isoformat(),
118
+ "end": datetime.fromtimestamp(
119
+ end_time, tz=timezone.utc
120
+ ).isoformat(),
121
+ },
122
+ "sections": {},
123
+ "issues": [],
124
+ "executive_summary": "",
125
+ }
126
+
127
+ # Collect all metrics
128
+ all_metrics = []
129
+ for metric_group, group_name in [
130
+ (LATENCY_METRICS, "latency"),
131
+ (RESOURCE_METRICS, "resources"),
132
+ (STORAGE_METRICS, "storage"),
133
+ ]:
134
+ section_data = self._collect_metrics(
135
+ db_instance, metric_group, start_time, end_time
136
+ )
137
+ if section_data:
138
+ report["sections"][group_name] = section_data
139
+ all_metrics.extend(section_data.get("metrics", {}).items())
140
+
141
+ # Analyze metrics and generate insights
142
+ self._analyze_metrics(report, all_metrics)
143
+
144
+ # Generate executive summary
145
+ report["executive_summary"] = self._generate_executive_summary(report)
146
+
147
+ # Format the report as readable text
148
+ formatted_report = self._format_report(report)
149
+
150
+ return StructuredToolResult(
151
+ status=ToolResultStatus.SUCCESS,
152
+ data=formatted_report,
153
+ params=params,
154
+ )
155
+
156
+ except Exception as e:
157
+ logging.error(f"Error generating RDS performance report: {str(e)}")
158
+ return StructuredToolResult(
159
+ status=ToolResultStatus.ERROR,
160
+ error=f"Failed to generate RDS performance report: {str(e)}",
161
+ params=params,
162
+ )
163
+
164
+ def _collect_metrics(
165
+ self,
166
+ db_instance: str,
167
+ metric_list: List[Tuple[str, str, str]],
168
+ start_time: int,
169
+ end_time: int,
170
+ ) -> Dict[str, Any]:
171
+ """Collect metrics for a specific group"""
172
+ if not self.toolset.dd_config:
173
+ raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
174
+
175
+ metrics = {}
176
+
177
+ for metric_name, display_name, unit in metric_list:
178
+ query = f"{metric_name}{{dbinstanceidentifier:{db_instance}}}"
179
+
180
+ try:
181
+ url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
182
+ headers = get_headers(self.toolset.dd_config)
183
+ payload = {
184
+ "query": query,
185
+ "from": start_time,
186
+ "to": end_time,
187
+ }
188
+
189
+ response = execute_datadog_http_request(
190
+ url=url,
191
+ headers=headers,
192
+ payload_or_params=payload,
193
+ timeout=self.toolset.dd_config.request_timeout,
194
+ method="GET",
195
+ )
196
+
197
+ if response and "series" in response and response["series"]:
198
+ series = response["series"][0]
199
+ points = series.get("pointlist", [])
200
+
201
+ if points:
202
+ values = [p[1] for p in points if p[1] is not None]
203
+ if values:
204
+ metrics[display_name] = {
205
+ "unit": unit
206
+ or series.get("unit", [{"short_name": ""}])[0].get(
207
+ "short_name", ""
208
+ ),
209
+ "avg": round(sum(values) / len(values), 2),
210
+ "max": round(max(values), 2),
211
+ "min": round(min(values), 2),
212
+ "latest": round(values[-1], 2),
213
+ "data_points": len(values),
214
+ }
215
+ except DataDogRequestError:
216
+ continue
217
+
218
+ return {"metrics": metrics} if metrics else {}
219
+
220
+ def _analyze_metrics(self, report: Dict, all_metrics: List[Tuple[str, Dict]]):
221
+ """Analyze metrics and generate issues"""
222
+ for metric_name, data in all_metrics:
223
+ # Latency analysis
224
+ if "Latency" in metric_name and metric_name != "Commit Latency":
225
+ if data["avg"] > 10:
226
+ report["issues"].append(
227
+ f"{metric_name} averaging {data['avg']}ms (above 10ms threshold)"
228
+ )
229
+ if data["max"] > 50:
230
+ report["issues"].append(f"{metric_name} peaked at {data['max']}ms")
231
+
232
+ # Disk queue depth
233
+ elif metric_name == "Disk Queue Depth":
234
+ if data["avg"] > 5:
235
+ report["issues"].append(
236
+ f"High disk queue depth (avg: {data['avg']})"
237
+ )
238
+
239
+ # CPU utilization
240
+ elif metric_name == "CPU Utilization":
241
+ if data["avg"] > 70:
242
+ report["issues"].append(
243
+ f"High CPU utilization (avg: {data['avg']}%)"
244
+ )
245
+ if data["max"] > 90:
246
+ report["issues"].append(
247
+ f"CPU saturation detected (max: {data['max']}%)"
248
+ )
249
+
250
+ # Memory
251
+ elif metric_name == "Freeable Memory":
252
+ if data["min"] < 100 * 1024 * 1024: # 100MB
253
+ report["issues"].append(
254
+ f"Low memory availability (min: {data['min'] / 1024 / 1024:.1f}MB)"
255
+ )
256
+
257
+ # Swap usage
258
+ elif metric_name == "Swap Usage":
259
+ if data["avg"] > 0:
260
+ report["issues"].append(
261
+ "Swap usage detected, indicating memory pressure"
262
+ )
263
+
264
+ # Burst balance
265
+ elif metric_name == "Burst Balance":
266
+ if data["min"] < 30:
267
+ report["issues"].append(
268
+ f"Low burst balance detected (min: {data['min']}%)"
269
+ )
270
+
271
+ # IOPS
272
+ elif "IOPS" in metric_name:
273
+ if data["max"] > 3000:
274
+ report["issues"].append(
275
+ f"High {metric_name} (max: {data['max']} IOPS)"
276
+ )
277
+
278
+ def _generate_executive_summary(self, report: Dict) -> str:
279
+ """Generate executive summary"""
280
+ issue_count = len(report["issues"])
281
+
282
+ if issue_count == 0:
283
+ return "Database is operating within normal parameters. No significant issues detected."
284
+ elif issue_count <= 2:
285
+ severity = "Low"
286
+ elif issue_count <= 5:
287
+ severity = "Medium"
288
+ else:
289
+ severity = "High"
290
+
291
+ summary = f"Performance diagnosis: {severity} severity - {issue_count} issues detected.\n\n"
292
+
293
+ # Add key findings
294
+ if any("latency" in issue.lower() for issue in report["issues"]):
295
+ summary += "• Latency issues affecting database response times\n"
296
+ if any("cpu" in issue.lower() for issue in report["issues"]):
297
+ summary += "• CPU resource constraints detected\n"
298
+ if any(
299
+ "memory" in issue.lower() or "swap" in issue.lower()
300
+ for issue in report["issues"]
301
+ ):
302
+ summary += "• Memory pressure affecting performance\n"
303
+ if any(
304
+ "burst" in issue.lower() or "iops" in issue.lower()
305
+ for issue in report["issues"]
306
+ ):
307
+ summary += "• Storage I/O bottlenecks identified\n"
308
+
309
+ return summary
310
+
311
+ def _format_report(self, report: Dict) -> str:
312
+ """Format the report as readable text"""
313
+ lines = []
314
+ lines.append(f"RDS Performance Report - {report['instance_id']}")
315
+ lines.append("=" * 70)
316
+ lines.append(f"Generated: {report['report_time']}")
317
+ lines.append(
318
+ f"Time Range: {report['time_range']['start']} to {report['time_range']['end']}"
319
+ )
320
+ lines.append("")
321
+
322
+ # Executive Summary
323
+ lines.append("EXECUTIVE SUMMARY")
324
+ lines.append("-" * 40)
325
+ lines.append(report["executive_summary"])
326
+ lines.append("")
327
+
328
+ # Metrics sections
329
+ for section_name, section_data in report["sections"].items():
330
+ lines.append(f"{section_name.upper()} METRICS")
331
+ lines.append("-" * 40)
332
+
333
+ if section_data.get("metrics"):
334
+ lines.append(
335
+ f"{'Metric':<25} {'Avg':>10} {'Max':>10} {'Min':>10} {'Latest':>10} {'Unit':>8}"
336
+ )
337
+ lines.append("-" * 80)
338
+
339
+ for metric_name, data in section_data["metrics"].items():
340
+ lines.append(
341
+ f"{metric_name:<25} {data['avg']:>10.2f} {data['max']:>10.2f} "
342
+ f"{data['min']:>10.2f} {data['latest']:>10.2f} {data['unit']:>8}"
343
+ )
344
+ lines.append("")
345
+
346
+ # Issues
347
+ if report["issues"]:
348
+ lines.append(f"ISSUES DETECTED ({len(report['issues'])})")
349
+ lines.append("-" * 40)
350
+ for i, issue in enumerate(report["issues"], 1):
351
+ lines.append(f"{i}. {issue}")
352
+ lines.append("")
353
+
354
+ return "\n".join(lines)
355
+
356
+ def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
357
+ db_instance = params.get("db_instance_identifier", "unknown")
358
+ return f"Generating performance report for RDS instance: {db_instance}"
359
+
360
+
361
+ class GetTopWorstPerformingRDSInstances(BaseDatadogRDSTool):
362
+ def __init__(self, toolset: "DatadogRDSToolset"):
363
+ super().__init__(
364
+ name="datadog_rds_top_worst_performing",
365
+ description="Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
366
+ parameters={
367
+ "top_n": ToolParameter(
368
+ description=f"Number of worst performing instances to return (default: {DEFAULT_TOP_INSTANCES})",
369
+ type="number",
370
+ required=False,
371
+ ),
372
+ "start_time": ToolParameter(
373
+ description=standard_start_datetime_tool_param_description(
374
+ DEFAULT_TIME_SPAN_SECONDS
375
+ ),
376
+ type="string",
377
+ required=False,
378
+ ),
379
+ "end_time": ToolParameter(
380
+ description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
381
+ type="string",
382
+ required=False,
383
+ ),
384
+ "sort_by": ToolParameter(
385
+ description="Metric to sort by: 'latency' (default), 'cpu', 'errors', or 'composite'",
386
+ type="string",
387
+ required=False,
388
+ ),
389
+ },
390
+ toolset=toolset,
391
+ )
392
+
393
+ def _invoke(self, params: Any) -> StructuredToolResult:
394
+ if not self.toolset.dd_config:
395
+ return StructuredToolResult(
396
+ status=ToolResultStatus.ERROR,
397
+ error=TOOLSET_CONFIG_MISSING_ERROR,
398
+ params=params,
399
+ )
400
+
401
+ try:
402
+ top_n = params.get("top_n", self.toolset.dd_config.default_top_instances)
403
+ sort_by = params.get("sort_by", "latency").lower()
404
+ start_time, end_time = process_timestamps_to_int(
405
+ start=params.get("start_time"),
406
+ end=params.get("end_time"),
407
+ default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
408
+ )
409
+
410
+ # Get all RDS instances
411
+ instances = self._get_all_rds_instances(start_time, end_time)
412
+
413
+ if not instances:
414
+ return StructuredToolResult(
415
+ status=ToolResultStatus.NO_DATA,
416
+ data="No RDS instances found with metrics in the specified time range",
417
+ params=params,
418
+ )
419
+
420
+ # Collect performance data for each instance
421
+ instance_performance = []
422
+ for instance_id in instances[:50]: # Limit to 50 instances to avoid timeout
423
+ perf_data = self._get_instance_performance_summary(
424
+ instance_id, start_time, end_time
425
+ )
426
+ if perf_data:
427
+ instance_performance.append(perf_data)
428
+
429
+ # Sort by the specified metric
430
+ instance_performance = self._sort_instances(instance_performance, sort_by)
431
+
432
+ # Get top N worst performers
433
+ worst_performers = instance_performance[:top_n]
434
+
435
+ # Format the report
436
+ report = self._format_summary_report(worst_performers, sort_by)
437
+
438
+ report += f"\n\nTotal instances analyzed: {len(instance_performance)}"
439
+ report += f"\n\nInstances:\n{json.dumps(worst_performers, indent=2)}"
440
+
441
+ return StructuredToolResult(
442
+ status=ToolResultStatus.SUCCESS,
443
+ data=report,
444
+ params=params,
445
+ )
446
+
447
+ except Exception as e:
448
+ logging.error(f"Error getting top worst performing RDS instances: {str(e)}")
449
+ return StructuredToolResult(
450
+ status=ToolResultStatus.ERROR,
451
+ error=f"Failed to get top worst performing RDS instances: {str(e)}",
452
+ params=params,
453
+ )
454
+
455
+ def _get_all_rds_instances(self, start_time: int, end_time: int) -> List[str]:
456
+ """Get list of all RDS instances with metrics"""
457
+ if not self.toolset.dd_config:
458
+ raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
459
+ try:
460
+ # Query for any RDS metric grouped by instance
461
+ query = "avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier}"
462
+
463
+ url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
464
+ headers = get_headers(self.toolset.dd_config)
465
+ payload = {
466
+ "query": query,
467
+ "from": start_time,
468
+ "to": end_time,
469
+ }
470
+
471
+ response = execute_datadog_http_request(
472
+ url=url,
473
+ headers=headers,
474
+ payload_or_params=payload,
475
+ timeout=self.toolset.dd_config.request_timeout,
476
+ method="GET",
477
+ )
478
+
479
+ instances = []
480
+ if response and "series" in response:
481
+ for series in response["series"]:
482
+ # Extract instance ID from tags
483
+ scope = series.get("scope", "")
484
+ if "dbinstanceidentifier:" in scope:
485
+ instance_id = scope.split("dbinstanceidentifier:")[1].split(
486
+ ","
487
+ )[0]
488
+ instances.append(instance_id)
489
+
490
+ return list(set(instances)) # Remove duplicates
491
+
492
+ except Exception as e:
493
+ logging.error(f"Error getting RDS instances: {str(e)}")
494
+ return []
495
+
496
+ def _get_instance_performance_summary(
497
+ self, instance_id: str, start_time: int, end_time: int
498
+ ) -> Optional[Dict]:
499
+ """Get performance summary for a single instance"""
500
+
501
+ if not self.toolset.dd_config:
502
+ raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
503
+
504
+ summary: dict[str, Any] = {
505
+ "instance_id": instance_id,
506
+ "metrics": {},
507
+ "score": 0, # Composite score for sorting
508
+ "issues": [],
509
+ }
510
+
511
+ # Key metrics to collect
512
+ metrics_to_collect = [
513
+ ("aws.rds.read_latency", "read_latency", 1.0), # weight for composite score
514
+ ("aws.rds.write_latency", "write_latency", 1.0),
515
+ ("aws.rds.cpuutilization", "cpu_utilization", 0.5),
516
+ ("aws.rds.database_connections", "connections", 0.2),
517
+ ("aws.rds.burst_balance", "burst_balance", 0.8),
518
+ ]
519
+
520
+ for metric_name, key, weight in metrics_to_collect:
521
+ query = f"avg:{metric_name}{{dbinstanceidentifier:{instance_id}}}"
522
+
523
+ try:
524
+ url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
525
+ headers = get_headers(self.toolset.dd_config)
526
+ payload = {
527
+ "query": query,
528
+ "from": start_time,
529
+ "to": end_time,
530
+ }
531
+
532
+ response = execute_datadog_http_request(
533
+ url=url,
534
+ headers=headers,
535
+ payload_or_params=payload,
536
+ timeout=self.toolset.dd_config.request_timeout,
537
+ method="GET",
538
+ )
539
+
540
+ if response and "series" in response and response["series"]:
541
+ series = response["series"][0]
542
+ points = series.get("pointlist", [])
543
+
544
+ if points:
545
+ values = [p[1] for p in points if p[1] is not None]
546
+ if values:
547
+ avg_value = sum(values) / len(values)
548
+ max_value = max(values)
549
+
550
+ summary["metrics"][key] = {
551
+ "avg": round(avg_value, 2),
552
+ "max": round(max_value, 2),
553
+ }
554
+
555
+ # Calculate contribution to composite score
556
+ if key in ["read_latency", "write_latency"]:
557
+ # Higher latency = worse performance
558
+ score_contrib = avg_value * weight
559
+ if avg_value > 10:
560
+ summary["issues"].append(
561
+ f"High {key.replace('_', ' ')}: {avg_value:.1f}ms"
562
+ )
563
+ elif key == "cpu_utilization":
564
+ # Higher CPU = worse performance
565
+ score_contrib = avg_value * weight
566
+ if avg_value > 70:
567
+ summary["issues"].append(
568
+ f"High CPU: {avg_value:.1f}%"
569
+ )
570
+ elif key == "burst_balance":
571
+ # Lower burst balance = worse performance
572
+ score_contrib = (100 - avg_value) * weight
573
+ if avg_value < 30:
574
+ summary["issues"].append(
575
+ f"Low burst balance: {avg_value:.1f}%"
576
+ )
577
+ else:
578
+ score_contrib = 0
579
+
580
+ summary["score"] += score_contrib
581
+
582
+ except Exception:
583
+ continue
584
+
585
+ return summary if summary["metrics"] else None
586
+
587
+ def _sort_instances(self, instances: List[Dict], sort_by: str) -> List[Dict]:
588
+ """Sort instances by specified metric"""
589
+ if sort_by == "latency":
590
+ # Sort by average of read and write latency
591
+ def latency_key(inst):
592
+ read_lat = inst["metrics"].get("read_latency", {}).get("avg", 0)
593
+ write_lat = inst["metrics"].get("write_latency", {}).get("avg", 0)
594
+ return (read_lat + write_lat) / 2
595
+
596
+ return sorted(instances, key=latency_key, reverse=True)
597
+
598
+ elif sort_by == "cpu":
599
+ return sorted(
600
+ instances,
601
+ key=lambda x: x["metrics"].get("cpu_utilization", {}).get("avg", 0),
602
+ reverse=True,
603
+ )
604
+
605
+ elif sort_by == "composite":
606
+ return sorted(instances, key=lambda x: x["score"], reverse=True)
607
+
608
+ else: # Default to latency
609
+ return self._sort_instances(instances, "latency")
610
+
611
+ def _format_summary_report(self, instances: List[Dict], sort_by: str) -> str:
612
+ """Format the summary report"""
613
+ lines = []
614
+ lines.append("Top Worst Performing RDS Instances")
615
+ lines.append("=" * 70)
616
+ lines.append(f"Sorted by: {sort_by}")
617
+ lines.append(f"Instances shown: {len(instances)}")
618
+ lines.append("")
619
+
620
+ for rank, inst in enumerate(instances, 1):
621
+ lines.append(f"{rank}. {inst['instance_id']}")
622
+ lines.append("-" * 40)
623
+
624
+ # Show key metrics
625
+ metrics = inst["metrics"]
626
+ if "read_latency" in metrics:
627
+ lines.append(
628
+ f" Read Latency: {metrics['read_latency']['avg']:.1f}ms avg, {metrics['read_latency']['max']:.1f}ms max"
629
+ )
630
+ if "write_latency" in metrics:
631
+ lines.append(
632
+ f" Write Latency: {metrics['write_latency']['avg']:.1f}ms avg, {metrics['write_latency']['max']:.1f}ms max"
633
+ )
634
+ if "cpu_utilization" in metrics:
635
+ lines.append(
636
+ f" CPU Usage: {metrics['cpu_utilization']['avg']:.1f}% avg, {metrics['cpu_utilization']['max']:.1f}% max"
637
+ )
638
+ if "burst_balance" in metrics:
639
+ lines.append(
640
+ f" Burst Balance: {metrics['burst_balance']['avg']:.1f}% avg"
641
+ )
642
+
643
+ # Show issues
644
+ if inst["issues"]:
645
+ lines.append(" Issues:")
646
+ for issue in inst["issues"]:
647
+ lines.append(f" • {issue}")
648
+
649
+ lines.append("")
650
+
651
+ return "\n".join(lines)
652
+
653
+ def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
654
+ top_n = params.get("top_n", DEFAULT_TOP_INSTANCES)
655
+ sort_by = params.get("sort_by", "latency")
656
+ return f"Getting top {top_n} worst performing RDS instances sorted by {sort_by}"
657
+
658
+
659
+ class DatadogRDSToolset(Toolset):
660
+ dd_config: Optional[DatadogRDSConfig] = None
661
+
662
+ def __init__(self):
663
+ super().__init__(
664
+ name="datadog/rds",
665
+ description="Analyze RDS database performance and identify worst performers using Datadog metrics",
666
+ tags=[ToolsetTag.CORE],
667
+ tools=[
668
+ GenerateRDSPerformanceReport(toolset=self),
669
+ GetTopWorstPerformingRDSInstances(toolset=self),
670
+ ],
671
+ )
672
+
673
+ def prerequisites_check(self, config: Dict[str, Any]) -> CallablePrerequisite:
674
+ def check_datadog_connectivity(config_dict: Dict[str, Any]) -> Tuple[bool, str]:
675
+ """Check Datadog API connectivity and permissions"""
676
+ try:
677
+ # Validate config
678
+ self.dd_config = DatadogRDSConfig(**config_dict)
679
+
680
+ # Test API connectivity
681
+ url = f"{self.dd_config.site_api_url}/api/v1/validate"
682
+ headers = get_headers(self.dd_config)
683
+
684
+ response = execute_datadog_http_request(
685
+ url=url,
686
+ headers=headers,
687
+ payload_or_params={},
688
+ timeout=self.dd_config.request_timeout,
689
+ method="GET",
690
+ )
691
+
692
+ if response and response.get("valid", False):
693
+ # Test metrics API access
694
+ metrics_url = f"{self.dd_config.site_api_url}/api/v1/metrics"
695
+ execute_datadog_http_request(
696
+ url=metrics_url,
697
+ headers=headers,
698
+ payload_or_params={"from": 0},
699
+ timeout=self.dd_config.request_timeout,
700
+ method="GET",
701
+ )
702
+ return True, ""
703
+ else:
704
+ return False, "Invalid Datadog API credentials"
705
+
706
+ except DataDogRequestError as e:
707
+ if e.status_code == 403:
708
+ return False, "Invalid Datadog API keys or insufficient permissions"
709
+ else:
710
+ return False, f"Datadog API error: {str(e)}"
711
+ except Exception as e:
712
+ return False, f"Failed to initialize Datadog RDS toolset: {str(e)}"
713
+
714
+ return CallablePrerequisite(callable=check_datadog_connectivity)
715
+
716
+ def post_init(self, config: dict):
717
+ """Load LLM instructions after initialization"""
718
+ self._reload_instructions()
719
+
720
+ def _reload_instructions(self):
721
+ """Load RDS analysis specific instructions"""
722
+ template_file_path = os.path.abspath(
723
+ os.path.join(os.path.dirname(__file__), "datadog_rds_instructions.jinja2")
724
+ )
725
+ self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
726
+
727
+ def get_example_config(self) -> Dict[str, Any]:
728
+ """Get example configuration for this toolset."""
729
+ return {
730
+ "dd_api_key": "your-datadog-api-key",
731
+ "dd_app_key": "your-datadog-application-key",
732
+ "site_api_url": "https://api.datadoghq.com",
733
+ "default_time_span_seconds": 3600,
734
+ "default_top_instances": 10,
735
+ }