holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +20 -6
- holmes/common/env_vars.py +58 -3
- holmes/common/openshift.py +1 -1
- holmes/config.py +123 -148
- holmes/core/conversations.py +71 -15
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +31 -39
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +508 -88
- holmes/core/models.py +108 -4
- holmes/core/openai_formatting.py +14 -1
- holmes/core/prompt.py +48 -3
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +8 -6
- holmes/core/supabase_dal.py +295 -100
- holmes/core/tool_calling_llm.py +489 -428
- holmes/core/tools.py +325 -56
- holmes/core/tools_utils/token_counting.py +21 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
- holmes/core/tools_utils/tool_executor.py +0 -13
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +191 -5
- holmes/core/tracing.py +19 -3
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +63 -0
- holmes/core/transformers/llm_summarize.py +175 -0
- holmes/core/transformers/registry.py +123 -0
- holmes/core/transformers/transformer.py +32 -0
- holmes/core/truncation/compaction.py +94 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +219 -0
- holmes/interactive.py +228 -31
- holmes/main.py +23 -40
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
- holmes/plugins/prompts/generic_ask.jinja2 +0 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
- holmes/plugins/runbooks/__init__.py +145 -17
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +44 -27
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
- holmes/plugins/toolsets/bash/common/bash.py +23 -13
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/git.py +54 -50
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +13 -29
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
- holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
- holmes/plugins/toolsets/internet/internet.py +15 -16
- holmes/plugins/toolsets/internet/notion.py +9 -11
- holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +36 -42
- holmes/plugins/toolsets/kubernetes.yaml +317 -113
- holmes/plugins/toolsets/kubernetes_logs.py +9 -9
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
- holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
- holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
- holmes/plugins/toolsets/robusta/robusta.py +239 -68
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +60 -11
- holmes/utils/holmes_status.py +6 -4
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +64 -0
- holmes/utils/stream.py +69 -8
- holmes/utils/tags.py +4 -3
- holmes/version.py +37 -15
- holmesgpt-0.18.4.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- holmesgpt-0.13.2.dist-info/RECORD +0 -234
- /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,739 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
from datetime import datetime, timezone
|
|
5
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
from holmes.core.tools import (
|
|
9
|
-
CallablePrerequisite,
|
|
10
|
-
StructuredToolResult,
|
|
11
|
-
Tool,
|
|
12
|
-
ToolParameter,
|
|
13
|
-
ToolResultStatus,
|
|
14
|
-
Toolset,
|
|
15
|
-
ToolsetTag,
|
|
16
|
-
)
|
|
17
|
-
from holmes.plugins.toolsets.consts import (
|
|
18
|
-
TOOLSET_CONFIG_MISSING_ERROR,
|
|
19
|
-
STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
|
|
20
|
-
)
|
|
21
|
-
from holmes.plugins.toolsets.datadog.datadog_api import (
|
|
22
|
-
DatadogBaseConfig,
|
|
23
|
-
DataDogRequestError,
|
|
24
|
-
execute_datadog_http_request,
|
|
25
|
-
get_headers,
|
|
26
|
-
)
|
|
27
|
-
from holmes.plugins.toolsets.utils import (
|
|
28
|
-
get_param_or_raise,
|
|
29
|
-
process_timestamps_to_int,
|
|
30
|
-
standard_start_datetime_tool_param_description,
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
34
|
-
DEFAULT_TOP_INSTANCES = 10
|
|
35
|
-
|
|
36
|
-
# Metric definitions
|
|
37
|
-
LATENCY_METRICS = [
|
|
38
|
-
("aws.rds.read_latency", "Read Latency", "ms"),
|
|
39
|
-
("aws.rds.write_latency", "Write Latency", "ms"),
|
|
40
|
-
("aws.rds.commit_latency", "Commit Latency", "ms"),
|
|
41
|
-
("aws.rds.disk_queue_depth", "Disk Queue Depth", ""),
|
|
42
|
-
]
|
|
43
|
-
|
|
44
|
-
RESOURCE_METRICS = [
|
|
45
|
-
("aws.rds.cpuutilization", "CPU Utilization", "%"),
|
|
46
|
-
("aws.rds.database_connections", "Database Connections", "connections"),
|
|
47
|
-
("aws.rds.freeable_memory", "Freeable Memory", "bytes"),
|
|
48
|
-
("aws.rds.swap_usage", "Swap Usage", "bytes"),
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
STORAGE_METRICS = [
|
|
52
|
-
("aws.rds.read_iops", "Read IOPS", "iops"),
|
|
53
|
-
("aws.rds.write_iops", "Write IOPS", "iops"),
|
|
54
|
-
("aws.rds.burst_balance", "Burst Balance", "%"),
|
|
55
|
-
("aws.rds.free_storage_space", "Free Storage Space", "bytes"),
|
|
56
|
-
]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class DatadogRDSConfig(DatadogBaseConfig):
|
|
60
|
-
default_time_span_seconds: int = DEFAULT_TIME_SPAN_SECONDS
|
|
61
|
-
default_top_instances: int = DEFAULT_TOP_INSTANCES
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class BaseDatadogRDSTool(Tool):
|
|
65
|
-
toolset: "DatadogRDSToolset"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
class GenerateRDSPerformanceReport(BaseDatadogRDSTool):
|
|
69
|
-
def __init__(self, toolset: "DatadogRDSToolset"):
|
|
70
|
-
super().__init__(
|
|
71
|
-
name="datadog_rds_performance_report",
|
|
72
|
-
description="Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
|
|
73
|
-
parameters={
|
|
74
|
-
"db_instance_identifier": ToolParameter(
|
|
75
|
-
description="The RDS database instance identifier",
|
|
76
|
-
type="string",
|
|
77
|
-
required=True,
|
|
78
|
-
),
|
|
79
|
-
"start_time": ToolParameter(
|
|
80
|
-
description=standard_start_datetime_tool_param_description(
|
|
81
|
-
DEFAULT_TIME_SPAN_SECONDS
|
|
82
|
-
),
|
|
83
|
-
type="string",
|
|
84
|
-
required=False,
|
|
85
|
-
),
|
|
86
|
-
"end_time": ToolParameter(
|
|
87
|
-
description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
|
|
88
|
-
type="string",
|
|
89
|
-
required=False,
|
|
90
|
-
),
|
|
91
|
-
},
|
|
92
|
-
toolset=toolset,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
def _invoke(
|
|
96
|
-
self, params: dict, user_approved: bool = False
|
|
97
|
-
) -> StructuredToolResult:
|
|
98
|
-
if not self.toolset.dd_config:
|
|
99
|
-
return StructuredToolResult(
|
|
100
|
-
status=ToolResultStatus.ERROR,
|
|
101
|
-
error=TOOLSET_CONFIG_MISSING_ERROR,
|
|
102
|
-
params=params,
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
try:
|
|
106
|
-
db_instance = get_param_or_raise(params, "db_instance_identifier")
|
|
107
|
-
start_time, end_time = process_timestamps_to_int(
|
|
108
|
-
start=params.get("start_time"),
|
|
109
|
-
end=params.get("end_time"),
|
|
110
|
-
default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
report: dict[str, Any] = {
|
|
114
|
-
"instance_id": db_instance,
|
|
115
|
-
"report_time": datetime.now(timezone.utc).isoformat(),
|
|
116
|
-
"time_range": {
|
|
117
|
-
"start": datetime.fromtimestamp(
|
|
118
|
-
start_time, tz=timezone.utc
|
|
119
|
-
).isoformat(),
|
|
120
|
-
"end": datetime.fromtimestamp(
|
|
121
|
-
end_time, tz=timezone.utc
|
|
122
|
-
).isoformat(),
|
|
123
|
-
},
|
|
124
|
-
"sections": {},
|
|
125
|
-
"issues": [],
|
|
126
|
-
"executive_summary": "",
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
# Collect all metrics
|
|
130
|
-
all_metrics = []
|
|
131
|
-
for metric_group, group_name in [
|
|
132
|
-
(LATENCY_METRICS, "latency"),
|
|
133
|
-
(RESOURCE_METRICS, "resources"),
|
|
134
|
-
(STORAGE_METRICS, "storage"),
|
|
135
|
-
]:
|
|
136
|
-
section_data = self._collect_metrics(
|
|
137
|
-
db_instance, metric_group, start_time, end_time
|
|
138
|
-
)
|
|
139
|
-
if section_data:
|
|
140
|
-
report["sections"][group_name] = section_data
|
|
141
|
-
all_metrics.extend(section_data.get("metrics", {}).items())
|
|
142
|
-
|
|
143
|
-
# Analyze metrics and generate insights
|
|
144
|
-
self._analyze_metrics(report, all_metrics)
|
|
145
|
-
|
|
146
|
-
# Generate executive summary
|
|
147
|
-
report["executive_summary"] = self._generate_executive_summary(report)
|
|
148
|
-
|
|
149
|
-
# Format the report as readable text
|
|
150
|
-
formatted_report = self._format_report(report)
|
|
151
|
-
|
|
152
|
-
return StructuredToolResult(
|
|
153
|
-
status=ToolResultStatus.SUCCESS,
|
|
154
|
-
data=formatted_report,
|
|
155
|
-
params=params,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
except Exception as e:
|
|
159
|
-
logging.error(f"Error generating RDS performance report: {str(e)}")
|
|
160
|
-
return StructuredToolResult(
|
|
161
|
-
status=ToolResultStatus.ERROR,
|
|
162
|
-
error=f"Failed to generate RDS performance report: {str(e)}",
|
|
163
|
-
params=params,
|
|
164
|
-
)
|
|
165
|
-
|
|
166
|
-
def _collect_metrics(
|
|
167
|
-
self,
|
|
168
|
-
db_instance: str,
|
|
169
|
-
metric_list: List[Tuple[str, str, str]],
|
|
170
|
-
start_time: int,
|
|
171
|
-
end_time: int,
|
|
172
|
-
) -> Dict[str, Any]:
|
|
173
|
-
"""Collect metrics for a specific group"""
|
|
174
|
-
if not self.toolset.dd_config:
|
|
175
|
-
raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
|
|
176
|
-
|
|
177
|
-
metrics = {}
|
|
178
|
-
|
|
179
|
-
for metric_name, display_name, unit in metric_list:
|
|
180
|
-
query = f"{metric_name}{{dbinstanceidentifier:{db_instance}}}"
|
|
181
|
-
|
|
182
|
-
try:
|
|
183
|
-
url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
|
|
184
|
-
headers = get_headers(self.toolset.dd_config)
|
|
185
|
-
payload = {
|
|
186
|
-
"query": query,
|
|
187
|
-
"from": start_time,
|
|
188
|
-
"to": end_time,
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
response = execute_datadog_http_request(
|
|
192
|
-
url=url,
|
|
193
|
-
headers=headers,
|
|
194
|
-
payload_or_params=payload,
|
|
195
|
-
timeout=self.toolset.dd_config.request_timeout,
|
|
196
|
-
method="GET",
|
|
197
|
-
)
|
|
198
|
-
|
|
199
|
-
if response and "series" in response and response["series"]:
|
|
200
|
-
series = response["series"][0]
|
|
201
|
-
points = series.get("pointlist", [])
|
|
202
|
-
|
|
203
|
-
if points:
|
|
204
|
-
values = [p[1] for p in points if p[1] is not None]
|
|
205
|
-
if values:
|
|
206
|
-
metrics[display_name] = {
|
|
207
|
-
"unit": unit
|
|
208
|
-
or series.get("unit", [{"short_name": ""}])[0].get(
|
|
209
|
-
"short_name", ""
|
|
210
|
-
),
|
|
211
|
-
"avg": round(sum(values) / len(values), 2),
|
|
212
|
-
"max": round(max(values), 2),
|
|
213
|
-
"min": round(min(values), 2),
|
|
214
|
-
"latest": round(values[-1], 2),
|
|
215
|
-
"data_points": len(values),
|
|
216
|
-
}
|
|
217
|
-
except DataDogRequestError:
|
|
218
|
-
continue
|
|
219
|
-
|
|
220
|
-
return {"metrics": metrics} if metrics else {}
|
|
221
|
-
|
|
222
|
-
def _analyze_metrics(self, report: Dict, all_metrics: List[Tuple[str, Dict]]):
|
|
223
|
-
"""Analyze metrics and generate issues"""
|
|
224
|
-
for metric_name, data in all_metrics:
|
|
225
|
-
# Latency analysis
|
|
226
|
-
if "Latency" in metric_name and metric_name != "Commit Latency":
|
|
227
|
-
if data["avg"] > 10:
|
|
228
|
-
report["issues"].append(
|
|
229
|
-
f"{metric_name} averaging {data['avg']}ms (above 10ms threshold)"
|
|
230
|
-
)
|
|
231
|
-
if data["max"] > 50:
|
|
232
|
-
report["issues"].append(f"{metric_name} peaked at {data['max']}ms")
|
|
233
|
-
|
|
234
|
-
# Disk queue depth
|
|
235
|
-
elif metric_name == "Disk Queue Depth":
|
|
236
|
-
if data["avg"] > 5:
|
|
237
|
-
report["issues"].append(
|
|
238
|
-
f"High disk queue depth (avg: {data['avg']})"
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
# CPU utilization
|
|
242
|
-
elif metric_name == "CPU Utilization":
|
|
243
|
-
if data["avg"] > 70:
|
|
244
|
-
report["issues"].append(
|
|
245
|
-
f"High CPU utilization (avg: {data['avg']}%)"
|
|
246
|
-
)
|
|
247
|
-
if data["max"] > 90:
|
|
248
|
-
report["issues"].append(
|
|
249
|
-
f"CPU saturation detected (max: {data['max']}%)"
|
|
250
|
-
)
|
|
251
|
-
|
|
252
|
-
# Memory
|
|
253
|
-
elif metric_name == "Freeable Memory":
|
|
254
|
-
if data["min"] < 100 * 1024 * 1024: # 100MB
|
|
255
|
-
report["issues"].append(
|
|
256
|
-
f"Low memory availability (min: {data['min'] / 1024 / 1024:.1f}MB)"
|
|
257
|
-
)
|
|
258
|
-
|
|
259
|
-
# Swap usage
|
|
260
|
-
elif metric_name == "Swap Usage":
|
|
261
|
-
if data["avg"] > 0:
|
|
262
|
-
report["issues"].append(
|
|
263
|
-
"Swap usage detected, indicating memory pressure"
|
|
264
|
-
)
|
|
265
|
-
|
|
266
|
-
# Burst balance
|
|
267
|
-
elif metric_name == "Burst Balance":
|
|
268
|
-
if data["min"] < 30:
|
|
269
|
-
report["issues"].append(
|
|
270
|
-
f"Low burst balance detected (min: {data['min']}%)"
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
# IOPS
|
|
274
|
-
elif "IOPS" in metric_name:
|
|
275
|
-
if data["max"] > 3000:
|
|
276
|
-
report["issues"].append(
|
|
277
|
-
f"High {metric_name} (max: {data['max']} IOPS)"
|
|
278
|
-
)
|
|
279
|
-
|
|
280
|
-
def _generate_executive_summary(self, report: Dict) -> str:
|
|
281
|
-
"""Generate executive summary"""
|
|
282
|
-
issue_count = len(report["issues"])
|
|
283
|
-
|
|
284
|
-
if issue_count == 0:
|
|
285
|
-
return "Database is operating within normal parameters. No significant issues detected."
|
|
286
|
-
elif issue_count <= 2:
|
|
287
|
-
severity = "Low"
|
|
288
|
-
elif issue_count <= 5:
|
|
289
|
-
severity = "Medium"
|
|
290
|
-
else:
|
|
291
|
-
severity = "High"
|
|
292
|
-
|
|
293
|
-
summary = f"Performance diagnosis: {severity} severity - {issue_count} issues detected.\n\n"
|
|
294
|
-
|
|
295
|
-
# Add key findings
|
|
296
|
-
if any("latency" in issue.lower() for issue in report["issues"]):
|
|
297
|
-
summary += "• Latency issues affecting database response times\n"
|
|
298
|
-
if any("cpu" in issue.lower() for issue in report["issues"]):
|
|
299
|
-
summary += "• CPU resource constraints detected\n"
|
|
300
|
-
if any(
|
|
301
|
-
"memory" in issue.lower() or "swap" in issue.lower()
|
|
302
|
-
for issue in report["issues"]
|
|
303
|
-
):
|
|
304
|
-
summary += "• Memory pressure affecting performance\n"
|
|
305
|
-
if any(
|
|
306
|
-
"burst" in issue.lower() or "iops" in issue.lower()
|
|
307
|
-
for issue in report["issues"]
|
|
308
|
-
):
|
|
309
|
-
summary += "• Storage I/O bottlenecks identified\n"
|
|
310
|
-
|
|
311
|
-
return summary
|
|
312
|
-
|
|
313
|
-
def _format_report(self, report: Dict) -> str:
|
|
314
|
-
"""Format the report as readable text"""
|
|
315
|
-
lines = []
|
|
316
|
-
lines.append(f"RDS Performance Report - {report['instance_id']}")
|
|
317
|
-
lines.append("=" * 70)
|
|
318
|
-
lines.append(f"Generated: {report['report_time']}")
|
|
319
|
-
lines.append(
|
|
320
|
-
f"Time Range: {report['time_range']['start']} to {report['time_range']['end']}"
|
|
321
|
-
)
|
|
322
|
-
lines.append("")
|
|
323
|
-
|
|
324
|
-
# Executive Summary
|
|
325
|
-
lines.append("EXECUTIVE SUMMARY")
|
|
326
|
-
lines.append("-" * 40)
|
|
327
|
-
lines.append(report["executive_summary"])
|
|
328
|
-
lines.append("")
|
|
329
|
-
|
|
330
|
-
# Metrics sections
|
|
331
|
-
for section_name, section_data in report["sections"].items():
|
|
332
|
-
lines.append(f"{section_name.upper()} METRICS")
|
|
333
|
-
lines.append("-" * 40)
|
|
334
|
-
|
|
335
|
-
if section_data.get("metrics"):
|
|
336
|
-
lines.append(
|
|
337
|
-
f"{'Metric':<25} {'Avg':>10} {'Max':>10} {'Min':>10} {'Latest':>10} {'Unit':>8}"
|
|
338
|
-
)
|
|
339
|
-
lines.append("-" * 80)
|
|
340
|
-
|
|
341
|
-
for metric_name, data in section_data["metrics"].items():
|
|
342
|
-
lines.append(
|
|
343
|
-
f"{metric_name:<25} {data['avg']:>10.2f} {data['max']:>10.2f} "
|
|
344
|
-
f"{data['min']:>10.2f} {data['latest']:>10.2f} {data['unit']:>8}"
|
|
345
|
-
)
|
|
346
|
-
lines.append("")
|
|
347
|
-
|
|
348
|
-
# Issues
|
|
349
|
-
if report["issues"]:
|
|
350
|
-
lines.append(f"ISSUES DETECTED ({len(report['issues'])})")
|
|
351
|
-
lines.append("-" * 40)
|
|
352
|
-
for i, issue in enumerate(report["issues"], 1):
|
|
353
|
-
lines.append(f"{i}. {issue}")
|
|
354
|
-
lines.append("")
|
|
355
|
-
|
|
356
|
-
return "\n".join(lines)
|
|
357
|
-
|
|
358
|
-
def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
|
|
359
|
-
db_instance = params.get("db_instance_identifier", "unknown")
|
|
360
|
-
return f"Generating performance report for RDS instance: {db_instance}"
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
class GetTopWorstPerformingRDSInstances(BaseDatadogRDSTool):
|
|
364
|
-
def __init__(self, toolset: "DatadogRDSToolset"):
|
|
365
|
-
super().__init__(
|
|
366
|
-
name="datadog_rds_top_worst_performing",
|
|
367
|
-
description="Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
|
|
368
|
-
parameters={
|
|
369
|
-
"top_n": ToolParameter(
|
|
370
|
-
description=f"Number of worst performing instances to return (default: {DEFAULT_TOP_INSTANCES})",
|
|
371
|
-
type="number",
|
|
372
|
-
required=False,
|
|
373
|
-
),
|
|
374
|
-
"start_time": ToolParameter(
|
|
375
|
-
description=standard_start_datetime_tool_param_description(
|
|
376
|
-
DEFAULT_TIME_SPAN_SECONDS
|
|
377
|
-
),
|
|
378
|
-
type="string",
|
|
379
|
-
required=False,
|
|
380
|
-
),
|
|
381
|
-
"end_time": ToolParameter(
|
|
382
|
-
description=STANDARD_END_DATETIME_TOOL_PARAM_DESCRIPTION,
|
|
383
|
-
type="string",
|
|
384
|
-
required=False,
|
|
385
|
-
),
|
|
386
|
-
"sort_by": ToolParameter(
|
|
387
|
-
description="Metric to sort by: 'latency' (default), 'cpu', 'errors', or 'composite'",
|
|
388
|
-
type="string",
|
|
389
|
-
required=False,
|
|
390
|
-
),
|
|
391
|
-
},
|
|
392
|
-
toolset=toolset,
|
|
393
|
-
)
|
|
394
|
-
|
|
395
|
-
def _invoke(
|
|
396
|
-
self, params: dict, user_approved: bool = False
|
|
397
|
-
) -> StructuredToolResult:
|
|
398
|
-
if not self.toolset.dd_config:
|
|
399
|
-
return StructuredToolResult(
|
|
400
|
-
status=ToolResultStatus.ERROR,
|
|
401
|
-
error=TOOLSET_CONFIG_MISSING_ERROR,
|
|
402
|
-
params=params,
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
try:
|
|
406
|
-
top_n = params.get("top_n", self.toolset.dd_config.default_top_instances)
|
|
407
|
-
sort_by = params.get("sort_by", "latency").lower()
|
|
408
|
-
start_time, end_time = process_timestamps_to_int(
|
|
409
|
-
start=params.get("start_time"),
|
|
410
|
-
end=params.get("end_time"),
|
|
411
|
-
default_time_span_seconds=self.toolset.dd_config.default_time_span_seconds,
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
# Get all RDS instances
|
|
415
|
-
instances = self._get_all_rds_instances(start_time, end_time)
|
|
416
|
-
|
|
417
|
-
if not instances:
|
|
418
|
-
return StructuredToolResult(
|
|
419
|
-
status=ToolResultStatus.NO_DATA,
|
|
420
|
-
data="No RDS instances found with metrics in the specified time range",
|
|
421
|
-
params=params,
|
|
422
|
-
)
|
|
423
|
-
|
|
424
|
-
# Collect performance data for each instance
|
|
425
|
-
instance_performance = []
|
|
426
|
-
for instance_id in instances[:50]: # Limit to 50 instances to avoid timeout
|
|
427
|
-
perf_data = self._get_instance_performance_summary(
|
|
428
|
-
instance_id, start_time, end_time
|
|
429
|
-
)
|
|
430
|
-
if perf_data:
|
|
431
|
-
instance_performance.append(perf_data)
|
|
432
|
-
|
|
433
|
-
# Sort by the specified metric
|
|
434
|
-
instance_performance = self._sort_instances(instance_performance, sort_by)
|
|
435
|
-
|
|
436
|
-
# Get top N worst performers
|
|
437
|
-
worst_performers = instance_performance[:top_n]
|
|
438
|
-
|
|
439
|
-
# Format the report
|
|
440
|
-
report = self._format_summary_report(worst_performers, sort_by)
|
|
441
|
-
|
|
442
|
-
report += f"\n\nTotal instances analyzed: {len(instance_performance)}"
|
|
443
|
-
report += f"\n\nInstances:\n{json.dumps(worst_performers, indent=2)}"
|
|
444
|
-
|
|
445
|
-
return StructuredToolResult(
|
|
446
|
-
status=ToolResultStatus.SUCCESS,
|
|
447
|
-
data=report,
|
|
448
|
-
params=params,
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
except Exception as e:
|
|
452
|
-
logging.error(f"Error getting top worst performing RDS instances: {str(e)}")
|
|
453
|
-
return StructuredToolResult(
|
|
454
|
-
status=ToolResultStatus.ERROR,
|
|
455
|
-
error=f"Failed to get top worst performing RDS instances: {str(e)}",
|
|
456
|
-
params=params,
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
def _get_all_rds_instances(self, start_time: int, end_time: int) -> List[str]:
|
|
460
|
-
"""Get list of all RDS instances with metrics"""
|
|
461
|
-
if not self.toolset.dd_config:
|
|
462
|
-
raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
|
|
463
|
-
try:
|
|
464
|
-
# Query for any RDS metric grouped by instance
|
|
465
|
-
query = "avg:aws.rds.cpuutilization{*} by {dbinstanceidentifier}"
|
|
466
|
-
|
|
467
|
-
url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
|
|
468
|
-
headers = get_headers(self.toolset.dd_config)
|
|
469
|
-
payload = {
|
|
470
|
-
"query": query,
|
|
471
|
-
"from": start_time,
|
|
472
|
-
"to": end_time,
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
response = execute_datadog_http_request(
|
|
476
|
-
url=url,
|
|
477
|
-
headers=headers,
|
|
478
|
-
payload_or_params=payload,
|
|
479
|
-
timeout=self.toolset.dd_config.request_timeout,
|
|
480
|
-
method="GET",
|
|
481
|
-
)
|
|
482
|
-
|
|
483
|
-
instances = []
|
|
484
|
-
if response and "series" in response:
|
|
485
|
-
for series in response["series"]:
|
|
486
|
-
# Extract instance ID from tags
|
|
487
|
-
scope = series.get("scope", "")
|
|
488
|
-
if "dbinstanceidentifier:" in scope:
|
|
489
|
-
instance_id = scope.split("dbinstanceidentifier:")[1].split(
|
|
490
|
-
","
|
|
491
|
-
)[0]
|
|
492
|
-
instances.append(instance_id)
|
|
493
|
-
|
|
494
|
-
return list(set(instances)) # Remove duplicates
|
|
495
|
-
|
|
496
|
-
except Exception as e:
|
|
497
|
-
logging.error(f"Error getting RDS instances: {str(e)}")
|
|
498
|
-
return []
|
|
499
|
-
|
|
500
|
-
def _get_instance_performance_summary(
|
|
501
|
-
self, instance_id: str, start_time: int, end_time: int
|
|
502
|
-
) -> Optional[Dict]:
|
|
503
|
-
"""Get performance summary for a single instance"""
|
|
504
|
-
|
|
505
|
-
if not self.toolset.dd_config:
|
|
506
|
-
raise Exception(TOOLSET_CONFIG_MISSING_ERROR)
|
|
507
|
-
|
|
508
|
-
summary: dict[str, Any] = {
|
|
509
|
-
"instance_id": instance_id,
|
|
510
|
-
"metrics": {},
|
|
511
|
-
"score": 0, # Composite score for sorting
|
|
512
|
-
"issues": [],
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
# Key metrics to collect
|
|
516
|
-
metrics_to_collect = [
|
|
517
|
-
("aws.rds.read_latency", "read_latency", 1.0), # weight for composite score
|
|
518
|
-
("aws.rds.write_latency", "write_latency", 1.0),
|
|
519
|
-
("aws.rds.cpuutilization", "cpu_utilization", 0.5),
|
|
520
|
-
("aws.rds.database_connections", "connections", 0.2),
|
|
521
|
-
("aws.rds.burst_balance", "burst_balance", 0.8),
|
|
522
|
-
]
|
|
523
|
-
|
|
524
|
-
for metric_name, key, weight in metrics_to_collect:
|
|
525
|
-
query = f"avg:{metric_name}{{dbinstanceidentifier:{instance_id}}}"
|
|
526
|
-
|
|
527
|
-
try:
|
|
528
|
-
url = f"{self.toolset.dd_config.site_api_url}/api/v1/query"
|
|
529
|
-
headers = get_headers(self.toolset.dd_config)
|
|
530
|
-
payload = {
|
|
531
|
-
"query": query,
|
|
532
|
-
"from": start_time,
|
|
533
|
-
"to": end_time,
|
|
534
|
-
}
|
|
535
|
-
|
|
536
|
-
response = execute_datadog_http_request(
|
|
537
|
-
url=url,
|
|
538
|
-
headers=headers,
|
|
539
|
-
payload_or_params=payload,
|
|
540
|
-
timeout=self.toolset.dd_config.request_timeout,
|
|
541
|
-
method="GET",
|
|
542
|
-
)
|
|
543
|
-
|
|
544
|
-
if response and "series" in response and response["series"]:
|
|
545
|
-
series = response["series"][0]
|
|
546
|
-
points = series.get("pointlist", [])
|
|
547
|
-
|
|
548
|
-
if points:
|
|
549
|
-
values = [p[1] for p in points if p[1] is not None]
|
|
550
|
-
if values:
|
|
551
|
-
avg_value = sum(values) / len(values)
|
|
552
|
-
max_value = max(values)
|
|
553
|
-
|
|
554
|
-
summary["metrics"][key] = {
|
|
555
|
-
"avg": round(avg_value, 2),
|
|
556
|
-
"max": round(max_value, 2),
|
|
557
|
-
}
|
|
558
|
-
|
|
559
|
-
# Calculate contribution to composite score
|
|
560
|
-
if key in ["read_latency", "write_latency"]:
|
|
561
|
-
# Higher latency = worse performance
|
|
562
|
-
score_contrib = avg_value * weight
|
|
563
|
-
if avg_value > 10:
|
|
564
|
-
summary["issues"].append(
|
|
565
|
-
f"High {key.replace('_', ' ')}: {avg_value:.1f}ms"
|
|
566
|
-
)
|
|
567
|
-
elif key == "cpu_utilization":
|
|
568
|
-
# Higher CPU = worse performance
|
|
569
|
-
score_contrib = avg_value * weight
|
|
570
|
-
if avg_value > 70:
|
|
571
|
-
summary["issues"].append(
|
|
572
|
-
f"High CPU: {avg_value:.1f}%"
|
|
573
|
-
)
|
|
574
|
-
elif key == "burst_balance":
|
|
575
|
-
# Lower burst balance = worse performance
|
|
576
|
-
score_contrib = (100 - avg_value) * weight
|
|
577
|
-
if avg_value < 30:
|
|
578
|
-
summary["issues"].append(
|
|
579
|
-
f"Low burst balance: {avg_value:.1f}%"
|
|
580
|
-
)
|
|
581
|
-
else:
|
|
582
|
-
score_contrib = 0
|
|
583
|
-
|
|
584
|
-
summary["score"] += score_contrib
|
|
585
|
-
|
|
586
|
-
except Exception:
|
|
587
|
-
continue
|
|
588
|
-
|
|
589
|
-
return summary if summary["metrics"] else None
|
|
590
|
-
|
|
591
|
-
def _sort_instances(self, instances: List[Dict], sort_by: str) -> List[Dict]:
|
|
592
|
-
"""Sort instances by specified metric"""
|
|
593
|
-
if sort_by == "latency":
|
|
594
|
-
# Sort by average of read and write latency
|
|
595
|
-
def latency_key(inst):
|
|
596
|
-
read_lat = inst["metrics"].get("read_latency", {}).get("avg", 0)
|
|
597
|
-
write_lat = inst["metrics"].get("write_latency", {}).get("avg", 0)
|
|
598
|
-
return (read_lat + write_lat) / 2
|
|
599
|
-
|
|
600
|
-
return sorted(instances, key=latency_key, reverse=True)
|
|
601
|
-
|
|
602
|
-
elif sort_by == "cpu":
|
|
603
|
-
return sorted(
|
|
604
|
-
instances,
|
|
605
|
-
key=lambda x: x["metrics"].get("cpu_utilization", {}).get("avg", 0),
|
|
606
|
-
reverse=True,
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
elif sort_by == "composite":
|
|
610
|
-
return sorted(instances, key=lambda x: x["score"], reverse=True)
|
|
611
|
-
|
|
612
|
-
else: # Default to latency
|
|
613
|
-
return self._sort_instances(instances, "latency")
|
|
614
|
-
|
|
615
|
-
def _format_summary_report(self, instances: List[Dict], sort_by: str) -> str:
|
|
616
|
-
"""Format the summary report"""
|
|
617
|
-
lines = []
|
|
618
|
-
lines.append("Top Worst Performing RDS Instances")
|
|
619
|
-
lines.append("=" * 70)
|
|
620
|
-
lines.append(f"Sorted by: {sort_by}")
|
|
621
|
-
lines.append(f"Instances shown: {len(instances)}")
|
|
622
|
-
lines.append("")
|
|
623
|
-
|
|
624
|
-
for rank, inst in enumerate(instances, 1):
|
|
625
|
-
lines.append(f"{rank}. {inst['instance_id']}")
|
|
626
|
-
lines.append("-" * 40)
|
|
627
|
-
|
|
628
|
-
# Show key metrics
|
|
629
|
-
metrics = inst["metrics"]
|
|
630
|
-
if "read_latency" in metrics:
|
|
631
|
-
lines.append(
|
|
632
|
-
f" Read Latency: {metrics['read_latency']['avg']:.1f}ms avg, {metrics['read_latency']['max']:.1f}ms max"
|
|
633
|
-
)
|
|
634
|
-
if "write_latency" in metrics:
|
|
635
|
-
lines.append(
|
|
636
|
-
f" Write Latency: {metrics['write_latency']['avg']:.1f}ms avg, {metrics['write_latency']['max']:.1f}ms max"
|
|
637
|
-
)
|
|
638
|
-
if "cpu_utilization" in metrics:
|
|
639
|
-
lines.append(
|
|
640
|
-
f" CPU Usage: {metrics['cpu_utilization']['avg']:.1f}% avg, {metrics['cpu_utilization']['max']:.1f}% max"
|
|
641
|
-
)
|
|
642
|
-
if "burst_balance" in metrics:
|
|
643
|
-
lines.append(
|
|
644
|
-
f" Burst Balance: {metrics['burst_balance']['avg']:.1f}% avg"
|
|
645
|
-
)
|
|
646
|
-
|
|
647
|
-
# Show issues
|
|
648
|
-
if inst["issues"]:
|
|
649
|
-
lines.append(" Issues:")
|
|
650
|
-
for issue in inst["issues"]:
|
|
651
|
-
lines.append(f" • {issue}")
|
|
652
|
-
|
|
653
|
-
lines.append("")
|
|
654
|
-
|
|
655
|
-
return "\n".join(lines)
|
|
656
|
-
|
|
657
|
-
def get_parameterized_one_liner(self, params: Dict[str, Any]) -> str:
|
|
658
|
-
top_n = params.get("top_n", DEFAULT_TOP_INSTANCES)
|
|
659
|
-
sort_by = params.get("sort_by", "latency")
|
|
660
|
-
return f"Getting top {top_n} worst performing RDS instances sorted by {sort_by}"
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
class DatadogRDSToolset(Toolset):
|
|
664
|
-
dd_config: Optional[DatadogRDSConfig] = None
|
|
665
|
-
|
|
666
|
-
def __init__(self):
|
|
667
|
-
super().__init__(
|
|
668
|
-
name="datadog/rds",
|
|
669
|
-
description="Analyze RDS database performance and identify worst performers using Datadog metrics",
|
|
670
|
-
tags=[ToolsetTag.CORE],
|
|
671
|
-
tools=[
|
|
672
|
-
GenerateRDSPerformanceReport(toolset=self),
|
|
673
|
-
GetTopWorstPerformingRDSInstances(toolset=self),
|
|
674
|
-
],
|
|
675
|
-
)
|
|
676
|
-
|
|
677
|
-
def prerequisites_check(self, config: Dict[str, Any]) -> CallablePrerequisite:
|
|
678
|
-
def check_datadog_connectivity(config_dict: Dict[str, Any]) -> Tuple[bool, str]:
|
|
679
|
-
"""Check Datadog API connectivity and permissions"""
|
|
680
|
-
try:
|
|
681
|
-
# Validate config
|
|
682
|
-
self.dd_config = DatadogRDSConfig(**config_dict)
|
|
683
|
-
|
|
684
|
-
# Test API connectivity
|
|
685
|
-
url = f"{self.dd_config.site_api_url}/api/v1/validate"
|
|
686
|
-
headers = get_headers(self.dd_config)
|
|
687
|
-
|
|
688
|
-
response = execute_datadog_http_request(
|
|
689
|
-
url=url,
|
|
690
|
-
headers=headers,
|
|
691
|
-
payload_or_params={},
|
|
692
|
-
timeout=self.dd_config.request_timeout,
|
|
693
|
-
method="GET",
|
|
694
|
-
)
|
|
695
|
-
|
|
696
|
-
if response and response.get("valid", False):
|
|
697
|
-
# Test metrics API access
|
|
698
|
-
metrics_url = f"{self.dd_config.site_api_url}/api/v1/metrics"
|
|
699
|
-
execute_datadog_http_request(
|
|
700
|
-
url=metrics_url,
|
|
701
|
-
headers=headers,
|
|
702
|
-
payload_or_params={"from": 0},
|
|
703
|
-
timeout=self.dd_config.request_timeout,
|
|
704
|
-
method="GET",
|
|
705
|
-
)
|
|
706
|
-
return True, ""
|
|
707
|
-
else:
|
|
708
|
-
return False, "Invalid Datadog API credentials"
|
|
709
|
-
|
|
710
|
-
except DataDogRequestError as e:
|
|
711
|
-
if e.status_code == 403:
|
|
712
|
-
return False, "Invalid Datadog API keys or insufficient permissions"
|
|
713
|
-
else:
|
|
714
|
-
return False, f"Datadog API error: {str(e)}"
|
|
715
|
-
except Exception as e:
|
|
716
|
-
return False, f"Failed to initialize Datadog RDS toolset: {str(e)}"
|
|
717
|
-
|
|
718
|
-
return CallablePrerequisite(callable=check_datadog_connectivity)
|
|
719
|
-
|
|
720
|
-
def post_init(self, config: dict):
|
|
721
|
-
"""Load LLM instructions after initialization"""
|
|
722
|
-
self._reload_instructions()
|
|
723
|
-
|
|
724
|
-
def _reload_instructions(self):
|
|
725
|
-
"""Load RDS analysis specific instructions"""
|
|
726
|
-
template_file_path = os.path.abspath(
|
|
727
|
-
os.path.join(os.path.dirname(__file__), "datadog_rds_instructions.jinja2")
|
|
728
|
-
)
|
|
729
|
-
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
730
|
-
|
|
731
|
-
def get_example_config(self) -> Dict[str, Any]:
|
|
732
|
-
"""Get example configuration for this toolset."""
|
|
733
|
-
return {
|
|
734
|
-
"dd_api_key": "your-datadog-api-key",
|
|
735
|
-
"dd_app_key": "your-datadog-application-key",
|
|
736
|
-
"site_api_url": "https://api.datadoghq.com",
|
|
737
|
-
"default_time_span_seconds": 3600,
|
|
738
|
-
"default_top_instances": 10,
|
|
739
|
-
}
|