holmesgpt 0.11.5__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (40) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/common/env_vars.py +8 -4
  3. holmes/config.py +52 -13
  4. holmes/core/investigation_structured_output.py +7 -0
  5. holmes/core/llm.py +14 -4
  6. holmes/core/models.py +24 -0
  7. holmes/core/tool_calling_llm.py +48 -6
  8. holmes/core/tools.py +7 -4
  9. holmes/core/toolset_manager.py +24 -5
  10. holmes/core/tracing.py +224 -0
  11. holmes/interactive.py +761 -44
  12. holmes/main.py +59 -127
  13. holmes/plugins/prompts/_fetch_logs.jinja2 +4 -0
  14. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -10
  15. holmes/plugins/toolsets/__init__.py +10 -2
  16. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  17. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +3 -0
  18. holmes/plugins/toolsets/datadog/datadog_api.py +161 -0
  19. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +26 -0
  20. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +310 -0
  21. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +51 -0
  22. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +267 -0
  23. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +488 -0
  24. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +689 -0
  25. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -0
  26. holmes/plugins/toolsets/internet/internet.py +1 -1
  27. holmes/plugins/toolsets/logging_utils/logging_api.py +9 -3
  28. holmes/plugins/toolsets/opensearch/opensearch_logs.py +3 -0
  29. holmes/plugins/toolsets/utils.py +6 -2
  30. holmes/utils/cache.py +4 -4
  31. holmes/utils/console/consts.py +2 -0
  32. holmes/utils/console/logging.py +95 -0
  33. holmes/utils/console/result.py +37 -0
  34. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/METADATA +3 -4
  35. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/RECORD +38 -29
  36. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/WHEEL +1 -1
  37. holmes/__init__.py.bak +0 -76
  38. holmes/plugins/toolsets/datadog.py +0 -153
  39. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/LICENSE.txt +0 -0
  40. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,310 @@
1
+ """Formatting utilities for Datadog traces output."""
2
+
3
+ from collections import defaultdict
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, Tuple
6
+
7
+ from holmes.plugins.toolsets.utils import unix_nano_to_rfc3339
8
+
9
+
10
+ def parse_datadog_span_timestamp(attrs: Dict[str, Any]) -> Tuple[int, int]:
11
+ """
12
+ Parse timestamp and duration from Datadog span attributes.
13
+
14
+ Returns:
15
+ Tuple of (start_ns, duration_ns)
16
+ """
17
+ custom = attrs.get("custom", {})
18
+
19
+ # Get timestamp and convert to nanoseconds
20
+ start_timestamp = attrs.get("start_timestamp", "")
21
+ # Check for duration in both custom and direct attributes
22
+ duration_ns = custom.get("duration", 0) or attrs.get("duration", 0)
23
+
24
+ # Check for start time in nanoseconds directly first
25
+ start_ns = attrs.get("start", 0)
26
+
27
+ # If not found, try to parse from timestamp string
28
+ if not start_ns and start_timestamp:
29
+ try:
30
+ dt = datetime.fromisoformat(start_timestamp.replace("Z", "+00:00"))
31
+ start_ns = int(dt.timestamp() * 1_000_000_000)
32
+ except (ValueError, TypeError):
33
+ start_ns = 0
34
+
35
+ return start_ns, duration_ns
36
+
37
+
38
+ def format_traces_list(spans: List[Dict[str, Any]], limit: int = 50) -> str:
39
+ """
40
+ Format a list of spans grouped by trace ID into a readable output.
41
+ """
42
+ if not spans:
43
+ return ""
44
+
45
+ # Group spans by trace_id
46
+ traces = defaultdict(list)
47
+ for span in spans:
48
+ trace_id = span.get("attributes", {}).get("trace_id")
49
+ if trace_id:
50
+ traces[trace_id].append(span)
51
+
52
+ # Format output
53
+ output_lines = []
54
+ output_lines.append(f"Found {len(traces)} traces with matching spans")
55
+ output_lines.append("")
56
+
57
+ for trace_id, trace_spans in list(traces.items())[:limit]:
58
+ # Find root span and calculate trace duration
59
+ root_span = None
60
+ min_start = float("inf")
61
+ max_end = 0
62
+
63
+ for span in trace_spans:
64
+ attrs = span.get("attributes", {})
65
+ start_ns, duration_ns = parse_datadog_span_timestamp(attrs)
66
+ end_ns = start_ns + duration_ns
67
+
68
+ if start_ns > 0 and start_ns < min_start:
69
+ min_start = start_ns
70
+
71
+ if end_ns > max_end:
72
+ max_end = end_ns
73
+
74
+ # Check if this is a root span (no parent_id)
75
+ if not attrs.get("parent_id"):
76
+ root_span = span
77
+
78
+ # If no root span found, use the first span
79
+ if not root_span and trace_spans:
80
+ root_span = trace_spans[0]
81
+
82
+ # Calculate duration, handling edge cases
83
+ if min_start == float("inf") or max_end == 0:
84
+ trace_duration_ms = 0.0
85
+ else:
86
+ trace_duration_ms = (max_end - min_start) / 1_000_000
87
+
88
+ if root_span:
89
+ attrs = root_span.get("attributes", {})
90
+ service_name = attrs.get("service", "unknown")
91
+ operation_name = attrs.get("operation_name", "unknown")
92
+ start_time_str = (
93
+ unix_nano_to_rfc3339(min_start)
94
+ if min_start != float("inf")
95
+ else "unknown"
96
+ )
97
+
98
+ output_lines.append(
99
+ f"Trace (traceID={trace_id}) (durationMs={trace_duration_ms:.2f})"
100
+ )
101
+ output_lines.append(
102
+ f"\tstartTime={start_time_str} rootServiceName={service_name} rootTraceName={operation_name}"
103
+ )
104
+
105
+ return "\n".join(output_lines)
106
+
107
+
108
+ def build_span_hierarchy(
109
+ spans: List[Dict[str, Any]],
110
+ ) -> Tuple[Dict[str, Dict], List[Dict]]:
111
+ """
112
+ Build a hierarchy of spans from a flat list.
113
+
114
+ Returns:
115
+ Tuple of (span_map, root_spans)
116
+ """
117
+ span_map = {}
118
+ root_spans = []
119
+
120
+ # First pass: create span objects
121
+ for span_data in spans:
122
+ attrs = span_data.get("attributes", {})
123
+ span_id = attrs.get("span_id", "")
124
+ parent_id = attrs.get("parent_id", "")
125
+
126
+ start_ns, duration_ns = parse_datadog_span_timestamp(attrs)
127
+
128
+ span_obj = {
129
+ "span_id": span_id,
130
+ "parent_id": parent_id,
131
+ "name": attrs.get("operation_name", "unknown"),
132
+ "service": attrs.get("service", "unknown"),
133
+ "resource": attrs.get("resource_name", ""),
134
+ "start_ns": start_ns,
135
+ "duration_ns": duration_ns,
136
+ "status": attrs.get("status", ""),
137
+ "tags": attrs.get("tags", []),
138
+ "children": [],
139
+ "attributes": attrs,
140
+ }
141
+
142
+ span_map[span_id] = span_obj
143
+
144
+ if not parent_id:
145
+ root_spans.append(span_obj)
146
+
147
+ # Second pass: build hierarchy
148
+ for span_obj in span_map.values():
149
+ parent_id = span_obj["parent_id"]
150
+ if parent_id and parent_id in span_map:
151
+ span_map[parent_id]["children"].append(span_obj)
152
+ elif parent_id and parent_id not in span_map:
153
+ # This is an orphaned span (parent not in trace)
154
+ root_spans.append(span_obj)
155
+
156
+ return span_map, root_spans
157
+
158
+
159
+ def format_trace_hierarchy(trace_id: str, spans: List[Dict[str, Any]]) -> str:
160
+ """
161
+ Format a trace with its full span hierarchy.
162
+ """
163
+ if not spans:
164
+ return ""
165
+
166
+ span_map, root_spans = build_span_hierarchy(spans)
167
+
168
+ # Format output
169
+ output_lines = []
170
+ output_lines.append(f"Trace ID: {trace_id}")
171
+ output_lines.append("")
172
+
173
+ def format_span_tree(span: Dict[str, Any], level: int = 0) -> None:
174
+ indent = " " * level
175
+ duration_ms = span["duration_ns"] / 1_000_000
176
+
177
+ output_lines.append(
178
+ f"{indent}├─ {span['name']} ({span['service']}) - {duration_ms:.2f}ms (span_id={span['span_id']})"
179
+ )
180
+
181
+ start_time_str = unix_nano_to_rfc3339(span["start_ns"])
182
+ end_time_ns = span["start_ns"] + span["duration_ns"]
183
+ end_time_str = unix_nano_to_rfc3339(end_time_ns)
184
+
185
+ output_lines.append(
186
+ f"{indent}│ Datetime: start={start_time_str} end={end_time_str}"
187
+ )
188
+
189
+ if span["resource"]:
190
+ output_lines.append(f"{indent}│ Resource: {span['resource']}")
191
+
192
+ if span["status"]:
193
+ output_lines.append(f"{indent}│ Status: {span['status']}")
194
+
195
+ # Show important tags
196
+ important_tags = [
197
+ "env",
198
+ "version",
199
+ "http.method",
200
+ "http.status_code",
201
+ "error.type",
202
+ "error.message",
203
+ ]
204
+ tags_to_show = {}
205
+
206
+ for tag in span["tags"]:
207
+ if isinstance(tag, str) and ":" in tag:
208
+ key, value = tag.split(":", 1)
209
+ if key in important_tags:
210
+ tags_to_show[key] = value
211
+
212
+ if tags_to_show:
213
+ output_lines.append(f"{indent}│ Tags:")
214
+ for key, value in tags_to_show.items():
215
+ output_lines.append(f"{indent}│ {key}: {value}")
216
+
217
+ # Sort children by start time
218
+ sorted_children = sorted(span["children"], key=lambda s: s["start_ns"])
219
+ for child in sorted_children:
220
+ format_span_tree(child, level + 1)
221
+
222
+ # Format all root spans
223
+ for root_span in sorted(root_spans, key=lambda s: s["start_ns"]):
224
+ format_span_tree(root_span)
225
+
226
+ return "\n".join(output_lines)
227
+
228
+
229
+ def format_spans_search(
230
+ spans: List[Dict[str, Any]], max_traces: int = 50, max_spans_per_trace: int = 10
231
+ ) -> str:
232
+ """
233
+ Format spans search results grouped by trace.
234
+ """
235
+ if not spans:
236
+ return ""
237
+
238
+ # Format output
239
+ output_lines = []
240
+ output_lines.append(f"Found {len(spans)} matching spans")
241
+ output_lines.append("")
242
+
243
+ # Group spans by trace for better readability
244
+ spans_by_trace = defaultdict(list)
245
+ for span in spans:
246
+ trace_id = span.get("attributes", {}).get("trace_id", "unknown")
247
+ spans_by_trace[trace_id].append(span)
248
+
249
+ output_lines.append(f"Spans grouped by {len(spans_by_trace)} traces:")
250
+ output_lines.append("")
251
+
252
+ for trace_id, trace_spans in list(spans_by_trace.items())[:max_traces]:
253
+ output_lines.append(f"Trace ID: {trace_id}")
254
+
255
+ # Sort spans by timestamp within each trace
256
+ sorted_spans = sorted(
257
+ trace_spans,
258
+ key=lambda s: parse_datadog_span_timestamp(s.get("attributes", {}))[0],
259
+ )
260
+
261
+ for span in sorted_spans[:max_spans_per_trace]:
262
+ attrs = span.get("attributes", {})
263
+
264
+ span_id = attrs.get("span_id", "unknown")
265
+ service = attrs.get("service", "unknown")
266
+ operation = attrs.get("operation_name", "unknown")
267
+ resource = attrs.get("resource_name", "")
268
+
269
+ start_ns, duration_ns = parse_datadog_span_timestamp(attrs)
270
+ duration_ms = duration_ns / 1_000_000
271
+ start_time_str = unix_nano_to_rfc3339(start_ns)
272
+
273
+ output_lines.append(f" ├─ {operation} ({service}) - {duration_ms:.2f}ms")
274
+ output_lines.append(f" │ span_id: {span_id}")
275
+ output_lines.append(f" │ time: {start_time_str}")
276
+
277
+ if resource:
278
+ output_lines.append(f" │ resource: {resource}")
279
+
280
+ # Show status if error
281
+ status = attrs.get("status", "")
282
+ if status and status != "ok":
283
+ output_lines.append(f" │ status: {status}")
284
+
285
+ # Show important tags
286
+ tags = attrs.get("tags", [])
287
+ important_tags = {}
288
+ for tag in tags:
289
+ if isinstance(tag, str) and ":" in tag:
290
+ key, value = tag.split(":", 1)
291
+ if key in ["env", "version", "http.status_code", "error.type"]:
292
+ important_tags[key] = value
293
+
294
+ if important_tags:
295
+ tags_str = ", ".join([f"{k}={v}" for k, v in important_tags.items()])
296
+ output_lines.append(f" │ tags: {tags_str}")
297
+
298
+ output_lines.append(" │")
299
+
300
+ if len(trace_spans) > max_spans_per_trace:
301
+ output_lines.append(
302
+ f" └─ ... and {len(trace_spans) - max_spans_per_trace} more spans in this trace"
303
+ )
304
+
305
+ output_lines.append("")
306
+
307
+ if len(spans_by_trace) > max_traces:
308
+ output_lines.append(f"... and {len(spans_by_trace) - max_traces} more traces")
309
+
310
+ return "\n".join(output_lines)
@@ -0,0 +1,51 @@
1
+ ## Datadog Traces Toolset
2
+
3
+ Tools to search and analyze distributed traces from Datadog APM.
4
+
5
+ ### Available Tools:
6
+ - **fetch_datadog_traces** - List traces with filters (service, operation, duration)
7
+ - **fetch_datadog_trace_by_id** - Get detailed span hierarchy for a specific trace
8
+ - **fetch_datadog_spans** - Search spans with Datadog query syntax
9
+
10
+ ### Common Usage:
11
+
12
+ ```python
13
+ # Find slow traces (>5s) for a service
14
+ fetch_datadog_traces(service="backend-service", min_duration="5s")
15
+
16
+ # Get trace details showing full span hierarchy
17
+ fetch_datadog_trace_by_id(trace_id="6878d11e0000000064837efe7e97f5f8")
18
+
19
+ # Search for errors using Datadog query syntax
20
+ fetch_datadog_spans(query="@http.status_code:500")
21
+ fetch_datadog_spans(service="api", query="status:error")
22
+
23
+ # Time ranges (default: last hour)
24
+ fetch_datadog_traces(
25
+ service="api",
26
+ start_datetime="-3600", # 1 hour ago
27
+ end_datetime="0" # now
28
+ )
29
+ ```
30
+
31
+ ### Query Examples:
32
+
33
+ ```python
34
+ # Performance issues
35
+ fetch_datadog_traces(min_duration="2s", operation="GET /api/products")
36
+
37
+ # Errors by service
38
+ fetch_datadog_spans(service="payment", query="@http.status_code:5*")
39
+
40
+ # Database queries
41
+ fetch_datadog_spans(query="service:postgres @duration:>1000000000")
42
+
43
+ # With tags
44
+ fetch_datadog_spans(tags={"env": "production"}, query="error:true")
45
+ ```
46
+
47
+ ### Tips:
48
+ - Duration units: ms, s, m (e.g., "500ms", "5s", "1m")
49
+ - Time: RFC3339 format or negative seconds from now
50
+ - Rate limit: 300 requests/hour
51
+ - Default time range: 1 hour
@@ -0,0 +1,267 @@
1
+ from enum import Enum
2
+ import json
3
+ import logging
4
+ from typing import Any, Optional, Dict, Tuple
5
+ from holmes.core.tools import (
6
+ CallablePrerequisite,
7
+ ToolsetTag,
8
+ )
9
+ from pydantic import BaseModel, Field
10
+ from holmes.core.tools import StructuredToolResult, ToolResultStatus
11
+ from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
12
+ from holmes.plugins.toolsets.datadog.datadog_api import (
13
+ DatadogBaseConfig,
14
+ DataDogRequestError,
15
+ execute_datadog_http_request,
16
+ get_headers,
17
+ MAX_RETRY_COUNT_ON_RATE_LIMIT,
18
+ )
19
+ from holmes.plugins.toolsets.logging_utils.logging_api import (
20
+ DEFAULT_TIME_SPAN_SECONDS,
21
+ BasePodLoggingToolset,
22
+ FetchPodLogsParams,
23
+ PodLoggingTool,
24
+ )
25
+ from holmes.plugins.toolsets.utils import process_timestamps_to_rfc3339
26
+
27
+
28
+ class DataDogLabelsMapping(BaseModel):
29
+ pod: str = "pod_name"
30
+ namespace: str = "kube_namespace"
31
+
32
+
33
+ class DataDogStorageTier(str, Enum):
34
+ INDEXES = "indexes"
35
+ ONLINE_ARCHIVES = "online-archives"
36
+ FLEX = "flex"
37
+
38
+
39
+ DEFAULT_STORAGE_TIERS = [DataDogStorageTier.INDEXES]
40
+
41
+
42
+ class DatadogLogsConfig(DatadogBaseConfig):
43
+ indexes: list[str] = ["*"]
44
+ # Ordered list of storage tiers. Works as fallback. Subsequent tiers are queried only if the previous tier yielded no result
45
+ storage_tiers: list[DataDogStorageTier] = Field(
46
+ default=DEFAULT_STORAGE_TIERS, min_length=1
47
+ )
48
+ labels: DataDogLabelsMapping = DataDogLabelsMapping()
49
+ page_size: int = 300
50
+ default_limit: int = 1000
51
+
52
+
53
+ def calculate_page_size(
54
+ params: FetchPodLogsParams, dd_config: DatadogLogsConfig, logs: list
55
+ ) -> int:
56
+ logs_count = len(logs)
57
+
58
+ max_logs_count = dd_config.default_limit
59
+ if params.limit:
60
+ max_logs_count = params.limit
61
+
62
+ return min(dd_config.page_size, max(0, max_logs_count - logs_count))
63
+
64
+
65
+ def fetch_paginated_logs(
66
+ params: FetchPodLogsParams,
67
+ dd_config: DatadogLogsConfig,
68
+ storage_tier: DataDogStorageTier,
69
+ ) -> list[dict]:
70
+ limit = params.limit or dd_config.default_limit
71
+
72
+ (from_time, to_time) = process_timestamps_to_rfc3339(
73
+ start_timestamp=params.start_time,
74
+ end_timestamp=params.end_time,
75
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
76
+ )
77
+
78
+ url = f"{dd_config.site_api_url}/api/v2/logs/events/search"
79
+ headers = get_headers(dd_config)
80
+
81
+ query = f"{dd_config.labels.namespace}:{params.namespace}"
82
+ query += f" {dd_config.labels.pod}:{params.pod_name}"
83
+ if params.filter:
84
+ filter = params.filter.replace('"', '\\"')
85
+ query += f' "{filter}"'
86
+
87
+ payload: Dict[str, Any] = {
88
+ "filter": {
89
+ "from": from_time,
90
+ "to": to_time,
91
+ "query": query,
92
+ "indexes": dd_config.indexes,
93
+ "storage_tier": storage_tier.value,
94
+ },
95
+ "sort": "-timestamp",
96
+ "page": {"limit": calculate_page_size(params, dd_config, [])},
97
+ }
98
+
99
+ logs, cursor = execute_datadog_http_request(
100
+ url=url,
101
+ headers=headers,
102
+ payload_or_params=payload,
103
+ timeout=dd_config.request_timeout,
104
+ )
105
+
106
+ while cursor and len(logs) < limit:
107
+ payload["page"]["cursor"] = cursor
108
+ new_logs, cursor = execute_datadog_http_request(
109
+ url=url,
110
+ headers=headers,
111
+ payload_or_params=payload,
112
+ timeout=dd_config.request_timeout,
113
+ )
114
+ logs += new_logs
115
+ payload["page"]["limit"] = calculate_page_size(params, dd_config, logs)
116
+
117
+ # logs are fetched descending order. Unified logging API follows the pattern of kubectl logs where oldest logs are first
118
+ logs.reverse()
119
+
120
+ if len(logs) > limit:
121
+ logs = logs[-limit:]
122
+ return logs
123
+
124
+
125
+ def format_logs(raw_logs: list[dict]) -> str:
126
+ logs = []
127
+
128
+ for raw_log_item in raw_logs:
129
+ message = raw_log_item.get("attributes", {}).get(
130
+ "message", json.dumps(raw_log_item)
131
+ )
132
+ logs.append(message)
133
+
134
+ return "\n".join(logs)
135
+
136
+
137
+ class DatadogLogsToolset(BasePodLoggingToolset):
138
+ dd_config: Optional[DatadogLogsConfig] = None
139
+
140
+ def __init__(self):
141
+ super().__init__(
142
+ name="datadog/logs",
143
+ description="Toolset for interacting with Datadog to fetch logs",
144
+ docs_url="https://docs.datadoghq.com/api/latest/logs/",
145
+ icon_url="https://imgix.datadoghq.com//img/about/presskit/DDlogo.jpg",
146
+ prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
147
+ tools=[
148
+ PodLoggingTool(self),
149
+ ],
150
+ experimental=True,
151
+ tags=[ToolsetTag.CORE],
152
+ )
153
+
154
+ def logger_name(self) -> str:
155
+ return "DataDog"
156
+
157
+ def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
158
+ if not self.dd_config:
159
+ return StructuredToolResult(
160
+ status=ToolResultStatus.ERROR,
161
+ data=TOOLSET_CONFIG_MISSING_ERROR,
162
+ params=params.model_dump(),
163
+ )
164
+
165
+ try:
166
+ raw_logs = []
167
+ for storage_tier in self.dd_config.storage_tiers:
168
+ raw_logs = fetch_paginated_logs(
169
+ params, self.dd_config, storage_tier=storage_tier
170
+ )
171
+
172
+ if raw_logs:
173
+ logs_str = format_logs(raw_logs)
174
+ return StructuredToolResult(
175
+ status=ToolResultStatus.SUCCESS,
176
+ data=logs_str,
177
+ params=params.model_dump(),
178
+ )
179
+
180
+ return StructuredToolResult(
181
+ status=ToolResultStatus.NO_DATA,
182
+ params=params.model_dump(),
183
+ )
184
+
185
+ except DataDogRequestError as e:
186
+ logging.exception(e, exc_info=True)
187
+
188
+ # Provide more specific error message for rate limiting failures
189
+ if e.status_code == 429:
190
+ error_msg = f"Datadog API rate limit exceeded. Failed after {MAX_RETRY_COUNT_ON_RATE_LIMIT} retry attempts."
191
+ else:
192
+ error_msg = f"Exception while querying Datadog: {str(e)}"
193
+
194
+ return StructuredToolResult(
195
+ status=ToolResultStatus.ERROR,
196
+ error=error_msg,
197
+ params=params.model_dump(),
198
+ invocation=json.dumps(e.payload),
199
+ )
200
+
201
+ except Exception as e:
202
+ logging.exception(
203
+ f"Failed to query Datadog logs for params: {params}", exc_info=True
204
+ )
205
+ return StructuredToolResult(
206
+ status=ToolResultStatus.ERROR,
207
+ error=f"Exception while querying Datadog: {str(e)}",
208
+ params=params.model_dump(),
209
+ )
210
+
211
+ def _perform_healthcheck(self) -> Tuple[bool, str]:
212
+ """
213
+ Perform a healthcheck by fetching a single log from Datadog.
214
+ Returns (success, error_message).
215
+ """
216
+ try:
217
+ logging.info("Performing Datadog configuration healthcheck...")
218
+ healthcheck_params = FetchPodLogsParams(
219
+ namespace="*",
220
+ pod_name="*",
221
+ limit=1,
222
+ start_time="-172800", # 48 hours in seconds
223
+ )
224
+
225
+ result = self.fetch_pod_logs(healthcheck_params)
226
+
227
+ if result.status == ToolResultStatus.ERROR:
228
+ error_msg = result.error or "Unknown error during healthcheck"
229
+ logging.error(f"Datadog healthcheck failed: {error_msg}")
230
+ return False, f"Datadog healthcheck failed: {error_msg}"
231
+ elif result.status == ToolResultStatus.NO_DATA:
232
+ error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?"
233
+ logging.error(f"Datadog healthcheck failed: {error_msg}")
234
+ return False, f"Datadog healthcheck failed: {error_msg}"
235
+
236
+ logging.info("Datadog healthcheck completed successfully")
237
+ return True, ""
238
+
239
+ except Exception as e:
240
+ logging.exception("Failed during Datadog healthcheck")
241
+ return False, f"Healthcheck failed with exception: {str(e)}"
242
+
243
+ def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
244
+ if not config:
245
+ return (
246
+ False,
247
+ TOOLSET_CONFIG_MISSING_ERROR,
248
+ )
249
+
250
+ try:
251
+ dd_config = DatadogLogsConfig(**config)
252
+ self.dd_config = dd_config
253
+
254
+ # Perform healthcheck
255
+ success, error_msg = self._perform_healthcheck()
256
+ return success, error_msg
257
+
258
+ except Exception as e:
259
+ logging.exception("Failed to set up Datadog toolset")
260
+ return (False, f"Failed to parse Datadog configuration: {str(e)}")
261
+
262
+ def get_example_config(self) -> Dict[str, Any]:
263
+ return {
264
+ "dd_api_key": "your-datadog-api-key",
265
+ "dd_app_key": "your-datadog-application-key",
266
+ "site_api_url": "https://api.datadoghq.com",
267
+ }