holmesgpt 0.14.1a0__py3-none-any.whl → 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +5 -2
- holmes/common/env_vars.py +2 -2
- holmes/config.py +1 -1
- holmes/core/llm.py +44 -6
- holmes/core/tool_calling_llm.py +9 -1
- holmes/core/toolset_manager.py +2 -2
- holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +329 -190
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +181 -9
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +75 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +2 -2
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +3 -3
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +2 -1
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +3 -3
- holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
- holmes/plugins/toolsets/prometheus/prometheus.py +704 -349
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +27 -11
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.2.dist-info}/METADATA +2 -2
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.2.dist-info}/RECORD +25 -25
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.2.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.2.dist-info}/WHEEL +0 -0
- {holmesgpt-0.14.1a0.dist-info → holmesgpt-0.14.2.dist-info}/entry_points.txt +0 -0
|
@@ -3,6 +3,7 @@ from enum import Enum
|
|
|
3
3
|
import json
|
|
4
4
|
import logging
|
|
5
5
|
from typing import Any, Optional, Dict, Tuple, Set
|
|
6
|
+
from urllib.parse import urlencode
|
|
6
7
|
from holmes.core.tools import (
|
|
7
8
|
CallablePrerequisite,
|
|
8
9
|
ToolsetTag,
|
|
@@ -16,6 +17,8 @@ from holmes.plugins.toolsets.datadog.datadog_api import (
|
|
|
16
17
|
execute_paginated_datadog_http_request,
|
|
17
18
|
get_headers,
|
|
18
19
|
MAX_RETRY_COUNT_ON_RATE_LIMIT,
|
|
20
|
+
enhance_error_message,
|
|
21
|
+
preprocess_time_fields,
|
|
19
22
|
)
|
|
20
23
|
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
21
24
|
DEFAULT_TIME_SPAN_SECONDS,
|
|
@@ -99,23 +102,28 @@ def fetch_paginated_logs(
|
|
|
99
102
|
"page": {"limit": calculate_page_size(params, dd_config, [])},
|
|
100
103
|
}
|
|
101
104
|
|
|
105
|
+
# Preprocess time fields to ensure correct format
|
|
106
|
+
processed_payload = preprocess_time_fields(payload, "/api/v2/logs/events/search")
|
|
107
|
+
|
|
102
108
|
logs, cursor = execute_paginated_datadog_http_request(
|
|
103
109
|
url=url,
|
|
104
110
|
headers=headers,
|
|
105
|
-
payload_or_params=
|
|
111
|
+
payload_or_params=processed_payload,
|
|
106
112
|
timeout=dd_config.request_timeout,
|
|
107
113
|
)
|
|
108
114
|
|
|
109
115
|
while cursor and len(logs) < limit:
|
|
110
|
-
|
|
116
|
+
processed_payload["page"]["cursor"] = cursor
|
|
117
|
+
processed_payload["page"]["limit"] = calculate_page_size(
|
|
118
|
+
params, dd_config, logs
|
|
119
|
+
)
|
|
111
120
|
new_logs, cursor = execute_paginated_datadog_http_request(
|
|
112
121
|
url=url,
|
|
113
122
|
headers=headers,
|
|
114
|
-
payload_or_params=
|
|
123
|
+
payload_or_params=processed_payload,
|
|
115
124
|
timeout=dd_config.request_timeout,
|
|
116
125
|
)
|
|
117
126
|
logs += new_logs
|
|
118
|
-
payload["page"]["limit"] = calculate_page_size(params, dd_config, logs)
|
|
119
127
|
|
|
120
128
|
# logs are fetched descending order. Unified logging API follows the pattern of kubectl logs where oldest logs are first
|
|
121
129
|
logs.reverse()
|
|
@@ -129,14 +137,73 @@ def format_logs(raw_logs: list[dict]) -> str:
|
|
|
129
137
|
logs = []
|
|
130
138
|
|
|
131
139
|
for raw_log_item in raw_logs:
|
|
140
|
+
# Extract timestamp - Datadog returns it in ISO format
|
|
141
|
+
timestamp = raw_log_item.get("attributes", {}).get("timestamp", "")
|
|
142
|
+
if not timestamp:
|
|
143
|
+
# Fallback to @timestamp if timestamp is not in attributes
|
|
144
|
+
timestamp = raw_log_item.get("attributes", {}).get("@timestamp", "")
|
|
145
|
+
|
|
146
|
+
# Extract message
|
|
132
147
|
message = raw_log_item.get("attributes", {}).get(
|
|
133
148
|
"message", json.dumps(raw_log_item)
|
|
134
149
|
)
|
|
135
|
-
|
|
150
|
+
|
|
151
|
+
# Format as: [timestamp] message
|
|
152
|
+
if timestamp:
|
|
153
|
+
logs.append(f"[{timestamp}] {message}")
|
|
154
|
+
else:
|
|
155
|
+
logs.append(message)
|
|
136
156
|
|
|
137
157
|
return "\n".join(logs)
|
|
138
158
|
|
|
139
159
|
|
|
160
|
+
def generate_datadog_logs_url(
|
|
161
|
+
dd_config: DatadogLogsConfig,
|
|
162
|
+
params: FetchPodLogsParams,
|
|
163
|
+
storage_tier: DataDogStorageTier,
|
|
164
|
+
) -> str:
|
|
165
|
+
"""Generate a Datadog web UI URL for the logs query."""
|
|
166
|
+
from holmes.plugins.toolsets.utils import process_timestamps_to_int
|
|
167
|
+
from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
|
|
168
|
+
|
|
169
|
+
# Convert API URL to app URL using the shared helper
|
|
170
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
171
|
+
|
|
172
|
+
# Build the query string
|
|
173
|
+
query = f"{dd_config.labels.namespace}:{params.namespace}"
|
|
174
|
+
query += f" {dd_config.labels.pod}:{params.pod_name}"
|
|
175
|
+
if params.filter:
|
|
176
|
+
filter = params.filter.replace('"', '\\"')
|
|
177
|
+
query += f' "{filter}"'
|
|
178
|
+
|
|
179
|
+
# Process timestamps - get Unix timestamps in seconds
|
|
180
|
+
(from_time_seconds, to_time_seconds) = process_timestamps_to_int(
|
|
181
|
+
start=params.start_time,
|
|
182
|
+
end=params.end_time,
|
|
183
|
+
default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Convert to milliseconds for Datadog web UI
|
|
187
|
+
from_time_ms = from_time_seconds * 1000
|
|
188
|
+
to_time_ms = to_time_seconds * 1000
|
|
189
|
+
|
|
190
|
+
# Build URL parameters matching Datadog's web UI format
|
|
191
|
+
url_params = {
|
|
192
|
+
"query": query,
|
|
193
|
+
"from_ts": str(from_time_ms),
|
|
194
|
+
"to_ts": str(to_time_ms),
|
|
195
|
+
"live": "true",
|
|
196
|
+
"storage": storage_tier.value,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
# Add indexes if not default
|
|
200
|
+
if dd_config.indexes != ["*"]:
|
|
201
|
+
url_params["index"] = ",".join(dd_config.indexes)
|
|
202
|
+
|
|
203
|
+
# Construct the full URL
|
|
204
|
+
return f"{base_url}/logs?{urlencode(url_params)}"
|
|
205
|
+
|
|
206
|
+
|
|
140
207
|
class DatadogLogsToolset(BasePodLoggingToolset):
|
|
141
208
|
dd_config: Optional[DatadogLogsConfig] = None
|
|
142
209
|
|
|
@@ -181,29 +248,134 @@ class DatadogLogsToolset(BasePodLoggingToolset):
|
|
|
181
248
|
|
|
182
249
|
if raw_logs:
|
|
183
250
|
logs_str = format_logs(raw_logs)
|
|
251
|
+
# Generate Datadog web UI URL
|
|
252
|
+
datadog_url = generate_datadog_logs_url(
|
|
253
|
+
self.dd_config, params, storage_tier
|
|
254
|
+
)
|
|
255
|
+
logs_with_link = f"{logs_str}\n\nView in Datadog: {datadog_url}"
|
|
184
256
|
return StructuredToolResult(
|
|
185
257
|
status=StructuredToolResultStatus.SUCCESS,
|
|
186
|
-
data=
|
|
258
|
+
data=logs_with_link,
|
|
259
|
+
url=datadog_url,
|
|
187
260
|
params=params.model_dump(),
|
|
188
261
|
)
|
|
189
262
|
|
|
263
|
+
# Include detailed diagnostic context
|
|
264
|
+
query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
|
|
265
|
+
if params.filter:
|
|
266
|
+
query += f' "{params.filter}"'
|
|
267
|
+
|
|
268
|
+
# Get actual time range used
|
|
269
|
+
(from_time, to_time) = process_timestamps_to_rfc3339(
|
|
270
|
+
start_timestamp=params.start_time,
|
|
271
|
+
end_timestamp=params.end_time,
|
|
272
|
+
default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Generate Datadog web UI URL for the last storage tier checked
|
|
276
|
+
datadog_url = generate_datadog_logs_url(
|
|
277
|
+
self.dd_config, params, self.dd_config.storage_tiers[-1]
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
# Build diagnostic information
|
|
281
|
+
diagnostics: Dict[str, Any] = {
|
|
282
|
+
"query_executed": query,
|
|
283
|
+
"time_range": f"{from_time} to {to_time}",
|
|
284
|
+
"indexes_searched": self.dd_config.indexes,
|
|
285
|
+
"storage_tiers_checked": [
|
|
286
|
+
tier.value for tier in self.dd_config.storage_tiers
|
|
287
|
+
],
|
|
288
|
+
"field_mappings": {
|
|
289
|
+
"namespace_field": self.dd_config.labels.namespace,
|
|
290
|
+
"pod_field": self.dd_config.labels.pod,
|
|
291
|
+
},
|
|
292
|
+
"limit": params.limit or self.dd_config.default_limit,
|
|
293
|
+
"datadog_url": datadog_url,
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
# Format diagnostic info as structured text
|
|
297
|
+
error_msg = (
|
|
298
|
+
f"No logs found.\n\n"
|
|
299
|
+
f"Diagnostic Information:\n"
|
|
300
|
+
f"----------------------\n"
|
|
301
|
+
f"Query executed: {diagnostics['query_executed']}\n"
|
|
302
|
+
f"Time range: {diagnostics['time_range']}\n"
|
|
303
|
+
f"Indexes searched: {diagnostics['indexes_searched']}\n"
|
|
304
|
+
f"Storage tiers checked: {', '.join(str(tier) for tier in diagnostics.get('storage_tiers_checked', []))}\n"
|
|
305
|
+
f"Field mappings:\n"
|
|
306
|
+
f" - Namespace field: {diagnostics.get('field_mappings', {}).get('namespace_field', 'N/A')}\n"
|
|
307
|
+
f" - Pod field: {diagnostics.get('field_mappings', {}).get('pod_field', 'N/A')}\n"
|
|
308
|
+
f"Limit: {diagnostics['limit']}\n\n"
|
|
309
|
+
f"View in Datadog: {diagnostics['datadog_url']}"
|
|
310
|
+
)
|
|
311
|
+
|
|
190
312
|
return StructuredToolResult(
|
|
191
313
|
status=StructuredToolResultStatus.NO_DATA,
|
|
314
|
+
error=error_msg,
|
|
315
|
+
url=datadog_url,
|
|
192
316
|
params=params.model_dump(),
|
|
193
317
|
)
|
|
194
318
|
|
|
195
319
|
except DataDogRequestError as e:
|
|
196
320
|
logging.exception(e, exc_info=True)
|
|
197
321
|
|
|
322
|
+
# Always try to generate Datadog URL for debugging
|
|
323
|
+
try:
|
|
324
|
+
datadog_url = generate_datadog_logs_url(
|
|
325
|
+
self.dd_config, params, self.dd_config.storage_tiers[0]
|
|
326
|
+
)
|
|
327
|
+
except Exception:
|
|
328
|
+
datadog_url = None
|
|
329
|
+
|
|
198
330
|
# Provide more specific error message for rate limiting failures
|
|
199
331
|
if e.status_code == 429:
|
|
200
332
|
error_msg = f"Datadog API rate limit exceeded. Failed after {MAX_RETRY_COUNT_ON_RATE_LIMIT} retry attempts."
|
|
333
|
+
if datadog_url:
|
|
334
|
+
error_msg += f"\nView in Datadog: {datadog_url}"
|
|
335
|
+
elif e.status_code == 400:
|
|
336
|
+
# Use enhanced error message for validation errors
|
|
337
|
+
error_msg = enhance_error_message(
|
|
338
|
+
e,
|
|
339
|
+
"/api/v2/logs/events/search",
|
|
340
|
+
"POST",
|
|
341
|
+
str(self.dd_config.site_api_url),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Add query context
|
|
345
|
+
query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
|
|
346
|
+
if params.filter:
|
|
347
|
+
query += f' "{params.filter}"'
|
|
348
|
+
error_msg += f"\n\nQuery attempted: {query}"
|
|
349
|
+
|
|
350
|
+
# Add Datadog web UI URL to error message
|
|
351
|
+
if datadog_url:
|
|
352
|
+
error_msg += f"\nView in Datadog: {datadog_url}"
|
|
201
353
|
else:
|
|
202
|
-
|
|
354
|
+
# Include full API error details and query context
|
|
355
|
+
error_msg = (
|
|
356
|
+
f"Datadog API error (status {e.status_code}): {e.response_text}"
|
|
357
|
+
)
|
|
358
|
+
query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
|
|
359
|
+
if params.filter:
|
|
360
|
+
query += f' "{params.filter}"'
|
|
361
|
+
error_msg += f"\nQuery: {query}"
|
|
362
|
+
|
|
363
|
+
# Get actual time range used
|
|
364
|
+
(from_time, to_time) = process_timestamps_to_rfc3339(
|
|
365
|
+
start_timestamp=params.start_time,
|
|
366
|
+
end_timestamp=params.end_time,
|
|
367
|
+
default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
|
|
368
|
+
)
|
|
369
|
+
error_msg += f"\nTime range: {from_time} to {to_time}"
|
|
370
|
+
|
|
371
|
+
# Add Datadog web UI URL to error message
|
|
372
|
+
if datadog_url:
|
|
373
|
+
error_msg += f"\nView in Datadog: {datadog_url}"
|
|
203
374
|
|
|
204
375
|
return StructuredToolResult(
|
|
205
376
|
status=StructuredToolResultStatus.ERROR,
|
|
206
377
|
error=error_msg,
|
|
378
|
+
url=datadog_url,
|
|
207
379
|
params=params.model_dump(),
|
|
208
380
|
invocation=json.dumps(e.payload),
|
|
209
381
|
)
|
|
@@ -224,7 +396,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
|
|
|
224
396
|
Returns (success, error_message).
|
|
225
397
|
"""
|
|
226
398
|
try:
|
|
227
|
-
logging.
|
|
399
|
+
logging.debug("Performing Datadog configuration healthcheck...")
|
|
228
400
|
healthcheck_params = FetchPodLogsParams(
|
|
229
401
|
namespace="*",
|
|
230
402
|
pod_name="*",
|
|
@@ -254,7 +426,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
|
|
|
254
426
|
if not config:
|
|
255
427
|
return (
|
|
256
428
|
False,
|
|
257
|
-
|
|
429
|
+
"Datadog logs toolset requires configuration. Please provide: dd_api_key, dd_app_key, and site_api_url in your Holmes config. For more details, see https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
|
|
258
430
|
)
|
|
259
431
|
|
|
260
432
|
try:
|
|
@@ -54,7 +54,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
|
|
|
54
54
|
def __init__(self, toolset: "DatadogMetricsToolset"):
|
|
55
55
|
super().__init__(
|
|
56
56
|
name="list_active_datadog_metrics",
|
|
57
|
-
description=f"List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
|
|
57
|
+
description=f"[datadog/metrics toolset] List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
|
|
58
58
|
parameters={
|
|
59
59
|
"from_time": ToolParameter(
|
|
60
60
|
description=f"Start time for listing metrics. Can be an RFC3339 formatted datetime (e.g. '2023-03-01T10:30:00Z') or a negative integer for relative seconds from now (e.g. -86400 for 24 hours ago). Defaults to {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours ago",
|
|
@@ -149,7 +149,27 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
|
|
|
149
149
|
f"and 'timeseries_query' permissions. Error: {str(e)}"
|
|
150
150
|
)
|
|
151
151
|
else:
|
|
152
|
-
|
|
152
|
+
# Include full API error details for better debugging
|
|
153
|
+
error_msg = (
|
|
154
|
+
f"Datadog API error (status {e.status_code}): {e.response_text}"
|
|
155
|
+
)
|
|
156
|
+
if params:
|
|
157
|
+
# ListActiveMetrics parameters: from_time, host, tag_filter
|
|
158
|
+
if params.get("host"):
|
|
159
|
+
error_msg += f"\nHost filter: {params.get('host')}"
|
|
160
|
+
if params.get("tag_filter"):
|
|
161
|
+
error_msg += f"\nTag filter: {params.get('tag_filter')}"
|
|
162
|
+
|
|
163
|
+
from_time_param = params.get("from_time")
|
|
164
|
+
if from_time_param:
|
|
165
|
+
time_desc = from_time_param
|
|
166
|
+
else:
|
|
167
|
+
time_desc = f"default (last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours)"
|
|
168
|
+
error_msg += f"\nTime range: {time_desc}"
|
|
169
|
+
|
|
170
|
+
# Note: We cannot generate a Datadog Metrics Explorer URL for ListActiveMetrics
|
|
171
|
+
# because the Metrics Explorer requires a specific metric query,
|
|
172
|
+
# while ListActiveMetrics just lists available metrics without querying any specific one
|
|
153
173
|
|
|
154
174
|
return StructuredToolResult(
|
|
155
175
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -184,7 +204,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
|
|
|
184
204
|
def __init__(self, toolset: "DatadogMetricsToolset"):
|
|
185
205
|
super().__init__(
|
|
186
206
|
name="query_datadog_metrics",
|
|
187
|
-
description="Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
|
|
207
|
+
description="[datadog/metrics toolset] Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
|
|
188
208
|
parameters={
|
|
189
209
|
"query": ToolParameter(
|
|
190
210
|
description="The metric query string (e.g., 'system.cpu.user{host:myhost}')",
|
|
@@ -261,9 +281,29 @@ class QueryMetrics(BaseDatadogMetricsTool):
|
|
|
261
281
|
output_type = params.get("output_type", "Plain")
|
|
262
282
|
|
|
263
283
|
if not series:
|
|
284
|
+
# Include detailed context in error message
|
|
285
|
+
from_time_param = params.get("from_time")
|
|
286
|
+
to_time_param = params.get("to_time")
|
|
287
|
+
|
|
288
|
+
if from_time_param:
|
|
289
|
+
from_desc = from_time_param
|
|
290
|
+
else:
|
|
291
|
+
from_desc = (
|
|
292
|
+
f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
to_desc = to_time_param or "now"
|
|
296
|
+
|
|
297
|
+
error_msg = (
|
|
298
|
+
f"The query returned no data.\n"
|
|
299
|
+
f"Query: {params.get('query', 'not specified')}\n"
|
|
300
|
+
f"Time range: {from_desc} to {to_desc}\n"
|
|
301
|
+
f"Please check your query syntax and ensure data exists for this time range."
|
|
302
|
+
)
|
|
303
|
+
|
|
264
304
|
return StructuredToolResult(
|
|
265
305
|
status=StructuredToolResultStatus.NO_DATA,
|
|
266
|
-
error=
|
|
306
|
+
error=error_msg,
|
|
267
307
|
params=params,
|
|
268
308
|
)
|
|
269
309
|
|
|
@@ -333,7 +373,25 @@ class QueryMetrics(BaseDatadogMetricsTool):
|
|
|
333
373
|
f"and 'timeseries_query' permissions. Error: {str(e)}"
|
|
334
374
|
)
|
|
335
375
|
else:
|
|
336
|
-
|
|
376
|
+
# Include full API error details for better debugging
|
|
377
|
+
error_msg = (
|
|
378
|
+
f"Datadog API error (status {e.status_code}): {e.response_text}"
|
|
379
|
+
)
|
|
380
|
+
if params:
|
|
381
|
+
error_msg += f"\nQuery: {params.get('query', 'not specified')}"
|
|
382
|
+
|
|
383
|
+
from_time_param = params.get("from_time")
|
|
384
|
+
to_time_param = params.get("to_time")
|
|
385
|
+
|
|
386
|
+
if from_time_param:
|
|
387
|
+
from_desc = from_time_param
|
|
388
|
+
else:
|
|
389
|
+
from_desc = (
|
|
390
|
+
f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
|
|
391
|
+
)
|
|
392
|
+
|
|
393
|
+
to_desc = to_time_param or "now"
|
|
394
|
+
error_msg += f"\nTime range: {from_desc} to {to_desc}"
|
|
337
395
|
|
|
338
396
|
return StructuredToolResult(
|
|
339
397
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -364,7 +422,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
|
|
|
364
422
|
def __init__(self, toolset: "DatadogMetricsToolset"):
|
|
365
423
|
super().__init__(
|
|
366
424
|
name="get_datadog_metric_metadata",
|
|
367
|
-
description="Get metadata about one or more metrics including their type, description, unit, and other properties",
|
|
425
|
+
description="[datadog/metrics toolset] Get metadata about one or more metrics including their type, description, unit, and other properties",
|
|
368
426
|
parameters={
|
|
369
427
|
"metric_names": ToolParameter(
|
|
370
428
|
description="Comma-separated list of metric names to get metadata for (e.g., 'system.cpu.user, system.mem.used')",
|
|
@@ -480,7 +538,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
|
|
|
480
538
|
def __init__(self, toolset: "DatadogMetricsToolset"):
|
|
481
539
|
super().__init__(
|
|
482
540
|
name="list_datadog_metric_tags",
|
|
483
|
-
description="List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
|
|
541
|
+
description="[datadog/metrics toolset] List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
|
|
484
542
|
parameters={
|
|
485
543
|
"metric_name": ToolParameter(
|
|
486
544
|
description="The name of the metric to get tags for (e.g., 'system.cpu.user', 'container.memory.usage')",
|
|
@@ -537,7 +595,14 @@ class ListMetricTags(BaseDatadogMetricsTool):
|
|
|
537
595
|
f"permissions. Error: {str(e)}"
|
|
538
596
|
)
|
|
539
597
|
else:
|
|
540
|
-
|
|
598
|
+
# Include full API error details for better debugging
|
|
599
|
+
error_msg = (
|
|
600
|
+
f"Datadog API error (status {e.status_code}): {e.response_text}"
|
|
601
|
+
)
|
|
602
|
+
if params:
|
|
603
|
+
error_msg += (
|
|
604
|
+
f"\nMetric name: {params.get('metric_name', 'not specified')}"
|
|
605
|
+
)
|
|
541
606
|
|
|
542
607
|
return StructuredToolResult(
|
|
543
608
|
status=StructuredToolResultStatus.ERROR,
|
|
@@ -586,7 +651,7 @@ class DatadogMetricsToolset(Toolset):
|
|
|
586
651
|
|
|
587
652
|
def _perform_healthcheck(self, dd_config: DatadogMetricsConfig) -> Tuple[bool, str]:
|
|
588
653
|
try:
|
|
589
|
-
logging.
|
|
654
|
+
logging.debug("Performing Datadog metrics configuration healthcheck...")
|
|
590
655
|
|
|
591
656
|
url = f"{dd_config.site_api_url}/api/v1/validate"
|
|
592
657
|
headers = get_headers(dd_config)
|
|
@@ -615,7 +680,7 @@ class DatadogMetricsToolset(Toolset):
|
|
|
615
680
|
if not config:
|
|
616
681
|
return (
|
|
617
682
|
False,
|
|
618
|
-
|
|
683
|
+
"Datadog metrics toolset requires configuration. Please provide: dd_api_key, dd_app_key, and site_api_url in your Holmes config. For more details, see https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
|
|
619
684
|
)
|
|
620
685
|
|
|
621
686
|
try:
|
|
@@ -69,7 +69,7 @@ class GenerateRDSPerformanceReport(BaseDatadogRDSTool):
|
|
|
69
69
|
def __init__(self, toolset: "DatadogRDSToolset"):
|
|
70
70
|
super().__init__(
|
|
71
71
|
name="datadog_rds_performance_report",
|
|
72
|
-
description="Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
|
|
72
|
+
description="[datadog/rds toolset] Generate a comprehensive performance report for a specific RDS instance including latency, resource utilization, and storage metrics with analysis",
|
|
73
73
|
parameters={
|
|
74
74
|
"db_instance_identifier": ToolParameter(
|
|
75
75
|
description="The RDS database instance identifier",
|
|
@@ -364,7 +364,7 @@ class GetTopWorstPerformingRDSInstances(BaseDatadogRDSTool):
|
|
|
364
364
|
def __init__(self, toolset: "DatadogRDSToolset"):
|
|
365
365
|
super().__init__(
|
|
366
366
|
name="datadog_rds_top_worst_performing",
|
|
367
|
-
description="Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
|
|
367
|
+
description="[datadog/rds toolset] Get a summarized report of the top worst performing RDS instances based on latency, CPU utilization, and error rates",
|
|
368
368
|
parameters={
|
|
369
369
|
"top_n": ToolParameter(
|
|
370
370
|
description=f"Number of worst performing instances to return (default: {DEFAULT_TOP_INSTANCES})",
|
|
@@ -156,7 +156,7 @@ class FetchDatadogTracesList(BaseDatadogTracesTool):
|
|
|
156
156
|
def __init__(self, toolset: "DatadogTracesToolset"):
|
|
157
157
|
super().__init__(
|
|
158
158
|
name="fetch_datadog_traces",
|
|
159
|
-
description="Fetch a list of traces from Datadog with optional filters",
|
|
159
|
+
description="[datadog/traces toolset] Fetch a list of traces from Datadog with optional filters",
|
|
160
160
|
parameters={
|
|
161
161
|
"service": ToolParameter(
|
|
162
162
|
description="Filter by service name",
|
|
@@ -360,7 +360,7 @@ class FetchDatadogTraceById(BaseDatadogTracesTool):
|
|
|
360
360
|
def __init__(self, toolset: "DatadogTracesToolset"):
|
|
361
361
|
super().__init__(
|
|
362
362
|
name="fetch_datadog_trace_by_id",
|
|
363
|
-
description="Fetch detailed information about a specific trace by its ID",
|
|
363
|
+
description="[datadog/traces toolset] Fetch detailed information about a specific trace by its ID",
|
|
364
364
|
parameters={
|
|
365
365
|
"trace_id": ToolParameter(
|
|
366
366
|
description="The trace ID to fetch details for",
|
|
@@ -499,7 +499,7 @@ class FetchDatadogSpansByFilter(BaseDatadogTracesTool):
|
|
|
499
499
|
def __init__(self, toolset: "DatadogTracesToolset"):
|
|
500
500
|
super().__init__(
|
|
501
501
|
name="fetch_datadog_spans",
|
|
502
|
-
description="Search for spans in Datadog with detailed filters",
|
|
502
|
+
description="[datadog/traces toolset] Search for spans in Datadog with detailed filters",
|
|
503
503
|
parameters={
|
|
504
504
|
"query": ToolParameter(
|
|
505
505
|
description="Datadog search query (e.g., 'service:web-app @http.status_code:500')",
|
|
@@ -14,6 +14,7 @@ from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
|
14
14
|
LoggingCapability,
|
|
15
15
|
PodLoggingTool,
|
|
16
16
|
DEFAULT_TIME_SPAN_SECONDS,
|
|
17
|
+
DEFAULT_LOG_LIMIT,
|
|
17
18
|
)
|
|
18
19
|
from holmes.plugins.toolsets.utils import (
|
|
19
20
|
process_timestamps_to_rfc3339,
|
|
@@ -94,7 +95,7 @@ class GrafanaLokiToolset(BasePodLoggingToolset):
|
|
|
94
95
|
label_value=params.pod_name,
|
|
95
96
|
start=start,
|
|
96
97
|
end=end,
|
|
97
|
-
limit=params.limit or
|
|
98
|
+
limit=params.limit or DEFAULT_LOG_LIMIT,
|
|
98
99
|
)
|
|
99
100
|
if logs:
|
|
100
101
|
logs.sort(key=lambda x: x["timestamp"])
|
|
@@ -242,9 +242,9 @@ Examples:
|
|
|
242
242
|
import logging
|
|
243
243
|
|
|
244
244
|
logger = logging.getLogger(__name__)
|
|
245
|
-
logger.
|
|
245
|
+
logger.debug(f"Tempo query: {stats_query}")
|
|
246
246
|
|
|
247
|
-
logger.
|
|
247
|
+
logger.debug(f"start: {start}, end: {end}")
|
|
248
248
|
|
|
249
249
|
all_traces_response = api.search_traces_by_query(
|
|
250
250
|
q=stats_query,
|
|
@@ -253,7 +253,7 @@ Examples:
|
|
|
253
253
|
limit=1000,
|
|
254
254
|
)
|
|
255
255
|
|
|
256
|
-
logger.
|
|
256
|
+
logger.debug(f"Response: {all_traces_response}")
|
|
257
257
|
|
|
258
258
|
traces = all_traces_response.get("traces", [])
|
|
259
259
|
if not traces:
|
|
@@ -18,7 +18,7 @@ from holmes.plugins.toolsets.utils import get_param_or_raise
|
|
|
18
18
|
DEFAULT_LOG_LIMIT = 100
|
|
19
19
|
SECONDS_PER_DAY = 24 * 60 * 60
|
|
20
20
|
DEFAULT_TIME_SPAN_SECONDS = 7 * SECONDS_PER_DAY # 1 week in seconds
|
|
21
|
-
DEFAULT_GRAPH_TIME_SPAN_SECONDS = 1 *
|
|
21
|
+
DEFAULT_GRAPH_TIME_SPAN_SECONDS = 1 * 60 * 60 # 1 hour in seconds
|
|
22
22
|
|
|
23
23
|
POD_LOGGING_TOOL_NAME = "fetch_pod_logs"
|
|
24
24
|
|