holmesgpt 0.13.3a0__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (86) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +15 -4
  3. holmes/common/env_vars.py +8 -1
  4. holmes/config.py +66 -139
  5. holmes/core/investigation.py +1 -2
  6. holmes/core/llm.py +295 -52
  7. holmes/core/models.py +2 -0
  8. holmes/core/safeguards.py +4 -4
  9. holmes/core/supabase_dal.py +14 -8
  10. holmes/core/tool_calling_llm.py +202 -177
  11. holmes/core/tools.py +260 -25
  12. holmes/core/tools_utils/data_types.py +81 -0
  13. holmes/core/tools_utils/tool_context_window_limiter.py +33 -0
  14. holmes/core/tools_utils/tool_executor.py +2 -2
  15. holmes/core/toolset_manager.py +150 -3
  16. holmes/core/tracing.py +6 -1
  17. holmes/core/transformers/__init__.py +23 -0
  18. holmes/core/transformers/base.py +62 -0
  19. holmes/core/transformers/llm_summarize.py +174 -0
  20. holmes/core/transformers/registry.py +122 -0
  21. holmes/core/transformers/transformer.py +31 -0
  22. holmes/main.py +5 -0
  23. holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
  24. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  25. holmes/plugins/toolsets/aks.yaml +64 -0
  26. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +17 -15
  27. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +8 -4
  28. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -3
  29. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -3
  30. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  31. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -3
  32. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +4 -4
  33. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +7 -3
  34. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +7 -3
  35. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +7 -3
  36. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +7 -3
  37. holmes/plugins/toolsets/bash/bash_toolset.py +6 -6
  38. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  39. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  40. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  41. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  42. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +345 -207
  43. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  44. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +96 -32
  45. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +10 -10
  46. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +21 -22
  47. holmes/plugins/toolsets/git.py +22 -22
  48. holmes/plugins/toolsets/grafana/common.py +14 -2
  49. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +473 -0
  50. holmes/plugins/toolsets/grafana/toolset_grafana.py +4 -4
  51. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +5 -4
  52. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  53. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +662 -290
  54. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  55. holmes/plugins/toolsets/internet/internet.py +3 -3
  56. holmes/plugins/toolsets/internet/notion.py +3 -3
  57. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  58. holmes/plugins/toolsets/kafka.py +18 -18
  59. holmes/plugins/toolsets/kubernetes.yaml +58 -0
  60. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  61. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  62. holmes/plugins/toolsets/logging_utils/logging_api.py +1 -1
  63. holmes/plugins/toolsets/mcp/toolset_mcp.py +4 -4
  64. holmes/plugins/toolsets/newrelic.py +8 -8
  65. holmes/plugins/toolsets/opensearch/opensearch.py +5 -5
  66. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  67. holmes/plugins/toolsets/opensearch/opensearch_traces.py +10 -10
  68. holmes/plugins/toolsets/prometheus/prometheus.py +841 -351
  69. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +39 -2
  70. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  71. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +6 -4
  72. holmes/plugins/toolsets/robusta/robusta.py +10 -10
  73. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -4
  74. holmes/plugins/toolsets/servicenow/servicenow.py +6 -6
  75. holmes/plugins/toolsets/utils.py +88 -0
  76. holmes/utils/config_utils.py +91 -0
  77. holmes/utils/env.py +7 -0
  78. holmes/utils/holmes_status.py +2 -1
  79. holmes/utils/sentry_helper.py +41 -0
  80. holmes/utils/stream.py +9 -0
  81. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/METADATA +11 -15
  82. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/RECORD +85 -75
  83. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  84. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/LICENSE.txt +0 -0
  85. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/WHEEL +0 -0
  86. {holmesgpt-0.13.3a0.dist-info → holmesgpt-0.14.1.dist-info}/entry_points.txt +0 -0
@@ -3,12 +3,13 @@ from enum import Enum
3
3
  import json
4
4
  import logging
5
5
  from typing import Any, Optional, Dict, Tuple, Set
6
+ from urllib.parse import urlencode
6
7
  from holmes.core.tools import (
7
8
  CallablePrerequisite,
8
9
  ToolsetTag,
9
10
  )
10
11
  from pydantic import BaseModel, Field
11
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
12
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
12
13
  from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
13
14
  from holmes.plugins.toolsets.datadog.datadog_api import (
14
15
  DatadogBaseConfig,
@@ -16,6 +17,8 @@ from holmes.plugins.toolsets.datadog.datadog_api import (
16
17
  execute_paginated_datadog_http_request,
17
18
  get_headers,
18
19
  MAX_RETRY_COUNT_ON_RATE_LIMIT,
20
+ enhance_error_message,
21
+ preprocess_time_fields,
19
22
  )
20
23
  from holmes.plugins.toolsets.logging_utils.logging_api import (
21
24
  DEFAULT_TIME_SPAN_SECONDS,
@@ -99,23 +102,28 @@ def fetch_paginated_logs(
99
102
  "page": {"limit": calculate_page_size(params, dd_config, [])},
100
103
  }
101
104
 
105
+ # Preprocess time fields to ensure correct format
106
+ processed_payload = preprocess_time_fields(payload, "/api/v2/logs/events/search")
107
+
102
108
  logs, cursor = execute_paginated_datadog_http_request(
103
109
  url=url,
104
110
  headers=headers,
105
- payload_or_params=payload,
111
+ payload_or_params=processed_payload,
106
112
  timeout=dd_config.request_timeout,
107
113
  )
108
114
 
109
115
  while cursor and len(logs) < limit:
110
- payload["page"]["cursor"] = cursor
116
+ processed_payload["page"]["cursor"] = cursor
117
+ processed_payload["page"]["limit"] = calculate_page_size(
118
+ params, dd_config, logs
119
+ )
111
120
  new_logs, cursor = execute_paginated_datadog_http_request(
112
121
  url=url,
113
122
  headers=headers,
114
- payload_or_params=payload,
123
+ payload_or_params=processed_payload,
115
124
  timeout=dd_config.request_timeout,
116
125
  )
117
126
  logs += new_logs
118
- payload["page"]["limit"] = calculate_page_size(params, dd_config, logs)
119
127
 
120
128
  # logs are fetched descending order. Unified logging API follows the pattern of kubectl logs where oldest logs are first
121
129
  logs.reverse()
@@ -129,14 +137,73 @@ def format_logs(raw_logs: list[dict]) -> str:
129
137
  logs = []
130
138
 
131
139
  for raw_log_item in raw_logs:
140
+ # Extract timestamp - Datadog returns it in ISO format
141
+ timestamp = raw_log_item.get("attributes", {}).get("timestamp", "")
142
+ if not timestamp:
143
+ # Fallback to @timestamp if timestamp is not in attributes
144
+ timestamp = raw_log_item.get("attributes", {}).get("@timestamp", "")
145
+
146
+ # Extract message
132
147
  message = raw_log_item.get("attributes", {}).get(
133
148
  "message", json.dumps(raw_log_item)
134
149
  )
135
- logs.append(message)
150
+
151
+ # Format as: [timestamp] message
152
+ if timestamp:
153
+ logs.append(f"[{timestamp}] {message}")
154
+ else:
155
+ logs.append(message)
136
156
 
137
157
  return "\n".join(logs)
138
158
 
139
159
 
160
+ def generate_datadog_logs_url(
161
+ dd_config: DatadogLogsConfig,
162
+ params: FetchPodLogsParams,
163
+ storage_tier: DataDogStorageTier,
164
+ ) -> str:
165
+ """Generate a Datadog web UI URL for the logs query."""
166
+ from holmes.plugins.toolsets.utils import process_timestamps_to_int
167
+ from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
168
+
169
+ # Convert API URL to app URL using the shared helper
170
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
171
+
172
+ # Build the query string
173
+ query = f"{dd_config.labels.namespace}:{params.namespace}"
174
+ query += f" {dd_config.labels.pod}:{params.pod_name}"
175
+ if params.filter:
176
+ filter = params.filter.replace('"', '\\"')
177
+ query += f' "{filter}"'
178
+
179
+ # Process timestamps - get Unix timestamps in seconds
180
+ (from_time_seconds, to_time_seconds) = process_timestamps_to_int(
181
+ start=params.start_time,
182
+ end=params.end_time,
183
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
184
+ )
185
+
186
+ # Convert to milliseconds for Datadog web UI
187
+ from_time_ms = from_time_seconds * 1000
188
+ to_time_ms = to_time_seconds * 1000
189
+
190
+ # Build URL parameters matching Datadog's web UI format
191
+ url_params = {
192
+ "query": query,
193
+ "from_ts": str(from_time_ms),
194
+ "to_ts": str(to_time_ms),
195
+ "live": "true",
196
+ "storage": storage_tier.value,
197
+ }
198
+
199
+ # Add indexes if not default
200
+ if dd_config.indexes != ["*"]:
201
+ url_params["index"] = ",".join(dd_config.indexes)
202
+
203
+ # Construct the full URL
204
+ return f"{base_url}/logs?{urlencode(url_params)}"
205
+
206
+
140
207
  class DatadogLogsToolset(BasePodLoggingToolset):
141
208
  dd_config: Optional[DatadogLogsConfig] = None
142
209
 
@@ -151,11 +218,10 @@ class DatadogLogsToolset(BasePodLoggingToolset):
151
218
  super().__init__(
152
219
  name="datadog/logs",
153
220
  description="Toolset for fetching logs from Datadog, including historical data for pods no longer in the cluster",
154
- docs_url="https://docs.datadoghq.com/api/latest/logs/",
221
+ docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
155
222
  icon_url="https://imgix.datadoghq.com//img/about/presskit/DDlogo.jpg",
156
223
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
157
224
  tools=[], # Initialize with empty tools first
158
- experimental=True,
159
225
  tags=[ToolsetTag.CORE],
160
226
  )
161
227
  # Now that parent is initialized and self.name exists, create the tool
@@ -168,7 +234,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
168
234
  def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
169
235
  if not self.dd_config:
170
236
  return StructuredToolResult(
171
- status=ToolResultStatus.ERROR,
237
+ status=StructuredToolResultStatus.ERROR,
172
238
  data=TOOLSET_CONFIG_MISSING_ERROR,
173
239
  params=params.model_dump(),
174
240
  )
@@ -182,29 +248,134 @@ class DatadogLogsToolset(BasePodLoggingToolset):
182
248
 
183
249
  if raw_logs:
184
250
  logs_str = format_logs(raw_logs)
251
+ # Generate Datadog web UI URL
252
+ datadog_url = generate_datadog_logs_url(
253
+ self.dd_config, params, storage_tier
254
+ )
255
+ logs_with_link = f"{logs_str}\n\nView in Datadog: {datadog_url}"
185
256
  return StructuredToolResult(
186
- status=ToolResultStatus.SUCCESS,
187
- data=logs_str,
257
+ status=StructuredToolResultStatus.SUCCESS,
258
+ data=logs_with_link,
259
+ url=datadog_url,
188
260
  params=params.model_dump(),
189
261
  )
190
262
 
263
+ # Include detailed diagnostic context
264
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
265
+ if params.filter:
266
+ query += f' "{params.filter}"'
267
+
268
+ # Get actual time range used
269
+ (from_time, to_time) = process_timestamps_to_rfc3339(
270
+ start_timestamp=params.start_time,
271
+ end_timestamp=params.end_time,
272
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
273
+ )
274
+
275
+ # Generate Datadog web UI URL for the last storage tier checked
276
+ datadog_url = generate_datadog_logs_url(
277
+ self.dd_config, params, self.dd_config.storage_tiers[-1]
278
+ )
279
+
280
+ # Build diagnostic information
281
+ diagnostics: Dict[str, Any] = {
282
+ "query_executed": query,
283
+ "time_range": f"{from_time} to {to_time}",
284
+ "indexes_searched": self.dd_config.indexes,
285
+ "storage_tiers_checked": [
286
+ tier.value for tier in self.dd_config.storage_tiers
287
+ ],
288
+ "field_mappings": {
289
+ "namespace_field": self.dd_config.labels.namespace,
290
+ "pod_field": self.dd_config.labels.pod,
291
+ },
292
+ "limit": params.limit or self.dd_config.default_limit,
293
+ "datadog_url": datadog_url,
294
+ }
295
+
296
+ # Format diagnostic info as structured text
297
+ error_msg = (
298
+ f"No logs found.\n\n"
299
+ f"Diagnostic Information:\n"
300
+ f"----------------------\n"
301
+ f"Query executed: {diagnostics['query_executed']}\n"
302
+ f"Time range: {diagnostics['time_range']}\n"
303
+ f"Indexes searched: {diagnostics['indexes_searched']}\n"
304
+ f"Storage tiers checked: {', '.join(str(tier) for tier in diagnostics.get('storage_tiers_checked', []))}\n"
305
+ f"Field mappings:\n"
306
+ f" - Namespace field: {diagnostics.get('field_mappings', {}).get('namespace_field', 'N/A')}\n"
307
+ f" - Pod field: {diagnostics.get('field_mappings', {}).get('pod_field', 'N/A')}\n"
308
+ f"Limit: {diagnostics['limit']}\n\n"
309
+ f"View in Datadog: {diagnostics['datadog_url']}"
310
+ )
311
+
191
312
  return StructuredToolResult(
192
- status=ToolResultStatus.NO_DATA,
313
+ status=StructuredToolResultStatus.NO_DATA,
314
+ error=error_msg,
315
+ url=datadog_url,
193
316
  params=params.model_dump(),
194
317
  )
195
318
 
196
319
  except DataDogRequestError as e:
197
320
  logging.exception(e, exc_info=True)
198
321
 
322
+ # Always try to generate Datadog URL for debugging
323
+ try:
324
+ datadog_url = generate_datadog_logs_url(
325
+ self.dd_config, params, self.dd_config.storage_tiers[0]
326
+ )
327
+ except Exception:
328
+ datadog_url = None
329
+
199
330
  # Provide more specific error message for rate limiting failures
200
331
  if e.status_code == 429:
201
332
  error_msg = f"Datadog API rate limit exceeded. Failed after {MAX_RETRY_COUNT_ON_RATE_LIMIT} retry attempts."
333
+ if datadog_url:
334
+ error_msg += f"\nView in Datadog: {datadog_url}"
335
+ elif e.status_code == 400:
336
+ # Use enhanced error message for validation errors
337
+ error_msg = enhance_error_message(
338
+ e,
339
+ "/api/v2/logs/events/search",
340
+ "POST",
341
+ str(self.dd_config.site_api_url),
342
+ )
343
+
344
+ # Add query context
345
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
346
+ if params.filter:
347
+ query += f' "{params.filter}"'
348
+ error_msg += f"\n\nQuery attempted: {query}"
349
+
350
+ # Add Datadog web UI URL to error message
351
+ if datadog_url:
352
+ error_msg += f"\nView in Datadog: {datadog_url}"
202
353
  else:
203
- error_msg = f"Exception while querying Datadog: {str(e)}"
354
+ # Include full API error details and query context
355
+ error_msg = (
356
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
357
+ )
358
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
359
+ if params.filter:
360
+ query += f' "{params.filter}"'
361
+ error_msg += f"\nQuery: {query}"
362
+
363
+ # Get actual time range used
364
+ (from_time, to_time) = process_timestamps_to_rfc3339(
365
+ start_timestamp=params.start_time,
366
+ end_timestamp=params.end_time,
367
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
368
+ )
369
+ error_msg += f"\nTime range: {from_time} to {to_time}"
370
+
371
+ # Add Datadog web UI URL to error message
372
+ if datadog_url:
373
+ error_msg += f"\nView in Datadog: {datadog_url}"
204
374
 
205
375
  return StructuredToolResult(
206
- status=ToolResultStatus.ERROR,
376
+ status=StructuredToolResultStatus.ERROR,
207
377
  error=error_msg,
378
+ url=datadog_url,
208
379
  params=params.model_dump(),
209
380
  invocation=json.dumps(e.payload),
210
381
  )
@@ -214,7 +385,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
214
385
  f"Failed to query Datadog logs for params: {params}", exc_info=True
215
386
  )
216
387
  return StructuredToolResult(
217
- status=ToolResultStatus.ERROR,
388
+ status=StructuredToolResultStatus.ERROR,
218
389
  error=f"Exception while querying Datadog: {str(e)}",
219
390
  params=params.model_dump(),
220
391
  )
@@ -225,7 +396,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
225
396
  Returns (success, error_message).
226
397
  """
227
398
  try:
228
- logging.info("Performing Datadog configuration healthcheck...")
399
+ logging.debug("Performing Datadog configuration healthcheck...")
229
400
  healthcheck_params = FetchPodLogsParams(
230
401
  namespace="*",
231
402
  pod_name="*",
@@ -235,11 +406,11 @@ class DatadogLogsToolset(BasePodLoggingToolset):
235
406
 
236
407
  result = self.fetch_pod_logs(healthcheck_params)
237
408
 
238
- if result.status == ToolResultStatus.ERROR:
409
+ if result.status == StructuredToolResultStatus.ERROR:
239
410
  error_msg = result.error or "Unknown error during healthcheck"
240
411
  logging.error(f"Datadog healthcheck failed: {error_msg}")
241
412
  return False, f"Datadog healthcheck failed: {error_msg}"
242
- elif result.status == ToolResultStatus.NO_DATA:
413
+ elif result.status == StructuredToolResultStatus.NO_DATA:
243
414
  error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?"
244
415
  logging.error(f"Datadog healthcheck failed: {error_msg}")
245
416
  return False, f"Datadog healthcheck failed: {error_msg}"
@@ -255,7 +426,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
255
426
  if not config:
256
427
  return (
257
428
  False,
258
- TOOLSET_CONFIG_MISSING_ERROR,
429
+ "Datadog logs toolset requires configuration. Please provide: dd_api_key, dd_app_key, and site_api_url in your Holmes config. For more details, see https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
259
430
  )
260
431
 
261
432
  try:
@@ -7,7 +7,7 @@ from holmes.core.tools import (
7
7
  StructuredToolResult,
8
8
  Tool,
9
9
  ToolParameter,
10
- ToolResultStatus,
10
+ StructuredToolResultStatus,
11
11
  Toolset,
12
12
  ToolsetTag,
13
13
  )
@@ -54,7 +54,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
54
54
  def __init__(self, toolset: "DatadogMetricsToolset"):
55
55
  super().__init__(
56
56
  name="list_active_datadog_metrics",
57
- description=f"List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
57
+ description=f"[datadog/metrics toolset] List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
58
58
  parameters={
59
59
  "from_time": ToolParameter(
60
60
  description=f"Start time for listing metrics. Can be an RFC3339 formatted datetime (e.g. '2023-03-01T10:30:00Z') or a negative integer for relative seconds from now (e.g. -86400 for 24 hours ago). Defaults to {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours ago",
@@ -80,7 +80,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
80
80
  ) -> StructuredToolResult:
81
81
  if not self.toolset.dd_config:
82
82
  return StructuredToolResult(
83
- status=ToolResultStatus.ERROR,
83
+ status=StructuredToolResultStatus.ERROR,
84
84
  error=TOOLSET_CONFIG_MISSING_ERROR,
85
85
  params=params,
86
86
  )
@@ -121,7 +121,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
121
121
  metrics = data.get("metrics", [])
122
122
  if not metrics:
123
123
  return StructuredToolResult(
124
- status=ToolResultStatus.ERROR,
124
+ status=StructuredToolResultStatus.ERROR,
125
125
  data="Your filter returned no metrics. Change your filter and try again",
126
126
  params=params,
127
127
  )
@@ -133,7 +133,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
133
133
  output.append(metric)
134
134
 
135
135
  return StructuredToolResult(
136
- status=ToolResultStatus.SUCCESS,
136
+ status=StructuredToolResultStatus.SUCCESS,
137
137
  data="\n".join(output),
138
138
  params=params,
139
139
  )
@@ -149,10 +149,30 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
149
149
  f"and 'timeseries_query' permissions. Error: {str(e)}"
150
150
  )
151
151
  else:
152
- error_msg = f"Exception while querying Datadog: {str(e)}"
152
+ # Include full API error details for better debugging
153
+ error_msg = (
154
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
155
+ )
156
+ if params:
157
+ # ListActiveMetrics parameters: from_time, host, tag_filter
158
+ if params.get("host"):
159
+ error_msg += f"\nHost filter: {params.get('host')}"
160
+ if params.get("tag_filter"):
161
+ error_msg += f"\nTag filter: {params.get('tag_filter')}"
162
+
163
+ from_time_param = params.get("from_time")
164
+ if from_time_param:
165
+ time_desc = from_time_param
166
+ else:
167
+ time_desc = f"default (last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours)"
168
+ error_msg += f"\nTime range: {time_desc}"
169
+
170
+ # Note: We cannot generate a Datadog Metrics Explorer URL for ListActiveMetrics
171
+ # because the Metrics Explorer requires a specific metric query,
172
+ # while ListActiveMetrics just lists available metrics without querying any specific one
153
173
 
154
174
  return StructuredToolResult(
155
- status=ToolResultStatus.ERROR,
175
+ status=StructuredToolResultStatus.ERROR,
156
176
  error=error_msg,
157
177
  params=params,
158
178
  invocation=json.dumps({"url": url, "params": query_params})
@@ -165,7 +185,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
165
185
  f"Failed to query Datadog metrics for params: {params}", exc_info=True
166
186
  )
167
187
  return StructuredToolResult(
168
- status=ToolResultStatus.ERROR,
188
+ status=StructuredToolResultStatus.ERROR,
169
189
  error=f"Exception while querying Datadog: {str(e)}",
170
190
  params=params,
171
191
  )
@@ -184,7 +204,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
184
204
  def __init__(self, toolset: "DatadogMetricsToolset"):
185
205
  super().__init__(
186
206
  name="query_datadog_metrics",
187
- description="Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
207
+ description="[datadog/metrics toolset] Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
188
208
  parameters={
189
209
  "query": ToolParameter(
190
210
  description="The metric query string (e.g., 'system.cpu.user{host:myhost}')",
@@ -222,7 +242,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
222
242
  ) -> StructuredToolResult:
223
243
  if not self.toolset.dd_config:
224
244
  return StructuredToolResult(
225
- status=ToolResultStatus.ERROR,
245
+ status=StructuredToolResultStatus.ERROR,
226
246
  error=TOOLSET_CONFIG_MISSING_ERROR,
227
247
  params=params,
228
248
  )
@@ -261,9 +281,29 @@ class QueryMetrics(BaseDatadogMetricsTool):
261
281
  output_type = params.get("output_type", "Plain")
262
282
 
263
283
  if not series:
284
+ # Include detailed context in error message
285
+ from_time_param = params.get("from_time")
286
+ to_time_param = params.get("to_time")
287
+
288
+ if from_time_param:
289
+ from_desc = from_time_param
290
+ else:
291
+ from_desc = (
292
+ f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
293
+ )
294
+
295
+ to_desc = to_time_param or "now"
296
+
297
+ error_msg = (
298
+ f"The query returned no data.\n"
299
+ f"Query: {params.get('query', 'not specified')}\n"
300
+ f"Time range: {from_desc} to {to_desc}\n"
301
+ f"Please check your query syntax and ensure data exists for this time range."
302
+ )
303
+
264
304
  return StructuredToolResult(
265
- status=ToolResultStatus.NO_DATA,
266
- error="The query returned no data. Please check your query syntax and time range.",
305
+ status=StructuredToolResultStatus.NO_DATA,
306
+ error=error_msg,
267
307
  params=params,
268
308
  )
269
309
 
@@ -317,7 +357,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
317
357
 
318
358
  data_str = json.dumps(response_data, indent=2)
319
359
  return StructuredToolResult(
320
- status=ToolResultStatus.SUCCESS,
360
+ status=StructuredToolResultStatus.SUCCESS,
321
361
  data=data_str,
322
362
  params=params,
323
363
  )
@@ -333,10 +373,28 @@ class QueryMetrics(BaseDatadogMetricsTool):
333
373
  f"and 'timeseries_query' permissions. Error: {str(e)}"
334
374
  )
335
375
  else:
336
- error_msg = f"Exception while querying Datadog: {str(e)}"
376
+ # Include full API error details for better debugging
377
+ error_msg = (
378
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
379
+ )
380
+ if params:
381
+ error_msg += f"\nQuery: {params.get('query', 'not specified')}"
382
+
383
+ from_time_param = params.get("from_time")
384
+ to_time_param = params.get("to_time")
385
+
386
+ if from_time_param:
387
+ from_desc = from_time_param
388
+ else:
389
+ from_desc = (
390
+ f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
391
+ )
392
+
393
+ to_desc = to_time_param or "now"
394
+ error_msg += f"\nTime range: {from_desc} to {to_desc}"
337
395
 
338
396
  return StructuredToolResult(
339
- status=ToolResultStatus.ERROR,
397
+ status=StructuredToolResultStatus.ERROR,
340
398
  error=error_msg,
341
399
  params=params,
342
400
  invocation=json.dumps({"url": url, "params": query_params})
@@ -350,7 +408,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
350
408
  )
351
409
 
352
410
  return StructuredToolResult(
353
- status=ToolResultStatus.ERROR,
411
+ status=StructuredToolResultStatus.ERROR,
354
412
  error=f"Exception while querying Datadog: {str(e)}",
355
413
  params=params,
356
414
  )
@@ -364,7 +422,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
364
422
  def __init__(self, toolset: "DatadogMetricsToolset"):
365
423
  super().__init__(
366
424
  name="get_datadog_metric_metadata",
367
- description="Get metadata about one or more metrics including their type, description, unit, and other properties",
425
+ description="[datadog/metrics toolset] Get metadata about one or more metrics including their type, description, unit, and other properties",
368
426
  parameters={
369
427
  "metric_names": ToolParameter(
370
428
  description="Comma-separated list of metric names to get metadata for (e.g., 'system.cpu.user, system.mem.used')",
@@ -380,7 +438,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
380
438
  ) -> StructuredToolResult:
381
439
  if not self.toolset.dd_config:
382
440
  return StructuredToolResult(
383
- status=ToolResultStatus.ERROR,
441
+ status=StructuredToolResultStatus.ERROR,
384
442
  error=TOOLSET_CONFIG_MISSING_ERROR,
385
443
  params=params,
386
444
  )
@@ -396,7 +454,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
396
454
 
397
455
  if not metric_names:
398
456
  return StructuredToolResult(
399
- status=ToolResultStatus.ERROR,
457
+ status=StructuredToolResultStatus.ERROR,
400
458
  error="metric_names cannot be empty",
401
459
  params=params,
402
460
  )
@@ -442,14 +500,14 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
442
500
 
443
501
  if not results and errors:
444
502
  return StructuredToolResult(
445
- status=ToolResultStatus.ERROR,
503
+ status=StructuredToolResultStatus.ERROR,
446
504
  error="Failed to retrieve metadata for all metrics",
447
505
  data=json.dumps(response_data, indent=2),
448
506
  params=params,
449
507
  )
450
508
 
451
509
  return StructuredToolResult(
452
- status=ToolResultStatus.SUCCESS,
510
+ status=StructuredToolResultStatus.SUCCESS,
453
511
  data=json.dumps(response_data, indent=2),
454
512
  params=params,
455
513
  )
@@ -461,7 +519,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
461
519
  )
462
520
 
463
521
  return StructuredToolResult(
464
- status=ToolResultStatus.ERROR,
522
+ status=StructuredToolResultStatus.ERROR,
465
523
  error=f"Exception while querying Datadog: {str(e)}",
466
524
  params=params,
467
525
  )
@@ -480,7 +538,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
480
538
  def __init__(self, toolset: "DatadogMetricsToolset"):
481
539
  super().__init__(
482
540
  name="list_datadog_metric_tags",
483
- description="List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
541
+ description="[datadog/metrics toolset] List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
484
542
  parameters={
485
543
  "metric_name": ToolParameter(
486
544
  description="The name of the metric to get tags for (e.g., 'system.cpu.user', 'container.memory.usage')",
@@ -496,7 +554,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
496
554
  ) -> StructuredToolResult:
497
555
  if not self.toolset.dd_config:
498
556
  return StructuredToolResult(
499
- status=ToolResultStatus.ERROR,
557
+ status=StructuredToolResultStatus.ERROR,
500
558
  error=TOOLSET_CONFIG_MISSING_ERROR,
501
559
  params=params,
502
560
  )
@@ -519,7 +577,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
519
577
  )
520
578
 
521
579
  return StructuredToolResult(
522
- status=ToolResultStatus.SUCCESS,
580
+ status=StructuredToolResultStatus.SUCCESS,
523
581
  data=data,
524
582
  params=params,
525
583
  )
@@ -537,10 +595,17 @@ class ListMetricTags(BaseDatadogMetricsTool):
537
595
  f"permissions. Error: {str(e)}"
538
596
  )
539
597
  else:
540
- error_msg = f"Exception while querying Datadog: {str(e)}"
598
+ # Include full API error details for better debugging
599
+ error_msg = (
600
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
601
+ )
602
+ if params:
603
+ error_msg += (
604
+ f"\nMetric name: {params.get('metric_name', 'not specified')}"
605
+ )
541
606
 
542
607
  return StructuredToolResult(
543
- status=ToolResultStatus.ERROR,
608
+ status=StructuredToolResultStatus.ERROR,
544
609
  error=error_msg,
545
610
  params=params,
546
611
  invocation=json.dumps({"url": url, "params": query_params})
@@ -554,7 +619,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
554
619
  exc_info=True,
555
620
  )
556
621
  return StructuredToolResult(
557
- status=ToolResultStatus.ERROR,
622
+ status=StructuredToolResultStatus.ERROR,
558
623
  error=f"Exception while querying Datadog: {str(e)}",
559
624
  params=params,
560
625
  )
@@ -571,7 +636,7 @@ class DatadogMetricsToolset(Toolset):
571
636
  super().__init__(
572
637
  name="datadog/metrics",
573
638
  description="Toolset for fetching metrics and metadata from Datadog, including historical data for pods no longer in the cluster",
574
- docs_url="https://docs.datadoghq.com/api/latest/metrics/",
639
+ docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
575
640
  icon_url="https://imgix.datadoghq.com//img/about/presskit/DDlogo.jpg",
576
641
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
577
642
  tools=[
@@ -580,14 +645,13 @@ class DatadogMetricsToolset(Toolset):
580
645
  QueryMetricsMetadata(toolset=self),
581
646
  ListMetricTags(toolset=self),
582
647
  ],
583
- experimental=True,
584
648
  tags=[ToolsetTag.CORE],
585
649
  )
586
650
  self._reload_instructions()
587
651
 
588
652
  def _perform_healthcheck(self, dd_config: DatadogMetricsConfig) -> Tuple[bool, str]:
589
653
  try:
590
- logging.info("Performing Datadog metrics configuration healthcheck...")
654
+ logging.debug("Performing Datadog metrics configuration healthcheck...")
591
655
 
592
656
  url = f"{dd_config.site_api_url}/api/v1/validate"
593
657
  headers = get_headers(dd_config)
@@ -616,7 +680,7 @@ class DatadogMetricsToolset(Toolset):
616
680
  if not config:
617
681
  return (
618
682
  False,
619
- TOOLSET_CONFIG_MISSING_ERROR,
683
+ "Datadog metrics toolset requires configuration. Please provide: dd_api_key, dd_app_key, and site_api_url in your Holmes config. For more details, see https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
620
684
  )
621
685
 
622
686
  try: