holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/clients/robusta_client.py +17 -4
  3. holmes/common/env_vars.py +40 -1
  4. holmes/config.py +114 -144
  5. holmes/core/conversations.py +53 -14
  6. holmes/core/feedback.py +191 -0
  7. holmes/core/investigation.py +18 -22
  8. holmes/core/llm.py +489 -88
  9. holmes/core/models.py +103 -1
  10. holmes/core/openai_formatting.py +13 -0
  11. holmes/core/prompt.py +1 -1
  12. holmes/core/safeguards.py +4 -4
  13. holmes/core/supabase_dal.py +293 -100
  14. holmes/core/tool_calling_llm.py +423 -323
  15. holmes/core/tools.py +311 -33
  16. holmes/core/tools_utils/token_counting.py +14 -0
  17. holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
  18. holmes/core/tools_utils/tool_executor.py +13 -8
  19. holmes/core/toolset_manager.py +155 -4
  20. holmes/core/tracing.py +6 -1
  21. holmes/core/transformers/__init__.py +23 -0
  22. holmes/core/transformers/base.py +62 -0
  23. holmes/core/transformers/llm_summarize.py +174 -0
  24. holmes/core/transformers/registry.py +122 -0
  25. holmes/core/transformers/transformer.py +31 -0
  26. holmes/core/truncation/compaction.py +59 -0
  27. holmes/core/truncation/dal_truncation_utils.py +23 -0
  28. holmes/core/truncation/input_context_window_limiter.py +218 -0
  29. holmes/interactive.py +177 -24
  30. holmes/main.py +7 -4
  31. holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
  32. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  33. holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
  34. holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
  35. holmes/plugins/prompts/generic_ask.jinja2 +2 -4
  36. holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
  37. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
  38. holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
  39. holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
  40. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
  41. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
  42. holmes/plugins/runbooks/__init__.py +117 -18
  43. holmes/plugins/runbooks/catalog.json +2 -0
  44. holmes/plugins/toolsets/__init__.py +21 -8
  45. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  46. holmes/plugins/toolsets/aks.yaml +64 -0
  47. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
  48. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
  49. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
  50. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
  51. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
  52. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
  53. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
  54. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
  55. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
  56. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
  57. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
  58. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
  59. holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
  60. holmes/plugins/toolsets/bash/common/bash.py +7 -7
  61. holmes/plugins/toolsets/cilium.yaml +284 -0
  62. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
  63. holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
  64. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
  65. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
  66. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
  67. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
  68. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
  69. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
  70. holmes/plugins/toolsets/git.py +51 -46
  71. holmes/plugins/toolsets/grafana/common.py +15 -3
  72. holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
  73. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
  74. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
  75. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
  76. holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
  77. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
  78. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  79. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
  80. holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
  81. holmes/plugins/toolsets/internet/internet.py +6 -7
  82. holmes/plugins/toolsets/internet/notion.py +5 -6
  83. holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
  84. holmes/plugins/toolsets/kafka.py +25 -36
  85. holmes/plugins/toolsets/kubernetes.yaml +58 -84
  86. holmes/plugins/toolsets/kubernetes_logs.py +6 -6
  87. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  88. holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
  89. holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
  90. holmes/plugins/toolsets/newrelic/__init__.py +0 -0
  91. holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
  92. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
  93. holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
  94. holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
  95. holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
  96. holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  97. holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
  98. holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
  99. holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
  100. holmes/plugins/toolsets/openshift.yaml +283 -0
  101. holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
  102. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
  103. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  104. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
  105. holmes/plugins/toolsets/robusta/robusta.py +236 -65
  106. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  107. holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
  108. holmes/plugins/toolsets/service_discovery.py +1 -1
  109. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  110. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  111. holmes/plugins/toolsets/utils.py +88 -0
  112. holmes/utils/config_utils.py +91 -0
  113. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  114. holmes/utils/env.py +7 -0
  115. holmes/utils/global_instructions.py +75 -10
  116. holmes/utils/holmes_status.py +2 -1
  117. holmes/utils/holmes_sync_toolsets.py +0 -2
  118. holmes/utils/krr_utils.py +188 -0
  119. holmes/utils/sentry_helper.py +41 -0
  120. holmes/utils/stream.py +61 -7
  121. holmes/version.py +34 -14
  122. holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
  123. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
  124. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
  125. holmes/core/performance_timing.py +0 -72
  126. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  127. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  128. holmes/plugins/toolsets/newrelic.py +0 -231
  129. holmes/plugins/toolsets/servicenow/install.md +0 -37
  130. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  131. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  132. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  133. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
  134. {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
@@ -3,12 +3,13 @@ from enum import Enum
3
3
  import json
4
4
  import logging
5
5
  from typing import Any, Optional, Dict, Tuple, Set
6
+ from urllib.parse import urlencode
6
7
  from holmes.core.tools import (
7
8
  CallablePrerequisite,
8
9
  ToolsetTag,
9
10
  )
10
11
  from pydantic import BaseModel, Field
11
- from holmes.core.tools import StructuredToolResult, ToolResultStatus
12
+ from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
12
13
  from holmes.plugins.toolsets.consts import TOOLSET_CONFIG_MISSING_ERROR
13
14
  from holmes.plugins.toolsets.datadog.datadog_api import (
14
15
  DatadogBaseConfig,
@@ -16,6 +17,8 @@ from holmes.plugins.toolsets.datadog.datadog_api import (
16
17
  execute_paginated_datadog_http_request,
17
18
  get_headers,
18
19
  MAX_RETRY_COUNT_ON_RATE_LIMIT,
20
+ enhance_error_message,
21
+ preprocess_time_fields,
19
22
  )
20
23
  from holmes.plugins.toolsets.logging_utils.logging_api import (
21
24
  DEFAULT_TIME_SPAN_SECONDS,
@@ -99,23 +102,28 @@ def fetch_paginated_logs(
99
102
  "page": {"limit": calculate_page_size(params, dd_config, [])},
100
103
  }
101
104
 
105
+ # Preprocess time fields to ensure correct format
106
+ processed_payload = preprocess_time_fields(payload, "/api/v2/logs/events/search")
107
+
102
108
  logs, cursor = execute_paginated_datadog_http_request(
103
109
  url=url,
104
110
  headers=headers,
105
- payload_or_params=payload,
111
+ payload_or_params=processed_payload,
106
112
  timeout=dd_config.request_timeout,
107
113
  )
108
114
 
109
115
  while cursor and len(logs) < limit:
110
- payload["page"]["cursor"] = cursor
116
+ processed_payload["page"]["cursor"] = cursor
117
+ processed_payload["page"]["limit"] = calculate_page_size(
118
+ params, dd_config, logs
119
+ )
111
120
  new_logs, cursor = execute_paginated_datadog_http_request(
112
121
  url=url,
113
122
  headers=headers,
114
- payload_or_params=payload,
123
+ payload_or_params=processed_payload,
115
124
  timeout=dd_config.request_timeout,
116
125
  )
117
126
  logs += new_logs
118
- payload["page"]["limit"] = calculate_page_size(params, dd_config, logs)
119
127
 
120
128
  # logs are fetched descending order. Unified logging API follows the pattern of kubectl logs where oldest logs are first
121
129
  logs.reverse()
@@ -129,14 +137,73 @@ def format_logs(raw_logs: list[dict]) -> str:
129
137
  logs = []
130
138
 
131
139
  for raw_log_item in raw_logs:
140
+ # Extract timestamp - Datadog returns it in ISO format
141
+ timestamp = raw_log_item.get("attributes", {}).get("timestamp", "")
142
+ if not timestamp:
143
+ # Fallback to @timestamp if timestamp is not in attributes
144
+ timestamp = raw_log_item.get("attributes", {}).get("@timestamp", "")
145
+
146
+ # Extract message
132
147
  message = raw_log_item.get("attributes", {}).get(
133
148
  "message", json.dumps(raw_log_item)
134
149
  )
135
- logs.append(message)
150
+
151
+ # Format as: [timestamp] message
152
+ if timestamp:
153
+ logs.append(f"[{timestamp}] {message}")
154
+ else:
155
+ logs.append(message)
136
156
 
137
157
  return "\n".join(logs)
138
158
 
139
159
 
160
+ def generate_datadog_logs_url(
161
+ dd_config: DatadogLogsConfig,
162
+ params: FetchPodLogsParams,
163
+ storage_tier: DataDogStorageTier,
164
+ ) -> str:
165
+ """Generate a Datadog web UI URL for the logs query."""
166
+ from holmes.plugins.toolsets.utils import process_timestamps_to_int
167
+ from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
168
+
169
+ # Convert API URL to app URL using the shared helper
170
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
171
+
172
+ # Build the query string
173
+ query = f"{dd_config.labels.namespace}:{params.namespace}"
174
+ query += f" {dd_config.labels.pod}:{params.pod_name}"
175
+ if params.filter:
176
+ filter = params.filter.replace('"', '\\"')
177
+ query += f' "{filter}"'
178
+
179
+ # Process timestamps - get Unix timestamps in seconds
180
+ (from_time_seconds, to_time_seconds) = process_timestamps_to_int(
181
+ start=params.start_time,
182
+ end=params.end_time,
183
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
184
+ )
185
+
186
+ # Convert to milliseconds for Datadog web UI
187
+ from_time_ms = from_time_seconds * 1000
188
+ to_time_ms = to_time_seconds * 1000
189
+
190
+ # Build URL parameters matching Datadog's web UI format
191
+ url_params = {
192
+ "query": query,
193
+ "from_ts": str(from_time_ms),
194
+ "to_ts": str(to_time_ms),
195
+ "live": "true",
196
+ "storage": storage_tier.value,
197
+ }
198
+
199
+ # Add indexes if not default
200
+ if dd_config.indexes != ["*"]:
201
+ url_params["index"] = ",".join(dd_config.indexes)
202
+
203
+ # Construct the full URL
204
+ return f"{base_url}/logs?{urlencode(url_params)}"
205
+
206
+
140
207
  class DatadogLogsToolset(BasePodLoggingToolset):
141
208
  dd_config: Optional[DatadogLogsConfig] = None
142
209
 
@@ -151,11 +218,10 @@ class DatadogLogsToolset(BasePodLoggingToolset):
151
218
  super().__init__(
152
219
  name="datadog/logs",
153
220
  description="Toolset for fetching logs from Datadog, including historical data for pods no longer in the cluster",
154
- docs_url="https://docs.datadoghq.com/api/latest/logs/",
221
+ docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
155
222
  icon_url="https://imgix.datadoghq.com//img/about/presskit/DDlogo.jpg",
156
223
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
157
224
  tools=[], # Initialize with empty tools first
158
- experimental=True,
159
225
  tags=[ToolsetTag.CORE],
160
226
  )
161
227
  # Now that parent is initialized and self.name exists, create the tool
@@ -168,7 +234,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
168
234
  def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
169
235
  if not self.dd_config:
170
236
  return StructuredToolResult(
171
- status=ToolResultStatus.ERROR,
237
+ status=StructuredToolResultStatus.ERROR,
172
238
  data=TOOLSET_CONFIG_MISSING_ERROR,
173
239
  params=params.model_dump(),
174
240
  )
@@ -182,29 +248,134 @@ class DatadogLogsToolset(BasePodLoggingToolset):
182
248
 
183
249
  if raw_logs:
184
250
  logs_str = format_logs(raw_logs)
251
+ # Generate Datadog web UI URL
252
+ datadog_url = generate_datadog_logs_url(
253
+ self.dd_config, params, storage_tier
254
+ )
255
+ logs_with_link = f"{logs_str}\n\nView in Datadog: {datadog_url}"
185
256
  return StructuredToolResult(
186
- status=ToolResultStatus.SUCCESS,
187
- data=logs_str,
257
+ status=StructuredToolResultStatus.SUCCESS,
258
+ data=logs_with_link,
259
+ url=datadog_url,
188
260
  params=params.model_dump(),
189
261
  )
190
262
 
263
+ # Include detailed diagnostic context
264
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
265
+ if params.filter:
266
+ query += f' "{params.filter}"'
267
+
268
+ # Get actual time range used
269
+ (from_time, to_time) = process_timestamps_to_rfc3339(
270
+ start_timestamp=params.start_time,
271
+ end_timestamp=params.end_time,
272
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
273
+ )
274
+
275
+ # Generate Datadog web UI URL for the last storage tier checked
276
+ datadog_url = generate_datadog_logs_url(
277
+ self.dd_config, params, self.dd_config.storage_tiers[-1]
278
+ )
279
+
280
+ # Build diagnostic information
281
+ diagnostics: Dict[str, Any] = {
282
+ "query_executed": query,
283
+ "time_range": f"{from_time} to {to_time}",
284
+ "indexes_searched": self.dd_config.indexes,
285
+ "storage_tiers_checked": [
286
+ tier.value for tier in self.dd_config.storage_tiers
287
+ ],
288
+ "field_mappings": {
289
+ "namespace_field": self.dd_config.labels.namespace,
290
+ "pod_field": self.dd_config.labels.pod,
291
+ },
292
+ "limit": params.limit or self.dd_config.default_limit,
293
+ "datadog_url": datadog_url,
294
+ }
295
+
296
+ # Format diagnostic info as structured text
297
+ error_msg = (
298
+ f"No logs found.\n\n"
299
+ f"Diagnostic Information:\n"
300
+ f"----------------------\n"
301
+ f"Query executed: {diagnostics['query_executed']}\n"
302
+ f"Time range: {diagnostics['time_range']}\n"
303
+ f"Indexes searched: {diagnostics['indexes_searched']}\n"
304
+ f"Storage tiers checked: {', '.join(str(tier) for tier in diagnostics.get('storage_tiers_checked', []))}\n"
305
+ f"Field mappings:\n"
306
+ f" - Namespace field: {diagnostics.get('field_mappings', {}).get('namespace_field', 'N/A')}\n"
307
+ f" - Pod field: {diagnostics.get('field_mappings', {}).get('pod_field', 'N/A')}\n"
308
+ f"Limit: {diagnostics['limit']}\n\n"
309
+ f"View in Datadog: {diagnostics['datadog_url']}"
310
+ )
311
+
191
312
  return StructuredToolResult(
192
- status=ToolResultStatus.NO_DATA,
313
+ status=StructuredToolResultStatus.NO_DATA,
314
+ error=error_msg,
315
+ url=datadog_url,
193
316
  params=params.model_dump(),
194
317
  )
195
318
 
196
319
  except DataDogRequestError as e:
197
320
  logging.exception(e, exc_info=True)
198
321
 
322
+ # Always try to generate Datadog URL for debugging
323
+ try:
324
+ datadog_url = generate_datadog_logs_url(
325
+ self.dd_config, params, self.dd_config.storage_tiers[0]
326
+ )
327
+ except Exception:
328
+ datadog_url = None
329
+
199
330
  # Provide more specific error message for rate limiting failures
200
331
  if e.status_code == 429:
201
332
  error_msg = f"Datadog API rate limit exceeded. Failed after {MAX_RETRY_COUNT_ON_RATE_LIMIT} retry attempts."
333
+ if datadog_url:
334
+ error_msg += f"\nView in Datadog: {datadog_url}"
335
+ elif e.status_code == 400:
336
+ # Use enhanced error message for validation errors
337
+ error_msg = enhance_error_message(
338
+ e,
339
+ "/api/v2/logs/events/search",
340
+ "POST",
341
+ str(self.dd_config.site_api_url),
342
+ )
343
+
344
+ # Add query context
345
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
346
+ if params.filter:
347
+ query += f' "{params.filter}"'
348
+ error_msg += f"\n\nQuery attempted: {query}"
349
+
350
+ # Add Datadog web UI URL to error message
351
+ if datadog_url:
352
+ error_msg += f"\nView in Datadog: {datadog_url}"
202
353
  else:
203
- error_msg = f"Exception while querying Datadog: {str(e)}"
354
+ # Include full API error details and query context
355
+ error_msg = (
356
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
357
+ )
358
+ query = f"{self.dd_config.labels.namespace}:{params.namespace} {self.dd_config.labels.pod}:{params.pod_name}"
359
+ if params.filter:
360
+ query += f' "{params.filter}"'
361
+ error_msg += f"\nQuery: {query}"
362
+
363
+ # Get actual time range used
364
+ (from_time, to_time) = process_timestamps_to_rfc3339(
365
+ start_timestamp=params.start_time,
366
+ end_timestamp=params.end_time,
367
+ default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
368
+ )
369
+ error_msg += f"\nTime range: {from_time} to {to_time}"
370
+
371
+ # Add Datadog web UI URL to error message
372
+ if datadog_url:
373
+ error_msg += f"\nView in Datadog: {datadog_url}"
204
374
 
205
375
  return StructuredToolResult(
206
- status=ToolResultStatus.ERROR,
376
+ status=StructuredToolResultStatus.ERROR,
207
377
  error=error_msg,
378
+ url=datadog_url,
208
379
  params=params.model_dump(),
209
380
  invocation=json.dumps(e.payload),
210
381
  )
@@ -214,7 +385,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
214
385
  f"Failed to query Datadog logs for params: {params}", exc_info=True
215
386
  )
216
387
  return StructuredToolResult(
217
- status=ToolResultStatus.ERROR,
388
+ status=StructuredToolResultStatus.ERROR,
218
389
  error=f"Exception while querying Datadog: {str(e)}",
219
390
  params=params.model_dump(),
220
391
  )
@@ -225,7 +396,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
225
396
  Returns (success, error_message).
226
397
  """
227
398
  try:
228
- logging.info("Performing Datadog configuration healthcheck...")
399
+ logging.debug("Performing Datadog configuration healthcheck...")
229
400
  healthcheck_params = FetchPodLogsParams(
230
401
  namespace="*",
231
402
  pod_name="*",
@@ -235,11 +406,11 @@ class DatadogLogsToolset(BasePodLoggingToolset):
235
406
 
236
407
  result = self.fetch_pod_logs(healthcheck_params)
237
408
 
238
- if result.status == ToolResultStatus.ERROR:
409
+ if result.status == StructuredToolResultStatus.ERROR:
239
410
  error_msg = result.error or "Unknown error during healthcheck"
240
411
  logging.error(f"Datadog healthcheck failed: {error_msg}")
241
412
  return False, f"Datadog healthcheck failed: {error_msg}"
242
- elif result.status == ToolResultStatus.NO_DATA:
413
+ elif result.status == StructuredToolResultStatus.NO_DATA:
243
414
  error_msg = "No logs were found in the last 48 hours using wildcards for pod and namespace. Is the configuration correct?"
244
415
  logging.error(f"Datadog healthcheck failed: {error_msg}")
245
416
  return False, f"Datadog healthcheck failed: {error_msg}"
@@ -255,7 +426,7 @@ class DatadogLogsToolset(BasePodLoggingToolset):
255
426
  if not config:
256
427
  return (
257
428
  False,
258
- TOOLSET_CONFIG_MISSING_ERROR,
429
+ "Missing config for dd_api_key, dd_app_key, or site_api_url. For details: https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
259
430
  )
260
431
 
261
432
  try:
@@ -6,8 +6,9 @@ from holmes.core.tools import (
6
6
  CallablePrerequisite,
7
7
  StructuredToolResult,
8
8
  Tool,
9
+ ToolInvokeContext,
9
10
  ToolParameter,
10
- ToolResultStatus,
11
+ StructuredToolResultStatus,
11
12
  Toolset,
12
13
  ToolsetTag,
13
14
  )
@@ -54,7 +55,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
54
55
  def __init__(self, toolset: "DatadogMetricsToolset"):
55
56
  super().__init__(
56
57
  name="list_active_datadog_metrics",
57
- description=f"List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
58
+ description=f"[datadog/metrics toolset] List active metrics from Datadog for the last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours. This includes metrics that have actively reported data points, including from pods no longer in the cluster.",
58
59
  parameters={
59
60
  "from_time": ToolParameter(
60
61
  description=f"Start time for listing metrics. Can be an RFC3339 formatted datetime (e.g. '2023-03-01T10:30:00Z') or a negative integer for relative seconds from now (e.g. -86400 for 24 hours ago). Defaults to {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours ago",
@@ -75,12 +76,10 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
75
76
  toolset=toolset,
76
77
  )
77
78
 
78
- def _invoke(
79
- self, params: dict, user_approved: bool = False
80
- ) -> StructuredToolResult:
79
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
81
80
  if not self.toolset.dd_config:
82
81
  return StructuredToolResult(
83
- status=ToolResultStatus.ERROR,
82
+ status=StructuredToolResultStatus.ERROR,
84
83
  error=TOOLSET_CONFIG_MISSING_ERROR,
85
84
  params=params,
86
85
  )
@@ -121,7 +120,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
121
120
  metrics = data.get("metrics", [])
122
121
  if not metrics:
123
122
  return StructuredToolResult(
124
- status=ToolResultStatus.ERROR,
123
+ status=StructuredToolResultStatus.ERROR,
125
124
  data="Your filter returned no metrics. Change your filter and try again",
126
125
  params=params,
127
126
  )
@@ -133,7 +132,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
133
132
  output.append(metric)
134
133
 
135
134
  return StructuredToolResult(
136
- status=ToolResultStatus.SUCCESS,
135
+ status=StructuredToolResultStatus.SUCCESS,
137
136
  data="\n".join(output),
138
137
  params=params,
139
138
  )
@@ -149,10 +148,30 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
149
148
  f"and 'timeseries_query' permissions. Error: {str(e)}"
150
149
  )
151
150
  else:
152
- error_msg = f"Exception while querying Datadog: {str(e)}"
151
+ # Include full API error details for better debugging
152
+ error_msg = (
153
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
154
+ )
155
+ if params:
156
+ # ListActiveMetrics parameters: from_time, host, tag_filter
157
+ if params.get("host"):
158
+ error_msg += f"\nHost filter: {params.get('host')}"
159
+ if params.get("tag_filter"):
160
+ error_msg += f"\nTag filter: {params.get('tag_filter')}"
161
+
162
+ from_time_param = params.get("from_time")
163
+ if from_time_param:
164
+ time_desc = from_time_param
165
+ else:
166
+ time_desc = f"default (last {ACTIVE_METRICS_DEFAULT_LOOK_BACK_HOURS} hours)"
167
+ error_msg += f"\nTime range: {time_desc}"
168
+
169
+ # Note: We cannot generate a Datadog Metrics Explorer URL for ListActiveMetrics
170
+ # because the Metrics Explorer requires a specific metric query,
171
+ # while ListActiveMetrics just lists available metrics without querying any specific one
153
172
 
154
173
  return StructuredToolResult(
155
- status=ToolResultStatus.ERROR,
174
+ status=StructuredToolResultStatus.ERROR,
156
175
  error=error_msg,
157
176
  params=params,
158
177
  invocation=json.dumps({"url": url, "params": query_params})
@@ -165,7 +184,7 @@ class ListActiveMetrics(BaseDatadogMetricsTool):
165
184
  f"Failed to query Datadog metrics for params: {params}", exc_info=True
166
185
  )
167
186
  return StructuredToolResult(
168
- status=ToolResultStatus.ERROR,
187
+ status=StructuredToolResultStatus.ERROR,
169
188
  error=f"Exception while querying Datadog: {str(e)}",
170
189
  params=params,
171
190
  )
@@ -184,7 +203,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
184
203
  def __init__(self, toolset: "DatadogMetricsToolset"):
185
204
  super().__init__(
186
205
  name="query_datadog_metrics",
187
- description="Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
206
+ description="[datadog/metrics toolset] Query timeseries data from Datadog for a specific metric, including historical data for pods no longer in the cluster",
188
207
  parameters={
189
208
  "query": ToolParameter(
190
209
  description="The metric query string (e.g., 'system.cpu.user{host:myhost}')",
@@ -217,12 +236,10 @@ class QueryMetrics(BaseDatadogMetricsTool):
217
236
  toolset=toolset,
218
237
  )
219
238
 
220
- def _invoke(
221
- self, params: dict, user_approved: bool = False
222
- ) -> StructuredToolResult:
239
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
223
240
  if not self.toolset.dd_config:
224
241
  return StructuredToolResult(
225
- status=ToolResultStatus.ERROR,
242
+ status=StructuredToolResultStatus.ERROR,
226
243
  error=TOOLSET_CONFIG_MISSING_ERROR,
227
244
  params=params,
228
245
  )
@@ -261,9 +278,29 @@ class QueryMetrics(BaseDatadogMetricsTool):
261
278
  output_type = params.get("output_type", "Plain")
262
279
 
263
280
  if not series:
281
+ # Include detailed context in error message
282
+ from_time_param = params.get("from_time")
283
+ to_time_param = params.get("to_time")
284
+
285
+ if from_time_param:
286
+ from_desc = from_time_param
287
+ else:
288
+ from_desc = (
289
+ f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
290
+ )
291
+
292
+ to_desc = to_time_param or "now"
293
+
294
+ error_msg = (
295
+ f"The query returned no data.\n"
296
+ f"Query: {params.get('query', 'not specified')}\n"
297
+ f"Time range: {from_desc} to {to_desc}\n"
298
+ f"Please check your query syntax and ensure data exists for this time range."
299
+ )
300
+
264
301
  return StructuredToolResult(
265
- status=ToolResultStatus.NO_DATA,
266
- error="The query returned no data. Please check your query syntax and time range.",
302
+ status=StructuredToolResultStatus.NO_DATA,
303
+ error=error_msg,
267
304
  params=params,
268
305
  )
269
306
 
@@ -317,7 +354,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
317
354
 
318
355
  data_str = json.dumps(response_data, indent=2)
319
356
  return StructuredToolResult(
320
- status=ToolResultStatus.SUCCESS,
357
+ status=StructuredToolResultStatus.SUCCESS,
321
358
  data=data_str,
322
359
  params=params,
323
360
  )
@@ -333,10 +370,28 @@ class QueryMetrics(BaseDatadogMetricsTool):
333
370
  f"and 'timeseries_query' permissions. Error: {str(e)}"
334
371
  )
335
372
  else:
336
- error_msg = f"Exception while querying Datadog: {str(e)}"
373
+ # Include full API error details for better debugging
374
+ error_msg = (
375
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
376
+ )
377
+ if params:
378
+ error_msg += f"\nQuery: {params.get('query', 'not specified')}"
379
+
380
+ from_time_param = params.get("from_time")
381
+ to_time_param = params.get("to_time")
382
+
383
+ if from_time_param:
384
+ from_desc = from_time_param
385
+ else:
386
+ from_desc = (
387
+ f"default (last {DEFAULT_TIME_SPAN_SECONDS // 86400} days)"
388
+ )
389
+
390
+ to_desc = to_time_param or "now"
391
+ error_msg += f"\nTime range: {from_desc} to {to_desc}"
337
392
 
338
393
  return StructuredToolResult(
339
- status=ToolResultStatus.ERROR,
394
+ status=StructuredToolResultStatus.ERROR,
340
395
  error=error_msg,
341
396
  params=params,
342
397
  invocation=json.dumps({"url": url, "params": query_params})
@@ -350,7 +405,7 @@ class QueryMetrics(BaseDatadogMetricsTool):
350
405
  )
351
406
 
352
407
  return StructuredToolResult(
353
- status=ToolResultStatus.ERROR,
408
+ status=StructuredToolResultStatus.ERROR,
354
409
  error=f"Exception while querying Datadog: {str(e)}",
355
410
  params=params,
356
411
  )
@@ -364,7 +419,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
364
419
  def __init__(self, toolset: "DatadogMetricsToolset"):
365
420
  super().__init__(
366
421
  name="get_datadog_metric_metadata",
367
- description="Get metadata about one or more metrics including their type, description, unit, and other properties",
422
+ description="[datadog/metrics toolset] Get metadata about one or more metrics including their type, description, unit, and other properties",
368
423
  parameters={
369
424
  "metric_names": ToolParameter(
370
425
  description="Comma-separated list of metric names to get metadata for (e.g., 'system.cpu.user, system.mem.used')",
@@ -375,12 +430,10 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
375
430
  toolset=toolset,
376
431
  )
377
432
 
378
- def _invoke(
379
- self, params: dict, user_approved: bool = False
380
- ) -> StructuredToolResult:
433
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
381
434
  if not self.toolset.dd_config:
382
435
  return StructuredToolResult(
383
- status=ToolResultStatus.ERROR,
436
+ status=StructuredToolResultStatus.ERROR,
384
437
  error=TOOLSET_CONFIG_MISSING_ERROR,
385
438
  params=params,
386
439
  )
@@ -396,7 +449,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
396
449
 
397
450
  if not metric_names:
398
451
  return StructuredToolResult(
399
- status=ToolResultStatus.ERROR,
452
+ status=StructuredToolResultStatus.ERROR,
400
453
  error="metric_names cannot be empty",
401
454
  params=params,
402
455
  )
@@ -442,14 +495,14 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
442
495
 
443
496
  if not results and errors:
444
497
  return StructuredToolResult(
445
- status=ToolResultStatus.ERROR,
498
+ status=StructuredToolResultStatus.ERROR,
446
499
  error="Failed to retrieve metadata for all metrics",
447
500
  data=json.dumps(response_data, indent=2),
448
501
  params=params,
449
502
  )
450
503
 
451
504
  return StructuredToolResult(
452
- status=ToolResultStatus.SUCCESS,
505
+ status=StructuredToolResultStatus.SUCCESS,
453
506
  data=json.dumps(response_data, indent=2),
454
507
  params=params,
455
508
  )
@@ -461,7 +514,7 @@ class QueryMetricsMetadata(BaseDatadogMetricsTool):
461
514
  )
462
515
 
463
516
  return StructuredToolResult(
464
- status=ToolResultStatus.ERROR,
517
+ status=StructuredToolResultStatus.ERROR,
465
518
  error=f"Exception while querying Datadog: {str(e)}",
466
519
  params=params,
467
520
  )
@@ -480,7 +533,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
480
533
  def __init__(self, toolset: "DatadogMetricsToolset"):
481
534
  super().__init__(
482
535
  name="list_datadog_metric_tags",
483
- description="List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
536
+ description="[datadog/metrics toolset] List all available tags and aggregations for a specific metric. This helps in building queries by showing what dimensions are available for filtering.",
484
537
  parameters={
485
538
  "metric_name": ToolParameter(
486
539
  description="The name of the metric to get tags for (e.g., 'system.cpu.user', 'container.memory.usage')",
@@ -491,12 +544,10 @@ class ListMetricTags(BaseDatadogMetricsTool):
491
544
  toolset=toolset,
492
545
  )
493
546
 
494
- def _invoke(
495
- self, params: dict, user_approved: bool = False
496
- ) -> StructuredToolResult:
547
+ def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
497
548
  if not self.toolset.dd_config:
498
549
  return StructuredToolResult(
499
- status=ToolResultStatus.ERROR,
550
+ status=StructuredToolResultStatus.ERROR,
500
551
  error=TOOLSET_CONFIG_MISSING_ERROR,
501
552
  params=params,
502
553
  )
@@ -519,7 +570,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
519
570
  )
520
571
 
521
572
  return StructuredToolResult(
522
- status=ToolResultStatus.SUCCESS,
573
+ status=StructuredToolResultStatus.SUCCESS,
523
574
  data=data,
524
575
  params=params,
525
576
  )
@@ -537,10 +588,17 @@ class ListMetricTags(BaseDatadogMetricsTool):
537
588
  f"permissions. Error: {str(e)}"
538
589
  )
539
590
  else:
540
- error_msg = f"Exception while querying Datadog: {str(e)}"
591
+ # Include full API error details for better debugging
592
+ error_msg = (
593
+ f"Datadog API error (status {e.status_code}): {e.response_text}"
594
+ )
595
+ if params:
596
+ error_msg += (
597
+ f"\nMetric name: {params.get('metric_name', 'not specified')}"
598
+ )
541
599
 
542
600
  return StructuredToolResult(
543
- status=ToolResultStatus.ERROR,
601
+ status=StructuredToolResultStatus.ERROR,
544
602
  error=error_msg,
545
603
  params=params,
546
604
  invocation=json.dumps({"url": url, "params": query_params})
@@ -554,7 +612,7 @@ class ListMetricTags(BaseDatadogMetricsTool):
554
612
  exc_info=True,
555
613
  )
556
614
  return StructuredToolResult(
557
- status=ToolResultStatus.ERROR,
615
+ status=StructuredToolResultStatus.ERROR,
558
616
  error=f"Exception while querying Datadog: {str(e)}",
559
617
  params=params,
560
618
  )
@@ -571,7 +629,7 @@ class DatadogMetricsToolset(Toolset):
571
629
  super().__init__(
572
630
  name="datadog/metrics",
573
631
  description="Toolset for fetching metrics and metadata from Datadog, including historical data for pods no longer in the cluster",
574
- docs_url="https://docs.datadoghq.com/api/latest/metrics/",
632
+ docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
575
633
  icon_url="https://imgix.datadoghq.com//img/about/presskit/DDlogo.jpg",
576
634
  prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
577
635
  tools=[
@@ -580,14 +638,13 @@ class DatadogMetricsToolset(Toolset):
580
638
  QueryMetricsMetadata(toolset=self),
581
639
  ListMetricTags(toolset=self),
582
640
  ],
583
- experimental=True,
584
641
  tags=[ToolsetTag.CORE],
585
642
  )
586
643
  self._reload_instructions()
587
644
 
588
645
  def _perform_healthcheck(self, dd_config: DatadogMetricsConfig) -> Tuple[bool, str]:
589
646
  try:
590
- logging.info("Performing Datadog metrics configuration healthcheck...")
647
+ logging.debug("Performing Datadog metrics configuration healthcheck...")
591
648
 
592
649
  url = f"{dd_config.site_api_url}/api/v1/validate"
593
650
  headers = get_headers(dd_config)
@@ -616,7 +673,7 @@ class DatadogMetricsToolset(Toolset):
616
673
  if not config:
617
674
  return (
618
675
  False,
619
- TOOLSET_CONFIG_MISSING_ERROR,
676
+ "Missing config for dd_api_key, dd_app_key, or site_api_url. For details: https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/",
620
677
  )
621
678
 
622
679
  try: