holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,17 @@
1
1
  import json
2
2
  import logging
3
- from typing import Any, Optional, Dict, Union
3
+ import re
4
+ import threading
5
+ from datetime import datetime, timedelta, timezone
6
+ from typing import Any, Dict, Optional, Tuple, Union
7
+ from urllib.parse import urlparse, urlunparse
8
+
4
9
  import requests # type: ignore
5
10
  from pydantic import AnyUrl, BaseModel
6
11
  from requests.structures import CaseInsensitiveDict # type: ignore
7
12
  from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
8
13
  from tenacity.wait import wait_base
9
14
 
10
-
11
15
  START_RETRY_DELAY = (
12
16
  5.0 # Initial fallback delay if datadog does not return a reset_time
13
17
  )
@@ -16,6 +20,78 @@ MAX_RETRY_COUNT_ON_RATE_LIMIT = 5
16
20
 
17
21
  RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
18
22
 
23
+ # Cache for OpenAPI spec
24
+ _openapi_spec_cache: Dict[str, Any] = {}
25
+
26
+ # Global lock for Datadog API requests to prevent concurrent calls
27
+ _datadog_request_lock = threading.Lock()
28
+
29
+ # Relative time pattern (m = minutes, mo = months)
30
+ RELATIVE_TIME_PATTERN = re.compile(r"^-?(\d+)([hdwsy]|min|m|mo)$|^now$", re.IGNORECASE)
31
+
32
+
33
+ def convert_api_url_to_app_url(api_url: Union[str, AnyUrl]) -> str:
34
+ """
35
+ Convert a Datadog API URL to the corresponding web app URL.
36
+
37
+ Handles various URL formats:
38
+ - https://api.datadoghq.com -> https://app.datadoghq.com
39
+ - https://api.datadoghq.eu -> https://app.datadoghq.eu
40
+ - https://api.us5.datadoghq.com -> https://app.us5.datadoghq.com
41
+ - Also handles URLs with paths like https://api.datadoghq.com/api/v1
42
+
43
+ Args:
44
+ api_url: The API URL to convert
45
+
46
+ Returns:
47
+ The web app URL without trailing slash
48
+ """
49
+ url_str = str(api_url)
50
+ parsed = urlparse(url_str)
51
+
52
+ # Replace 'api.' subdomain with 'app.' in the hostname
53
+ # This handles cases like:
54
+ # - api.datadoghq.com -> app.datadoghq.com
55
+ # - api.datadoghq.eu -> app.datadoghq.eu
56
+ # - api.us5.datadoghq.com -> app.us5.datadoghq.com
57
+ if parsed.hostname and parsed.hostname.startswith("api."):
58
+ new_hostname = "app." + parsed.hostname[4:]
59
+ # Reconstruct the netloc with the new hostname
60
+ if parsed.port:
61
+ new_netloc = f"{new_hostname}:{parsed.port}"
62
+ else:
63
+ new_netloc = new_hostname
64
+ else:
65
+ # If it doesn't start with 'api.', keep the hostname as is
66
+ # This handles edge cases where the URL might not follow the pattern
67
+ new_netloc = parsed.netloc
68
+
69
+ # Remove any /api path segments if present
70
+ # Some configurations might include /api/v1 or similar in the base URL
71
+ new_path = parsed.path
72
+ if new_path.startswith("/api/"):
73
+ new_path = new_path[4:] # Remove '/api' prefix
74
+ elif new_path == "/api":
75
+ new_path = "/"
76
+
77
+ # Reconstruct the URL with the app subdomain
78
+ app_url = urlunparse(
79
+ (
80
+ parsed.scheme,
81
+ new_netloc,
82
+ new_path,
83
+ "", # params
84
+ "", # query
85
+ "", # fragment
86
+ )
87
+ )
88
+
89
+ # Remove trailing slash
90
+ if app_url.endswith("/"):
91
+ app_url = app_url[:-1]
92
+
93
+ return app_url
94
+
19
95
 
20
96
  class DatadogBaseConfig(BaseModel):
21
97
  """Base configuration for all Datadog toolsets"""
@@ -166,15 +242,38 @@ def execute_datadog_http_request(
166
242
  timeout: int,
167
243
  method: str = "POST",
168
244
  ) -> Any:
169
- # Log the request details
170
- logging.info("Datadog API Request:")
171
- logging.info(f" Method: {method}")
172
- logging.info(f" URL: {url}")
173
- logging.info(f" Headers: {json.dumps(sanitize_headers(headers), indent=2)}")
174
- logging.info(
175
- f" {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)}"
245
+ # from my limited testing doing 1 just request at a time is faster because the RATE_LIMIT_REMAINING_SECONDS_HEADER is shorter
246
+ # Serialize all Datadog API requests to avoid rate limits
247
+ with _datadog_request_lock:
248
+ return execute_datadog_http_request_with_retries(
249
+ url, headers, payload_or_params, timeout, method
250
+ )
251
+
252
+
253
+ @retry(
254
+ retry=retry_if_http_429_error(),
255
+ wait=wait_for_retry_after_header(
256
+ fallback=wait_incrementing(
257
+ start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
258
+ )
259
+ ),
260
+ stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
261
+ before_sleep=lambda retry_state: logging.warning(
262
+ f"DataDog API rate limited. Retrying... "
263
+ f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
264
+ ),
265
+ reraise=True,
266
+ )
267
+ def execute_datadog_http_request_with_retries(
268
+ url: str,
269
+ headers: dict,
270
+ payload_or_params: dict,
271
+ timeout: int,
272
+ method: str,
273
+ ) -> Any:
274
+ logging.debug(
275
+ f"Datadog API Request: Method: {method} URL: {url} Headers: {json.dumps(sanitize_headers(headers), indent=2)} {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)} Timeout: {timeout}s"
176
276
  )
177
- logging.info(f" Timeout: {timeout}s")
178
277
 
179
278
  if method == "GET":
180
279
  response = requests.get(
@@ -186,31 +285,431 @@ def execute_datadog_http_request(
186
285
  )
187
286
 
188
287
  # Log the response details
189
- logging.info("Datadog API Response:")
190
- logging.info(f" Status Code: {response.status_code}")
191
- logging.info(f" Response Headers: {dict(sanitize_headers(response.headers))}")
288
+ logging.debug(
289
+ f"Datadog API Response: Status Code: {response.status_code} Response Headers: {dict(sanitize_headers(response.headers))}"
290
+ )
192
291
 
193
292
  if response.status_code == 200:
194
293
  response_data = response.json()
195
- # Log response size but not full content (could be large)
196
- if isinstance(response_data, dict):
197
- logging.info(f" Response Keys: {list(response_data.keys())}")
198
- if "data" in response_data:
199
- data_len = (
200
- len(response_data["data"])
201
- if isinstance(response_data["data"], list)
202
- else 1
203
- )
204
- logging.info(f" Data Items Count: {data_len}")
205
- else:
206
- logging.info(f" Response Type: {type(response_data).__name__}")
207
294
  return response_data
208
295
 
209
296
  else:
210
- logging.error(f" Error Response Body: {response.text}")
297
+ logging.debug(f"Error Response Body: {response.text}")
211
298
  raise DataDogRequestError(
212
299
  payload=payload_or_params,
213
300
  status_code=response.status_code,
214
301
  response_text=response.text,
215
302
  response_headers=response.headers,
216
303
  )
304
+
305
+
306
+ def fetch_openapi_spec(
307
+ site_api_url: Optional[str] = None, version: str = "both"
308
+ ) -> Optional[Dict[str, Any]]:
309
+ """Fetch and cache the Datadog OpenAPI specification.
310
+
311
+ Args:
312
+ site_api_url: Base URL for Datadog API (not used, kept for compatibility)
313
+ version: Which version to fetch ('v1', 'v2', or 'both')
314
+
315
+ Returns:
316
+ OpenAPI spec as dictionary (combined if 'both'), or None if fetch fails
317
+ """
318
+ global _openapi_spec_cache
319
+
320
+ # Use version as cache key
321
+ cache_key = f"openapi_{version}"
322
+
323
+ # Check cache first
324
+ if cache_key in _openapi_spec_cache:
325
+ return _openapi_spec_cache[cache_key]
326
+
327
+ try:
328
+ import yaml
329
+
330
+ # GitHub raw URLs for Datadog's official OpenAPI specs
331
+ spec_urls = {
332
+ "v1": "https://raw.githubusercontent.com/DataDog/datadog-api-client-python/master/.generator/schemas/v1/openapi.yaml",
333
+ "v2": "https://raw.githubusercontent.com/DataDog/datadog-api-client-python/master/.generator/schemas/v2/openapi.yaml",
334
+ }
335
+
336
+ combined_spec: Dict[str, Any] = {
337
+ "openapi": "3.0.0",
338
+ "paths": {},
339
+ "components": {},
340
+ }
341
+
342
+ versions_to_fetch = []
343
+ if version == "both":
344
+ versions_to_fetch = ["v1", "v2"]
345
+ elif version in spec_urls:
346
+ versions_to_fetch = [version]
347
+ else:
348
+ logging.error(f"Invalid version: {version}")
349
+ return None
350
+
351
+ for ver in versions_to_fetch:
352
+ try:
353
+ logging.debug(f"Fetching Datadog OpenAPI spec for {ver}...")
354
+ response = requests.get(spec_urls[ver], timeout=30)
355
+ if response.status_code == 200:
356
+ # Parse YAML to dict
357
+ spec = yaml.safe_load(response.text)
358
+
359
+ if version == "both":
360
+ # Merge specs
361
+ if "paths" in spec:
362
+ # Prefix v1 paths with /api/v1 and v2 with /api/v2
363
+ for path, methods in spec.get("paths", {}).items():
364
+ prefixed_path = (
365
+ f"/api/{ver}{path}"
366
+ if not path.startswith("/api/")
367
+ else path
368
+ )
369
+ paths_dict = combined_spec.get("paths", {})
370
+ if isinstance(paths_dict, dict):
371
+ paths_dict[prefixed_path] = methods
372
+
373
+ # Merge components
374
+ if "components" in spec:
375
+ for comp_type, components in spec.get(
376
+ "components", {}
377
+ ).items():
378
+ components_dict = combined_spec.get("components", {})
379
+ if isinstance(components_dict, dict):
380
+ if comp_type not in components_dict:
381
+ components_dict[comp_type] = {}
382
+ components_dict[comp_type].update(components)
383
+ else:
384
+ combined_spec = spec
385
+
386
+ logging.info(f"Successfully fetched OpenAPI spec for {ver}")
387
+ else:
388
+ logging.warning(
389
+ f"Failed to fetch spec for {ver}: HTTP {response.status_code}"
390
+ )
391
+ except Exception as e:
392
+ logging.error(f"Failed to fetch spec for {ver}: {e}")
393
+ if version != "both":
394
+ return None
395
+
396
+ if combined_spec["paths"]:
397
+ _openapi_spec_cache[cache_key] = combined_spec
398
+ logging.info(
399
+ f"Cached OpenAPI spec with {len(combined_spec['paths'])} endpoints"
400
+ )
401
+ return combined_spec
402
+ else:
403
+ logging.warning("No endpoints found in OpenAPI spec")
404
+ return None
405
+
406
+ except Exception as e:
407
+ logging.error(f"Error fetching OpenAPI spec: {e}")
408
+ return None
409
+
410
+
411
+ def get_endpoint_requirements(
412
+ spec: Dict[str, Any], endpoint: str, method: str
413
+ ) -> Optional[Dict[str, Any]]:
414
+ """Extract parameter requirements for a specific endpoint from OpenAPI spec.
415
+
416
+ Args:
417
+ spec: OpenAPI specification
418
+ endpoint: API endpoint path
419
+ method: HTTP method
420
+
421
+ Returns:
422
+ Dictionary with parameter requirements, or None if not found
423
+ """
424
+ if not spec or "paths" not in spec:
425
+ return None
426
+
427
+ # Normalize endpoint path
428
+ endpoint = endpoint.strip("/")
429
+ if not endpoint.startswith("/"):
430
+ endpoint = "/" + endpoint
431
+
432
+ # Find the endpoint in the spec
433
+ paths = spec.get("paths", {})
434
+ if endpoint not in paths:
435
+ # Try to find a matching pattern (e.g., /api/v2/logs/events/search)
436
+ for path_pattern in paths.keys():
437
+ if (
438
+ path_pattern == endpoint
439
+ or path_pattern.replace("{", "").replace("}", "") in endpoint
440
+ ):
441
+ endpoint = path_pattern
442
+ break
443
+ else:
444
+ return None
445
+
446
+ # Get method requirements
447
+ endpoint_spec = paths.get(endpoint, {})
448
+ method_spec = endpoint_spec.get(method.lower(), {})
449
+
450
+ if not method_spec:
451
+ return None
452
+
453
+ requirements = {
454
+ "description": method_spec.get("description", ""),
455
+ "parameters": [],
456
+ "requestBody": None,
457
+ }
458
+
459
+ # Extract parameters
460
+ for param in method_spec.get("parameters", []):
461
+ param_info = {
462
+ "name": param.get("name"),
463
+ "in": param.get("in"), # query, path, header
464
+ "required": param.get("required", False),
465
+ "description": param.get("description", ""),
466
+ "schema": param.get("schema", {}),
467
+ }
468
+ requirements["parameters"].append(param_info)
469
+
470
+ # Extract request body schema
471
+ if "requestBody" in method_spec:
472
+ body = method_spec["requestBody"]
473
+ content = body.get("content", {})
474
+ json_content = content.get("application/json", {})
475
+ requirements["requestBody"] = {
476
+ "required": body.get("required", False),
477
+ "schema": json_content.get("schema", {}),
478
+ "description": body.get("description", ""),
479
+ }
480
+
481
+ return requirements
482
+
483
+
484
+ def convert_relative_time(time_str: str) -> Tuple[str, str]:
485
+ """Convert relative time strings to RFC3339 format.
486
+
487
+ Args:
488
+ time_str: Time string (e.g., '-24h', 'now', '-7d', '2024-01-01T00:00:00Z')
489
+
490
+ Returns:
491
+ Tuple of (converted_time, format_type) where format_type is 'relative', 'rfc3339', or 'unix'
492
+ """
493
+ # Check if already in RFC3339 format
494
+ try:
495
+ # Try parsing as RFC3339
496
+ if "T" in time_str and (
497
+ time_str.endswith("Z") or "+" in time_str or "-" in time_str[-6:]
498
+ ):
499
+ datetime.fromisoformat(time_str.replace("Z", "+00:00"))
500
+ return time_str, "rfc3339"
501
+ except (ValueError, AttributeError):
502
+ pass
503
+
504
+ # Check if Unix timestamp
505
+ try:
506
+ timestamp = float(time_str)
507
+ if 1000000000 < timestamp < 2000000000: # Reasonable Unix timestamp range
508
+ return time_str, "unix"
509
+ except (ValueError, TypeError):
510
+ pass
511
+
512
+ # Check for relative time
513
+ match = RELATIVE_TIME_PATTERN.match(time_str.strip())
514
+ if not match:
515
+ # Return as-is if not recognized
516
+ return time_str, "unknown"
517
+
518
+ now = datetime.now(timezone.utc)
519
+
520
+ if time_str.lower() == "now":
521
+ return now.isoformat().replace("+00:00", "Z"), "relative"
522
+
523
+ # Parse relative time
524
+ groups = match.groups()
525
+ if groups[0] is None:
526
+ return time_str, "unknown"
527
+
528
+ amount = int(groups[0])
529
+ unit = groups[1].lower()
530
+
531
+ # Convert to timedelta
532
+ if unit == "s":
533
+ delta = timedelta(seconds=amount)
534
+ elif unit == "min":
535
+ delta = timedelta(minutes=amount)
536
+ elif unit == "m":
537
+ delta = timedelta(minutes=amount) # m = minutes
538
+ elif unit == "h":
539
+ delta = timedelta(hours=amount)
540
+ elif unit == "d":
541
+ delta = timedelta(days=amount)
542
+ elif unit == "w":
543
+ delta = timedelta(weeks=amount)
544
+ elif unit == "mo":
545
+ delta = timedelta(days=amount * 30) # mo = months (approximate)
546
+ elif unit == "y":
547
+ delta = timedelta(days=amount * 365) # Approximate
548
+ else:
549
+ return time_str, "unknown"
550
+
551
+ # Apply delta (subtract if negative relative time)
552
+ if time_str.startswith("-"):
553
+ result_time = now - delta
554
+ else:
555
+ result_time = now + delta
556
+
557
+ return result_time.isoformat().replace("+00:00", "Z"), "relative"
558
+
559
+
560
+ def preprocess_time_fields(payload: Dict[str, Any], endpoint: str) -> Dict[str, Any]:
561
+ """Preprocess time fields in payload, converting relative times to appropriate format.
562
+
563
+ Args:
564
+ payload: Request payload
565
+ endpoint: API endpoint
566
+
567
+ Returns:
568
+ Modified payload with converted time fields
569
+ """
570
+ # Deep copy to avoid modifying original
571
+ import copy
572
+
573
+ processed = copy.deepcopy(payload)
574
+
575
+ # Common time field paths to check
576
+ time_fields = [
577
+ ["filter", "from"],
578
+ ["filter", "to"],
579
+ ["from"],
580
+ ["to"],
581
+ ["start"],
582
+ ["end"],
583
+ ["start_time"],
584
+ ["end_time"],
585
+ ]
586
+
587
+ def get_nested(d, path):
588
+ """Get nested dictionary value."""
589
+ for key in path:
590
+ if isinstance(d, dict) and key in d:
591
+ d = d[key]
592
+ else:
593
+ return None
594
+ return d
595
+
596
+ def set_nested(d, path, value):
597
+ """Set nested dictionary value."""
598
+ for key in path[:-1]:
599
+ if key not in d:
600
+ d[key] = {}
601
+ d = d[key]
602
+ d[path[-1]] = value
603
+
604
+ conversions = []
605
+
606
+ for field_path in time_fields:
607
+ value = get_nested(processed, field_path)
608
+ if value and isinstance(value, str):
609
+ converted, format_type = convert_relative_time(value)
610
+ if format_type == "relative":
611
+ set_nested(processed, field_path, converted)
612
+ conversions.append(
613
+ f"{'.'.join(field_path)}: '{value}' -> '{converted}'"
614
+ )
615
+
616
+ if conversions:
617
+ logging.info(f"Converted relative time fields: {', '.join(conversions)}")
618
+
619
+ return processed
620
+
621
+
622
+ def enhance_error_message(
623
+ error: DataDogRequestError, endpoint: str, method: str, site_api_url: str
624
+ ) -> str:
625
+ """Enhance error message with OpenAPI spec details and format examples.
626
+
627
+ Args:
628
+ error: Original DataDog request error
629
+ endpoint: API endpoint
630
+ method: HTTP method
631
+ site_api_url: Base API URL
632
+
633
+ Returns:
634
+ Enhanced error message
635
+ """
636
+ base_msg = f"HTTP error: {error.status_code} - {error.response_text}"
637
+
638
+ # For 400 errors, try to provide more context
639
+ if error.status_code == 400:
640
+ enhanced_parts = [base_msg]
641
+
642
+ # Try to parse error details
643
+ try:
644
+ error_body = json.loads(error.response_text)
645
+ if "errors" in error_body:
646
+ enhanced_parts.append(f"\nErrors: {error_body['errors']}")
647
+
648
+ # Check for specific field validation errors
649
+ for err in error_body.get("errors", []):
650
+ if "input_validation_error" in str(err):
651
+ enhanced_parts.append("\n⚠️ Input validation error detected.")
652
+
653
+ # Add time format help
654
+ if any(
655
+ field in str(err).lower()
656
+ for field in ["from", "to", "time", "date"]
657
+ ):
658
+ enhanced_parts.append(
659
+ "\nTime format requirements:\n"
660
+ " - v1 API: Unix timestamps (e.g., 1704067200)\n"
661
+ " - v2 API: RFC3339 format (e.g., '2024-01-01T00:00:00Z')\n"
662
+ " - NOT supported: Relative times like '-24h', 'now', '-7d'"
663
+ )
664
+ except (json.JSONDecodeError, TypeError):
665
+ pass
666
+
667
+ # Try to fetch OpenAPI spec for more details
668
+ spec = fetch_openapi_spec(version="both")
669
+ if spec:
670
+ requirements = get_endpoint_requirements(spec, endpoint, method)
671
+ if requirements:
672
+ enhanced_parts.append(f"\nEndpoint: {method} {endpoint}")
673
+ if requirements["description"]:
674
+ enhanced_parts.append(f"Description: {requirements['description']}")
675
+
676
+ # Add parameter requirements
677
+ if requirements["parameters"]:
678
+ enhanced_parts.append("\nRequired parameters:")
679
+ for param in requirements["parameters"]:
680
+ if param["required"]:
681
+ enhanced_parts.append(
682
+ f" - {param['name']} ({param['in']}): {param['description']}"
683
+ )
684
+
685
+ # Add request body schema hints
686
+ if (
687
+ requirements["requestBody"]
688
+ and requirements["requestBody"]["required"]
689
+ ):
690
+ enhanced_parts.append("\nRequest body is required")
691
+ if requirements["requestBody"]["description"]:
692
+ enhanced_parts.append(
693
+ f"Body: {requirements['requestBody']['description']}"
694
+ )
695
+
696
+ # Add example for common endpoints
697
+ if "/logs/events/search" in endpoint:
698
+ enhanced_parts.append(
699
+ "\nExample request body for logs search:\n"
700
+ "```json\n"
701
+ "{\n"
702
+ ' "filter": {\n'
703
+ ' "from": "2024-01-01T00:00:00Z",\n'
704
+ ' "to": "2024-01-02T00:00:00Z",\n'
705
+ ' "query": "*"\n'
706
+ " },\n"
707
+ ' "sort": "-timestamp",\n'
708
+ ' "page": {"limit": 50}\n'
709
+ "}\n"
710
+ "```"
711
+ )
712
+
713
+ return "\n".join(enhanced_parts)
714
+
715
+ return base_msg
@@ -14,26 +14,36 @@ Before running logs queries:
14
14
 
15
15
  ### CRITICAL: Pod Name Resolution Workflow
16
16
 
17
- **When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
18
- - FIRST query Datadog directly with that pod name using appropriate tags
17
+ **IMPORTANT WILDCARD USAGE:**
18
+ - **ALWAYS use wildcards** when searching for pods unless you have the COMPLETE pod name with all suffixes
19
+ - Kubernetes pod names include deployment hash + replica ID (e.g., `nginx-ingress-7b9899-x2km9`, `frontend-5f4d3b2a1-abc123`)
20
+ - When user says "nginx pod" or "frontend pod", search for `nginx-*` or `frontend-*` NOT just `nginx` or `frontend`
21
+ - Datadog supports wildcards: `*` matches any characters (e.g., `nginx-*`, `*ingress*`, `*-x2km9`)
22
+ - For partial matches, use wildcards on both sides: `*keyword*` to find logs from any pod containing "keyword"
23
+
24
+ **When user provides what looks like a complete pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
25
+ - Query Datadog directly with that exact pod name
19
26
  - Do NOT try to verify if the pod exists in Kubernetes first
20
27
  - This allows querying historical pods that have been deleted/replaced
21
28
 
22
- **When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
23
- - First use `kubectl_find_resource` to find actual pod names
24
- - Example: `kubectl_find_resource` with "my-workload" finds pods like "my-workload-8f8cdfxyz-c7zdr"
25
- - Then use those specific pod names in Datadog queries
26
- - Alternative: Use deployment-level tags when appropriate
29
+ **When user provides a simple/generic name** (e.g., "nginx", "redis", "payment-service", "auth"):
30
+ - **DEFAULT ACTION: Use wildcards** - Query with `pod-name-*` pattern
31
+ - For historical queries (yesterday, last week): ALWAYS use wildcards directly in Datadog
32
+ - For current issues: Optionally use `kubectl_find_resource` to find exact pod names, but wildcards often work better
33
+ - Examples:
34
+ - User says "nginx pod" → Query Datadog with `nginx-*`
35
+ - User says "redis instance" → Query Datadog with `redis-*`
36
+ - User says "payment service" → Query Datadog with `payment-*`
27
37
 
28
- **Why this matters:**
38
+ **Why wildcards are critical:**
29
39
  - Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
30
- - Historical pods that no longer exist in the cluster can still have logs in Datadog
31
- - Deployment/service names alone are NOT pod names (they need the suffix)
40
+ - Users typically refer to pods by their deployment/service name without suffixes
41
+ - Without wildcards, queries for "nginx" will find NOTHING when actual pods are named "nginx-7b9899-x2km9"
42
+ - Historical pods that no longer exist can only be found via Datadog with proper wildcard usage
32
43
 
33
44
  ### Time Parameters
34
45
  - Use RFC3339 format: `2023-03-01T10:30:00Z`
35
46
  - Or relative seconds: `-3600` for 1 hour ago
36
- - Defaults to 1 hour window if not specified
37
47
 
38
48
  ### Common Investigation Patterns
39
49
 
@@ -41,3 +51,37 @@ Before running logs queries:
41
51
  1. User asks: "Show logs for my-workload"
42
52
  2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
43
53
  3. Query Datadog for pod "my-workload-abc123-xyz" logs
54
+
55
+
56
+ ### Search Query Guidelines
57
+
58
+ 1. Avoid using @timestamp Attribute in the search queries (e.g for example @timestamp:[2025-12-10T01:00:00.000Z TO 2025-12-10T04:00:00.000Z)
59
+ Rely on the fetch_datadog_logs function start_datetime and end_datetime parameters for that.
60
+ 2. Datadog default TAGS for kubernetes are *kube_namespace* and *pod_name*, if a user specificy custom TAGS used in his environment please use them in your search queries.
61
+ 3. If you see a useful TAG in your Old fetch_datadog_logs query use it for further queries.
62
+
63
+ ### CRITICAL: Cursor Usage Rules
64
+ **NEVER parallelize cursor-based calls or reuse cursor values!**
65
+
66
+ Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
67
+
68
+ **WRONG (causes duplicate data):**
69
+ ```
70
+ Batch 1 → cursor_A
71
+ Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
72
+ Result: Duplicate data, incomplete results
73
+ ```
74
+
75
+ **CORRECT (sequential pagination):**
76
+ ```
77
+ Batch 1 → cursor_A
78
+ Wait for response → use cursor_A for Batch 2 → cursor_B
79
+ Wait for response → use cursor_B for Batch 3 → cursor_C
80
+ Result: Complete unique data ✅
81
+ ```
82
+
83
+ **Key Rules:**
84
+ - Each response provides a NEW cursor for the NEXT request
85
+ - NEVER reuse the same cursor value multiple times
86
+ - NEVER make parallel calls with the same cursor
87
+ - Always wait for response before using the returned cursor