holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +4 -3
  3. holmes/common/env_vars.py +18 -2
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +11 -6
  6. holmes/core/conversations.py +30 -13
  7. holmes/core/investigation.py +21 -25
  8. holmes/core/investigation_structured_output.py +3 -3
  9. holmes/core/issue.py +1 -1
  10. holmes/core/llm.py +50 -31
  11. holmes/core/models.py +19 -17
  12. holmes/core/openai_formatting.py +1 -1
  13. holmes/core/prompt.py +47 -2
  14. holmes/core/runbooks.py +1 -0
  15. holmes/core/safeguards.py +4 -2
  16. holmes/core/supabase_dal.py +4 -2
  17. holmes/core/tool_calling_llm.py +102 -141
  18. holmes/core/tools.py +19 -28
  19. holmes/core/tools_utils/token_counting.py +9 -2
  20. holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
  21. holmes/core/tools_utils/tool_executor.py +0 -18
  22. holmes/core/tools_utils/toolset_utils.py +1 -0
  23. holmes/core/toolset_manager.py +37 -2
  24. holmes/core/tracing.py +13 -2
  25. holmes/core/transformers/__init__.py +1 -1
  26. holmes/core/transformers/base.py +1 -0
  27. holmes/core/transformers/llm_summarize.py +3 -2
  28. holmes/core/transformers/registry.py +2 -1
  29. holmes/core/transformers/transformer.py +1 -0
  30. holmes/core/truncation/compaction.py +37 -2
  31. holmes/core/truncation/input_context_window_limiter.py +3 -2
  32. holmes/interactive.py +52 -8
  33. holmes/main.py +17 -37
  34. holmes/plugins/interfaces.py +2 -1
  35. holmes/plugins/prompts/__init__.py +2 -1
  36. holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
  37. holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
  38. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  39. holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
  40. holmes/plugins/prompts/generic_ask.jinja2 +0 -2
  41. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
  42. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
  43. holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
  44. holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
  45. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
  46. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
  47. holmes/plugins/runbooks/__init__.py +32 -3
  48. holmes/plugins/sources/github/__init__.py +4 -2
  49. holmes/plugins/sources/prometheus/models.py +1 -0
  50. holmes/plugins/toolsets/__init__.py +30 -26
  51. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
  52. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  53. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  54. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  55. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  56. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  57. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
  58. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
  59. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
  60. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
  61. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
  62. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
  63. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
  64. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
  65. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
  66. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
  67. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
  68. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  69. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  70. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  71. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  72. holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
  73. holmes/plugins/toolsets/bash/common/bash.py +19 -9
  74. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  75. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  76. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  77. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  78. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  79. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  80. holmes/plugins/toolsets/connectivity_check.py +124 -0
  81. holmes/plugins/toolsets/coralogix/api.py +132 -119
  82. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  83. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  84. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  85. holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
  86. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
  87. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  88. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  89. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  90. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  91. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
  92. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
  93. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
  94. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
  95. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  96. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  97. holmes/plugins/toolsets/git.py +7 -8
  98. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  99. holmes/plugins/toolsets/grafana/common.py +2 -30
  100. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
  101. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
  102. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
  103. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  104. holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
  105. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
  106. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
  107. holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
  108. holmes/plugins/toolsets/internet/internet.py +10 -10
  109. holmes/plugins/toolsets/internet/notion.py +5 -6
  110. holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
  111. holmes/plugins/toolsets/investigator/model.py +3 -1
  112. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  113. holmes/plugins/toolsets/kafka.py +12 -7
  114. holmes/plugins/toolsets/kubernetes.yaml +260 -30
  115. holmes/plugins/toolsets/kubernetes_logs.py +3 -3
  116. holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
  117. holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
  118. holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
  119. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
  120. holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
  121. holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
  122. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
  123. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  124. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
  125. holmes/plugins/toolsets/robusta/robusta.py +5 -5
  126. holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
  127. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
  128. holmes/plugins/toolsets/utils.py +1 -1
  129. holmes/utils/config_utils.py +1 -1
  130. holmes/utils/connection_utils.py +31 -0
  131. holmes/utils/console/result.py +10 -0
  132. holmes/utils/file_utils.py +2 -1
  133. holmes/utils/global_instructions.py +10 -26
  134. holmes/utils/holmes_status.py +4 -3
  135. holmes/utils/log.py +15 -0
  136. holmes/utils/markdown_utils.py +2 -3
  137. holmes/utils/memory_limit.py +58 -0
  138. holmes/utils/sentry_helper.py +23 -0
  139. holmes/utils/stream.py +12 -5
  140. holmes/utils/tags.py +4 -3
  141. holmes/version.py +3 -1
  142. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
  143. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  144. holmes/plugins/toolsets/aws.yaml +0 -80
  145. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
  146. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  147. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
  148. holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
  149. holmes/plugins/toolsets/opensearch/__init__.py +0 -0
  150. holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
  151. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  152. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
  153. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  154. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  155. holmes/utils/keygen_utils.py +0 -6
  156. holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
  157. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
  158. holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
  159. /holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
  160. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
  161. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  162. {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -1,16 +1,17 @@
1
1
  import json
2
2
  import logging
3
3
  import re
4
+ import threading
4
5
  from datetime import datetime, timedelta, timezone
5
- from typing import Any, Optional, Dict, Union, Tuple
6
+ from typing import Any, Dict, Optional, Tuple, Union
6
7
  from urllib.parse import urlparse, urlunparse
8
+
7
9
  import requests # type: ignore
8
10
  from pydantic import AnyUrl, BaseModel
9
11
  from requests.structures import CaseInsensitiveDict # type: ignore
10
12
  from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
11
13
  from tenacity.wait import wait_base
12
14
 
13
-
14
15
  START_RETRY_DELAY = (
15
16
  5.0 # Initial fallback delay if datadog does not return a reset_time
16
17
  )
@@ -22,6 +23,9 @@ RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
22
23
  # Cache for OpenAPI spec
23
24
  _openapi_spec_cache: Dict[str, Any] = {}
24
25
 
26
+ # Global lock for Datadog API requests to prevent concurrent calls
27
+ _datadog_request_lock = threading.Lock()
28
+
25
29
  # Relative time pattern (m = minutes, mo = months)
26
30
  RELATIVE_TIME_PATTERN = re.compile(r"^-?(\d+)([hdwsy]|min|m|mo)$|^now$", re.IGNORECASE)
27
31
 
@@ -237,6 +241,35 @@ def execute_datadog_http_request(
237
241
  payload_or_params: dict,
238
242
  timeout: int,
239
243
  method: str = "POST",
244
+ ) -> Any:
245
+ # from my limited testing doing 1 just request at a time is faster because the RATE_LIMIT_REMAINING_SECONDS_HEADER is shorter
246
+ # Serialize all Datadog API requests to avoid rate limits
247
+ with _datadog_request_lock:
248
+ return execute_datadog_http_request_with_retries(
249
+ url, headers, payload_or_params, timeout, method
250
+ )
251
+
252
+
253
+ @retry(
254
+ retry=retry_if_http_429_error(),
255
+ wait=wait_for_retry_after_header(
256
+ fallback=wait_incrementing(
257
+ start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
258
+ )
259
+ ),
260
+ stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
261
+ before_sleep=lambda retry_state: logging.warning(
262
+ f"DataDog API rate limited. Retrying... "
263
+ f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
264
+ ),
265
+ reraise=True,
266
+ )
267
+ def execute_datadog_http_request_with_retries(
268
+ url: str,
269
+ headers: dict,
270
+ payload_or_params: dict,
271
+ timeout: int,
272
+ method: str,
240
273
  ) -> Any:
241
274
  logging.debug(
242
275
  f"Datadog API Request: Method: {method} URL: {url} Headers: {json.dumps(sanitize_headers(headers), indent=2)} {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)} Timeout: {timeout}s"
@@ -261,7 +294,7 @@ def execute_datadog_http_request(
261
294
  return response_data
262
295
 
263
296
  else:
264
- logging.error(f" Error Response Body: {response.text}")
297
+ logging.debug(f"Error Response Body: {response.text}")
265
298
  raise DataDogRequestError(
266
299
  payload=payload_or_params,
267
300
  status_code=response.status_code,
@@ -44,7 +44,6 @@ Before running logs queries:
44
44
  ### Time Parameters
45
45
  - Use RFC3339 format: `2023-03-01T10:30:00Z`
46
46
  - Or relative seconds: `-3600` for 1 hour ago
47
- - Defaults to 1 hour window if not specified
48
47
 
49
48
  ### Common Investigation Patterns
50
49
 
@@ -52,3 +51,37 @@ Before running logs queries:
52
51
  1. User asks: "Show logs for my-workload"
53
52
  2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
54
53
  3. Query Datadog for pod "my-workload-abc123-xyz" logs
54
+
55
+
56
+ ### Search Query Guidelines
57
+
58
+ 1. Avoid using @timestamp Attribute in the search queries (e.g for example @timestamp:[2025-12-10T01:00:00.000Z TO 2025-12-10T04:00:00.000Z)
59
+ Rely on the fetch_datadog_logs function start_datetime and end_datetime parameters for that.
60
+ 2. Datadog default TAGS for kubernetes are *kube_namespace* and *pod_name*, if a user specificy custom TAGS used in his environment please use them in your search queries.
61
+ 3. If you see a useful TAG in your Old fetch_datadog_logs query use it for further queries.
62
+
63
+ ### CRITICAL: Cursor Usage Rules
64
+ **NEVER parallelize cursor-based calls or reuse cursor values!**
65
+
66
+ Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
67
+
68
+ **WRONG (causes duplicate data):**
69
+ ```
70
+ Batch 1 → cursor_A
71
+ Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
72
+ Result: Duplicate data, incomplete results
73
+ ```
74
+
75
+ **CORRECT (sequential pagination):**
76
+ ```
77
+ Batch 1 → cursor_A
78
+ Wait for response → use cursor_A for Batch 2 → cursor_B
79
+ Wait for response → use cursor_B for Batch 3 → cursor_C
80
+ Result: Complete unique data ✅
81
+ ```
82
+
83
+ **Key Rules:**
84
+ - Each response provides a NEW cursor for the NEXT request
85
+ - NEVER reuse the same cursor value multiple times
86
+ - NEVER make parallel calls with the same cursor
87
+ - Always wait for response before using the returned cursor
@@ -73,7 +73,7 @@ When investigating metrics-related issues:
73
73
 
74
74
  # Handling queries results
75
75
  * ALWAYS embed the execution results into your answer
76
- * You only need to embed the partial result in your response. Include the "tool_name" and "random_key". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "92jf2hf"} >>
76
+ * You only need to embed the partial result in your response. Include the "tool_name" and "tool_call_id". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "92jf2hf"} >>
77
77
  * Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
78
78
  * You MUST ensure that the query is successful.
79
79
  * ALWAYS embed a DataDog graph in the response. The graph should visualize data related to the incident.
@@ -81,6 +81,6 @@ When investigating metrics-related issues:
81
81
  * When embedding multiple graphs, always add line spacing between them
82
82
  For example:
83
83
 
84
- <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "lBaA"}>>
84
+ <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "lBaA"}>>
85
85
 
86
- <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "IKtq"}>>
86
+ <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "IKtq"}>>
@@ -0,0 +1,59 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import Field
4
+
5
+ from holmes.plugins.toolsets.datadog.datadog_api import DatadogBaseConfig
6
+ from holmes.plugins.toolsets.logging_utils.logging_api import DEFAULT_LOG_LIMIT
7
+
8
+ # Constants for RDS toolset
9
+ DEFAULT_TIME_SPAN_SECONDS = 3600
10
+ DEFAULT_TOP_INSTANCES = 10
11
+
12
+ # Constants for general toolset
13
+ MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10MB
14
+
15
+
16
+ class DataDogStorageTier(str, Enum):
17
+ """Storage tier enum for Datadog logs."""
18
+
19
+ INDEXES = "indexes"
20
+ ONLINE_ARCHIVES = "online-archives"
21
+ FLEX = "flex"
22
+
23
+
24
+ # Constants for logs toolset
25
+ DEFAULT_STORAGE_TIERS = [DataDogStorageTier.INDEXES]
26
+
27
+
28
+ class DatadogMetricsConfig(DatadogBaseConfig):
29
+ """Configuration for Datadog metrics toolset."""
30
+
31
+ default_limit: int = DEFAULT_LOG_LIMIT
32
+
33
+
34
+ class DatadogTracesConfig(DatadogBaseConfig):
35
+ """Configuration for Datadog traces toolset."""
36
+
37
+ indexes: list[str] = ["*"]
38
+
39
+
40
+ class DatadogLogsConfig(DatadogBaseConfig):
41
+ """Configuration for Datadog logs toolset."""
42
+
43
+ indexes: list[str] = ["*"]
44
+ # TODO storage tier just works with first element. need to add support for multi stoarge tiers.
45
+ storage_tiers: list[DataDogStorageTier] = Field(
46
+ default_factory=lambda: [DataDogStorageTier.INDEXES], min_length=1
47
+ )
48
+
49
+ compact_logs: bool = True
50
+ default_limit: int = DEFAULT_LOG_LIMIT
51
+
52
+
53
+ class DatadogGeneralConfig(DatadogBaseConfig):
54
+ """Configuration for general-purpose Datadog toolset."""
55
+
56
+ max_response_size: int = MAX_RESPONSE_SIZE
57
+ allow_custom_endpoints: bool = (
58
+ False # If True, allows endpoints not in whitelist (still filtered for safety)
59
+ )
@@ -0,0 +1,213 @@
1
+ import re
2
+ from typing import Any, Dict, Optional
3
+ from urllib.parse import urlencode, urlparse
4
+
5
+ from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
6
+ from holmes.plugins.toolsets.datadog.datadog_models import (
7
+ DatadogGeneralConfig,
8
+ DatadogLogsConfig,
9
+ DatadogMetricsConfig,
10
+ DatadogTracesConfig,
11
+ )
12
+
13
+
14
+ def generate_datadog_metrics_explorer_url(
15
+ dd_config: DatadogMetricsConfig,
16
+ query: str,
17
+ from_time: int,
18
+ to_time: int,
19
+ ) -> str:
20
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
21
+
22
+ params = {
23
+ "query": query,
24
+ "from_ts": from_time * 1000, # seconds -> ms
25
+ "to_ts": to_time * 1000, # seconds -> ms
26
+ "live": "true",
27
+ }
28
+
29
+ return f"{base_url}/metric/explorer?{urlencode(params)}"
30
+
31
+
32
+ def generate_datadog_metrics_list_url(
33
+ dd_config: DatadogMetricsConfig,
34
+ from_time: int,
35
+ host: Optional[str] = None,
36
+ tag_filter: Optional[str] = None,
37
+ metric_filter: Optional[str] = None,
38
+ ) -> str:
39
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
40
+
41
+ params = {}
42
+ if metric_filter:
43
+ params["filter"] = metric_filter
44
+
45
+ if host:
46
+ params["host"] = host
47
+ if tag_filter:
48
+ params["tag_filter"] = tag_filter
49
+
50
+ qs = urlencode(params) if params else ""
51
+ return f"{base_url}/metric/summary" + (f"?{qs}" if qs else "")
52
+
53
+
54
+ def generate_datadog_metric_metadata_url(
55
+ dd_config: DatadogMetricsConfig,
56
+ metric_name: str,
57
+ ) -> str:
58
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
59
+ params = {"metric": metric_name}
60
+ return f"{base_url}/metric/summary?{urlencode(params)}"
61
+
62
+
63
+ def generate_datadog_metric_tags_url(
64
+ dd_config: DatadogMetricsConfig,
65
+ metric_name: str,
66
+ ) -> str:
67
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
68
+ params = {"metric": metric_name}
69
+ return f"{base_url}/metric/summary?{urlencode(params)}"
70
+
71
+
72
+ def generate_datadog_spans_url(
73
+ dd_config: DatadogTracesConfig,
74
+ query: str,
75
+ from_time_ms: int,
76
+ to_time_ms: int,
77
+ ) -> str:
78
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
79
+
80
+ url_params = {
81
+ "query": query,
82
+ "from_ts": from_time_ms,
83
+ "to_ts": to_time_ms,
84
+ "live": "true",
85
+ }
86
+
87
+ return f"{base_url}/apm/traces?{urlencode(url_params)}"
88
+
89
+
90
+ def generate_datadog_spans_analytics_url(
91
+ dd_config: DatadogTracesConfig,
92
+ query: str,
93
+ from_time_ms: int,
94
+ to_time_ms: int,
95
+ ) -> str:
96
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
97
+
98
+ url_params = {
99
+ "query": query,
100
+ "from_ts": from_time_ms,
101
+ "to_ts": to_time_ms,
102
+ "live": "true",
103
+ }
104
+
105
+ return f"{base_url}/apm/analytics?{urlencode(url_params)}"
106
+
107
+
108
+ def generate_datadog_logs_url(
109
+ dd_config: DatadogLogsConfig,
110
+ params: dict,
111
+ ) -> str:
112
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
113
+ url_params = {
114
+ "query": params["filter"]["query"],
115
+ "from_ts": params["filter"]["from"],
116
+ "to_ts": params["filter"]["to"],
117
+ "live": "true",
118
+ "storage": params["filter"]["storage_tier"],
119
+ }
120
+
121
+ if dd_config.indexes != ["*"]:
122
+ url_params["index"] = ",".join(dd_config.indexes)
123
+
124
+ # Construct the full URL
125
+ return f"{base_url}/logs?{urlencode(url_params)}"
126
+
127
+
128
+ def _build_qs(
129
+ query_params: Optional[Dict[str, Any]], allowed: Optional[set] = None
130
+ ) -> str:
131
+ if not query_params:
132
+ return ""
133
+ allowed = allowed or {
134
+ "filter",
135
+ "query",
136
+ "tags",
137
+ "status",
138
+ "start",
139
+ "end",
140
+ "from",
141
+ "to",
142
+ }
143
+ url_params = {}
144
+ for k, v in query_params.items():
145
+ if k not in allowed or v is None:
146
+ continue
147
+ if k in ("start", "from"):
148
+ url_params["from_ts"] = v * 1000
149
+ elif k in ("end", "to"):
150
+ url_params["to_ts"] = v * 1000
151
+ elif k in ("query", "filter", "tags"):
152
+ url_params["q"] = v
153
+ else:
154
+ url_params[k] = v
155
+ qs = urlencode(url_params) if url_params else ""
156
+ return f"?{qs}" if qs else ""
157
+
158
+
159
+ def generate_datadog_general_url(
160
+ dd_config: DatadogGeneralConfig,
161
+ endpoint: str,
162
+ query_params: Optional[Dict[str, Any]] = None,
163
+ ) -> Optional[str]:
164
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
165
+ path = urlparse(endpoint).path
166
+
167
+ if "/logs" in path:
168
+ return f"{base_url}/logs{_build_qs(query_params, {'start', 'end'})}"
169
+
170
+ if "/monitor" in path:
171
+ qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
172
+ monitor_id_match = re.search(r"/monitor/(\d+)", path)
173
+ if monitor_id_match:
174
+ return f"{base_url}/monitors/{monitor_id_match.group(1)}{qs}"
175
+ return f"{base_url}/monitors{qs}"
176
+
177
+ if "/dashboard" in path:
178
+ qs = _build_qs(query_params, {"filter", "query", "tags"})
179
+ if re.match(r"^/api/v\d+/dashboard/[^/]+", path):
180
+ return f"{base_url}/dashboard/{path.split('/')[-1]}{qs}"
181
+ return f"{base_url}/dashboard{qs}"
182
+
183
+ if "/slo" in path:
184
+ qs = _build_qs(query_params, {"filter", "query", "tags"})
185
+ if re.match(r"^/api/v\d+/slo/[^/]+", path):
186
+ return f"{base_url}/slo/{path.split('/')[-1]}{qs}"
187
+ return f"{base_url}/slo{qs}"
188
+
189
+ if "/events" in path:
190
+ return f"{base_url}/events{_build_qs(query_params, {'start', 'end'})}"
191
+
192
+ if "/incidents" in path:
193
+ qs = _build_qs(query_params, {"filter", "query", "status"})
194
+ if re.match(r"^/api/v\d+/incidents/[^/]+", path):
195
+ return f"{base_url}/incidents/{path.split('/')[-1]}{qs}"
196
+ return f"{base_url}/incidents{qs}"
197
+
198
+ if "/synthetics" in path:
199
+ qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
200
+ if re.match(r"^/api/v\d+/synthetics/tests/[^/]+", path):
201
+ return f"{base_url}/synthetics/tests/{path.split('/')[-1]}{qs}"
202
+ return f"{base_url}/synthetics/tests{qs}"
203
+
204
+ if "/hosts" in path:
205
+ return f"{base_url}/infrastructure{_build_qs(query_params, {'filter', 'query', 'tags'})}"
206
+
207
+ if "/services" in path:
208
+ return f"{base_url}/apm/services{_build_qs(query_params, {'filter', 'query', 'tags'})}"
209
+
210
+ if "/metrics" in path or "/query" in path:
211
+ return f"{base_url}/metrics/explorer{_build_qs(query_params, {'from', 'to', 'query'})}"
212
+
213
+ return f"{base_url}/apm/home"
@@ -3,49 +3,186 @@
3
3
  Tools to search and analyze distributed traces from Datadog APM.
4
4
 
5
5
  ### Available Tools:
6
- - **fetch_datadog_traces** - List traces with filters (service, operation, duration)
7
- - **fetch_datadog_trace_by_id** - Get detailed span hierarchy for a specific trace
8
6
  - **fetch_datadog_spans** - Search spans with Datadog query syntax
7
+ - **aggregate_datadog_spans** - Aggregate span data into buckets and compute metrics
9
8
 
10
9
  ### Common Usage:
11
10
 
12
11
  ```python
13
- # Find slow traces (>5s) for a service
14
- fetch_datadog_traces(service="backend-service", min_duration="5s")
12
+ # Search for errors using Datadog query syntax
13
+ fetch_datadog_spans(query="@http.status_code:500", limit=5)
14
+ fetch_datadog_spans(query="service:api status:error", limit=10)
15
+ ```
15
16
 
16
- # Get trace details showing full span hierarchy
17
- fetch_datadog_trace_by_id(trace_id="6878d11e0000000064837efe7e97f5f8")
17
+ ### Query Patterns:
18
18
 
19
- # Search for errors using Datadog query syntax
20
- fetch_datadog_spans(query="@http.status_code:500")
21
- fetch_datadog_spans(service="api", query="status:error")
19
+ ```python
20
+ # Specific HTTP endpoint (any method)
21
+ fetch_datadog_spans(query="@http.route:/api/orders", limit=5)
22
+
23
+ # HTTP routes containing substring (wildcard search)
24
+ fetch_datadog_spans(query="@http.route:*payment*", limit=5)
25
+
26
+ # Broad search across all span types
27
+ fetch_datadog_spans(query="resource_name:*user*", limit=10)
28
+
29
+ # Errors by service with wildcard
30
+ fetch_datadog_spans(query="service:payment @http.status_code:5*", limit=5)
31
+
32
+ # Database queries with time range (last hour)
33
+ fetch_datadog_spans(
34
+ query="service:postgres @duration:>1000000000",
35
+ start_datetime="-3600", # 1 hour in seconds
36
+ limit=10
37
+ )
38
+
39
+ # Production errors
40
+ fetch_datadog_spans(query="env:production error:true", limit=5)
22
41
 
23
- # Time ranges (default: last hour)
24
- fetch_datadog_traces(
25
- service="api",
26
- start_datetime="-3600", # 1 hour ago
27
- end_datetime="0" # now
42
+ # Specific endpoint pattern with custom time range
43
+ fetch_datadog_spans(
44
+ query='@http.route:*/user/* @http.status_code:>=400',
45
+ start_datetime="-1800", # 30 minutes in seconds
46
+ limit=10
47
+ )
48
+
49
+ # Combining multiple conditions with wildcards
50
+ fetch_datadog_spans(
51
+ query='service:*api* @http.route:*/user/* @http.status_code:[400 TO 599]',
52
+ limit=10
28
53
  )
29
54
  ```
30
55
 
31
- ### Query Examples:
56
+ ### Aggregate Examples:
32
57
 
33
58
  ```python
34
- # Performance issues
35
- fetch_datadog_traces(min_duration="2s", operation="GET /api/products")
59
+ # Count spans grouped by status code (last 15 minutes)
60
+ aggregate_datadog_spans(
61
+ query='resource_name:*api* @http.method:POST',
62
+ compute=[{"aggregation": "count", "type": "total"}],
63
+ group_by=[{"facet": "@http.status_code", "limit": 50}],
64
+ start_datetime="-900" # 15 minutes in seconds
65
+ )
66
+
67
+ # Get average duration by service (last hour)
68
+ aggregate_datadog_spans(
69
+ query='service:*backend* OR service:*api*',
70
+ compute=[{"aggregation": "avg", "metric": "@duration", "type": "total"}],
71
+ group_by=[{"facet": "service", "limit": 50}],
72
+ start_datetime="-3600" # 1 hour in seconds
73
+ )
74
+
75
+ # Get P95 latency timeseries by service
76
+ aggregate_datadog_spans(
77
+ query='@http.route:*/api/* @http.status_code:[200 TO 299]',
78
+ compute=[{
79
+ "aggregation": "pc95",
80
+ "metric": "@duration",
81
+ "type": "timeseries",
82
+ "interval": "5m"
83
+ }],
84
+ group_by=[{"facet": "service", "limit": 50}]
85
+ )
86
+
87
+ # Complex aggregation with histogram
88
+ aggregate_datadog_spans(
89
+ query='resource_name:*product* OR resource_name:*catalog*',
90
+ compute=[
91
+ {"aggregation": "avg", "metric": "@duration", "type": "total"},
92
+ {"aggregation": "count", "type": "total"}
93
+ ],
94
+ group_by=[{
95
+ "facet": "@duration",
96
+ "histogram": {"interval": 100, "min": 0, "max": 1000},
97
+ "limit": 50
98
+ }]
99
+ )
100
+
101
+ # Error rate calculation by endpoint
102
+ aggregate_datadog_spans(
103
+ query='@http.route:* @http.status_code:[400 TO 599]',
104
+ compute=[{"aggregation": "count", "type": "total"}],
105
+ group_by=[
106
+ {"facet": "resource_name", "limit": 50},
107
+ {"facet": "@http.status_code", "limit": 50}
108
+ ]
109
+ )
110
+ ```
111
+
112
+ ### Query Pattern Tips:
113
+
114
+ | Your Goal | Use This Pattern |
115
+ |-----------|------------------|
116
+ | Specific HTTP endpoint, any method | `@http.route:/api/users` |
117
+ | HTTP routes containing substring | `@http.route:*payment*` |
118
+ | Broad search across all span types | `resource_name:*user*` |
119
+ | Service name patterns | `service:*api*` or `service:payment-*` |
120
+ | Multiple wildcards | `@http.route:*/user/*/profile` |
121
+ | Error status codes | `@http.status_code:5*` or `@http.status_code:[400 TO 599]` |
122
+
123
+ ### General Tips:
124
+ - Wildcards (*) can be used in most fields for flexible pattern matching
125
+ - For aggregations: use @-prefixed attributes (e.g., @duration, @http.status_code)
126
+ - Keep fetch_datadog_spans limit low (5-10) to avoid too much data
127
+ - aggregate_datadog_spans can handle higher limits (50+) for group_by facets
128
+
129
+ ### CRITICAL: Cursor Usage Rules
130
+ **NEVER parallelize cursor-based calls or reuse cursor values!**
131
+
132
+ Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
133
+
134
+ **WRONG (causes duplicate data):**
135
+ ```
136
+ Batch 1 → cursor_A
137
+ Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
138
+ Result: Duplicate data, incomplete results
139
+ ```
36
140
 
37
- # Errors by service
38
- fetch_datadog_spans(service="payment", query="@http.status_code:5*")
141
+ **CORRECT (sequential pagination):**
142
+ ```
143
+ Batch 1 → cursor_A
144
+ Wait for response → use cursor_A for Batch 2 → cursor_B
145
+ Wait for response → use cursor_B for Batch 3 → cursor_C
146
+ Result: Complete unique data ✅
147
+ ```
148
+
149
+ **Key Rules:**
150
+ - Each response provides a NEW cursor for the NEXT request
151
+ - NEVER reuse the same cursor value multiple times
152
+ - NEVER make parallel calls with the same cursor
153
+ - Always wait for response before using the returned cursor
154
+
155
+ ### Compact Mode Strategy:
156
+
157
+ The `compact` parameter reduces output size by returning only essential fields. Use this strategy:
39
158
 
40
- # Database queries
41
- fetch_datadog_spans(query="service:postgres @duration:>1000000000")
159
+ 1. **Initial exploration**: Use compact=true with higher limits (50-100) to get an overview
160
+ 2. **Detailed investigation**: Use compact=false with lower limits (5-10) for specific spans
42
161
 
43
- # With tags
44
- fetch_datadog_spans(tags={"env": "production"}, query="error:true")
162
+ ```python
163
+ # STEP 1: Initial search with compact mode to find patterns
164
+ fetch_datadog_spans(
165
+ query="service:api @http.status_code:5*",
166
+ compact=true,
167
+ limit=100 # Higher limit safe with compact mode
168
+ )
169
+
170
+ # STEP 2: Detailed investigation of specific issues
171
+ fetch_datadog_spans(
172
+ query="service:api @http.status_code:500 resource_name:*/user/*",
173
+ compact=false, # Full details for deep analysis
174
+ limit=10
175
+ )
45
176
  ```
46
177
 
47
- ### Tips:
48
- - Duration units: ms, s, m (e.g., "500ms", "5s", "1m")
49
- - Time: RFC3339 format or negative seconds from now
50
- - Rate limit: 300 requests/hour
51
- - Default time range: 1 hour
178
+ **When to use compact=true:**
179
+ - Initial searches to identify patterns
180
+ - When you need to scan many spans for errors or performance issues
181
+ - When looking for specific span IDs or trace IDs
182
+ - When the full span details aren't needed yet
183
+
184
+ **When to use compact=false (default):**
185
+ - Investigating specific errors
186
+ - Analyzing request/response headers
187
+ - Examining user agent details
188
+ - Debugging authentication issues or HTTP details