holmesgpt 0.13.2__py3-none-any.whl → 0.18.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. holmes/__init__.py +3 -5
  2. holmes/clients/robusta_client.py +20 -6
  3. holmes/common/env_vars.py +58 -3
  4. holmes/common/openshift.py +1 -1
  5. holmes/config.py +123 -148
  6. holmes/core/conversations.py +71 -15
  7. holmes/core/feedback.py +191 -0
  8. holmes/core/investigation.py +31 -39
  9. holmes/core/investigation_structured_output.py +3 -3
  10. holmes/core/issue.py +1 -1
  11. holmes/core/llm.py +508 -88
  12. holmes/core/models.py +108 -4
  13. holmes/core/openai_formatting.py +14 -1
  14. holmes/core/prompt.py +48 -3
  15. holmes/core/runbooks.py +1 -0
  16. holmes/core/safeguards.py +8 -6
  17. holmes/core/supabase_dal.py +295 -100
  18. holmes/core/tool_calling_llm.py +489 -428
  19. holmes/core/tools.py +325 -56
  20. holmes/core/tools_utils/token_counting.py +21 -0
  21. holmes/core/tools_utils/tool_context_window_limiter.py +40 -0
  22. holmes/core/tools_utils/tool_executor.py +0 -13
  23. holmes/core/tools_utils/toolset_utils.py +1 -0
  24. holmes/core/toolset_manager.py +191 -5
  25. holmes/core/tracing.py +19 -3
  26. holmes/core/transformers/__init__.py +23 -0
  27. holmes/core/transformers/base.py +63 -0
  28. holmes/core/transformers/llm_summarize.py +175 -0
  29. holmes/core/transformers/registry.py +123 -0
  30. holmes/core/transformers/transformer.py +32 -0
  31. holmes/core/truncation/compaction.py +94 -0
  32. holmes/core/truncation/dal_truncation_utils.py +23 -0
  33. holmes/core/truncation/input_context_window_limiter.py +219 -0
  34. holmes/interactive.py +228 -31
  35. holmes/main.py +23 -40
  36. holmes/plugins/interfaces.py +2 -1
  37. holmes/plugins/prompts/__init__.py +2 -1
  38. holmes/plugins/prompts/_fetch_logs.jinja2 +31 -6
  39. holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
  40. holmes/plugins/prompts/_runbook_instructions.jinja2 +24 -12
  41. holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
  42. holmes/plugins/prompts/conversation_history_compaction.jinja2 +89 -0
  43. holmes/plugins/prompts/generic_ask.jinja2 +0 -4
  44. holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -1
  45. holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -1
  46. holmes/plugins/prompts/generic_investigation.jinja2 +0 -1
  47. holmes/plugins/prompts/investigation_procedure.jinja2 +50 -1
  48. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -1
  49. holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -1
  50. holmes/plugins/runbooks/__init__.py +145 -17
  51. holmes/plugins/runbooks/catalog.json +2 -0
  52. holmes/plugins/sources/github/__init__.py +4 -2
  53. holmes/plugins/sources/prometheus/models.py +1 -0
  54. holmes/plugins/toolsets/__init__.py +44 -27
  55. holmes/plugins/toolsets/aks-node-health.yaml +46 -0
  56. holmes/plugins/toolsets/aks.yaml +64 -0
  57. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +38 -47
  58. holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
  59. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  60. holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
  61. holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
  62. holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
  63. holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -13
  64. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +15 -12
  65. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +15 -12
  66. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +11 -11
  67. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +11 -9
  68. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +15 -12
  69. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +15 -15
  70. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +11 -8
  71. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +11 -8
  72. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +11 -8
  73. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +11 -8
  74. holmes/plugins/toolsets/azure_sql/utils.py +0 -32
  75. holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
  76. holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
  77. holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
  78. holmes/plugins/toolsets/bash/bash_toolset.py +11 -15
  79. holmes/plugins/toolsets/bash/common/bash.py +23 -13
  80. holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
  81. holmes/plugins/toolsets/bash/common/stringify.py +1 -1
  82. holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
  83. holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
  84. holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
  85. holmes/plugins/toolsets/bash/parse_command.py +12 -13
  86. holmes/plugins/toolsets/cilium.yaml +284 -0
  87. holmes/plugins/toolsets/connectivity_check.py +124 -0
  88. holmes/plugins/toolsets/coralogix/api.py +132 -119
  89. holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
  90. holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
  91. holmes/plugins/toolsets/coralogix/utils.py +15 -79
  92. holmes/plugins/toolsets/datadog/datadog_api.py +525 -26
  93. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +55 -11
  94. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
  95. holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
  96. holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
  97. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
  98. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +417 -241
  99. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +234 -214
  100. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +167 -79
  101. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +374 -363
  102. holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
  103. holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
  104. holmes/plugins/toolsets/elasticsearch/opensearch_ppl_query_docs.jinja2 +1616 -0
  105. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist.py +78 -0
  106. holmes/plugins/toolsets/elasticsearch/opensearch_query_assist_instructions.jinja2 +223 -0
  107. holmes/plugins/toolsets/git.py +54 -50
  108. holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
  109. holmes/plugins/toolsets/grafana/common.py +13 -29
  110. holmes/plugins/toolsets/grafana/grafana_tempo_api.py +455 -0
  111. holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +25 -0
  112. holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +191 -0
  113. holmes/plugins/toolsets/grafana/loki_api.py +4 -0
  114. holmes/plugins/toolsets/grafana/toolset_grafana.py +293 -89
  115. holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +49 -0
  116. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
  117. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +820 -292
  118. holmes/plugins/toolsets/grafana/trace_parser.py +4 -3
  119. holmes/plugins/toolsets/internet/internet.py +15 -16
  120. holmes/plugins/toolsets/internet/notion.py +9 -11
  121. holmes/plugins/toolsets/investigator/core_investigation.py +44 -36
  122. holmes/plugins/toolsets/investigator/model.py +3 -1
  123. holmes/plugins/toolsets/json_filter_mixin.py +134 -0
  124. holmes/plugins/toolsets/kafka.py +36 -42
  125. holmes/plugins/toolsets/kubernetes.yaml +317 -113
  126. holmes/plugins/toolsets/kubernetes_logs.py +9 -9
  127. holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
  128. holmes/plugins/toolsets/logging_utils/logging_api.py +94 -8
  129. holmes/plugins/toolsets/mcp/toolset_mcp.py +218 -64
  130. holmes/plugins/toolsets/newrelic/new_relic_api.py +165 -0
  131. holmes/plugins/toolsets/newrelic/newrelic.jinja2 +65 -0
  132. holmes/plugins/toolsets/newrelic/newrelic.py +320 -0
  133. holmes/plugins/toolsets/openshift.yaml +283 -0
  134. holmes/plugins/toolsets/prometheus/prometheus.py +1202 -421
  135. holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +54 -5
  136. holmes/plugins/toolsets/prometheus/utils.py +28 -0
  137. holmes/plugins/toolsets/rabbitmq/api.py +23 -4
  138. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +13 -14
  139. holmes/plugins/toolsets/robusta/robusta.py +239 -68
  140. holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
  141. holmes/plugins/toolsets/runbook/runbook_fetcher.py +157 -27
  142. holmes/plugins/toolsets/service_discovery.py +1 -1
  143. holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
  144. holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
  145. holmes/plugins/toolsets/utils.py +88 -0
  146. holmes/utils/config_utils.py +91 -0
  147. holmes/utils/connection_utils.py +31 -0
  148. holmes/utils/console/result.py +10 -0
  149. holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
  150. holmes/utils/env.py +7 -0
  151. holmes/utils/file_utils.py +2 -1
  152. holmes/utils/global_instructions.py +60 -11
  153. holmes/utils/holmes_status.py +6 -4
  154. holmes/utils/holmes_sync_toolsets.py +0 -2
  155. holmes/utils/krr_utils.py +188 -0
  156. holmes/utils/log.py +15 -0
  157. holmes/utils/markdown_utils.py +2 -3
  158. holmes/utils/memory_limit.py +58 -0
  159. holmes/utils/sentry_helper.py +64 -0
  160. holmes/utils/stream.py +69 -8
  161. holmes/utils/tags.py +4 -3
  162. holmes/version.py +37 -15
  163. holmesgpt-0.18.4.dist-info/LICENSE +178 -0
  164. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +35 -31
  165. holmesgpt-0.18.4.dist-info/RECORD +258 -0
  166. holmes/core/performance_timing.py +0 -72
  167. holmes/plugins/toolsets/aws.yaml +0 -80
  168. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -112
  169. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
  170. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -739
  171. holmes/plugins/toolsets/grafana/grafana_api.py +0 -42
  172. holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
  173. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
  174. holmes/plugins/toolsets/newrelic.py +0 -231
  175. holmes/plugins/toolsets/opensearch/opensearch.py +0 -257
  176. holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
  177. holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -218
  178. holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
  179. holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
  180. holmes/plugins/toolsets/servicenow/install.md +0 -37
  181. holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
  182. holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
  183. holmes/utils/keygen_utils.py +0 -6
  184. holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
  185. holmesgpt-0.13.2.dist-info/RECORD +0 -234
  186. /holmes/plugins/toolsets/{opensearch → newrelic}/__init__.py +0 -0
  187. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
  188. {holmesgpt-0.13.2.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
@@ -73,7 +73,7 @@ When investigating metrics-related issues:
73
73
 
74
74
  # Handling queries results
75
75
  * ALWAYS embed the execution results into your answer
76
- * You only need to embed the partial result in your response. Include the "tool_name" and "random_key". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "92jf2hf"} >>
76
+ * You only need to embed the partial result in your response. Include the "tool_name" and "tool_call_id". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "92jf2hf"} >>
77
77
  * Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
78
78
  * You MUST ensure that the query is successful.
79
79
  * ALWAYS embed a DataDog graph in the response. The graph should visualize data related to the incident.
@@ -81,6 +81,6 @@ When investigating metrics-related issues:
81
81
  * When embedding multiple graphs, always add line spacing between them
82
82
  For example:
83
83
 
84
- <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "lBaA"}>>
84
+ <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "lBaA"}>>
85
85
 
86
- <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "random_key": "IKtq"}>>
86
+ <<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "IKtq"}>>
@@ -0,0 +1,59 @@
1
+ from enum import Enum
2
+
3
+ from pydantic import Field
4
+
5
+ from holmes.plugins.toolsets.datadog.datadog_api import DatadogBaseConfig
6
+ from holmes.plugins.toolsets.logging_utils.logging_api import DEFAULT_LOG_LIMIT
7
+
8
+ # Constants for RDS toolset
9
+ DEFAULT_TIME_SPAN_SECONDS = 3600
10
+ DEFAULT_TOP_INSTANCES = 10
11
+
12
+ # Constants for general toolset
13
+ MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10MB
14
+
15
+
16
+ class DataDogStorageTier(str, Enum):
17
+ """Storage tier enum for Datadog logs."""
18
+
19
+ INDEXES = "indexes"
20
+ ONLINE_ARCHIVES = "online-archives"
21
+ FLEX = "flex"
22
+
23
+
24
+ # Constants for logs toolset
25
+ DEFAULT_STORAGE_TIERS = [DataDogStorageTier.INDEXES]
26
+
27
+
28
+ class DatadogMetricsConfig(DatadogBaseConfig):
29
+ """Configuration for Datadog metrics toolset."""
30
+
31
+ default_limit: int = DEFAULT_LOG_LIMIT
32
+
33
+
34
+ class DatadogTracesConfig(DatadogBaseConfig):
35
+ """Configuration for Datadog traces toolset."""
36
+
37
+ indexes: list[str] = ["*"]
38
+
39
+
40
+ class DatadogLogsConfig(DatadogBaseConfig):
41
+ """Configuration for Datadog logs toolset."""
42
+
43
+ indexes: list[str] = ["*"]
44
+ # TODO storage tier just works with first element. need to add support for multi stoarge tiers.
45
+ storage_tiers: list[DataDogStorageTier] = Field(
46
+ default_factory=lambda: [DataDogStorageTier.INDEXES], min_length=1
47
+ )
48
+
49
+ compact_logs: bool = True
50
+ default_limit: int = DEFAULT_LOG_LIMIT
51
+
52
+
53
+ class DatadogGeneralConfig(DatadogBaseConfig):
54
+ """Configuration for general-purpose Datadog toolset."""
55
+
56
+ max_response_size: int = MAX_RESPONSE_SIZE
57
+ allow_custom_endpoints: bool = (
58
+ False # If True, allows endpoints not in whitelist (still filtered for safety)
59
+ )
@@ -0,0 +1,213 @@
1
+ import re
2
+ from typing import Any, Dict, Optional
3
+ from urllib.parse import urlencode, urlparse
4
+
5
+ from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
6
+ from holmes.plugins.toolsets.datadog.datadog_models import (
7
+ DatadogGeneralConfig,
8
+ DatadogLogsConfig,
9
+ DatadogMetricsConfig,
10
+ DatadogTracesConfig,
11
+ )
12
+
13
+
14
+ def generate_datadog_metrics_explorer_url(
15
+ dd_config: DatadogMetricsConfig,
16
+ query: str,
17
+ from_time: int,
18
+ to_time: int,
19
+ ) -> str:
20
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
21
+
22
+ params = {
23
+ "query": query,
24
+ "from_ts": from_time * 1000, # seconds -> ms
25
+ "to_ts": to_time * 1000, # seconds -> ms
26
+ "live": "true",
27
+ }
28
+
29
+ return f"{base_url}/metric/explorer?{urlencode(params)}"
30
+
31
+
32
+ def generate_datadog_metrics_list_url(
33
+ dd_config: DatadogMetricsConfig,
34
+ from_time: int,
35
+ host: Optional[str] = None,
36
+ tag_filter: Optional[str] = None,
37
+ metric_filter: Optional[str] = None,
38
+ ) -> str:
39
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
40
+
41
+ params = {}
42
+ if metric_filter:
43
+ params["filter"] = metric_filter
44
+
45
+ if host:
46
+ params["host"] = host
47
+ if tag_filter:
48
+ params["tag_filter"] = tag_filter
49
+
50
+ qs = urlencode(params) if params else ""
51
+ return f"{base_url}/metric/summary" + (f"?{qs}" if qs else "")
52
+
53
+
54
+ def generate_datadog_metric_metadata_url(
55
+ dd_config: DatadogMetricsConfig,
56
+ metric_name: str,
57
+ ) -> str:
58
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
59
+ params = {"metric": metric_name}
60
+ return f"{base_url}/metric/summary?{urlencode(params)}"
61
+
62
+
63
+ def generate_datadog_metric_tags_url(
64
+ dd_config: DatadogMetricsConfig,
65
+ metric_name: str,
66
+ ) -> str:
67
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
68
+ params = {"metric": metric_name}
69
+ return f"{base_url}/metric/summary?{urlencode(params)}"
70
+
71
+
72
+ def generate_datadog_spans_url(
73
+ dd_config: DatadogTracesConfig,
74
+ query: str,
75
+ from_time_ms: int,
76
+ to_time_ms: int,
77
+ ) -> str:
78
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
79
+
80
+ url_params = {
81
+ "query": query,
82
+ "from_ts": from_time_ms,
83
+ "to_ts": to_time_ms,
84
+ "live": "true",
85
+ }
86
+
87
+ return f"{base_url}/apm/traces?{urlencode(url_params)}"
88
+
89
+
90
+ def generate_datadog_spans_analytics_url(
91
+ dd_config: DatadogTracesConfig,
92
+ query: str,
93
+ from_time_ms: int,
94
+ to_time_ms: int,
95
+ ) -> str:
96
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
97
+
98
+ url_params = {
99
+ "query": query,
100
+ "from_ts": from_time_ms,
101
+ "to_ts": to_time_ms,
102
+ "live": "true",
103
+ }
104
+
105
+ return f"{base_url}/apm/analytics?{urlencode(url_params)}"
106
+
107
+
108
+ def generate_datadog_logs_url(
109
+ dd_config: DatadogLogsConfig,
110
+ params: dict,
111
+ ) -> str:
112
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
113
+ url_params = {
114
+ "query": params["filter"]["query"],
115
+ "from_ts": params["filter"]["from"],
116
+ "to_ts": params["filter"]["to"],
117
+ "live": "true",
118
+ "storage": params["filter"]["storage_tier"],
119
+ }
120
+
121
+ if dd_config.indexes != ["*"]:
122
+ url_params["index"] = ",".join(dd_config.indexes)
123
+
124
+ # Construct the full URL
125
+ return f"{base_url}/logs?{urlencode(url_params)}"
126
+
127
+
128
+ def _build_qs(
129
+ query_params: Optional[Dict[str, Any]], allowed: Optional[set] = None
130
+ ) -> str:
131
+ if not query_params:
132
+ return ""
133
+ allowed = allowed or {
134
+ "filter",
135
+ "query",
136
+ "tags",
137
+ "status",
138
+ "start",
139
+ "end",
140
+ "from",
141
+ "to",
142
+ }
143
+ url_params = {}
144
+ for k, v in query_params.items():
145
+ if k not in allowed or v is None:
146
+ continue
147
+ if k in ("start", "from"):
148
+ url_params["from_ts"] = v * 1000
149
+ elif k in ("end", "to"):
150
+ url_params["to_ts"] = v * 1000
151
+ elif k in ("query", "filter", "tags"):
152
+ url_params["q"] = v
153
+ else:
154
+ url_params[k] = v
155
+ qs = urlencode(url_params) if url_params else ""
156
+ return f"?{qs}" if qs else ""
157
+
158
+
159
+ def generate_datadog_general_url(
160
+ dd_config: DatadogGeneralConfig,
161
+ endpoint: str,
162
+ query_params: Optional[Dict[str, Any]] = None,
163
+ ) -> Optional[str]:
164
+ base_url = convert_api_url_to_app_url(dd_config.site_api_url)
165
+ path = urlparse(endpoint).path
166
+
167
+ if "/logs" in path:
168
+ return f"{base_url}/logs{_build_qs(query_params, {'start', 'end'})}"
169
+
170
+ if "/monitor" in path:
171
+ qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
172
+ monitor_id_match = re.search(r"/monitor/(\d+)", path)
173
+ if monitor_id_match:
174
+ return f"{base_url}/monitors/{monitor_id_match.group(1)}{qs}"
175
+ return f"{base_url}/monitors{qs}"
176
+
177
+ if "/dashboard" in path:
178
+ qs = _build_qs(query_params, {"filter", "query", "tags"})
179
+ if re.match(r"^/api/v\d+/dashboard/[^/]+", path):
180
+ return f"{base_url}/dashboard/{path.split('/')[-1]}{qs}"
181
+ return f"{base_url}/dashboard{qs}"
182
+
183
+ if "/slo" in path:
184
+ qs = _build_qs(query_params, {"filter", "query", "tags"})
185
+ if re.match(r"^/api/v\d+/slo/[^/]+", path):
186
+ return f"{base_url}/slo/{path.split('/')[-1]}{qs}"
187
+ return f"{base_url}/slo{qs}"
188
+
189
+ if "/events" in path:
190
+ return f"{base_url}/events{_build_qs(query_params, {'start', 'end'})}"
191
+
192
+ if "/incidents" in path:
193
+ qs = _build_qs(query_params, {"filter", "query", "status"})
194
+ if re.match(r"^/api/v\d+/incidents/[^/]+", path):
195
+ return f"{base_url}/incidents/{path.split('/')[-1]}{qs}"
196
+ return f"{base_url}/incidents{qs}"
197
+
198
+ if "/synthetics" in path:
199
+ qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
200
+ if re.match(r"^/api/v\d+/synthetics/tests/[^/]+", path):
201
+ return f"{base_url}/synthetics/tests/{path.split('/')[-1]}{qs}"
202
+ return f"{base_url}/synthetics/tests{qs}"
203
+
204
+ if "/hosts" in path:
205
+ return f"{base_url}/infrastructure{_build_qs(query_params, {'filter', 'query', 'tags'})}"
206
+
207
+ if "/services" in path:
208
+ return f"{base_url}/apm/services{_build_qs(query_params, {'filter', 'query', 'tags'})}"
209
+
210
+ if "/metrics" in path or "/query" in path:
211
+ return f"{base_url}/metrics/explorer{_build_qs(query_params, {'from', 'to', 'query'})}"
212
+
213
+ return f"{base_url}/apm/home"
@@ -3,49 +3,186 @@
3
3
  Tools to search and analyze distributed traces from Datadog APM.
4
4
 
5
5
  ### Available Tools:
6
- - **fetch_datadog_traces** - List traces with filters (service, operation, duration)
7
- - **fetch_datadog_trace_by_id** - Get detailed span hierarchy for a specific trace
8
6
  - **fetch_datadog_spans** - Search spans with Datadog query syntax
7
+ - **aggregate_datadog_spans** - Aggregate span data into buckets and compute metrics
9
8
 
10
9
  ### Common Usage:
11
10
 
12
11
  ```python
13
- # Find slow traces (>5s) for a service
14
- fetch_datadog_traces(service="backend-service", min_duration="5s")
12
+ # Search for errors using Datadog query syntax
13
+ fetch_datadog_spans(query="@http.status_code:500", limit=5)
14
+ fetch_datadog_spans(query="service:api status:error", limit=10)
15
+ ```
15
16
 
16
- # Get trace details showing full span hierarchy
17
- fetch_datadog_trace_by_id(trace_id="6878d11e0000000064837efe7e97f5f8")
17
+ ### Query Patterns:
18
18
 
19
- # Search for errors using Datadog query syntax
20
- fetch_datadog_spans(query="@http.status_code:500")
21
- fetch_datadog_spans(service="api", query="status:error")
19
+ ```python
20
+ # Specific HTTP endpoint (any method)
21
+ fetch_datadog_spans(query="@http.route:/api/orders", limit=5)
22
+
23
+ # HTTP routes containing substring (wildcard search)
24
+ fetch_datadog_spans(query="@http.route:*payment*", limit=5)
25
+
26
+ # Broad search across all span types
27
+ fetch_datadog_spans(query="resource_name:*user*", limit=10)
28
+
29
+ # Errors by service with wildcard
30
+ fetch_datadog_spans(query="service:payment @http.status_code:5*", limit=5)
31
+
32
+ # Database queries with time range (last hour)
33
+ fetch_datadog_spans(
34
+ query="service:postgres @duration:>1000000000",
35
+ start_datetime="-3600", # 1 hour in seconds
36
+ limit=10
37
+ )
38
+
39
+ # Production errors
40
+ fetch_datadog_spans(query="env:production error:true", limit=5)
22
41
 
23
- # Time ranges (default: last hour)
24
- fetch_datadog_traces(
25
- service="api",
26
- start_datetime="-3600", # 1 hour ago
27
- end_datetime="0" # now
42
+ # Specific endpoint pattern with custom time range
43
+ fetch_datadog_spans(
44
+ query='@http.route:*/user/* @http.status_code:>=400',
45
+ start_datetime="-1800", # 30 minutes in seconds
46
+ limit=10
47
+ )
48
+
49
+ # Combining multiple conditions with wildcards
50
+ fetch_datadog_spans(
51
+ query='service:*api* @http.route:*/user/* @http.status_code:[400 TO 599]',
52
+ limit=10
28
53
  )
29
54
  ```
30
55
 
31
- ### Query Examples:
56
+ ### Aggregate Examples:
32
57
 
33
58
  ```python
34
- # Performance issues
35
- fetch_datadog_traces(min_duration="2s", operation="GET /api/products")
59
+ # Count spans grouped by status code (last 15 minutes)
60
+ aggregate_datadog_spans(
61
+ query='resource_name:*api* @http.method:POST',
62
+ compute=[{"aggregation": "count", "type": "total"}],
63
+ group_by=[{"facet": "@http.status_code", "limit": 50}],
64
+ start_datetime="-900" # 15 minutes in seconds
65
+ )
66
+
67
+ # Get average duration by service (last hour)
68
+ aggregate_datadog_spans(
69
+ query='service:*backend* OR service:*api*',
70
+ compute=[{"aggregation": "avg", "metric": "@duration", "type": "total"}],
71
+ group_by=[{"facet": "service", "limit": 50}],
72
+ start_datetime="-3600" # 1 hour in seconds
73
+ )
74
+
75
+ # Get P95 latency timeseries by service
76
+ aggregate_datadog_spans(
77
+ query='@http.route:*/api/* @http.status_code:[200 TO 299]',
78
+ compute=[{
79
+ "aggregation": "pc95",
80
+ "metric": "@duration",
81
+ "type": "timeseries",
82
+ "interval": "5m"
83
+ }],
84
+ group_by=[{"facet": "service", "limit": 50}]
85
+ )
86
+
87
+ # Complex aggregation with histogram
88
+ aggregate_datadog_spans(
89
+ query='resource_name:*product* OR resource_name:*catalog*',
90
+ compute=[
91
+ {"aggregation": "avg", "metric": "@duration", "type": "total"},
92
+ {"aggregation": "count", "type": "total"}
93
+ ],
94
+ group_by=[{
95
+ "facet": "@duration",
96
+ "histogram": {"interval": 100, "min": 0, "max": 1000},
97
+ "limit": 50
98
+ }]
99
+ )
100
+
101
+ # Error rate calculation by endpoint
102
+ aggregate_datadog_spans(
103
+ query='@http.route:* @http.status_code:[400 TO 599]',
104
+ compute=[{"aggregation": "count", "type": "total"}],
105
+ group_by=[
106
+ {"facet": "resource_name", "limit": 50},
107
+ {"facet": "@http.status_code", "limit": 50}
108
+ ]
109
+ )
110
+ ```
111
+
112
+ ### Query Pattern Tips:
113
+
114
+ | Your Goal | Use This Pattern |
115
+ |-----------|------------------|
116
+ | Specific HTTP endpoint, any method | `@http.route:/api/users` |
117
+ | HTTP routes containing substring | `@http.route:*payment*` |
118
+ | Broad search across all span types | `resource_name:*user*` |
119
+ | Service name patterns | `service:*api*` or `service:payment-*` |
120
+ | Multiple wildcards | `@http.route:*/user/*/profile` |
121
+ | Error status codes | `@http.status_code:5*` or `@http.status_code:[400 TO 599]` |
122
+
123
+ ### General Tips:
124
+ - Wildcards (*) can be used in most fields for flexible pattern matching
125
+ - For aggregations: use @-prefixed attributes (e.g., @duration, @http.status_code)
126
+ - Keep fetch_datadog_spans limit low (5-10) to avoid too much data
127
+ - aggregate_datadog_spans can handle higher limits (50+) for group_by facets
128
+
129
+ ### CRITICAL: Cursor Usage Rules
130
+ **NEVER parallelize cursor-based calls or reuse cursor values!**
131
+
132
+ Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
133
+
134
+ **WRONG (causes duplicate data):**
135
+ ```
136
+ Batch 1 → cursor_A
137
+ Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
138
+ Result: Duplicate data, incomplete results
139
+ ```
36
140
 
37
- # Errors by service
38
- fetch_datadog_spans(service="payment", query="@http.status_code:5*")
141
+ **CORRECT (sequential pagination):**
142
+ ```
143
+ Batch 1 → cursor_A
144
+ Wait for response → use cursor_A for Batch 2 → cursor_B
145
+ Wait for response → use cursor_B for Batch 3 → cursor_C
146
+ Result: Complete unique data ✅
147
+ ```
148
+
149
+ **Key Rules:**
150
+ - Each response provides a NEW cursor for the NEXT request
151
+ - NEVER reuse the same cursor value multiple times
152
+ - NEVER make parallel calls with the same cursor
153
+ - Always wait for response before using the returned cursor
154
+
155
+ ### Compact Mode Strategy:
156
+
157
+ The `compact` parameter reduces output size by returning only essential fields. Use this strategy:
39
158
 
40
- # Database queries
41
- fetch_datadog_spans(query="service:postgres @duration:>1000000000")
159
+ 1. **Initial exploration**: Use compact=true with higher limits (50-100) to get an overview
160
+ 2. **Detailed investigation**: Use compact=false with lower limits (5-10) for specific spans
42
161
 
43
- # With tags
44
- fetch_datadog_spans(tags={"env": "production"}, query="error:true")
162
+ ```python
163
+ # STEP 1: Initial search with compact mode to find patterns
164
+ fetch_datadog_spans(
165
+ query="service:api @http.status_code:5*",
166
+ compact=true,
167
+ limit=100 # Higher limit safe with compact mode
168
+ )
169
+
170
+ # STEP 2: Detailed investigation of specific issues
171
+ fetch_datadog_spans(
172
+ query="service:api @http.status_code:500 resource_name:*/user/*",
173
+ compact=false, # Full details for deep analysis
174
+ limit=10
175
+ )
45
176
  ```
46
177
 
47
- ### Tips:
48
- - Duration units: ms, s, m (e.g., "500ms", "5s", "1m")
49
- - Time: RFC3339 format or negative seconds from now
50
- - Rate limit: 300 requests/hour
51
- - Default time range: 1 hour
178
+ **When to use compact=true:**
179
+ - Initial searches to identify patterns
180
+ - When you need to scan many spans for errors or performance issues
181
+ - When looking for specific span IDs or trace IDs
182
+ - When the full span details aren't needed yet
183
+
184
+ **When to use compact=false (default):**
185
+ - Investigating specific errors
186
+ - Analyzing request/response headers
187
+ - Examining user agent details
188
+ - Debugging authentication issues or HTTP details