holmesgpt 0.16.2a0__py3-none-any.whl → 0.18.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +3 -5
- holmes/clients/robusta_client.py +4 -3
- holmes/common/env_vars.py +18 -2
- holmes/common/openshift.py +1 -1
- holmes/config.py +11 -6
- holmes/core/conversations.py +30 -13
- holmes/core/investigation.py +21 -25
- holmes/core/investigation_structured_output.py +3 -3
- holmes/core/issue.py +1 -1
- holmes/core/llm.py +50 -31
- holmes/core/models.py +19 -17
- holmes/core/openai_formatting.py +1 -1
- holmes/core/prompt.py +47 -2
- holmes/core/runbooks.py +1 -0
- holmes/core/safeguards.py +4 -2
- holmes/core/supabase_dal.py +4 -2
- holmes/core/tool_calling_llm.py +102 -141
- holmes/core/tools.py +19 -28
- holmes/core/tools_utils/token_counting.py +9 -2
- holmes/core/tools_utils/tool_context_window_limiter.py +13 -30
- holmes/core/tools_utils/tool_executor.py +0 -18
- holmes/core/tools_utils/toolset_utils.py +1 -0
- holmes/core/toolset_manager.py +37 -2
- holmes/core/tracing.py +13 -2
- holmes/core/transformers/__init__.py +1 -1
- holmes/core/transformers/base.py +1 -0
- holmes/core/transformers/llm_summarize.py +3 -2
- holmes/core/transformers/registry.py +2 -1
- holmes/core/transformers/transformer.py +1 -0
- holmes/core/truncation/compaction.py +37 -2
- holmes/core/truncation/input_context_window_limiter.py +3 -2
- holmes/interactive.py +52 -8
- holmes/main.py +17 -37
- holmes/plugins/interfaces.py +2 -1
- holmes/plugins/prompts/__init__.py +2 -1
- holmes/plugins/prompts/_fetch_logs.jinja2 +5 -5
- holmes/plugins/prompts/_runbook_instructions.jinja2 +2 -1
- holmes/plugins/prompts/base_user_prompt.jinja2 +7 -0
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask.jinja2 +0 -2
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +0 -2
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +0 -2
- holmes/plugins/prompts/generic_investigation.jinja2 +0 -2
- holmes/plugins/prompts/investigation_procedure.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +0 -2
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +0 -2
- holmes/plugins/runbooks/__init__.py +32 -3
- holmes/plugins/sources/github/__init__.py +4 -2
- holmes/plugins/sources/prometheus/models.py +1 -0
- holmes/plugins/toolsets/__init__.py +30 -26
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +13 -12
- holmes/plugins/toolsets/azure_sql/apis/alert_monitoring_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/azure_sql/apis/connection_failure_api.py +3 -2
- holmes/plugins/toolsets/azure_sql/apis/connection_monitoring_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/apis/storage_analysis_api.py +3 -1
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +12 -12
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -5
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +7 -7
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +6 -8
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -3
- holmes/plugins/toolsets/azure_sql/utils.py +0 -32
- holmes/plugins/toolsets/bash/argocd/__init__.py +3 -3
- holmes/plugins/toolsets/bash/aws/__init__.py +4 -4
- holmes/plugins/toolsets/bash/azure/__init__.py +4 -4
- holmes/plugins/toolsets/bash/bash_toolset.py +2 -3
- holmes/plugins/toolsets/bash/common/bash.py +19 -9
- holmes/plugins/toolsets/bash/common/bash_command.py +1 -1
- holmes/plugins/toolsets/bash/common/stringify.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/__init__.py +2 -1
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +3 -4
- holmes/plugins/toolsets/bash/parse_command.py +12 -13
- holmes/plugins/toolsets/connectivity_check.py +124 -0
- holmes/plugins/toolsets/coralogix/api.py +132 -119
- holmes/plugins/toolsets/coralogix/coralogix.jinja2 +14 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix.py +219 -0
- holmes/plugins/toolsets/coralogix/utils.py +15 -79
- holmes/plugins/toolsets/datadog/datadog_api.py +36 -3
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +34 -1
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +3 -3
- holmes/plugins/toolsets/datadog/datadog_models.py +59 -0
- holmes/plugins/toolsets/datadog/datadog_url_utils.py +213 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +165 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +71 -28
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +224 -375
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +67 -36
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +360 -343
- holmes/plugins/toolsets/elasticsearch/__init__.py +6 -0
- holmes/plugins/toolsets/elasticsearch/elasticsearch.py +834 -0
- holmes/plugins/toolsets/git.py +7 -8
- holmes/plugins/toolsets/grafana/base_grafana_toolset.py +16 -4
- holmes/plugins/toolsets/grafana/common.py +2 -30
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +2 -1
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +18 -2
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +92 -18
- holmes/plugins/toolsets/grafana/loki_api.py +4 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +109 -25
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +22 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +201 -33
- holmes/plugins/toolsets/grafana/trace_parser.py +3 -2
- holmes/plugins/toolsets/internet/internet.py +10 -10
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +3 -3
- holmes/plugins/toolsets/investigator/model.py +3 -1
- holmes/plugins/toolsets/json_filter_mixin.py +134 -0
- holmes/plugins/toolsets/kafka.py +12 -7
- holmes/plugins/toolsets/kubernetes.yaml +260 -30
- holmes/plugins/toolsets/kubernetes_logs.py +3 -3
- holmes/plugins/toolsets/logging_utils/logging_api.py +16 -6
- holmes/plugins/toolsets/mcp/toolset_mcp.py +88 -60
- holmes/plugins/toolsets/newrelic/new_relic_api.py +41 -1
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +24 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +212 -55
- holmes/plugins/toolsets/prometheus/prometheus.py +358 -102
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +11 -3
- holmes/plugins/toolsets/rabbitmq/api.py +23 -4
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +5 -5
- holmes/plugins/toolsets/robusta/robusta.py +5 -5
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +25 -6
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +1 -1
- holmes/plugins/toolsets/utils.py +1 -1
- holmes/utils/config_utils.py +1 -1
- holmes/utils/connection_utils.py +31 -0
- holmes/utils/console/result.py +10 -0
- holmes/utils/file_utils.py +2 -1
- holmes/utils/global_instructions.py +10 -26
- holmes/utils/holmes_status.py +4 -3
- holmes/utils/log.py +15 -0
- holmes/utils/markdown_utils.py +2 -3
- holmes/utils/memory_limit.py +58 -0
- holmes/utils/sentry_helper.py +23 -0
- holmes/utils/stream.py +12 -5
- holmes/utils/tags.py +4 -3
- holmes/version.py +3 -1
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/METADATA +12 -10
- holmesgpt-0.18.4.dist-info/RECORD +258 -0
- holmes/plugins/toolsets/aws.yaml +0 -80
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +0 -114
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +0 -310
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +0 -736
- holmes/plugins/toolsets/grafana/grafana_api.py +0 -64
- holmes/plugins/toolsets/opensearch/__init__.py +0 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +0 -250
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +0 -161
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +0 -215
- holmes/plugins/toolsets/opensearch/opensearch_traces_instructions.jinja2 +0 -12
- holmes/plugins/toolsets/opensearch/opensearch_utils.py +0 -166
- holmes/utils/keygen_utils.py +0 -6
- holmesgpt-0.16.2a0.dist-info/RECORD +0 -258
- holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_ppl_query_docs.jinja2 +0 -0
- holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist.py +2 -2
- /holmes/plugins/toolsets/{opensearch → elasticsearch}/opensearch_query_assist_instructions.jinja2 +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/LICENSE +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/WHEEL +0 -0
- {holmesgpt-0.16.2a0.dist-info → holmesgpt-0.18.4.dist-info}/entry_points.txt +0 -0
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
+
import threading
|
|
4
5
|
from datetime import datetime, timedelta, timezone
|
|
5
|
-
from typing import Any, Optional,
|
|
6
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
6
7
|
from urllib.parse import urlparse, urlunparse
|
|
8
|
+
|
|
7
9
|
import requests # type: ignore
|
|
8
10
|
from pydantic import AnyUrl, BaseModel
|
|
9
11
|
from requests.structures import CaseInsensitiveDict # type: ignore
|
|
10
12
|
from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
|
|
11
13
|
from tenacity.wait import wait_base
|
|
12
14
|
|
|
13
|
-
|
|
14
15
|
START_RETRY_DELAY = (
|
|
15
16
|
5.0 # Initial fallback delay if datadog does not return a reset_time
|
|
16
17
|
)
|
|
@@ -22,6 +23,9 @@ RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
|
|
|
22
23
|
# Cache for OpenAPI spec
|
|
23
24
|
_openapi_spec_cache: Dict[str, Any] = {}
|
|
24
25
|
|
|
26
|
+
# Global lock for Datadog API requests to prevent concurrent calls
|
|
27
|
+
_datadog_request_lock = threading.Lock()
|
|
28
|
+
|
|
25
29
|
# Relative time pattern (m = minutes, mo = months)
|
|
26
30
|
RELATIVE_TIME_PATTERN = re.compile(r"^-?(\d+)([hdwsy]|min|m|mo)$|^now$", re.IGNORECASE)
|
|
27
31
|
|
|
@@ -237,6 +241,35 @@ def execute_datadog_http_request(
|
|
|
237
241
|
payload_or_params: dict,
|
|
238
242
|
timeout: int,
|
|
239
243
|
method: str = "POST",
|
|
244
|
+
) -> Any:
|
|
245
|
+
# from my limited testing doing 1 just request at a time is faster because the RATE_LIMIT_REMAINING_SECONDS_HEADER is shorter
|
|
246
|
+
# Serialize all Datadog API requests to avoid rate limits
|
|
247
|
+
with _datadog_request_lock:
|
|
248
|
+
return execute_datadog_http_request_with_retries(
|
|
249
|
+
url, headers, payload_or_params, timeout, method
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@retry(
|
|
254
|
+
retry=retry_if_http_429_error(),
|
|
255
|
+
wait=wait_for_retry_after_header(
|
|
256
|
+
fallback=wait_incrementing(
|
|
257
|
+
start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
|
|
258
|
+
)
|
|
259
|
+
),
|
|
260
|
+
stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
|
|
261
|
+
before_sleep=lambda retry_state: logging.warning(
|
|
262
|
+
f"DataDog API rate limited. Retrying... "
|
|
263
|
+
f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
|
|
264
|
+
),
|
|
265
|
+
reraise=True,
|
|
266
|
+
)
|
|
267
|
+
def execute_datadog_http_request_with_retries(
|
|
268
|
+
url: str,
|
|
269
|
+
headers: dict,
|
|
270
|
+
payload_or_params: dict,
|
|
271
|
+
timeout: int,
|
|
272
|
+
method: str,
|
|
240
273
|
) -> Any:
|
|
241
274
|
logging.debug(
|
|
242
275
|
f"Datadog API Request: Method: {method} URL: {url} Headers: {json.dumps(sanitize_headers(headers), indent=2)} {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)} Timeout: {timeout}s"
|
|
@@ -261,7 +294,7 @@ def execute_datadog_http_request(
|
|
|
261
294
|
return response_data
|
|
262
295
|
|
|
263
296
|
else:
|
|
264
|
-
logging.
|
|
297
|
+
logging.debug(f"Error Response Body: {response.text}")
|
|
265
298
|
raise DataDogRequestError(
|
|
266
299
|
payload=payload_or_params,
|
|
267
300
|
status_code=response.status_code,
|
|
@@ -44,7 +44,6 @@ Before running logs queries:
|
|
|
44
44
|
### Time Parameters
|
|
45
45
|
- Use RFC3339 format: `2023-03-01T10:30:00Z`
|
|
46
46
|
- Or relative seconds: `-3600` for 1 hour ago
|
|
47
|
-
- Defaults to 1 hour window if not specified
|
|
48
47
|
|
|
49
48
|
### Common Investigation Patterns
|
|
50
49
|
|
|
@@ -52,3 +51,37 @@ Before running logs queries:
|
|
|
52
51
|
1. User asks: "Show logs for my-workload"
|
|
53
52
|
2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
|
|
54
53
|
3. Query Datadog for pod "my-workload-abc123-xyz" logs
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
### Search Query Guidelines
|
|
57
|
+
|
|
58
|
+
1. Avoid using @timestamp Attribute in the search queries (e.g for example @timestamp:[2025-12-10T01:00:00.000Z TO 2025-12-10T04:00:00.000Z)
|
|
59
|
+
Rely on the fetch_datadog_logs function start_datetime and end_datetime parameters for that.
|
|
60
|
+
2. Datadog default TAGS for kubernetes are *kube_namespace* and *pod_name*, if a user specificy custom TAGS used in his environment please use them in your search queries.
|
|
61
|
+
3. If you see a useful TAG in your Old fetch_datadog_logs query use it for further queries.
|
|
62
|
+
|
|
63
|
+
### CRITICAL: Cursor Usage Rules
|
|
64
|
+
**NEVER parallelize cursor-based calls or reuse cursor values!**
|
|
65
|
+
|
|
66
|
+
Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
|
|
67
|
+
|
|
68
|
+
**WRONG (causes duplicate data):**
|
|
69
|
+
```
|
|
70
|
+
Batch 1 → cursor_A
|
|
71
|
+
Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
|
|
72
|
+
Result: Duplicate data, incomplete results
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
**CORRECT (sequential pagination):**
|
|
76
|
+
```
|
|
77
|
+
Batch 1 → cursor_A
|
|
78
|
+
Wait for response → use cursor_A for Batch 2 → cursor_B
|
|
79
|
+
Wait for response → use cursor_B for Batch 3 → cursor_C
|
|
80
|
+
Result: Complete unique data ✅
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
**Key Rules:**
|
|
84
|
+
- Each response provides a NEW cursor for the NEXT request
|
|
85
|
+
- NEVER reuse the same cursor value multiple times
|
|
86
|
+
- NEVER make parallel calls with the same cursor
|
|
87
|
+
- Always wait for response before using the returned cursor
|
|
@@ -73,7 +73,7 @@ When investigating metrics-related issues:
|
|
|
73
73
|
|
|
74
74
|
# Handling queries results
|
|
75
75
|
* ALWAYS embed the execution results into your answer
|
|
76
|
-
* You only need to embed the partial result in your response. Include the "tool_name" and "
|
|
76
|
+
* You only need to embed the partial result in your response. Include the "tool_name" and "tool_call_id". For example: << {"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "92jf2hf"} >>
|
|
77
77
|
* Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
|
|
78
78
|
* You MUST ensure that the query is successful.
|
|
79
79
|
* ALWAYS embed a DataDog graph in the response. The graph should visualize data related to the incident.
|
|
@@ -81,6 +81,6 @@ When investigating metrics-related issues:
|
|
|
81
81
|
* When embedding multiple graphs, always add line spacing between them
|
|
82
82
|
For example:
|
|
83
83
|
|
|
84
|
-
<<{"type": "datadogql", "tool_name": "query_datadog_metrics", "
|
|
84
|
+
<<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "lBaA"}>>
|
|
85
85
|
|
|
86
|
-
<<{"type": "datadogql", "tool_name": "query_datadog_metrics", "
|
|
86
|
+
<<{"type": "datadogql", "tool_name": "query_datadog_metrics", "tool_call_id": "IKtq"}>>
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from holmes.plugins.toolsets.datadog.datadog_api import DatadogBaseConfig
|
|
6
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import DEFAULT_LOG_LIMIT
|
|
7
|
+
|
|
8
|
+
# Constants for RDS toolset
|
|
9
|
+
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
10
|
+
DEFAULT_TOP_INSTANCES = 10
|
|
11
|
+
|
|
12
|
+
# Constants for general toolset
|
|
13
|
+
MAX_RESPONSE_SIZE = 10 * 1024 * 1024 # 10MB
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DataDogStorageTier(str, Enum):
|
|
17
|
+
"""Storage tier enum for Datadog logs."""
|
|
18
|
+
|
|
19
|
+
INDEXES = "indexes"
|
|
20
|
+
ONLINE_ARCHIVES = "online-archives"
|
|
21
|
+
FLEX = "flex"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Constants for logs toolset
|
|
25
|
+
DEFAULT_STORAGE_TIERS = [DataDogStorageTier.INDEXES]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DatadogMetricsConfig(DatadogBaseConfig):
|
|
29
|
+
"""Configuration for Datadog metrics toolset."""
|
|
30
|
+
|
|
31
|
+
default_limit: int = DEFAULT_LOG_LIMIT
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DatadogTracesConfig(DatadogBaseConfig):
|
|
35
|
+
"""Configuration for Datadog traces toolset."""
|
|
36
|
+
|
|
37
|
+
indexes: list[str] = ["*"]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class DatadogLogsConfig(DatadogBaseConfig):
|
|
41
|
+
"""Configuration for Datadog logs toolset."""
|
|
42
|
+
|
|
43
|
+
indexes: list[str] = ["*"]
|
|
44
|
+
# TODO storage tier just works with first element. need to add support for multi stoarge tiers.
|
|
45
|
+
storage_tiers: list[DataDogStorageTier] = Field(
|
|
46
|
+
default_factory=lambda: [DataDogStorageTier.INDEXES], min_length=1
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
compact_logs: bool = True
|
|
50
|
+
default_limit: int = DEFAULT_LOG_LIMIT
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DatadogGeneralConfig(DatadogBaseConfig):
|
|
54
|
+
"""Configuration for general-purpose Datadog toolset."""
|
|
55
|
+
|
|
56
|
+
max_response_size: int = MAX_RESPONSE_SIZE
|
|
57
|
+
allow_custom_endpoints: bool = (
|
|
58
|
+
False # If True, allows endpoints not in whitelist (still filtered for safety)
|
|
59
|
+
)
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
from urllib.parse import urlencode, urlparse
|
|
4
|
+
|
|
5
|
+
from holmes.plugins.toolsets.datadog.datadog_api import convert_api_url_to_app_url
|
|
6
|
+
from holmes.plugins.toolsets.datadog.datadog_models import (
|
|
7
|
+
DatadogGeneralConfig,
|
|
8
|
+
DatadogLogsConfig,
|
|
9
|
+
DatadogMetricsConfig,
|
|
10
|
+
DatadogTracesConfig,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_datadog_metrics_explorer_url(
|
|
15
|
+
dd_config: DatadogMetricsConfig,
|
|
16
|
+
query: str,
|
|
17
|
+
from_time: int,
|
|
18
|
+
to_time: int,
|
|
19
|
+
) -> str:
|
|
20
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
21
|
+
|
|
22
|
+
params = {
|
|
23
|
+
"query": query,
|
|
24
|
+
"from_ts": from_time * 1000, # seconds -> ms
|
|
25
|
+
"to_ts": to_time * 1000, # seconds -> ms
|
|
26
|
+
"live": "true",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
return f"{base_url}/metric/explorer?{urlencode(params)}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def generate_datadog_metrics_list_url(
|
|
33
|
+
dd_config: DatadogMetricsConfig,
|
|
34
|
+
from_time: int,
|
|
35
|
+
host: Optional[str] = None,
|
|
36
|
+
tag_filter: Optional[str] = None,
|
|
37
|
+
metric_filter: Optional[str] = None,
|
|
38
|
+
) -> str:
|
|
39
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
40
|
+
|
|
41
|
+
params = {}
|
|
42
|
+
if metric_filter:
|
|
43
|
+
params["filter"] = metric_filter
|
|
44
|
+
|
|
45
|
+
if host:
|
|
46
|
+
params["host"] = host
|
|
47
|
+
if tag_filter:
|
|
48
|
+
params["tag_filter"] = tag_filter
|
|
49
|
+
|
|
50
|
+
qs = urlencode(params) if params else ""
|
|
51
|
+
return f"{base_url}/metric/summary" + (f"?{qs}" if qs else "")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def generate_datadog_metric_metadata_url(
|
|
55
|
+
dd_config: DatadogMetricsConfig,
|
|
56
|
+
metric_name: str,
|
|
57
|
+
) -> str:
|
|
58
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
59
|
+
params = {"metric": metric_name}
|
|
60
|
+
return f"{base_url}/metric/summary?{urlencode(params)}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def generate_datadog_metric_tags_url(
|
|
64
|
+
dd_config: DatadogMetricsConfig,
|
|
65
|
+
metric_name: str,
|
|
66
|
+
) -> str:
|
|
67
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
68
|
+
params = {"metric": metric_name}
|
|
69
|
+
return f"{base_url}/metric/summary?{urlencode(params)}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def generate_datadog_spans_url(
|
|
73
|
+
dd_config: DatadogTracesConfig,
|
|
74
|
+
query: str,
|
|
75
|
+
from_time_ms: int,
|
|
76
|
+
to_time_ms: int,
|
|
77
|
+
) -> str:
|
|
78
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
79
|
+
|
|
80
|
+
url_params = {
|
|
81
|
+
"query": query,
|
|
82
|
+
"from_ts": from_time_ms,
|
|
83
|
+
"to_ts": to_time_ms,
|
|
84
|
+
"live": "true",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return f"{base_url}/apm/traces?{urlencode(url_params)}"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def generate_datadog_spans_analytics_url(
|
|
91
|
+
dd_config: DatadogTracesConfig,
|
|
92
|
+
query: str,
|
|
93
|
+
from_time_ms: int,
|
|
94
|
+
to_time_ms: int,
|
|
95
|
+
) -> str:
|
|
96
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
97
|
+
|
|
98
|
+
url_params = {
|
|
99
|
+
"query": query,
|
|
100
|
+
"from_ts": from_time_ms,
|
|
101
|
+
"to_ts": to_time_ms,
|
|
102
|
+
"live": "true",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
return f"{base_url}/apm/analytics?{urlencode(url_params)}"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def generate_datadog_logs_url(
|
|
109
|
+
dd_config: DatadogLogsConfig,
|
|
110
|
+
params: dict,
|
|
111
|
+
) -> str:
|
|
112
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
113
|
+
url_params = {
|
|
114
|
+
"query": params["filter"]["query"],
|
|
115
|
+
"from_ts": params["filter"]["from"],
|
|
116
|
+
"to_ts": params["filter"]["to"],
|
|
117
|
+
"live": "true",
|
|
118
|
+
"storage": params["filter"]["storage_tier"],
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if dd_config.indexes != ["*"]:
|
|
122
|
+
url_params["index"] = ",".join(dd_config.indexes)
|
|
123
|
+
|
|
124
|
+
# Construct the full URL
|
|
125
|
+
return f"{base_url}/logs?{urlencode(url_params)}"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _build_qs(
|
|
129
|
+
query_params: Optional[Dict[str, Any]], allowed: Optional[set] = None
|
|
130
|
+
) -> str:
|
|
131
|
+
if not query_params:
|
|
132
|
+
return ""
|
|
133
|
+
allowed = allowed or {
|
|
134
|
+
"filter",
|
|
135
|
+
"query",
|
|
136
|
+
"tags",
|
|
137
|
+
"status",
|
|
138
|
+
"start",
|
|
139
|
+
"end",
|
|
140
|
+
"from",
|
|
141
|
+
"to",
|
|
142
|
+
}
|
|
143
|
+
url_params = {}
|
|
144
|
+
for k, v in query_params.items():
|
|
145
|
+
if k not in allowed or v is None:
|
|
146
|
+
continue
|
|
147
|
+
if k in ("start", "from"):
|
|
148
|
+
url_params["from_ts"] = v * 1000
|
|
149
|
+
elif k in ("end", "to"):
|
|
150
|
+
url_params["to_ts"] = v * 1000
|
|
151
|
+
elif k in ("query", "filter", "tags"):
|
|
152
|
+
url_params["q"] = v
|
|
153
|
+
else:
|
|
154
|
+
url_params[k] = v
|
|
155
|
+
qs = urlencode(url_params) if url_params else ""
|
|
156
|
+
return f"?{qs}" if qs else ""
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def generate_datadog_general_url(
|
|
160
|
+
dd_config: DatadogGeneralConfig,
|
|
161
|
+
endpoint: str,
|
|
162
|
+
query_params: Optional[Dict[str, Any]] = None,
|
|
163
|
+
) -> Optional[str]:
|
|
164
|
+
base_url = convert_api_url_to_app_url(dd_config.site_api_url)
|
|
165
|
+
path = urlparse(endpoint).path
|
|
166
|
+
|
|
167
|
+
if "/logs" in path:
|
|
168
|
+
return f"{base_url}/logs{_build_qs(query_params, {'start', 'end'})}"
|
|
169
|
+
|
|
170
|
+
if "/monitor" in path:
|
|
171
|
+
qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
|
|
172
|
+
monitor_id_match = re.search(r"/monitor/(\d+)", path)
|
|
173
|
+
if monitor_id_match:
|
|
174
|
+
return f"{base_url}/monitors/{monitor_id_match.group(1)}{qs}"
|
|
175
|
+
return f"{base_url}/monitors{qs}"
|
|
176
|
+
|
|
177
|
+
if "/dashboard" in path:
|
|
178
|
+
qs = _build_qs(query_params, {"filter", "query", "tags"})
|
|
179
|
+
if re.match(r"^/api/v\d+/dashboard/[^/]+", path):
|
|
180
|
+
return f"{base_url}/dashboard/{path.split('/')[-1]}{qs}"
|
|
181
|
+
return f"{base_url}/dashboard{qs}"
|
|
182
|
+
|
|
183
|
+
if "/slo" in path:
|
|
184
|
+
qs = _build_qs(query_params, {"filter", "query", "tags"})
|
|
185
|
+
if re.match(r"^/api/v\d+/slo/[^/]+", path):
|
|
186
|
+
return f"{base_url}/slo/{path.split('/')[-1]}{qs}"
|
|
187
|
+
return f"{base_url}/slo{qs}"
|
|
188
|
+
|
|
189
|
+
if "/events" in path:
|
|
190
|
+
return f"{base_url}/events{_build_qs(query_params, {'start', 'end'})}"
|
|
191
|
+
|
|
192
|
+
if "/incidents" in path:
|
|
193
|
+
qs = _build_qs(query_params, {"filter", "query", "status"})
|
|
194
|
+
if re.match(r"^/api/v\d+/incidents/[^/]+", path):
|
|
195
|
+
return f"{base_url}/incidents/{path.split('/')[-1]}{qs}"
|
|
196
|
+
return f"{base_url}/incidents{qs}"
|
|
197
|
+
|
|
198
|
+
if "/synthetics" in path:
|
|
199
|
+
qs = _build_qs(query_params, {"filter", "query", "tags", "status"})
|
|
200
|
+
if re.match(r"^/api/v\d+/synthetics/tests/[^/]+", path):
|
|
201
|
+
return f"{base_url}/synthetics/tests/{path.split('/')[-1]}{qs}"
|
|
202
|
+
return f"{base_url}/synthetics/tests{qs}"
|
|
203
|
+
|
|
204
|
+
if "/hosts" in path:
|
|
205
|
+
return f"{base_url}/infrastructure{_build_qs(query_params, {'filter', 'query', 'tags'})}"
|
|
206
|
+
|
|
207
|
+
if "/services" in path:
|
|
208
|
+
return f"{base_url}/apm/services{_build_qs(query_params, {'filter', 'query', 'tags'})}"
|
|
209
|
+
|
|
210
|
+
if "/metrics" in path or "/query" in path:
|
|
211
|
+
return f"{base_url}/metrics/explorer{_build_qs(query_params, {'from', 'to', 'query'})}"
|
|
212
|
+
|
|
213
|
+
return f"{base_url}/apm/home"
|
|
@@ -3,49 +3,186 @@
|
|
|
3
3
|
Tools to search and analyze distributed traces from Datadog APM.
|
|
4
4
|
|
|
5
5
|
### Available Tools:
|
|
6
|
-
- **fetch_datadog_traces** - List traces with filters (service, operation, duration)
|
|
7
|
-
- **fetch_datadog_trace_by_id** - Get detailed span hierarchy for a specific trace
|
|
8
6
|
- **fetch_datadog_spans** - Search spans with Datadog query syntax
|
|
7
|
+
- **aggregate_datadog_spans** - Aggregate span data into buckets and compute metrics
|
|
9
8
|
|
|
10
9
|
### Common Usage:
|
|
11
10
|
|
|
12
11
|
```python
|
|
13
|
-
#
|
|
14
|
-
|
|
12
|
+
# Search for errors using Datadog query syntax
|
|
13
|
+
fetch_datadog_spans(query="@http.status_code:500", limit=5)
|
|
14
|
+
fetch_datadog_spans(query="service:api status:error", limit=10)
|
|
15
|
+
```
|
|
15
16
|
|
|
16
|
-
|
|
17
|
-
fetch_datadog_trace_by_id(trace_id="6878d11e0000000064837efe7e97f5f8")
|
|
17
|
+
### Query Patterns:
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
fetch_datadog_spans(
|
|
19
|
+
```python
|
|
20
|
+
# Specific HTTP endpoint (any method)
|
|
21
|
+
fetch_datadog_spans(query="@http.route:/api/orders", limit=5)
|
|
22
|
+
|
|
23
|
+
# HTTP routes containing substring (wildcard search)
|
|
24
|
+
fetch_datadog_spans(query="@http.route:*payment*", limit=5)
|
|
25
|
+
|
|
26
|
+
# Broad search across all span types
|
|
27
|
+
fetch_datadog_spans(query="resource_name:*user*", limit=10)
|
|
28
|
+
|
|
29
|
+
# Errors by service with wildcard
|
|
30
|
+
fetch_datadog_spans(query="service:payment @http.status_code:5*", limit=5)
|
|
31
|
+
|
|
32
|
+
# Database queries with time range (last hour)
|
|
33
|
+
fetch_datadog_spans(
|
|
34
|
+
query="service:postgres @duration:>1000000000",
|
|
35
|
+
start_datetime="-3600", # 1 hour in seconds
|
|
36
|
+
limit=10
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Production errors
|
|
40
|
+
fetch_datadog_spans(query="env:production error:true", limit=5)
|
|
22
41
|
|
|
23
|
-
#
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
start_datetime="-
|
|
27
|
-
|
|
42
|
+
# Specific endpoint pattern with custom time range
|
|
43
|
+
fetch_datadog_spans(
|
|
44
|
+
query='@http.route:*/user/* @http.status_code:>=400',
|
|
45
|
+
start_datetime="-1800", # 30 minutes in seconds
|
|
46
|
+
limit=10
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Combining multiple conditions with wildcards
|
|
50
|
+
fetch_datadog_spans(
|
|
51
|
+
query='service:*api* @http.route:*/user/* @http.status_code:[400 TO 599]',
|
|
52
|
+
limit=10
|
|
28
53
|
)
|
|
29
54
|
```
|
|
30
55
|
|
|
31
|
-
###
|
|
56
|
+
### Aggregate Examples:
|
|
32
57
|
|
|
33
58
|
```python
|
|
34
|
-
#
|
|
35
|
-
|
|
59
|
+
# Count spans grouped by status code (last 15 minutes)
|
|
60
|
+
aggregate_datadog_spans(
|
|
61
|
+
query='resource_name:*api* @http.method:POST',
|
|
62
|
+
compute=[{"aggregation": "count", "type": "total"}],
|
|
63
|
+
group_by=[{"facet": "@http.status_code", "limit": 50}],
|
|
64
|
+
start_datetime="-900" # 15 minutes in seconds
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Get average duration by service (last hour)
|
|
68
|
+
aggregate_datadog_spans(
|
|
69
|
+
query='service:*backend* OR service:*api*',
|
|
70
|
+
compute=[{"aggregation": "avg", "metric": "@duration", "type": "total"}],
|
|
71
|
+
group_by=[{"facet": "service", "limit": 50}],
|
|
72
|
+
start_datetime="-3600" # 1 hour in seconds
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Get P95 latency timeseries by service
|
|
76
|
+
aggregate_datadog_spans(
|
|
77
|
+
query='@http.route:*/api/* @http.status_code:[200 TO 299]',
|
|
78
|
+
compute=[{
|
|
79
|
+
"aggregation": "pc95",
|
|
80
|
+
"metric": "@duration",
|
|
81
|
+
"type": "timeseries",
|
|
82
|
+
"interval": "5m"
|
|
83
|
+
}],
|
|
84
|
+
group_by=[{"facet": "service", "limit": 50}]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Complex aggregation with histogram
|
|
88
|
+
aggregate_datadog_spans(
|
|
89
|
+
query='resource_name:*product* OR resource_name:*catalog*',
|
|
90
|
+
compute=[
|
|
91
|
+
{"aggregation": "avg", "metric": "@duration", "type": "total"},
|
|
92
|
+
{"aggregation": "count", "type": "total"}
|
|
93
|
+
],
|
|
94
|
+
group_by=[{
|
|
95
|
+
"facet": "@duration",
|
|
96
|
+
"histogram": {"interval": 100, "min": 0, "max": 1000},
|
|
97
|
+
"limit": 50
|
|
98
|
+
}]
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Error rate calculation by endpoint
|
|
102
|
+
aggregate_datadog_spans(
|
|
103
|
+
query='@http.route:* @http.status_code:[400 TO 599]',
|
|
104
|
+
compute=[{"aggregation": "count", "type": "total"}],
|
|
105
|
+
group_by=[
|
|
106
|
+
{"facet": "resource_name", "limit": 50},
|
|
107
|
+
{"facet": "@http.status_code", "limit": 50}
|
|
108
|
+
]
|
|
109
|
+
)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Query Pattern Tips:
|
|
113
|
+
|
|
114
|
+
| Your Goal | Use This Pattern |
|
|
115
|
+
|-----------|------------------|
|
|
116
|
+
| Specific HTTP endpoint, any method | `@http.route:/api/users` |
|
|
117
|
+
| HTTP routes containing substring | `@http.route:*payment*` |
|
|
118
|
+
| Broad search across all span types | `resource_name:*user*` |
|
|
119
|
+
| Service name patterns | `service:*api*` or `service:payment-*` |
|
|
120
|
+
| Multiple wildcards | `@http.route:*/user/*/profile` |
|
|
121
|
+
| Error status codes | `@http.status_code:5*` or `@http.status_code:[400 TO 599]` |
|
|
122
|
+
|
|
123
|
+
### General Tips:
|
|
124
|
+
- Wildcards (*) can be used in most fields for flexible pattern matching
|
|
125
|
+
- For aggregations: use @-prefixed attributes (e.g., @duration, @http.status_code)
|
|
126
|
+
- Keep fetch_datadog_spans limit low (5-10) to avoid too much data
|
|
127
|
+
- aggregate_datadog_spans can handle higher limits (50+) for group_by facets
|
|
128
|
+
|
|
129
|
+
### CRITICAL: Cursor Usage Rules
|
|
130
|
+
**NEVER parallelize cursor-based calls or reuse cursor values!**
|
|
131
|
+
|
|
132
|
+
Cursors are stateful pointers - each one is single-use and represents a unique position in the data stream.
|
|
133
|
+
|
|
134
|
+
**WRONG (causes duplicate data):**
|
|
135
|
+
```
|
|
136
|
+
Batch 1 → cursor_A
|
|
137
|
+
Then call Batch 2, 3, 4 ALL with cursor_A in parallel ❌
|
|
138
|
+
Result: Duplicate data, incomplete results
|
|
139
|
+
```
|
|
36
140
|
|
|
37
|
-
|
|
38
|
-
|
|
141
|
+
**CORRECT (sequential pagination):**
|
|
142
|
+
```
|
|
143
|
+
Batch 1 → cursor_A
|
|
144
|
+
Wait for response → use cursor_A for Batch 2 → cursor_B
|
|
145
|
+
Wait for response → use cursor_B for Batch 3 → cursor_C
|
|
146
|
+
Result: Complete unique data ✅
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
**Key Rules:**
|
|
150
|
+
- Each response provides a NEW cursor for the NEXT request
|
|
151
|
+
- NEVER reuse the same cursor value multiple times
|
|
152
|
+
- NEVER make parallel calls with the same cursor
|
|
153
|
+
- Always wait for response before using the returned cursor
|
|
154
|
+
|
|
155
|
+
### Compact Mode Strategy:
|
|
156
|
+
|
|
157
|
+
The `compact` parameter reduces output size by returning only essential fields. Use this strategy:
|
|
39
158
|
|
|
40
|
-
|
|
41
|
-
|
|
159
|
+
1. **Initial exploration**: Use compact=true with higher limits (50-100) to get an overview
|
|
160
|
+
2. **Detailed investigation**: Use compact=false with lower limits (5-10) for specific spans
|
|
42
161
|
|
|
43
|
-
|
|
44
|
-
|
|
162
|
+
```python
|
|
163
|
+
# STEP 1: Initial search with compact mode to find patterns
|
|
164
|
+
fetch_datadog_spans(
|
|
165
|
+
query="service:api @http.status_code:5*",
|
|
166
|
+
compact=true,
|
|
167
|
+
limit=100 # Higher limit safe with compact mode
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# STEP 2: Detailed investigation of specific issues
|
|
171
|
+
fetch_datadog_spans(
|
|
172
|
+
query="service:api @http.status_code:500 resource_name:*/user/*",
|
|
173
|
+
compact=false, # Full details for deep analysis
|
|
174
|
+
limit=10
|
|
175
|
+
)
|
|
45
176
|
```
|
|
46
177
|
|
|
47
|
-
|
|
48
|
-
-
|
|
49
|
-
-
|
|
50
|
-
-
|
|
51
|
-
-
|
|
178
|
+
**When to use compact=true:**
|
|
179
|
+
- Initial searches to identify patterns
|
|
180
|
+
- When you need to scan many spans for errors or performance issues
|
|
181
|
+
- When looking for specific span IDs or trace IDs
|
|
182
|
+
- When the full span details aren't needed yet
|
|
183
|
+
|
|
184
|
+
**When to use compact=false (default):**
|
|
185
|
+
- Investigating specific errors
|
|
186
|
+
- Analyzing request/response headers
|
|
187
|
+
- Examining user agent details
|
|
188
|
+
- Debugging authentication issues or HTTP details
|