holmesgpt 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/common/env_vars.py +11 -0
- holmes/config.py +3 -1
- holmes/core/conversations.py +0 -11
- holmes/core/investigation.py +0 -6
- holmes/core/llm.py +63 -2
- holmes/core/prompt.py +0 -2
- holmes/core/supabase_dal.py +2 -2
- holmes/core/todo_tasks_formatter.py +51 -0
- holmes/core/tool_calling_llm.py +277 -101
- holmes/core/tools.py +20 -4
- holmes/core/toolset_manager.py +1 -5
- holmes/core/tracing.py +1 -1
- holmes/interactive.py +63 -2
- holmes/main.py +7 -2
- holmes/plugins/prompts/_fetch_logs.jinja2 +4 -0
- holmes/plugins/prompts/_general_instructions.jinja2 +3 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +3 -13
- holmes/plugins/runbooks/CLAUDE.md +85 -0
- holmes/plugins/runbooks/README.md +24 -0
- holmes/plugins/toolsets/__init__.py +5 -1
- holmes/plugins/toolsets/argocd.yaml +1 -1
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +18 -6
- holmes/plugins/toolsets/aws.yaml +9 -5
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -1
- holmes/plugins/toolsets/bash/argocd/__init__.py +65 -0
- holmes/plugins/toolsets/bash/argocd/constants.py +120 -0
- holmes/plugins/toolsets/bash/aws/__init__.py +66 -0
- holmes/plugins/toolsets/bash/aws/constants.py +529 -0
- holmes/plugins/toolsets/bash/azure/__init__.py +56 -0
- holmes/plugins/toolsets/bash/azure/constants.py +339 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +6 -7
- holmes/plugins/toolsets/bash/bash_toolset.py +62 -17
- holmes/plugins/toolsets/bash/common/bash_command.py +131 -0
- holmes/plugins/toolsets/bash/common/stringify.py +14 -1
- holmes/plugins/toolsets/bash/common/validators.py +91 -0
- holmes/plugins/toolsets/bash/docker/__init__.py +59 -0
- holmes/plugins/toolsets/bash/docker/constants.py +255 -0
- holmes/plugins/toolsets/bash/helm/__init__.py +61 -0
- holmes/plugins/toolsets/bash/helm/constants.py +92 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +80 -79
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -14
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +38 -56
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +28 -76
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +39 -99
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +34 -15
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +38 -77
- holmes/plugins/toolsets/bash/parse_command.py +106 -32
- holmes/plugins/toolsets/bash/utilities/__init__.py +0 -0
- holmes/plugins/toolsets/bash/utilities/base64_util.py +12 -0
- holmes/plugins/toolsets/bash/utilities/cut.py +12 -0
- holmes/plugins/toolsets/bash/utilities/grep/__init__.py +10 -0
- holmes/plugins/toolsets/bash/utilities/head.py +12 -0
- holmes/plugins/toolsets/bash/utilities/jq.py +79 -0
- holmes/plugins/toolsets/bash/utilities/sed.py +164 -0
- holmes/plugins/toolsets/bash/utilities/sort.py +15 -0
- holmes/plugins/toolsets/bash/utilities/tail.py +12 -0
- holmes/plugins/toolsets/bash/utilities/tr.py +57 -0
- holmes/plugins/toolsets/bash/utilities/uniq.py +12 -0
- holmes/plugins/toolsets/bash/utilities/wc.py +12 -0
- holmes/plugins/toolsets/confluence.yaml +1 -1
- holmes/plugins/toolsets/coralogix/api.py +3 -1
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +4 -4
- holmes/plugins/toolsets/coralogix/utils.py +41 -14
- holmes/plugins/toolsets/datadog/datadog_api.py +45 -2
- holmes/plugins/toolsets/datadog/datadog_general_instructions.jinja2 +208 -0
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +43 -0
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +12 -9
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +722 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +17 -6
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +15 -7
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +6 -2
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +9 -3
- holmes/plugins/toolsets/docker.yaml +1 -1
- holmes/plugins/toolsets/git.py +15 -5
- holmes/plugins/toolsets/grafana/toolset_grafana.py +25 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +4 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +5 -3
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -32
- holmes/plugins/toolsets/helm.yaml +1 -1
- holmes/plugins/toolsets/internet/internet.py +4 -2
- holmes/plugins/toolsets/internet/notion.py +4 -2
- holmes/plugins/toolsets/investigator/core_investigation.py +5 -17
- holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +1 -5
- holmes/plugins/toolsets/kafka.py +19 -7
- holmes/plugins/toolsets/kubernetes.yaml +5 -5
- holmes/plugins/toolsets/kubernetes_logs.py +4 -4
- holmes/plugins/toolsets/kubernetes_logs.yaml +1 -1
- holmes/plugins/toolsets/logging_utils/logging_api.py +15 -2
- holmes/plugins/toolsets/mcp/toolset_mcp.py +3 -1
- holmes/plugins/toolsets/newrelic.py +8 -4
- holmes/plugins/toolsets/opensearch/opensearch.py +13 -5
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +4 -4
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +9 -6
- holmes/plugins/toolsets/prometheus/prometheus.py +198 -57
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +7 -3
- holmes/plugins/toolsets/robusta/robusta.py +10 -4
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -2
- holmes/plugins/toolsets/servicenow/servicenow.py +9 -3
- holmes/plugins/toolsets/slab.yaml +1 -1
- holmes/utils/console/logging.py +6 -1
- {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/METADATA +3 -2
- {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/RECORD +116 -90
- holmes/core/todo_manager.py +0 -88
- holmes/plugins/toolsets/bash/grep/__init__.py +0 -52
- {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.0.dist-info → holmesgpt-0.13.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from holmes.plugins.toolsets.bash.common.bash_command import (
|
|
2
|
+
SimpleBashCommand,
|
|
3
|
+
)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class UniqCommand(SimpleBashCommand):
|
|
7
|
+
def __init__(self):
|
|
8
|
+
super().__init__(
|
|
9
|
+
name="uniq",
|
|
10
|
+
allowed_options=[], # Allow all options except file operations
|
|
11
|
+
denied_options=[],
|
|
12
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
toolsets:
|
|
2
2
|
confluence:
|
|
3
3
|
description: "Fetch confluence pages"
|
|
4
|
-
docs_url: "https://
|
|
4
|
+
docs_url: "https://holmesgpt.dev/data-sources/builtin-toolsets/confluence/"
|
|
5
5
|
icon_url: "https://platform.robusta.dev/demos/confluence.svg"
|
|
6
6
|
tags:
|
|
7
7
|
- core
|
|
@@ -106,7 +106,9 @@ def query_logs_for_tier(
|
|
|
106
106
|
)
|
|
107
107
|
http_status = response.status_code
|
|
108
108
|
if http_status == 200:
|
|
109
|
-
logs = parse_logs(
|
|
109
|
+
logs = parse_logs(
|
|
110
|
+
raw_logs=response.text.strip(), labels_config=config.labels
|
|
111
|
+
)
|
|
110
112
|
return CoralogixQueryResult(logs=logs, http_status=http_status, error=None)
|
|
111
113
|
else:
|
|
112
114
|
return CoralogixQueryResult(
|
|
@@ -38,14 +38,14 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
|
|
|
38
38
|
super().__init__(
|
|
39
39
|
name="coralogix/logs",
|
|
40
40
|
description="Toolset for interacting with Coralogix to fetch logs",
|
|
41
|
-
docs_url="https://
|
|
41
|
+
docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/coralogix-logs/",
|
|
42
42
|
icon_url="https://avatars.githubusercontent.com/u/35295744?s=200&v=4",
|
|
43
43
|
prerequisites=[CallablePrerequisite(callable=self.prerequisites_callable)],
|
|
44
|
-
tools=[
|
|
45
|
-
PodLoggingTool(self),
|
|
46
|
-
],
|
|
44
|
+
tools=[], # Initialize with empty tools first
|
|
47
45
|
tags=[ToolsetTag.CORE],
|
|
48
46
|
)
|
|
47
|
+
# Now that parent is initialized and self.name exists, create the tool
|
|
48
|
+
self.tools = [PodLoggingTool(self)]
|
|
49
49
|
|
|
50
50
|
def get_example_config(self):
|
|
51
51
|
example_config = CoralogixConfig(
|
|
@@ -20,9 +20,10 @@ class CoralogixQueryResult(BaseModel):
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class CoralogixLabelsConfig(BaseModel):
|
|
23
|
-
pod: str = "
|
|
24
|
-
namespace: str = "
|
|
25
|
-
log_message: str = "
|
|
23
|
+
pod: str = "resource.attributes.k8s.pod.name"
|
|
24
|
+
namespace: str = "resource.attributes.k8s.namespace.name"
|
|
25
|
+
log_message: str = "logRecord.body"
|
|
26
|
+
timestamp: str = "logRecord.attributes.time"
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class CoralogixLogsMethodology(str, Enum):
|
|
@@ -78,24 +79,43 @@ def normalize_datetime(date_str: Optional[str]) -> str:
|
|
|
78
79
|
return date_str
|
|
79
80
|
|
|
80
81
|
|
|
82
|
+
def extract_field(data_obj: dict[str, Any], field: str):
|
|
83
|
+
"""returns a nested field from a dict
|
|
84
|
+
e.g. extract_field({"parent": {"child": "value"}}, "parent.child") => value
|
|
85
|
+
"""
|
|
86
|
+
current_object: Any = data_obj
|
|
87
|
+
fields = field.split(".")
|
|
88
|
+
|
|
89
|
+
for field in fields:
|
|
90
|
+
if not current_object:
|
|
91
|
+
return None
|
|
92
|
+
if isinstance(current_object, dict):
|
|
93
|
+
current_object = current_object.get(field)
|
|
94
|
+
else:
|
|
95
|
+
return None
|
|
96
|
+
|
|
97
|
+
return current_object
|
|
98
|
+
|
|
99
|
+
|
|
81
100
|
def flatten_structured_log_entries(
|
|
82
101
|
log_entries: List[Dict[str, Any]],
|
|
102
|
+
labels_config: CoralogixLabelsConfig,
|
|
83
103
|
) -> List[FlattenedLog]:
|
|
84
104
|
flattened_logs = []
|
|
85
105
|
for log_entry in log_entries:
|
|
86
106
|
try:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
if log_message:
|
|
107
|
+
userData = json.loads(log_entry.get("userData", "{}"))
|
|
108
|
+
log_message = extract_field(userData, labels_config.log_message)
|
|
109
|
+
timestamp = extract_field(userData, labels_config.timestamp)
|
|
110
|
+
if not log_message or not timestamp:
|
|
111
|
+
log_message = json.dumps(userData)
|
|
112
|
+
else:
|
|
91
113
|
flattened_logs.append(
|
|
92
114
|
FlattenedLog(timestamp=timestamp, log_message=log_message)
|
|
93
115
|
) # Store as tuple for sorting
|
|
94
116
|
|
|
95
117
|
except json.JSONDecodeError:
|
|
96
|
-
logging.error(
|
|
97
|
-
f"Failed to decode userData JSON: {log_entry.get('userData')}"
|
|
98
|
-
)
|
|
118
|
+
logging.error(f"Failed to decode userData JSON: {json.dumps(log_entry)}")
|
|
99
119
|
return flattened_logs
|
|
100
120
|
|
|
101
121
|
|
|
@@ -107,14 +127,16 @@ def stringify_flattened_logs(log_entries: List[FlattenedLog]) -> str:
|
|
|
107
127
|
return "\n".join(formatted_logs) if formatted_logs else "No logs found."
|
|
108
128
|
|
|
109
129
|
|
|
110
|
-
def parse_json_objects(
|
|
130
|
+
def parse_json_objects(
|
|
131
|
+
json_objects: List[Dict[str, Any]], labels_config: CoralogixLabelsConfig
|
|
132
|
+
) -> List[FlattenedLog]:
|
|
111
133
|
"""Extracts timestamp and log values from parsed JSON objects, sorted in ascending order (oldest first)."""
|
|
112
134
|
logs: List[FlattenedLog] = []
|
|
113
135
|
|
|
114
136
|
for data in json_objects:
|
|
115
137
|
if isinstance(data, dict) and "result" in data and "results" in data["result"]:
|
|
116
138
|
logs += flatten_structured_log_entries(
|
|
117
|
-
log_entries=data["result"]["results"]
|
|
139
|
+
log_entries=data["result"]["results"], labels_config=labels_config
|
|
118
140
|
)
|
|
119
141
|
elif isinstance(data, dict) and data.get("warning"):
|
|
120
142
|
logging.info(
|
|
@@ -128,13 +150,18 @@ def parse_json_objects(json_objects: List[Dict[str, Any]]) -> List[FlattenedLog]
|
|
|
128
150
|
return logs
|
|
129
151
|
|
|
130
152
|
|
|
131
|
-
def parse_logs(
|
|
153
|
+
def parse_logs(
|
|
154
|
+
raw_logs: str,
|
|
155
|
+
labels_config: CoralogixLabelsConfig,
|
|
156
|
+
) -> List[FlattenedLog]:
|
|
132
157
|
"""Processes the HTTP response and extracts only log outputs."""
|
|
133
158
|
try:
|
|
134
159
|
json_objects = parse_json_lines(raw_logs)
|
|
135
160
|
if not json_objects:
|
|
136
161
|
raise Exception("No valid JSON objects found.")
|
|
137
|
-
return parse_json_objects(
|
|
162
|
+
return parse_json_objects(
|
|
163
|
+
json_objects=json_objects, labels_config=labels_config
|
|
164
|
+
)
|
|
138
165
|
except Exception as e:
|
|
139
166
|
logging.error(
|
|
140
167
|
f"Unexpected error in format_logs for a coralogix API response: {str(e)}"
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import json
|
|
1
2
|
import logging
|
|
2
|
-
from typing import Any, Optional, Dict
|
|
3
|
+
from typing import Any, Optional, Dict, Union
|
|
3
4
|
import requests # type: ignore
|
|
4
5
|
from pydantic import AnyUrl, BaseModel
|
|
5
6
|
from requests.structures import CaseInsensitiveDict # type: ignore
|
|
@@ -145,6 +146,19 @@ def execute_paginated_datadog_http_request(
|
|
|
145
146
|
return data, cursor
|
|
146
147
|
|
|
147
148
|
|
|
149
|
+
def sanitize_headers(headers: Union[dict, CaseInsensitiveDict]) -> dict:
|
|
150
|
+
try:
|
|
151
|
+
return {
|
|
152
|
+
k: v
|
|
153
|
+
if ("key" not in k.lower() and "key" not in v.lower())
|
|
154
|
+
else "[REDACTED]"
|
|
155
|
+
for k, v in headers.items()
|
|
156
|
+
}
|
|
157
|
+
except (AttributeError, TypeError):
|
|
158
|
+
# Return empty dict for mock objects or other non-dict types
|
|
159
|
+
return {}
|
|
160
|
+
|
|
161
|
+
|
|
148
162
|
def execute_datadog_http_request(
|
|
149
163
|
url: str,
|
|
150
164
|
headers: dict,
|
|
@@ -152,6 +166,16 @@ def execute_datadog_http_request(
|
|
|
152
166
|
timeout: int,
|
|
153
167
|
method: str = "POST",
|
|
154
168
|
) -> Any:
|
|
169
|
+
# Log the request details
|
|
170
|
+
logging.info("Datadog API Request:")
|
|
171
|
+
logging.info(f" Method: {method}")
|
|
172
|
+
logging.info(f" URL: {url}")
|
|
173
|
+
logging.info(f" Headers: {json.dumps(sanitize_headers(headers), indent=2)}")
|
|
174
|
+
logging.info(
|
|
175
|
+
f" {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)}"
|
|
176
|
+
)
|
|
177
|
+
logging.info(f" Timeout: {timeout}s")
|
|
178
|
+
|
|
155
179
|
if method == "GET":
|
|
156
180
|
response = requests.get(
|
|
157
181
|
url, headers=headers, params=payload_or_params, timeout=timeout
|
|
@@ -161,10 +185,29 @@ def execute_datadog_http_request(
|
|
|
161
185
|
url, headers=headers, json=payload_or_params, timeout=timeout
|
|
162
186
|
)
|
|
163
187
|
|
|
188
|
+
# Log the response details
|
|
189
|
+
logging.info("Datadog API Response:")
|
|
190
|
+
logging.info(f" Status Code: {response.status_code}")
|
|
191
|
+
logging.info(f" Response Headers: {dict(sanitize_headers(response.headers))}")
|
|
192
|
+
|
|
164
193
|
if response.status_code == 200:
|
|
165
|
-
|
|
194
|
+
response_data = response.json()
|
|
195
|
+
# Log response size but not full content (could be large)
|
|
196
|
+
if isinstance(response_data, dict):
|
|
197
|
+
logging.info(f" Response Keys: {list(response_data.keys())}")
|
|
198
|
+
if "data" in response_data:
|
|
199
|
+
data_len = (
|
|
200
|
+
len(response_data["data"])
|
|
201
|
+
if isinstance(response_data["data"], list)
|
|
202
|
+
else 1
|
|
203
|
+
)
|
|
204
|
+
logging.info(f" Data Items Count: {data_len}")
|
|
205
|
+
else:
|
|
206
|
+
logging.info(f" Response Type: {type(response_data).__name__}")
|
|
207
|
+
return response_data
|
|
166
208
|
|
|
167
209
|
else:
|
|
210
|
+
logging.error(f" Error Response Body: {response.text}")
|
|
168
211
|
raise DataDogRequestError(
|
|
169
212
|
payload=payload_or_params,
|
|
170
213
|
status_code=response.status_code,
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
## Datadog General API Tools Usage Guide
|
|
2
|
+
|
|
3
|
+
### When to Use This Toolset
|
|
4
|
+
|
|
5
|
+
**PROACTIVELY use the Datadog general toolset when investigating issues to gather comprehensive observability data.**
|
|
6
|
+
|
|
7
|
+
**Use Datadog for Historical Context When Needed, or check live data when needed:**
|
|
8
|
+
- **When checking current status**: Use current time ranges for real-time monitoring
|
|
9
|
+
- **When investigating past issues**: If asked about problems from yesterday, last week, etc.
|
|
10
|
+
- **When finding root causes**: Look at events/monitors from BEFORE an issue started
|
|
11
|
+
- **When Kubernetes data is missing**: Pods may have been deleted, events expired, etc.
|
|
12
|
+
|
|
13
|
+
This toolset provides access to critical Datadog resources that can help identify root causes, or health status:
|
|
14
|
+
- **Monitors**: Check alert history, thresholds, and monitor states
|
|
15
|
+
- **Incidents**: Review recent incidents and their timelines
|
|
16
|
+
- **Dashboards**: Access pre-configured dashboards for system overview
|
|
17
|
+
- **SLOs**: Verify service level objectives and error budgets
|
|
18
|
+
- **Events**: Correlate deployments, configuration changes, and system events
|
|
19
|
+
- **Synthetics**: Check endpoint availability and performance
|
|
20
|
+
- **Security**: Review security signals and alerts
|
|
21
|
+
- **Hosts**: Get infrastructure-level information
|
|
22
|
+
|
|
23
|
+
### When Historical Data is Important
|
|
24
|
+
|
|
25
|
+
**Kubernetes limitations that Datadog can address:**
|
|
26
|
+
- Kubernetes events expire after 1 hour by default
|
|
27
|
+
- Deleted pods/deployments leave no trace in the cluster
|
|
28
|
+
- Previous configuration values are not retained
|
|
29
|
+
- Past node issues may be resolved without evidence
|
|
30
|
+
|
|
31
|
+
**Datadog preserves this context when you need it:**
|
|
32
|
+
- Events from before an incident started
|
|
33
|
+
- Monitor triggers on now-deleted resources
|
|
34
|
+
- Past incidents and their resolutions
|
|
35
|
+
- Deployment and configuration change history
|
|
36
|
+
|
|
37
|
+
### Investigation Workflow
|
|
38
|
+
|
|
39
|
+
**1. Determine the appropriate time range based on the request:**
|
|
40
|
+
```
|
|
41
|
+
- For current status: Use recent time windows (last hour, last few minutes)
|
|
42
|
+
- For investigating alerts: Query from before the alert started to understand triggers
|
|
43
|
+
- For past issues: Use the specific timeframe when the issue occurred
|
|
44
|
+
- For root cause analysis: Look at events/changes before the problem began
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**2. Check relevant monitors and incidents:**
|
|
48
|
+
```
|
|
49
|
+
- Use `datadog_api_get` with `/api/v1/monitor` to list monitors
|
|
50
|
+
- Use `datadog_api_post_search` with `/api/v2/incidents/search` to find recent incidents
|
|
51
|
+
- Check monitor states to understand alert patterns
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
**3. Correlate with events when investigating issues:**
|
|
55
|
+
```
|
|
56
|
+
- Query `/api/v1/events` with appropriate time range
|
|
57
|
+
- For root cause: Look for events BEFORE the issue started
|
|
58
|
+
- Events often reveal deployments, config changes, or infrastructure updates
|
|
59
|
+
- Especially useful when Kubernetes resources have been deleted/replaced
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
**4. Check service health and dependencies:**
|
|
63
|
+
```
|
|
64
|
+
- Use `/api/v2/services` to list services and their states
|
|
65
|
+
- Query `/api/v2/services/{service}/dependencies` to understand service relationships
|
|
66
|
+
- This helps identify cascade failures
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
**5. Review SLOs for service degradation over time:**
|
|
70
|
+
```
|
|
71
|
+
- Query `/api/v1/slo` to check service level objectives
|
|
72
|
+
- Use `/api/v1/slo/{id}/history` to see historical compliance
|
|
73
|
+
- Identify when degradation started (may be before alerts fired)
|
|
74
|
+
- Check if issues are violating SLO targets
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Common Investigation Patterns
|
|
78
|
+
|
|
79
|
+
**For Kubernetes Pod/Deployment Issues:**
|
|
80
|
+
1. **When pods are missing/deleted**: Query Datadog for historical data about those pods
|
|
81
|
+
2. **For recurring issues**: Check monitor history for patterns
|
|
82
|
+
3. **For deployment problems**: Look for deployment events around issue time
|
|
83
|
+
4. **When Kubernetes events expired**: Use Datadog events for the same timeframe
|
|
84
|
+
|
|
85
|
+
**For Application Issues:**
|
|
86
|
+
1. **Adjust time range based on issue**: Current for live issues, historical for past problems
|
|
87
|
+
2. Review monitors: `datadog_api_get` with `/api/v1/monitor` filtering by service
|
|
88
|
+
3. Search incidents: `datadog_api_post_search` with `/api/v2/incidents/search`
|
|
89
|
+
4. For degradation: Check SLO history to identify when it started
|
|
90
|
+
|
|
91
|
+
**For Infrastructure Issues:**
|
|
92
|
+
1. List hosts: `datadog_api_get` with `/api/v1/hosts` to see host status
|
|
93
|
+
2. Check host details: `datadog_api_get` with `/api/v1/hosts/{hostname}`
|
|
94
|
+
3. Review events: Look for infrastructure changes or maintenance
|
|
95
|
+
4. Check monitors: Find infrastructure-related alerts
|
|
96
|
+
|
|
97
|
+
**For Performance Issues:**
|
|
98
|
+
1. Review synthetics: `datadog_api_get` with `/api/v1/synthetics/tests` for endpoint monitoring
|
|
99
|
+
2. Check SLO history: Track performance degradation over time
|
|
100
|
+
3. Review dashboards: `datadog_api_get` with `/api/v1/dashboard` for performance dashboards
|
|
101
|
+
4. Correlate with events: Find changes that might impact performance
|
|
102
|
+
|
|
103
|
+
**For Security Issues:**
|
|
104
|
+
1. Search security signals: `datadog_api_post_search` with `/api/v2/security_monitoring/signals/search`
|
|
105
|
+
2. Review security rules: `datadog_api_get` with `/api/v2/security_monitoring/rules`
|
|
106
|
+
3. Check recent incidents: Look for security-related incidents
|
|
107
|
+
|
|
108
|
+
### Time Parameters
|
|
109
|
+
|
|
110
|
+
**Choose time ranges based on the investigation context:**
|
|
111
|
+
- Use query parameters for time ranges:
|
|
112
|
+
- `from`: Start time (Unix timestamp or ISO 8601)
|
|
113
|
+
- `to`: End time (Unix timestamp or ISO 8601)
|
|
114
|
+
- Example: `{"from": "2024-01-01T00:00:00Z", "to": "2024-01-02T00:00:00Z"}`
|
|
115
|
+
- For relative times: `{"from": "-1h"}` for last hour
|
|
116
|
+
- **For root cause analysis**: Query from before the issue started (e.g., if alert fired 2 hours ago, query from "-4h")
|
|
117
|
+
- **For current status**: Use recent time windows (e.g., "-15m" or "-1h")
|
|
118
|
+
- **For historical issues**: Use the specific timeframe when the issue occurred
|
|
119
|
+
|
|
120
|
+
### Query Examples
|
|
121
|
+
|
|
122
|
+
**List all monitors with their current state:**
|
|
123
|
+
```
|
|
124
|
+
Tool: datadog_api_get
|
|
125
|
+
Endpoint: /api/v1/monitor
|
|
126
|
+
Query params: {"group_states": "all", "monitor_tags": "env:production"}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Search for recent incidents:**
|
|
130
|
+
```
|
|
131
|
+
Tool: datadog_api_post_search
|
|
132
|
+
Endpoint: /api/v2/incidents/search
|
|
133
|
+
Body: {
|
|
134
|
+
"filter": {
|
|
135
|
+
"created": {
|
|
136
|
+
"from": "-24h"
|
|
137
|
+
}
|
|
138
|
+
},
|
|
139
|
+
"sort": "-created",
|
|
140
|
+
"page": {"limit": 10}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
**Get events for a specific service:**
|
|
145
|
+
```
|
|
146
|
+
Tool: datadog_api_get
|
|
147
|
+
Endpoint: /api/v1/events
|
|
148
|
+
Query params: {"start": "-3600", "end": "now", "tags": "service:my-service"}
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
**Check SLO compliance:**
|
|
152
|
+
```
|
|
153
|
+
Tool: datadog_api_get
|
|
154
|
+
Endpoint: /api/v1/slo/{slo_id}/history
|
|
155
|
+
Query params: {"from_ts": 1234567890, "to_ts": 1234567900}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Best Practices
|
|
159
|
+
|
|
160
|
+
1. **Always correlate multiple data sources:**
|
|
161
|
+
- Don't rely on a single metric or log
|
|
162
|
+
- Cross-reference monitors, events, and incidents
|
|
163
|
+
- Look for patterns across different data types
|
|
164
|
+
|
|
165
|
+
2. **Use time windows effectively:**
|
|
166
|
+
- Start with a broader time range to see patterns
|
|
167
|
+
- Narrow down once you identify the issue timeframe
|
|
168
|
+
- Compare with historical data when available
|
|
169
|
+
|
|
170
|
+
3. **Follow the dependency chain:**
|
|
171
|
+
- Check upstream services when investigating issues
|
|
172
|
+
- Use service dependency maps to understand impact
|
|
173
|
+
- Look for cascade failures
|
|
174
|
+
|
|
175
|
+
4. **Prioritize based on severity:**
|
|
176
|
+
- Check critical monitors and P1 incidents first
|
|
177
|
+
- Review SLO violations for business impact
|
|
178
|
+
- Focus on customer-facing services
|
|
179
|
+
|
|
180
|
+
5. **Document findings:**
|
|
181
|
+
- Note correlations between events and issues
|
|
182
|
+
- Identify patterns in monitor triggers
|
|
183
|
+
- Track incident timelines for post-mortems
|
|
184
|
+
|
|
185
|
+
### Resource Discovery
|
|
186
|
+
|
|
187
|
+
Use `list_datadog_api_resources` to discover available endpoints:
|
|
188
|
+
- Filter by category: monitors, dashboards, slos, incidents, etc.
|
|
189
|
+
- This helps identify which resources are available for investigation
|
|
190
|
+
- Example: `list_datadog_api_resources` with `{"category": "monitors"}`
|
|
191
|
+
|
|
192
|
+
### Integration with Other Toolsets
|
|
193
|
+
|
|
194
|
+
This toolset complements other Datadog toolsets:
|
|
195
|
+
- Use with `datadog/metrics` for detailed metric analysis
|
|
196
|
+
- Combine with `datadog/logs` for log correlation
|
|
197
|
+
- Use alongside `datadog/traces` for distributed tracing
|
|
198
|
+
- Integrate with Kubernetes toolsets for container-level issues
|
|
199
|
+
|
|
200
|
+
### IMPORTANT: Proactive Usage
|
|
201
|
+
|
|
202
|
+
**Don't wait for the user to explicitly ask for Datadog data. When investigating any issue:**
|
|
203
|
+
1. Check if there are relevant monitors or incidents
|
|
204
|
+
2. Look for recent events that might be related
|
|
205
|
+
3. Verify service health and SLO compliance
|
|
206
|
+
4. Review any security signals if applicable
|
|
207
|
+
|
|
208
|
+
This proactive approach often reveals root causes that wouldn't be found through logs or metrics alone.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
## Datadog Logs Tools Usage Guide
|
|
2
|
+
|
|
3
|
+
Before running logs queries:
|
|
4
|
+
|
|
5
|
+
** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
|
|
6
|
+
** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
|
|
7
|
+
** Find the involved resource name and kind
|
|
8
|
+
** If you can't figure out what is the type of the resource, ask the user for more information and don't guess
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### General guideline
|
|
12
|
+
- This toolset is used to read pod logs.
|
|
13
|
+
- Assume the pod should have logs. If logs not found, try to adjust the query
|
|
14
|
+
|
|
15
|
+
### CRITICAL: Pod Name Resolution Workflow
|
|
16
|
+
|
|
17
|
+
**When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
|
|
18
|
+
- FIRST query Datadog directly with that pod name using appropriate tags
|
|
19
|
+
- Do NOT try to verify if the pod exists in Kubernetes first
|
|
20
|
+
- This allows querying historical pods that have been deleted/replaced
|
|
21
|
+
|
|
22
|
+
**When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
|
|
23
|
+
- First use `kubectl_find_resource` to find actual pod names
|
|
24
|
+
- Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
|
|
25
|
+
- Then use those specific pod names in Datadog queries
|
|
26
|
+
- Alternative: Use deployment-level tags when appropriate
|
|
27
|
+
|
|
28
|
+
**Why this matters:**
|
|
29
|
+
- Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
|
|
30
|
+
- Historical pods that no longer exist in the cluster can still have logs in Datadog
|
|
31
|
+
- Deployment/service names alone are NOT pod names (they need the suffix)
|
|
32
|
+
|
|
33
|
+
### Time Parameters
|
|
34
|
+
- Use RFC3339 format: `2023-03-01T10:30:00Z`
|
|
35
|
+
- Or relative seconds: `-3600` for 1 hour ago
|
|
36
|
+
- Defaults to 1 hour window if not specified
|
|
37
|
+
|
|
38
|
+
### Common Investigation Patterns
|
|
39
|
+
|
|
40
|
+
**For Pod/Container Metrics (MOST COMMON):**
|
|
41
|
+
1. User asks: "Show logs for my-workload"
|
|
42
|
+
2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
|
|
43
|
+
3. Query Datadog for pod "my-workload-abc123-xyz" logs
|
|
@@ -32,19 +32,22 @@ When investigating metrics-related issues:
|
|
|
32
32
|
- IMPORTANT: This toolset DOES NOT support promql queries.
|
|
33
33
|
|
|
34
34
|
### CRITICAL: Pod Name Resolution Workflow
|
|
35
|
-
When users ask for metrics about a deployment, service, or workload (e.g., "my-workload", "nginx-deployment"):
|
|
36
35
|
|
|
37
|
-
**
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
36
|
+
**When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
|
|
37
|
+
- Query Datadog directly with that pod name using appropriate metrics and tags
|
|
38
|
+
- Do NOT try to verify if the pod exists in Kubernetes first
|
|
39
|
+
- This allows querying historical pods that have been deleted/replaced
|
|
40
|
+
|
|
41
|
+
**When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
|
|
42
|
+
- First use `kubectl_find_resource` to find actual pod names
|
|
43
|
+
- Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
|
|
44
|
+
- Then use those specific pod names in Datadog queries
|
|
45
|
+
- Alternative: Use deployment-level tags when appropriate
|
|
43
46
|
|
|
44
47
|
**Why this matters:**
|
|
45
48
|
- Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
|
|
46
|
-
-
|
|
47
|
-
-
|
|
49
|
+
- Historical pods that no longer exist in the cluster can still have metrics in Datadog
|
|
50
|
+
- Deployment/service names alone are NOT pod names (they need the suffix)
|
|
48
51
|
|
|
49
52
|
### Time Parameters
|
|
50
53
|
- Use RFC3339 format: `2023-03-01T10:30:00Z`
|