holmesgpt 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/common/env_vars.py +7 -0
  3. holmes/config.py +3 -1
  4. holmes/core/conversations.py +0 -11
  5. holmes/core/investigation.py +0 -6
  6. holmes/core/llm.py +60 -1
  7. holmes/core/prompt.py +0 -2
  8. holmes/core/supabase_dal.py +2 -2
  9. holmes/core/todo_tasks_formatter.py +51 -0
  10. holmes/core/tool_calling_llm.py +166 -91
  11. holmes/core/tools.py +20 -4
  12. holmes/interactive.py +63 -2
  13. holmes/main.py +0 -1
  14. holmes/plugins/prompts/_general_instructions.jinja2 +3 -1
  15. holmes/plugins/prompts/investigation_procedure.jinja2 +3 -13
  16. holmes/plugins/toolsets/__init__.py +5 -1
  17. holmes/plugins/toolsets/argocd.yaml +1 -1
  18. holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +18 -6
  19. holmes/plugins/toolsets/aws.yaml +9 -5
  20. holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +3 -1
  21. holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +3 -1
  22. holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
  23. holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +3 -1
  24. holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +3 -1
  25. holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
  26. holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +3 -1
  27. holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +3 -1
  28. holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +3 -1
  29. holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +3 -1
  30. holmes/plugins/toolsets/bash/bash_toolset.py +31 -20
  31. holmes/plugins/toolsets/confluence.yaml +1 -1
  32. holmes/plugins/toolsets/coralogix/api.py +3 -1
  33. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +4 -4
  34. holmes/plugins/toolsets/coralogix/utils.py +41 -14
  35. holmes/plugins/toolsets/datadog/datadog_api.py +45 -2
  36. holmes/plugins/toolsets/datadog/datadog_general_instructions.jinja2 +208 -0
  37. holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +43 -0
  38. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +12 -9
  39. holmes/plugins/toolsets/datadog/toolset_datadog_general.py +722 -0
  40. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +17 -6
  41. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +15 -7
  42. holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +6 -2
  43. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +9 -3
  44. holmes/plugins/toolsets/docker.yaml +1 -1
  45. holmes/plugins/toolsets/git.py +15 -5
  46. holmes/plugins/toolsets/grafana/toolset_grafana.py +25 -4
  47. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +4 -4
  48. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +5 -3
  49. holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +299 -32
  50. holmes/plugins/toolsets/helm.yaml +1 -1
  51. holmes/plugins/toolsets/internet/internet.py +4 -2
  52. holmes/plugins/toolsets/internet/notion.py +4 -2
  53. holmes/plugins/toolsets/investigator/core_investigation.py +5 -17
  54. holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +1 -5
  55. holmes/plugins/toolsets/kafka.py +19 -7
  56. holmes/plugins/toolsets/kubernetes.yaml +5 -5
  57. holmes/plugins/toolsets/kubernetes_logs.py +4 -4
  58. holmes/plugins/toolsets/kubernetes_logs.yaml +1 -1
  59. holmes/plugins/toolsets/logging_utils/logging_api.py +15 -2
  60. holmes/plugins/toolsets/mcp/toolset_mcp.py +3 -1
  61. holmes/plugins/toolsets/newrelic.py +8 -4
  62. holmes/plugins/toolsets/opensearch/opensearch.py +13 -5
  63. holmes/plugins/toolsets/opensearch/opensearch_logs.py +4 -4
  64. holmes/plugins/toolsets/opensearch/opensearch_traces.py +9 -6
  65. holmes/plugins/toolsets/prometheus/prometheus.py +193 -82
  66. holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +7 -3
  67. holmes/plugins/toolsets/robusta/robusta.py +10 -4
  68. holmes/plugins/toolsets/runbook/runbook_fetcher.py +4 -2
  69. holmes/plugins/toolsets/servicenow/servicenow.py +9 -3
  70. holmes/plugins/toolsets/slab.yaml +1 -1
  71. {holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/METADATA +3 -2
  72. {holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/RECORD +75 -72
  73. holmes/core/todo_manager.py +0 -88
  74. {holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/LICENSE.txt +0 -0
  75. {holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/WHEEL +0 -0
  76. {holmesgpt-0.13.1.dist-info → holmesgpt-0.13.2.dist-info}/entry_points.txt +0 -0
@@ -20,9 +20,10 @@ class CoralogixQueryResult(BaseModel):
20
20
 
21
21
 
22
22
  class CoralogixLabelsConfig(BaseModel):
23
- pod: str = "kubernetes.pod_name"
24
- namespace: str = "kubernetes.namespace_name"
25
- log_message: str = "log"
23
+ pod: str = "resource.attributes.k8s.pod.name"
24
+ namespace: str = "resource.attributes.k8s.namespace.name"
25
+ log_message: str = "logRecord.body"
26
+ timestamp: str = "logRecord.attributes.time"
26
27
 
27
28
 
28
29
  class CoralogixLogsMethodology(str, Enum):
@@ -78,24 +79,43 @@ def normalize_datetime(date_str: Optional[str]) -> str:
78
79
  return date_str
79
80
 
80
81
 
82
+ def extract_field(data_obj: dict[str, Any], field: str):
83
+ """returns a nested field from a dict
84
+ e.g. extract_field({"parent": {"child": "value"}}, "parent.child") => value
85
+ """
86
+ current_object: Any = data_obj
87
+ fields = field.split(".")
88
+
89
+ for field in fields:
90
+ if not current_object:
91
+ return None
92
+ if isinstance(current_object, dict):
93
+ current_object = current_object.get(field)
94
+ else:
95
+ return None
96
+
97
+ return current_object
98
+
99
+
81
100
  def flatten_structured_log_entries(
82
101
  log_entries: List[Dict[str, Any]],
102
+ labels_config: CoralogixLabelsConfig,
83
103
  ) -> List[FlattenedLog]:
84
104
  flattened_logs = []
85
105
  for log_entry in log_entries:
86
106
  try:
87
- user_data = json.loads(log_entry.get("userData", "{}"))
88
- timestamp = normalize_datetime(user_data.get("time"))
89
- log_message = user_data.get("log", "")
90
- if log_message:
107
+ userData = json.loads(log_entry.get("userData", "{}"))
108
+ log_message = extract_field(userData, labels_config.log_message)
109
+ timestamp = extract_field(userData, labels_config.timestamp)
110
+ if not log_message or not timestamp:
111
+ log_message = json.dumps(userData)
112
+ else:
91
113
  flattened_logs.append(
92
114
  FlattenedLog(timestamp=timestamp, log_message=log_message)
93
115
  ) # Store as tuple for sorting
94
116
 
95
117
  except json.JSONDecodeError:
96
- logging.error(
97
- f"Failed to decode userData JSON: {log_entry.get('userData')}"
98
- )
118
+ logging.error(f"Failed to decode userData JSON: {json.dumps(log_entry)}")
99
119
  return flattened_logs
100
120
 
101
121
 
@@ -107,14 +127,16 @@ def stringify_flattened_logs(log_entries: List[FlattenedLog]) -> str:
107
127
  return "\n".join(formatted_logs) if formatted_logs else "No logs found."
108
128
 
109
129
 
110
- def parse_json_objects(json_objects: List[Dict[str, Any]]) -> List[FlattenedLog]:
130
+ def parse_json_objects(
131
+ json_objects: List[Dict[str, Any]], labels_config: CoralogixLabelsConfig
132
+ ) -> List[FlattenedLog]:
111
133
  """Extracts timestamp and log values from parsed JSON objects, sorted in ascending order (oldest first)."""
112
134
  logs: List[FlattenedLog] = []
113
135
 
114
136
  for data in json_objects:
115
137
  if isinstance(data, dict) and "result" in data and "results" in data["result"]:
116
138
  logs += flatten_structured_log_entries(
117
- log_entries=data["result"]["results"]
139
+ log_entries=data["result"]["results"], labels_config=labels_config
118
140
  )
119
141
  elif isinstance(data, dict) and data.get("warning"):
120
142
  logging.info(
@@ -128,13 +150,18 @@ def parse_json_objects(json_objects: List[Dict[str, Any]]) -> List[FlattenedLog]
128
150
  return logs
129
151
 
130
152
 
131
- def parse_logs(raw_logs: str) -> List[FlattenedLog]:
153
+ def parse_logs(
154
+ raw_logs: str,
155
+ labels_config: CoralogixLabelsConfig,
156
+ ) -> List[FlattenedLog]:
132
157
  """Processes the HTTP response and extracts only log outputs."""
133
158
  try:
134
159
  json_objects = parse_json_lines(raw_logs)
135
160
  if not json_objects:
136
161
  raise Exception("No valid JSON objects found.")
137
- return parse_json_objects(json_objects)
162
+ return parse_json_objects(
163
+ json_objects=json_objects, labels_config=labels_config
164
+ )
138
165
  except Exception as e:
139
166
  logging.error(
140
167
  f"Unexpected error in format_logs for a coralogix API response: {str(e)}"
@@ -1,5 +1,6 @@
1
+ import json
1
2
  import logging
2
- from typing import Any, Optional, Dict
3
+ from typing import Any, Optional, Dict, Union
3
4
  import requests # type: ignore
4
5
  from pydantic import AnyUrl, BaseModel
5
6
  from requests.structures import CaseInsensitiveDict # type: ignore
@@ -145,6 +146,19 @@ def execute_paginated_datadog_http_request(
145
146
  return data, cursor
146
147
 
147
148
 
149
+ def sanitize_headers(headers: Union[dict, CaseInsensitiveDict]) -> dict:
150
+ try:
151
+ return {
152
+ k: v
153
+ if ("key" not in k.lower() and "key" not in v.lower())
154
+ else "[REDACTED]"
155
+ for k, v in headers.items()
156
+ }
157
+ except (AttributeError, TypeError):
158
+ # Return empty dict for mock objects or other non-dict types
159
+ return {}
160
+
161
+
148
162
  def execute_datadog_http_request(
149
163
  url: str,
150
164
  headers: dict,
@@ -152,6 +166,16 @@ def execute_datadog_http_request(
152
166
  timeout: int,
153
167
  method: str = "POST",
154
168
  ) -> Any:
169
+ # Log the request details
170
+ logging.info("Datadog API Request:")
171
+ logging.info(f" Method: {method}")
172
+ logging.info(f" URL: {url}")
173
+ logging.info(f" Headers: {json.dumps(sanitize_headers(headers), indent=2)}")
174
+ logging.info(
175
+ f" {'Params' if method == 'GET' else 'Payload'}: {json.dumps(payload_or_params, indent=2)}"
176
+ )
177
+ logging.info(f" Timeout: {timeout}s")
178
+
155
179
  if method == "GET":
156
180
  response = requests.get(
157
181
  url, headers=headers, params=payload_or_params, timeout=timeout
@@ -161,10 +185,29 @@ def execute_datadog_http_request(
161
185
  url, headers=headers, json=payload_or_params, timeout=timeout
162
186
  )
163
187
 
188
+ # Log the response details
189
+ logging.info("Datadog API Response:")
190
+ logging.info(f" Status Code: {response.status_code}")
191
+ logging.info(f" Response Headers: {dict(sanitize_headers(response.headers))}")
192
+
164
193
  if response.status_code == 200:
165
- return response.json()
194
+ response_data = response.json()
195
+ # Log response size but not full content (could be large)
196
+ if isinstance(response_data, dict):
197
+ logging.info(f" Response Keys: {list(response_data.keys())}")
198
+ if "data" in response_data:
199
+ data_len = (
200
+ len(response_data["data"])
201
+ if isinstance(response_data["data"], list)
202
+ else 1
203
+ )
204
+ logging.info(f" Data Items Count: {data_len}")
205
+ else:
206
+ logging.info(f" Response Type: {type(response_data).__name__}")
207
+ return response_data
166
208
 
167
209
  else:
210
+ logging.error(f" Error Response Body: {response.text}")
168
211
  raise DataDogRequestError(
169
212
  payload=payload_or_params,
170
213
  status_code=response.status_code,
@@ -0,0 +1,208 @@
1
+ ## Datadog General API Tools Usage Guide
2
+
3
+ ### When to Use This Toolset
4
+
5
+ **PROACTIVELY use the Datadog general toolset when investigating issues to gather comprehensive observability data.**
6
+
7
+ **Use Datadog for Historical Context When Needed, or check live data when needed:**
8
+ - **When checking current status**: Use current time ranges for real-time monitoring
9
+ - **When investigating past issues**: If asked about problems from yesterday, last week, etc.
10
+ - **When finding root causes**: Look at events/monitors from BEFORE an issue started
11
+ - **When Kubernetes data is missing**: Pods may have been deleted, events expired, etc.
12
+
13
+ This toolset provides access to critical Datadog resources that can help identify root causes, or health status:
14
+ - **Monitors**: Check alert history, thresholds, and monitor states
15
+ - **Incidents**: Review recent incidents and their timelines
16
+ - **Dashboards**: Access pre-configured dashboards for system overview
17
+ - **SLOs**: Verify service level objectives and error budgets
18
+ - **Events**: Correlate deployments, configuration changes, and system events
19
+ - **Synthetics**: Check endpoint availability and performance
20
+ - **Security**: Review security signals and alerts
21
+ - **Hosts**: Get infrastructure-level information
22
+
23
+ ### When Historical Data is Important
24
+
25
+ **Kubernetes limitations that Datadog can address:**
26
+ - Kubernetes events expire after 1 hour by default
27
+ - Deleted pods/deployments leave no trace in the cluster
28
+ - Previous configuration values are not retained
29
+ - Past node issues may be resolved without evidence
30
+
31
+ **Datadog preserves this context when you need it:**
32
+ - Events from before an incident started
33
+ - Monitor triggers on now-deleted resources
34
+ - Past incidents and their resolutions
35
+ - Deployment and configuration change history
36
+
37
+ ### Investigation Workflow
38
+
39
+ **1. Determine the appropriate time range based on the request:**
40
+ ```
41
+ - For current status: Use recent time windows (last hour, last few minutes)
42
+ - For investigating alerts: Query from before the alert started to understand triggers
43
+ - For past issues: Use the specific timeframe when the issue occurred
44
+ - For root cause analysis: Look at events/changes before the problem began
45
+ ```
46
+
47
+ **2. Check relevant monitors and incidents:**
48
+ ```
49
+ - Use `datadog_api_get` with `/api/v1/monitor` to list monitors
50
+ - Use `datadog_api_post_search` with `/api/v2/incidents/search` to find recent incidents
51
+ - Check monitor states to understand alert patterns
52
+ ```
53
+
54
+ **3. Correlate with events when investigating issues:**
55
+ ```
56
+ - Query `/api/v1/events` with appropriate time range
57
+ - For root cause: Look for events BEFORE the issue started
58
+ - Events often reveal deployments, config changes, or infrastructure updates
59
+ - Especially useful when Kubernetes resources have been deleted/replaced
60
+ ```
61
+
62
+ **4. Check service health and dependencies:**
63
+ ```
64
+ - Use `/api/v2/services` to list services and their states
65
+ - Query `/api/v2/services/{service}/dependencies` to understand service relationships
66
+ - This helps identify cascade failures
67
+ ```
68
+
69
+ **5. Review SLOs for service degradation over time:**
70
+ ```
71
+ - Query `/api/v1/slo` to check service level objectives
72
+ - Use `/api/v1/slo/{id}/history` to see historical compliance
73
+ - Identify when degradation started (may be before alerts fired)
74
+ - Check if issues are violating SLO targets
75
+ ```
76
+
77
+ ### Common Investigation Patterns
78
+
79
+ **For Kubernetes Pod/Deployment Issues:**
80
+ 1. **When pods are missing/deleted**: Query Datadog for historical data about those pods
81
+ 2. **For recurring issues**: Check monitor history for patterns
82
+ 3. **For deployment problems**: Look for deployment events around issue time
83
+ 4. **When Kubernetes events expired**: Use Datadog events for the same timeframe
84
+
85
+ **For Application Issues:**
86
+ 1. **Adjust time range based on issue**: Current for live issues, historical for past problems
87
+ 2. Review monitors: `datadog_api_get` with `/api/v1/monitor` filtering by service
88
+ 3. Search incidents: `datadog_api_post_search` with `/api/v2/incidents/search`
89
+ 4. For degradation: Check SLO history to identify when it started
90
+
91
+ **For Infrastructure Issues:**
92
+ 1. List hosts: `datadog_api_get` with `/api/v1/hosts` to see host status
93
+ 2. Check host details: `datadog_api_get` with `/api/v1/hosts/{hostname}`
94
+ 3. Review events: Look for infrastructure changes or maintenance
95
+ 4. Check monitors: Find infrastructure-related alerts
96
+
97
+ **For Performance Issues:**
98
+ 1. Review synthetics: `datadog_api_get` with `/api/v1/synthetics/tests` for endpoint monitoring
99
+ 2. Check SLO history: Track performance degradation over time
100
+ 3. Review dashboards: `datadog_api_get` with `/api/v1/dashboard` for performance dashboards
101
+ 4. Correlate with events: Find changes that might impact performance
102
+
103
+ **For Security Issues:**
104
+ 1. Search security signals: `datadog_api_post_search` with `/api/v2/security_monitoring/signals/search`
105
+ 2. Review security rules: `datadog_api_get` with `/api/v2/security_monitoring/rules`
106
+ 3. Check recent incidents: Look for security-related incidents
107
+
108
+ ### Time Parameters
109
+
110
+ **Choose time ranges based on the investigation context:**
111
+ - Use query parameters for time ranges:
112
+ - `from`: Start time (Unix timestamp or ISO 8601)
113
+ - `to`: End time (Unix timestamp or ISO 8601)
114
+ - Example: `{"from": "2024-01-01T00:00:00Z", "to": "2024-01-02T00:00:00Z"}`
115
+ - For relative times: `{"from": "-1h"}` for last hour
116
+ - **For root cause analysis**: Query from before the issue started (e.g., if alert fired 2 hours ago, query from "-4h")
117
+ - **For current status**: Use recent time windows (e.g., "-15m" or "-1h")
118
+ - **For historical issues**: Use the specific timeframe when the issue occurred
119
+
120
+ ### Query Examples
121
+
122
+ **List all monitors with their current state:**
123
+ ```
124
+ Tool: datadog_api_get
125
+ Endpoint: /api/v1/monitor
126
+ Query params: {"group_states": "all", "monitor_tags": "env:production"}
127
+ ```
128
+
129
+ **Search for recent incidents:**
130
+ ```
131
+ Tool: datadog_api_post_search
132
+ Endpoint: /api/v2/incidents/search
133
+ Body: {
134
+ "filter": {
135
+ "created": {
136
+ "from": "-24h"
137
+ }
138
+ },
139
+ "sort": "-created",
140
+ "page": {"limit": 10}
141
+ }
142
+ ```
143
+
144
+ **Get events for a specific service:**
145
+ ```
146
+ Tool: datadog_api_get
147
+ Endpoint: /api/v1/events
148
+ Query params: {"start": "-3600", "end": "now", "tags": "service:my-service"}
149
+ ```
150
+
151
+ **Check SLO compliance:**
152
+ ```
153
+ Tool: datadog_api_get
154
+ Endpoint: /api/v1/slo/{slo_id}/history
155
+ Query params: {"from_ts": 1234567890, "to_ts": 1234567900}
156
+ ```
157
+
158
+ ### Best Practices
159
+
160
+ 1. **Always correlate multiple data sources:**
161
+ - Don't rely on a single metric or log
162
+ - Cross-reference monitors, events, and incidents
163
+ - Look for patterns across different data types
164
+
165
+ 2. **Use time windows effectively:**
166
+ - Start with a broader time range to see patterns
167
+ - Narrow down once you identify the issue timeframe
168
+ - Compare with historical data when available
169
+
170
+ 3. **Follow the dependency chain:**
171
+ - Check upstream services when investigating issues
172
+ - Use service dependency maps to understand impact
173
+ - Look for cascade failures
174
+
175
+ 4. **Prioritize based on severity:**
176
+ - Check critical monitors and P1 incidents first
177
+ - Review SLO violations for business impact
178
+ - Focus on customer-facing services
179
+
180
+ 5. **Document findings:**
181
+ - Note correlations between events and issues
182
+ - Identify patterns in monitor triggers
183
+ - Track incident timelines for post-mortems
184
+
185
+ ### Resource Discovery
186
+
187
+ Use `list_datadog_api_resources` to discover available endpoints:
188
+ - Filter by category: monitors, dashboards, slos, incidents, etc.
189
+ - This helps identify which resources are available for investigation
190
+ - Example: `list_datadog_api_resources` with `{"category": "monitors"}`
191
+
192
+ ### Integration with Other Toolsets
193
+
194
+ This toolset complements other Datadog toolsets:
195
+ - Use with `datadog/metrics` for detailed metric analysis
196
+ - Combine with `datadog/logs` for log correlation
197
+ - Use alongside `datadog/traces` for distributed tracing
198
+ - Integrate with Kubernetes toolsets for container-level issues
199
+
200
+ ### IMPORTANT: Proactive Usage
201
+
202
+ **Don't wait for the user to explicitly ask for Datadog data. When investigating any issue:**
203
+ 1. Check if there are relevant monitors or incidents
204
+ 2. Look for recent events that might be related
205
+ 3. Verify service health and SLO compliance
206
+ 4. Review any security signals if applicable
207
+
208
+ This proactive approach often reveals root causes that wouldn't be found through logs or metrics alone.
@@ -0,0 +1,43 @@
1
+ ## Datadog Logs Tools Usage Guide
2
+
3
+ Before running logs queries:
4
+
5
+ ** You are often (but not always) running in a kubernetes environment. So users might ask you questions about kubernetes workloads without explicitly stating their type.
6
+ ** When getting ambiguous questions, use kubectl_find_resource to find the resource you are being asked about!
7
+ ** Find the involved resource name and kind
8
+ ** If you can't figure out what is the type of the resource, ask the user for more information and don't guess
9
+
10
+
11
+ ### General guideline
12
+ - This toolset is used to read pod logs.
13
+ - Assume the pod should have logs. If logs not found, try to adjust the query
14
+
15
+ ### CRITICAL: Pod Name Resolution Workflow
16
+
17
+ **When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
18
+ - FIRST query Datadog directly with that pod name using appropriate tags
19
+ - Do NOT try to verify if the pod exists in Kubernetes first
20
+ - This allows querying historical pods that have been deleted/replaced
21
+
22
+ **When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
23
+ - First use `kubectl_find_resource` to find actual pod names
24
+ - Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
25
+ - Then use those specific pod names in Datadog queries
26
+ - Alternative: Use deployment-level tags when appropriate
27
+
28
+ **Why this matters:**
29
+ - Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
30
+ - Historical pods that no longer exist in the cluster can still have logs in Datadog
31
+ - Deployment/service names alone are NOT pod names (they need the suffix)
32
+
33
+ ### Time Parameters
34
+ - Use RFC3339 format: `2023-03-01T10:30:00Z`
35
+ - Or relative seconds: `-3600` for 1 hour ago
36
+ - Defaults to 1 hour window if not specified
37
+
38
+ ### Common Investigation Patterns
39
+
40
+ **For Pod/Container Metrics (MOST COMMON):**
41
+ 1. User asks: "Show logs for my-workload"
42
+ 2. Use `kubectl_find_resource` → find pod "my-workload-abc123-xyz"
43
+ 3. Query Datadog for pod "my-workload-abc123-xyz" logs
@@ -32,19 +32,22 @@ When investigating metrics-related issues:
32
32
  - IMPORTANT: This toolset DOES NOT support promql queries.
33
33
 
34
34
  ### CRITICAL: Pod Name Resolution Workflow
35
- When users ask for metrics about a deployment, service, or workload (e.g., "my-workload", "nginx-deployment"):
36
35
 
37
- **ALWAYS follow this two-step process:**
38
- 1. **First**: Use `kubectl_find_resource` to find the actual pod names
39
- - Example: `kubectl_find_resource` with "my-workload" finds pods like "my-workload-8f8cdfxyz-c7zdr"
40
- 2. **Then**: Use those specific pod names in Datadog queries
41
- - Correct: `container.cpu.usage{pod_name:my-workload-8f8cdfxyz-c7zdr}`
42
- - WRONG: `container.cpu.usage{pod_name:my-workload}` This will return no data!
36
+ **When user provides an exact pod name** (e.g., `my-workload-5f9d8b7c4d-x2km9`):
37
+ - Query Datadog directly with that pod name using appropriate metrics and tags
38
+ - Do NOT try to verify if the pod exists in Kubernetes first
39
+ - This allows querying historical pods that have been deleted/replaced
40
+
41
+ **When user provides a generic workload name** (e.g., "my-workload", "nginx", "telemetry-processor"):
42
+ - First use `kubectl_find_resource` to find actual pod names
43
+ - Example: `kubectl_find_resource` with "my-workload" → finds pods like "my-workload-8f8cdfxyz-c7zdr"
44
+ - Then use those specific pod names in Datadog queries
45
+ - Alternative: Use deployment-level tags when appropriate
43
46
 
44
47
  **Why this matters:**
45
48
  - Pod names in Datadog are the actual Kubernetes pod names (with random suffixes)
46
- - Deployment/service names are NOT pod names
47
- - Using deployment names as pod_name filters will always return empty results
49
+ - Historical pods that no longer exist in the cluster can still have metrics in Datadog
50
+ - Deployment/service names alone are NOT pod names (they need the suffix)
48
51
 
49
52
  ### Time Parameters
50
53
  - Use RFC3339 format: `2023-03-01T10:30:00Z`