holmesgpt 0.11.5__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of holmesgpt might be problematic. Click here for more details.

Files changed (40) hide show
  1. holmes/__init__.py +1 -1
  2. holmes/common/env_vars.py +8 -4
  3. holmes/config.py +52 -13
  4. holmes/core/investigation_structured_output.py +7 -0
  5. holmes/core/llm.py +14 -4
  6. holmes/core/models.py +24 -0
  7. holmes/core/tool_calling_llm.py +48 -6
  8. holmes/core/tools.py +7 -4
  9. holmes/core/toolset_manager.py +24 -5
  10. holmes/core/tracing.py +224 -0
  11. holmes/interactive.py +761 -44
  12. holmes/main.py +59 -127
  13. holmes/plugins/prompts/_fetch_logs.jinja2 +4 -0
  14. holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -10
  15. holmes/plugins/toolsets/__init__.py +10 -2
  16. holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
  17. holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +3 -0
  18. holmes/plugins/toolsets/datadog/datadog_api.py +161 -0
  19. holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +26 -0
  20. holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +310 -0
  21. holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +51 -0
  22. holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +267 -0
  23. holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +488 -0
  24. holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +689 -0
  25. holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -0
  26. holmes/plugins/toolsets/internet/internet.py +1 -1
  27. holmes/plugins/toolsets/logging_utils/logging_api.py +9 -3
  28. holmes/plugins/toolsets/opensearch/opensearch_logs.py +3 -0
  29. holmes/plugins/toolsets/utils.py +6 -2
  30. holmes/utils/cache.py +4 -4
  31. holmes/utils/console/consts.py +2 -0
  32. holmes/utils/console/logging.py +95 -0
  33. holmes/utils/console/result.py +37 -0
  34. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/METADATA +3 -4
  35. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/RECORD +38 -29
  36. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/WHEEL +1 -1
  37. holmes/__init__.py.bak +0 -76
  38. holmes/plugins/toolsets/datadog.py +0 -153
  39. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/LICENSE.txt +0 -0
  40. {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0.dist-info}/entry_points.txt +0 -0
holmes/main.py CHANGED
@@ -1,5 +1,6 @@
1
1
  # ruff: noqa: E402
2
2
  import os
3
+ import sys
3
4
 
4
5
  from holmes.utils.cert_utils import add_custom_certificate
5
6
 
@@ -15,14 +16,11 @@ import json
15
16
  import logging
16
17
  import socket
17
18
  import uuid
18
- import warnings
19
- from enum import Enum
19
+ from datetime import datetime
20
20
  from pathlib import Path
21
21
  from typing import List, Optional
22
22
 
23
23
  import typer
24
- from rich.console import Console
25
- from rich.logging import RichHandler
26
24
  from rich.markdown import Markdown
27
25
  from rich.rule import Rule
28
26
 
@@ -35,13 +33,16 @@ from holmes.config import (
35
33
  )
36
34
  from holmes.core.prompt import build_initial_ask_messages
37
35
  from holmes.core.resource_instruction import ResourceInstructionDocument
38
- from holmes.core.tool_calling_llm import LLMResult
39
36
  from holmes.core.tools import pretty_print_toolset_status
37
+ from holmes.core.tracing import SpanType, TracingFactory
40
38
  from holmes.interactive import run_interactive_loop
41
39
  from holmes.plugins.destinations import DestinationType
42
40
  from holmes.plugins.interfaces import Issue
43
41
  from holmes.plugins.prompts import load_and_render_prompt
44
42
  from holmes.plugins.sources.opsgenie import OPSGENIE_TEAM_INTEGRATION_KEY_HELP
43
+ from holmes.utils.console.consts import system_prompt_help
44
+ from holmes.utils.console.logging import init_logging
45
+ from holmes.utils.console.result import handle_result
45
46
  from holmes.utils.file_utils import write_json_file
46
47
 
47
48
  app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
@@ -68,94 +69,6 @@ toolset_app = typer.Typer(
68
69
  app.add_typer(toolset_app, name="toolset")
69
70
 
70
71
 
71
- class Verbosity(Enum):
72
- NORMAL = 0
73
- LOG_QUERIES = 1 # TODO: currently unused
74
- VERBOSE = 2
75
- VERY_VERBOSE = 3
76
-
77
-
78
- def cli_flags_to_verbosity(verbose_flags: List[bool]) -> Verbosity:
79
- if verbose_flags is None or len(verbose_flags) == 0:
80
- return Verbosity.NORMAL
81
- elif len(verbose_flags) == 1:
82
- return Verbosity.LOG_QUERIES
83
- elif len(verbose_flags) == 2:
84
- return Verbosity.VERBOSE
85
- else:
86
- return Verbosity.VERY_VERBOSE
87
-
88
-
89
- def suppress_noisy_logs():
90
- # disable INFO logs from OpenAI
91
- logging.getLogger("httpx").setLevel(logging.WARNING)
92
- # disable INFO logs from LiteLLM
93
- logging.getLogger("LiteLLM").setLevel(logging.WARNING)
94
- # disable INFO logs from AWS (relevant when using bedrock)
95
- logging.getLogger("boto3").setLevel(logging.WARNING)
96
- logging.getLogger("botocore").setLevel(logging.WARNING)
97
- # when running in --verbose mode we don't want to see DEBUG logs from these libraries
98
- logging.getLogger("openai._base_client").setLevel(logging.INFO)
99
- logging.getLogger("httpcore").setLevel(logging.INFO)
100
- logging.getLogger("markdown_it").setLevel(logging.INFO)
101
- # suppress UserWarnings from the slack_sdk module
102
- warnings.filterwarnings("ignore", category=UserWarning, module="slack_sdk.*")
103
-
104
-
105
- def init_logging(verbose_flags: Optional[List[bool]] = None):
106
- verbosity = cli_flags_to_verbosity(verbose_flags) # type: ignore
107
-
108
- if verbosity == Verbosity.VERY_VERBOSE:
109
- logging.basicConfig(
110
- level=logging.DEBUG,
111
- format="%(message)s",
112
- handlers=[
113
- RichHandler(
114
- show_level=False,
115
- markup=True,
116
- show_time=False,
117
- show_path=False,
118
- console=Console(width=None),
119
- )
120
- ],
121
- )
122
- elif verbosity == Verbosity.VERBOSE:
123
- logging.basicConfig(
124
- level=logging.INFO,
125
- format="%(message)s",
126
- handlers=[
127
- RichHandler(
128
- show_level=False,
129
- markup=True,
130
- show_time=False,
131
- show_path=False,
132
- console=Console(width=None),
133
- )
134
- ],
135
- )
136
- logging.getLogger().setLevel(logging.DEBUG)
137
- suppress_noisy_logs()
138
- else:
139
- logging.basicConfig(
140
- level=logging.INFO,
141
- format="%(message)s",
142
- handlers=[
143
- RichHandler(
144
- show_level=False,
145
- markup=True,
146
- show_time=False,
147
- show_path=False,
148
- console=Console(width=None),
149
- )
150
- ],
151
- )
152
- suppress_noisy_logs()
153
-
154
- logging.debug(f"verbosity is {verbosity}")
155
-
156
- return Console()
157
-
158
-
159
72
  # Common cli options
160
73
  # The defaults for options that are also in the config file MUST be None or else the cli defaults will override settings in the config file
161
74
  opt_api_key: Optional[str] = typer.Option(
@@ -231,9 +144,6 @@ opt_documents: Optional[str] = typer.Option(
231
144
  help="Additional documents to provide the LLM (typically URLs to runbooks)",
232
145
  )
233
146
 
234
- # Common help texts
235
- system_prompt_help = "Advanced. System prompt for LLM. Values starting with builtin:// are loaded from holmes/plugins/prompts, values starting with file:// are loaded from the given path, other values are interpreted as a prompt string"
236
-
237
147
 
238
148
  def parse_documents(documents: Optional[str]) -> List[ResourceInstructionDocument]:
239
149
  resource_documents = []
@@ -247,35 +157,6 @@ def parse_documents(documents: Optional[str]) -> List[ResourceInstructionDocumen
247
157
  return resource_documents
248
158
 
249
159
 
250
- def handle_result(
251
- result: LLMResult,
252
- console: Console,
253
- destination: DestinationType,
254
- config: Config,
255
- issue: Issue,
256
- show_tool_output: bool,
257
- add_separator: bool,
258
- ):
259
- if destination == DestinationType.CLI:
260
- if show_tool_output and result.tool_calls:
261
- for tool_call in result.tool_calls:
262
- console.print("[bold magenta]Used Tool:[/bold magenta]", end="")
263
- # we need to print this separately with markup=False because it contains arbitrary text and we don't want console.print to interpret it
264
- console.print(
265
- f"{tool_call.description}. Output=\n{tool_call.result}",
266
- markup=False,
267
- )
268
-
269
- console.print("[bold green]AI:[/bold green]", end=" ")
270
- console.print(Markdown(result.result)) # type: ignore
271
- if add_separator:
272
- console.print(Rule())
273
-
274
- elif destination == DestinationType.SLACK:
275
- slack = config.create_slack_destination()
276
- slack.send_issue(issue, result)
277
-
278
-
279
160
  # TODO: add streaming output
280
161
  @app.command()
281
162
  def ask(
@@ -323,11 +204,31 @@ def ask(
323
204
  "-i/-n",
324
205
  help="Enter interactive mode after the initial question? For scripting, disable this with --no-interactive",
325
206
  ),
207
+ refresh_toolsets: bool = typer.Option(
208
+ False,
209
+ "--refresh-toolsets",
210
+ help="Refresh the toolsets status",
211
+ ),
212
+ trace: Optional[str] = typer.Option(
213
+ None,
214
+ "--trace",
215
+ help="Enable tracing to the specified provider (e.g., 'braintrust')",
216
+ ),
326
217
  ):
327
218
  """
328
219
  Ask any question and answer using available tools
329
220
  """
330
221
  console = init_logging(verbose) # type: ignore
222
+
223
+ # Detect and read piped input
224
+ piped_data = None
225
+ if not sys.stdin.isatty():
226
+ piped_data = sys.stdin.read().strip()
227
+ if interactive:
228
+ console.print(
229
+ "[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
230
+ )
231
+ interactive = False
331
232
  config = Config.load_from_file(
332
233
  config_file,
333
234
  api_key=api_key,
@@ -338,8 +239,17 @@ def ask(
338
239
  slack_channel=slack_channel,
339
240
  )
340
241
 
242
+ # Create tracer if trace option is provided
243
+ tracer = TracingFactory.create_tracer(trace, project="HolmesGPT-CLI")
244
+ experiment_name = f"holmes-ask-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
245
+ tracer.start_experiment(
246
+ experiment_name=experiment_name, metadata={"prompt": prompt or "holmes-ask"}
247
+ )
248
+
341
249
  ai = config.create_console_toolcalling_llm(
342
250
  dal=None, # type: ignore
251
+ refresh_toolsets=refresh_toolsets, # flag to refresh the toolset status
252
+ tracer=tracer,
343
253
  )
344
254
  template_context = {
345
255
  "toolsets": ai.tool_executor.toolsets,
@@ -360,11 +270,20 @@ def ask(
360
270
  console.print(
361
271
  f"[bold yellow]Loaded prompt from file {prompt_file}[/bold yellow]"
362
272
  )
363
- elif not prompt and not interactive:
273
+ elif not prompt and not interactive and not piped_data:
364
274
  raise typer.BadParameter(
365
275
  "Either the 'prompt' argument or the --prompt-file option must be provided (unless using --interactive mode)."
366
276
  )
367
277
 
278
+ # Handle piped data
279
+ if piped_data:
280
+ if prompt:
281
+ # User provided both piped data and a prompt
282
+ prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
283
+ else:
284
+ # Only piped data, no prompt - ask what to do with it
285
+ prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
286
+
368
287
  if echo_request and not interactive and prompt:
369
288
  console.print("[bold yellow]User:[/bold yellow] " + prompt)
370
289
 
@@ -377,6 +296,7 @@ def ask(
377
296
  include_file,
378
297
  post_processing_prompt,
379
298
  show_tool_output,
299
+ tracer,
380
300
  )
381
301
  return
382
302
 
@@ -387,7 +307,16 @@ def ask(
387
307
  include_file,
388
308
  )
389
309
 
390
- response = ai.call(messages, post_processing_prompt)
310
+ with tracer.start_trace(
311
+ f'holmes ask "{prompt}"', span_type=SpanType.TASK
312
+ ) as trace_span:
313
+ trace_span.log(input=prompt, metadata={"type": "user_question"})
314
+ response = ai.call(messages, post_processing_prompt, trace_span=trace_span)
315
+ trace_span.log(
316
+ output=response.result,
317
+ )
318
+ trace_url = tracer.get_trace_url()
319
+
391
320
  messages = response.messages # type: ignore # Update messages with the full history
392
321
 
393
322
  if json_output_file:
@@ -410,6 +339,9 @@ def ask(
410
339
  False, # type: ignore
411
340
  )
412
341
 
342
+ if trace_url:
343
+ console.print(f"🔍 View trace: {trace_url}")
344
+
413
345
 
414
346
  @investigate_app.command()
415
347
  def alertmanager(
@@ -3,6 +3,7 @@
3
3
  {%- set k8s_base_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | selectattr("fetch_pod_logs", "defined") | first -%}
4
4
  {%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
5
5
  {%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
6
+ {%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
6
7
 
7
8
  # Logs
8
9
  {% if loki_ts and loki_ts.status == "enabled" -%}
@@ -19,6 +20,8 @@
19
20
  {% include '_default_log_prompt.jinja2' %}
20
21
  {%- elif k8s_base_ts and k8s_base_ts.status == "enabled" -%}
21
22
  {% include '_default_log_prompt.jinja2' %}
23
+ {%- elif datadog_ts and datadog_ts.status == "enabled" -%}
24
+ {% include '_default_log_prompt.jinja2' %}
22
25
  {%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
23
26
  * if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
24
27
  * use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
@@ -33,4 +36,5 @@
33
36
  ** 'grafana/loki'
34
37
  ** 'opensearch/logs'
35
38
  ** 'coralogix/logs'
39
+ ** 'datadog/logs'
36
40
  {%- endif -%}
@@ -10,7 +10,6 @@ Global Instructions
10
10
  You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user request aligns with one of the described methods or conditions.
11
11
  Use these rules when deciding how to apply them:
12
12
 
13
- * If the user prompt includes Global Instructions, treat them as a reference resource.
14
13
  * Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current request or the instructions in a triple quotes section reference one of these tasks, ALWAYS follow the Global Instruction for that task.
15
14
  * Some Global Instructions may define general conditions that always apply if a certain scenario occurs (e.g., "whenever investigating a memory issue, always check resource limits"). If such a condition matches the current situation, apply the Global Instruction accordingly.
16
15
  * If user's prompt or the instructions in a triple quotes section direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, ALWAYS follow the Global Instructions on how to perform it.
@@ -41,10 +40,6 @@ In general:
41
40
  * do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
42
41
  * if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
43
42
  * if you find errors and warning in a pods logs and you believe they indicate a real issue. consider the pod as not healthy.
44
- * if the user says something isn't working, ALWAYS:
45
- ** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
46
- ** check the application aspects by accessing the application logs and other relevant tools
47
- ** look for misconfigured ingresses/services etc
48
43
 
49
44
  {% include '_toolsets_instructions.jinja2' %}
50
45
 
@@ -53,9 +48,7 @@ In general:
53
48
  Style guide:
54
49
  * Be painfully concise.
55
50
  * Leave out "the" and filler words when possible.
56
- * Be terse but not at the expense of leaving out important data like the root cause and how to fix.
57
- * if asked by Global Instructions or instructions in a triple single quotes section to explicitly include something in the answer, don't leave it out.
58
- * return a json object with the following schema as a result:
51
+ * your answer should ONLY return a json object with the following schema as a result:
59
52
  {
60
53
  "type": "object",
61
54
  "properties": {
@@ -69,13 +62,12 @@ Style guide:
69
62
  }
70
63
  },
71
64
  "required": [
72
- "reasoning",
65
+ "root_cause_summary",
73
66
  "workload_healthy"
74
67
  ]
75
68
  }
76
69
 
77
70
 
78
-
79
71
  {% if alerts %}
80
72
  Here are issues and configuration changes that happend to this kubernetes workload in recent time. Check if these can help you understand the issue.
81
73
  {% for a in alerts %}
@@ -14,7 +14,13 @@ from holmes.core.tools import Toolset, ToolsetType, ToolsetYamlFromConfig, YAMLT
14
14
  from holmes.plugins.toolsets.coralogix.toolset_coralogix_logs import (
15
15
  CoralogixLogsToolset,
16
16
  )
17
- from holmes.plugins.toolsets.datadog import DatadogToolset
17
+ from holmes.plugins.toolsets.datadog.toolset_datadog_logs import DatadogLogsToolset
18
+ from holmes.plugins.toolsets.datadog.toolset_datadog_metrics import (
19
+ DatadogMetricsToolset,
20
+ )
21
+ from holmes.plugins.toolsets.datadog.toolset_datadog_traces import (
22
+ DatadogTracesToolset,
23
+ )
18
24
  from holmes.plugins.toolsets.kubernetes_logs import KubernetesLogsToolset
19
25
  from holmes.plugins.toolsets.git import GitToolset
20
26
  from holmes.plugins.toolsets.grafana.toolset_grafana import GrafanaToolset
@@ -68,7 +74,9 @@ def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
68
74
  GrafanaToolset(),
69
75
  NotionToolset(),
70
76
  KafkaToolset(),
71
- DatadogToolset(),
77
+ DatadogLogsToolset(),
78
+ DatadogMetricsToolset(),
79
+ DatadogTracesToolset(),
72
80
  PrometheusToolset(),
73
81
  OpenSearchLogsToolset(),
74
82
  OpenSearchTracesToolset(),
@@ -1,5 +1,4 @@
1
1
  from typing import Dict, List
2
- import pyodbc
3
2
  import logging
4
3
  import struct
5
4
  from azure.core.credentials import TokenCredential
@@ -38,6 +37,8 @@ class AzureSQLAPIClient:
38
37
  self, server_name: str, database_name: str, query: str
39
38
  ) -> List[Dict]:
40
39
  """Execute a T-SQL query against the Azure SQL database."""
40
+ import pyodbc # type: ignore
41
+
41
42
  conn = None
42
43
  cursor = None
43
44
 
@@ -62,6 +62,9 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
62
62
  def coralogix_config(self) -> Optional[CoralogixConfig]:
63
63
  return self.config
64
64
 
65
+ def logger_name(self) -> str:
66
+ return "Coralogix"
67
+
65
68
  def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
66
69
  if not self.coralogix_config:
67
70
  return StructuredToolResult(
@@ -0,0 +1,161 @@
1
+ import logging
2
+ from typing import Any, Optional, Dict
3
+ import requests # type: ignore
4
+ from pydantic import AnyUrl, BaseModel
5
+ from requests.structures import CaseInsensitiveDict # type: ignore
6
+ from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
7
+ from tenacity.wait import wait_base
8
+
9
+
10
+ START_RETRY_DELAY = (
11
+ 5.0 # Initial fallback delay if datadog does not return a reset_time
12
+ )
13
+ INCREMENT_RETRY_DELAY = 5.0 # Delay increment after each rate limit, if datadog does not return a reset_time
14
+ MAX_RETRY_COUNT_ON_RATE_LIMIT = 5
15
+
16
+ RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
17
+
18
+
19
+ class DatadogBaseConfig(BaseModel):
20
+ """Base configuration for all Datadog toolsets"""
21
+
22
+ dd_api_key: str
23
+ dd_app_key: str
24
+ site_api_url: AnyUrl
25
+ request_timeout: int = 60
26
+
27
+
28
+ class DataDogRequestError(Exception):
29
+ payload: dict
30
+ status_code: int
31
+ response_text: str
32
+ response_headers: CaseInsensitiveDict[str]
33
+
34
+ def __init__(
35
+ self,
36
+ payload: dict,
37
+ status_code: int,
38
+ response_text: str,
39
+ response_headers: CaseInsensitiveDict[str],
40
+ ):
41
+ super().__init__(f"HTTP error: {status_code} - {response_text}")
42
+ self.payload = payload
43
+ self.status_code = status_code
44
+ self.response_text = response_text
45
+ self.response_headers = response_headers
46
+
47
+
48
+ def get_headers(dd_config: DatadogBaseConfig) -> Dict[str, str]:
49
+ """Get standard headers for Datadog API requests.
50
+
51
+ Args:
52
+ dd_config: Datadog configuration object
53
+
54
+ Returns:
55
+ Dictionary of headers for Datadog API requests
56
+ """
57
+ return {
58
+ "Content-Type": "application/json",
59
+ "DD-API-KEY": dd_config.dd_api_key,
60
+ "DD-APPLICATION-KEY": dd_config.dd_app_key,
61
+ }
62
+
63
+
64
+ def extract_cursor(data: dict) -> Optional[str]:
65
+ """Extract cursor for paginating through Datadog logs API responses."""
66
+ if data is None:
67
+ return None
68
+ meta = data.get("meta", {})
69
+ if meta is None:
70
+ return None
71
+ page = meta.get("page", {})
72
+ if page is None:
73
+ return None
74
+ return page.get("after", None)
75
+
76
+
77
+ class retry_if_http_429_error(retry_if_exception):
78
+ def __init__(self):
79
+ def is_http_429_error(exception):
80
+ return (
81
+ isinstance(exception, DataDogRequestError)
82
+ and exception.status_code == 429
83
+ )
84
+
85
+ super().__init__(predicate=is_http_429_error)
86
+
87
+
88
+ class wait_for_retry_after_header(wait_base):
89
+ def __init__(self, fallback):
90
+ self.fallback = fallback
91
+
92
+ def __call__(self, retry_state):
93
+ if retry_state.outcome:
94
+ exc = retry_state.outcome.exception()
95
+
96
+ if isinstance(exc, DataDogRequestError) and exc.response_headers.get(
97
+ RATE_LIMIT_REMAINING_SECONDS_HEADER
98
+ ):
99
+ reset_time_header = exc.response_headers.get(
100
+ RATE_LIMIT_REMAINING_SECONDS_HEADER
101
+ )
102
+ if reset_time_header:
103
+ try:
104
+ reset_time = int(reset_time_header)
105
+ wait_time = max(0, reset_time) + 0.1
106
+ return wait_time
107
+ except ValueError:
108
+ logging.warning(
109
+ f"Received invalid {RATE_LIMIT_REMAINING_SECONDS_HEADER} header value from datadog: {reset_time_header}"
110
+ )
111
+
112
+ return self.fallback(retry_state)
113
+
114
+
115
+ @retry(
116
+ retry=retry_if_http_429_error(),
117
+ wait=wait_for_retry_after_header(
118
+ fallback=wait_incrementing(
119
+ start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
120
+ )
121
+ ),
122
+ stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
123
+ before_sleep=lambda retry_state: logging.warning(
124
+ f"DataDog API rate limited. Retrying... "
125
+ f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
126
+ ),
127
+ reraise=True,
128
+ )
129
+ def execute_datadog_http_request(
130
+ url: str,
131
+ headers: dict,
132
+ payload_or_params: dict,
133
+ timeout: int,
134
+ method: str = "POST",
135
+ ) -> Any:
136
+ if method == "GET":
137
+ response = requests.get(
138
+ url, headers=headers, params=payload_or_params, timeout=timeout
139
+ )
140
+ else:
141
+ response = requests.post(
142
+ url, headers=headers, json=payload_or_params, timeout=timeout
143
+ )
144
+
145
+ if response.status_code == 200:
146
+ response_data = response.json()
147
+
148
+ if method == "POST" and response_data and "data" in response_data:
149
+ cursor = extract_cursor(response_data)
150
+ data = response_data.get("data", [])
151
+ return data, cursor
152
+ else:
153
+ return response_data
154
+
155
+ else:
156
+ raise DataDogRequestError(
157
+ payload=payload_or_params,
158
+ status_code=response.status_code,
159
+ response_text=response.text,
160
+ response_headers=response.headers,
161
+ )
@@ -0,0 +1,26 @@
1
+ ## Datadog Metrics Tools Usage Guide
2
+
3
+ When investigating metrics-related issues:
4
+
5
+ 1. **Start with `list_active_datadog_metrics`** to discover available metrics
6
+ - Use filters like `host` or `tag_filter` to narrow results
7
+ - Default shows metrics from last 24 hours
8
+
9
+ 2. **Use `query_datadog_metrics`** to fetch actual metric data
10
+ - Query syntax: `metric_name{tag:value}`
11
+ - Example: `system.cpu.user{host:myhost}`
12
+ - Returns timeseries data with timestamps and values
13
+
14
+ 3. **Use `get_datadog_metric_metadata`** to understand metric properties
15
+ - Provides metric type (gauge/count/rate), unit, and description
16
+ - Accepts comma-separated list for batch queries
17
+
18
+ ### Time Parameters
19
+ - Use RFC3339 format: `2023-03-01T10:30:00Z`
20
+ - Or relative seconds: `-3600` for 1 hour ago
21
+ - Defaults to 1 hour window if not specified
22
+
23
+ ### Common Patterns
24
+ - CPU investigation: First list metrics with `tag_filter:kube_node_name:nodename`, then query specific metrics
25
+ - Memory issues: Look for `system.mem.*` or `kubernetes.memory.*` metrics
26
+ - Container metrics: Filter by pod/container tags