holmesgpt 0.11.5__py3-none-any.whl → 0.12.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/common/env_vars.py +8 -4
- holmes/config.py +54 -14
- holmes/core/investigation_structured_output.py +7 -0
- holmes/core/llm.py +14 -4
- holmes/core/models.py +24 -0
- holmes/core/tool_calling_llm.py +48 -6
- holmes/core/tools.py +7 -4
- holmes/core/toolset_manager.py +24 -5
- holmes/core/tracing.py +224 -0
- holmes/interactive.py +761 -44
- holmes/main.py +59 -127
- holmes/plugins/prompts/_fetch_logs.jinja2 +4 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -10
- holmes/plugins/toolsets/__init__.py +10 -2
- holmes/plugins/toolsets/azure_sql/apis/azure_sql_api.py +2 -1
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +3 -0
- holmes/plugins/toolsets/datadog/datadog_api.py +161 -0
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +26 -0
- holmes/plugins/toolsets/datadog/datadog_traces_formatter.py +310 -0
- holmes/plugins/toolsets/datadog/instructions_datadog_traces.jinja2 +51 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +267 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +488 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +689 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +3 -0
- holmes/plugins/toolsets/internet/internet.py +1 -1
- holmes/plugins/toolsets/logging_utils/logging_api.py +9 -3
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +3 -0
- holmes/plugins/toolsets/utils.py +6 -2
- holmes/utils/cache.py +4 -4
- holmes/utils/console/consts.py +2 -0
- holmes/utils/console/logging.py +95 -0
- holmes/utils/console/result.py +37 -0
- holmes/utils/robusta.py +2 -3
- {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0a0.dist-info}/METADATA +3 -4
- {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0a0.dist-info}/RECORD +39 -30
- {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0a0.dist-info}/WHEEL +1 -1
- holmes/__init__.py.bak +0 -76
- holmes/plugins/toolsets/datadog.py +0 -153
- {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0a0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.11.5.dist-info → holmesgpt-0.12.0a0.dist-info}/entry_points.txt +0 -0
holmes/main.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# ruff: noqa: E402
|
|
2
2
|
import os
|
|
3
|
+
import sys
|
|
3
4
|
|
|
4
5
|
from holmes.utils.cert_utils import add_custom_certificate
|
|
5
6
|
|
|
@@ -15,14 +16,11 @@ import json
|
|
|
15
16
|
import logging
|
|
16
17
|
import socket
|
|
17
18
|
import uuid
|
|
18
|
-
import
|
|
19
|
-
from enum import Enum
|
|
19
|
+
from datetime import datetime
|
|
20
20
|
from pathlib import Path
|
|
21
21
|
from typing import List, Optional
|
|
22
22
|
|
|
23
23
|
import typer
|
|
24
|
-
from rich.console import Console
|
|
25
|
-
from rich.logging import RichHandler
|
|
26
24
|
from rich.markdown import Markdown
|
|
27
25
|
from rich.rule import Rule
|
|
28
26
|
|
|
@@ -35,13 +33,16 @@ from holmes.config import (
|
|
|
35
33
|
)
|
|
36
34
|
from holmes.core.prompt import build_initial_ask_messages
|
|
37
35
|
from holmes.core.resource_instruction import ResourceInstructionDocument
|
|
38
|
-
from holmes.core.tool_calling_llm import LLMResult
|
|
39
36
|
from holmes.core.tools import pretty_print_toolset_status
|
|
37
|
+
from holmes.core.tracing import SpanType, TracingFactory
|
|
40
38
|
from holmes.interactive import run_interactive_loop
|
|
41
39
|
from holmes.plugins.destinations import DestinationType
|
|
42
40
|
from holmes.plugins.interfaces import Issue
|
|
43
41
|
from holmes.plugins.prompts import load_and_render_prompt
|
|
44
42
|
from holmes.plugins.sources.opsgenie import OPSGENIE_TEAM_INTEGRATION_KEY_HELP
|
|
43
|
+
from holmes.utils.console.consts import system_prompt_help
|
|
44
|
+
from holmes.utils.console.logging import init_logging
|
|
45
|
+
from holmes.utils.console.result import handle_result
|
|
45
46
|
from holmes.utils.file_utils import write_json_file
|
|
46
47
|
|
|
47
48
|
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
|
@@ -68,94 +69,6 @@ toolset_app = typer.Typer(
|
|
|
68
69
|
app.add_typer(toolset_app, name="toolset")
|
|
69
70
|
|
|
70
71
|
|
|
71
|
-
class Verbosity(Enum):
|
|
72
|
-
NORMAL = 0
|
|
73
|
-
LOG_QUERIES = 1 # TODO: currently unused
|
|
74
|
-
VERBOSE = 2
|
|
75
|
-
VERY_VERBOSE = 3
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def cli_flags_to_verbosity(verbose_flags: List[bool]) -> Verbosity:
|
|
79
|
-
if verbose_flags is None or len(verbose_flags) == 0:
|
|
80
|
-
return Verbosity.NORMAL
|
|
81
|
-
elif len(verbose_flags) == 1:
|
|
82
|
-
return Verbosity.LOG_QUERIES
|
|
83
|
-
elif len(verbose_flags) == 2:
|
|
84
|
-
return Verbosity.VERBOSE
|
|
85
|
-
else:
|
|
86
|
-
return Verbosity.VERY_VERBOSE
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def suppress_noisy_logs():
|
|
90
|
-
# disable INFO logs from OpenAI
|
|
91
|
-
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
92
|
-
# disable INFO logs from LiteLLM
|
|
93
|
-
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
|
|
94
|
-
# disable INFO logs from AWS (relevant when using bedrock)
|
|
95
|
-
logging.getLogger("boto3").setLevel(logging.WARNING)
|
|
96
|
-
logging.getLogger("botocore").setLevel(logging.WARNING)
|
|
97
|
-
# when running in --verbose mode we don't want to see DEBUG logs from these libraries
|
|
98
|
-
logging.getLogger("openai._base_client").setLevel(logging.INFO)
|
|
99
|
-
logging.getLogger("httpcore").setLevel(logging.INFO)
|
|
100
|
-
logging.getLogger("markdown_it").setLevel(logging.INFO)
|
|
101
|
-
# suppress UserWarnings from the slack_sdk module
|
|
102
|
-
warnings.filterwarnings("ignore", category=UserWarning, module="slack_sdk.*")
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def init_logging(verbose_flags: Optional[List[bool]] = None):
|
|
106
|
-
verbosity = cli_flags_to_verbosity(verbose_flags) # type: ignore
|
|
107
|
-
|
|
108
|
-
if verbosity == Verbosity.VERY_VERBOSE:
|
|
109
|
-
logging.basicConfig(
|
|
110
|
-
level=logging.DEBUG,
|
|
111
|
-
format="%(message)s",
|
|
112
|
-
handlers=[
|
|
113
|
-
RichHandler(
|
|
114
|
-
show_level=False,
|
|
115
|
-
markup=True,
|
|
116
|
-
show_time=False,
|
|
117
|
-
show_path=False,
|
|
118
|
-
console=Console(width=None),
|
|
119
|
-
)
|
|
120
|
-
],
|
|
121
|
-
)
|
|
122
|
-
elif verbosity == Verbosity.VERBOSE:
|
|
123
|
-
logging.basicConfig(
|
|
124
|
-
level=logging.INFO,
|
|
125
|
-
format="%(message)s",
|
|
126
|
-
handlers=[
|
|
127
|
-
RichHandler(
|
|
128
|
-
show_level=False,
|
|
129
|
-
markup=True,
|
|
130
|
-
show_time=False,
|
|
131
|
-
show_path=False,
|
|
132
|
-
console=Console(width=None),
|
|
133
|
-
)
|
|
134
|
-
],
|
|
135
|
-
)
|
|
136
|
-
logging.getLogger().setLevel(logging.DEBUG)
|
|
137
|
-
suppress_noisy_logs()
|
|
138
|
-
else:
|
|
139
|
-
logging.basicConfig(
|
|
140
|
-
level=logging.INFO,
|
|
141
|
-
format="%(message)s",
|
|
142
|
-
handlers=[
|
|
143
|
-
RichHandler(
|
|
144
|
-
show_level=False,
|
|
145
|
-
markup=True,
|
|
146
|
-
show_time=False,
|
|
147
|
-
show_path=False,
|
|
148
|
-
console=Console(width=None),
|
|
149
|
-
)
|
|
150
|
-
],
|
|
151
|
-
)
|
|
152
|
-
suppress_noisy_logs()
|
|
153
|
-
|
|
154
|
-
logging.debug(f"verbosity is {verbosity}")
|
|
155
|
-
|
|
156
|
-
return Console()
|
|
157
|
-
|
|
158
|
-
|
|
159
72
|
# Common cli options
|
|
160
73
|
# The defaults for options that are also in the config file MUST be None or else the cli defaults will override settings in the config file
|
|
161
74
|
opt_api_key: Optional[str] = typer.Option(
|
|
@@ -231,9 +144,6 @@ opt_documents: Optional[str] = typer.Option(
|
|
|
231
144
|
help="Additional documents to provide the LLM (typically URLs to runbooks)",
|
|
232
145
|
)
|
|
233
146
|
|
|
234
|
-
# Common help texts
|
|
235
|
-
system_prompt_help = "Advanced. System prompt for LLM. Values starting with builtin:// are loaded from holmes/plugins/prompts, values starting with file:// are loaded from the given path, other values are interpreted as a prompt string"
|
|
236
|
-
|
|
237
147
|
|
|
238
148
|
def parse_documents(documents: Optional[str]) -> List[ResourceInstructionDocument]:
|
|
239
149
|
resource_documents = []
|
|
@@ -247,35 +157,6 @@ def parse_documents(documents: Optional[str]) -> List[ResourceInstructionDocumen
|
|
|
247
157
|
return resource_documents
|
|
248
158
|
|
|
249
159
|
|
|
250
|
-
def handle_result(
|
|
251
|
-
result: LLMResult,
|
|
252
|
-
console: Console,
|
|
253
|
-
destination: DestinationType,
|
|
254
|
-
config: Config,
|
|
255
|
-
issue: Issue,
|
|
256
|
-
show_tool_output: bool,
|
|
257
|
-
add_separator: bool,
|
|
258
|
-
):
|
|
259
|
-
if destination == DestinationType.CLI:
|
|
260
|
-
if show_tool_output and result.tool_calls:
|
|
261
|
-
for tool_call in result.tool_calls:
|
|
262
|
-
console.print("[bold magenta]Used Tool:[/bold magenta]", end="")
|
|
263
|
-
# we need to print this separately with markup=False because it contains arbitrary text and we don't want console.print to interpret it
|
|
264
|
-
console.print(
|
|
265
|
-
f"{tool_call.description}. Output=\n{tool_call.result}",
|
|
266
|
-
markup=False,
|
|
267
|
-
)
|
|
268
|
-
|
|
269
|
-
console.print("[bold green]AI:[/bold green]", end=" ")
|
|
270
|
-
console.print(Markdown(result.result)) # type: ignore
|
|
271
|
-
if add_separator:
|
|
272
|
-
console.print(Rule())
|
|
273
|
-
|
|
274
|
-
elif destination == DestinationType.SLACK:
|
|
275
|
-
slack = config.create_slack_destination()
|
|
276
|
-
slack.send_issue(issue, result)
|
|
277
|
-
|
|
278
|
-
|
|
279
160
|
# TODO: add streaming output
|
|
280
161
|
@app.command()
|
|
281
162
|
def ask(
|
|
@@ -323,11 +204,31 @@ def ask(
|
|
|
323
204
|
"-i/-n",
|
|
324
205
|
help="Enter interactive mode after the initial question? For scripting, disable this with --no-interactive",
|
|
325
206
|
),
|
|
207
|
+
refresh_toolsets: bool = typer.Option(
|
|
208
|
+
False,
|
|
209
|
+
"--refresh-toolsets",
|
|
210
|
+
help="Refresh the toolsets status",
|
|
211
|
+
),
|
|
212
|
+
trace: Optional[str] = typer.Option(
|
|
213
|
+
None,
|
|
214
|
+
"--trace",
|
|
215
|
+
help="Enable tracing to the specified provider (e.g., 'braintrust')",
|
|
216
|
+
),
|
|
326
217
|
):
|
|
327
218
|
"""
|
|
328
219
|
Ask any question and answer using available tools
|
|
329
220
|
"""
|
|
330
221
|
console = init_logging(verbose) # type: ignore
|
|
222
|
+
|
|
223
|
+
# Detect and read piped input
|
|
224
|
+
piped_data = None
|
|
225
|
+
if not sys.stdin.isatty():
|
|
226
|
+
piped_data = sys.stdin.read().strip()
|
|
227
|
+
if interactive:
|
|
228
|
+
console.print(
|
|
229
|
+
"[bold yellow]Interactive mode disabled when reading piped input[/bold yellow]"
|
|
230
|
+
)
|
|
231
|
+
interactive = False
|
|
331
232
|
config = Config.load_from_file(
|
|
332
233
|
config_file,
|
|
333
234
|
api_key=api_key,
|
|
@@ -338,8 +239,17 @@ def ask(
|
|
|
338
239
|
slack_channel=slack_channel,
|
|
339
240
|
)
|
|
340
241
|
|
|
242
|
+
# Create tracer if trace option is provided
|
|
243
|
+
tracer = TracingFactory.create_tracer(trace, project="HolmesGPT-CLI")
|
|
244
|
+
experiment_name = f"holmes-ask-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
245
|
+
tracer.start_experiment(
|
|
246
|
+
experiment_name=experiment_name, metadata={"prompt": prompt or "holmes-ask"}
|
|
247
|
+
)
|
|
248
|
+
|
|
341
249
|
ai = config.create_console_toolcalling_llm(
|
|
342
250
|
dal=None, # type: ignore
|
|
251
|
+
refresh_toolsets=refresh_toolsets, # flag to refresh the toolset status
|
|
252
|
+
tracer=tracer,
|
|
343
253
|
)
|
|
344
254
|
template_context = {
|
|
345
255
|
"toolsets": ai.tool_executor.toolsets,
|
|
@@ -360,11 +270,20 @@ def ask(
|
|
|
360
270
|
console.print(
|
|
361
271
|
f"[bold yellow]Loaded prompt from file {prompt_file}[/bold yellow]"
|
|
362
272
|
)
|
|
363
|
-
elif not prompt and not interactive:
|
|
273
|
+
elif not prompt and not interactive and not piped_data:
|
|
364
274
|
raise typer.BadParameter(
|
|
365
275
|
"Either the 'prompt' argument or the --prompt-file option must be provided (unless using --interactive mode)."
|
|
366
276
|
)
|
|
367
277
|
|
|
278
|
+
# Handle piped data
|
|
279
|
+
if piped_data:
|
|
280
|
+
if prompt:
|
|
281
|
+
# User provided both piped data and a prompt
|
|
282
|
+
prompt = f"Here's some piped output:\n\n{piped_data}\n\n{prompt}"
|
|
283
|
+
else:
|
|
284
|
+
# Only piped data, no prompt - ask what to do with it
|
|
285
|
+
prompt = f"Here's some piped output:\n\n{piped_data}\n\nWhat can you tell me about this output?"
|
|
286
|
+
|
|
368
287
|
if echo_request and not interactive and prompt:
|
|
369
288
|
console.print("[bold yellow]User:[/bold yellow] " + prompt)
|
|
370
289
|
|
|
@@ -377,6 +296,7 @@ def ask(
|
|
|
377
296
|
include_file,
|
|
378
297
|
post_processing_prompt,
|
|
379
298
|
show_tool_output,
|
|
299
|
+
tracer,
|
|
380
300
|
)
|
|
381
301
|
return
|
|
382
302
|
|
|
@@ -387,7 +307,16 @@ def ask(
|
|
|
387
307
|
include_file,
|
|
388
308
|
)
|
|
389
309
|
|
|
390
|
-
|
|
310
|
+
with tracer.start_trace(
|
|
311
|
+
f'holmes ask "{prompt}"', span_type=SpanType.TASK
|
|
312
|
+
) as trace_span:
|
|
313
|
+
trace_span.log(input=prompt, metadata={"type": "user_question"})
|
|
314
|
+
response = ai.call(messages, post_processing_prompt, trace_span=trace_span)
|
|
315
|
+
trace_span.log(
|
|
316
|
+
output=response.result,
|
|
317
|
+
)
|
|
318
|
+
trace_url = tracer.get_trace_url()
|
|
319
|
+
|
|
391
320
|
messages = response.messages # type: ignore # Update messages with the full history
|
|
392
321
|
|
|
393
322
|
if json_output_file:
|
|
@@ -410,6 +339,9 @@ def ask(
|
|
|
410
339
|
False, # type: ignore
|
|
411
340
|
)
|
|
412
341
|
|
|
342
|
+
if trace_url:
|
|
343
|
+
console.print(f"🔍 View trace: {trace_url}")
|
|
344
|
+
|
|
413
345
|
|
|
414
346
|
@investigate_app.command()
|
|
415
347
|
def alertmanager(
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
{%- set k8s_base_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | selectattr("fetch_pod_logs", "defined") | first -%}
|
|
4
4
|
{%- set k8s_yaml_ts = toolsets | selectattr("name", "equalto", "kubernetes/logs") | rejectattr("fetch_pod_logs", "defined") | first -%}
|
|
5
5
|
{%- set opensearch_ts = toolsets | selectattr("name", "equalto", "opensearch/logs") | first -%}
|
|
6
|
+
{%- set datadog_ts = toolsets | selectattr("name", "equalto", "datadog/logs") | first -%}
|
|
6
7
|
|
|
7
8
|
# Logs
|
|
8
9
|
{% if loki_ts and loki_ts.status == "enabled" -%}
|
|
@@ -19,6 +20,8 @@
|
|
|
19
20
|
{% include '_default_log_prompt.jinja2' %}
|
|
20
21
|
{%- elif k8s_base_ts and k8s_base_ts.status == "enabled" -%}
|
|
21
22
|
{% include '_default_log_prompt.jinja2' %}
|
|
23
|
+
{%- elif datadog_ts and datadog_ts.status == "enabled" -%}
|
|
24
|
+
{% include '_default_log_prompt.jinja2' %}
|
|
22
25
|
{%- elif k8s_yaml_ts and k8s_yaml_ts.status == "enabled" -%}
|
|
23
26
|
* if the user wants to find a specific term in a pod's logs, use kubectl_logs_grep
|
|
24
27
|
* use both kubectl_previous_logs and kubectl_logs when reading logs. Treat the output of both as a single unified logs stream
|
|
@@ -33,4 +36,5 @@
|
|
|
33
36
|
** 'grafana/loki'
|
|
34
37
|
** 'opensearch/logs'
|
|
35
38
|
** 'coralogix/logs'
|
|
39
|
+
** 'datadog/logs'
|
|
36
40
|
{%- endif -%}
|
|
@@ -10,7 +10,6 @@ Global Instructions
|
|
|
10
10
|
You may receive a set of “Global Instructions” that describe how to perform certain tasks, handle certain situations, or apply certain best practices. They are not mandatory for every request, but serve as a reference resource and must be used if the current scenario or user request aligns with one of the described methods or conditions.
|
|
11
11
|
Use these rules when deciding how to apply them:
|
|
12
12
|
|
|
13
|
-
* If the user prompt includes Global Instructions, treat them as a reference resource.
|
|
14
13
|
* Some Global Instructions may describe how to handle specific tasks or scenarios. If the user's current request or the instructions in a triple quotes section reference one of these tasks, ALWAYS follow the Global Instruction for that task.
|
|
15
14
|
* Some Global Instructions may define general conditions that always apply if a certain scenario occurs (e.g., "whenever investigating a memory issue, always check resource limits"). If such a condition matches the current situation, apply the Global Instruction accordingly.
|
|
16
15
|
* If user's prompt or the instructions in a triple quotes section direct you to perform a task (e.g., “Find owner”) and there is a Global Instruction on how to do that task, ALWAYS follow the Global Instructions on how to perform it.
|
|
@@ -41,10 +40,6 @@ In general:
|
|
|
41
40
|
* do not give an answer like "Pod's node affinity/selector doesn't match any available nodes" because that doesn't include data on WHICH label doesn't match
|
|
42
41
|
* if investigating an issue on many pods, there is no need to check more than 3 individual pods in the same deployment. pick up to a representative 3 from each deployment if relevant
|
|
43
42
|
* if you find errors and warning in a pods logs and you believe they indicate a real issue. consider the pod as not healthy.
|
|
44
|
-
* if the user says something isn't working, ALWAYS:
|
|
45
|
-
** use kubectl_describe on the owner workload + individual pods and look for any transient issues they might have been referring to
|
|
46
|
-
** check the application aspects by accessing the application logs and other relevant tools
|
|
47
|
-
** look for misconfigured ingresses/services etc
|
|
48
43
|
|
|
49
44
|
{% include '_toolsets_instructions.jinja2' %}
|
|
50
45
|
|
|
@@ -53,9 +48,7 @@ In general:
|
|
|
53
48
|
Style guide:
|
|
54
49
|
* Be painfully concise.
|
|
55
50
|
* Leave out "the" and filler words when possible.
|
|
56
|
-
*
|
|
57
|
-
* if asked by Global Instructions or instructions in a triple single quotes section to explicitly include something in the answer, don't leave it out.
|
|
58
|
-
* return a json object with the following schema as a result:
|
|
51
|
+
* your answer should ONLY return a json object with the following schema as a result:
|
|
59
52
|
{
|
|
60
53
|
"type": "object",
|
|
61
54
|
"properties": {
|
|
@@ -69,13 +62,12 @@ Style guide:
|
|
|
69
62
|
}
|
|
70
63
|
},
|
|
71
64
|
"required": [
|
|
72
|
-
"
|
|
65
|
+
"root_cause_summary",
|
|
73
66
|
"workload_healthy"
|
|
74
67
|
]
|
|
75
68
|
}
|
|
76
69
|
|
|
77
70
|
|
|
78
|
-
|
|
79
71
|
{% if alerts %}
|
|
80
72
|
Here are issues and configuration changes that happend to this kubernetes workload in recent time. Check if these can help you understand the issue.
|
|
81
73
|
{% for a in alerts %}
|
|
@@ -14,7 +14,13 @@ from holmes.core.tools import Toolset, ToolsetType, ToolsetYamlFromConfig, YAMLT
|
|
|
14
14
|
from holmes.plugins.toolsets.coralogix.toolset_coralogix_logs import (
|
|
15
15
|
CoralogixLogsToolset,
|
|
16
16
|
)
|
|
17
|
-
from holmes.plugins.toolsets.datadog import
|
|
17
|
+
from holmes.plugins.toolsets.datadog.toolset_datadog_logs import DatadogLogsToolset
|
|
18
|
+
from holmes.plugins.toolsets.datadog.toolset_datadog_metrics import (
|
|
19
|
+
DatadogMetricsToolset,
|
|
20
|
+
)
|
|
21
|
+
from holmes.plugins.toolsets.datadog.toolset_datadog_traces import (
|
|
22
|
+
DatadogTracesToolset,
|
|
23
|
+
)
|
|
18
24
|
from holmes.plugins.toolsets.kubernetes_logs import KubernetesLogsToolset
|
|
19
25
|
from holmes.plugins.toolsets.git import GitToolset
|
|
20
26
|
from holmes.plugins.toolsets.grafana.toolset_grafana import GrafanaToolset
|
|
@@ -68,7 +74,9 @@ def load_python_toolsets(dal: Optional[SupabaseDal]) -> List[Toolset]:
|
|
|
68
74
|
GrafanaToolset(),
|
|
69
75
|
NotionToolset(),
|
|
70
76
|
KafkaToolset(),
|
|
71
|
-
|
|
77
|
+
DatadogLogsToolset(),
|
|
78
|
+
DatadogMetricsToolset(),
|
|
79
|
+
DatadogTracesToolset(),
|
|
72
80
|
PrometheusToolset(),
|
|
73
81
|
OpenSearchLogsToolset(),
|
|
74
82
|
OpenSearchTracesToolset(),
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
from typing import Dict, List
|
|
2
|
-
import pyodbc
|
|
3
2
|
import logging
|
|
4
3
|
import struct
|
|
5
4
|
from azure.core.credentials import TokenCredential
|
|
@@ -38,6 +37,8 @@ class AzureSQLAPIClient:
|
|
|
38
37
|
self, server_name: str, database_name: str, query: str
|
|
39
38
|
) -> List[Dict]:
|
|
40
39
|
"""Execute a T-SQL query against the Azure SQL database."""
|
|
40
|
+
import pyodbc # type: ignore
|
|
41
|
+
|
|
41
42
|
conn = None
|
|
42
43
|
cursor = None
|
|
43
44
|
|
|
@@ -62,6 +62,9 @@ class CoralogixLogsToolset(BasePodLoggingToolset):
|
|
|
62
62
|
def coralogix_config(self) -> Optional[CoralogixConfig]:
|
|
63
63
|
return self.config
|
|
64
64
|
|
|
65
|
+
def logger_name(self) -> str:
|
|
66
|
+
return "Coralogix"
|
|
67
|
+
|
|
65
68
|
def fetch_pod_logs(self, params: FetchPodLogsParams) -> StructuredToolResult:
|
|
66
69
|
if not self.coralogix_config:
|
|
67
70
|
return StructuredToolResult(
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import Any, Optional, Dict
|
|
3
|
+
import requests # type: ignore
|
|
4
|
+
from pydantic import AnyUrl, BaseModel
|
|
5
|
+
from requests.structures import CaseInsensitiveDict # type: ignore
|
|
6
|
+
from tenacity import retry, retry_if_exception, stop_after_attempt, wait_incrementing
|
|
7
|
+
from tenacity.wait import wait_base
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
START_RETRY_DELAY = (
|
|
11
|
+
5.0 # Initial fallback delay if datadog does not return a reset_time
|
|
12
|
+
)
|
|
13
|
+
INCREMENT_RETRY_DELAY = 5.0 # Delay increment after each rate limit, if datadog does not return a reset_time
|
|
14
|
+
MAX_RETRY_COUNT_ON_RATE_LIMIT = 5
|
|
15
|
+
|
|
16
|
+
RATE_LIMIT_REMAINING_SECONDS_HEADER = "X-RateLimit-Reset"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DatadogBaseConfig(BaseModel):
|
|
20
|
+
"""Base configuration for all Datadog toolsets"""
|
|
21
|
+
|
|
22
|
+
dd_api_key: str
|
|
23
|
+
dd_app_key: str
|
|
24
|
+
site_api_url: AnyUrl
|
|
25
|
+
request_timeout: int = 60
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataDogRequestError(Exception):
|
|
29
|
+
payload: dict
|
|
30
|
+
status_code: int
|
|
31
|
+
response_text: str
|
|
32
|
+
response_headers: CaseInsensitiveDict[str]
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
payload: dict,
|
|
37
|
+
status_code: int,
|
|
38
|
+
response_text: str,
|
|
39
|
+
response_headers: CaseInsensitiveDict[str],
|
|
40
|
+
):
|
|
41
|
+
super().__init__(f"HTTP error: {status_code} - {response_text}")
|
|
42
|
+
self.payload = payload
|
|
43
|
+
self.status_code = status_code
|
|
44
|
+
self.response_text = response_text
|
|
45
|
+
self.response_headers = response_headers
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_headers(dd_config: DatadogBaseConfig) -> Dict[str, str]:
|
|
49
|
+
"""Get standard headers for Datadog API requests.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
dd_config: Datadog configuration object
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dictionary of headers for Datadog API requests
|
|
56
|
+
"""
|
|
57
|
+
return {
|
|
58
|
+
"Content-Type": "application/json",
|
|
59
|
+
"DD-API-KEY": dd_config.dd_api_key,
|
|
60
|
+
"DD-APPLICATION-KEY": dd_config.dd_app_key,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_cursor(data: dict) -> Optional[str]:
|
|
65
|
+
"""Extract cursor for paginating through Datadog logs API responses."""
|
|
66
|
+
if data is None:
|
|
67
|
+
return None
|
|
68
|
+
meta = data.get("meta", {})
|
|
69
|
+
if meta is None:
|
|
70
|
+
return None
|
|
71
|
+
page = meta.get("page", {})
|
|
72
|
+
if page is None:
|
|
73
|
+
return None
|
|
74
|
+
return page.get("after", None)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class retry_if_http_429_error(retry_if_exception):
|
|
78
|
+
def __init__(self):
|
|
79
|
+
def is_http_429_error(exception):
|
|
80
|
+
return (
|
|
81
|
+
isinstance(exception, DataDogRequestError)
|
|
82
|
+
and exception.status_code == 429
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
super().__init__(predicate=is_http_429_error)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class wait_for_retry_after_header(wait_base):
|
|
89
|
+
def __init__(self, fallback):
|
|
90
|
+
self.fallback = fallback
|
|
91
|
+
|
|
92
|
+
def __call__(self, retry_state):
|
|
93
|
+
if retry_state.outcome:
|
|
94
|
+
exc = retry_state.outcome.exception()
|
|
95
|
+
|
|
96
|
+
if isinstance(exc, DataDogRequestError) and exc.response_headers.get(
|
|
97
|
+
RATE_LIMIT_REMAINING_SECONDS_HEADER
|
|
98
|
+
):
|
|
99
|
+
reset_time_header = exc.response_headers.get(
|
|
100
|
+
RATE_LIMIT_REMAINING_SECONDS_HEADER
|
|
101
|
+
)
|
|
102
|
+
if reset_time_header:
|
|
103
|
+
try:
|
|
104
|
+
reset_time = int(reset_time_header)
|
|
105
|
+
wait_time = max(0, reset_time) + 0.1
|
|
106
|
+
return wait_time
|
|
107
|
+
except ValueError:
|
|
108
|
+
logging.warning(
|
|
109
|
+
f"Received invalid {RATE_LIMIT_REMAINING_SECONDS_HEADER} header value from datadog: {reset_time_header}"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
return self.fallback(retry_state)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@retry(
|
|
116
|
+
retry=retry_if_http_429_error(),
|
|
117
|
+
wait=wait_for_retry_after_header(
|
|
118
|
+
fallback=wait_incrementing(
|
|
119
|
+
start=START_RETRY_DELAY, increment=INCREMENT_RETRY_DELAY
|
|
120
|
+
)
|
|
121
|
+
),
|
|
122
|
+
stop=stop_after_attempt(MAX_RETRY_COUNT_ON_RATE_LIMIT),
|
|
123
|
+
before_sleep=lambda retry_state: logging.warning(
|
|
124
|
+
f"DataDog API rate limited. Retrying... "
|
|
125
|
+
f"(attempt {retry_state.attempt_number}/{MAX_RETRY_COUNT_ON_RATE_LIMIT})"
|
|
126
|
+
),
|
|
127
|
+
reraise=True,
|
|
128
|
+
)
|
|
129
|
+
def execute_datadog_http_request(
|
|
130
|
+
url: str,
|
|
131
|
+
headers: dict,
|
|
132
|
+
payload_or_params: dict,
|
|
133
|
+
timeout: int,
|
|
134
|
+
method: str = "POST",
|
|
135
|
+
) -> Any:
|
|
136
|
+
if method == "GET":
|
|
137
|
+
response = requests.get(
|
|
138
|
+
url, headers=headers, params=payload_or_params, timeout=timeout
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
response = requests.post(
|
|
142
|
+
url, headers=headers, json=payload_or_params, timeout=timeout
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if response.status_code == 200:
|
|
146
|
+
response_data = response.json()
|
|
147
|
+
|
|
148
|
+
if method == "POST" and response_data and "data" in response_data:
|
|
149
|
+
cursor = extract_cursor(response_data)
|
|
150
|
+
data = response_data.get("data", [])
|
|
151
|
+
return data, cursor
|
|
152
|
+
else:
|
|
153
|
+
return response_data
|
|
154
|
+
|
|
155
|
+
else:
|
|
156
|
+
raise DataDogRequestError(
|
|
157
|
+
payload=payload_or_params,
|
|
158
|
+
status_code=response.status_code,
|
|
159
|
+
response_text=response.text,
|
|
160
|
+
response_headers=response.headers,
|
|
161
|
+
)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
## Datadog Metrics Tools Usage Guide
|
|
2
|
+
|
|
3
|
+
When investigating metrics-related issues:
|
|
4
|
+
|
|
5
|
+
1. **Start with `list_active_datadog_metrics`** to discover available metrics
|
|
6
|
+
- Use filters like `host` or `tag_filter` to narrow results
|
|
7
|
+
- Default shows metrics from last 24 hours
|
|
8
|
+
|
|
9
|
+
2. **Use `query_datadog_metrics`** to fetch actual metric data
|
|
10
|
+
- Query syntax: `metric_name{tag:value}`
|
|
11
|
+
- Example: `system.cpu.user{host:myhost}`
|
|
12
|
+
- Returns timeseries data with timestamps and values
|
|
13
|
+
|
|
14
|
+
3. **Use `get_datadog_metric_metadata`** to understand metric properties
|
|
15
|
+
- Provides metric type (gauge/count/rate), unit, and description
|
|
16
|
+
- Accepts comma-separated list for batch queries
|
|
17
|
+
|
|
18
|
+
### Time Parameters
|
|
19
|
+
- Use RFC3339 format: `2023-03-01T10:30:00Z`
|
|
20
|
+
- Or relative seconds: `-3600` for 1 hour ago
|
|
21
|
+
- Defaults to 1 hour window if not specified
|
|
22
|
+
|
|
23
|
+
### Common Patterns
|
|
24
|
+
- CPU investigation: First list metrics with `tag_filter:kube_node_name:nodename`, then query specific metrics
|
|
25
|
+
- Memory issues: Look for `system.mem.*` or `kubernetes.memory.*` metrics
|
|
26
|
+
- Container metrics: Filter by pod/container tags
|