holmesgpt 0.12.6__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +19 -1
- holmes/common/env_vars.py +17 -0
- holmes/config.py +69 -9
- holmes/core/conversations.py +11 -0
- holmes/core/investigation.py +16 -3
- holmes/core/investigation_structured_output.py +12 -0
- holmes/core/llm.py +13 -1
- holmes/core/models.py +9 -1
- holmes/core/openai_formatting.py +72 -12
- holmes/core/prompt.py +13 -0
- holmes/core/supabase_dal.py +3 -0
- holmes/core/todo_manager.py +88 -0
- holmes/core/tool_calling_llm.py +230 -157
- holmes/core/tools.py +10 -1
- holmes/core/tools_utils/tool_executor.py +7 -2
- holmes/core/tools_utils/toolset_utils.py +7 -2
- holmes/core/toolset_manager.py +1 -5
- holmes/core/tracing.py +4 -3
- holmes/interactive.py +1 -0
- holmes/main.py +9 -2
- holmes/plugins/prompts/__init__.py +7 -1
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +4 -2
- holmes/plugins/prompts/_fetch_logs.jinja2 +10 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +14 -0
- holmes/plugins/prompts/_permission_errors.jinja2 +1 -1
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +4 -4
- holmes/plugins/prompts/generic_ask.jinja2 +4 -3
- holmes/plugins/prompts/investigation_procedure.jinja2 +210 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -0
- holmes/plugins/runbooks/CLAUDE.md +85 -0
- holmes/plugins/runbooks/README.md +24 -0
- holmes/plugins/toolsets/__init__.py +19 -6
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +27 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -2
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -1
- holmes/plugins/toolsets/bash/argocd/__init__.py +65 -0
- holmes/plugins/toolsets/bash/argocd/constants.py +120 -0
- holmes/plugins/toolsets/bash/aws/__init__.py +66 -0
- holmes/plugins/toolsets/bash/aws/constants.py +529 -0
- holmes/plugins/toolsets/bash/azure/__init__.py +56 -0
- holmes/plugins/toolsets/bash/azure/constants.py +339 -0
- holmes/plugins/toolsets/bash/bash_instructions.jinja2 +6 -7
- holmes/plugins/toolsets/bash/bash_toolset.py +47 -13
- holmes/plugins/toolsets/bash/common/bash_command.py +131 -0
- holmes/plugins/toolsets/bash/common/stringify.py +14 -1
- holmes/plugins/toolsets/bash/common/validators.py +91 -0
- holmes/plugins/toolsets/bash/docker/__init__.py +59 -0
- holmes/plugins/toolsets/bash/docker/constants.py +255 -0
- holmes/plugins/toolsets/bash/helm/__init__.py +61 -0
- holmes/plugins/toolsets/bash/helm/constants.py +92 -0
- holmes/plugins/toolsets/bash/kubectl/__init__.py +80 -79
- holmes/plugins/toolsets/bash/kubectl/constants.py +0 -14
- holmes/plugins/toolsets/bash/kubectl/kubectl_describe.py +38 -56
- holmes/plugins/toolsets/bash/kubectl/kubectl_events.py +28 -76
- holmes/plugins/toolsets/bash/kubectl/kubectl_get.py +39 -99
- holmes/plugins/toolsets/bash/kubectl/kubectl_logs.py +34 -15
- holmes/plugins/toolsets/bash/kubectl/kubectl_run.py +1 -1
- holmes/plugins/toolsets/bash/kubectl/kubectl_top.py +38 -77
- holmes/plugins/toolsets/bash/parse_command.py +106 -32
- holmes/plugins/toolsets/bash/utilities/__init__.py +0 -0
- holmes/plugins/toolsets/bash/utilities/base64_util.py +12 -0
- holmes/plugins/toolsets/bash/utilities/cut.py +12 -0
- holmes/plugins/toolsets/bash/utilities/grep/__init__.py +10 -0
- holmes/plugins/toolsets/bash/utilities/head.py +12 -0
- holmes/plugins/toolsets/bash/utilities/jq.py +79 -0
- holmes/plugins/toolsets/bash/utilities/sed.py +164 -0
- holmes/plugins/toolsets/bash/utilities/sort.py +15 -0
- holmes/plugins/toolsets/bash/utilities/tail.py +12 -0
- holmes/plugins/toolsets/bash/utilities/tr.py +57 -0
- holmes/plugins/toolsets/bash/utilities/uniq.py +12 -0
- holmes/plugins/toolsets/bash/utilities/wc.py +12 -0
- holmes/plugins/toolsets/coralogix/api.py +6 -6
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +7 -1
- holmes/plugins/toolsets/datadog/datadog_api.py +20 -8
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +8 -1
- holmes/plugins/toolsets/datadog/datadog_rds_instructions.jinja2 +82 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +12 -5
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -11
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +735 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -11
- holmes/plugins/toolsets/git.py +15 -15
- holmes/plugins/toolsets/grafana/grafana_api.py +12 -1
- holmes/plugins/toolsets/grafana/toolset_grafana.py +5 -1
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +9 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +12 -5
- holmes/plugins/toolsets/internet/internet.py +2 -1
- holmes/plugins/toolsets/internet/notion.py +2 -1
- holmes/plugins/toolsets/investigator/__init__.py +0 -0
- holmes/plugins/toolsets/investigator/core_investigation.py +157 -0
- holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +253 -0
- holmes/plugins/toolsets/investigator/model.py +15 -0
- holmes/plugins/toolsets/kafka.py +14 -7
- holmes/plugins/toolsets/kubernetes_logs.py +454 -25
- holmes/plugins/toolsets/logging_utils/logging_api.py +115 -55
- holmes/plugins/toolsets/mcp/toolset_mcp.py +1 -1
- holmes/plugins/toolsets/newrelic.py +8 -3
- holmes/plugins/toolsets/opensearch/opensearch.py +8 -4
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +9 -2
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +6 -2
- holmes/plugins/toolsets/prometheus/prometheus.py +179 -44
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +8 -2
- holmes/plugins/toolsets/robusta/robusta.py +4 -4
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +6 -5
- holmes/plugins/toolsets/servicenow/servicenow.py +18 -3
- holmes/plugins/toolsets/utils.py +8 -1
- holmes/utils/console/logging.py +6 -1
- holmes/utils/llms.py +20 -0
- holmes/utils/stream.py +90 -0
- {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/METADATA +47 -34
- {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/RECORD +123 -91
- holmes/plugins/toolsets/bash/grep/__init__.py +0 -52
- holmes/utils/robusta.py +0 -9
- {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/WHEEL +0 -0
- {holmesgpt-0.12.6.dist-info → holmesgpt-0.13.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import logging
|
|
3
|
+
import boto3
|
|
3
4
|
import os
|
|
4
5
|
import re
|
|
5
6
|
import time
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
7
8
|
from urllib.parse import urljoin
|
|
8
9
|
|
|
9
10
|
import requests # type: ignore
|
|
10
11
|
from pydantic import BaseModel, field_validator, Field, model_validator
|
|
11
12
|
from requests import RequestException
|
|
13
|
+
from requests_aws4auth import AWS4Auth
|
|
12
14
|
|
|
13
15
|
from holmes.core.tools import (
|
|
14
16
|
CallablePrerequisite,
|
|
@@ -25,14 +27,17 @@ from holmes.plugins.toolsets.utils import (
|
|
|
25
27
|
get_param_or_raise,
|
|
26
28
|
process_timestamps_to_rfc3339,
|
|
27
29
|
standard_start_datetime_tool_param_description,
|
|
30
|
+
toolset_name_for_one_liner,
|
|
28
31
|
)
|
|
29
32
|
from holmes.utils.cache import TTLCache
|
|
30
33
|
from holmes.common.env_vars import IS_OPENSHIFT
|
|
31
34
|
from holmes.common.openshift import load_openshift_token
|
|
35
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
36
|
+
DEFAULT_TIME_SPAN_SECONDS,
|
|
37
|
+
)
|
|
32
38
|
from holmes.utils.keygen_utils import generate_random_key
|
|
33
39
|
|
|
34
40
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
35
|
-
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
36
41
|
|
|
37
42
|
|
|
38
43
|
class PrometheusConfig(BaseModel):
|
|
@@ -49,6 +54,7 @@ class PrometheusConfig(BaseModel):
|
|
|
49
54
|
headers: Dict = Field(default_factory=dict)
|
|
50
55
|
rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
|
|
51
56
|
additional_labels: Optional[Dict[str, str]] = None
|
|
57
|
+
prometheus_ssl_enabled: bool = True
|
|
52
58
|
|
|
53
59
|
@field_validator("prometheus_url")
|
|
54
60
|
def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
|
|
@@ -73,6 +79,63 @@ class PrometheusConfig(BaseModel):
|
|
|
73
79
|
|
|
74
80
|
return self
|
|
75
81
|
|
|
82
|
+
def is_amp(self) -> bool:
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
def get_auth(self) -> Any:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class AMPConfig(PrometheusConfig):
|
|
90
|
+
aws_access_key: Optional[str] = None
|
|
91
|
+
aws_secret_access_key: Optional[str] = None
|
|
92
|
+
aws_region: str
|
|
93
|
+
aws_service_name: str = "aps"
|
|
94
|
+
healthcheck: str = "api/v1/query?query=up"
|
|
95
|
+
prometheus_ssl_enabled: bool = False
|
|
96
|
+
|
|
97
|
+
def is_amp(self) -> bool:
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
def _build_irsa_auth(self) -> Optional[AWS4Auth]:
|
|
101
|
+
"""Try IRSA (or default AWS provider chain)."""
|
|
102
|
+
session = boto3.Session()
|
|
103
|
+
creds = session.get_credentials()
|
|
104
|
+
if creds is None:
|
|
105
|
+
return None
|
|
106
|
+
frozen = creds.get_frozen_credentials()
|
|
107
|
+
return AWS4Auth(
|
|
108
|
+
frozen.access_key,
|
|
109
|
+
frozen.secret_key,
|
|
110
|
+
self.aws_region,
|
|
111
|
+
self.aws_service_name,
|
|
112
|
+
session_token=frozen.token,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _build_static_aws_auth(self) -> Optional[AWS4Auth]:
|
|
116
|
+
"""Fallback: static credentials from config."""
|
|
117
|
+
if self.aws_access_key and self.aws_secret_access_key:
|
|
118
|
+
return AWS4Auth(
|
|
119
|
+
self.aws_access_key,
|
|
120
|
+
self.aws_secret_access_key,
|
|
121
|
+
self.aws_region,
|
|
122
|
+
self.aws_service_name,
|
|
123
|
+
)
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
def get_auth(self):
|
|
127
|
+
# Prefer IRSA, fallback to static
|
|
128
|
+
irsa_auth = self._build_irsa_auth()
|
|
129
|
+
if irsa_auth:
|
|
130
|
+
return irsa_auth
|
|
131
|
+
static_auth = self._build_static_aws_auth()
|
|
132
|
+
if static_auth:
|
|
133
|
+
return static_auth
|
|
134
|
+
raise RuntimeError(
|
|
135
|
+
"No AWS credentials available. Tried IRSA and static keys. "
|
|
136
|
+
"Ensure IRSA is configured on the service account or provide aws_access_key/aws_secret_access_key."
|
|
137
|
+
)
|
|
138
|
+
|
|
76
139
|
|
|
77
140
|
class BasePrometheusTool(Tool):
|
|
78
141
|
toolset: "PrometheusToolset"
|
|
@@ -99,10 +162,15 @@ def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
|
|
|
99
162
|
METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
|
|
100
163
|
|
|
101
164
|
|
|
102
|
-
def fetch_metadata(
|
|
165
|
+
def fetch_metadata(
|
|
166
|
+
prometheus_url: str,
|
|
167
|
+
headers: Optional[Dict],
|
|
168
|
+
auth=None,
|
|
169
|
+
verify_ssl: bool = True,
|
|
170
|
+
) -> Dict:
|
|
103
171
|
metadata_url = urljoin(prometheus_url, "api/v1/metadata")
|
|
104
172
|
metadata_response = requests.get(
|
|
105
|
-
metadata_url, headers=headers, timeout=60, verify=
|
|
173
|
+
metadata_url, headers=headers, timeout=60, verify=verify_ssl, auth=auth
|
|
106
174
|
)
|
|
107
175
|
|
|
108
176
|
metadata_response.raise_for_status()
|
|
@@ -124,13 +192,17 @@ def fetch_metadata(prometheus_url: str, headers: Optional[Dict]) -> Dict:
|
|
|
124
192
|
|
|
125
193
|
|
|
126
194
|
def fetch_metadata_with_series_api(
|
|
127
|
-
prometheus_url: str,
|
|
195
|
+
prometheus_url: str,
|
|
196
|
+
metric_name: str,
|
|
197
|
+
headers: Dict,
|
|
198
|
+
auth=None,
|
|
199
|
+
verify_ssl: bool = True,
|
|
128
200
|
) -> Dict:
|
|
129
201
|
url = urljoin(prometheus_url, "api/v1/series")
|
|
130
202
|
params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
131
203
|
|
|
132
204
|
response = requests.get(
|
|
133
|
-
url, headers=headers, timeout=60, params=params, verify=
|
|
205
|
+
url, headers=headers, timeout=60, params=params, auth=auth, verify=verify_ssl
|
|
134
206
|
)
|
|
135
207
|
response.raise_for_status()
|
|
136
208
|
metrics = response.json()["data"]
|
|
@@ -172,6 +244,8 @@ def fetch_metrics_labels_with_series_api(
|
|
|
172
244
|
cache: Optional[TTLCache],
|
|
173
245
|
metrics_labels_time_window_hrs: Union[int, None],
|
|
174
246
|
metric_name: str,
|
|
247
|
+
auth=None,
|
|
248
|
+
verify_ssl: bool = True,
|
|
175
249
|
) -> dict:
|
|
176
250
|
"""This is a slow query. Takes 5+ seconds to run"""
|
|
177
251
|
cache_key = f"metrics_labels_series_api:{metric_name}"
|
|
@@ -188,7 +262,12 @@ def fetch_metrics_labels_with_series_api(
|
|
|
188
262
|
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
189
263
|
|
|
190
264
|
series_response = requests.get(
|
|
191
|
-
url=series_url,
|
|
265
|
+
url=series_url,
|
|
266
|
+
headers=headers,
|
|
267
|
+
params=params,
|
|
268
|
+
auth=auth,
|
|
269
|
+
timeout=60,
|
|
270
|
+
verify=verify_ssl,
|
|
192
271
|
)
|
|
193
272
|
series_response.raise_for_status()
|
|
194
273
|
series = series_response.json()["data"]
|
|
@@ -214,6 +293,8 @@ def fetch_metrics_labels_with_labels_api(
|
|
|
214
293
|
metrics_labels_time_window_hrs: Union[int, None],
|
|
215
294
|
metric_names: List[str],
|
|
216
295
|
headers: Dict,
|
|
296
|
+
auth=None,
|
|
297
|
+
verify_ssl: bool = True,
|
|
217
298
|
) -> dict:
|
|
218
299
|
metrics_labels = {}
|
|
219
300
|
|
|
@@ -233,7 +314,12 @@ def fetch_metrics_labels_with_labels_api(
|
|
|
233
314
|
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
234
315
|
|
|
235
316
|
response = requests.get(
|
|
236
|
-
url=url,
|
|
317
|
+
url=url,
|
|
318
|
+
headers=headers,
|
|
319
|
+
params=params,
|
|
320
|
+
auth=auth,
|
|
321
|
+
timeout=60,
|
|
322
|
+
verify=verify_ssl,
|
|
237
323
|
)
|
|
238
324
|
response.raise_for_status()
|
|
239
325
|
labels = response.json()["data"]
|
|
@@ -254,16 +340,27 @@ def fetch_metrics(
|
|
|
254
340
|
should_fetch_labels_with_labels_api: bool,
|
|
255
341
|
should_fetch_metadata_with_series_api: bool,
|
|
256
342
|
headers: Dict,
|
|
343
|
+
auth=None,
|
|
344
|
+
verify_ssl: bool = True,
|
|
257
345
|
) -> dict:
|
|
258
346
|
metrics = None
|
|
259
347
|
should_fetch_labels = True
|
|
260
348
|
if should_fetch_metadata_with_series_api:
|
|
261
349
|
metrics = fetch_metadata_with_series_api(
|
|
262
|
-
prometheus_url=prometheus_url,
|
|
350
|
+
prometheus_url=prometheus_url,
|
|
351
|
+
metric_name=metric_name,
|
|
352
|
+
headers=headers,
|
|
353
|
+
auth=auth,
|
|
354
|
+
verify_ssl=verify_ssl,
|
|
263
355
|
)
|
|
264
356
|
should_fetch_labels = False # series API returns the labels
|
|
265
357
|
else:
|
|
266
|
-
metrics = fetch_metadata(
|
|
358
|
+
metrics = fetch_metadata(
|
|
359
|
+
prometheus_url=prometheus_url,
|
|
360
|
+
headers=headers,
|
|
361
|
+
auth=auth,
|
|
362
|
+
verify_ssl=verify_ssl,
|
|
363
|
+
)
|
|
267
364
|
metrics = filter_metrics_by_name(metrics, metric_name)
|
|
268
365
|
|
|
269
366
|
if should_fetch_labels:
|
|
@@ -275,6 +372,8 @@ def fetch_metrics(
|
|
|
275
372
|
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
276
373
|
metric_names=list(metrics.keys()),
|
|
277
374
|
headers=headers,
|
|
375
|
+
auth=auth,
|
|
376
|
+
verify_ssl=verify_ssl,
|
|
278
377
|
)
|
|
279
378
|
else:
|
|
280
379
|
metrics_labels = fetch_metrics_labels_with_series_api(
|
|
@@ -283,6 +382,8 @@ def fetch_metrics(
|
|
|
283
382
|
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
284
383
|
metric_name=metric_name,
|
|
285
384
|
headers=headers,
|
|
385
|
+
auth=auth,
|
|
386
|
+
verify_ssl=verify_ssl,
|
|
286
387
|
)
|
|
287
388
|
|
|
288
389
|
for metric_name in metrics:
|
|
@@ -309,6 +410,12 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
309
410
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
310
411
|
params=params,
|
|
311
412
|
)
|
|
413
|
+
if self.toolset.config.is_amp():
|
|
414
|
+
return StructuredToolResult(
|
|
415
|
+
status=ToolResultStatus.ERROR,
|
|
416
|
+
error="Tool not supported in AMP",
|
|
417
|
+
params=params,
|
|
418
|
+
)
|
|
312
419
|
if not self._cache and self.toolset.config.rules_cache_duration_seconds:
|
|
313
420
|
self._cache = TTLCache(self.toolset.config.rules_cache_duration_seconds) # type: ignore
|
|
314
421
|
try:
|
|
@@ -330,8 +437,9 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
330
437
|
rules_response = requests.get(
|
|
331
438
|
url=rules_url,
|
|
332
439
|
params=params,
|
|
440
|
+
auth=self.toolset.config.get_auth(),
|
|
333
441
|
timeout=180,
|
|
334
|
-
verify=
|
|
442
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
335
443
|
headers=self.toolset.config.headers,
|
|
336
444
|
)
|
|
337
445
|
rules_response.raise_for_status()
|
|
@@ -367,7 +475,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
367
475
|
)
|
|
368
476
|
|
|
369
477
|
def get_parameterized_one_liner(self, params) -> str:
|
|
370
|
-
return "
|
|
478
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
|
|
371
479
|
|
|
372
480
|
|
|
373
481
|
class ListAvailableMetrics(BasePrometheusTool):
|
|
@@ -424,6 +532,8 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
424
532
|
should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
|
|
425
533
|
should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
|
|
426
534
|
headers=self.toolset.config.headers,
|
|
535
|
+
auth=self.toolset.config.get_auth(),
|
|
536
|
+
verify_ssl=self.toolset.config.prometheus_ssl_enabled,
|
|
427
537
|
)
|
|
428
538
|
|
|
429
539
|
if params.get("type_filter"):
|
|
@@ -470,7 +580,8 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
470
580
|
)
|
|
471
581
|
|
|
472
582
|
def get_parameterized_one_liner(self, params) -> str:
|
|
473
|
-
|
|
583
|
+
name_filter = params.get("name_filter", "")
|
|
584
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
|
|
474
585
|
|
|
475
586
|
|
|
476
587
|
class ExecuteInstantQuery(BasePrometheusTool):
|
|
@@ -509,7 +620,11 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
509
620
|
payload = {"query": query}
|
|
510
621
|
|
|
511
622
|
response = requests.post(
|
|
512
|
-
url=url,
|
|
623
|
+
url=url,
|
|
624
|
+
headers=self.toolset.config.headers,
|
|
625
|
+
auth=self.toolset.config.get_auth(),
|
|
626
|
+
data=payload,
|
|
627
|
+
timeout=60,
|
|
513
628
|
)
|
|
514
629
|
|
|
515
630
|
if response.status_code == 200:
|
|
@@ -579,9 +694,8 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
579
694
|
)
|
|
580
695
|
|
|
581
696
|
def get_parameterized_one_liner(self, params) -> str:
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
return f"Execute Prometheus Query (instant): promql='{query}', description='{description}'"
|
|
697
|
+
description = params.get("description", "")
|
|
698
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})"
|
|
585
699
|
|
|
586
700
|
|
|
587
701
|
class ExecuteRangeQuery(BasePrometheusTool):
|
|
@@ -654,7 +768,11 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
654
768
|
}
|
|
655
769
|
|
|
656
770
|
response = requests.post(
|
|
657
|
-
url=url,
|
|
771
|
+
url=url,
|
|
772
|
+
headers=self.toolset.config.headers,
|
|
773
|
+
auth=self.toolset.config.get_auth(),
|
|
774
|
+
data=payload,
|
|
775
|
+
timeout=120,
|
|
658
776
|
)
|
|
659
777
|
|
|
660
778
|
if response.status_code == 200:
|
|
@@ -726,15 +844,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
726
844
|
)
|
|
727
845
|
|
|
728
846
|
def get_parameterized_one_liner(self, params) -> str:
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
end = params.get("end")
|
|
732
|
-
step = params.get("step")
|
|
733
|
-
description = params.get("description")
|
|
734
|
-
return f"Execute Prometheus Query (range): promql='{query}', start={start}, end={end}, step={step}, description='{description}'"
|
|
847
|
+
description = params.get("description", "")
|
|
848
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})"
|
|
735
849
|
|
|
736
850
|
|
|
737
851
|
class PrometheusToolset(Toolset):
|
|
852
|
+
config: Optional[Union[PrometheusConfig, AMPConfig]] = None
|
|
853
|
+
|
|
738
854
|
def __init__(self):
|
|
739
855
|
super().__init__(
|
|
740
856
|
name="prometheus/metrics",
|
|
@@ -760,28 +876,43 @@ class PrometheusToolset(Toolset):
|
|
|
760
876
|
)
|
|
761
877
|
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
762
878
|
|
|
763
|
-
def
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
879
|
+
def determine_prometheus_class(
|
|
880
|
+
self, config: dict[str, Any]
|
|
881
|
+
) -> Type[Union[PrometheusConfig, AMPConfig]]:
|
|
882
|
+
has_aws_fields = "aws_region" in config
|
|
883
|
+
return AMPConfig if has_aws_fields else PrometheusConfig
|
|
768
884
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
885
|
+
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
886
|
+
try:
|
|
887
|
+
if config:
|
|
888
|
+
config_cls = self.determine_prometheus_class(config)
|
|
889
|
+
self.config = config_cls(**config) # type: ignore
|
|
890
|
+
|
|
891
|
+
self._reload_llm_instructions()
|
|
892
|
+
return self._is_healthy()
|
|
893
|
+
except Exception:
|
|
894
|
+
logging.exception("Failed to create prometheus config")
|
|
895
|
+
return False, "Failed to create prometheus config"
|
|
896
|
+
try:
|
|
897
|
+
prometheus_url = os.environ.get("PROMETHEUS_URL")
|
|
772
898
|
if not prometheus_url:
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
899
|
+
prometheus_url = self.auto_detect_prometheus_url()
|
|
900
|
+
if not prometheus_url:
|
|
901
|
+
return (
|
|
902
|
+
False,
|
|
903
|
+
"Unable to auto-detect prometheus. Define prometheus_url in the configuration for tool prometheus/metrics",
|
|
904
|
+
)
|
|
777
905
|
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
906
|
+
self.config = PrometheusConfig(
|
|
907
|
+
prometheus_url=prometheus_url,
|
|
908
|
+
headers=add_prometheus_auth(os.environ.get("PROMETHEUS_AUTH_HEADER")),
|
|
909
|
+
)
|
|
910
|
+
logging.info(f"Prometheus auto discovered at url {prometheus_url}")
|
|
911
|
+
self._reload_llm_instructions()
|
|
912
|
+
return self._is_healthy()
|
|
913
|
+
except Exception as e:
|
|
914
|
+
logging.exception("Failed to set up prometheus")
|
|
915
|
+
return False, str(e)
|
|
785
916
|
|
|
786
917
|
def auto_detect_prometheus_url(self) -> Optional[str]:
|
|
787
918
|
url: Optional[str] = PrometheusDiscovery.find_prometheus_url()
|
|
@@ -804,7 +935,11 @@ class PrometheusToolset(Toolset):
|
|
|
804
935
|
url = urljoin(self.config.prometheus_url, self.config.healthcheck)
|
|
805
936
|
try:
|
|
806
937
|
response = requests.get(
|
|
807
|
-
url=url,
|
|
938
|
+
url=url,
|
|
939
|
+
headers=self.config.headers,
|
|
940
|
+
auth=self.config.get_auth(),
|
|
941
|
+
timeout=10,
|
|
942
|
+
verify=self.config.prometheus_ssl_enabled,
|
|
808
943
|
)
|
|
809
944
|
|
|
810
945
|
if response.status_code == 200:
|
|
@@ -21,6 +21,7 @@ from holmes.plugins.toolsets.rabbitmq.api import (
|
|
|
21
21
|
get_cluster_status,
|
|
22
22
|
make_request,
|
|
23
23
|
)
|
|
24
|
+
from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class RabbitMQConfig(BaseModel):
|
|
@@ -80,7 +81,9 @@ class ListConfiguredClusters(BaseRabbitMQTool):
|
|
|
80
81
|
)
|
|
81
82
|
|
|
82
83
|
def get_parameterized_one_liner(self, params) -> str:
|
|
83
|
-
return
|
|
84
|
+
return (
|
|
85
|
+
f"{toolset_name_for_one_liner(self.toolset.name)}: List RabbitMQ Clusters"
|
|
86
|
+
)
|
|
84
87
|
|
|
85
88
|
|
|
86
89
|
class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
@@ -116,7 +119,10 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
def get_parameterized_one_liner(self, params) -> str:
|
|
119
|
-
|
|
122
|
+
cluster_id = params.get("cluster_id", "")
|
|
123
|
+
if cluster_id:
|
|
124
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Cluster Status ({cluster_id})"
|
|
125
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Cluster Status"
|
|
120
126
|
|
|
121
127
|
|
|
122
128
|
class RabbitMQToolset(Toolset):
|
|
@@ -74,7 +74,7 @@ class FetchRobustaFinding(Tool):
|
|
|
74
74
|
)
|
|
75
75
|
|
|
76
76
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
77
|
-
return "Fetch Alert Metadata"
|
|
77
|
+
return "Robusta: Fetch Alert Metadata"
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
class FetchResourceRecommendation(Tool):
|
|
@@ -138,7 +138,7 @@ class FetchResourceRecommendation(Tool):
|
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
141
|
-
return f"Check Historical Resource Utilization: ({str(params)})"
|
|
141
|
+
return f"Robusta: Check Historical Resource Utilization: ({str(params)})"
|
|
142
142
|
|
|
143
143
|
|
|
144
144
|
class FetchConfigurationChanges(Tool):
|
|
@@ -196,14 +196,14 @@ class FetchConfigurationChanges(Tool):
|
|
|
196
196
|
)
|
|
197
197
|
|
|
198
198
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
199
|
-
return
|
|
199
|
+
return "Robusta: Search Change History"
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
class RobustaToolset(Toolset):
|
|
203
203
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
204
204
|
dal_prereq = StaticPrerequisite(
|
|
205
205
|
enabled=True if dal else False,
|
|
206
|
-
disabled_reason="
|
|
206
|
+
disabled_reason="Integration with Robusta cloud is disabled",
|
|
207
207
|
)
|
|
208
208
|
if dal:
|
|
209
209
|
dal_prereq = StaticPrerequisite(
|
|
@@ -12,6 +12,7 @@ from holmes.core.tools import (
|
|
|
12
12
|
)
|
|
13
13
|
|
|
14
14
|
from holmes.plugins.runbooks import get_runbook_by_path, DEFAULT_RUNBOOK_SEARCH_PATH
|
|
15
|
+
from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
# TODO(mainred): currently we support fetch runbooks hosted internally, in the future we may want to support fetching
|
|
@@ -82,10 +83,10 @@ class RunbookFetcher(Tool):
|
|
|
82
83
|
4. ❌ *Could not analyze process mailbox sizes* - Observer tool not enabled in container. Enable remote shell or observer_cli for process introspection.
|
|
83
84
|
5. ✅ *Check pod memory limits* - container limit 4Gi, requests 2Gi
|
|
84
85
|
6. ✅ *Verify BEAM startup arguments* - `+S 4:4 +P 1048576`, no memory instrumentation flags enabled
|
|
85
|
-
7. ❌ *Could not retrieve APM traces* - Datadog traces toolset is disabled. You can enable it by following https://
|
|
86
|
-
8. ❌ *Could not query Erlang metrics* - Prometheus integration is not connected. Enable it via https://
|
|
86
|
+
7. ❌ *Could not retrieve APM traces* - Datadog traces toolset is disabled. You can enable it by following https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/
|
|
87
|
+
8. ❌ *Could not query Erlang metrics* - Prometheus integration is not connected. Enable it via https://holmesgpt.dev/data-sources/builtin-toolsets/prometheus/
|
|
87
88
|
9. ✅ *Examine recent deployments* - app version 2.1.3 deployed 4 hours ago, coincides with memory spike
|
|
88
|
-
10. ❌ *Could not check Stripe API status* - No toolset for Stripe integration exists. To monitor Stripe or similar third-party APIs, add a [custom toolset](https://
|
|
89
|
+
10. ❌ *Could not check Stripe API status* - No toolset for Stripe integration exists. To monitor Stripe or similar third-party APIs, add a [custom toolset](https://holmesgpt.dev/data-sources/custom-toolsets/) or use a [remote MCP server](https://holmesgpt.dev/data-sources/remote-mcp-servers/)
|
|
89
90
|
|
|
90
91
|
**Root cause:** Memory leak in `gen_server` logic introduced in v2.1.3. BEAM VM hitting memory limit, causing out-of-memory crashes.
|
|
91
92
|
|
|
@@ -107,8 +108,8 @@ class RunbookFetcher(Tool):
|
|
|
107
108
|
)
|
|
108
109
|
|
|
109
110
|
def get_parameterized_one_liner(self, params) -> str:
|
|
110
|
-
path: str = params
|
|
111
|
-
return f"
|
|
111
|
+
path: str = params.get("link", "")
|
|
112
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Runbook {path}"
|
|
112
113
|
|
|
113
114
|
|
|
114
115
|
class RunbookToolset(Toolset):
|
|
@@ -15,9 +15,11 @@ from holmes.core.tools import StructuredToolResult, ToolResultStatus
|
|
|
15
15
|
from holmes.plugins.toolsets.utils import (
|
|
16
16
|
process_timestamps_to_rfc3339,
|
|
17
17
|
standard_start_datetime_tool_param_description,
|
|
18
|
+
toolset_name_for_one_liner,
|
|
19
|
+
)
|
|
20
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
21
|
+
DEFAULT_TIME_SPAN_SECONDS,
|
|
18
22
|
)
|
|
19
|
-
|
|
20
|
-
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
class ServiceNowConfig(BaseModel):
|
|
@@ -92,7 +94,8 @@ class ServiceNowBaseTool(Tool):
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
def get_parameterized_one_liner(self, params) -> str:
|
|
95
|
-
|
|
97
|
+
# Default implementation - will be overridden by subclasses
|
|
98
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: ServiceNow {self.name} {params}"
|
|
96
99
|
|
|
97
100
|
|
|
98
101
|
class ReturnChangesInTimerange(ServiceNowBaseTool):
|
|
@@ -108,6 +111,10 @@ class ReturnChangesInTimerange(ServiceNowBaseTool):
|
|
|
108
111
|
)
|
|
109
112
|
}
|
|
110
113
|
|
|
114
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
115
|
+
start = params.get("start", "last hour")
|
|
116
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Change Requests ({start})"
|
|
117
|
+
|
|
111
118
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
112
119
|
parsed_params = {}
|
|
113
120
|
try:
|
|
@@ -147,6 +154,10 @@ class ReturnChange(ServiceNowBaseTool):
|
|
|
147
154
|
)
|
|
148
155
|
}
|
|
149
156
|
|
|
157
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
158
|
+
sys_id = params.get("sys_id", "")
|
|
159
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Change Details ({sys_id})"
|
|
160
|
+
|
|
150
161
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
151
162
|
try:
|
|
152
163
|
url = "https://{instance}.service-now.com/api/now/v2/table/change_request/{sys_id}".format(
|
|
@@ -175,6 +186,10 @@ class ReturnChangesWithKeyword(ServiceNowBaseTool):
|
|
|
175
186
|
)
|
|
176
187
|
}
|
|
177
188
|
|
|
189
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
190
|
+
keyword = params.get("keyword", "")
|
|
191
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Changes ({keyword})"
|
|
192
|
+
|
|
178
193
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
179
194
|
parsed_params = {}
|
|
180
195
|
try:
|
holmes/plugins/toolsets/utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import datetime
|
|
|
2
2
|
import time
|
|
3
3
|
from typing import Dict, Optional, Tuple, Union
|
|
4
4
|
|
|
5
|
-
from dateutil import parser
|
|
5
|
+
from dateutil import parser
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def standard_start_datetime_tool_param_description(time_span_seconds: int):
|
|
@@ -139,3 +139,10 @@ def get_param_or_raise(dict: Dict, param: str) -> str:
|
|
|
139
139
|
if not value:
|
|
140
140
|
raise Exception(f'Missing param "{param}"')
|
|
141
141
|
return value
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def toolset_name_for_one_liner(toolset_name: str) -> str:
|
|
145
|
+
name = toolset_name
|
|
146
|
+
if "/" in toolset_name:
|
|
147
|
+
name = toolset_name.split("/")[0]
|
|
148
|
+
return name.capitalize()
|
holmes/utils/console/logging.py
CHANGED
|
@@ -41,9 +41,14 @@ def suppress_noisy_logs():
|
|
|
41
41
|
warnings.filterwarnings("ignore", category=UserWarning, module="slack_sdk.*")
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
def init_logging(verbose_flags: Optional[List[bool]] = None):
|
|
44
|
+
def init_logging(verbose_flags: Optional[List[bool]] = None, log_costs: bool = False):
|
|
45
45
|
verbosity = cli_flags_to_verbosity(verbose_flags) # type: ignore
|
|
46
46
|
|
|
47
|
+
# Setup cost logger if requested
|
|
48
|
+
if log_costs:
|
|
49
|
+
cost_logger = logging.getLogger("holmes.costs")
|
|
50
|
+
cost_logger.setLevel(logging.DEBUG)
|
|
51
|
+
|
|
47
52
|
if verbosity == Verbosity.VERY_VERBOSE:
|
|
48
53
|
logging.basicConfig(
|
|
49
54
|
force=True,
|
holmes/utils/llms.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def model_matches_list(model: str, model_list: List[str]) -> bool:
|
|
6
|
+
"""
|
|
7
|
+
Check if a model matches any pattern in a list of model patterns.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
model: The name of an LLM model (e.g., "azure/gpt", "openai/gpt-4o")
|
|
11
|
+
model_list: List of model patterns that may include wildcards
|
|
12
|
+
(e.g., ["azure/*", "*/mistral", "openai/gpt-*"])
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
True if the model matches any pattern in the list, False otherwise
|
|
16
|
+
"""
|
|
17
|
+
for pattern in model_list:
|
|
18
|
+
if fnmatch.fnmatchcase(model, pattern):
|
|
19
|
+
return True
|
|
20
|
+
return False
|