holmesgpt 0.12.4__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of holmesgpt might be problematic. Click here for more details.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +19 -1
- holmes/common/env_vars.py +13 -0
- holmes/config.py +69 -9
- holmes/core/conversations.py +11 -0
- holmes/core/investigation.py +16 -3
- holmes/core/investigation_structured_output.py +12 -0
- holmes/core/llm.py +10 -0
- holmes/core/models.py +9 -1
- holmes/core/openai_formatting.py +72 -12
- holmes/core/prompt.py +13 -0
- holmes/core/supabase_dal.py +3 -0
- holmes/core/todo_manager.py +88 -0
- holmes/core/tool_calling_llm.py +121 -149
- holmes/core/tools.py +10 -1
- holmes/core/tools_utils/tool_executor.py +7 -2
- holmes/core/tools_utils/toolset_utils.py +7 -2
- holmes/core/tracing.py +8 -7
- holmes/interactive.py +1 -0
- holmes/main.py +2 -1
- holmes/plugins/prompts/__init__.py +7 -1
- holmes/plugins/prompts/_ai_safety.jinja2 +43 -0
- holmes/plugins/prompts/_current_date_time.jinja2 +1 -0
- holmes/plugins/prompts/_default_log_prompt.jinja2 +4 -2
- holmes/plugins/prompts/_fetch_logs.jinja2 +6 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +16 -0
- holmes/plugins/prompts/_permission_errors.jinja2 +1 -1
- holmes/plugins/prompts/_toolsets_instructions.jinja2 +4 -4
- holmes/plugins/prompts/generic_ask.jinja2 +4 -3
- holmes/plugins/prompts/investigation_procedure.jinja2 +210 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +4 -0
- holmes/plugins/toolsets/__init__.py +19 -6
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +27 -0
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +2 -2
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +3 -1
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +2 -1
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +2 -1
- holmes/plugins/toolsets/coralogix/api.py +6 -6
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +7 -1
- holmes/plugins/toolsets/datadog/datadog_api.py +20 -8
- holmes/plugins/toolsets/datadog/datadog_metrics_instructions.jinja2 +8 -1
- holmes/plugins/toolsets/datadog/datadog_rds_instructions.jinja2 +82 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +12 -5
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +20 -11
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +735 -0
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +18 -11
- holmes/plugins/toolsets/git.py +15 -15
- holmes/plugins/toolsets/grafana/grafana_api.py +12 -1
- holmes/plugins/toolsets/grafana/toolset_grafana.py +5 -1
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +9 -4
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +12 -5
- holmes/plugins/toolsets/internet/internet.py +2 -1
- holmes/plugins/toolsets/internet/notion.py +2 -1
- holmes/plugins/toolsets/investigator/__init__.py +0 -0
- holmes/plugins/toolsets/investigator/core_investigation.py +157 -0
- holmes/plugins/toolsets/investigator/investigator_instructions.jinja2 +253 -0
- holmes/plugins/toolsets/investigator/model.py +15 -0
- holmes/plugins/toolsets/kafka.py +14 -7
- holmes/plugins/toolsets/kubernetes.yaml +7 -7
- holmes/plugins/toolsets/kubernetes_logs.py +454 -25
- holmes/plugins/toolsets/logging_utils/logging_api.py +115 -55
- holmes/plugins/toolsets/mcp/toolset_mcp.py +1 -1
- holmes/plugins/toolsets/newrelic.py +8 -3
- holmes/plugins/toolsets/opensearch/opensearch.py +8 -4
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +9 -2
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +6 -2
- holmes/plugins/toolsets/prometheus/prometheus.py +149 -44
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +8 -2
- holmes/plugins/toolsets/robusta/robusta.py +4 -4
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +6 -5
- holmes/plugins/toolsets/servicenow/servicenow.py +18 -3
- holmes/plugins/toolsets/utils.py +8 -1
- holmes/utils/llms.py +20 -0
- holmes/utils/stream.py +90 -0
- {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/METADATA +48 -35
- {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/RECORD +85 -75
- holmes/utils/robusta.py +0 -9
- {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/LICENSE.txt +0 -0
- {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.12.4.dist-info → holmesgpt-0.13.0.dist-info}/entry_points.txt +0 -0
|
@@ -3,12 +3,13 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
5
|
import time
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
|
7
7
|
from urllib.parse import urljoin
|
|
8
8
|
|
|
9
9
|
import requests # type: ignore
|
|
10
10
|
from pydantic import BaseModel, field_validator, Field, model_validator
|
|
11
11
|
from requests import RequestException
|
|
12
|
+
from requests_aws4auth import AWS4Auth
|
|
12
13
|
|
|
13
14
|
from holmes.core.tools import (
|
|
14
15
|
CallablePrerequisite,
|
|
@@ -25,14 +26,17 @@ from holmes.plugins.toolsets.utils import (
|
|
|
25
26
|
get_param_or_raise,
|
|
26
27
|
process_timestamps_to_rfc3339,
|
|
27
28
|
standard_start_datetime_tool_param_description,
|
|
29
|
+
toolset_name_for_one_liner,
|
|
28
30
|
)
|
|
29
31
|
from holmes.utils.cache import TTLCache
|
|
30
32
|
from holmes.common.env_vars import IS_OPENSHIFT
|
|
31
33
|
from holmes.common.openshift import load_openshift_token
|
|
34
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
35
|
+
DEFAULT_TIME_SPAN_SECONDS,
|
|
36
|
+
)
|
|
32
37
|
from holmes.utils.keygen_utils import generate_random_key
|
|
33
38
|
|
|
34
39
|
PROMETHEUS_RULES_CACHE_KEY = "cached_prometheus_rules"
|
|
35
|
-
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
36
40
|
|
|
37
41
|
|
|
38
42
|
class PrometheusConfig(BaseModel):
|
|
@@ -49,6 +53,7 @@ class PrometheusConfig(BaseModel):
|
|
|
49
53
|
headers: Dict = Field(default_factory=dict)
|
|
50
54
|
rules_cache_duration_seconds: Union[int, None] = 1800 # 30 minutes
|
|
51
55
|
additional_labels: Optional[Dict[str, str]] = None
|
|
56
|
+
prometheus_ssl_enabled: bool = True
|
|
52
57
|
|
|
53
58
|
@field_validator("prometheus_url")
|
|
54
59
|
def ensure_trailing_slash(cls, v: Optional[str]) -> Optional[str]:
|
|
@@ -73,6 +78,32 @@ class PrometheusConfig(BaseModel):
|
|
|
73
78
|
|
|
74
79
|
return self
|
|
75
80
|
|
|
81
|
+
def is_amp(self) -> bool:
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
def get_auth(self) -> Any:
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class AMPConfig(PrometheusConfig):
|
|
89
|
+
aws_access_key: str
|
|
90
|
+
aws_secret_access_key: str
|
|
91
|
+
aws_region: str
|
|
92
|
+
aws_service_name: str = "aps"
|
|
93
|
+
healthcheck: str = "api/v1/query?query=up" # Override for AMP
|
|
94
|
+
prometheus_ssl_enabled: bool = False
|
|
95
|
+
|
|
96
|
+
def is_amp(self) -> bool:
|
|
97
|
+
return True
|
|
98
|
+
|
|
99
|
+
def get_auth(self):
|
|
100
|
+
return AWS4Auth(
|
|
101
|
+
self.aws_access_key, # type: ignore
|
|
102
|
+
self.aws_secret_access_key, # type: ignore
|
|
103
|
+
self.aws_region, # type: ignore
|
|
104
|
+
self.aws_service_name, # type: ignore
|
|
105
|
+
)
|
|
106
|
+
|
|
76
107
|
|
|
77
108
|
class BasePrometheusTool(Tool):
|
|
78
109
|
toolset: "PrometheusToolset"
|
|
@@ -99,10 +130,15 @@ def filter_metrics_by_name(metrics: Dict, pattern: str) -> Dict:
|
|
|
99
130
|
METRICS_SUFFIXES_TO_STRIP = ["_bucket", "_count", "_sum"]
|
|
100
131
|
|
|
101
132
|
|
|
102
|
-
def fetch_metadata(
|
|
133
|
+
def fetch_metadata(
|
|
134
|
+
prometheus_url: str,
|
|
135
|
+
headers: Optional[Dict],
|
|
136
|
+
auth=None,
|
|
137
|
+
verify_ssl: bool = True,
|
|
138
|
+
) -> Dict:
|
|
103
139
|
metadata_url = urljoin(prometheus_url, "api/v1/metadata")
|
|
104
140
|
metadata_response = requests.get(
|
|
105
|
-
metadata_url, headers=headers, timeout=60, verify=
|
|
141
|
+
metadata_url, headers=headers, timeout=60, verify=verify_ssl, auth=auth
|
|
106
142
|
)
|
|
107
143
|
|
|
108
144
|
metadata_response.raise_for_status()
|
|
@@ -124,13 +160,17 @@ def fetch_metadata(prometheus_url: str, headers: Optional[Dict]) -> Dict:
|
|
|
124
160
|
|
|
125
161
|
|
|
126
162
|
def fetch_metadata_with_series_api(
|
|
127
|
-
prometheus_url: str,
|
|
163
|
+
prometheus_url: str,
|
|
164
|
+
metric_name: str,
|
|
165
|
+
headers: Dict,
|
|
166
|
+
auth=None,
|
|
167
|
+
verify_ssl: bool = True,
|
|
128
168
|
) -> Dict:
|
|
129
169
|
url = urljoin(prometheus_url, "api/v1/series")
|
|
130
170
|
params: Dict = {"match[]": f'{{__name__=~".*{metric_name}.*"}}', "limit": "10000"}
|
|
131
171
|
|
|
132
172
|
response = requests.get(
|
|
133
|
-
url, headers=headers, timeout=60, params=params, verify=
|
|
173
|
+
url, headers=headers, timeout=60, params=params, auth=auth, verify=verify_ssl
|
|
134
174
|
)
|
|
135
175
|
response.raise_for_status()
|
|
136
176
|
metrics = response.json()["data"]
|
|
@@ -172,6 +212,8 @@ def fetch_metrics_labels_with_series_api(
|
|
|
172
212
|
cache: Optional[TTLCache],
|
|
173
213
|
metrics_labels_time_window_hrs: Union[int, None],
|
|
174
214
|
metric_name: str,
|
|
215
|
+
auth=None,
|
|
216
|
+
verify_ssl: bool = True,
|
|
175
217
|
) -> dict:
|
|
176
218
|
"""This is a slow query. Takes 5+ seconds to run"""
|
|
177
219
|
cache_key = f"metrics_labels_series_api:{metric_name}"
|
|
@@ -188,7 +230,12 @@ def fetch_metrics_labels_with_series_api(
|
|
|
188
230
|
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
189
231
|
|
|
190
232
|
series_response = requests.get(
|
|
191
|
-
url=series_url,
|
|
233
|
+
url=series_url,
|
|
234
|
+
headers=headers,
|
|
235
|
+
params=params,
|
|
236
|
+
auth=auth,
|
|
237
|
+
timeout=60,
|
|
238
|
+
verify=verify_ssl,
|
|
192
239
|
)
|
|
193
240
|
series_response.raise_for_status()
|
|
194
241
|
series = series_response.json()["data"]
|
|
@@ -214,6 +261,8 @@ def fetch_metrics_labels_with_labels_api(
|
|
|
214
261
|
metrics_labels_time_window_hrs: Union[int, None],
|
|
215
262
|
metric_names: List[str],
|
|
216
263
|
headers: Dict,
|
|
264
|
+
auth=None,
|
|
265
|
+
verify_ssl: bool = True,
|
|
217
266
|
) -> dict:
|
|
218
267
|
metrics_labels = {}
|
|
219
268
|
|
|
@@ -233,7 +282,12 @@ def fetch_metrics_labels_with_labels_api(
|
|
|
233
282
|
params["start"] = params["end"] - (metrics_labels_time_window_hrs * 60 * 60)
|
|
234
283
|
|
|
235
284
|
response = requests.get(
|
|
236
|
-
url=url,
|
|
285
|
+
url=url,
|
|
286
|
+
headers=headers,
|
|
287
|
+
params=params,
|
|
288
|
+
auth=auth,
|
|
289
|
+
timeout=60,
|
|
290
|
+
verify=verify_ssl,
|
|
237
291
|
)
|
|
238
292
|
response.raise_for_status()
|
|
239
293
|
labels = response.json()["data"]
|
|
@@ -254,16 +308,27 @@ def fetch_metrics(
|
|
|
254
308
|
should_fetch_labels_with_labels_api: bool,
|
|
255
309
|
should_fetch_metadata_with_series_api: bool,
|
|
256
310
|
headers: Dict,
|
|
311
|
+
auth=None,
|
|
312
|
+
verify_ssl: bool = True,
|
|
257
313
|
) -> dict:
|
|
258
314
|
metrics = None
|
|
259
315
|
should_fetch_labels = True
|
|
260
316
|
if should_fetch_metadata_with_series_api:
|
|
261
317
|
metrics = fetch_metadata_with_series_api(
|
|
262
|
-
prometheus_url=prometheus_url,
|
|
318
|
+
prometheus_url=prometheus_url,
|
|
319
|
+
metric_name=metric_name,
|
|
320
|
+
headers=headers,
|
|
321
|
+
auth=auth,
|
|
322
|
+
verify_ssl=verify_ssl,
|
|
263
323
|
)
|
|
264
324
|
should_fetch_labels = False # series API returns the labels
|
|
265
325
|
else:
|
|
266
|
-
metrics = fetch_metadata(
|
|
326
|
+
metrics = fetch_metadata(
|
|
327
|
+
prometheus_url=prometheus_url,
|
|
328
|
+
headers=headers,
|
|
329
|
+
auth=auth,
|
|
330
|
+
verify_ssl=verify_ssl,
|
|
331
|
+
)
|
|
267
332
|
metrics = filter_metrics_by_name(metrics, metric_name)
|
|
268
333
|
|
|
269
334
|
if should_fetch_labels:
|
|
@@ -275,6 +340,8 @@ def fetch_metrics(
|
|
|
275
340
|
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
276
341
|
metric_names=list(metrics.keys()),
|
|
277
342
|
headers=headers,
|
|
343
|
+
auth=auth,
|
|
344
|
+
verify_ssl=verify_ssl,
|
|
278
345
|
)
|
|
279
346
|
else:
|
|
280
347
|
metrics_labels = fetch_metrics_labels_with_series_api(
|
|
@@ -283,6 +350,8 @@ def fetch_metrics(
|
|
|
283
350
|
metrics_labels_time_window_hrs=metrics_labels_time_window_hrs,
|
|
284
351
|
metric_name=metric_name,
|
|
285
352
|
headers=headers,
|
|
353
|
+
auth=auth,
|
|
354
|
+
verify_ssl=verify_ssl,
|
|
286
355
|
)
|
|
287
356
|
|
|
288
357
|
for metric_name in metrics:
|
|
@@ -309,6 +378,12 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
309
378
|
error="Prometheus is not configured. Prometheus URL is missing",
|
|
310
379
|
params=params,
|
|
311
380
|
)
|
|
381
|
+
if self.toolset.config.is_amp():
|
|
382
|
+
return StructuredToolResult(
|
|
383
|
+
status=ToolResultStatus.ERROR,
|
|
384
|
+
error="Tool not supported in AMP",
|
|
385
|
+
params=params,
|
|
386
|
+
)
|
|
312
387
|
if not self._cache and self.toolset.config.rules_cache_duration_seconds:
|
|
313
388
|
self._cache = TTLCache(self.toolset.config.rules_cache_duration_seconds) # type: ignore
|
|
314
389
|
try:
|
|
@@ -330,8 +405,9 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
330
405
|
rules_response = requests.get(
|
|
331
406
|
url=rules_url,
|
|
332
407
|
params=params,
|
|
408
|
+
auth=self.toolset.config.get_auth(),
|
|
333
409
|
timeout=180,
|
|
334
|
-
verify=
|
|
410
|
+
verify=self.toolset.config.prometheus_ssl_enabled,
|
|
335
411
|
headers=self.toolset.config.headers,
|
|
336
412
|
)
|
|
337
413
|
rules_response.raise_for_status()
|
|
@@ -367,7 +443,7 @@ class ListPrometheusRules(BasePrometheusTool):
|
|
|
367
443
|
)
|
|
368
444
|
|
|
369
445
|
def get_parameterized_one_liner(self, params) -> str:
|
|
370
|
-
return "
|
|
446
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Rules"
|
|
371
447
|
|
|
372
448
|
|
|
373
449
|
class ListAvailableMetrics(BasePrometheusTool):
|
|
@@ -424,6 +500,8 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
424
500
|
should_fetch_labels_with_labels_api=self.toolset.config.fetch_labels_with_labels_api,
|
|
425
501
|
should_fetch_metadata_with_series_api=self.toolset.config.fetch_metadata_with_series_api,
|
|
426
502
|
headers=self.toolset.config.headers,
|
|
503
|
+
auth=self.toolset.config.get_auth(),
|
|
504
|
+
verify_ssl=self.toolset.config.prometheus_ssl_enabled,
|
|
427
505
|
)
|
|
428
506
|
|
|
429
507
|
if params.get("type_filter"):
|
|
@@ -470,7 +548,8 @@ class ListAvailableMetrics(BasePrometheusTool):
|
|
|
470
548
|
)
|
|
471
549
|
|
|
472
550
|
def get_parameterized_one_liner(self, params) -> str:
|
|
473
|
-
|
|
551
|
+
name_filter = params.get("name_filter", "")
|
|
552
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Metrics ({name_filter})"
|
|
474
553
|
|
|
475
554
|
|
|
476
555
|
class ExecuteInstantQuery(BasePrometheusTool):
|
|
@@ -509,7 +588,11 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
509
588
|
payload = {"query": query}
|
|
510
589
|
|
|
511
590
|
response = requests.post(
|
|
512
|
-
url=url,
|
|
591
|
+
url=url,
|
|
592
|
+
headers=self.toolset.config.headers,
|
|
593
|
+
auth=self.toolset.config.get_auth(),
|
|
594
|
+
data=payload,
|
|
595
|
+
timeout=60,
|
|
513
596
|
)
|
|
514
597
|
|
|
515
598
|
if response.status_code == 200:
|
|
@@ -579,9 +662,8 @@ class ExecuteInstantQuery(BasePrometheusTool):
|
|
|
579
662
|
)
|
|
580
663
|
|
|
581
664
|
def get_parameterized_one_liner(self, params) -> str:
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
return f"Execute Prometheus Query (instant): promql='{query}', description='{description}'"
|
|
665
|
+
description = params.get("description", "")
|
|
666
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})"
|
|
585
667
|
|
|
586
668
|
|
|
587
669
|
class ExecuteRangeQuery(BasePrometheusTool):
|
|
@@ -654,7 +736,11 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
654
736
|
}
|
|
655
737
|
|
|
656
738
|
response = requests.post(
|
|
657
|
-
url=url,
|
|
739
|
+
url=url,
|
|
740
|
+
headers=self.toolset.config.headers,
|
|
741
|
+
auth=self.toolset.config.get_auth(),
|
|
742
|
+
data=payload,
|
|
743
|
+
timeout=120,
|
|
658
744
|
)
|
|
659
745
|
|
|
660
746
|
if response.status_code == 200:
|
|
@@ -726,15 +812,13 @@ class ExecuteRangeQuery(BasePrometheusTool):
|
|
|
726
812
|
)
|
|
727
813
|
|
|
728
814
|
def get_parameterized_one_liner(self, params) -> str:
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
end = params.get("end")
|
|
732
|
-
step = params.get("step")
|
|
733
|
-
description = params.get("description")
|
|
734
|
-
return f"Execute Prometheus Query (range): promql='{query}', start={start}, end={end}, step={step}, description='{description}'"
|
|
815
|
+
description = params.get("description", "")
|
|
816
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})"
|
|
735
817
|
|
|
736
818
|
|
|
737
819
|
class PrometheusToolset(Toolset):
|
|
820
|
+
config: Optional[Union[PrometheusConfig, AMPConfig]] = None
|
|
821
|
+
|
|
738
822
|
def __init__(self):
|
|
739
823
|
super().__init__(
|
|
740
824
|
name="prometheus/metrics",
|
|
@@ -760,28 +844,45 @@ class PrometheusToolset(Toolset):
|
|
|
760
844
|
)
|
|
761
845
|
self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
|
|
762
846
|
|
|
763
|
-
def
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
847
|
+
def determine_prometheus_class(
|
|
848
|
+
self, config: dict[str, Any]
|
|
849
|
+
) -> Type[Union[PrometheusConfig, AMPConfig]]:
|
|
850
|
+
has_aws_credentials = (
|
|
851
|
+
"aws_access_key" in config or "aws_secret_access_key" in config
|
|
852
|
+
)
|
|
853
|
+
return AMPConfig if has_aws_credentials else PrometheusConfig
|
|
768
854
|
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
855
|
+
def prerequisites_callable(self, config: dict[str, Any]) -> Tuple[bool, str]:
|
|
856
|
+
try:
|
|
857
|
+
if config:
|
|
858
|
+
config_cls = self.determine_prometheus_class(config)
|
|
859
|
+
self.config = config_cls(**config) # type: ignore
|
|
860
|
+
|
|
861
|
+
self._reload_llm_instructions()
|
|
862
|
+
return self._is_healthy()
|
|
863
|
+
except Exception:
|
|
864
|
+
logging.exception("Failed to create prometheus config")
|
|
865
|
+
return False, "Failed to create prometheus config"
|
|
866
|
+
try:
|
|
867
|
+
prometheus_url = os.environ.get("PROMETHEUS_URL")
|
|
772
868
|
if not prometheus_url:
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
869
|
+
prometheus_url = self.auto_detect_prometheus_url()
|
|
870
|
+
if not prometheus_url:
|
|
871
|
+
return (
|
|
872
|
+
False,
|
|
873
|
+
"Unable to auto-detect prometheus. Define prometheus_url in the configuration for tool prometheus/metrics",
|
|
874
|
+
)
|
|
777
875
|
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
876
|
+
self.config = PrometheusConfig(
|
|
877
|
+
prometheus_url=prometheus_url,
|
|
878
|
+
headers=add_prometheus_auth(os.environ.get("PROMETHEUS_AUTH_HEADER")),
|
|
879
|
+
)
|
|
880
|
+
logging.info(f"Prometheus auto discovered at url {prometheus_url}")
|
|
881
|
+
self._reload_llm_instructions()
|
|
882
|
+
return self._is_healthy()
|
|
883
|
+
except Exception as e:
|
|
884
|
+
logging.exception("Failed to set up prometheus")
|
|
885
|
+
return False, str(e)
|
|
785
886
|
|
|
786
887
|
def auto_detect_prometheus_url(self) -> Optional[str]:
|
|
787
888
|
url: Optional[str] = PrometheusDiscovery.find_prometheus_url()
|
|
@@ -804,7 +905,11 @@ class PrometheusToolset(Toolset):
|
|
|
804
905
|
url = urljoin(self.config.prometheus_url, self.config.healthcheck)
|
|
805
906
|
try:
|
|
806
907
|
response = requests.get(
|
|
807
|
-
url=url,
|
|
908
|
+
url=url,
|
|
909
|
+
headers=self.config.headers,
|
|
910
|
+
auth=self.config.get_auth(),
|
|
911
|
+
timeout=10,
|
|
912
|
+
verify=self.config.prometheus_ssl_enabled,
|
|
808
913
|
)
|
|
809
914
|
|
|
810
915
|
if response.status_code == 200:
|
|
@@ -21,6 +21,7 @@ from holmes.plugins.toolsets.rabbitmq.api import (
|
|
|
21
21
|
get_cluster_status,
|
|
22
22
|
make_request,
|
|
23
23
|
)
|
|
24
|
+
from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
class RabbitMQConfig(BaseModel):
|
|
@@ -80,7 +81,9 @@ class ListConfiguredClusters(BaseRabbitMQTool):
|
|
|
80
81
|
)
|
|
81
82
|
|
|
82
83
|
def get_parameterized_one_liner(self, params) -> str:
|
|
83
|
-
return
|
|
84
|
+
return (
|
|
85
|
+
f"{toolset_name_for_one_liner(self.toolset.name)}: List RabbitMQ Clusters"
|
|
86
|
+
)
|
|
84
87
|
|
|
85
88
|
|
|
86
89
|
class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
@@ -116,7 +119,10 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
|
116
119
|
)
|
|
117
120
|
|
|
118
121
|
def get_parameterized_one_liner(self, params) -> str:
|
|
119
|
-
|
|
122
|
+
cluster_id = params.get("cluster_id", "")
|
|
123
|
+
if cluster_id:
|
|
124
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Cluster Status ({cluster_id})"
|
|
125
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Cluster Status"
|
|
120
126
|
|
|
121
127
|
|
|
122
128
|
class RabbitMQToolset(Toolset):
|
|
@@ -74,7 +74,7 @@ class FetchRobustaFinding(Tool):
|
|
|
74
74
|
)
|
|
75
75
|
|
|
76
76
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
77
|
-
return "Fetch Alert Metadata"
|
|
77
|
+
return "Robusta: Fetch Alert Metadata"
|
|
78
78
|
|
|
79
79
|
|
|
80
80
|
class FetchResourceRecommendation(Tool):
|
|
@@ -138,7 +138,7 @@ class FetchResourceRecommendation(Tool):
|
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
141
|
-
return f"Check Historical Resource Utilization: ({str(params)})"
|
|
141
|
+
return f"Robusta: Check Historical Resource Utilization: ({str(params)})"
|
|
142
142
|
|
|
143
143
|
|
|
144
144
|
class FetchConfigurationChanges(Tool):
|
|
@@ -196,14 +196,14 @@ class FetchConfigurationChanges(Tool):
|
|
|
196
196
|
)
|
|
197
197
|
|
|
198
198
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
199
|
-
return
|
|
199
|
+
return "Robusta: Search Change History"
|
|
200
200
|
|
|
201
201
|
|
|
202
202
|
class RobustaToolset(Toolset):
|
|
203
203
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
204
204
|
dal_prereq = StaticPrerequisite(
|
|
205
205
|
enabled=True if dal else False,
|
|
206
|
-
disabled_reason="
|
|
206
|
+
disabled_reason="Integration with Robusta cloud is disabled",
|
|
207
207
|
)
|
|
208
208
|
if dal:
|
|
209
209
|
dal_prereq = StaticPrerequisite(
|
|
@@ -12,6 +12,7 @@ from holmes.core.tools import (
|
|
|
12
12
|
)
|
|
13
13
|
|
|
14
14
|
from holmes.plugins.runbooks import get_runbook_by_path, DEFAULT_RUNBOOK_SEARCH_PATH
|
|
15
|
+
from holmes.plugins.toolsets.utils import toolset_name_for_one_liner
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
# TODO(mainred): currently we support fetch runbooks hosted internally, in the future we may want to support fetching
|
|
@@ -82,10 +83,10 @@ class RunbookFetcher(Tool):
|
|
|
82
83
|
4. ❌ *Could not analyze process mailbox sizes* - Observer tool not enabled in container. Enable remote shell or observer_cli for process introspection.
|
|
83
84
|
5. ✅ *Check pod memory limits* - container limit 4Gi, requests 2Gi
|
|
84
85
|
6. ✅ *Verify BEAM startup arguments* - `+S 4:4 +P 1048576`, no memory instrumentation flags enabled
|
|
85
|
-
7. ❌ *Could not retrieve APM traces* - Datadog traces toolset is disabled. You can enable it by following https://
|
|
86
|
-
8. ❌ *Could not query Erlang metrics* - Prometheus integration is not connected. Enable it via https://
|
|
86
|
+
7. ❌ *Could not retrieve APM traces* - Datadog traces toolset is disabled. You can enable it by following https://holmesgpt.dev/data-sources/builtin-toolsets/datadog/
|
|
87
|
+
8. ❌ *Could not query Erlang metrics* - Prometheus integration is not connected. Enable it via https://holmesgpt.dev/data-sources/builtin-toolsets/prometheus/
|
|
87
88
|
9. ✅ *Examine recent deployments* - app version 2.1.3 deployed 4 hours ago, coincides with memory spike
|
|
88
|
-
10. ❌ *Could not check Stripe API status* - No toolset for Stripe integration exists. To monitor Stripe or similar third-party APIs, add a [custom toolset](https://
|
|
89
|
+
10. ❌ *Could not check Stripe API status* - No toolset for Stripe integration exists. To monitor Stripe or similar third-party APIs, add a [custom toolset](https://holmesgpt.dev/data-sources/custom-toolsets/) or use a [remote MCP server](https://holmesgpt.dev/data-sources/remote-mcp-servers/)
|
|
89
90
|
|
|
90
91
|
**Root cause:** Memory leak in `gen_server` logic introduced in v2.1.3. BEAM VM hitting memory limit, causing out-of-memory crashes.
|
|
91
92
|
|
|
@@ -107,8 +108,8 @@ class RunbookFetcher(Tool):
|
|
|
107
108
|
)
|
|
108
109
|
|
|
109
110
|
def get_parameterized_one_liner(self, params) -> str:
|
|
110
|
-
path: str = params
|
|
111
|
-
return f"
|
|
111
|
+
path: str = params.get("link", "")
|
|
112
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Fetch Runbook {path}"
|
|
112
113
|
|
|
113
114
|
|
|
114
115
|
class RunbookToolset(Toolset):
|
|
@@ -15,9 +15,11 @@ from holmes.core.tools import StructuredToolResult, ToolResultStatus
|
|
|
15
15
|
from holmes.plugins.toolsets.utils import (
|
|
16
16
|
process_timestamps_to_rfc3339,
|
|
17
17
|
standard_start_datetime_tool_param_description,
|
|
18
|
+
toolset_name_for_one_liner,
|
|
19
|
+
)
|
|
20
|
+
from holmes.plugins.toolsets.logging_utils.logging_api import (
|
|
21
|
+
DEFAULT_TIME_SPAN_SECONDS,
|
|
18
22
|
)
|
|
19
|
-
|
|
20
|
-
DEFAULT_TIME_SPAN_SECONDS = 3600
|
|
21
23
|
|
|
22
24
|
|
|
23
25
|
class ServiceNowConfig(BaseModel):
|
|
@@ -92,7 +94,8 @@ class ServiceNowBaseTool(Tool):
|
|
|
92
94
|
)
|
|
93
95
|
|
|
94
96
|
def get_parameterized_one_liner(self, params) -> str:
|
|
95
|
-
|
|
97
|
+
# Default implementation - will be overridden by subclasses
|
|
98
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: ServiceNow {self.name} {params}"
|
|
96
99
|
|
|
97
100
|
|
|
98
101
|
class ReturnChangesInTimerange(ServiceNowBaseTool):
|
|
@@ -108,6 +111,10 @@ class ReturnChangesInTimerange(ServiceNowBaseTool):
|
|
|
108
111
|
)
|
|
109
112
|
}
|
|
110
113
|
|
|
114
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
115
|
+
start = params.get("start", "last hour")
|
|
116
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Change Requests ({start})"
|
|
117
|
+
|
|
111
118
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
112
119
|
parsed_params = {}
|
|
113
120
|
try:
|
|
@@ -147,6 +154,10 @@ class ReturnChange(ServiceNowBaseTool):
|
|
|
147
154
|
)
|
|
148
155
|
}
|
|
149
156
|
|
|
157
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
158
|
+
sys_id = params.get("sys_id", "")
|
|
159
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Get Change Details ({sys_id})"
|
|
160
|
+
|
|
150
161
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
151
162
|
try:
|
|
152
163
|
url = "https://{instance}.service-now.com/api/now/v2/table/change_request/{sys_id}".format(
|
|
@@ -175,6 +186,10 @@ class ReturnChangesWithKeyword(ServiceNowBaseTool):
|
|
|
175
186
|
)
|
|
176
187
|
}
|
|
177
188
|
|
|
189
|
+
def get_parameterized_one_liner(self, params) -> str:
|
|
190
|
+
keyword = params.get("keyword", "")
|
|
191
|
+
return f"{toolset_name_for_one_liner(self.toolset.name)}: Search Changes ({keyword})"
|
|
192
|
+
|
|
178
193
|
def _invoke(self, params: Any) -> StructuredToolResult:
|
|
179
194
|
parsed_params = {}
|
|
180
195
|
try:
|
holmes/plugins/toolsets/utils.py
CHANGED
|
@@ -2,7 +2,7 @@ import datetime
|
|
|
2
2
|
import time
|
|
3
3
|
from typing import Dict, Optional, Tuple, Union
|
|
4
4
|
|
|
5
|
-
from dateutil import parser
|
|
5
|
+
from dateutil import parser
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def standard_start_datetime_tool_param_description(time_span_seconds: int):
|
|
@@ -139,3 +139,10 @@ def get_param_or_raise(dict: Dict, param: str) -> str:
|
|
|
139
139
|
if not value:
|
|
140
140
|
raise Exception(f'Missing param "{param}"')
|
|
141
141
|
return value
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def toolset_name_for_one_liner(toolset_name: str) -> str:
|
|
145
|
+
name = toolset_name
|
|
146
|
+
if "/" in toolset_name:
|
|
147
|
+
name = toolset_name.split("/")[0]
|
|
148
|
+
return name.capitalize()
|
holmes/utils/llms.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def model_matches_list(model: str, model_list: List[str]) -> bool:
|
|
6
|
+
"""
|
|
7
|
+
Check if a model matches any pattern in a list of model patterns.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
model: The name of an LLM model (e.g., "azure/gpt", "openai/gpt-4o")
|
|
11
|
+
model_list: List of model patterns that may include wildcards
|
|
12
|
+
(e.g., ["azure/*", "*/mistral", "openai/gpt-*"])
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
True if the model matches any pattern in the list, False otherwise
|
|
16
|
+
"""
|
|
17
|
+
for pattern in model_list:
|
|
18
|
+
if fnmatch.fnmatchcase(model, pattern):
|
|
19
|
+
return True
|
|
20
|
+
return False
|
holmes/utils/stream.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Generator, Optional, List
|
|
4
|
+
import litellm
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from holmes.core.investigation_structured_output import process_response_into_sections
|
|
7
|
+
from functools import partial
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class StreamEvents(str, Enum):
|
|
11
|
+
ANSWER_END = "ai_answer_end"
|
|
12
|
+
START_TOOL = "start_tool_calling"
|
|
13
|
+
TOOL_RESULT = "tool_calling_result"
|
|
14
|
+
ERROR = "error"
|
|
15
|
+
AI_MESSAGE = "ai_message"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class StreamMessage(BaseModel):
|
|
19
|
+
event: StreamEvents
|
|
20
|
+
data: dict = Field(default={})
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def create_sse_message(event_type: str, data: Optional[dict] = None):
|
|
24
|
+
if data is None:
|
|
25
|
+
data = {}
|
|
26
|
+
return f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_sse_error_message(description: str, error_code: int, msg: str):
|
|
30
|
+
return create_sse_message(
|
|
31
|
+
StreamEvents.ERROR.value,
|
|
32
|
+
{
|
|
33
|
+
"description": description,
|
|
34
|
+
"error_code": error_code,
|
|
35
|
+
"msg": msg,
|
|
36
|
+
"success": False,
|
|
37
|
+
},
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
create_rate_limit_error_message = partial(
|
|
42
|
+
create_sse_error_message,
|
|
43
|
+
error_code=5204,
|
|
44
|
+
msg="Rate limit exceeded",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def stream_investigate_formatter(
|
|
49
|
+
call_stream: Generator[StreamMessage, None, None], runbooks
|
|
50
|
+
):
|
|
51
|
+
try:
|
|
52
|
+
for message in call_stream:
|
|
53
|
+
if message.event == StreamEvents.ANSWER_END:
|
|
54
|
+
(text_response, sections) = process_response_into_sections( # type: ignore
|
|
55
|
+
message.data.get("content")
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
yield create_sse_message(
|
|
59
|
+
StreamEvents.ANSWER_END.value,
|
|
60
|
+
{
|
|
61
|
+
"sections": sections or {},
|
|
62
|
+
"analysis": text_response,
|
|
63
|
+
"instructions": runbooks or [],
|
|
64
|
+
},
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
yield create_sse_message(message.event.value, message.data)
|
|
68
|
+
except litellm.exceptions.RateLimitError as e:
|
|
69
|
+
yield create_rate_limit_error_message(str(e))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def stream_chat_formatter(
|
|
73
|
+
call_stream: Generator[StreamMessage, None, None],
|
|
74
|
+
followups: Optional[List[dict]] = None,
|
|
75
|
+
):
|
|
76
|
+
try:
|
|
77
|
+
for message in call_stream:
|
|
78
|
+
if message.event == StreamEvents.ANSWER_END:
|
|
79
|
+
yield create_sse_message(
|
|
80
|
+
StreamEvents.ANSWER_END.value,
|
|
81
|
+
{
|
|
82
|
+
"analysis": message.data.get("content"),
|
|
83
|
+
"conversation_history": message.data.get("messages"),
|
|
84
|
+
"follow_up_actions": followups,
|
|
85
|
+
},
|
|
86
|
+
)
|
|
87
|
+
else:
|
|
88
|
+
yield create_sse_message(message.event.value, message.data)
|
|
89
|
+
except litellm.exceptions.RateLimitError as e:
|
|
90
|
+
yield create_rate_limit_error_message(str(e))
|