holmesgpt 0.13.2__py3-none-any.whl → 0.16.2a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- holmes/__init__.py +1 -1
- holmes/clients/robusta_client.py +17 -4
- holmes/common/env_vars.py +40 -1
- holmes/config.py +114 -144
- holmes/core/conversations.py +53 -14
- holmes/core/feedback.py +191 -0
- holmes/core/investigation.py +18 -22
- holmes/core/llm.py +489 -88
- holmes/core/models.py +103 -1
- holmes/core/openai_formatting.py +13 -0
- holmes/core/prompt.py +1 -1
- holmes/core/safeguards.py +4 -4
- holmes/core/supabase_dal.py +293 -100
- holmes/core/tool_calling_llm.py +423 -323
- holmes/core/tools.py +311 -33
- holmes/core/tools_utils/token_counting.py +14 -0
- holmes/core/tools_utils/tool_context_window_limiter.py +57 -0
- holmes/core/tools_utils/tool_executor.py +13 -8
- holmes/core/toolset_manager.py +155 -4
- holmes/core/tracing.py +6 -1
- holmes/core/transformers/__init__.py +23 -0
- holmes/core/transformers/base.py +62 -0
- holmes/core/transformers/llm_summarize.py +174 -0
- holmes/core/transformers/registry.py +122 -0
- holmes/core/transformers/transformer.py +31 -0
- holmes/core/truncation/compaction.py +59 -0
- holmes/core/truncation/dal_truncation_utils.py +23 -0
- holmes/core/truncation/input_context_window_limiter.py +218 -0
- holmes/interactive.py +177 -24
- holmes/main.py +7 -4
- holmes/plugins/prompts/_fetch_logs.jinja2 +26 -1
- holmes/plugins/prompts/_general_instructions.jinja2 +1 -2
- holmes/plugins/prompts/_runbook_instructions.jinja2 +23 -12
- holmes/plugins/prompts/conversation_history_compaction.jinja2 +88 -0
- holmes/plugins/prompts/generic_ask.jinja2 +2 -4
- holmes/plugins/prompts/generic_ask_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_ask_for_issue_conversation.jinja2 +2 -1
- holmes/plugins/prompts/generic_investigation.jinja2 +2 -1
- holmes/plugins/prompts/investigation_procedure.jinja2 +48 -0
- holmes/plugins/prompts/kubernetes_workload_ask.jinja2 +2 -1
- holmes/plugins/prompts/kubernetes_workload_chat.jinja2 +2 -1
- holmes/plugins/runbooks/__init__.py +117 -18
- holmes/plugins/runbooks/catalog.json +2 -0
- holmes/plugins/toolsets/__init__.py +21 -8
- holmes/plugins/toolsets/aks-node-health.yaml +46 -0
- holmes/plugins/toolsets/aks.yaml +64 -0
- holmes/plugins/toolsets/atlas_mongodb/mongodb_atlas.py +26 -36
- holmes/plugins/toolsets/azure_sql/azure_sql_toolset.py +0 -1
- holmes/plugins/toolsets/azure_sql/tools/analyze_connection_failures.py +10 -7
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_connections.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_health_status.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_performance.py +8 -6
- holmes/plugins/toolsets/azure_sql/tools/analyze_database_storage.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_active_alerts.py +9 -7
- holmes/plugins/toolsets/azure_sql/tools/get_slow_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_cpu_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_data_io_queries.py +9 -6
- holmes/plugins/toolsets/azure_sql/tools/get_top_log_io_queries.py +9 -6
- holmes/plugins/toolsets/bash/bash_toolset.py +10 -13
- holmes/plugins/toolsets/bash/common/bash.py +7 -7
- holmes/plugins/toolsets/cilium.yaml +284 -0
- holmes/plugins/toolsets/coralogix/toolset_coralogix_logs.py +5 -3
- holmes/plugins/toolsets/datadog/datadog_api.py +490 -24
- holmes/plugins/toolsets/datadog/datadog_logs_instructions.jinja2 +21 -10
- holmes/plugins/toolsets/datadog/toolset_datadog_general.py +349 -216
- holmes/plugins/toolsets/datadog/toolset_datadog_logs.py +190 -19
- holmes/plugins/toolsets/datadog/toolset_datadog_metrics.py +101 -44
- holmes/plugins/toolsets/datadog/toolset_datadog_rds.py +13 -16
- holmes/plugins/toolsets/datadog/toolset_datadog_traces.py +25 -31
- holmes/plugins/toolsets/git.py +51 -46
- holmes/plugins/toolsets/grafana/common.py +15 -3
- holmes/plugins/toolsets/grafana/grafana_api.py +46 -24
- holmes/plugins/toolsets/grafana/grafana_tempo_api.py +454 -0
- holmes/plugins/toolsets/grafana/loki/instructions.jinja2 +9 -0
- holmes/plugins/toolsets/grafana/loki/toolset_grafana_loki.py +117 -0
- holmes/plugins/toolsets/grafana/toolset_grafana.py +211 -91
- holmes/plugins/toolsets/grafana/toolset_grafana_dashboard.jinja2 +27 -0
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.jinja2 +246 -11
- holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py +653 -293
- holmes/plugins/toolsets/grafana/trace_parser.py +1 -1
- holmes/plugins/toolsets/internet/internet.py +6 -7
- holmes/plugins/toolsets/internet/notion.py +5 -6
- holmes/plugins/toolsets/investigator/core_investigation.py +42 -34
- holmes/plugins/toolsets/kafka.py +25 -36
- holmes/plugins/toolsets/kubernetes.yaml +58 -84
- holmes/plugins/toolsets/kubernetes_logs.py +6 -6
- holmes/plugins/toolsets/kubernetes_logs.yaml +32 -0
- holmes/plugins/toolsets/logging_utils/logging_api.py +80 -4
- holmes/plugins/toolsets/mcp/toolset_mcp.py +181 -55
- holmes/plugins/toolsets/newrelic/__init__.py +0 -0
- holmes/plugins/toolsets/newrelic/new_relic_api.py +125 -0
- holmes/plugins/toolsets/newrelic/newrelic.jinja2 +41 -0
- holmes/plugins/toolsets/newrelic/newrelic.py +163 -0
- holmes/plugins/toolsets/opensearch/opensearch.py +10 -17
- holmes/plugins/toolsets/opensearch/opensearch_logs.py +7 -7
- holmes/plugins/toolsets/opensearch/opensearch_ppl_query_docs.jinja2 +1616 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist.py +78 -0
- holmes/plugins/toolsets/opensearch/opensearch_query_assist_instructions.jinja2 +223 -0
- holmes/plugins/toolsets/opensearch/opensearch_traces.py +13 -16
- holmes/plugins/toolsets/openshift.yaml +283 -0
- holmes/plugins/toolsets/prometheus/prometheus.py +915 -390
- holmes/plugins/toolsets/prometheus/prometheus_instructions.jinja2 +43 -2
- holmes/plugins/toolsets/prometheus/utils.py +28 -0
- holmes/plugins/toolsets/rabbitmq/toolset_rabbitmq.py +9 -10
- holmes/plugins/toolsets/robusta/robusta.py +236 -65
- holmes/plugins/toolsets/robusta/robusta_instructions.jinja2 +26 -9
- holmes/plugins/toolsets/runbook/runbook_fetcher.py +137 -26
- holmes/plugins/toolsets/service_discovery.py +1 -1
- holmes/plugins/toolsets/servicenow_tables/instructions.jinja2 +83 -0
- holmes/plugins/toolsets/servicenow_tables/servicenow_tables.py +426 -0
- holmes/plugins/toolsets/utils.py +88 -0
- holmes/utils/config_utils.py +91 -0
- holmes/utils/default_toolset_installation_guide.jinja2 +1 -22
- holmes/utils/env.py +7 -0
- holmes/utils/global_instructions.py +75 -10
- holmes/utils/holmes_status.py +2 -1
- holmes/utils/holmes_sync_toolsets.py +0 -2
- holmes/utils/krr_utils.py +188 -0
- holmes/utils/sentry_helper.py +41 -0
- holmes/utils/stream.py +61 -7
- holmes/version.py +34 -14
- holmesgpt-0.16.2a0.dist-info/LICENSE +178 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/METADATA +29 -27
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/RECORD +126 -102
- holmes/core/performance_timing.py +0 -72
- holmes/plugins/toolsets/grafana/tempo_api.py +0 -124
- holmes/plugins/toolsets/grafana/toolset_grafana_loki.py +0 -110
- holmes/plugins/toolsets/newrelic.py +0 -231
- holmes/plugins/toolsets/servicenow/install.md +0 -37
- holmes/plugins/toolsets/servicenow/instructions.jinja2 +0 -3
- holmes/plugins/toolsets/servicenow/servicenow.py +0 -219
- holmesgpt-0.13.2.dist-info/LICENSE.txt +0 -21
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/WHEEL +0 -0
- {holmesgpt-0.13.2.dist-info → holmesgpt-0.16.2a0.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,27 @@
|
|
|
1
1
|
|
|
2
2
|
# Prometheus/PromQL queries
|
|
3
|
-
|
|
3
|
+
|
|
4
|
+
## Efficient Metric Discovery (when needed)
|
|
5
|
+
* When you need to discover metrics, use `get_metric_names` with filters - it's the fastest method
|
|
6
|
+
* Combine multiple patterns with regex OR (|) to reduce API calls:
|
|
7
|
+
- `{__name__=~"node_cpu.*|node_memory.*|node_disk.*"}` - get all node resource metrics in one call
|
|
8
|
+
- `{__name__=~"container.*|pod.*|kube.*"}` - get all Kubernetes-related metrics
|
|
9
|
+
- `{namespace=~"example1|example2|example3"}` - metrics from multiple namespaces
|
|
10
|
+
* Use `get_metric_metadata` after discovering names to get types/descriptions if needed
|
|
11
|
+
* Use `get_label_values` to discover pods, namespaces, jobs: e.g., get_label_values(label="pod")
|
|
12
|
+
* Only use `get_series` when you need full label sets (slower than other methods)
|
|
13
|
+
|
|
14
|
+
## Retrying queries that return too much data
|
|
15
|
+
* When a Prometheus query returns too much data (e.g., truncation error), you MUST retry with a more specific query or less data points or topk/bottomk
|
|
16
|
+
* NEVER EVER EVER answer a question based on Prometheus data that was truncated as you might be missing important information and give the totally wrong answer
|
|
17
|
+
* Prefer telling the user you can't answer the question because of too much data rather than answering based on incomplete data
|
|
18
|
+
* You are also able to show graphs to the user (using the promql embed functionality mentioned below) so you can show users graphs and THEY can interpret the data themselves, even if you can't answer.
|
|
19
|
+
* Do NOT hestitate to try alternative queries and try to reduce the amount of data returned until you get a successful query
|
|
20
|
+
* Be extremely, extremely cautious when answering based on get_label_values because the existence of a label value says NOTHING about the metric value itself (is it high, low, or perhaps the label exists in Prometheus but its an older series not present right now)
|
|
21
|
+
* DO NOT give answers about metrics based on what 'is typically the case' or 'common knowledge' - if you can't see the actual metric value, you MUST NEVER EVER answer about it - just tell the user your limitations due to the size of the data
|
|
22
|
+
|
|
23
|
+
## Alert Investigation & Query Execution
|
|
24
|
+
* When investigating a Prometheus alert, ALWAYS call list_prometheus_rules to get the alert definition
|
|
4
25
|
* Use Prometheus to query metrics from the alert promql
|
|
5
26
|
* Use prometheus to execute promql queries with the tools `execute_prometheus_instant_query` and `execute_prometheus_range_query`
|
|
6
27
|
* To create queries, use 'start_timestamp' and 'end_timestamp' as graphs start and end times
|
|
@@ -16,9 +37,29 @@
|
|
|
16
37
|
** Avoid global averages like `sum(rate(<metric>_sum)) / sum(rate(<metric>_count))` because it hides data and is not generally informative
|
|
17
38
|
* Timestamps MUST be in string date format. For example: '2025-03-15 10:10:08.610862+00:00'
|
|
18
39
|
* Post processing will parse your response, re-run the query from the tool output and create a chart visible to the user
|
|
19
|
-
*
|
|
40
|
+
* When unsure about available metrics, use `get_metric_names` with appropriate filters (combine multiple patterns with | for efficiency). Then use `get_metric_metadata` if you need descriptions/types
|
|
20
41
|
* Check that any node, service, pod, container, app, namespace, etc. mentioned in the query exist in the kubernetes cluster before making a query. Use any appropriate kubectl tool(s) for this
|
|
21
42
|
* The toolcall will return no data to you. That is expected. You MUST however ensure that the query is successful.
|
|
43
|
+
|
|
44
|
+
## Handling High-Cardinality Metrics
|
|
45
|
+
* CRITICAL: When querying metrics that may return many time series (>10), ALWAYS use aggregation to limit results
|
|
46
|
+
* ALWAYS use `topk()` or `bottomk()` to limit the number of series returned
|
|
47
|
+
* Standard pattern for high-cardinality queries:
|
|
48
|
+
- Use `topk(5, <your_query>)` to get the top 5 series
|
|
49
|
+
- Example: `topk(5, rate(container_cpu_usage_seconds_total{namespace="example"}[5m]))`
|
|
50
|
+
- This prevents context overflow and focuses on the most relevant data
|
|
51
|
+
* To also capture the aggregate of remaining series as "other":
|
|
52
|
+
```
|
|
53
|
+
topk(5, rate(container_cpu_usage_seconds_total{namespace="example"}[5m])) or label_replace((sum(rate(container_cpu_usage_seconds_total{namespace="example"}[5m])) - sum(topk(5, rate(container_cpu_usage_seconds_total{namespace="example"}[5m])))), "pod", "other", "", "")
|
|
54
|
+
```
|
|
55
|
+
* Common high-cardinality scenarios requiring topk():
|
|
56
|
+
- Pod-level metrics in namespaces with many pods
|
|
57
|
+
- Container-level CPU/memory metrics
|
|
58
|
+
- HTTP metrics with many endpoints or status codes
|
|
59
|
+
- Any query returning more than 10 time series
|
|
60
|
+
* For initial exploration, you may use instant queries with `count()` to check cardinality:
|
|
61
|
+
- Example: `count(count by (pod) (container_cpu_usage_seconds_total{namespace="example"}))`
|
|
62
|
+
- If count > 10, use topk() in your range query
|
|
22
63
|
* When doing queries, always extend the time range, to 15 min before and after the alert start time
|
|
23
64
|
* ALWAYS embed the execution results into your answer
|
|
24
65
|
* ALWAYS embed a Prometheus graph in the response. The graph should visualize data related to the incident.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Optional, Union
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def parse_duration_to_seconds(v: Optional[Union[str, float, int]]) -> Optional[float]:
|
|
6
|
+
if v is None:
|
|
7
|
+
return None
|
|
8
|
+
if isinstance(v, (int, float)):
|
|
9
|
+
return float(v)
|
|
10
|
+
s = v.strip().lower()
|
|
11
|
+
if s.isdigit():
|
|
12
|
+
return float(int(s))
|
|
13
|
+
|
|
14
|
+
units = {"s": 1, "m": 60, "h": 3600, "d": 86400}
|
|
15
|
+
|
|
16
|
+
# Check for partial time formats (e.g., 1h30m, 5m12s, 1d2h30m)
|
|
17
|
+
pattern = r"(\d+(?:\.\d+)?)(d|h|m|s)"
|
|
18
|
+
matches = re.findall(pattern, s)
|
|
19
|
+
|
|
20
|
+
if matches:
|
|
21
|
+
total_seconds = 0.0
|
|
22
|
+
for value_str, unit in matches:
|
|
23
|
+
value = float(value_str)
|
|
24
|
+
total_seconds += value * units[unit]
|
|
25
|
+
return float(int(total_seconds))
|
|
26
|
+
|
|
27
|
+
# fallback: try float seconds
|
|
28
|
+
return float(s)
|
|
@@ -7,8 +7,9 @@ from holmes.core.tools import (
|
|
|
7
7
|
CallablePrerequisite,
|
|
8
8
|
StructuredToolResult,
|
|
9
9
|
Tool,
|
|
10
|
+
ToolInvokeContext,
|
|
10
11
|
ToolParameter,
|
|
11
|
-
|
|
12
|
+
StructuredToolResultStatus,
|
|
12
13
|
Toolset,
|
|
13
14
|
ToolsetTag,
|
|
14
15
|
)
|
|
@@ -63,9 +64,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
|
|
|
63
64
|
toolset=toolset,
|
|
64
65
|
)
|
|
65
66
|
|
|
66
|
-
def _invoke(
|
|
67
|
-
self, params: dict, user_approved: bool = False
|
|
68
|
-
) -> StructuredToolResult:
|
|
67
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
69
68
|
if not self.toolset.config:
|
|
70
69
|
raise ValueError("RabbitMQ is not configured.")
|
|
71
70
|
|
|
@@ -79,7 +78,7 @@ class ListConfiguredClusters(BaseRabbitMQTool):
|
|
|
79
78
|
if c.connection_status == ClusterConnectionStatus.SUCCESS
|
|
80
79
|
]
|
|
81
80
|
return StructuredToolResult(
|
|
82
|
-
status=
|
|
81
|
+
status=StructuredToolResultStatus.SUCCESS, data=available_clusters
|
|
83
82
|
)
|
|
84
83
|
|
|
85
84
|
def get_parameterized_one_liner(self, params) -> str:
|
|
@@ -103,21 +102,21 @@ class GetRabbitMQClusterStatus(BaseRabbitMQTool):
|
|
|
103
102
|
toolset=toolset,
|
|
104
103
|
)
|
|
105
104
|
|
|
106
|
-
def _invoke(
|
|
107
|
-
self, params: dict, user_approved: bool = False
|
|
108
|
-
) -> StructuredToolResult:
|
|
105
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
109
106
|
try:
|
|
110
107
|
# Fetch node details which include partition info
|
|
111
108
|
cluster_config = self._get_cluster_config(
|
|
112
109
|
cluster_id=params.get("cluster_id")
|
|
113
110
|
)
|
|
114
111
|
result = get_cluster_status(cluster_config)
|
|
115
|
-
return StructuredToolResult(
|
|
112
|
+
return StructuredToolResult(
|
|
113
|
+
status=StructuredToolResultStatus.SUCCESS, data=result
|
|
114
|
+
)
|
|
116
115
|
|
|
117
116
|
except Exception as e:
|
|
118
117
|
logging.info("Failed to process RabbitMQ cluster status", exc_info=True)
|
|
119
118
|
return StructuredToolResult(
|
|
120
|
-
status=
|
|
119
|
+
status=StructuredToolResultStatus.ERROR,
|
|
121
120
|
error=f"Unexpected error fetching RabbitMQ cluster status: {str(e)}",
|
|
122
121
|
data=None,
|
|
123
122
|
)
|
|
@@ -3,21 +3,27 @@ import os
|
|
|
3
3
|
import logging
|
|
4
4
|
|
|
5
5
|
from typing import Optional, Dict, Any, List
|
|
6
|
-
from holmes.
|
|
6
|
+
from holmes.common.env_vars import load_bool
|
|
7
|
+
from holmes.core.supabase_dal import SupabaseDal, FindingType
|
|
7
8
|
from holmes.core.tools import (
|
|
8
9
|
StaticPrerequisite,
|
|
9
10
|
Tool,
|
|
11
|
+
ToolInvokeContext,
|
|
10
12
|
ToolParameter,
|
|
11
13
|
Toolset,
|
|
12
14
|
ToolsetTag,
|
|
13
15
|
)
|
|
14
|
-
from holmes.core.tools import StructuredToolResult,
|
|
16
|
+
from holmes.core.tools import StructuredToolResult, StructuredToolResultStatus
|
|
17
|
+
|
|
18
|
+
PULL_EXTERNAL_FINDINGS = load_bool("PULL_EXTERNAL_FINDINGS", False)
|
|
15
19
|
|
|
16
20
|
PARAM_FINDING_ID = "id"
|
|
17
21
|
START_TIME = "start_datetime"
|
|
18
22
|
END_TIME = "end_datetime"
|
|
19
23
|
NAMESPACE = "namespace"
|
|
20
24
|
WORKLOAD = "workload"
|
|
25
|
+
DEFAULT_LIMIT_CHANGE_ROWS = 100
|
|
26
|
+
MAX_LIMIT_CHANGE_ROWS = 200
|
|
21
27
|
|
|
22
28
|
|
|
23
29
|
class FetchRobustaFinding(Tool):
|
|
@@ -26,7 +32,7 @@ class FetchRobustaFinding(Tool):
|
|
|
26
32
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
27
33
|
super().__init__(
|
|
28
34
|
name="fetch_finding_by_id",
|
|
29
|
-
description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update",
|
|
35
|
+
description="Fetches a robusta finding. Findings are events, like a Prometheus alert or a deployment update and configuration change.",
|
|
30
36
|
parameters={
|
|
31
37
|
PARAM_FINDING_ID: ToolParameter(
|
|
32
38
|
description="The id of the finding to fetch",
|
|
@@ -45,21 +51,19 @@ class FetchRobustaFinding(Tool):
|
|
|
45
51
|
logging.error(error)
|
|
46
52
|
return {"error": error}
|
|
47
53
|
|
|
48
|
-
def _invoke(
|
|
49
|
-
self, params: dict, user_approved: bool = False
|
|
50
|
-
) -> StructuredToolResult:
|
|
54
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
51
55
|
finding_id = params[PARAM_FINDING_ID]
|
|
52
56
|
try:
|
|
53
57
|
finding = self._fetch_finding(finding_id)
|
|
54
58
|
if finding:
|
|
55
59
|
return StructuredToolResult(
|
|
56
|
-
status=
|
|
60
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
57
61
|
data=finding,
|
|
58
62
|
params=params,
|
|
59
63
|
)
|
|
60
64
|
else:
|
|
61
65
|
return StructuredToolResult(
|
|
62
|
-
status=
|
|
66
|
+
status=StructuredToolResultStatus.NO_DATA,
|
|
63
67
|
data=f"Could not find a finding with finding_id={finding_id}",
|
|
64
68
|
params=params,
|
|
65
69
|
)
|
|
@@ -70,13 +74,13 @@ class FetchRobustaFinding(Tool):
|
|
|
70
74
|
)
|
|
71
75
|
|
|
72
76
|
return StructuredToolResult(
|
|
73
|
-
status=
|
|
77
|
+
status=StructuredToolResultStatus.ERROR,
|
|
74
78
|
data=f"There was an internal error while fetching finding {finding_id}",
|
|
75
79
|
params=params,
|
|
76
80
|
)
|
|
77
81
|
|
|
78
82
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
79
|
-
return "Robusta: Fetch
|
|
83
|
+
return f"Robusta: Fetch finding data {params}"
|
|
80
84
|
|
|
81
85
|
|
|
82
86
|
class FetchResourceRecommendation(Tool):
|
|
@@ -85,124 +89,285 @@ class FetchResourceRecommendation(Tool):
|
|
|
85
89
|
def __init__(self, dal: Optional[SupabaseDal]):
|
|
86
90
|
super().__init__(
|
|
87
91
|
name="fetch_resource_recommendation",
|
|
88
|
-
description=
|
|
92
|
+
description=(
|
|
93
|
+
"Fetch KRR (Kubernetes Resource Recommendations) for CPU and memory optimization. "
|
|
94
|
+
"KRR provides AI-powered recommendations based on actual historical usage patterns for right-sizing workloads. "
|
|
95
|
+
"Supports two usage modes: "
|
|
96
|
+
"(1) Specific workload lookup - Use name_pattern with an exact name, namespace, and kind to get recommendations for a single workload. "
|
|
97
|
+
"(2) Discovery mode - Use limit and sort_by to get a ranked list of top optimization opportunities. Optionally filter by namespace, name_pattern (wildcards supported), kind, or container. "
|
|
98
|
+
"Returns current configured resources alongside recommended values. In discovery mode, results are sorted by potential savings."
|
|
99
|
+
),
|
|
89
100
|
parameters={
|
|
90
|
-
"
|
|
91
|
-
description="
|
|
101
|
+
"limit": ToolParameter(
|
|
102
|
+
description="Maximum number of recommendations to return (default: 10, max: 100).",
|
|
103
|
+
type="integer",
|
|
104
|
+
required=False,
|
|
105
|
+
),
|
|
106
|
+
"sort_by": ToolParameter(
|
|
107
|
+
description=(
|
|
108
|
+
"Field to sort recommendations by potential savings. Options: "
|
|
109
|
+
"'cpu_total' (default) - Total CPU savings (requests + limits), "
|
|
110
|
+
"'memory_total' - Total memory savings (requests + limits), "
|
|
111
|
+
"'cpu_requests' - CPU requests savings, "
|
|
112
|
+
"'memory_requests' - Memory requests savings, "
|
|
113
|
+
"'cpu_limits' - CPU limits savings, "
|
|
114
|
+
"'memory_limits' - Memory limits savings, "
|
|
115
|
+
"'priority' - Use scan priority field."
|
|
116
|
+
),
|
|
92
117
|
type="string",
|
|
93
|
-
required=
|
|
118
|
+
required=False,
|
|
94
119
|
),
|
|
95
120
|
"namespace": ToolParameter(
|
|
96
|
-
description="
|
|
121
|
+
description="Filter by Kubernetes namespace (exact match). Leave empty to search all namespaces.",
|
|
97
122
|
type="string",
|
|
98
|
-
required=
|
|
123
|
+
required=False,
|
|
124
|
+
),
|
|
125
|
+
"name_pattern": ToolParameter(
|
|
126
|
+
description=(
|
|
127
|
+
"Filter by workload name pattern. Supports SQL LIKE patterns: "
|
|
128
|
+
"Use '%' as wildcard (e.g., '%app%' matches any name containing 'app', "
|
|
129
|
+
"'prod-%' matches names starting with 'prod-'). "
|
|
130
|
+
"Leave empty to match all names."
|
|
131
|
+
),
|
|
132
|
+
type="string",
|
|
133
|
+
required=False,
|
|
99
134
|
),
|
|
100
135
|
"kind": ToolParameter(
|
|
101
|
-
description=
|
|
136
|
+
description=(
|
|
137
|
+
"Filter by Kubernetes resource kind. "
|
|
138
|
+
"Must be one of: Deployment, StatefulSet, DaemonSet, Job. "
|
|
139
|
+
"Leave empty to include all kinds."
|
|
140
|
+
),
|
|
102
141
|
type="string",
|
|
103
|
-
required=
|
|
142
|
+
required=False,
|
|
143
|
+
),
|
|
144
|
+
"container": ToolParameter(
|
|
145
|
+
description="Filter by container name (exact match). Leave empty to include all containers.",
|
|
146
|
+
type="string",
|
|
147
|
+
required=False,
|
|
104
148
|
),
|
|
105
149
|
},
|
|
106
150
|
)
|
|
107
151
|
self._dal = dal
|
|
108
152
|
|
|
109
|
-
def
|
|
153
|
+
def _fetch_recommendations(self, params: Dict) -> Optional[List[Dict]]:
|
|
110
154
|
if self._dal and self._dal.enabled:
|
|
155
|
+
# Set default values
|
|
156
|
+
limit = min(params.get("limit", 10) or 10, 100)
|
|
157
|
+
sort_by = params.get("sort_by") or "cpu_total"
|
|
158
|
+
|
|
111
159
|
return self._dal.get_resource_recommendation(
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
160
|
+
limit=limit,
|
|
161
|
+
sort_by=sort_by,
|
|
162
|
+
namespace=params.get("namespace"),
|
|
163
|
+
name_pattern=params.get("name_pattern"),
|
|
164
|
+
kind=params.get("kind"),
|
|
165
|
+
container=params.get("container"),
|
|
115
166
|
)
|
|
116
167
|
return None
|
|
117
168
|
|
|
118
|
-
def _invoke(
|
|
119
|
-
self, params: dict, user_approved: bool = False
|
|
120
|
-
) -> StructuredToolResult:
|
|
169
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
121
170
|
try:
|
|
122
|
-
recommendations = self.
|
|
171
|
+
recommendations = self._fetch_recommendations(params)
|
|
123
172
|
if recommendations:
|
|
124
173
|
return StructuredToolResult(
|
|
125
|
-
status=
|
|
174
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
126
175
|
data=recommendations,
|
|
127
176
|
params=params,
|
|
128
177
|
)
|
|
129
178
|
else:
|
|
130
179
|
return StructuredToolResult(
|
|
131
|
-
status=
|
|
132
|
-
data=f"Could not find recommendations
|
|
180
|
+
status=StructuredToolResultStatus.NO_DATA,
|
|
181
|
+
data=f"Could not find any recommendations with filters: {params}",
|
|
133
182
|
params=params,
|
|
134
183
|
)
|
|
135
184
|
except Exception as e:
|
|
136
|
-
msg = f"There was an
|
|
185
|
+
msg = f"There was an error while fetching top recommendations for {params}. {str(e)}"
|
|
137
186
|
logging.exception(msg)
|
|
138
187
|
return StructuredToolResult(
|
|
139
|
-
status=
|
|
140
|
-
|
|
188
|
+
status=StructuredToolResultStatus.ERROR,
|
|
189
|
+
error=msg,
|
|
141
190
|
params=params,
|
|
142
191
|
)
|
|
143
192
|
|
|
144
193
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
145
|
-
return f"Robusta:
|
|
194
|
+
return f"Robusta: Fetch KRR Recommendations ({str(params)})"
|
|
146
195
|
|
|
147
196
|
|
|
148
|
-
class
|
|
197
|
+
class FetchConfigurationChangesMetadataBase(Tool):
|
|
149
198
|
_dal: Optional[SupabaseDal]
|
|
150
199
|
|
|
151
|
-
def __init__(
|
|
200
|
+
def __init__(
|
|
201
|
+
self,
|
|
202
|
+
dal: Optional[SupabaseDal],
|
|
203
|
+
name: str,
|
|
204
|
+
description: str,
|
|
205
|
+
add_cluster_filter: bool = True,
|
|
206
|
+
):
|
|
207
|
+
"""
|
|
208
|
+
We need seperate tools for external and cluster configuration changes due to the different cluster parameters that are not on "external" changes like 'workload' and 'namespace'.
|
|
209
|
+
add_cluster_filter: adds the namespace and workload parameters for configuration changes tool.
|
|
210
|
+
"""
|
|
211
|
+
parameters = {
|
|
212
|
+
START_TIME: ToolParameter(
|
|
213
|
+
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
214
|
+
type="string",
|
|
215
|
+
required=True,
|
|
216
|
+
),
|
|
217
|
+
END_TIME: ToolParameter(
|
|
218
|
+
description="The ending time boundary for the search period. String in RFC3339 format.",
|
|
219
|
+
type="string",
|
|
220
|
+
required=True,
|
|
221
|
+
),
|
|
222
|
+
"limit": ToolParameter(
|
|
223
|
+
description=f"Maximum number of rows to return. Default is {DEFAULT_LIMIT_CHANGE_ROWS} and the maximum is 200",
|
|
224
|
+
type="integer",
|
|
225
|
+
required=False,
|
|
226
|
+
),
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
if add_cluster_filter:
|
|
230
|
+
parameters.update(
|
|
231
|
+
{
|
|
232
|
+
"namespace": ToolParameter(
|
|
233
|
+
description="The Kubernetes namespace name for filtering configuration changes",
|
|
234
|
+
type="string",
|
|
235
|
+
required=False,
|
|
236
|
+
),
|
|
237
|
+
"workload": ToolParameter(
|
|
238
|
+
description="Kubernetes resource name to filter configuration changes (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
|
|
239
|
+
type="string",
|
|
240
|
+
required=False,
|
|
241
|
+
),
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
|
|
152
245
|
super().__init__(
|
|
153
|
-
name=
|
|
154
|
-
description=
|
|
155
|
-
parameters=
|
|
156
|
-
START_TIME: ToolParameter(
|
|
157
|
-
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
158
|
-
type="string",
|
|
159
|
-
required=True,
|
|
160
|
-
),
|
|
161
|
-
END_TIME: ToolParameter(
|
|
162
|
-
description="The starting time boundary for the search period. String in RFC3339 format.",
|
|
163
|
-
type="string",
|
|
164
|
-
required=True,
|
|
165
|
-
),
|
|
166
|
-
},
|
|
246
|
+
name=name,
|
|
247
|
+
description=description,
|
|
248
|
+
parameters=parameters,
|
|
167
249
|
)
|
|
168
250
|
self._dal = dal
|
|
169
251
|
|
|
170
|
-
def
|
|
252
|
+
def _fetch_issues(
|
|
253
|
+
self,
|
|
254
|
+
params: Dict,
|
|
255
|
+
cluster: Optional[str] = None,
|
|
256
|
+
finding_type: FindingType = FindingType.CONFIGURATION_CHANGE,
|
|
257
|
+
) -> Optional[List[Dict]]:
|
|
171
258
|
if self._dal and self._dal.enabled:
|
|
172
|
-
return self._dal.
|
|
259
|
+
return self._dal.get_issues_metadata(
|
|
173
260
|
start_datetime=params["start_datetime"],
|
|
174
261
|
end_datetime=params["end_datetime"],
|
|
262
|
+
limit=min(
|
|
263
|
+
params.get("limit") or DEFAULT_LIMIT_CHANGE_ROWS,
|
|
264
|
+
MAX_LIMIT_CHANGE_ROWS,
|
|
265
|
+
),
|
|
266
|
+
ns=params.get("namespace"),
|
|
267
|
+
workload=params.get("workload"),
|
|
268
|
+
cluster=cluster,
|
|
269
|
+
finding_type=finding_type,
|
|
175
270
|
)
|
|
176
271
|
return None
|
|
177
272
|
|
|
178
|
-
def _invoke(
|
|
179
|
-
self, params: dict, user_approved: bool = False
|
|
180
|
-
) -> StructuredToolResult:
|
|
273
|
+
def _invoke(self, params: dict, context: ToolInvokeContext) -> StructuredToolResult:
|
|
181
274
|
try:
|
|
182
|
-
changes = self.
|
|
275
|
+
changes = self._fetch_issues(params)
|
|
183
276
|
if changes:
|
|
184
277
|
return StructuredToolResult(
|
|
185
|
-
status=
|
|
278
|
+
status=StructuredToolResultStatus.SUCCESS,
|
|
186
279
|
data=changes,
|
|
187
280
|
params=params,
|
|
188
281
|
)
|
|
189
282
|
else:
|
|
190
283
|
return StructuredToolResult(
|
|
191
|
-
status=
|
|
192
|
-
data=f"
|
|
284
|
+
status=StructuredToolResultStatus.NO_DATA,
|
|
285
|
+
data=f"{self.name} found no data. {params}",
|
|
193
286
|
params=params,
|
|
194
287
|
)
|
|
195
288
|
except Exception as e:
|
|
196
289
|
msg = f"There was an internal error while fetching changes for {params}. {str(e)}"
|
|
197
290
|
logging.exception(msg)
|
|
198
291
|
return StructuredToolResult(
|
|
199
|
-
status=
|
|
292
|
+
status=StructuredToolResultStatus.ERROR,
|
|
200
293
|
data=msg,
|
|
201
294
|
params=params,
|
|
202
295
|
)
|
|
203
296
|
|
|
204
297
|
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
205
|
-
return "Robusta: Search Change History"
|
|
298
|
+
return f"Robusta: Search Change History {params}"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
class FetchConfigurationChangesMetadata(FetchConfigurationChangesMetadataBase):
|
|
302
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
303
|
+
super().__init__(
|
|
304
|
+
dal=dal,
|
|
305
|
+
name="fetch_configuration_changes_metadata",
|
|
306
|
+
description=(
|
|
307
|
+
"Fetch configuration changes metadata in a given time range. "
|
|
308
|
+
"By default, fetch all cluster changes. Can be filtered on a given namespace or a specific kubernetes resource. "
|
|
309
|
+
"Use fetch_finding_by_id to get detailed change of one specific configuration change."
|
|
310
|
+
),
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
class FetchExternalConfigurationChangesMetadata(FetchConfigurationChangesMetadataBase):
|
|
315
|
+
"""
|
|
316
|
+
Fetch configuration changes from external sources, e.g., LaunchDarkly changes.
|
|
317
|
+
It needs to be a seperate tool due to the different cluster parameter used in the DAL method like workload and namespace.
|
|
318
|
+
"""
|
|
319
|
+
|
|
320
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
321
|
+
super().__init__(
|
|
322
|
+
dal=dal,
|
|
323
|
+
name="fetch_external_configuration_changes_metadata",
|
|
324
|
+
description=(
|
|
325
|
+
"Fetch external configuration changes metadata in a given time range. "
|
|
326
|
+
"Fetches configuration changes from external sources. "
|
|
327
|
+
"Use fetch_finding_by_id to get detailed change of one specific configuration change."
|
|
328
|
+
),
|
|
329
|
+
add_cluster_filter=False,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
|
|
333
|
+
return super()._fetch_issues(params, cluster="external")
|
|
334
|
+
|
|
335
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
336
|
+
return f"Robusta: Search External Change History {params}"
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
class FetchResourceIssuesMetadata(FetchConfigurationChangesMetadataBase):
|
|
340
|
+
def __init__(self, dal: Optional[SupabaseDal]):
|
|
341
|
+
super().__init__(
|
|
342
|
+
dal=dal,
|
|
343
|
+
name="fetch_resource_issues_metadata",
|
|
344
|
+
description=(
|
|
345
|
+
"Fetch issues and alert metadata in a given time range. "
|
|
346
|
+
"Must be filtered on a given namespace and specific kubernetes resource, such as pod, deployment, job, etc. "
|
|
347
|
+
"Use fetch_finding_by_id to get further information on a specific issue or alert."
|
|
348
|
+
),
|
|
349
|
+
add_cluster_filter=False,
|
|
350
|
+
)
|
|
351
|
+
self.parameters.update(
|
|
352
|
+
{
|
|
353
|
+
"namespace": ToolParameter(
|
|
354
|
+
description="The Kubernetes namespace name for filtering issues and alerts",
|
|
355
|
+
type="string",
|
|
356
|
+
required=True,
|
|
357
|
+
),
|
|
358
|
+
"workload": ToolParameter(
|
|
359
|
+
description="Kubernetes resource name to filter issues and alerts (e.g., Pod, Deployment, Job, etc.). Must be the full name. For Pods, include the exact generated suffix.",
|
|
360
|
+
type="string",
|
|
361
|
+
required=True,
|
|
362
|
+
),
|
|
363
|
+
}
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
def _fetch_issues(self, params: Dict) -> Optional[List[Dict]]: # type: ignore
|
|
367
|
+
return super()._fetch_issues(params, finding_type=FindingType.ISSUE)
|
|
368
|
+
|
|
369
|
+
def get_parameterized_one_liner(self, params: Dict) -> str:
|
|
370
|
+
return f"Robusta: fetch resource issues metadata {params}"
|
|
206
371
|
|
|
207
372
|
|
|
208
373
|
class RobustaToolset(Toolset):
|
|
@@ -216,17 +381,23 @@ class RobustaToolset(Toolset):
|
|
|
216
381
|
enabled=dal.enabled, disabled_reason="Data access layer is disabled"
|
|
217
382
|
)
|
|
218
383
|
|
|
384
|
+
tools = [
|
|
385
|
+
FetchRobustaFinding(dal),
|
|
386
|
+
FetchConfigurationChangesMetadata(dal),
|
|
387
|
+
FetchResourceRecommendation(dal),
|
|
388
|
+
FetchResourceIssuesMetadata(dal),
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
if PULL_EXTERNAL_FINDINGS:
|
|
392
|
+
tools.append(FetchExternalConfigurationChangesMetadata(dal))
|
|
393
|
+
|
|
219
394
|
super().__init__(
|
|
220
395
|
icon_url="https://cdn.prod.website-files.com/633e9bac8f71dfb7a8e4c9a6/646be7710db810b14133bdb5_logo.svg",
|
|
221
396
|
description="Fetches alerts metadata and change history",
|
|
222
397
|
docs_url="https://holmesgpt.dev/data-sources/builtin-toolsets/robusta/",
|
|
223
398
|
name="robusta",
|
|
224
399
|
prerequisites=[dal_prereq],
|
|
225
|
-
tools=
|
|
226
|
-
FetchRobustaFinding(dal),
|
|
227
|
-
FetchConfigurationChanges(dal),
|
|
228
|
-
FetchResourceRecommendation(dal),
|
|
229
|
-
],
|
|
400
|
+
tools=tools,
|
|
230
401
|
tags=[
|
|
231
402
|
ToolsetTag.CORE,
|
|
232
403
|
],
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
# Configuration and alerts history
|
|
2
|
-
* Use
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* Use fetch_configuration_changes_metadata to get historical configuration changes in a cluster or for a specific workload.
|
|
3
|
+
* If a change seems important to the investigation, Use fetch_finding_by_id with the configuration change ID to get full details of the change.
|
|
4
|
+
* You must ALWAYS call fetch_configuration_changes_metadata when investigating an alert
|
|
5
|
+
* Never respond without calling fetch_configuration_changes_metadata
|
|
6
|
+
* When investigating a resource (pod, deployment, or job), if no relevant information is available from the live cluster at the time of investigation, call the fetch_resource_issues_metadata function to retrieve its historical alert data.
|
|
7
|
+
* You can use fetch_resource_issues_metadata to get issues context for a specific kubernetes resource. Start with a 4 hours window and try to expand to 24 hours windows if nothing comes up.
|
|
5
8
|
* When investigating an alert, look at historical configuration changes that happen 4 hours before the alert started
|
|
6
9
|
* If you found a change that caused the alert, you MUST write: 'The issue was introduced by ...' with a short description of the change, and the date of it.
|
|
7
10
|
For example:
|
|
@@ -11,12 +14,26 @@ For example:
|
|
|
11
14
|
* Embed it in with the evidence id and a title describing the change. Use this format:
|
|
12
15
|
<< { "type": "diff", "evidence_id": "8a4d1369-0e98-4ff2-b180-699d5ff286ab", "title": "Image change on the DB workload" } >>
|
|
13
16
|
|
|
14
|
-
# Resource and efficiency recommendations
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
17
|
+
# Resource and efficiency recommendations (KRR)
|
|
18
|
+
* KRR (Kubernetes Resource Recommendations) provides AI-powered recommendations for right-sizing CPU and memory requests/limits
|
|
19
|
+
* Use fetch_resource_recommendation for all KRR queries - it supports two modes:
|
|
20
|
+
- **Discovery mode**: Get a ranked list of top optimization opportunities across multiple workloads
|
|
21
|
+
- Use limit and sort_by parameters to control ranking (CPU savings, memory savings, or priority)
|
|
22
|
+
- Supports filtering by namespace, name_pattern (with wildcards like '%app%'), kind, and container
|
|
23
|
+
- Returns up to 100 recommendations sorted by potential impact
|
|
24
|
+
- Use this for questions like "top recommendations", "cost savings opportunities", "what to optimize"
|
|
25
|
+
- **Specific lookup mode**: Get recommendations for a single known workload
|
|
26
|
+
- Use name_pattern with exact workload name, along with namespace and kind
|
|
27
|
+
- Best for focused analysis when you already know which workload to investigate
|
|
28
|
+
* When asked if a resource can be optimized, or if resources are over-utilized, use fetch_resource_recommendation to answer
|
|
29
|
+
* When asked about "GPU workloads" or filtering out GPU-based resources, you can use filters like name_pattern or namespace to exclude them
|
|
30
|
+
* Right-sizing of resources is key to avoiding performance issues and achieving cost savings
|
|
31
|
+
* Examples of questions that use fetch_resource_recommendation:
|
|
32
|
+
- "Show me top CPU recommendations" → Use limit=10, sort_by='cpu_total'
|
|
33
|
+
- "What are the biggest memory optimization opportunities?" → Use limit=10, sort_by='memory_total'
|
|
34
|
+
- "Show me top KRR recommendations for non-GPU workloads" → Use name_pattern filter or namespace filter
|
|
35
|
+
- "Find workloads in namespace X that can save the most CPU" → Use namespace='X', sort_by='cpu_total'
|
|
36
|
+
- "Get recommendations for deployment nginx in namespace prod" → Use name_pattern='nginx', namespace='prod', kind='Deployment'
|
|
20
37
|
|
|
21
38
|
# Investigating issues
|
|
22
39
|
* If provided an issue id (a.k.a. a finding), use `fetch_finding_by_id` to get more information about that issue
|